summaryrefslogtreecommitdiff
path: root/storage/bdb/mp/mp_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'storage/bdb/mp/mp_alloc.c')
-rw-r--r--storage/bdb/mp/mp_alloc.c442
1 files changed, 442 insertions, 0 deletions
diff --git a/storage/bdb/mp/mp_alloc.c b/storage/bdb/mp/mp_alloc.c
new file mode 100644
index 00000000000..96dd612d7ba
--- /dev/null
+++ b/storage/bdb/mp/mp_alloc.c
@@ -0,0 +1,442 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2002
+ * Sleepycat Software. All rights reserved.
+ */
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: mp_alloc.c,v 11.31 2002/08/14 17:21:37 ubell Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"
+
+typedef struct {
+ DB_MPOOL_HASH *bucket;
+ u_int32_t priority;
+} HS;
+
+static void __memp_bad_buffer __P((DB_MPOOL_HASH *));
+static void __memp_reset_lru __P((DB_ENV *, REGINFO *, MPOOL *));
+
+/*
+ * __memp_alloc --
+ * Allocate some space from a cache region.
+ *
+ * PUBLIC: int __memp_alloc __P((DB_MPOOL *,
+ * PUBLIC: REGINFO *, MPOOLFILE *, size_t, roff_t *, void *));
+ */
+int
+__memp_alloc(dbmp, memreg, mfp, len, offsetp, retp)
+ DB_MPOOL *dbmp;
+ REGINFO *memreg;
+ MPOOLFILE *mfp;
+ size_t len;
+ roff_t *offsetp;
+ void *retp;
+{
+ BH *bhp;
+ DB_ENV *dbenv;
+ DB_MPOOL_HASH *dbht, *hp, *hp_end, *hp_tmp;
+ DB_MUTEX *mutexp;
+ MPOOL *c_mp;
+ MPOOLFILE *bh_mfp;
+ size_t freed_space;
+ u_int32_t buckets, buffers, high_priority, max_na, priority;
+ int aggressive, ret;
+ void *p;
+
+ dbenv = dbmp->dbenv;
+ c_mp = memreg->primary;
+ dbht = R_ADDR(memreg, c_mp->htab);
+ hp_end = &dbht[c_mp->htab_buckets];
+
+ buckets = buffers = 0;
+ aggressive = 0;
+
+ c_mp->stat.st_alloc++;
+
+ /*
+ * Get aggressive if we've tried to flush the number of pages as are
+ * in the system without finding space.
+ */
+ max_na = 5 * c_mp->htab_buckets;
+
+ /*
+ * If we're allocating a buffer, and the one we're discarding is the
+ * same size, we don't want to waste the time to re-integrate it into
+ * the shared memory free list. If the DB_MPOOLFILE argument isn't
+ * NULL, we'll compare the underlying page sizes of the two buffers
+ * before free-ing and re-allocating buffers.
+ */
+ if (mfp != NULL)
+ len = (sizeof(BH) - sizeof(u_int8_t)) + mfp->stat.st_pagesize;
+
+ R_LOCK(dbenv, memreg);
+
+ /*
+ * On every buffer allocation we update the buffer generation number
+ * and check for wraparound.
+ */
+ if (++c_mp->lru_count == UINT32_T_MAX)
+ __memp_reset_lru(dbenv, memreg, c_mp);
+
+ /*
+ * Anything newer than 1/10th of the buffer pool is ignored during
+ * allocation (unless allocation starts failing).
+ */
+ DB_ASSERT(c_mp->lru_count > c_mp->stat.st_pages / 10);
+ high_priority = c_mp->lru_count - c_mp->stat.st_pages / 10;
+
+ /*
+ * First we try to allocate from free memory. If that fails, scan the
+ * buffer pool to find buffers with low priorities. We consider small
+ * sets of hash buckets each time to limit the amount of work needing
+ * to be done. This approximates LRU, but not very well. We either
+ * find a buffer of the same size to use, or we will free 3 times what
+ * we need in the hopes it will coalesce into a contiguous chunk of the
+ * right size. In the latter case we branch back here and try again.
+ */
+alloc: if ((ret = __db_shalloc(memreg->addr, len, MUTEX_ALIGN, &p)) == 0) {
+ if (mfp != NULL)
+ c_mp->stat.st_pages++;
+ R_UNLOCK(dbenv, memreg);
+
+found: if (offsetp != NULL)
+ *offsetp = R_OFFSET(memreg, p);
+ *(void **)retp = p;
+
+ /*
+ * Update the search statistics.
+ *
+ * We're not holding the region locked here, these statistics
+ * can't be trusted.
+ */
+ if (buckets != 0) {
+ if (buckets > c_mp->stat.st_alloc_max_buckets)
+ c_mp->stat.st_alloc_max_buckets = buckets;
+ c_mp->stat.st_alloc_buckets += buckets;
+ }
+ if (buffers != 0) {
+ if (buffers > c_mp->stat.st_alloc_max_pages)
+ c_mp->stat.st_alloc_max_pages = buffers;
+ c_mp->stat.st_alloc_pages += buffers;
+ }
+ return (0);
+ }
+
+ /*
+ * We re-attempt the allocation every time we've freed 3 times what
+ * we need. Reset our free-space counter.
+ */
+ freed_space = 0;
+
+ /*
+ * Walk the hash buckets and find the next two with potentially useful
+ * buffers. Free the buffer with the lowest priority from the buckets'
+ * chains.
+ */
+ for (hp_tmp = NULL;;) {
+ /* Check for wrap around. */
+ hp = &dbht[c_mp->last_checked++];
+ if (hp >= hp_end) {
+ c_mp->last_checked = 0;
+
+ /*
+ * If we've gone through all of the hash buckets, try
+ * an allocation. If the cache is small, the old page
+ * size is small, and the new page size is large, we
+ * might have freed enough memory (but not 3 times the
+ * memory).
+ */
+ goto alloc;
+ }
+
+ /*
+ * Skip empty buckets.
+ *
+ * We can check for empty buckets before locking as we
+ * only care if the pointer is zero or non-zero.
+ */
+ if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
+ continue;
+
+ /*
+ * The failure mode is when there are too many buffers we can't
+ * write or there's not enough memory in the system. We don't
+ * have a metric for deciding if allocation has no possible way
+ * to succeed, so we don't ever fail, we assume memory will be
+ * available if we wait long enough.
+ *
+ * Get aggressive if we've tried to flush 5 times the number of
+ * hash buckets as are in the system -- it's possible we have
+ * been repeatedly trying to flush the same buffers, although
+ * it's unlikely. Aggressive means:
+ *
+ * a: set a flag to attempt to flush high priority buffers as
+ * well as other buffers.
+ * b: sync the mpool to force out queue extent pages. While we
+ * might not have enough space for what we want and flushing
+ * is expensive, why not?
+ * c: sleep for a second -- hopefully someone else will run and
+ * free up some memory. Try to allocate memory too, in case
+ * the other thread returns its memory to the region.
+ * d: look at a buffer in every hash bucket rather than choose
+ * the more preferable of two.
+ *
+ * !!!
+ * This test ignores pathological cases like no buffers in the
+ * system -- that shouldn't be possible.
+ */
+ if ((++buckets % max_na) == 0) {
+ aggressive = 1;
+
+ R_UNLOCK(dbenv, memreg);
+
+ (void)__memp_sync_int(
+ dbenv, NULL, 0, DB_SYNC_ALLOC, NULL);
+
+ (void)__os_sleep(dbenv, 1, 0);
+
+ R_LOCK(dbenv, memreg);
+ goto alloc;
+ }
+
+ if (!aggressive) {
+ /* Skip high priority buckets. */
+ if (hp->hash_priority > high_priority)
+ continue;
+
+ /*
+ * Find two buckets and select the one with the lowest
+ * priority. Performance testing shows that looking
+ * at two improves the LRUness and looking at more only
+ * does a little better.
+ */
+ if (hp_tmp == NULL) {
+ hp_tmp = hp;
+ continue;
+ }
+ if (hp->hash_priority > hp_tmp->hash_priority)
+ hp = hp_tmp;
+ hp_tmp = NULL;
+ }
+
+ /* Remember the priority of the buffer we're looking for. */
+ priority = hp->hash_priority;
+
+ /* Unlock the region and lock the hash bucket. */
+ R_UNLOCK(dbenv, memreg);
+ mutexp = &hp->hash_mutex;
+ MUTEX_LOCK(dbenv, mutexp);
+
+#ifdef DIAGNOSTIC
+ __memp_check_order(hp);
+#endif
+ /*
+ * The lowest priority page is first in the bucket, as they are
+ * maintained in sorted order.
+ *
+ * The buffer may have been freed or its priority changed while
+ * we switched from the region lock to the hash lock. If so,
+ * we have to restart. We will still take the first buffer on
+ * the bucket's list, though, if it has a low enough priority.
+ */
+ if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL ||
+ bhp->ref != 0 || bhp->priority > priority)
+ goto next_hb;
+
+ buffers++;
+
+ /* Find the associated MPOOLFILE. */
+ bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+
+ /* If the page is dirty, pin it and write it. */
+ ret = 0;
+ if (F_ISSET(bhp, BH_DIRTY)) {
+ ++bhp->ref;
+ ret = __memp_bhwrite(dbmp, hp, bh_mfp, bhp, 0);
+ --bhp->ref;
+ if (ret == 0)
+ ++c_mp->stat.st_rw_evict;
+ } else
+ ++c_mp->stat.st_ro_evict;
+
+ /*
+ * If a write fails for any reason, we can't proceed.
+ *
+ * We released the hash bucket lock while doing I/O, so another
+ * thread may have acquired this buffer and incremented the ref
+ * count after we wrote it, in which case we can't have it.
+ *
+ * If there's a write error, avoid selecting this buffer again
+ * by making it the bucket's least-desirable buffer.
+ */
+ if (ret != 0 || bhp->ref != 0) {
+ if (ret != 0 && aggressive)
+ __memp_bad_buffer(hp);
+ goto next_hb;
+ }
+
+ /*
+ * Check to see if the buffer is the size we're looking for.
+ * If so, we can simply reuse it. Else, free the buffer and
+ * its space and keep looking.
+ */
+ if (mfp != NULL &&
+ mfp->stat.st_pagesize == bh_mfp->stat.st_pagesize) {
+ __memp_bhfree(dbmp, hp, bhp, 0);
+
+ p = bhp;
+ goto found;
+ }
+
+ freed_space += __db_shsizeof(bhp);
+ __memp_bhfree(dbmp, hp, bhp, 1);
+
+ /*
+ * Unlock this hash bucket and re-acquire the region lock. If
+ * we're reaching here as a result of calling memp_bhfree, the
+ * hash bucket lock has already been discarded.
+ */
+ if (0) {
+next_hb: MUTEX_UNLOCK(dbenv, mutexp);
+ }
+ R_LOCK(dbenv, memreg);
+
+ /*
+ * Retry the allocation as soon as we've freed up sufficient
+ * space. We're likely to have to coalesce of memory to
+ * satisfy the request, don't try until it's likely (possible?)
+ * we'll succeed.
+ */
+ if (freed_space >= 3 * len)
+ goto alloc;
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * __memp_bad_buffer --
+ * Make the first buffer in a hash bucket the least desirable buffer.
+ */
+static void
+__memp_bad_buffer(hp)
+ DB_MPOOL_HASH *hp;
+{
+ BH *bhp, *t_bhp;
+ u_int32_t priority;
+
+ /* Remove the first buffer from the bucket. */
+ bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
+ SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
+
+ /*
+ * Find the highest priority buffer in the bucket. Buffers are
+ * sorted by priority, so it's the last one in the bucket.
+ *
+ * XXX
+ * Should use SH_TAILQ_LAST, but I think that macro is broken.
+ */
+ priority = bhp->priority;
+ for (t_bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
+ t_bhp != NULL; t_bhp = SH_TAILQ_NEXT(t_bhp, hq, __bh))
+ priority = t_bhp->priority;
+
+ /*
+ * Set our buffer's priority to be just as bad, and append it to
+ * the bucket.
+ */
+ bhp->priority = priority;
+ SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
+
+ /* Reset the hash bucket's priority. */
+ hp->hash_priority = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
+}
+
+/*
+ * __memp_reset_lru --
+ * Reset the cache LRU counter.
+ */
+static void
+__memp_reset_lru(dbenv, memreg, c_mp)
+ DB_ENV *dbenv;
+ REGINFO *memreg;
+ MPOOL *c_mp;
+{
+ BH *bhp;
+ DB_MPOOL_HASH *hp;
+ int bucket;
+
+ /*
+ * Update the counter so all future allocations will start at the
+ * bottom.
+ */
+ c_mp->lru_count -= MPOOL_BASE_DECREMENT;
+
+ /* Release the region lock. */
+ R_UNLOCK(dbenv, memreg);
+
+ /* Adjust the priority of every buffer in the system. */
+ for (hp = R_ADDR(memreg, c_mp->htab),
+ bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
+ /*
+ * Skip empty buckets.
+ *
+ * We can check for empty buckets before locking as we
+ * only care if the pointer is zero or non-zero.
+ */
+ if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
+ continue;
+
+ MUTEX_LOCK(dbenv, &hp->hash_mutex);
+ for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
+ bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
+ if (bhp->priority != UINT32_T_MAX &&
+ bhp->priority > MPOOL_BASE_DECREMENT)
+ bhp->priority -= MPOOL_BASE_DECREMENT;
+ MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+ }
+
+ /* Reacquire the region lock. */
+ R_LOCK(dbenv, memreg);
+}
+
+#ifdef DIAGNOSTIC
+/*
+ * __memp_check_order --
+ * Verify the priority ordering of a hash bucket chain.
+ *
+ * PUBLIC: #ifdef DIAGNOSTIC
+ * PUBLIC: void __memp_check_order __P((DB_MPOOL_HASH *));
+ * PUBLIC: #endif
+ */
+void
+__memp_check_order(hp)
+ DB_MPOOL_HASH *hp;
+{
+ BH *bhp;
+ u_int32_t priority;
+
+ /*
+ * Assumes the hash bucket is locked.
+ */
+ if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL)
+ return;
+
+ DB_ASSERT(bhp->priority == hp->hash_priority);
+
+ for (priority = bhp->priority;
+ (bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) != NULL;
+ priority = bhp->priority)
+ DB_ASSERT(priority <= bhp->priority);
+}
+#endif