summaryrefslogtreecommitdiff
path: root/bdb/mp/mp_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'bdb/mp/mp_alloc.c')
-rw-r--r--bdb/mp/mp_alloc.c430
1 files changed, 360 insertions, 70 deletions
diff --git a/bdb/mp/mp_alloc.c b/bdb/mp/mp_alloc.c
index 731f569f57f..96dd612d7ba 100644
--- a/bdb/mp/mp_alloc.c
+++ b/bdb/mp/mp_alloc.c
@@ -1,22 +1,31 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: mp_alloc.c,v 11.7 2000/04/20 21:14:18 bostic Exp $";
+static const char revid[] = "$Id: mp_alloc.c,v 11.31 2002/08/14 17:21:37 ubell Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
#include <sys/types.h>
+#include <string.h>
#endif
#include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"
+
+typedef struct {
+ DB_MPOOL_HASH *bucket;
+ u_int32_t priority;
+} HS;
+
+static void __memp_bad_buffer __P((DB_MPOOL_HASH *));
+static void __memp_reset_lru __P((DB_ENV *, REGINFO *, MPOOL *));
/*
* __memp_alloc --
@@ -34,14 +43,32 @@ __memp_alloc(dbmp, memreg, mfp, len, offsetp, retp)
roff_t *offsetp;
void *retp;
{
- BH *bhp, *nbhp;
+ BH *bhp;
+ DB_ENV *dbenv;
+ DB_MPOOL_HASH *dbht, *hp, *hp_end, *hp_tmp;
+ DB_MUTEX *mutexp;
MPOOL *c_mp;
MPOOLFILE *bh_mfp;
- size_t total;
- int nomore, restart, ret, wrote;
+ size_t freed_space;
+ u_int32_t buckets, buffers, high_priority, max_na, priority;
+ int aggressive, ret;
void *p;
+ dbenv = dbmp->dbenv;
c_mp = memreg->primary;
+ dbht = R_ADDR(memreg, c_mp->htab);
+ hp_end = &dbht[c_mp->htab_buckets];
+
+ buckets = buffers = 0;
+ aggressive = 0;
+
+ c_mp->stat.st_alloc++;
+
+ /*
+ * Get aggressive if we've tried to flush the number of pages as are
+ * in the system without finding space.
+ */
+ max_na = 5 * c_mp->htab_buckets;
/*
* If we're allocating a buffer, and the one we're discarding is the
@@ -53,100 +80,363 @@ __memp_alloc(dbmp, memreg, mfp, len, offsetp, retp)
if (mfp != NULL)
len = (sizeof(BH) - sizeof(u_int8_t)) + mfp->stat.st_pagesize;
- nomore = 0;
+ R_LOCK(dbenv, memreg);
+
+ /*
+ * On every buffer allocation we update the buffer generation number
+ * and check for wraparound.
+ */
+ if (++c_mp->lru_count == UINT32_T_MAX)
+ __memp_reset_lru(dbenv, memreg, c_mp);
+
+ /*
+ * Anything newer than 1/10th of the buffer pool is ignored during
+ * allocation (unless allocation starts failing).
+ */
+ DB_ASSERT(c_mp->lru_count > c_mp->stat.st_pages / 10);
+ high_priority = c_mp->lru_count - c_mp->stat.st_pages / 10;
+
+ /*
+ * First we try to allocate from free memory. If that fails, scan the
+ * buffer pool to find buffers with low priorities. We consider small
+ * sets of hash buckets each time to limit the amount of work needing
+ * to be done. This approximates LRU, but not very well. We either
+ * find a buffer of the same size to use, or we will free 3 times what
+ * we need in the hopes it will coalesce into a contiguous chunk of the
+ * right size. In the latter case we branch back here and try again.
+ */
alloc: if ((ret = __db_shalloc(memreg->addr, len, MUTEX_ALIGN, &p)) == 0) {
- if (offsetp != NULL)
+ if (mfp != NULL)
+ c_mp->stat.st_pages++;
+ R_UNLOCK(dbenv, memreg);
+
+found: if (offsetp != NULL)
*offsetp = R_OFFSET(memreg, p);
*(void **)retp = p;
+
+ /*
+ * Update the search statistics.
+ *
+ * We're not holding the region locked here, these statistics
+ * can't be trusted.
+ */
+ if (buckets != 0) {
+ if (buckets > c_mp->stat.st_alloc_max_buckets)
+ c_mp->stat.st_alloc_max_buckets = buckets;
+ c_mp->stat.st_alloc_buckets += buckets;
+ }
+ if (buffers != 0) {
+ if (buffers > c_mp->stat.st_alloc_max_pages)
+ c_mp->stat.st_alloc_max_pages = buffers;
+ c_mp->stat.st_alloc_pages += buffers;
+ }
return (0);
}
- if (nomore) {
- __db_err(dbmp->dbenv,
- "Unable to allocate %lu bytes from mpool shared region: %s\n",
- (u_long)len, db_strerror(ret));
- return (ret);
- }
-retry: /* Find a buffer we can flush; pure LRU. */
- restart = total = 0;
- for (bhp =
- SH_TAILQ_FIRST(&c_mp->bhq, __bh); bhp != NULL; bhp = nbhp) {
- nbhp = SH_TAILQ_NEXT(bhp, q, __bh);
+ /*
+ * We re-attempt the allocation every time we've freed 3 times what
+ * we need. Reset our free-space counter.
+ */
+ freed_space = 0;
- /* Ignore pinned or locked (I/O in progress) buffers. */
- if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED))
+ /*
+ * Walk the hash buckets and find the next two with potentially useful
+ * buffers. Free the buffer with the lowest priority from the buckets'
+ * chains.
+ */
+ for (hp_tmp = NULL;;) {
+ /* Check for wrap around. */
+ hp = &dbht[c_mp->last_checked++];
+ if (hp >= hp_end) {
+ c_mp->last_checked = 0;
+
+ /*
+ * If we've gone through all of the hash buckets, try
+ * an allocation. If the cache is small, the old page
+ * size is small, and the new page size is large, we
+ * might have freed enough memory (but not 3 times the
+ * memory).
+ */
+ goto alloc;
+ }
+
+ /*
+ * Skip empty buckets.
+ *
+ * We can check for empty buckets before locking as we
+ * only care if the pointer is zero or non-zero.
+ */
+ if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
continue;
- /* Find the associated MPOOLFILE. */
- bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+ /*
+ * The failure mode is when there are too many buffers we can't
+ * write or there's not enough memory in the system. We don't
+ * have a metric for deciding if allocation has no possible way
+ * to succeed, so we don't ever fail, we assume memory will be
+ * available if we wait long enough.
+ *
+ * Get aggressive if we've tried to flush 5 times the number of
+ * hash buckets as are in the system -- it's possible we have
+ * been repeatedly trying to flush the same buffers, although
+ * it's unlikely. Aggressive means:
+ *
+ * a: set a flag to attempt to flush high priority buffers as
+ * well as other buffers.
+ * b: sync the mpool to force out queue extent pages. While we
+ * might not have enough space for what we want and flushing
+ * is expensive, why not?
+ * c: sleep for a second -- hopefully someone else will run and
+ * free up some memory. Try to allocate memory too, in case
+ * the other thread returns its memory to the region.
+ * d: look at a buffer in every hash bucket rather than choose
+ * the more preferable of two.
+ *
+ * !!!
+ * This test ignores pathological cases like no buffers in the
+ * system -- that shouldn't be possible.
+ */
+ if ((++buckets % max_na) == 0) {
+ aggressive = 1;
- /* Write the page if it's dirty. */
- if (F_ISSET(bhp, BH_DIRTY)) {
- ++bhp->ref;
- if ((ret = __memp_bhwrite(dbmp,
- bh_mfp, bhp, &restart, &wrote)) != 0)
- return (ret);
- --bhp->ref;
+ R_UNLOCK(dbenv, memreg);
- /*
- * Another process may have acquired this buffer and
- * incremented the ref count after we wrote it.
- */
- if (bhp->ref != 0)
- goto retry;
+ (void)__memp_sync_int(
+ dbenv, NULL, 0, DB_SYNC_ALLOC, NULL);
+
+ (void)__os_sleep(dbenv, 1, 0);
+
+ R_LOCK(dbenv, memreg);
+ goto alloc;
+ }
+
+ if (!aggressive) {
+ /* Skip high priority buckets. */
+ if (hp->hash_priority > high_priority)
+ continue;
/*
- * If we wrote the page, continue and free the buffer.
- * We don't have to rewalk the list to acquire the
- * buffer because it was never available for any other
- * process to modify it.
- *
- * If we didn't write the page, but we discarded and
- * reacquired the region lock, restart the list walk.
- *
- * If we neither wrote the buffer nor discarded the
- * region lock, continue down the buffer list.
+ * Find two buckets and select the one with the lowest
+ * priority. Performance testing shows that looking
+ * at two improves the LRUness and looking at more only
+ * does a little better.
*/
- if (wrote)
- ++c_mp->stat.st_rw_evict;
- else {
- if (restart)
- goto retry;
+ if (hp_tmp == NULL) {
+ hp_tmp = hp;
continue;
}
+ if (hp->hash_priority > hp_tmp->hash_priority)
+ hp = hp_tmp;
+ hp_tmp = NULL;
+ }
+
+ /* Remember the priority of the buffer we're looking for. */
+ priority = hp->hash_priority;
+
+ /* Unlock the region and lock the hash bucket. */
+ R_UNLOCK(dbenv, memreg);
+ mutexp = &hp->hash_mutex;
+ MUTEX_LOCK(dbenv, mutexp);
+
+#ifdef DIAGNOSTIC
+ __memp_check_order(hp);
+#endif
+ /*
+ * The lowest priority page is first in the bucket, as they are
+ * maintained in sorted order.
+ *
+ * The buffer may have been freed or its priority changed while
+ * we switched from the region lock to the hash lock. If so,
+ * we have to restart. We will still take the first buffer on
+ * the bucket's list, though, if it has a low enough priority.
+ */
+ if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL ||
+ bhp->ref != 0 || bhp->priority > priority)
+ goto next_hb;
+
+ buffers++;
+
+ /* Find the associated MPOOLFILE. */
+ bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+
+ /* If the page is dirty, pin it and write it. */
+ ret = 0;
+ if (F_ISSET(bhp, BH_DIRTY)) {
+ ++bhp->ref;
+ ret = __memp_bhwrite(dbmp, hp, bh_mfp, bhp, 0);
+ --bhp->ref;
+ if (ret == 0)
+ ++c_mp->stat.st_rw_evict;
} else
++c_mp->stat.st_ro_evict;
/*
+ * If a write fails for any reason, we can't proceed.
+ *
+ * We released the hash bucket lock while doing I/O, so another
+ * thread may have acquired this buffer and incremented the ref
+ * count after we wrote it, in which case we can't have it.
+ *
+ * If there's a write error, avoid selecting this buffer again
+ * by making it the bucket's least-desirable buffer.
+ */
+ if (ret != 0 || bhp->ref != 0) {
+ if (ret != 0 && aggressive)
+ __memp_bad_buffer(hp);
+ goto next_hb;
+ }
+
+ /*
* Check to see if the buffer is the size we're looking for.
- * If it is, simply reuse it.
+ * If so, we can simply reuse it. Else, free the buffer and
+ * its space and keep looking.
*/
if (mfp != NULL &&
mfp->stat.st_pagesize == bh_mfp->stat.st_pagesize) {
- __memp_bhfree(dbmp, bhp, 0);
+ __memp_bhfree(dbmp, hp, bhp, 0);
- if (offsetp != NULL)
- *offsetp = R_OFFSET(memreg, bhp);
- *(void **)retp = bhp;
- return (0);
+ p = bhp;
+ goto found;
}
- /* Note how much space we've freed, and free the buffer. */
- total += __db_shsizeof(bhp);
- __memp_bhfree(dbmp, bhp, 1);
+ freed_space += __db_shsizeof(bhp);
+ __memp_bhfree(dbmp, hp, bhp, 1);
/*
- * Retry as soon as we've freed up sufficient space. If we
- * have to coalesce of memory to satisfy the request, don't
- * try until it's likely (possible?) that we'll succeed.
+ * Unlock this hash bucket and re-acquire the region lock. If
+ * we're reaching here as a result of calling memp_bhfree, the
+ * hash bucket lock has already been discarded.
*/
- if (total >= 3 * len)
+ if (0) {
+next_hb: MUTEX_UNLOCK(dbenv, mutexp);
+ }
+ R_LOCK(dbenv, memreg);
+
+ /*
+ * Retry the allocation as soon as we've freed up sufficient
+ * space. We're likely to have to coalesce of memory to
+ * satisfy the request, don't try until it's likely (possible?)
+ * we'll succeed.
+ */
+ if (freed_space >= 3 * len)
goto alloc;
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * __memp_bad_buffer --
+ * Make the first buffer in a hash bucket the least desirable buffer.
+ */
+static void
+__memp_bad_buffer(hp)
+ DB_MPOOL_HASH *hp;
+{
+ BH *bhp, *t_bhp;
+ u_int32_t priority;
+
+ /* Remove the first buffer from the bucket. */
+ bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
+ SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
+
+ /*
+ * Find the highest priority buffer in the bucket. Buffers are
+ * sorted by priority, so it's the last one in the bucket.
+ *
+ * XXX
+ * Should use SH_TAILQ_LAST, but I think that macro is broken.
+ */
+ priority = bhp->priority;
+ for (t_bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
+ t_bhp != NULL; t_bhp = SH_TAILQ_NEXT(t_bhp, hq, __bh))
+ priority = t_bhp->priority;
+
+ /*
+ * Set our buffer's priority to be just as bad, and append it to
+ * the bucket.
+ */
+ bhp->priority = priority;
+ SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
- /* Restart the walk if we discarded the region lock. */
- if (restart)
- goto retry;
+ /* Reset the hash bucket's priority. */
+ hp->hash_priority = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
+}
+
+/*
+ * __memp_reset_lru --
+ * Reset the cache LRU counter.
+ */
+static void
+__memp_reset_lru(dbenv, memreg, c_mp)
+ DB_ENV *dbenv;
+ REGINFO *memreg;
+ MPOOL *c_mp;
+{
+ BH *bhp;
+ DB_MPOOL_HASH *hp;
+ int bucket;
+
+ /*
+ * Update the counter so all future allocations will start at the
+ * bottom.
+ */
+ c_mp->lru_count -= MPOOL_BASE_DECREMENT;
+
+ /* Release the region lock. */
+ R_UNLOCK(dbenv, memreg);
+
+ /* Adjust the priority of every buffer in the system. */
+ for (hp = R_ADDR(memreg, c_mp->htab),
+ bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
+ /*
+ * Skip empty buckets.
+ *
+ * We can check for empty buckets before locking as we
+ * only care if the pointer is zero or non-zero.
+ */
+ if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
+ continue;
+
+ MUTEX_LOCK(dbenv, &hp->hash_mutex);
+ for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
+ bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
+ if (bhp->priority != UINT32_T_MAX &&
+ bhp->priority > MPOOL_BASE_DECREMENT)
+ bhp->priority -= MPOOL_BASE_DECREMENT;
+ MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
}
- nomore = 1;
- goto alloc;
+
+ /* Reacquire the region lock. */
+ R_LOCK(dbenv, memreg);
+}
+
+#ifdef DIAGNOSTIC
+/*
+ * __memp_check_order --
+ * Verify the priority ordering of a hash bucket chain.
+ *
+ * PUBLIC: #ifdef DIAGNOSTIC
+ * PUBLIC: void __memp_check_order __P((DB_MPOOL_HASH *));
+ * PUBLIC: #endif
+ */
+void
+__memp_check_order(hp)
+ DB_MPOOL_HASH *hp;
+{
+ BH *bhp;
+ u_int32_t priority;
+
+ /*
+ * Assumes the hash bucket is locked.
+ */
+ if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL)
+ return;
+
+ DB_ASSERT(bhp->priority == hp->hash_priority);
+
+ for (priority = bhp->priority;
+ (bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) != NULL;
+ priority = bhp->priority)
+ DB_ASSERT(priority <= bhp->priority);
}
+#endif