summaryrefslogtreecommitdiff
path: root/bdb/mp
diff options
context:
space:
mode:
Diffstat (limited to 'bdb/mp')
-rw-r--r--bdb/mp/Design52
-rw-r--r--bdb/mp/mp_alloc.c430
-rw-r--r--bdb/mp/mp_bh.c568
-rw-r--r--bdb/mp/mp_fget.c763
-rw-r--r--bdb/mp/mp_fopen.c1167
-rw-r--r--bdb/mp/mp_fput.c196
-rw-r--r--bdb/mp/mp_fset.c63
-rw-r--r--bdb/mp/mp_method.c109
-rw-r--r--bdb/mp/mp_region.c211
-rw-r--r--bdb/mp/mp_register.c33
-rw-r--r--bdb/mp/mp_stat.c325
-rw-r--r--bdb/mp/mp_sync.c909
-rw-r--r--bdb/mp/mp_trickle.c136
13 files changed, 2917 insertions, 2045 deletions
diff --git a/bdb/mp/Design b/bdb/mp/Design
deleted file mode 100644
index 1b26aae6cba..00000000000
--- a/bdb/mp/Design
+++ /dev/null
@@ -1,52 +0,0 @@
-$Id: Design,v 11.2 1999/11/21 23:08:27 bostic Exp $
-
-There are three ways we do locking in the mpool code:
-
-Locking a handle mutex to provide concurrency for DB_THREAD operations.
-Locking the region mutex to provide mutual exclusion while reading and
- writing structures in the shared region.
-Locking buffer header mutexes during I/O.
-
-The first will not be further described here. We use the shared mpool
-region lock to provide mutual exclusion while reading/modifying all of
-the data structures, including the buffer headers. We use a per-buffer
-header lock to wait on buffer I/O. The order of locking is as follows:
-
-Searching for a buffer:
- Acquire the region lock.
- Find the buffer header.
- Increment the reference count (guarantee the buffer stays).
- While the BH_LOCKED flag is set (I/O is going on) {
- Release the region lock.
- Explicitly yield the processor if it's not the first pass
- through this loop, otherwise, we can simply spin because
- we'll be simply switching between the two locks.
- Request the buffer lock.
- The I/O will complete...
- Acquire the buffer lock.
- Release the buffer lock.
- Acquire the region lock.
- }
- Return the buffer.
-
-Reading/writing a buffer:
- Acquire the region lock.
- Find/create the buffer header.
- If reading, increment the reference count (guarantee the buffer stays).
- Set the BH_LOCKED flag.
- Acquire the buffer lock (guaranteed not to block).
- Release the region lock.
- Do the I/O and/or initialize the buffer contents.
- Release the buffer lock.
- At this point, the buffer lock is available, but the logical
- operation (flagged by BH_LOCKED) is not yet completed. For
- this reason, among others, threads checking the BH_LOCKED flag
- must loop around their test.
- Acquire the region lock.
- Clear the BH_LOCKED flag.
- Release the region lock.
- Return/discard the buffer.
-
-Pointers to DB_MPOOL, MPOOL, DB_MPOOLFILE and MPOOLFILE structures are
-not reacquired when a region lock is reacquired because they couldn't
-have been closed/discarded and because they never move in memory.
diff --git a/bdb/mp/mp_alloc.c b/bdb/mp/mp_alloc.c
index 731f569f57f..96dd612d7ba 100644
--- a/bdb/mp/mp_alloc.c
+++ b/bdb/mp/mp_alloc.c
@@ -1,22 +1,31 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: mp_alloc.c,v 11.7 2000/04/20 21:14:18 bostic Exp $";
+static const char revid[] = "$Id: mp_alloc.c,v 11.31 2002/08/14 17:21:37 ubell Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
#include <sys/types.h>
+#include <string.h>
#endif
#include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"
+
+typedef struct {
+ DB_MPOOL_HASH *bucket;
+ u_int32_t priority;
+} HS;
+
+static void __memp_bad_buffer __P((DB_MPOOL_HASH *));
+static void __memp_reset_lru __P((DB_ENV *, REGINFO *, MPOOL *));
/*
* __memp_alloc --
@@ -34,14 +43,32 @@ __memp_alloc(dbmp, memreg, mfp, len, offsetp, retp)
roff_t *offsetp;
void *retp;
{
- BH *bhp, *nbhp;
+ BH *bhp;
+ DB_ENV *dbenv;
+ DB_MPOOL_HASH *dbht, *hp, *hp_end, *hp_tmp;
+ DB_MUTEX *mutexp;
MPOOL *c_mp;
MPOOLFILE *bh_mfp;
- size_t total;
- int nomore, restart, ret, wrote;
+ size_t freed_space;
+ u_int32_t buckets, buffers, high_priority, max_na, priority;
+ int aggressive, ret;
void *p;
+ dbenv = dbmp->dbenv;
c_mp = memreg->primary;
+ dbht = R_ADDR(memreg, c_mp->htab);
+ hp_end = &dbht[c_mp->htab_buckets];
+
+ buckets = buffers = 0;
+ aggressive = 0;
+
+ c_mp->stat.st_alloc++;
+
+ /*
+ * Get aggressive if we've tried to flush the number of pages as are
+ * in the system without finding space.
+ */
+ max_na = 5 * c_mp->htab_buckets;
/*
* If we're allocating a buffer, and the one we're discarding is the
@@ -53,100 +80,363 @@ __memp_alloc(dbmp, memreg, mfp, len, offsetp, retp)
if (mfp != NULL)
len = (sizeof(BH) - sizeof(u_int8_t)) + mfp->stat.st_pagesize;
- nomore = 0;
+ R_LOCK(dbenv, memreg);
+
+ /*
+ * On every buffer allocation we update the buffer generation number
+ * and check for wraparound.
+ */
+ if (++c_mp->lru_count == UINT32_T_MAX)
+ __memp_reset_lru(dbenv, memreg, c_mp);
+
+ /*
+ * Anything newer than 1/10th of the buffer pool is ignored during
+ * allocation (unless allocation starts failing).
+ */
+ DB_ASSERT(c_mp->lru_count > c_mp->stat.st_pages / 10);
+ high_priority = c_mp->lru_count - c_mp->stat.st_pages / 10;
+
+ /*
+ * First we try to allocate from free memory. If that fails, scan the
+ * buffer pool to find buffers with low priorities. We consider small
+ * sets of hash buckets each time to limit the amount of work needing
+ * to be done. This approximates LRU, but not very well. We either
+ * find a buffer of the same size to use, or we will free 3 times what
+ * we need in the hopes it will coalesce into a contiguous chunk of the
+ * right size. In the latter case we branch back here and try again.
+ */
alloc: if ((ret = __db_shalloc(memreg->addr, len, MUTEX_ALIGN, &p)) == 0) {
- if (offsetp != NULL)
+ if (mfp != NULL)
+ c_mp->stat.st_pages++;
+ R_UNLOCK(dbenv, memreg);
+
+found: if (offsetp != NULL)
*offsetp = R_OFFSET(memreg, p);
*(void **)retp = p;
+
+ /*
+ * Update the search statistics.
+ *
+ * We're not holding the region locked here, these statistics
+ * can't be trusted.
+ */
+ if (buckets != 0) {
+ if (buckets > c_mp->stat.st_alloc_max_buckets)
+ c_mp->stat.st_alloc_max_buckets = buckets;
+ c_mp->stat.st_alloc_buckets += buckets;
+ }
+ if (buffers != 0) {
+ if (buffers > c_mp->stat.st_alloc_max_pages)
+ c_mp->stat.st_alloc_max_pages = buffers;
+ c_mp->stat.st_alloc_pages += buffers;
+ }
return (0);
}
- if (nomore) {
- __db_err(dbmp->dbenv,
- "Unable to allocate %lu bytes from mpool shared region: %s\n",
- (u_long)len, db_strerror(ret));
- return (ret);
- }
-retry: /* Find a buffer we can flush; pure LRU. */
- restart = total = 0;
- for (bhp =
- SH_TAILQ_FIRST(&c_mp->bhq, __bh); bhp != NULL; bhp = nbhp) {
- nbhp = SH_TAILQ_NEXT(bhp, q, __bh);
+ /*
+ * We re-attempt the allocation every time we've freed 3 times what
+ * we need. Reset our free-space counter.
+ */
+ freed_space = 0;
- /* Ignore pinned or locked (I/O in progress) buffers. */
- if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED))
+ /*
+ * Walk the hash buckets and find the next two with potentially useful
+ * buffers. Free the buffer with the lowest priority from the buckets'
+ * chains.
+ */
+ for (hp_tmp = NULL;;) {
+ /* Check for wrap around. */
+ hp = &dbht[c_mp->last_checked++];
+ if (hp >= hp_end) {
+ c_mp->last_checked = 0;
+
+ /*
+ * If we've gone through all of the hash buckets, try
+ * an allocation. If the cache is small, the old page
+ * size is small, and the new page size is large, we
+ * might have freed enough memory (but not 3 times the
+ * memory).
+ */
+ goto alloc;
+ }
+
+ /*
+ * Skip empty buckets.
+ *
+ * We can check for empty buckets before locking as we
+ * only care if the pointer is zero or non-zero.
+ */
+ if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
continue;
- /* Find the associated MPOOLFILE. */
- bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+ /*
+ * The failure mode is when there are too many buffers we can't
+ * write or there's not enough memory in the system. We don't
+ * have a metric for deciding if allocation has no possible way
+ * to succeed, so we don't ever fail, we assume memory will be
+ * available if we wait long enough.
+ *
+ * Get aggressive if we've tried to flush 5 times the number of
+ * hash buckets as are in the system -- it's possible we have
+ * been repeatedly trying to flush the same buffers, although
+ * it's unlikely. Aggressive means:
+ *
+ * a: set a flag to attempt to flush high priority buffers as
+ * well as other buffers.
+ * b: sync the mpool to force out queue extent pages. While we
+ * might not have enough space for what we want and flushing
+ * is expensive, why not?
+ * c: sleep for a second -- hopefully someone else will run and
+ * free up some memory. Try to allocate memory too, in case
+ * the other thread returns its memory to the region.
+ * d: look at a buffer in every hash bucket rather than choose
+ * the more preferable of two.
+ *
+ * !!!
+ * This test ignores pathological cases like no buffers in the
+ * system -- that shouldn't be possible.
+ */
+ if ((++buckets % max_na) == 0) {
+ aggressive = 1;
- /* Write the page if it's dirty. */
- if (F_ISSET(bhp, BH_DIRTY)) {
- ++bhp->ref;
- if ((ret = __memp_bhwrite(dbmp,
- bh_mfp, bhp, &restart, &wrote)) != 0)
- return (ret);
- --bhp->ref;
+ R_UNLOCK(dbenv, memreg);
- /*
- * Another process may have acquired this buffer and
- * incremented the ref count after we wrote it.
- */
- if (bhp->ref != 0)
- goto retry;
+ (void)__memp_sync_int(
+ dbenv, NULL, 0, DB_SYNC_ALLOC, NULL);
+
+ (void)__os_sleep(dbenv, 1, 0);
+
+ R_LOCK(dbenv, memreg);
+ goto alloc;
+ }
+
+ if (!aggressive) {
+ /* Skip high priority buckets. */
+ if (hp->hash_priority > high_priority)
+ continue;
/*
- * If we wrote the page, continue and free the buffer.
- * We don't have to rewalk the list to acquire the
- * buffer because it was never available for any other
- * process to modify it.
- *
- * If we didn't write the page, but we discarded and
- * reacquired the region lock, restart the list walk.
- *
- * If we neither wrote the buffer nor discarded the
- * region lock, continue down the buffer list.
+ * Find two buckets and select the one with the lowest
+ * priority. Performance testing shows that looking
+ * at two improves the LRUness and looking at more only
+ * does a little better.
*/
- if (wrote)
- ++c_mp->stat.st_rw_evict;
- else {
- if (restart)
- goto retry;
+ if (hp_tmp == NULL) {
+ hp_tmp = hp;
continue;
}
+ if (hp->hash_priority > hp_tmp->hash_priority)
+ hp = hp_tmp;
+ hp_tmp = NULL;
+ }
+
+ /* Remember the priority of the buffer we're looking for. */
+ priority = hp->hash_priority;
+
+ /* Unlock the region and lock the hash bucket. */
+ R_UNLOCK(dbenv, memreg);
+ mutexp = &hp->hash_mutex;
+ MUTEX_LOCK(dbenv, mutexp);
+
+#ifdef DIAGNOSTIC
+ __memp_check_order(hp);
+#endif
+ /*
+ * The lowest priority page is first in the bucket, as they are
+ * maintained in sorted order.
+ *
+ * The buffer may have been freed or its priority changed while
+ * we switched from the region lock to the hash lock. If so,
+ * we have to restart. We will still take the first buffer on
+ * the bucket's list, though, if it has a low enough priority.
+ */
+ if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL ||
+ bhp->ref != 0 || bhp->priority > priority)
+ goto next_hb;
+
+ buffers++;
+
+ /* Find the associated MPOOLFILE. */
+ bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+
+ /* If the page is dirty, pin it and write it. */
+ ret = 0;
+ if (F_ISSET(bhp, BH_DIRTY)) {
+ ++bhp->ref;
+ ret = __memp_bhwrite(dbmp, hp, bh_mfp, bhp, 0);
+ --bhp->ref;
+ if (ret == 0)
+ ++c_mp->stat.st_rw_evict;
} else
++c_mp->stat.st_ro_evict;
/*
+ * If a write fails for any reason, we can't proceed.
+ *
+ * We released the hash bucket lock while doing I/O, so another
+ * thread may have acquired this buffer and incremented the ref
+ * count after we wrote it, in which case we can't have it.
+ *
+ * If there's a write error, avoid selecting this buffer again
+ * by making it the bucket's least-desirable buffer.
+ */
+ if (ret != 0 || bhp->ref != 0) {
+ if (ret != 0 && aggressive)
+ __memp_bad_buffer(hp);
+ goto next_hb;
+ }
+
+ /*
* Check to see if the buffer is the size we're looking for.
- * If it is, simply reuse it.
+ * If so, we can simply reuse it. Else, free the buffer and
+ * its space and keep looking.
*/
if (mfp != NULL &&
mfp->stat.st_pagesize == bh_mfp->stat.st_pagesize) {
- __memp_bhfree(dbmp, bhp, 0);
+ __memp_bhfree(dbmp, hp, bhp, 0);
- if (offsetp != NULL)
- *offsetp = R_OFFSET(memreg, bhp);
- *(void **)retp = bhp;
- return (0);
+ p = bhp;
+ goto found;
}
- /* Note how much space we've freed, and free the buffer. */
- total += __db_shsizeof(bhp);
- __memp_bhfree(dbmp, bhp, 1);
+ freed_space += __db_shsizeof(bhp);
+ __memp_bhfree(dbmp, hp, bhp, 1);
/*
- * Retry as soon as we've freed up sufficient space. If we
- * have to coalesce of memory to satisfy the request, don't
- * try until it's likely (possible?) that we'll succeed.
+ * Unlock this hash bucket and re-acquire the region lock. If
+ * we're reaching here as a result of calling memp_bhfree, the
+ * hash bucket lock has already been discarded.
*/
- if (total >= 3 * len)
+ if (0) {
+next_hb: MUTEX_UNLOCK(dbenv, mutexp);
+ }
+ R_LOCK(dbenv, memreg);
+
+ /*
+ * Retry the allocation as soon as we've freed up sufficient
+ * space. We're likely to have to coalesce of memory to
+ * satisfy the request, don't try until it's likely (possible?)
+ * we'll succeed.
+ */
+ if (freed_space >= 3 * len)
goto alloc;
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * __memp_bad_buffer --
+ * Make the first buffer in a hash bucket the least desirable buffer.
+ */
+static void
+__memp_bad_buffer(hp)
+ DB_MPOOL_HASH *hp;
+{
+ BH *bhp, *t_bhp;
+ u_int32_t priority;
+
+ /* Remove the first buffer from the bucket. */
+ bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
+ SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
+
+ /*
+ * Find the highest priority buffer in the bucket. Buffers are
+ * sorted by priority, so it's the last one in the bucket.
+ *
+ * XXX
+ * Should use SH_TAILQ_LAST, but I think that macro is broken.
+ */
+ priority = bhp->priority;
+ for (t_bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
+ t_bhp != NULL; t_bhp = SH_TAILQ_NEXT(t_bhp, hq, __bh))
+ priority = t_bhp->priority;
+
+ /*
+ * Set our buffer's priority to be just as bad, and append it to
+ * the bucket.
+ */
+ bhp->priority = priority;
+ SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
- /* Restart the walk if we discarded the region lock. */
- if (restart)
- goto retry;
+ /* Reset the hash bucket's priority. */
+ hp->hash_priority = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
+}
+
+/*
+ * __memp_reset_lru --
+ * Reset the cache LRU counter.
+ */
+static void
+__memp_reset_lru(dbenv, memreg, c_mp)
+ DB_ENV *dbenv;
+ REGINFO *memreg;
+ MPOOL *c_mp;
+{
+ BH *bhp;
+ DB_MPOOL_HASH *hp;
+ int bucket;
+
+ /*
+ * Update the counter so all future allocations will start at the
+ * bottom.
+ */
+ c_mp->lru_count -= MPOOL_BASE_DECREMENT;
+
+ /* Release the region lock. */
+ R_UNLOCK(dbenv, memreg);
+
+ /* Adjust the priority of every buffer in the system. */
+ for (hp = R_ADDR(memreg, c_mp->htab),
+ bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
+ /*
+ * Skip empty buckets.
+ *
+ * We can check for empty buckets before locking as we
+ * only care if the pointer is zero or non-zero.
+ */
+ if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
+ continue;
+
+ MUTEX_LOCK(dbenv, &hp->hash_mutex);
+ for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
+ bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
+ if (bhp->priority != UINT32_T_MAX &&
+ bhp->priority > MPOOL_BASE_DECREMENT)
+ bhp->priority -= MPOOL_BASE_DECREMENT;
+ MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
}
- nomore = 1;
- goto alloc;
+
+ /* Reacquire the region lock. */
+ R_LOCK(dbenv, memreg);
+}
+
+#ifdef DIAGNOSTIC
+/*
+ * __memp_check_order --
+ * Verify the priority ordering of a hash bucket chain.
+ *
+ * PUBLIC: #ifdef DIAGNOSTIC
+ * PUBLIC: void __memp_check_order __P((DB_MPOOL_HASH *));
+ * PUBLIC: #endif
+ */
+void
+__memp_check_order(hp)
+ DB_MPOOL_HASH *hp;
+{
+ BH *bhp;
+ u_int32_t priority;
+
+ /*
+ * Assumes the hash bucket is locked.
+ */
+ if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL)
+ return;
+
+ DB_ASSERT(bhp->priority == hp->hash_priority);
+
+ for (priority = bhp->priority;
+ (bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) != NULL;
+ priority = bhp->priority)
+ DB_ASSERT(priority <= bhp->priority);
}
+#endif
diff --git a/bdb/mp/mp_bh.c b/bdb/mp/mp_bh.c
index e802b165b2d..85d15218abf 100644
--- a/bdb/mp/mp_bh.c
+++ b/bdb/mp/mp_bh.c
@@ -1,13 +1,13 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: mp_bh.c,v 11.25 2001/01/10 04:50:53 ubell Exp $";
+static const char revid[] = "$Id: mp_bh.c,v 11.71 2002/09/04 19:06:45 margo Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -18,40 +18,41 @@ static const char revid[] = "$Id: mp_bh.c,v 11.25 2001/01/10 04:50:53 ubell Exp
#endif
#include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
-#include "log.h"
-#include "db_page.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"
+#include "dbinc/log.h"
+#include "dbinc/db_page.h"
+static int __memp_pgwrite
+ __P((DB_MPOOL *, DB_MPOOLFILE *, DB_MPOOL_HASH *, BH *));
static int __memp_upgrade __P((DB_MPOOL *, DB_MPOOLFILE *, MPOOLFILE *));
/*
* __memp_bhwrite --
- * Write the page associated with a given bucket header.
+ * Write the page associated with a given buffer header.
*
- * PUBLIC: int __memp_bhwrite
- * PUBLIC: __P((DB_MPOOL *, MPOOLFILE *, BH *, int *, int *));
+ * PUBLIC: int __memp_bhwrite __P((DB_MPOOL *,
+ * PUBLIC: DB_MPOOL_HASH *, MPOOLFILE *, BH *, int));
*/
int
-__memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep)
+__memp_bhwrite(dbmp, hp, mfp, bhp, open_extents)
DB_MPOOL *dbmp;
+ DB_MPOOL_HASH *hp;
MPOOLFILE *mfp;
BH *bhp;
- int *restartp, *wrotep;
+ int open_extents;
{
+ DB_ENV *dbenv;
DB_MPOOLFILE *dbmfp;
DB_MPREG *mpreg;
- int incremented, ret;
+ int local_open, incremented, ret;
- if (restartp != NULL)
- *restartp = 0;
- if (wrotep != NULL)
- *wrotep = 0;
- incremented = 0;
+ dbenv = dbmp->dbenv;
+ local_open = incremented = 0;
/*
- * If the file has been removed or is a closed temporary file, Jump
- * right ahead and pretend that we've found the file we want-- the
+ * If the file has been removed or is a closed temporary file, jump
+ * right ahead and pretend that we've found the file we want -- the
* page-write function knows how to handle the fact that we don't have
* (or need!) any real file descriptor information.
*/
@@ -66,52 +67,60 @@ __memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep)
* If we find a descriptor on the file that's not open for writing, we
* try and upgrade it to make it writeable. If that fails, we're done.
*/
- MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp);
+ MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q))
if (dbmfp->mfp == mfp) {
if (F_ISSET(dbmfp, MP_READONLY) &&
- __memp_upgrade(dbmp, dbmfp, mfp)) {
- MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
- return (0);
+ !F_ISSET(dbmfp, MP_UPGRADE) &&
+ (F_ISSET(dbmfp, MP_UPGRADE_FAIL) ||
+ __memp_upgrade(dbmp, dbmfp, mfp))) {
+ MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+ return (EPERM);
}
/*
* Increment the reference count -- see the comment in
- * memp_fclose().
+ * __memp_fclose_int().
*/
++dbmfp->ref;
incremented = 1;
break;
}
- MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
+ MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+
if (dbmfp != NULL)
goto found;
/*
* !!!
+ * It's the caller's choice if we're going to open extent files.
+ */
+ if (!open_extents && F_ISSET(mfp, MP_EXTENT))
+ return (EPERM);
+
+ /*
+ * !!!
* Don't try to attach to temporary files. There are two problems in
* trying to do that. First, if we have different privileges than the
* process that "owns" the temporary file, we might create the backing
* disk file such that the owning process couldn't read/write its own
- * buffers, e.g., memp_trickle() running as root creating a file owned
+ * buffers, e.g., memp_trickle running as root creating a file owned
* as root, mode 600. Second, if the temporary file has already been
* created, we don't have any way of finding out what its real name is,
* and, even if we did, it was already unlinked (so that it won't be
* left if the process dies horribly). This decision causes a problem,
* however: if the temporary file consumes the entire buffer cache,
* and the owner doesn't flush the buffers to disk, we could end up
- * with resource starvation, and the memp_trickle() thread couldn't do
+ * with resource starvation, and the memp_trickle thread couldn't do
* anything about it. That's a pretty unlikely scenario, though.
*
- * Note that we should never get here when the temporary file
- * in question has already been closed in another process, in which
- * case it should be marked MP_DEADFILE.
+ * Note we should never get here when the temporary file in question
+ * has already been closed in another process, in which case it should
+ * be marked MP_DEADFILE.
*/
- if (F_ISSET(mfp, MP_TEMP)) {
- DB_ASSERT(!F_ISSET(mfp, MP_DEADFILE));
- return (0);
- }
+ if (F_ISSET(mfp, MP_TEMP))
+ return (EPERM);
/*
* It's not a page from a file we've opened. If the file requires
@@ -120,14 +129,14 @@ __memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep)
* nothing we can do.
*/
if (mfp->ftype != 0) {
- MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp);
+ MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
for (mpreg = LIST_FIRST(&dbmp->dbregq);
mpreg != NULL; mpreg = LIST_NEXT(mpreg, q))
if (mpreg->ftype == mfp->ftype)
break;
- MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
+ MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
if (mpreg == NULL)
- return (0);
+ return (EPERM);
}
/*
@@ -138,17 +147,24 @@ __memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep)
* There's no negative cache, so we may repeatedly try and open files
* that we have previously tried (and failed) to open.
*/
- if (__memp_fopen(dbmp, mfp, R_ADDR(dbmp->reginfo, mfp->path_off),
- 0, 0, mfp->stat.st_pagesize, 0, NULL, &dbmfp) != 0)
- return (0);
+ if ((ret = dbenv->memp_fcreate(dbenv, &dbmfp, 0)) != 0)
+ return (ret);
+ if ((ret = __memp_fopen_int(dbmfp, mfp,
+ R_ADDR(dbmp->reginfo, mfp->path_off),
+ 0, 0, mfp->stat.st_pagesize)) != 0) {
+ (void)dbmfp->close(dbmfp, 0);
+ return (ret);
+ }
+ local_open = 1;
-found: ret = __memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep);
+found: ret = __memp_pgwrite(dbmp, dbmfp, hp, bhp);
- if (incremented) {
- MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp);
+ MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
+ if (incremented)
--dbmfp->ref;
- MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
- }
+ else if (local_open)
+ F_SET(dbmfp, MP_FLUSH);
+ MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
return (ret);
}
@@ -157,11 +173,12 @@ found: ret = __memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep);
* __memp_pgread --
* Read a page from a file.
*
- * PUBLIC: int __memp_pgread __P((DB_MPOOLFILE *, BH *, int));
+ * PUBLIC: int __memp_pgread __P((DB_MPOOLFILE *, DB_MUTEX *, BH *, int));
*/
int
-__memp_pgread(dbmfp, bhp, can_create)
+__memp_pgread(dbmfp, mutexp, bhp, can_create)
DB_MPOOLFILE *dbmfp;
+ DB_MUTEX *mutexp;
BH *bhp;
int can_create;
{
@@ -169,171 +186,129 @@ __memp_pgread(dbmfp, bhp, can_create)
DB_ENV *dbenv;
DB_MPOOL *dbmp;
MPOOLFILE *mfp;
- size_t len, pagesize;
- size_t nr;
- int created, ret;
+ size_t len, nr, pagesize;
+ int ret;
dbmp = dbmfp->dbmp;
dbenv = dbmp->dbenv;
mfp = dbmfp->mfp;
pagesize = mfp->stat.st_pagesize;
+ /* We should never be called with a dirty or a locked buffer. */
+ DB_ASSERT(!F_ISSET(bhp, BH_DIRTY | BH_DIRTY_CREATE | BH_LOCKED));
+
+ /* Lock the buffer and swap the hash bucket lock for the buffer lock. */
F_SET(bhp, BH_LOCKED | BH_TRASH);
- MUTEX_LOCK(dbenv, &bhp->mutex, dbenv->lockfhp);
- R_UNLOCK(dbenv, dbmp->reginfo);
+ MUTEX_LOCK(dbenv, &bhp->mutex);
+ MUTEX_UNLOCK(dbenv, mutexp);
/*
* Temporary files may not yet have been created. We don't create
* them now, we create them when the pages have to be flushed.
*/
nr = 0;
- if (F_ISSET(&dbmfp->fh, DB_FH_VALID)) {
- /*
- * Ignore read errors if we have permission to create the page.
- * Assume that the page doesn't exist, and that we'll create it
- * when we write it out.
- *
- * XXX
- * Theoretically, we could overwrite a page of data if it were
- * possible for a file to be successfully opened for reading
- * and then for the read to fail. Shouldn't ever happen, but
- * it might be worth checking to see if the offset is past the
- * known end-of-file.
- */
- db_io.fhp = &dbmfp->fh;
+ if (F_ISSET(dbmfp->fhp, DB_FH_VALID)) {
+ db_io.fhp = dbmfp->fhp;
db_io.mutexp = dbmfp->mutexp;
db_io.pagesize = db_io.bytes = pagesize;
db_io.pgno = bhp->pgno;
db_io.buf = bhp->buf;
- ret = __os_io(dbenv, &db_io, DB_IO_READ, &nr);
- } else
- ret = 0;
+ /*
+ * The page may not exist; if it doesn't, nr may well be 0,
+ * but we expect the underlying OS calls not to return an
+ * error code in this case.
+ */
+ if ((ret = __os_io(dbenv, &db_io, DB_IO_READ, &nr)) != 0)
+ goto err;
+ }
- created = 0;
if (nr < pagesize) {
- if (can_create)
- created = 1;
- else {
- /*
- * If we had a short read, ret may be 0. This may not
- * be an error -- in particular DB recovery processing
- * may request pages that have never been written to
- * disk, in which case we won't find the page. So, the
- * caller must know how to handle the error.
- */
- if (ret == 0)
- ret = EIO;
+ /*
+ * Don't output error messages for short reads. In particular,
+ * DB recovery processing may request pages never written to
+ * disk or for which only some part have been written to disk,
+ * in which case we won't find the page. The caller must know
+ * how to handle the error.
+ */
+ if (can_create == 0) {
+ ret = DB_PAGE_NOTFOUND;
goto err;
}
- }
- /*
- * Clear any bytes we didn't read that need to be cleared. If we're
- * running in diagnostic mode, smash any bytes on the page that are
- * unknown quantities for the caller.
- */
- if (nr != pagesize) {
+ /* Clear any bytes that need to be cleared. */
len = mfp->clear_len == 0 ? pagesize : mfp->clear_len;
- if (nr < len)
- memset(bhp->buf + nr, 0, len - nr);
-#ifdef DIAGNOSTIC
- if (nr > len)
- len = nr;
+ memset(bhp->buf, 0, len);
+
+#if defined(DIAGNOSTIC) || defined(UMRW)
+ /*
+ * If we're running in diagnostic mode, corrupt any bytes on
+ * the page that are unknown quantities for the caller.
+ */
if (len < pagesize)
memset(bhp->buf + len, CLEAR_BYTE, pagesize - len);
#endif
- }
+ ++mfp->stat.st_page_create;
+ } else
+ ++mfp->stat.st_page_in;
/* Call any pgin function. */
ret = mfp->ftype == 0 ? 0 : __memp_pg(dbmfp, bhp, 1);
- /* Unlock the buffer and reacquire the region lock. */
+ /* Unlock the buffer and reacquire the hash bucket lock. */
err: MUTEX_UNLOCK(dbenv, &bhp->mutex);
- R_LOCK(dbenv, dbmp->reginfo);
+ MUTEX_LOCK(dbenv, mutexp);
/*
* If no errors occurred, the data is now valid, clear the BH_TRASH
* flag; regardless, clear the lock bit and let other threads proceed.
*/
F_CLR(bhp, BH_LOCKED);
- if (ret == 0) {
+ if (ret == 0)
F_CLR(bhp, BH_TRASH);
- /* Update the statistics. */
- if (created)
- ++mfp->stat.st_page_create;
- else
- ++mfp->stat.st_page_in;
- }
-
return (ret);
}
/*
* __memp_pgwrite --
* Write a page to a file.
- *
- * PUBLIC: int __memp_pgwrite
- * PUBLIC: __P((DB_MPOOL *, DB_MPOOLFILE *, BH *, int *, int *));
*/
-int
-__memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep)
+static int
+__memp_pgwrite(dbmp, dbmfp, hp, bhp)
DB_MPOOL *dbmp;
DB_MPOOLFILE *dbmfp;
+ DB_MPOOL_HASH *hp;
BH *bhp;
- int *restartp, *wrotep;
{
DB_ENV *dbenv;
DB_IO db_io;
DB_LSN lsn;
- MPOOL *c_mp, *mp;
MPOOLFILE *mfp;
size_t nw;
- int callpgin, dosync, ret, syncfail;
- const char *fail;
+ int callpgin, ret;
dbenv = dbmp->dbenv;
- mp = dbmp->reginfo[0].primary;
mfp = dbmfp == NULL ? NULL : dbmfp->mfp;
-
- if (restartp != NULL)
- *restartp = 0;
- if (wrotep != NULL)
- *wrotep = 0;
- callpgin = 0;
+ callpgin = ret = 0;
/*
- * Check the dirty bit -- this buffer may have been written since we
- * decided to write it.
+ * We should never be called with a clean or trash buffer.
+ * The sync code does call us with already locked buffers.
*/
- if (!F_ISSET(bhp, BH_DIRTY)) {
- if (wrotep != NULL)
- *wrotep = 1;
- return (0);
- }
-
- MUTEX_LOCK(dbenv, &bhp->mutex, dbenv->lockfhp);
+ DB_ASSERT(F_ISSET(bhp, BH_DIRTY));
+ DB_ASSERT(!F_ISSET(bhp, BH_TRASH));
/*
- * If there were two writers, we may have just been waiting while the
- * other writer completed I/O on this buffer. Check the dirty bit one
- * more time.
+ * If we have not already traded the hash bucket lock for the buffer
+ * lock, do so now.
*/
- if (!F_ISSET(bhp, BH_DIRTY)) {
- MUTEX_UNLOCK(dbenv, &bhp->mutex);
-
- if (wrotep != NULL)
- *wrotep = 1;
- return (0);
+ if (!F_ISSET(bhp, BH_LOCKED)) {
+ F_SET(bhp, BH_LOCKED);
+ MUTEX_LOCK(dbenv, &bhp->mutex);
+ MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
}
- F_SET(bhp, BH_LOCKED);
- R_UNLOCK(dbenv, dbmp->reginfo);
-
- if (restartp != NULL)
- *restartp = 1;
-
/*
* It's possible that the underlying file doesn't exist, either
* because of an outright removal or because it was a temporary
@@ -347,155 +322,122 @@ __memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep)
goto file_dead;
/*
- * Ensure the appropriate log records are on disk. If the page is
- * being written as part of a sync operation, the flush has already
- * been done, unless it was written by the application *after* the
- * sync was scheduled.
+ * If the page is in a file for which we have LSN information, we have
+ * to ensure the appropriate log records are on disk.
*/
- if (LOGGING_ON(dbenv) &&
- (!F_ISSET(bhp, BH_SYNC) || F_ISSET(bhp, BH_SYNC_LOGFLSH))) {
+ if (LOGGING_ON(dbenv) && mfp->lsn_off != -1) {
memcpy(&lsn, bhp->buf + mfp->lsn_off, sizeof(DB_LSN));
- if ((ret = log_flush(dbenv, &lsn)) != 0)
+ if ((ret = dbenv->log_flush(dbenv, &lsn)) != 0)
goto err;
}
- DB_ASSERT(!LOGGING_ON(dbenv) ||
- log_compare(&((LOG *)((DB_LOG *)
- dbenv->lg_handle)->reginfo.primary)->s_lsn, &LSN(bhp->buf)) > 0);
+
+#ifdef DIAGNOSTIC
+ /*
+ * Verify write-ahead logging semantics.
+ *
+ * !!!
+ * One special case. There is a single field on the meta-data page,
+ * the last-page-number-in-the-file field, for which we do not log
+ * changes. If the page was originally created in a database that
+ * didn't have logging turned on, we can see a page marked dirty but
+ * for which no corresponding log record has been written. However,
+ * the only way that a page can be created for which there isn't a
+ * previous log record and valid LSN is when the page was created
+ * without logging turned on, and so we check for that special-case
+ * LSN value.
+ */
+ if (LOGGING_ON(dbenv) && !IS_NOT_LOGGED_LSN(LSN(bhp->buf))) {
+ /*
+ * There is a potential race here. If we are in the midst of
+ * switching log files, it's possible we could test against the
+ * old file and the new offset in the log region's LSN. If we
+ * fail the first test, acquire the log mutex and check again.
+ */
+ DB_LOG *dblp;
+ LOG *lp;
+
+ dblp = dbenv->lg_handle;
+ lp = dblp->reginfo.primary;
+ if (!IS_NOT_LOGGED_LSN(LSN(bhp->buf)) &&
+ log_compare(&lp->s_lsn, &LSN(bhp->buf)) <= 0) {
+ R_LOCK(dbenv, &dblp->reginfo);
+ DB_ASSERT(log_compare(&lp->s_lsn, &LSN(bhp->buf)) > 0);
+ R_UNLOCK(dbenv, &dblp->reginfo);
+ }
+ }
+#endif
/*
* Call any pgout function. We set the callpgin flag so that we flag
* that the contents of the buffer will need to be passed through pgin
* before they are reused.
*/
- if (mfp->ftype == 0)
- ret = 0;
- else {
+ if (mfp->ftype != 0) {
callpgin = 1;
if ((ret = __memp_pg(dbmfp, bhp, 0)) != 0)
goto err;
}
/* Temporary files may not yet have been created. */
- if (!F_ISSET(&dbmfp->fh, DB_FH_VALID)) {
+ if (!F_ISSET(dbmfp->fhp, DB_FH_VALID)) {
MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
- if (!F_ISSET(&dbmfp->fh, DB_FH_VALID) &&
- ((ret = __db_appname(dbenv, DB_APP_TMP, NULL, NULL,
- DB_OSO_CREATE | DB_OSO_EXCL | DB_OSO_TEMP,
- &dbmfp->fh, NULL)) != 0 ||
- !F_ISSET(&dbmfp->fh, DB_FH_VALID))) {
- MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+ ret = F_ISSET(dbmfp->fhp, DB_FH_VALID) ? 0 :
+ __db_appname(dbenv, DB_APP_TMP, NULL,
+ F_ISSET(dbenv, DB_ENV_DIRECT_DB) ? DB_OSO_DIRECT : 0,
+ dbmfp->fhp, NULL);
+ MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+ if (ret != 0) {
__db_err(dbenv,
"unable to create temporary backing file");
goto err;
}
- MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
}
/* Write the page. */
- db_io.fhp = &dbmfp->fh;
+ db_io.fhp = dbmfp->fhp;
db_io.mutexp = dbmfp->mutexp;
db_io.pagesize = db_io.bytes = mfp->stat.st_pagesize;
db_io.pgno = bhp->pgno;
db_io.buf = bhp->buf;
if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) {
- ret = __db_panic(dbenv, ret);
- fail = "write";
- goto syserr;
- }
- if (nw != mfp->stat.st_pagesize) {
- ret = EIO;
- fail = "write";
- goto syserr;
+ __db_err(dbenv, "%s: write failed for page %lu",
+ __memp_fn(dbmfp), (u_long)bhp->pgno);
+ goto err;
}
+ ++mfp->stat.st_page_out;
+err:
file_dead:
/*
* !!!
* Once we pass this point, dbmfp and mfp may be NULL, we may not have
* a valid file reference.
*
- * Unlock the buffer and reacquire the region lock.
+ * Unlock the buffer and reacquire the hash lock.
*/
MUTEX_UNLOCK(dbenv, &bhp->mutex);
- R_LOCK(dbenv, dbmp->reginfo);
+ MUTEX_LOCK(dbenv, &hp->hash_mutex);
/*
- * Clean up the flags based on a successful write.
- *
* If we rewrote the page, it will need processing by the pgin
* routine before reuse.
*/
if (callpgin)
F_SET(bhp, BH_CALLPGIN);
- F_CLR(bhp, BH_DIRTY | BH_LOCKED);
/*
- * If we write a buffer for which a checkpoint is waiting, update
- * the count of pending buffers (both in the mpool as a whole and
- * for this file). If the count for this file goes to zero, set a
- * flag so we flush the writes.
+ * Update the hash bucket statistics, reset the flags.
+ * If we were successful, the page is no longer dirty.
*/
- dosync = 0;
- if (F_ISSET(bhp, BH_SYNC)) {
- F_CLR(bhp, BH_SYNC | BH_SYNC_LOGFLSH);
-
- --mp->lsn_cnt;
- if (mfp != NULL)
- dosync = --mfp->lsn_cnt == 0 ? 1 : 0;
- }
-
- /* Update the page clean/dirty statistics. */
- c_mp = BH_TO_CACHE(dbmp, bhp);
- ++c_mp->stat.st_page_clean;
- --c_mp->stat.st_page_dirty;
-
- /* Update I/O statistics. */
- if (mfp != NULL)
- ++mfp->stat.st_page_out;
+ if (ret == 0) {
+ DB_ASSERT(hp->hash_page_dirty != 0);
+ --hp->hash_page_dirty;
- /*
- * Do the sync after everything else has been updated, so any incoming
- * checkpoint doesn't see inconsistent information.
- *
- * XXX:
- * Don't lock the region around the sync, fsync(2) has no atomicity
- * issues.
- *
- * XXX:
- * We ignore errors from the sync -- it makes no sense to return an
- * error to the calling process, so set a flag causing the checkpoint
- * to be retried later. There is a possibility, of course, that a
- * subsequent checkpoint was started and that we're going to force it
- * to fail. That should be unlikely, and fixing it would be difficult.
- */
- if (dosync) {
- R_UNLOCK(dbenv, dbmp->reginfo);
- syncfail = __os_fsync(dbenv, &dbmfp->fh) != 0;
- R_LOCK(dbenv, dbmp->reginfo);
- if (syncfail)
- F_SET(mp, MP_LSN_RETRY);
+ F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE);
}
- if (wrotep != NULL)
- *wrotep = 1;
-
- return (0);
-
-syserr: __db_err(dbenv, "%s: %s failed for page %lu",
- __memp_fn(dbmfp), fail, (u_long)bhp->pgno);
-
-err: /* Unlock the buffer and reacquire the region lock. */
- MUTEX_UNLOCK(dbenv, &bhp->mutex);
- R_LOCK(dbenv, dbmp->reginfo);
-
- /*
- * Clean up the flags based on a failure.
- *
- * The page remains dirty but we remove our lock. If we rewrote the
- * page, it will need processing by the pgin routine before reuse.
- */
- if (callpgin)
- F_SET(bhp, BH_CALLPGIN);
+ /* Regardless, clear any sync wait-for count and remove our lock. */
+ bhp->ref_sync = 0;
F_CLR(bhp, BH_LOCKED);
return (ret);
@@ -514,15 +456,17 @@ __memp_pg(dbmfp, bhp, is_pgin)
int is_pgin;
{
DBT dbt, *dbtp;
+ DB_ENV *dbenv;
DB_MPOOL *dbmp;
DB_MPREG *mpreg;
MPOOLFILE *mfp;
int ftype, ret;
dbmp = dbmfp->dbmp;
+ dbenv = dbmp->dbenv;
mfp = dbmfp->mfp;
- MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp);
+ MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
ftype = mfp->ftype;
for (mpreg = LIST_FIRST(&dbmp->dbregq);
@@ -536,28 +480,28 @@ __memp_pg(dbmfp, bhp, is_pgin)
dbt.data = R_ADDR(dbmp->reginfo, mfp->pgcookie_off);
dbtp = &dbt;
}
- MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
+ MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
if (is_pgin) {
if (mpreg->pgin != NULL &&
- (ret = mpreg->pgin(dbmp->dbenv,
+ (ret = mpreg->pgin(dbenv,
bhp->pgno, bhp->buf, dbtp)) != 0)
goto err;
} else
if (mpreg->pgout != NULL &&
- (ret = mpreg->pgout(dbmp->dbenv,
+ (ret = mpreg->pgout(dbenv,
bhp->pgno, bhp->buf, dbtp)) != 0)
goto err;
break;
}
if (mpreg == NULL)
- MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
+ MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
return (0);
-err: MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
- __db_err(dbmp->dbenv, "%s: %s failed for page %lu",
+err: MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+ __db_err(dbenv, "%s: %s failed for page %lu",
__memp_fn(dbmfp), is_pgin ? "pgin" : "pgout", (u_long)bhp->pgno);
return (ret);
}
@@ -566,55 +510,78 @@ err: MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
* __memp_bhfree --
* Free a bucket header and its referenced data.
*
- * PUBLIC: void __memp_bhfree __P((DB_MPOOL *, BH *, int));
+ * PUBLIC: void __memp_bhfree __P((DB_MPOOL *, DB_MPOOL_HASH *, BH *, int));
*/
void
-__memp_bhfree(dbmp, bhp, free_mem)
+__memp_bhfree(dbmp, hp, bhp, free_mem)
DB_MPOOL *dbmp;
+ DB_MPOOL_HASH *hp;
BH *bhp;
int free_mem;
{
- DB_HASHTAB *dbht;
+ DB_ENV *dbenv;
MPOOL *c_mp, *mp;
MPOOLFILE *mfp;
- int n_bucket, n_cache;
+ u_int32_t n_cache;
+ /*
+ * Assumes the hash bucket is locked and the MPOOL is not.
+ */
+ dbenv = dbmp->dbenv;
mp = dbmp->reginfo[0].primary;
- c_mp = BH_TO_CACHE(dbmp, bhp);
- n_cache = NCACHE(mp, bhp->pgno);
- n_bucket = NBUCKET(c_mp, bhp->mf_offset, bhp->pgno);
- dbht = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
+ n_cache = NCACHE(mp, bhp->mf_offset, bhp->pgno);
- /* Delete the buffer header from the hash bucket queue. */
- SH_TAILQ_REMOVE(&dbht[n_bucket], bhp, hq, __bh);
+ /*
+ * Delete the buffer header from the hash bucket queue and reset
+ * the hash bucket's priority, if necessary.
+ */
+ SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
+ if (bhp->priority == hp->hash_priority)
+ hp->hash_priority =
+ SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL ?
+ 0 : SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
- /* Delete the buffer header from the LRU queue. */
- SH_TAILQ_REMOVE(&c_mp->bhq, bhp, q, __bh);
+ /*
+ * Discard the hash bucket's mutex, it's no longer needed, and
+ * we don't want to be holding it when acquiring other locks.
+ */
+ MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
- /* Clear the mutex this buffer recorded */
- __db_shlocks_clear(&bhp->mutex, &dbmp->reginfo[n_cache],
- (REGMAINT *)R_ADDR(&dbmp->reginfo[n_cache], mp->maint_off));
/*
* Find the underlying MPOOLFILE and decrement its reference count.
* If this is its last reference, remove it.
*/
mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+ MUTEX_LOCK(dbenv, &mfp->mutex);
if (--mfp->block_cnt == 0 && mfp->mpf_cnt == 0)
__memp_mf_discard(dbmp, mfp);
+ else
+ MUTEX_UNLOCK(dbenv, &mfp->mutex);
+
+ R_LOCK(dbenv, &dbmp->reginfo[n_cache]);
+
+ /*
+ * Clear the mutex this buffer recorded; requires the region lock
+ * be held.
+ */
+ __db_shlocks_clear(&bhp->mutex, &dbmp->reginfo[n_cache],
+ (REGMAINT *)R_ADDR(&dbmp->reginfo[n_cache], mp->maint_off));
/*
- * If we're not reusing it immediately, free the buffer header
+ * If we're not reusing the buffer immediately, free the buffer header
* and data for real.
*/
if (free_mem) {
- --c_mp->stat.st_page_clean;
__db_shalloc_free(dbmp->reginfo[n_cache].addr, bhp);
+ c_mp = dbmp->reginfo[n_cache].primary;
+ c_mp->stat.st_pages--;
}
+ R_UNLOCK(dbenv, &dbmp->reginfo[n_cache]);
}
/*
* __memp_upgrade --
- * Upgrade a file descriptor from readonly to readwrite.
+ * Upgrade a file descriptor from read-only to read-write.
*/
static int
__memp_upgrade(dbmp, dbmfp, mfp)
@@ -622,41 +589,58 @@ __memp_upgrade(dbmp, dbmfp, mfp)
DB_MPOOLFILE *dbmfp;
MPOOLFILE *mfp;
{
- DB_FH fh;
+ DB_ENV *dbenv;
+ DB_FH *fhp, *tfhp;
int ret;
char *rpath;
- /*
- * !!!
- * We expect the handle to already be locked.
- */
-
- /* Check to see if we've already upgraded. */
- if (F_ISSET(dbmfp, MP_UPGRADE))
- return (0);
-
- /* Check to see if we've already failed. */
- if (F_ISSET(dbmfp, MP_UPGRADE_FAIL))
- return (1);
+ dbenv = dbmp->dbenv;
+ fhp = NULL;
+ rpath = NULL;
/*
* Calculate the real name for this file and try to open it read/write.
* We know we have a valid pathname for the file because it's the only
* way we could have gotten a file descriptor of any kind.
*/
- if ((ret = __db_appname(dbmp->dbenv, DB_APP_DATA,
- NULL, R_ADDR(dbmp->reginfo, mfp->path_off), 0, NULL, &rpath)) != 0)
- return (ret);
- if (__os_open(dbmp->dbenv, rpath, 0, 0, &fh) != 0) {
+ if ((ret = __os_calloc(dbenv, 1, sizeof(DB_FH), &fhp)) != 0)
+ goto err;
+
+ if ((ret = __db_appname(dbenv, DB_APP_DATA,
+ R_ADDR(dbmp->reginfo, mfp->path_off), 0, NULL, &rpath)) != 0)
+ goto err;
+
+ if (__os_open(dbenv, rpath,
+ F_ISSET(mfp, MP_DIRECT) ? DB_OSO_DIRECT : 0, 0, fhp) != 0) {
F_SET(dbmfp, MP_UPGRADE_FAIL);
- ret = 1;
- } else {
- /* Swap the descriptors and set the upgrade flag. */
- (void)__os_closehandle(&dbmfp->fh);
- dbmfp->fh = fh;
- F_SET(dbmfp, MP_UPGRADE);
- ret = 0;
+ goto err;
}
- __os_freestr(rpath);
+
+ /*
+ * Swap the descriptors and set the upgrade flag.
+ *
+ * XXX
+ * There is a race here. If another process schedules a read using the
+ * existing file descriptor and is swapped out before making the system
+ * call, this code could theoretically close the file descriptor out
+ * from under it. While it's very unlikely, this code should still be
+ * rewritten.
+ */
+ tfhp = dbmfp->fhp;
+ dbmfp->fhp = fhp;
+ fhp = tfhp;
+
+ (void)__os_closehandle(dbenv, fhp);
+ F_SET(dbmfp, MP_UPGRADE);
+
+ ret = 0;
+ if (0) {
+err: ret = 1;
+ }
+ if (fhp != NULL)
+ __os_free(dbenv, fhp);
+ if (rpath != NULL)
+ __os_free(dbenv, rpath);
+
return (ret);
}
diff --git a/bdb/mp/mp_fget.c b/bdb/mp/mp_fget.c
index 1bff5e136ab..be0785a2184 100644
--- a/bdb/mp/mp_fget.c
+++ b/bdb/mp/mp_fget.c
@@ -1,13 +1,13 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: mp_fget.c,v 11.28 2001/01/10 04:50:53 ubell Exp $";
+static const char revid[] = "$Id: mp_fget.c,v 11.68 2002/08/06 04:58:09 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -16,51 +16,54 @@ static const char revid[] = "$Id: mp_fget.c,v 11.28 2001/01/10 04:50:53 ubell Ex
#include <string.h>
#endif
-#ifdef HAVE_RPC
-#include "db_server.h"
-#endif
-
#include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"
-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
+#ifdef HAVE_FILESYSTEM_NOTZERO
+static int __memp_fs_notzero
+ __P((DB_ENV *, DB_MPOOLFILE *, MPOOLFILE *, db_pgno_t *));
#endif
/*
- * memp_fget --
+ * __memp_fget --
* Get a page from the file.
+ *
+ * PUBLIC: int __memp_fget
+ * PUBLIC: __P((DB_MPOOLFILE *, db_pgno_t *, u_int32_t, void *));
*/
int
-memp_fget(dbmfp, pgnoaddr, flags, addrp)
+__memp_fget(dbmfp, pgnoaddr, flags, addrp)
DB_MPOOLFILE *dbmfp;
db_pgno_t *pgnoaddr;
u_int32_t flags;
void *addrp;
{
- BH *bhp;
+ enum { FIRST_FOUND, FIRST_MISS, SECOND_FOUND, SECOND_MISS } state;
+ BH *alloc_bhp, *bhp;
DB_ENV *dbenv;
DB_MPOOL *dbmp;
- DB_HASHTAB *dbht;
+ DB_MPOOL_HASH *hp;
MPOOL *c_mp, *mp;
MPOOLFILE *mfp;
- size_t n_bucket, n_cache, mf_offset;
- u_int32_t st_hsearch;
- int b_incr, first, ret;
+ roff_t mf_offset;
+ u_int32_t n_cache, st_hsearch;
+ int b_incr, extending, first, ret;
+
+ *(void **)addrp = NULL;
dbmp = dbmfp->dbmp;
dbenv = dbmp->dbenv;
- mp = dbmp->reginfo[0].primary;
- mfp = dbmfp->mfp;
-#ifdef HAVE_RPC
- if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
- return (__dbcl_memp_fget(dbmfp, pgnoaddr, flags, addrp));
-#endif
PANIC_CHECK(dbenv);
+ mp = dbmp->reginfo[0].primary;
+ mfp = dbmfp->mfp;
+ mf_offset = R_OFFSET(dbmp->reginfo, mfp);
+ alloc_bhp = bhp = NULL;
+ hp = NULL;
+ b_incr = extending = ret = 0;
+
/*
* Validate arguments.
*
@@ -74,100 +77,35 @@ memp_fget(dbmfp, pgnoaddr, flags, addrp)
* is to keep database files small. It's sleazy as hell, but we catch
* any attempt to actually write the file in memp_fput().
*/
-#define OKFLAGS \
- (DB_MPOOL_CREATE | DB_MPOOL_LAST | \
- DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP | DB_MPOOL_EXTENT)
+#define OKFLAGS (DB_MPOOL_CREATE | DB_MPOOL_LAST | DB_MPOOL_NEW)
if (flags != 0) {
if ((ret = __db_fchk(dbenv, "memp_fget", flags, OKFLAGS)) != 0)
return (ret);
- switch (flags & ~DB_MPOOL_EXTENT) {
+ switch (flags) {
case DB_MPOOL_CREATE:
+ break;
case DB_MPOOL_LAST:
+ /* Get the last page number in the file. */
+ if (flags == DB_MPOOL_LAST) {
+ R_LOCK(dbenv, dbmp->reginfo);
+ *pgnoaddr = mfp->last_pgno;
+ R_UNLOCK(dbenv, dbmp->reginfo);
+ }
+ break;
case DB_MPOOL_NEW:
- case DB_MPOOL_NEW_GROUP:
- case 0:
+ /*
+ * If always creating a page, skip the first search
+ * of the hash bucket.
+ */
+ if (flags == DB_MPOOL_NEW)
+ goto alloc;
break;
default:
return (__db_ferr(dbenv, "memp_fget", 1));
}
}
-#ifdef DIAGNOSTIC
- /*
- * XXX
- * We want to switch threads as often as possible. Yield every time
- * we get a new page to ensure contention.
- */
- if (DB_GLOBAL(db_pageyield))
- __os_yield(dbenv, 1);
-#endif
-
- /* Initialize remaining local variables. */
- mf_offset = R_OFFSET(dbmp->reginfo, mfp);
- bhp = NULL;
- st_hsearch = 0;
- b_incr = ret = 0;
-
- R_LOCK(dbenv, dbmp->reginfo);
-
- /*
- * Check for the new, last or last + 1 page requests.
- *
- * Examine and update the file's last_pgno value. We don't care if
- * the last_pgno value immediately changes due to another thread --
- * at this instant in time, the value is correct. We do increment the
- * current last_pgno value if the thread is asking for a new page,
- * however, to ensure that two threads creating pages don't get the
- * same one.
- *
- * If we create a page, there is the potential that a page after it
- * in the file will be written before it will be written. Recovery
- * depends on pages that are "created" in the file by subsequent pages
- * being written be zeroed out, not have random garbage. Ensure that
- * the OS agrees.
- *
- * !!!
- * DB_MPOOL_NEW_GROUP is undocumented -- the hash access method needs
- * to allocate contiguous groups of pages in order to do subdatabases.
- * We return the first page in the group, but the caller must put an
- * LSN on the *last* page and write it, otherwise after a crash we may
- * not create all of the pages we need to create.
- */
- if (LF_ISSET(DB_MPOOL_LAST | DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP)) {
- if (LF_ISSET(DB_MPOOL_NEW)) {
- if (F_ISSET(&dbmfp->fh, DB_FH_VALID) && (ret =
- __os_fpinit(dbenv, &dbmfp->fh, mfp->last_pgno + 1,
- 1, mfp->stat.st_pagesize)) != 0) {
- R_UNLOCK(dbenv, dbmp->reginfo);
- return (ret);
- }
- ++mfp->last_pgno;
- }
- if (LF_ISSET(DB_MPOOL_NEW_GROUP)) {
- if (F_ISSET(&dbmfp->fh, DB_FH_VALID) && (ret =
- __os_fpinit(dbenv, &dbmfp->fh, mfp->last_pgno + 1,
- (int)*pgnoaddr, mfp->stat.st_pagesize)) != 0) {
- R_UNLOCK(dbenv, dbmp->reginfo);
- return (ret);
- }
- mfp->last_pgno += *pgnoaddr;
- }
- *pgnoaddr = mfp->last_pgno;
- }
-
- /*
- * Determine the hash bucket where this page will live, and get local
- * pointers to the cache and its hash table.
- */
- n_cache = NCACHE(mp, *pgnoaddr);
- c_mp = dbmp->reginfo[n_cache].primary;
- n_bucket = NBUCKET(c_mp, mf_offset, *pgnoaddr);
- dbht = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
-
- if (LF_ISSET(DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP))
- goto alloc;
-
/*
* If mmap'ing the file and the page is not past the end of the file,
* just return a pointer.
@@ -183,235 +121,534 @@ memp_fget(dbmfp, pgnoaddr, flags, addrp)
* goes through the cache. All pages previously returned will be safe,
* as long as the correct locking protocol was observed.
*
- * XXX
* We don't discard the map because we don't know when all of the
* pages will have been discarded from the process' address space.
* It would be possible to do so by reference counting the open
* pages from the mmap, but it's unclear to me that it's worth it.
*/
- if (dbmfp->addr != NULL && F_ISSET(mfp, MP_CAN_MMAP)) {
- if (*pgnoaddr > mfp->orig_last_pgno) {
- /*
- * !!!
- * See the comment above about non-existent pages and
- * the hash access method.
- */
- if (!LF_ISSET(DB_MPOOL_CREATE)) {
- if (!LF_ISSET(DB_MPOOL_EXTENT))
- __db_err(dbenv,
- "%s: page %lu doesn't exist",
- __memp_fn(dbmfp), (u_long)*pgnoaddr);
- ret = EINVAL;
- goto err;
- }
- } else {
- *(void **)addrp =
- R_ADDR(dbmfp, *pgnoaddr * mfp->stat.st_pagesize);
- ++mfp->stat.st_map;
- goto done;
- }
+ if (dbmfp->addr != NULL &&
+ F_ISSET(mfp, MP_CAN_MMAP) && *pgnoaddr <= mfp->orig_last_pgno) {
+ *(void **)addrp =
+ R_ADDR(dbmfp, *pgnoaddr * mfp->stat.st_pagesize);
+ ++mfp->stat.st_map;
+ return (0);
}
+hb_search:
+ /*
+ * Determine the cache and hash bucket where this page lives and get
+ * local pointers to them. Reset on each pass through this code, the
+ * page number can change.
+ */
+ n_cache = NCACHE(mp, mf_offset, *pgnoaddr);
+ c_mp = dbmp->reginfo[n_cache].primary;
+ hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
+ hp = &hp[NBUCKET(c_mp, mf_offset, *pgnoaddr)];
+
/* Search the hash chain for the page. */
- for (bhp = SH_TAILQ_FIRST(&dbht[n_bucket], __bh);
+retry: st_hsearch = 0;
+ MUTEX_LOCK(dbenv, &hp->hash_mutex);
+ for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) {
++st_hsearch;
if (bhp->pgno != *pgnoaddr || bhp->mf_offset != mf_offset)
continue;
- /* Increment the reference count. */
+ /*
+ * Increment the reference count. We may discard the hash
+ * bucket lock as we evaluate and/or read the buffer, so we
+ * need to ensure it doesn't move and its contents remain
+ * unchanged.
+ */
if (bhp->ref == UINT16_T_MAX) {
__db_err(dbenv,
"%s: page %lu: reference count overflow",
__memp_fn(dbmfp), (u_long)bhp->pgno);
ret = EINVAL;
+ MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
goto err;
}
-
- /*
- * Increment the reference count. We may discard the region
- * lock as we evaluate and/or read the buffer, so we need to
- * ensure that it doesn't move and that its contents remain
- * unchanged.
- */
++bhp->ref;
b_incr = 1;
/*
- * Any buffer we find might be trouble.
- *
* BH_LOCKED --
- * I/O is in progress. Because we've incremented the buffer
- * reference count, we know the buffer can't move. Unlock
- * the region lock, wait for the I/O to complete, and reacquire
- * the region.
+ * I/O is in progress or sync is waiting on the buffer to write
+ * it. Because we've incremented the buffer reference count,
+ * we know the buffer can't move. Unlock the bucket lock, wait
+ * for the buffer to become available, reacquire the bucket.
*/
- for (first = 1; F_ISSET(bhp, BH_LOCKED); first = 0) {
- R_UNLOCK(dbenv, dbmp->reginfo);
+ for (first = 1; F_ISSET(bhp, BH_LOCKED) &&
+ !F_ISSET(dbenv, DB_ENV_NOLOCKING); first = 0) {
+ /*
+ * If someone is trying to sync this buffer and the
+ * buffer is hot, they may never get in. Give up
+ * and try again.
+ */
+ if (!first && bhp->ref_sync != 0) {
+ --bhp->ref;
+ b_incr = 0;
+ MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+ __os_yield(dbenv, 1);
+ goto retry;
+ }
+ MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
/*
- * Explicitly yield the processor if it's not the first
- * pass through this loop -- if we don't, we might end
- * up running to the end of our CPU quantum as we will
- * simply be swapping between the two locks.
+ * Explicitly yield the processor if not the first pass
+ * through this loop -- if we don't, we might run to the
+ * end of our CPU quantum as we will simply be swapping
+ * between the two locks.
*/
if (!first)
__os_yield(dbenv, 1);
- MUTEX_LOCK(dbenv, &bhp->mutex, dbenv->lockfhp);
+ MUTEX_LOCK(dbenv, &bhp->mutex);
/* Wait for I/O to finish... */
MUTEX_UNLOCK(dbenv, &bhp->mutex);
- R_LOCK(dbenv, dbmp->reginfo);
+ MUTEX_LOCK(dbenv, &hp->hash_mutex);
+ }
+
+ ++mfp->stat.st_cache_hit;
+ break;
+ }
+
+ /*
+ * Update the hash bucket search statistics -- do now because our next
+ * search may be for a different bucket.
+ */
+ ++c_mp->stat.st_hash_searches;
+ if (st_hsearch > c_mp->stat.st_hash_longest)
+ c_mp->stat.st_hash_longest = st_hsearch;
+ c_mp->stat.st_hash_examined += st_hsearch;
+
+ /*
+ * There are 4 possible paths to this location:
+ *
+ * FIRST_MISS:
+ * Didn't find the page in the hash bucket on our first pass:
+ * bhp == NULL, alloc_bhp == NULL
+ *
+ * FIRST_FOUND:
+ * Found the page in the hash bucket on our first pass:
+ * bhp != NULL, alloc_bhp == NULL
+ *
+ * SECOND_FOUND:
+ * Didn't find the page in the hash bucket on the first pass,
+ * allocated space, and found the page in the hash bucket on
+ * our second pass:
+ * bhp != NULL, alloc_bhp != NULL
+ *
+ * SECOND_MISS:
+ * Didn't find the page in the hash bucket on the first pass,
+ * allocated space, and didn't find the page in the hash bucket
+ * on our second pass:
+ * bhp == NULL, alloc_bhp != NULL
+ */
+ state = bhp == NULL ?
+ (alloc_bhp == NULL ? FIRST_MISS : SECOND_MISS) :
+ (alloc_bhp == NULL ? FIRST_FOUND : SECOND_FOUND);
+ switch (state) {
+ case FIRST_FOUND:
+ /* We found the buffer in our first check -- we're done. */
+ break;
+ case FIRST_MISS:
+ /*
+ * We didn't find the buffer in our first check. Figure out
+ * if the page exists, and allocate structures so we can add
+ * the page to the buffer pool.
+ */
+ MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+
+alloc: /*
+ * If DB_MPOOL_NEW is set, we have to allocate a page number.
+ * If neither DB_MPOOL_CREATE or DB_MPOOL_CREATE is set, then
+ * it's an error to try and get a page past the end of file.
+ */
+ COMPQUIET(n_cache, 0);
+
+ extending = ret = 0;
+ R_LOCK(dbenv, dbmp->reginfo);
+ switch (flags) {
+ case DB_MPOOL_NEW:
+ extending = 1;
+ *pgnoaddr = mfp->last_pgno + 1;
+ break;
+ case DB_MPOOL_CREATE:
+ extending = *pgnoaddr > mfp->last_pgno;
+ break;
+ default:
+ ret = *pgnoaddr > mfp->last_pgno ? DB_PAGE_NOTFOUND : 0;
+ break;
}
+ R_UNLOCK(dbenv, dbmp->reginfo);
+ if (ret != 0)
+ goto err;
/*
- * BH_TRASH --
- * The contents of the buffer are garbage. Shouldn't happen,
- * and this read is likely to fail, but might as well try.
+ * !!!
+ * In the DB_MPOOL_NEW code path, mf_offset and n_cache have
+ * not yet been initialized.
*/
- if (F_ISSET(bhp, BH_TRASH))
- goto reread;
+ mf_offset = R_OFFSET(dbmp->reginfo, mfp);
+ n_cache = NCACHE(mp, mf_offset, *pgnoaddr);
+ /* Allocate a new buffer header and data space. */
+ if ((ret = __memp_alloc(dbmp,
+ &dbmp->reginfo[n_cache], mfp, 0, NULL, &alloc_bhp)) != 0)
+ goto err;
+#ifdef DIAGNOSTIC
+ if ((db_alignp_t)alloc_bhp->buf & (sizeof(size_t) - 1)) {
+ __db_err(dbenv,
+ "Error: buffer data is NOT size_t aligned");
+ ret = EINVAL;
+ goto err;
+ }
+#endif
/*
- * BH_CALLPGIN --
- * The buffer was converted so it could be written, and the
- * contents need to be converted again.
+ * If we are extending the file, we'll need the region lock
+ * again.
*/
- if (F_ISSET(bhp, BH_CALLPGIN)) {
- if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0)
+ if (extending)
+ R_LOCK(dbenv, dbmp->reginfo);
+
+ /*
+ * DB_MPOOL_NEW does not guarantee you a page unreferenced by
+ * any other thread of control. (That guarantee is interesting
+ * for DB_MPOOL_NEW, unlike DB_MPOOL_CREATE, because the caller
+ * did not specify the page number, and so, may reasonably not
+ * have any way to lock the page outside of mpool.) Regardless,
+ * if we allocate the page, and some other thread of control
+ * requests the page by number, we will not detect that and the
+ * thread of control that allocated using DB_MPOOL_NEW may not
+ * have a chance to initialize the page. (Note: we *could*
+ * detect this case if we set a flag in the buffer header which
+ * guaranteed that no gets of the page would succeed until the
+ * reference count went to 0, that is, until the creating page
+ * put the page.) What we do guarantee is that if two threads
+ * of control are both doing DB_MPOOL_NEW calls, they won't
+ * collide, that is, they won't both get the same page.
+ *
+ * There's a possibility that another thread allocated the page
+ * we were planning to allocate while we were off doing buffer
+ * allocation. We can do that by making sure the page number
+ * we were going to use is still available. If it's not, then
+ * we check to see if the next available page number hashes to
+ * the same mpool region as the old one -- if it does, we can
+ * continue, otherwise, we have to start over.
+ */
+ if (flags == DB_MPOOL_NEW && *pgnoaddr != mfp->last_pgno + 1) {
+ *pgnoaddr = mfp->last_pgno + 1;
+ if (n_cache != NCACHE(mp, mf_offset, *pgnoaddr)) {
+ __db_shalloc_free(
+ dbmp->reginfo[n_cache].addr, alloc_bhp);
+ /*
+ * flags == DB_MPOOL_NEW, so extending is set
+ * and we're holding the region locked.
+ */
+ R_UNLOCK(dbenv, dbmp->reginfo);
+
+ alloc_bhp = NULL;
+ goto alloc;
+ }
+ }
+
+ /*
+ * We released the region lock, so another thread might have
+ * extended the file. Update the last_pgno and initialize
+ * the file, as necessary, if we extended the file.
+ */
+ if (extending) {
+#ifdef HAVE_FILESYSTEM_NOTZERO
+ if (*pgnoaddr > mfp->last_pgno &&
+ __os_fs_notzero() &&
+ F_ISSET(dbmfp->fhp, DB_FH_VALID))
+ ret = __memp_fs_notzero(
+ dbenv, dbmfp, mfp, pgnoaddr);
+ else
+ ret = 0;
+#endif
+ if (ret == 0 && *pgnoaddr > mfp->last_pgno)
+ mfp->last_pgno = *pgnoaddr;
+
+ R_UNLOCK(dbenv, dbmp->reginfo);
+ if (ret != 0)
goto err;
- F_CLR(bhp, BH_CALLPGIN);
}
+ goto hb_search;
+ case SECOND_FOUND:
+ /*
+ * We allocated buffer space for the requested page, but then
+ * found the page in the buffer cache on our second check.
+ * That's OK -- we can use the page we found in the pool,
+ * unless DB_MPOOL_NEW is set.
+ *
+ * Free the allocated memory, we no longer need it. Since we
+ * can't acquire the region lock while holding the hash bucket
+ * lock, we have to release the hash bucket and re-acquire it.
+ * That's OK, because we have the buffer pinned down.
+ */
+ MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+ R_LOCK(dbenv, &dbmp->reginfo[n_cache]);
+ __db_shalloc_free(dbmp->reginfo[n_cache].addr, alloc_bhp);
+ alloc_bhp = NULL;
+ R_UNLOCK(dbenv, &dbmp->reginfo[n_cache]);
+ MUTEX_LOCK(dbenv, &hp->hash_mutex);
- ++mfp->stat.st_cache_hit;
- *(void **)addrp = bhp->buf;
- goto done;
- }
+ /*
+ * We can't use the page we found in the pool if DB_MPOOL_NEW
+ * was set. (For details, see the above comment beginning
+ * "DB_MPOOL_NEW does not guarantee you a page unreferenced by
+ * any other thread of control".) If DB_MPOOL_NEW is set, we
+ * release our pin on this particular buffer, and try to get
+ * another one.
+ */
+ if (flags == DB_MPOOL_NEW) {
+ --bhp->ref;
+ b_incr = 0;
+ goto alloc;
+ }
+ break;
+ case SECOND_MISS:
+ /*
+ * We allocated buffer space for the requested page, and found
+ * the page still missing on our second pass through the buffer
+ * cache. Instantiate the page.
+ */
+ bhp = alloc_bhp;
+ alloc_bhp = NULL;
-alloc: /* Allocate new buffer header and data space. */
- if ((ret = __memp_alloc(dbmp,
- &dbmp->reginfo[n_cache], mfp, 0, NULL, &bhp)) != 0)
- goto err;
+ /*
+ * Initialize all the BH and hash bucket fields so we can call
+ * __memp_bhfree if an error occurs.
+ *
+ * Append the buffer to the tail of the bucket list and update
+ * the hash bucket's priority.
+ */
+ b_incr = 1;
+
+ memset(bhp, 0, sizeof(BH));
+ bhp->ref = 1;
+ bhp->priority = UINT32_T_MAX;
+ bhp->pgno = *pgnoaddr;
+ bhp->mf_offset = mf_offset;
+ SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
+ hp->hash_priority =
+ SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
+
+ /* If we extended the file, make sure the page is never lost. */
+ if (extending) {
+ ++hp->hash_page_dirty;
+ F_SET(bhp, BH_DIRTY | BH_DIRTY_CREATE);
+ }
- ++c_mp->stat.st_page_clean;
+ /*
+ * If we created the page, zero it out. If we didn't create
+ * the page, read from the backing file.
+ *
+ * !!!
+ * DB_MPOOL_NEW doesn't call the pgin function.
+ *
+ * If DB_MPOOL_CREATE is used, then the application's pgin
+ * function has to be able to handle pages of 0's -- if it
+ * uses DB_MPOOL_NEW, it can detect all of its page creates,
+ * and not bother.
+ *
+ * If we're running in diagnostic mode, smash any bytes on the
+ * page that are unknown quantities for the caller.
+ *
+ * Otherwise, read the page into memory, optionally creating it
+ * if DB_MPOOL_CREATE is set.
+ */
+ if (extending) {
+ if (mfp->clear_len == 0)
+ memset(bhp->buf, 0, mfp->stat.st_pagesize);
+ else {
+ memset(bhp->buf, 0, mfp->clear_len);
+#if defined(DIAGNOSTIC) || defined(UMRW)
+ memset(bhp->buf + mfp->clear_len, CLEAR_BYTE,
+ mfp->stat.st_pagesize - mfp->clear_len);
+#endif
+ }
- /*
- * Initialize the BH fields so that we can call the __memp_bhfree
- * routine if an error occurs.
- */
- memset(bhp, 0, sizeof(BH));
- bhp->ref = 1;
- bhp->pgno = *pgnoaddr;
- bhp->mf_offset = mf_offset;
+ if (flags == DB_MPOOL_CREATE && mfp->ftype != 0)
+ F_SET(bhp, BH_CALLPGIN);
- /* Increment the count of buffers referenced by this MPOOLFILE. */
- ++mfp->block_cnt;
+ ++mfp->stat.st_page_create;
+ } else {
+ F_SET(bhp, BH_TRASH);
+ ++mfp->stat.st_cache_miss;
+ }
- /*
- * Prepend the bucket header to the head of the appropriate MPOOL
- * bucket hash list. Append the bucket header to the tail of the
- * MPOOL LRU chain.
- */
- SH_TAILQ_INSERT_HEAD(&dbht[n_bucket], bhp, hq, __bh);
- SH_TAILQ_INSERT_TAIL(&c_mp->bhq, bhp, q);
+ /* Increment buffer count referenced by MPOOLFILE. */
+ MUTEX_LOCK(dbenv, &mfp->mutex);
+ ++mfp->block_cnt;
+ MUTEX_UNLOCK(dbenv, &mfp->mutex);
-#ifdef DIAGNOSTIC
- if ((db_alignp_t)bhp->buf & (sizeof(size_t) - 1)) {
- __db_err(dbenv, "Internal error: BH data NOT size_t aligned.");
- ret = EINVAL;
- __memp_bhfree(dbmp, bhp, 1);
- goto err;
+ /*
+ * Initialize the mutex. This is the last initialization step,
+ * because it's the only one that can fail, and everything else
+ * must be set up or we can't jump to the err label because it
+ * will call __memp_bhfree.
+ */
+ if ((ret = __db_mutex_setup(dbenv,
+ &dbmp->reginfo[n_cache], &bhp->mutex, 0)) != 0)
+ goto err;
}
-#endif
- if ((ret = __db_shmutex_init(dbenv, &bhp->mutex,
- R_OFFSET(dbmp->reginfo, &bhp->mutex) + DB_FCNTL_OFF_MPOOL,
- 0, &dbmp->reginfo[n_cache],
- (REGMAINT *)R_ADDR(&dbmp->reginfo[n_cache], c_mp->maint_off)))
- != 0) {
- __memp_bhfree(dbmp, bhp, 1);
- goto err;
+ DB_ASSERT(bhp->ref != 0);
+
+ /*
+ * If we're the only reference, update buffer and bucket priorities.
+ * We may be about to release the hash bucket lock, and everything
+ * should be correct, first. (We've already done this if we created
+ * the buffer, so there is no need to do it again.)
+ */
+ if (state != SECOND_MISS && bhp->ref == 1) {
+ bhp->priority = UINT32_T_MAX;
+ SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
+ SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
+ hp->hash_priority =
+ SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
}
/*
- * If we created the page, zero it out and continue.
- *
- * !!!
- * Note: DB_MPOOL_NEW specifically doesn't call the pgin function.
- * If DB_MPOOL_CREATE is used, then the application's pgin function
- * has to be able to handle pages of 0's -- if it uses DB_MPOOL_NEW,
- * it can detect all of its page creates, and not bother.
+ * BH_TRASH --
+ * The buffer we found may need to be filled from the disk.
*
- * If we're running in diagnostic mode, smash any bytes on the
- * page that are unknown quantities for the caller.
- *
- * Otherwise, read the page into memory, optionally creating it if
- * DB_MPOOL_CREATE is set.
+ * It's possible for the read function to fail, which means we fail as
+ * well. Note, the __memp_pgread() function discards and reacquires
+ * the hash lock, so the buffer must be pinned down so that it cannot
+ * move and its contents are unchanged. Discard the buffer on failure
+ * unless another thread is waiting on our I/O to complete. It's OK to
+ * leave the buffer around, as the waiting thread will see the BH_TRASH
+ * flag set, and will also attempt to discard it. If there's a waiter,
+ * we need to decrement our reference count.
*/
- if (LF_ISSET(DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP)) {
- if (mfp->clear_len == 0)
- memset(bhp->buf, 0, mfp->stat.st_pagesize);
- else {
- memset(bhp->buf, 0, mfp->clear_len);
-#ifdef DIAGNOSTIC
- memset(bhp->buf + mfp->clear_len, CLEAR_BYTE,
- mfp->stat.st_pagesize - mfp->clear_len);
-#endif
- }
+ if (F_ISSET(bhp, BH_TRASH) &&
+ (ret = __memp_pgread(dbmfp,
+ &hp->hash_mutex, bhp, LF_ISSET(DB_MPOOL_CREATE) ? 1 : 0)) != 0)
+ goto err;
- ++mfp->stat.st_page_create;
- } else {
- /*
- * It's possible for the read function to fail, which means
- * that we fail as well. Note, the __memp_pgread() function
- * discards the region lock, so the buffer must be pinned
- * down so that it cannot move and its contents are unchanged.
- */
-reread: if ((ret = __memp_pgread(dbmfp,
- bhp, LF_ISSET(DB_MPOOL_CREATE|DB_MPOOL_EXTENT))) != 0) {
- /*
- * !!!
- * Discard the buffer unless another thread is waiting
- * on our I/O to complete. Regardless, the header has
- * the BH_TRASH flag set.
- */
- if (bhp->ref == 1)
- __memp_bhfree(dbmp, bhp, 1);
+ /*
+ * BH_CALLPGIN --
+ * The buffer was processed for being written to disk, and now has
+ * to be re-converted for use.
+ */
+ if (F_ISSET(bhp, BH_CALLPGIN)) {
+ if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0)
goto err;
- }
-
- ++mfp->stat.st_cache_miss;
+ F_CLR(bhp, BH_CALLPGIN);
}
+ MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+
+#ifdef DIAGNOSTIC
+ /* Update the file's pinned reference count. */
+ R_LOCK(dbenv, dbmp->reginfo);
+ ++dbmfp->pinref;
+ R_UNLOCK(dbenv, dbmp->reginfo);
+
/*
- * If we're returning a page after our current notion of the last-page,
- * update our information. Note, there's no way to un-instantiate this
- * page, it's going to exist whether it's returned to us dirty or not.
+ * We want to switch threads as often as possible, and at awkward
+ * times. Yield every time we get a new page to ensure contention.
*/
- if (bhp->pgno > mfp->last_pgno)
- mfp->last_pgno = bhp->pgno;
+ if (F_ISSET(dbenv, DB_ENV_YIELDCPU))
+ __os_yield(dbenv, 1);
+#endif
*(void **)addrp = bhp->buf;
+ return (0);
-done: /* Update the chain search statistics. */
- if (st_hsearch) {
- ++c_mp->stat.st_hash_searches;
- if (st_hsearch > c_mp->stat.st_hash_longest)
- c_mp->stat.st_hash_longest = st_hsearch;
- c_mp->stat.st_hash_examined += st_hsearch;
+err: /*
+ * Discard our reference. If we're the only reference, discard the
+ * the buffer entirely. If we held a reference to a buffer, we are
+ * also still holding the hash bucket mutex.
+ */
+ if (b_incr) {
+ if (bhp->ref == 1)
+ (void)__memp_bhfree(dbmp, hp, bhp, 1);
+ else {
+ --bhp->ref;
+ MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+ }
}
- ++dbmfp->pinref;
+ /* If alloc_bhp is set, free the memory. */
+ if (alloc_bhp != NULL)
+ __db_shalloc_free(dbmp->reginfo[n_cache].addr, alloc_bhp);
- R_UNLOCK(dbenv, dbmp->reginfo);
+ return (ret);
+}
- return (0);
+#ifdef HAVE_FILESYSTEM_NOTZERO
+/*
+ * __memp_fs_notzero --
+ * Initialize the underlying allocated pages in the file.
+ */
+static int
+__memp_fs_notzero(dbenv, dbmfp, mfp, pgnoaddr)
+ DB_ENV *dbenv;
+ DB_MPOOLFILE *dbmfp;
+ MPOOLFILE *mfp;
+ db_pgno_t *pgnoaddr;
+{
+ DB_IO db_io;
+ u_int32_t i, npages;
+ size_t nw;
+ int ret;
+ u_int8_t *page;
+ char *fail;
-err: /* Discard our reference. */
- if (b_incr)
- --bhp->ref;
- R_UNLOCK(dbenv, dbmp->reginfo);
+ /*
+ * Pages allocated by writing pages past end-of-file are not zeroed,
+ * on some systems. Recovery could theoretically be fooled by a page
+ * showing up that contained garbage. In order to avoid this, we
+ * have to write the pages out to disk, and flush them. The reason
+ * for the flush is because if we don't sync, the allocation of another
+ * page subsequent to this one might reach the disk first, and if we
+ * crashed at the right moment, leave us with this page as the one
+ * allocated by writing a page past it in the file.
+ *
+ * Hash is the only access method that allocates groups of pages. We
+ * know that it will use the existence of the last page in a group to
+ * signify that the entire group is OK; so, write all the pages but
+ * the last one in the group, flush them to disk, and then write the
+ * last one to disk and flush it.
+ */
+ if ((ret = __os_calloc(dbenv, 1, mfp->stat.st_pagesize, &page)) != 0)
+ return (ret);
+
+ db_io.fhp = dbmfp->fhp;
+ db_io.mutexp = dbmfp->mutexp;
+ db_io.pagesize = db_io.bytes = mfp->stat.st_pagesize;
+ db_io.buf = page;
+
+ npages = *pgnoaddr - mfp->last_pgno;
+ for (i = 1; i < npages; ++i) {
+ db_io.pgno = mfp->last_pgno + i;
+ if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) {
+ fail = "write";
+ goto err;
+ }
+ }
+ if (i != 1 && (ret = __os_fsync(dbenv, dbmfp->fhp)) != 0) {
+ fail = "sync";
+ goto err;
+ }
- *(void **)addrp = NULL;
+ db_io.pgno = mfp->last_pgno + npages;
+ if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) {
+ fail = "write";
+ goto err;
+ }
+ if ((ret = __os_fsync(dbenv, dbmfp->fhp)) != 0) {
+ fail = "sync";
+err: __db_err(dbenv, "%s: %s failed for page %lu",
+ __memp_fn(dbmfp), fail, (u_long)db_io.pgno);
+ }
+
+ __os_free(dbenv, page);
return (ret);
}
+#endif
diff --git a/bdb/mp/mp_fopen.c b/bdb/mp/mp_fopen.c
index 3611ded18f4..a91bf264652 100644
--- a/bdb/mp/mp_fopen.c
+++ b/bdb/mp/mp_fopen.c
@@ -1,13 +1,13 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: mp_fopen.c,v 11.41 2001/01/10 04:50:53 ubell Exp $";
+static const char revid[] = "$Id: mp_fopen.c,v 11.90 2002/08/26 15:22:01 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -16,211 +16,464 @@ static const char revid[] = "$Id: mp_fopen.c,v 11.41 2001/01/10 04:50:53 ubell E
#include <string.h>
#endif
-#ifdef HAVE_RPC
-#include "db_server.h"
-#endif
-
#include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
-
-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
-#endif
-
-static int __memp_mf_open __P((DB_MPOOL *, const char *,
- size_t, db_pgno_t, DB_MPOOL_FINFO *, u_int32_t, MPOOLFILE **));
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"
+
+static int __memp_fclose __P((DB_MPOOLFILE *, u_int32_t));
+static int __memp_fopen __P((DB_MPOOLFILE *,
+ const char *, u_int32_t, int, size_t));
+static void __memp_get_fileid __P((DB_MPOOLFILE *, u_int8_t *));
+static void __memp_last_pgno __P((DB_MPOOLFILE *, db_pgno_t *));
+static void __memp_refcnt __P((DB_MPOOLFILE *, db_pgno_t *));
+static int __memp_set_clear_len __P((DB_MPOOLFILE *, u_int32_t));
+static int __memp_set_fileid __P((DB_MPOOLFILE *, u_int8_t *));
+static int __memp_set_ftype __P((DB_MPOOLFILE *, int));
+static int __memp_set_lsn_offset __P((DB_MPOOLFILE *, int32_t));
+static int __memp_set_pgcookie __P((DB_MPOOLFILE *, DBT *));
+static int __memp_set_priority __P((DB_MPOOLFILE *, DB_CACHE_PRIORITY));
+static void __memp_set_unlink __P((DB_MPOOLFILE *, int));
+
+/* Initialization methods cannot be called after open is called. */
+#define MPF_ILLEGAL_AFTER_OPEN(dbmfp, name) \
+ if (F_ISSET(dbmfp, MP_OPEN_CALLED)) \
+ return (__db_mi_open((dbmfp)->dbmp->dbenv, name, 1));
/*
- * MEMP_FREMOVE --
- * Discard an MPOOLFILE and any buffers it references: update the flags
- * so we never try to write buffers associated with the file, nor can we
- * find it when looking for files to join. In addition, clear the ftype
- * field, there's no reason to post-process pages, they can be discarded
- * by any thread.
- */
-#define MEMP_FREMOVE(mfp) { \
- mfp->ftype = 0; \
- F_SET(mfp, MP_DEADFILE); \
-}
-
-/*
- * memp_fopen --
- * Open a backing file for the memory pool.
+ * __memp_fcreate --
+ * Create a DB_MPOOLFILE handle.
+ *
+ * PUBLIC: int __memp_fcreate __P((DB_ENV *, DB_MPOOLFILE **, u_int32_t));
*/
int
-memp_fopen(dbenv, path, flags, mode, pagesize, finfop, retp)
+__memp_fcreate(dbenv, retp, flags)
DB_ENV *dbenv;
- const char *path;
- u_int32_t flags;
- int mode;
- size_t pagesize;
- DB_MPOOL_FINFO *finfop;
DB_MPOOLFILE **retp;
+ u_int32_t flags;
{
DB_MPOOL *dbmp;
+ DB_MPOOLFILE *dbmfp;
int ret;
-#ifdef HAVE_RPC
- if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
- return (__dbcl_memp_fopen(dbenv, path, flags,
- mode, pagesize, finfop, retp));
-#endif
-
PANIC_CHECK(dbenv);
- ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL);
+ ENV_REQUIRES_CONFIG(dbenv,
+ dbenv->mp_handle, "memp_fcreate", DB_INIT_MPOOL);
dbmp = dbenv->mp_handle;
/* Validate arguments. */
- if ((ret = __db_fchk(dbenv, "memp_fopen", flags,
- DB_CREATE |
- DB_NOMMAP | DB_ODDFILESIZE | DB_RDONLY | DB_TRUNCATE)) != 0)
+ if ((ret = __db_fchk(dbenv, "memp_fcreate", flags, 0)) != 0)
return (ret);
- /* Require a non-zero pagesize. */
- if (pagesize == 0 ||
- (finfop != NULL && finfop->clear_len > pagesize)) {
- __db_err(dbenv, "memp_fopen: illegal page size.");
- return (EINVAL);
+ /* Allocate and initialize the per-process structure. */
+ if ((ret = __os_calloc(dbenv, 1, sizeof(DB_MPOOLFILE), &dbmfp)) != 0)
+ return (ret);
+ if ((ret = __os_calloc(dbenv, 1, sizeof(DB_FH), &dbmfp->fhp)) != 0)
+ goto err;
+
+ /* Allocate and initialize a mutex if necessary. */
+ if (F_ISSET(dbenv, DB_ENV_THREAD) &&
+ (ret = __db_mutex_setup(dbenv, dbmp->reginfo, &dbmfp->mutexp,
+ MUTEX_ALLOC | MUTEX_THREAD)) != 0)
+ goto err;
+
+ dbmfp->ref = 1;
+ dbmfp->lsn_offset = -1;
+ dbmfp->dbmp = dbmp;
+ dbmfp->mfp = INVALID_ROFF;
+
+ dbmfp->close = __memp_fclose;
+ dbmfp->get = __memp_fget;
+ dbmfp->get_fileid = __memp_get_fileid;
+ dbmfp->last_pgno = __memp_last_pgno;
+ dbmfp->open = __memp_fopen;
+ dbmfp->put = __memp_fput;
+ dbmfp->refcnt = __memp_refcnt;
+ dbmfp->set = __memp_fset;
+ dbmfp->set_clear_len = __memp_set_clear_len;
+ dbmfp->set_fileid = __memp_set_fileid;
+ dbmfp->set_ftype = __memp_set_ftype;
+ dbmfp->set_lsn_offset = __memp_set_lsn_offset;
+ dbmfp->set_pgcookie = __memp_set_pgcookie;
+ dbmfp->set_priority = __memp_set_priority;
+ dbmfp->set_unlink = __memp_set_unlink;
+ dbmfp->sync = __memp_fsync;
+
+ *retp = dbmfp;
+ return (0);
+
+err: if (dbmfp != NULL) {
+ if (dbmfp->fhp != NULL)
+ (void)__os_free(dbenv, dbmfp->fhp);
+ (void)__os_free(dbenv, dbmfp);
}
+ return (ret);
+}
- return (__memp_fopen(dbmp,
- NULL, path, flags, mode, pagesize, 1, finfop, retp));
+/*
+ * __memp_set_clear_len --
+ * Set the clear length.
+ */
+static int
+__memp_set_clear_len(dbmfp, clear_len)
+ DB_MPOOLFILE *dbmfp;
+ u_int32_t clear_len;
+{
+ MPF_ILLEGAL_AFTER_OPEN(dbmfp, "set_clear_len");
+
+ dbmfp->clear_len = clear_len;
+ return (0);
}
/*
- * __memp_set_unlink -- set unlink on last close flag.
- *
- * PUBLIC: void __memp_set_unlink __P((DB_MPOOLFILE *));
+ * __memp_set_fileid --
+ * Set the file ID.
*/
-void
-__memp_set_unlink(dbmpf)
- DB_MPOOLFILE *dbmpf;
+static int
+__memp_set_fileid(dbmfp, fileid)
+ DB_MPOOLFILE *dbmfp;
+ u_int8_t *fileid;
{
- DB_MPOOL *dbmp;
- dbmp = dbmpf->dbmp;
+ MPF_ILLEGAL_AFTER_OPEN(dbmfp, "set_fileid");
- R_LOCK(dbmp->dbenv, dbmp->reginfo);
- F_SET(dbmpf->mfp, MP_UNLINK);
- R_UNLOCK(dbmp->dbenv, dbmp->reginfo);
+ /*
+ * XXX
+ * This is dangerous -- we're saving the caller's pointer instead
+ * of allocating memory and copying the contents.
+ */
+ dbmfp->fileid = fileid;
+ return (0);
}
/*
- * __memp_clear_unlink -- clear unlink on last close flag.
- *
- * PUBLIC: void __memp_clear_unlink __P((DB_MPOOLFILE *));
+ * __memp_set_ftype --
+ * Set the file type (as registered).
*/
-void
-__memp_clear_unlink(dbmpf)
- DB_MPOOLFILE *dbmpf;
+static int
+__memp_set_ftype(dbmfp, ftype)
+ DB_MPOOLFILE *dbmfp;
+ int ftype;
+{
+ MPF_ILLEGAL_AFTER_OPEN(dbmfp, "set_ftype");
+
+ dbmfp->ftype = ftype;
+ return (0);
+}
+
+/*
+ * __memp_set_lsn_offset --
+ * Set the page's LSN offset.
+ */
+static int
+__memp_set_lsn_offset(dbmfp, lsn_offset)
+ DB_MPOOLFILE *dbmfp;
+ int32_t lsn_offset;
+{
+ MPF_ILLEGAL_AFTER_OPEN(dbmfp, "set_lsn_offset");
+
+ dbmfp->lsn_offset = lsn_offset;
+ return (0);
+}
+
+/*
+ * __memp_set_pgcookie --
+ * Set the pgin/pgout cookie.
+ */
+static int
+__memp_set_pgcookie(dbmfp, pgcookie)
+ DB_MPOOLFILE *dbmfp;
+ DBT *pgcookie;
+{
+ MPF_ILLEGAL_AFTER_OPEN(dbmfp, "set_pgcookie");
+
+ dbmfp->pgcookie = pgcookie;
+ return (0);
+}
+
+/*
+ * __memp_set_priority --
+ * Set the cache priority for pages from this file.
+ */
+static int
+__memp_set_priority(dbmfp, priority)
+ DB_MPOOLFILE *dbmfp;
+ DB_CACHE_PRIORITY priority;
+{
+ switch (priority) {
+ case DB_PRIORITY_VERY_LOW:
+ dbmfp->mfp->priority = MPOOL_PRI_VERY_LOW;
+ break;
+ case DB_PRIORITY_LOW:
+ dbmfp->mfp->priority = MPOOL_PRI_LOW;
+ break;
+ case DB_PRIORITY_DEFAULT:
+ dbmfp->mfp->priority = MPOOL_PRI_DEFAULT;
+ break;
+ case DB_PRIORITY_HIGH:
+ dbmfp->mfp->priority = MPOOL_PRI_HIGH;
+ break;
+ case DB_PRIORITY_VERY_HIGH:
+ dbmfp->mfp->priority = MPOOL_PRI_VERY_HIGH;
+ break;
+ default:
+ __db_err(dbmfp->dbmp->dbenv,
+ "Unknown priority value: %d", priority);
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+/*
+ * __memp_fopen --
+ * Open a backing file for the memory pool.
+ */
+static int
+__memp_fopen(dbmfp, path, flags, mode, pagesize)
+ DB_MPOOLFILE *dbmfp;
+ const char *path;
+ u_int32_t flags;
+ int mode;
+ size_t pagesize;
{
+ DB_ENV *dbenv;
DB_MPOOL *dbmp;
- dbmp = dbmpf->dbmp;
+ int ret;
+
+ dbmp = dbmfp->dbmp;
+ dbenv = dbmp->dbenv;
+
+ PANIC_CHECK(dbenv);
+
+ /* Validate arguments. */
+ if ((ret = __db_fchk(dbenv, "memp_fopen", flags,
+ DB_CREATE | DB_DIRECT | DB_EXTENT |
+ DB_NOMMAP | DB_ODDFILESIZE | DB_RDONLY | DB_TRUNCATE)) != 0)
+ return (ret);
/*
- * This bit is protected in the queue code because the metapage
- * is locked so we can avoid geting the region lock.
- * If this gets used from other than the queue code, we cannot.
+ * Require a non-zero, power-of-two pagesize, smaller than the
+ * clear length.
*/
- if (!F_ISSET(dbmpf->mfp, MP_UNLINK))
- return;
- R_LOCK(dbmp->dbenv, dbmp->reginfo);
- F_CLR(dbmpf->mfp, MP_UNLINK);
- R_UNLOCK(dbmp->dbenv, dbmp->reginfo);
+ if (pagesize == 0 || !POWER_OF_TWO(pagesize)) {
+ __db_err(dbenv,
+ "memp_fopen: page sizes must be a power-of-2");
+ return (EINVAL);
+ }
+ if (dbmfp->clear_len > pagesize) {
+ __db_err(dbenv,
+ "memp_fopen: clear length larger than page size");
+ return (EINVAL);
+ }
+
+ /* Read-only checks, and local flag. */
+ if (LF_ISSET(DB_RDONLY) && path == NULL) {
+ __db_err(dbenv,
+ "memp_fopen: temporary files can't be readonly");
+ return (EINVAL);
+ }
+
+ return (__memp_fopen_int(dbmfp, NULL, path, flags, mode, pagesize));
}
/*
- * __memp_fopen --
+ * __memp_fopen_int --
* Open a backing file for the memory pool; internal version.
*
- * PUBLIC: int __memp_fopen __P((DB_MPOOL *, MPOOLFILE *, const char *,
- * PUBLIC: u_int32_t, int, size_t, int, DB_MPOOL_FINFO *, DB_MPOOLFILE **));
+ * PUBLIC: int __memp_fopen_int __P((DB_MPOOLFILE *,
+ * PUBLIC: MPOOLFILE *, const char *, u_int32_t, int, size_t));
*/
int
-__memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp)
- DB_MPOOL *dbmp;
+__memp_fopen_int(dbmfp, mfp, path, flags, mode, pagesize)
+ DB_MPOOLFILE *dbmfp;
MPOOLFILE *mfp;
const char *path;
u_int32_t flags;
- int mode, needlock;
+ int mode;
size_t pagesize;
- DB_MPOOL_FINFO *finfop;
- DB_MPOOLFILE **retp;
{
DB_ENV *dbenv;
- DB_MPOOLFILE *dbmfp;
- DB_MPOOL_FINFO finfo;
+ DB_MPOOL *dbmp;
+ MPOOL *mp;
db_pgno_t last_pgno;
size_t maxmap;
u_int32_t mbytes, bytes, oflags;
- int ret;
+ int mfp_alloc, ret;
u_int8_t idbuf[DB_FILE_ID_LEN];
char *rpath;
+ void *p;
+ dbmp = dbmfp->dbmp;
dbenv = dbmp->dbenv;
- ret = 0;
+ mp = dbmp->reginfo[0].primary;
+ mfp_alloc = ret = 0;
rpath = NULL;
/*
- * If mfp is provided, we take the DB_MPOOL_FINFO information from
- * the mfp. We don't bother initializing everything, because some
- * of them are expensive to acquire. If no mfp is provided and the
- * finfop argument is NULL, we default the values.
+ * Set the page size so os_open can decide whether to turn buffering
+ * off if the DB_DIRECT_DB flag is set.
*/
- if (finfop == NULL) {
- memset(&finfo, 0, sizeof(finfo));
- if (mfp != NULL) {
- finfo.ftype = mfp->ftype;
- finfo.pgcookie = NULL;
- finfo.fileid = NULL;
- finfo.lsn_offset = mfp->lsn_off;
- finfo.clear_len = mfp->clear_len;
- } else {
- finfo.ftype = 0;
- finfo.pgcookie = NULL;
- finfo.fileid = NULL;
- finfo.lsn_offset = -1;
- finfo.clear_len = 0;
- }
- finfop = &finfo;
- }
+ dbmfp->fhp->pagesize = (u_int32_t)pagesize;
- /* Allocate and initialize the per-process structure. */
- if ((ret = __os_calloc(dbenv, 1, sizeof(DB_MPOOLFILE), &dbmfp)) != 0)
- return (ret);
- dbmfp->dbmp = dbmp;
- dbmfp->ref = 1;
- if (LF_ISSET(DB_RDONLY))
+ /*
+ * If it's a temporary file, delay the open until we actually need
+ * to write the file, and we know we can't join any existing files.
+ */
+ if (path == NULL)
+ goto alloc;
+
+ /*
+ * Get the real name for this file and open it. If it's a Queue extent
+ * file, it may not exist, and that's OK.
+ */
+ oflags = 0;
+ if (LF_ISSET(DB_CREATE))
+ oflags |= DB_OSO_CREATE;
+ if (LF_ISSET(DB_DIRECT))
+ oflags |= DB_OSO_DIRECT;
+ if (LF_ISSET(DB_RDONLY)) {
F_SET(dbmfp, MP_READONLY);
+ oflags |= DB_OSO_RDONLY;
+ }
+ if ((ret =
+ __db_appname(dbenv, DB_APP_DATA, path, 0, NULL, &rpath)) != 0)
+ goto err;
+ if ((ret = __os_open(dbenv, rpath, oflags, mode, dbmfp->fhp)) != 0) {
+ if (!LF_ISSET(DB_EXTENT))
+ __db_err(dbenv, "%s: %s", rpath, db_strerror(ret));
+ goto err;
+ }
- if (path == NULL) {
- if (LF_ISSET(DB_RDONLY)) {
- __db_err(dbenv,
- "memp_fopen: temporary files can't be readonly");
- ret = EINVAL;
+ /*
+ * Get the file id if we weren't given one. Generated file id's
+ * don't use timestamps, otherwise there'd be no chance of any
+ * other process joining the party.
+ */
+ if (dbmfp->fileid == NULL) {
+ if ((ret = __os_fileid(dbenv, rpath, 0, idbuf)) != 0)
goto err;
+ dbmfp->fileid = idbuf;
+ }
+
+ /*
+ * If our caller knows what mfp we're using, increment the ref count,
+ * no need to search.
+ *
+ * We don't need to acquire a lock other than the mfp itself, because
+ * we know there's another reference and it's not going away.
+ */
+ if (mfp != NULL) {
+ MUTEX_LOCK(dbenv, &mfp->mutex);
+ ++mfp->mpf_cnt;
+ MUTEX_UNLOCK(dbenv, &mfp->mutex);
+ goto check_map;
+ }
+
+ /*
+ * If not creating a temporary file, walk the list of MPOOLFILE's,
+ * looking for a matching file. Files backed by temporary files
+ * or previously removed files can't match.
+ *
+ * DB_TRUNCATE support.
+ *
+ * The fileID is a filesystem unique number (e.g., a UNIX dev/inode
+ * pair) plus a timestamp. If files are removed and created in less
+ * than a second, the fileID can be repeated. The problem with
+ * repetition happens when the file that previously had the fileID
+ * value still has pages in the pool, since we don't want to use them
+ * to satisfy requests for the new file.
+ *
+ * Because the DB_TRUNCATE flag reuses the dev/inode pair, repeated
+ * opens with that flag set guarantees matching fileIDs when the
+ * machine can open a file and then re-open with truncate within a
+ * second. For this reason, we pass that flag down, and, if we find
+ * a matching entry, we ensure that it's never found again, and we
+ * create a new entry for the current request.
+ */
+ R_LOCK(dbenv, dbmp->reginfo);
+ for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
+ mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
+ /* Skip dead files and temporary files. */
+ if (F_ISSET(mfp, MP_DEADFILE | MP_TEMP))
+ continue;
+
+ /* Skip non-matching files. */
+ if (memcmp(dbmfp->fileid, R_ADDR(dbmp->reginfo,
+ mfp->fileid_off), DB_FILE_ID_LEN) != 0)
+ continue;
+
+ /*
+ * If the file is being truncated, remove it from the system
+ * and create a new entry.
+ *
+ * !!!
+ * We should be able to set mfp to NULL and break out of the
+ * loop, but I like the idea of checking all the entries.
+ */
+ if (LF_ISSET(DB_TRUNCATE)) {
+ MUTEX_LOCK(dbenv, &mfp->mutex);
+ MPOOLFILE_IGNORE(mfp);
+ MUTEX_UNLOCK(dbenv, &mfp->mutex);
+ continue;
}
- last_pgno = 0;
- } else {
- /* Get the real name for this file and open it. */
- if ((ret = __db_appname(dbenv,
- DB_APP_DATA, NULL, path, 0, NULL, &rpath)) != 0)
- goto err;
- oflags = 0;
- if (LF_ISSET(DB_CREATE))
- oflags |= DB_OSO_CREATE;
- if (LF_ISSET(DB_RDONLY))
- oflags |= DB_OSO_RDONLY;
- if ((ret =
- __os_open(dbenv, rpath, oflags, mode, &dbmfp->fh)) != 0) {
- if (!LF_ISSET(DB_EXTENT))
- __db_err(dbenv,
- "%s: %s", rpath, db_strerror(ret));
+
+ /*
+ * Some things about a file cannot be changed: the clear length,
+ * page size, or lSN location.
+ *
+ * The file type can change if the application's pre- and post-
+ * processing needs change. For example, an application that
+ * created a hash subdatabase in a database that was previously
+ * all btree.
+ *
+ * XXX
+ * We do not check to see if the pgcookie information changed,
+ * or update it if it is, this might be a bug.
+ */
+ if (dbmfp->clear_len != mfp->clear_len ||
+ pagesize != mfp->stat.st_pagesize ||
+ dbmfp->lsn_offset != mfp->lsn_off) {
+ __db_err(dbenv,
+ "%s: clear length, page size or LSN location changed",
+ path);
+ R_UNLOCK(dbenv, dbmp->reginfo);
+ ret = EINVAL;
goto err;
}
+ if (dbmfp->ftype != 0)
+ mfp->ftype = dbmfp->ftype;
+
+ MUTEX_LOCK(dbenv, &mfp->mutex);
+ ++mfp->mpf_cnt;
+ MUTEX_UNLOCK(dbenv, &mfp->mutex);
+ break;
+ }
+ R_UNLOCK(dbenv, dbmp->reginfo);
+
+ if (mfp != NULL)
+ goto check_map;
+
+alloc: /* Allocate and initialize a new MPOOLFILE. */
+ if ((ret = __memp_alloc(
+ dbmp, dbmp->reginfo, NULL, sizeof(MPOOLFILE), NULL, &mfp)) != 0)
+ goto err;
+ mfp_alloc = 1;
+ memset(mfp, 0, sizeof(MPOOLFILE));
+ mfp->mpf_cnt = 1;
+ mfp->ftype = dbmfp->ftype;
+ mfp->stat.st_pagesize = pagesize;
+ mfp->lsn_off = dbmfp->lsn_offset;
+ mfp->clear_len = dbmfp->clear_len;
+
+ if (LF_ISSET(DB_DIRECT))
+ F_SET(mfp, MP_DIRECT);
+ if (LF_ISSET(DB_EXTENT))
+ F_SET(mfp, MP_EXTENT);
+
+ if (path == NULL)
+ F_SET(mfp, MP_TEMP);
+ else {
/*
* Don't permit files that aren't a multiple of the pagesize,
* and find the number of the last page in the file, all the
@@ -234,93 +487,84 @@ __memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp)
* environments where an off_t is 32-bits, but still run where
* offsets are 64-bits, and they pay us a lot of money.
*/
- if ((ret = __os_ioinfo(dbenv, rpath,
- &dbmfp->fh, &mbytes, &bytes, NULL)) != 0) {
+ if ((ret = __os_ioinfo(
+ dbenv, rpath, dbmfp->fhp, &mbytes, &bytes, NULL)) != 0) {
__db_err(dbenv, "%s: %s", rpath, db_strerror(ret));
goto err;
}
/*
- * If we're doing a verify, we might have to cope with
- * a truncated file; if the file size is not a multiple
- * of the page size, round down to a page--we'll
- * take care of the partial page outside the memp system.
+ * During verify or recovery, we might have to cope with a
+ * truncated file; if the file size is not a multiple of the
+ * page size, round down to a page, we'll take care of the
+ * partial page outside the mpool system.
*/
-
- /* Page sizes have to be a power-of-two, ignore mbytes. */
if (bytes % pagesize != 0) {
if (LF_ISSET(DB_ODDFILESIZE))
- /*
- * If we're doing a verify, we might
- * have to cope with a truncated file;
- * round down, we'll worry about the partial
- * page outside the memp system.
- */
- bytes -= (bytes % pagesize);
+ bytes -= (u_int32_t)(bytes % pagesize);
else {
__db_err(dbenv,
- "%s: file size not a multiple of the pagesize",
- rpath);
+ "%s: file size not a multiple of the pagesize", rpath);
ret = EINVAL;
goto err;
}
}
- last_pgno = mbytes * (MEGABYTE / pagesize);
- last_pgno += bytes / pagesize;
-
- /* Correction: page numbers are zero-based, not 1-based. */
+ /*
+ * If the user specifies DB_MPOOL_LAST or DB_MPOOL_NEW on a
+ * page get, we have to increment the last page in the file.
+ * Figure it out and save it away.
+ *
+ * Note correction: page numbers are zero-based, not 1-based.
+ */
+ last_pgno = (db_pgno_t)(mbytes * (MEGABYTE / pagesize));
+ last_pgno += (db_pgno_t)(bytes / pagesize);
if (last_pgno != 0)
--last_pgno;
+ mfp->orig_last_pgno = mfp->last_pgno = last_pgno;
- /*
- * Get the file id if we weren't given one. Generated file id's
- * don't use timestamps, otherwise there'd be no chance of any
- * other process joining the party.
- */
- if (finfop->fileid == NULL) {
- if ((ret = __os_fileid(dbenv, rpath, 0, idbuf)) != 0)
- goto err;
- finfop->fileid = idbuf;
- }
- }
+ /* Copy the file path into shared memory. */
+ if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
+ NULL, strlen(path) + 1, &mfp->path_off, &p)) != 0)
+ goto err;
+ memcpy(p, path, strlen(path) + 1);
- /*
- * If we weren't provided an underlying shared object to join with,
- * find/allocate the shared file objects. Also allocate space for
- * for the per-process thread lock.
- */
- if (needlock)
- R_LOCK(dbenv, dbmp->reginfo);
- if (mfp == NULL)
- ret = __memp_mf_open(
- dbmp, path, pagesize, last_pgno, finfop, flags, &mfp);
- else {
- ++mfp->mpf_cnt;
- ret = 0;
+ /* Copy the file identification string into shared memory. */
+ if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
+ NULL, DB_FILE_ID_LEN, &mfp->fileid_off, &p)) != 0)
+ goto err;
+ memcpy(p, dbmfp->fileid, DB_FILE_ID_LEN);
}
- if (needlock)
- R_UNLOCK(dbenv, dbmp->reginfo);
- if (ret != 0)
- goto err;
- if (F_ISSET(dbenv, DB_ENV_THREAD)) {
- if ((ret = __db_mutex_alloc(
- dbenv, dbmp->reginfo, &dbmfp->mutexp)) != 0)
- goto err;
- if ((ret = __db_mutex_init(
- dbenv, dbmfp->mutexp, 0, MUTEX_THREAD)) != 0)
+ /* Copy the page cookie into shared memory. */
+ if (dbmfp->pgcookie == NULL || dbmfp->pgcookie->size == 0) {
+ mfp->pgcookie_len = 0;
+ mfp->pgcookie_off = 0;
+ } else {
+ if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
+ NULL, dbmfp->pgcookie->size, &mfp->pgcookie_off, &p)) != 0)
goto err;
-
- /* XXX: KEITH: CLOSE THE FILE ON FAILURE? */
+ memcpy(p, dbmfp->pgcookie->data, dbmfp->pgcookie->size);
+ mfp->pgcookie_len = dbmfp->pgcookie->size;
}
- dbmfp->mfp = mfp;
+ /*
+ * Prepend the MPOOLFILE to the list of MPOOLFILE's.
+ */
+ R_LOCK(dbenv, dbmp->reginfo);
+ ret = __db_mutex_setup(dbenv, dbmp->reginfo, &mfp->mutex,
+ MUTEX_NO_RLOCK);
+ if (ret == 0)
+ SH_TAILQ_INSERT_HEAD(&mp->mpfq, mfp, q, __mpoolfile);
+ R_UNLOCK(dbenv, dbmp->reginfo);
+ if (ret != 0)
+ goto err;
+check_map:
/*
* If a file:
- * + is read-only
* + isn't temporary
+ * + is read-only
* + doesn't require any pgin/pgout support
* + the DB_NOMMAP flag wasn't set (in either the file open or
* the environment in which it was opened)
@@ -332,7 +576,6 @@ __memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp)
* NFS mounted partition, and we can fail in buffer I/O just as easily
* as here.
*
- * XXX
* We'd like to test to see if the file is too big to mmap. Since we
* don't know what size or type off_t's or size_t's are, or the largest
* unsigned integral type is, or what random insanity the local C
@@ -341,11 +584,11 @@ __memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp)
*/
#define DB_MAXMMAPSIZE (10 * 1024 * 1024) /* 10 Mb. */
if (F_ISSET(mfp, MP_CAN_MMAP)) {
- if (!F_ISSET(dbmfp, MP_READONLY))
- F_CLR(mfp, MP_CAN_MMAP);
if (path == NULL)
F_CLR(mfp, MP_CAN_MMAP);
- if (finfop->ftype != 0)
+ if (!F_ISSET(dbmfp, MP_READONLY))
+ F_CLR(mfp, MP_CAN_MMAP);
+ if (dbmfp->ftype != 0)
F_CLR(mfp, MP_CAN_MMAP);
if (LF_ISSET(DB_NOMMAP) || F_ISSET(dbenv, DB_ENV_NOMMAP))
F_CLR(mfp, MP_CAN_MMAP);
@@ -354,260 +597,239 @@ __memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp)
if (mbytes > maxmap / MEGABYTE ||
(mbytes == maxmap / MEGABYTE && bytes >= maxmap % MEGABYTE))
F_CLR(mfp, MP_CAN_MMAP);
- }
- dbmfp->addr = NULL;
- if (F_ISSET(mfp, MP_CAN_MMAP)) {
- dbmfp->len = (size_t)mbytes * MEGABYTE + bytes;
- if (__os_mapfile(dbenv, rpath,
- &dbmfp->fh, dbmfp->len, 1, &dbmfp->addr) != 0) {
- dbmfp->addr = NULL;
- F_CLR(mfp, MP_CAN_MMAP);
+
+ dbmfp->addr = NULL;
+ if (F_ISSET(mfp, MP_CAN_MMAP)) {
+ dbmfp->len = (size_t)mbytes * MEGABYTE + bytes;
+ if (__os_mapfile(dbenv, rpath,
+ dbmfp->fhp, dbmfp->len, 1, &dbmfp->addr) != 0) {
+ dbmfp->addr = NULL;
+ F_CLR(mfp, MP_CAN_MMAP);
+ }
}
}
- if (rpath != NULL)
- __os_freestr(rpath);
+ dbmfp->mfp = mfp;
+
+ F_SET(dbmfp, MP_OPEN_CALLED);
+
+ /* Add the file to the process' list of DB_MPOOLFILEs. */
MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
TAILQ_INSERT_TAIL(&dbmp->dbmfq, dbmfp, q);
MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
- *retp = dbmfp;
- return (0);
+ if (0) {
+err: if (F_ISSET(dbmfp->fhp, DB_FH_VALID))
+ (void)__os_closehandle(dbenv, dbmfp->fhp);
+
+ if (mfp_alloc) {
+ R_LOCK(dbenv, dbmp->reginfo);
+ if (mfp->path_off != 0)
+ __db_shalloc_free(dbmp->reginfo[0].addr,
+ R_ADDR(dbmp->reginfo, mfp->path_off));
+ if (mfp->fileid_off != 0)
+ __db_shalloc_free(dbmp->reginfo[0].addr,
+ R_ADDR(dbmp->reginfo, mfp->fileid_off));
+ __db_shalloc_free(dbmp->reginfo[0].addr, mfp);
+ R_UNLOCK(dbenv, dbmp->reginfo);
+ }
-err: /*
- * Note that we do not have to free the thread mutex, because we
- * never get to here after we have successfully allocated it.
- */
- if (rpath != NULL)
- __os_freestr(rpath);
- if (F_ISSET(&dbmfp->fh, DB_FH_VALID))
- (void)__os_closehandle(&dbmfp->fh);
- if (dbmfp != NULL) {
- if (dbmfp->mutexp != NULL)
- __db_mutex_free(dbenv, dbmp->reginfo, dbmfp->mutexp);
- __os_free(dbmfp, sizeof(DB_MPOOLFILE));
}
+ if (rpath != NULL)
+ __os_free(dbenv, rpath);
return (ret);
}
/*
- * __memp_mf_open --
- * Open an MPOOLFILE.
+ * __memp_get_fileid --
+ * Return the file ID.
+ *
+ * XXX
+ * Undocumented interface: DB private.
*/
-static int
-__memp_mf_open(dbmp, path, pagesize, last_pgno, finfop, flags, retp)
- DB_MPOOL *dbmp;
- const char *path;
- size_t pagesize;
- db_pgno_t last_pgno;
- DB_MPOOL_FINFO *finfop;
- u_int32_t flags;
- MPOOLFILE **retp;
+static void
+__memp_get_fileid(dbmfp, fidp)
+ DB_MPOOLFILE *dbmfp;
+ u_int8_t *fidp;
{
- MPOOL *mp;
- MPOOLFILE *mfp;
- int ret;
- void *p;
-
-#define ISTEMPORARY (path == NULL)
-
/*
- * If not creating a temporary file, walk the list of MPOOLFILE's,
- * looking for a matching file. Files backed by temporary files
- * or previously removed files can't match.
+ * No lock needed -- we're using the handle, it had better not
+ * be going away.
*
- * DB_TRUNCATE support.
- *
- * The fileID is a filesystem unique number (e.g., a UNIX dev/inode
- * pair) plus a timestamp. If files are removed and created in less
- * than a second, the fileID can be repeated. The problem with
- * repetition happens when the file that previously had the fileID
- * value still has pages in the pool, since we don't want to use them
- * to satisfy requests for the new file.
- *
- * Because the DB_TRUNCATE flag reuses the dev/inode pair, repeated
- * opens with that flag set guarantees matching fileIDs when the
- * machine can open a file and then re-open with truncate within a
- * second. For this reason, we pass that flag down, and, if we find
- * a matching entry, we ensure that it's never found again, and we
- * create a new entry for the current request.
+ * !!!
+ * Get the fileID out of the region, not out of the DB_MPOOLFILE
+ * structure because the DB_MPOOLFILE reference is possibly short
+ * lived, and isn't to be trusted.
*/
- if (!ISTEMPORARY) {
- mp = dbmp->reginfo[0].primary;
- for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
- mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
- if (F_ISSET(mfp, MP_DEADFILE | MP_TEMP))
- continue;
- if (memcmp(finfop->fileid, R_ADDR(dbmp->reginfo,
- mfp->fileid_off), DB_FILE_ID_LEN) == 0) {
- if (LF_ISSET(DB_TRUNCATE)) {
- MEMP_FREMOVE(mfp);
- continue;
- }
- if (finfop->clear_len != mfp->clear_len ||
- pagesize != mfp->stat.st_pagesize) {
- __db_err(dbmp->dbenv,
- "%s: page size or clear length changed",
- path);
- return (EINVAL);
- }
-
- /*
- * It's possible that our needs for pre- and
- * post-processing are changing. For example,
- * an application created a hash subdatabase
- * in a database that was previously all btree.
- */
- if (finfop->ftype != 0)
- mfp->ftype = finfop->ftype;
-
- ++mfp->mpf_cnt;
-
- *retp = mfp;
- return (0);
- }
- }
- }
+ memcpy(fidp, R_ADDR(
+ dbmfp->dbmp->reginfo, dbmfp->mfp->fileid_off), DB_FILE_ID_LEN);
+}
- /* Allocate a new MPOOLFILE. */
- if ((ret = __memp_alloc(
- dbmp, dbmp->reginfo, NULL, sizeof(MPOOLFILE), NULL, &mfp)) != 0)
- goto mem_err;
- *retp = mfp;
+/*
+ * __memp_last_pgno --
+ * Return the page number of the last page in the file.
+ *
+ * XXX
+ * Undocumented interface: DB private.
+ */
+static void
+__memp_last_pgno(dbmfp, pgnoaddr)
+ DB_MPOOLFILE *dbmfp;
+ db_pgno_t *pgnoaddr;
+{
+ DB_ENV *dbenv;
+ DB_MPOOL *dbmp;
- /* Initialize the structure. */
- memset(mfp, 0, sizeof(MPOOLFILE));
- mfp->mpf_cnt = 1;
- mfp->ftype = finfop->ftype;
- mfp->lsn_off = finfop->lsn_offset;
- mfp->clear_len = finfop->clear_len;
+ dbmp = dbmfp->dbmp;
+ dbenv = dbmp->dbenv;
- /*
- * If the user specifies DB_MPOOL_LAST or DB_MPOOL_NEW on a memp_fget,
- * we have to know the last page in the file. Figure it out and save
- * it away.
- */
- mfp->stat.st_pagesize = pagesize;
- mfp->orig_last_pgno = mfp->last_pgno = last_pgno;
+ R_LOCK(dbenv, dbmp->reginfo);
+ *pgnoaddr = dbmfp->mfp->last_pgno;
+ R_UNLOCK(dbenv, dbmp->reginfo);
+}
- if (ISTEMPORARY)
- F_SET(mfp, MP_TEMP);
- else {
- /* Copy the file path into shared memory. */
- if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
- NULL, strlen(path) + 1, &mfp->path_off, &p)) != 0)
- goto err;
- memcpy(p, path, strlen(path) + 1);
+/*
+ * __memp_refcnt --
+ * Return the current reference count.
+ *
+ * XXX
+ * Undocumented interface: DB private.
+ */
+static void
+__memp_refcnt(dbmfp, cntp)
+ DB_MPOOLFILE *dbmfp;
+ db_pgno_t *cntp;
+{
+ DB_ENV *dbenv;
- /* Copy the file identification string into shared memory. */
- if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
- NULL, DB_FILE_ID_LEN, &mfp->fileid_off, &p)) != 0)
- goto err;
- memcpy(p, finfop->fileid, DB_FILE_ID_LEN);
+ dbenv = dbmfp->dbmp->dbenv;
- F_SET(mfp, MP_CAN_MMAP);
- }
+ MUTEX_LOCK(dbenv, &dbmfp->mfp->mutex);
+ *cntp = dbmfp->mfp->mpf_cnt;
+ MUTEX_UNLOCK(dbenv, &dbmfp->mfp->mutex);
+}
- /* Copy the page cookie into shared memory. */
- if (finfop->pgcookie == NULL || finfop->pgcookie->size == 0) {
- mfp->pgcookie_len = 0;
- mfp->pgcookie_off = 0;
- } else {
- if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
- NULL, finfop->pgcookie->size, &mfp->pgcookie_off, &p)) != 0)
- goto err;
- memcpy(p, finfop->pgcookie->data, finfop->pgcookie->size);
- mfp->pgcookie_len = finfop->pgcookie->size;
- }
+/*
+ * __memp_set_unlink --
+ * Set unlink on last close flag.
+ *
+ * XXX
+ * Undocumented interface: DB private.
+ */
+static void
+__memp_set_unlink(dbmpf, set)
+ DB_MPOOLFILE *dbmpf;
+ int set;
+{
+ DB_ENV *dbenv;
- /* Prepend the MPOOLFILE to the list of MPOOLFILE's. */
- mp = dbmp->reginfo[0].primary;
- SH_TAILQ_INSERT_HEAD(&mp->mpfq, mfp, q, __mpoolfile);
+ dbenv = dbmpf->dbmp->dbenv;
- if (0) {
-err: if (mfp->path_off != 0)
- __db_shalloc_free(dbmp->reginfo[0].addr,
- R_ADDR(dbmp->reginfo, mfp->path_off));
- if (mfp->fileid_off != 0)
- __db_shalloc_free(dbmp->reginfo[0].addr,
- R_ADDR(dbmp->reginfo, mfp->fileid_off));
- if (mfp != NULL)
- __db_shalloc_free(dbmp->reginfo[0].addr, mfp);
-mem_err: __db_err(dbmp->dbenv,
- "Unable to allocate memory for mpool file");
- }
- return (ret);
+ MUTEX_LOCK(dbenv, &dbmpf->mfp->mutex);
+ if (set)
+ F_SET(dbmpf->mfp, MP_UNLINK);
+ else
+ F_CLR(dbmpf->mfp, MP_UNLINK);
+ MUTEX_UNLOCK(dbenv, &dbmpf->mfp->mutex);
}
/*
* memp_fclose --
* Close a backing file for the memory pool.
*/
+static int
+__memp_fclose(dbmfp, flags)
+ DB_MPOOLFILE *dbmfp;
+ u_int32_t flags;
+{
+ DB_ENV *dbenv;
+ int ret, t_ret;
+
+ dbenv = dbmfp->dbmp->dbenv;
+
+ PANIC_CHECK(dbenv);
+
+ /*
+ * XXX
+ * DB_MPOOL_DISCARD: Undocumented flag: DB private.
+ */
+ ret = __db_fchk(dbenv, "DB_MPOOLFILE->close", flags, DB_MPOOL_DISCARD);
+
+ if ((t_ret = __memp_fclose_int(dbmfp, flags)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __memp_fclose_int --
+ * Internal version of __memp_fclose.
+ *
+ * PUBLIC: int __memp_fclose_int __P((DB_MPOOLFILE *, u_int32_t));
+ */
int
-memp_fclose(dbmfp)
+__memp_fclose_int(dbmfp, flags)
DB_MPOOLFILE *dbmfp;
+ u_int32_t flags;
{
DB_ENV *dbenv;
DB_MPOOL *dbmp;
MPOOLFILE *mfp;
char *rpath;
- int ret, t_ret;
+ int deleted, ret, t_ret;
dbmp = dbmfp->dbmp;
dbenv = dbmp->dbenv;
ret = 0;
- PANIC_CHECK(dbenv);
-
-#ifdef HAVE_RPC
- if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
- return (__dbcl_memp_fclose(dbmfp));
-#endif
-
/*
- * Remove the DB_MPOOLFILE from the queue. This has to happen before
- * we perform any action that can fail, otherwise __memp_close may
- * loop infinitely when calling us to discard all of the DB_MPOOLFILEs.
+ * We have to reference count DB_MPOOLFILE structures as other threads
+ * in the process may be using them. Here's the problem:
+ *
+ * Thread A opens a database.
+ * Thread B uses thread A's DB_MPOOLFILE to write a buffer
+ * in order to free up memory in the mpool cache.
+ * Thread A closes the database while thread B is using the
+ * DB_MPOOLFILE structure.
+ *
+ * By opening all databases before creating any threads, and closing
+ * the databases after all the threads have exited, applications get
+ * better performance and avoid the problem path entirely.
+ *
+ * Regardless, holding the DB_MPOOLFILE to flush a dirty buffer is a
+ * short-term lock, even in worst case, since we better be the only
+ * thread of control using the DB_MPOOLFILE structure to read pages
+ * *into* the cache. Wait until we're the only reference holder and
+ * remove the DB_MPOOLFILE structure from the list, so nobody else can
+ * find it. We do this, rather than have the last reference holder
+ * (whoever that might be) discard the DB_MPOOLFILE structure, because
+ * we'd rather write error messages to the application in the close
+ * routine, not in the checkpoint/sync routine.
+ *
+ * !!!
+ * It's possible the DB_MPOOLFILE was never added to the DB_MPOOLFILE
+ * file list, check the DB_OPEN_CALLED flag to be sure.
*/
- for (;;) {
+ for (deleted = 0;;) {
MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
-
- /*
- * We have to reference count DB_MPOOLFILE structures as other
- * threads may be using them. The problem only happens if the
- * application makes a bad design choice. Here's the path:
- *
- * Thread A opens a database.
- * Thread B uses thread A's DB_MPOOLFILE to write a buffer
- * in order to free up memory in the mpool cache.
- * Thread A closes the database while thread B is using the
- * DB_MPOOLFILE structure.
- *
- * By opening all databases before creating the threads, and
- * closing them after the threads have exited, applications
- * get better performance and avoid the problem path entirely.
- *
- * Regardless, holding the DB_MPOOLFILE to flush a dirty buffer
- * is a short-term lock, even in worst case, since we better be
- * the only thread of control using the DB_MPOOLFILE structure
- * to read pages *into* the cache. Wait until we're the only
- * reference holder and remove the DB_MPOOLFILE structure from
- * the list, so nobody else can even find it.
- */
if (dbmfp->ref == 1) {
- TAILQ_REMOVE(&dbmp->dbmfq, dbmfp, q);
- break;
+ if (F_ISSET(dbmfp, MP_OPEN_CALLED))
+ TAILQ_REMOVE(&dbmp->dbmfq, dbmfp, q);
+ deleted = 1;
}
MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
- (void)__os_sleep(dbenv, 1, 0);
+ if (deleted)
+ break;
+ __os_sleep(dbenv, 1, 0);
}
- MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
/* Complain if pinned blocks never returned. */
- if (dbmfp->pinref != 0)
+ if (dbmfp->pinref != 0) {
__db_err(dbenv, "%s: close: %lu blocks left pinned",
__memp_fn(dbmfp), (u_long)dbmfp->pinref);
+ ret = __db_panic(dbenv, DB_RUNRECOVERY);
+ }
/* Discard any mmap information. */
if (dbmfp->addr != NULL &&
@@ -615,11 +837,11 @@ memp_fclose(dbmfp)
__db_err(dbenv, "%s: %s", __memp_fn(dbmfp), db_strerror(ret));
/* Close the file; temporary files may not yet have been created. */
- if (F_ISSET(&dbmfp->fh, DB_FH_VALID) &&
- (t_ret = __os_closehandle(&dbmfp->fh)) != 0) {
+ if (F_ISSET(dbmfp->fhp, DB_FH_VALID) &&
+ (t_ret = __os_closehandle(dbenv, dbmfp->fhp)) != 0) {
__db_err(dbenv, "%s: %s", __memp_fn(dbmfp), db_strerror(t_ret));
- if (ret != 0)
- t_ret = ret;
+ if (ret == 0)
+ ret = t_ret;
}
/* Discard the thread mutex. */
@@ -628,38 +850,51 @@ memp_fclose(dbmfp)
/*
* Discard our reference on the the underlying MPOOLFILE, and close
- * it if it's no longer useful to anyone.
- *
- * If we're not discarding it, and it's a temp file, this means
- * all the outstanding references belong to unflushed buffers.
- * (A temp file can only be referenced by one DB_MPOOLFILE).
- * We don't care about preserving any of those buffers, so mark
- * the MPOOLFILE as dead so that when we try to flush them,
- * even the dirty ones just get discarded.
+ * it if it's no longer useful to anyone. It possible the open of
+ * the file never happened or wasn't successful, in which case, mpf
+ * will be NULL;
*/
- R_LOCK(dbenv, dbmp->reginfo);
- mfp = dbmfp->mfp;
- if (--mfp->mpf_cnt == 0) {
+ if ((mfp = dbmfp->mfp) == NULL)
+ goto done;
+
+ /*
+ * If it's a temp file, all outstanding references belong to unflushed
+ * buffers. (A temp file can only be referenced by one DB_MPOOLFILE).
+ * We don't care about preserving any of those buffers, so mark the
+ * MPOOLFILE as dead so that even the dirty ones just get discarded
+ * when we try to flush them.
+ */
+ deleted = 0;
+ MUTEX_LOCK(dbenv, &mfp->mutex);
+ if (--mfp->mpf_cnt == 0 || LF_ISSET(DB_MPOOL_DISCARD)) {
+ if (LF_ISSET(DB_MPOOL_DISCARD) ||
+ F_ISSET(mfp, MP_TEMP | MP_UNLINK))
+ MPOOLFILE_IGNORE(mfp);
if (F_ISSET(mfp, MP_UNLINK)) {
- MEMP_FREMOVE(mfp);
if ((t_ret = __db_appname(dbmp->dbenv,
- DB_APP_DATA, NULL, R_ADDR(dbmp->reginfo,
+ DB_APP_DATA, R_ADDR(dbmp->reginfo,
mfp->path_off), 0, NULL, &rpath)) != 0 && ret == 0)
ret = t_ret;
- if (t_ret == 0 && (t_ret =
- __os_unlink(dbmp->dbenv, rpath) != 0 && ret == 0))
+ if (t_ret == 0) {
+ if ((t_ret = __os_unlink(
+ dbmp->dbenv, rpath) != 0) && ret == 0)
+ ret = t_ret;
+ __os_free(dbenv, rpath);
+ }
+ }
+ if (mfp->block_cnt == 0) {
+ if ((t_ret =
+ __memp_mf_discard(dbmp, mfp)) != 0 && ret == 0)
ret = t_ret;
- __os_free(rpath, 0);
+ deleted = 1;
}
- if (mfp->block_cnt == 0)
- __memp_mf_discard(dbmp, mfp);
}
- else if (F_ISSET(mfp, MP_TEMP))
- MEMP_FREMOVE(mfp);
- R_UNLOCK(dbenv, dbmp->reginfo);
+ if (deleted == 0)
+ MUTEX_UNLOCK(dbenv, &mfp->mutex);
/* Discard the DB_MPOOLFILE structure. */
- __os_free(dbmfp, sizeof(DB_MPOOLFILE));
+done: __os_free(dbenv, dbmfp->fhp);
+ __os_free(dbenv, dbmfp);
return (ret);
}
@@ -668,20 +903,69 @@ memp_fclose(dbmfp)
* __memp_mf_discard --
* Discard an MPOOLFILE.
*
- * PUBLIC: void __memp_mf_discard __P((DB_MPOOL *, MPOOLFILE *));
+ * PUBLIC: int __memp_mf_discard __P((DB_MPOOL *, MPOOLFILE *));
*/
-void
+int
__memp_mf_discard(dbmp, mfp)
DB_MPOOL *dbmp;
MPOOLFILE *mfp;
{
+ DB_ENV *dbenv;
+ DB_FH fh;
+ DB_MPOOL_STAT *sp;
MPOOL *mp;
+ char *rpath;
+ int ret;
+ dbenv = dbmp->dbenv;
mp = dbmp->reginfo[0].primary;
+ ret = 0;
+
+ /*
+ * Expects caller to be holding the MPOOLFILE mutex.
+ *
+ * When discarding a file, we have to flush writes from it to disk.
+ * The scenario is that dirty buffers from this file need to be
+ * flushed to satisfy a future checkpoint, but when the checkpoint
+ * calls mpool sync, the sync code won't know anything about them.
+ */
+ if (!F_ISSET(mfp, MP_DEADFILE) &&
+ (ret = __db_appname(dbenv, DB_APP_DATA,
+ R_ADDR(dbmp->reginfo, mfp->path_off), 0, NULL, &rpath)) == 0) {
+ if ((ret = __os_open(dbenv, rpath, 0, 0, &fh)) == 0) {
+ ret = __os_fsync(dbenv, &fh);
+ (void)__os_closehandle(dbenv, &fh);
+ }
+ __os_free(dbenv, rpath);
+ }
+
+ /*
+ * We have to release the MPOOLFILE lock before acquiring the region
+ * lock so that we don't deadlock. Make sure nobody ever looks at
+ * this structure again.
+ */
+ MPOOLFILE_IGNORE(mfp);
+
+ /* Discard the mutex we're holding. */
+ MUTEX_UNLOCK(dbenv, &mfp->mutex);
/* Delete from the list of MPOOLFILEs. */
+ R_LOCK(dbenv, dbmp->reginfo);
SH_TAILQ_REMOVE(&mp->mpfq, mfp, q, __mpoolfile);
+ /* Copy the statistics into the region. */
+ sp = &mp->stat;
+ sp->st_cache_hit += mfp->stat.st_cache_hit;
+ sp->st_cache_miss += mfp->stat.st_cache_miss;
+ sp->st_map += mfp->stat.st_map;
+ sp->st_page_create += mfp->stat.st_page_create;
+ sp->st_page_in += mfp->stat.st_page_in;
+ sp->st_page_out += mfp->stat.st_page_out;
+
+ /* Clear the mutex this MPOOLFILE recorded. */
+ __db_shlocks_clear(&mfp->mutex, dbmp->reginfo,
+ (REGMAINT *)R_ADDR(dbmp->reginfo, mp->maint_off));
+
/* Free the space. */
if (mfp->path_off != 0)
__db_shalloc_free(dbmp->reginfo[0].addr,
@@ -693,35 +977,10 @@ __memp_mf_discard(dbmp, mfp)
__db_shalloc_free(dbmp->reginfo[0].addr,
R_ADDR(dbmp->reginfo, mfp->pgcookie_off));
__db_shalloc_free(dbmp->reginfo[0].addr, mfp);
-}
-
-/*
- * __memp_fremove --
- * Remove an underlying file from the system.
- *
- * PUBLIC: int __memp_fremove __P((DB_MPOOLFILE *));
- */
-int
-__memp_fremove(dbmfp)
- DB_MPOOLFILE *dbmfp;
-{
- DB_ENV *dbenv;
- DB_MPOOL *dbmp;
- MPOOLFILE *mfp;
-
- dbmp = dbmfp->dbmp;
- dbenv = dbmp->dbenv;
- mfp = dbmfp->mfp;
-
- PANIC_CHECK(dbenv);
-
- R_LOCK(dbenv, dbmp->reginfo);
-
- MEMP_FREMOVE(mfp);
R_UNLOCK(dbenv, dbmp->reginfo);
- return (0);
+ return (ret);
}
/*
diff --git a/bdb/mp/mp_fput.c b/bdb/mp/mp_fput.c
index be03b721f36..271e44a4ef8 100644
--- a/bdb/mp/mp_fput.c
+++ b/bdb/mp/mp_fput.c
@@ -1,13 +1,13 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: mp_fput.c,v 11.16 2000/11/30 00:58:41 ubell Exp $";
+static const char revid[] = "$Id: mp_fput.c,v 11.36 2002/08/09 19:04:11 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -15,43 +15,32 @@ static const char revid[] = "$Id: mp_fput.c,v 11.16 2000/11/30 00:58:41 ubell Ex
#endif
-#ifdef HAVE_RPC
-#include "db_server.h"
-#endif
-
#include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
-
-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
-#endif
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"
/*
- * memp_fput --
+ * __memp_fput --
* Mpool file put function.
+ *
+ * PUBLIC: int __memp_fput __P((DB_MPOOLFILE *, void *, u_int32_t));
*/
int
-memp_fput(dbmfp, pgaddr, flags)
+__memp_fput(dbmfp, pgaddr, flags)
DB_MPOOLFILE *dbmfp;
void *pgaddr;
u_int32_t flags;
{
- BH *bhp;
+ BH *argbhp, *bhp, *prev;
DB_ENV *dbenv;
DB_MPOOL *dbmp;
- MPOOL *c_mp, *mp;
- int ret, wrote;
+ DB_MPOOL_HASH *hp;
+ MPOOL *c_mp;
+ u_int32_t n_cache;
+ int adjust, ret;
dbmp = dbmfp->dbmp;
dbenv = dbmp->dbenv;
- mp = dbmp->reginfo[0].primary;
-
-#ifdef HAVE_RPC
- if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
- return (__dbcl_memp_fput(dbmfp, pgaddr, flags));
-#endif
PANIC_CHECK(dbenv);
@@ -72,17 +61,6 @@ memp_fput(dbmfp, pgaddr, flags)
}
}
- R_LOCK(dbenv, dbmp->reginfo);
-
- /* Decrement the pinned reference count. */
- if (dbmfp->pinref == 0) {
- __db_err(dbenv,
- "%s: more pages returned than retrieved", __memp_fn(dbmfp));
- R_UNLOCK(dbenv, dbmp->reginfo);
- return (EINVAL);
- } else
- --dbmfp->pinref;
-
/*
* If we're mapping the file, there's nothing to do. Because we can
* stop mapping the file at any time, we have to check on each buffer
@@ -90,97 +68,135 @@ memp_fput(dbmfp, pgaddr, flags)
* region.
*/
if (dbmfp->addr != NULL && pgaddr >= dbmfp->addr &&
- (u_int8_t *)pgaddr <= (u_int8_t *)dbmfp->addr + dbmfp->len) {
- R_UNLOCK(dbenv, dbmp->reginfo);
+ (u_int8_t *)pgaddr <= (u_int8_t *)dbmfp->addr + dbmfp->len)
return (0);
+
+#ifdef DIAGNOSTIC
+ /*
+ * Decrement the per-file pinned buffer count (mapped pages aren't
+ * counted).
+ */
+ R_LOCK(dbenv, dbmp->reginfo);
+ if (dbmfp->pinref == 0) {
+ ret = EINVAL;
+ __db_err(dbenv,
+ "%s: more pages returned than retrieved", __memp_fn(dbmfp));
+ } else {
+ ret = 0;
+ --dbmfp->pinref;
}
+ R_UNLOCK(dbenv, dbmp->reginfo);
+ if (ret != 0)
+ return (ret);
+#endif
- /* Convert the page address to a buffer header. */
+ /* Convert a page address to a buffer header and hash bucket. */
bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf));
+ n_cache = NCACHE(dbmp->reginfo[0].primary, bhp->mf_offset, bhp->pgno);
+ c_mp = dbmp->reginfo[n_cache].primary;
+ hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
+ hp = &hp[NBUCKET(c_mp, bhp->mf_offset, bhp->pgno)];
- /* Convert the buffer header to a cache. */
- c_mp = BH_TO_CACHE(dbmp, bhp);
-
-/* UNLOCK THE REGION, LOCK THE CACHE. */
+ MUTEX_LOCK(dbenv, &hp->hash_mutex);
/* Set/clear the page bits. */
- if (LF_ISSET(DB_MPOOL_CLEAN) && F_ISSET(bhp, BH_DIRTY)) {
- ++c_mp->stat.st_page_clean;
- --c_mp->stat.st_page_dirty;
+ if (LF_ISSET(DB_MPOOL_CLEAN) &&
+ F_ISSET(bhp, BH_DIRTY) && !F_ISSET(bhp, BH_DIRTY_CREATE)) {
+ DB_ASSERT(hp->hash_page_dirty != 0);
+ --hp->hash_page_dirty;
F_CLR(bhp, BH_DIRTY);
}
if (LF_ISSET(DB_MPOOL_DIRTY) && !F_ISSET(bhp, BH_DIRTY)) {
- --c_mp->stat.st_page_clean;
- ++c_mp->stat.st_page_dirty;
+ ++hp->hash_page_dirty;
F_SET(bhp, BH_DIRTY);
}
if (LF_ISSET(DB_MPOOL_DISCARD))
F_SET(bhp, BH_DISCARD);
/*
- * If the page is dirty and being scheduled to be written as part of
- * a checkpoint, we no longer know that the log is up-to-date.
- */
- if (F_ISSET(bhp, BH_DIRTY) && F_ISSET(bhp, BH_SYNC))
- F_SET(bhp, BH_SYNC_LOGFLSH);
-
- /*
* Check for a reference count going to zero. This can happen if the
* application returns a page twice.
*/
if (bhp->ref == 0) {
__db_err(dbenv, "%s: page %lu: unpinned page returned",
__memp_fn(dbmfp), (u_long)bhp->pgno);
- R_UNLOCK(dbenv, dbmp->reginfo);
+ MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
return (EINVAL);
}
/*
- * If more than one reference to the page, we're done. Ignore the
- * discard flags (for now) and leave it at its position in the LRU
- * chain. The rest gets done at last reference close.
+ * If more than one reference to the page or a reference other than a
+ * thread waiting to flush the buffer to disk, we're done. Ignore the
+ * discard flags (for now) and leave the buffer's priority alone.
*/
- if (--bhp->ref > 0) {
- R_UNLOCK(dbenv, dbmp->reginfo);
+ if (--bhp->ref > 1 || (bhp->ref == 1 && !F_ISSET(bhp, BH_LOCKED))) {
+ MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
return (0);
}
+ /* Update priority values. */
+ if (F_ISSET(bhp, BH_DISCARD) ||
+ dbmfp->mfp->priority == MPOOL_PRI_VERY_LOW)
+ bhp->priority = 0;
+ else {
+ /*
+ * We don't lock the LRU counter or the stat.st_pages field, if
+ * we get garbage (which won't happen on a 32-bit machine), it
+ * only means a buffer has the wrong priority.
+ */
+ bhp->priority = c_mp->lru_count;
+
+ adjust = 0;
+ if (dbmfp->mfp->priority != 0)
+ adjust =
+ (int)c_mp->stat.st_pages / dbmfp->mfp->priority;
+ if (F_ISSET(bhp, BH_DIRTY))
+ adjust += c_mp->stat.st_pages / MPOOL_PRI_DIRTY;
+
+ if (adjust > 0) {
+ if (UINT32_T_MAX - bhp->priority <= (u_int32_t)adjust)
+ bhp->priority += adjust;
+ } else if (adjust < 0)
+ if (bhp->priority > (u_int32_t)-adjust)
+ bhp->priority += adjust;
+ }
+
/*
- * Move the buffer to the head/tail of the LRU chain. We do this
- * before writing the buffer for checkpoint purposes, as the write
- * can discard the region lock and allow another process to acquire
- * buffer. We could keep that from happening, but there seems no
- * reason to do so.
+ * Buffers on hash buckets are sorted by priority -- move the buffer
+ * to the correct position in the list.
*/
- SH_TAILQ_REMOVE(&c_mp->bhq, bhp, q, __bh);
- if (F_ISSET(bhp, BH_DISCARD))
- SH_TAILQ_INSERT_HEAD(&c_mp->bhq, bhp, q, __bh);
+ argbhp = bhp;
+ SH_TAILQ_REMOVE(&hp->hash_bucket, argbhp, hq, __bh);
+
+ prev = NULL;
+ for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
+ bhp != NULL; prev = bhp, bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
+ if (bhp->priority > argbhp->priority)
+ break;
+ if (prev == NULL)
+ SH_TAILQ_INSERT_HEAD(&hp->hash_bucket, argbhp, hq, __bh);
else
- SH_TAILQ_INSERT_TAIL(&c_mp->bhq, bhp, q);
+ SH_TAILQ_INSERT_AFTER(&hp->hash_bucket, prev, argbhp, hq, __bh);
+
+ /* Reset the hash bucket's priority. */
+ hp->hash_priority = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
+
+#ifdef DIAGNOSTIC
+ __memp_check_order(hp);
+#endif
/*
- * If this buffer is scheduled for writing because of a checkpoint, we
- * need to write it (if it's dirty), or update the checkpoint counters
- * (if it's not dirty). If we try to write it and can't, that's not
- * necessarily an error as it's not completely unreasonable that the
- * application have permission to write the underlying file, but set a
- * flag so that the next time the memp_sync function is called we try
- * writing it there, as the checkpoint thread of control better be able
- * to write all of the files.
+ * The sync code has a separate counter for buffers on which it waits.
+ * It reads that value without holding a lock so we update it as the
+ * last thing we do. Once that value goes to 0, we won't see another
+ * reference to that buffer being returned to the cache until the sync
+ * code has finished, so we're safe as long as we don't let the value
+ * go to 0 before we finish with the buffer.
*/
- if (F_ISSET(bhp, BH_SYNC)) {
- if (F_ISSET(bhp, BH_DIRTY)) {
- if (__memp_bhwrite(dbmp,
- dbmfp->mfp, bhp, NULL, &wrote) != 0 || !wrote)
- F_SET(mp, MP_LSN_RETRY);
- } else {
- F_CLR(bhp, BH_SYNC);
-
- --mp->lsn_cnt;
- --dbmfp->mfp->lsn_cnt;
- }
- }
+ if (F_ISSET(argbhp, BH_LOCKED) && argbhp->ref_sync != 0)
+ --argbhp->ref_sync;
+
+ MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
- R_UNLOCK(dbenv, dbmp->reginfo);
return (0);
}
diff --git a/bdb/mp/mp_fset.c b/bdb/mp/mp_fset.c
index 08313c9b6f5..65cd6286ac9 100644
--- a/bdb/mp/mp_fset.c
+++ b/bdb/mp/mp_fset.c
@@ -1,13 +1,13 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: mp_fset.c,v 11.13 2000/11/30 00:58:41 ubell Exp $";
+static const char revid[] = "$Id: mp_fset.c,v 11.25 2002/05/03 15:21:17 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -15,25 +15,18 @@ static const char revid[] = "$Id: mp_fset.c,v 11.13 2000/11/30 00:58:41 ubell Ex
#endif
-#ifdef HAVE_RPC
-#include "db_server.h"
-#endif
-
#include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
-
-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
-#endif
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"
/*
- * memp_fset --
+ * __memp_fset --
* Mpool page set-flag routine.
+ *
+ * PUBLIC: int __memp_fset __P((DB_MPOOLFILE *, void *, u_int32_t));
*/
int
-memp_fset(dbmfp, pgaddr, flags)
+__memp_fset(dbmfp, pgaddr, flags)
DB_MPOOLFILE *dbmfp;
void *pgaddr;
u_int32_t flags;
@@ -41,17 +34,13 @@ memp_fset(dbmfp, pgaddr, flags)
BH *bhp;
DB_ENV *dbenv;
DB_MPOOL *dbmp;
- MPOOL *c_mp, *mp;
+ DB_MPOOL_HASH *hp;
+ MPOOL *c_mp;
+ u_int32_t n_cache;
int ret;
dbmp = dbmfp->dbmp;
dbenv = dbmp->dbenv;
- mp = dbmp->reginfo[0].primary;
-
-#ifdef HAVE_RPC
- if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
- return (__dbcl_memp_fset(dbmfp, pgaddr, flags));
-#endif
PANIC_CHECK(dbenv);
@@ -60,7 +49,7 @@ memp_fset(dbmfp, pgaddr, flags)
return (__db_ferr(dbenv, "memp_fset", 1));
if ((ret = __db_fchk(dbenv, "memp_fset", flags,
- DB_MPOOL_DIRTY | DB_MPOOL_CLEAN | DB_MPOOL_DISCARD)) != 0)
+ DB_MPOOL_CLEAN | DB_MPOOL_DIRTY | DB_MPOOL_DISCARD)) != 0)
return (ret);
if ((ret = __db_fcchk(dbenv, "memp_fset",
flags, DB_MPOOL_CLEAN, DB_MPOOL_DIRTY)) != 0)
@@ -72,27 +61,29 @@ memp_fset(dbmfp, pgaddr, flags)
return (EACCES);
}
- /* Convert the page address to a buffer header. */
+ /* Convert the page address to a buffer header and hash bucket. */
bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf));
-
- /* Convert the buffer header to a cache. */
- c_mp = BH_TO_CACHE(dbmp, bhp);
-
- R_LOCK(dbenv, dbmp->reginfo);
-
- if (LF_ISSET(DB_MPOOL_CLEAN) && F_ISSET(bhp, BH_DIRTY)) {
- ++c_mp->stat.st_page_clean;
- --c_mp->stat.st_page_dirty;
+ n_cache = NCACHE(dbmp->reginfo[0].primary, bhp->mf_offset, bhp->pgno);
+ c_mp = dbmp->reginfo[n_cache].primary;
+ hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
+ hp = &hp[NBUCKET(c_mp, bhp->mf_offset, bhp->pgno)];
+
+ MUTEX_LOCK(dbenv, &hp->hash_mutex);
+
+ /* Set/clear the page bits. */
+ if (LF_ISSET(DB_MPOOL_CLEAN) &&
+ F_ISSET(bhp, BH_DIRTY) && !F_ISSET(bhp, BH_DIRTY_CREATE)) {
+ DB_ASSERT(hp->hash_page_dirty != 0);
+ --hp->hash_page_dirty;
F_CLR(bhp, BH_DIRTY);
}
if (LF_ISSET(DB_MPOOL_DIRTY) && !F_ISSET(bhp, BH_DIRTY)) {
- --c_mp->stat.st_page_clean;
- ++c_mp->stat.st_page_dirty;
+ ++hp->hash_page_dirty;
F_SET(bhp, BH_DIRTY);
}
if (LF_ISSET(DB_MPOOL_DISCARD))
F_SET(bhp, BH_DISCARD);
- R_UNLOCK(dbenv, dbmp->reginfo);
+ MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
return (0);
}
diff --git a/bdb/mp/mp_method.c b/bdb/mp/mp_method.c
index 85a6239b032..38f0a645f16 100644
--- a/bdb/mp/mp_method.c
+++ b/bdb/mp/mp_method.c
@@ -1,30 +1,30 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: mp_method.c,v 11.10 2000/04/04 20:12:04 bostic Exp $";
+static const char revid[] = "$Id: mp_method.c,v 11.29 2002/03/27 04:32:27 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
#include <sys/types.h>
-#endif
-#ifdef HAVE_RPC
-#include "db_server.h"
+#ifdef HAVE_RPC
+#include <rpc/rpc.h>
+#endif
#endif
#include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"
#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
+#include "dbinc_auto/db_server.h"
+#include "dbinc_auto/rpc_client_ext.h"
#endif
static int __memp_set_cachesize __P((DB_ENV *, u_int32_t, u_int32_t, int));
@@ -41,29 +41,46 @@ __memp_dbenv_create(dbenv)
DB_ENV *dbenv;
{
/*
+ * !!!
+ * Our caller has not yet had the opportunity to reset the panic
+ * state or turn off mutex locking, and so we can neither check
+ * the panic state or acquire a mutex in the DB_ENV create path.
+ *
* We default to 32 8K pages. We don't default to a flat 256K, because
* some systems require significantly more memory to hold 32 pages than
* others. For example, HP-UX with POSIX pthreads needs 88 bytes for
* a POSIX pthread mutex and almost 200 bytes per buffer header, while
- * Solaris needs 24 and 52 bytes for the same structures.
+ * Solaris needs 24 and 52 bytes for the same structures. The minimum
+ * number of hash buckets is 37. These contain a mutex also.
*/
- dbenv->mp_bytes = 32 * ((8 * 1024) + sizeof(BH));
+ dbenv->mp_bytes =
+ 32 * ((8 * 1024) + sizeof(BH)) + 37 * sizeof(DB_MPOOL_HASH);
dbenv->mp_ncache = 1;
- dbenv->set_mp_mmapsize = __memp_set_mp_mmapsize;
- dbenv->set_cachesize = __memp_set_cachesize;
-
-#ifdef HAVE_RPC
- /*
- * If we have a client, overwrite what we just setup to
- * point to client functions.
- */
+#ifdef HAVE_RPC
if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) {
dbenv->set_cachesize = __dbcl_env_cachesize;
dbenv->set_mp_mmapsize = __dbcl_set_mp_mmapsize;
- }
+ dbenv->memp_dump_region = NULL;
+ dbenv->memp_fcreate = __dbcl_memp_fcreate;
+ dbenv->memp_nameop = NULL;
+ dbenv->memp_register = __dbcl_memp_register;
+ dbenv->memp_stat = __dbcl_memp_stat;
+ dbenv->memp_sync = __dbcl_memp_sync;
+ dbenv->memp_trickle = __dbcl_memp_trickle;
+ } else
#endif
-
+ {
+ dbenv->set_cachesize = __memp_set_cachesize;
+ dbenv->set_mp_mmapsize = __memp_set_mp_mmapsize;
+ dbenv->memp_dump_region = __memp_dump_region;
+ dbenv->memp_fcreate = __memp_fcreate;
+ dbenv->memp_nameop = __memp_nameop;
+ dbenv->memp_register = __memp_register;
+ dbenv->memp_stat = __memp_stat;
+ dbenv->memp_sync = __memp_sync;
+ dbenv->memp_trickle = __memp_trickle;
+ }
}
/*
@@ -78,26 +95,50 @@ __memp_set_cachesize(dbenv, gbytes, bytes, ncache)
{
ENV_ILLEGAL_AFTER_OPEN(dbenv, "set_cachesize");
- dbenv->mp_gbytes = gbytes + bytes / GIGABYTE;
- dbenv->mp_bytes = bytes % GIGABYTE;
- dbenv->mp_ncache = ncache == 0 ? 1 : ncache;
+ /* Normalize the values. */
+ if (ncache == 0)
+ ncache = 1;
/*
- * If the application requested less than 500Mb, increase the
- * cachesize by 25% to account for our overhead. (I'm guessing
- * that caches over 500Mb are specifically sized, i.e., it's
- * a large server and the application actually knows how much
- * memory is available.)
+ * You can only store 4GB-1 in an unsigned 32-bit value, so correct for
+ * applications that specify 4GB cache sizes -- we know what they meant.
+ */
+ if (gbytes / ncache == 4 && bytes == 0) {
+ --gbytes;
+ bytes = GIGABYTE - 1;
+ } else {
+ gbytes += bytes / GIGABYTE;
+ bytes %= GIGABYTE;
+ }
+
+ /* Avoid too-large cache sizes, they result in a region size of zero. */
+ if (gbytes / ncache > 4 || (gbytes / ncache == 4 && bytes != 0)) {
+ __db_err(dbenv, "individual cache size too large");
+ return (EINVAL);
+ }
+
+ /*
+ * If the application requested less than 500Mb, increase the cachesize
+ * by 25% and factor in the size of the hash buckets to account for our
+ * overhead. (I'm guessing caches over 500Mb are specifically sized,
+ * that is, it's a large server and the application actually knows how
+ * much memory is available. We only document the 25% overhead number,
+ * not the hash buckets, but I don't see a reason to confuse the issue,
+ * it shouldn't matter to an application.)
*
* There is a minimum cache size, regardless.
*/
- if (dbenv->mp_gbytes == 0) {
- if (dbenv->mp_bytes < 500 * MEGABYTE)
- dbenv->mp_bytes += dbenv->mp_bytes / 4;
- if (dbenv->mp_bytes < DB_CACHESIZE_MIN)
- dbenv->mp_bytes = DB_CACHESIZE_MIN;
+ if (gbytes == 0) {
+ if (bytes < 500 * MEGABYTE)
+ bytes += (bytes / 4) + 37 * sizeof(DB_MPOOL_HASH);
+ if (bytes / ncache < DB_CACHESIZE_MIN)
+ bytes = ncache * DB_CACHESIZE_MIN;
}
+ dbenv->mp_gbytes = gbytes;
+ dbenv->mp_bytes = bytes;
+ dbenv->mp_ncache = ncache;
+
return (0);
}
diff --git a/bdb/mp/mp_region.c b/bdb/mp/mp_region.c
index 4b85466ce63..06eca2f8646 100644
--- a/bdb/mp/mp_region.c
+++ b/bdb/mp/mp_region.c
@@ -1,13 +1,13 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: mp_region.c,v 11.26 2000/11/30 00:58:41 ubell Exp $";
+static const char revid[] = "$Id: mp_region.c,v 11.49 2002/05/07 18:42:20 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -17,11 +17,11 @@ static const char revid[] = "$Id: mp_region.c,v 11.26 2000/11/30 00:58:41 ubell
#endif
#include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"
static int __mpool_init __P((DB_ENV *, DB_MPOOL *, int, int));
-#ifdef MUTEX_SYSTEM_RESOURCES
+#ifdef HAVE_MUTEX_SYSTEM_RESOURCES
static size_t __mpool_region_maint __P((REGINFO *));
#endif
@@ -119,6 +119,8 @@ __memp_open(dbenv)
regids[i] = dbmp->reginfo[i].id;
}
+
+ R_UNLOCK(dbenv, dbmp->reginfo);
} else {
/*
* Determine how many regions there are going to be, allocate
@@ -135,6 +137,19 @@ __memp_open(dbenv)
dbmp->reginfo[i].id = INVALID_REGION_ID;
dbmp->reginfo[0] = reginfo;
+ /*
+ * We have to unlock the primary mpool region before we attempt
+ * to join the additional mpool regions. If we don't, we can
+ * deadlock. The scenario is that we hold the primary mpool
+ * region lock. We then try to attach to an additional mpool
+ * region, which requires the acquisition/release of the main
+ * region lock (to search the list of regions). If another
+ * thread of control already holds the main region lock and is
+ * waiting on our primary mpool region lock, we'll deadlock.
+ * See [#4696] for more information.
+ */
+ R_UNLOCK(dbenv, dbmp->reginfo);
+
/* Join remaining regions. */
regids = R_ADDR(dbmp->reginfo, mp->regids);
for (i = 1; i < dbmp->nreg; ++i) {
@@ -155,17 +170,10 @@ __memp_open(dbenv)
R_ADDR(&dbmp->reginfo[i], dbmp->reginfo[i].rp->primary);
/* If the region is threaded, allocate a mutex to lock the handles. */
- if (F_ISSET(dbenv, DB_ENV_THREAD)) {
- if ((ret = __db_mutex_alloc(
- dbenv, dbmp->reginfo, &dbmp->mutexp)) != 0) {
- goto err;
- }
- if ((ret =
- __db_mutex_init(dbenv, dbmp->mutexp, 0, MUTEX_THREAD)) != 0)
- goto err;
- }
-
- R_UNLOCK(dbenv, dbmp->reginfo);
+ if (F_ISSET(dbenv, DB_ENV_THREAD) &&
+ (ret = __db_mutex_setup(dbenv, dbmp->reginfo, &dbmp->mutexp,
+ MUTEX_ALLOC | MUTEX_THREAD)) != 0)
+ goto err;
dbenv->mp_handle = dbmp;
return (0);
@@ -180,12 +188,11 @@ err: if (dbmp->reginfo != NULL && dbmp->reginfo[0].addr != NULL) {
if (dbmp->reginfo[i].id != INVALID_REGION_ID)
(void)__db_r_detach(
dbenv, &dbmp->reginfo[i], 0);
- __os_free(dbmp->reginfo,
- dbmp->nreg * sizeof(*dbmp->reginfo));
+ __os_free(dbenv, dbmp->reginfo);
}
if (dbmp->mutexp != NULL)
__db_mutex_free(dbenv, dbmp->reginfo, dbmp->mutexp);
- __os_free(dbmp, sizeof(*dbmp));
+ __os_free(dbenv, dbmp);
return (ret);
}
@@ -199,13 +206,13 @@ __mpool_init(dbenv, dbmp, reginfo_off, htab_buckets)
DB_MPOOL *dbmp;
int reginfo_off, htab_buckets;
{
- DB_HASHTAB *htab;
+ DB_MPOOL_HASH *htab;
MPOOL *mp;
REGINFO *reginfo;
-#ifdef MUTEX_SYSTEM_RESOURCES
+#ifdef HAVE_MUTEX_SYSTEM_RESOURCES
size_t maint_size;
#endif
- int ret;
+ int i, ret;
void *p;
mp = NULL;
@@ -218,7 +225,7 @@ __mpool_init(dbenv, dbmp, reginfo_off, htab_buckets)
mp = reginfo->primary;
memset(mp, 0, sizeof(*mp));
-#ifdef MUTEX_SYSTEM_RESOURCES
+#ifdef HAVE_MUTEX_SYSTEM_RESOURCES
maint_size = __mpool_region_maint(reginfo);
/* Allocate room for the maintenance info and initialize it. */
if ((ret = __db_shalloc(reginfo->addr,
@@ -231,14 +238,7 @@ __mpool_init(dbenv, dbmp, reginfo_off, htab_buckets)
if (reginfo_off == 0) {
SH_TAILQ_INIT(&mp->mpfq);
- if ((ret = __db_shmutex_init(dbenv, &mp->sync_mutex,
- R_OFFSET(dbmp->reginfo, &mp->sync_mutex) +
- DB_FCNTL_OFF_MPOOL, 0, dbmp->reginfo,
- (REGMAINT *)R_ADDR(dbmp->reginfo, mp->maint_off))) != 0)
- goto err;
-
ZERO_LSN(mp->lsn);
- mp->lsn_cnt = 0;
mp->nreg = dbmp->nreg;
if ((ret = __db_shalloc(dbmp->reginfo[0].addr,
@@ -247,32 +247,41 @@ __mpool_init(dbenv, dbmp, reginfo_off, htab_buckets)
mp->regids = R_OFFSET(dbmp->reginfo, p);
}
- SH_TAILQ_INIT(&mp->bhq);
-
/* Allocate hash table space and initialize it. */
if ((ret = __db_shalloc(reginfo->addr,
- htab_buckets * sizeof(DB_HASHTAB), 0, &htab)) != 0)
+ htab_buckets * sizeof(DB_MPOOL_HASH), 0, &htab)) != 0)
goto mem_err;
- __db_hashinit(htab, htab_buckets);
mp->htab = R_OFFSET(reginfo, htab);
- mp->htab_buckets = htab_buckets;
+ for (i = 0; i < htab_buckets; i++) {
+ if ((ret = __db_mutex_setup(dbenv,
+ reginfo, &htab[i].hash_mutex,
+ MUTEX_NO_RLOCK)) != 0)
+ return (ret);
+ SH_TAILQ_INIT(&htab[i].hash_bucket);
+ htab[i].hash_page_dirty = htab[i].hash_priority = 0;
+ }
+ mp->htab_buckets = mp->stat.st_hash_buckets = htab_buckets;
+ /*
+ * Only the environment creator knows the total cache size, fill in
+ * those statistics now.
+ */
+ mp->stat.st_gbytes = dbenv->mp_gbytes;
+ mp->stat.st_bytes = dbenv->mp_bytes;
return (0);
mem_err:__db_err(dbenv, "Unable to allocate memory for mpool region");
-err: if (reginfo->primary != NULL)
- __db_shalloc_free(reginfo->addr, reginfo->primary);
return (ret);
}
/*
- * __memp_close --
- * Internal version of memp_close: only called from DB_ENV->close.
+ * __memp_dbenv_refresh --
+ * Clean up after the mpool system on a close or failed open.
*
- * PUBLIC: int __memp_close __P((DB_ENV *));
+ * PUBLIC: int __memp_dbenv_refresh __P((DB_ENV *));
*/
int
-__memp_close(dbenv)
+__memp_dbenv_refresh(dbenv)
DB_ENV *dbenv;
{
DB_MPOOL *dbmp;
@@ -287,12 +296,12 @@ __memp_close(dbenv)
/* Discard DB_MPREGs. */
while ((mpreg = LIST_FIRST(&dbmp->dbregq)) != NULL) {
LIST_REMOVE(mpreg, q);
- __os_free(mpreg, sizeof(DB_MPREG));
+ __os_free(dbenv, mpreg);
}
/* Discard DB_MPOOLFILEs. */
while ((dbmfp = TAILQ_FIRST(&dbmp->dbmfq)) != NULL)
- if ((t_ret = memp_fclose(dbmfp)) != 0 && ret == 0)
+ if ((t_ret = __memp_fclose_int(dbmfp, 0)) != 0 && ret == 0)
ret = t_ret;
/* Discard the thread mutex. */
@@ -305,14 +314,14 @@ __memp_close(dbenv)
dbenv, &dbmp->reginfo[i], 0)) != 0 && ret == 0)
ret = t_ret;
- __os_free(dbmp->reginfo, dbmp->nreg * sizeof(*dbmp->reginfo));
- __os_free(dbmp, sizeof(*dbmp));
+ __os_free(dbenv, dbmp->reginfo);
+ __os_free(dbenv, dbmp);
dbenv->mp_handle = NULL;
return (ret);
}
-#ifdef MUTEX_SYSTEM_RESOURCES
+#ifdef HAVE_MUTEX_SYSTEM_RESOURCES
/*
* __mpool_region_maint --
* Return the amount of space needed for region maintenance info.
@@ -328,9 +337,11 @@ __mpool_region_maint(infop)
/*
* For mutex maintenance we need one mutex per possible page.
* Compute the maximum number of pages this cache can have.
- * Also add in an mpool mutex.
+ * Also add in an mpool mutex and mutexes for all dbenv and db
+ * handles.
*/
numlocks = ((infop->rp->size / DB_MIN_PGSIZE) + 1);
+ numlocks += DB_MAX_HANDLES;
s = sizeof(roff_t) * numlocks;
return (s);
}
@@ -347,11 +358,109 @@ __mpool_region_destroy(dbenv, infop)
DB_ENV *dbenv;
REGINFO *infop;
{
- MPOOL *mp;
+ __db_shlocks_destroy(infop, (REGMAINT *)R_ADDR(infop,
+ ((MPOOL *)R_ADDR(infop, infop->rp->primary))->maint_off));
COMPQUIET(dbenv, NULL);
- mp = R_ADDR(infop, infop->rp->primary);
+ COMPQUIET(infop, NULL);
+}
+
+/*
+ * __memp_nameop
+ * Remove or rename a file in the pool.
+ *
+ * PUBLIC: int __memp_nameop __P((DB_ENV *,
+ * PUBLIC: u_int8_t *, const char *, const char *, const char *));
+ *
+ * XXX
+ * Undocumented interface: DB private.
+ */
+int
+__memp_nameop(dbenv, fileid, newname, fullold, fullnew)
+ DB_ENV *dbenv;
+ u_int8_t *fileid;
+ const char *newname, *fullold, *fullnew;
+{
+ DB_MPOOL *dbmp;
+ MPOOL *mp;
+ MPOOLFILE *mfp;
+ roff_t newname_off;
+ int locked, ret;
+ void *p;
+
+ locked = 0;
+ dbmp = NULL;
- __db_shlocks_destroy(infop, (REGMAINT *)R_ADDR(infop, mp->maint_off));
- return;
+ if (!MPOOL_ON(dbenv))
+ goto fsop;
+
+ dbmp = dbenv->mp_handle;
+ mp = dbmp->reginfo[0].primary;
+
+ /*
+ * Remove or rename a file that the mpool might know about. We assume
+ * that the fop layer has the file locked for exclusive access, so we
+ * don't worry about locking except for the mpool mutexes. Checkpoint
+ * can happen at any time, independent of file locking, so we have to
+ * do the actual unlink or rename system call to avoid any race.
+ *
+ * If this is a rename, allocate first, because we can't recursively
+ * grab the region lock.
+ */
+ if (newname == NULL)
+ p = NULL;
+ else {
+ if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
+ NULL, strlen(newname) + 1, &newname_off, &p)) != 0)
+ return (ret);
+ memcpy(p, newname, strlen(newname) + 1);
+ }
+
+ locked = 1;
+ R_LOCK(dbenv, dbmp->reginfo);
+
+ /*
+ * Find the file -- if mpool doesn't know about this file, that's not
+ * an error-- we may not have it open.
+ */
+ for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
+ mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
+ /* Ignore non-active files. */
+ if (F_ISSET(mfp, MP_DEADFILE | MP_TEMP))
+ continue;
+
+ /* Ignore non-matching files. */
+ if (memcmp(fileid, R_ADDR(
+ dbmp->reginfo, mfp->fileid_off), DB_FILE_ID_LEN) != 0)
+ continue;
+
+ /* If newname is NULL, we're removing the file. */
+ if (newname == NULL) {
+ MUTEX_LOCK(dbenv, &mfp->mutex);
+ MPOOLFILE_IGNORE(mfp);
+ MUTEX_UNLOCK(dbenv, &mfp->mutex);
+ } else {
+ /*
+ * Else, it's a rename. We've allocated memory
+ * for the new name. Swap it with the old one.
+ */
+ p = R_ADDR(dbmp->reginfo, mfp->path_off);
+ mfp->path_off = newname_off;
+ }
+ break;
+ }
+
+ /* Delete the memory we no longer need. */
+ if (p != NULL)
+ __db_shalloc_free(dbmp->reginfo[0].addr, p);
+
+fsop: if (newname == NULL)
+ (void)__os_unlink(dbenv, fullold);
+ else
+ (void)__os_rename(dbenv, fullold, fullnew, 1);
+
+ if (locked)
+ R_UNLOCK(dbenv, dbmp->reginfo);
+
+ return (0);
}
diff --git a/bdb/mp/mp_register.c b/bdb/mp/mp_register.c
index 27859f69d7b..46eefad986f 100644
--- a/bdb/mp/mp_register.c
+++ b/bdb/mp/mp_register.c
@@ -1,38 +1,33 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: mp_register.c,v 11.12 2000/11/15 19:25:39 sue Exp $";
+static const char revid[] = "$Id: mp_register.c,v 11.21 2002/03/27 04:32:27 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
#include <sys/types.h>
#endif
-#ifdef HAVE_RPC
-#include "db_server.h"
-#endif
-
#include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
-
-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
-#endif
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"
/*
* memp_register --
* Register a file type's pgin, pgout routines.
+ *
+ * PUBLIC: int __memp_register __P((DB_ENV *, int,
+ * PUBLIC: int (*)(DB_ENV *, db_pgno_t, void *, DBT *),
+ * PUBLIC: int (*)(DB_ENV *, db_pgno_t, void *, DBT *)));
*/
int
-memp_register(dbenv, ftype, pgin, pgout)
+__memp_register(dbenv, ftype, pgin, pgout)
DB_ENV *dbenv;
int ftype;
int (*pgin) __P((DB_ENV *, db_pgno_t, void *, DBT *));
@@ -42,13 +37,9 @@ memp_register(dbenv, ftype, pgin, pgout)
DB_MPREG *mpreg;
int ret;
-#ifdef HAVE_RPC
- if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
- return (__dbcl_memp_register(dbenv, ftype, pgin, pgout));
-#endif
-
PANIC_CHECK(dbenv);
- ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL);
+ ENV_REQUIRES_CONFIG(dbenv,
+ dbenv->mp_handle, "DB_ENV->memp_register", DB_INIT_MPOOL);
dbmp = dbenv->mp_handle;
@@ -70,7 +61,7 @@ memp_register(dbenv, ftype, pgin, pgout)
return (0);
/* New entry. */
- if ((ret = __os_malloc(dbenv, sizeof(DB_MPREG), NULL, &mpreg)) != 0)
+ if ((ret = __os_malloc(dbenv, sizeof(DB_MPREG), &mpreg)) != 0)
return (ret);
mpreg->ftype = ftype;
diff --git a/bdb/mp/mp_stat.c b/bdb/mp/mp_stat.c
index 7982513448d..12e72b91d70 100644
--- a/bdb/mp/mp_stat.c
+++ b/bdb/mp/mp_stat.c
@@ -1,13 +1,13 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: mp_stat.c,v 11.21 2001/01/09 16:59:30 bostic Exp $";
+static const char revid[] = "$Id: mp_stat.c,v 11.51 2002/08/06 06:13:47 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -18,123 +18,150 @@ static const char revid[] = "$Id: mp_stat.c,v 11.21 2001/01/09 16:59:30 bostic E
#include <unistd.h>
#endif
-#ifdef HAVE_RPC
-#include "db_server.h"
-#endif
-
#include "db_int.h"
-#include "db_page.h"
-#include "db_shash.h"
-#include "db_am.h"
-#include "mp.h"
-
-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
-#endif
+#include "dbinc/db_page.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/db_am.h"
+#include "dbinc/mp.h"
-static void __memp_dumpcache
- __P((DB_MPOOL *, REGINFO *, size_t *, FILE *, u_int32_t));
+static void __memp_dumpcache __P((DB_ENV *,
+ DB_MPOOL *, REGINFO *, size_t *, FILE *, u_int32_t));
static void __memp_pbh __P((DB_MPOOL *, BH *, size_t *, FILE *));
+static void __memp_stat_wait __P((REGINFO *, MPOOL *, DB_MPOOL_STAT *, int));
/*
- * memp_stat --
+ * __memp_stat --
* Display MPOOL statistics.
+ *
+ * PUBLIC: int __memp_stat
+ * PUBLIC: __P((DB_ENV *, DB_MPOOL_STAT **, DB_MPOOL_FSTAT ***, u_int32_t));
*/
int
-memp_stat(dbenv, gspp, fspp, db_malloc)
+__memp_stat(dbenv, gspp, fspp, flags)
DB_ENV *dbenv;
DB_MPOOL_STAT **gspp;
DB_MPOOL_FSTAT ***fspp;
- void *(*db_malloc) __P((size_t));
+ u_int32_t flags;
{
DB_MPOOL *dbmp;
DB_MPOOL_FSTAT **tfsp, *tstruct;
DB_MPOOL_STAT *sp;
MPOOL *c_mp, *mp;
MPOOLFILE *mfp;
- char *tname;
- size_t len, nlen;
- u_int32_t i;
+ size_t len, nlen, pagesize;
+ u_int32_t pages, i;
int ret;
- char *name;
-
-#ifdef HAVE_RPC
- if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
- return (__dbcl_memp_stat(dbenv, gspp, fspp, db_malloc));
-#endif
+ char *name, *tname;
PANIC_CHECK(dbenv);
- ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL);
+ ENV_REQUIRES_CONFIG(dbenv,
+ dbenv->mp_handle, "memp_stat", DB_INIT_MPOOL);
+
+ if ((ret = __db_fchk(dbenv,
+ "DB_ENV->memp_stat", flags, DB_STAT_CLEAR)) != 0)
+ return (ret);
dbmp = dbenv->mp_handle;
- sp = NULL;
+ mp = dbmp->reginfo[0].primary;
/* Global statistics. */
- mp = dbmp->reginfo[0].primary;
if (gspp != NULL) {
*gspp = NULL;
- if ((ret = __os_calloc(dbenv, 1, sizeof(**gspp), gspp)) != 0)
+ if ((ret = __os_umalloc(dbenv, sizeof(**gspp), gspp)) != 0)
return (ret);
+ memset(*gspp, 0, sizeof(**gspp));
sp = *gspp;
/*
* Initialization and information that is not maintained on
* a per-cache basis.
*/
- sp->st_hash_longest = 0;
- sp->st_region_wait = dbmp->reginfo[0].rp->mutex.mutex_set_wait;
- sp->st_region_nowait =
- dbmp->reginfo[0].rp->mutex.mutex_set_nowait;
- sp->st_gbytes = dbenv->mp_gbytes;
- sp->st_bytes = dbenv->mp_bytes;
+ c_mp = dbmp->reginfo[0].primary;
+ sp->st_gbytes = c_mp->stat.st_gbytes;
+ sp->st_bytes = c_mp->stat.st_bytes;
sp->st_ncache = dbmp->nreg;
sp->st_regsize = dbmp->reginfo[0].rp->size;
- R_LOCK(dbenv, dbmp->reginfo);
-
/* Walk the cache list and accumulate the global information. */
for (i = 0; i < mp->nreg; ++i) {
c_mp = dbmp->reginfo[i].primary;
+
+ sp->st_map += c_mp->stat.st_map;
sp->st_cache_hit += c_mp->stat.st_cache_hit;
sp->st_cache_miss += c_mp->stat.st_cache_miss;
- sp->st_map += c_mp->stat.st_map;
sp->st_page_create += c_mp->stat.st_page_create;
sp->st_page_in += c_mp->stat.st_page_in;
sp->st_page_out += c_mp->stat.st_page_out;
sp->st_ro_evict += c_mp->stat.st_ro_evict;
sp->st_rw_evict += c_mp->stat.st_rw_evict;
+ sp->st_page_trickle += c_mp->stat.st_page_trickle;
+ sp->st_pages += c_mp->stat.st_pages;
+ /*
+ * st_page_dirty calculated by __memp_stat_hash
+ * st_page_clean calculated here
+ */
+ __memp_stat_hash(
+ &dbmp->reginfo[i], c_mp, &sp->st_page_dirty);
+ sp->st_page_clean = sp->st_pages - sp->st_page_dirty;
sp->st_hash_buckets += c_mp->stat.st_hash_buckets;
sp->st_hash_searches += c_mp->stat.st_hash_searches;
- if (c_mp->stat.st_hash_longest > sp->st_hash_longest)
- sp->st_hash_longest =
- c_mp->stat.st_hash_longest;
+ sp->st_hash_longest += c_mp->stat.st_hash_longest;
sp->st_hash_examined += c_mp->stat.st_hash_examined;
- sp->st_page_clean += c_mp->stat.st_page_clean;
- sp->st_page_dirty += c_mp->stat.st_page_dirty;
- sp->st_page_trickle += c_mp->stat.st_page_trickle;
- sp->st_region_wait += c_mp->stat.st_region_wait;
- sp->st_region_nowait += c_mp->stat.st_region_nowait;
+ /*
+ * st_hash_nowait calculated by __memp_stat_wait
+ * st_hash_wait
+ */
+ __memp_stat_wait(&dbmp->reginfo[i], c_mp, sp, flags);
+ sp->st_region_nowait +=
+ dbmp->reginfo[i].rp->mutex.mutex_set_nowait;
+ sp->st_region_wait +=
+ dbmp->reginfo[i].rp->mutex.mutex_set_wait;
+ sp->st_alloc += c_mp->stat.st_alloc;
+ sp->st_alloc_buckets += c_mp->stat.st_alloc_buckets;
+ if (sp->st_alloc_max_buckets <
+ c_mp->stat.st_alloc_max_buckets)
+ sp->st_alloc_max_buckets =
+ c_mp->stat.st_alloc_max_buckets;
+ sp->st_alloc_pages += c_mp->stat.st_alloc_pages;
+ if (sp->st_alloc_max_pages <
+ c_mp->stat.st_alloc_max_pages)
+ sp->st_alloc_max_pages =
+ c_mp->stat.st_alloc_max_pages;
+
+ if (LF_ISSET(DB_STAT_CLEAR)) {
+ dbmp->reginfo[i].rp->mutex.mutex_set_wait = 0;
+ dbmp->reginfo[i].rp->mutex.mutex_set_nowait = 0;
+ pages = c_mp->stat.st_pages;
+ memset(&c_mp->stat, 0, sizeof(c_mp->stat));
+ c_mp->stat.st_hash_buckets = c_mp->htab_buckets;
+ c_mp->stat.st_pages = pages;
+ }
}
/*
- * We have duplicate statistics fields in the cache and
- * per-file structures. The counters are only incremented
- * in the per-file structures, though. The intent is that
- * if we ever flush files from the pool we can save their
- * last known totals in the cache structure.
+ * We have duplicate statistics fields in per-file structures
+ * and the cache. The counters are only incremented in the
+ * per-file structures, except if a file is flushed from the
+ * mpool, at which time we copy its information into the cache
+ * statistics. We added the cache information above, now we
+ * add the per-file information.
*/
+ R_LOCK(dbenv, dbmp->reginfo);
for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
+ sp->st_map += mfp->stat.st_map;
sp->st_cache_hit += mfp->stat.st_cache_hit;
sp->st_cache_miss += mfp->stat.st_cache_miss;
- sp->st_map += mfp->stat.st_map;
sp->st_page_create += mfp->stat.st_page_create;
sp->st_page_in += mfp->stat.st_page_in;
sp->st_page_out += mfp->stat.st_page_out;
+ if (fspp == NULL && LF_ISSET(DB_STAT_CLEAR)) {
+ pagesize = mfp->stat.st_pagesize;
+ memset(&mfp->stat, 0, sizeof(mfp->stat));
+ mfp->stat.st_pagesize = pagesize;
+ }
}
-
R_UNLOCK(dbenv, dbmp->reginfo);
}
@@ -142,9 +169,8 @@ memp_stat(dbenv, gspp, fspp, db_malloc)
if (fspp != NULL) {
*fspp = NULL;
- R_LOCK(dbenv, dbmp->reginfo);
-
/* Count the MPOOLFILE structures. */
+ R_LOCK(dbenv, dbmp->reginfo);
for (i = 0, len = 0,
mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
mfp != NULL;
@@ -153,18 +179,15 @@ memp_stat(dbenv, gspp, fspp, db_malloc)
sizeof(DB_MPOOL_FSTAT) +
strlen(__memp_fns(dbmp, mfp)) + 1;
len += sizeof(DB_MPOOL_FSTAT *); /* Trailing NULL */
-
R_UNLOCK(dbenv, dbmp->reginfo);
- if (len == 0)
+ if (i == 0)
return (0);
/* Allocate space */
- if ((ret = __os_malloc(dbenv, len, db_malloc, fspp)) != 0)
+ if ((ret = __os_umalloc(dbenv, len, fspp)) != 0)
return (ret);
- R_LOCK(dbenv, dbmp->reginfo);
-
/*
* Build each individual entry. We assume that an array of
* pointers are aligned correctly to be followed by an array
@@ -179,20 +202,30 @@ memp_stat(dbenv, gspp, fspp, db_malloc)
tstruct = (DB_MPOOL_FSTAT *)(tfsp + i + 1);
tname = (char *)(tstruct + i);
+ /*
+ * Files may have been opened since we counted, don't walk
+ * off the end of the allocated space.
+ */
+ R_LOCK(dbenv, dbmp->reginfo);
for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
- mfp != NULL;
+ mfp != NULL && i-- > 0;
++tfsp, ++tstruct, tname += nlen,
mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
name = __memp_fns(dbmp, mfp);
nlen = strlen(name) + 1;
*tfsp = tstruct;
*tstruct = mfp->stat;
+ if (LF_ISSET(DB_STAT_CLEAR)) {
+ pagesize = mfp->stat.st_pagesize;
+ memset(&mfp->stat, 0, sizeof(mfp->stat));
+ mfp->stat.st_pagesize = pagesize;
+ }
tstruct->file_name = tname;
memcpy(tname, name, nlen);
}
- *tfsp = NULL;
-
R_UNLOCK(dbenv, dbmp->reginfo);
+
+ *tfsp = NULL;
}
return (0);
}
@@ -200,7 +233,6 @@ memp_stat(dbenv, gspp, fspp, db_malloc)
#define FMAP_ENTRIES 200 /* Files we map. */
#define MPOOL_DUMP_HASH 0x01 /* Debug hash chains. */
-#define MPOOL_DUMP_LRU 0x02 /* Debug LRU chains. */
#define MPOOL_DUMP_MEM 0x04 /* Debug region memory. */
#define MPOOL_DUMP_ALL 0x07 /* Debug all. */
@@ -208,14 +240,23 @@ memp_stat(dbenv, gspp, fspp, db_malloc)
* __memp_dump_region --
* Display MPOOL structures.
*
- * PUBLIC: void __memp_dump_region __P((DB_ENV *, char *, FILE *));
+ * PUBLIC: int __memp_dump_region __P((DB_ENV *, char *, FILE *));
*/
-void
+int
__memp_dump_region(dbenv, area, fp)
DB_ENV *dbenv;
char *area;
FILE *fp;
{
+ static const FN fn[] = {
+ { MP_CAN_MMAP, "mmapped" },
+ { MP_DEADFILE, "dead" },
+ { MP_DIRECT, "no buffer" },
+ { MP_EXTENT, "extent" },
+ { MP_TEMP, "temporary" },
+ { MP_UNLINK, "unlink" },
+ { 0, NULL }
+ };
DB_MPOOL *dbmp;
DB_MPOOLFILE *dbmfp;
MPOOL *mp;
@@ -225,6 +266,10 @@ __memp_dump_region(dbenv, area, fp)
int cnt;
u_int8_t *p;
+ PANIC_CHECK(dbenv);
+ ENV_REQUIRES_CONFIG(dbenv,
+ dbenv->mp_handle, "memp_dump_region", DB_INIT_MPOOL);
+
dbmp = dbenv->mp_handle;
/* Make it easy to call from the debugger. */
@@ -239,40 +284,42 @@ __memp_dump_region(dbenv, area, fp)
case 'h':
LF_SET(MPOOL_DUMP_HASH);
break;
- case 'l':
- LF_SET(MPOOL_DUMP_LRU);
- break;
case 'm':
LF_SET(MPOOL_DUMP_MEM);
break;
}
- R_LOCK(dbenv, dbmp->reginfo);
-
mp = dbmp->reginfo[0].primary;
/* Display MPOOL structures. */
(void)fprintf(fp, "%s\nPool (region addr 0x%lx)\n",
- DB_LINE, (u_long)dbmp->reginfo[0].addr);
+ DB_LINE, P_TO_ULONG(dbmp->reginfo[0].addr));
/* Display the MPOOLFILE structures. */
- cnt = 0;
- for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
+ R_LOCK(dbenv, dbmp->reginfo);
+ for (cnt = 0, mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile), ++cnt) {
- (void)fprintf(fp, "File #%d: %s: type %ld, %s\n\t [UID: ",
- cnt + 1, __memp_fns(dbmp, mfp), (long)mfp->ftype,
- F_ISSET(mfp, MP_CAN_MMAP) ? "mmap" : "read/write");
+ (void)fprintf(fp, "File #%d: %s: pagesize %lu\n", cnt + 1,
+ __memp_fns(dbmp, mfp), (u_long)mfp->stat.st_pagesize);
+ (void)fprintf(fp, "\t type %ld; ref %lu; blocks %lu; last %lu;",
+ (long)mfp->ftype, (u_long)mfp->mpf_cnt,
+ (u_long)mfp->block_cnt, (u_long)mfp->last_pgno);
+ __db_prflags(mfp->flags, fn, fp);
+
+ (void)fprintf(fp, "\n\t UID: ");
p = R_ADDR(dbmp->reginfo, mfp->fileid_off);
- for (i = 0; i < DB_FILE_ID_LEN; ++i) {
- (void)fprintf(fp, "%x", *p++);
+ for (i = 0; i < DB_FILE_ID_LEN; ++i, ++p) {
+ (void)fprintf(fp, "%x", (u_int)*p);
if (i < DB_FILE_ID_LEN - 1)
(void)fprintf(fp, " ");
}
- (void)fprintf(fp, "]\n");
+ (void)fprintf(fp, "\n");
if (cnt < FMAP_ENTRIES)
fmap[cnt] = R_OFFSET(dbmp->reginfo, mfp);
}
+ R_UNLOCK(dbenv, dbmp->reginfo);
+ MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q), ++cnt) {
(void)fprintf(fp, "File #%d: %s: per-process, %s\n",
@@ -281,6 +328,7 @@ __memp_dump_region(dbenv, area, fp)
if (cnt < FMAP_ENTRIES)
fmap[cnt] = R_OFFSET(dbmp->reginfo, mfp);
}
+ MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
if (cnt < FMAP_ENTRIES)
fmap[cnt] = INVALID_ROFF;
else
@@ -289,13 +337,14 @@ __memp_dump_region(dbenv, area, fp)
/* Dump the memory pools. */
for (i = 0; i < mp->nreg; ++i) {
(void)fprintf(fp, "%s\nCache #%d:\n", DB_LINE, i + 1);
- __memp_dumpcache(dbmp, &dbmp->reginfo[i], fmap, fp, flags);
+ __memp_dumpcache(
+ dbenv, dbmp, &dbmp->reginfo[i], fmap, fp, flags);
}
- R_UNLOCK(dbenv, dbmp->reginfo);
-
/* Flush in case we're debugging. */
(void)fflush(fp);
+
+ return (0);
}
/*
@@ -303,7 +352,8 @@ __memp_dump_region(dbenv, area, fp)
* Display statistics for a cache.
*/
static void
-__memp_dumpcache(dbmp, reginfo, fmap, fp, flags)
+__memp_dumpcache(dbenv, dbmp, reginfo, fmap, fp, flags)
+ DB_ENV *dbenv;
DB_MPOOL *dbmp;
REGINFO *reginfo;
size_t *fmap;
@@ -311,7 +361,7 @@ __memp_dumpcache(dbmp, reginfo, fmap, fp, flags)
u_int32_t flags;
{
BH *bhp;
- DB_HASHTAB *dbht;
+ DB_MPOOL_HASH *hp;
MPOOL *c_mp;
int bucket;
@@ -320,27 +370,24 @@ __memp_dumpcache(dbmp, reginfo, fmap, fp, flags)
/* Display the hash table list of BH's. */
if (LF_ISSET(MPOOL_DUMP_HASH)) {
(void)fprintf(fp,
- "%s\nBH hash table (%lu hash slots)\npageno, file, ref, address\n",
+ "%s\nBH hash table (%lu hash slots)\nbucket (priority):\n",
DB_LINE, (u_long)c_mp->htab_buckets);
- for (dbht = R_ADDR(reginfo, c_mp->htab),
- bucket = 0; bucket < c_mp->htab_buckets; ++dbht, ++bucket) {
- if (SH_TAILQ_FIRST(dbht, __bh) != NULL)
- (void)fprintf(fp, "%lu:\n", (u_long)bucket);
- for (bhp = SH_TAILQ_FIRST(dbht, __bh);
- bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
+ (void)fprintf(fp,
+ "\tpageno, file, ref, address [LSN] priority\n");
+
+ for (hp = R_ADDR(reginfo, c_mp->htab),
+ bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
+ MUTEX_LOCK(dbenv, &hp->hash_mutex);
+ if ((bhp =
+ SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) != NULL)
+ (void)fprintf(fp, "%lu (%u):\n",
+ (u_long)bucket, hp->hash_priority);
+ for (; bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
__memp_pbh(dbmp, bhp, fmap, fp);
+ MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
}
}
- /* Display the LRU list of BH's. */
- if (LF_ISSET(MPOOL_DUMP_LRU)) {
- (void)fprintf(fp, "%s\nBH LRU list\n", DB_LINE);
- (void)fprintf(fp, "pageno, file, ref, address\n");
- for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh);
- bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh))
- __memp_pbh(dbmp, bhp, fmap, fp);
- }
-
/* Dump the memory pool. */
if (LF_ISSET(MPOOL_DUMP_MEM))
__db_shalloc_dump(reginfo->addr, fp);
@@ -360,10 +407,9 @@ __memp_pbh(dbmp, bhp, fmap, fp)
static const FN fn[] = {
{ BH_CALLPGIN, "callpgin" },
{ BH_DIRTY, "dirty" },
+ { BH_DIRTY_CREATE, "created" },
{ BH_DISCARD, "discard" },
{ BH_LOCKED, "locked" },
- { BH_SYNC, "sync" },
- { BH_SYNC_LOGFLSH, "sync:logflush" },
{ BH_TRASH, "trash" },
{ 0, NULL }
};
@@ -374,15 +420,72 @@ __memp_pbh(dbmp, bhp, fmap, fp)
break;
if (fmap[i] == INVALID_ROFF)
- (void)fprintf(fp, " %4lu, %lu, %2lu, %lu",
+ (void)fprintf(fp, "\t%5lu, %lu, %2lu, %8lu [%lu,%lu] %lu",
(u_long)bhp->pgno, (u_long)bhp->mf_offset,
- (u_long)bhp->ref, (u_long)R_OFFSET(dbmp->reginfo, bhp));
+ (u_long)bhp->ref, (u_long)R_OFFSET(dbmp->reginfo, bhp),
+ (u_long)LSN(bhp->buf).file, (u_long)LSN(bhp->buf).offset,
+ (u_long)bhp->priority);
else
- (void)fprintf(fp, " %4lu, #%d, %2lu, %lu",
+ (void)fprintf(fp, "\t%5lu, #%d, %2lu, %8lu [%lu,%lu] %lu",
(u_long)bhp->pgno, i + 1,
- (u_long)bhp->ref, (u_long)R_OFFSET(dbmp->reginfo, bhp));
+ (u_long)bhp->ref, (u_long)R_OFFSET(dbmp->reginfo, bhp),
+ (u_long)LSN(bhp->buf).file, (u_long)LSN(bhp->buf).offset,
+ (u_long)bhp->priority);
__db_prflags(bhp->flags, fn, fp);
(void)fprintf(fp, "\n");
}
+
+/*
+ * __memp_stat_hash --
+ * Total hash bucket stats (other than mutex wait) into the region.
+ *
+ * PUBLIC: void __memp_stat_hash __P((REGINFO *, MPOOL *, u_int32_t *));
+ */
+void
+__memp_stat_hash(reginfo, mp, dirtyp)
+ REGINFO *reginfo;
+ MPOOL *mp;
+ u_int32_t *dirtyp;
+{
+ DB_MPOOL_HASH *hp;
+ u_int32_t dirty;
+ int i;
+
+ hp = R_ADDR(reginfo, mp->htab);
+ for (i = 0, dirty = 0; i < mp->htab_buckets; i++, hp++)
+ dirty += hp->hash_page_dirty;
+ *dirtyp = dirty;
+}
+
+/*
+ * __memp_stat_wait --
+ * Total hash bucket wait stats into the region.
+ */
+static void
+__memp_stat_wait(reginfo, mp, mstat, flags)
+ REGINFO *reginfo;
+ MPOOL *mp;
+ DB_MPOOL_STAT *mstat;
+ int flags;
+{
+ DB_MPOOL_HASH *hp;
+ DB_MUTEX *mutexp;
+ int i;
+
+ mstat->st_hash_max_wait = 0;
+ hp = R_ADDR(reginfo, mp->htab);
+ for (i = 0; i < mp->htab_buckets; i++, hp++) {
+ mutexp = &hp->hash_mutex;
+ mstat->st_hash_nowait += mutexp->mutex_set_nowait;
+ mstat->st_hash_wait += mutexp->mutex_set_wait;
+ if (mutexp->mutex_set_wait > mstat->st_hash_max_wait)
+ mstat->st_hash_max_wait = mutexp->mutex_set_wait;
+
+ if (LF_ISSET(DB_STAT_CLEAR)) {
+ mutexp->mutex_set_wait = 0;
+ mutexp->mutex_set_nowait = 0;
+ }
+ }
+}
diff --git a/bdb/mp/mp_sync.c b/bdb/mp/mp_sync.c
index 1b0751db709..03b42208b39 100644
--- a/bdb/mp/mp_sync.c
+++ b/bdb/mp/mp_sync.c
@@ -1,13 +1,13 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: mp_sync.c,v 11.29 2001/01/11 18:19:53 bostic Exp $";
+static const char revid[] = "$Id: mp_sync.c,v 11.64 2002/08/25 16:00:27 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -16,339 +16,92 @@ static const char revid[] = "$Id: mp_sync.c,v 11.29 2001/01/11 18:19:53 bostic E
#include <stdlib.h>
#endif
-#ifdef HAVE_RPC
-#include "db_server.h"
-#endif
-
#include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"
-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
-#endif
+typedef struct {
+ DB_MPOOL_HASH *track_hp; /* Hash bucket. */
+
+ roff_t track_off; /* Page file offset. */
+ db_pgno_t track_pgno; /* Page number. */
+} BH_TRACK;
static int __bhcmp __P((const void *, const void *));
-static int __memp_fsync __P((DB_MPOOLFILE *));
-static int __memp_sballoc __P((DB_ENV *, BH ***, u_int32_t *));
+static int __memp_close_flush_files __P((DB_ENV *, DB_MPOOL *));
+static int __memp_sync_files __P((DB_ENV *, DB_MPOOL *));
/*
- * memp_sync --
+ * __memp_sync --
* Mpool sync function.
+ *
+ * PUBLIC: int __memp_sync __P((DB_ENV *, DB_LSN *));
*/
int
-memp_sync(dbenv, lsnp)
+__memp_sync(dbenv, lsnp)
DB_ENV *dbenv;
DB_LSN *lsnp;
{
- BH *bhp, **bharray;
DB_MPOOL *dbmp;
- DB_LSN tlsn;
- MPOOL *c_mp, *mp;
- MPOOLFILE *mfp;
- u_int32_t ar_cnt, i, ndirty;
- int ret, retry_done, retry_need, wrote;
-
-#ifdef HAVE_RPC
- if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
- return (__dbcl_memp_sync(dbenv, lsnp));
-#endif
+ MPOOL *mp;
+ int ret;
PANIC_CHECK(dbenv);
- ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL);
-
- dbmp = dbenv->mp_handle;
- mp = dbmp->reginfo[0].primary;
+ ENV_REQUIRES_CONFIG(dbenv,
+ dbenv->mp_handle, "memp_sync", DB_INIT_MPOOL);
/*
- * If no LSN is provided, flush the entire cache.
- *
- * !!!
- * Our current behavior is to flush the entire cache, so there's
- * nothing special we have to do here other than deal with NULL
- * pointers.
+ * If no LSN is provided, flush the entire cache (reasonable usage
+ * even if there's no log subsystem configured).
*/
- if (lsnp == NULL) {
- ZERO_LSN(tlsn);
- lsnp = &tlsn;
- F_SET(mp, MP_LSN_RETRY);
- } else if (!LOGGING_ON(dbenv)) {
- __db_err(dbenv, "memp_sync: requires logging");
- return (EINVAL);
- }
+ if (lsnp != NULL)
+ ENV_REQUIRES_CONFIG(dbenv,
+ dbenv->lg_handle, "memp_sync", DB_INIT_LOG);
- /*
- * Sync calls are single-threaded so that we don't have multiple
- * threads, with different checkpoint LSNs, walking the caches
- * and updating the checkpoint LSNs and how many buffers remain
- * to be written for the checkpoint. This shouldn't be a problem,
- * any application that has multiple checkpoint threads isn't what
- * I'd call trustworthy.
- */
- MUTEX_LOCK(dbenv, &mp->sync_mutex, dbenv->lockfhp);
+ dbmp = dbenv->mp_handle;
+ mp = dbmp->reginfo[0].primary;
- /*
- * If the application is asking about a previous call to memp_sync(),
- * and we haven't found any buffers that the application holding the
- * pin couldn't write, return yes or no based on the current count.
- * Note, if the application is asking about a LSN *smaller* than one
- * we've already handled or are currently handling, then we return a
- * result based on the count for the larger LSN.
- */
- R_LOCK(dbenv, dbmp->reginfo);
- if (!IS_ZERO_LSN(*lsnp) &&
- !F_ISSET(mp, MP_LSN_RETRY) && log_compare(lsnp, &mp->lsn) <= 0) {
- if (mp->lsn_cnt == 0) {
+ /* If we've flushed to the requested LSN, return that information. */
+ if (lsnp != NULL) {
+ R_LOCK(dbenv, dbmp->reginfo);
+ if (log_compare(lsnp, &mp->lsn) <= 0) {
*lsnp = mp->lsn;
- ret = 0;
- } else
- ret = DB_INCOMPLETE;
+ R_UNLOCK(dbenv, dbmp->reginfo);
+ return (0);
+ }
R_UNLOCK(dbenv, dbmp->reginfo);
- MUTEX_UNLOCK(dbenv, &mp->sync_mutex);
- return (ret);
}
- /*
- * Allocate room for a list of buffers, and decide how many buffers
- * we can pin down.
- *
- * !!!
- * Note: __memp_sballoc has released the region lock if we're not
- * continuing forward.
- */
- if ((ret =
- __memp_sballoc(dbenv, &bharray, &ndirty)) != 0 || ndirty == 0) {
- MUTEX_UNLOCK(dbenv, &mp->sync_mutex);
+ if ((ret = __memp_sync_int(dbenv, NULL, 0, DB_SYNC_CACHE, NULL)) != 0)
return (ret);
- }
- retry_done = 0;
-retry: retry_need = 0;
- /*
- * Start a new checkpoint.
- *
- * Save the LSN. We know that it's a new LSN, a retry, or larger than
- * the one for which we were already doing a checkpoint. (BTW, I don't
- * expect to see multiple LSN's from the same or multiple processes,
- * but You Just Never Know. Responding as if they all called with the
- * largest of the LSNs specified makes everything work.)
- *
- * We don't currently use the LSN we save. We could potentially save
- * the last-written LSN in each buffer header and use it to determine
- * what buffers need to be written. The problem with this is that it's
- * sizeof(LSN) more bytes of buffer header. We currently write all the
- * dirty buffers instead, but with a sufficiently large cache that's
- * going to be a problem.
- */
- mp->lsn = *lsnp;
-
- /*
- * Clear the global count of buffers waiting to be written, walk the
- * list of files clearing the count of buffers waiting to be written.
- *
- * Clear the retry flag.
- */
- mp->lsn_cnt = 0;
- for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
- mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile))
- mfp->lsn_cnt = 0;
- F_CLR(mp, MP_LSN_RETRY);
-
- /*
- * Walk each cache's list of buffers and mark all dirty buffers to be
- * written and all pinned buffers to be potentially written (we can't
- * know if they'll need to be written until the holder returns them to
- * the cache). We do this in one pass while holding the region locked
- * so that processes can't make new buffers dirty, causing us to never
- * finish. Since the application may have restarted the sync using a
- * different LSN value, clear any BH_SYNC | BH_SYNC_LOGFLSH flags that
- * appear leftover from previous calls.
- *
- * Keep a count of the total number of buffers we need to write in
- * MPOOL->lsn_cnt, and for each file, in MPOOLFILE->lsn_count.
- */
- for (ar_cnt = 0, i = 0; i < mp->nreg; ++i) {
- c_mp = dbmp->reginfo[i].primary;
- for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh);
- bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) {
- if (F_ISSET(bhp, BH_DIRTY) || bhp->ref != 0) {
- F_SET(bhp, BH_SYNC);
-
- ++mp->lsn_cnt;
-
- mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
- ++mfp->lsn_cnt;
-
- /*
- * If the buffer isn't being used, we can write
- * it immediately, so increment its reference
- * count to lock it down, and save a reference
- * to it.
- *
- * If we've run out space to store buffer refs,
- * we're screwed. We don't want to realloc the
- * array while holding a region lock, so we set
- * a flag and deal with it later.
- */
- if (bhp->ref == 0) {
- ++bhp->ref;
- bharray[ar_cnt] = bhp;
-
- if (++ar_cnt >= ndirty) {
- retry_need = 1;
- break;
- }
- }
- } else
- if (F_ISSET(bhp, BH_SYNC))
- F_CLR(bhp, BH_SYNC | BH_SYNC_LOGFLSH);
- }
- if (ar_cnt >= ndirty)
- break;
- }
-
- /* If there no buffers we can write immediately, we're done. */
- if (ar_cnt == 0) {
- ret = mp->lsn_cnt ? DB_INCOMPLETE : 0;
- goto done;
- }
-
- R_UNLOCK(dbenv, dbmp->reginfo);
-
- /*
- * Sort the buffers we're going to write immediately.
- *
- * We try and write the buffers in file/page order: it should reduce
- * seeks by the underlying filesystem and possibly reduce the actual
- * number of writes.
- */
- if (ar_cnt > 1)
- qsort(bharray, ar_cnt, sizeof(BH *), __bhcmp);
-
- /*
- * Flush the log. We have to ensure the log records reflecting the
- * changes on the database pages we're writing have already made it
- * to disk. We usually do that as we write each page, but if we
- * are going to write a large number of pages, repeatedly acquiring
- * the log region lock is going to be expensive. Flush the entire
- * log now, so that sync doesn't require any more log flushes.
- */
- if (LOGGING_ON(dbenv) && (ret = log_flush(dbenv, NULL)) != 0)
- goto done;
-
- R_LOCK(dbenv, dbmp->reginfo);
-
- /* Walk the array, writing buffers. */
- for (i = 0; i < ar_cnt; ++i) {
- /*
- * It's possible for a thread to have gotten the buffer since
- * we listed it for writing. If the reference count is still
- * 1, we're the only ones using the buffer, go ahead and write.
- * If it's >1, then skip the buffer and assume that it will be
- * written when it's returned to the cache.
- */
- if (bharray[i]->ref > 1) {
- --bharray[i]->ref;
- continue;
- }
-
- /* Write the buffer. */
- mfp = R_ADDR(dbmp->reginfo, bharray[i]->mf_offset);
- ret = __memp_bhwrite(dbmp, mfp, bharray[i], NULL, &wrote);
-
- /* Release the buffer. */
- --bharray[i]->ref;
-
- if (ret == 0 && wrote)
- continue;
-
- /*
- * Any process syncing the shared memory buffer pool had best
- * be able to write to any underlying file. Be understanding,
- * but firm, on this point.
- */
- if (ret == 0) {
- __db_err(dbenv, "%s: unable to flush page: %lu",
- __memp_fns(dbmp, mfp), (u_long)bharray[i]->pgno);
- ret = EPERM;
- }
-
- /*
- * On error, clear MPOOL->lsn and set MP_LSN_RETRY so that no
- * future checkpoint return can depend on this failure. Clear
- * the buffer's BH_SYNC flag, because it's used to determine
- * if lsn_cnt values are incremented/decremented. Don't bother
- * to reset/clear:
- *
- * MPOOL->lsn_cnt
- * MPOOLFILE->lsn_cnt
- *
- * they don't make any difference.
- */
- ZERO_LSN(mp->lsn);
- F_SET(mp, MP_LSN_RETRY);
-
- /* Release any buffers we're still pinning down. */
- while (++i < ar_cnt) {
- bhp = bharray[i];
- --bhp->ref;
- F_CLR(bhp, BH_SYNC | BH_SYNC_LOGFLSH);
- }
-
- goto done;
- }
-
- ret = mp->lsn_cnt != 0 ? DB_INCOMPLETE : 0;
-
- /*
- * If there were too many buffers and we're not returning an error, we
- * re-try the checkpoint once -- since we allocated 80% of the total
- * buffer count, once should be enough. If it still doesn't work, some
- * other thread of control is dirtying buffers as fast as we're writing
- * them, and we might as well give up for now. In the latter case, set
- * the global retry flag, we'll have to start from scratch on the next
- * checkpoint.
- */
- if (retry_need) {
- if (retry_done) {
- ret = DB_INCOMPLETE;
- F_SET(mp, MP_LSN_RETRY);
- } else {
- retry_done = 1;
- goto retry;
- }
+ if (lsnp != NULL) {
+ R_LOCK(dbenv, dbmp->reginfo);
+ if (log_compare(lsnp, &mp->lsn) > 0)
+ mp->lsn = *lsnp;
+ R_UNLOCK(dbenv, dbmp->reginfo);
}
-done: R_UNLOCK(dbenv, dbmp->reginfo);
- MUTEX_UNLOCK(dbenv, &mp->sync_mutex);
-
- __os_free(bharray, ndirty * sizeof(BH *));
-
- return (ret);
+ return (0);
}
/*
- * memp_fsync --
+ * __memp_fsync --
* Mpool file sync function.
+ *
+ * PUBLIC: int __memp_fsync __P((DB_MPOOLFILE *));
*/
int
-memp_fsync(dbmfp)
+__memp_fsync(dbmfp)
DB_MPOOLFILE *dbmfp;
{
DB_ENV *dbenv;
DB_MPOOL *dbmp;
- int is_tmp;
dbmp = dbmfp->dbmp;
dbenv = dbmp->dbenv;
-#ifdef HAVE_RPC
- if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
- return (__dbcl_memp_fsync(dbmfp));
-#endif
-
PANIC_CHECK(dbenv);
/*
@@ -359,13 +112,10 @@ memp_fsync(dbmfp)
if (F_ISSET(dbmfp, MP_READONLY))
return (0);
- R_LOCK(dbenv, dbmp->reginfo);
- is_tmp = F_ISSET(dbmfp->mfp, MP_TEMP);
- R_UNLOCK(dbenv, dbmp->reginfo);
- if (is_tmp)
+ if (F_ISSET(dbmfp->mfp, MP_TEMP))
return (0);
- return (__memp_fsync(dbmfp));
+ return (__memp_sync_int(dbenv, dbmfp, 0, DB_SYNC_FILE, NULL));
}
/*
@@ -379,6 +129,7 @@ __mp_xxx_fh(dbmfp, fhp)
DB_MPOOLFILE *dbmfp;
DB_FH **fhp;
{
+ DB_ENV *dbenv;
/*
* This is a truly spectacular layering violation, intended ONLY to
* support compatibility for the DB 1.85 DB->fd call.
@@ -393,239 +144,457 @@ __mp_xxx_fh(dbmfp, fhp)
* because we want to write to the backing file regardless so that
* we get a file descriptor to return.
*/
- *fhp = &dbmfp->fh;
- return (F_ISSET(&dbmfp->fh, DB_FH_VALID) ? 0 : __memp_fsync(dbmfp));
+ *fhp = dbmfp->fhp;
+ if (F_ISSET(dbmfp->fhp, DB_FH_VALID))
+ return (0);
+ dbenv = dbmfp->dbmp->dbenv;
+
+ return (__memp_sync_int(dbenv, dbmfp, 0, DB_SYNC_FILE, NULL));
}
/*
- * __memp_fsync --
- * Mpool file internal sync function.
+ * __memp_sync_int --
+ * Mpool sync internal function.
+ *
+ * PUBLIC: int __memp_sync_int
+ * PUBLIC: __P((DB_ENV *, DB_MPOOLFILE *, int, db_sync_op, int *));
*/
-static int
-__memp_fsync(dbmfp)
+int
+__memp_sync_int(dbenv, dbmfp, ar_max, op, wrotep)
+ DB_ENV *dbenv;
DB_MPOOLFILE *dbmfp;
+ int ar_max, *wrotep;
+ db_sync_op op;
{
- BH *bhp, **bharray;
- DB_ENV *dbenv;
+ BH *bhp;
+ BH_TRACK *bharray;
DB_MPOOL *dbmp;
+ DB_MPOOL_HASH *hp;
+ DB_MUTEX *mutexp;
MPOOL *c_mp, *mp;
- size_t mf_offset;
- u_int32_t ar_cnt, i, ndirty;
- int incomplete, ret, retry_done, retry_need, wrote;
+ MPOOLFILE *mfp;
+ u_int32_t n_cache;
+ int ar_cnt, hb_lock, i, pass, remaining, ret, t_ret, wait_cnt, wrote;
- dbmp = dbmfp->dbmp;
- dbenv = dbmp->dbenv;
+ dbmp = dbenv->mp_handle;
mp = dbmp->reginfo[0].primary;
-
- R_LOCK(dbenv, dbmp->reginfo);
+ pass = wrote = 0;
/*
- * Allocate room for a list of buffers, and decide how many buffers
- * we can pin down.
- *
- * !!!
- * Note: __memp_sballoc has released our region lock if we're not
- * continuing forward.
+ * If the caller does not specify how many pages assume one
+ * per bucket.
*/
+ if (ar_max == 0)
+ ar_max = mp->nreg * mp->htab_buckets;
+
if ((ret =
- __memp_sballoc(dbenv, &bharray, &ndirty)) != 0 || ndirty == 0)
+ __os_malloc(dbenv, ar_max * sizeof(BH_TRACK), &bharray)) != 0)
return (ret);
- retry_done = 0;
-retry: retry_need = 0;
/*
* Walk each cache's list of buffers and mark all dirty buffers to be
- * written and all pinned buffers to be potentially written (we can't
- * know if they'll need to be written until the holder returns them to
- * the cache). We do this in one pass while holding the region locked
- * so that processes can't make new buffers dirty, causing us to never
- * finish.
+ * written and all pinned buffers to be potentially written, depending
+ * on our flags.
*/
- mf_offset = R_OFFSET(dbmp->reginfo, dbmfp->mfp);
- for (ar_cnt = 0, incomplete = 0, i = 0; i < mp->nreg; ++i) {
- c_mp = dbmp->reginfo[i].primary;
- for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh);
- bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) {
- if (!F_ISSET(bhp, BH_DIRTY) ||
- bhp->mf_offset != mf_offset)
- continue;
- if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED)) {
- incomplete = 1;
- continue;
- }
+ for (ar_cnt = 0, n_cache = 0; n_cache < mp->nreg; ++n_cache) {
+ c_mp = dbmp->reginfo[n_cache].primary;
+ hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
+ for (i = 0; i < c_mp->htab_buckets; i++, hp++) {
/*
- * If the buffer isn't being used, we can write
- * it immediately, so increment its reference
- * count to lock it down, and save a reference
- * to it.
- *
- * If we've run out space to store buffer refs,
- * we're screwed. We don't want to realloc the
- * array while holding a region lock, so we set
- * a flag and deal with it later.
+ * We can check for empty buckets before locking as we
+ * only care if the pointer is zero or non-zero. We
+ * can ignore empty buckets because we only need write
+ * buffers that were dirty before we started.
*/
- ++bhp->ref;
- bharray[ar_cnt] = bhp;
- if (++ar_cnt >= ndirty) {
- retry_need = 1;
- break;
+ if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
+ continue;
+
+ MUTEX_LOCK(dbenv, &hp->hash_mutex);
+ for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
+ bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) {
+ /* Always ignore unreferenced, clean pages. */
+ if (bhp->ref == 0 && !F_ISSET(bhp, BH_DIRTY))
+ continue;
+
+ /*
+ * Checkpoints have to wait on all pinned pages,
+ * as pages may be marked dirty when returned to
+ * the cache.
+ *
+ * File syncs only wait on pages both pinned and
+ * dirty. (We don't care if pages are marked
+ * dirty when returned to the cache, that means
+ * there's another writing thread and flushing
+ * the cache for this handle is meaningless.)
+ */
+ if (op == DB_SYNC_FILE &&
+ !F_ISSET(bhp, BH_DIRTY))
+ continue;
+
+ mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+
+ /*
+ * Ignore temporary files -- this means you
+ * can't even flush temporary files by handle.
+ * (Checkpoint doesn't require temporary files
+ * be flushed and the underlying buffer write
+ * write routine may not be able to write it
+ * anyway.)
+ */
+ if (F_ISSET(mfp, MP_TEMP))
+ continue;
+
+ /*
+ * If we're flushing a specific file, see if
+ * this page is from that file.
+ */
+ if (dbmfp != NULL && mfp != dbmfp->mfp)
+ continue;
+
+ /*
+ * Ignore files that aren't involved in DB's
+ * transactional operations during checkpoints.
+ */
+ if (dbmfp == NULL && mfp->lsn_off == -1)
+ continue;
+
+ /* Track the buffer, we want it. */
+ bharray[ar_cnt].track_hp = hp;
+ bharray[ar_cnt].track_pgno = bhp->pgno;
+ bharray[ar_cnt].track_off = bhp->mf_offset;
+ ar_cnt++;
+
+ if (ar_cnt >= ar_max) {
+ if ((ret = __os_realloc(dbenv,
+ (ar_max * 2) * sizeof(BH_TRACK),
+ &bharray)) != 0)
+ break;
+ ar_max *= 2;
+ }
}
+ MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+
+ if (ret != 0)
+ goto err;
}
- if (ar_cnt >= ndirty)
- break;
}
- /* If there no buffers we can write immediately, we're done. */
- if (ar_cnt == 0) {
- ret = 0;
+ /* If there no buffers to write, we're done. */
+ if (ar_cnt == 0)
goto done;
- }
- R_UNLOCK(dbenv, dbmp->reginfo);
-
- /* Sort the buffers we're going to write. */
+ /*
+ * Write the buffers in file/page order, trying to reduce seeks by the
+ * filesystem and, when pages are smaller than filesystem block sizes,
+ * reduce the actual number of writes.
+ */
if (ar_cnt > 1)
- qsort(bharray, ar_cnt, sizeof(BH *), __bhcmp);
+ qsort(bharray, ar_cnt, sizeof(BH_TRACK), __bhcmp);
- R_LOCK(dbenv, dbmp->reginfo);
+ /*
+ * If we're trickling buffers, only write enough to reach the correct
+ * percentage for this region. We may not write enough if the dirty
+ * buffers have an unbalanced distribution among the regions, but that
+ * seems unlikely.
+ */
+ if (op == DB_SYNC_TRICKLE && ar_cnt > ar_max / (int)mp->nreg)
+ ar_cnt = ar_max / (int)mp->nreg;
+
+ /*
+ * Flush the log. We have to ensure the log records reflecting the
+ * changes on the database pages we're writing have already made it
+ * to disk. We still have to check the log each time we write a page
+ * (because pages we are about to write may be modified after we have
+ * flushed the log), but in general this will at least avoid any I/O
+ * on the log's part.
+ */
+ if (LOGGING_ON(dbenv) && (ret = dbenv->log_flush(dbenv, NULL)) != 0)
+ goto err;
+
+ /*
+ * Walk the array, writing buffers. When we write a buffer, we NULL
+ * out its hash bucket pointer so we don't process a slot more than
+ * once.
+ */
+ for (remaining = ar_cnt, i = pass = 0; remaining > 0; ++i) {
+ if (i >= ar_cnt) {
+ i = 0;
+ ++pass;
+ __os_sleep(dbenv, 1, 0);
+ }
+ if ((hp = bharray[i].track_hp) == NULL)
+ continue;
+
+ /* Lock the hash bucket and find the buffer. */
+ mutexp = &hp->hash_mutex;
+ MUTEX_LOCK(dbenv, mutexp);
+ for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
+ bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
+ if (bhp->pgno == bharray[i].track_pgno &&
+ bhp->mf_offset == bharray[i].track_off)
+ break;
- /* Walk the array, writing buffers. */
- for (i = 0; i < ar_cnt;) {
/*
- * It's possible for a thread to have gotten the buffer since
- * we listed it for writing. If the reference count is still
- * 1, we're the only ones using the buffer, go ahead and write.
- * If it's >1, then skip the buffer and assume that it will be
- * written when it's returned to the cache.
+ * If we can't find the buffer we're done, somebody else had
+ * to have written it.
+ *
+ * If the buffer isn't pinned or dirty, we're done, there's
+ * no work needed.
*/
- if (bharray[i]->ref > 1) {
- incomplete = 1;
- --bharray[i++]->ref;
+ if (bhp == NULL || (bhp->ref == 0 && !F_ISSET(bhp, BH_DIRTY))) {
+ MUTEX_UNLOCK(dbenv, mutexp);
+ --remaining;
+ bharray[i].track_hp = NULL;
continue;
}
- /* Write the buffer. */
- ret = __memp_pgwrite(dbmp, dbmfp, bharray[i], NULL, &wrote);
+ /*
+ * If the buffer is locked by another thread, ignore it, we'll
+ * come back to it.
+ *
+ * If the buffer is pinned and it's only the first or second
+ * time we have looked at it, ignore it, we'll come back to
+ * it.
+ *
+ * In either case, skip the buffer if we're not required to
+ * write it.
+ */
+ if (F_ISSET(bhp, BH_LOCKED) || (bhp->ref != 0 && pass < 2)) {
+ MUTEX_UNLOCK(dbenv, mutexp);
+ if (op != DB_SYNC_CACHE && op != DB_SYNC_FILE) {
+ --remaining;
+ bharray[i].track_hp = NULL;
+ }
+ continue;
+ }
+
+ /*
+ * The buffer is either pinned or dirty.
+ *
+ * Set the sync wait-for count, used to count down outstanding
+ * references to this buffer as they are returned to the cache.
+ */
+ bhp->ref_sync = bhp->ref;
- /* Release the buffer. */
- --bharray[i++]->ref;
+ /* Pin the buffer into memory and lock it. */
+ ++bhp->ref;
+ F_SET(bhp, BH_LOCKED);
+ MUTEX_LOCK(dbenv, &bhp->mutex);
- if (ret == 0) {
- if (!wrote)
- incomplete = 1;
- continue;
+ /*
+ * Unlock the hash bucket and wait for the wait-for count to
+ * go to 0. No new thread can acquire the buffer because we
+ * have it locked.
+ *
+ * If a thread attempts to re-pin a page, the wait-for count
+ * will never go to 0 (the thread spins on our buffer lock,
+ * while we spin on the thread's ref count). Give up if we
+ * don't get the buffer in 3 seconds, we can try again later.
+ *
+ * If, when the wait-for count goes to 0, the buffer is found
+ * to be dirty, write it.
+ */
+ MUTEX_UNLOCK(dbenv, mutexp);
+ for (wait_cnt = 1;
+ bhp->ref_sync != 0 && wait_cnt < 4; ++wait_cnt)
+ __os_sleep(dbenv, 1, 0);
+ MUTEX_LOCK(dbenv, mutexp);
+ hb_lock = 1;
+
+ /*
+ * If the ref_sync count has gone to 0, we're going to be done
+ * with this buffer no matter what happens.
+ */
+ if (bhp->ref_sync == 0) {
+ --remaining;
+ bharray[i].track_hp = NULL;
}
/*
- * On error:
+ * If the ref_sync count has gone to 0 and the buffer is still
+ * dirty, we write it. We only try to write the buffer once.
+ * Any process checkpointing or trickle-flushing the pool
+ * must be able to write any underlying file -- if the write
+ * fails, error out. It would be very strange if file sync
+ * failed to write, but we don't care if it happens.
+ */
+ if (bhp->ref_sync == 0 && F_ISSET(bhp, BH_DIRTY)) {
+ hb_lock = 0;
+ MUTEX_UNLOCK(dbenv, mutexp);
+
+ mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+ if ((ret = __memp_bhwrite(dbmp, hp, mfp, bhp, 1)) == 0)
+ ++wrote;
+ else if (op == DB_SYNC_CACHE || op == DB_SYNC_TRICKLE)
+ __db_err(dbenv, "%s: unable to flush page: %lu",
+ __memp_fns(dbmp, mfp), (u_long)bhp->pgno);
+ else
+ ret = 0;
+ }
+
+ /*
+ * If ref_sync count never went to 0, the buffer was written
+ * by another thread, or the write failed, we still have the
+ * buffer locked.
+ *
+ * We may or may not currently hold the hash bucket mutex. If
+ * the __memp_bhwrite -> __memp_pgwrite call was successful,
+ * then __memp_pgwrite will have swapped the buffer lock for
+ * the hash lock. All other call paths will leave us without
+ * the hash bucket lock.
*
- * Release any buffers we're still pinning down.
+ * The order of mutexes above was to acquire the buffer lock
+ * while holding the hash bucket lock. Don't deadlock here,
+ * release the buffer lock and then acquire the hash bucket
+ * lock.
*/
- while (i < ar_cnt)
- --bharray[i++]->ref;
- break;
- }
+ if (F_ISSET(bhp, BH_LOCKED)) {
+ F_CLR(bhp, BH_LOCKED);
+ MUTEX_UNLOCK(dbenv, &bhp->mutex);
- /*
- * If there were too many buffers and we're not returning an error, we
- * re-try the flush once -- since we allocated 80% of the total
- * buffer count, once should be enough. If it still doesn't work, some
- * other thread of control is dirtying buffers as fast as we're writing
- * them, and we might as well give up.
- */
- if (retry_need) {
- if (retry_done)
- incomplete = 1;
- else {
- retry_done = 1;
- goto retry;
+ if (!hb_lock)
+ MUTEX_LOCK(dbenv, mutexp);
}
- }
-done: R_UNLOCK(dbenv, dbmp->reginfo);
+ /*
+ * Reset the ref_sync count regardless of our success, we're
+ * done with this buffer for now.
+ */
+ bhp->ref_sync = 0;
+
+ /* Discard our reference and unlock the bucket. */
+ --bhp->ref;
+ MUTEX_UNLOCK(dbenv, mutexp);
- __os_free(bharray, ndirty * sizeof(BH *));
+ if (ret != 0)
+ break;
+ }
+
+done: /* If we've opened files to flush pages, close them. */
+ if ((t_ret = __memp_close_flush_files(dbenv, dbmp)) != 0 && ret == 0)
+ ret = t_ret;
/*
- * Sync the underlying file as the last thing we do, so that the OS
- * has a maximal opportunity to flush buffers before we request it.
- *
- * !!!:
- * Don't lock the region around the sync, fsync(2) has no atomicity
- * issues.
+ * If doing a checkpoint or flushing a file for the application, we
+ * have to force the pages to disk. We don't do this as we go along
+ * because we want to give the OS as much time as possible to lazily
+ * flush, and because we have to flush files that might not even have
+ * had dirty buffers in the cache, so we have to walk the files list.
*/
- if (ret == 0)
- ret = incomplete ?
- DB_INCOMPLETE : __os_fsync(dbenv, &dbmfp->fh);
+ if (ret == 0 && (op == DB_SYNC_CACHE || op == DB_SYNC_FILE)) {
+ if (dbmfp == NULL)
+ ret = __memp_sync_files(dbenv, dbmp);
+ else
+ ret = __os_fsync(dbenv, dbmfp->fhp);
+ }
+
+err: __os_free(dbenv, bharray);
+ if (wrotep != NULL)
+ *wrotep = wrote;
return (ret);
}
/*
- * __memp_sballoc --
- * Allocate room for a list of buffers.
+ * __memp_sync_files --
+ * Sync all the files in the environment, open or not.
*/
-static int
-__memp_sballoc(dbenv, bharrayp, ndirtyp)
+static
+int __memp_sync_files(dbenv, dbmp)
DB_ENV *dbenv;
- BH ***bharrayp;
- u_int32_t *ndirtyp;
-{
DB_MPOOL *dbmp;
- MPOOL *c_mp, *mp;
- u_int32_t i, nclean, ndirty, maxpin;
- int ret;
+{
+ DB_MPOOLFILE *dbmfp;
+ MPOOL *mp;
+ MPOOLFILE *mfp;
+ int ret, t_ret;
- dbmp = dbenv->mp_handle;
+ ret = 0;
mp = dbmp->reginfo[0].primary;
- /*
- * We don't want to hold the region lock while we write the buffers,
- * so only lock it while we create a list.
- *
- * Walk through the list of caches, figuring out how many buffers
- * we're going to need.
- *
- * Make a point of not holding the region lock across the library
- * allocation call.
- */
- for (nclean = ndirty = 0, i = 0; i < mp->nreg; ++i) {
- c_mp = dbmp->reginfo[i].primary;
- ndirty += c_mp->stat.st_page_dirty;
- nclean += c_mp->stat.st_page_clean;
+ R_LOCK(dbenv, dbmp->reginfo);
+ for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
+ mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
+ if (mfp->stat.st_page_out == 0 ||
+ F_ISSET(mfp, MP_DEADFILE | MP_TEMP))
+ continue;
+
+ /* Look for an already open handle. */
+ ret = 0;
+ MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
+ for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
+ dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q))
+ if (dbmfp->mfp == mfp) {
+ ret = __os_fsync(dbenv, dbmfp->fhp);
+ break;
+ }
+ MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+ if (ret != 0)
+ goto err;
+
+ /* If we don't find one, open one. */
+ if (dbmfp == NULL) {
+ if ((ret = dbenv->memp_fcreate(dbenv, &dbmfp, 0)) != 0)
+ goto err;
+ ret = __memp_fopen_int(
+ dbmfp, mfp, R_ADDR(dbmp->reginfo, mfp->path_off),
+ 0, 0, mfp->stat.st_pagesize);
+ if (ret == 0)
+ ret = __os_fsync(dbenv, dbmfp->fhp);
+ if ((t_ret =
+ __memp_fclose_int(dbmfp, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err;
+ }
}
- R_UNLOCK(dbenv, dbmp->reginfo);
- if (ndirty == 0) {
- *ndirtyp = 0;
- return (0);
+
+ if (0) {
+err: __db_err(dbenv, "%s: cannot sync: %s",
+ R_ADDR(dbmp->reginfo, mfp->path_off), db_strerror(ret));
}
+ R_UNLOCK(dbenv, dbmp->reginfo);
- /*
- * We don't want to pin down the entire buffer cache, otherwise we'll
- * starve threads needing new pages. Don't pin down more than 80% of
- * the cache, making sure that we don't screw up just because only a
- * few pages have been created.
- */
- maxpin = ((ndirty + nclean) * 8) / 10;
- if (maxpin < 10)
- maxpin = 10;
+ return (ret);
+}
+
+/*
+ * __memp_close_flush_files --
+ * Close files opened only to flush buffers.
+ */
+static int
+__memp_close_flush_files(dbenv, dbmp)
+ DB_ENV *dbenv;
+ DB_MPOOL *dbmp;
+{
+ DB_MPOOLFILE *dbmfp;
+ int ret;
/*
- * Get a good-sized block of memory to hold buffer pointers, we don't
- * want to run out, but correct if we want to allocate more than we
- * would be allowed to store, regardless.
+ * The routine exists because we must close files opened by sync to
+ * flush buffers. There are two cases: first, extent files have to
+ * be closed so they may be removed when empty. Second, regular
+ * files have to be closed so we don't run out of descriptors (for
+ * example, and application partitioning its data into databases
+ * based on timestamps, so there's a continually increasing set of
+ * files).
+ *
+ * We mark files opened in the __memp_bhwrite() function with the
+ * MP_FLUSH flag. Here we walk through our file descriptor list,
+ * and, if a file was opened by __memp_bhwrite(), we close it.
*/
- ndirty += ndirty / 2 + 10;
- if (ndirty > maxpin)
- ndirty = maxpin;
- if ((ret =
- __os_malloc(dbenv, ndirty * sizeof(BH *), NULL, bharrayp)) != 0)
- return (ret);
-
- *ndirtyp = ndirty;
-
- R_LOCK(dbenv, dbmp->reginfo);
+retry: MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
+ for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
+ dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q))
+ if (F_ISSET(dbmfp, MP_FLUSH)) {
+ F_CLR(dbmfp, MP_FLUSH);
+ MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+ if ((ret = __memp_fclose_int(dbmfp, 0)) != 0)
+ return (ret);
+ goto retry;
+ }
+ MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
return (0);
}
@@ -634,15 +603,15 @@ static int
__bhcmp(p1, p2)
const void *p1, *p2;
{
- BH *bhp1, *bhp2;
+ BH_TRACK *bhp1, *bhp2;
- bhp1 = *(BH * const *)p1;
- bhp2 = *(BH * const *)p2;
+ bhp1 = (BH_TRACK *)p1;
+ bhp2 = (BH_TRACK *)p2;
/* Sort by file (shared memory pool offset). */
- if (bhp1->mf_offset < bhp2->mf_offset)
+ if (bhp1->track_off < bhp2->track_off)
return (-1);
- if (bhp1->mf_offset > bhp2->mf_offset)
+ if (bhp1->track_off > bhp2->track_off)
return (1);
/*
@@ -650,9 +619,9 @@ __bhcmp(p1, p2)
* Defend against badly written quicksort code calling the comparison
* function with two identical pointers (e.g., WATCOM C++ (Power++)).
*/
- if (bhp1->pgno < bhp2->pgno)
+ if (bhp1->track_pgno < bhp2->track_pgno)
return (-1);
- if (bhp1->pgno > bhp2->pgno)
+ if (bhp1->track_pgno > bhp2->track_pgno)
return (1);
return (0);
}
diff --git a/bdb/mp/mp_trickle.c b/bdb/mp/mp_trickle.c
index f937805cf40..71077ab60cc 100644
--- a/bdb/mp/mp_trickle.c
+++ b/bdb/mp/mp_trickle.c
@@ -1,13 +1,13 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: mp_trickle.c,v 11.12 2000/11/30 00:58:41 ubell Exp $";
+static const char revid[] = "$Id: mp_trickle.c,v 11.24 2002/08/06 06:13:53 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -16,42 +16,29 @@ static const char revid[] = "$Id: mp_trickle.c,v 11.12 2000/11/30 00:58:41 ubell
#include <stdlib.h>
#endif
-#ifdef HAVE_RPC
-#include "db_server.h"
-#endif
-
#include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
-
-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
-#endif
-
-static int __memp_trick __P((DB_ENV *, int, int, int *));
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"
/*
- * memp_trickle --
+ * __memp_trickle --
* Keep a specified percentage of the buffers clean.
+ *
+ * PUBLIC: int __memp_trickle __P((DB_ENV *, int, int *));
*/
int
-memp_trickle(dbenv, pct, nwrotep)
+__memp_trickle(dbenv, pct, nwrotep)
DB_ENV *dbenv;
int pct, *nwrotep;
{
DB_MPOOL *dbmp;
- MPOOL *mp;
- u_int32_t i;
- int ret;
-
-#ifdef HAVE_RPC
- if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
- return (__dbcl_memp_trickle(dbenv, pct, nwrotep));
-#endif
+ MPOOL *c_mp, *mp;
+ u_int32_t clean, dirty, i, total, dtmp;
+ int ret, wrote;
PANIC_CHECK(dbenv);
- ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL);
+ ENV_REQUIRES_CONFIG(dbenv,
+ dbenv->mp_handle, "memp_trickle", DB_INIT_MPOOL);
dbmp = dbenv->mp_handle;
mp = dbmp->reginfo[0].primary;
@@ -62,88 +49,35 @@ memp_trickle(dbenv, pct, nwrotep)
if (pct < 1 || pct > 100)
return (EINVAL);
- R_LOCK(dbenv, dbmp->reginfo);
-
- /* Loop through the caches... */
- for (ret = 0, i = 0; i < mp->nreg; ++i)
- if ((ret = __memp_trick(dbenv, i, pct, nwrotep)) != 0)
- break;
-
- R_UNLOCK(dbenv, dbmp->reginfo);
- return (ret);
-}
-
-/*
- * __memp_trick --
- * Trickle a single cache.
- */
-static int
-__memp_trick(dbenv, ncache, pct, nwrotep)
- DB_ENV *dbenv;
- int ncache, pct, *nwrotep;
-{
- BH *bhp;
- DB_MPOOL *dbmp;
- MPOOL *c_mp;
- MPOOLFILE *mfp;
- db_pgno_t pgno;
- u_long total;
- int ret, wrote;
-
- dbmp = dbenv->mp_handle;
- c_mp = dbmp->reginfo[ncache].primary;
-
/*
- * If there are sufficient clean buffers, or no buffers or no dirty
+ * If there are sufficient clean buffers, no buffers or no dirty
* buffers, we're done.
*
* XXX
- * Using st_page_clean and st_page_dirty is our only choice at the
- * moment, but it's not as correct as we might like in the presence
- * of pools with more than one buffer size, as a free 512-byte buffer
- * isn't the same as a free 8K buffer.
+ * Using hash_page_dirty is our only choice at the moment, but it's not
+ * as correct as we might like in the presence of pools having more
+ * than one page size, as a free 512B buffer isn't the same as a free
+ * 8KB buffer.
+ *
+ * Loop through the caches counting total/dirty buffers.
*/
-loop: total = c_mp->stat.st_page_clean + c_mp->stat.st_page_dirty;
- if (total == 0 || c_mp->stat.st_page_dirty == 0 ||
- (c_mp->stat.st_page_clean * 100) / total >= (u_long)pct)
- return (0);
-
- /* Loop until we write a buffer. */
- for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh);
- bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) {
- if (bhp->ref != 0 ||
- !F_ISSET(bhp, BH_DIRTY) || F_ISSET(bhp, BH_LOCKED))
- continue;
-
- mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
-
- /*
- * We can't write to temporary files -- see the comment in
- * mp_bh.c:__memp_bhwrite().
- */
- if (F_ISSET(mfp, MP_TEMP))
- continue;
+ for (ret = 0, i = dirty = total = 0; i < mp->nreg; ++i) {
+ c_mp = dbmp->reginfo[i].primary;
+ total += c_mp->stat.st_pages;
+ __memp_stat_hash(&dbmp->reginfo[i], c_mp, &dtmp);
+ dirty += dtmp;
+ }
- pgno = bhp->pgno;
- if ((ret = __memp_bhwrite(dbmp, mfp, bhp, NULL, &wrote)) != 0)
- return (ret);
+ clean = total - dirty;
+ if (clean == total || (clean * 100) / total >= (u_long)pct)
+ return (0);
- /*
- * Any process syncing the shared memory buffer pool had better
- * be able to write to any underlying file. Be understanding,
- * but firm, on this point.
- */
- if (!wrote) {
- __db_err(dbenv, "%s: unable to flush page: %lu",
- __memp_fns(dbmp, mfp), (u_long)pgno);
- return (EPERM);
- }
+ if (nwrotep == NULL)
+ nwrotep = &wrote;
+ ret = __memp_sync_int(dbenv, NULL,
+ ((total * pct) / 100) - clean, DB_SYNC_TRICKLE, nwrotep);
- ++c_mp->stat.st_page_trickle;
- if (nwrotep != NULL)
- ++*nwrotep;
- goto loop;
- }
+ mp->stat.st_page_trickle += *nwrotep;
- return (0);
+ return (ret);
}