summaryrefslogtreecommitdiff
path: root/bdb/mp
diff options
context:
space:
mode:
Diffstat (limited to 'bdb/mp')
-rw-r--r--bdb/mp/Design52
-rw-r--r--bdb/mp/mp_alloc.c152
-rw-r--r--bdb/mp/mp_bh.c662
-rw-r--r--bdb/mp/mp_fget.c417
-rw-r--r--bdb/mp/mp_fopen.c756
-rw-r--r--bdb/mp/mp_fput.c186
-rw-r--r--bdb/mp/mp_fset.c98
-rw-r--r--bdb/mp/mp_method.c115
-rw-r--r--bdb/mp/mp_region.c357
-rw-r--r--bdb/mp/mp_register.c85
-rw-r--r--bdb/mp/mp_stat.c388
-rw-r--r--bdb/mp/mp_sync.c658
-rw-r--r--bdb/mp/mp_trickle.c149
13 files changed, 4075 insertions, 0 deletions
diff --git a/bdb/mp/Design b/bdb/mp/Design
new file mode 100644
index 00000000000..1b26aae6cba
--- /dev/null
+++ b/bdb/mp/Design
@@ -0,0 +1,52 @@
+$Id: Design,v 11.2 1999/11/21 23:08:27 bostic Exp $
+
+There are three ways we do locking in the mpool code:
+
+Locking a handle mutex to provide concurrency for DB_THREAD operations.
+Locking the region mutex to provide mutual exclusion while reading and
+ writing structures in the shared region.
+Locking buffer header mutexes during I/O.
+
+The first will not be further described here. We use the shared mpool
+region lock to provide mutual exclusion while reading/modifying all of
+the data structures, including the buffer headers. We use a per-buffer
+header lock to wait on buffer I/O. The order of locking is as follows:
+
+Searching for a buffer:
+ Acquire the region lock.
+ Find the buffer header.
+ Increment the reference count (guarantee the buffer stays).
+ While the BH_LOCKED flag is set (I/O is going on) {
+ Release the region lock.
+ Explicitly yield the processor if it's not the first pass
+ through this loop, otherwise, we can simply spin because
+ we'll be simply switching between the two locks.
+ Request the buffer lock.
+ The I/O will complete...
+ Acquire the buffer lock.
+ Release the buffer lock.
+ Acquire the region lock.
+ }
+ Return the buffer.
+
+Reading/writing a buffer:
+ Acquire the region lock.
+ Find/create the buffer header.
+ If reading, increment the reference count (guarantee the buffer stays).
+ Set the BH_LOCKED flag.
+ Acquire the buffer lock (guaranteed not to block).
+ Release the region lock.
+ Do the I/O and/or initialize the buffer contents.
+ Release the buffer lock.
+ At this point, the buffer lock is available, but the logical
+ operation (flagged by BH_LOCKED) is not yet completed. For
+ this reason, among others, threads checking the BH_LOCKED flag
+ must loop around their test.
+ Acquire the region lock.
+ Clear the BH_LOCKED flag.
+ Release the region lock.
+ Return/discard the buffer.
+
+Pointers to DB_MPOOL, MPOOL, DB_MPOOLFILE and MPOOLFILE structures are
+not reacquired when a region lock is reacquired because they couldn't
+have been closed/discarded and because they never move in memory.
diff --git a/bdb/mp/mp_alloc.c b/bdb/mp/mp_alloc.c
new file mode 100644
index 00000000000..731f569f57f
--- /dev/null
+++ b/bdb/mp/mp_alloc.c
@@ -0,0 +1,152 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: mp_alloc.c,v 11.7 2000/04/20 21:14:18 bostic Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+#endif
+
+#include "db_int.h"
+#include "db_shash.h"
+#include "mp.h"
+
+/*
+ * __memp_alloc --
+ * Allocate some space from a cache region.
+ *
+ * PUBLIC: int __memp_alloc __P((DB_MPOOL *,
+ * PUBLIC: REGINFO *, MPOOLFILE *, size_t, roff_t *, void *));
+ */
+int
+__memp_alloc(dbmp, memreg, mfp, len, offsetp, retp)
+ DB_MPOOL *dbmp;
+ REGINFO *memreg;
+ MPOOLFILE *mfp;
+ size_t len;
+ roff_t *offsetp;
+ void *retp;
+{
+ BH *bhp, *nbhp;
+ MPOOL *c_mp;
+ MPOOLFILE *bh_mfp;
+ size_t total;
+ int nomore, restart, ret, wrote;
+ void *p;
+
+ c_mp = memreg->primary;
+
+ /*
+ * If we're allocating a buffer, and the one we're discarding is the
+ * same size, we don't want to waste the time to re-integrate it into
+ * the shared memory free list. If the DB_MPOOLFILE argument isn't
+ * NULL, we'll compare the underlying page sizes of the two buffers
+ * before free-ing and re-allocating buffers.
+ */
+ if (mfp != NULL)
+ len = (sizeof(BH) - sizeof(u_int8_t)) + mfp->stat.st_pagesize;
+
+ nomore = 0;
+alloc: if ((ret = __db_shalloc(memreg->addr, len, MUTEX_ALIGN, &p)) == 0) {
+ if (offsetp != NULL)
+ *offsetp = R_OFFSET(memreg, p);
+ *(void **)retp = p;
+ return (0);
+ }
+ if (nomore) {
+ __db_err(dbmp->dbenv,
+ "Unable to allocate %lu bytes from mpool shared region: %s\n",
+ (u_long)len, db_strerror(ret));
+ return (ret);
+ }
+
+retry: /* Find a buffer we can flush; pure LRU. */
+ restart = total = 0;
+ for (bhp =
+ SH_TAILQ_FIRST(&c_mp->bhq, __bh); bhp != NULL; bhp = nbhp) {
+ nbhp = SH_TAILQ_NEXT(bhp, q, __bh);
+
+ /* Ignore pinned or locked (I/O in progress) buffers. */
+ if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED))
+ continue;
+
+ /* Find the associated MPOOLFILE. */
+ bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+
+ /* Write the page if it's dirty. */
+ if (F_ISSET(bhp, BH_DIRTY)) {
+ ++bhp->ref;
+ if ((ret = __memp_bhwrite(dbmp,
+ bh_mfp, bhp, &restart, &wrote)) != 0)
+ return (ret);
+ --bhp->ref;
+
+ /*
+ * Another process may have acquired this buffer and
+ * incremented the ref count after we wrote it.
+ */
+ if (bhp->ref != 0)
+ goto retry;
+
+ /*
+ * If we wrote the page, continue and free the buffer.
+ * We don't have to rewalk the list to acquire the
+ * buffer because it was never available for any other
+ * process to modify it.
+ *
+ * If we didn't write the page, but we discarded and
+ * reacquired the region lock, restart the list walk.
+ *
+ * If we neither wrote the buffer nor discarded the
+ * region lock, continue down the buffer list.
+ */
+ if (wrote)
+ ++c_mp->stat.st_rw_evict;
+ else {
+ if (restart)
+ goto retry;
+ continue;
+ }
+ } else
+ ++c_mp->stat.st_ro_evict;
+
+ /*
+ * Check to see if the buffer is the size we're looking for.
+ * If it is, simply reuse it.
+ */
+ if (mfp != NULL &&
+ mfp->stat.st_pagesize == bh_mfp->stat.st_pagesize) {
+ __memp_bhfree(dbmp, bhp, 0);
+
+ if (offsetp != NULL)
+ *offsetp = R_OFFSET(memreg, bhp);
+ *(void **)retp = bhp;
+ return (0);
+ }
+
+ /* Note how much space we've freed, and free the buffer. */
+ total += __db_shsizeof(bhp);
+ __memp_bhfree(dbmp, bhp, 1);
+
+ /*
+ * Retry as soon as we've freed up sufficient space. If we
+ * have to coalesce of memory to satisfy the request, don't
+ * try until it's likely (possible?) that we'll succeed.
+ */
+ if (total >= 3 * len)
+ goto alloc;
+
+ /* Restart the walk if we discarded the region lock. */
+ if (restart)
+ goto retry;
+ }
+ nomore = 1;
+ goto alloc;
+}
diff --git a/bdb/mp/mp_bh.c b/bdb/mp/mp_bh.c
new file mode 100644
index 00000000000..e802b165b2d
--- /dev/null
+++ b/bdb/mp/mp_bh.c
@@ -0,0 +1,662 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: mp_bh.c,v 11.25 2001/01/10 04:50:53 ubell Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <string.h>
+#include <unistd.h>
+#endif
+
+#include "db_int.h"
+#include "db_shash.h"
+#include "mp.h"
+#include "log.h"
+#include "db_page.h"
+
+static int __memp_upgrade __P((DB_MPOOL *, DB_MPOOLFILE *, MPOOLFILE *));
+
+/*
+ * __memp_bhwrite --
+ * Write the page associated with a given bucket header.
+ *
+ * PUBLIC: int __memp_bhwrite
+ * PUBLIC: __P((DB_MPOOL *, MPOOLFILE *, BH *, int *, int *));
+ */
+int
+__memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep)
+ DB_MPOOL *dbmp;
+ MPOOLFILE *mfp;
+ BH *bhp;
+ int *restartp, *wrotep;
+{
+ DB_MPOOLFILE *dbmfp;
+ DB_MPREG *mpreg;
+ int incremented, ret;
+
+ if (restartp != NULL)
+ *restartp = 0;
+ if (wrotep != NULL)
+ *wrotep = 0;
+ incremented = 0;
+
+ /*
+ * If the file has been removed or is a closed temporary file, Jump
+ * right ahead and pretend that we've found the file we want-- the
+ * page-write function knows how to handle the fact that we don't have
+ * (or need!) any real file descriptor information.
+ */
+ if (F_ISSET(mfp, MP_DEADFILE)) {
+ dbmfp = NULL;
+ goto found;
+ }
+
+ /*
+ * Walk the process' DB_MPOOLFILE list and find a file descriptor for
+ * the file. We also check that the descriptor is open for writing.
+ * If we find a descriptor on the file that's not open for writing, we
+ * try and upgrade it to make it writeable. If that fails, we're done.
+ */
+ MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp);
+ for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
+ dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q))
+ if (dbmfp->mfp == mfp) {
+ if (F_ISSET(dbmfp, MP_READONLY) &&
+ __memp_upgrade(dbmp, dbmfp, mfp)) {
+ MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
+ return (0);
+ }
+
+ /*
+ * Increment the reference count -- see the comment in
+ * memp_fclose().
+ */
+ ++dbmfp->ref;
+ incremented = 1;
+ break;
+ }
+ MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
+ if (dbmfp != NULL)
+ goto found;
+
+ /*
+ * !!!
+ * Don't try to attach to temporary files. There are two problems in
+ * trying to do that. First, if we have different privileges than the
+ * process that "owns" the temporary file, we might create the backing
+ * disk file such that the owning process couldn't read/write its own
+ * buffers, e.g., memp_trickle() running as root creating a file owned
+ * as root, mode 600. Second, if the temporary file has already been
+ * created, we don't have any way of finding out what its real name is,
+ * and, even if we did, it was already unlinked (so that it won't be
+ * left if the process dies horribly). This decision causes a problem,
+ * however: if the temporary file consumes the entire buffer cache,
+ * and the owner doesn't flush the buffers to disk, we could end up
+ * with resource starvation, and the memp_trickle() thread couldn't do
+ * anything about it. That's a pretty unlikely scenario, though.
+ *
+ * Note that we should never get here when the temporary file
+ * in question has already been closed in another process, in which
+ * case it should be marked MP_DEADFILE.
+ */
+ if (F_ISSET(mfp, MP_TEMP)) {
+ DB_ASSERT(!F_ISSET(mfp, MP_DEADFILE));
+ return (0);
+ }
+
+ /*
+ * It's not a page from a file we've opened. If the file requires
+ * input/output processing, see if this process has ever registered
+ * information as to how to write this type of file. If not, there's
+ * nothing we can do.
+ */
+ if (mfp->ftype != 0) {
+ MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp);
+ for (mpreg = LIST_FIRST(&dbmp->dbregq);
+ mpreg != NULL; mpreg = LIST_NEXT(mpreg, q))
+ if (mpreg->ftype == mfp->ftype)
+ break;
+ MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
+ if (mpreg == NULL)
+ return (0);
+ }
+
+ /*
+ * Try and open the file, attaching to the underlying shared area.
+ * Ignore any error, assume it's a permissions problem.
+ *
+ * XXX
+ * There's no negative cache, so we may repeatedly try and open files
+ * that we have previously tried (and failed) to open.
+ */
+ if (__memp_fopen(dbmp, mfp, R_ADDR(dbmp->reginfo, mfp->path_off),
+ 0, 0, mfp->stat.st_pagesize, 0, NULL, &dbmfp) != 0)
+ return (0);
+
+found: ret = __memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep);
+
+ if (incremented) {
+ MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp);
+ --dbmfp->ref;
+ MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
+ }
+
+ return (ret);
+}
+
+/*
+ * __memp_pgread --
+ * Read a page from a file.
+ *
+ * PUBLIC: int __memp_pgread __P((DB_MPOOLFILE *, BH *, int));
+ */
+int
+__memp_pgread(dbmfp, bhp, can_create)
+ DB_MPOOLFILE *dbmfp;
+ BH *bhp;
+ int can_create;
+{
+ DB_IO db_io;
+ DB_ENV *dbenv;
+ DB_MPOOL *dbmp;
+ MPOOLFILE *mfp;
+ size_t len, pagesize;
+ size_t nr;
+ int created, ret;
+
+ dbmp = dbmfp->dbmp;
+ dbenv = dbmp->dbenv;
+ mfp = dbmfp->mfp;
+ pagesize = mfp->stat.st_pagesize;
+
+ F_SET(bhp, BH_LOCKED | BH_TRASH);
+ MUTEX_LOCK(dbenv, &bhp->mutex, dbenv->lockfhp);
+ R_UNLOCK(dbenv, dbmp->reginfo);
+
+ /*
+ * Temporary files may not yet have been created. We don't create
+ * them now, we create them when the pages have to be flushed.
+ */
+ nr = 0;
+ if (F_ISSET(&dbmfp->fh, DB_FH_VALID)) {
+ /*
+ * Ignore read errors if we have permission to create the page.
+ * Assume that the page doesn't exist, and that we'll create it
+ * when we write it out.
+ *
+ * XXX
+ * Theoretically, we could overwrite a page of data if it were
+ * possible for a file to be successfully opened for reading
+ * and then for the read to fail. Shouldn't ever happen, but
+ * it might be worth checking to see if the offset is past the
+ * known end-of-file.
+ */
+ db_io.fhp = &dbmfp->fh;
+ db_io.mutexp = dbmfp->mutexp;
+ db_io.pagesize = db_io.bytes = pagesize;
+ db_io.pgno = bhp->pgno;
+ db_io.buf = bhp->buf;
+
+ ret = __os_io(dbenv, &db_io, DB_IO_READ, &nr);
+ } else
+ ret = 0;
+
+ created = 0;
+ if (nr < pagesize) {
+ if (can_create)
+ created = 1;
+ else {
+ /*
+ * If we had a short read, ret may be 0. This may not
+ * be an error -- in particular DB recovery processing
+ * may request pages that have never been written to
+ * disk, in which case we won't find the page. So, the
+ * caller must know how to handle the error.
+ */
+ if (ret == 0)
+ ret = EIO;
+ goto err;
+ }
+ }
+
+ /*
+ * Clear any bytes we didn't read that need to be cleared. If we're
+ * running in diagnostic mode, smash any bytes on the page that are
+ * unknown quantities for the caller.
+ */
+ if (nr != pagesize) {
+ len = mfp->clear_len == 0 ? pagesize : mfp->clear_len;
+ if (nr < len)
+ memset(bhp->buf + nr, 0, len - nr);
+#ifdef DIAGNOSTIC
+ if (nr > len)
+ len = nr;
+ if (len < pagesize)
+ memset(bhp->buf + len, CLEAR_BYTE, pagesize - len);
+#endif
+ }
+
+ /* Call any pgin function. */
+ ret = mfp->ftype == 0 ? 0 : __memp_pg(dbmfp, bhp, 1);
+
+ /* Unlock the buffer and reacquire the region lock. */
+err: MUTEX_UNLOCK(dbenv, &bhp->mutex);
+ R_LOCK(dbenv, dbmp->reginfo);
+
+ /*
+ * If no errors occurred, the data is now valid, clear the BH_TRASH
+ * flag; regardless, clear the lock bit and let other threads proceed.
+ */
+ F_CLR(bhp, BH_LOCKED);
+ if (ret == 0) {
+ F_CLR(bhp, BH_TRASH);
+
+ /* Update the statistics. */
+ if (created)
+ ++mfp->stat.st_page_create;
+ else
+ ++mfp->stat.st_page_in;
+ }
+
+ return (ret);
+}
+
+/*
+ * __memp_pgwrite --
+ * Write a page to a file.
+ *
+ * PUBLIC: int __memp_pgwrite
+ * PUBLIC: __P((DB_MPOOL *, DB_MPOOLFILE *, BH *, int *, int *));
+ */
+int
+__memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep)
+ DB_MPOOL *dbmp;
+ DB_MPOOLFILE *dbmfp;
+ BH *bhp;
+ int *restartp, *wrotep;
+{
+ DB_ENV *dbenv;
+ DB_IO db_io;
+ DB_LSN lsn;
+ MPOOL *c_mp, *mp;
+ MPOOLFILE *mfp;
+ size_t nw;
+ int callpgin, dosync, ret, syncfail;
+ const char *fail;
+
+ dbenv = dbmp->dbenv;
+ mp = dbmp->reginfo[0].primary;
+ mfp = dbmfp == NULL ? NULL : dbmfp->mfp;
+
+ if (restartp != NULL)
+ *restartp = 0;
+ if (wrotep != NULL)
+ *wrotep = 0;
+ callpgin = 0;
+
+ /*
+ * Check the dirty bit -- this buffer may have been written since we
+ * decided to write it.
+ */
+ if (!F_ISSET(bhp, BH_DIRTY)) {
+ if (wrotep != NULL)
+ *wrotep = 1;
+ return (0);
+ }
+
+ MUTEX_LOCK(dbenv, &bhp->mutex, dbenv->lockfhp);
+
+ /*
+ * If there were two writers, we may have just been waiting while the
+ * other writer completed I/O on this buffer. Check the dirty bit one
+ * more time.
+ */
+ if (!F_ISSET(bhp, BH_DIRTY)) {
+ MUTEX_UNLOCK(dbenv, &bhp->mutex);
+
+ if (wrotep != NULL)
+ *wrotep = 1;
+ return (0);
+ }
+
+ F_SET(bhp, BH_LOCKED);
+ R_UNLOCK(dbenv, dbmp->reginfo);
+
+ if (restartp != NULL)
+ *restartp = 1;
+
+ /*
+ * It's possible that the underlying file doesn't exist, either
+ * because of an outright removal or because it was a temporary
+ * file that's been closed.
+ *
+ * !!!
+ * Once we pass this point, we know that dbmfp and mfp aren't NULL,
+ * and that we have a valid file reference.
+ */
+ if (mfp == NULL || F_ISSET(mfp, MP_DEADFILE))
+ goto file_dead;
+
+ /*
+ * Ensure the appropriate log records are on disk. If the page is
+ * being written as part of a sync operation, the flush has already
+ * been done, unless it was written by the application *after* the
+ * sync was scheduled.
+ */
+ if (LOGGING_ON(dbenv) &&
+ (!F_ISSET(bhp, BH_SYNC) || F_ISSET(bhp, BH_SYNC_LOGFLSH))) {
+ memcpy(&lsn, bhp->buf + mfp->lsn_off, sizeof(DB_LSN));
+ if ((ret = log_flush(dbenv, &lsn)) != 0)
+ goto err;
+ }
+ DB_ASSERT(!LOGGING_ON(dbenv) ||
+ log_compare(&((LOG *)((DB_LOG *)
+ dbenv->lg_handle)->reginfo.primary)->s_lsn, &LSN(bhp->buf)) > 0);
+
+ /*
+ * Call any pgout function. We set the callpgin flag so that we flag
+ * that the contents of the buffer will need to be passed through pgin
+ * before they are reused.
+ */
+ if (mfp->ftype == 0)
+ ret = 0;
+ else {
+ callpgin = 1;
+ if ((ret = __memp_pg(dbmfp, bhp, 0)) != 0)
+ goto err;
+ }
+
+ /* Temporary files may not yet have been created. */
+ if (!F_ISSET(&dbmfp->fh, DB_FH_VALID)) {
+ MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
+ if (!F_ISSET(&dbmfp->fh, DB_FH_VALID) &&
+ ((ret = __db_appname(dbenv, DB_APP_TMP, NULL, NULL,
+ DB_OSO_CREATE | DB_OSO_EXCL | DB_OSO_TEMP,
+ &dbmfp->fh, NULL)) != 0 ||
+ !F_ISSET(&dbmfp->fh, DB_FH_VALID))) {
+ MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+ __db_err(dbenv,
+ "unable to create temporary backing file");
+ goto err;
+ }
+ MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+ }
+
+ /* Write the page. */
+ db_io.fhp = &dbmfp->fh;
+ db_io.mutexp = dbmfp->mutexp;
+ db_io.pagesize = db_io.bytes = mfp->stat.st_pagesize;
+ db_io.pgno = bhp->pgno;
+ db_io.buf = bhp->buf;
+ if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) {
+ ret = __db_panic(dbenv, ret);
+ fail = "write";
+ goto syserr;
+ }
+ if (nw != mfp->stat.st_pagesize) {
+ ret = EIO;
+ fail = "write";
+ goto syserr;
+ }
+
+file_dead:
+ /*
+ * !!!
+ * Once we pass this point, dbmfp and mfp may be NULL, we may not have
+ * a valid file reference.
+ *
+ * Unlock the buffer and reacquire the region lock.
+ */
+ MUTEX_UNLOCK(dbenv, &bhp->mutex);
+ R_LOCK(dbenv, dbmp->reginfo);
+
+ /*
+ * Clean up the flags based on a successful write.
+ *
+ * If we rewrote the page, it will need processing by the pgin
+ * routine before reuse.
+ */
+ if (callpgin)
+ F_SET(bhp, BH_CALLPGIN);
+ F_CLR(bhp, BH_DIRTY | BH_LOCKED);
+
+ /*
+ * If we write a buffer for which a checkpoint is waiting, update
+ * the count of pending buffers (both in the mpool as a whole and
+ * for this file). If the count for this file goes to zero, set a
+ * flag so we flush the writes.
+ */
+ dosync = 0;
+ if (F_ISSET(bhp, BH_SYNC)) {
+ F_CLR(bhp, BH_SYNC | BH_SYNC_LOGFLSH);
+
+ --mp->lsn_cnt;
+ if (mfp != NULL)
+ dosync = --mfp->lsn_cnt == 0 ? 1 : 0;
+ }
+
+ /* Update the page clean/dirty statistics. */
+ c_mp = BH_TO_CACHE(dbmp, bhp);
+ ++c_mp->stat.st_page_clean;
+ --c_mp->stat.st_page_dirty;
+
+ /* Update I/O statistics. */
+ if (mfp != NULL)
+ ++mfp->stat.st_page_out;
+
+ /*
+ * Do the sync after everything else has been updated, so any incoming
+ * checkpoint doesn't see inconsistent information.
+ *
+ * XXX:
+ * Don't lock the region around the sync, fsync(2) has no atomicity
+ * issues.
+ *
+ * XXX:
+ * We ignore errors from the sync -- it makes no sense to return an
+ * error to the calling process, so set a flag causing the checkpoint
+ * to be retried later. There is a possibility, of course, that a
+ * subsequent checkpoint was started and that we're going to force it
+ * to fail. That should be unlikely, and fixing it would be difficult.
+ */
+ if (dosync) {
+ R_UNLOCK(dbenv, dbmp->reginfo);
+ syncfail = __os_fsync(dbenv, &dbmfp->fh) != 0;
+ R_LOCK(dbenv, dbmp->reginfo);
+ if (syncfail)
+ F_SET(mp, MP_LSN_RETRY);
+ }
+
+ if (wrotep != NULL)
+ *wrotep = 1;
+
+ return (0);
+
+syserr: __db_err(dbenv, "%s: %s failed for page %lu",
+ __memp_fn(dbmfp), fail, (u_long)bhp->pgno);
+
+err: /* Unlock the buffer and reacquire the region lock. */
+ MUTEX_UNLOCK(dbenv, &bhp->mutex);
+ R_LOCK(dbenv, dbmp->reginfo);
+
+ /*
+ * Clean up the flags based on a failure.
+ *
+ * The page remains dirty but we remove our lock. If we rewrote the
+ * page, it will need processing by the pgin routine before reuse.
+ */
+ if (callpgin)
+ F_SET(bhp, BH_CALLPGIN);
+ F_CLR(bhp, BH_LOCKED);
+
+ return (ret);
+}
+
+/*
+ * __memp_pg --
+ * Call the pgin/pgout routine.
+ *
+ * PUBLIC: int __memp_pg __P((DB_MPOOLFILE *, BH *, int));
+ */
+int
+__memp_pg(dbmfp, bhp, is_pgin)
+ DB_MPOOLFILE *dbmfp;
+ BH *bhp;
+ int is_pgin;
+{
+ DBT dbt, *dbtp;
+ DB_MPOOL *dbmp;
+ DB_MPREG *mpreg;
+ MPOOLFILE *mfp;
+ int ftype, ret;
+
+ dbmp = dbmfp->dbmp;
+ mfp = dbmfp->mfp;
+
+ MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp);
+
+ ftype = mfp->ftype;
+ for (mpreg = LIST_FIRST(&dbmp->dbregq);
+ mpreg != NULL; mpreg = LIST_NEXT(mpreg, q)) {
+ if (ftype != mpreg->ftype)
+ continue;
+ if (mfp->pgcookie_len == 0)
+ dbtp = NULL;
+ else {
+ dbt.size = mfp->pgcookie_len;
+ dbt.data = R_ADDR(dbmp->reginfo, mfp->pgcookie_off);
+ dbtp = &dbt;
+ }
+ MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
+
+ if (is_pgin) {
+ if (mpreg->pgin != NULL &&
+ (ret = mpreg->pgin(dbmp->dbenv,
+ bhp->pgno, bhp->buf, dbtp)) != 0)
+ goto err;
+ } else
+ if (mpreg->pgout != NULL &&
+ (ret = mpreg->pgout(dbmp->dbenv,
+ bhp->pgno, bhp->buf, dbtp)) != 0)
+ goto err;
+ break;
+ }
+
+ if (mpreg == NULL)
+ MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
+
+ return (0);
+
+err: MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
+ __db_err(dbmp->dbenv, "%s: %s failed for page %lu",
+ __memp_fn(dbmfp), is_pgin ? "pgin" : "pgout", (u_long)bhp->pgno);
+ return (ret);
+}
+
+/*
+ * __memp_bhfree --
+ * Free a bucket header and its referenced data.
+ *
+ * PUBLIC: void __memp_bhfree __P((DB_MPOOL *, BH *, int));
+ */
+void
+__memp_bhfree(dbmp, bhp, free_mem)
+ DB_MPOOL *dbmp;
+ BH *bhp;
+ int free_mem;
+{
+ DB_HASHTAB *dbht;
+ MPOOL *c_mp, *mp;
+ MPOOLFILE *mfp;
+ int n_bucket, n_cache;
+
+ mp = dbmp->reginfo[0].primary;
+ c_mp = BH_TO_CACHE(dbmp, bhp);
+ n_cache = NCACHE(mp, bhp->pgno);
+ n_bucket = NBUCKET(c_mp, bhp->mf_offset, bhp->pgno);
+ dbht = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
+
+ /* Delete the buffer header from the hash bucket queue. */
+ SH_TAILQ_REMOVE(&dbht[n_bucket], bhp, hq, __bh);
+
+ /* Delete the buffer header from the LRU queue. */
+ SH_TAILQ_REMOVE(&c_mp->bhq, bhp, q, __bh);
+
+ /* Clear the mutex this buffer recorded */
+ __db_shlocks_clear(&bhp->mutex, &dbmp->reginfo[n_cache],
+ (REGMAINT *)R_ADDR(&dbmp->reginfo[n_cache], mp->maint_off));
+ /*
+ * Find the underlying MPOOLFILE and decrement its reference count.
+ * If this is its last reference, remove it.
+ */
+ mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+ if (--mfp->block_cnt == 0 && mfp->mpf_cnt == 0)
+ __memp_mf_discard(dbmp, mfp);
+
+ /*
+ * If we're not reusing it immediately, free the buffer header
+ * and data for real.
+ */
+ if (free_mem) {
+ --c_mp->stat.st_page_clean;
+ __db_shalloc_free(dbmp->reginfo[n_cache].addr, bhp);
+ }
+}
+
+/*
+ * __memp_upgrade --
+ * Upgrade a file descriptor from readonly to readwrite.
+ */
+static int
+__memp_upgrade(dbmp, dbmfp, mfp)
+ DB_MPOOL *dbmp;
+ DB_MPOOLFILE *dbmfp;
+ MPOOLFILE *mfp;
+{
+ DB_FH fh;
+ int ret;
+ char *rpath;
+
+ /*
+ * !!!
+ * We expect the handle to already be locked.
+ */
+
+ /* Check to see if we've already upgraded. */
+ if (F_ISSET(dbmfp, MP_UPGRADE))
+ return (0);
+
+ /* Check to see if we've already failed. */
+ if (F_ISSET(dbmfp, MP_UPGRADE_FAIL))
+ return (1);
+
+ /*
+ * Calculate the real name for this file and try to open it read/write.
+ * We know we have a valid pathname for the file because it's the only
+ * way we could have gotten a file descriptor of any kind.
+ */
+ if ((ret = __db_appname(dbmp->dbenv, DB_APP_DATA,
+ NULL, R_ADDR(dbmp->reginfo, mfp->path_off), 0, NULL, &rpath)) != 0)
+ return (ret);
+ if (__os_open(dbmp->dbenv, rpath, 0, 0, &fh) != 0) {
+ F_SET(dbmfp, MP_UPGRADE_FAIL);
+ ret = 1;
+ } else {
+ /* Swap the descriptors and set the upgrade flag. */
+ (void)__os_closehandle(&dbmfp->fh);
+ dbmfp->fh = fh;
+ F_SET(dbmfp, MP_UPGRADE);
+ ret = 0;
+ }
+ __os_freestr(rpath);
+ return (ret);
+}
diff --git a/bdb/mp/mp_fget.c b/bdb/mp/mp_fget.c
new file mode 100644
index 00000000000..1bff5e136ab
--- /dev/null
+++ b/bdb/mp/mp_fget.c
@@ -0,0 +1,417 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: mp_fget.c,v 11.28 2001/01/10 04:50:53 ubell Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <string.h>
+#endif
+
+#ifdef HAVE_RPC
+#include "db_server.h"
+#endif
+
+#include "db_int.h"
+#include "db_shash.h"
+#include "mp.h"
+
+#ifdef HAVE_RPC
+#include "gen_client_ext.h"
+#include "rpc_client_ext.h"
+#endif
+
+/*
+ * memp_fget --
+ * Get a page from the file.
+ */
+int
+memp_fget(dbmfp, pgnoaddr, flags, addrp)
+ DB_MPOOLFILE *dbmfp;
+ db_pgno_t *pgnoaddr;
+ u_int32_t flags;
+ void *addrp;
+{
+ BH *bhp;
+ DB_ENV *dbenv;
+ DB_MPOOL *dbmp;
+ DB_HASHTAB *dbht;
+ MPOOL *c_mp, *mp;
+ MPOOLFILE *mfp;
+ size_t n_bucket, n_cache, mf_offset;
+ u_int32_t st_hsearch;
+ int b_incr, first, ret;
+
+ dbmp = dbmfp->dbmp;
+ dbenv = dbmp->dbenv;
+ mp = dbmp->reginfo[0].primary;
+ mfp = dbmfp->mfp;
+#ifdef HAVE_RPC
+ if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
+ return (__dbcl_memp_fget(dbmfp, pgnoaddr, flags, addrp));
+#endif
+
+ PANIC_CHECK(dbenv);
+
+ /*
+ * Validate arguments.
+ *
+ * !!!
+ * Don't test for DB_MPOOL_CREATE and DB_MPOOL_NEW flags for readonly
+ * files here, and create non-existent pages in readonly files if the
+ * flags are set, later. The reason is that the hash access method
+ * wants to get empty pages that don't really exist in readonly files.
+ * The only alternative is for hash to write the last "bucket" all the
+ * time, which we don't want to do because one of our big goals in life
+ * is to keep database files small. It's sleazy as hell, but we catch
+ * any attempt to actually write the file in memp_fput().
+ */
+#define OKFLAGS \
+ (DB_MPOOL_CREATE | DB_MPOOL_LAST | \
+ DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP | DB_MPOOL_EXTENT)
+ if (flags != 0) {
+ if ((ret = __db_fchk(dbenv, "memp_fget", flags, OKFLAGS)) != 0)
+ return (ret);
+
+ switch (flags & ~DB_MPOOL_EXTENT) {
+ case DB_MPOOL_CREATE:
+ case DB_MPOOL_LAST:
+ case DB_MPOOL_NEW:
+ case DB_MPOOL_NEW_GROUP:
+ case 0:
+ break;
+ default:
+ return (__db_ferr(dbenv, "memp_fget", 1));
+ }
+ }
+
+#ifdef DIAGNOSTIC
+ /*
+ * XXX
+ * We want to switch threads as often as possible. Yield every time
+ * we get a new page to ensure contention.
+ */
+ if (DB_GLOBAL(db_pageyield))
+ __os_yield(dbenv, 1);
+#endif
+
+ /* Initialize remaining local variables. */
+ mf_offset = R_OFFSET(dbmp->reginfo, mfp);
+ bhp = NULL;
+ st_hsearch = 0;
+ b_incr = ret = 0;
+
+ R_LOCK(dbenv, dbmp->reginfo);
+
+ /*
+ * Check for the new, last or last + 1 page requests.
+ *
+ * Examine and update the file's last_pgno value. We don't care if
+ * the last_pgno value immediately changes due to another thread --
+ * at this instant in time, the value is correct. We do increment the
+ * current last_pgno value if the thread is asking for a new page,
+ * however, to ensure that two threads creating pages don't get the
+ * same one.
+ *
+ * If we create a page, there is the potential that a page after it
+ * in the file will be written before it will be written. Recovery
+ * depends on pages that are "created" in the file by subsequent pages
+ * being written be zeroed out, not have random garbage. Ensure that
+ * the OS agrees.
+ *
+ * !!!
+ * DB_MPOOL_NEW_GROUP is undocumented -- the hash access method needs
+ * to allocate contiguous groups of pages in order to do subdatabases.
+ * We return the first page in the group, but the caller must put an
+ * LSN on the *last* page and write it, otherwise after a crash we may
+ * not create all of the pages we need to create.
+ */
+ if (LF_ISSET(DB_MPOOL_LAST | DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP)) {
+ if (LF_ISSET(DB_MPOOL_NEW)) {
+ if (F_ISSET(&dbmfp->fh, DB_FH_VALID) && (ret =
+ __os_fpinit(dbenv, &dbmfp->fh, mfp->last_pgno + 1,
+ 1, mfp->stat.st_pagesize)) != 0) {
+ R_UNLOCK(dbenv, dbmp->reginfo);
+ return (ret);
+ }
+ ++mfp->last_pgno;
+ }
+ if (LF_ISSET(DB_MPOOL_NEW_GROUP)) {
+ if (F_ISSET(&dbmfp->fh, DB_FH_VALID) && (ret =
+ __os_fpinit(dbenv, &dbmfp->fh, mfp->last_pgno + 1,
+ (int)*pgnoaddr, mfp->stat.st_pagesize)) != 0) {
+ R_UNLOCK(dbenv, dbmp->reginfo);
+ return (ret);
+ }
+ mfp->last_pgno += *pgnoaddr;
+ }
+ *pgnoaddr = mfp->last_pgno;
+ }
+
+ /*
+ * Determine the hash bucket where this page will live, and get local
+ * pointers to the cache and its hash table.
+ */
+ n_cache = NCACHE(mp, *pgnoaddr);
+ c_mp = dbmp->reginfo[n_cache].primary;
+ n_bucket = NBUCKET(c_mp, mf_offset, *pgnoaddr);
+ dbht = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
+
+ if (LF_ISSET(DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP))
+ goto alloc;
+
+ /*
+ * If mmap'ing the file and the page is not past the end of the file,
+ * just return a pointer.
+ *
+ * The page may be past the end of the file, so check the page number
+ * argument against the original length of the file. If we previously
+ * returned pages past the original end of the file, last_pgno will
+ * have been updated to match the "new" end of the file, and checking
+ * against it would return pointers past the end of the mmap'd region.
+ *
+ * If another process has opened the file for writing since we mmap'd
+ * it, we will start playing the game by their rules, i.e. everything
+ * goes through the cache. All pages previously returned will be safe,
+ * as long as the correct locking protocol was observed.
+ *
+ * XXX
+ * We don't discard the map because we don't know when all of the
+ * pages will have been discarded from the process' address space.
+ * It would be possible to do so by reference counting the open
+ * pages from the mmap, but it's unclear to me that it's worth it.
+ */
+ if (dbmfp->addr != NULL && F_ISSET(mfp, MP_CAN_MMAP)) {
+ if (*pgnoaddr > mfp->orig_last_pgno) {
+ /*
+ * !!!
+ * See the comment above about non-existent pages and
+ * the hash access method.
+ */
+ if (!LF_ISSET(DB_MPOOL_CREATE)) {
+ if (!LF_ISSET(DB_MPOOL_EXTENT))
+ __db_err(dbenv,
+ "%s: page %lu doesn't exist",
+ __memp_fn(dbmfp), (u_long)*pgnoaddr);
+ ret = EINVAL;
+ goto err;
+ }
+ } else {
+ *(void **)addrp =
+ R_ADDR(dbmfp, *pgnoaddr * mfp->stat.st_pagesize);
+ ++mfp->stat.st_map;
+ goto done;
+ }
+ }
+
+ /* Search the hash chain for the page. */
+ for (bhp = SH_TAILQ_FIRST(&dbht[n_bucket], __bh);
+ bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) {
+ ++st_hsearch;
+ if (bhp->pgno != *pgnoaddr || bhp->mf_offset != mf_offset)
+ continue;
+
+ /* Increment the reference count. */
+ if (bhp->ref == UINT16_T_MAX) {
+ __db_err(dbenv,
+ "%s: page %lu: reference count overflow",
+ __memp_fn(dbmfp), (u_long)bhp->pgno);
+ ret = EINVAL;
+ goto err;
+ }
+
+ /*
+ * Increment the reference count. We may discard the region
+ * lock as we evaluate and/or read the buffer, so we need to
+ * ensure that it doesn't move and that its contents remain
+ * unchanged.
+ */
+ ++bhp->ref;
+ b_incr = 1;
+
+ /*
+ * Any buffer we find might be trouble.
+ *
+ * BH_LOCKED --
+ * I/O is in progress. Because we've incremented the buffer
+ * reference count, we know the buffer can't move. Unlock
+ * the region lock, wait for the I/O to complete, and reacquire
+ * the region.
+ */
+ for (first = 1; F_ISSET(bhp, BH_LOCKED); first = 0) {
+ R_UNLOCK(dbenv, dbmp->reginfo);
+
+ /*
+ * Explicitly yield the processor if it's not the first
+ * pass through this loop -- if we don't, we might end
+ * up running to the end of our CPU quantum as we will
+ * simply be swapping between the two locks.
+ */
+ if (!first)
+ __os_yield(dbenv, 1);
+
+ MUTEX_LOCK(dbenv, &bhp->mutex, dbenv->lockfhp);
+ /* Wait for I/O to finish... */
+ MUTEX_UNLOCK(dbenv, &bhp->mutex);
+ R_LOCK(dbenv, dbmp->reginfo);
+ }
+
+ /*
+ * BH_TRASH --
+ * The contents of the buffer are garbage. Shouldn't happen,
+ * and this read is likely to fail, but might as well try.
+ */
+ if (F_ISSET(bhp, BH_TRASH))
+ goto reread;
+
+ /*
+ * BH_CALLPGIN --
+ * The buffer was converted so it could be written, and the
+ * contents need to be converted again.
+ */
+ if (F_ISSET(bhp, BH_CALLPGIN)) {
+ if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0)
+ goto err;
+ F_CLR(bhp, BH_CALLPGIN);
+ }
+
+ ++mfp->stat.st_cache_hit;
+ *(void **)addrp = bhp->buf;
+ goto done;
+ }
+
+alloc: /* Allocate new buffer header and data space. */
+ if ((ret = __memp_alloc(dbmp,
+ &dbmp->reginfo[n_cache], mfp, 0, NULL, &bhp)) != 0)
+ goto err;
+
+ ++c_mp->stat.st_page_clean;
+
+ /*
+ * Initialize the BH fields so that we can call the __memp_bhfree
+ * routine if an error occurs.
+ */
+ memset(bhp, 0, sizeof(BH));
+ bhp->ref = 1;
+ bhp->pgno = *pgnoaddr;
+ bhp->mf_offset = mf_offset;
+
+ /* Increment the count of buffers referenced by this MPOOLFILE. */
+ ++mfp->block_cnt;
+
+ /*
+ * Prepend the bucket header to the head of the appropriate MPOOL
+ * bucket hash list. Append the bucket header to the tail of the
+ * MPOOL LRU chain.
+ */
+ SH_TAILQ_INSERT_HEAD(&dbht[n_bucket], bhp, hq, __bh);
+ SH_TAILQ_INSERT_TAIL(&c_mp->bhq, bhp, q);
+
+#ifdef DIAGNOSTIC
+ if ((db_alignp_t)bhp->buf & (sizeof(size_t) - 1)) {
+ __db_err(dbenv, "Internal error: BH data NOT size_t aligned.");
+ ret = EINVAL;
+ __memp_bhfree(dbmp, bhp, 1);
+ goto err;
+ }
+#endif
+
+ if ((ret = __db_shmutex_init(dbenv, &bhp->mutex,
+ R_OFFSET(dbmp->reginfo, &bhp->mutex) + DB_FCNTL_OFF_MPOOL,
+ 0, &dbmp->reginfo[n_cache],
+ (REGMAINT *)R_ADDR(&dbmp->reginfo[n_cache], c_mp->maint_off)))
+ != 0) {
+ __memp_bhfree(dbmp, bhp, 1);
+ goto err;
+ }
+
+ /*
+ * If we created the page, zero it out and continue.
+ *
+ * !!!
+ * Note: DB_MPOOL_NEW specifically doesn't call the pgin function.
+ * If DB_MPOOL_CREATE is used, then the application's pgin function
+ * has to be able to handle pages of 0's -- if it uses DB_MPOOL_NEW,
+ * it can detect all of its page creates, and not bother.
+ *
+ * If we're running in diagnostic mode, smash any bytes on the
+ * page that are unknown quantities for the caller.
+ *
+ * Otherwise, read the page into memory, optionally creating it if
+ * DB_MPOOL_CREATE is set.
+ */
+ if (LF_ISSET(DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP)) {
+ if (mfp->clear_len == 0)
+ memset(bhp->buf, 0, mfp->stat.st_pagesize);
+ else {
+ memset(bhp->buf, 0, mfp->clear_len);
+#ifdef DIAGNOSTIC
+ memset(bhp->buf + mfp->clear_len, CLEAR_BYTE,
+ mfp->stat.st_pagesize - mfp->clear_len);
+#endif
+ }
+
+ ++mfp->stat.st_page_create;
+ } else {
+ /*
+ * It's possible for the read function to fail, which means
+ * that we fail as well. Note, the __memp_pgread() function
+ * discards the region lock, so the buffer must be pinned
+ * down so that it cannot move and its contents are unchanged.
+ */
+reread: if ((ret = __memp_pgread(dbmfp,
+ bhp, LF_ISSET(DB_MPOOL_CREATE|DB_MPOOL_EXTENT))) != 0) {
+ /*
+ * !!!
+ * Discard the buffer unless another thread is waiting
+ * on our I/O to complete. Regardless, the header has
+ * the BH_TRASH flag set.
+ */
+ if (bhp->ref == 1)
+ __memp_bhfree(dbmp, bhp, 1);
+ goto err;
+ }
+
+ ++mfp->stat.st_cache_miss;
+ }
+
+ /*
+ * If we're returning a page after our current notion of the last-page,
+ * update our information. Note, there's no way to un-instantiate this
+ * page, it's going to exist whether it's returned to us dirty or not.
+ */
+ if (bhp->pgno > mfp->last_pgno)
+ mfp->last_pgno = bhp->pgno;
+
+ *(void **)addrp = bhp->buf;
+
+done: /* Update the chain search statistics. */
+ if (st_hsearch) {
+ ++c_mp->stat.st_hash_searches;
+ if (st_hsearch > c_mp->stat.st_hash_longest)
+ c_mp->stat.st_hash_longest = st_hsearch;
+ c_mp->stat.st_hash_examined += st_hsearch;
+ }
+
+ ++dbmfp->pinref;
+
+ R_UNLOCK(dbenv, dbmp->reginfo);
+
+ return (0);
+
+err: /* Discard our reference. */
+ if (b_incr)
+ --bhp->ref;
+ R_UNLOCK(dbenv, dbmp->reginfo);
+
+ *(void **)addrp = NULL;
+ return (ret);
+}
diff --git a/bdb/mp/mp_fopen.c b/bdb/mp/mp_fopen.c
new file mode 100644
index 00000000000..3611ded18f4
--- /dev/null
+++ b/bdb/mp/mp_fopen.c
@@ -0,0 +1,756 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: mp_fopen.c,v 11.41 2001/01/10 04:50:53 ubell Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <string.h>
+#endif
+
+#ifdef HAVE_RPC
+#include "db_server.h"
+#endif
+
+#include "db_int.h"
+#include "db_shash.h"
+#include "mp.h"
+
+#ifdef HAVE_RPC
+#include "gen_client_ext.h"
+#include "rpc_client_ext.h"
+#endif
+
+static int __memp_mf_open __P((DB_MPOOL *, const char *,
+ size_t, db_pgno_t, DB_MPOOL_FINFO *, u_int32_t, MPOOLFILE **));
+
+/*
+ * MEMP_FREMOVE --
+ * Discard an MPOOLFILE and any buffers it references: update the flags
+ * so we never try to write buffers associated with the file, nor can we
+ * find it when looking for files to join. In addition, clear the ftype
+ * field, there's no reason to post-process pages, they can be discarded
+ * by any thread.
+ */
+#define MEMP_FREMOVE(mfp) { \
+ mfp->ftype = 0; \
+ F_SET(mfp, MP_DEADFILE); \
+}
+
+/*
+ * memp_fopen --
+ * Open a backing file for the memory pool.
+ */
+int
+memp_fopen(dbenv, path, flags, mode, pagesize, finfop, retp)
+ DB_ENV *dbenv;
+ const char *path;
+ u_int32_t flags;
+ int mode;
+ size_t pagesize;
+ DB_MPOOL_FINFO *finfop;
+ DB_MPOOLFILE **retp;
+{
+ DB_MPOOL *dbmp;
+ int ret;
+
+#ifdef HAVE_RPC
+ if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
+ return (__dbcl_memp_fopen(dbenv, path, flags,
+ mode, pagesize, finfop, retp));
+#endif
+
+ PANIC_CHECK(dbenv);
+ ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL);
+
+ dbmp = dbenv->mp_handle;
+
+ /* Validate arguments. */
+ if ((ret = __db_fchk(dbenv, "memp_fopen", flags,
+ DB_CREATE |
+ DB_NOMMAP | DB_ODDFILESIZE | DB_RDONLY | DB_TRUNCATE)) != 0)
+ return (ret);
+
+ /* Require a non-zero pagesize. */
+ if (pagesize == 0 ||
+ (finfop != NULL && finfop->clear_len > pagesize)) {
+ __db_err(dbenv, "memp_fopen: illegal page size.");
+ return (EINVAL);
+ }
+
+ return (__memp_fopen(dbmp,
+ NULL, path, flags, mode, pagesize, 1, finfop, retp));
+}
+
+/*
+ * __memp_set_unlink -- set unlink on last close flag.
+ *
+ * PUBLIC: void __memp_set_unlink __P((DB_MPOOLFILE *));
+ */
+void
+__memp_set_unlink(dbmpf)
+ DB_MPOOLFILE *dbmpf;
+{
+ DB_MPOOL *dbmp;
+ dbmp = dbmpf->dbmp;
+
+ R_LOCK(dbmp->dbenv, dbmp->reginfo);
+ F_SET(dbmpf->mfp, MP_UNLINK);
+ R_UNLOCK(dbmp->dbenv, dbmp->reginfo);
+}
+
+/*
+ * __memp_clear_unlink -- clear unlink on last close flag.
+ *
+ * PUBLIC: void __memp_clear_unlink __P((DB_MPOOLFILE *));
+ */
+void
+__memp_clear_unlink(dbmpf)
+ DB_MPOOLFILE *dbmpf;
+{
+ DB_MPOOL *dbmp;
+ dbmp = dbmpf->dbmp;
+
+ /*
+ * This bit is protected in the queue code because the metapage
+ * is locked so we can avoid geting the region lock.
+ * If this gets used from other than the queue code, we cannot.
+ */
+ if (!F_ISSET(dbmpf->mfp, MP_UNLINK))
+ return;
+ R_LOCK(dbmp->dbenv, dbmp->reginfo);
+ F_CLR(dbmpf->mfp, MP_UNLINK);
+ R_UNLOCK(dbmp->dbenv, dbmp->reginfo);
+}
+
+/*
+ * __memp_fopen --
+ * Open a backing file for the memory pool; internal version.
+ *
+ * PUBLIC: int __memp_fopen __P((DB_MPOOL *, MPOOLFILE *, const char *,
+ * PUBLIC: u_int32_t, int, size_t, int, DB_MPOOL_FINFO *, DB_MPOOLFILE **));
+ */
+int
+__memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp)
+ DB_MPOOL *dbmp;
+ MPOOLFILE *mfp;
+ const char *path;
+ u_int32_t flags;
+ int mode, needlock;
+ size_t pagesize;
+ DB_MPOOL_FINFO *finfop;
+ DB_MPOOLFILE **retp;
+{
+ DB_ENV *dbenv;
+ DB_MPOOLFILE *dbmfp;
+ DB_MPOOL_FINFO finfo;
+ db_pgno_t last_pgno;
+ size_t maxmap;
+ u_int32_t mbytes, bytes, oflags;
+ int ret;
+ u_int8_t idbuf[DB_FILE_ID_LEN];
+ char *rpath;
+
+ dbenv = dbmp->dbenv;
+ ret = 0;
+ rpath = NULL;
+
+ /*
+ * If mfp is provided, we take the DB_MPOOL_FINFO information from
+ * the mfp. We don't bother initializing everything, because some
+ * of them are expensive to acquire. If no mfp is provided and the
+ * finfop argument is NULL, we default the values.
+ */
+ if (finfop == NULL) {
+ memset(&finfo, 0, sizeof(finfo));
+ if (mfp != NULL) {
+ finfo.ftype = mfp->ftype;
+ finfo.pgcookie = NULL;
+ finfo.fileid = NULL;
+ finfo.lsn_offset = mfp->lsn_off;
+ finfo.clear_len = mfp->clear_len;
+ } else {
+ finfo.ftype = 0;
+ finfo.pgcookie = NULL;
+ finfo.fileid = NULL;
+ finfo.lsn_offset = -1;
+ finfo.clear_len = 0;
+ }
+ finfop = &finfo;
+ }
+
+ /* Allocate and initialize the per-process structure. */
+ if ((ret = __os_calloc(dbenv, 1, sizeof(DB_MPOOLFILE), &dbmfp)) != 0)
+ return (ret);
+ dbmfp->dbmp = dbmp;
+ dbmfp->ref = 1;
+ if (LF_ISSET(DB_RDONLY))
+ F_SET(dbmfp, MP_READONLY);
+
+ if (path == NULL) {
+ if (LF_ISSET(DB_RDONLY)) {
+ __db_err(dbenv,
+ "memp_fopen: temporary files can't be readonly");
+ ret = EINVAL;
+ goto err;
+ }
+ last_pgno = 0;
+ } else {
+ /* Get the real name for this file and open it. */
+ if ((ret = __db_appname(dbenv,
+ DB_APP_DATA, NULL, path, 0, NULL, &rpath)) != 0)
+ goto err;
+ oflags = 0;
+ if (LF_ISSET(DB_CREATE))
+ oflags |= DB_OSO_CREATE;
+ if (LF_ISSET(DB_RDONLY))
+ oflags |= DB_OSO_RDONLY;
+ if ((ret =
+ __os_open(dbenv, rpath, oflags, mode, &dbmfp->fh)) != 0) {
+ if (!LF_ISSET(DB_EXTENT))
+ __db_err(dbenv,
+ "%s: %s", rpath, db_strerror(ret));
+ goto err;
+ }
+
+ /*
+ * Don't permit files that aren't a multiple of the pagesize,
+ * and find the number of the last page in the file, all the
+ * time being careful not to overflow 32 bits.
+ *
+ * !!!
+ * We can't use off_t's here, or in any code in the mainline
+ * library for that matter. (We have to use them in the os
+ * stubs, of course, as there are system calls that take them
+ * as arguments.) The reason is that some customers build in
+ * environments where an off_t is 32-bits, but still run where
+ * offsets are 64-bits, and they pay us a lot of money.
+ */
+ if ((ret = __os_ioinfo(dbenv, rpath,
+ &dbmfp->fh, &mbytes, &bytes, NULL)) != 0) {
+ __db_err(dbenv, "%s: %s", rpath, db_strerror(ret));
+ goto err;
+ }
+
+ /*
+ * If we're doing a verify, we might have to cope with
+ * a truncated file; if the file size is not a multiple
+ * of the page size, round down to a page--we'll
+ * take care of the partial page outside the memp system.
+ */
+
+ /* Page sizes have to be a power-of-two, ignore mbytes. */
+ if (bytes % pagesize != 0) {
+ if (LF_ISSET(DB_ODDFILESIZE))
+ /*
+ * If we're doing a verify, we might
+ * have to cope with a truncated file;
+ * round down, we'll worry about the partial
+ * page outside the memp system.
+ */
+ bytes -= (bytes % pagesize);
+ else {
+ __db_err(dbenv,
+ "%s: file size not a multiple of the pagesize",
+ rpath);
+ ret = EINVAL;
+ goto err;
+ }
+ }
+
+ last_pgno = mbytes * (MEGABYTE / pagesize);
+ last_pgno += bytes / pagesize;
+
+ /* Correction: page numbers are zero-based, not 1-based. */
+ if (last_pgno != 0)
+ --last_pgno;
+
+ /*
+ * Get the file id if we weren't given one. Generated file id's
+ * don't use timestamps, otherwise there'd be no chance of any
+ * other process joining the party.
+ */
+ if (finfop->fileid == NULL) {
+ if ((ret = __os_fileid(dbenv, rpath, 0, idbuf)) != 0)
+ goto err;
+ finfop->fileid = idbuf;
+ }
+ }
+
+ /*
+ * If we weren't provided an underlying shared object to join with,
+ * find/allocate the shared file objects. Also allocate space for
+ * for the per-process thread lock.
+ */
+ if (needlock)
+ R_LOCK(dbenv, dbmp->reginfo);
+ if (mfp == NULL)
+ ret = __memp_mf_open(
+ dbmp, path, pagesize, last_pgno, finfop, flags, &mfp);
+ else {
+ ++mfp->mpf_cnt;
+ ret = 0;
+ }
+ if (needlock)
+ R_UNLOCK(dbenv, dbmp->reginfo);
+ if (ret != 0)
+ goto err;
+
+ if (F_ISSET(dbenv, DB_ENV_THREAD)) {
+ if ((ret = __db_mutex_alloc(
+ dbenv, dbmp->reginfo, &dbmfp->mutexp)) != 0)
+ goto err;
+ if ((ret = __db_mutex_init(
+ dbenv, dbmfp->mutexp, 0, MUTEX_THREAD)) != 0)
+ goto err;
+
+ /* XXX: KEITH: CLOSE THE FILE ON FAILURE? */
+ }
+
+ dbmfp->mfp = mfp;
+
+ /*
+ * If a file:
+ * + is read-only
+ * + isn't temporary
+ * + doesn't require any pgin/pgout support
+ * + the DB_NOMMAP flag wasn't set (in either the file open or
+ * the environment in which it was opened)
+ * + and is less than mp_mmapsize bytes in size
+ *
+ * we can mmap it instead of reading/writing buffers. Don't do error
+ * checking based on the mmap call failure. We want to do normal I/O
+ * on the file if the reason we failed was because the file was on an
+ * NFS mounted partition, and we can fail in buffer I/O just as easily
+ * as here.
+ *
+ * XXX
+ * We'd like to test to see if the file is too big to mmap. Since we
+ * don't know what size or type off_t's or size_t's are, or the largest
+ * unsigned integral type is, or what random insanity the local C
+ * compiler will perpetrate, doing the comparison in a portable way is
+ * flatly impossible. Hope that mmap fails if the file is too large.
+ */
+#define DB_MAXMMAPSIZE (10 * 1024 * 1024) /* 10 Mb. */
+ if (F_ISSET(mfp, MP_CAN_MMAP)) {
+ if (!F_ISSET(dbmfp, MP_READONLY))
+ F_CLR(mfp, MP_CAN_MMAP);
+ if (path == NULL)
+ F_CLR(mfp, MP_CAN_MMAP);
+ if (finfop->ftype != 0)
+ F_CLR(mfp, MP_CAN_MMAP);
+ if (LF_ISSET(DB_NOMMAP) || F_ISSET(dbenv, DB_ENV_NOMMAP))
+ F_CLR(mfp, MP_CAN_MMAP);
+ maxmap = dbenv->mp_mmapsize == 0 ?
+ DB_MAXMMAPSIZE : dbenv->mp_mmapsize;
+ if (mbytes > maxmap / MEGABYTE ||
+ (mbytes == maxmap / MEGABYTE && bytes >= maxmap % MEGABYTE))
+ F_CLR(mfp, MP_CAN_MMAP);
+ }
+ dbmfp->addr = NULL;
+ if (F_ISSET(mfp, MP_CAN_MMAP)) {
+ dbmfp->len = (size_t)mbytes * MEGABYTE + bytes;
+ if (__os_mapfile(dbenv, rpath,
+ &dbmfp->fh, dbmfp->len, 1, &dbmfp->addr) != 0) {
+ dbmfp->addr = NULL;
+ F_CLR(mfp, MP_CAN_MMAP);
+ }
+ }
+ if (rpath != NULL)
+ __os_freestr(rpath);
+
+ MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
+ TAILQ_INSERT_TAIL(&dbmp->dbmfq, dbmfp, q);
+ MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+
+ *retp = dbmfp;
+ return (0);
+
+err: /*
+ * Note that we do not have to free the thread mutex, because we
+ * never get to here after we have successfully allocated it.
+ */
+ if (rpath != NULL)
+ __os_freestr(rpath);
+ if (F_ISSET(&dbmfp->fh, DB_FH_VALID))
+ (void)__os_closehandle(&dbmfp->fh);
+ if (dbmfp != NULL) {
+ if (dbmfp->mutexp != NULL)
+ __db_mutex_free(dbenv, dbmp->reginfo, dbmfp->mutexp);
+ __os_free(dbmfp, sizeof(DB_MPOOLFILE));
+ }
+ return (ret);
+}
+
+/*
+ * __memp_mf_open --
+ * Open an MPOOLFILE.
+ */
+static int
+__memp_mf_open(dbmp, path, pagesize, last_pgno, finfop, flags, retp)
+ DB_MPOOL *dbmp;
+ const char *path;
+ size_t pagesize;
+ db_pgno_t last_pgno;
+ DB_MPOOL_FINFO *finfop;
+ u_int32_t flags;
+ MPOOLFILE **retp;
+{
+ MPOOL *mp;
+ MPOOLFILE *mfp;
+ int ret;
+ void *p;
+
+#define ISTEMPORARY (path == NULL)
+
+ /*
+ * If not creating a temporary file, walk the list of MPOOLFILE's,
+ * looking for a matching file. Files backed by temporary files
+ * or previously removed files can't match.
+ *
+ * DB_TRUNCATE support.
+ *
+ * The fileID is a filesystem unique number (e.g., a UNIX dev/inode
+ * pair) plus a timestamp. If files are removed and created in less
+ * than a second, the fileID can be repeated. The problem with
+ * repetition happens when the file that previously had the fileID
+ * value still has pages in the pool, since we don't want to use them
+ * to satisfy requests for the new file.
+ *
+ * Because the DB_TRUNCATE flag reuses the dev/inode pair, repeated
+ * opens with that flag set guarantees matching fileIDs when the
+ * machine can open a file and then re-open with truncate within a
+ * second. For this reason, we pass that flag down, and, if we find
+ * a matching entry, we ensure that it's never found again, and we
+ * create a new entry for the current request.
+ */
+ if (!ISTEMPORARY) {
+ mp = dbmp->reginfo[0].primary;
+ for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
+ mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
+ if (F_ISSET(mfp, MP_DEADFILE | MP_TEMP))
+ continue;
+ if (memcmp(finfop->fileid, R_ADDR(dbmp->reginfo,
+ mfp->fileid_off), DB_FILE_ID_LEN) == 0) {
+ if (LF_ISSET(DB_TRUNCATE)) {
+ MEMP_FREMOVE(mfp);
+ continue;
+ }
+ if (finfop->clear_len != mfp->clear_len ||
+ pagesize != mfp->stat.st_pagesize) {
+ __db_err(dbmp->dbenv,
+ "%s: page size or clear length changed",
+ path);
+ return (EINVAL);
+ }
+
+ /*
+ * It's possible that our needs for pre- and
+ * post-processing are changing. For example,
+ * an application created a hash subdatabase
+ * in a database that was previously all btree.
+ */
+ if (finfop->ftype != 0)
+ mfp->ftype = finfop->ftype;
+
+ ++mfp->mpf_cnt;
+
+ *retp = mfp;
+ return (0);
+ }
+ }
+ }
+
+ /* Allocate a new MPOOLFILE. */
+ if ((ret = __memp_alloc(
+ dbmp, dbmp->reginfo, NULL, sizeof(MPOOLFILE), NULL, &mfp)) != 0)
+ goto mem_err;
+ *retp = mfp;
+
+ /* Initialize the structure. */
+ memset(mfp, 0, sizeof(MPOOLFILE));
+ mfp->mpf_cnt = 1;
+ mfp->ftype = finfop->ftype;
+ mfp->lsn_off = finfop->lsn_offset;
+ mfp->clear_len = finfop->clear_len;
+
+ /*
+ * If the user specifies DB_MPOOL_LAST or DB_MPOOL_NEW on a memp_fget,
+ * we have to know the last page in the file. Figure it out and save
+ * it away.
+ */
+ mfp->stat.st_pagesize = pagesize;
+ mfp->orig_last_pgno = mfp->last_pgno = last_pgno;
+
+ if (ISTEMPORARY)
+ F_SET(mfp, MP_TEMP);
+ else {
+ /* Copy the file path into shared memory. */
+ if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
+ NULL, strlen(path) + 1, &mfp->path_off, &p)) != 0)
+ goto err;
+ memcpy(p, path, strlen(path) + 1);
+
+ /* Copy the file identification string into shared memory. */
+ if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
+ NULL, DB_FILE_ID_LEN, &mfp->fileid_off, &p)) != 0)
+ goto err;
+ memcpy(p, finfop->fileid, DB_FILE_ID_LEN);
+
+ F_SET(mfp, MP_CAN_MMAP);
+ }
+
+ /* Copy the page cookie into shared memory. */
+ if (finfop->pgcookie == NULL || finfop->pgcookie->size == 0) {
+ mfp->pgcookie_len = 0;
+ mfp->pgcookie_off = 0;
+ } else {
+ if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
+ NULL, finfop->pgcookie->size, &mfp->pgcookie_off, &p)) != 0)
+ goto err;
+ memcpy(p, finfop->pgcookie->data, finfop->pgcookie->size);
+ mfp->pgcookie_len = finfop->pgcookie->size;
+ }
+
+ /* Prepend the MPOOLFILE to the list of MPOOLFILE's. */
+ mp = dbmp->reginfo[0].primary;
+ SH_TAILQ_INSERT_HEAD(&mp->mpfq, mfp, q, __mpoolfile);
+
+ if (0) {
+err: if (mfp->path_off != 0)
+ __db_shalloc_free(dbmp->reginfo[0].addr,
+ R_ADDR(dbmp->reginfo, mfp->path_off));
+ if (mfp->fileid_off != 0)
+ __db_shalloc_free(dbmp->reginfo[0].addr,
+ R_ADDR(dbmp->reginfo, mfp->fileid_off));
+ if (mfp != NULL)
+ __db_shalloc_free(dbmp->reginfo[0].addr, mfp);
+mem_err: __db_err(dbmp->dbenv,
+ "Unable to allocate memory for mpool file");
+ }
+ return (ret);
+}
+
+/*
+ * memp_fclose --
+ * Close a backing file for the memory pool.
+ */
+int
+memp_fclose(dbmfp)
+ DB_MPOOLFILE *dbmfp;
+{
+ DB_ENV *dbenv;
+ DB_MPOOL *dbmp;
+ MPOOLFILE *mfp;
+ char *rpath;
+ int ret, t_ret;
+
+ dbmp = dbmfp->dbmp;
+ dbenv = dbmp->dbenv;
+ ret = 0;
+
+ PANIC_CHECK(dbenv);
+
+#ifdef HAVE_RPC
+ if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
+ return (__dbcl_memp_fclose(dbmfp));
+#endif
+
+ /*
+ * Remove the DB_MPOOLFILE from the queue. This has to happen before
+ * we perform any action that can fail, otherwise __memp_close may
+ * loop infinitely when calling us to discard all of the DB_MPOOLFILEs.
+ */
+ for (;;) {
+ MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
+
+ /*
+ * We have to reference count DB_MPOOLFILE structures as other
+ * threads may be using them. The problem only happens if the
+ * application makes a bad design choice. Here's the path:
+ *
+ * Thread A opens a database.
+ * Thread B uses thread A's DB_MPOOLFILE to write a buffer
+ * in order to free up memory in the mpool cache.
+ * Thread A closes the database while thread B is using the
+ * DB_MPOOLFILE structure.
+ *
+ * By opening all databases before creating the threads, and
+ * closing them after the threads have exited, applications
+ * get better performance and avoid the problem path entirely.
+ *
+ * Regardless, holding the DB_MPOOLFILE to flush a dirty buffer
+ * is a short-term lock, even in worst case, since we better be
+ * the only thread of control using the DB_MPOOLFILE structure
+ * to read pages *into* the cache. Wait until we're the only
+ * reference holder and remove the DB_MPOOLFILE structure from
+ * the list, so nobody else can even find it.
+ */
+ if (dbmfp->ref == 1) {
+ TAILQ_REMOVE(&dbmp->dbmfq, dbmfp, q);
+ break;
+ }
+ MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+
+ (void)__os_sleep(dbenv, 1, 0);
+ }
+ MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+
+ /* Complain if pinned blocks never returned. */
+ if (dbmfp->pinref != 0)
+ __db_err(dbenv, "%s: close: %lu blocks left pinned",
+ __memp_fn(dbmfp), (u_long)dbmfp->pinref);
+
+ /* Discard any mmap information. */
+ if (dbmfp->addr != NULL &&
+ (ret = __os_unmapfile(dbenv, dbmfp->addr, dbmfp->len)) != 0)
+ __db_err(dbenv, "%s: %s", __memp_fn(dbmfp), db_strerror(ret));
+
+ /* Close the file; temporary files may not yet have been created. */
+ if (F_ISSET(&dbmfp->fh, DB_FH_VALID) &&
+ (t_ret = __os_closehandle(&dbmfp->fh)) != 0) {
+ __db_err(dbenv, "%s: %s", __memp_fn(dbmfp), db_strerror(t_ret));
+ if (ret != 0)
+ t_ret = ret;
+ }
+
+ /* Discard the thread mutex. */
+ if (dbmfp->mutexp != NULL)
+ __db_mutex_free(dbenv, dbmp->reginfo, dbmfp->mutexp);
+
+ /*
+ * Discard our reference on the the underlying MPOOLFILE, and close
+ * it if it's no longer useful to anyone.
+ *
+ * If we're not discarding it, and it's a temp file, this means
+ * all the outstanding references belong to unflushed buffers.
+ * (A temp file can only be referenced by one DB_MPOOLFILE).
+ * We don't care about preserving any of those buffers, so mark
+ * the MPOOLFILE as dead so that when we try to flush them,
+ * even the dirty ones just get discarded.
+ */
+ R_LOCK(dbenv, dbmp->reginfo);
+ mfp = dbmfp->mfp;
+ if (--mfp->mpf_cnt == 0) {
+ if (F_ISSET(mfp, MP_UNLINK)) {
+ MEMP_FREMOVE(mfp);
+ if ((t_ret = __db_appname(dbmp->dbenv,
+ DB_APP_DATA, NULL, R_ADDR(dbmp->reginfo,
+ mfp->path_off), 0, NULL, &rpath)) != 0 && ret == 0)
+ ret = t_ret;
+ if (t_ret == 0 && (t_ret =
+ __os_unlink(dbmp->dbenv, rpath) != 0 && ret == 0))
+ ret = t_ret;
+ __os_free(rpath, 0);
+ }
+ if (mfp->block_cnt == 0)
+ __memp_mf_discard(dbmp, mfp);
+ }
+ else if (F_ISSET(mfp, MP_TEMP))
+ MEMP_FREMOVE(mfp);
+ R_UNLOCK(dbenv, dbmp->reginfo);
+
+ /* Discard the DB_MPOOLFILE structure. */
+ __os_free(dbmfp, sizeof(DB_MPOOLFILE));
+
+ return (ret);
+}
+
+/*
+ * __memp_mf_discard --
+ * Discard an MPOOLFILE.
+ *
+ * PUBLIC: void __memp_mf_discard __P((DB_MPOOL *, MPOOLFILE *));
+ */
+void
+__memp_mf_discard(dbmp, mfp)
+ DB_MPOOL *dbmp;
+ MPOOLFILE *mfp;
+{
+ MPOOL *mp;
+
+ mp = dbmp->reginfo[0].primary;
+
+ /* Delete from the list of MPOOLFILEs. */
+ SH_TAILQ_REMOVE(&mp->mpfq, mfp, q, __mpoolfile);
+
+ /* Free the space. */
+ if (mfp->path_off != 0)
+ __db_shalloc_free(dbmp->reginfo[0].addr,
+ R_ADDR(dbmp->reginfo, mfp->path_off));
+ if (mfp->fileid_off != 0)
+ __db_shalloc_free(dbmp->reginfo[0].addr,
+ R_ADDR(dbmp->reginfo, mfp->fileid_off));
+ if (mfp->pgcookie_off != 0)
+ __db_shalloc_free(dbmp->reginfo[0].addr,
+ R_ADDR(dbmp->reginfo, mfp->pgcookie_off));
+ __db_shalloc_free(dbmp->reginfo[0].addr, mfp);
+}
+
+/*
+ * __memp_fremove --
+ * Remove an underlying file from the system.
+ *
+ * PUBLIC: int __memp_fremove __P((DB_MPOOLFILE *));
+ */
+int
+__memp_fremove(dbmfp)
+ DB_MPOOLFILE *dbmfp;
+{
+ DB_ENV *dbenv;
+ DB_MPOOL *dbmp;
+ MPOOLFILE *mfp;
+
+ dbmp = dbmfp->dbmp;
+ dbenv = dbmp->dbenv;
+ mfp = dbmfp->mfp;
+
+ PANIC_CHECK(dbenv);
+
+ R_LOCK(dbenv, dbmp->reginfo);
+
+ MEMP_FREMOVE(mfp);
+
+ R_UNLOCK(dbenv, dbmp->reginfo);
+
+ return (0);
+}
+
+/*
+ * __memp_fn --
+ * On errors we print whatever is available as the file name.
+ *
+ * PUBLIC: char * __memp_fn __P((DB_MPOOLFILE *));
+ */
+char *
+__memp_fn(dbmfp)
+ DB_MPOOLFILE *dbmfp;
+{
+ return (__memp_fns(dbmfp->dbmp, dbmfp->mfp));
+}
+
+/*
+ * __memp_fns --
+ * On errors we print whatever is available as the file name.
+ *
+ * PUBLIC: char * __memp_fns __P((DB_MPOOL *, MPOOLFILE *));
+ *
+ */
+char *
+__memp_fns(dbmp, mfp)
+ DB_MPOOL *dbmp;
+ MPOOLFILE *mfp;
+{
+ if (mfp->path_off == 0)
+ return ((char *)"temporary");
+
+ return ((char *)R_ADDR(dbmp->reginfo, mfp->path_off));
+}
diff --git a/bdb/mp/mp_fput.c b/bdb/mp/mp_fput.c
new file mode 100644
index 00000000000..be03b721f36
--- /dev/null
+++ b/bdb/mp/mp_fput.c
@@ -0,0 +1,186 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: mp_fput.c,v 11.16 2000/11/30 00:58:41 ubell Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#endif
+
+#ifdef HAVE_RPC
+#include "db_server.h"
+#endif
+
+#include "db_int.h"
+#include "db_shash.h"
+#include "mp.h"
+
+#ifdef HAVE_RPC
+#include "gen_client_ext.h"
+#include "rpc_client_ext.h"
+#endif
+
+/*
+ * memp_fput --
+ * Mpool file put function.
+ */
+int
+memp_fput(dbmfp, pgaddr, flags)
+ DB_MPOOLFILE *dbmfp;
+ void *pgaddr;
+ u_int32_t flags;
+{
+ BH *bhp;
+ DB_ENV *dbenv;
+ DB_MPOOL *dbmp;
+ MPOOL *c_mp, *mp;
+ int ret, wrote;
+
+ dbmp = dbmfp->dbmp;
+ dbenv = dbmp->dbenv;
+ mp = dbmp->reginfo[0].primary;
+
+#ifdef HAVE_RPC
+ if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
+ return (__dbcl_memp_fput(dbmfp, pgaddr, flags));
+#endif
+
+ PANIC_CHECK(dbenv);
+
+ /* Validate arguments. */
+ if (flags) {
+ if ((ret = __db_fchk(dbenv, "memp_fput", flags,
+ DB_MPOOL_CLEAN | DB_MPOOL_DIRTY | DB_MPOOL_DISCARD)) != 0)
+ return (ret);
+ if ((ret = __db_fcchk(dbenv, "memp_fput",
+ flags, DB_MPOOL_CLEAN, DB_MPOOL_DIRTY)) != 0)
+ return (ret);
+
+ if (LF_ISSET(DB_MPOOL_DIRTY) && F_ISSET(dbmfp, MP_READONLY)) {
+ __db_err(dbenv,
+ "%s: dirty flag set for readonly file page",
+ __memp_fn(dbmfp));
+ return (EACCES);
+ }
+ }
+
+ R_LOCK(dbenv, dbmp->reginfo);
+
+ /* Decrement the pinned reference count. */
+ if (dbmfp->pinref == 0) {
+ __db_err(dbenv,
+ "%s: more pages returned than retrieved", __memp_fn(dbmfp));
+ R_UNLOCK(dbenv, dbmp->reginfo);
+ return (EINVAL);
+ } else
+ --dbmfp->pinref;
+
+ /*
+ * If we're mapping the file, there's nothing to do. Because we can
+ * stop mapping the file at any time, we have to check on each buffer
+ * to see if the address we gave the application was part of the map
+ * region.
+ */
+ if (dbmfp->addr != NULL && pgaddr >= dbmfp->addr &&
+ (u_int8_t *)pgaddr <= (u_int8_t *)dbmfp->addr + dbmfp->len) {
+ R_UNLOCK(dbenv, dbmp->reginfo);
+ return (0);
+ }
+
+ /* Convert the page address to a buffer header. */
+ bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf));
+
+ /* Convert the buffer header to a cache. */
+ c_mp = BH_TO_CACHE(dbmp, bhp);
+
+/* UNLOCK THE REGION, LOCK THE CACHE. */
+
+ /* Set/clear the page bits. */
+ if (LF_ISSET(DB_MPOOL_CLEAN) && F_ISSET(bhp, BH_DIRTY)) {
+ ++c_mp->stat.st_page_clean;
+ --c_mp->stat.st_page_dirty;
+ F_CLR(bhp, BH_DIRTY);
+ }
+ if (LF_ISSET(DB_MPOOL_DIRTY) && !F_ISSET(bhp, BH_DIRTY)) {
+ --c_mp->stat.st_page_clean;
+ ++c_mp->stat.st_page_dirty;
+ F_SET(bhp, BH_DIRTY);
+ }
+ if (LF_ISSET(DB_MPOOL_DISCARD))
+ F_SET(bhp, BH_DISCARD);
+
+ /*
+ * If the page is dirty and being scheduled to be written as part of
+ * a checkpoint, we no longer know that the log is up-to-date.
+ */
+ if (F_ISSET(bhp, BH_DIRTY) && F_ISSET(bhp, BH_SYNC))
+ F_SET(bhp, BH_SYNC_LOGFLSH);
+
+ /*
+ * Check for a reference count going to zero. This can happen if the
+ * application returns a page twice.
+ */
+ if (bhp->ref == 0) {
+ __db_err(dbenv, "%s: page %lu: unpinned page returned",
+ __memp_fn(dbmfp), (u_long)bhp->pgno);
+ R_UNLOCK(dbenv, dbmp->reginfo);
+ return (EINVAL);
+ }
+
+ /*
+ * If more than one reference to the page, we're done. Ignore the
+ * discard flags (for now) and leave it at its position in the LRU
+ * chain. The rest gets done at last reference close.
+ */
+ if (--bhp->ref > 0) {
+ R_UNLOCK(dbenv, dbmp->reginfo);
+ return (0);
+ }
+
+ /*
+ * Move the buffer to the head/tail of the LRU chain. We do this
+ * before writing the buffer for checkpoint purposes, as the write
+ * can discard the region lock and allow another process to acquire
+ * buffer. We could keep that from happening, but there seems no
+ * reason to do so.
+ */
+ SH_TAILQ_REMOVE(&c_mp->bhq, bhp, q, __bh);
+ if (F_ISSET(bhp, BH_DISCARD))
+ SH_TAILQ_INSERT_HEAD(&c_mp->bhq, bhp, q, __bh);
+ else
+ SH_TAILQ_INSERT_TAIL(&c_mp->bhq, bhp, q);
+
+ /*
+ * If this buffer is scheduled for writing because of a checkpoint, we
+ * need to write it (if it's dirty), or update the checkpoint counters
+ * (if it's not dirty). If we try to write it and can't, that's not
+ * necessarily an error as it's not completely unreasonable that the
+ * application have permission to write the underlying file, but set a
+ * flag so that the next time the memp_sync function is called we try
+ * writing it there, as the checkpoint thread of control better be able
+ * to write all of the files.
+ */
+ if (F_ISSET(bhp, BH_SYNC)) {
+ if (F_ISSET(bhp, BH_DIRTY)) {
+ if (__memp_bhwrite(dbmp,
+ dbmfp->mfp, bhp, NULL, &wrote) != 0 || !wrote)
+ F_SET(mp, MP_LSN_RETRY);
+ } else {
+ F_CLR(bhp, BH_SYNC);
+
+ --mp->lsn_cnt;
+ --dbmfp->mfp->lsn_cnt;
+ }
+ }
+
+ R_UNLOCK(dbenv, dbmp->reginfo);
+ return (0);
+}
diff --git a/bdb/mp/mp_fset.c b/bdb/mp/mp_fset.c
new file mode 100644
index 00000000000..08313c9b6f5
--- /dev/null
+++ b/bdb/mp/mp_fset.c
@@ -0,0 +1,98 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: mp_fset.c,v 11.13 2000/11/30 00:58:41 ubell Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#endif
+
+#ifdef HAVE_RPC
+#include "db_server.h"
+#endif
+
+#include "db_int.h"
+#include "db_shash.h"
+#include "mp.h"
+
+#ifdef HAVE_RPC
+#include "gen_client_ext.h"
+#include "rpc_client_ext.h"
+#endif
+
+/*
+ * memp_fset --
+ * Mpool page set-flag routine.
+ */
+int
+memp_fset(dbmfp, pgaddr, flags)
+ DB_MPOOLFILE *dbmfp;
+ void *pgaddr;
+ u_int32_t flags;
+{
+ BH *bhp;
+ DB_ENV *dbenv;
+ DB_MPOOL *dbmp;
+ MPOOL *c_mp, *mp;
+ int ret;
+
+ dbmp = dbmfp->dbmp;
+ dbenv = dbmp->dbenv;
+ mp = dbmp->reginfo[0].primary;
+
+#ifdef HAVE_RPC
+ if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
+ return (__dbcl_memp_fset(dbmfp, pgaddr, flags));
+#endif
+
+ PANIC_CHECK(dbenv);
+
+ /* Validate arguments. */
+ if (flags == 0)
+ return (__db_ferr(dbenv, "memp_fset", 1));
+
+ if ((ret = __db_fchk(dbenv, "memp_fset", flags,
+ DB_MPOOL_DIRTY | DB_MPOOL_CLEAN | DB_MPOOL_DISCARD)) != 0)
+ return (ret);
+ if ((ret = __db_fcchk(dbenv, "memp_fset",
+ flags, DB_MPOOL_CLEAN, DB_MPOOL_DIRTY)) != 0)
+ return (ret);
+
+ if (LF_ISSET(DB_MPOOL_DIRTY) && F_ISSET(dbmfp, MP_READONLY)) {
+ __db_err(dbenv, "%s: dirty flag set for readonly file page",
+ __memp_fn(dbmfp));
+ return (EACCES);
+ }
+
+ /* Convert the page address to a buffer header. */
+ bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf));
+
+ /* Convert the buffer header to a cache. */
+ c_mp = BH_TO_CACHE(dbmp, bhp);
+
+ R_LOCK(dbenv, dbmp->reginfo);
+
+ if (LF_ISSET(DB_MPOOL_CLEAN) && F_ISSET(bhp, BH_DIRTY)) {
+ ++c_mp->stat.st_page_clean;
+ --c_mp->stat.st_page_dirty;
+ F_CLR(bhp, BH_DIRTY);
+ }
+ if (LF_ISSET(DB_MPOOL_DIRTY) && !F_ISSET(bhp, BH_DIRTY)) {
+ --c_mp->stat.st_page_clean;
+ ++c_mp->stat.st_page_dirty;
+ F_SET(bhp, BH_DIRTY);
+ }
+ if (LF_ISSET(DB_MPOOL_DISCARD))
+ F_SET(bhp, BH_DISCARD);
+
+ R_UNLOCK(dbenv, dbmp->reginfo);
+ return (0);
+}
diff --git a/bdb/mp/mp_method.c b/bdb/mp/mp_method.c
new file mode 100644
index 00000000000..85a6239b032
--- /dev/null
+++ b/bdb/mp/mp_method.c
@@ -0,0 +1,115 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: mp_method.c,v 11.10 2000/04/04 20:12:04 bostic Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+#endif
+
+#ifdef HAVE_RPC
+#include "db_server.h"
+#endif
+
+#include "db_int.h"
+#include "db_shash.h"
+#include "mp.h"
+
+#ifdef HAVE_RPC
+#include "gen_client_ext.h"
+#include "rpc_client_ext.h"
+#endif
+
+static int __memp_set_cachesize __P((DB_ENV *, u_int32_t, u_int32_t, int));
+static int __memp_set_mp_mmapsize __P((DB_ENV *, size_t));
+
+/*
+ * __memp_dbenv_create --
+ * Mpool specific creation of the DB_ENV structure.
+ *
+ * PUBLIC: void __memp_dbenv_create __P((DB_ENV *));
+ */
+void
+__memp_dbenv_create(dbenv)
+ DB_ENV *dbenv;
+{
+ /*
+ * We default to 32 8K pages. We don't default to a flat 256K, because
+ * some systems require significantly more memory to hold 32 pages than
+ * others. For example, HP-UX with POSIX pthreads needs 88 bytes for
+ * a POSIX pthread mutex and almost 200 bytes per buffer header, while
+ * Solaris needs 24 and 52 bytes for the same structures.
+ */
+ dbenv->mp_bytes = 32 * ((8 * 1024) + sizeof(BH));
+ dbenv->mp_ncache = 1;
+
+ dbenv->set_mp_mmapsize = __memp_set_mp_mmapsize;
+ dbenv->set_cachesize = __memp_set_cachesize;
+
+#ifdef HAVE_RPC
+ /*
+ * If we have a client, overwrite what we just setup to
+ * point to client functions.
+ */
+ if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) {
+ dbenv->set_cachesize = __dbcl_env_cachesize;
+ dbenv->set_mp_mmapsize = __dbcl_set_mp_mmapsize;
+ }
+#endif
+
+}
+
+/*
+ * __memp_set_cachesize --
+ * Initialize the cache size.
+ */
+static int
+__memp_set_cachesize(dbenv, gbytes, bytes, ncache)
+ DB_ENV *dbenv;
+ u_int32_t gbytes, bytes;
+ int ncache;
+{
+ ENV_ILLEGAL_AFTER_OPEN(dbenv, "set_cachesize");
+
+ dbenv->mp_gbytes = gbytes + bytes / GIGABYTE;
+ dbenv->mp_bytes = bytes % GIGABYTE;
+ dbenv->mp_ncache = ncache == 0 ? 1 : ncache;
+
+ /*
+ * If the application requested less than 500Mb, increase the
+ * cachesize by 25% to account for our overhead. (I'm guessing
+ * that caches over 500Mb are specifically sized, i.e., it's
+ * a large server and the application actually knows how much
+ * memory is available.)
+ *
+ * There is a minimum cache size, regardless.
+ */
+ if (dbenv->mp_gbytes == 0) {
+ if (dbenv->mp_bytes < 500 * MEGABYTE)
+ dbenv->mp_bytes += dbenv->mp_bytes / 4;
+ if (dbenv->mp_bytes < DB_CACHESIZE_MIN)
+ dbenv->mp_bytes = DB_CACHESIZE_MIN;
+ }
+
+ return (0);
+}
+
+/*
+ * __memp_set_mp_mmapsize --
+ * Set the maximum mapped file size.
+ */
+static int
+__memp_set_mp_mmapsize(dbenv, mp_mmapsize )
+ DB_ENV *dbenv;
+ size_t mp_mmapsize;
+{
+ dbenv->mp_mmapsize = mp_mmapsize;
+ return (0);
+}
diff --git a/bdb/mp/mp_region.c b/bdb/mp/mp_region.c
new file mode 100644
index 00000000000..4b85466ce63
--- /dev/null
+++ b/bdb/mp/mp_region.c
@@ -0,0 +1,357 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: mp_region.c,v 11.26 2000/11/30 00:58:41 ubell Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "db_shash.h"
+#include "mp.h"
+
+static int __mpool_init __P((DB_ENV *, DB_MPOOL *, int, int));
+#ifdef MUTEX_SYSTEM_RESOURCES
+static size_t __mpool_region_maint __P((REGINFO *));
+#endif
+
+/*
+ * __memp_open --
+ * Internal version of memp_open: only called from DB_ENV->open.
+ *
+ * PUBLIC: int __memp_open __P((DB_ENV *));
+ */
+int
+__memp_open(dbenv)
+ DB_ENV *dbenv;
+{
+ DB_MPOOL *dbmp;
+ MPOOL *mp;
+ REGINFO reginfo;
+ roff_t reg_size, *regids;
+ u_int32_t i;
+ int htab_buckets, ret;
+
+ /* Figure out how big each cache region is. */
+ reg_size = (dbenv->mp_gbytes / dbenv->mp_ncache) * GIGABYTE;
+ reg_size += ((dbenv->mp_gbytes %
+ dbenv->mp_ncache) * GIGABYTE) / dbenv->mp_ncache;
+ reg_size += dbenv->mp_bytes / dbenv->mp_ncache;
+
+ /*
+ * Figure out how many hash buckets each region will have. Assume we
+ * want to keep the hash chains with under 10 pages on each chain. We
+ * don't know the pagesize in advance, and it may differ for different
+ * files. Use a pagesize of 1K for the calculation -- we walk these
+ * chains a lot, they must be kept short.
+ */
+ htab_buckets = __db_tablesize((reg_size / (1 * 1024)) / 10);
+
+ /* Create and initialize the DB_MPOOL structure. */
+ if ((ret = __os_calloc(dbenv, 1, sizeof(*dbmp), &dbmp)) != 0)
+ return (ret);
+ LIST_INIT(&dbmp->dbregq);
+ TAILQ_INIT(&dbmp->dbmfq);
+ dbmp->dbenv = dbenv;
+
+ /* Join/create the first mpool region. */
+ memset(&reginfo, 0, sizeof(REGINFO));
+ reginfo.type = REGION_TYPE_MPOOL;
+ reginfo.id = INVALID_REGION_ID;
+ reginfo.mode = dbenv->db_mode;
+ reginfo.flags = REGION_JOIN_OK;
+ if (F_ISSET(dbenv, DB_ENV_CREATE))
+ F_SET(&reginfo, REGION_CREATE_OK);
+ if ((ret = __db_r_attach(dbenv, &reginfo, reg_size)) != 0)
+ goto err;
+
+ /*
+ * If we created the region, initialize it. Create or join any
+ * additional regions.
+ */
+ if (F_ISSET(&reginfo, REGION_CREATE)) {
+ /*
+ * We define how many regions there are going to be, allocate
+ * the REGINFO structures and create them. Make sure we don't
+ * clear the wrong entries on error.
+ */
+ dbmp->nreg = dbenv->mp_ncache;
+ if ((ret = __os_calloc(dbenv,
+ dbmp->nreg, sizeof(REGINFO), &dbmp->reginfo)) != 0)
+ goto err;
+ /* Make sure we don't clear the wrong entries on error. */
+ for (i = 0; i < dbmp->nreg; ++i)
+ dbmp->reginfo[i].id = INVALID_REGION_ID;
+ dbmp->reginfo[0] = reginfo;
+
+ /* Initialize the first region. */
+ if ((ret = __mpool_init(dbenv, dbmp, 0, htab_buckets)) != 0)
+ goto err;
+
+ /*
+ * Create/initialize remaining regions and copy their IDs into
+ * the first region.
+ */
+ mp = R_ADDR(dbmp->reginfo, dbmp->reginfo[0].rp->primary);
+ regids = R_ADDR(dbmp->reginfo, mp->regids);
+ for (i = 1; i < dbmp->nreg; ++i) {
+ dbmp->reginfo[i].type = REGION_TYPE_MPOOL;
+ dbmp->reginfo[i].id = INVALID_REGION_ID;
+ dbmp->reginfo[i].mode = dbenv->db_mode;
+ dbmp->reginfo[i].flags = REGION_CREATE_OK;
+ if ((ret = __db_r_attach(
+ dbenv, &dbmp->reginfo[i], reg_size)) != 0)
+ goto err;
+ if ((ret =
+ __mpool_init(dbenv, dbmp, i, htab_buckets)) != 0)
+ goto err;
+ R_UNLOCK(dbenv, &dbmp->reginfo[i]);
+
+ regids[i] = dbmp->reginfo[i].id;
+ }
+ } else {
+ /*
+ * Determine how many regions there are going to be, allocate
+ * the REGINFO structures and fill in local copies of that
+ * information.
+ */
+ mp = R_ADDR(&reginfo, reginfo.rp->primary);
+ dbmp->nreg = mp->nreg;
+ if ((ret = __os_calloc(dbenv,
+ dbmp->nreg, sizeof(REGINFO), &dbmp->reginfo)) != 0)
+ goto err;
+ /* Make sure we don't clear the wrong entries on error. */
+ for (i = 0; i < dbmp->nreg; ++i)
+ dbmp->reginfo[i].id = INVALID_REGION_ID;
+ dbmp->reginfo[0] = reginfo;
+
+ /* Join remaining regions. */
+ regids = R_ADDR(dbmp->reginfo, mp->regids);
+ for (i = 1; i < dbmp->nreg; ++i) {
+ dbmp->reginfo[i].type = REGION_TYPE_MPOOL;
+ dbmp->reginfo[i].id = regids[i];
+ dbmp->reginfo[i].mode = 0;
+ dbmp->reginfo[i].flags = REGION_JOIN_OK;
+ if ((ret = __db_r_attach(
+ dbenv, &dbmp->reginfo[i], 0)) != 0)
+ goto err;
+ R_UNLOCK(dbenv, &dbmp->reginfo[i]);
+ }
+ }
+
+ /* Set the local addresses for the regions. */
+ for (i = 0; i < dbmp->nreg; ++i)
+ dbmp->reginfo[i].primary =
+ R_ADDR(&dbmp->reginfo[i], dbmp->reginfo[i].rp->primary);
+
+ /* If the region is threaded, allocate a mutex to lock the handles. */
+ if (F_ISSET(dbenv, DB_ENV_THREAD)) {
+ if ((ret = __db_mutex_alloc(
+ dbenv, dbmp->reginfo, &dbmp->mutexp)) != 0) {
+ goto err;
+ }
+ if ((ret =
+ __db_mutex_init(dbenv, dbmp->mutexp, 0, MUTEX_THREAD)) != 0)
+ goto err;
+ }
+
+ R_UNLOCK(dbenv, dbmp->reginfo);
+
+ dbenv->mp_handle = dbmp;
+ return (0);
+
+err: if (dbmp->reginfo != NULL && dbmp->reginfo[0].addr != NULL) {
+ if (F_ISSET(dbmp->reginfo, REGION_CREATE))
+ ret = __db_panic(dbenv, ret);
+
+ R_UNLOCK(dbenv, dbmp->reginfo);
+
+ for (i = 0; i < dbmp->nreg; ++i)
+ if (dbmp->reginfo[i].id != INVALID_REGION_ID)
+ (void)__db_r_detach(
+ dbenv, &dbmp->reginfo[i], 0);
+ __os_free(dbmp->reginfo,
+ dbmp->nreg * sizeof(*dbmp->reginfo));
+ }
+ if (dbmp->mutexp != NULL)
+ __db_mutex_free(dbenv, dbmp->reginfo, dbmp->mutexp);
+ __os_free(dbmp, sizeof(*dbmp));
+ return (ret);
+}
+
+/*
+ * __mpool_init --
+ * Initialize a MPOOL structure in shared memory.
+ */
+static int
+__mpool_init(dbenv, dbmp, reginfo_off, htab_buckets)
+ DB_ENV *dbenv;
+ DB_MPOOL *dbmp;
+ int reginfo_off, htab_buckets;
+{
+ DB_HASHTAB *htab;
+ MPOOL *mp;
+ REGINFO *reginfo;
+#ifdef MUTEX_SYSTEM_RESOURCES
+ size_t maint_size;
+#endif
+ int ret;
+ void *p;
+
+ mp = NULL;
+
+ reginfo = &dbmp->reginfo[reginfo_off];
+ if ((ret = __db_shalloc(reginfo->addr,
+ sizeof(MPOOL), MUTEX_ALIGN, &reginfo->primary)) != 0)
+ goto mem_err;
+ reginfo->rp->primary = R_OFFSET(reginfo, reginfo->primary);
+ mp = reginfo->primary;
+ memset(mp, 0, sizeof(*mp));
+
+#ifdef MUTEX_SYSTEM_RESOURCES
+ maint_size = __mpool_region_maint(reginfo);
+ /* Allocate room for the maintenance info and initialize it. */
+ if ((ret = __db_shalloc(reginfo->addr,
+ sizeof(REGMAINT) + maint_size, 0, &p)) != 0)
+ goto mem_err;
+ __db_maintinit(reginfo, p, maint_size);
+ mp->maint_off = R_OFFSET(reginfo, p);
+#endif
+
+ if (reginfo_off == 0) {
+ SH_TAILQ_INIT(&mp->mpfq);
+
+ if ((ret = __db_shmutex_init(dbenv, &mp->sync_mutex,
+ R_OFFSET(dbmp->reginfo, &mp->sync_mutex) +
+ DB_FCNTL_OFF_MPOOL, 0, dbmp->reginfo,
+ (REGMAINT *)R_ADDR(dbmp->reginfo, mp->maint_off))) != 0)
+ goto err;
+
+ ZERO_LSN(mp->lsn);
+ mp->lsn_cnt = 0;
+
+ mp->nreg = dbmp->nreg;
+ if ((ret = __db_shalloc(dbmp->reginfo[0].addr,
+ dbmp->nreg * sizeof(int), 0, &p)) != 0)
+ goto mem_err;
+ mp->regids = R_OFFSET(dbmp->reginfo, p);
+ }
+
+ SH_TAILQ_INIT(&mp->bhq);
+
+ /* Allocate hash table space and initialize it. */
+ if ((ret = __db_shalloc(reginfo->addr,
+ htab_buckets * sizeof(DB_HASHTAB), 0, &htab)) != 0)
+ goto mem_err;
+ __db_hashinit(htab, htab_buckets);
+ mp->htab = R_OFFSET(reginfo, htab);
+ mp->htab_buckets = htab_buckets;
+
+ return (0);
+
+mem_err:__db_err(dbenv, "Unable to allocate memory for mpool region");
+err: if (reginfo->primary != NULL)
+ __db_shalloc_free(reginfo->addr, reginfo->primary);
+ return (ret);
+}
+
+/*
+ * __memp_close --
+ * Internal version of memp_close: only called from DB_ENV->close.
+ *
+ * PUBLIC: int __memp_close __P((DB_ENV *));
+ */
+int
+__memp_close(dbenv)
+ DB_ENV *dbenv;
+{
+ DB_MPOOL *dbmp;
+ DB_MPOOLFILE *dbmfp;
+ DB_MPREG *mpreg;
+ u_int32_t i;
+ int ret, t_ret;
+
+ ret = 0;
+ dbmp = dbenv->mp_handle;
+
+ /* Discard DB_MPREGs. */
+ while ((mpreg = LIST_FIRST(&dbmp->dbregq)) != NULL) {
+ LIST_REMOVE(mpreg, q);
+ __os_free(mpreg, sizeof(DB_MPREG));
+ }
+
+ /* Discard DB_MPOOLFILEs. */
+ while ((dbmfp = TAILQ_FIRST(&dbmp->dbmfq)) != NULL)
+ if ((t_ret = memp_fclose(dbmfp)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Discard the thread mutex. */
+ if (dbmp->mutexp != NULL)
+ __db_mutex_free(dbenv, dbmp->reginfo, dbmp->mutexp);
+
+ /* Detach from the region(s). */
+ for (i = 0; i < dbmp->nreg; ++i)
+ if ((t_ret = __db_r_detach(
+ dbenv, &dbmp->reginfo[i], 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ __os_free(dbmp->reginfo, dbmp->nreg * sizeof(*dbmp->reginfo));
+ __os_free(dbmp, sizeof(*dbmp));
+
+ dbenv->mp_handle = NULL;
+ return (ret);
+}
+
+#ifdef MUTEX_SYSTEM_RESOURCES
+/*
+ * __mpool_region_maint --
+ * Return the amount of space needed for region maintenance info.
+ *
+ */
+static size_t
+__mpool_region_maint(infop)
+ REGINFO *infop;
+{
+ size_t s;
+ int numlocks;
+
+ /*
+ * For mutex maintenance we need one mutex per possible page.
+ * Compute the maximum number of pages this cache can have.
+ * Also add in an mpool mutex.
+ */
+ numlocks = ((infop->rp->size / DB_MIN_PGSIZE) + 1);
+ s = sizeof(roff_t) * numlocks;
+ return (s);
+}
+#endif
+
+/*
+ * __mpool_region_destroy
+ * Destroy any region maintenance info.
+ *
+ * PUBLIC: void __mpool_region_destroy __P((DB_ENV *, REGINFO *));
+ */
+void
+__mpool_region_destroy(dbenv, infop)
+ DB_ENV *dbenv;
+ REGINFO *infop;
+{
+ MPOOL *mp;
+
+ COMPQUIET(dbenv, NULL);
+ mp = R_ADDR(infop, infop->rp->primary);
+
+ __db_shlocks_destroy(infop, (REGMAINT *)R_ADDR(infop, mp->maint_off));
+ return;
+}
diff --git a/bdb/mp/mp_register.c b/bdb/mp/mp_register.c
new file mode 100644
index 00000000000..27859f69d7b
--- /dev/null
+++ b/bdb/mp/mp_register.c
@@ -0,0 +1,85 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: mp_register.c,v 11.12 2000/11/15 19:25:39 sue Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+#endif
+
+#ifdef HAVE_RPC
+#include "db_server.h"
+#endif
+
+#include "db_int.h"
+#include "db_shash.h"
+#include "mp.h"
+
+#ifdef HAVE_RPC
+#include "gen_client_ext.h"
+#include "rpc_client_ext.h"
+#endif
+
+/*
+ * memp_register --
+ * Register a file type's pgin, pgout routines.
+ */
+int
+memp_register(dbenv, ftype, pgin, pgout)
+ DB_ENV *dbenv;
+ int ftype;
+ int (*pgin) __P((DB_ENV *, db_pgno_t, void *, DBT *));
+ int (*pgout) __P((DB_ENV *, db_pgno_t, void *, DBT *));
+{
+ DB_MPOOL *dbmp;
+ DB_MPREG *mpreg;
+ int ret;
+
+#ifdef HAVE_RPC
+ if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
+ return (__dbcl_memp_register(dbenv, ftype, pgin, pgout));
+#endif
+
+ PANIC_CHECK(dbenv);
+ ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL);
+
+ dbmp = dbenv->mp_handle;
+
+ /*
+ * Chances are good that the item has already been registered, as the
+ * DB access methods are the folks that call this routine. If already
+ * registered, just update the entry, although it's probably unchanged.
+ */
+ MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
+ for (mpreg = LIST_FIRST(&dbmp->dbregq);
+ mpreg != NULL; mpreg = LIST_NEXT(mpreg, q))
+ if (mpreg->ftype == ftype) {
+ mpreg->pgin = pgin;
+ mpreg->pgout = pgout;
+ break;
+ }
+ MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+ if (mpreg != NULL)
+ return (0);
+
+ /* New entry. */
+ if ((ret = __os_malloc(dbenv, sizeof(DB_MPREG), NULL, &mpreg)) != 0)
+ return (ret);
+
+ mpreg->ftype = ftype;
+ mpreg->pgin = pgin;
+ mpreg->pgout = pgout;
+
+ MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
+ LIST_INSERT_HEAD(&dbmp->dbregq, mpreg, q);
+ MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+
+ return (0);
+}
diff --git a/bdb/mp/mp_stat.c b/bdb/mp/mp_stat.c
new file mode 100644
index 00000000000..7982513448d
--- /dev/null
+++ b/bdb/mp/mp_stat.c
@@ -0,0 +1,388 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: mp_stat.c,v 11.21 2001/01/09 16:59:30 bostic Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#endif
+
+#ifdef HAVE_RPC
+#include "db_server.h"
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "db_shash.h"
+#include "db_am.h"
+#include "mp.h"
+
+#ifdef HAVE_RPC
+#include "gen_client_ext.h"
+#include "rpc_client_ext.h"
+#endif
+
+static void __memp_dumpcache
+ __P((DB_MPOOL *, REGINFO *, size_t *, FILE *, u_int32_t));
+static void __memp_pbh __P((DB_MPOOL *, BH *, size_t *, FILE *));
+
+/*
+ * memp_stat --
+ * Display MPOOL statistics.
+ */
+int
+memp_stat(dbenv, gspp, fspp, db_malloc)
+ DB_ENV *dbenv;
+ DB_MPOOL_STAT **gspp;
+ DB_MPOOL_FSTAT ***fspp;
+ void *(*db_malloc) __P((size_t));
+{
+ DB_MPOOL *dbmp;
+ DB_MPOOL_FSTAT **tfsp, *tstruct;
+ DB_MPOOL_STAT *sp;
+ MPOOL *c_mp, *mp;
+ MPOOLFILE *mfp;
+ char *tname;
+ size_t len, nlen;
+ u_int32_t i;
+ int ret;
+ char *name;
+
+#ifdef HAVE_RPC
+ if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
+ return (__dbcl_memp_stat(dbenv, gspp, fspp, db_malloc));
+#endif
+
+ PANIC_CHECK(dbenv);
+ ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL);
+
+ dbmp = dbenv->mp_handle;
+ sp = NULL;
+
+ /* Global statistics. */
+ mp = dbmp->reginfo[0].primary;
+ if (gspp != NULL) {
+ *gspp = NULL;
+
+ if ((ret = __os_calloc(dbenv, 1, sizeof(**gspp), gspp)) != 0)
+ return (ret);
+ sp = *gspp;
+
+ /*
+ * Initialization and information that is not maintained on
+ * a per-cache basis.
+ */
+ sp->st_hash_longest = 0;
+ sp->st_region_wait = dbmp->reginfo[0].rp->mutex.mutex_set_wait;
+ sp->st_region_nowait =
+ dbmp->reginfo[0].rp->mutex.mutex_set_nowait;
+ sp->st_gbytes = dbenv->mp_gbytes;
+ sp->st_bytes = dbenv->mp_bytes;
+ sp->st_ncache = dbmp->nreg;
+ sp->st_regsize = dbmp->reginfo[0].rp->size;
+
+ R_LOCK(dbenv, dbmp->reginfo);
+
+ /* Walk the cache list and accumulate the global information. */
+ for (i = 0; i < mp->nreg; ++i) {
+ c_mp = dbmp->reginfo[i].primary;
+ sp->st_cache_hit += c_mp->stat.st_cache_hit;
+ sp->st_cache_miss += c_mp->stat.st_cache_miss;
+ sp->st_map += c_mp->stat.st_map;
+ sp->st_page_create += c_mp->stat.st_page_create;
+ sp->st_page_in += c_mp->stat.st_page_in;
+ sp->st_page_out += c_mp->stat.st_page_out;
+ sp->st_ro_evict += c_mp->stat.st_ro_evict;
+ sp->st_rw_evict += c_mp->stat.st_rw_evict;
+ sp->st_hash_buckets += c_mp->stat.st_hash_buckets;
+ sp->st_hash_searches += c_mp->stat.st_hash_searches;
+ if (c_mp->stat.st_hash_longest > sp->st_hash_longest)
+ sp->st_hash_longest =
+ c_mp->stat.st_hash_longest;
+ sp->st_hash_examined += c_mp->stat.st_hash_examined;
+ sp->st_page_clean += c_mp->stat.st_page_clean;
+ sp->st_page_dirty += c_mp->stat.st_page_dirty;
+ sp->st_page_trickle += c_mp->stat.st_page_trickle;
+ sp->st_region_wait += c_mp->stat.st_region_wait;
+ sp->st_region_nowait += c_mp->stat.st_region_nowait;
+ }
+
+ /*
+ * We have duplicate statistics fields in the cache and
+ * per-file structures. The counters are only incremented
+ * in the per-file structures, though. The intent is that
+ * if we ever flush files from the pool we can save their
+ * last known totals in the cache structure.
+ */
+ for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
+ mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
+ sp->st_cache_hit += mfp->stat.st_cache_hit;
+ sp->st_cache_miss += mfp->stat.st_cache_miss;
+ sp->st_map += mfp->stat.st_map;
+ sp->st_page_create += mfp->stat.st_page_create;
+ sp->st_page_in += mfp->stat.st_page_in;
+ sp->st_page_out += mfp->stat.st_page_out;
+ }
+
+ R_UNLOCK(dbenv, dbmp->reginfo);
+ }
+
+ /* Per-file statistics. */
+ if (fspp != NULL) {
+ *fspp = NULL;
+
+ R_LOCK(dbenv, dbmp->reginfo);
+
+ /* Count the MPOOLFILE structures. */
+ for (i = 0, len = 0,
+ mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
+ mfp != NULL;
+ ++i, mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile))
+ len += sizeof(DB_MPOOL_FSTAT *) +
+ sizeof(DB_MPOOL_FSTAT) +
+ strlen(__memp_fns(dbmp, mfp)) + 1;
+ len += sizeof(DB_MPOOL_FSTAT *); /* Trailing NULL */
+
+ R_UNLOCK(dbenv, dbmp->reginfo);
+
+ if (len == 0)
+ return (0);
+
+ /* Allocate space */
+ if ((ret = __os_malloc(dbenv, len, db_malloc, fspp)) != 0)
+ return (ret);
+
+ R_LOCK(dbenv, dbmp->reginfo);
+
+ /*
+ * Build each individual entry. We assume that an array of
+ * pointers are aligned correctly to be followed by an array
+ * of structures, which should be safe (in this particular
+ * case, the first element of the structure is a pointer, so
+ * we're doubly safe). The array is followed by space for
+ * the text file names.
+ *
+ * Add 1 to i because we need to skip over the NULL.
+ */
+ tfsp = *fspp;
+ tstruct = (DB_MPOOL_FSTAT *)(tfsp + i + 1);
+ tname = (char *)(tstruct + i);
+
+ for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
+ mfp != NULL;
+ ++tfsp, ++tstruct, tname += nlen,
+ mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
+ name = __memp_fns(dbmp, mfp);
+ nlen = strlen(name) + 1;
+ *tfsp = tstruct;
+ *tstruct = mfp->stat;
+ tstruct->file_name = tname;
+ memcpy(tname, name, nlen);
+ }
+ *tfsp = NULL;
+
+ R_UNLOCK(dbenv, dbmp->reginfo);
+ }
+ return (0);
+}
+
+#define FMAP_ENTRIES 200 /* Files we map. */
+
+#define MPOOL_DUMP_HASH 0x01 /* Debug hash chains. */
+#define MPOOL_DUMP_LRU 0x02 /* Debug LRU chains. */
+#define MPOOL_DUMP_MEM 0x04 /* Debug region memory. */
+#define MPOOL_DUMP_ALL 0x07 /* Debug all. */
+
+/*
+ * __memp_dump_region --
+ * Display MPOOL structures.
+ *
+ * PUBLIC: void __memp_dump_region __P((DB_ENV *, char *, FILE *));
+ */
+void
+__memp_dump_region(dbenv, area, fp)
+ DB_ENV *dbenv;
+ char *area;
+ FILE *fp;
+{
+ DB_MPOOL *dbmp;
+ DB_MPOOLFILE *dbmfp;
+ MPOOL *mp;
+ MPOOLFILE *mfp;
+ size_t fmap[FMAP_ENTRIES + 1];
+ u_int32_t i, flags;
+ int cnt;
+ u_int8_t *p;
+
+ dbmp = dbenv->mp_handle;
+
+ /* Make it easy to call from the debugger. */
+ if (fp == NULL)
+ fp = stderr;
+
+ for (flags = 0; *area != '\0'; ++area)
+ switch (*area) {
+ case 'A':
+ LF_SET(MPOOL_DUMP_ALL);
+ break;
+ case 'h':
+ LF_SET(MPOOL_DUMP_HASH);
+ break;
+ case 'l':
+ LF_SET(MPOOL_DUMP_LRU);
+ break;
+ case 'm':
+ LF_SET(MPOOL_DUMP_MEM);
+ break;
+ }
+
+ R_LOCK(dbenv, dbmp->reginfo);
+
+ mp = dbmp->reginfo[0].primary;
+
+ /* Display MPOOL structures. */
+ (void)fprintf(fp, "%s\nPool (region addr 0x%lx)\n",
+ DB_LINE, (u_long)dbmp->reginfo[0].addr);
+
+ /* Display the MPOOLFILE structures. */
+ cnt = 0;
+ for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
+ mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile), ++cnt) {
+ (void)fprintf(fp, "File #%d: %s: type %ld, %s\n\t [UID: ",
+ cnt + 1, __memp_fns(dbmp, mfp), (long)mfp->ftype,
+ F_ISSET(mfp, MP_CAN_MMAP) ? "mmap" : "read/write");
+ p = R_ADDR(dbmp->reginfo, mfp->fileid_off);
+ for (i = 0; i < DB_FILE_ID_LEN; ++i) {
+ (void)fprintf(fp, "%x", *p++);
+ if (i < DB_FILE_ID_LEN - 1)
+ (void)fprintf(fp, " ");
+ }
+ (void)fprintf(fp, "]\n");
+ if (cnt < FMAP_ENTRIES)
+ fmap[cnt] = R_OFFSET(dbmp->reginfo, mfp);
+ }
+
+ for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
+ dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q), ++cnt) {
+ (void)fprintf(fp, "File #%d: %s: per-process, %s\n",
+ cnt + 1, __memp_fn(dbmfp),
+ F_ISSET(dbmfp, MP_READONLY) ? "readonly" : "read/write");
+ if (cnt < FMAP_ENTRIES)
+ fmap[cnt] = R_OFFSET(dbmp->reginfo, mfp);
+ }
+ if (cnt < FMAP_ENTRIES)
+ fmap[cnt] = INVALID_ROFF;
+ else
+ fmap[FMAP_ENTRIES] = INVALID_ROFF;
+
+ /* Dump the memory pools. */
+ for (i = 0; i < mp->nreg; ++i) {
+ (void)fprintf(fp, "%s\nCache #%d:\n", DB_LINE, i + 1);
+ __memp_dumpcache(dbmp, &dbmp->reginfo[i], fmap, fp, flags);
+ }
+
+ R_UNLOCK(dbenv, dbmp->reginfo);
+
+ /* Flush in case we're debugging. */
+ (void)fflush(fp);
+}
+
+/*
+ * __memp_dumpcache --
+ * Display statistics for a cache.
+ */
+static void
+__memp_dumpcache(dbmp, reginfo, fmap, fp, flags)
+ DB_MPOOL *dbmp;
+ REGINFO *reginfo;
+ size_t *fmap;
+ FILE *fp;
+ u_int32_t flags;
+{
+ BH *bhp;
+ DB_HASHTAB *dbht;
+ MPOOL *c_mp;
+ int bucket;
+
+ c_mp = reginfo->primary;
+
+ /* Display the hash table list of BH's. */
+ if (LF_ISSET(MPOOL_DUMP_HASH)) {
+ (void)fprintf(fp,
+ "%s\nBH hash table (%lu hash slots)\npageno, file, ref, address\n",
+ DB_LINE, (u_long)c_mp->htab_buckets);
+ for (dbht = R_ADDR(reginfo, c_mp->htab),
+ bucket = 0; bucket < c_mp->htab_buckets; ++dbht, ++bucket) {
+ if (SH_TAILQ_FIRST(dbht, __bh) != NULL)
+ (void)fprintf(fp, "%lu:\n", (u_long)bucket);
+ for (bhp = SH_TAILQ_FIRST(dbht, __bh);
+ bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
+ __memp_pbh(dbmp, bhp, fmap, fp);
+ }
+ }
+
+ /* Display the LRU list of BH's. */
+ if (LF_ISSET(MPOOL_DUMP_LRU)) {
+ (void)fprintf(fp, "%s\nBH LRU list\n", DB_LINE);
+ (void)fprintf(fp, "pageno, file, ref, address\n");
+ for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh);
+ bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh))
+ __memp_pbh(dbmp, bhp, fmap, fp);
+ }
+
+ /* Dump the memory pool. */
+ if (LF_ISSET(MPOOL_DUMP_MEM))
+ __db_shalloc_dump(reginfo->addr, fp);
+}
+
+/*
+ * __memp_pbh --
+ * Display a BH structure.
+ */
+static void
+__memp_pbh(dbmp, bhp, fmap, fp)
+ DB_MPOOL *dbmp;
+ BH *bhp;
+ size_t *fmap;
+ FILE *fp;
+{
+ static const FN fn[] = {
+ { BH_CALLPGIN, "callpgin" },
+ { BH_DIRTY, "dirty" },
+ { BH_DISCARD, "discard" },
+ { BH_LOCKED, "locked" },
+ { BH_SYNC, "sync" },
+ { BH_SYNC_LOGFLSH, "sync:logflush" },
+ { BH_TRASH, "trash" },
+ { 0, NULL }
+ };
+ int i;
+
+ for (i = 0; i < FMAP_ENTRIES; ++i)
+ if (fmap[i] == INVALID_ROFF || fmap[i] == bhp->mf_offset)
+ break;
+
+ if (fmap[i] == INVALID_ROFF)
+ (void)fprintf(fp, " %4lu, %lu, %2lu, %lu",
+ (u_long)bhp->pgno, (u_long)bhp->mf_offset,
+ (u_long)bhp->ref, (u_long)R_OFFSET(dbmp->reginfo, bhp));
+ else
+ (void)fprintf(fp, " %4lu, #%d, %2lu, %lu",
+ (u_long)bhp->pgno, i + 1,
+ (u_long)bhp->ref, (u_long)R_OFFSET(dbmp->reginfo, bhp));
+
+ __db_prflags(bhp->flags, fn, fp);
+
+ (void)fprintf(fp, "\n");
+}
diff --git a/bdb/mp/mp_sync.c b/bdb/mp/mp_sync.c
new file mode 100644
index 00000000000..1b0751db709
--- /dev/null
+++ b/bdb/mp/mp_sync.c
@@ -0,0 +1,658 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: mp_sync.c,v 11.29 2001/01/11 18:19:53 bostic Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <stdlib.h>
+#endif
+
+#ifdef HAVE_RPC
+#include "db_server.h"
+#endif
+
+#include "db_int.h"
+#include "db_shash.h"
+#include "mp.h"
+
+#ifdef HAVE_RPC
+#include "gen_client_ext.h"
+#include "rpc_client_ext.h"
+#endif
+
+static int __bhcmp __P((const void *, const void *));
+static int __memp_fsync __P((DB_MPOOLFILE *));
+static int __memp_sballoc __P((DB_ENV *, BH ***, u_int32_t *));
+
+/*
+ * memp_sync --
+ * Mpool sync function.
+ */
+int
+memp_sync(dbenv, lsnp)
+ DB_ENV *dbenv;
+ DB_LSN *lsnp;
+{
+ BH *bhp, **bharray;
+ DB_MPOOL *dbmp;
+ DB_LSN tlsn;
+ MPOOL *c_mp, *mp;
+ MPOOLFILE *mfp;
+ u_int32_t ar_cnt, i, ndirty;
+ int ret, retry_done, retry_need, wrote;
+
+#ifdef HAVE_RPC
+ if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
+ return (__dbcl_memp_sync(dbenv, lsnp));
+#endif
+
+ PANIC_CHECK(dbenv);
+ ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL);
+
+ dbmp = dbenv->mp_handle;
+ mp = dbmp->reginfo[0].primary;
+
+ /*
+ * If no LSN is provided, flush the entire cache.
+ *
+ * !!!
+ * Our current behavior is to flush the entire cache, so there's
+ * nothing special we have to do here other than deal with NULL
+ * pointers.
+ */
+ if (lsnp == NULL) {
+ ZERO_LSN(tlsn);
+ lsnp = &tlsn;
+ F_SET(mp, MP_LSN_RETRY);
+ } else if (!LOGGING_ON(dbenv)) {
+ __db_err(dbenv, "memp_sync: requires logging");
+ return (EINVAL);
+ }
+
+ /*
+ * Sync calls are single-threaded so that we don't have multiple
+ * threads, with different checkpoint LSNs, walking the caches
+ * and updating the checkpoint LSNs and how many buffers remain
+ * to be written for the checkpoint. This shouldn't be a problem,
+ * any application that has multiple checkpoint threads isn't what
+ * I'd call trustworthy.
+ */
+ MUTEX_LOCK(dbenv, &mp->sync_mutex, dbenv->lockfhp);
+
+ /*
+ * If the application is asking about a previous call to memp_sync(),
+ * and we haven't found any buffers that the application holding the
+ * pin couldn't write, return yes or no based on the current count.
+ * Note, if the application is asking about a LSN *smaller* than one
+ * we've already handled or are currently handling, then we return a
+ * result based on the count for the larger LSN.
+ */
+ R_LOCK(dbenv, dbmp->reginfo);
+ if (!IS_ZERO_LSN(*lsnp) &&
+ !F_ISSET(mp, MP_LSN_RETRY) && log_compare(lsnp, &mp->lsn) <= 0) {
+ if (mp->lsn_cnt == 0) {
+ *lsnp = mp->lsn;
+ ret = 0;
+ } else
+ ret = DB_INCOMPLETE;
+
+ R_UNLOCK(dbenv, dbmp->reginfo);
+ MUTEX_UNLOCK(dbenv, &mp->sync_mutex);
+ return (ret);
+ }
+
+ /*
+ * Allocate room for a list of buffers, and decide how many buffers
+ * we can pin down.
+ *
+ * !!!
+ * Note: __memp_sballoc has released the region lock if we're not
+ * continuing forward.
+ */
+ if ((ret =
+ __memp_sballoc(dbenv, &bharray, &ndirty)) != 0 || ndirty == 0) {
+ MUTEX_UNLOCK(dbenv, &mp->sync_mutex);
+ return (ret);
+ }
+
+ retry_done = 0;
+retry: retry_need = 0;
+ /*
+ * Start a new checkpoint.
+ *
+ * Save the LSN. We know that it's a new LSN, a retry, or larger than
+ * the one for which we were already doing a checkpoint. (BTW, I don't
+ * expect to see multiple LSN's from the same or multiple processes,
+ * but You Just Never Know. Responding as if they all called with the
+ * largest of the LSNs specified makes everything work.)
+ *
+ * We don't currently use the LSN we save. We could potentially save
+ * the last-written LSN in each buffer header and use it to determine
+ * what buffers need to be written. The problem with this is that it's
+ * sizeof(LSN) more bytes of buffer header. We currently write all the
+ * dirty buffers instead, but with a sufficiently large cache that's
+ * going to be a problem.
+ */
+ mp->lsn = *lsnp;
+
+ /*
+ * Clear the global count of buffers waiting to be written, walk the
+ * list of files clearing the count of buffers waiting to be written.
+ *
+ * Clear the retry flag.
+ */
+ mp->lsn_cnt = 0;
+ for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
+ mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile))
+ mfp->lsn_cnt = 0;
+ F_CLR(mp, MP_LSN_RETRY);
+
+ /*
+ * Walk each cache's list of buffers and mark all dirty buffers to be
+ * written and all pinned buffers to be potentially written (we can't
+ * know if they'll need to be written until the holder returns them to
+ * the cache). We do this in one pass while holding the region locked
+ * so that processes can't make new buffers dirty, causing us to never
+ * finish. Since the application may have restarted the sync using a
+ * different LSN value, clear any BH_SYNC | BH_SYNC_LOGFLSH flags that
+ * appear leftover from previous calls.
+ *
+ * Keep a count of the total number of buffers we need to write in
+ * MPOOL->lsn_cnt, and for each file, in MPOOLFILE->lsn_count.
+ */
+ for (ar_cnt = 0, i = 0; i < mp->nreg; ++i) {
+ c_mp = dbmp->reginfo[i].primary;
+ for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh);
+ bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) {
+ if (F_ISSET(bhp, BH_DIRTY) || bhp->ref != 0) {
+ F_SET(bhp, BH_SYNC);
+
+ ++mp->lsn_cnt;
+
+ mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+ ++mfp->lsn_cnt;
+
+ /*
+ * If the buffer isn't being used, we can write
+ * it immediately, so increment its reference
+ * count to lock it down, and save a reference
+ * to it.
+ *
+ * If we've run out space to store buffer refs,
+ * we're screwed. We don't want to realloc the
+ * array while holding a region lock, so we set
+ * a flag and deal with it later.
+ */
+ if (bhp->ref == 0) {
+ ++bhp->ref;
+ bharray[ar_cnt] = bhp;
+
+ if (++ar_cnt >= ndirty) {
+ retry_need = 1;
+ break;
+ }
+ }
+ } else
+ if (F_ISSET(bhp, BH_SYNC))
+ F_CLR(bhp, BH_SYNC | BH_SYNC_LOGFLSH);
+ }
+ if (ar_cnt >= ndirty)
+ break;
+ }
+
+ /* If there no buffers we can write immediately, we're done. */
+ if (ar_cnt == 0) {
+ ret = mp->lsn_cnt ? DB_INCOMPLETE : 0;
+ goto done;
+ }
+
+ R_UNLOCK(dbenv, dbmp->reginfo);
+
+ /*
+ * Sort the buffers we're going to write immediately.
+ *
+ * We try and write the buffers in file/page order: it should reduce
+ * seeks by the underlying filesystem and possibly reduce the actual
+ * number of writes.
+ */
+ if (ar_cnt > 1)
+ qsort(bharray, ar_cnt, sizeof(BH *), __bhcmp);
+
+ /*
+ * Flush the log. We have to ensure the log records reflecting the
+ * changes on the database pages we're writing have already made it
+ * to disk. We usually do that as we write each page, but if we
+ * are going to write a large number of pages, repeatedly acquiring
+ * the log region lock is going to be expensive. Flush the entire
+ * log now, so that sync doesn't require any more log flushes.
+ */
+ if (LOGGING_ON(dbenv) && (ret = log_flush(dbenv, NULL)) != 0)
+ goto done;
+
+ R_LOCK(dbenv, dbmp->reginfo);
+
+ /* Walk the array, writing buffers. */
+ for (i = 0; i < ar_cnt; ++i) {
+ /*
+ * It's possible for a thread to have gotten the buffer since
+ * we listed it for writing. If the reference count is still
+ * 1, we're the only ones using the buffer, go ahead and write.
+ * If it's >1, then skip the buffer and assume that it will be
+ * written when it's returned to the cache.
+ */
+ if (bharray[i]->ref > 1) {
+ --bharray[i]->ref;
+ continue;
+ }
+
+ /* Write the buffer. */
+ mfp = R_ADDR(dbmp->reginfo, bharray[i]->mf_offset);
+ ret = __memp_bhwrite(dbmp, mfp, bharray[i], NULL, &wrote);
+
+ /* Release the buffer. */
+ --bharray[i]->ref;
+
+ if (ret == 0 && wrote)
+ continue;
+
+ /*
+ * Any process syncing the shared memory buffer pool had best
+ * be able to write to any underlying file. Be understanding,
+ * but firm, on this point.
+ */
+ if (ret == 0) {
+ __db_err(dbenv, "%s: unable to flush page: %lu",
+ __memp_fns(dbmp, mfp), (u_long)bharray[i]->pgno);
+ ret = EPERM;
+ }
+
+ /*
+ * On error, clear MPOOL->lsn and set MP_LSN_RETRY so that no
+ * future checkpoint return can depend on this failure. Clear
+ * the buffer's BH_SYNC flag, because it's used to determine
+ * if lsn_cnt values are incremented/decremented. Don't bother
+ * to reset/clear:
+ *
+ * MPOOL->lsn_cnt
+ * MPOOLFILE->lsn_cnt
+ *
+ * they don't make any difference.
+ */
+ ZERO_LSN(mp->lsn);
+ F_SET(mp, MP_LSN_RETRY);
+
+ /* Release any buffers we're still pinning down. */
+ while (++i < ar_cnt) {
+ bhp = bharray[i];
+ --bhp->ref;
+ F_CLR(bhp, BH_SYNC | BH_SYNC_LOGFLSH);
+ }
+
+ goto done;
+ }
+
+ ret = mp->lsn_cnt != 0 ? DB_INCOMPLETE : 0;
+
+ /*
+ * If there were too many buffers and we're not returning an error, we
+ * re-try the checkpoint once -- since we allocated 80% of the total
+ * buffer count, once should be enough. If it still doesn't work, some
+ * other thread of control is dirtying buffers as fast as we're writing
+ * them, and we might as well give up for now. In the latter case, set
+ * the global retry flag, we'll have to start from scratch on the next
+ * checkpoint.
+ */
+ if (retry_need) {
+ if (retry_done) {
+ ret = DB_INCOMPLETE;
+ F_SET(mp, MP_LSN_RETRY);
+ } else {
+ retry_done = 1;
+ goto retry;
+ }
+ }
+
+done: R_UNLOCK(dbenv, dbmp->reginfo);
+ MUTEX_UNLOCK(dbenv, &mp->sync_mutex);
+
+ __os_free(bharray, ndirty * sizeof(BH *));
+
+ return (ret);
+}
+
+/*
+ * memp_fsync --
+ * Mpool file sync function.
+ */
+int
+memp_fsync(dbmfp)
+ DB_MPOOLFILE *dbmfp;
+{
+ DB_ENV *dbenv;
+ DB_MPOOL *dbmp;
+ int is_tmp;
+
+ dbmp = dbmfp->dbmp;
+ dbenv = dbmp->dbenv;
+
+#ifdef HAVE_RPC
+ if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
+ return (__dbcl_memp_fsync(dbmfp));
+#endif
+
+ PANIC_CHECK(dbenv);
+
+ /*
+ * If this handle doesn't have a file descriptor that's open for
+ * writing, or if the file is a temporary, there's no reason to
+ * proceed further.
+ */
+ if (F_ISSET(dbmfp, MP_READONLY))
+ return (0);
+
+ R_LOCK(dbenv, dbmp->reginfo);
+ is_tmp = F_ISSET(dbmfp->mfp, MP_TEMP);
+ R_UNLOCK(dbenv, dbmp->reginfo);
+ if (is_tmp)
+ return (0);
+
+ return (__memp_fsync(dbmfp));
+}
+
+/*
+ * __mp_xxx_fh --
+ * Return a file descriptor for DB 1.85 compatibility locking.
+ *
+ * PUBLIC: int __mp_xxx_fh __P((DB_MPOOLFILE *, DB_FH **));
+ */
+int
+__mp_xxx_fh(dbmfp, fhp)
+ DB_MPOOLFILE *dbmfp;
+ DB_FH **fhp;
+{
+ /*
+ * This is a truly spectacular layering violation, intended ONLY to
+ * support compatibility for the DB 1.85 DB->fd call.
+ *
+ * Sync the database file to disk, creating the file as necessary.
+ *
+ * We skip the MP_READONLY and MP_TEMP tests done by memp_fsync(3).
+ * The MP_READONLY test isn't interesting because we will either
+ * already have a file descriptor (we opened the database file for
+ * reading) or we aren't readonly (we created the database which
+ * requires write privileges). The MP_TEMP test isn't interesting
+ * because we want to write to the backing file regardless so that
+ * we get a file descriptor to return.
+ */
+ *fhp = &dbmfp->fh;
+ return (F_ISSET(&dbmfp->fh, DB_FH_VALID) ? 0 : __memp_fsync(dbmfp));
+}
+
+/*
+ * __memp_fsync --
+ * Mpool file internal sync function.
+ */
+static int
+__memp_fsync(dbmfp)
+ DB_MPOOLFILE *dbmfp;
+{
+ BH *bhp, **bharray;
+ DB_ENV *dbenv;
+ DB_MPOOL *dbmp;
+ MPOOL *c_mp, *mp;
+ size_t mf_offset;
+ u_int32_t ar_cnt, i, ndirty;
+ int incomplete, ret, retry_done, retry_need, wrote;
+
+ dbmp = dbmfp->dbmp;
+ dbenv = dbmp->dbenv;
+ mp = dbmp->reginfo[0].primary;
+
+ R_LOCK(dbenv, dbmp->reginfo);
+
+ /*
+ * Allocate room for a list of buffers, and decide how many buffers
+ * we can pin down.
+ *
+ * !!!
+ * Note: __memp_sballoc has released our region lock if we're not
+ * continuing forward.
+ */
+ if ((ret =
+ __memp_sballoc(dbenv, &bharray, &ndirty)) != 0 || ndirty == 0)
+ return (ret);
+
+ retry_done = 0;
+retry: retry_need = 0;
+ /*
+ * Walk each cache's list of buffers and mark all dirty buffers to be
+ * written and all pinned buffers to be potentially written (we can't
+ * know if they'll need to be written until the holder returns them to
+ * the cache). We do this in one pass while holding the region locked
+ * so that processes can't make new buffers dirty, causing us to never
+ * finish.
+ */
+ mf_offset = R_OFFSET(dbmp->reginfo, dbmfp->mfp);
+ for (ar_cnt = 0, incomplete = 0, i = 0; i < mp->nreg; ++i) {
+ c_mp = dbmp->reginfo[i].primary;
+ for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh);
+ bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) {
+ if (!F_ISSET(bhp, BH_DIRTY) ||
+ bhp->mf_offset != mf_offset)
+ continue;
+ if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED)) {
+ incomplete = 1;
+ continue;
+ }
+
+ /*
+ * If the buffer isn't being used, we can write
+ * it immediately, so increment its reference
+ * count to lock it down, and save a reference
+ * to it.
+ *
+ * If we've run out space to store buffer refs,
+ * we're screwed. We don't want to realloc the
+ * array while holding a region lock, so we set
+ * a flag and deal with it later.
+ */
+ ++bhp->ref;
+ bharray[ar_cnt] = bhp;
+ if (++ar_cnt >= ndirty) {
+ retry_need = 1;
+ break;
+ }
+ }
+ if (ar_cnt >= ndirty)
+ break;
+ }
+
+ /* If there no buffers we can write immediately, we're done. */
+ if (ar_cnt == 0) {
+ ret = 0;
+ goto done;
+ }
+
+ R_UNLOCK(dbenv, dbmp->reginfo);
+
+ /* Sort the buffers we're going to write. */
+ if (ar_cnt > 1)
+ qsort(bharray, ar_cnt, sizeof(BH *), __bhcmp);
+
+ R_LOCK(dbenv, dbmp->reginfo);
+
+ /* Walk the array, writing buffers. */
+ for (i = 0; i < ar_cnt;) {
+ /*
+ * It's possible for a thread to have gotten the buffer since
+ * we listed it for writing. If the reference count is still
+ * 1, we're the only ones using the buffer, go ahead and write.
+ * If it's >1, then skip the buffer and assume that it will be
+ * written when it's returned to the cache.
+ */
+ if (bharray[i]->ref > 1) {
+ incomplete = 1;
+ --bharray[i++]->ref;
+ continue;
+ }
+
+ /* Write the buffer. */
+ ret = __memp_pgwrite(dbmp, dbmfp, bharray[i], NULL, &wrote);
+
+ /* Release the buffer. */
+ --bharray[i++]->ref;
+
+ if (ret == 0) {
+ if (!wrote)
+ incomplete = 1;
+ continue;
+ }
+
+ /*
+ * On error:
+ *
+ * Release any buffers we're still pinning down.
+ */
+ while (i < ar_cnt)
+ --bharray[i++]->ref;
+ break;
+ }
+
+ /*
+ * If there were too many buffers and we're not returning an error, we
+ * re-try the flush once -- since we allocated 80% of the total
+ * buffer count, once should be enough. If it still doesn't work, some
+ * other thread of control is dirtying buffers as fast as we're writing
+ * them, and we might as well give up.
+ */
+ if (retry_need) {
+ if (retry_done)
+ incomplete = 1;
+ else {
+ retry_done = 1;
+ goto retry;
+ }
+ }
+
+done: R_UNLOCK(dbenv, dbmp->reginfo);
+
+ __os_free(bharray, ndirty * sizeof(BH *));
+
+ /*
+ * Sync the underlying file as the last thing we do, so that the OS
+ * has a maximal opportunity to flush buffers before we request it.
+ *
+ * !!!:
+ * Don't lock the region around the sync, fsync(2) has no atomicity
+ * issues.
+ */
+ if (ret == 0)
+ ret = incomplete ?
+ DB_INCOMPLETE : __os_fsync(dbenv, &dbmfp->fh);
+
+ return (ret);
+}
+
+/*
+ * __memp_sballoc --
+ * Allocate room for a list of buffers.
+ */
+static int
+__memp_sballoc(dbenv, bharrayp, ndirtyp)
+ DB_ENV *dbenv;
+ BH ***bharrayp;
+ u_int32_t *ndirtyp;
+{
+ DB_MPOOL *dbmp;
+ MPOOL *c_mp, *mp;
+ u_int32_t i, nclean, ndirty, maxpin;
+ int ret;
+
+ dbmp = dbenv->mp_handle;
+ mp = dbmp->reginfo[0].primary;
+
+ /*
+ * We don't want to hold the region lock while we write the buffers,
+ * so only lock it while we create a list.
+ *
+ * Walk through the list of caches, figuring out how many buffers
+ * we're going to need.
+ *
+ * Make a point of not holding the region lock across the library
+ * allocation call.
+ */
+ for (nclean = ndirty = 0, i = 0; i < mp->nreg; ++i) {
+ c_mp = dbmp->reginfo[i].primary;
+ ndirty += c_mp->stat.st_page_dirty;
+ nclean += c_mp->stat.st_page_clean;
+ }
+ R_UNLOCK(dbenv, dbmp->reginfo);
+ if (ndirty == 0) {
+ *ndirtyp = 0;
+ return (0);
+ }
+
+ /*
+ * We don't want to pin down the entire buffer cache, otherwise we'll
+ * starve threads needing new pages. Don't pin down more than 80% of
+ * the cache, making sure that we don't screw up just because only a
+ * few pages have been created.
+ */
+ maxpin = ((ndirty + nclean) * 8) / 10;
+ if (maxpin < 10)
+ maxpin = 10;
+
+ /*
+ * Get a good-sized block of memory to hold buffer pointers, we don't
+ * want to run out, but correct if we want to allocate more than we
+ * would be allowed to store, regardless.
+ */
+ ndirty += ndirty / 2 + 10;
+ if (ndirty > maxpin)
+ ndirty = maxpin;
+ if ((ret =
+ __os_malloc(dbenv, ndirty * sizeof(BH *), NULL, bharrayp)) != 0)
+ return (ret);
+
+ *ndirtyp = ndirty;
+
+ R_LOCK(dbenv, dbmp->reginfo);
+
+ return (0);
+}
+
+static int
+__bhcmp(p1, p2)
+ const void *p1, *p2;
+{
+ BH *bhp1, *bhp2;
+
+ bhp1 = *(BH * const *)p1;
+ bhp2 = *(BH * const *)p2;
+
+ /* Sort by file (shared memory pool offset). */
+ if (bhp1->mf_offset < bhp2->mf_offset)
+ return (-1);
+ if (bhp1->mf_offset > bhp2->mf_offset)
+ return (1);
+
+ /*
+ * !!!
+ * Defend against badly written quicksort code calling the comparison
+ * function with two identical pointers (e.g., WATCOM C++ (Power++)).
+ */
+ if (bhp1->pgno < bhp2->pgno)
+ return (-1);
+ if (bhp1->pgno > bhp2->pgno)
+ return (1);
+ return (0);
+}
diff --git a/bdb/mp/mp_trickle.c b/bdb/mp/mp_trickle.c
new file mode 100644
index 00000000000..f937805cf40
--- /dev/null
+++ b/bdb/mp/mp_trickle.c
@@ -0,0 +1,149 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: mp_trickle.c,v 11.12 2000/11/30 00:58:41 ubell Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <stdlib.h>
+#endif
+
+#ifdef HAVE_RPC
+#include "db_server.h"
+#endif
+
+#include "db_int.h"
+#include "db_shash.h"
+#include "mp.h"
+
+#ifdef HAVE_RPC
+#include "gen_client_ext.h"
+#include "rpc_client_ext.h"
+#endif
+
+static int __memp_trick __P((DB_ENV *, int, int, int *));
+
+/*
+ * memp_trickle --
+ * Keep a specified percentage of the buffers clean.
+ */
+int
+memp_trickle(dbenv, pct, nwrotep)
+ DB_ENV *dbenv;
+ int pct, *nwrotep;
+{
+ DB_MPOOL *dbmp;
+ MPOOL *mp;
+ u_int32_t i;
+ int ret;
+
+#ifdef HAVE_RPC
+ if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
+ return (__dbcl_memp_trickle(dbenv, pct, nwrotep));
+#endif
+
+ PANIC_CHECK(dbenv);
+ ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL);
+
+ dbmp = dbenv->mp_handle;
+ mp = dbmp->reginfo[0].primary;
+
+ if (nwrotep != NULL)
+ *nwrotep = 0;
+
+ if (pct < 1 || pct > 100)
+ return (EINVAL);
+
+ R_LOCK(dbenv, dbmp->reginfo);
+
+ /* Loop through the caches... */
+ for (ret = 0, i = 0; i < mp->nreg; ++i)
+ if ((ret = __memp_trick(dbenv, i, pct, nwrotep)) != 0)
+ break;
+
+ R_UNLOCK(dbenv, dbmp->reginfo);
+ return (ret);
+}
+
+/*
+ * __memp_trick --
+ * Trickle a single cache.
+ */
+static int
+__memp_trick(dbenv, ncache, pct, nwrotep)
+ DB_ENV *dbenv;
+ int ncache, pct, *nwrotep;
+{
+ BH *bhp;
+ DB_MPOOL *dbmp;
+ MPOOL *c_mp;
+ MPOOLFILE *mfp;
+ db_pgno_t pgno;
+ u_long total;
+ int ret, wrote;
+
+ dbmp = dbenv->mp_handle;
+ c_mp = dbmp->reginfo[ncache].primary;
+
+ /*
+ * If there are sufficient clean buffers, or no buffers or no dirty
+ * buffers, we're done.
+ *
+ * XXX
+ * Using st_page_clean and st_page_dirty is our only choice at the
+ * moment, but it's not as correct as we might like in the presence
+ * of pools with more than one buffer size, as a free 512-byte buffer
+ * isn't the same as a free 8K buffer.
+ */
+loop: total = c_mp->stat.st_page_clean + c_mp->stat.st_page_dirty;
+ if (total == 0 || c_mp->stat.st_page_dirty == 0 ||
+ (c_mp->stat.st_page_clean * 100) / total >= (u_long)pct)
+ return (0);
+
+ /* Loop until we write a buffer. */
+ for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh);
+ bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) {
+ if (bhp->ref != 0 ||
+ !F_ISSET(bhp, BH_DIRTY) || F_ISSET(bhp, BH_LOCKED))
+ continue;
+
+ mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+
+ /*
+ * We can't write to temporary files -- see the comment in
+ * mp_bh.c:__memp_bhwrite().
+ */
+ if (F_ISSET(mfp, MP_TEMP))
+ continue;
+
+ pgno = bhp->pgno;
+ if ((ret = __memp_bhwrite(dbmp, mfp, bhp, NULL, &wrote)) != 0)
+ return (ret);
+
+ /*
+ * Any process syncing the shared memory buffer pool had better
+ * be able to write to any underlying file. Be understanding,
+ * but firm, on this point.
+ */
+ if (!wrote) {
+ __db_err(dbenv, "%s: unable to flush page: %lu",
+ __memp_fns(dbmp, mfp), (u_long)pgno);
+ return (EPERM);
+ }
+
+ ++c_mp->stat.st_page_trickle;
+ if (nwrotep != NULL)
+ ++*nwrotep;
+ goto loop;
+ }
+
+ return (0);
+}