diff options
Diffstat (limited to 'bdb/mp')
-rw-r--r-- | bdb/mp/Design | 52 | ||||
-rw-r--r-- | bdb/mp/mp_alloc.c | 152 | ||||
-rw-r--r-- | bdb/mp/mp_bh.c | 662 | ||||
-rw-r--r-- | bdb/mp/mp_fget.c | 417 | ||||
-rw-r--r-- | bdb/mp/mp_fopen.c | 756 | ||||
-rw-r--r-- | bdb/mp/mp_fput.c | 186 | ||||
-rw-r--r-- | bdb/mp/mp_fset.c | 98 | ||||
-rw-r--r-- | bdb/mp/mp_method.c | 115 | ||||
-rw-r--r-- | bdb/mp/mp_region.c | 357 | ||||
-rw-r--r-- | bdb/mp/mp_register.c | 85 | ||||
-rw-r--r-- | bdb/mp/mp_stat.c | 388 | ||||
-rw-r--r-- | bdb/mp/mp_sync.c | 658 | ||||
-rw-r--r-- | bdb/mp/mp_trickle.c | 149 |
13 files changed, 4075 insertions, 0 deletions
diff --git a/bdb/mp/Design b/bdb/mp/Design new file mode 100644 index 00000000000..1b26aae6cba --- /dev/null +++ b/bdb/mp/Design @@ -0,0 +1,52 @@ +$Id: Design,v 11.2 1999/11/21 23:08:27 bostic Exp $ + +There are three ways we do locking in the mpool code: + +Locking a handle mutex to provide concurrency for DB_THREAD operations. +Locking the region mutex to provide mutual exclusion while reading and + writing structures in the shared region. +Locking buffer header mutexes during I/O. + +The first will not be further described here. We use the shared mpool +region lock to provide mutual exclusion while reading/modifying all of +the data structures, including the buffer headers. We use a per-buffer +header lock to wait on buffer I/O. The order of locking is as follows: + +Searching for a buffer: + Acquire the region lock. + Find the buffer header. + Increment the reference count (guarantee the buffer stays). + While the BH_LOCKED flag is set (I/O is going on) { + Release the region lock. + Explicitly yield the processor if it's not the first pass + through this loop, otherwise, we can simply spin because + we'll be simply switching between the two locks. + Request the buffer lock. + The I/O will complete... + Acquire the buffer lock. + Release the buffer lock. + Acquire the region lock. + } + Return the buffer. + +Reading/writing a buffer: + Acquire the region lock. + Find/create the buffer header. + If reading, increment the reference count (guarantee the buffer stays). + Set the BH_LOCKED flag. + Acquire the buffer lock (guaranteed not to block). + Release the region lock. + Do the I/O and/or initialize the buffer contents. + Release the buffer lock. + At this point, the buffer lock is available, but the logical + operation (flagged by BH_LOCKED) is not yet completed. For + this reason, among others, threads checking the BH_LOCKED flag + must loop around their test. + Acquire the region lock. + Clear the BH_LOCKED flag. + Release the region lock. + Return/discard the buffer. + +Pointers to DB_MPOOL, MPOOL, DB_MPOOLFILE and MPOOLFILE structures are +not reacquired when a region lock is reacquired because they couldn't +have been closed/discarded and because they never move in memory. diff --git a/bdb/mp/mp_alloc.c b/bdb/mp/mp_alloc.c new file mode 100644 index 00000000000..731f569f57f --- /dev/null +++ b/bdb/mp/mp_alloc.c @@ -0,0 +1,152 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: mp_alloc.c,v 11.7 2000/04/20 21:14:18 bostic Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#endif + +#include "db_int.h" +#include "db_shash.h" +#include "mp.h" + +/* + * __memp_alloc -- + * Allocate some space from a cache region. + * + * PUBLIC: int __memp_alloc __P((DB_MPOOL *, + * PUBLIC: REGINFO *, MPOOLFILE *, size_t, roff_t *, void *)); + */ +int +__memp_alloc(dbmp, memreg, mfp, len, offsetp, retp) + DB_MPOOL *dbmp; + REGINFO *memreg; + MPOOLFILE *mfp; + size_t len; + roff_t *offsetp; + void *retp; +{ + BH *bhp, *nbhp; + MPOOL *c_mp; + MPOOLFILE *bh_mfp; + size_t total; + int nomore, restart, ret, wrote; + void *p; + + c_mp = memreg->primary; + + /* + * If we're allocating a buffer, and the one we're discarding is the + * same size, we don't want to waste the time to re-integrate it into + * the shared memory free list. If the DB_MPOOLFILE argument isn't + * NULL, we'll compare the underlying page sizes of the two buffers + * before free-ing and re-allocating buffers. + */ + if (mfp != NULL) + len = (sizeof(BH) - sizeof(u_int8_t)) + mfp->stat.st_pagesize; + + nomore = 0; +alloc: if ((ret = __db_shalloc(memreg->addr, len, MUTEX_ALIGN, &p)) == 0) { + if (offsetp != NULL) + *offsetp = R_OFFSET(memreg, p); + *(void **)retp = p; + return (0); + } + if (nomore) { + __db_err(dbmp->dbenv, + "Unable to allocate %lu bytes from mpool shared region: %s\n", + (u_long)len, db_strerror(ret)); + return (ret); + } + +retry: /* Find a buffer we can flush; pure LRU. */ + restart = total = 0; + for (bhp = + SH_TAILQ_FIRST(&c_mp->bhq, __bh); bhp != NULL; bhp = nbhp) { + nbhp = SH_TAILQ_NEXT(bhp, q, __bh); + + /* Ignore pinned or locked (I/O in progress) buffers. */ + if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED)) + continue; + + /* Find the associated MPOOLFILE. */ + bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); + + /* Write the page if it's dirty. */ + if (F_ISSET(bhp, BH_DIRTY)) { + ++bhp->ref; + if ((ret = __memp_bhwrite(dbmp, + bh_mfp, bhp, &restart, &wrote)) != 0) + return (ret); + --bhp->ref; + + /* + * Another process may have acquired this buffer and + * incremented the ref count after we wrote it. + */ + if (bhp->ref != 0) + goto retry; + + /* + * If we wrote the page, continue and free the buffer. + * We don't have to rewalk the list to acquire the + * buffer because it was never available for any other + * process to modify it. + * + * If we didn't write the page, but we discarded and + * reacquired the region lock, restart the list walk. + * + * If we neither wrote the buffer nor discarded the + * region lock, continue down the buffer list. + */ + if (wrote) + ++c_mp->stat.st_rw_evict; + else { + if (restart) + goto retry; + continue; + } + } else + ++c_mp->stat.st_ro_evict; + + /* + * Check to see if the buffer is the size we're looking for. + * If it is, simply reuse it. + */ + if (mfp != NULL && + mfp->stat.st_pagesize == bh_mfp->stat.st_pagesize) { + __memp_bhfree(dbmp, bhp, 0); + + if (offsetp != NULL) + *offsetp = R_OFFSET(memreg, bhp); + *(void **)retp = bhp; + return (0); + } + + /* Note how much space we've freed, and free the buffer. */ + total += __db_shsizeof(bhp); + __memp_bhfree(dbmp, bhp, 1); + + /* + * Retry as soon as we've freed up sufficient space. If we + * have to coalesce of memory to satisfy the request, don't + * try until it's likely (possible?) that we'll succeed. + */ + if (total >= 3 * len) + goto alloc; + + /* Restart the walk if we discarded the region lock. */ + if (restart) + goto retry; + } + nomore = 1; + goto alloc; +} diff --git a/bdb/mp/mp_bh.c b/bdb/mp/mp_bh.c new file mode 100644 index 00000000000..e802b165b2d --- /dev/null +++ b/bdb/mp/mp_bh.c @@ -0,0 +1,662 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: mp_bh.c,v 11.25 2001/01/10 04:50:53 ubell Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <string.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "db_shash.h" +#include "mp.h" +#include "log.h" +#include "db_page.h" + +static int __memp_upgrade __P((DB_MPOOL *, DB_MPOOLFILE *, MPOOLFILE *)); + +/* + * __memp_bhwrite -- + * Write the page associated with a given bucket header. + * + * PUBLIC: int __memp_bhwrite + * PUBLIC: __P((DB_MPOOL *, MPOOLFILE *, BH *, int *, int *)); + */ +int +__memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep) + DB_MPOOL *dbmp; + MPOOLFILE *mfp; + BH *bhp; + int *restartp, *wrotep; +{ + DB_MPOOLFILE *dbmfp; + DB_MPREG *mpreg; + int incremented, ret; + + if (restartp != NULL) + *restartp = 0; + if (wrotep != NULL) + *wrotep = 0; + incremented = 0; + + /* + * If the file has been removed or is a closed temporary file, Jump + * right ahead and pretend that we've found the file we want-- the + * page-write function knows how to handle the fact that we don't have + * (or need!) any real file descriptor information. + */ + if (F_ISSET(mfp, MP_DEADFILE)) { + dbmfp = NULL; + goto found; + } + + /* + * Walk the process' DB_MPOOLFILE list and find a file descriptor for + * the file. We also check that the descriptor is open for writing. + * If we find a descriptor on the file that's not open for writing, we + * try and upgrade it to make it writeable. If that fails, we're done. + */ + MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp); + for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq); + dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q)) + if (dbmfp->mfp == mfp) { + if (F_ISSET(dbmfp, MP_READONLY) && + __memp_upgrade(dbmp, dbmfp, mfp)) { + MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp); + return (0); + } + + /* + * Increment the reference count -- see the comment in + * memp_fclose(). + */ + ++dbmfp->ref; + incremented = 1; + break; + } + MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp); + if (dbmfp != NULL) + goto found; + + /* + * !!! + * Don't try to attach to temporary files. There are two problems in + * trying to do that. First, if we have different privileges than the + * process that "owns" the temporary file, we might create the backing + * disk file such that the owning process couldn't read/write its own + * buffers, e.g., memp_trickle() running as root creating a file owned + * as root, mode 600. Second, if the temporary file has already been + * created, we don't have any way of finding out what its real name is, + * and, even if we did, it was already unlinked (so that it won't be + * left if the process dies horribly). This decision causes a problem, + * however: if the temporary file consumes the entire buffer cache, + * and the owner doesn't flush the buffers to disk, we could end up + * with resource starvation, and the memp_trickle() thread couldn't do + * anything about it. That's a pretty unlikely scenario, though. + * + * Note that we should never get here when the temporary file + * in question has already been closed in another process, in which + * case it should be marked MP_DEADFILE. + */ + if (F_ISSET(mfp, MP_TEMP)) { + DB_ASSERT(!F_ISSET(mfp, MP_DEADFILE)); + return (0); + } + + /* + * It's not a page from a file we've opened. If the file requires + * input/output processing, see if this process has ever registered + * information as to how to write this type of file. If not, there's + * nothing we can do. + */ + if (mfp->ftype != 0) { + MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp); + for (mpreg = LIST_FIRST(&dbmp->dbregq); + mpreg != NULL; mpreg = LIST_NEXT(mpreg, q)) + if (mpreg->ftype == mfp->ftype) + break; + MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp); + if (mpreg == NULL) + return (0); + } + + /* + * Try and open the file, attaching to the underlying shared area. + * Ignore any error, assume it's a permissions problem. + * + * XXX + * There's no negative cache, so we may repeatedly try and open files + * that we have previously tried (and failed) to open. + */ + if (__memp_fopen(dbmp, mfp, R_ADDR(dbmp->reginfo, mfp->path_off), + 0, 0, mfp->stat.st_pagesize, 0, NULL, &dbmfp) != 0) + return (0); + +found: ret = __memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep); + + if (incremented) { + MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp); + --dbmfp->ref; + MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp); + } + + return (ret); +} + +/* + * __memp_pgread -- + * Read a page from a file. + * + * PUBLIC: int __memp_pgread __P((DB_MPOOLFILE *, BH *, int)); + */ +int +__memp_pgread(dbmfp, bhp, can_create) + DB_MPOOLFILE *dbmfp; + BH *bhp; + int can_create; +{ + DB_IO db_io; + DB_ENV *dbenv; + DB_MPOOL *dbmp; + MPOOLFILE *mfp; + size_t len, pagesize; + size_t nr; + int created, ret; + + dbmp = dbmfp->dbmp; + dbenv = dbmp->dbenv; + mfp = dbmfp->mfp; + pagesize = mfp->stat.st_pagesize; + + F_SET(bhp, BH_LOCKED | BH_TRASH); + MUTEX_LOCK(dbenv, &bhp->mutex, dbenv->lockfhp); + R_UNLOCK(dbenv, dbmp->reginfo); + + /* + * Temporary files may not yet have been created. We don't create + * them now, we create them when the pages have to be flushed. + */ + nr = 0; + if (F_ISSET(&dbmfp->fh, DB_FH_VALID)) { + /* + * Ignore read errors if we have permission to create the page. + * Assume that the page doesn't exist, and that we'll create it + * when we write it out. + * + * XXX + * Theoretically, we could overwrite a page of data if it were + * possible for a file to be successfully opened for reading + * and then for the read to fail. Shouldn't ever happen, but + * it might be worth checking to see if the offset is past the + * known end-of-file. + */ + db_io.fhp = &dbmfp->fh; + db_io.mutexp = dbmfp->mutexp; + db_io.pagesize = db_io.bytes = pagesize; + db_io.pgno = bhp->pgno; + db_io.buf = bhp->buf; + + ret = __os_io(dbenv, &db_io, DB_IO_READ, &nr); + } else + ret = 0; + + created = 0; + if (nr < pagesize) { + if (can_create) + created = 1; + else { + /* + * If we had a short read, ret may be 0. This may not + * be an error -- in particular DB recovery processing + * may request pages that have never been written to + * disk, in which case we won't find the page. So, the + * caller must know how to handle the error. + */ + if (ret == 0) + ret = EIO; + goto err; + } + } + + /* + * Clear any bytes we didn't read that need to be cleared. If we're + * running in diagnostic mode, smash any bytes on the page that are + * unknown quantities for the caller. + */ + if (nr != pagesize) { + len = mfp->clear_len == 0 ? pagesize : mfp->clear_len; + if (nr < len) + memset(bhp->buf + nr, 0, len - nr); +#ifdef DIAGNOSTIC + if (nr > len) + len = nr; + if (len < pagesize) + memset(bhp->buf + len, CLEAR_BYTE, pagesize - len); +#endif + } + + /* Call any pgin function. */ + ret = mfp->ftype == 0 ? 0 : __memp_pg(dbmfp, bhp, 1); + + /* Unlock the buffer and reacquire the region lock. */ +err: MUTEX_UNLOCK(dbenv, &bhp->mutex); + R_LOCK(dbenv, dbmp->reginfo); + + /* + * If no errors occurred, the data is now valid, clear the BH_TRASH + * flag; regardless, clear the lock bit and let other threads proceed. + */ + F_CLR(bhp, BH_LOCKED); + if (ret == 0) { + F_CLR(bhp, BH_TRASH); + + /* Update the statistics. */ + if (created) + ++mfp->stat.st_page_create; + else + ++mfp->stat.st_page_in; + } + + return (ret); +} + +/* + * __memp_pgwrite -- + * Write a page to a file. + * + * PUBLIC: int __memp_pgwrite + * PUBLIC: __P((DB_MPOOL *, DB_MPOOLFILE *, BH *, int *, int *)); + */ +int +__memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep) + DB_MPOOL *dbmp; + DB_MPOOLFILE *dbmfp; + BH *bhp; + int *restartp, *wrotep; +{ + DB_ENV *dbenv; + DB_IO db_io; + DB_LSN lsn; + MPOOL *c_mp, *mp; + MPOOLFILE *mfp; + size_t nw; + int callpgin, dosync, ret, syncfail; + const char *fail; + + dbenv = dbmp->dbenv; + mp = dbmp->reginfo[0].primary; + mfp = dbmfp == NULL ? NULL : dbmfp->mfp; + + if (restartp != NULL) + *restartp = 0; + if (wrotep != NULL) + *wrotep = 0; + callpgin = 0; + + /* + * Check the dirty bit -- this buffer may have been written since we + * decided to write it. + */ + if (!F_ISSET(bhp, BH_DIRTY)) { + if (wrotep != NULL) + *wrotep = 1; + return (0); + } + + MUTEX_LOCK(dbenv, &bhp->mutex, dbenv->lockfhp); + + /* + * If there were two writers, we may have just been waiting while the + * other writer completed I/O on this buffer. Check the dirty bit one + * more time. + */ + if (!F_ISSET(bhp, BH_DIRTY)) { + MUTEX_UNLOCK(dbenv, &bhp->mutex); + + if (wrotep != NULL) + *wrotep = 1; + return (0); + } + + F_SET(bhp, BH_LOCKED); + R_UNLOCK(dbenv, dbmp->reginfo); + + if (restartp != NULL) + *restartp = 1; + + /* + * It's possible that the underlying file doesn't exist, either + * because of an outright removal or because it was a temporary + * file that's been closed. + * + * !!! + * Once we pass this point, we know that dbmfp and mfp aren't NULL, + * and that we have a valid file reference. + */ + if (mfp == NULL || F_ISSET(mfp, MP_DEADFILE)) + goto file_dead; + + /* + * Ensure the appropriate log records are on disk. If the page is + * being written as part of a sync operation, the flush has already + * been done, unless it was written by the application *after* the + * sync was scheduled. + */ + if (LOGGING_ON(dbenv) && + (!F_ISSET(bhp, BH_SYNC) || F_ISSET(bhp, BH_SYNC_LOGFLSH))) { + memcpy(&lsn, bhp->buf + mfp->lsn_off, sizeof(DB_LSN)); + if ((ret = log_flush(dbenv, &lsn)) != 0) + goto err; + } + DB_ASSERT(!LOGGING_ON(dbenv) || + log_compare(&((LOG *)((DB_LOG *) + dbenv->lg_handle)->reginfo.primary)->s_lsn, &LSN(bhp->buf)) > 0); + + /* + * Call any pgout function. We set the callpgin flag so that we flag + * that the contents of the buffer will need to be passed through pgin + * before they are reused. + */ + if (mfp->ftype == 0) + ret = 0; + else { + callpgin = 1; + if ((ret = __memp_pg(dbmfp, bhp, 0)) != 0) + goto err; + } + + /* Temporary files may not yet have been created. */ + if (!F_ISSET(&dbmfp->fh, DB_FH_VALID)) { + MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp); + if (!F_ISSET(&dbmfp->fh, DB_FH_VALID) && + ((ret = __db_appname(dbenv, DB_APP_TMP, NULL, NULL, + DB_OSO_CREATE | DB_OSO_EXCL | DB_OSO_TEMP, + &dbmfp->fh, NULL)) != 0 || + !F_ISSET(&dbmfp->fh, DB_FH_VALID))) { + MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); + __db_err(dbenv, + "unable to create temporary backing file"); + goto err; + } + MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); + } + + /* Write the page. */ + db_io.fhp = &dbmfp->fh; + db_io.mutexp = dbmfp->mutexp; + db_io.pagesize = db_io.bytes = mfp->stat.st_pagesize; + db_io.pgno = bhp->pgno; + db_io.buf = bhp->buf; + if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) { + ret = __db_panic(dbenv, ret); + fail = "write"; + goto syserr; + } + if (nw != mfp->stat.st_pagesize) { + ret = EIO; + fail = "write"; + goto syserr; + } + +file_dead: + /* + * !!! + * Once we pass this point, dbmfp and mfp may be NULL, we may not have + * a valid file reference. + * + * Unlock the buffer and reacquire the region lock. + */ + MUTEX_UNLOCK(dbenv, &bhp->mutex); + R_LOCK(dbenv, dbmp->reginfo); + + /* + * Clean up the flags based on a successful write. + * + * If we rewrote the page, it will need processing by the pgin + * routine before reuse. + */ + if (callpgin) + F_SET(bhp, BH_CALLPGIN); + F_CLR(bhp, BH_DIRTY | BH_LOCKED); + + /* + * If we write a buffer for which a checkpoint is waiting, update + * the count of pending buffers (both in the mpool as a whole and + * for this file). If the count for this file goes to zero, set a + * flag so we flush the writes. + */ + dosync = 0; + if (F_ISSET(bhp, BH_SYNC)) { + F_CLR(bhp, BH_SYNC | BH_SYNC_LOGFLSH); + + --mp->lsn_cnt; + if (mfp != NULL) + dosync = --mfp->lsn_cnt == 0 ? 1 : 0; + } + + /* Update the page clean/dirty statistics. */ + c_mp = BH_TO_CACHE(dbmp, bhp); + ++c_mp->stat.st_page_clean; + --c_mp->stat.st_page_dirty; + + /* Update I/O statistics. */ + if (mfp != NULL) + ++mfp->stat.st_page_out; + + /* + * Do the sync after everything else has been updated, so any incoming + * checkpoint doesn't see inconsistent information. + * + * XXX: + * Don't lock the region around the sync, fsync(2) has no atomicity + * issues. + * + * XXX: + * We ignore errors from the sync -- it makes no sense to return an + * error to the calling process, so set a flag causing the checkpoint + * to be retried later. There is a possibility, of course, that a + * subsequent checkpoint was started and that we're going to force it + * to fail. That should be unlikely, and fixing it would be difficult. + */ + if (dosync) { + R_UNLOCK(dbenv, dbmp->reginfo); + syncfail = __os_fsync(dbenv, &dbmfp->fh) != 0; + R_LOCK(dbenv, dbmp->reginfo); + if (syncfail) + F_SET(mp, MP_LSN_RETRY); + } + + if (wrotep != NULL) + *wrotep = 1; + + return (0); + +syserr: __db_err(dbenv, "%s: %s failed for page %lu", + __memp_fn(dbmfp), fail, (u_long)bhp->pgno); + +err: /* Unlock the buffer and reacquire the region lock. */ + MUTEX_UNLOCK(dbenv, &bhp->mutex); + R_LOCK(dbenv, dbmp->reginfo); + + /* + * Clean up the flags based on a failure. + * + * The page remains dirty but we remove our lock. If we rewrote the + * page, it will need processing by the pgin routine before reuse. + */ + if (callpgin) + F_SET(bhp, BH_CALLPGIN); + F_CLR(bhp, BH_LOCKED); + + return (ret); +} + +/* + * __memp_pg -- + * Call the pgin/pgout routine. + * + * PUBLIC: int __memp_pg __P((DB_MPOOLFILE *, BH *, int)); + */ +int +__memp_pg(dbmfp, bhp, is_pgin) + DB_MPOOLFILE *dbmfp; + BH *bhp; + int is_pgin; +{ + DBT dbt, *dbtp; + DB_MPOOL *dbmp; + DB_MPREG *mpreg; + MPOOLFILE *mfp; + int ftype, ret; + + dbmp = dbmfp->dbmp; + mfp = dbmfp->mfp; + + MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp); + + ftype = mfp->ftype; + for (mpreg = LIST_FIRST(&dbmp->dbregq); + mpreg != NULL; mpreg = LIST_NEXT(mpreg, q)) { + if (ftype != mpreg->ftype) + continue; + if (mfp->pgcookie_len == 0) + dbtp = NULL; + else { + dbt.size = mfp->pgcookie_len; + dbt.data = R_ADDR(dbmp->reginfo, mfp->pgcookie_off); + dbtp = &dbt; + } + MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp); + + if (is_pgin) { + if (mpreg->pgin != NULL && + (ret = mpreg->pgin(dbmp->dbenv, + bhp->pgno, bhp->buf, dbtp)) != 0) + goto err; + } else + if (mpreg->pgout != NULL && + (ret = mpreg->pgout(dbmp->dbenv, + bhp->pgno, bhp->buf, dbtp)) != 0) + goto err; + break; + } + + if (mpreg == NULL) + MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp); + + return (0); + +err: MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp); + __db_err(dbmp->dbenv, "%s: %s failed for page %lu", + __memp_fn(dbmfp), is_pgin ? "pgin" : "pgout", (u_long)bhp->pgno); + return (ret); +} + +/* + * __memp_bhfree -- + * Free a bucket header and its referenced data. + * + * PUBLIC: void __memp_bhfree __P((DB_MPOOL *, BH *, int)); + */ +void +__memp_bhfree(dbmp, bhp, free_mem) + DB_MPOOL *dbmp; + BH *bhp; + int free_mem; +{ + DB_HASHTAB *dbht; + MPOOL *c_mp, *mp; + MPOOLFILE *mfp; + int n_bucket, n_cache; + + mp = dbmp->reginfo[0].primary; + c_mp = BH_TO_CACHE(dbmp, bhp); + n_cache = NCACHE(mp, bhp->pgno); + n_bucket = NBUCKET(c_mp, bhp->mf_offset, bhp->pgno); + dbht = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab); + + /* Delete the buffer header from the hash bucket queue. */ + SH_TAILQ_REMOVE(&dbht[n_bucket], bhp, hq, __bh); + + /* Delete the buffer header from the LRU queue. */ + SH_TAILQ_REMOVE(&c_mp->bhq, bhp, q, __bh); + + /* Clear the mutex this buffer recorded */ + __db_shlocks_clear(&bhp->mutex, &dbmp->reginfo[n_cache], + (REGMAINT *)R_ADDR(&dbmp->reginfo[n_cache], mp->maint_off)); + /* + * Find the underlying MPOOLFILE and decrement its reference count. + * If this is its last reference, remove it. + */ + mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); + if (--mfp->block_cnt == 0 && mfp->mpf_cnt == 0) + __memp_mf_discard(dbmp, mfp); + + /* + * If we're not reusing it immediately, free the buffer header + * and data for real. + */ + if (free_mem) { + --c_mp->stat.st_page_clean; + __db_shalloc_free(dbmp->reginfo[n_cache].addr, bhp); + } +} + +/* + * __memp_upgrade -- + * Upgrade a file descriptor from readonly to readwrite. + */ +static int +__memp_upgrade(dbmp, dbmfp, mfp) + DB_MPOOL *dbmp; + DB_MPOOLFILE *dbmfp; + MPOOLFILE *mfp; +{ + DB_FH fh; + int ret; + char *rpath; + + /* + * !!! + * We expect the handle to already be locked. + */ + + /* Check to see if we've already upgraded. */ + if (F_ISSET(dbmfp, MP_UPGRADE)) + return (0); + + /* Check to see if we've already failed. */ + if (F_ISSET(dbmfp, MP_UPGRADE_FAIL)) + return (1); + + /* + * Calculate the real name for this file and try to open it read/write. + * We know we have a valid pathname for the file because it's the only + * way we could have gotten a file descriptor of any kind. + */ + if ((ret = __db_appname(dbmp->dbenv, DB_APP_DATA, + NULL, R_ADDR(dbmp->reginfo, mfp->path_off), 0, NULL, &rpath)) != 0) + return (ret); + if (__os_open(dbmp->dbenv, rpath, 0, 0, &fh) != 0) { + F_SET(dbmfp, MP_UPGRADE_FAIL); + ret = 1; + } else { + /* Swap the descriptors and set the upgrade flag. */ + (void)__os_closehandle(&dbmfp->fh); + dbmfp->fh = fh; + F_SET(dbmfp, MP_UPGRADE); + ret = 0; + } + __os_freestr(rpath); + return (ret); +} diff --git a/bdb/mp/mp_fget.c b/bdb/mp/mp_fget.c new file mode 100644 index 00000000000..1bff5e136ab --- /dev/null +++ b/bdb/mp/mp_fget.c @@ -0,0 +1,417 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: mp_fget.c,v 11.28 2001/01/10 04:50:53 ubell Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <string.h> +#endif + +#ifdef HAVE_RPC +#include "db_server.h" +#endif + +#include "db_int.h" +#include "db_shash.h" +#include "mp.h" + +#ifdef HAVE_RPC +#include "gen_client_ext.h" +#include "rpc_client_ext.h" +#endif + +/* + * memp_fget -- + * Get a page from the file. + */ +int +memp_fget(dbmfp, pgnoaddr, flags, addrp) + DB_MPOOLFILE *dbmfp; + db_pgno_t *pgnoaddr; + u_int32_t flags; + void *addrp; +{ + BH *bhp; + DB_ENV *dbenv; + DB_MPOOL *dbmp; + DB_HASHTAB *dbht; + MPOOL *c_mp, *mp; + MPOOLFILE *mfp; + size_t n_bucket, n_cache, mf_offset; + u_int32_t st_hsearch; + int b_incr, first, ret; + + dbmp = dbmfp->dbmp; + dbenv = dbmp->dbenv; + mp = dbmp->reginfo[0].primary; + mfp = dbmfp->mfp; +#ifdef HAVE_RPC + if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) + return (__dbcl_memp_fget(dbmfp, pgnoaddr, flags, addrp)); +#endif + + PANIC_CHECK(dbenv); + + /* + * Validate arguments. + * + * !!! + * Don't test for DB_MPOOL_CREATE and DB_MPOOL_NEW flags for readonly + * files here, and create non-existent pages in readonly files if the + * flags are set, later. The reason is that the hash access method + * wants to get empty pages that don't really exist in readonly files. + * The only alternative is for hash to write the last "bucket" all the + * time, which we don't want to do because one of our big goals in life + * is to keep database files small. It's sleazy as hell, but we catch + * any attempt to actually write the file in memp_fput(). + */ +#define OKFLAGS \ + (DB_MPOOL_CREATE | DB_MPOOL_LAST | \ + DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP | DB_MPOOL_EXTENT) + if (flags != 0) { + if ((ret = __db_fchk(dbenv, "memp_fget", flags, OKFLAGS)) != 0) + return (ret); + + switch (flags & ~DB_MPOOL_EXTENT) { + case DB_MPOOL_CREATE: + case DB_MPOOL_LAST: + case DB_MPOOL_NEW: + case DB_MPOOL_NEW_GROUP: + case 0: + break; + default: + return (__db_ferr(dbenv, "memp_fget", 1)); + } + } + +#ifdef DIAGNOSTIC + /* + * XXX + * We want to switch threads as often as possible. Yield every time + * we get a new page to ensure contention. + */ + if (DB_GLOBAL(db_pageyield)) + __os_yield(dbenv, 1); +#endif + + /* Initialize remaining local variables. */ + mf_offset = R_OFFSET(dbmp->reginfo, mfp); + bhp = NULL; + st_hsearch = 0; + b_incr = ret = 0; + + R_LOCK(dbenv, dbmp->reginfo); + + /* + * Check for the new, last or last + 1 page requests. + * + * Examine and update the file's last_pgno value. We don't care if + * the last_pgno value immediately changes due to another thread -- + * at this instant in time, the value is correct. We do increment the + * current last_pgno value if the thread is asking for a new page, + * however, to ensure that two threads creating pages don't get the + * same one. + * + * If we create a page, there is the potential that a page after it + * in the file will be written before it will be written. Recovery + * depends on pages that are "created" in the file by subsequent pages + * being written be zeroed out, not have random garbage. Ensure that + * the OS agrees. + * + * !!! + * DB_MPOOL_NEW_GROUP is undocumented -- the hash access method needs + * to allocate contiguous groups of pages in order to do subdatabases. + * We return the first page in the group, but the caller must put an + * LSN on the *last* page and write it, otherwise after a crash we may + * not create all of the pages we need to create. + */ + if (LF_ISSET(DB_MPOOL_LAST | DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP)) { + if (LF_ISSET(DB_MPOOL_NEW)) { + if (F_ISSET(&dbmfp->fh, DB_FH_VALID) && (ret = + __os_fpinit(dbenv, &dbmfp->fh, mfp->last_pgno + 1, + 1, mfp->stat.st_pagesize)) != 0) { + R_UNLOCK(dbenv, dbmp->reginfo); + return (ret); + } + ++mfp->last_pgno; + } + if (LF_ISSET(DB_MPOOL_NEW_GROUP)) { + if (F_ISSET(&dbmfp->fh, DB_FH_VALID) && (ret = + __os_fpinit(dbenv, &dbmfp->fh, mfp->last_pgno + 1, + (int)*pgnoaddr, mfp->stat.st_pagesize)) != 0) { + R_UNLOCK(dbenv, dbmp->reginfo); + return (ret); + } + mfp->last_pgno += *pgnoaddr; + } + *pgnoaddr = mfp->last_pgno; + } + + /* + * Determine the hash bucket where this page will live, and get local + * pointers to the cache and its hash table. + */ + n_cache = NCACHE(mp, *pgnoaddr); + c_mp = dbmp->reginfo[n_cache].primary; + n_bucket = NBUCKET(c_mp, mf_offset, *pgnoaddr); + dbht = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab); + + if (LF_ISSET(DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP)) + goto alloc; + + /* + * If mmap'ing the file and the page is not past the end of the file, + * just return a pointer. + * + * The page may be past the end of the file, so check the page number + * argument against the original length of the file. If we previously + * returned pages past the original end of the file, last_pgno will + * have been updated to match the "new" end of the file, and checking + * against it would return pointers past the end of the mmap'd region. + * + * If another process has opened the file for writing since we mmap'd + * it, we will start playing the game by their rules, i.e. everything + * goes through the cache. All pages previously returned will be safe, + * as long as the correct locking protocol was observed. + * + * XXX + * We don't discard the map because we don't know when all of the + * pages will have been discarded from the process' address space. + * It would be possible to do so by reference counting the open + * pages from the mmap, but it's unclear to me that it's worth it. + */ + if (dbmfp->addr != NULL && F_ISSET(mfp, MP_CAN_MMAP)) { + if (*pgnoaddr > mfp->orig_last_pgno) { + /* + * !!! + * See the comment above about non-existent pages and + * the hash access method. + */ + if (!LF_ISSET(DB_MPOOL_CREATE)) { + if (!LF_ISSET(DB_MPOOL_EXTENT)) + __db_err(dbenv, + "%s: page %lu doesn't exist", + __memp_fn(dbmfp), (u_long)*pgnoaddr); + ret = EINVAL; + goto err; + } + } else { + *(void **)addrp = + R_ADDR(dbmfp, *pgnoaddr * mfp->stat.st_pagesize); + ++mfp->stat.st_map; + goto done; + } + } + + /* Search the hash chain for the page. */ + for (bhp = SH_TAILQ_FIRST(&dbht[n_bucket], __bh); + bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) { + ++st_hsearch; + if (bhp->pgno != *pgnoaddr || bhp->mf_offset != mf_offset) + continue; + + /* Increment the reference count. */ + if (bhp->ref == UINT16_T_MAX) { + __db_err(dbenv, + "%s: page %lu: reference count overflow", + __memp_fn(dbmfp), (u_long)bhp->pgno); + ret = EINVAL; + goto err; + } + + /* + * Increment the reference count. We may discard the region + * lock as we evaluate and/or read the buffer, so we need to + * ensure that it doesn't move and that its contents remain + * unchanged. + */ + ++bhp->ref; + b_incr = 1; + + /* + * Any buffer we find might be trouble. + * + * BH_LOCKED -- + * I/O is in progress. Because we've incremented the buffer + * reference count, we know the buffer can't move. Unlock + * the region lock, wait for the I/O to complete, and reacquire + * the region. + */ + for (first = 1; F_ISSET(bhp, BH_LOCKED); first = 0) { + R_UNLOCK(dbenv, dbmp->reginfo); + + /* + * Explicitly yield the processor if it's not the first + * pass through this loop -- if we don't, we might end + * up running to the end of our CPU quantum as we will + * simply be swapping between the two locks. + */ + if (!first) + __os_yield(dbenv, 1); + + MUTEX_LOCK(dbenv, &bhp->mutex, dbenv->lockfhp); + /* Wait for I/O to finish... */ + MUTEX_UNLOCK(dbenv, &bhp->mutex); + R_LOCK(dbenv, dbmp->reginfo); + } + + /* + * BH_TRASH -- + * The contents of the buffer are garbage. Shouldn't happen, + * and this read is likely to fail, but might as well try. + */ + if (F_ISSET(bhp, BH_TRASH)) + goto reread; + + /* + * BH_CALLPGIN -- + * The buffer was converted so it could be written, and the + * contents need to be converted again. + */ + if (F_ISSET(bhp, BH_CALLPGIN)) { + if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0) + goto err; + F_CLR(bhp, BH_CALLPGIN); + } + + ++mfp->stat.st_cache_hit; + *(void **)addrp = bhp->buf; + goto done; + } + +alloc: /* Allocate new buffer header and data space. */ + if ((ret = __memp_alloc(dbmp, + &dbmp->reginfo[n_cache], mfp, 0, NULL, &bhp)) != 0) + goto err; + + ++c_mp->stat.st_page_clean; + + /* + * Initialize the BH fields so that we can call the __memp_bhfree + * routine if an error occurs. + */ + memset(bhp, 0, sizeof(BH)); + bhp->ref = 1; + bhp->pgno = *pgnoaddr; + bhp->mf_offset = mf_offset; + + /* Increment the count of buffers referenced by this MPOOLFILE. */ + ++mfp->block_cnt; + + /* + * Prepend the bucket header to the head of the appropriate MPOOL + * bucket hash list. Append the bucket header to the tail of the + * MPOOL LRU chain. + */ + SH_TAILQ_INSERT_HEAD(&dbht[n_bucket], bhp, hq, __bh); + SH_TAILQ_INSERT_TAIL(&c_mp->bhq, bhp, q); + +#ifdef DIAGNOSTIC + if ((db_alignp_t)bhp->buf & (sizeof(size_t) - 1)) { + __db_err(dbenv, "Internal error: BH data NOT size_t aligned."); + ret = EINVAL; + __memp_bhfree(dbmp, bhp, 1); + goto err; + } +#endif + + if ((ret = __db_shmutex_init(dbenv, &bhp->mutex, + R_OFFSET(dbmp->reginfo, &bhp->mutex) + DB_FCNTL_OFF_MPOOL, + 0, &dbmp->reginfo[n_cache], + (REGMAINT *)R_ADDR(&dbmp->reginfo[n_cache], c_mp->maint_off))) + != 0) { + __memp_bhfree(dbmp, bhp, 1); + goto err; + } + + /* + * If we created the page, zero it out and continue. + * + * !!! + * Note: DB_MPOOL_NEW specifically doesn't call the pgin function. + * If DB_MPOOL_CREATE is used, then the application's pgin function + * has to be able to handle pages of 0's -- if it uses DB_MPOOL_NEW, + * it can detect all of its page creates, and not bother. + * + * If we're running in diagnostic mode, smash any bytes on the + * page that are unknown quantities for the caller. + * + * Otherwise, read the page into memory, optionally creating it if + * DB_MPOOL_CREATE is set. + */ + if (LF_ISSET(DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP)) { + if (mfp->clear_len == 0) + memset(bhp->buf, 0, mfp->stat.st_pagesize); + else { + memset(bhp->buf, 0, mfp->clear_len); +#ifdef DIAGNOSTIC + memset(bhp->buf + mfp->clear_len, CLEAR_BYTE, + mfp->stat.st_pagesize - mfp->clear_len); +#endif + } + + ++mfp->stat.st_page_create; + } else { + /* + * It's possible for the read function to fail, which means + * that we fail as well. Note, the __memp_pgread() function + * discards the region lock, so the buffer must be pinned + * down so that it cannot move and its contents are unchanged. + */ +reread: if ((ret = __memp_pgread(dbmfp, + bhp, LF_ISSET(DB_MPOOL_CREATE|DB_MPOOL_EXTENT))) != 0) { + /* + * !!! + * Discard the buffer unless another thread is waiting + * on our I/O to complete. Regardless, the header has + * the BH_TRASH flag set. + */ + if (bhp->ref == 1) + __memp_bhfree(dbmp, bhp, 1); + goto err; + } + + ++mfp->stat.st_cache_miss; + } + + /* + * If we're returning a page after our current notion of the last-page, + * update our information. Note, there's no way to un-instantiate this + * page, it's going to exist whether it's returned to us dirty or not. + */ + if (bhp->pgno > mfp->last_pgno) + mfp->last_pgno = bhp->pgno; + + *(void **)addrp = bhp->buf; + +done: /* Update the chain search statistics. */ + if (st_hsearch) { + ++c_mp->stat.st_hash_searches; + if (st_hsearch > c_mp->stat.st_hash_longest) + c_mp->stat.st_hash_longest = st_hsearch; + c_mp->stat.st_hash_examined += st_hsearch; + } + + ++dbmfp->pinref; + + R_UNLOCK(dbenv, dbmp->reginfo); + + return (0); + +err: /* Discard our reference. */ + if (b_incr) + --bhp->ref; + R_UNLOCK(dbenv, dbmp->reginfo); + + *(void **)addrp = NULL; + return (ret); +} diff --git a/bdb/mp/mp_fopen.c b/bdb/mp/mp_fopen.c new file mode 100644 index 00000000000..3611ded18f4 --- /dev/null +++ b/bdb/mp/mp_fopen.c @@ -0,0 +1,756 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: mp_fopen.c,v 11.41 2001/01/10 04:50:53 ubell Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <string.h> +#endif + +#ifdef HAVE_RPC +#include "db_server.h" +#endif + +#include "db_int.h" +#include "db_shash.h" +#include "mp.h" + +#ifdef HAVE_RPC +#include "gen_client_ext.h" +#include "rpc_client_ext.h" +#endif + +static int __memp_mf_open __P((DB_MPOOL *, const char *, + size_t, db_pgno_t, DB_MPOOL_FINFO *, u_int32_t, MPOOLFILE **)); + +/* + * MEMP_FREMOVE -- + * Discard an MPOOLFILE and any buffers it references: update the flags + * so we never try to write buffers associated with the file, nor can we + * find it when looking for files to join. In addition, clear the ftype + * field, there's no reason to post-process pages, they can be discarded + * by any thread. + */ +#define MEMP_FREMOVE(mfp) { \ + mfp->ftype = 0; \ + F_SET(mfp, MP_DEADFILE); \ +} + +/* + * memp_fopen -- + * Open a backing file for the memory pool. + */ +int +memp_fopen(dbenv, path, flags, mode, pagesize, finfop, retp) + DB_ENV *dbenv; + const char *path; + u_int32_t flags; + int mode; + size_t pagesize; + DB_MPOOL_FINFO *finfop; + DB_MPOOLFILE **retp; +{ + DB_MPOOL *dbmp; + int ret; + +#ifdef HAVE_RPC + if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) + return (__dbcl_memp_fopen(dbenv, path, flags, + mode, pagesize, finfop, retp)); +#endif + + PANIC_CHECK(dbenv); + ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL); + + dbmp = dbenv->mp_handle; + + /* Validate arguments. */ + if ((ret = __db_fchk(dbenv, "memp_fopen", flags, + DB_CREATE | + DB_NOMMAP | DB_ODDFILESIZE | DB_RDONLY | DB_TRUNCATE)) != 0) + return (ret); + + /* Require a non-zero pagesize. */ + if (pagesize == 0 || + (finfop != NULL && finfop->clear_len > pagesize)) { + __db_err(dbenv, "memp_fopen: illegal page size."); + return (EINVAL); + } + + return (__memp_fopen(dbmp, + NULL, path, flags, mode, pagesize, 1, finfop, retp)); +} + +/* + * __memp_set_unlink -- set unlink on last close flag. + * + * PUBLIC: void __memp_set_unlink __P((DB_MPOOLFILE *)); + */ +void +__memp_set_unlink(dbmpf) + DB_MPOOLFILE *dbmpf; +{ + DB_MPOOL *dbmp; + dbmp = dbmpf->dbmp; + + R_LOCK(dbmp->dbenv, dbmp->reginfo); + F_SET(dbmpf->mfp, MP_UNLINK); + R_UNLOCK(dbmp->dbenv, dbmp->reginfo); +} + +/* + * __memp_clear_unlink -- clear unlink on last close flag. + * + * PUBLIC: void __memp_clear_unlink __P((DB_MPOOLFILE *)); + */ +void +__memp_clear_unlink(dbmpf) + DB_MPOOLFILE *dbmpf; +{ + DB_MPOOL *dbmp; + dbmp = dbmpf->dbmp; + + /* + * This bit is protected in the queue code because the metapage + * is locked so we can avoid geting the region lock. + * If this gets used from other than the queue code, we cannot. + */ + if (!F_ISSET(dbmpf->mfp, MP_UNLINK)) + return; + R_LOCK(dbmp->dbenv, dbmp->reginfo); + F_CLR(dbmpf->mfp, MP_UNLINK); + R_UNLOCK(dbmp->dbenv, dbmp->reginfo); +} + +/* + * __memp_fopen -- + * Open a backing file for the memory pool; internal version. + * + * PUBLIC: int __memp_fopen __P((DB_MPOOL *, MPOOLFILE *, const char *, + * PUBLIC: u_int32_t, int, size_t, int, DB_MPOOL_FINFO *, DB_MPOOLFILE **)); + */ +int +__memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp) + DB_MPOOL *dbmp; + MPOOLFILE *mfp; + const char *path; + u_int32_t flags; + int mode, needlock; + size_t pagesize; + DB_MPOOL_FINFO *finfop; + DB_MPOOLFILE **retp; +{ + DB_ENV *dbenv; + DB_MPOOLFILE *dbmfp; + DB_MPOOL_FINFO finfo; + db_pgno_t last_pgno; + size_t maxmap; + u_int32_t mbytes, bytes, oflags; + int ret; + u_int8_t idbuf[DB_FILE_ID_LEN]; + char *rpath; + + dbenv = dbmp->dbenv; + ret = 0; + rpath = NULL; + + /* + * If mfp is provided, we take the DB_MPOOL_FINFO information from + * the mfp. We don't bother initializing everything, because some + * of them are expensive to acquire. If no mfp is provided and the + * finfop argument is NULL, we default the values. + */ + if (finfop == NULL) { + memset(&finfo, 0, sizeof(finfo)); + if (mfp != NULL) { + finfo.ftype = mfp->ftype; + finfo.pgcookie = NULL; + finfo.fileid = NULL; + finfo.lsn_offset = mfp->lsn_off; + finfo.clear_len = mfp->clear_len; + } else { + finfo.ftype = 0; + finfo.pgcookie = NULL; + finfo.fileid = NULL; + finfo.lsn_offset = -1; + finfo.clear_len = 0; + } + finfop = &finfo; + } + + /* Allocate and initialize the per-process structure. */ + if ((ret = __os_calloc(dbenv, 1, sizeof(DB_MPOOLFILE), &dbmfp)) != 0) + return (ret); + dbmfp->dbmp = dbmp; + dbmfp->ref = 1; + if (LF_ISSET(DB_RDONLY)) + F_SET(dbmfp, MP_READONLY); + + if (path == NULL) { + if (LF_ISSET(DB_RDONLY)) { + __db_err(dbenv, + "memp_fopen: temporary files can't be readonly"); + ret = EINVAL; + goto err; + } + last_pgno = 0; + } else { + /* Get the real name for this file and open it. */ + if ((ret = __db_appname(dbenv, + DB_APP_DATA, NULL, path, 0, NULL, &rpath)) != 0) + goto err; + oflags = 0; + if (LF_ISSET(DB_CREATE)) + oflags |= DB_OSO_CREATE; + if (LF_ISSET(DB_RDONLY)) + oflags |= DB_OSO_RDONLY; + if ((ret = + __os_open(dbenv, rpath, oflags, mode, &dbmfp->fh)) != 0) { + if (!LF_ISSET(DB_EXTENT)) + __db_err(dbenv, + "%s: %s", rpath, db_strerror(ret)); + goto err; + } + + /* + * Don't permit files that aren't a multiple of the pagesize, + * and find the number of the last page in the file, all the + * time being careful not to overflow 32 bits. + * + * !!! + * We can't use off_t's here, or in any code in the mainline + * library for that matter. (We have to use them in the os + * stubs, of course, as there are system calls that take them + * as arguments.) The reason is that some customers build in + * environments where an off_t is 32-bits, but still run where + * offsets are 64-bits, and they pay us a lot of money. + */ + if ((ret = __os_ioinfo(dbenv, rpath, + &dbmfp->fh, &mbytes, &bytes, NULL)) != 0) { + __db_err(dbenv, "%s: %s", rpath, db_strerror(ret)); + goto err; + } + + /* + * If we're doing a verify, we might have to cope with + * a truncated file; if the file size is not a multiple + * of the page size, round down to a page--we'll + * take care of the partial page outside the memp system. + */ + + /* Page sizes have to be a power-of-two, ignore mbytes. */ + if (bytes % pagesize != 0) { + if (LF_ISSET(DB_ODDFILESIZE)) + /* + * If we're doing a verify, we might + * have to cope with a truncated file; + * round down, we'll worry about the partial + * page outside the memp system. + */ + bytes -= (bytes % pagesize); + else { + __db_err(dbenv, + "%s: file size not a multiple of the pagesize", + rpath); + ret = EINVAL; + goto err; + } + } + + last_pgno = mbytes * (MEGABYTE / pagesize); + last_pgno += bytes / pagesize; + + /* Correction: page numbers are zero-based, not 1-based. */ + if (last_pgno != 0) + --last_pgno; + + /* + * Get the file id if we weren't given one. Generated file id's + * don't use timestamps, otherwise there'd be no chance of any + * other process joining the party. + */ + if (finfop->fileid == NULL) { + if ((ret = __os_fileid(dbenv, rpath, 0, idbuf)) != 0) + goto err; + finfop->fileid = idbuf; + } + } + + /* + * If we weren't provided an underlying shared object to join with, + * find/allocate the shared file objects. Also allocate space for + * for the per-process thread lock. + */ + if (needlock) + R_LOCK(dbenv, dbmp->reginfo); + if (mfp == NULL) + ret = __memp_mf_open( + dbmp, path, pagesize, last_pgno, finfop, flags, &mfp); + else { + ++mfp->mpf_cnt; + ret = 0; + } + if (needlock) + R_UNLOCK(dbenv, dbmp->reginfo); + if (ret != 0) + goto err; + + if (F_ISSET(dbenv, DB_ENV_THREAD)) { + if ((ret = __db_mutex_alloc( + dbenv, dbmp->reginfo, &dbmfp->mutexp)) != 0) + goto err; + if ((ret = __db_mutex_init( + dbenv, dbmfp->mutexp, 0, MUTEX_THREAD)) != 0) + goto err; + + /* XXX: KEITH: CLOSE THE FILE ON FAILURE? */ + } + + dbmfp->mfp = mfp; + + /* + * If a file: + * + is read-only + * + isn't temporary + * + doesn't require any pgin/pgout support + * + the DB_NOMMAP flag wasn't set (in either the file open or + * the environment in which it was opened) + * + and is less than mp_mmapsize bytes in size + * + * we can mmap it instead of reading/writing buffers. Don't do error + * checking based on the mmap call failure. We want to do normal I/O + * on the file if the reason we failed was because the file was on an + * NFS mounted partition, and we can fail in buffer I/O just as easily + * as here. + * + * XXX + * We'd like to test to see if the file is too big to mmap. Since we + * don't know what size or type off_t's or size_t's are, or the largest + * unsigned integral type is, or what random insanity the local C + * compiler will perpetrate, doing the comparison in a portable way is + * flatly impossible. Hope that mmap fails if the file is too large. + */ +#define DB_MAXMMAPSIZE (10 * 1024 * 1024) /* 10 Mb. */ + if (F_ISSET(mfp, MP_CAN_MMAP)) { + if (!F_ISSET(dbmfp, MP_READONLY)) + F_CLR(mfp, MP_CAN_MMAP); + if (path == NULL) + F_CLR(mfp, MP_CAN_MMAP); + if (finfop->ftype != 0) + F_CLR(mfp, MP_CAN_MMAP); + if (LF_ISSET(DB_NOMMAP) || F_ISSET(dbenv, DB_ENV_NOMMAP)) + F_CLR(mfp, MP_CAN_MMAP); + maxmap = dbenv->mp_mmapsize == 0 ? + DB_MAXMMAPSIZE : dbenv->mp_mmapsize; + if (mbytes > maxmap / MEGABYTE || + (mbytes == maxmap / MEGABYTE && bytes >= maxmap % MEGABYTE)) + F_CLR(mfp, MP_CAN_MMAP); + } + dbmfp->addr = NULL; + if (F_ISSET(mfp, MP_CAN_MMAP)) { + dbmfp->len = (size_t)mbytes * MEGABYTE + bytes; + if (__os_mapfile(dbenv, rpath, + &dbmfp->fh, dbmfp->len, 1, &dbmfp->addr) != 0) { + dbmfp->addr = NULL; + F_CLR(mfp, MP_CAN_MMAP); + } + } + if (rpath != NULL) + __os_freestr(rpath); + + MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp); + TAILQ_INSERT_TAIL(&dbmp->dbmfq, dbmfp, q); + MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); + + *retp = dbmfp; + return (0); + +err: /* + * Note that we do not have to free the thread mutex, because we + * never get to here after we have successfully allocated it. + */ + if (rpath != NULL) + __os_freestr(rpath); + if (F_ISSET(&dbmfp->fh, DB_FH_VALID)) + (void)__os_closehandle(&dbmfp->fh); + if (dbmfp != NULL) { + if (dbmfp->mutexp != NULL) + __db_mutex_free(dbenv, dbmp->reginfo, dbmfp->mutexp); + __os_free(dbmfp, sizeof(DB_MPOOLFILE)); + } + return (ret); +} + +/* + * __memp_mf_open -- + * Open an MPOOLFILE. + */ +static int +__memp_mf_open(dbmp, path, pagesize, last_pgno, finfop, flags, retp) + DB_MPOOL *dbmp; + const char *path; + size_t pagesize; + db_pgno_t last_pgno; + DB_MPOOL_FINFO *finfop; + u_int32_t flags; + MPOOLFILE **retp; +{ + MPOOL *mp; + MPOOLFILE *mfp; + int ret; + void *p; + +#define ISTEMPORARY (path == NULL) + + /* + * If not creating a temporary file, walk the list of MPOOLFILE's, + * looking for a matching file. Files backed by temporary files + * or previously removed files can't match. + * + * DB_TRUNCATE support. + * + * The fileID is a filesystem unique number (e.g., a UNIX dev/inode + * pair) plus a timestamp. If files are removed and created in less + * than a second, the fileID can be repeated. The problem with + * repetition happens when the file that previously had the fileID + * value still has pages in the pool, since we don't want to use them + * to satisfy requests for the new file. + * + * Because the DB_TRUNCATE flag reuses the dev/inode pair, repeated + * opens with that flag set guarantees matching fileIDs when the + * machine can open a file and then re-open with truncate within a + * second. For this reason, we pass that flag down, and, if we find + * a matching entry, we ensure that it's never found again, and we + * create a new entry for the current request. + */ + if (!ISTEMPORARY) { + mp = dbmp->reginfo[0].primary; + for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile); + mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) { + if (F_ISSET(mfp, MP_DEADFILE | MP_TEMP)) + continue; + if (memcmp(finfop->fileid, R_ADDR(dbmp->reginfo, + mfp->fileid_off), DB_FILE_ID_LEN) == 0) { + if (LF_ISSET(DB_TRUNCATE)) { + MEMP_FREMOVE(mfp); + continue; + } + if (finfop->clear_len != mfp->clear_len || + pagesize != mfp->stat.st_pagesize) { + __db_err(dbmp->dbenv, + "%s: page size or clear length changed", + path); + return (EINVAL); + } + + /* + * It's possible that our needs for pre- and + * post-processing are changing. For example, + * an application created a hash subdatabase + * in a database that was previously all btree. + */ + if (finfop->ftype != 0) + mfp->ftype = finfop->ftype; + + ++mfp->mpf_cnt; + + *retp = mfp; + return (0); + } + } + } + + /* Allocate a new MPOOLFILE. */ + if ((ret = __memp_alloc( + dbmp, dbmp->reginfo, NULL, sizeof(MPOOLFILE), NULL, &mfp)) != 0) + goto mem_err; + *retp = mfp; + + /* Initialize the structure. */ + memset(mfp, 0, sizeof(MPOOLFILE)); + mfp->mpf_cnt = 1; + mfp->ftype = finfop->ftype; + mfp->lsn_off = finfop->lsn_offset; + mfp->clear_len = finfop->clear_len; + + /* + * If the user specifies DB_MPOOL_LAST or DB_MPOOL_NEW on a memp_fget, + * we have to know the last page in the file. Figure it out and save + * it away. + */ + mfp->stat.st_pagesize = pagesize; + mfp->orig_last_pgno = mfp->last_pgno = last_pgno; + + if (ISTEMPORARY) + F_SET(mfp, MP_TEMP); + else { + /* Copy the file path into shared memory. */ + if ((ret = __memp_alloc(dbmp, dbmp->reginfo, + NULL, strlen(path) + 1, &mfp->path_off, &p)) != 0) + goto err; + memcpy(p, path, strlen(path) + 1); + + /* Copy the file identification string into shared memory. */ + if ((ret = __memp_alloc(dbmp, dbmp->reginfo, + NULL, DB_FILE_ID_LEN, &mfp->fileid_off, &p)) != 0) + goto err; + memcpy(p, finfop->fileid, DB_FILE_ID_LEN); + + F_SET(mfp, MP_CAN_MMAP); + } + + /* Copy the page cookie into shared memory. */ + if (finfop->pgcookie == NULL || finfop->pgcookie->size == 0) { + mfp->pgcookie_len = 0; + mfp->pgcookie_off = 0; + } else { + if ((ret = __memp_alloc(dbmp, dbmp->reginfo, + NULL, finfop->pgcookie->size, &mfp->pgcookie_off, &p)) != 0) + goto err; + memcpy(p, finfop->pgcookie->data, finfop->pgcookie->size); + mfp->pgcookie_len = finfop->pgcookie->size; + } + + /* Prepend the MPOOLFILE to the list of MPOOLFILE's. */ + mp = dbmp->reginfo[0].primary; + SH_TAILQ_INSERT_HEAD(&mp->mpfq, mfp, q, __mpoolfile); + + if (0) { +err: if (mfp->path_off != 0) + __db_shalloc_free(dbmp->reginfo[0].addr, + R_ADDR(dbmp->reginfo, mfp->path_off)); + if (mfp->fileid_off != 0) + __db_shalloc_free(dbmp->reginfo[0].addr, + R_ADDR(dbmp->reginfo, mfp->fileid_off)); + if (mfp != NULL) + __db_shalloc_free(dbmp->reginfo[0].addr, mfp); +mem_err: __db_err(dbmp->dbenv, + "Unable to allocate memory for mpool file"); + } + return (ret); +} + +/* + * memp_fclose -- + * Close a backing file for the memory pool. + */ +int +memp_fclose(dbmfp) + DB_MPOOLFILE *dbmfp; +{ + DB_ENV *dbenv; + DB_MPOOL *dbmp; + MPOOLFILE *mfp; + char *rpath; + int ret, t_ret; + + dbmp = dbmfp->dbmp; + dbenv = dbmp->dbenv; + ret = 0; + + PANIC_CHECK(dbenv); + +#ifdef HAVE_RPC + if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) + return (__dbcl_memp_fclose(dbmfp)); +#endif + + /* + * Remove the DB_MPOOLFILE from the queue. This has to happen before + * we perform any action that can fail, otherwise __memp_close may + * loop infinitely when calling us to discard all of the DB_MPOOLFILEs. + */ + for (;;) { + MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp); + + /* + * We have to reference count DB_MPOOLFILE structures as other + * threads may be using them. The problem only happens if the + * application makes a bad design choice. Here's the path: + * + * Thread A opens a database. + * Thread B uses thread A's DB_MPOOLFILE to write a buffer + * in order to free up memory in the mpool cache. + * Thread A closes the database while thread B is using the + * DB_MPOOLFILE structure. + * + * By opening all databases before creating the threads, and + * closing them after the threads have exited, applications + * get better performance and avoid the problem path entirely. + * + * Regardless, holding the DB_MPOOLFILE to flush a dirty buffer + * is a short-term lock, even in worst case, since we better be + * the only thread of control using the DB_MPOOLFILE structure + * to read pages *into* the cache. Wait until we're the only + * reference holder and remove the DB_MPOOLFILE structure from + * the list, so nobody else can even find it. + */ + if (dbmfp->ref == 1) { + TAILQ_REMOVE(&dbmp->dbmfq, dbmfp, q); + break; + } + MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); + + (void)__os_sleep(dbenv, 1, 0); + } + MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); + + /* Complain if pinned blocks never returned. */ + if (dbmfp->pinref != 0) + __db_err(dbenv, "%s: close: %lu blocks left pinned", + __memp_fn(dbmfp), (u_long)dbmfp->pinref); + + /* Discard any mmap information. */ + if (dbmfp->addr != NULL && + (ret = __os_unmapfile(dbenv, dbmfp->addr, dbmfp->len)) != 0) + __db_err(dbenv, "%s: %s", __memp_fn(dbmfp), db_strerror(ret)); + + /* Close the file; temporary files may not yet have been created. */ + if (F_ISSET(&dbmfp->fh, DB_FH_VALID) && + (t_ret = __os_closehandle(&dbmfp->fh)) != 0) { + __db_err(dbenv, "%s: %s", __memp_fn(dbmfp), db_strerror(t_ret)); + if (ret != 0) + t_ret = ret; + } + + /* Discard the thread mutex. */ + if (dbmfp->mutexp != NULL) + __db_mutex_free(dbenv, dbmp->reginfo, dbmfp->mutexp); + + /* + * Discard our reference on the the underlying MPOOLFILE, and close + * it if it's no longer useful to anyone. + * + * If we're not discarding it, and it's a temp file, this means + * all the outstanding references belong to unflushed buffers. + * (A temp file can only be referenced by one DB_MPOOLFILE). + * We don't care about preserving any of those buffers, so mark + * the MPOOLFILE as dead so that when we try to flush them, + * even the dirty ones just get discarded. + */ + R_LOCK(dbenv, dbmp->reginfo); + mfp = dbmfp->mfp; + if (--mfp->mpf_cnt == 0) { + if (F_ISSET(mfp, MP_UNLINK)) { + MEMP_FREMOVE(mfp); + if ((t_ret = __db_appname(dbmp->dbenv, + DB_APP_DATA, NULL, R_ADDR(dbmp->reginfo, + mfp->path_off), 0, NULL, &rpath)) != 0 && ret == 0) + ret = t_ret; + if (t_ret == 0 && (t_ret = + __os_unlink(dbmp->dbenv, rpath) != 0 && ret == 0)) + ret = t_ret; + __os_free(rpath, 0); + } + if (mfp->block_cnt == 0) + __memp_mf_discard(dbmp, mfp); + } + else if (F_ISSET(mfp, MP_TEMP)) + MEMP_FREMOVE(mfp); + R_UNLOCK(dbenv, dbmp->reginfo); + + /* Discard the DB_MPOOLFILE structure. */ + __os_free(dbmfp, sizeof(DB_MPOOLFILE)); + + return (ret); +} + +/* + * __memp_mf_discard -- + * Discard an MPOOLFILE. + * + * PUBLIC: void __memp_mf_discard __P((DB_MPOOL *, MPOOLFILE *)); + */ +void +__memp_mf_discard(dbmp, mfp) + DB_MPOOL *dbmp; + MPOOLFILE *mfp; +{ + MPOOL *mp; + + mp = dbmp->reginfo[0].primary; + + /* Delete from the list of MPOOLFILEs. */ + SH_TAILQ_REMOVE(&mp->mpfq, mfp, q, __mpoolfile); + + /* Free the space. */ + if (mfp->path_off != 0) + __db_shalloc_free(dbmp->reginfo[0].addr, + R_ADDR(dbmp->reginfo, mfp->path_off)); + if (mfp->fileid_off != 0) + __db_shalloc_free(dbmp->reginfo[0].addr, + R_ADDR(dbmp->reginfo, mfp->fileid_off)); + if (mfp->pgcookie_off != 0) + __db_shalloc_free(dbmp->reginfo[0].addr, + R_ADDR(dbmp->reginfo, mfp->pgcookie_off)); + __db_shalloc_free(dbmp->reginfo[0].addr, mfp); +} + +/* + * __memp_fremove -- + * Remove an underlying file from the system. + * + * PUBLIC: int __memp_fremove __P((DB_MPOOLFILE *)); + */ +int +__memp_fremove(dbmfp) + DB_MPOOLFILE *dbmfp; +{ + DB_ENV *dbenv; + DB_MPOOL *dbmp; + MPOOLFILE *mfp; + + dbmp = dbmfp->dbmp; + dbenv = dbmp->dbenv; + mfp = dbmfp->mfp; + + PANIC_CHECK(dbenv); + + R_LOCK(dbenv, dbmp->reginfo); + + MEMP_FREMOVE(mfp); + + R_UNLOCK(dbenv, dbmp->reginfo); + + return (0); +} + +/* + * __memp_fn -- + * On errors we print whatever is available as the file name. + * + * PUBLIC: char * __memp_fn __P((DB_MPOOLFILE *)); + */ +char * +__memp_fn(dbmfp) + DB_MPOOLFILE *dbmfp; +{ + return (__memp_fns(dbmfp->dbmp, dbmfp->mfp)); +} + +/* + * __memp_fns -- + * On errors we print whatever is available as the file name. + * + * PUBLIC: char * __memp_fns __P((DB_MPOOL *, MPOOLFILE *)); + * + */ +char * +__memp_fns(dbmp, mfp) + DB_MPOOL *dbmp; + MPOOLFILE *mfp; +{ + if (mfp->path_off == 0) + return ((char *)"temporary"); + + return ((char *)R_ADDR(dbmp->reginfo, mfp->path_off)); +} diff --git a/bdb/mp/mp_fput.c b/bdb/mp/mp_fput.c new file mode 100644 index 00000000000..be03b721f36 --- /dev/null +++ b/bdb/mp/mp_fput.c @@ -0,0 +1,186 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: mp_fput.c,v 11.16 2000/11/30 00:58:41 ubell Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#endif + +#ifdef HAVE_RPC +#include "db_server.h" +#endif + +#include "db_int.h" +#include "db_shash.h" +#include "mp.h" + +#ifdef HAVE_RPC +#include "gen_client_ext.h" +#include "rpc_client_ext.h" +#endif + +/* + * memp_fput -- + * Mpool file put function. + */ +int +memp_fput(dbmfp, pgaddr, flags) + DB_MPOOLFILE *dbmfp; + void *pgaddr; + u_int32_t flags; +{ + BH *bhp; + DB_ENV *dbenv; + DB_MPOOL *dbmp; + MPOOL *c_mp, *mp; + int ret, wrote; + + dbmp = dbmfp->dbmp; + dbenv = dbmp->dbenv; + mp = dbmp->reginfo[0].primary; + +#ifdef HAVE_RPC + if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) + return (__dbcl_memp_fput(dbmfp, pgaddr, flags)); +#endif + + PANIC_CHECK(dbenv); + + /* Validate arguments. */ + if (flags) { + if ((ret = __db_fchk(dbenv, "memp_fput", flags, + DB_MPOOL_CLEAN | DB_MPOOL_DIRTY | DB_MPOOL_DISCARD)) != 0) + return (ret); + if ((ret = __db_fcchk(dbenv, "memp_fput", + flags, DB_MPOOL_CLEAN, DB_MPOOL_DIRTY)) != 0) + return (ret); + + if (LF_ISSET(DB_MPOOL_DIRTY) && F_ISSET(dbmfp, MP_READONLY)) { + __db_err(dbenv, + "%s: dirty flag set for readonly file page", + __memp_fn(dbmfp)); + return (EACCES); + } + } + + R_LOCK(dbenv, dbmp->reginfo); + + /* Decrement the pinned reference count. */ + if (dbmfp->pinref == 0) { + __db_err(dbenv, + "%s: more pages returned than retrieved", __memp_fn(dbmfp)); + R_UNLOCK(dbenv, dbmp->reginfo); + return (EINVAL); + } else + --dbmfp->pinref; + + /* + * If we're mapping the file, there's nothing to do. Because we can + * stop mapping the file at any time, we have to check on each buffer + * to see if the address we gave the application was part of the map + * region. + */ + if (dbmfp->addr != NULL && pgaddr >= dbmfp->addr && + (u_int8_t *)pgaddr <= (u_int8_t *)dbmfp->addr + dbmfp->len) { + R_UNLOCK(dbenv, dbmp->reginfo); + return (0); + } + + /* Convert the page address to a buffer header. */ + bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf)); + + /* Convert the buffer header to a cache. */ + c_mp = BH_TO_CACHE(dbmp, bhp); + +/* UNLOCK THE REGION, LOCK THE CACHE. */ + + /* Set/clear the page bits. */ + if (LF_ISSET(DB_MPOOL_CLEAN) && F_ISSET(bhp, BH_DIRTY)) { + ++c_mp->stat.st_page_clean; + --c_mp->stat.st_page_dirty; + F_CLR(bhp, BH_DIRTY); + } + if (LF_ISSET(DB_MPOOL_DIRTY) && !F_ISSET(bhp, BH_DIRTY)) { + --c_mp->stat.st_page_clean; + ++c_mp->stat.st_page_dirty; + F_SET(bhp, BH_DIRTY); + } + if (LF_ISSET(DB_MPOOL_DISCARD)) + F_SET(bhp, BH_DISCARD); + + /* + * If the page is dirty and being scheduled to be written as part of + * a checkpoint, we no longer know that the log is up-to-date. + */ + if (F_ISSET(bhp, BH_DIRTY) && F_ISSET(bhp, BH_SYNC)) + F_SET(bhp, BH_SYNC_LOGFLSH); + + /* + * Check for a reference count going to zero. This can happen if the + * application returns a page twice. + */ + if (bhp->ref == 0) { + __db_err(dbenv, "%s: page %lu: unpinned page returned", + __memp_fn(dbmfp), (u_long)bhp->pgno); + R_UNLOCK(dbenv, dbmp->reginfo); + return (EINVAL); + } + + /* + * If more than one reference to the page, we're done. Ignore the + * discard flags (for now) and leave it at its position in the LRU + * chain. The rest gets done at last reference close. + */ + if (--bhp->ref > 0) { + R_UNLOCK(dbenv, dbmp->reginfo); + return (0); + } + + /* + * Move the buffer to the head/tail of the LRU chain. We do this + * before writing the buffer for checkpoint purposes, as the write + * can discard the region lock and allow another process to acquire + * buffer. We could keep that from happening, but there seems no + * reason to do so. + */ + SH_TAILQ_REMOVE(&c_mp->bhq, bhp, q, __bh); + if (F_ISSET(bhp, BH_DISCARD)) + SH_TAILQ_INSERT_HEAD(&c_mp->bhq, bhp, q, __bh); + else + SH_TAILQ_INSERT_TAIL(&c_mp->bhq, bhp, q); + + /* + * If this buffer is scheduled for writing because of a checkpoint, we + * need to write it (if it's dirty), or update the checkpoint counters + * (if it's not dirty). If we try to write it and can't, that's not + * necessarily an error as it's not completely unreasonable that the + * application have permission to write the underlying file, but set a + * flag so that the next time the memp_sync function is called we try + * writing it there, as the checkpoint thread of control better be able + * to write all of the files. + */ + if (F_ISSET(bhp, BH_SYNC)) { + if (F_ISSET(bhp, BH_DIRTY)) { + if (__memp_bhwrite(dbmp, + dbmfp->mfp, bhp, NULL, &wrote) != 0 || !wrote) + F_SET(mp, MP_LSN_RETRY); + } else { + F_CLR(bhp, BH_SYNC); + + --mp->lsn_cnt; + --dbmfp->mfp->lsn_cnt; + } + } + + R_UNLOCK(dbenv, dbmp->reginfo); + return (0); +} diff --git a/bdb/mp/mp_fset.c b/bdb/mp/mp_fset.c new file mode 100644 index 00000000000..08313c9b6f5 --- /dev/null +++ b/bdb/mp/mp_fset.c @@ -0,0 +1,98 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: mp_fset.c,v 11.13 2000/11/30 00:58:41 ubell Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#endif + +#ifdef HAVE_RPC +#include "db_server.h" +#endif + +#include "db_int.h" +#include "db_shash.h" +#include "mp.h" + +#ifdef HAVE_RPC +#include "gen_client_ext.h" +#include "rpc_client_ext.h" +#endif + +/* + * memp_fset -- + * Mpool page set-flag routine. + */ +int +memp_fset(dbmfp, pgaddr, flags) + DB_MPOOLFILE *dbmfp; + void *pgaddr; + u_int32_t flags; +{ + BH *bhp; + DB_ENV *dbenv; + DB_MPOOL *dbmp; + MPOOL *c_mp, *mp; + int ret; + + dbmp = dbmfp->dbmp; + dbenv = dbmp->dbenv; + mp = dbmp->reginfo[0].primary; + +#ifdef HAVE_RPC + if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) + return (__dbcl_memp_fset(dbmfp, pgaddr, flags)); +#endif + + PANIC_CHECK(dbenv); + + /* Validate arguments. */ + if (flags == 0) + return (__db_ferr(dbenv, "memp_fset", 1)); + + if ((ret = __db_fchk(dbenv, "memp_fset", flags, + DB_MPOOL_DIRTY | DB_MPOOL_CLEAN | DB_MPOOL_DISCARD)) != 0) + return (ret); + if ((ret = __db_fcchk(dbenv, "memp_fset", + flags, DB_MPOOL_CLEAN, DB_MPOOL_DIRTY)) != 0) + return (ret); + + if (LF_ISSET(DB_MPOOL_DIRTY) && F_ISSET(dbmfp, MP_READONLY)) { + __db_err(dbenv, "%s: dirty flag set for readonly file page", + __memp_fn(dbmfp)); + return (EACCES); + } + + /* Convert the page address to a buffer header. */ + bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf)); + + /* Convert the buffer header to a cache. */ + c_mp = BH_TO_CACHE(dbmp, bhp); + + R_LOCK(dbenv, dbmp->reginfo); + + if (LF_ISSET(DB_MPOOL_CLEAN) && F_ISSET(bhp, BH_DIRTY)) { + ++c_mp->stat.st_page_clean; + --c_mp->stat.st_page_dirty; + F_CLR(bhp, BH_DIRTY); + } + if (LF_ISSET(DB_MPOOL_DIRTY) && !F_ISSET(bhp, BH_DIRTY)) { + --c_mp->stat.st_page_clean; + ++c_mp->stat.st_page_dirty; + F_SET(bhp, BH_DIRTY); + } + if (LF_ISSET(DB_MPOOL_DISCARD)) + F_SET(bhp, BH_DISCARD); + + R_UNLOCK(dbenv, dbmp->reginfo); + return (0); +} diff --git a/bdb/mp/mp_method.c b/bdb/mp/mp_method.c new file mode 100644 index 00000000000..85a6239b032 --- /dev/null +++ b/bdb/mp/mp_method.c @@ -0,0 +1,115 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: mp_method.c,v 11.10 2000/04/04 20:12:04 bostic Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#endif + +#ifdef HAVE_RPC +#include "db_server.h" +#endif + +#include "db_int.h" +#include "db_shash.h" +#include "mp.h" + +#ifdef HAVE_RPC +#include "gen_client_ext.h" +#include "rpc_client_ext.h" +#endif + +static int __memp_set_cachesize __P((DB_ENV *, u_int32_t, u_int32_t, int)); +static int __memp_set_mp_mmapsize __P((DB_ENV *, size_t)); + +/* + * __memp_dbenv_create -- + * Mpool specific creation of the DB_ENV structure. + * + * PUBLIC: void __memp_dbenv_create __P((DB_ENV *)); + */ +void +__memp_dbenv_create(dbenv) + DB_ENV *dbenv; +{ + /* + * We default to 32 8K pages. We don't default to a flat 256K, because + * some systems require significantly more memory to hold 32 pages than + * others. For example, HP-UX with POSIX pthreads needs 88 bytes for + * a POSIX pthread mutex and almost 200 bytes per buffer header, while + * Solaris needs 24 and 52 bytes for the same structures. + */ + dbenv->mp_bytes = 32 * ((8 * 1024) + sizeof(BH)); + dbenv->mp_ncache = 1; + + dbenv->set_mp_mmapsize = __memp_set_mp_mmapsize; + dbenv->set_cachesize = __memp_set_cachesize; + +#ifdef HAVE_RPC + /* + * If we have a client, overwrite what we just setup to + * point to client functions. + */ + if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) { + dbenv->set_cachesize = __dbcl_env_cachesize; + dbenv->set_mp_mmapsize = __dbcl_set_mp_mmapsize; + } +#endif + +} + +/* + * __memp_set_cachesize -- + * Initialize the cache size. + */ +static int +__memp_set_cachesize(dbenv, gbytes, bytes, ncache) + DB_ENV *dbenv; + u_int32_t gbytes, bytes; + int ncache; +{ + ENV_ILLEGAL_AFTER_OPEN(dbenv, "set_cachesize"); + + dbenv->mp_gbytes = gbytes + bytes / GIGABYTE; + dbenv->mp_bytes = bytes % GIGABYTE; + dbenv->mp_ncache = ncache == 0 ? 1 : ncache; + + /* + * If the application requested less than 500Mb, increase the + * cachesize by 25% to account for our overhead. (I'm guessing + * that caches over 500Mb are specifically sized, i.e., it's + * a large server and the application actually knows how much + * memory is available.) + * + * There is a minimum cache size, regardless. + */ + if (dbenv->mp_gbytes == 0) { + if (dbenv->mp_bytes < 500 * MEGABYTE) + dbenv->mp_bytes += dbenv->mp_bytes / 4; + if (dbenv->mp_bytes < DB_CACHESIZE_MIN) + dbenv->mp_bytes = DB_CACHESIZE_MIN; + } + + return (0); +} + +/* + * __memp_set_mp_mmapsize -- + * Set the maximum mapped file size. + */ +static int +__memp_set_mp_mmapsize(dbenv, mp_mmapsize ) + DB_ENV *dbenv; + size_t mp_mmapsize; +{ + dbenv->mp_mmapsize = mp_mmapsize; + return (0); +} diff --git a/bdb/mp/mp_region.c b/bdb/mp/mp_region.c new file mode 100644 index 00000000000..4b85466ce63 --- /dev/null +++ b/bdb/mp/mp_region.c @@ -0,0 +1,357 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: mp_region.c,v 11.26 2000/11/30 00:58:41 ubell Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <string.h> +#endif + +#include "db_int.h" +#include "db_shash.h" +#include "mp.h" + +static int __mpool_init __P((DB_ENV *, DB_MPOOL *, int, int)); +#ifdef MUTEX_SYSTEM_RESOURCES +static size_t __mpool_region_maint __P((REGINFO *)); +#endif + +/* + * __memp_open -- + * Internal version of memp_open: only called from DB_ENV->open. + * + * PUBLIC: int __memp_open __P((DB_ENV *)); + */ +int +__memp_open(dbenv) + DB_ENV *dbenv; +{ + DB_MPOOL *dbmp; + MPOOL *mp; + REGINFO reginfo; + roff_t reg_size, *regids; + u_int32_t i; + int htab_buckets, ret; + + /* Figure out how big each cache region is. */ + reg_size = (dbenv->mp_gbytes / dbenv->mp_ncache) * GIGABYTE; + reg_size += ((dbenv->mp_gbytes % + dbenv->mp_ncache) * GIGABYTE) / dbenv->mp_ncache; + reg_size += dbenv->mp_bytes / dbenv->mp_ncache; + + /* + * Figure out how many hash buckets each region will have. Assume we + * want to keep the hash chains with under 10 pages on each chain. We + * don't know the pagesize in advance, and it may differ for different + * files. Use a pagesize of 1K for the calculation -- we walk these + * chains a lot, they must be kept short. + */ + htab_buckets = __db_tablesize((reg_size / (1 * 1024)) / 10); + + /* Create and initialize the DB_MPOOL structure. */ + if ((ret = __os_calloc(dbenv, 1, sizeof(*dbmp), &dbmp)) != 0) + return (ret); + LIST_INIT(&dbmp->dbregq); + TAILQ_INIT(&dbmp->dbmfq); + dbmp->dbenv = dbenv; + + /* Join/create the first mpool region. */ + memset(®info, 0, sizeof(REGINFO)); + reginfo.type = REGION_TYPE_MPOOL; + reginfo.id = INVALID_REGION_ID; + reginfo.mode = dbenv->db_mode; + reginfo.flags = REGION_JOIN_OK; + if (F_ISSET(dbenv, DB_ENV_CREATE)) + F_SET(®info, REGION_CREATE_OK); + if ((ret = __db_r_attach(dbenv, ®info, reg_size)) != 0) + goto err; + + /* + * If we created the region, initialize it. Create or join any + * additional regions. + */ + if (F_ISSET(®info, REGION_CREATE)) { + /* + * We define how many regions there are going to be, allocate + * the REGINFO structures and create them. Make sure we don't + * clear the wrong entries on error. + */ + dbmp->nreg = dbenv->mp_ncache; + if ((ret = __os_calloc(dbenv, + dbmp->nreg, sizeof(REGINFO), &dbmp->reginfo)) != 0) + goto err; + /* Make sure we don't clear the wrong entries on error. */ + for (i = 0; i < dbmp->nreg; ++i) + dbmp->reginfo[i].id = INVALID_REGION_ID; + dbmp->reginfo[0] = reginfo; + + /* Initialize the first region. */ + if ((ret = __mpool_init(dbenv, dbmp, 0, htab_buckets)) != 0) + goto err; + + /* + * Create/initialize remaining regions and copy their IDs into + * the first region. + */ + mp = R_ADDR(dbmp->reginfo, dbmp->reginfo[0].rp->primary); + regids = R_ADDR(dbmp->reginfo, mp->regids); + for (i = 1; i < dbmp->nreg; ++i) { + dbmp->reginfo[i].type = REGION_TYPE_MPOOL; + dbmp->reginfo[i].id = INVALID_REGION_ID; + dbmp->reginfo[i].mode = dbenv->db_mode; + dbmp->reginfo[i].flags = REGION_CREATE_OK; + if ((ret = __db_r_attach( + dbenv, &dbmp->reginfo[i], reg_size)) != 0) + goto err; + if ((ret = + __mpool_init(dbenv, dbmp, i, htab_buckets)) != 0) + goto err; + R_UNLOCK(dbenv, &dbmp->reginfo[i]); + + regids[i] = dbmp->reginfo[i].id; + } + } else { + /* + * Determine how many regions there are going to be, allocate + * the REGINFO structures and fill in local copies of that + * information. + */ + mp = R_ADDR(®info, reginfo.rp->primary); + dbmp->nreg = mp->nreg; + if ((ret = __os_calloc(dbenv, + dbmp->nreg, sizeof(REGINFO), &dbmp->reginfo)) != 0) + goto err; + /* Make sure we don't clear the wrong entries on error. */ + for (i = 0; i < dbmp->nreg; ++i) + dbmp->reginfo[i].id = INVALID_REGION_ID; + dbmp->reginfo[0] = reginfo; + + /* Join remaining regions. */ + regids = R_ADDR(dbmp->reginfo, mp->regids); + for (i = 1; i < dbmp->nreg; ++i) { + dbmp->reginfo[i].type = REGION_TYPE_MPOOL; + dbmp->reginfo[i].id = regids[i]; + dbmp->reginfo[i].mode = 0; + dbmp->reginfo[i].flags = REGION_JOIN_OK; + if ((ret = __db_r_attach( + dbenv, &dbmp->reginfo[i], 0)) != 0) + goto err; + R_UNLOCK(dbenv, &dbmp->reginfo[i]); + } + } + + /* Set the local addresses for the regions. */ + for (i = 0; i < dbmp->nreg; ++i) + dbmp->reginfo[i].primary = + R_ADDR(&dbmp->reginfo[i], dbmp->reginfo[i].rp->primary); + + /* If the region is threaded, allocate a mutex to lock the handles. */ + if (F_ISSET(dbenv, DB_ENV_THREAD)) { + if ((ret = __db_mutex_alloc( + dbenv, dbmp->reginfo, &dbmp->mutexp)) != 0) { + goto err; + } + if ((ret = + __db_mutex_init(dbenv, dbmp->mutexp, 0, MUTEX_THREAD)) != 0) + goto err; + } + + R_UNLOCK(dbenv, dbmp->reginfo); + + dbenv->mp_handle = dbmp; + return (0); + +err: if (dbmp->reginfo != NULL && dbmp->reginfo[0].addr != NULL) { + if (F_ISSET(dbmp->reginfo, REGION_CREATE)) + ret = __db_panic(dbenv, ret); + + R_UNLOCK(dbenv, dbmp->reginfo); + + for (i = 0; i < dbmp->nreg; ++i) + if (dbmp->reginfo[i].id != INVALID_REGION_ID) + (void)__db_r_detach( + dbenv, &dbmp->reginfo[i], 0); + __os_free(dbmp->reginfo, + dbmp->nreg * sizeof(*dbmp->reginfo)); + } + if (dbmp->mutexp != NULL) + __db_mutex_free(dbenv, dbmp->reginfo, dbmp->mutexp); + __os_free(dbmp, sizeof(*dbmp)); + return (ret); +} + +/* + * __mpool_init -- + * Initialize a MPOOL structure in shared memory. + */ +static int +__mpool_init(dbenv, dbmp, reginfo_off, htab_buckets) + DB_ENV *dbenv; + DB_MPOOL *dbmp; + int reginfo_off, htab_buckets; +{ + DB_HASHTAB *htab; + MPOOL *mp; + REGINFO *reginfo; +#ifdef MUTEX_SYSTEM_RESOURCES + size_t maint_size; +#endif + int ret; + void *p; + + mp = NULL; + + reginfo = &dbmp->reginfo[reginfo_off]; + if ((ret = __db_shalloc(reginfo->addr, + sizeof(MPOOL), MUTEX_ALIGN, ®info->primary)) != 0) + goto mem_err; + reginfo->rp->primary = R_OFFSET(reginfo, reginfo->primary); + mp = reginfo->primary; + memset(mp, 0, sizeof(*mp)); + +#ifdef MUTEX_SYSTEM_RESOURCES + maint_size = __mpool_region_maint(reginfo); + /* Allocate room for the maintenance info and initialize it. */ + if ((ret = __db_shalloc(reginfo->addr, + sizeof(REGMAINT) + maint_size, 0, &p)) != 0) + goto mem_err; + __db_maintinit(reginfo, p, maint_size); + mp->maint_off = R_OFFSET(reginfo, p); +#endif + + if (reginfo_off == 0) { + SH_TAILQ_INIT(&mp->mpfq); + + if ((ret = __db_shmutex_init(dbenv, &mp->sync_mutex, + R_OFFSET(dbmp->reginfo, &mp->sync_mutex) + + DB_FCNTL_OFF_MPOOL, 0, dbmp->reginfo, + (REGMAINT *)R_ADDR(dbmp->reginfo, mp->maint_off))) != 0) + goto err; + + ZERO_LSN(mp->lsn); + mp->lsn_cnt = 0; + + mp->nreg = dbmp->nreg; + if ((ret = __db_shalloc(dbmp->reginfo[0].addr, + dbmp->nreg * sizeof(int), 0, &p)) != 0) + goto mem_err; + mp->regids = R_OFFSET(dbmp->reginfo, p); + } + + SH_TAILQ_INIT(&mp->bhq); + + /* Allocate hash table space and initialize it. */ + if ((ret = __db_shalloc(reginfo->addr, + htab_buckets * sizeof(DB_HASHTAB), 0, &htab)) != 0) + goto mem_err; + __db_hashinit(htab, htab_buckets); + mp->htab = R_OFFSET(reginfo, htab); + mp->htab_buckets = htab_buckets; + + return (0); + +mem_err:__db_err(dbenv, "Unable to allocate memory for mpool region"); +err: if (reginfo->primary != NULL) + __db_shalloc_free(reginfo->addr, reginfo->primary); + return (ret); +} + +/* + * __memp_close -- + * Internal version of memp_close: only called from DB_ENV->close. + * + * PUBLIC: int __memp_close __P((DB_ENV *)); + */ +int +__memp_close(dbenv) + DB_ENV *dbenv; +{ + DB_MPOOL *dbmp; + DB_MPOOLFILE *dbmfp; + DB_MPREG *mpreg; + u_int32_t i; + int ret, t_ret; + + ret = 0; + dbmp = dbenv->mp_handle; + + /* Discard DB_MPREGs. */ + while ((mpreg = LIST_FIRST(&dbmp->dbregq)) != NULL) { + LIST_REMOVE(mpreg, q); + __os_free(mpreg, sizeof(DB_MPREG)); + } + + /* Discard DB_MPOOLFILEs. */ + while ((dbmfp = TAILQ_FIRST(&dbmp->dbmfq)) != NULL) + if ((t_ret = memp_fclose(dbmfp)) != 0 && ret == 0) + ret = t_ret; + + /* Discard the thread mutex. */ + if (dbmp->mutexp != NULL) + __db_mutex_free(dbenv, dbmp->reginfo, dbmp->mutexp); + + /* Detach from the region(s). */ + for (i = 0; i < dbmp->nreg; ++i) + if ((t_ret = __db_r_detach( + dbenv, &dbmp->reginfo[i], 0)) != 0 && ret == 0) + ret = t_ret; + + __os_free(dbmp->reginfo, dbmp->nreg * sizeof(*dbmp->reginfo)); + __os_free(dbmp, sizeof(*dbmp)); + + dbenv->mp_handle = NULL; + return (ret); +} + +#ifdef MUTEX_SYSTEM_RESOURCES +/* + * __mpool_region_maint -- + * Return the amount of space needed for region maintenance info. + * + */ +static size_t +__mpool_region_maint(infop) + REGINFO *infop; +{ + size_t s; + int numlocks; + + /* + * For mutex maintenance we need one mutex per possible page. + * Compute the maximum number of pages this cache can have. + * Also add in an mpool mutex. + */ + numlocks = ((infop->rp->size / DB_MIN_PGSIZE) + 1); + s = sizeof(roff_t) * numlocks; + return (s); +} +#endif + +/* + * __mpool_region_destroy + * Destroy any region maintenance info. + * + * PUBLIC: void __mpool_region_destroy __P((DB_ENV *, REGINFO *)); + */ +void +__mpool_region_destroy(dbenv, infop) + DB_ENV *dbenv; + REGINFO *infop; +{ + MPOOL *mp; + + COMPQUIET(dbenv, NULL); + mp = R_ADDR(infop, infop->rp->primary); + + __db_shlocks_destroy(infop, (REGMAINT *)R_ADDR(infop, mp->maint_off)); + return; +} diff --git a/bdb/mp/mp_register.c b/bdb/mp/mp_register.c new file mode 100644 index 00000000000..27859f69d7b --- /dev/null +++ b/bdb/mp/mp_register.c @@ -0,0 +1,85 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: mp_register.c,v 11.12 2000/11/15 19:25:39 sue Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#endif + +#ifdef HAVE_RPC +#include "db_server.h" +#endif + +#include "db_int.h" +#include "db_shash.h" +#include "mp.h" + +#ifdef HAVE_RPC +#include "gen_client_ext.h" +#include "rpc_client_ext.h" +#endif + +/* + * memp_register -- + * Register a file type's pgin, pgout routines. + */ +int +memp_register(dbenv, ftype, pgin, pgout) + DB_ENV *dbenv; + int ftype; + int (*pgin) __P((DB_ENV *, db_pgno_t, void *, DBT *)); + int (*pgout) __P((DB_ENV *, db_pgno_t, void *, DBT *)); +{ + DB_MPOOL *dbmp; + DB_MPREG *mpreg; + int ret; + +#ifdef HAVE_RPC + if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) + return (__dbcl_memp_register(dbenv, ftype, pgin, pgout)); +#endif + + PANIC_CHECK(dbenv); + ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL); + + dbmp = dbenv->mp_handle; + + /* + * Chances are good that the item has already been registered, as the + * DB access methods are the folks that call this routine. If already + * registered, just update the entry, although it's probably unchanged. + */ + MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp); + for (mpreg = LIST_FIRST(&dbmp->dbregq); + mpreg != NULL; mpreg = LIST_NEXT(mpreg, q)) + if (mpreg->ftype == ftype) { + mpreg->pgin = pgin; + mpreg->pgout = pgout; + break; + } + MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); + if (mpreg != NULL) + return (0); + + /* New entry. */ + if ((ret = __os_malloc(dbenv, sizeof(DB_MPREG), NULL, &mpreg)) != 0) + return (ret); + + mpreg->ftype = ftype; + mpreg->pgin = pgin; + mpreg->pgout = pgout; + + MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp); + LIST_INSERT_HEAD(&dbmp->dbregq, mpreg, q); + MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); + + return (0); +} diff --git a/bdb/mp/mp_stat.c b/bdb/mp/mp_stat.c new file mode 100644 index 00000000000..7982513448d --- /dev/null +++ b/bdb/mp/mp_stat.c @@ -0,0 +1,388 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: mp_stat.c,v 11.21 2001/01/09 16:59:30 bostic Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#endif + +#ifdef HAVE_RPC +#include "db_server.h" +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_shash.h" +#include "db_am.h" +#include "mp.h" + +#ifdef HAVE_RPC +#include "gen_client_ext.h" +#include "rpc_client_ext.h" +#endif + +static void __memp_dumpcache + __P((DB_MPOOL *, REGINFO *, size_t *, FILE *, u_int32_t)); +static void __memp_pbh __P((DB_MPOOL *, BH *, size_t *, FILE *)); + +/* + * memp_stat -- + * Display MPOOL statistics. + */ +int +memp_stat(dbenv, gspp, fspp, db_malloc) + DB_ENV *dbenv; + DB_MPOOL_STAT **gspp; + DB_MPOOL_FSTAT ***fspp; + void *(*db_malloc) __P((size_t)); +{ + DB_MPOOL *dbmp; + DB_MPOOL_FSTAT **tfsp, *tstruct; + DB_MPOOL_STAT *sp; + MPOOL *c_mp, *mp; + MPOOLFILE *mfp; + char *tname; + size_t len, nlen; + u_int32_t i; + int ret; + char *name; + +#ifdef HAVE_RPC + if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) + return (__dbcl_memp_stat(dbenv, gspp, fspp, db_malloc)); +#endif + + PANIC_CHECK(dbenv); + ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL); + + dbmp = dbenv->mp_handle; + sp = NULL; + + /* Global statistics. */ + mp = dbmp->reginfo[0].primary; + if (gspp != NULL) { + *gspp = NULL; + + if ((ret = __os_calloc(dbenv, 1, sizeof(**gspp), gspp)) != 0) + return (ret); + sp = *gspp; + + /* + * Initialization and information that is not maintained on + * a per-cache basis. + */ + sp->st_hash_longest = 0; + sp->st_region_wait = dbmp->reginfo[0].rp->mutex.mutex_set_wait; + sp->st_region_nowait = + dbmp->reginfo[0].rp->mutex.mutex_set_nowait; + sp->st_gbytes = dbenv->mp_gbytes; + sp->st_bytes = dbenv->mp_bytes; + sp->st_ncache = dbmp->nreg; + sp->st_regsize = dbmp->reginfo[0].rp->size; + + R_LOCK(dbenv, dbmp->reginfo); + + /* Walk the cache list and accumulate the global information. */ + for (i = 0; i < mp->nreg; ++i) { + c_mp = dbmp->reginfo[i].primary; + sp->st_cache_hit += c_mp->stat.st_cache_hit; + sp->st_cache_miss += c_mp->stat.st_cache_miss; + sp->st_map += c_mp->stat.st_map; + sp->st_page_create += c_mp->stat.st_page_create; + sp->st_page_in += c_mp->stat.st_page_in; + sp->st_page_out += c_mp->stat.st_page_out; + sp->st_ro_evict += c_mp->stat.st_ro_evict; + sp->st_rw_evict += c_mp->stat.st_rw_evict; + sp->st_hash_buckets += c_mp->stat.st_hash_buckets; + sp->st_hash_searches += c_mp->stat.st_hash_searches; + if (c_mp->stat.st_hash_longest > sp->st_hash_longest) + sp->st_hash_longest = + c_mp->stat.st_hash_longest; + sp->st_hash_examined += c_mp->stat.st_hash_examined; + sp->st_page_clean += c_mp->stat.st_page_clean; + sp->st_page_dirty += c_mp->stat.st_page_dirty; + sp->st_page_trickle += c_mp->stat.st_page_trickle; + sp->st_region_wait += c_mp->stat.st_region_wait; + sp->st_region_nowait += c_mp->stat.st_region_nowait; + } + + /* + * We have duplicate statistics fields in the cache and + * per-file structures. The counters are only incremented + * in the per-file structures, though. The intent is that + * if we ever flush files from the pool we can save their + * last known totals in the cache structure. + */ + for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile); + mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) { + sp->st_cache_hit += mfp->stat.st_cache_hit; + sp->st_cache_miss += mfp->stat.st_cache_miss; + sp->st_map += mfp->stat.st_map; + sp->st_page_create += mfp->stat.st_page_create; + sp->st_page_in += mfp->stat.st_page_in; + sp->st_page_out += mfp->stat.st_page_out; + } + + R_UNLOCK(dbenv, dbmp->reginfo); + } + + /* Per-file statistics. */ + if (fspp != NULL) { + *fspp = NULL; + + R_LOCK(dbenv, dbmp->reginfo); + + /* Count the MPOOLFILE structures. */ + for (i = 0, len = 0, + mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile); + mfp != NULL; + ++i, mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) + len += sizeof(DB_MPOOL_FSTAT *) + + sizeof(DB_MPOOL_FSTAT) + + strlen(__memp_fns(dbmp, mfp)) + 1; + len += sizeof(DB_MPOOL_FSTAT *); /* Trailing NULL */ + + R_UNLOCK(dbenv, dbmp->reginfo); + + if (len == 0) + return (0); + + /* Allocate space */ + if ((ret = __os_malloc(dbenv, len, db_malloc, fspp)) != 0) + return (ret); + + R_LOCK(dbenv, dbmp->reginfo); + + /* + * Build each individual entry. We assume that an array of + * pointers are aligned correctly to be followed by an array + * of structures, which should be safe (in this particular + * case, the first element of the structure is a pointer, so + * we're doubly safe). The array is followed by space for + * the text file names. + * + * Add 1 to i because we need to skip over the NULL. + */ + tfsp = *fspp; + tstruct = (DB_MPOOL_FSTAT *)(tfsp + i + 1); + tname = (char *)(tstruct + i); + + for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile); + mfp != NULL; + ++tfsp, ++tstruct, tname += nlen, + mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) { + name = __memp_fns(dbmp, mfp); + nlen = strlen(name) + 1; + *tfsp = tstruct; + *tstruct = mfp->stat; + tstruct->file_name = tname; + memcpy(tname, name, nlen); + } + *tfsp = NULL; + + R_UNLOCK(dbenv, dbmp->reginfo); + } + return (0); +} + +#define FMAP_ENTRIES 200 /* Files we map. */ + +#define MPOOL_DUMP_HASH 0x01 /* Debug hash chains. */ +#define MPOOL_DUMP_LRU 0x02 /* Debug LRU chains. */ +#define MPOOL_DUMP_MEM 0x04 /* Debug region memory. */ +#define MPOOL_DUMP_ALL 0x07 /* Debug all. */ + +/* + * __memp_dump_region -- + * Display MPOOL structures. + * + * PUBLIC: void __memp_dump_region __P((DB_ENV *, char *, FILE *)); + */ +void +__memp_dump_region(dbenv, area, fp) + DB_ENV *dbenv; + char *area; + FILE *fp; +{ + DB_MPOOL *dbmp; + DB_MPOOLFILE *dbmfp; + MPOOL *mp; + MPOOLFILE *mfp; + size_t fmap[FMAP_ENTRIES + 1]; + u_int32_t i, flags; + int cnt; + u_int8_t *p; + + dbmp = dbenv->mp_handle; + + /* Make it easy to call from the debugger. */ + if (fp == NULL) + fp = stderr; + + for (flags = 0; *area != '\0'; ++area) + switch (*area) { + case 'A': + LF_SET(MPOOL_DUMP_ALL); + break; + case 'h': + LF_SET(MPOOL_DUMP_HASH); + break; + case 'l': + LF_SET(MPOOL_DUMP_LRU); + break; + case 'm': + LF_SET(MPOOL_DUMP_MEM); + break; + } + + R_LOCK(dbenv, dbmp->reginfo); + + mp = dbmp->reginfo[0].primary; + + /* Display MPOOL structures. */ + (void)fprintf(fp, "%s\nPool (region addr 0x%lx)\n", + DB_LINE, (u_long)dbmp->reginfo[0].addr); + + /* Display the MPOOLFILE structures. */ + cnt = 0; + for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile); + mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile), ++cnt) { + (void)fprintf(fp, "File #%d: %s: type %ld, %s\n\t [UID: ", + cnt + 1, __memp_fns(dbmp, mfp), (long)mfp->ftype, + F_ISSET(mfp, MP_CAN_MMAP) ? "mmap" : "read/write"); + p = R_ADDR(dbmp->reginfo, mfp->fileid_off); + for (i = 0; i < DB_FILE_ID_LEN; ++i) { + (void)fprintf(fp, "%x", *p++); + if (i < DB_FILE_ID_LEN - 1) + (void)fprintf(fp, " "); + } + (void)fprintf(fp, "]\n"); + if (cnt < FMAP_ENTRIES) + fmap[cnt] = R_OFFSET(dbmp->reginfo, mfp); + } + + for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq); + dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q), ++cnt) { + (void)fprintf(fp, "File #%d: %s: per-process, %s\n", + cnt + 1, __memp_fn(dbmfp), + F_ISSET(dbmfp, MP_READONLY) ? "readonly" : "read/write"); + if (cnt < FMAP_ENTRIES) + fmap[cnt] = R_OFFSET(dbmp->reginfo, mfp); + } + if (cnt < FMAP_ENTRIES) + fmap[cnt] = INVALID_ROFF; + else + fmap[FMAP_ENTRIES] = INVALID_ROFF; + + /* Dump the memory pools. */ + for (i = 0; i < mp->nreg; ++i) { + (void)fprintf(fp, "%s\nCache #%d:\n", DB_LINE, i + 1); + __memp_dumpcache(dbmp, &dbmp->reginfo[i], fmap, fp, flags); + } + + R_UNLOCK(dbenv, dbmp->reginfo); + + /* Flush in case we're debugging. */ + (void)fflush(fp); +} + +/* + * __memp_dumpcache -- + * Display statistics for a cache. + */ +static void +__memp_dumpcache(dbmp, reginfo, fmap, fp, flags) + DB_MPOOL *dbmp; + REGINFO *reginfo; + size_t *fmap; + FILE *fp; + u_int32_t flags; +{ + BH *bhp; + DB_HASHTAB *dbht; + MPOOL *c_mp; + int bucket; + + c_mp = reginfo->primary; + + /* Display the hash table list of BH's. */ + if (LF_ISSET(MPOOL_DUMP_HASH)) { + (void)fprintf(fp, + "%s\nBH hash table (%lu hash slots)\npageno, file, ref, address\n", + DB_LINE, (u_long)c_mp->htab_buckets); + for (dbht = R_ADDR(reginfo, c_mp->htab), + bucket = 0; bucket < c_mp->htab_buckets; ++dbht, ++bucket) { + if (SH_TAILQ_FIRST(dbht, __bh) != NULL) + (void)fprintf(fp, "%lu:\n", (u_long)bucket); + for (bhp = SH_TAILQ_FIRST(dbht, __bh); + bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) + __memp_pbh(dbmp, bhp, fmap, fp); + } + } + + /* Display the LRU list of BH's. */ + if (LF_ISSET(MPOOL_DUMP_LRU)) { + (void)fprintf(fp, "%s\nBH LRU list\n", DB_LINE); + (void)fprintf(fp, "pageno, file, ref, address\n"); + for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh); + bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) + __memp_pbh(dbmp, bhp, fmap, fp); + } + + /* Dump the memory pool. */ + if (LF_ISSET(MPOOL_DUMP_MEM)) + __db_shalloc_dump(reginfo->addr, fp); +} + +/* + * __memp_pbh -- + * Display a BH structure. + */ +static void +__memp_pbh(dbmp, bhp, fmap, fp) + DB_MPOOL *dbmp; + BH *bhp; + size_t *fmap; + FILE *fp; +{ + static const FN fn[] = { + { BH_CALLPGIN, "callpgin" }, + { BH_DIRTY, "dirty" }, + { BH_DISCARD, "discard" }, + { BH_LOCKED, "locked" }, + { BH_SYNC, "sync" }, + { BH_SYNC_LOGFLSH, "sync:logflush" }, + { BH_TRASH, "trash" }, + { 0, NULL } + }; + int i; + + for (i = 0; i < FMAP_ENTRIES; ++i) + if (fmap[i] == INVALID_ROFF || fmap[i] == bhp->mf_offset) + break; + + if (fmap[i] == INVALID_ROFF) + (void)fprintf(fp, " %4lu, %lu, %2lu, %lu", + (u_long)bhp->pgno, (u_long)bhp->mf_offset, + (u_long)bhp->ref, (u_long)R_OFFSET(dbmp->reginfo, bhp)); + else + (void)fprintf(fp, " %4lu, #%d, %2lu, %lu", + (u_long)bhp->pgno, i + 1, + (u_long)bhp->ref, (u_long)R_OFFSET(dbmp->reginfo, bhp)); + + __db_prflags(bhp->flags, fn, fp); + + (void)fprintf(fp, "\n"); +} diff --git a/bdb/mp/mp_sync.c b/bdb/mp/mp_sync.c new file mode 100644 index 00000000000..1b0751db709 --- /dev/null +++ b/bdb/mp/mp_sync.c @@ -0,0 +1,658 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: mp_sync.c,v 11.29 2001/01/11 18:19:53 bostic Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <stdlib.h> +#endif + +#ifdef HAVE_RPC +#include "db_server.h" +#endif + +#include "db_int.h" +#include "db_shash.h" +#include "mp.h" + +#ifdef HAVE_RPC +#include "gen_client_ext.h" +#include "rpc_client_ext.h" +#endif + +static int __bhcmp __P((const void *, const void *)); +static int __memp_fsync __P((DB_MPOOLFILE *)); +static int __memp_sballoc __P((DB_ENV *, BH ***, u_int32_t *)); + +/* + * memp_sync -- + * Mpool sync function. + */ +int +memp_sync(dbenv, lsnp) + DB_ENV *dbenv; + DB_LSN *lsnp; +{ + BH *bhp, **bharray; + DB_MPOOL *dbmp; + DB_LSN tlsn; + MPOOL *c_mp, *mp; + MPOOLFILE *mfp; + u_int32_t ar_cnt, i, ndirty; + int ret, retry_done, retry_need, wrote; + +#ifdef HAVE_RPC + if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) + return (__dbcl_memp_sync(dbenv, lsnp)); +#endif + + PANIC_CHECK(dbenv); + ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL); + + dbmp = dbenv->mp_handle; + mp = dbmp->reginfo[0].primary; + + /* + * If no LSN is provided, flush the entire cache. + * + * !!! + * Our current behavior is to flush the entire cache, so there's + * nothing special we have to do here other than deal with NULL + * pointers. + */ + if (lsnp == NULL) { + ZERO_LSN(tlsn); + lsnp = &tlsn; + F_SET(mp, MP_LSN_RETRY); + } else if (!LOGGING_ON(dbenv)) { + __db_err(dbenv, "memp_sync: requires logging"); + return (EINVAL); + } + + /* + * Sync calls are single-threaded so that we don't have multiple + * threads, with different checkpoint LSNs, walking the caches + * and updating the checkpoint LSNs and how many buffers remain + * to be written for the checkpoint. This shouldn't be a problem, + * any application that has multiple checkpoint threads isn't what + * I'd call trustworthy. + */ + MUTEX_LOCK(dbenv, &mp->sync_mutex, dbenv->lockfhp); + + /* + * If the application is asking about a previous call to memp_sync(), + * and we haven't found any buffers that the application holding the + * pin couldn't write, return yes or no based on the current count. + * Note, if the application is asking about a LSN *smaller* than one + * we've already handled or are currently handling, then we return a + * result based on the count for the larger LSN. + */ + R_LOCK(dbenv, dbmp->reginfo); + if (!IS_ZERO_LSN(*lsnp) && + !F_ISSET(mp, MP_LSN_RETRY) && log_compare(lsnp, &mp->lsn) <= 0) { + if (mp->lsn_cnt == 0) { + *lsnp = mp->lsn; + ret = 0; + } else + ret = DB_INCOMPLETE; + + R_UNLOCK(dbenv, dbmp->reginfo); + MUTEX_UNLOCK(dbenv, &mp->sync_mutex); + return (ret); + } + + /* + * Allocate room for a list of buffers, and decide how many buffers + * we can pin down. + * + * !!! + * Note: __memp_sballoc has released the region lock if we're not + * continuing forward. + */ + if ((ret = + __memp_sballoc(dbenv, &bharray, &ndirty)) != 0 || ndirty == 0) { + MUTEX_UNLOCK(dbenv, &mp->sync_mutex); + return (ret); + } + + retry_done = 0; +retry: retry_need = 0; + /* + * Start a new checkpoint. + * + * Save the LSN. We know that it's a new LSN, a retry, or larger than + * the one for which we were already doing a checkpoint. (BTW, I don't + * expect to see multiple LSN's from the same or multiple processes, + * but You Just Never Know. Responding as if they all called with the + * largest of the LSNs specified makes everything work.) + * + * We don't currently use the LSN we save. We could potentially save + * the last-written LSN in each buffer header and use it to determine + * what buffers need to be written. The problem with this is that it's + * sizeof(LSN) more bytes of buffer header. We currently write all the + * dirty buffers instead, but with a sufficiently large cache that's + * going to be a problem. + */ + mp->lsn = *lsnp; + + /* + * Clear the global count of buffers waiting to be written, walk the + * list of files clearing the count of buffers waiting to be written. + * + * Clear the retry flag. + */ + mp->lsn_cnt = 0; + for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile); + mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) + mfp->lsn_cnt = 0; + F_CLR(mp, MP_LSN_RETRY); + + /* + * Walk each cache's list of buffers and mark all dirty buffers to be + * written and all pinned buffers to be potentially written (we can't + * know if they'll need to be written until the holder returns them to + * the cache). We do this in one pass while holding the region locked + * so that processes can't make new buffers dirty, causing us to never + * finish. Since the application may have restarted the sync using a + * different LSN value, clear any BH_SYNC | BH_SYNC_LOGFLSH flags that + * appear leftover from previous calls. + * + * Keep a count of the total number of buffers we need to write in + * MPOOL->lsn_cnt, and for each file, in MPOOLFILE->lsn_count. + */ + for (ar_cnt = 0, i = 0; i < mp->nreg; ++i) { + c_mp = dbmp->reginfo[i].primary; + for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh); + bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) { + if (F_ISSET(bhp, BH_DIRTY) || bhp->ref != 0) { + F_SET(bhp, BH_SYNC); + + ++mp->lsn_cnt; + + mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); + ++mfp->lsn_cnt; + + /* + * If the buffer isn't being used, we can write + * it immediately, so increment its reference + * count to lock it down, and save a reference + * to it. + * + * If we've run out space to store buffer refs, + * we're screwed. We don't want to realloc the + * array while holding a region lock, so we set + * a flag and deal with it later. + */ + if (bhp->ref == 0) { + ++bhp->ref; + bharray[ar_cnt] = bhp; + + if (++ar_cnt >= ndirty) { + retry_need = 1; + break; + } + } + } else + if (F_ISSET(bhp, BH_SYNC)) + F_CLR(bhp, BH_SYNC | BH_SYNC_LOGFLSH); + } + if (ar_cnt >= ndirty) + break; + } + + /* If there no buffers we can write immediately, we're done. */ + if (ar_cnt == 0) { + ret = mp->lsn_cnt ? DB_INCOMPLETE : 0; + goto done; + } + + R_UNLOCK(dbenv, dbmp->reginfo); + + /* + * Sort the buffers we're going to write immediately. + * + * We try and write the buffers in file/page order: it should reduce + * seeks by the underlying filesystem and possibly reduce the actual + * number of writes. + */ + if (ar_cnt > 1) + qsort(bharray, ar_cnt, sizeof(BH *), __bhcmp); + + /* + * Flush the log. We have to ensure the log records reflecting the + * changes on the database pages we're writing have already made it + * to disk. We usually do that as we write each page, but if we + * are going to write a large number of pages, repeatedly acquiring + * the log region lock is going to be expensive. Flush the entire + * log now, so that sync doesn't require any more log flushes. + */ + if (LOGGING_ON(dbenv) && (ret = log_flush(dbenv, NULL)) != 0) + goto done; + + R_LOCK(dbenv, dbmp->reginfo); + + /* Walk the array, writing buffers. */ + for (i = 0; i < ar_cnt; ++i) { + /* + * It's possible for a thread to have gotten the buffer since + * we listed it for writing. If the reference count is still + * 1, we're the only ones using the buffer, go ahead and write. + * If it's >1, then skip the buffer and assume that it will be + * written when it's returned to the cache. + */ + if (bharray[i]->ref > 1) { + --bharray[i]->ref; + continue; + } + + /* Write the buffer. */ + mfp = R_ADDR(dbmp->reginfo, bharray[i]->mf_offset); + ret = __memp_bhwrite(dbmp, mfp, bharray[i], NULL, &wrote); + + /* Release the buffer. */ + --bharray[i]->ref; + + if (ret == 0 && wrote) + continue; + + /* + * Any process syncing the shared memory buffer pool had best + * be able to write to any underlying file. Be understanding, + * but firm, on this point. + */ + if (ret == 0) { + __db_err(dbenv, "%s: unable to flush page: %lu", + __memp_fns(dbmp, mfp), (u_long)bharray[i]->pgno); + ret = EPERM; + } + + /* + * On error, clear MPOOL->lsn and set MP_LSN_RETRY so that no + * future checkpoint return can depend on this failure. Clear + * the buffer's BH_SYNC flag, because it's used to determine + * if lsn_cnt values are incremented/decremented. Don't bother + * to reset/clear: + * + * MPOOL->lsn_cnt + * MPOOLFILE->lsn_cnt + * + * they don't make any difference. + */ + ZERO_LSN(mp->lsn); + F_SET(mp, MP_LSN_RETRY); + + /* Release any buffers we're still pinning down. */ + while (++i < ar_cnt) { + bhp = bharray[i]; + --bhp->ref; + F_CLR(bhp, BH_SYNC | BH_SYNC_LOGFLSH); + } + + goto done; + } + + ret = mp->lsn_cnt != 0 ? DB_INCOMPLETE : 0; + + /* + * If there were too many buffers and we're not returning an error, we + * re-try the checkpoint once -- since we allocated 80% of the total + * buffer count, once should be enough. If it still doesn't work, some + * other thread of control is dirtying buffers as fast as we're writing + * them, and we might as well give up for now. In the latter case, set + * the global retry flag, we'll have to start from scratch on the next + * checkpoint. + */ + if (retry_need) { + if (retry_done) { + ret = DB_INCOMPLETE; + F_SET(mp, MP_LSN_RETRY); + } else { + retry_done = 1; + goto retry; + } + } + +done: R_UNLOCK(dbenv, dbmp->reginfo); + MUTEX_UNLOCK(dbenv, &mp->sync_mutex); + + __os_free(bharray, ndirty * sizeof(BH *)); + + return (ret); +} + +/* + * memp_fsync -- + * Mpool file sync function. + */ +int +memp_fsync(dbmfp) + DB_MPOOLFILE *dbmfp; +{ + DB_ENV *dbenv; + DB_MPOOL *dbmp; + int is_tmp; + + dbmp = dbmfp->dbmp; + dbenv = dbmp->dbenv; + +#ifdef HAVE_RPC + if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) + return (__dbcl_memp_fsync(dbmfp)); +#endif + + PANIC_CHECK(dbenv); + + /* + * If this handle doesn't have a file descriptor that's open for + * writing, or if the file is a temporary, there's no reason to + * proceed further. + */ + if (F_ISSET(dbmfp, MP_READONLY)) + return (0); + + R_LOCK(dbenv, dbmp->reginfo); + is_tmp = F_ISSET(dbmfp->mfp, MP_TEMP); + R_UNLOCK(dbenv, dbmp->reginfo); + if (is_tmp) + return (0); + + return (__memp_fsync(dbmfp)); +} + +/* + * __mp_xxx_fh -- + * Return a file descriptor for DB 1.85 compatibility locking. + * + * PUBLIC: int __mp_xxx_fh __P((DB_MPOOLFILE *, DB_FH **)); + */ +int +__mp_xxx_fh(dbmfp, fhp) + DB_MPOOLFILE *dbmfp; + DB_FH **fhp; +{ + /* + * This is a truly spectacular layering violation, intended ONLY to + * support compatibility for the DB 1.85 DB->fd call. + * + * Sync the database file to disk, creating the file as necessary. + * + * We skip the MP_READONLY and MP_TEMP tests done by memp_fsync(3). + * The MP_READONLY test isn't interesting because we will either + * already have a file descriptor (we opened the database file for + * reading) or we aren't readonly (we created the database which + * requires write privileges). The MP_TEMP test isn't interesting + * because we want to write to the backing file regardless so that + * we get a file descriptor to return. + */ + *fhp = &dbmfp->fh; + return (F_ISSET(&dbmfp->fh, DB_FH_VALID) ? 0 : __memp_fsync(dbmfp)); +} + +/* + * __memp_fsync -- + * Mpool file internal sync function. + */ +static int +__memp_fsync(dbmfp) + DB_MPOOLFILE *dbmfp; +{ + BH *bhp, **bharray; + DB_ENV *dbenv; + DB_MPOOL *dbmp; + MPOOL *c_mp, *mp; + size_t mf_offset; + u_int32_t ar_cnt, i, ndirty; + int incomplete, ret, retry_done, retry_need, wrote; + + dbmp = dbmfp->dbmp; + dbenv = dbmp->dbenv; + mp = dbmp->reginfo[0].primary; + + R_LOCK(dbenv, dbmp->reginfo); + + /* + * Allocate room for a list of buffers, and decide how many buffers + * we can pin down. + * + * !!! + * Note: __memp_sballoc has released our region lock if we're not + * continuing forward. + */ + if ((ret = + __memp_sballoc(dbenv, &bharray, &ndirty)) != 0 || ndirty == 0) + return (ret); + + retry_done = 0; +retry: retry_need = 0; + /* + * Walk each cache's list of buffers and mark all dirty buffers to be + * written and all pinned buffers to be potentially written (we can't + * know if they'll need to be written until the holder returns them to + * the cache). We do this in one pass while holding the region locked + * so that processes can't make new buffers dirty, causing us to never + * finish. + */ + mf_offset = R_OFFSET(dbmp->reginfo, dbmfp->mfp); + for (ar_cnt = 0, incomplete = 0, i = 0; i < mp->nreg; ++i) { + c_mp = dbmp->reginfo[i].primary; + for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh); + bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) { + if (!F_ISSET(bhp, BH_DIRTY) || + bhp->mf_offset != mf_offset) + continue; + if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED)) { + incomplete = 1; + continue; + } + + /* + * If the buffer isn't being used, we can write + * it immediately, so increment its reference + * count to lock it down, and save a reference + * to it. + * + * If we've run out space to store buffer refs, + * we're screwed. We don't want to realloc the + * array while holding a region lock, so we set + * a flag and deal with it later. + */ + ++bhp->ref; + bharray[ar_cnt] = bhp; + if (++ar_cnt >= ndirty) { + retry_need = 1; + break; + } + } + if (ar_cnt >= ndirty) + break; + } + + /* If there no buffers we can write immediately, we're done. */ + if (ar_cnt == 0) { + ret = 0; + goto done; + } + + R_UNLOCK(dbenv, dbmp->reginfo); + + /* Sort the buffers we're going to write. */ + if (ar_cnt > 1) + qsort(bharray, ar_cnt, sizeof(BH *), __bhcmp); + + R_LOCK(dbenv, dbmp->reginfo); + + /* Walk the array, writing buffers. */ + for (i = 0; i < ar_cnt;) { + /* + * It's possible for a thread to have gotten the buffer since + * we listed it for writing. If the reference count is still + * 1, we're the only ones using the buffer, go ahead and write. + * If it's >1, then skip the buffer and assume that it will be + * written when it's returned to the cache. + */ + if (bharray[i]->ref > 1) { + incomplete = 1; + --bharray[i++]->ref; + continue; + } + + /* Write the buffer. */ + ret = __memp_pgwrite(dbmp, dbmfp, bharray[i], NULL, &wrote); + + /* Release the buffer. */ + --bharray[i++]->ref; + + if (ret == 0) { + if (!wrote) + incomplete = 1; + continue; + } + + /* + * On error: + * + * Release any buffers we're still pinning down. + */ + while (i < ar_cnt) + --bharray[i++]->ref; + break; + } + + /* + * If there were too many buffers and we're not returning an error, we + * re-try the flush once -- since we allocated 80% of the total + * buffer count, once should be enough. If it still doesn't work, some + * other thread of control is dirtying buffers as fast as we're writing + * them, and we might as well give up. + */ + if (retry_need) { + if (retry_done) + incomplete = 1; + else { + retry_done = 1; + goto retry; + } + } + +done: R_UNLOCK(dbenv, dbmp->reginfo); + + __os_free(bharray, ndirty * sizeof(BH *)); + + /* + * Sync the underlying file as the last thing we do, so that the OS + * has a maximal opportunity to flush buffers before we request it. + * + * !!!: + * Don't lock the region around the sync, fsync(2) has no atomicity + * issues. + */ + if (ret == 0) + ret = incomplete ? + DB_INCOMPLETE : __os_fsync(dbenv, &dbmfp->fh); + + return (ret); +} + +/* + * __memp_sballoc -- + * Allocate room for a list of buffers. + */ +static int +__memp_sballoc(dbenv, bharrayp, ndirtyp) + DB_ENV *dbenv; + BH ***bharrayp; + u_int32_t *ndirtyp; +{ + DB_MPOOL *dbmp; + MPOOL *c_mp, *mp; + u_int32_t i, nclean, ndirty, maxpin; + int ret; + + dbmp = dbenv->mp_handle; + mp = dbmp->reginfo[0].primary; + + /* + * We don't want to hold the region lock while we write the buffers, + * so only lock it while we create a list. + * + * Walk through the list of caches, figuring out how many buffers + * we're going to need. + * + * Make a point of not holding the region lock across the library + * allocation call. + */ + for (nclean = ndirty = 0, i = 0; i < mp->nreg; ++i) { + c_mp = dbmp->reginfo[i].primary; + ndirty += c_mp->stat.st_page_dirty; + nclean += c_mp->stat.st_page_clean; + } + R_UNLOCK(dbenv, dbmp->reginfo); + if (ndirty == 0) { + *ndirtyp = 0; + return (0); + } + + /* + * We don't want to pin down the entire buffer cache, otherwise we'll + * starve threads needing new pages. Don't pin down more than 80% of + * the cache, making sure that we don't screw up just because only a + * few pages have been created. + */ + maxpin = ((ndirty + nclean) * 8) / 10; + if (maxpin < 10) + maxpin = 10; + + /* + * Get a good-sized block of memory to hold buffer pointers, we don't + * want to run out, but correct if we want to allocate more than we + * would be allowed to store, regardless. + */ + ndirty += ndirty / 2 + 10; + if (ndirty > maxpin) + ndirty = maxpin; + if ((ret = + __os_malloc(dbenv, ndirty * sizeof(BH *), NULL, bharrayp)) != 0) + return (ret); + + *ndirtyp = ndirty; + + R_LOCK(dbenv, dbmp->reginfo); + + return (0); +} + +static int +__bhcmp(p1, p2) + const void *p1, *p2; +{ + BH *bhp1, *bhp2; + + bhp1 = *(BH * const *)p1; + bhp2 = *(BH * const *)p2; + + /* Sort by file (shared memory pool offset). */ + if (bhp1->mf_offset < bhp2->mf_offset) + return (-1); + if (bhp1->mf_offset > bhp2->mf_offset) + return (1); + + /* + * !!! + * Defend against badly written quicksort code calling the comparison + * function with two identical pointers (e.g., WATCOM C++ (Power++)). + */ + if (bhp1->pgno < bhp2->pgno) + return (-1); + if (bhp1->pgno > bhp2->pgno) + return (1); + return (0); +} diff --git a/bdb/mp/mp_trickle.c b/bdb/mp/mp_trickle.c new file mode 100644 index 00000000000..f937805cf40 --- /dev/null +++ b/bdb/mp/mp_trickle.c @@ -0,0 +1,149 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: mp_trickle.c,v 11.12 2000/11/30 00:58:41 ubell Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <stdlib.h> +#endif + +#ifdef HAVE_RPC +#include "db_server.h" +#endif + +#include "db_int.h" +#include "db_shash.h" +#include "mp.h" + +#ifdef HAVE_RPC +#include "gen_client_ext.h" +#include "rpc_client_ext.h" +#endif + +static int __memp_trick __P((DB_ENV *, int, int, int *)); + +/* + * memp_trickle -- + * Keep a specified percentage of the buffers clean. + */ +int +memp_trickle(dbenv, pct, nwrotep) + DB_ENV *dbenv; + int pct, *nwrotep; +{ + DB_MPOOL *dbmp; + MPOOL *mp; + u_int32_t i; + int ret; + +#ifdef HAVE_RPC + if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) + return (__dbcl_memp_trickle(dbenv, pct, nwrotep)); +#endif + + PANIC_CHECK(dbenv); + ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL); + + dbmp = dbenv->mp_handle; + mp = dbmp->reginfo[0].primary; + + if (nwrotep != NULL) + *nwrotep = 0; + + if (pct < 1 || pct > 100) + return (EINVAL); + + R_LOCK(dbenv, dbmp->reginfo); + + /* Loop through the caches... */ + for (ret = 0, i = 0; i < mp->nreg; ++i) + if ((ret = __memp_trick(dbenv, i, pct, nwrotep)) != 0) + break; + + R_UNLOCK(dbenv, dbmp->reginfo); + return (ret); +} + +/* + * __memp_trick -- + * Trickle a single cache. + */ +static int +__memp_trick(dbenv, ncache, pct, nwrotep) + DB_ENV *dbenv; + int ncache, pct, *nwrotep; +{ + BH *bhp; + DB_MPOOL *dbmp; + MPOOL *c_mp; + MPOOLFILE *mfp; + db_pgno_t pgno; + u_long total; + int ret, wrote; + + dbmp = dbenv->mp_handle; + c_mp = dbmp->reginfo[ncache].primary; + + /* + * If there are sufficient clean buffers, or no buffers or no dirty + * buffers, we're done. + * + * XXX + * Using st_page_clean and st_page_dirty is our only choice at the + * moment, but it's not as correct as we might like in the presence + * of pools with more than one buffer size, as a free 512-byte buffer + * isn't the same as a free 8K buffer. + */ +loop: total = c_mp->stat.st_page_clean + c_mp->stat.st_page_dirty; + if (total == 0 || c_mp->stat.st_page_dirty == 0 || + (c_mp->stat.st_page_clean * 100) / total >= (u_long)pct) + return (0); + + /* Loop until we write a buffer. */ + for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh); + bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) { + if (bhp->ref != 0 || + !F_ISSET(bhp, BH_DIRTY) || F_ISSET(bhp, BH_LOCKED)) + continue; + + mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); + + /* + * We can't write to temporary files -- see the comment in + * mp_bh.c:__memp_bhwrite(). + */ + if (F_ISSET(mfp, MP_TEMP)) + continue; + + pgno = bhp->pgno; + if ((ret = __memp_bhwrite(dbmp, mfp, bhp, NULL, &wrote)) != 0) + return (ret); + + /* + * Any process syncing the shared memory buffer pool had better + * be able to write to any underlying file. Be understanding, + * but firm, on this point. + */ + if (!wrote) { + __db_err(dbenv, "%s: unable to flush page: %lu", + __memp_fns(dbmp, mfp), (u_long)pgno); + return (EPERM); + } + + ++c_mp->stat.st_page_trickle; + if (nwrotep != NULL) + ++*nwrotep; + goto loop; + } + + return (0); +} |