diff options
Diffstat (limited to 'bdb/mp')
-rw-r--r-- | bdb/mp/mp_alloc.c | 442 | ||||
-rw-r--r-- | bdb/mp/mp_bh.c | 646 | ||||
-rw-r--r-- | bdb/mp/mp_fget.c | 654 | ||||
-rw-r--r-- | bdb/mp/mp_fopen.c | 1018 | ||||
-rw-r--r-- | bdb/mp/mp_fput.c | 202 | ||||
-rw-r--r-- | bdb/mp/mp_fset.c | 89 | ||||
-rw-r--r-- | bdb/mp/mp_method.c | 156 | ||||
-rw-r--r-- | bdb/mp/mp_region.c | 466 | ||||
-rw-r--r-- | bdb/mp/mp_register.c | 76 | ||||
-rw-r--r-- | bdb/mp/mp_stat.c | 491 | ||||
-rw-r--r-- | bdb/mp/mp_sync.c | 627 | ||||
-rw-r--r-- | bdb/mp/mp_trickle.c | 83 |
12 files changed, 0 insertions, 4950 deletions
diff --git a/bdb/mp/mp_alloc.c b/bdb/mp/mp_alloc.c deleted file mode 100644 index 96dd612d7ba..00000000000 --- a/bdb/mp/mp_alloc.c +++ /dev/null @@ -1,442 +0,0 @@ -/*- - * See the file LICENSE for redistribution information. - * - * Copyright (c) 1996-2002 - * Sleepycat Software. All rights reserved. - */ -#include "db_config.h" - -#ifndef lint -static const char revid[] = "$Id: mp_alloc.c,v 11.31 2002/08/14 17:21:37 ubell Exp $"; -#endif /* not lint */ - -#ifndef NO_SYSTEM_INCLUDES -#include <sys/types.h> -#include <string.h> -#endif - -#include "db_int.h" -#include "dbinc/db_shash.h" -#include "dbinc/mp.h" - -typedef struct { - DB_MPOOL_HASH *bucket; - u_int32_t priority; -} HS; - -static void __memp_bad_buffer __P((DB_MPOOL_HASH *)); -static void __memp_reset_lru __P((DB_ENV *, REGINFO *, MPOOL *)); - -/* - * __memp_alloc -- - * Allocate some space from a cache region. - * - * PUBLIC: int __memp_alloc __P((DB_MPOOL *, - * PUBLIC: REGINFO *, MPOOLFILE *, size_t, roff_t *, void *)); - */ -int -__memp_alloc(dbmp, memreg, mfp, len, offsetp, retp) - DB_MPOOL *dbmp; - REGINFO *memreg; - MPOOLFILE *mfp; - size_t len; - roff_t *offsetp; - void *retp; -{ - BH *bhp; - DB_ENV *dbenv; - DB_MPOOL_HASH *dbht, *hp, *hp_end, *hp_tmp; - DB_MUTEX *mutexp; - MPOOL *c_mp; - MPOOLFILE *bh_mfp; - size_t freed_space; - u_int32_t buckets, buffers, high_priority, max_na, priority; - int aggressive, ret; - void *p; - - dbenv = dbmp->dbenv; - c_mp = memreg->primary; - dbht = R_ADDR(memreg, c_mp->htab); - hp_end = &dbht[c_mp->htab_buckets]; - - buckets = buffers = 0; - aggressive = 0; - - c_mp->stat.st_alloc++; - - /* - * Get aggressive if we've tried to flush the number of pages as are - * in the system without finding space. - */ - max_na = 5 * c_mp->htab_buckets; - - /* - * If we're allocating a buffer, and the one we're discarding is the - * same size, we don't want to waste the time to re-integrate it into - * the shared memory free list. If the DB_MPOOLFILE argument isn't - * NULL, we'll compare the underlying page sizes of the two buffers - * before free-ing and re-allocating buffers. - */ - if (mfp != NULL) - len = (sizeof(BH) - sizeof(u_int8_t)) + mfp->stat.st_pagesize; - - R_LOCK(dbenv, memreg); - - /* - * On every buffer allocation we update the buffer generation number - * and check for wraparound. - */ - if (++c_mp->lru_count == UINT32_T_MAX) - __memp_reset_lru(dbenv, memreg, c_mp); - - /* - * Anything newer than 1/10th of the buffer pool is ignored during - * allocation (unless allocation starts failing). - */ - DB_ASSERT(c_mp->lru_count > c_mp->stat.st_pages / 10); - high_priority = c_mp->lru_count - c_mp->stat.st_pages / 10; - - /* - * First we try to allocate from free memory. If that fails, scan the - * buffer pool to find buffers with low priorities. We consider small - * sets of hash buckets each time to limit the amount of work needing - * to be done. This approximates LRU, but not very well. We either - * find a buffer of the same size to use, or we will free 3 times what - * we need in the hopes it will coalesce into a contiguous chunk of the - * right size. In the latter case we branch back here and try again. - */ -alloc: if ((ret = __db_shalloc(memreg->addr, len, MUTEX_ALIGN, &p)) == 0) { - if (mfp != NULL) - c_mp->stat.st_pages++; - R_UNLOCK(dbenv, memreg); - -found: if (offsetp != NULL) - *offsetp = R_OFFSET(memreg, p); - *(void **)retp = p; - - /* - * Update the search statistics. - * - * We're not holding the region locked here, these statistics - * can't be trusted. - */ - if (buckets != 0) { - if (buckets > c_mp->stat.st_alloc_max_buckets) - c_mp->stat.st_alloc_max_buckets = buckets; - c_mp->stat.st_alloc_buckets += buckets; - } - if (buffers != 0) { - if (buffers > c_mp->stat.st_alloc_max_pages) - c_mp->stat.st_alloc_max_pages = buffers; - c_mp->stat.st_alloc_pages += buffers; - } - return (0); - } - - /* - * We re-attempt the allocation every time we've freed 3 times what - * we need. Reset our free-space counter. - */ - freed_space = 0; - - /* - * Walk the hash buckets and find the next two with potentially useful - * buffers. Free the buffer with the lowest priority from the buckets' - * chains. - */ - for (hp_tmp = NULL;;) { - /* Check for wrap around. */ - hp = &dbht[c_mp->last_checked++]; - if (hp >= hp_end) { - c_mp->last_checked = 0; - - /* - * If we've gone through all of the hash buckets, try - * an allocation. If the cache is small, the old page - * size is small, and the new page size is large, we - * might have freed enough memory (but not 3 times the - * memory). - */ - goto alloc; - } - - /* - * Skip empty buckets. - * - * We can check for empty buckets before locking as we - * only care if the pointer is zero or non-zero. - */ - if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL) - continue; - - /* - * The failure mode is when there are too many buffers we can't - * write or there's not enough memory in the system. We don't - * have a metric for deciding if allocation has no possible way - * to succeed, so we don't ever fail, we assume memory will be - * available if we wait long enough. - * - * Get aggressive if we've tried to flush 5 times the number of - * hash buckets as are in the system -- it's possible we have - * been repeatedly trying to flush the same buffers, although - * it's unlikely. Aggressive means: - * - * a: set a flag to attempt to flush high priority buffers as - * well as other buffers. - * b: sync the mpool to force out queue extent pages. While we - * might not have enough space for what we want and flushing - * is expensive, why not? - * c: sleep for a second -- hopefully someone else will run and - * free up some memory. Try to allocate memory too, in case - * the other thread returns its memory to the region. - * d: look at a buffer in every hash bucket rather than choose - * the more preferable of two. - * - * !!! - * This test ignores pathological cases like no buffers in the - * system -- that shouldn't be possible. - */ - if ((++buckets % max_na) == 0) { - aggressive = 1; - - R_UNLOCK(dbenv, memreg); - - (void)__memp_sync_int( - dbenv, NULL, 0, DB_SYNC_ALLOC, NULL); - - (void)__os_sleep(dbenv, 1, 0); - - R_LOCK(dbenv, memreg); - goto alloc; - } - - if (!aggressive) { - /* Skip high priority buckets. */ - if (hp->hash_priority > high_priority) - continue; - - /* - * Find two buckets and select the one with the lowest - * priority. Performance testing shows that looking - * at two improves the LRUness and looking at more only - * does a little better. - */ - if (hp_tmp == NULL) { - hp_tmp = hp; - continue; - } - if (hp->hash_priority > hp_tmp->hash_priority) - hp = hp_tmp; - hp_tmp = NULL; - } - - /* Remember the priority of the buffer we're looking for. */ - priority = hp->hash_priority; - - /* Unlock the region and lock the hash bucket. */ - R_UNLOCK(dbenv, memreg); - mutexp = &hp->hash_mutex; - MUTEX_LOCK(dbenv, mutexp); - -#ifdef DIAGNOSTIC - __memp_check_order(hp); -#endif - /* - * The lowest priority page is first in the bucket, as they are - * maintained in sorted order. - * - * The buffer may have been freed or its priority changed while - * we switched from the region lock to the hash lock. If so, - * we have to restart. We will still take the first buffer on - * the bucket's list, though, if it has a low enough priority. - */ - if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL || - bhp->ref != 0 || bhp->priority > priority) - goto next_hb; - - buffers++; - - /* Find the associated MPOOLFILE. */ - bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); - - /* If the page is dirty, pin it and write it. */ - ret = 0; - if (F_ISSET(bhp, BH_DIRTY)) { - ++bhp->ref; - ret = __memp_bhwrite(dbmp, hp, bh_mfp, bhp, 0); - --bhp->ref; - if (ret == 0) - ++c_mp->stat.st_rw_evict; - } else - ++c_mp->stat.st_ro_evict; - - /* - * If a write fails for any reason, we can't proceed. - * - * We released the hash bucket lock while doing I/O, so another - * thread may have acquired this buffer and incremented the ref - * count after we wrote it, in which case we can't have it. - * - * If there's a write error, avoid selecting this buffer again - * by making it the bucket's least-desirable buffer. - */ - if (ret != 0 || bhp->ref != 0) { - if (ret != 0 && aggressive) - __memp_bad_buffer(hp); - goto next_hb; - } - - /* - * Check to see if the buffer is the size we're looking for. - * If so, we can simply reuse it. Else, free the buffer and - * its space and keep looking. - */ - if (mfp != NULL && - mfp->stat.st_pagesize == bh_mfp->stat.st_pagesize) { - __memp_bhfree(dbmp, hp, bhp, 0); - - p = bhp; - goto found; - } - - freed_space += __db_shsizeof(bhp); - __memp_bhfree(dbmp, hp, bhp, 1); - - /* - * Unlock this hash bucket and re-acquire the region lock. If - * we're reaching here as a result of calling memp_bhfree, the - * hash bucket lock has already been discarded. - */ - if (0) { -next_hb: MUTEX_UNLOCK(dbenv, mutexp); - } - R_LOCK(dbenv, memreg); - - /* - * Retry the allocation as soon as we've freed up sufficient - * space. We're likely to have to coalesce of memory to - * satisfy the request, don't try until it's likely (possible?) - * we'll succeed. - */ - if (freed_space >= 3 * len) - goto alloc; - } - /* NOTREACHED */ -} - -/* - * __memp_bad_buffer -- - * Make the first buffer in a hash bucket the least desirable buffer. - */ -static void -__memp_bad_buffer(hp) - DB_MPOOL_HASH *hp; -{ - BH *bhp, *t_bhp; - u_int32_t priority; - - /* Remove the first buffer from the bucket. */ - bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh); - SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh); - - /* - * Find the highest priority buffer in the bucket. Buffers are - * sorted by priority, so it's the last one in the bucket. - * - * XXX - * Should use SH_TAILQ_LAST, but I think that macro is broken. - */ - priority = bhp->priority; - for (t_bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh); - t_bhp != NULL; t_bhp = SH_TAILQ_NEXT(t_bhp, hq, __bh)) - priority = t_bhp->priority; - - /* - * Set our buffer's priority to be just as bad, and append it to - * the bucket. - */ - bhp->priority = priority; - SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq); - - /* Reset the hash bucket's priority. */ - hp->hash_priority = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority; -} - -/* - * __memp_reset_lru -- - * Reset the cache LRU counter. - */ -static void -__memp_reset_lru(dbenv, memreg, c_mp) - DB_ENV *dbenv; - REGINFO *memreg; - MPOOL *c_mp; -{ - BH *bhp; - DB_MPOOL_HASH *hp; - int bucket; - - /* - * Update the counter so all future allocations will start at the - * bottom. - */ - c_mp->lru_count -= MPOOL_BASE_DECREMENT; - - /* Release the region lock. */ - R_UNLOCK(dbenv, memreg); - - /* Adjust the priority of every buffer in the system. */ - for (hp = R_ADDR(memreg, c_mp->htab), - bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) { - /* - * Skip empty buckets. - * - * We can check for empty buckets before locking as we - * only care if the pointer is zero or non-zero. - */ - if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL) - continue; - - MUTEX_LOCK(dbenv, &hp->hash_mutex); - for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh); - bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) - if (bhp->priority != UINT32_T_MAX && - bhp->priority > MPOOL_BASE_DECREMENT) - bhp->priority -= MPOOL_BASE_DECREMENT; - MUTEX_UNLOCK(dbenv, &hp->hash_mutex); - } - - /* Reacquire the region lock. */ - R_LOCK(dbenv, memreg); -} - -#ifdef DIAGNOSTIC -/* - * __memp_check_order -- - * Verify the priority ordering of a hash bucket chain. - * - * PUBLIC: #ifdef DIAGNOSTIC - * PUBLIC: void __memp_check_order __P((DB_MPOOL_HASH *)); - * PUBLIC: #endif - */ -void -__memp_check_order(hp) - DB_MPOOL_HASH *hp; -{ - BH *bhp; - u_int32_t priority; - - /* - * Assumes the hash bucket is locked. - */ - if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL) - return; - - DB_ASSERT(bhp->priority == hp->hash_priority); - - for (priority = bhp->priority; - (bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) != NULL; - priority = bhp->priority) - DB_ASSERT(priority <= bhp->priority); -} -#endif diff --git a/bdb/mp/mp_bh.c b/bdb/mp/mp_bh.c deleted file mode 100644 index 85d15218abf..00000000000 --- a/bdb/mp/mp_bh.c +++ /dev/null @@ -1,646 +0,0 @@ -/*- - * See the file LICENSE for redistribution information. - * - * Copyright (c) 1996-2002 - * Sleepycat Software. All rights reserved. - */ -#include "db_config.h" - -#ifndef lint -static const char revid[] = "$Id: mp_bh.c,v 11.71 2002/09/04 19:06:45 margo Exp $"; -#endif /* not lint */ - -#ifndef NO_SYSTEM_INCLUDES -#include <sys/types.h> - -#include <string.h> -#include <unistd.h> -#endif - -#include "db_int.h" -#include "dbinc/db_shash.h" -#include "dbinc/mp.h" -#include "dbinc/log.h" -#include "dbinc/db_page.h" - -static int __memp_pgwrite - __P((DB_MPOOL *, DB_MPOOLFILE *, DB_MPOOL_HASH *, BH *)); -static int __memp_upgrade __P((DB_MPOOL *, DB_MPOOLFILE *, MPOOLFILE *)); - -/* - * __memp_bhwrite -- - * Write the page associated with a given buffer header. - * - * PUBLIC: int __memp_bhwrite __P((DB_MPOOL *, - * PUBLIC: DB_MPOOL_HASH *, MPOOLFILE *, BH *, int)); - */ -int -__memp_bhwrite(dbmp, hp, mfp, bhp, open_extents) - DB_MPOOL *dbmp; - DB_MPOOL_HASH *hp; - MPOOLFILE *mfp; - BH *bhp; - int open_extents; -{ - DB_ENV *dbenv; - DB_MPOOLFILE *dbmfp; - DB_MPREG *mpreg; - int local_open, incremented, ret; - - dbenv = dbmp->dbenv; - local_open = incremented = 0; - - /* - * If the file has been removed or is a closed temporary file, jump - * right ahead and pretend that we've found the file we want -- the - * page-write function knows how to handle the fact that we don't have - * (or need!) any real file descriptor information. - */ - if (F_ISSET(mfp, MP_DEADFILE)) { - dbmfp = NULL; - goto found; - } - - /* - * Walk the process' DB_MPOOLFILE list and find a file descriptor for - * the file. We also check that the descriptor is open for writing. - * If we find a descriptor on the file that's not open for writing, we - * try and upgrade it to make it writeable. If that fails, we're done. - */ - MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp); - for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq); - dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q)) - if (dbmfp->mfp == mfp) { - if (F_ISSET(dbmfp, MP_READONLY) && - !F_ISSET(dbmfp, MP_UPGRADE) && - (F_ISSET(dbmfp, MP_UPGRADE_FAIL) || - __memp_upgrade(dbmp, dbmfp, mfp))) { - MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); - return (EPERM); - } - - /* - * Increment the reference count -- see the comment in - * __memp_fclose_int(). - */ - ++dbmfp->ref; - incremented = 1; - break; - } - MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); - - if (dbmfp != NULL) - goto found; - - /* - * !!! - * It's the caller's choice if we're going to open extent files. - */ - if (!open_extents && F_ISSET(mfp, MP_EXTENT)) - return (EPERM); - - /* - * !!! - * Don't try to attach to temporary files. There are two problems in - * trying to do that. First, if we have different privileges than the - * process that "owns" the temporary file, we might create the backing - * disk file such that the owning process couldn't read/write its own - * buffers, e.g., memp_trickle running as root creating a file owned - * as root, mode 600. Second, if the temporary file has already been - * created, we don't have any way of finding out what its real name is, - * and, even if we did, it was already unlinked (so that it won't be - * left if the process dies horribly). This decision causes a problem, - * however: if the temporary file consumes the entire buffer cache, - * and the owner doesn't flush the buffers to disk, we could end up - * with resource starvation, and the memp_trickle thread couldn't do - * anything about it. That's a pretty unlikely scenario, though. - * - * Note we should never get here when the temporary file in question - * has already been closed in another process, in which case it should - * be marked MP_DEADFILE. - */ - if (F_ISSET(mfp, MP_TEMP)) - return (EPERM); - - /* - * It's not a page from a file we've opened. If the file requires - * input/output processing, see if this process has ever registered - * information as to how to write this type of file. If not, there's - * nothing we can do. - */ - if (mfp->ftype != 0) { - MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp); - for (mpreg = LIST_FIRST(&dbmp->dbregq); - mpreg != NULL; mpreg = LIST_NEXT(mpreg, q)) - if (mpreg->ftype == mfp->ftype) - break; - MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); - if (mpreg == NULL) - return (EPERM); - } - - /* - * Try and open the file, attaching to the underlying shared area. - * Ignore any error, assume it's a permissions problem. - * - * XXX - * There's no negative cache, so we may repeatedly try and open files - * that we have previously tried (and failed) to open. - */ - if ((ret = dbenv->memp_fcreate(dbenv, &dbmfp, 0)) != 0) - return (ret); - if ((ret = __memp_fopen_int(dbmfp, mfp, - R_ADDR(dbmp->reginfo, mfp->path_off), - 0, 0, mfp->stat.st_pagesize)) != 0) { - (void)dbmfp->close(dbmfp, 0); - return (ret); - } - local_open = 1; - -found: ret = __memp_pgwrite(dbmp, dbmfp, hp, bhp); - - MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp); - if (incremented) - --dbmfp->ref; - else if (local_open) - F_SET(dbmfp, MP_FLUSH); - MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); - - return (ret); -} - -/* - * __memp_pgread -- - * Read a page from a file. - * - * PUBLIC: int __memp_pgread __P((DB_MPOOLFILE *, DB_MUTEX *, BH *, int)); - */ -int -__memp_pgread(dbmfp, mutexp, bhp, can_create) - DB_MPOOLFILE *dbmfp; - DB_MUTEX *mutexp; - BH *bhp; - int can_create; -{ - DB_IO db_io; - DB_ENV *dbenv; - DB_MPOOL *dbmp; - MPOOLFILE *mfp; - size_t len, nr, pagesize; - int ret; - - dbmp = dbmfp->dbmp; - dbenv = dbmp->dbenv; - mfp = dbmfp->mfp; - pagesize = mfp->stat.st_pagesize; - - /* We should never be called with a dirty or a locked buffer. */ - DB_ASSERT(!F_ISSET(bhp, BH_DIRTY | BH_DIRTY_CREATE | BH_LOCKED)); - - /* Lock the buffer and swap the hash bucket lock for the buffer lock. */ - F_SET(bhp, BH_LOCKED | BH_TRASH); - MUTEX_LOCK(dbenv, &bhp->mutex); - MUTEX_UNLOCK(dbenv, mutexp); - - /* - * Temporary files may not yet have been created. We don't create - * them now, we create them when the pages have to be flushed. - */ - nr = 0; - if (F_ISSET(dbmfp->fhp, DB_FH_VALID)) { - db_io.fhp = dbmfp->fhp; - db_io.mutexp = dbmfp->mutexp; - db_io.pagesize = db_io.bytes = pagesize; - db_io.pgno = bhp->pgno; - db_io.buf = bhp->buf; - - /* - * The page may not exist; if it doesn't, nr may well be 0, - * but we expect the underlying OS calls not to return an - * error code in this case. - */ - if ((ret = __os_io(dbenv, &db_io, DB_IO_READ, &nr)) != 0) - goto err; - } - - if (nr < pagesize) { - /* - * Don't output error messages for short reads. In particular, - * DB recovery processing may request pages never written to - * disk or for which only some part have been written to disk, - * in which case we won't find the page. The caller must know - * how to handle the error. - */ - if (can_create == 0) { - ret = DB_PAGE_NOTFOUND; - goto err; - } - - /* Clear any bytes that need to be cleared. */ - len = mfp->clear_len == 0 ? pagesize : mfp->clear_len; - memset(bhp->buf, 0, len); - -#if defined(DIAGNOSTIC) || defined(UMRW) - /* - * If we're running in diagnostic mode, corrupt any bytes on - * the page that are unknown quantities for the caller. - */ - if (len < pagesize) - memset(bhp->buf + len, CLEAR_BYTE, pagesize - len); -#endif - ++mfp->stat.st_page_create; - } else - ++mfp->stat.st_page_in; - - /* Call any pgin function. */ - ret = mfp->ftype == 0 ? 0 : __memp_pg(dbmfp, bhp, 1); - - /* Unlock the buffer and reacquire the hash bucket lock. */ -err: MUTEX_UNLOCK(dbenv, &bhp->mutex); - MUTEX_LOCK(dbenv, mutexp); - - /* - * If no errors occurred, the data is now valid, clear the BH_TRASH - * flag; regardless, clear the lock bit and let other threads proceed. - */ - F_CLR(bhp, BH_LOCKED); - if (ret == 0) - F_CLR(bhp, BH_TRASH); - - return (ret); -} - -/* - * __memp_pgwrite -- - * Write a page to a file. - */ -static int -__memp_pgwrite(dbmp, dbmfp, hp, bhp) - DB_MPOOL *dbmp; - DB_MPOOLFILE *dbmfp; - DB_MPOOL_HASH *hp; - BH *bhp; -{ - DB_ENV *dbenv; - DB_IO db_io; - DB_LSN lsn; - MPOOLFILE *mfp; - size_t nw; - int callpgin, ret; - - dbenv = dbmp->dbenv; - mfp = dbmfp == NULL ? NULL : dbmfp->mfp; - callpgin = ret = 0; - - /* - * We should never be called with a clean or trash buffer. - * The sync code does call us with already locked buffers. - */ - DB_ASSERT(F_ISSET(bhp, BH_DIRTY)); - DB_ASSERT(!F_ISSET(bhp, BH_TRASH)); - - /* - * If we have not already traded the hash bucket lock for the buffer - * lock, do so now. - */ - if (!F_ISSET(bhp, BH_LOCKED)) { - F_SET(bhp, BH_LOCKED); - MUTEX_LOCK(dbenv, &bhp->mutex); - MUTEX_UNLOCK(dbenv, &hp->hash_mutex); - } - - /* - * It's possible that the underlying file doesn't exist, either - * because of an outright removal or because it was a temporary - * file that's been closed. - * - * !!! - * Once we pass this point, we know that dbmfp and mfp aren't NULL, - * and that we have a valid file reference. - */ - if (mfp == NULL || F_ISSET(mfp, MP_DEADFILE)) - goto file_dead; - - /* - * If the page is in a file for which we have LSN information, we have - * to ensure the appropriate log records are on disk. - */ - if (LOGGING_ON(dbenv) && mfp->lsn_off != -1) { - memcpy(&lsn, bhp->buf + mfp->lsn_off, sizeof(DB_LSN)); - if ((ret = dbenv->log_flush(dbenv, &lsn)) != 0) - goto err; - } - -#ifdef DIAGNOSTIC - /* - * Verify write-ahead logging semantics. - * - * !!! - * One special case. There is a single field on the meta-data page, - * the last-page-number-in-the-file field, for which we do not log - * changes. If the page was originally created in a database that - * didn't have logging turned on, we can see a page marked dirty but - * for which no corresponding log record has been written. However, - * the only way that a page can be created for which there isn't a - * previous log record and valid LSN is when the page was created - * without logging turned on, and so we check for that special-case - * LSN value. - */ - if (LOGGING_ON(dbenv) && !IS_NOT_LOGGED_LSN(LSN(bhp->buf))) { - /* - * There is a potential race here. If we are in the midst of - * switching log files, it's possible we could test against the - * old file and the new offset in the log region's LSN. If we - * fail the first test, acquire the log mutex and check again. - */ - DB_LOG *dblp; - LOG *lp; - - dblp = dbenv->lg_handle; - lp = dblp->reginfo.primary; - if (!IS_NOT_LOGGED_LSN(LSN(bhp->buf)) && - log_compare(&lp->s_lsn, &LSN(bhp->buf)) <= 0) { - R_LOCK(dbenv, &dblp->reginfo); - DB_ASSERT(log_compare(&lp->s_lsn, &LSN(bhp->buf)) > 0); - R_UNLOCK(dbenv, &dblp->reginfo); - } - } -#endif - - /* - * Call any pgout function. We set the callpgin flag so that we flag - * that the contents of the buffer will need to be passed through pgin - * before they are reused. - */ - if (mfp->ftype != 0) { - callpgin = 1; - if ((ret = __memp_pg(dbmfp, bhp, 0)) != 0) - goto err; - } - - /* Temporary files may not yet have been created. */ - if (!F_ISSET(dbmfp->fhp, DB_FH_VALID)) { - MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp); - ret = F_ISSET(dbmfp->fhp, DB_FH_VALID) ? 0 : - __db_appname(dbenv, DB_APP_TMP, NULL, - F_ISSET(dbenv, DB_ENV_DIRECT_DB) ? DB_OSO_DIRECT : 0, - dbmfp->fhp, NULL); - MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); - if (ret != 0) { - __db_err(dbenv, - "unable to create temporary backing file"); - goto err; - } - } - - /* Write the page. */ - db_io.fhp = dbmfp->fhp; - db_io.mutexp = dbmfp->mutexp; - db_io.pagesize = db_io.bytes = mfp->stat.st_pagesize; - db_io.pgno = bhp->pgno; - db_io.buf = bhp->buf; - if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) { - __db_err(dbenv, "%s: write failed for page %lu", - __memp_fn(dbmfp), (u_long)bhp->pgno); - goto err; - } - ++mfp->stat.st_page_out; - -err: -file_dead: - /* - * !!! - * Once we pass this point, dbmfp and mfp may be NULL, we may not have - * a valid file reference. - * - * Unlock the buffer and reacquire the hash lock. - */ - MUTEX_UNLOCK(dbenv, &bhp->mutex); - MUTEX_LOCK(dbenv, &hp->hash_mutex); - - /* - * If we rewrote the page, it will need processing by the pgin - * routine before reuse. - */ - if (callpgin) - F_SET(bhp, BH_CALLPGIN); - - /* - * Update the hash bucket statistics, reset the flags. - * If we were successful, the page is no longer dirty. - */ - if (ret == 0) { - DB_ASSERT(hp->hash_page_dirty != 0); - --hp->hash_page_dirty; - - F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE); - } - - /* Regardless, clear any sync wait-for count and remove our lock. */ - bhp->ref_sync = 0; - F_CLR(bhp, BH_LOCKED); - - return (ret); -} - -/* - * __memp_pg -- - * Call the pgin/pgout routine. - * - * PUBLIC: int __memp_pg __P((DB_MPOOLFILE *, BH *, int)); - */ -int -__memp_pg(dbmfp, bhp, is_pgin) - DB_MPOOLFILE *dbmfp; - BH *bhp; - int is_pgin; -{ - DBT dbt, *dbtp; - DB_ENV *dbenv; - DB_MPOOL *dbmp; - DB_MPREG *mpreg; - MPOOLFILE *mfp; - int ftype, ret; - - dbmp = dbmfp->dbmp; - dbenv = dbmp->dbenv; - mfp = dbmfp->mfp; - - MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp); - - ftype = mfp->ftype; - for (mpreg = LIST_FIRST(&dbmp->dbregq); - mpreg != NULL; mpreg = LIST_NEXT(mpreg, q)) { - if (ftype != mpreg->ftype) - continue; - if (mfp->pgcookie_len == 0) - dbtp = NULL; - else { - dbt.size = mfp->pgcookie_len; - dbt.data = R_ADDR(dbmp->reginfo, mfp->pgcookie_off); - dbtp = &dbt; - } - MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); - - if (is_pgin) { - if (mpreg->pgin != NULL && - (ret = mpreg->pgin(dbenv, - bhp->pgno, bhp->buf, dbtp)) != 0) - goto err; - } else - if (mpreg->pgout != NULL && - (ret = mpreg->pgout(dbenv, - bhp->pgno, bhp->buf, dbtp)) != 0) - goto err; - break; - } - - if (mpreg == NULL) - MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); - - return (0); - -err: MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); - __db_err(dbenv, "%s: %s failed for page %lu", - __memp_fn(dbmfp), is_pgin ? "pgin" : "pgout", (u_long)bhp->pgno); - return (ret); -} - -/* - * __memp_bhfree -- - * Free a bucket header and its referenced data. - * - * PUBLIC: void __memp_bhfree __P((DB_MPOOL *, DB_MPOOL_HASH *, BH *, int)); - */ -void -__memp_bhfree(dbmp, hp, bhp, free_mem) - DB_MPOOL *dbmp; - DB_MPOOL_HASH *hp; - BH *bhp; - int free_mem; -{ - DB_ENV *dbenv; - MPOOL *c_mp, *mp; - MPOOLFILE *mfp; - u_int32_t n_cache; - - /* - * Assumes the hash bucket is locked and the MPOOL is not. - */ - dbenv = dbmp->dbenv; - mp = dbmp->reginfo[0].primary; - n_cache = NCACHE(mp, bhp->mf_offset, bhp->pgno); - - /* - * Delete the buffer header from the hash bucket queue and reset - * the hash bucket's priority, if necessary. - */ - SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh); - if (bhp->priority == hp->hash_priority) - hp->hash_priority = - SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL ? - 0 : SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority; - - /* - * Discard the hash bucket's mutex, it's no longer needed, and - * we don't want to be holding it when acquiring other locks. - */ - MUTEX_UNLOCK(dbenv, &hp->hash_mutex); - - /* - * Find the underlying MPOOLFILE and decrement its reference count. - * If this is its last reference, remove it. - */ - mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); - MUTEX_LOCK(dbenv, &mfp->mutex); - if (--mfp->block_cnt == 0 && mfp->mpf_cnt == 0) - __memp_mf_discard(dbmp, mfp); - else - MUTEX_UNLOCK(dbenv, &mfp->mutex); - - R_LOCK(dbenv, &dbmp->reginfo[n_cache]); - - /* - * Clear the mutex this buffer recorded; requires the region lock - * be held. - */ - __db_shlocks_clear(&bhp->mutex, &dbmp->reginfo[n_cache], - (REGMAINT *)R_ADDR(&dbmp->reginfo[n_cache], mp->maint_off)); - - /* - * If we're not reusing the buffer immediately, free the buffer header - * and data for real. - */ - if (free_mem) { - __db_shalloc_free(dbmp->reginfo[n_cache].addr, bhp); - c_mp = dbmp->reginfo[n_cache].primary; - c_mp->stat.st_pages--; - } - R_UNLOCK(dbenv, &dbmp->reginfo[n_cache]); -} - -/* - * __memp_upgrade -- - * Upgrade a file descriptor from read-only to read-write. - */ -static int -__memp_upgrade(dbmp, dbmfp, mfp) - DB_MPOOL *dbmp; - DB_MPOOLFILE *dbmfp; - MPOOLFILE *mfp; -{ - DB_ENV *dbenv; - DB_FH *fhp, *tfhp; - int ret; - char *rpath; - - dbenv = dbmp->dbenv; - fhp = NULL; - rpath = NULL; - - /* - * Calculate the real name for this file and try to open it read/write. - * We know we have a valid pathname for the file because it's the only - * way we could have gotten a file descriptor of any kind. - */ - if ((ret = __os_calloc(dbenv, 1, sizeof(DB_FH), &fhp)) != 0) - goto err; - - if ((ret = __db_appname(dbenv, DB_APP_DATA, - R_ADDR(dbmp->reginfo, mfp->path_off), 0, NULL, &rpath)) != 0) - goto err; - - if (__os_open(dbenv, rpath, - F_ISSET(mfp, MP_DIRECT) ? DB_OSO_DIRECT : 0, 0, fhp) != 0) { - F_SET(dbmfp, MP_UPGRADE_FAIL); - goto err; - } - - /* - * Swap the descriptors and set the upgrade flag. - * - * XXX - * There is a race here. If another process schedules a read using the - * existing file descriptor and is swapped out before making the system - * call, this code could theoretically close the file descriptor out - * from under it. While it's very unlikely, this code should still be - * rewritten. - */ - tfhp = dbmfp->fhp; - dbmfp->fhp = fhp; - fhp = tfhp; - - (void)__os_closehandle(dbenv, fhp); - F_SET(dbmfp, MP_UPGRADE); - - ret = 0; - if (0) { -err: ret = 1; - } - if (fhp != NULL) - __os_free(dbenv, fhp); - if (rpath != NULL) - __os_free(dbenv, rpath); - - return (ret); -} diff --git a/bdb/mp/mp_fget.c b/bdb/mp/mp_fget.c deleted file mode 100644 index be0785a2184..00000000000 --- a/bdb/mp/mp_fget.c +++ /dev/null @@ -1,654 +0,0 @@ -/*- - * See the file LICENSE for redistribution information. - * - * Copyright (c) 1996-2002 - * Sleepycat Software. All rights reserved. - */ -#include "db_config.h" - -#ifndef lint -static const char revid[] = "$Id: mp_fget.c,v 11.68 2002/08/06 04:58:09 bostic Exp $"; -#endif /* not lint */ - -#ifndef NO_SYSTEM_INCLUDES -#include <sys/types.h> - -#include <string.h> -#endif - -#include "db_int.h" -#include "dbinc/db_shash.h" -#include "dbinc/mp.h" - -#ifdef HAVE_FILESYSTEM_NOTZERO -static int __memp_fs_notzero - __P((DB_ENV *, DB_MPOOLFILE *, MPOOLFILE *, db_pgno_t *)); -#endif - -/* - * __memp_fget -- - * Get a page from the file. - * - * PUBLIC: int __memp_fget - * PUBLIC: __P((DB_MPOOLFILE *, db_pgno_t *, u_int32_t, void *)); - */ -int -__memp_fget(dbmfp, pgnoaddr, flags, addrp) - DB_MPOOLFILE *dbmfp; - db_pgno_t *pgnoaddr; - u_int32_t flags; - void *addrp; -{ - enum { FIRST_FOUND, FIRST_MISS, SECOND_FOUND, SECOND_MISS } state; - BH *alloc_bhp, *bhp; - DB_ENV *dbenv; - DB_MPOOL *dbmp; - DB_MPOOL_HASH *hp; - MPOOL *c_mp, *mp; - MPOOLFILE *mfp; - roff_t mf_offset; - u_int32_t n_cache, st_hsearch; - int b_incr, extending, first, ret; - - *(void **)addrp = NULL; - - dbmp = dbmfp->dbmp; - dbenv = dbmp->dbenv; - - PANIC_CHECK(dbenv); - - mp = dbmp->reginfo[0].primary; - mfp = dbmfp->mfp; - mf_offset = R_OFFSET(dbmp->reginfo, mfp); - alloc_bhp = bhp = NULL; - hp = NULL; - b_incr = extending = ret = 0; - - /* - * Validate arguments. - * - * !!! - * Don't test for DB_MPOOL_CREATE and DB_MPOOL_NEW flags for readonly - * files here, and create non-existent pages in readonly files if the - * flags are set, later. The reason is that the hash access method - * wants to get empty pages that don't really exist in readonly files. - * The only alternative is for hash to write the last "bucket" all the - * time, which we don't want to do because one of our big goals in life - * is to keep database files small. It's sleazy as hell, but we catch - * any attempt to actually write the file in memp_fput(). - */ -#define OKFLAGS (DB_MPOOL_CREATE | DB_MPOOL_LAST | DB_MPOOL_NEW) - if (flags != 0) { - if ((ret = __db_fchk(dbenv, "memp_fget", flags, OKFLAGS)) != 0) - return (ret); - - switch (flags) { - case DB_MPOOL_CREATE: - break; - case DB_MPOOL_LAST: - /* Get the last page number in the file. */ - if (flags == DB_MPOOL_LAST) { - R_LOCK(dbenv, dbmp->reginfo); - *pgnoaddr = mfp->last_pgno; - R_UNLOCK(dbenv, dbmp->reginfo); - } - break; - case DB_MPOOL_NEW: - /* - * If always creating a page, skip the first search - * of the hash bucket. - */ - if (flags == DB_MPOOL_NEW) - goto alloc; - break; - default: - return (__db_ferr(dbenv, "memp_fget", 1)); - } - } - - /* - * If mmap'ing the file and the page is not past the end of the file, - * just return a pointer. - * - * The page may be past the end of the file, so check the page number - * argument against the original length of the file. If we previously - * returned pages past the original end of the file, last_pgno will - * have been updated to match the "new" end of the file, and checking - * against it would return pointers past the end of the mmap'd region. - * - * If another process has opened the file for writing since we mmap'd - * it, we will start playing the game by their rules, i.e. everything - * goes through the cache. All pages previously returned will be safe, - * as long as the correct locking protocol was observed. - * - * We don't discard the map because we don't know when all of the - * pages will have been discarded from the process' address space. - * It would be possible to do so by reference counting the open - * pages from the mmap, but it's unclear to me that it's worth it. - */ - if (dbmfp->addr != NULL && - F_ISSET(mfp, MP_CAN_MMAP) && *pgnoaddr <= mfp->orig_last_pgno) { - *(void **)addrp = - R_ADDR(dbmfp, *pgnoaddr * mfp->stat.st_pagesize); - ++mfp->stat.st_map; - return (0); - } - -hb_search: - /* - * Determine the cache and hash bucket where this page lives and get - * local pointers to them. Reset on each pass through this code, the - * page number can change. - */ - n_cache = NCACHE(mp, mf_offset, *pgnoaddr); - c_mp = dbmp->reginfo[n_cache].primary; - hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab); - hp = &hp[NBUCKET(c_mp, mf_offset, *pgnoaddr)]; - - /* Search the hash chain for the page. */ -retry: st_hsearch = 0; - MUTEX_LOCK(dbenv, &hp->hash_mutex); - for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh); - bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) { - ++st_hsearch; - if (bhp->pgno != *pgnoaddr || bhp->mf_offset != mf_offset) - continue; - - /* - * Increment the reference count. We may discard the hash - * bucket lock as we evaluate and/or read the buffer, so we - * need to ensure it doesn't move and its contents remain - * unchanged. - */ - if (bhp->ref == UINT16_T_MAX) { - __db_err(dbenv, - "%s: page %lu: reference count overflow", - __memp_fn(dbmfp), (u_long)bhp->pgno); - ret = EINVAL; - MUTEX_UNLOCK(dbenv, &hp->hash_mutex); - goto err; - } - ++bhp->ref; - b_incr = 1; - - /* - * BH_LOCKED -- - * I/O is in progress or sync is waiting on the buffer to write - * it. Because we've incremented the buffer reference count, - * we know the buffer can't move. Unlock the bucket lock, wait - * for the buffer to become available, reacquire the bucket. - */ - for (first = 1; F_ISSET(bhp, BH_LOCKED) && - !F_ISSET(dbenv, DB_ENV_NOLOCKING); first = 0) { - /* - * If someone is trying to sync this buffer and the - * buffer is hot, they may never get in. Give up - * and try again. - */ - if (!first && bhp->ref_sync != 0) { - --bhp->ref; - b_incr = 0; - MUTEX_UNLOCK(dbenv, &hp->hash_mutex); - __os_yield(dbenv, 1); - goto retry; - } - - MUTEX_UNLOCK(dbenv, &hp->hash_mutex); - /* - * Explicitly yield the processor if not the first pass - * through this loop -- if we don't, we might run to the - * end of our CPU quantum as we will simply be swapping - * between the two locks. - */ - if (!first) - __os_yield(dbenv, 1); - - MUTEX_LOCK(dbenv, &bhp->mutex); - /* Wait for I/O to finish... */ - MUTEX_UNLOCK(dbenv, &bhp->mutex); - MUTEX_LOCK(dbenv, &hp->hash_mutex); - } - - ++mfp->stat.st_cache_hit; - break; - } - - /* - * Update the hash bucket search statistics -- do now because our next - * search may be for a different bucket. - */ - ++c_mp->stat.st_hash_searches; - if (st_hsearch > c_mp->stat.st_hash_longest) - c_mp->stat.st_hash_longest = st_hsearch; - c_mp->stat.st_hash_examined += st_hsearch; - - /* - * There are 4 possible paths to this location: - * - * FIRST_MISS: - * Didn't find the page in the hash bucket on our first pass: - * bhp == NULL, alloc_bhp == NULL - * - * FIRST_FOUND: - * Found the page in the hash bucket on our first pass: - * bhp != NULL, alloc_bhp == NULL - * - * SECOND_FOUND: - * Didn't find the page in the hash bucket on the first pass, - * allocated space, and found the page in the hash bucket on - * our second pass: - * bhp != NULL, alloc_bhp != NULL - * - * SECOND_MISS: - * Didn't find the page in the hash bucket on the first pass, - * allocated space, and didn't find the page in the hash bucket - * on our second pass: - * bhp == NULL, alloc_bhp != NULL - */ - state = bhp == NULL ? - (alloc_bhp == NULL ? FIRST_MISS : SECOND_MISS) : - (alloc_bhp == NULL ? FIRST_FOUND : SECOND_FOUND); - switch (state) { - case FIRST_FOUND: - /* We found the buffer in our first check -- we're done. */ - break; - case FIRST_MISS: - /* - * We didn't find the buffer in our first check. Figure out - * if the page exists, and allocate structures so we can add - * the page to the buffer pool. - */ - MUTEX_UNLOCK(dbenv, &hp->hash_mutex); - -alloc: /* - * If DB_MPOOL_NEW is set, we have to allocate a page number. - * If neither DB_MPOOL_CREATE or DB_MPOOL_CREATE is set, then - * it's an error to try and get a page past the end of file. - */ - COMPQUIET(n_cache, 0); - - extending = ret = 0; - R_LOCK(dbenv, dbmp->reginfo); - switch (flags) { - case DB_MPOOL_NEW: - extending = 1; - *pgnoaddr = mfp->last_pgno + 1; - break; - case DB_MPOOL_CREATE: - extending = *pgnoaddr > mfp->last_pgno; - break; - default: - ret = *pgnoaddr > mfp->last_pgno ? DB_PAGE_NOTFOUND : 0; - break; - } - R_UNLOCK(dbenv, dbmp->reginfo); - if (ret != 0) - goto err; - - /* - * !!! - * In the DB_MPOOL_NEW code path, mf_offset and n_cache have - * not yet been initialized. - */ - mf_offset = R_OFFSET(dbmp->reginfo, mfp); - n_cache = NCACHE(mp, mf_offset, *pgnoaddr); - - /* Allocate a new buffer header and data space. */ - if ((ret = __memp_alloc(dbmp, - &dbmp->reginfo[n_cache], mfp, 0, NULL, &alloc_bhp)) != 0) - goto err; -#ifdef DIAGNOSTIC - if ((db_alignp_t)alloc_bhp->buf & (sizeof(size_t) - 1)) { - __db_err(dbenv, - "Error: buffer data is NOT size_t aligned"); - ret = EINVAL; - goto err; - } -#endif - /* - * If we are extending the file, we'll need the region lock - * again. - */ - if (extending) - R_LOCK(dbenv, dbmp->reginfo); - - /* - * DB_MPOOL_NEW does not guarantee you a page unreferenced by - * any other thread of control. (That guarantee is interesting - * for DB_MPOOL_NEW, unlike DB_MPOOL_CREATE, because the caller - * did not specify the page number, and so, may reasonably not - * have any way to lock the page outside of mpool.) Regardless, - * if we allocate the page, and some other thread of control - * requests the page by number, we will not detect that and the - * thread of control that allocated using DB_MPOOL_NEW may not - * have a chance to initialize the page. (Note: we *could* - * detect this case if we set a flag in the buffer header which - * guaranteed that no gets of the page would succeed until the - * reference count went to 0, that is, until the creating page - * put the page.) What we do guarantee is that if two threads - * of control are both doing DB_MPOOL_NEW calls, they won't - * collide, that is, they won't both get the same page. - * - * There's a possibility that another thread allocated the page - * we were planning to allocate while we were off doing buffer - * allocation. We can do that by making sure the page number - * we were going to use is still available. If it's not, then - * we check to see if the next available page number hashes to - * the same mpool region as the old one -- if it does, we can - * continue, otherwise, we have to start over. - */ - if (flags == DB_MPOOL_NEW && *pgnoaddr != mfp->last_pgno + 1) { - *pgnoaddr = mfp->last_pgno + 1; - if (n_cache != NCACHE(mp, mf_offset, *pgnoaddr)) { - __db_shalloc_free( - dbmp->reginfo[n_cache].addr, alloc_bhp); - /* - * flags == DB_MPOOL_NEW, so extending is set - * and we're holding the region locked. - */ - R_UNLOCK(dbenv, dbmp->reginfo); - - alloc_bhp = NULL; - goto alloc; - } - } - - /* - * We released the region lock, so another thread might have - * extended the file. Update the last_pgno and initialize - * the file, as necessary, if we extended the file. - */ - if (extending) { -#ifdef HAVE_FILESYSTEM_NOTZERO - if (*pgnoaddr > mfp->last_pgno && - __os_fs_notzero() && - F_ISSET(dbmfp->fhp, DB_FH_VALID)) - ret = __memp_fs_notzero( - dbenv, dbmfp, mfp, pgnoaddr); - else - ret = 0; -#endif - if (ret == 0 && *pgnoaddr > mfp->last_pgno) - mfp->last_pgno = *pgnoaddr; - - R_UNLOCK(dbenv, dbmp->reginfo); - if (ret != 0) - goto err; - } - goto hb_search; - case SECOND_FOUND: - /* - * We allocated buffer space for the requested page, but then - * found the page in the buffer cache on our second check. - * That's OK -- we can use the page we found in the pool, - * unless DB_MPOOL_NEW is set. - * - * Free the allocated memory, we no longer need it. Since we - * can't acquire the region lock while holding the hash bucket - * lock, we have to release the hash bucket and re-acquire it. - * That's OK, because we have the buffer pinned down. - */ - MUTEX_UNLOCK(dbenv, &hp->hash_mutex); - R_LOCK(dbenv, &dbmp->reginfo[n_cache]); - __db_shalloc_free(dbmp->reginfo[n_cache].addr, alloc_bhp); - alloc_bhp = NULL; - R_UNLOCK(dbenv, &dbmp->reginfo[n_cache]); - MUTEX_LOCK(dbenv, &hp->hash_mutex); - - /* - * We can't use the page we found in the pool if DB_MPOOL_NEW - * was set. (For details, see the above comment beginning - * "DB_MPOOL_NEW does not guarantee you a page unreferenced by - * any other thread of control".) If DB_MPOOL_NEW is set, we - * release our pin on this particular buffer, and try to get - * another one. - */ - if (flags == DB_MPOOL_NEW) { - --bhp->ref; - b_incr = 0; - goto alloc; - } - break; - case SECOND_MISS: - /* - * We allocated buffer space for the requested page, and found - * the page still missing on our second pass through the buffer - * cache. Instantiate the page. - */ - bhp = alloc_bhp; - alloc_bhp = NULL; - - /* - * Initialize all the BH and hash bucket fields so we can call - * __memp_bhfree if an error occurs. - * - * Append the buffer to the tail of the bucket list and update - * the hash bucket's priority. - */ - b_incr = 1; - - memset(bhp, 0, sizeof(BH)); - bhp->ref = 1; - bhp->priority = UINT32_T_MAX; - bhp->pgno = *pgnoaddr; - bhp->mf_offset = mf_offset; - SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq); - hp->hash_priority = - SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority; - - /* If we extended the file, make sure the page is never lost. */ - if (extending) { - ++hp->hash_page_dirty; - F_SET(bhp, BH_DIRTY | BH_DIRTY_CREATE); - } - - /* - * If we created the page, zero it out. If we didn't create - * the page, read from the backing file. - * - * !!! - * DB_MPOOL_NEW doesn't call the pgin function. - * - * If DB_MPOOL_CREATE is used, then the application's pgin - * function has to be able to handle pages of 0's -- if it - * uses DB_MPOOL_NEW, it can detect all of its page creates, - * and not bother. - * - * If we're running in diagnostic mode, smash any bytes on the - * page that are unknown quantities for the caller. - * - * Otherwise, read the page into memory, optionally creating it - * if DB_MPOOL_CREATE is set. - */ - if (extending) { - if (mfp->clear_len == 0) - memset(bhp->buf, 0, mfp->stat.st_pagesize); - else { - memset(bhp->buf, 0, mfp->clear_len); -#if defined(DIAGNOSTIC) || defined(UMRW) - memset(bhp->buf + mfp->clear_len, CLEAR_BYTE, - mfp->stat.st_pagesize - mfp->clear_len); -#endif - } - - if (flags == DB_MPOOL_CREATE && mfp->ftype != 0) - F_SET(bhp, BH_CALLPGIN); - - ++mfp->stat.st_page_create; - } else { - F_SET(bhp, BH_TRASH); - ++mfp->stat.st_cache_miss; - } - - /* Increment buffer count referenced by MPOOLFILE. */ - MUTEX_LOCK(dbenv, &mfp->mutex); - ++mfp->block_cnt; - MUTEX_UNLOCK(dbenv, &mfp->mutex); - - /* - * Initialize the mutex. This is the last initialization step, - * because it's the only one that can fail, and everything else - * must be set up or we can't jump to the err label because it - * will call __memp_bhfree. - */ - if ((ret = __db_mutex_setup(dbenv, - &dbmp->reginfo[n_cache], &bhp->mutex, 0)) != 0) - goto err; - } - - DB_ASSERT(bhp->ref != 0); - - /* - * If we're the only reference, update buffer and bucket priorities. - * We may be about to release the hash bucket lock, and everything - * should be correct, first. (We've already done this if we created - * the buffer, so there is no need to do it again.) - */ - if (state != SECOND_MISS && bhp->ref == 1) { - bhp->priority = UINT32_T_MAX; - SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh); - SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq); - hp->hash_priority = - SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority; - } - - /* - * BH_TRASH -- - * The buffer we found may need to be filled from the disk. - * - * It's possible for the read function to fail, which means we fail as - * well. Note, the __memp_pgread() function discards and reacquires - * the hash lock, so the buffer must be pinned down so that it cannot - * move and its contents are unchanged. Discard the buffer on failure - * unless another thread is waiting on our I/O to complete. It's OK to - * leave the buffer around, as the waiting thread will see the BH_TRASH - * flag set, and will also attempt to discard it. If there's a waiter, - * we need to decrement our reference count. - */ - if (F_ISSET(bhp, BH_TRASH) && - (ret = __memp_pgread(dbmfp, - &hp->hash_mutex, bhp, LF_ISSET(DB_MPOOL_CREATE) ? 1 : 0)) != 0) - goto err; - - /* - * BH_CALLPGIN -- - * The buffer was processed for being written to disk, and now has - * to be re-converted for use. - */ - if (F_ISSET(bhp, BH_CALLPGIN)) { - if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0) - goto err; - F_CLR(bhp, BH_CALLPGIN); - } - - MUTEX_UNLOCK(dbenv, &hp->hash_mutex); - -#ifdef DIAGNOSTIC - /* Update the file's pinned reference count. */ - R_LOCK(dbenv, dbmp->reginfo); - ++dbmfp->pinref; - R_UNLOCK(dbenv, dbmp->reginfo); - - /* - * We want to switch threads as often as possible, and at awkward - * times. Yield every time we get a new page to ensure contention. - */ - if (F_ISSET(dbenv, DB_ENV_YIELDCPU)) - __os_yield(dbenv, 1); -#endif - - *(void **)addrp = bhp->buf; - return (0); - -err: /* - * Discard our reference. If we're the only reference, discard the - * the buffer entirely. If we held a reference to a buffer, we are - * also still holding the hash bucket mutex. - */ - if (b_incr) { - if (bhp->ref == 1) - (void)__memp_bhfree(dbmp, hp, bhp, 1); - else { - --bhp->ref; - MUTEX_UNLOCK(dbenv, &hp->hash_mutex); - } - } - - /* If alloc_bhp is set, free the memory. */ - if (alloc_bhp != NULL) - __db_shalloc_free(dbmp->reginfo[n_cache].addr, alloc_bhp); - - return (ret); -} - -#ifdef HAVE_FILESYSTEM_NOTZERO -/* - * __memp_fs_notzero -- - * Initialize the underlying allocated pages in the file. - */ -static int -__memp_fs_notzero(dbenv, dbmfp, mfp, pgnoaddr) - DB_ENV *dbenv; - DB_MPOOLFILE *dbmfp; - MPOOLFILE *mfp; - db_pgno_t *pgnoaddr; -{ - DB_IO db_io; - u_int32_t i, npages; - size_t nw; - int ret; - u_int8_t *page; - char *fail; - - /* - * Pages allocated by writing pages past end-of-file are not zeroed, - * on some systems. Recovery could theoretically be fooled by a page - * showing up that contained garbage. In order to avoid this, we - * have to write the pages out to disk, and flush them. The reason - * for the flush is because if we don't sync, the allocation of another - * page subsequent to this one might reach the disk first, and if we - * crashed at the right moment, leave us with this page as the one - * allocated by writing a page past it in the file. - * - * Hash is the only access method that allocates groups of pages. We - * know that it will use the existence of the last page in a group to - * signify that the entire group is OK; so, write all the pages but - * the last one in the group, flush them to disk, and then write the - * last one to disk and flush it. - */ - if ((ret = __os_calloc(dbenv, 1, mfp->stat.st_pagesize, &page)) != 0) - return (ret); - - db_io.fhp = dbmfp->fhp; - db_io.mutexp = dbmfp->mutexp; - db_io.pagesize = db_io.bytes = mfp->stat.st_pagesize; - db_io.buf = page; - - npages = *pgnoaddr - mfp->last_pgno; - for (i = 1; i < npages; ++i) { - db_io.pgno = mfp->last_pgno + i; - if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) { - fail = "write"; - goto err; - } - } - if (i != 1 && (ret = __os_fsync(dbenv, dbmfp->fhp)) != 0) { - fail = "sync"; - goto err; - } - - db_io.pgno = mfp->last_pgno + npages; - if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) { - fail = "write"; - goto err; - } - if ((ret = __os_fsync(dbenv, dbmfp->fhp)) != 0) { - fail = "sync"; -err: __db_err(dbenv, "%s: %s failed for page %lu", - __memp_fn(dbmfp), fail, (u_long)db_io.pgno); - } - - __os_free(dbenv, page); - return (ret); -} -#endif diff --git a/bdb/mp/mp_fopen.c b/bdb/mp/mp_fopen.c deleted file mode 100644 index 8fdefb0f5e9..00000000000 --- a/bdb/mp/mp_fopen.c +++ /dev/null @@ -1,1018 +0,0 @@ -/*- - * See the file LICENSE for redistribution information. - * - * Copyright (c) 1996-2002 - * Sleepycat Software. All rights reserved. - */ -#include "db_config.h" - -#ifndef lint -static const char revid[] = "$Id: mp_fopen.c,v 11.90 2002/08/26 15:22:01 bostic Exp $"; -#endif /* not lint */ - -#ifndef NO_SYSTEM_INCLUDES -#include <sys/types.h> - -#include <string.h> -#endif - -#include "db_int.h" -#include "dbinc/db_shash.h" -#include "dbinc/mp.h" - -static int __memp_fclose __P((DB_MPOOLFILE *, u_int32_t)); -static int __memp_fopen __P((DB_MPOOLFILE *, - const char *, u_int32_t, int, size_t)); -static void __memp_get_fileid __P((DB_MPOOLFILE *, u_int8_t *)); -static void __memp_last_pgno __P((DB_MPOOLFILE *, db_pgno_t *)); -static void __memp_refcnt __P((DB_MPOOLFILE *, db_pgno_t *)); -static int __memp_set_clear_len __P((DB_MPOOLFILE *, u_int32_t)); -static int __memp_set_fileid __P((DB_MPOOLFILE *, u_int8_t *)); -static int __memp_set_ftype __P((DB_MPOOLFILE *, int)); -static int __memp_set_lsn_offset __P((DB_MPOOLFILE *, int32_t)); -static int __memp_set_pgcookie __P((DB_MPOOLFILE *, DBT *)); -static int __memp_set_priority __P((DB_MPOOLFILE *, DB_CACHE_PRIORITY)); -static void __memp_set_unlink __P((DB_MPOOLFILE *, int)); - -/* Initialization methods cannot be called after open is called. */ -#define MPF_ILLEGAL_AFTER_OPEN(dbmfp, name) \ - if (F_ISSET(dbmfp, MP_OPEN_CALLED)) \ - return (__db_mi_open((dbmfp)->dbmp->dbenv, name, 1)); - -/* - * __memp_fcreate -- - * Create a DB_MPOOLFILE handle. - * - * PUBLIC: int __memp_fcreate __P((DB_ENV *, DB_MPOOLFILE **, u_int32_t)); - */ -int -__memp_fcreate(dbenv, retp, flags) - DB_ENV *dbenv; - DB_MPOOLFILE **retp; - u_int32_t flags; -{ - DB_MPOOL *dbmp; - DB_MPOOLFILE *dbmfp; - int ret; - - PANIC_CHECK(dbenv); - ENV_REQUIRES_CONFIG(dbenv, - dbenv->mp_handle, "memp_fcreate", DB_INIT_MPOOL); - - dbmp = dbenv->mp_handle; - - /* Validate arguments. */ - if ((ret = __db_fchk(dbenv, "memp_fcreate", flags, 0)) != 0) - return (ret); - - /* Allocate and initialize the per-process structure. */ - if ((ret = __os_calloc(dbenv, 1, sizeof(DB_MPOOLFILE), &dbmfp)) != 0) - return (ret); - if ((ret = __os_calloc(dbenv, 1, sizeof(DB_FH), &dbmfp->fhp)) != 0) - goto err; - - /* Allocate and initialize a mutex if necessary. */ - if (F_ISSET(dbenv, DB_ENV_THREAD) && - (ret = __db_mutex_setup(dbenv, dbmp->reginfo, &dbmfp->mutexp, - MUTEX_ALLOC | MUTEX_THREAD)) != 0) - goto err; - - dbmfp->ref = 1; - dbmfp->lsn_offset = -1; - dbmfp->dbmp = dbmp; - dbmfp->mfp = INVALID_ROFF; - - dbmfp->close = __memp_fclose; - dbmfp->get = __memp_fget; - dbmfp->get_fileid = __memp_get_fileid; - dbmfp->last_pgno = __memp_last_pgno; - dbmfp->open = __memp_fopen; - dbmfp->put = __memp_fput; - dbmfp->refcnt = __memp_refcnt; - dbmfp->set = __memp_fset; - dbmfp->set_clear_len = __memp_set_clear_len; - dbmfp->set_fileid = __memp_set_fileid; - dbmfp->set_ftype = __memp_set_ftype; - dbmfp->set_lsn_offset = __memp_set_lsn_offset; - dbmfp->set_pgcookie = __memp_set_pgcookie; - dbmfp->set_priority = __memp_set_priority; - dbmfp->set_unlink = __memp_set_unlink; - dbmfp->sync = __memp_fsync; - - *retp = dbmfp; - return (0); - -err: if (dbmfp != NULL) { - if (dbmfp->fhp != NULL) - (void)__os_free(dbenv, dbmfp->fhp); - (void)__os_free(dbenv, dbmfp); - } - return (ret); -} - -/* - * __memp_set_clear_len -- - * Set the clear length. - */ -static int -__memp_set_clear_len(dbmfp, clear_len) - DB_MPOOLFILE *dbmfp; - u_int32_t clear_len; -{ - MPF_ILLEGAL_AFTER_OPEN(dbmfp, "set_clear_len"); - - dbmfp->clear_len = clear_len; - return (0); -} - -/* - * __memp_set_fileid -- - * Set the file ID. - */ -static int -__memp_set_fileid(dbmfp, fileid) - DB_MPOOLFILE *dbmfp; - u_int8_t *fileid; -{ - MPF_ILLEGAL_AFTER_OPEN(dbmfp, "set_fileid"); - - /* - * XXX - * This is dangerous -- we're saving the caller's pointer instead - * of allocating memory and copying the contents. - */ - dbmfp->fileid = fileid; - return (0); -} - -/* - * __memp_set_ftype -- - * Set the file type (as registered). - */ -static int -__memp_set_ftype(dbmfp, ftype) - DB_MPOOLFILE *dbmfp; - int ftype; -{ - MPF_ILLEGAL_AFTER_OPEN(dbmfp, "set_ftype"); - - dbmfp->ftype = ftype; - return (0); -} - -/* - * __memp_set_lsn_offset -- - * Set the page's LSN offset. - */ -static int -__memp_set_lsn_offset(dbmfp, lsn_offset) - DB_MPOOLFILE *dbmfp; - int32_t lsn_offset; -{ - MPF_ILLEGAL_AFTER_OPEN(dbmfp, "set_lsn_offset"); - - dbmfp->lsn_offset = lsn_offset; - return (0); -} - -/* - * __memp_set_pgcookie -- - * Set the pgin/pgout cookie. - */ -static int -__memp_set_pgcookie(dbmfp, pgcookie) - DB_MPOOLFILE *dbmfp; - DBT *pgcookie; -{ - MPF_ILLEGAL_AFTER_OPEN(dbmfp, "set_pgcookie"); - - dbmfp->pgcookie = pgcookie; - return (0); -} - -/* - * __memp_set_priority -- - * Set the cache priority for pages from this file. - */ -static int -__memp_set_priority(dbmfp, priority) - DB_MPOOLFILE *dbmfp; - DB_CACHE_PRIORITY priority; -{ - switch (priority) { - case DB_PRIORITY_VERY_LOW: - dbmfp->mfp->priority = MPOOL_PRI_VERY_LOW; - break; - case DB_PRIORITY_LOW: - dbmfp->mfp->priority = MPOOL_PRI_LOW; - break; - case DB_PRIORITY_DEFAULT: - dbmfp->mfp->priority = MPOOL_PRI_DEFAULT; - break; - case DB_PRIORITY_HIGH: - dbmfp->mfp->priority = MPOOL_PRI_HIGH; - break; - case DB_PRIORITY_VERY_HIGH: - dbmfp->mfp->priority = MPOOL_PRI_VERY_HIGH; - break; - default: - __db_err(dbmfp->dbmp->dbenv, - "Unknown priority value: %d", priority); - return (EINVAL); - } - - return (0); -} - -/* - * __memp_fopen -- - * Open a backing file for the memory pool. - */ -static int -__memp_fopen(dbmfp, path, flags, mode, pagesize) - DB_MPOOLFILE *dbmfp; - const char *path; - u_int32_t flags; - int mode; - size_t pagesize; -{ - DB_ENV *dbenv; - DB_MPOOL *dbmp; - int ret; - - dbmp = dbmfp->dbmp; - dbenv = dbmp->dbenv; - - PANIC_CHECK(dbenv); - - /* Validate arguments. */ - if ((ret = __db_fchk(dbenv, "memp_fopen", flags, - DB_CREATE | DB_DIRECT | DB_EXTENT | - DB_NOMMAP | DB_ODDFILESIZE | DB_RDONLY | DB_TRUNCATE)) != 0) - return (ret); - - /* - * Require a non-zero, power-of-two pagesize, smaller than the - * clear length. - */ - if (pagesize == 0 || !POWER_OF_TWO(pagesize)) { - __db_err(dbenv, - "memp_fopen: page sizes must be a power-of-2"); - return (EINVAL); - } - if (dbmfp->clear_len > pagesize) { - __db_err(dbenv, - "memp_fopen: clear length larger than page size"); - return (EINVAL); - } - - /* Read-only checks, and local flag. */ - if (LF_ISSET(DB_RDONLY) && path == NULL) { - __db_err(dbenv, - "memp_fopen: temporary files can't be readonly"); - return (EINVAL); - } - - return (__memp_fopen_int(dbmfp, NULL, path, flags, mode, pagesize)); -} - -/* - * __memp_fopen_int -- - * Open a backing file for the memory pool; internal version. - * - * PUBLIC: int __memp_fopen_int __P((DB_MPOOLFILE *, - * PUBLIC: MPOOLFILE *, const char *, u_int32_t, int, size_t)); - */ -int -__memp_fopen_int(dbmfp, mfp, path, flags, mode, pagesize) - DB_MPOOLFILE *dbmfp; - MPOOLFILE *mfp; - const char *path; - u_int32_t flags; - int mode; - size_t pagesize; -{ - DB_ENV *dbenv; - DB_MPOOL *dbmp; - MPOOL *mp; - db_pgno_t last_pgno; - size_t maxmap; - u_int32_t mbytes, bytes, oflags; - int mfp_alloc, ret; - u_int8_t idbuf[DB_FILE_ID_LEN]; - char *rpath; - void *p; - - dbmp = dbmfp->dbmp; - dbenv = dbmp->dbenv; - mp = dbmp->reginfo[0].primary; - mfp_alloc = ret = 0; - rpath = NULL; - - /* - * Set the page size so os_open can decide whether to turn buffering - * off if the DB_DIRECT_DB flag is set. - */ - dbmfp->fhp->pagesize = (u_int32_t)pagesize; - - /* - * If it's a temporary file, delay the open until we actually need - * to write the file, and we know we can't join any existing files. - */ - if (path == NULL) - goto alloc; - - /* - * Get the real name for this file and open it. If it's a Queue extent - * file, it may not exist, and that's OK. - */ - oflags = 0; - if (LF_ISSET(DB_CREATE)) - oflags |= DB_OSO_CREATE; - if (LF_ISSET(DB_DIRECT)) - oflags |= DB_OSO_DIRECT; - if (LF_ISSET(DB_RDONLY)) { - F_SET(dbmfp, MP_READONLY); - oflags |= DB_OSO_RDONLY; - } - if ((ret = - __db_appname(dbenv, DB_APP_DATA, path, 0, NULL, &rpath)) != 0) - goto err; - if ((ret = __os_open(dbenv, rpath, oflags, mode, dbmfp->fhp)) != 0) { - if (!LF_ISSET(DB_EXTENT)) - __db_err(dbenv, "%s: %s", rpath, db_strerror(ret)); - goto err; - } - - /* - * Figure out the file's size. - * - * !!! - * We can't use off_t's here, or in any code in the mainline library - * for that matter. (We have to use them in the os stubs, of course, - * as there are system calls that take them as arguments.) The reason - * is some customers build in environments where an off_t is 32-bits, - * but still run where offsets are 64-bits, and they pay us a lot of - * money. - */ - if ((ret = __os_ioinfo( - dbenv, rpath, dbmfp->fhp, &mbytes, &bytes, NULL)) != 0) { - __db_err(dbenv, "%s: %s", rpath, db_strerror(ret)); - goto err; - } - - /* - * Get the file id if we weren't given one. Generated file id's - * don't use timestamps, otherwise there'd be no chance of any - * other process joining the party. - */ - if (dbmfp->fileid == NULL) { - if ((ret = __os_fileid(dbenv, rpath, 0, idbuf)) != 0) - goto err; - dbmfp->fileid = idbuf; - } - - /* - * If our caller knows what mfp we're using, increment the ref count, - * no need to search. - * - * We don't need to acquire a lock other than the mfp itself, because - * we know there's another reference and it's not going away. - */ - if (mfp != NULL) { - MUTEX_LOCK(dbenv, &mfp->mutex); - ++mfp->mpf_cnt; - MUTEX_UNLOCK(dbenv, &mfp->mutex); - goto check_map; - } - - /* - * If not creating a temporary file, walk the list of MPOOLFILE's, - * looking for a matching file. Files backed by temporary files - * or previously removed files can't match. - * - * DB_TRUNCATE support. - * - * The fileID is a filesystem unique number (e.g., a UNIX dev/inode - * pair) plus a timestamp. If files are removed and created in less - * than a second, the fileID can be repeated. The problem with - * repetition happens when the file that previously had the fileID - * value still has pages in the pool, since we don't want to use them - * to satisfy requests for the new file. - * - * Because the DB_TRUNCATE flag reuses the dev/inode pair, repeated - * opens with that flag set guarantees matching fileIDs when the - * machine can open a file and then re-open with truncate within a - * second. For this reason, we pass that flag down, and, if we find - * a matching entry, we ensure that it's never found again, and we - * create a new entry for the current request. - */ - R_LOCK(dbenv, dbmp->reginfo); - for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile); - mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) { - /* Skip dead files and temporary files. */ - if (F_ISSET(mfp, MP_DEADFILE | MP_TEMP)) - continue; - - /* Skip non-matching files. */ - if (memcmp(dbmfp->fileid, R_ADDR(dbmp->reginfo, - mfp->fileid_off), DB_FILE_ID_LEN) != 0) - continue; - - /* - * If the file is being truncated, remove it from the system - * and create a new entry. - * - * !!! - * We should be able to set mfp to NULL and break out of the - * loop, but I like the idea of checking all the entries. - */ - if (LF_ISSET(DB_TRUNCATE)) { - MUTEX_LOCK(dbenv, &mfp->mutex); - MPOOLFILE_IGNORE(mfp); - MUTEX_UNLOCK(dbenv, &mfp->mutex); - continue; - } - - /* - * Some things about a file cannot be changed: the clear length, - * page size, or lSN location. - * - * The file type can change if the application's pre- and post- - * processing needs change. For example, an application that - * created a hash subdatabase in a database that was previously - * all btree. - * - * XXX - * We do not check to see if the pgcookie information changed, - * or update it if it is, this might be a bug. - */ - if (dbmfp->clear_len != mfp->clear_len || - pagesize != mfp->stat.st_pagesize || - dbmfp->lsn_offset != mfp->lsn_off) { - __db_err(dbenv, - "%s: clear length, page size or LSN location changed", - path); - R_UNLOCK(dbenv, dbmp->reginfo); - ret = EINVAL; - goto err; - } - - if (dbmfp->ftype != 0) - mfp->ftype = dbmfp->ftype; - - MUTEX_LOCK(dbenv, &mfp->mutex); - ++mfp->mpf_cnt; - MUTEX_UNLOCK(dbenv, &mfp->mutex); - break; - } - R_UNLOCK(dbenv, dbmp->reginfo); - - if (mfp != NULL) - goto check_map; - -alloc: /* Allocate and initialize a new MPOOLFILE. */ - if ((ret = __memp_alloc( - dbmp, dbmp->reginfo, NULL, sizeof(MPOOLFILE), NULL, &mfp)) != 0) - goto err; - mfp_alloc = 1; - memset(mfp, 0, sizeof(MPOOLFILE)); - mfp->mpf_cnt = 1; - mfp->ftype = dbmfp->ftype; - mfp->stat.st_pagesize = pagesize; - mfp->lsn_off = dbmfp->lsn_offset; - mfp->clear_len = dbmfp->clear_len; - - if (LF_ISSET(DB_DIRECT)) - F_SET(mfp, MP_DIRECT); - if (LF_ISSET(DB_EXTENT)) - F_SET(mfp, MP_EXTENT); - F_SET(mfp, MP_CAN_MMAP); - - if (path == NULL) - F_SET(mfp, MP_TEMP); - else { - /* - * Don't permit files that aren't a multiple of the pagesize, - * and find the number of the last page in the file, all the - * time being careful not to overflow 32 bits. - * - * During verify or recovery, we might have to cope with a - * truncated file; if the file size is not a multiple of the - * page size, round down to a page, we'll take care of the - * partial page outside the mpool system. - */ - if (bytes % pagesize != 0) { - if (LF_ISSET(DB_ODDFILESIZE)) - bytes -= (u_int32_t)(bytes % pagesize); - else { - __db_err(dbenv, - "%s: file size not a multiple of the pagesize", rpath); - ret = EINVAL; - goto err; - } - } - - /* - * If the user specifies DB_MPOOL_LAST or DB_MPOOL_NEW on a - * page get, we have to increment the last page in the file. - * Figure it out and save it away. - * - * Note correction: page numbers are zero-based, not 1-based. - */ - last_pgno = (db_pgno_t)(mbytes * (MEGABYTE / pagesize)); - last_pgno += (db_pgno_t)(bytes / pagesize); - if (last_pgno != 0) - --last_pgno; - mfp->orig_last_pgno = mfp->last_pgno = last_pgno; - - /* Copy the file path into shared memory. */ - if ((ret = __memp_alloc(dbmp, dbmp->reginfo, - NULL, strlen(path) + 1, &mfp->path_off, &p)) != 0) - goto err; - memcpy(p, path, strlen(path) + 1); - - /* Copy the file identification string into shared memory. */ - if ((ret = __memp_alloc(dbmp, dbmp->reginfo, - NULL, DB_FILE_ID_LEN, &mfp->fileid_off, &p)) != 0) - goto err; - memcpy(p, dbmfp->fileid, DB_FILE_ID_LEN); - } - - /* Copy the page cookie into shared memory. */ - if (dbmfp->pgcookie == NULL || dbmfp->pgcookie->size == 0) { - mfp->pgcookie_len = 0; - mfp->pgcookie_off = 0; - } else { - if ((ret = __memp_alloc(dbmp, dbmp->reginfo, - NULL, dbmfp->pgcookie->size, &mfp->pgcookie_off, &p)) != 0) - goto err; - memcpy(p, dbmfp->pgcookie->data, dbmfp->pgcookie->size); - mfp->pgcookie_len = dbmfp->pgcookie->size; - } - - /* - * Prepend the MPOOLFILE to the list of MPOOLFILE's. - */ - R_LOCK(dbenv, dbmp->reginfo); - ret = __db_mutex_setup(dbenv, dbmp->reginfo, &mfp->mutex, - MUTEX_NO_RLOCK); - if (ret == 0) - SH_TAILQ_INSERT_HEAD(&mp->mpfq, mfp, q, __mpoolfile); - R_UNLOCK(dbenv, dbmp->reginfo); - if (ret != 0) - goto err; - -check_map: - /* - * If a file: - * + isn't temporary - * + is read-only - * + doesn't require any pgin/pgout support - * + the DB_NOMMAP flag wasn't set (in either the file open or - * the environment in which it was opened) - * + and is less than mp_mmapsize bytes in size - * - * we can mmap it instead of reading/writing buffers. Don't do error - * checking based on the mmap call failure. We want to do normal I/O - * on the file if the reason we failed was because the file was on an - * NFS mounted partition, and we can fail in buffer I/O just as easily - * as here. - * - * We'd like to test to see if the file is too big to mmap. Since we - * don't know what size or type off_t's or size_t's are, or the largest - * unsigned integral type is, or what random insanity the local C - * compiler will perpetrate, doing the comparison in a portable way is - * flatly impossible. Hope that mmap fails if the file is too large. - */ -#define DB_MAXMMAPSIZE (10 * 1024 * 1024) /* 10 MB. */ - if (F_ISSET(mfp, MP_CAN_MMAP)) { - if (path == NULL) - F_CLR(mfp, MP_CAN_MMAP); - if (!F_ISSET(dbmfp, MP_READONLY)) - F_CLR(mfp, MP_CAN_MMAP); - if (dbmfp->ftype != 0) - F_CLR(mfp, MP_CAN_MMAP); - if (LF_ISSET(DB_NOMMAP) || F_ISSET(dbenv, DB_ENV_NOMMAP)) - F_CLR(mfp, MP_CAN_MMAP); - maxmap = dbenv->mp_mmapsize == 0 ? - DB_MAXMMAPSIZE : dbenv->mp_mmapsize; - if (mbytes > maxmap / MEGABYTE || - (mbytes == maxmap / MEGABYTE && bytes >= maxmap % MEGABYTE)) - F_CLR(mfp, MP_CAN_MMAP); - - dbmfp->addr = NULL; - if (F_ISSET(mfp, MP_CAN_MMAP)) { - dbmfp->len = (size_t)mbytes * MEGABYTE + bytes; - if (__os_mapfile(dbenv, rpath, - dbmfp->fhp, dbmfp->len, 1, &dbmfp->addr) != 0) { - dbmfp->addr = NULL; - F_CLR(mfp, MP_CAN_MMAP); - } - } - } - - dbmfp->mfp = mfp; - - F_SET(dbmfp, MP_OPEN_CALLED); - - /* Add the file to the process' list of DB_MPOOLFILEs. */ - MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp); - TAILQ_INSERT_TAIL(&dbmp->dbmfq, dbmfp, q); - MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); - - if (0) { -err: if (F_ISSET(dbmfp->fhp, DB_FH_VALID)) - (void)__os_closehandle(dbenv, dbmfp->fhp); - - if (mfp_alloc) { - R_LOCK(dbenv, dbmp->reginfo); - if (mfp->path_off != 0) - __db_shalloc_free(dbmp->reginfo[0].addr, - R_ADDR(dbmp->reginfo, mfp->path_off)); - if (mfp->fileid_off != 0) - __db_shalloc_free(dbmp->reginfo[0].addr, - R_ADDR(dbmp->reginfo, mfp->fileid_off)); - __db_shalloc_free(dbmp->reginfo[0].addr, mfp); - R_UNLOCK(dbenv, dbmp->reginfo); - } - - } - if (rpath != NULL) - __os_free(dbenv, rpath); - return (ret); -} - -/* - * __memp_get_fileid -- - * Return the file ID. - * - * XXX - * Undocumented interface: DB private. - */ -static void -__memp_get_fileid(dbmfp, fidp) - DB_MPOOLFILE *dbmfp; - u_int8_t *fidp; -{ - /* - * No lock needed -- we're using the handle, it had better not - * be going away. - * - * !!! - * Get the fileID out of the region, not out of the DB_MPOOLFILE - * structure because the DB_MPOOLFILE reference is possibly short - * lived, and isn't to be trusted. - */ - memcpy(fidp, R_ADDR( - dbmfp->dbmp->reginfo, dbmfp->mfp->fileid_off), DB_FILE_ID_LEN); -} - -/* - * __memp_last_pgno -- - * Return the page number of the last page in the file. - * - * XXX - * Undocumented interface: DB private. - */ -static void -__memp_last_pgno(dbmfp, pgnoaddr) - DB_MPOOLFILE *dbmfp; - db_pgno_t *pgnoaddr; -{ - DB_ENV *dbenv; - DB_MPOOL *dbmp; - - dbmp = dbmfp->dbmp; - dbenv = dbmp->dbenv; - - R_LOCK(dbenv, dbmp->reginfo); - *pgnoaddr = dbmfp->mfp->last_pgno; - R_UNLOCK(dbenv, dbmp->reginfo); -} - -/* - * __memp_refcnt -- - * Return the current reference count. - * - * XXX - * Undocumented interface: DB private. - */ -static void -__memp_refcnt(dbmfp, cntp) - DB_MPOOLFILE *dbmfp; - db_pgno_t *cntp; -{ - DB_ENV *dbenv; - - dbenv = dbmfp->dbmp->dbenv; - - MUTEX_LOCK(dbenv, &dbmfp->mfp->mutex); - *cntp = dbmfp->mfp->mpf_cnt; - MUTEX_UNLOCK(dbenv, &dbmfp->mfp->mutex); -} - -/* - * __memp_set_unlink -- - * Set unlink on last close flag. - * - * XXX - * Undocumented interface: DB private. - */ -static void -__memp_set_unlink(dbmpf, set) - DB_MPOOLFILE *dbmpf; - int set; -{ - DB_ENV *dbenv; - - dbenv = dbmpf->dbmp->dbenv; - - MUTEX_LOCK(dbenv, &dbmpf->mfp->mutex); - if (set) - F_SET(dbmpf->mfp, MP_UNLINK); - else - F_CLR(dbmpf->mfp, MP_UNLINK); - MUTEX_UNLOCK(dbenv, &dbmpf->mfp->mutex); -} - -/* - * memp_fclose -- - * Close a backing file for the memory pool. - */ -static int -__memp_fclose(dbmfp, flags) - DB_MPOOLFILE *dbmfp; - u_int32_t flags; -{ - DB_ENV *dbenv; - int ret, t_ret; - - dbenv = dbmfp->dbmp->dbenv; - - PANIC_CHECK(dbenv); - - /* - * XXX - * DB_MPOOL_DISCARD: Undocumented flag: DB private. - */ - ret = __db_fchk(dbenv, "DB_MPOOLFILE->close", flags, DB_MPOOL_DISCARD); - - if ((t_ret = __memp_fclose_int(dbmfp, flags)) != 0 && ret == 0) - ret = t_ret; - - return (ret); -} - -/* - * __memp_fclose_int -- - * Internal version of __memp_fclose. - * - * PUBLIC: int __memp_fclose_int __P((DB_MPOOLFILE *, u_int32_t)); - */ -int -__memp_fclose_int(dbmfp, flags) - DB_MPOOLFILE *dbmfp; - u_int32_t flags; -{ - DB_ENV *dbenv; - DB_MPOOL *dbmp; - MPOOLFILE *mfp; - char *rpath; - int deleted, ret, t_ret; - - dbmp = dbmfp->dbmp; - dbenv = dbmp->dbenv; - ret = 0; - - /* - * We have to reference count DB_MPOOLFILE structures as other threads - * in the process may be using them. Here's the problem: - * - * Thread A opens a database. - * Thread B uses thread A's DB_MPOOLFILE to write a buffer - * in order to free up memory in the mpool cache. - * Thread A closes the database while thread B is using the - * DB_MPOOLFILE structure. - * - * By opening all databases before creating any threads, and closing - * the databases after all the threads have exited, applications get - * better performance and avoid the problem path entirely. - * - * Regardless, holding the DB_MPOOLFILE to flush a dirty buffer is a - * short-term lock, even in worst case, since we better be the only - * thread of control using the DB_MPOOLFILE structure to read pages - * *into* the cache. Wait until we're the only reference holder and - * remove the DB_MPOOLFILE structure from the list, so nobody else can - * find it. We do this, rather than have the last reference holder - * (whoever that might be) discard the DB_MPOOLFILE structure, because - * we'd rather write error messages to the application in the close - * routine, not in the checkpoint/sync routine. - * - * !!! - * It's possible the DB_MPOOLFILE was never added to the DB_MPOOLFILE - * file list, check the DB_OPEN_CALLED flag to be sure. - */ - for (deleted = 0;;) { - MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp); - if (dbmfp->ref == 1) { - if (F_ISSET(dbmfp, MP_OPEN_CALLED)) - TAILQ_REMOVE(&dbmp->dbmfq, dbmfp, q); - deleted = 1; - } - MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); - - if (deleted) - break; - __os_sleep(dbenv, 1, 0); - } - - /* Complain if pinned blocks never returned. */ - if (dbmfp->pinref != 0) { - __db_err(dbenv, "%s: close: %lu blocks left pinned", - __memp_fn(dbmfp), (u_long)dbmfp->pinref); - ret = __db_panic(dbenv, DB_RUNRECOVERY); - } - - /* Discard any mmap information. */ - if (dbmfp->addr != NULL && - (ret = __os_unmapfile(dbenv, dbmfp->addr, dbmfp->len)) != 0) - __db_err(dbenv, "%s: %s", __memp_fn(dbmfp), db_strerror(ret)); - - /* Close the file; temporary files may not yet have been created. */ - if (F_ISSET(dbmfp->fhp, DB_FH_VALID) && - (t_ret = __os_closehandle(dbenv, dbmfp->fhp)) != 0) { - __db_err(dbenv, "%s: %s", __memp_fn(dbmfp), db_strerror(t_ret)); - if (ret == 0) - ret = t_ret; - } - - /* Discard the thread mutex. */ - if (dbmfp->mutexp != NULL) - __db_mutex_free(dbenv, dbmp->reginfo, dbmfp->mutexp); - - /* - * Discard our reference on the the underlying MPOOLFILE, and close - * it if it's no longer useful to anyone. It possible the open of - * the file never happened or wasn't successful, in which case, mpf - * will be NULL; - */ - if ((mfp = dbmfp->mfp) == NULL) - goto done; - - /* - * If it's a temp file, all outstanding references belong to unflushed - * buffers. (A temp file can only be referenced by one DB_MPOOLFILE). - * We don't care about preserving any of those buffers, so mark the - * MPOOLFILE as dead so that even the dirty ones just get discarded - * when we try to flush them. - */ - deleted = 0; - MUTEX_LOCK(dbenv, &mfp->mutex); - if (--mfp->mpf_cnt == 0 || LF_ISSET(DB_MPOOL_DISCARD)) { - if (LF_ISSET(DB_MPOOL_DISCARD) || - F_ISSET(mfp, MP_TEMP | MP_UNLINK)) - MPOOLFILE_IGNORE(mfp); - if (F_ISSET(mfp, MP_UNLINK)) { - if ((t_ret = __db_appname(dbmp->dbenv, - DB_APP_DATA, R_ADDR(dbmp->reginfo, - mfp->path_off), 0, NULL, &rpath)) != 0 && ret == 0) - ret = t_ret; - if (t_ret == 0) { - if ((t_ret = __os_unlink( - dbmp->dbenv, rpath) != 0) && ret == 0) - ret = t_ret; - __os_free(dbenv, rpath); - } - } - if (mfp->block_cnt == 0) { - if ((t_ret = - __memp_mf_discard(dbmp, mfp)) != 0 && ret == 0) - ret = t_ret; - deleted = 1; - } - } - if (deleted == 0) - MUTEX_UNLOCK(dbenv, &mfp->mutex); - - /* Discard the DB_MPOOLFILE structure. */ -done: __os_free(dbenv, dbmfp->fhp); - __os_free(dbenv, dbmfp); - - return (ret); -} - -/* - * __memp_mf_discard -- - * Discard an MPOOLFILE. - * - * PUBLIC: int __memp_mf_discard __P((DB_MPOOL *, MPOOLFILE *)); - */ -int -__memp_mf_discard(dbmp, mfp) - DB_MPOOL *dbmp; - MPOOLFILE *mfp; -{ - DB_ENV *dbenv; - DB_FH fh; - DB_MPOOL_STAT *sp; - MPOOL *mp; - char *rpath; - int ret; - - dbenv = dbmp->dbenv; - mp = dbmp->reginfo[0].primary; - ret = 0; - - /* - * Expects caller to be holding the MPOOLFILE mutex. - * - * When discarding a file, we have to flush writes from it to disk. - * The scenario is that dirty buffers from this file need to be - * flushed to satisfy a future checkpoint, but when the checkpoint - * calls mpool sync, the sync code won't know anything about them. - */ - if (!F_ISSET(mfp, MP_DEADFILE) && - (ret = __db_appname(dbenv, DB_APP_DATA, - R_ADDR(dbmp->reginfo, mfp->path_off), 0, NULL, &rpath)) == 0) { - if ((ret = __os_open(dbenv, rpath, 0, 0, &fh)) == 0) { - ret = __os_fsync(dbenv, &fh); - (void)__os_closehandle(dbenv, &fh); - } - __os_free(dbenv, rpath); - } - - /* - * We have to release the MPOOLFILE lock before acquiring the region - * lock so that we don't deadlock. Make sure nobody ever looks at - * this structure again. - */ - MPOOLFILE_IGNORE(mfp); - - /* Discard the mutex we're holding. */ - MUTEX_UNLOCK(dbenv, &mfp->mutex); - - /* Delete from the list of MPOOLFILEs. */ - R_LOCK(dbenv, dbmp->reginfo); - SH_TAILQ_REMOVE(&mp->mpfq, mfp, q, __mpoolfile); - - /* Copy the statistics into the region. */ - sp = &mp->stat; - sp->st_cache_hit += mfp->stat.st_cache_hit; - sp->st_cache_miss += mfp->stat.st_cache_miss; - sp->st_map += mfp->stat.st_map; - sp->st_page_create += mfp->stat.st_page_create; - sp->st_page_in += mfp->stat.st_page_in; - sp->st_page_out += mfp->stat.st_page_out; - - /* Clear the mutex this MPOOLFILE recorded. */ - __db_shlocks_clear(&mfp->mutex, dbmp->reginfo, - (REGMAINT *)R_ADDR(dbmp->reginfo, mp->maint_off)); - - /* Free the space. */ - if (mfp->path_off != 0) - __db_shalloc_free(dbmp->reginfo[0].addr, - R_ADDR(dbmp->reginfo, mfp->path_off)); - if (mfp->fileid_off != 0) - __db_shalloc_free(dbmp->reginfo[0].addr, - R_ADDR(dbmp->reginfo, mfp->fileid_off)); - if (mfp->pgcookie_off != 0) - __db_shalloc_free(dbmp->reginfo[0].addr, - R_ADDR(dbmp->reginfo, mfp->pgcookie_off)); - __db_shalloc_free(dbmp->reginfo[0].addr, mfp); - - R_UNLOCK(dbenv, dbmp->reginfo); - - return (ret); -} - -/* - * __memp_fn -- - * On errors we print whatever is available as the file name. - * - * PUBLIC: char * __memp_fn __P((DB_MPOOLFILE *)); - */ -char * -__memp_fn(dbmfp) - DB_MPOOLFILE *dbmfp; -{ - return (__memp_fns(dbmfp->dbmp, dbmfp->mfp)); -} - -/* - * __memp_fns -- - * On errors we print whatever is available as the file name. - * - * PUBLIC: char * __memp_fns __P((DB_MPOOL *, MPOOLFILE *)); - * - */ -char * -__memp_fns(dbmp, mfp) - DB_MPOOL *dbmp; - MPOOLFILE *mfp; -{ - if (mfp->path_off == 0) - return ((char *)"temporary"); - - return ((char *)R_ADDR(dbmp->reginfo, mfp->path_off)); -} diff --git a/bdb/mp/mp_fput.c b/bdb/mp/mp_fput.c deleted file mode 100644 index 271e44a4ef8..00000000000 --- a/bdb/mp/mp_fput.c +++ /dev/null @@ -1,202 +0,0 @@ -/*- - * See the file LICENSE for redistribution information. - * - * Copyright (c) 1996-2002 - * Sleepycat Software. All rights reserved. - */ -#include "db_config.h" - -#ifndef lint -static const char revid[] = "$Id: mp_fput.c,v 11.36 2002/08/09 19:04:11 bostic Exp $"; -#endif /* not lint */ - -#ifndef NO_SYSTEM_INCLUDES -#include <sys/types.h> - -#endif - -#include "db_int.h" -#include "dbinc/db_shash.h" -#include "dbinc/mp.h" - -/* - * __memp_fput -- - * Mpool file put function. - * - * PUBLIC: int __memp_fput __P((DB_MPOOLFILE *, void *, u_int32_t)); - */ -int -__memp_fput(dbmfp, pgaddr, flags) - DB_MPOOLFILE *dbmfp; - void *pgaddr; - u_int32_t flags; -{ - BH *argbhp, *bhp, *prev; - DB_ENV *dbenv; - DB_MPOOL *dbmp; - DB_MPOOL_HASH *hp; - MPOOL *c_mp; - u_int32_t n_cache; - int adjust, ret; - - dbmp = dbmfp->dbmp; - dbenv = dbmp->dbenv; - - PANIC_CHECK(dbenv); - - /* Validate arguments. */ - if (flags) { - if ((ret = __db_fchk(dbenv, "memp_fput", flags, - DB_MPOOL_CLEAN | DB_MPOOL_DIRTY | DB_MPOOL_DISCARD)) != 0) - return (ret); - if ((ret = __db_fcchk(dbenv, "memp_fput", - flags, DB_MPOOL_CLEAN, DB_MPOOL_DIRTY)) != 0) - return (ret); - - if (LF_ISSET(DB_MPOOL_DIRTY) && F_ISSET(dbmfp, MP_READONLY)) { - __db_err(dbenv, - "%s: dirty flag set for readonly file page", - __memp_fn(dbmfp)); - return (EACCES); - } - } - - /* - * If we're mapping the file, there's nothing to do. Because we can - * stop mapping the file at any time, we have to check on each buffer - * to see if the address we gave the application was part of the map - * region. - */ - if (dbmfp->addr != NULL && pgaddr >= dbmfp->addr && - (u_int8_t *)pgaddr <= (u_int8_t *)dbmfp->addr + dbmfp->len) - return (0); - -#ifdef DIAGNOSTIC - /* - * Decrement the per-file pinned buffer count (mapped pages aren't - * counted). - */ - R_LOCK(dbenv, dbmp->reginfo); - if (dbmfp->pinref == 0) { - ret = EINVAL; - __db_err(dbenv, - "%s: more pages returned than retrieved", __memp_fn(dbmfp)); - } else { - ret = 0; - --dbmfp->pinref; - } - R_UNLOCK(dbenv, dbmp->reginfo); - if (ret != 0) - return (ret); -#endif - - /* Convert a page address to a buffer header and hash bucket. */ - bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf)); - n_cache = NCACHE(dbmp->reginfo[0].primary, bhp->mf_offset, bhp->pgno); - c_mp = dbmp->reginfo[n_cache].primary; - hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab); - hp = &hp[NBUCKET(c_mp, bhp->mf_offset, bhp->pgno)]; - - MUTEX_LOCK(dbenv, &hp->hash_mutex); - - /* Set/clear the page bits. */ - if (LF_ISSET(DB_MPOOL_CLEAN) && - F_ISSET(bhp, BH_DIRTY) && !F_ISSET(bhp, BH_DIRTY_CREATE)) { - DB_ASSERT(hp->hash_page_dirty != 0); - --hp->hash_page_dirty; - F_CLR(bhp, BH_DIRTY); - } - if (LF_ISSET(DB_MPOOL_DIRTY) && !F_ISSET(bhp, BH_DIRTY)) { - ++hp->hash_page_dirty; - F_SET(bhp, BH_DIRTY); - } - if (LF_ISSET(DB_MPOOL_DISCARD)) - F_SET(bhp, BH_DISCARD); - - /* - * Check for a reference count going to zero. This can happen if the - * application returns a page twice. - */ - if (bhp->ref == 0) { - __db_err(dbenv, "%s: page %lu: unpinned page returned", - __memp_fn(dbmfp), (u_long)bhp->pgno); - MUTEX_UNLOCK(dbenv, &hp->hash_mutex); - return (EINVAL); - } - - /* - * If more than one reference to the page or a reference other than a - * thread waiting to flush the buffer to disk, we're done. Ignore the - * discard flags (for now) and leave the buffer's priority alone. - */ - if (--bhp->ref > 1 || (bhp->ref == 1 && !F_ISSET(bhp, BH_LOCKED))) { - MUTEX_UNLOCK(dbenv, &hp->hash_mutex); - return (0); - } - - /* Update priority values. */ - if (F_ISSET(bhp, BH_DISCARD) || - dbmfp->mfp->priority == MPOOL_PRI_VERY_LOW) - bhp->priority = 0; - else { - /* - * We don't lock the LRU counter or the stat.st_pages field, if - * we get garbage (which won't happen on a 32-bit machine), it - * only means a buffer has the wrong priority. - */ - bhp->priority = c_mp->lru_count; - - adjust = 0; - if (dbmfp->mfp->priority != 0) - adjust = - (int)c_mp->stat.st_pages / dbmfp->mfp->priority; - if (F_ISSET(bhp, BH_DIRTY)) - adjust += c_mp->stat.st_pages / MPOOL_PRI_DIRTY; - - if (adjust > 0) { - if (UINT32_T_MAX - bhp->priority <= (u_int32_t)adjust) - bhp->priority += adjust; - } else if (adjust < 0) - if (bhp->priority > (u_int32_t)-adjust) - bhp->priority += adjust; - } - - /* - * Buffers on hash buckets are sorted by priority -- move the buffer - * to the correct position in the list. - */ - argbhp = bhp; - SH_TAILQ_REMOVE(&hp->hash_bucket, argbhp, hq, __bh); - - prev = NULL; - for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh); - bhp != NULL; prev = bhp, bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) - if (bhp->priority > argbhp->priority) - break; - if (prev == NULL) - SH_TAILQ_INSERT_HEAD(&hp->hash_bucket, argbhp, hq, __bh); - else - SH_TAILQ_INSERT_AFTER(&hp->hash_bucket, prev, argbhp, hq, __bh); - - /* Reset the hash bucket's priority. */ - hp->hash_priority = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority; - -#ifdef DIAGNOSTIC - __memp_check_order(hp); -#endif - - /* - * The sync code has a separate counter for buffers on which it waits. - * It reads that value without holding a lock so we update it as the - * last thing we do. Once that value goes to 0, we won't see another - * reference to that buffer being returned to the cache until the sync - * code has finished, so we're safe as long as we don't let the value - * go to 0 before we finish with the buffer. - */ - if (F_ISSET(argbhp, BH_LOCKED) && argbhp->ref_sync != 0) - --argbhp->ref_sync; - - MUTEX_UNLOCK(dbenv, &hp->hash_mutex); - - return (0); -} diff --git a/bdb/mp/mp_fset.c b/bdb/mp/mp_fset.c deleted file mode 100644 index 65cd6286ac9..00000000000 --- a/bdb/mp/mp_fset.c +++ /dev/null @@ -1,89 +0,0 @@ -/*- - * See the file LICENSE for redistribution information. - * - * Copyright (c) 1996-2002 - * Sleepycat Software. All rights reserved. - */ -#include "db_config.h" - -#ifndef lint -static const char revid[] = "$Id: mp_fset.c,v 11.25 2002/05/03 15:21:17 bostic Exp $"; -#endif /* not lint */ - -#ifndef NO_SYSTEM_INCLUDES -#include <sys/types.h> - -#endif - -#include "db_int.h" -#include "dbinc/db_shash.h" -#include "dbinc/mp.h" - -/* - * __memp_fset -- - * Mpool page set-flag routine. - * - * PUBLIC: int __memp_fset __P((DB_MPOOLFILE *, void *, u_int32_t)); - */ -int -__memp_fset(dbmfp, pgaddr, flags) - DB_MPOOLFILE *dbmfp; - void *pgaddr; - u_int32_t flags; -{ - BH *bhp; - DB_ENV *dbenv; - DB_MPOOL *dbmp; - DB_MPOOL_HASH *hp; - MPOOL *c_mp; - u_int32_t n_cache; - int ret; - - dbmp = dbmfp->dbmp; - dbenv = dbmp->dbenv; - - PANIC_CHECK(dbenv); - - /* Validate arguments. */ - if (flags == 0) - return (__db_ferr(dbenv, "memp_fset", 1)); - - if ((ret = __db_fchk(dbenv, "memp_fset", flags, - DB_MPOOL_CLEAN | DB_MPOOL_DIRTY | DB_MPOOL_DISCARD)) != 0) - return (ret); - if ((ret = __db_fcchk(dbenv, "memp_fset", - flags, DB_MPOOL_CLEAN, DB_MPOOL_DIRTY)) != 0) - return (ret); - - if (LF_ISSET(DB_MPOOL_DIRTY) && F_ISSET(dbmfp, MP_READONLY)) { - __db_err(dbenv, "%s: dirty flag set for readonly file page", - __memp_fn(dbmfp)); - return (EACCES); - } - - /* Convert the page address to a buffer header and hash bucket. */ - bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf)); - n_cache = NCACHE(dbmp->reginfo[0].primary, bhp->mf_offset, bhp->pgno); - c_mp = dbmp->reginfo[n_cache].primary; - hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab); - hp = &hp[NBUCKET(c_mp, bhp->mf_offset, bhp->pgno)]; - - MUTEX_LOCK(dbenv, &hp->hash_mutex); - - /* Set/clear the page bits. */ - if (LF_ISSET(DB_MPOOL_CLEAN) && - F_ISSET(bhp, BH_DIRTY) && !F_ISSET(bhp, BH_DIRTY_CREATE)) { - DB_ASSERT(hp->hash_page_dirty != 0); - --hp->hash_page_dirty; - F_CLR(bhp, BH_DIRTY); - } - if (LF_ISSET(DB_MPOOL_DIRTY) && !F_ISSET(bhp, BH_DIRTY)) { - ++hp->hash_page_dirty; - F_SET(bhp, BH_DIRTY); - } - if (LF_ISSET(DB_MPOOL_DISCARD)) - F_SET(bhp, BH_DISCARD); - - MUTEX_UNLOCK(dbenv, &hp->hash_mutex); - return (0); -} diff --git a/bdb/mp/mp_method.c b/bdb/mp/mp_method.c deleted file mode 100644 index 38f0a645f16..00000000000 --- a/bdb/mp/mp_method.c +++ /dev/null @@ -1,156 +0,0 @@ -/*- - * See the file LICENSE for redistribution information. - * - * Copyright (c) 1996-2002 - * Sleepycat Software. All rights reserved. - */ -#include "db_config.h" - -#ifndef lint -static const char revid[] = "$Id: mp_method.c,v 11.29 2002/03/27 04:32:27 bostic Exp $"; -#endif /* not lint */ - -#ifndef NO_SYSTEM_INCLUDES -#include <sys/types.h> - -#ifdef HAVE_RPC -#include <rpc/rpc.h> -#endif -#endif - -#include "db_int.h" -#include "dbinc/db_shash.h" -#include "dbinc/mp.h" - -#ifdef HAVE_RPC -#include "dbinc_auto/db_server.h" -#include "dbinc_auto/rpc_client_ext.h" -#endif - -static int __memp_set_cachesize __P((DB_ENV *, u_int32_t, u_int32_t, int)); -static int __memp_set_mp_mmapsize __P((DB_ENV *, size_t)); - -/* - * __memp_dbenv_create -- - * Mpool specific creation of the DB_ENV structure. - * - * PUBLIC: void __memp_dbenv_create __P((DB_ENV *)); - */ -void -__memp_dbenv_create(dbenv) - DB_ENV *dbenv; -{ - /* - * !!! - * Our caller has not yet had the opportunity to reset the panic - * state or turn off mutex locking, and so we can neither check - * the panic state or acquire a mutex in the DB_ENV create path. - * - * We default to 32 8K pages. We don't default to a flat 256K, because - * some systems require significantly more memory to hold 32 pages than - * others. For example, HP-UX with POSIX pthreads needs 88 bytes for - * a POSIX pthread mutex and almost 200 bytes per buffer header, while - * Solaris needs 24 and 52 bytes for the same structures. The minimum - * number of hash buckets is 37. These contain a mutex also. - */ - dbenv->mp_bytes = - 32 * ((8 * 1024) + sizeof(BH)) + 37 * sizeof(DB_MPOOL_HASH); - dbenv->mp_ncache = 1; - -#ifdef HAVE_RPC - if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) { - dbenv->set_cachesize = __dbcl_env_cachesize; - dbenv->set_mp_mmapsize = __dbcl_set_mp_mmapsize; - dbenv->memp_dump_region = NULL; - dbenv->memp_fcreate = __dbcl_memp_fcreate; - dbenv->memp_nameop = NULL; - dbenv->memp_register = __dbcl_memp_register; - dbenv->memp_stat = __dbcl_memp_stat; - dbenv->memp_sync = __dbcl_memp_sync; - dbenv->memp_trickle = __dbcl_memp_trickle; - } else -#endif - { - dbenv->set_cachesize = __memp_set_cachesize; - dbenv->set_mp_mmapsize = __memp_set_mp_mmapsize; - dbenv->memp_dump_region = __memp_dump_region; - dbenv->memp_fcreate = __memp_fcreate; - dbenv->memp_nameop = __memp_nameop; - dbenv->memp_register = __memp_register; - dbenv->memp_stat = __memp_stat; - dbenv->memp_sync = __memp_sync; - dbenv->memp_trickle = __memp_trickle; - } -} - -/* - * __memp_set_cachesize -- - * Initialize the cache size. - */ -static int -__memp_set_cachesize(dbenv, gbytes, bytes, ncache) - DB_ENV *dbenv; - u_int32_t gbytes, bytes; - int ncache; -{ - ENV_ILLEGAL_AFTER_OPEN(dbenv, "set_cachesize"); - - /* Normalize the values. */ - if (ncache == 0) - ncache = 1; - - /* - * You can only store 4GB-1 in an unsigned 32-bit value, so correct for - * applications that specify 4GB cache sizes -- we know what they meant. - */ - if (gbytes / ncache == 4 && bytes == 0) { - --gbytes; - bytes = GIGABYTE - 1; - } else { - gbytes += bytes / GIGABYTE; - bytes %= GIGABYTE; - } - - /* Avoid too-large cache sizes, they result in a region size of zero. */ - if (gbytes / ncache > 4 || (gbytes / ncache == 4 && bytes != 0)) { - __db_err(dbenv, "individual cache size too large"); - return (EINVAL); - } - - /* - * If the application requested less than 500Mb, increase the cachesize - * by 25% and factor in the size of the hash buckets to account for our - * overhead. (I'm guessing caches over 500Mb are specifically sized, - * that is, it's a large server and the application actually knows how - * much memory is available. We only document the 25% overhead number, - * not the hash buckets, but I don't see a reason to confuse the issue, - * it shouldn't matter to an application.) - * - * There is a minimum cache size, regardless. - */ - if (gbytes == 0) { - if (bytes < 500 * MEGABYTE) - bytes += (bytes / 4) + 37 * sizeof(DB_MPOOL_HASH); - if (bytes / ncache < DB_CACHESIZE_MIN) - bytes = ncache * DB_CACHESIZE_MIN; - } - - dbenv->mp_gbytes = gbytes; - dbenv->mp_bytes = bytes; - dbenv->mp_ncache = ncache; - - return (0); -} - -/* - * __memp_set_mp_mmapsize -- - * Set the maximum mapped file size. - */ -static int -__memp_set_mp_mmapsize(dbenv, mp_mmapsize ) - DB_ENV *dbenv; - size_t mp_mmapsize; -{ - dbenv->mp_mmapsize = mp_mmapsize; - return (0); -} diff --git a/bdb/mp/mp_region.c b/bdb/mp/mp_region.c deleted file mode 100644 index 06eca2f8646..00000000000 --- a/bdb/mp/mp_region.c +++ /dev/null @@ -1,466 +0,0 @@ -/*- - * See the file LICENSE for redistribution information. - * - * Copyright (c) 1996-2002 - * Sleepycat Software. All rights reserved. - */ -#include "db_config.h" - -#ifndef lint -static const char revid[] = "$Id: mp_region.c,v 11.49 2002/05/07 18:42:20 bostic Exp $"; -#endif /* not lint */ - -#ifndef NO_SYSTEM_INCLUDES -#include <sys/types.h> - -#include <string.h> -#endif - -#include "db_int.h" -#include "dbinc/db_shash.h" -#include "dbinc/mp.h" - -static int __mpool_init __P((DB_ENV *, DB_MPOOL *, int, int)); -#ifdef HAVE_MUTEX_SYSTEM_RESOURCES -static size_t __mpool_region_maint __P((REGINFO *)); -#endif - -/* - * __memp_open -- - * Internal version of memp_open: only called from DB_ENV->open. - * - * PUBLIC: int __memp_open __P((DB_ENV *)); - */ -int -__memp_open(dbenv) - DB_ENV *dbenv; -{ - DB_MPOOL *dbmp; - MPOOL *mp; - REGINFO reginfo; - roff_t reg_size, *regids; - u_int32_t i; - int htab_buckets, ret; - - /* Figure out how big each cache region is. */ - reg_size = (dbenv->mp_gbytes / dbenv->mp_ncache) * GIGABYTE; - reg_size += ((dbenv->mp_gbytes % - dbenv->mp_ncache) * GIGABYTE) / dbenv->mp_ncache; - reg_size += dbenv->mp_bytes / dbenv->mp_ncache; - - /* - * Figure out how many hash buckets each region will have. Assume we - * want to keep the hash chains with under 10 pages on each chain. We - * don't know the pagesize in advance, and it may differ for different - * files. Use a pagesize of 1K for the calculation -- we walk these - * chains a lot, they must be kept short. - */ - htab_buckets = __db_tablesize((reg_size / (1 * 1024)) / 10); - - /* Create and initialize the DB_MPOOL structure. */ - if ((ret = __os_calloc(dbenv, 1, sizeof(*dbmp), &dbmp)) != 0) - return (ret); - LIST_INIT(&dbmp->dbregq); - TAILQ_INIT(&dbmp->dbmfq); - dbmp->dbenv = dbenv; - - /* Join/create the first mpool region. */ - memset(®info, 0, sizeof(REGINFO)); - reginfo.type = REGION_TYPE_MPOOL; - reginfo.id = INVALID_REGION_ID; - reginfo.mode = dbenv->db_mode; - reginfo.flags = REGION_JOIN_OK; - if (F_ISSET(dbenv, DB_ENV_CREATE)) - F_SET(®info, REGION_CREATE_OK); - if ((ret = __db_r_attach(dbenv, ®info, reg_size)) != 0) - goto err; - - /* - * If we created the region, initialize it. Create or join any - * additional regions. - */ - if (F_ISSET(®info, REGION_CREATE)) { - /* - * We define how many regions there are going to be, allocate - * the REGINFO structures and create them. Make sure we don't - * clear the wrong entries on error. - */ - dbmp->nreg = dbenv->mp_ncache; - if ((ret = __os_calloc(dbenv, - dbmp->nreg, sizeof(REGINFO), &dbmp->reginfo)) != 0) - goto err; - /* Make sure we don't clear the wrong entries on error. */ - for (i = 0; i < dbmp->nreg; ++i) - dbmp->reginfo[i].id = INVALID_REGION_ID; - dbmp->reginfo[0] = reginfo; - - /* Initialize the first region. */ - if ((ret = __mpool_init(dbenv, dbmp, 0, htab_buckets)) != 0) - goto err; - - /* - * Create/initialize remaining regions and copy their IDs into - * the first region. - */ - mp = R_ADDR(dbmp->reginfo, dbmp->reginfo[0].rp->primary); - regids = R_ADDR(dbmp->reginfo, mp->regids); - for (i = 1; i < dbmp->nreg; ++i) { - dbmp->reginfo[i].type = REGION_TYPE_MPOOL; - dbmp->reginfo[i].id = INVALID_REGION_ID; - dbmp->reginfo[i].mode = dbenv->db_mode; - dbmp->reginfo[i].flags = REGION_CREATE_OK; - if ((ret = __db_r_attach( - dbenv, &dbmp->reginfo[i], reg_size)) != 0) - goto err; - if ((ret = - __mpool_init(dbenv, dbmp, i, htab_buckets)) != 0) - goto err; - R_UNLOCK(dbenv, &dbmp->reginfo[i]); - - regids[i] = dbmp->reginfo[i].id; - } - - R_UNLOCK(dbenv, dbmp->reginfo); - } else { - /* - * Determine how many regions there are going to be, allocate - * the REGINFO structures and fill in local copies of that - * information. - */ - mp = R_ADDR(®info, reginfo.rp->primary); - dbmp->nreg = mp->nreg; - if ((ret = __os_calloc(dbenv, - dbmp->nreg, sizeof(REGINFO), &dbmp->reginfo)) != 0) - goto err; - /* Make sure we don't clear the wrong entries on error. */ - for (i = 0; i < dbmp->nreg; ++i) - dbmp->reginfo[i].id = INVALID_REGION_ID; - dbmp->reginfo[0] = reginfo; - - /* - * We have to unlock the primary mpool region before we attempt - * to join the additional mpool regions. If we don't, we can - * deadlock. The scenario is that we hold the primary mpool - * region lock. We then try to attach to an additional mpool - * region, which requires the acquisition/release of the main - * region lock (to search the list of regions). If another - * thread of control already holds the main region lock and is - * waiting on our primary mpool region lock, we'll deadlock. - * See [#4696] for more information. - */ - R_UNLOCK(dbenv, dbmp->reginfo); - - /* Join remaining regions. */ - regids = R_ADDR(dbmp->reginfo, mp->regids); - for (i = 1; i < dbmp->nreg; ++i) { - dbmp->reginfo[i].type = REGION_TYPE_MPOOL; - dbmp->reginfo[i].id = regids[i]; - dbmp->reginfo[i].mode = 0; - dbmp->reginfo[i].flags = REGION_JOIN_OK; - if ((ret = __db_r_attach( - dbenv, &dbmp->reginfo[i], 0)) != 0) - goto err; - R_UNLOCK(dbenv, &dbmp->reginfo[i]); - } - } - - /* Set the local addresses for the regions. */ - for (i = 0; i < dbmp->nreg; ++i) - dbmp->reginfo[i].primary = - R_ADDR(&dbmp->reginfo[i], dbmp->reginfo[i].rp->primary); - - /* If the region is threaded, allocate a mutex to lock the handles. */ - if (F_ISSET(dbenv, DB_ENV_THREAD) && - (ret = __db_mutex_setup(dbenv, dbmp->reginfo, &dbmp->mutexp, - MUTEX_ALLOC | MUTEX_THREAD)) != 0) - goto err; - - dbenv->mp_handle = dbmp; - return (0); - -err: if (dbmp->reginfo != NULL && dbmp->reginfo[0].addr != NULL) { - if (F_ISSET(dbmp->reginfo, REGION_CREATE)) - ret = __db_panic(dbenv, ret); - - R_UNLOCK(dbenv, dbmp->reginfo); - - for (i = 0; i < dbmp->nreg; ++i) - if (dbmp->reginfo[i].id != INVALID_REGION_ID) - (void)__db_r_detach( - dbenv, &dbmp->reginfo[i], 0); - __os_free(dbenv, dbmp->reginfo); - } - if (dbmp->mutexp != NULL) - __db_mutex_free(dbenv, dbmp->reginfo, dbmp->mutexp); - __os_free(dbenv, dbmp); - return (ret); -} - -/* - * __mpool_init -- - * Initialize a MPOOL structure in shared memory. - */ -static int -__mpool_init(dbenv, dbmp, reginfo_off, htab_buckets) - DB_ENV *dbenv; - DB_MPOOL *dbmp; - int reginfo_off, htab_buckets; -{ - DB_MPOOL_HASH *htab; - MPOOL *mp; - REGINFO *reginfo; -#ifdef HAVE_MUTEX_SYSTEM_RESOURCES - size_t maint_size; -#endif - int i, ret; - void *p; - - mp = NULL; - - reginfo = &dbmp->reginfo[reginfo_off]; - if ((ret = __db_shalloc(reginfo->addr, - sizeof(MPOOL), MUTEX_ALIGN, ®info->primary)) != 0) - goto mem_err; - reginfo->rp->primary = R_OFFSET(reginfo, reginfo->primary); - mp = reginfo->primary; - memset(mp, 0, sizeof(*mp)); - -#ifdef HAVE_MUTEX_SYSTEM_RESOURCES - maint_size = __mpool_region_maint(reginfo); - /* Allocate room for the maintenance info and initialize it. */ - if ((ret = __db_shalloc(reginfo->addr, - sizeof(REGMAINT) + maint_size, 0, &p)) != 0) - goto mem_err; - __db_maintinit(reginfo, p, maint_size); - mp->maint_off = R_OFFSET(reginfo, p); -#endif - - if (reginfo_off == 0) { - SH_TAILQ_INIT(&mp->mpfq); - - ZERO_LSN(mp->lsn); - - mp->nreg = dbmp->nreg; - if ((ret = __db_shalloc(dbmp->reginfo[0].addr, - dbmp->nreg * sizeof(int), 0, &p)) != 0) - goto mem_err; - mp->regids = R_OFFSET(dbmp->reginfo, p); - } - - /* Allocate hash table space and initialize it. */ - if ((ret = __db_shalloc(reginfo->addr, - htab_buckets * sizeof(DB_MPOOL_HASH), 0, &htab)) != 0) - goto mem_err; - mp->htab = R_OFFSET(reginfo, htab); - for (i = 0; i < htab_buckets; i++) { - if ((ret = __db_mutex_setup(dbenv, - reginfo, &htab[i].hash_mutex, - MUTEX_NO_RLOCK)) != 0) - return (ret); - SH_TAILQ_INIT(&htab[i].hash_bucket); - htab[i].hash_page_dirty = htab[i].hash_priority = 0; - } - mp->htab_buckets = mp->stat.st_hash_buckets = htab_buckets; - - /* - * Only the environment creator knows the total cache size, fill in - * those statistics now. - */ - mp->stat.st_gbytes = dbenv->mp_gbytes; - mp->stat.st_bytes = dbenv->mp_bytes; - return (0); - -mem_err:__db_err(dbenv, "Unable to allocate memory for mpool region"); - return (ret); -} - -/* - * __memp_dbenv_refresh -- - * Clean up after the mpool system on a close or failed open. - * - * PUBLIC: int __memp_dbenv_refresh __P((DB_ENV *)); - */ -int -__memp_dbenv_refresh(dbenv) - DB_ENV *dbenv; -{ - DB_MPOOL *dbmp; - DB_MPOOLFILE *dbmfp; - DB_MPREG *mpreg; - u_int32_t i; - int ret, t_ret; - - ret = 0; - dbmp = dbenv->mp_handle; - - /* Discard DB_MPREGs. */ - while ((mpreg = LIST_FIRST(&dbmp->dbregq)) != NULL) { - LIST_REMOVE(mpreg, q); - __os_free(dbenv, mpreg); - } - - /* Discard DB_MPOOLFILEs. */ - while ((dbmfp = TAILQ_FIRST(&dbmp->dbmfq)) != NULL) - if ((t_ret = __memp_fclose_int(dbmfp, 0)) != 0 && ret == 0) - ret = t_ret; - - /* Discard the thread mutex. */ - if (dbmp->mutexp != NULL) - __db_mutex_free(dbenv, dbmp->reginfo, dbmp->mutexp); - - /* Detach from the region(s). */ - for (i = 0; i < dbmp->nreg; ++i) - if ((t_ret = __db_r_detach( - dbenv, &dbmp->reginfo[i], 0)) != 0 && ret == 0) - ret = t_ret; - - __os_free(dbenv, dbmp->reginfo); - __os_free(dbenv, dbmp); - - dbenv->mp_handle = NULL; - return (ret); -} - -#ifdef HAVE_MUTEX_SYSTEM_RESOURCES -/* - * __mpool_region_maint -- - * Return the amount of space needed for region maintenance info. - * - */ -static size_t -__mpool_region_maint(infop) - REGINFO *infop; -{ - size_t s; - int numlocks; - - /* - * For mutex maintenance we need one mutex per possible page. - * Compute the maximum number of pages this cache can have. - * Also add in an mpool mutex and mutexes for all dbenv and db - * handles. - */ - numlocks = ((infop->rp->size / DB_MIN_PGSIZE) + 1); - numlocks += DB_MAX_HANDLES; - s = sizeof(roff_t) * numlocks; - return (s); -} -#endif - -/* - * __mpool_region_destroy - * Destroy any region maintenance info. - * - * PUBLIC: void __mpool_region_destroy __P((DB_ENV *, REGINFO *)); - */ -void -__mpool_region_destroy(dbenv, infop) - DB_ENV *dbenv; - REGINFO *infop; -{ - __db_shlocks_destroy(infop, (REGMAINT *)R_ADDR(infop, - ((MPOOL *)R_ADDR(infop, infop->rp->primary))->maint_off)); - - COMPQUIET(dbenv, NULL); - COMPQUIET(infop, NULL); -} - -/* - * __memp_nameop - * Remove or rename a file in the pool. - * - * PUBLIC: int __memp_nameop __P((DB_ENV *, - * PUBLIC: u_int8_t *, const char *, const char *, const char *)); - * - * XXX - * Undocumented interface: DB private. - */ -int -__memp_nameop(dbenv, fileid, newname, fullold, fullnew) - DB_ENV *dbenv; - u_int8_t *fileid; - const char *newname, *fullold, *fullnew; -{ - DB_MPOOL *dbmp; - MPOOL *mp; - MPOOLFILE *mfp; - roff_t newname_off; - int locked, ret; - void *p; - - locked = 0; - dbmp = NULL; - - if (!MPOOL_ON(dbenv)) - goto fsop; - - dbmp = dbenv->mp_handle; - mp = dbmp->reginfo[0].primary; - - /* - * Remove or rename a file that the mpool might know about. We assume - * that the fop layer has the file locked for exclusive access, so we - * don't worry about locking except for the mpool mutexes. Checkpoint - * can happen at any time, independent of file locking, so we have to - * do the actual unlink or rename system call to avoid any race. - * - * If this is a rename, allocate first, because we can't recursively - * grab the region lock. - */ - if (newname == NULL) - p = NULL; - else { - if ((ret = __memp_alloc(dbmp, dbmp->reginfo, - NULL, strlen(newname) + 1, &newname_off, &p)) != 0) - return (ret); - memcpy(p, newname, strlen(newname) + 1); - } - - locked = 1; - R_LOCK(dbenv, dbmp->reginfo); - - /* - * Find the file -- if mpool doesn't know about this file, that's not - * an error-- we may not have it open. - */ - for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile); - mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) { - /* Ignore non-active files. */ - if (F_ISSET(mfp, MP_DEADFILE | MP_TEMP)) - continue; - - /* Ignore non-matching files. */ - if (memcmp(fileid, R_ADDR( - dbmp->reginfo, mfp->fileid_off), DB_FILE_ID_LEN) != 0) - continue; - - /* If newname is NULL, we're removing the file. */ - if (newname == NULL) { - MUTEX_LOCK(dbenv, &mfp->mutex); - MPOOLFILE_IGNORE(mfp); - MUTEX_UNLOCK(dbenv, &mfp->mutex); - } else { - /* - * Else, it's a rename. We've allocated memory - * for the new name. Swap it with the old one. - */ - p = R_ADDR(dbmp->reginfo, mfp->path_off); - mfp->path_off = newname_off; - } - break; - } - - /* Delete the memory we no longer need. */ - if (p != NULL) - __db_shalloc_free(dbmp->reginfo[0].addr, p); - -fsop: if (newname == NULL) - (void)__os_unlink(dbenv, fullold); - else - (void)__os_rename(dbenv, fullold, fullnew, 1); - - if (locked) - R_UNLOCK(dbenv, dbmp->reginfo); - - return (0); -} diff --git a/bdb/mp/mp_register.c b/bdb/mp/mp_register.c deleted file mode 100644 index 46eefad986f..00000000000 --- a/bdb/mp/mp_register.c +++ /dev/null @@ -1,76 +0,0 @@ -/*- - * See the file LICENSE for redistribution information. - * - * Copyright (c) 1996-2002 - * Sleepycat Software. All rights reserved. - */ -#include "db_config.h" - -#ifndef lint -static const char revid[] = "$Id: mp_register.c,v 11.21 2002/03/27 04:32:27 bostic Exp $"; -#endif /* not lint */ - -#ifndef NO_SYSTEM_INCLUDES -#include <sys/types.h> -#endif - -#include "db_int.h" -#include "dbinc/db_shash.h" -#include "dbinc/mp.h" - -/* - * memp_register -- - * Register a file type's pgin, pgout routines. - * - * PUBLIC: int __memp_register __P((DB_ENV *, int, - * PUBLIC: int (*)(DB_ENV *, db_pgno_t, void *, DBT *), - * PUBLIC: int (*)(DB_ENV *, db_pgno_t, void *, DBT *))); - */ -int -__memp_register(dbenv, ftype, pgin, pgout) - DB_ENV *dbenv; - int ftype; - int (*pgin) __P((DB_ENV *, db_pgno_t, void *, DBT *)); - int (*pgout) __P((DB_ENV *, db_pgno_t, void *, DBT *)); -{ - DB_MPOOL *dbmp; - DB_MPREG *mpreg; - int ret; - - PANIC_CHECK(dbenv); - ENV_REQUIRES_CONFIG(dbenv, - dbenv->mp_handle, "DB_ENV->memp_register", DB_INIT_MPOOL); - - dbmp = dbenv->mp_handle; - - /* - * Chances are good that the item has already been registered, as the - * DB access methods are the folks that call this routine. If already - * registered, just update the entry, although it's probably unchanged. - */ - MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp); - for (mpreg = LIST_FIRST(&dbmp->dbregq); - mpreg != NULL; mpreg = LIST_NEXT(mpreg, q)) - if (mpreg->ftype == ftype) { - mpreg->pgin = pgin; - mpreg->pgout = pgout; - break; - } - MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); - if (mpreg != NULL) - return (0); - - /* New entry. */ - if ((ret = __os_malloc(dbenv, sizeof(DB_MPREG), &mpreg)) != 0) - return (ret); - - mpreg->ftype = ftype; - mpreg->pgin = pgin; - mpreg->pgout = pgout; - - MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp); - LIST_INSERT_HEAD(&dbmp->dbregq, mpreg, q); - MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); - - return (0); -} diff --git a/bdb/mp/mp_stat.c b/bdb/mp/mp_stat.c deleted file mode 100644 index 12e72b91d70..00000000000 --- a/bdb/mp/mp_stat.c +++ /dev/null @@ -1,491 +0,0 @@ -/*- - * See the file LICENSE for redistribution information. - * - * Copyright (c) 1996-2002 - * Sleepycat Software. All rights reserved. - */ -#include "db_config.h" - -#ifndef lint -static const char revid[] = "$Id: mp_stat.c,v 11.51 2002/08/06 06:13:47 bostic Exp $"; -#endif /* not lint */ - -#ifndef NO_SYSTEM_INCLUDES -#include <sys/types.h> - -#include <stdio.h> -#include <string.h> -#include <unistd.h> -#endif - -#include "db_int.h" -#include "dbinc/db_page.h" -#include "dbinc/db_shash.h" -#include "dbinc/db_am.h" -#include "dbinc/mp.h" - -static void __memp_dumpcache __P((DB_ENV *, - DB_MPOOL *, REGINFO *, size_t *, FILE *, u_int32_t)); -static void __memp_pbh __P((DB_MPOOL *, BH *, size_t *, FILE *)); -static void __memp_stat_wait __P((REGINFO *, MPOOL *, DB_MPOOL_STAT *, int)); - -/* - * __memp_stat -- - * Display MPOOL statistics. - * - * PUBLIC: int __memp_stat - * PUBLIC: __P((DB_ENV *, DB_MPOOL_STAT **, DB_MPOOL_FSTAT ***, u_int32_t)); - */ -int -__memp_stat(dbenv, gspp, fspp, flags) - DB_ENV *dbenv; - DB_MPOOL_STAT **gspp; - DB_MPOOL_FSTAT ***fspp; - u_int32_t flags; -{ - DB_MPOOL *dbmp; - DB_MPOOL_FSTAT **tfsp, *tstruct; - DB_MPOOL_STAT *sp; - MPOOL *c_mp, *mp; - MPOOLFILE *mfp; - size_t len, nlen, pagesize; - u_int32_t pages, i; - int ret; - char *name, *tname; - - PANIC_CHECK(dbenv); - ENV_REQUIRES_CONFIG(dbenv, - dbenv->mp_handle, "memp_stat", DB_INIT_MPOOL); - - if ((ret = __db_fchk(dbenv, - "DB_ENV->memp_stat", flags, DB_STAT_CLEAR)) != 0) - return (ret); - - dbmp = dbenv->mp_handle; - mp = dbmp->reginfo[0].primary; - - /* Global statistics. */ - if (gspp != NULL) { - *gspp = NULL; - - if ((ret = __os_umalloc(dbenv, sizeof(**gspp), gspp)) != 0) - return (ret); - memset(*gspp, 0, sizeof(**gspp)); - sp = *gspp; - - /* - * Initialization and information that is not maintained on - * a per-cache basis. - */ - c_mp = dbmp->reginfo[0].primary; - sp->st_gbytes = c_mp->stat.st_gbytes; - sp->st_bytes = c_mp->stat.st_bytes; - sp->st_ncache = dbmp->nreg; - sp->st_regsize = dbmp->reginfo[0].rp->size; - - /* Walk the cache list and accumulate the global information. */ - for (i = 0; i < mp->nreg; ++i) { - c_mp = dbmp->reginfo[i].primary; - - sp->st_map += c_mp->stat.st_map; - sp->st_cache_hit += c_mp->stat.st_cache_hit; - sp->st_cache_miss += c_mp->stat.st_cache_miss; - sp->st_page_create += c_mp->stat.st_page_create; - sp->st_page_in += c_mp->stat.st_page_in; - sp->st_page_out += c_mp->stat.st_page_out; - sp->st_ro_evict += c_mp->stat.st_ro_evict; - sp->st_rw_evict += c_mp->stat.st_rw_evict; - sp->st_page_trickle += c_mp->stat.st_page_trickle; - sp->st_pages += c_mp->stat.st_pages; - /* - * st_page_dirty calculated by __memp_stat_hash - * st_page_clean calculated here - */ - __memp_stat_hash( - &dbmp->reginfo[i], c_mp, &sp->st_page_dirty); - sp->st_page_clean = sp->st_pages - sp->st_page_dirty; - sp->st_hash_buckets += c_mp->stat.st_hash_buckets; - sp->st_hash_searches += c_mp->stat.st_hash_searches; - sp->st_hash_longest += c_mp->stat.st_hash_longest; - sp->st_hash_examined += c_mp->stat.st_hash_examined; - /* - * st_hash_nowait calculated by __memp_stat_wait - * st_hash_wait - */ - __memp_stat_wait(&dbmp->reginfo[i], c_mp, sp, flags); - sp->st_region_nowait += - dbmp->reginfo[i].rp->mutex.mutex_set_nowait; - sp->st_region_wait += - dbmp->reginfo[i].rp->mutex.mutex_set_wait; - sp->st_alloc += c_mp->stat.st_alloc; - sp->st_alloc_buckets += c_mp->stat.st_alloc_buckets; - if (sp->st_alloc_max_buckets < - c_mp->stat.st_alloc_max_buckets) - sp->st_alloc_max_buckets = - c_mp->stat.st_alloc_max_buckets; - sp->st_alloc_pages += c_mp->stat.st_alloc_pages; - if (sp->st_alloc_max_pages < - c_mp->stat.st_alloc_max_pages) - sp->st_alloc_max_pages = - c_mp->stat.st_alloc_max_pages; - - if (LF_ISSET(DB_STAT_CLEAR)) { - dbmp->reginfo[i].rp->mutex.mutex_set_wait = 0; - dbmp->reginfo[i].rp->mutex.mutex_set_nowait = 0; - pages = c_mp->stat.st_pages; - memset(&c_mp->stat, 0, sizeof(c_mp->stat)); - c_mp->stat.st_hash_buckets = c_mp->htab_buckets; - c_mp->stat.st_pages = pages; - } - } - - /* - * We have duplicate statistics fields in per-file structures - * and the cache. The counters are only incremented in the - * per-file structures, except if a file is flushed from the - * mpool, at which time we copy its information into the cache - * statistics. We added the cache information above, now we - * add the per-file information. - */ - R_LOCK(dbenv, dbmp->reginfo); - for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile); - mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) { - sp->st_map += mfp->stat.st_map; - sp->st_cache_hit += mfp->stat.st_cache_hit; - sp->st_cache_miss += mfp->stat.st_cache_miss; - sp->st_page_create += mfp->stat.st_page_create; - sp->st_page_in += mfp->stat.st_page_in; - sp->st_page_out += mfp->stat.st_page_out; - if (fspp == NULL && LF_ISSET(DB_STAT_CLEAR)) { - pagesize = mfp->stat.st_pagesize; - memset(&mfp->stat, 0, sizeof(mfp->stat)); - mfp->stat.st_pagesize = pagesize; - } - } - R_UNLOCK(dbenv, dbmp->reginfo); - } - - /* Per-file statistics. */ - if (fspp != NULL) { - *fspp = NULL; - - /* Count the MPOOLFILE structures. */ - R_LOCK(dbenv, dbmp->reginfo); - for (i = 0, len = 0, - mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile); - mfp != NULL; - ++i, mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) - len += sizeof(DB_MPOOL_FSTAT *) + - sizeof(DB_MPOOL_FSTAT) + - strlen(__memp_fns(dbmp, mfp)) + 1; - len += sizeof(DB_MPOOL_FSTAT *); /* Trailing NULL */ - R_UNLOCK(dbenv, dbmp->reginfo); - - if (i == 0) - return (0); - - /* Allocate space */ - if ((ret = __os_umalloc(dbenv, len, fspp)) != 0) - return (ret); - - /* - * Build each individual entry. We assume that an array of - * pointers are aligned correctly to be followed by an array - * of structures, which should be safe (in this particular - * case, the first element of the structure is a pointer, so - * we're doubly safe). The array is followed by space for - * the text file names. - * - * Add 1 to i because we need to skip over the NULL. - */ - tfsp = *fspp; - tstruct = (DB_MPOOL_FSTAT *)(tfsp + i + 1); - tname = (char *)(tstruct + i); - - /* - * Files may have been opened since we counted, don't walk - * off the end of the allocated space. - */ - R_LOCK(dbenv, dbmp->reginfo); - for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile); - mfp != NULL && i-- > 0; - ++tfsp, ++tstruct, tname += nlen, - mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) { - name = __memp_fns(dbmp, mfp); - nlen = strlen(name) + 1; - *tfsp = tstruct; - *tstruct = mfp->stat; - if (LF_ISSET(DB_STAT_CLEAR)) { - pagesize = mfp->stat.st_pagesize; - memset(&mfp->stat, 0, sizeof(mfp->stat)); - mfp->stat.st_pagesize = pagesize; - } - tstruct->file_name = tname; - memcpy(tname, name, nlen); - } - R_UNLOCK(dbenv, dbmp->reginfo); - - *tfsp = NULL; - } - return (0); -} - -#define FMAP_ENTRIES 200 /* Files we map. */ - -#define MPOOL_DUMP_HASH 0x01 /* Debug hash chains. */ -#define MPOOL_DUMP_MEM 0x04 /* Debug region memory. */ -#define MPOOL_DUMP_ALL 0x07 /* Debug all. */ - -/* - * __memp_dump_region -- - * Display MPOOL structures. - * - * PUBLIC: int __memp_dump_region __P((DB_ENV *, char *, FILE *)); - */ -int -__memp_dump_region(dbenv, area, fp) - DB_ENV *dbenv; - char *area; - FILE *fp; -{ - static const FN fn[] = { - { MP_CAN_MMAP, "mmapped" }, - { MP_DEADFILE, "dead" }, - { MP_DIRECT, "no buffer" }, - { MP_EXTENT, "extent" }, - { MP_TEMP, "temporary" }, - { MP_UNLINK, "unlink" }, - { 0, NULL } - }; - DB_MPOOL *dbmp; - DB_MPOOLFILE *dbmfp; - MPOOL *mp; - MPOOLFILE *mfp; - size_t fmap[FMAP_ENTRIES + 1]; - u_int32_t i, flags; - int cnt; - u_int8_t *p; - - PANIC_CHECK(dbenv); - ENV_REQUIRES_CONFIG(dbenv, - dbenv->mp_handle, "memp_dump_region", DB_INIT_MPOOL); - - dbmp = dbenv->mp_handle; - - /* Make it easy to call from the debugger. */ - if (fp == NULL) - fp = stderr; - - for (flags = 0; *area != '\0'; ++area) - switch (*area) { - case 'A': - LF_SET(MPOOL_DUMP_ALL); - break; - case 'h': - LF_SET(MPOOL_DUMP_HASH); - break; - case 'm': - LF_SET(MPOOL_DUMP_MEM); - break; - } - - mp = dbmp->reginfo[0].primary; - - /* Display MPOOL structures. */ - (void)fprintf(fp, "%s\nPool (region addr 0x%lx)\n", - DB_LINE, P_TO_ULONG(dbmp->reginfo[0].addr)); - - /* Display the MPOOLFILE structures. */ - R_LOCK(dbenv, dbmp->reginfo); - for (cnt = 0, mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile); - mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile), ++cnt) { - (void)fprintf(fp, "File #%d: %s: pagesize %lu\n", cnt + 1, - __memp_fns(dbmp, mfp), (u_long)mfp->stat.st_pagesize); - (void)fprintf(fp, "\t type %ld; ref %lu; blocks %lu; last %lu;", - (long)mfp->ftype, (u_long)mfp->mpf_cnt, - (u_long)mfp->block_cnt, (u_long)mfp->last_pgno); - __db_prflags(mfp->flags, fn, fp); - - (void)fprintf(fp, "\n\t UID: "); - p = R_ADDR(dbmp->reginfo, mfp->fileid_off); - for (i = 0; i < DB_FILE_ID_LEN; ++i, ++p) { - (void)fprintf(fp, "%x", (u_int)*p); - if (i < DB_FILE_ID_LEN - 1) - (void)fprintf(fp, " "); - } - (void)fprintf(fp, "\n"); - if (cnt < FMAP_ENTRIES) - fmap[cnt] = R_OFFSET(dbmp->reginfo, mfp); - } - R_UNLOCK(dbenv, dbmp->reginfo); - - MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp); - for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq); - dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q), ++cnt) { - (void)fprintf(fp, "File #%d: %s: per-process, %s\n", - cnt + 1, __memp_fn(dbmfp), - F_ISSET(dbmfp, MP_READONLY) ? "readonly" : "read/write"); - if (cnt < FMAP_ENTRIES) - fmap[cnt] = R_OFFSET(dbmp->reginfo, mfp); - } - MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); - if (cnt < FMAP_ENTRIES) - fmap[cnt] = INVALID_ROFF; - else - fmap[FMAP_ENTRIES] = INVALID_ROFF; - - /* Dump the memory pools. */ - for (i = 0; i < mp->nreg; ++i) { - (void)fprintf(fp, "%s\nCache #%d:\n", DB_LINE, i + 1); - __memp_dumpcache( - dbenv, dbmp, &dbmp->reginfo[i], fmap, fp, flags); - } - - /* Flush in case we're debugging. */ - (void)fflush(fp); - - return (0); -} - -/* - * __memp_dumpcache -- - * Display statistics for a cache. - */ -static void -__memp_dumpcache(dbenv, dbmp, reginfo, fmap, fp, flags) - DB_ENV *dbenv; - DB_MPOOL *dbmp; - REGINFO *reginfo; - size_t *fmap; - FILE *fp; - u_int32_t flags; -{ - BH *bhp; - DB_MPOOL_HASH *hp; - MPOOL *c_mp; - int bucket; - - c_mp = reginfo->primary; - - /* Display the hash table list of BH's. */ - if (LF_ISSET(MPOOL_DUMP_HASH)) { - (void)fprintf(fp, - "%s\nBH hash table (%lu hash slots)\nbucket (priority):\n", - DB_LINE, (u_long)c_mp->htab_buckets); - (void)fprintf(fp, - "\tpageno, file, ref, address [LSN] priority\n"); - - for (hp = R_ADDR(reginfo, c_mp->htab), - bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) { - MUTEX_LOCK(dbenv, &hp->hash_mutex); - if ((bhp = - SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) != NULL) - (void)fprintf(fp, "%lu (%u):\n", - (u_long)bucket, hp->hash_priority); - for (; bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) - __memp_pbh(dbmp, bhp, fmap, fp); - MUTEX_UNLOCK(dbenv, &hp->hash_mutex); - } - } - - /* Dump the memory pool. */ - if (LF_ISSET(MPOOL_DUMP_MEM)) - __db_shalloc_dump(reginfo->addr, fp); -} - -/* - * __memp_pbh -- - * Display a BH structure. - */ -static void -__memp_pbh(dbmp, bhp, fmap, fp) - DB_MPOOL *dbmp; - BH *bhp; - size_t *fmap; - FILE *fp; -{ - static const FN fn[] = { - { BH_CALLPGIN, "callpgin" }, - { BH_DIRTY, "dirty" }, - { BH_DIRTY_CREATE, "created" }, - { BH_DISCARD, "discard" }, - { BH_LOCKED, "locked" }, - { BH_TRASH, "trash" }, - { 0, NULL } - }; - int i; - - for (i = 0; i < FMAP_ENTRIES; ++i) - if (fmap[i] == INVALID_ROFF || fmap[i] == bhp->mf_offset) - break; - - if (fmap[i] == INVALID_ROFF) - (void)fprintf(fp, "\t%5lu, %lu, %2lu, %8lu [%lu,%lu] %lu", - (u_long)bhp->pgno, (u_long)bhp->mf_offset, - (u_long)bhp->ref, (u_long)R_OFFSET(dbmp->reginfo, bhp), - (u_long)LSN(bhp->buf).file, (u_long)LSN(bhp->buf).offset, - (u_long)bhp->priority); - else - (void)fprintf(fp, "\t%5lu, #%d, %2lu, %8lu [%lu,%lu] %lu", - (u_long)bhp->pgno, i + 1, - (u_long)bhp->ref, (u_long)R_OFFSET(dbmp->reginfo, bhp), - (u_long)LSN(bhp->buf).file, (u_long)LSN(bhp->buf).offset, - (u_long)bhp->priority); - - __db_prflags(bhp->flags, fn, fp); - - (void)fprintf(fp, "\n"); -} - -/* - * __memp_stat_hash -- - * Total hash bucket stats (other than mutex wait) into the region. - * - * PUBLIC: void __memp_stat_hash __P((REGINFO *, MPOOL *, u_int32_t *)); - */ -void -__memp_stat_hash(reginfo, mp, dirtyp) - REGINFO *reginfo; - MPOOL *mp; - u_int32_t *dirtyp; -{ - DB_MPOOL_HASH *hp; - u_int32_t dirty; - int i; - - hp = R_ADDR(reginfo, mp->htab); - for (i = 0, dirty = 0; i < mp->htab_buckets; i++, hp++) - dirty += hp->hash_page_dirty; - *dirtyp = dirty; -} - -/* - * __memp_stat_wait -- - * Total hash bucket wait stats into the region. - */ -static void -__memp_stat_wait(reginfo, mp, mstat, flags) - REGINFO *reginfo; - MPOOL *mp; - DB_MPOOL_STAT *mstat; - int flags; -{ - DB_MPOOL_HASH *hp; - DB_MUTEX *mutexp; - int i; - - mstat->st_hash_max_wait = 0; - hp = R_ADDR(reginfo, mp->htab); - for (i = 0; i < mp->htab_buckets; i++, hp++) { - mutexp = &hp->hash_mutex; - mstat->st_hash_nowait += mutexp->mutex_set_nowait; - mstat->st_hash_wait += mutexp->mutex_set_wait; - if (mutexp->mutex_set_wait > mstat->st_hash_max_wait) - mstat->st_hash_max_wait = mutexp->mutex_set_wait; - - if (LF_ISSET(DB_STAT_CLEAR)) { - mutexp->mutex_set_wait = 0; - mutexp->mutex_set_nowait = 0; - } - } -} diff --git a/bdb/mp/mp_sync.c b/bdb/mp/mp_sync.c deleted file mode 100644 index 03b42208b39..00000000000 --- a/bdb/mp/mp_sync.c +++ /dev/null @@ -1,627 +0,0 @@ -/*- - * See the file LICENSE for redistribution information. - * - * Copyright (c) 1996-2002 - * Sleepycat Software. All rights reserved. - */ -#include "db_config.h" - -#ifndef lint -static const char revid[] = "$Id: mp_sync.c,v 11.64 2002/08/25 16:00:27 bostic Exp $"; -#endif /* not lint */ - -#ifndef NO_SYSTEM_INCLUDES -#include <sys/types.h> - -#include <stdlib.h> -#endif - -#include "db_int.h" -#include "dbinc/db_shash.h" -#include "dbinc/mp.h" - -typedef struct { - DB_MPOOL_HASH *track_hp; /* Hash bucket. */ - - roff_t track_off; /* Page file offset. */ - db_pgno_t track_pgno; /* Page number. */ -} BH_TRACK; - -static int __bhcmp __P((const void *, const void *)); -static int __memp_close_flush_files __P((DB_ENV *, DB_MPOOL *)); -static int __memp_sync_files __P((DB_ENV *, DB_MPOOL *)); - -/* - * __memp_sync -- - * Mpool sync function. - * - * PUBLIC: int __memp_sync __P((DB_ENV *, DB_LSN *)); - */ -int -__memp_sync(dbenv, lsnp) - DB_ENV *dbenv; - DB_LSN *lsnp; -{ - DB_MPOOL *dbmp; - MPOOL *mp; - int ret; - - PANIC_CHECK(dbenv); - ENV_REQUIRES_CONFIG(dbenv, - dbenv->mp_handle, "memp_sync", DB_INIT_MPOOL); - - /* - * If no LSN is provided, flush the entire cache (reasonable usage - * even if there's no log subsystem configured). - */ - if (lsnp != NULL) - ENV_REQUIRES_CONFIG(dbenv, - dbenv->lg_handle, "memp_sync", DB_INIT_LOG); - - dbmp = dbenv->mp_handle; - mp = dbmp->reginfo[0].primary; - - /* If we've flushed to the requested LSN, return that information. */ - if (lsnp != NULL) { - R_LOCK(dbenv, dbmp->reginfo); - if (log_compare(lsnp, &mp->lsn) <= 0) { - *lsnp = mp->lsn; - - R_UNLOCK(dbenv, dbmp->reginfo); - return (0); - } - R_UNLOCK(dbenv, dbmp->reginfo); - } - - if ((ret = __memp_sync_int(dbenv, NULL, 0, DB_SYNC_CACHE, NULL)) != 0) - return (ret); - - if (lsnp != NULL) { - R_LOCK(dbenv, dbmp->reginfo); - if (log_compare(lsnp, &mp->lsn) > 0) - mp->lsn = *lsnp; - R_UNLOCK(dbenv, dbmp->reginfo); - } - - return (0); -} - -/* - * __memp_fsync -- - * Mpool file sync function. - * - * PUBLIC: int __memp_fsync __P((DB_MPOOLFILE *)); - */ -int -__memp_fsync(dbmfp) - DB_MPOOLFILE *dbmfp; -{ - DB_ENV *dbenv; - DB_MPOOL *dbmp; - - dbmp = dbmfp->dbmp; - dbenv = dbmp->dbenv; - - PANIC_CHECK(dbenv); - - /* - * If this handle doesn't have a file descriptor that's open for - * writing, or if the file is a temporary, there's no reason to - * proceed further. - */ - if (F_ISSET(dbmfp, MP_READONLY)) - return (0); - - if (F_ISSET(dbmfp->mfp, MP_TEMP)) - return (0); - - return (__memp_sync_int(dbenv, dbmfp, 0, DB_SYNC_FILE, NULL)); -} - -/* - * __mp_xxx_fh -- - * Return a file descriptor for DB 1.85 compatibility locking. - * - * PUBLIC: int __mp_xxx_fh __P((DB_MPOOLFILE *, DB_FH **)); - */ -int -__mp_xxx_fh(dbmfp, fhp) - DB_MPOOLFILE *dbmfp; - DB_FH **fhp; -{ - DB_ENV *dbenv; - /* - * This is a truly spectacular layering violation, intended ONLY to - * support compatibility for the DB 1.85 DB->fd call. - * - * Sync the database file to disk, creating the file as necessary. - * - * We skip the MP_READONLY and MP_TEMP tests done by memp_fsync(3). - * The MP_READONLY test isn't interesting because we will either - * already have a file descriptor (we opened the database file for - * reading) or we aren't readonly (we created the database which - * requires write privileges). The MP_TEMP test isn't interesting - * because we want to write to the backing file regardless so that - * we get a file descriptor to return. - */ - *fhp = dbmfp->fhp; - if (F_ISSET(dbmfp->fhp, DB_FH_VALID)) - return (0); - dbenv = dbmfp->dbmp->dbenv; - - return (__memp_sync_int(dbenv, dbmfp, 0, DB_SYNC_FILE, NULL)); -} - -/* - * __memp_sync_int -- - * Mpool sync internal function. - * - * PUBLIC: int __memp_sync_int - * PUBLIC: __P((DB_ENV *, DB_MPOOLFILE *, int, db_sync_op, int *)); - */ -int -__memp_sync_int(dbenv, dbmfp, ar_max, op, wrotep) - DB_ENV *dbenv; - DB_MPOOLFILE *dbmfp; - int ar_max, *wrotep; - db_sync_op op; -{ - BH *bhp; - BH_TRACK *bharray; - DB_MPOOL *dbmp; - DB_MPOOL_HASH *hp; - DB_MUTEX *mutexp; - MPOOL *c_mp, *mp; - MPOOLFILE *mfp; - u_int32_t n_cache; - int ar_cnt, hb_lock, i, pass, remaining, ret, t_ret, wait_cnt, wrote; - - dbmp = dbenv->mp_handle; - mp = dbmp->reginfo[0].primary; - pass = wrote = 0; - - /* - * If the caller does not specify how many pages assume one - * per bucket. - */ - if (ar_max == 0) - ar_max = mp->nreg * mp->htab_buckets; - - if ((ret = - __os_malloc(dbenv, ar_max * sizeof(BH_TRACK), &bharray)) != 0) - return (ret); - - /* - * Walk each cache's list of buffers and mark all dirty buffers to be - * written and all pinned buffers to be potentially written, depending - * on our flags. - */ - for (ar_cnt = 0, n_cache = 0; n_cache < mp->nreg; ++n_cache) { - c_mp = dbmp->reginfo[n_cache].primary; - - hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab); - for (i = 0; i < c_mp->htab_buckets; i++, hp++) { - /* - * We can check for empty buckets before locking as we - * only care if the pointer is zero or non-zero. We - * can ignore empty buckets because we only need write - * buffers that were dirty before we started. - */ - if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL) - continue; - - MUTEX_LOCK(dbenv, &hp->hash_mutex); - for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh); - bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) { - /* Always ignore unreferenced, clean pages. */ - if (bhp->ref == 0 && !F_ISSET(bhp, BH_DIRTY)) - continue; - - /* - * Checkpoints have to wait on all pinned pages, - * as pages may be marked dirty when returned to - * the cache. - * - * File syncs only wait on pages both pinned and - * dirty. (We don't care if pages are marked - * dirty when returned to the cache, that means - * there's another writing thread and flushing - * the cache for this handle is meaningless.) - */ - if (op == DB_SYNC_FILE && - !F_ISSET(bhp, BH_DIRTY)) - continue; - - mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); - - /* - * Ignore temporary files -- this means you - * can't even flush temporary files by handle. - * (Checkpoint doesn't require temporary files - * be flushed and the underlying buffer write - * write routine may not be able to write it - * anyway.) - */ - if (F_ISSET(mfp, MP_TEMP)) - continue; - - /* - * If we're flushing a specific file, see if - * this page is from that file. - */ - if (dbmfp != NULL && mfp != dbmfp->mfp) - continue; - - /* - * Ignore files that aren't involved in DB's - * transactional operations during checkpoints. - */ - if (dbmfp == NULL && mfp->lsn_off == -1) - continue; - - /* Track the buffer, we want it. */ - bharray[ar_cnt].track_hp = hp; - bharray[ar_cnt].track_pgno = bhp->pgno; - bharray[ar_cnt].track_off = bhp->mf_offset; - ar_cnt++; - - if (ar_cnt >= ar_max) { - if ((ret = __os_realloc(dbenv, - (ar_max * 2) * sizeof(BH_TRACK), - &bharray)) != 0) - break; - ar_max *= 2; - } - } - MUTEX_UNLOCK(dbenv, &hp->hash_mutex); - - if (ret != 0) - goto err; - } - } - - /* If there no buffers to write, we're done. */ - if (ar_cnt == 0) - goto done; - - /* - * Write the buffers in file/page order, trying to reduce seeks by the - * filesystem and, when pages are smaller than filesystem block sizes, - * reduce the actual number of writes. - */ - if (ar_cnt > 1) - qsort(bharray, ar_cnt, sizeof(BH_TRACK), __bhcmp); - - /* - * If we're trickling buffers, only write enough to reach the correct - * percentage for this region. We may not write enough if the dirty - * buffers have an unbalanced distribution among the regions, but that - * seems unlikely. - */ - if (op == DB_SYNC_TRICKLE && ar_cnt > ar_max / (int)mp->nreg) - ar_cnt = ar_max / (int)mp->nreg; - - /* - * Flush the log. We have to ensure the log records reflecting the - * changes on the database pages we're writing have already made it - * to disk. We still have to check the log each time we write a page - * (because pages we are about to write may be modified after we have - * flushed the log), but in general this will at least avoid any I/O - * on the log's part. - */ - if (LOGGING_ON(dbenv) && (ret = dbenv->log_flush(dbenv, NULL)) != 0) - goto err; - - /* - * Walk the array, writing buffers. When we write a buffer, we NULL - * out its hash bucket pointer so we don't process a slot more than - * once. - */ - for (remaining = ar_cnt, i = pass = 0; remaining > 0; ++i) { - if (i >= ar_cnt) { - i = 0; - ++pass; - __os_sleep(dbenv, 1, 0); - } - if ((hp = bharray[i].track_hp) == NULL) - continue; - - /* Lock the hash bucket and find the buffer. */ - mutexp = &hp->hash_mutex; - MUTEX_LOCK(dbenv, mutexp); - for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh); - bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) - if (bhp->pgno == bharray[i].track_pgno && - bhp->mf_offset == bharray[i].track_off) - break; - - /* - * If we can't find the buffer we're done, somebody else had - * to have written it. - * - * If the buffer isn't pinned or dirty, we're done, there's - * no work needed. - */ - if (bhp == NULL || (bhp->ref == 0 && !F_ISSET(bhp, BH_DIRTY))) { - MUTEX_UNLOCK(dbenv, mutexp); - --remaining; - bharray[i].track_hp = NULL; - continue; - } - - /* - * If the buffer is locked by another thread, ignore it, we'll - * come back to it. - * - * If the buffer is pinned and it's only the first or second - * time we have looked at it, ignore it, we'll come back to - * it. - * - * In either case, skip the buffer if we're not required to - * write it. - */ - if (F_ISSET(bhp, BH_LOCKED) || (bhp->ref != 0 && pass < 2)) { - MUTEX_UNLOCK(dbenv, mutexp); - if (op != DB_SYNC_CACHE && op != DB_SYNC_FILE) { - --remaining; - bharray[i].track_hp = NULL; - } - continue; - } - - /* - * The buffer is either pinned or dirty. - * - * Set the sync wait-for count, used to count down outstanding - * references to this buffer as they are returned to the cache. - */ - bhp->ref_sync = bhp->ref; - - /* Pin the buffer into memory and lock it. */ - ++bhp->ref; - F_SET(bhp, BH_LOCKED); - MUTEX_LOCK(dbenv, &bhp->mutex); - - /* - * Unlock the hash bucket and wait for the wait-for count to - * go to 0. No new thread can acquire the buffer because we - * have it locked. - * - * If a thread attempts to re-pin a page, the wait-for count - * will never go to 0 (the thread spins on our buffer lock, - * while we spin on the thread's ref count). Give up if we - * don't get the buffer in 3 seconds, we can try again later. - * - * If, when the wait-for count goes to 0, the buffer is found - * to be dirty, write it. - */ - MUTEX_UNLOCK(dbenv, mutexp); - for (wait_cnt = 1; - bhp->ref_sync != 0 && wait_cnt < 4; ++wait_cnt) - __os_sleep(dbenv, 1, 0); - MUTEX_LOCK(dbenv, mutexp); - hb_lock = 1; - - /* - * If the ref_sync count has gone to 0, we're going to be done - * with this buffer no matter what happens. - */ - if (bhp->ref_sync == 0) { - --remaining; - bharray[i].track_hp = NULL; - } - - /* - * If the ref_sync count has gone to 0 and the buffer is still - * dirty, we write it. We only try to write the buffer once. - * Any process checkpointing or trickle-flushing the pool - * must be able to write any underlying file -- if the write - * fails, error out. It would be very strange if file sync - * failed to write, but we don't care if it happens. - */ - if (bhp->ref_sync == 0 && F_ISSET(bhp, BH_DIRTY)) { - hb_lock = 0; - MUTEX_UNLOCK(dbenv, mutexp); - - mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); - if ((ret = __memp_bhwrite(dbmp, hp, mfp, bhp, 1)) == 0) - ++wrote; - else if (op == DB_SYNC_CACHE || op == DB_SYNC_TRICKLE) - __db_err(dbenv, "%s: unable to flush page: %lu", - __memp_fns(dbmp, mfp), (u_long)bhp->pgno); - else - ret = 0; - } - - /* - * If ref_sync count never went to 0, the buffer was written - * by another thread, or the write failed, we still have the - * buffer locked. - * - * We may or may not currently hold the hash bucket mutex. If - * the __memp_bhwrite -> __memp_pgwrite call was successful, - * then __memp_pgwrite will have swapped the buffer lock for - * the hash lock. All other call paths will leave us without - * the hash bucket lock. - * - * The order of mutexes above was to acquire the buffer lock - * while holding the hash bucket lock. Don't deadlock here, - * release the buffer lock and then acquire the hash bucket - * lock. - */ - if (F_ISSET(bhp, BH_LOCKED)) { - F_CLR(bhp, BH_LOCKED); - MUTEX_UNLOCK(dbenv, &bhp->mutex); - - if (!hb_lock) - MUTEX_LOCK(dbenv, mutexp); - } - - /* - * Reset the ref_sync count regardless of our success, we're - * done with this buffer for now. - */ - bhp->ref_sync = 0; - - /* Discard our reference and unlock the bucket. */ - --bhp->ref; - MUTEX_UNLOCK(dbenv, mutexp); - - if (ret != 0) - break; - } - -done: /* If we've opened files to flush pages, close them. */ - if ((t_ret = __memp_close_flush_files(dbenv, dbmp)) != 0 && ret == 0) - ret = t_ret; - - /* - * If doing a checkpoint or flushing a file for the application, we - * have to force the pages to disk. We don't do this as we go along - * because we want to give the OS as much time as possible to lazily - * flush, and because we have to flush files that might not even have - * had dirty buffers in the cache, so we have to walk the files list. - */ - if (ret == 0 && (op == DB_SYNC_CACHE || op == DB_SYNC_FILE)) { - if (dbmfp == NULL) - ret = __memp_sync_files(dbenv, dbmp); - else - ret = __os_fsync(dbenv, dbmfp->fhp); - } - -err: __os_free(dbenv, bharray); - if (wrotep != NULL) - *wrotep = wrote; - - return (ret); -} - -/* - * __memp_sync_files -- - * Sync all the files in the environment, open or not. - */ -static -int __memp_sync_files(dbenv, dbmp) - DB_ENV *dbenv; - DB_MPOOL *dbmp; -{ - DB_MPOOLFILE *dbmfp; - MPOOL *mp; - MPOOLFILE *mfp; - int ret, t_ret; - - ret = 0; - mp = dbmp->reginfo[0].primary; - - R_LOCK(dbenv, dbmp->reginfo); - for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile); - mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) { - if (mfp->stat.st_page_out == 0 || - F_ISSET(mfp, MP_DEADFILE | MP_TEMP)) - continue; - - /* Look for an already open handle. */ - ret = 0; - MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp); - for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq); - dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q)) - if (dbmfp->mfp == mfp) { - ret = __os_fsync(dbenv, dbmfp->fhp); - break; - } - MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); - if (ret != 0) - goto err; - - /* If we don't find one, open one. */ - if (dbmfp == NULL) { - if ((ret = dbenv->memp_fcreate(dbenv, &dbmfp, 0)) != 0) - goto err; - ret = __memp_fopen_int( - dbmfp, mfp, R_ADDR(dbmp->reginfo, mfp->path_off), - 0, 0, mfp->stat.st_pagesize); - if (ret == 0) - ret = __os_fsync(dbenv, dbmfp->fhp); - if ((t_ret = - __memp_fclose_int(dbmfp, 0)) != 0 && ret == 0) - ret = t_ret; - if (ret != 0) - goto err; - } - } - - if (0) { -err: __db_err(dbenv, "%s: cannot sync: %s", - R_ADDR(dbmp->reginfo, mfp->path_off), db_strerror(ret)); - } - R_UNLOCK(dbenv, dbmp->reginfo); - - return (ret); -} - -/* - * __memp_close_flush_files -- - * Close files opened only to flush buffers. - */ -static int -__memp_close_flush_files(dbenv, dbmp) - DB_ENV *dbenv; - DB_MPOOL *dbmp; -{ - DB_MPOOLFILE *dbmfp; - int ret; - - /* - * The routine exists because we must close files opened by sync to - * flush buffers. There are two cases: first, extent files have to - * be closed so they may be removed when empty. Second, regular - * files have to be closed so we don't run out of descriptors (for - * example, and application partitioning its data into databases - * based on timestamps, so there's a continually increasing set of - * files). - * - * We mark files opened in the __memp_bhwrite() function with the - * MP_FLUSH flag. Here we walk through our file descriptor list, - * and, if a file was opened by __memp_bhwrite(), we close it. - */ -retry: MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp); - for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq); - dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q)) - if (F_ISSET(dbmfp, MP_FLUSH)) { - F_CLR(dbmfp, MP_FLUSH); - MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); - if ((ret = __memp_fclose_int(dbmfp, 0)) != 0) - return (ret); - goto retry; - } - MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); - - return (0); -} - -static int -__bhcmp(p1, p2) - const void *p1, *p2; -{ - BH_TRACK *bhp1, *bhp2; - - bhp1 = (BH_TRACK *)p1; - bhp2 = (BH_TRACK *)p2; - - /* Sort by file (shared memory pool offset). */ - if (bhp1->track_off < bhp2->track_off) - return (-1); - if (bhp1->track_off > bhp2->track_off) - return (1); - - /* - * !!! - * Defend against badly written quicksort code calling the comparison - * function with two identical pointers (e.g., WATCOM C++ (Power++)). - */ - if (bhp1->track_pgno < bhp2->track_pgno) - return (-1); - if (bhp1->track_pgno > bhp2->track_pgno) - return (1); - return (0); -} diff --git a/bdb/mp/mp_trickle.c b/bdb/mp/mp_trickle.c deleted file mode 100644 index 71077ab60cc..00000000000 --- a/bdb/mp/mp_trickle.c +++ /dev/null @@ -1,83 +0,0 @@ -/*- - * See the file LICENSE for redistribution information. - * - * Copyright (c) 1996-2002 - * Sleepycat Software. All rights reserved. - */ -#include "db_config.h" - -#ifndef lint -static const char revid[] = "$Id: mp_trickle.c,v 11.24 2002/08/06 06:13:53 bostic Exp $"; -#endif /* not lint */ - -#ifndef NO_SYSTEM_INCLUDES -#include <sys/types.h> - -#include <stdlib.h> -#endif - -#include "db_int.h" -#include "dbinc/db_shash.h" -#include "dbinc/mp.h" - -/* - * __memp_trickle -- - * Keep a specified percentage of the buffers clean. - * - * PUBLIC: int __memp_trickle __P((DB_ENV *, int, int *)); - */ -int -__memp_trickle(dbenv, pct, nwrotep) - DB_ENV *dbenv; - int pct, *nwrotep; -{ - DB_MPOOL *dbmp; - MPOOL *c_mp, *mp; - u_int32_t clean, dirty, i, total, dtmp; - int ret, wrote; - - PANIC_CHECK(dbenv); - ENV_REQUIRES_CONFIG(dbenv, - dbenv->mp_handle, "memp_trickle", DB_INIT_MPOOL); - - dbmp = dbenv->mp_handle; - mp = dbmp->reginfo[0].primary; - - if (nwrotep != NULL) - *nwrotep = 0; - - if (pct < 1 || pct > 100) - return (EINVAL); - - /* - * If there are sufficient clean buffers, no buffers or no dirty - * buffers, we're done. - * - * XXX - * Using hash_page_dirty is our only choice at the moment, but it's not - * as correct as we might like in the presence of pools having more - * than one page size, as a free 512B buffer isn't the same as a free - * 8KB buffer. - * - * Loop through the caches counting total/dirty buffers. - */ - for (ret = 0, i = dirty = total = 0; i < mp->nreg; ++i) { - c_mp = dbmp->reginfo[i].primary; - total += c_mp->stat.st_pages; - __memp_stat_hash(&dbmp->reginfo[i], c_mp, &dtmp); - dirty += dtmp; - } - - clean = total - dirty; - if (clean == total || (clean * 100) / total >= (u_long)pct) - return (0); - - if (nwrotep == NULL) - nwrotep = &wrote; - ret = __memp_sync_int(dbenv, NULL, - ((total * pct) / 100) - clean, DB_SYNC_TRICKLE, nwrotep); - - mp->stat.st_page_trickle += *nwrotep; - - return (ret); -} |