diff options
Diffstat (limited to 'bdb/mp')
-rw-r--r-- | bdb/mp/Design | 52 | ||||
-rw-r--r-- | bdb/mp/mp_alloc.c | 430 | ||||
-rw-r--r-- | bdb/mp/mp_bh.c | 568 | ||||
-rw-r--r-- | bdb/mp/mp_fget.c | 763 | ||||
-rw-r--r-- | bdb/mp/mp_fopen.c | 1167 | ||||
-rw-r--r-- | bdb/mp/mp_fput.c | 196 | ||||
-rw-r--r-- | bdb/mp/mp_fset.c | 63 | ||||
-rw-r--r-- | bdb/mp/mp_method.c | 109 | ||||
-rw-r--r-- | bdb/mp/mp_region.c | 211 | ||||
-rw-r--r-- | bdb/mp/mp_register.c | 33 | ||||
-rw-r--r-- | bdb/mp/mp_stat.c | 325 | ||||
-rw-r--r-- | bdb/mp/mp_sync.c | 909 | ||||
-rw-r--r-- | bdb/mp/mp_trickle.c | 136 |
13 files changed, 2917 insertions, 2045 deletions
diff --git a/bdb/mp/Design b/bdb/mp/Design deleted file mode 100644 index 1b26aae6cba..00000000000 --- a/bdb/mp/Design +++ /dev/null @@ -1,52 +0,0 @@ -$Id: Design,v 11.2 1999/11/21 23:08:27 bostic Exp $ - -There are three ways we do locking in the mpool code: - -Locking a handle mutex to provide concurrency for DB_THREAD operations. -Locking the region mutex to provide mutual exclusion while reading and - writing structures in the shared region. -Locking buffer header mutexes during I/O. - -The first will not be further described here. We use the shared mpool -region lock to provide mutual exclusion while reading/modifying all of -the data structures, including the buffer headers. We use a per-buffer -header lock to wait on buffer I/O. The order of locking is as follows: - -Searching for a buffer: - Acquire the region lock. - Find the buffer header. - Increment the reference count (guarantee the buffer stays). - While the BH_LOCKED flag is set (I/O is going on) { - Release the region lock. - Explicitly yield the processor if it's not the first pass - through this loop, otherwise, we can simply spin because - we'll be simply switching between the two locks. - Request the buffer lock. - The I/O will complete... - Acquire the buffer lock. - Release the buffer lock. - Acquire the region lock. - } - Return the buffer. - -Reading/writing a buffer: - Acquire the region lock. - Find/create the buffer header. - If reading, increment the reference count (guarantee the buffer stays). - Set the BH_LOCKED flag. - Acquire the buffer lock (guaranteed not to block). - Release the region lock. - Do the I/O and/or initialize the buffer contents. - Release the buffer lock. - At this point, the buffer lock is available, but the logical - operation (flagged by BH_LOCKED) is not yet completed. For - this reason, among others, threads checking the BH_LOCKED flag - must loop around their test. - Acquire the region lock. - Clear the BH_LOCKED flag. - Release the region lock. - Return/discard the buffer. - -Pointers to DB_MPOOL, MPOOL, DB_MPOOLFILE and MPOOLFILE structures are -not reacquired when a region lock is reacquired because they couldn't -have been closed/discarded and because they never move in memory. diff --git a/bdb/mp/mp_alloc.c b/bdb/mp/mp_alloc.c index 731f569f57f..96dd612d7ba 100644 --- a/bdb/mp/mp_alloc.c +++ b/bdb/mp/mp_alloc.c @@ -1,22 +1,31 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: mp_alloc.c,v 11.7 2000/04/20 21:14:18 bostic Exp $"; +static const char revid[] = "$Id: mp_alloc.c,v 11.31 2002/08/14 17:21:37 ubell Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES #include <sys/types.h> +#include <string.h> #endif #include "db_int.h" -#include "db_shash.h" -#include "mp.h" +#include "dbinc/db_shash.h" +#include "dbinc/mp.h" + +typedef struct { + DB_MPOOL_HASH *bucket; + u_int32_t priority; +} HS; + +static void __memp_bad_buffer __P((DB_MPOOL_HASH *)); +static void __memp_reset_lru __P((DB_ENV *, REGINFO *, MPOOL *)); /* * __memp_alloc -- @@ -34,14 +43,32 @@ __memp_alloc(dbmp, memreg, mfp, len, offsetp, retp) roff_t *offsetp; void *retp; { - BH *bhp, *nbhp; + BH *bhp; + DB_ENV *dbenv; + DB_MPOOL_HASH *dbht, *hp, *hp_end, *hp_tmp; + DB_MUTEX *mutexp; MPOOL *c_mp; MPOOLFILE *bh_mfp; - size_t total; - int nomore, restart, ret, wrote; + size_t freed_space; + u_int32_t buckets, buffers, high_priority, max_na, priority; + int aggressive, ret; void *p; + dbenv = dbmp->dbenv; c_mp = memreg->primary; + dbht = R_ADDR(memreg, c_mp->htab); + hp_end = &dbht[c_mp->htab_buckets]; + + buckets = buffers = 0; + aggressive = 0; + + c_mp->stat.st_alloc++; + + /* + * Get aggressive if we've tried to flush the number of pages as are + * in the system without finding space. + */ + max_na = 5 * c_mp->htab_buckets; /* * If we're allocating a buffer, and the one we're discarding is the @@ -53,100 +80,363 @@ __memp_alloc(dbmp, memreg, mfp, len, offsetp, retp) if (mfp != NULL) len = (sizeof(BH) - sizeof(u_int8_t)) + mfp->stat.st_pagesize; - nomore = 0; + R_LOCK(dbenv, memreg); + + /* + * On every buffer allocation we update the buffer generation number + * and check for wraparound. + */ + if (++c_mp->lru_count == UINT32_T_MAX) + __memp_reset_lru(dbenv, memreg, c_mp); + + /* + * Anything newer than 1/10th of the buffer pool is ignored during + * allocation (unless allocation starts failing). + */ + DB_ASSERT(c_mp->lru_count > c_mp->stat.st_pages / 10); + high_priority = c_mp->lru_count - c_mp->stat.st_pages / 10; + + /* + * First we try to allocate from free memory. If that fails, scan the + * buffer pool to find buffers with low priorities. We consider small + * sets of hash buckets each time to limit the amount of work needing + * to be done. This approximates LRU, but not very well. We either + * find a buffer of the same size to use, or we will free 3 times what + * we need in the hopes it will coalesce into a contiguous chunk of the + * right size. In the latter case we branch back here and try again. + */ alloc: if ((ret = __db_shalloc(memreg->addr, len, MUTEX_ALIGN, &p)) == 0) { - if (offsetp != NULL) + if (mfp != NULL) + c_mp->stat.st_pages++; + R_UNLOCK(dbenv, memreg); + +found: if (offsetp != NULL) *offsetp = R_OFFSET(memreg, p); *(void **)retp = p; + + /* + * Update the search statistics. + * + * We're not holding the region locked here, these statistics + * can't be trusted. + */ + if (buckets != 0) { + if (buckets > c_mp->stat.st_alloc_max_buckets) + c_mp->stat.st_alloc_max_buckets = buckets; + c_mp->stat.st_alloc_buckets += buckets; + } + if (buffers != 0) { + if (buffers > c_mp->stat.st_alloc_max_pages) + c_mp->stat.st_alloc_max_pages = buffers; + c_mp->stat.st_alloc_pages += buffers; + } return (0); } - if (nomore) { - __db_err(dbmp->dbenv, - "Unable to allocate %lu bytes from mpool shared region: %s\n", - (u_long)len, db_strerror(ret)); - return (ret); - } -retry: /* Find a buffer we can flush; pure LRU. */ - restart = total = 0; - for (bhp = - SH_TAILQ_FIRST(&c_mp->bhq, __bh); bhp != NULL; bhp = nbhp) { - nbhp = SH_TAILQ_NEXT(bhp, q, __bh); + /* + * We re-attempt the allocation every time we've freed 3 times what + * we need. Reset our free-space counter. + */ + freed_space = 0; - /* Ignore pinned or locked (I/O in progress) buffers. */ - if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED)) + /* + * Walk the hash buckets and find the next two with potentially useful + * buffers. Free the buffer with the lowest priority from the buckets' + * chains. + */ + for (hp_tmp = NULL;;) { + /* Check for wrap around. */ + hp = &dbht[c_mp->last_checked++]; + if (hp >= hp_end) { + c_mp->last_checked = 0; + + /* + * If we've gone through all of the hash buckets, try + * an allocation. If the cache is small, the old page + * size is small, and the new page size is large, we + * might have freed enough memory (but not 3 times the + * memory). + */ + goto alloc; + } + + /* + * Skip empty buckets. + * + * We can check for empty buckets before locking as we + * only care if the pointer is zero or non-zero. + */ + if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL) continue; - /* Find the associated MPOOLFILE. */ - bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); + /* + * The failure mode is when there are too many buffers we can't + * write or there's not enough memory in the system. We don't + * have a metric for deciding if allocation has no possible way + * to succeed, so we don't ever fail, we assume memory will be + * available if we wait long enough. + * + * Get aggressive if we've tried to flush 5 times the number of + * hash buckets as are in the system -- it's possible we have + * been repeatedly trying to flush the same buffers, although + * it's unlikely. Aggressive means: + * + * a: set a flag to attempt to flush high priority buffers as + * well as other buffers. + * b: sync the mpool to force out queue extent pages. While we + * might not have enough space for what we want and flushing + * is expensive, why not? + * c: sleep for a second -- hopefully someone else will run and + * free up some memory. Try to allocate memory too, in case + * the other thread returns its memory to the region. + * d: look at a buffer in every hash bucket rather than choose + * the more preferable of two. + * + * !!! + * This test ignores pathological cases like no buffers in the + * system -- that shouldn't be possible. + */ + if ((++buckets % max_na) == 0) { + aggressive = 1; - /* Write the page if it's dirty. */ - if (F_ISSET(bhp, BH_DIRTY)) { - ++bhp->ref; - if ((ret = __memp_bhwrite(dbmp, - bh_mfp, bhp, &restart, &wrote)) != 0) - return (ret); - --bhp->ref; + R_UNLOCK(dbenv, memreg); - /* - * Another process may have acquired this buffer and - * incremented the ref count after we wrote it. - */ - if (bhp->ref != 0) - goto retry; + (void)__memp_sync_int( + dbenv, NULL, 0, DB_SYNC_ALLOC, NULL); + + (void)__os_sleep(dbenv, 1, 0); + + R_LOCK(dbenv, memreg); + goto alloc; + } + + if (!aggressive) { + /* Skip high priority buckets. */ + if (hp->hash_priority > high_priority) + continue; /* - * If we wrote the page, continue and free the buffer. - * We don't have to rewalk the list to acquire the - * buffer because it was never available for any other - * process to modify it. - * - * If we didn't write the page, but we discarded and - * reacquired the region lock, restart the list walk. - * - * If we neither wrote the buffer nor discarded the - * region lock, continue down the buffer list. + * Find two buckets and select the one with the lowest + * priority. Performance testing shows that looking + * at two improves the LRUness and looking at more only + * does a little better. */ - if (wrote) - ++c_mp->stat.st_rw_evict; - else { - if (restart) - goto retry; + if (hp_tmp == NULL) { + hp_tmp = hp; continue; } + if (hp->hash_priority > hp_tmp->hash_priority) + hp = hp_tmp; + hp_tmp = NULL; + } + + /* Remember the priority of the buffer we're looking for. */ + priority = hp->hash_priority; + + /* Unlock the region and lock the hash bucket. */ + R_UNLOCK(dbenv, memreg); + mutexp = &hp->hash_mutex; + MUTEX_LOCK(dbenv, mutexp); + +#ifdef DIAGNOSTIC + __memp_check_order(hp); +#endif + /* + * The lowest priority page is first in the bucket, as they are + * maintained in sorted order. + * + * The buffer may have been freed or its priority changed while + * we switched from the region lock to the hash lock. If so, + * we have to restart. We will still take the first buffer on + * the bucket's list, though, if it has a low enough priority. + */ + if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL || + bhp->ref != 0 || bhp->priority > priority) + goto next_hb; + + buffers++; + + /* Find the associated MPOOLFILE. */ + bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); + + /* If the page is dirty, pin it and write it. */ + ret = 0; + if (F_ISSET(bhp, BH_DIRTY)) { + ++bhp->ref; + ret = __memp_bhwrite(dbmp, hp, bh_mfp, bhp, 0); + --bhp->ref; + if (ret == 0) + ++c_mp->stat.st_rw_evict; } else ++c_mp->stat.st_ro_evict; /* + * If a write fails for any reason, we can't proceed. + * + * We released the hash bucket lock while doing I/O, so another + * thread may have acquired this buffer and incremented the ref + * count after we wrote it, in which case we can't have it. + * + * If there's a write error, avoid selecting this buffer again + * by making it the bucket's least-desirable buffer. + */ + if (ret != 0 || bhp->ref != 0) { + if (ret != 0 && aggressive) + __memp_bad_buffer(hp); + goto next_hb; + } + + /* * Check to see if the buffer is the size we're looking for. - * If it is, simply reuse it. + * If so, we can simply reuse it. Else, free the buffer and + * its space and keep looking. */ if (mfp != NULL && mfp->stat.st_pagesize == bh_mfp->stat.st_pagesize) { - __memp_bhfree(dbmp, bhp, 0); + __memp_bhfree(dbmp, hp, bhp, 0); - if (offsetp != NULL) - *offsetp = R_OFFSET(memreg, bhp); - *(void **)retp = bhp; - return (0); + p = bhp; + goto found; } - /* Note how much space we've freed, and free the buffer. */ - total += __db_shsizeof(bhp); - __memp_bhfree(dbmp, bhp, 1); + freed_space += __db_shsizeof(bhp); + __memp_bhfree(dbmp, hp, bhp, 1); /* - * Retry as soon as we've freed up sufficient space. If we - * have to coalesce of memory to satisfy the request, don't - * try until it's likely (possible?) that we'll succeed. + * Unlock this hash bucket and re-acquire the region lock. If + * we're reaching here as a result of calling memp_bhfree, the + * hash bucket lock has already been discarded. */ - if (total >= 3 * len) + if (0) { +next_hb: MUTEX_UNLOCK(dbenv, mutexp); + } + R_LOCK(dbenv, memreg); + + /* + * Retry the allocation as soon as we've freed up sufficient + * space. We're likely to have to coalesce of memory to + * satisfy the request, don't try until it's likely (possible?) + * we'll succeed. + */ + if (freed_space >= 3 * len) goto alloc; + } + /* NOTREACHED */ +} + +/* + * __memp_bad_buffer -- + * Make the first buffer in a hash bucket the least desirable buffer. + */ +static void +__memp_bad_buffer(hp) + DB_MPOOL_HASH *hp; +{ + BH *bhp, *t_bhp; + u_int32_t priority; + + /* Remove the first buffer from the bucket. */ + bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh); + SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh); + + /* + * Find the highest priority buffer in the bucket. Buffers are + * sorted by priority, so it's the last one in the bucket. + * + * XXX + * Should use SH_TAILQ_LAST, but I think that macro is broken. + */ + priority = bhp->priority; + for (t_bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh); + t_bhp != NULL; t_bhp = SH_TAILQ_NEXT(t_bhp, hq, __bh)) + priority = t_bhp->priority; + + /* + * Set our buffer's priority to be just as bad, and append it to + * the bucket. + */ + bhp->priority = priority; + SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq); - /* Restart the walk if we discarded the region lock. */ - if (restart) - goto retry; + /* Reset the hash bucket's priority. */ + hp->hash_priority = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority; +} + +/* + * __memp_reset_lru -- + * Reset the cache LRU counter. + */ +static void +__memp_reset_lru(dbenv, memreg, c_mp) + DB_ENV *dbenv; + REGINFO *memreg; + MPOOL *c_mp; +{ + BH *bhp; + DB_MPOOL_HASH *hp; + int bucket; + + /* + * Update the counter so all future allocations will start at the + * bottom. + */ + c_mp->lru_count -= MPOOL_BASE_DECREMENT; + + /* Release the region lock. */ + R_UNLOCK(dbenv, memreg); + + /* Adjust the priority of every buffer in the system. */ + for (hp = R_ADDR(memreg, c_mp->htab), + bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) { + /* + * Skip empty buckets. + * + * We can check for empty buckets before locking as we + * only care if the pointer is zero or non-zero. + */ + if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL) + continue; + + MUTEX_LOCK(dbenv, &hp->hash_mutex); + for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh); + bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) + if (bhp->priority != UINT32_T_MAX && + bhp->priority > MPOOL_BASE_DECREMENT) + bhp->priority -= MPOOL_BASE_DECREMENT; + MUTEX_UNLOCK(dbenv, &hp->hash_mutex); } - nomore = 1; - goto alloc; + + /* Reacquire the region lock. */ + R_LOCK(dbenv, memreg); +} + +#ifdef DIAGNOSTIC +/* + * __memp_check_order -- + * Verify the priority ordering of a hash bucket chain. + * + * PUBLIC: #ifdef DIAGNOSTIC + * PUBLIC: void __memp_check_order __P((DB_MPOOL_HASH *)); + * PUBLIC: #endif + */ +void +__memp_check_order(hp) + DB_MPOOL_HASH *hp; +{ + BH *bhp; + u_int32_t priority; + + /* + * Assumes the hash bucket is locked. + */ + if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL) + return; + + DB_ASSERT(bhp->priority == hp->hash_priority); + + for (priority = bhp->priority; + (bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) != NULL; + priority = bhp->priority) + DB_ASSERT(priority <= bhp->priority); } +#endif diff --git a/bdb/mp/mp_bh.c b/bdb/mp/mp_bh.c index e802b165b2d..85d15218abf 100644 --- a/bdb/mp/mp_bh.c +++ b/bdb/mp/mp_bh.c @@ -1,13 +1,13 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: mp_bh.c,v 11.25 2001/01/10 04:50:53 ubell Exp $"; +static const char revid[] = "$Id: mp_bh.c,v 11.71 2002/09/04 19:06:45 margo Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -18,40 +18,41 @@ static const char revid[] = "$Id: mp_bh.c,v 11.25 2001/01/10 04:50:53 ubell Exp #endif #include "db_int.h" -#include "db_shash.h" -#include "mp.h" -#include "log.h" -#include "db_page.h" +#include "dbinc/db_shash.h" +#include "dbinc/mp.h" +#include "dbinc/log.h" +#include "dbinc/db_page.h" +static int __memp_pgwrite + __P((DB_MPOOL *, DB_MPOOLFILE *, DB_MPOOL_HASH *, BH *)); static int __memp_upgrade __P((DB_MPOOL *, DB_MPOOLFILE *, MPOOLFILE *)); /* * __memp_bhwrite -- - * Write the page associated with a given bucket header. + * Write the page associated with a given buffer header. * - * PUBLIC: int __memp_bhwrite - * PUBLIC: __P((DB_MPOOL *, MPOOLFILE *, BH *, int *, int *)); + * PUBLIC: int __memp_bhwrite __P((DB_MPOOL *, + * PUBLIC: DB_MPOOL_HASH *, MPOOLFILE *, BH *, int)); */ int -__memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep) +__memp_bhwrite(dbmp, hp, mfp, bhp, open_extents) DB_MPOOL *dbmp; + DB_MPOOL_HASH *hp; MPOOLFILE *mfp; BH *bhp; - int *restartp, *wrotep; + int open_extents; { + DB_ENV *dbenv; DB_MPOOLFILE *dbmfp; DB_MPREG *mpreg; - int incremented, ret; + int local_open, incremented, ret; - if (restartp != NULL) - *restartp = 0; - if (wrotep != NULL) - *wrotep = 0; - incremented = 0; + dbenv = dbmp->dbenv; + local_open = incremented = 0; /* - * If the file has been removed or is a closed temporary file, Jump - * right ahead and pretend that we've found the file we want-- the + * If the file has been removed or is a closed temporary file, jump + * right ahead and pretend that we've found the file we want -- the * page-write function knows how to handle the fact that we don't have * (or need!) any real file descriptor information. */ @@ -66,52 +67,60 @@ __memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep) * If we find a descriptor on the file that's not open for writing, we * try and upgrade it to make it writeable. If that fails, we're done. */ - MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp); + MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp); for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq); dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q)) if (dbmfp->mfp == mfp) { if (F_ISSET(dbmfp, MP_READONLY) && - __memp_upgrade(dbmp, dbmfp, mfp)) { - MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp); - return (0); + !F_ISSET(dbmfp, MP_UPGRADE) && + (F_ISSET(dbmfp, MP_UPGRADE_FAIL) || + __memp_upgrade(dbmp, dbmfp, mfp))) { + MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); + return (EPERM); } /* * Increment the reference count -- see the comment in - * memp_fclose(). + * __memp_fclose_int(). */ ++dbmfp->ref; incremented = 1; break; } - MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp); + MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); + if (dbmfp != NULL) goto found; /* * !!! + * It's the caller's choice if we're going to open extent files. + */ + if (!open_extents && F_ISSET(mfp, MP_EXTENT)) + return (EPERM); + + /* + * !!! * Don't try to attach to temporary files. There are two problems in * trying to do that. First, if we have different privileges than the * process that "owns" the temporary file, we might create the backing * disk file such that the owning process couldn't read/write its own - * buffers, e.g., memp_trickle() running as root creating a file owned + * buffers, e.g., memp_trickle running as root creating a file owned * as root, mode 600. Second, if the temporary file has already been * created, we don't have any way of finding out what its real name is, * and, even if we did, it was already unlinked (so that it won't be * left if the process dies horribly). This decision causes a problem, * however: if the temporary file consumes the entire buffer cache, * and the owner doesn't flush the buffers to disk, we could end up - * with resource starvation, and the memp_trickle() thread couldn't do + * with resource starvation, and the memp_trickle thread couldn't do * anything about it. That's a pretty unlikely scenario, though. * - * Note that we should never get here when the temporary file - * in question has already been closed in another process, in which - * case it should be marked MP_DEADFILE. + * Note we should never get here when the temporary file in question + * has already been closed in another process, in which case it should + * be marked MP_DEADFILE. */ - if (F_ISSET(mfp, MP_TEMP)) { - DB_ASSERT(!F_ISSET(mfp, MP_DEADFILE)); - return (0); - } + if (F_ISSET(mfp, MP_TEMP)) + return (EPERM); /* * It's not a page from a file we've opened. If the file requires @@ -120,14 +129,14 @@ __memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep) * nothing we can do. */ if (mfp->ftype != 0) { - MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp); + MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp); for (mpreg = LIST_FIRST(&dbmp->dbregq); mpreg != NULL; mpreg = LIST_NEXT(mpreg, q)) if (mpreg->ftype == mfp->ftype) break; - MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp); + MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); if (mpreg == NULL) - return (0); + return (EPERM); } /* @@ -138,17 +147,24 @@ __memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep) * There's no negative cache, so we may repeatedly try and open files * that we have previously tried (and failed) to open. */ - if (__memp_fopen(dbmp, mfp, R_ADDR(dbmp->reginfo, mfp->path_off), - 0, 0, mfp->stat.st_pagesize, 0, NULL, &dbmfp) != 0) - return (0); + if ((ret = dbenv->memp_fcreate(dbenv, &dbmfp, 0)) != 0) + return (ret); + if ((ret = __memp_fopen_int(dbmfp, mfp, + R_ADDR(dbmp->reginfo, mfp->path_off), + 0, 0, mfp->stat.st_pagesize)) != 0) { + (void)dbmfp->close(dbmfp, 0); + return (ret); + } + local_open = 1; -found: ret = __memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep); +found: ret = __memp_pgwrite(dbmp, dbmfp, hp, bhp); - if (incremented) { - MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp); + MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp); + if (incremented) --dbmfp->ref; - MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp); - } + else if (local_open) + F_SET(dbmfp, MP_FLUSH); + MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); return (ret); } @@ -157,11 +173,12 @@ found: ret = __memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep); * __memp_pgread -- * Read a page from a file. * - * PUBLIC: int __memp_pgread __P((DB_MPOOLFILE *, BH *, int)); + * PUBLIC: int __memp_pgread __P((DB_MPOOLFILE *, DB_MUTEX *, BH *, int)); */ int -__memp_pgread(dbmfp, bhp, can_create) +__memp_pgread(dbmfp, mutexp, bhp, can_create) DB_MPOOLFILE *dbmfp; + DB_MUTEX *mutexp; BH *bhp; int can_create; { @@ -169,171 +186,129 @@ __memp_pgread(dbmfp, bhp, can_create) DB_ENV *dbenv; DB_MPOOL *dbmp; MPOOLFILE *mfp; - size_t len, pagesize; - size_t nr; - int created, ret; + size_t len, nr, pagesize; + int ret; dbmp = dbmfp->dbmp; dbenv = dbmp->dbenv; mfp = dbmfp->mfp; pagesize = mfp->stat.st_pagesize; + /* We should never be called with a dirty or a locked buffer. */ + DB_ASSERT(!F_ISSET(bhp, BH_DIRTY | BH_DIRTY_CREATE | BH_LOCKED)); + + /* Lock the buffer and swap the hash bucket lock for the buffer lock. */ F_SET(bhp, BH_LOCKED | BH_TRASH); - MUTEX_LOCK(dbenv, &bhp->mutex, dbenv->lockfhp); - R_UNLOCK(dbenv, dbmp->reginfo); + MUTEX_LOCK(dbenv, &bhp->mutex); + MUTEX_UNLOCK(dbenv, mutexp); /* * Temporary files may not yet have been created. We don't create * them now, we create them when the pages have to be flushed. */ nr = 0; - if (F_ISSET(&dbmfp->fh, DB_FH_VALID)) { - /* - * Ignore read errors if we have permission to create the page. - * Assume that the page doesn't exist, and that we'll create it - * when we write it out. - * - * XXX - * Theoretically, we could overwrite a page of data if it were - * possible for a file to be successfully opened for reading - * and then for the read to fail. Shouldn't ever happen, but - * it might be worth checking to see if the offset is past the - * known end-of-file. - */ - db_io.fhp = &dbmfp->fh; + if (F_ISSET(dbmfp->fhp, DB_FH_VALID)) { + db_io.fhp = dbmfp->fhp; db_io.mutexp = dbmfp->mutexp; db_io.pagesize = db_io.bytes = pagesize; db_io.pgno = bhp->pgno; db_io.buf = bhp->buf; - ret = __os_io(dbenv, &db_io, DB_IO_READ, &nr); - } else - ret = 0; + /* + * The page may not exist; if it doesn't, nr may well be 0, + * but we expect the underlying OS calls not to return an + * error code in this case. + */ + if ((ret = __os_io(dbenv, &db_io, DB_IO_READ, &nr)) != 0) + goto err; + } - created = 0; if (nr < pagesize) { - if (can_create) - created = 1; - else { - /* - * If we had a short read, ret may be 0. This may not - * be an error -- in particular DB recovery processing - * may request pages that have never been written to - * disk, in which case we won't find the page. So, the - * caller must know how to handle the error. - */ - if (ret == 0) - ret = EIO; + /* + * Don't output error messages for short reads. In particular, + * DB recovery processing may request pages never written to + * disk or for which only some part have been written to disk, + * in which case we won't find the page. The caller must know + * how to handle the error. + */ + if (can_create == 0) { + ret = DB_PAGE_NOTFOUND; goto err; } - } - /* - * Clear any bytes we didn't read that need to be cleared. If we're - * running in diagnostic mode, smash any bytes on the page that are - * unknown quantities for the caller. - */ - if (nr != pagesize) { + /* Clear any bytes that need to be cleared. */ len = mfp->clear_len == 0 ? pagesize : mfp->clear_len; - if (nr < len) - memset(bhp->buf + nr, 0, len - nr); -#ifdef DIAGNOSTIC - if (nr > len) - len = nr; + memset(bhp->buf, 0, len); + +#if defined(DIAGNOSTIC) || defined(UMRW) + /* + * If we're running in diagnostic mode, corrupt any bytes on + * the page that are unknown quantities for the caller. + */ if (len < pagesize) memset(bhp->buf + len, CLEAR_BYTE, pagesize - len); #endif - } + ++mfp->stat.st_page_create; + } else + ++mfp->stat.st_page_in; /* Call any pgin function. */ ret = mfp->ftype == 0 ? 0 : __memp_pg(dbmfp, bhp, 1); - /* Unlock the buffer and reacquire the region lock. */ + /* Unlock the buffer and reacquire the hash bucket lock. */ err: MUTEX_UNLOCK(dbenv, &bhp->mutex); - R_LOCK(dbenv, dbmp->reginfo); + MUTEX_LOCK(dbenv, mutexp); /* * If no errors occurred, the data is now valid, clear the BH_TRASH * flag; regardless, clear the lock bit and let other threads proceed. */ F_CLR(bhp, BH_LOCKED); - if (ret == 0) { + if (ret == 0) F_CLR(bhp, BH_TRASH); - /* Update the statistics. */ - if (created) - ++mfp->stat.st_page_create; - else - ++mfp->stat.st_page_in; - } - return (ret); } /* * __memp_pgwrite -- * Write a page to a file. - * - * PUBLIC: int __memp_pgwrite - * PUBLIC: __P((DB_MPOOL *, DB_MPOOLFILE *, BH *, int *, int *)); */ -int -__memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep) +static int +__memp_pgwrite(dbmp, dbmfp, hp, bhp) DB_MPOOL *dbmp; DB_MPOOLFILE *dbmfp; + DB_MPOOL_HASH *hp; BH *bhp; - int *restartp, *wrotep; { DB_ENV *dbenv; DB_IO db_io; DB_LSN lsn; - MPOOL *c_mp, *mp; MPOOLFILE *mfp; size_t nw; - int callpgin, dosync, ret, syncfail; - const char *fail; + int callpgin, ret; dbenv = dbmp->dbenv; - mp = dbmp->reginfo[0].primary; mfp = dbmfp == NULL ? NULL : dbmfp->mfp; - - if (restartp != NULL) - *restartp = 0; - if (wrotep != NULL) - *wrotep = 0; - callpgin = 0; + callpgin = ret = 0; /* - * Check the dirty bit -- this buffer may have been written since we - * decided to write it. + * We should never be called with a clean or trash buffer. + * The sync code does call us with already locked buffers. */ - if (!F_ISSET(bhp, BH_DIRTY)) { - if (wrotep != NULL) - *wrotep = 1; - return (0); - } - - MUTEX_LOCK(dbenv, &bhp->mutex, dbenv->lockfhp); + DB_ASSERT(F_ISSET(bhp, BH_DIRTY)); + DB_ASSERT(!F_ISSET(bhp, BH_TRASH)); /* - * If there were two writers, we may have just been waiting while the - * other writer completed I/O on this buffer. Check the dirty bit one - * more time. + * If we have not already traded the hash bucket lock for the buffer + * lock, do so now. */ - if (!F_ISSET(bhp, BH_DIRTY)) { - MUTEX_UNLOCK(dbenv, &bhp->mutex); - - if (wrotep != NULL) - *wrotep = 1; - return (0); + if (!F_ISSET(bhp, BH_LOCKED)) { + F_SET(bhp, BH_LOCKED); + MUTEX_LOCK(dbenv, &bhp->mutex); + MUTEX_UNLOCK(dbenv, &hp->hash_mutex); } - F_SET(bhp, BH_LOCKED); - R_UNLOCK(dbenv, dbmp->reginfo); - - if (restartp != NULL) - *restartp = 1; - /* * It's possible that the underlying file doesn't exist, either * because of an outright removal or because it was a temporary @@ -347,155 +322,122 @@ __memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep) goto file_dead; /* - * Ensure the appropriate log records are on disk. If the page is - * being written as part of a sync operation, the flush has already - * been done, unless it was written by the application *after* the - * sync was scheduled. + * If the page is in a file for which we have LSN information, we have + * to ensure the appropriate log records are on disk. */ - if (LOGGING_ON(dbenv) && - (!F_ISSET(bhp, BH_SYNC) || F_ISSET(bhp, BH_SYNC_LOGFLSH))) { + if (LOGGING_ON(dbenv) && mfp->lsn_off != -1) { memcpy(&lsn, bhp->buf + mfp->lsn_off, sizeof(DB_LSN)); - if ((ret = log_flush(dbenv, &lsn)) != 0) + if ((ret = dbenv->log_flush(dbenv, &lsn)) != 0) goto err; } - DB_ASSERT(!LOGGING_ON(dbenv) || - log_compare(&((LOG *)((DB_LOG *) - dbenv->lg_handle)->reginfo.primary)->s_lsn, &LSN(bhp->buf)) > 0); + +#ifdef DIAGNOSTIC + /* + * Verify write-ahead logging semantics. + * + * !!! + * One special case. There is a single field on the meta-data page, + * the last-page-number-in-the-file field, for which we do not log + * changes. If the page was originally created in a database that + * didn't have logging turned on, we can see a page marked dirty but + * for which no corresponding log record has been written. However, + * the only way that a page can be created for which there isn't a + * previous log record and valid LSN is when the page was created + * without logging turned on, and so we check for that special-case + * LSN value. + */ + if (LOGGING_ON(dbenv) && !IS_NOT_LOGGED_LSN(LSN(bhp->buf))) { + /* + * There is a potential race here. If we are in the midst of + * switching log files, it's possible we could test against the + * old file and the new offset in the log region's LSN. If we + * fail the first test, acquire the log mutex and check again. + */ + DB_LOG *dblp; + LOG *lp; + + dblp = dbenv->lg_handle; + lp = dblp->reginfo.primary; + if (!IS_NOT_LOGGED_LSN(LSN(bhp->buf)) && + log_compare(&lp->s_lsn, &LSN(bhp->buf)) <= 0) { + R_LOCK(dbenv, &dblp->reginfo); + DB_ASSERT(log_compare(&lp->s_lsn, &LSN(bhp->buf)) > 0); + R_UNLOCK(dbenv, &dblp->reginfo); + } + } +#endif /* * Call any pgout function. We set the callpgin flag so that we flag * that the contents of the buffer will need to be passed through pgin * before they are reused. */ - if (mfp->ftype == 0) - ret = 0; - else { + if (mfp->ftype != 0) { callpgin = 1; if ((ret = __memp_pg(dbmfp, bhp, 0)) != 0) goto err; } /* Temporary files may not yet have been created. */ - if (!F_ISSET(&dbmfp->fh, DB_FH_VALID)) { + if (!F_ISSET(dbmfp->fhp, DB_FH_VALID)) { MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp); - if (!F_ISSET(&dbmfp->fh, DB_FH_VALID) && - ((ret = __db_appname(dbenv, DB_APP_TMP, NULL, NULL, - DB_OSO_CREATE | DB_OSO_EXCL | DB_OSO_TEMP, - &dbmfp->fh, NULL)) != 0 || - !F_ISSET(&dbmfp->fh, DB_FH_VALID))) { - MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); + ret = F_ISSET(dbmfp->fhp, DB_FH_VALID) ? 0 : + __db_appname(dbenv, DB_APP_TMP, NULL, + F_ISSET(dbenv, DB_ENV_DIRECT_DB) ? DB_OSO_DIRECT : 0, + dbmfp->fhp, NULL); + MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); + if (ret != 0) { __db_err(dbenv, "unable to create temporary backing file"); goto err; } - MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); } /* Write the page. */ - db_io.fhp = &dbmfp->fh; + db_io.fhp = dbmfp->fhp; db_io.mutexp = dbmfp->mutexp; db_io.pagesize = db_io.bytes = mfp->stat.st_pagesize; db_io.pgno = bhp->pgno; db_io.buf = bhp->buf; if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) { - ret = __db_panic(dbenv, ret); - fail = "write"; - goto syserr; - } - if (nw != mfp->stat.st_pagesize) { - ret = EIO; - fail = "write"; - goto syserr; + __db_err(dbenv, "%s: write failed for page %lu", + __memp_fn(dbmfp), (u_long)bhp->pgno); + goto err; } + ++mfp->stat.st_page_out; +err: file_dead: /* * !!! * Once we pass this point, dbmfp and mfp may be NULL, we may not have * a valid file reference. * - * Unlock the buffer and reacquire the region lock. + * Unlock the buffer and reacquire the hash lock. */ MUTEX_UNLOCK(dbenv, &bhp->mutex); - R_LOCK(dbenv, dbmp->reginfo); + MUTEX_LOCK(dbenv, &hp->hash_mutex); /* - * Clean up the flags based on a successful write. - * * If we rewrote the page, it will need processing by the pgin * routine before reuse. */ if (callpgin) F_SET(bhp, BH_CALLPGIN); - F_CLR(bhp, BH_DIRTY | BH_LOCKED); /* - * If we write a buffer for which a checkpoint is waiting, update - * the count of pending buffers (both in the mpool as a whole and - * for this file). If the count for this file goes to zero, set a - * flag so we flush the writes. + * Update the hash bucket statistics, reset the flags. + * If we were successful, the page is no longer dirty. */ - dosync = 0; - if (F_ISSET(bhp, BH_SYNC)) { - F_CLR(bhp, BH_SYNC | BH_SYNC_LOGFLSH); - - --mp->lsn_cnt; - if (mfp != NULL) - dosync = --mfp->lsn_cnt == 0 ? 1 : 0; - } - - /* Update the page clean/dirty statistics. */ - c_mp = BH_TO_CACHE(dbmp, bhp); - ++c_mp->stat.st_page_clean; - --c_mp->stat.st_page_dirty; - - /* Update I/O statistics. */ - if (mfp != NULL) - ++mfp->stat.st_page_out; + if (ret == 0) { + DB_ASSERT(hp->hash_page_dirty != 0); + --hp->hash_page_dirty; - /* - * Do the sync after everything else has been updated, so any incoming - * checkpoint doesn't see inconsistent information. - * - * XXX: - * Don't lock the region around the sync, fsync(2) has no atomicity - * issues. - * - * XXX: - * We ignore errors from the sync -- it makes no sense to return an - * error to the calling process, so set a flag causing the checkpoint - * to be retried later. There is a possibility, of course, that a - * subsequent checkpoint was started and that we're going to force it - * to fail. That should be unlikely, and fixing it would be difficult. - */ - if (dosync) { - R_UNLOCK(dbenv, dbmp->reginfo); - syncfail = __os_fsync(dbenv, &dbmfp->fh) != 0; - R_LOCK(dbenv, dbmp->reginfo); - if (syncfail) - F_SET(mp, MP_LSN_RETRY); + F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE); } - if (wrotep != NULL) - *wrotep = 1; - - return (0); - -syserr: __db_err(dbenv, "%s: %s failed for page %lu", - __memp_fn(dbmfp), fail, (u_long)bhp->pgno); - -err: /* Unlock the buffer and reacquire the region lock. */ - MUTEX_UNLOCK(dbenv, &bhp->mutex); - R_LOCK(dbenv, dbmp->reginfo); - - /* - * Clean up the flags based on a failure. - * - * The page remains dirty but we remove our lock. If we rewrote the - * page, it will need processing by the pgin routine before reuse. - */ - if (callpgin) - F_SET(bhp, BH_CALLPGIN); + /* Regardless, clear any sync wait-for count and remove our lock. */ + bhp->ref_sync = 0; F_CLR(bhp, BH_LOCKED); return (ret); @@ -514,15 +456,17 @@ __memp_pg(dbmfp, bhp, is_pgin) int is_pgin; { DBT dbt, *dbtp; + DB_ENV *dbenv; DB_MPOOL *dbmp; DB_MPREG *mpreg; MPOOLFILE *mfp; int ftype, ret; dbmp = dbmfp->dbmp; + dbenv = dbmp->dbenv; mfp = dbmfp->mfp; - MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp); + MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp); ftype = mfp->ftype; for (mpreg = LIST_FIRST(&dbmp->dbregq); @@ -536,28 +480,28 @@ __memp_pg(dbmfp, bhp, is_pgin) dbt.data = R_ADDR(dbmp->reginfo, mfp->pgcookie_off); dbtp = &dbt; } - MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp); + MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); if (is_pgin) { if (mpreg->pgin != NULL && - (ret = mpreg->pgin(dbmp->dbenv, + (ret = mpreg->pgin(dbenv, bhp->pgno, bhp->buf, dbtp)) != 0) goto err; } else if (mpreg->pgout != NULL && - (ret = mpreg->pgout(dbmp->dbenv, + (ret = mpreg->pgout(dbenv, bhp->pgno, bhp->buf, dbtp)) != 0) goto err; break; } if (mpreg == NULL) - MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp); + MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); return (0); -err: MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp); - __db_err(dbmp->dbenv, "%s: %s failed for page %lu", +err: MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); + __db_err(dbenv, "%s: %s failed for page %lu", __memp_fn(dbmfp), is_pgin ? "pgin" : "pgout", (u_long)bhp->pgno); return (ret); } @@ -566,55 +510,78 @@ err: MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp); * __memp_bhfree -- * Free a bucket header and its referenced data. * - * PUBLIC: void __memp_bhfree __P((DB_MPOOL *, BH *, int)); + * PUBLIC: void __memp_bhfree __P((DB_MPOOL *, DB_MPOOL_HASH *, BH *, int)); */ void -__memp_bhfree(dbmp, bhp, free_mem) +__memp_bhfree(dbmp, hp, bhp, free_mem) DB_MPOOL *dbmp; + DB_MPOOL_HASH *hp; BH *bhp; int free_mem; { - DB_HASHTAB *dbht; + DB_ENV *dbenv; MPOOL *c_mp, *mp; MPOOLFILE *mfp; - int n_bucket, n_cache; + u_int32_t n_cache; + /* + * Assumes the hash bucket is locked and the MPOOL is not. + */ + dbenv = dbmp->dbenv; mp = dbmp->reginfo[0].primary; - c_mp = BH_TO_CACHE(dbmp, bhp); - n_cache = NCACHE(mp, bhp->pgno); - n_bucket = NBUCKET(c_mp, bhp->mf_offset, bhp->pgno); - dbht = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab); + n_cache = NCACHE(mp, bhp->mf_offset, bhp->pgno); - /* Delete the buffer header from the hash bucket queue. */ - SH_TAILQ_REMOVE(&dbht[n_bucket], bhp, hq, __bh); + /* + * Delete the buffer header from the hash bucket queue and reset + * the hash bucket's priority, if necessary. + */ + SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh); + if (bhp->priority == hp->hash_priority) + hp->hash_priority = + SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL ? + 0 : SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority; - /* Delete the buffer header from the LRU queue. */ - SH_TAILQ_REMOVE(&c_mp->bhq, bhp, q, __bh); + /* + * Discard the hash bucket's mutex, it's no longer needed, and + * we don't want to be holding it when acquiring other locks. + */ + MUTEX_UNLOCK(dbenv, &hp->hash_mutex); - /* Clear the mutex this buffer recorded */ - __db_shlocks_clear(&bhp->mutex, &dbmp->reginfo[n_cache], - (REGMAINT *)R_ADDR(&dbmp->reginfo[n_cache], mp->maint_off)); /* * Find the underlying MPOOLFILE and decrement its reference count. * If this is its last reference, remove it. */ mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); + MUTEX_LOCK(dbenv, &mfp->mutex); if (--mfp->block_cnt == 0 && mfp->mpf_cnt == 0) __memp_mf_discard(dbmp, mfp); + else + MUTEX_UNLOCK(dbenv, &mfp->mutex); + + R_LOCK(dbenv, &dbmp->reginfo[n_cache]); + + /* + * Clear the mutex this buffer recorded; requires the region lock + * be held. + */ + __db_shlocks_clear(&bhp->mutex, &dbmp->reginfo[n_cache], + (REGMAINT *)R_ADDR(&dbmp->reginfo[n_cache], mp->maint_off)); /* - * If we're not reusing it immediately, free the buffer header + * If we're not reusing the buffer immediately, free the buffer header * and data for real. */ if (free_mem) { - --c_mp->stat.st_page_clean; __db_shalloc_free(dbmp->reginfo[n_cache].addr, bhp); + c_mp = dbmp->reginfo[n_cache].primary; + c_mp->stat.st_pages--; } + R_UNLOCK(dbenv, &dbmp->reginfo[n_cache]); } /* * __memp_upgrade -- - * Upgrade a file descriptor from readonly to readwrite. + * Upgrade a file descriptor from read-only to read-write. */ static int __memp_upgrade(dbmp, dbmfp, mfp) @@ -622,41 +589,58 @@ __memp_upgrade(dbmp, dbmfp, mfp) DB_MPOOLFILE *dbmfp; MPOOLFILE *mfp; { - DB_FH fh; + DB_ENV *dbenv; + DB_FH *fhp, *tfhp; int ret; char *rpath; - /* - * !!! - * We expect the handle to already be locked. - */ - - /* Check to see if we've already upgraded. */ - if (F_ISSET(dbmfp, MP_UPGRADE)) - return (0); - - /* Check to see if we've already failed. */ - if (F_ISSET(dbmfp, MP_UPGRADE_FAIL)) - return (1); + dbenv = dbmp->dbenv; + fhp = NULL; + rpath = NULL; /* * Calculate the real name for this file and try to open it read/write. * We know we have a valid pathname for the file because it's the only * way we could have gotten a file descriptor of any kind. */ - if ((ret = __db_appname(dbmp->dbenv, DB_APP_DATA, - NULL, R_ADDR(dbmp->reginfo, mfp->path_off), 0, NULL, &rpath)) != 0) - return (ret); - if (__os_open(dbmp->dbenv, rpath, 0, 0, &fh) != 0) { + if ((ret = __os_calloc(dbenv, 1, sizeof(DB_FH), &fhp)) != 0) + goto err; + + if ((ret = __db_appname(dbenv, DB_APP_DATA, + R_ADDR(dbmp->reginfo, mfp->path_off), 0, NULL, &rpath)) != 0) + goto err; + + if (__os_open(dbenv, rpath, + F_ISSET(mfp, MP_DIRECT) ? DB_OSO_DIRECT : 0, 0, fhp) != 0) { F_SET(dbmfp, MP_UPGRADE_FAIL); - ret = 1; - } else { - /* Swap the descriptors and set the upgrade flag. */ - (void)__os_closehandle(&dbmfp->fh); - dbmfp->fh = fh; - F_SET(dbmfp, MP_UPGRADE); - ret = 0; + goto err; } - __os_freestr(rpath); + + /* + * Swap the descriptors and set the upgrade flag. + * + * XXX + * There is a race here. If another process schedules a read using the + * existing file descriptor and is swapped out before making the system + * call, this code could theoretically close the file descriptor out + * from under it. While it's very unlikely, this code should still be + * rewritten. + */ + tfhp = dbmfp->fhp; + dbmfp->fhp = fhp; + fhp = tfhp; + + (void)__os_closehandle(dbenv, fhp); + F_SET(dbmfp, MP_UPGRADE); + + ret = 0; + if (0) { +err: ret = 1; + } + if (fhp != NULL) + __os_free(dbenv, fhp); + if (rpath != NULL) + __os_free(dbenv, rpath); + return (ret); } diff --git a/bdb/mp/mp_fget.c b/bdb/mp/mp_fget.c index 1bff5e136ab..be0785a2184 100644 --- a/bdb/mp/mp_fget.c +++ b/bdb/mp/mp_fget.c @@ -1,13 +1,13 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: mp_fget.c,v 11.28 2001/01/10 04:50:53 ubell Exp $"; +static const char revid[] = "$Id: mp_fget.c,v 11.68 2002/08/06 04:58:09 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -16,51 +16,54 @@ static const char revid[] = "$Id: mp_fget.c,v 11.28 2001/01/10 04:50:53 ubell Ex #include <string.h> #endif -#ifdef HAVE_RPC -#include "db_server.h" -#endif - #include "db_int.h" -#include "db_shash.h" -#include "mp.h" +#include "dbinc/db_shash.h" +#include "dbinc/mp.h" -#ifdef HAVE_RPC -#include "gen_client_ext.h" -#include "rpc_client_ext.h" +#ifdef HAVE_FILESYSTEM_NOTZERO +static int __memp_fs_notzero + __P((DB_ENV *, DB_MPOOLFILE *, MPOOLFILE *, db_pgno_t *)); #endif /* - * memp_fget -- + * __memp_fget -- * Get a page from the file. + * + * PUBLIC: int __memp_fget + * PUBLIC: __P((DB_MPOOLFILE *, db_pgno_t *, u_int32_t, void *)); */ int -memp_fget(dbmfp, pgnoaddr, flags, addrp) +__memp_fget(dbmfp, pgnoaddr, flags, addrp) DB_MPOOLFILE *dbmfp; db_pgno_t *pgnoaddr; u_int32_t flags; void *addrp; { - BH *bhp; + enum { FIRST_FOUND, FIRST_MISS, SECOND_FOUND, SECOND_MISS } state; + BH *alloc_bhp, *bhp; DB_ENV *dbenv; DB_MPOOL *dbmp; - DB_HASHTAB *dbht; + DB_MPOOL_HASH *hp; MPOOL *c_mp, *mp; MPOOLFILE *mfp; - size_t n_bucket, n_cache, mf_offset; - u_int32_t st_hsearch; - int b_incr, first, ret; + roff_t mf_offset; + u_int32_t n_cache, st_hsearch; + int b_incr, extending, first, ret; + + *(void **)addrp = NULL; dbmp = dbmfp->dbmp; dbenv = dbmp->dbenv; - mp = dbmp->reginfo[0].primary; - mfp = dbmfp->mfp; -#ifdef HAVE_RPC - if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) - return (__dbcl_memp_fget(dbmfp, pgnoaddr, flags, addrp)); -#endif PANIC_CHECK(dbenv); + mp = dbmp->reginfo[0].primary; + mfp = dbmfp->mfp; + mf_offset = R_OFFSET(dbmp->reginfo, mfp); + alloc_bhp = bhp = NULL; + hp = NULL; + b_incr = extending = ret = 0; + /* * Validate arguments. * @@ -74,100 +77,35 @@ memp_fget(dbmfp, pgnoaddr, flags, addrp) * is to keep database files small. It's sleazy as hell, but we catch * any attempt to actually write the file in memp_fput(). */ -#define OKFLAGS \ - (DB_MPOOL_CREATE | DB_MPOOL_LAST | \ - DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP | DB_MPOOL_EXTENT) +#define OKFLAGS (DB_MPOOL_CREATE | DB_MPOOL_LAST | DB_MPOOL_NEW) if (flags != 0) { if ((ret = __db_fchk(dbenv, "memp_fget", flags, OKFLAGS)) != 0) return (ret); - switch (flags & ~DB_MPOOL_EXTENT) { + switch (flags) { case DB_MPOOL_CREATE: + break; case DB_MPOOL_LAST: + /* Get the last page number in the file. */ + if (flags == DB_MPOOL_LAST) { + R_LOCK(dbenv, dbmp->reginfo); + *pgnoaddr = mfp->last_pgno; + R_UNLOCK(dbenv, dbmp->reginfo); + } + break; case DB_MPOOL_NEW: - case DB_MPOOL_NEW_GROUP: - case 0: + /* + * If always creating a page, skip the first search + * of the hash bucket. + */ + if (flags == DB_MPOOL_NEW) + goto alloc; break; default: return (__db_ferr(dbenv, "memp_fget", 1)); } } -#ifdef DIAGNOSTIC - /* - * XXX - * We want to switch threads as often as possible. Yield every time - * we get a new page to ensure contention. - */ - if (DB_GLOBAL(db_pageyield)) - __os_yield(dbenv, 1); -#endif - - /* Initialize remaining local variables. */ - mf_offset = R_OFFSET(dbmp->reginfo, mfp); - bhp = NULL; - st_hsearch = 0; - b_incr = ret = 0; - - R_LOCK(dbenv, dbmp->reginfo); - - /* - * Check for the new, last or last + 1 page requests. - * - * Examine and update the file's last_pgno value. We don't care if - * the last_pgno value immediately changes due to another thread -- - * at this instant in time, the value is correct. We do increment the - * current last_pgno value if the thread is asking for a new page, - * however, to ensure that two threads creating pages don't get the - * same one. - * - * If we create a page, there is the potential that a page after it - * in the file will be written before it will be written. Recovery - * depends on pages that are "created" in the file by subsequent pages - * being written be zeroed out, not have random garbage. Ensure that - * the OS agrees. - * - * !!! - * DB_MPOOL_NEW_GROUP is undocumented -- the hash access method needs - * to allocate contiguous groups of pages in order to do subdatabases. - * We return the first page in the group, but the caller must put an - * LSN on the *last* page and write it, otherwise after a crash we may - * not create all of the pages we need to create. - */ - if (LF_ISSET(DB_MPOOL_LAST | DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP)) { - if (LF_ISSET(DB_MPOOL_NEW)) { - if (F_ISSET(&dbmfp->fh, DB_FH_VALID) && (ret = - __os_fpinit(dbenv, &dbmfp->fh, mfp->last_pgno + 1, - 1, mfp->stat.st_pagesize)) != 0) { - R_UNLOCK(dbenv, dbmp->reginfo); - return (ret); - } - ++mfp->last_pgno; - } - if (LF_ISSET(DB_MPOOL_NEW_GROUP)) { - if (F_ISSET(&dbmfp->fh, DB_FH_VALID) && (ret = - __os_fpinit(dbenv, &dbmfp->fh, mfp->last_pgno + 1, - (int)*pgnoaddr, mfp->stat.st_pagesize)) != 0) { - R_UNLOCK(dbenv, dbmp->reginfo); - return (ret); - } - mfp->last_pgno += *pgnoaddr; - } - *pgnoaddr = mfp->last_pgno; - } - - /* - * Determine the hash bucket where this page will live, and get local - * pointers to the cache and its hash table. - */ - n_cache = NCACHE(mp, *pgnoaddr); - c_mp = dbmp->reginfo[n_cache].primary; - n_bucket = NBUCKET(c_mp, mf_offset, *pgnoaddr); - dbht = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab); - - if (LF_ISSET(DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP)) - goto alloc; - /* * If mmap'ing the file and the page is not past the end of the file, * just return a pointer. @@ -183,235 +121,534 @@ memp_fget(dbmfp, pgnoaddr, flags, addrp) * goes through the cache. All pages previously returned will be safe, * as long as the correct locking protocol was observed. * - * XXX * We don't discard the map because we don't know when all of the * pages will have been discarded from the process' address space. * It would be possible to do so by reference counting the open * pages from the mmap, but it's unclear to me that it's worth it. */ - if (dbmfp->addr != NULL && F_ISSET(mfp, MP_CAN_MMAP)) { - if (*pgnoaddr > mfp->orig_last_pgno) { - /* - * !!! - * See the comment above about non-existent pages and - * the hash access method. - */ - if (!LF_ISSET(DB_MPOOL_CREATE)) { - if (!LF_ISSET(DB_MPOOL_EXTENT)) - __db_err(dbenv, - "%s: page %lu doesn't exist", - __memp_fn(dbmfp), (u_long)*pgnoaddr); - ret = EINVAL; - goto err; - } - } else { - *(void **)addrp = - R_ADDR(dbmfp, *pgnoaddr * mfp->stat.st_pagesize); - ++mfp->stat.st_map; - goto done; - } + if (dbmfp->addr != NULL && + F_ISSET(mfp, MP_CAN_MMAP) && *pgnoaddr <= mfp->orig_last_pgno) { + *(void **)addrp = + R_ADDR(dbmfp, *pgnoaddr * mfp->stat.st_pagesize); + ++mfp->stat.st_map; + return (0); } +hb_search: + /* + * Determine the cache and hash bucket where this page lives and get + * local pointers to them. Reset on each pass through this code, the + * page number can change. + */ + n_cache = NCACHE(mp, mf_offset, *pgnoaddr); + c_mp = dbmp->reginfo[n_cache].primary; + hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab); + hp = &hp[NBUCKET(c_mp, mf_offset, *pgnoaddr)]; + /* Search the hash chain for the page. */ - for (bhp = SH_TAILQ_FIRST(&dbht[n_bucket], __bh); +retry: st_hsearch = 0; + MUTEX_LOCK(dbenv, &hp->hash_mutex); + for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh); bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) { ++st_hsearch; if (bhp->pgno != *pgnoaddr || bhp->mf_offset != mf_offset) continue; - /* Increment the reference count. */ + /* + * Increment the reference count. We may discard the hash + * bucket lock as we evaluate and/or read the buffer, so we + * need to ensure it doesn't move and its contents remain + * unchanged. + */ if (bhp->ref == UINT16_T_MAX) { __db_err(dbenv, "%s: page %lu: reference count overflow", __memp_fn(dbmfp), (u_long)bhp->pgno); ret = EINVAL; + MUTEX_UNLOCK(dbenv, &hp->hash_mutex); goto err; } - - /* - * Increment the reference count. We may discard the region - * lock as we evaluate and/or read the buffer, so we need to - * ensure that it doesn't move and that its contents remain - * unchanged. - */ ++bhp->ref; b_incr = 1; /* - * Any buffer we find might be trouble. - * * BH_LOCKED -- - * I/O is in progress. Because we've incremented the buffer - * reference count, we know the buffer can't move. Unlock - * the region lock, wait for the I/O to complete, and reacquire - * the region. + * I/O is in progress or sync is waiting on the buffer to write + * it. Because we've incremented the buffer reference count, + * we know the buffer can't move. Unlock the bucket lock, wait + * for the buffer to become available, reacquire the bucket. */ - for (first = 1; F_ISSET(bhp, BH_LOCKED); first = 0) { - R_UNLOCK(dbenv, dbmp->reginfo); + for (first = 1; F_ISSET(bhp, BH_LOCKED) && + !F_ISSET(dbenv, DB_ENV_NOLOCKING); first = 0) { + /* + * If someone is trying to sync this buffer and the + * buffer is hot, they may never get in. Give up + * and try again. + */ + if (!first && bhp->ref_sync != 0) { + --bhp->ref; + b_incr = 0; + MUTEX_UNLOCK(dbenv, &hp->hash_mutex); + __os_yield(dbenv, 1); + goto retry; + } + MUTEX_UNLOCK(dbenv, &hp->hash_mutex); /* - * Explicitly yield the processor if it's not the first - * pass through this loop -- if we don't, we might end - * up running to the end of our CPU quantum as we will - * simply be swapping between the two locks. + * Explicitly yield the processor if not the first pass + * through this loop -- if we don't, we might run to the + * end of our CPU quantum as we will simply be swapping + * between the two locks. */ if (!first) __os_yield(dbenv, 1); - MUTEX_LOCK(dbenv, &bhp->mutex, dbenv->lockfhp); + MUTEX_LOCK(dbenv, &bhp->mutex); /* Wait for I/O to finish... */ MUTEX_UNLOCK(dbenv, &bhp->mutex); - R_LOCK(dbenv, dbmp->reginfo); + MUTEX_LOCK(dbenv, &hp->hash_mutex); + } + + ++mfp->stat.st_cache_hit; + break; + } + + /* + * Update the hash bucket search statistics -- do now because our next + * search may be for a different bucket. + */ + ++c_mp->stat.st_hash_searches; + if (st_hsearch > c_mp->stat.st_hash_longest) + c_mp->stat.st_hash_longest = st_hsearch; + c_mp->stat.st_hash_examined += st_hsearch; + + /* + * There are 4 possible paths to this location: + * + * FIRST_MISS: + * Didn't find the page in the hash bucket on our first pass: + * bhp == NULL, alloc_bhp == NULL + * + * FIRST_FOUND: + * Found the page in the hash bucket on our first pass: + * bhp != NULL, alloc_bhp == NULL + * + * SECOND_FOUND: + * Didn't find the page in the hash bucket on the first pass, + * allocated space, and found the page in the hash bucket on + * our second pass: + * bhp != NULL, alloc_bhp != NULL + * + * SECOND_MISS: + * Didn't find the page in the hash bucket on the first pass, + * allocated space, and didn't find the page in the hash bucket + * on our second pass: + * bhp == NULL, alloc_bhp != NULL + */ + state = bhp == NULL ? + (alloc_bhp == NULL ? FIRST_MISS : SECOND_MISS) : + (alloc_bhp == NULL ? FIRST_FOUND : SECOND_FOUND); + switch (state) { + case FIRST_FOUND: + /* We found the buffer in our first check -- we're done. */ + break; + case FIRST_MISS: + /* + * We didn't find the buffer in our first check. Figure out + * if the page exists, and allocate structures so we can add + * the page to the buffer pool. + */ + MUTEX_UNLOCK(dbenv, &hp->hash_mutex); + +alloc: /* + * If DB_MPOOL_NEW is set, we have to allocate a page number. + * If neither DB_MPOOL_CREATE or DB_MPOOL_CREATE is set, then + * it's an error to try and get a page past the end of file. + */ + COMPQUIET(n_cache, 0); + + extending = ret = 0; + R_LOCK(dbenv, dbmp->reginfo); + switch (flags) { + case DB_MPOOL_NEW: + extending = 1; + *pgnoaddr = mfp->last_pgno + 1; + break; + case DB_MPOOL_CREATE: + extending = *pgnoaddr > mfp->last_pgno; + break; + default: + ret = *pgnoaddr > mfp->last_pgno ? DB_PAGE_NOTFOUND : 0; + break; } + R_UNLOCK(dbenv, dbmp->reginfo); + if (ret != 0) + goto err; /* - * BH_TRASH -- - * The contents of the buffer are garbage. Shouldn't happen, - * and this read is likely to fail, but might as well try. + * !!! + * In the DB_MPOOL_NEW code path, mf_offset and n_cache have + * not yet been initialized. */ - if (F_ISSET(bhp, BH_TRASH)) - goto reread; + mf_offset = R_OFFSET(dbmp->reginfo, mfp); + n_cache = NCACHE(mp, mf_offset, *pgnoaddr); + /* Allocate a new buffer header and data space. */ + if ((ret = __memp_alloc(dbmp, + &dbmp->reginfo[n_cache], mfp, 0, NULL, &alloc_bhp)) != 0) + goto err; +#ifdef DIAGNOSTIC + if ((db_alignp_t)alloc_bhp->buf & (sizeof(size_t) - 1)) { + __db_err(dbenv, + "Error: buffer data is NOT size_t aligned"); + ret = EINVAL; + goto err; + } +#endif /* - * BH_CALLPGIN -- - * The buffer was converted so it could be written, and the - * contents need to be converted again. + * If we are extending the file, we'll need the region lock + * again. */ - if (F_ISSET(bhp, BH_CALLPGIN)) { - if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0) + if (extending) + R_LOCK(dbenv, dbmp->reginfo); + + /* + * DB_MPOOL_NEW does not guarantee you a page unreferenced by + * any other thread of control. (That guarantee is interesting + * for DB_MPOOL_NEW, unlike DB_MPOOL_CREATE, because the caller + * did not specify the page number, and so, may reasonably not + * have any way to lock the page outside of mpool.) Regardless, + * if we allocate the page, and some other thread of control + * requests the page by number, we will not detect that and the + * thread of control that allocated using DB_MPOOL_NEW may not + * have a chance to initialize the page. (Note: we *could* + * detect this case if we set a flag in the buffer header which + * guaranteed that no gets of the page would succeed until the + * reference count went to 0, that is, until the creating page + * put the page.) What we do guarantee is that if two threads + * of control are both doing DB_MPOOL_NEW calls, they won't + * collide, that is, they won't both get the same page. + * + * There's a possibility that another thread allocated the page + * we were planning to allocate while we were off doing buffer + * allocation. We can do that by making sure the page number + * we were going to use is still available. If it's not, then + * we check to see if the next available page number hashes to + * the same mpool region as the old one -- if it does, we can + * continue, otherwise, we have to start over. + */ + if (flags == DB_MPOOL_NEW && *pgnoaddr != mfp->last_pgno + 1) { + *pgnoaddr = mfp->last_pgno + 1; + if (n_cache != NCACHE(mp, mf_offset, *pgnoaddr)) { + __db_shalloc_free( + dbmp->reginfo[n_cache].addr, alloc_bhp); + /* + * flags == DB_MPOOL_NEW, so extending is set + * and we're holding the region locked. + */ + R_UNLOCK(dbenv, dbmp->reginfo); + + alloc_bhp = NULL; + goto alloc; + } + } + + /* + * We released the region lock, so another thread might have + * extended the file. Update the last_pgno and initialize + * the file, as necessary, if we extended the file. + */ + if (extending) { +#ifdef HAVE_FILESYSTEM_NOTZERO + if (*pgnoaddr > mfp->last_pgno && + __os_fs_notzero() && + F_ISSET(dbmfp->fhp, DB_FH_VALID)) + ret = __memp_fs_notzero( + dbenv, dbmfp, mfp, pgnoaddr); + else + ret = 0; +#endif + if (ret == 0 && *pgnoaddr > mfp->last_pgno) + mfp->last_pgno = *pgnoaddr; + + R_UNLOCK(dbenv, dbmp->reginfo); + if (ret != 0) goto err; - F_CLR(bhp, BH_CALLPGIN); } + goto hb_search; + case SECOND_FOUND: + /* + * We allocated buffer space for the requested page, but then + * found the page in the buffer cache on our second check. + * That's OK -- we can use the page we found in the pool, + * unless DB_MPOOL_NEW is set. + * + * Free the allocated memory, we no longer need it. Since we + * can't acquire the region lock while holding the hash bucket + * lock, we have to release the hash bucket and re-acquire it. + * That's OK, because we have the buffer pinned down. + */ + MUTEX_UNLOCK(dbenv, &hp->hash_mutex); + R_LOCK(dbenv, &dbmp->reginfo[n_cache]); + __db_shalloc_free(dbmp->reginfo[n_cache].addr, alloc_bhp); + alloc_bhp = NULL; + R_UNLOCK(dbenv, &dbmp->reginfo[n_cache]); + MUTEX_LOCK(dbenv, &hp->hash_mutex); - ++mfp->stat.st_cache_hit; - *(void **)addrp = bhp->buf; - goto done; - } + /* + * We can't use the page we found in the pool if DB_MPOOL_NEW + * was set. (For details, see the above comment beginning + * "DB_MPOOL_NEW does not guarantee you a page unreferenced by + * any other thread of control".) If DB_MPOOL_NEW is set, we + * release our pin on this particular buffer, and try to get + * another one. + */ + if (flags == DB_MPOOL_NEW) { + --bhp->ref; + b_incr = 0; + goto alloc; + } + break; + case SECOND_MISS: + /* + * We allocated buffer space for the requested page, and found + * the page still missing on our second pass through the buffer + * cache. Instantiate the page. + */ + bhp = alloc_bhp; + alloc_bhp = NULL; -alloc: /* Allocate new buffer header and data space. */ - if ((ret = __memp_alloc(dbmp, - &dbmp->reginfo[n_cache], mfp, 0, NULL, &bhp)) != 0) - goto err; + /* + * Initialize all the BH and hash bucket fields so we can call + * __memp_bhfree if an error occurs. + * + * Append the buffer to the tail of the bucket list and update + * the hash bucket's priority. + */ + b_incr = 1; + + memset(bhp, 0, sizeof(BH)); + bhp->ref = 1; + bhp->priority = UINT32_T_MAX; + bhp->pgno = *pgnoaddr; + bhp->mf_offset = mf_offset; + SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq); + hp->hash_priority = + SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority; + + /* If we extended the file, make sure the page is never lost. */ + if (extending) { + ++hp->hash_page_dirty; + F_SET(bhp, BH_DIRTY | BH_DIRTY_CREATE); + } - ++c_mp->stat.st_page_clean; + /* + * If we created the page, zero it out. If we didn't create + * the page, read from the backing file. + * + * !!! + * DB_MPOOL_NEW doesn't call the pgin function. + * + * If DB_MPOOL_CREATE is used, then the application's pgin + * function has to be able to handle pages of 0's -- if it + * uses DB_MPOOL_NEW, it can detect all of its page creates, + * and not bother. + * + * If we're running in diagnostic mode, smash any bytes on the + * page that are unknown quantities for the caller. + * + * Otherwise, read the page into memory, optionally creating it + * if DB_MPOOL_CREATE is set. + */ + if (extending) { + if (mfp->clear_len == 0) + memset(bhp->buf, 0, mfp->stat.st_pagesize); + else { + memset(bhp->buf, 0, mfp->clear_len); +#if defined(DIAGNOSTIC) || defined(UMRW) + memset(bhp->buf + mfp->clear_len, CLEAR_BYTE, + mfp->stat.st_pagesize - mfp->clear_len); +#endif + } - /* - * Initialize the BH fields so that we can call the __memp_bhfree - * routine if an error occurs. - */ - memset(bhp, 0, sizeof(BH)); - bhp->ref = 1; - bhp->pgno = *pgnoaddr; - bhp->mf_offset = mf_offset; + if (flags == DB_MPOOL_CREATE && mfp->ftype != 0) + F_SET(bhp, BH_CALLPGIN); - /* Increment the count of buffers referenced by this MPOOLFILE. */ - ++mfp->block_cnt; + ++mfp->stat.st_page_create; + } else { + F_SET(bhp, BH_TRASH); + ++mfp->stat.st_cache_miss; + } - /* - * Prepend the bucket header to the head of the appropriate MPOOL - * bucket hash list. Append the bucket header to the tail of the - * MPOOL LRU chain. - */ - SH_TAILQ_INSERT_HEAD(&dbht[n_bucket], bhp, hq, __bh); - SH_TAILQ_INSERT_TAIL(&c_mp->bhq, bhp, q); + /* Increment buffer count referenced by MPOOLFILE. */ + MUTEX_LOCK(dbenv, &mfp->mutex); + ++mfp->block_cnt; + MUTEX_UNLOCK(dbenv, &mfp->mutex); -#ifdef DIAGNOSTIC - if ((db_alignp_t)bhp->buf & (sizeof(size_t) - 1)) { - __db_err(dbenv, "Internal error: BH data NOT size_t aligned."); - ret = EINVAL; - __memp_bhfree(dbmp, bhp, 1); - goto err; + /* + * Initialize the mutex. This is the last initialization step, + * because it's the only one that can fail, and everything else + * must be set up or we can't jump to the err label because it + * will call __memp_bhfree. + */ + if ((ret = __db_mutex_setup(dbenv, + &dbmp->reginfo[n_cache], &bhp->mutex, 0)) != 0) + goto err; } -#endif - if ((ret = __db_shmutex_init(dbenv, &bhp->mutex, - R_OFFSET(dbmp->reginfo, &bhp->mutex) + DB_FCNTL_OFF_MPOOL, - 0, &dbmp->reginfo[n_cache], - (REGMAINT *)R_ADDR(&dbmp->reginfo[n_cache], c_mp->maint_off))) - != 0) { - __memp_bhfree(dbmp, bhp, 1); - goto err; + DB_ASSERT(bhp->ref != 0); + + /* + * If we're the only reference, update buffer and bucket priorities. + * We may be about to release the hash bucket lock, and everything + * should be correct, first. (We've already done this if we created + * the buffer, so there is no need to do it again.) + */ + if (state != SECOND_MISS && bhp->ref == 1) { + bhp->priority = UINT32_T_MAX; + SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh); + SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq); + hp->hash_priority = + SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority; } /* - * If we created the page, zero it out and continue. - * - * !!! - * Note: DB_MPOOL_NEW specifically doesn't call the pgin function. - * If DB_MPOOL_CREATE is used, then the application's pgin function - * has to be able to handle pages of 0's -- if it uses DB_MPOOL_NEW, - * it can detect all of its page creates, and not bother. + * BH_TRASH -- + * The buffer we found may need to be filled from the disk. * - * If we're running in diagnostic mode, smash any bytes on the - * page that are unknown quantities for the caller. - * - * Otherwise, read the page into memory, optionally creating it if - * DB_MPOOL_CREATE is set. + * It's possible for the read function to fail, which means we fail as + * well. Note, the __memp_pgread() function discards and reacquires + * the hash lock, so the buffer must be pinned down so that it cannot + * move and its contents are unchanged. Discard the buffer on failure + * unless another thread is waiting on our I/O to complete. It's OK to + * leave the buffer around, as the waiting thread will see the BH_TRASH + * flag set, and will also attempt to discard it. If there's a waiter, + * we need to decrement our reference count. */ - if (LF_ISSET(DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP)) { - if (mfp->clear_len == 0) - memset(bhp->buf, 0, mfp->stat.st_pagesize); - else { - memset(bhp->buf, 0, mfp->clear_len); -#ifdef DIAGNOSTIC - memset(bhp->buf + mfp->clear_len, CLEAR_BYTE, - mfp->stat.st_pagesize - mfp->clear_len); -#endif - } + if (F_ISSET(bhp, BH_TRASH) && + (ret = __memp_pgread(dbmfp, + &hp->hash_mutex, bhp, LF_ISSET(DB_MPOOL_CREATE) ? 1 : 0)) != 0) + goto err; - ++mfp->stat.st_page_create; - } else { - /* - * It's possible for the read function to fail, which means - * that we fail as well. Note, the __memp_pgread() function - * discards the region lock, so the buffer must be pinned - * down so that it cannot move and its contents are unchanged. - */ -reread: if ((ret = __memp_pgread(dbmfp, - bhp, LF_ISSET(DB_MPOOL_CREATE|DB_MPOOL_EXTENT))) != 0) { - /* - * !!! - * Discard the buffer unless another thread is waiting - * on our I/O to complete. Regardless, the header has - * the BH_TRASH flag set. - */ - if (bhp->ref == 1) - __memp_bhfree(dbmp, bhp, 1); + /* + * BH_CALLPGIN -- + * The buffer was processed for being written to disk, and now has + * to be re-converted for use. + */ + if (F_ISSET(bhp, BH_CALLPGIN)) { + if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0) goto err; - } - - ++mfp->stat.st_cache_miss; + F_CLR(bhp, BH_CALLPGIN); } + MUTEX_UNLOCK(dbenv, &hp->hash_mutex); + +#ifdef DIAGNOSTIC + /* Update the file's pinned reference count. */ + R_LOCK(dbenv, dbmp->reginfo); + ++dbmfp->pinref; + R_UNLOCK(dbenv, dbmp->reginfo); + /* - * If we're returning a page after our current notion of the last-page, - * update our information. Note, there's no way to un-instantiate this - * page, it's going to exist whether it's returned to us dirty or not. + * We want to switch threads as often as possible, and at awkward + * times. Yield every time we get a new page to ensure contention. */ - if (bhp->pgno > mfp->last_pgno) - mfp->last_pgno = bhp->pgno; + if (F_ISSET(dbenv, DB_ENV_YIELDCPU)) + __os_yield(dbenv, 1); +#endif *(void **)addrp = bhp->buf; + return (0); -done: /* Update the chain search statistics. */ - if (st_hsearch) { - ++c_mp->stat.st_hash_searches; - if (st_hsearch > c_mp->stat.st_hash_longest) - c_mp->stat.st_hash_longest = st_hsearch; - c_mp->stat.st_hash_examined += st_hsearch; +err: /* + * Discard our reference. If we're the only reference, discard the + * the buffer entirely. If we held a reference to a buffer, we are + * also still holding the hash bucket mutex. + */ + if (b_incr) { + if (bhp->ref == 1) + (void)__memp_bhfree(dbmp, hp, bhp, 1); + else { + --bhp->ref; + MUTEX_UNLOCK(dbenv, &hp->hash_mutex); + } } - ++dbmfp->pinref; + /* If alloc_bhp is set, free the memory. */ + if (alloc_bhp != NULL) + __db_shalloc_free(dbmp->reginfo[n_cache].addr, alloc_bhp); - R_UNLOCK(dbenv, dbmp->reginfo); + return (ret); +} - return (0); +#ifdef HAVE_FILESYSTEM_NOTZERO +/* + * __memp_fs_notzero -- + * Initialize the underlying allocated pages in the file. + */ +static int +__memp_fs_notzero(dbenv, dbmfp, mfp, pgnoaddr) + DB_ENV *dbenv; + DB_MPOOLFILE *dbmfp; + MPOOLFILE *mfp; + db_pgno_t *pgnoaddr; +{ + DB_IO db_io; + u_int32_t i, npages; + size_t nw; + int ret; + u_int8_t *page; + char *fail; -err: /* Discard our reference. */ - if (b_incr) - --bhp->ref; - R_UNLOCK(dbenv, dbmp->reginfo); + /* + * Pages allocated by writing pages past end-of-file are not zeroed, + * on some systems. Recovery could theoretically be fooled by a page + * showing up that contained garbage. In order to avoid this, we + * have to write the pages out to disk, and flush them. The reason + * for the flush is because if we don't sync, the allocation of another + * page subsequent to this one might reach the disk first, and if we + * crashed at the right moment, leave us with this page as the one + * allocated by writing a page past it in the file. + * + * Hash is the only access method that allocates groups of pages. We + * know that it will use the existence of the last page in a group to + * signify that the entire group is OK; so, write all the pages but + * the last one in the group, flush them to disk, and then write the + * last one to disk and flush it. + */ + if ((ret = __os_calloc(dbenv, 1, mfp->stat.st_pagesize, &page)) != 0) + return (ret); + + db_io.fhp = dbmfp->fhp; + db_io.mutexp = dbmfp->mutexp; + db_io.pagesize = db_io.bytes = mfp->stat.st_pagesize; + db_io.buf = page; + + npages = *pgnoaddr - mfp->last_pgno; + for (i = 1; i < npages; ++i) { + db_io.pgno = mfp->last_pgno + i; + if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) { + fail = "write"; + goto err; + } + } + if (i != 1 && (ret = __os_fsync(dbenv, dbmfp->fhp)) != 0) { + fail = "sync"; + goto err; + } - *(void **)addrp = NULL; + db_io.pgno = mfp->last_pgno + npages; + if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) { + fail = "write"; + goto err; + } + if ((ret = __os_fsync(dbenv, dbmfp->fhp)) != 0) { + fail = "sync"; +err: __db_err(dbenv, "%s: %s failed for page %lu", + __memp_fn(dbmfp), fail, (u_long)db_io.pgno); + } + + __os_free(dbenv, page); return (ret); } +#endif diff --git a/bdb/mp/mp_fopen.c b/bdb/mp/mp_fopen.c index 3611ded18f4..a91bf264652 100644 --- a/bdb/mp/mp_fopen.c +++ b/bdb/mp/mp_fopen.c @@ -1,13 +1,13 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: mp_fopen.c,v 11.41 2001/01/10 04:50:53 ubell Exp $"; +static const char revid[] = "$Id: mp_fopen.c,v 11.90 2002/08/26 15:22:01 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -16,211 +16,464 @@ static const char revid[] = "$Id: mp_fopen.c,v 11.41 2001/01/10 04:50:53 ubell E #include <string.h> #endif -#ifdef HAVE_RPC -#include "db_server.h" -#endif - #include "db_int.h" -#include "db_shash.h" -#include "mp.h" - -#ifdef HAVE_RPC -#include "gen_client_ext.h" -#include "rpc_client_ext.h" -#endif - -static int __memp_mf_open __P((DB_MPOOL *, const char *, - size_t, db_pgno_t, DB_MPOOL_FINFO *, u_int32_t, MPOOLFILE **)); +#include "dbinc/db_shash.h" +#include "dbinc/mp.h" + +static int __memp_fclose __P((DB_MPOOLFILE *, u_int32_t)); +static int __memp_fopen __P((DB_MPOOLFILE *, + const char *, u_int32_t, int, size_t)); +static void __memp_get_fileid __P((DB_MPOOLFILE *, u_int8_t *)); +static void __memp_last_pgno __P((DB_MPOOLFILE *, db_pgno_t *)); +static void __memp_refcnt __P((DB_MPOOLFILE *, db_pgno_t *)); +static int __memp_set_clear_len __P((DB_MPOOLFILE *, u_int32_t)); +static int __memp_set_fileid __P((DB_MPOOLFILE *, u_int8_t *)); +static int __memp_set_ftype __P((DB_MPOOLFILE *, int)); +static int __memp_set_lsn_offset __P((DB_MPOOLFILE *, int32_t)); +static int __memp_set_pgcookie __P((DB_MPOOLFILE *, DBT *)); +static int __memp_set_priority __P((DB_MPOOLFILE *, DB_CACHE_PRIORITY)); +static void __memp_set_unlink __P((DB_MPOOLFILE *, int)); + +/* Initialization methods cannot be called after open is called. */ +#define MPF_ILLEGAL_AFTER_OPEN(dbmfp, name) \ + if (F_ISSET(dbmfp, MP_OPEN_CALLED)) \ + return (__db_mi_open((dbmfp)->dbmp->dbenv, name, 1)); /* - * MEMP_FREMOVE -- - * Discard an MPOOLFILE and any buffers it references: update the flags - * so we never try to write buffers associated with the file, nor can we - * find it when looking for files to join. In addition, clear the ftype - * field, there's no reason to post-process pages, they can be discarded - * by any thread. - */ -#define MEMP_FREMOVE(mfp) { \ - mfp->ftype = 0; \ - F_SET(mfp, MP_DEADFILE); \ -} - -/* - * memp_fopen -- - * Open a backing file for the memory pool. + * __memp_fcreate -- + * Create a DB_MPOOLFILE handle. + * + * PUBLIC: int __memp_fcreate __P((DB_ENV *, DB_MPOOLFILE **, u_int32_t)); */ int -memp_fopen(dbenv, path, flags, mode, pagesize, finfop, retp) +__memp_fcreate(dbenv, retp, flags) DB_ENV *dbenv; - const char *path; - u_int32_t flags; - int mode; - size_t pagesize; - DB_MPOOL_FINFO *finfop; DB_MPOOLFILE **retp; + u_int32_t flags; { DB_MPOOL *dbmp; + DB_MPOOLFILE *dbmfp; int ret; -#ifdef HAVE_RPC - if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) - return (__dbcl_memp_fopen(dbenv, path, flags, - mode, pagesize, finfop, retp)); -#endif - PANIC_CHECK(dbenv); - ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL); + ENV_REQUIRES_CONFIG(dbenv, + dbenv->mp_handle, "memp_fcreate", DB_INIT_MPOOL); dbmp = dbenv->mp_handle; /* Validate arguments. */ - if ((ret = __db_fchk(dbenv, "memp_fopen", flags, - DB_CREATE | - DB_NOMMAP | DB_ODDFILESIZE | DB_RDONLY | DB_TRUNCATE)) != 0) + if ((ret = __db_fchk(dbenv, "memp_fcreate", flags, 0)) != 0) return (ret); - /* Require a non-zero pagesize. */ - if (pagesize == 0 || - (finfop != NULL && finfop->clear_len > pagesize)) { - __db_err(dbenv, "memp_fopen: illegal page size."); - return (EINVAL); + /* Allocate and initialize the per-process structure. */ + if ((ret = __os_calloc(dbenv, 1, sizeof(DB_MPOOLFILE), &dbmfp)) != 0) + return (ret); + if ((ret = __os_calloc(dbenv, 1, sizeof(DB_FH), &dbmfp->fhp)) != 0) + goto err; + + /* Allocate and initialize a mutex if necessary. */ + if (F_ISSET(dbenv, DB_ENV_THREAD) && + (ret = __db_mutex_setup(dbenv, dbmp->reginfo, &dbmfp->mutexp, + MUTEX_ALLOC | MUTEX_THREAD)) != 0) + goto err; + + dbmfp->ref = 1; + dbmfp->lsn_offset = -1; + dbmfp->dbmp = dbmp; + dbmfp->mfp = INVALID_ROFF; + + dbmfp->close = __memp_fclose; + dbmfp->get = __memp_fget; + dbmfp->get_fileid = __memp_get_fileid; + dbmfp->last_pgno = __memp_last_pgno; + dbmfp->open = __memp_fopen; + dbmfp->put = __memp_fput; + dbmfp->refcnt = __memp_refcnt; + dbmfp->set = __memp_fset; + dbmfp->set_clear_len = __memp_set_clear_len; + dbmfp->set_fileid = __memp_set_fileid; + dbmfp->set_ftype = __memp_set_ftype; + dbmfp->set_lsn_offset = __memp_set_lsn_offset; + dbmfp->set_pgcookie = __memp_set_pgcookie; + dbmfp->set_priority = __memp_set_priority; + dbmfp->set_unlink = __memp_set_unlink; + dbmfp->sync = __memp_fsync; + + *retp = dbmfp; + return (0); + +err: if (dbmfp != NULL) { + if (dbmfp->fhp != NULL) + (void)__os_free(dbenv, dbmfp->fhp); + (void)__os_free(dbenv, dbmfp); } + return (ret); +} - return (__memp_fopen(dbmp, - NULL, path, flags, mode, pagesize, 1, finfop, retp)); +/* + * __memp_set_clear_len -- + * Set the clear length. + */ +static int +__memp_set_clear_len(dbmfp, clear_len) + DB_MPOOLFILE *dbmfp; + u_int32_t clear_len; +{ + MPF_ILLEGAL_AFTER_OPEN(dbmfp, "set_clear_len"); + + dbmfp->clear_len = clear_len; + return (0); } /* - * __memp_set_unlink -- set unlink on last close flag. - * - * PUBLIC: void __memp_set_unlink __P((DB_MPOOLFILE *)); + * __memp_set_fileid -- + * Set the file ID. */ -void -__memp_set_unlink(dbmpf) - DB_MPOOLFILE *dbmpf; +static int +__memp_set_fileid(dbmfp, fileid) + DB_MPOOLFILE *dbmfp; + u_int8_t *fileid; { - DB_MPOOL *dbmp; - dbmp = dbmpf->dbmp; + MPF_ILLEGAL_AFTER_OPEN(dbmfp, "set_fileid"); - R_LOCK(dbmp->dbenv, dbmp->reginfo); - F_SET(dbmpf->mfp, MP_UNLINK); - R_UNLOCK(dbmp->dbenv, dbmp->reginfo); + /* + * XXX + * This is dangerous -- we're saving the caller's pointer instead + * of allocating memory and copying the contents. + */ + dbmfp->fileid = fileid; + return (0); } /* - * __memp_clear_unlink -- clear unlink on last close flag. - * - * PUBLIC: void __memp_clear_unlink __P((DB_MPOOLFILE *)); + * __memp_set_ftype -- + * Set the file type (as registered). */ -void -__memp_clear_unlink(dbmpf) - DB_MPOOLFILE *dbmpf; +static int +__memp_set_ftype(dbmfp, ftype) + DB_MPOOLFILE *dbmfp; + int ftype; +{ + MPF_ILLEGAL_AFTER_OPEN(dbmfp, "set_ftype"); + + dbmfp->ftype = ftype; + return (0); +} + +/* + * __memp_set_lsn_offset -- + * Set the page's LSN offset. + */ +static int +__memp_set_lsn_offset(dbmfp, lsn_offset) + DB_MPOOLFILE *dbmfp; + int32_t lsn_offset; +{ + MPF_ILLEGAL_AFTER_OPEN(dbmfp, "set_lsn_offset"); + + dbmfp->lsn_offset = lsn_offset; + return (0); +} + +/* + * __memp_set_pgcookie -- + * Set the pgin/pgout cookie. + */ +static int +__memp_set_pgcookie(dbmfp, pgcookie) + DB_MPOOLFILE *dbmfp; + DBT *pgcookie; +{ + MPF_ILLEGAL_AFTER_OPEN(dbmfp, "set_pgcookie"); + + dbmfp->pgcookie = pgcookie; + return (0); +} + +/* + * __memp_set_priority -- + * Set the cache priority for pages from this file. + */ +static int +__memp_set_priority(dbmfp, priority) + DB_MPOOLFILE *dbmfp; + DB_CACHE_PRIORITY priority; +{ + switch (priority) { + case DB_PRIORITY_VERY_LOW: + dbmfp->mfp->priority = MPOOL_PRI_VERY_LOW; + break; + case DB_PRIORITY_LOW: + dbmfp->mfp->priority = MPOOL_PRI_LOW; + break; + case DB_PRIORITY_DEFAULT: + dbmfp->mfp->priority = MPOOL_PRI_DEFAULT; + break; + case DB_PRIORITY_HIGH: + dbmfp->mfp->priority = MPOOL_PRI_HIGH; + break; + case DB_PRIORITY_VERY_HIGH: + dbmfp->mfp->priority = MPOOL_PRI_VERY_HIGH; + break; + default: + __db_err(dbmfp->dbmp->dbenv, + "Unknown priority value: %d", priority); + return (EINVAL); + } + + return (0); +} + +/* + * __memp_fopen -- + * Open a backing file for the memory pool. + */ +static int +__memp_fopen(dbmfp, path, flags, mode, pagesize) + DB_MPOOLFILE *dbmfp; + const char *path; + u_int32_t flags; + int mode; + size_t pagesize; { + DB_ENV *dbenv; DB_MPOOL *dbmp; - dbmp = dbmpf->dbmp; + int ret; + + dbmp = dbmfp->dbmp; + dbenv = dbmp->dbenv; + + PANIC_CHECK(dbenv); + + /* Validate arguments. */ + if ((ret = __db_fchk(dbenv, "memp_fopen", flags, + DB_CREATE | DB_DIRECT | DB_EXTENT | + DB_NOMMAP | DB_ODDFILESIZE | DB_RDONLY | DB_TRUNCATE)) != 0) + return (ret); /* - * This bit is protected in the queue code because the metapage - * is locked so we can avoid geting the region lock. - * If this gets used from other than the queue code, we cannot. + * Require a non-zero, power-of-two pagesize, smaller than the + * clear length. */ - if (!F_ISSET(dbmpf->mfp, MP_UNLINK)) - return; - R_LOCK(dbmp->dbenv, dbmp->reginfo); - F_CLR(dbmpf->mfp, MP_UNLINK); - R_UNLOCK(dbmp->dbenv, dbmp->reginfo); + if (pagesize == 0 || !POWER_OF_TWO(pagesize)) { + __db_err(dbenv, + "memp_fopen: page sizes must be a power-of-2"); + return (EINVAL); + } + if (dbmfp->clear_len > pagesize) { + __db_err(dbenv, + "memp_fopen: clear length larger than page size"); + return (EINVAL); + } + + /* Read-only checks, and local flag. */ + if (LF_ISSET(DB_RDONLY) && path == NULL) { + __db_err(dbenv, + "memp_fopen: temporary files can't be readonly"); + return (EINVAL); + } + + return (__memp_fopen_int(dbmfp, NULL, path, flags, mode, pagesize)); } /* - * __memp_fopen -- + * __memp_fopen_int -- * Open a backing file for the memory pool; internal version. * - * PUBLIC: int __memp_fopen __P((DB_MPOOL *, MPOOLFILE *, const char *, - * PUBLIC: u_int32_t, int, size_t, int, DB_MPOOL_FINFO *, DB_MPOOLFILE **)); + * PUBLIC: int __memp_fopen_int __P((DB_MPOOLFILE *, + * PUBLIC: MPOOLFILE *, const char *, u_int32_t, int, size_t)); */ int -__memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp) - DB_MPOOL *dbmp; +__memp_fopen_int(dbmfp, mfp, path, flags, mode, pagesize) + DB_MPOOLFILE *dbmfp; MPOOLFILE *mfp; const char *path; u_int32_t flags; - int mode, needlock; + int mode; size_t pagesize; - DB_MPOOL_FINFO *finfop; - DB_MPOOLFILE **retp; { DB_ENV *dbenv; - DB_MPOOLFILE *dbmfp; - DB_MPOOL_FINFO finfo; + DB_MPOOL *dbmp; + MPOOL *mp; db_pgno_t last_pgno; size_t maxmap; u_int32_t mbytes, bytes, oflags; - int ret; + int mfp_alloc, ret; u_int8_t idbuf[DB_FILE_ID_LEN]; char *rpath; + void *p; + dbmp = dbmfp->dbmp; dbenv = dbmp->dbenv; - ret = 0; + mp = dbmp->reginfo[0].primary; + mfp_alloc = ret = 0; rpath = NULL; /* - * If mfp is provided, we take the DB_MPOOL_FINFO information from - * the mfp. We don't bother initializing everything, because some - * of them are expensive to acquire. If no mfp is provided and the - * finfop argument is NULL, we default the values. + * Set the page size so os_open can decide whether to turn buffering + * off if the DB_DIRECT_DB flag is set. */ - if (finfop == NULL) { - memset(&finfo, 0, sizeof(finfo)); - if (mfp != NULL) { - finfo.ftype = mfp->ftype; - finfo.pgcookie = NULL; - finfo.fileid = NULL; - finfo.lsn_offset = mfp->lsn_off; - finfo.clear_len = mfp->clear_len; - } else { - finfo.ftype = 0; - finfo.pgcookie = NULL; - finfo.fileid = NULL; - finfo.lsn_offset = -1; - finfo.clear_len = 0; - } - finfop = &finfo; - } + dbmfp->fhp->pagesize = (u_int32_t)pagesize; - /* Allocate and initialize the per-process structure. */ - if ((ret = __os_calloc(dbenv, 1, sizeof(DB_MPOOLFILE), &dbmfp)) != 0) - return (ret); - dbmfp->dbmp = dbmp; - dbmfp->ref = 1; - if (LF_ISSET(DB_RDONLY)) + /* + * If it's a temporary file, delay the open until we actually need + * to write the file, and we know we can't join any existing files. + */ + if (path == NULL) + goto alloc; + + /* + * Get the real name for this file and open it. If it's a Queue extent + * file, it may not exist, and that's OK. + */ + oflags = 0; + if (LF_ISSET(DB_CREATE)) + oflags |= DB_OSO_CREATE; + if (LF_ISSET(DB_DIRECT)) + oflags |= DB_OSO_DIRECT; + if (LF_ISSET(DB_RDONLY)) { F_SET(dbmfp, MP_READONLY); + oflags |= DB_OSO_RDONLY; + } + if ((ret = + __db_appname(dbenv, DB_APP_DATA, path, 0, NULL, &rpath)) != 0) + goto err; + if ((ret = __os_open(dbenv, rpath, oflags, mode, dbmfp->fhp)) != 0) { + if (!LF_ISSET(DB_EXTENT)) + __db_err(dbenv, "%s: %s", rpath, db_strerror(ret)); + goto err; + } - if (path == NULL) { - if (LF_ISSET(DB_RDONLY)) { - __db_err(dbenv, - "memp_fopen: temporary files can't be readonly"); - ret = EINVAL; + /* + * Get the file id if we weren't given one. Generated file id's + * don't use timestamps, otherwise there'd be no chance of any + * other process joining the party. + */ + if (dbmfp->fileid == NULL) { + if ((ret = __os_fileid(dbenv, rpath, 0, idbuf)) != 0) goto err; + dbmfp->fileid = idbuf; + } + + /* + * If our caller knows what mfp we're using, increment the ref count, + * no need to search. + * + * We don't need to acquire a lock other than the mfp itself, because + * we know there's another reference and it's not going away. + */ + if (mfp != NULL) { + MUTEX_LOCK(dbenv, &mfp->mutex); + ++mfp->mpf_cnt; + MUTEX_UNLOCK(dbenv, &mfp->mutex); + goto check_map; + } + + /* + * If not creating a temporary file, walk the list of MPOOLFILE's, + * looking for a matching file. Files backed by temporary files + * or previously removed files can't match. + * + * DB_TRUNCATE support. + * + * The fileID is a filesystem unique number (e.g., a UNIX dev/inode + * pair) plus a timestamp. If files are removed and created in less + * than a second, the fileID can be repeated. The problem with + * repetition happens when the file that previously had the fileID + * value still has pages in the pool, since we don't want to use them + * to satisfy requests for the new file. + * + * Because the DB_TRUNCATE flag reuses the dev/inode pair, repeated + * opens with that flag set guarantees matching fileIDs when the + * machine can open a file and then re-open with truncate within a + * second. For this reason, we pass that flag down, and, if we find + * a matching entry, we ensure that it's never found again, and we + * create a new entry for the current request. + */ + R_LOCK(dbenv, dbmp->reginfo); + for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile); + mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) { + /* Skip dead files and temporary files. */ + if (F_ISSET(mfp, MP_DEADFILE | MP_TEMP)) + continue; + + /* Skip non-matching files. */ + if (memcmp(dbmfp->fileid, R_ADDR(dbmp->reginfo, + mfp->fileid_off), DB_FILE_ID_LEN) != 0) + continue; + + /* + * If the file is being truncated, remove it from the system + * and create a new entry. + * + * !!! + * We should be able to set mfp to NULL and break out of the + * loop, but I like the idea of checking all the entries. + */ + if (LF_ISSET(DB_TRUNCATE)) { + MUTEX_LOCK(dbenv, &mfp->mutex); + MPOOLFILE_IGNORE(mfp); + MUTEX_UNLOCK(dbenv, &mfp->mutex); + continue; } - last_pgno = 0; - } else { - /* Get the real name for this file and open it. */ - if ((ret = __db_appname(dbenv, - DB_APP_DATA, NULL, path, 0, NULL, &rpath)) != 0) - goto err; - oflags = 0; - if (LF_ISSET(DB_CREATE)) - oflags |= DB_OSO_CREATE; - if (LF_ISSET(DB_RDONLY)) - oflags |= DB_OSO_RDONLY; - if ((ret = - __os_open(dbenv, rpath, oflags, mode, &dbmfp->fh)) != 0) { - if (!LF_ISSET(DB_EXTENT)) - __db_err(dbenv, - "%s: %s", rpath, db_strerror(ret)); + + /* + * Some things about a file cannot be changed: the clear length, + * page size, or lSN location. + * + * The file type can change if the application's pre- and post- + * processing needs change. For example, an application that + * created a hash subdatabase in a database that was previously + * all btree. + * + * XXX + * We do not check to see if the pgcookie information changed, + * or update it if it is, this might be a bug. + */ + if (dbmfp->clear_len != mfp->clear_len || + pagesize != mfp->stat.st_pagesize || + dbmfp->lsn_offset != mfp->lsn_off) { + __db_err(dbenv, + "%s: clear length, page size or LSN location changed", + path); + R_UNLOCK(dbenv, dbmp->reginfo); + ret = EINVAL; goto err; } + if (dbmfp->ftype != 0) + mfp->ftype = dbmfp->ftype; + + MUTEX_LOCK(dbenv, &mfp->mutex); + ++mfp->mpf_cnt; + MUTEX_UNLOCK(dbenv, &mfp->mutex); + break; + } + R_UNLOCK(dbenv, dbmp->reginfo); + + if (mfp != NULL) + goto check_map; + +alloc: /* Allocate and initialize a new MPOOLFILE. */ + if ((ret = __memp_alloc( + dbmp, dbmp->reginfo, NULL, sizeof(MPOOLFILE), NULL, &mfp)) != 0) + goto err; + mfp_alloc = 1; + memset(mfp, 0, sizeof(MPOOLFILE)); + mfp->mpf_cnt = 1; + mfp->ftype = dbmfp->ftype; + mfp->stat.st_pagesize = pagesize; + mfp->lsn_off = dbmfp->lsn_offset; + mfp->clear_len = dbmfp->clear_len; + + if (LF_ISSET(DB_DIRECT)) + F_SET(mfp, MP_DIRECT); + if (LF_ISSET(DB_EXTENT)) + F_SET(mfp, MP_EXTENT); + + if (path == NULL) + F_SET(mfp, MP_TEMP); + else { /* * Don't permit files that aren't a multiple of the pagesize, * and find the number of the last page in the file, all the @@ -234,93 +487,84 @@ __memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp) * environments where an off_t is 32-bits, but still run where * offsets are 64-bits, and they pay us a lot of money. */ - if ((ret = __os_ioinfo(dbenv, rpath, - &dbmfp->fh, &mbytes, &bytes, NULL)) != 0) { + if ((ret = __os_ioinfo( + dbenv, rpath, dbmfp->fhp, &mbytes, &bytes, NULL)) != 0) { __db_err(dbenv, "%s: %s", rpath, db_strerror(ret)); goto err; } /* - * If we're doing a verify, we might have to cope with - * a truncated file; if the file size is not a multiple - * of the page size, round down to a page--we'll - * take care of the partial page outside the memp system. + * During verify or recovery, we might have to cope with a + * truncated file; if the file size is not a multiple of the + * page size, round down to a page, we'll take care of the + * partial page outside the mpool system. */ - - /* Page sizes have to be a power-of-two, ignore mbytes. */ if (bytes % pagesize != 0) { if (LF_ISSET(DB_ODDFILESIZE)) - /* - * If we're doing a verify, we might - * have to cope with a truncated file; - * round down, we'll worry about the partial - * page outside the memp system. - */ - bytes -= (bytes % pagesize); + bytes -= (u_int32_t)(bytes % pagesize); else { __db_err(dbenv, - "%s: file size not a multiple of the pagesize", - rpath); + "%s: file size not a multiple of the pagesize", rpath); ret = EINVAL; goto err; } } - last_pgno = mbytes * (MEGABYTE / pagesize); - last_pgno += bytes / pagesize; - - /* Correction: page numbers are zero-based, not 1-based. */ + /* + * If the user specifies DB_MPOOL_LAST or DB_MPOOL_NEW on a + * page get, we have to increment the last page in the file. + * Figure it out and save it away. + * + * Note correction: page numbers are zero-based, not 1-based. + */ + last_pgno = (db_pgno_t)(mbytes * (MEGABYTE / pagesize)); + last_pgno += (db_pgno_t)(bytes / pagesize); if (last_pgno != 0) --last_pgno; + mfp->orig_last_pgno = mfp->last_pgno = last_pgno; - /* - * Get the file id if we weren't given one. Generated file id's - * don't use timestamps, otherwise there'd be no chance of any - * other process joining the party. - */ - if (finfop->fileid == NULL) { - if ((ret = __os_fileid(dbenv, rpath, 0, idbuf)) != 0) - goto err; - finfop->fileid = idbuf; - } - } + /* Copy the file path into shared memory. */ + if ((ret = __memp_alloc(dbmp, dbmp->reginfo, + NULL, strlen(path) + 1, &mfp->path_off, &p)) != 0) + goto err; + memcpy(p, path, strlen(path) + 1); - /* - * If we weren't provided an underlying shared object to join with, - * find/allocate the shared file objects. Also allocate space for - * for the per-process thread lock. - */ - if (needlock) - R_LOCK(dbenv, dbmp->reginfo); - if (mfp == NULL) - ret = __memp_mf_open( - dbmp, path, pagesize, last_pgno, finfop, flags, &mfp); - else { - ++mfp->mpf_cnt; - ret = 0; + /* Copy the file identification string into shared memory. */ + if ((ret = __memp_alloc(dbmp, dbmp->reginfo, + NULL, DB_FILE_ID_LEN, &mfp->fileid_off, &p)) != 0) + goto err; + memcpy(p, dbmfp->fileid, DB_FILE_ID_LEN); } - if (needlock) - R_UNLOCK(dbenv, dbmp->reginfo); - if (ret != 0) - goto err; - if (F_ISSET(dbenv, DB_ENV_THREAD)) { - if ((ret = __db_mutex_alloc( - dbenv, dbmp->reginfo, &dbmfp->mutexp)) != 0) - goto err; - if ((ret = __db_mutex_init( - dbenv, dbmfp->mutexp, 0, MUTEX_THREAD)) != 0) + /* Copy the page cookie into shared memory. */ + if (dbmfp->pgcookie == NULL || dbmfp->pgcookie->size == 0) { + mfp->pgcookie_len = 0; + mfp->pgcookie_off = 0; + } else { + if ((ret = __memp_alloc(dbmp, dbmp->reginfo, + NULL, dbmfp->pgcookie->size, &mfp->pgcookie_off, &p)) != 0) goto err; - - /* XXX: KEITH: CLOSE THE FILE ON FAILURE? */ + memcpy(p, dbmfp->pgcookie->data, dbmfp->pgcookie->size); + mfp->pgcookie_len = dbmfp->pgcookie->size; } - dbmfp->mfp = mfp; + /* + * Prepend the MPOOLFILE to the list of MPOOLFILE's. + */ + R_LOCK(dbenv, dbmp->reginfo); + ret = __db_mutex_setup(dbenv, dbmp->reginfo, &mfp->mutex, + MUTEX_NO_RLOCK); + if (ret == 0) + SH_TAILQ_INSERT_HEAD(&mp->mpfq, mfp, q, __mpoolfile); + R_UNLOCK(dbenv, dbmp->reginfo); + if (ret != 0) + goto err; +check_map: /* * If a file: - * + is read-only * + isn't temporary + * + is read-only * + doesn't require any pgin/pgout support * + the DB_NOMMAP flag wasn't set (in either the file open or * the environment in which it was opened) @@ -332,7 +576,6 @@ __memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp) * NFS mounted partition, and we can fail in buffer I/O just as easily * as here. * - * XXX * We'd like to test to see if the file is too big to mmap. Since we * don't know what size or type off_t's or size_t's are, or the largest * unsigned integral type is, or what random insanity the local C @@ -341,11 +584,11 @@ __memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp) */ #define DB_MAXMMAPSIZE (10 * 1024 * 1024) /* 10 Mb. */ if (F_ISSET(mfp, MP_CAN_MMAP)) { - if (!F_ISSET(dbmfp, MP_READONLY)) - F_CLR(mfp, MP_CAN_MMAP); if (path == NULL) F_CLR(mfp, MP_CAN_MMAP); - if (finfop->ftype != 0) + if (!F_ISSET(dbmfp, MP_READONLY)) + F_CLR(mfp, MP_CAN_MMAP); + if (dbmfp->ftype != 0) F_CLR(mfp, MP_CAN_MMAP); if (LF_ISSET(DB_NOMMAP) || F_ISSET(dbenv, DB_ENV_NOMMAP)) F_CLR(mfp, MP_CAN_MMAP); @@ -354,260 +597,239 @@ __memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp) if (mbytes > maxmap / MEGABYTE || (mbytes == maxmap / MEGABYTE && bytes >= maxmap % MEGABYTE)) F_CLR(mfp, MP_CAN_MMAP); - } - dbmfp->addr = NULL; - if (F_ISSET(mfp, MP_CAN_MMAP)) { - dbmfp->len = (size_t)mbytes * MEGABYTE + bytes; - if (__os_mapfile(dbenv, rpath, - &dbmfp->fh, dbmfp->len, 1, &dbmfp->addr) != 0) { - dbmfp->addr = NULL; - F_CLR(mfp, MP_CAN_MMAP); + + dbmfp->addr = NULL; + if (F_ISSET(mfp, MP_CAN_MMAP)) { + dbmfp->len = (size_t)mbytes * MEGABYTE + bytes; + if (__os_mapfile(dbenv, rpath, + dbmfp->fhp, dbmfp->len, 1, &dbmfp->addr) != 0) { + dbmfp->addr = NULL; + F_CLR(mfp, MP_CAN_MMAP); + } } } - if (rpath != NULL) - __os_freestr(rpath); + dbmfp->mfp = mfp; + + F_SET(dbmfp, MP_OPEN_CALLED); + + /* Add the file to the process' list of DB_MPOOLFILEs. */ MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp); TAILQ_INSERT_TAIL(&dbmp->dbmfq, dbmfp, q); MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); - *retp = dbmfp; - return (0); + if (0) { +err: if (F_ISSET(dbmfp->fhp, DB_FH_VALID)) + (void)__os_closehandle(dbenv, dbmfp->fhp); + + if (mfp_alloc) { + R_LOCK(dbenv, dbmp->reginfo); + if (mfp->path_off != 0) + __db_shalloc_free(dbmp->reginfo[0].addr, + R_ADDR(dbmp->reginfo, mfp->path_off)); + if (mfp->fileid_off != 0) + __db_shalloc_free(dbmp->reginfo[0].addr, + R_ADDR(dbmp->reginfo, mfp->fileid_off)); + __db_shalloc_free(dbmp->reginfo[0].addr, mfp); + R_UNLOCK(dbenv, dbmp->reginfo); + } -err: /* - * Note that we do not have to free the thread mutex, because we - * never get to here after we have successfully allocated it. - */ - if (rpath != NULL) - __os_freestr(rpath); - if (F_ISSET(&dbmfp->fh, DB_FH_VALID)) - (void)__os_closehandle(&dbmfp->fh); - if (dbmfp != NULL) { - if (dbmfp->mutexp != NULL) - __db_mutex_free(dbenv, dbmp->reginfo, dbmfp->mutexp); - __os_free(dbmfp, sizeof(DB_MPOOLFILE)); } + if (rpath != NULL) + __os_free(dbenv, rpath); return (ret); } /* - * __memp_mf_open -- - * Open an MPOOLFILE. + * __memp_get_fileid -- + * Return the file ID. + * + * XXX + * Undocumented interface: DB private. */ -static int -__memp_mf_open(dbmp, path, pagesize, last_pgno, finfop, flags, retp) - DB_MPOOL *dbmp; - const char *path; - size_t pagesize; - db_pgno_t last_pgno; - DB_MPOOL_FINFO *finfop; - u_int32_t flags; - MPOOLFILE **retp; +static void +__memp_get_fileid(dbmfp, fidp) + DB_MPOOLFILE *dbmfp; + u_int8_t *fidp; { - MPOOL *mp; - MPOOLFILE *mfp; - int ret; - void *p; - -#define ISTEMPORARY (path == NULL) - /* - * If not creating a temporary file, walk the list of MPOOLFILE's, - * looking for a matching file. Files backed by temporary files - * or previously removed files can't match. + * No lock needed -- we're using the handle, it had better not + * be going away. * - * DB_TRUNCATE support. - * - * The fileID is a filesystem unique number (e.g., a UNIX dev/inode - * pair) plus a timestamp. If files are removed and created in less - * than a second, the fileID can be repeated. The problem with - * repetition happens when the file that previously had the fileID - * value still has pages in the pool, since we don't want to use them - * to satisfy requests for the new file. - * - * Because the DB_TRUNCATE flag reuses the dev/inode pair, repeated - * opens with that flag set guarantees matching fileIDs when the - * machine can open a file and then re-open with truncate within a - * second. For this reason, we pass that flag down, and, if we find - * a matching entry, we ensure that it's never found again, and we - * create a new entry for the current request. + * !!! + * Get the fileID out of the region, not out of the DB_MPOOLFILE + * structure because the DB_MPOOLFILE reference is possibly short + * lived, and isn't to be trusted. */ - if (!ISTEMPORARY) { - mp = dbmp->reginfo[0].primary; - for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile); - mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) { - if (F_ISSET(mfp, MP_DEADFILE | MP_TEMP)) - continue; - if (memcmp(finfop->fileid, R_ADDR(dbmp->reginfo, - mfp->fileid_off), DB_FILE_ID_LEN) == 0) { - if (LF_ISSET(DB_TRUNCATE)) { - MEMP_FREMOVE(mfp); - continue; - } - if (finfop->clear_len != mfp->clear_len || - pagesize != mfp->stat.st_pagesize) { - __db_err(dbmp->dbenv, - "%s: page size or clear length changed", - path); - return (EINVAL); - } - - /* - * It's possible that our needs for pre- and - * post-processing are changing. For example, - * an application created a hash subdatabase - * in a database that was previously all btree. - */ - if (finfop->ftype != 0) - mfp->ftype = finfop->ftype; - - ++mfp->mpf_cnt; - - *retp = mfp; - return (0); - } - } - } + memcpy(fidp, R_ADDR( + dbmfp->dbmp->reginfo, dbmfp->mfp->fileid_off), DB_FILE_ID_LEN); +} - /* Allocate a new MPOOLFILE. */ - if ((ret = __memp_alloc( - dbmp, dbmp->reginfo, NULL, sizeof(MPOOLFILE), NULL, &mfp)) != 0) - goto mem_err; - *retp = mfp; +/* + * __memp_last_pgno -- + * Return the page number of the last page in the file. + * + * XXX + * Undocumented interface: DB private. + */ +static void +__memp_last_pgno(dbmfp, pgnoaddr) + DB_MPOOLFILE *dbmfp; + db_pgno_t *pgnoaddr; +{ + DB_ENV *dbenv; + DB_MPOOL *dbmp; - /* Initialize the structure. */ - memset(mfp, 0, sizeof(MPOOLFILE)); - mfp->mpf_cnt = 1; - mfp->ftype = finfop->ftype; - mfp->lsn_off = finfop->lsn_offset; - mfp->clear_len = finfop->clear_len; + dbmp = dbmfp->dbmp; + dbenv = dbmp->dbenv; - /* - * If the user specifies DB_MPOOL_LAST or DB_MPOOL_NEW on a memp_fget, - * we have to know the last page in the file. Figure it out and save - * it away. - */ - mfp->stat.st_pagesize = pagesize; - mfp->orig_last_pgno = mfp->last_pgno = last_pgno; + R_LOCK(dbenv, dbmp->reginfo); + *pgnoaddr = dbmfp->mfp->last_pgno; + R_UNLOCK(dbenv, dbmp->reginfo); +} - if (ISTEMPORARY) - F_SET(mfp, MP_TEMP); - else { - /* Copy the file path into shared memory. */ - if ((ret = __memp_alloc(dbmp, dbmp->reginfo, - NULL, strlen(path) + 1, &mfp->path_off, &p)) != 0) - goto err; - memcpy(p, path, strlen(path) + 1); +/* + * __memp_refcnt -- + * Return the current reference count. + * + * XXX + * Undocumented interface: DB private. + */ +static void +__memp_refcnt(dbmfp, cntp) + DB_MPOOLFILE *dbmfp; + db_pgno_t *cntp; +{ + DB_ENV *dbenv; - /* Copy the file identification string into shared memory. */ - if ((ret = __memp_alloc(dbmp, dbmp->reginfo, - NULL, DB_FILE_ID_LEN, &mfp->fileid_off, &p)) != 0) - goto err; - memcpy(p, finfop->fileid, DB_FILE_ID_LEN); + dbenv = dbmfp->dbmp->dbenv; - F_SET(mfp, MP_CAN_MMAP); - } + MUTEX_LOCK(dbenv, &dbmfp->mfp->mutex); + *cntp = dbmfp->mfp->mpf_cnt; + MUTEX_UNLOCK(dbenv, &dbmfp->mfp->mutex); +} - /* Copy the page cookie into shared memory. */ - if (finfop->pgcookie == NULL || finfop->pgcookie->size == 0) { - mfp->pgcookie_len = 0; - mfp->pgcookie_off = 0; - } else { - if ((ret = __memp_alloc(dbmp, dbmp->reginfo, - NULL, finfop->pgcookie->size, &mfp->pgcookie_off, &p)) != 0) - goto err; - memcpy(p, finfop->pgcookie->data, finfop->pgcookie->size); - mfp->pgcookie_len = finfop->pgcookie->size; - } +/* + * __memp_set_unlink -- + * Set unlink on last close flag. + * + * XXX + * Undocumented interface: DB private. + */ +static void +__memp_set_unlink(dbmpf, set) + DB_MPOOLFILE *dbmpf; + int set; +{ + DB_ENV *dbenv; - /* Prepend the MPOOLFILE to the list of MPOOLFILE's. */ - mp = dbmp->reginfo[0].primary; - SH_TAILQ_INSERT_HEAD(&mp->mpfq, mfp, q, __mpoolfile); + dbenv = dbmpf->dbmp->dbenv; - if (0) { -err: if (mfp->path_off != 0) - __db_shalloc_free(dbmp->reginfo[0].addr, - R_ADDR(dbmp->reginfo, mfp->path_off)); - if (mfp->fileid_off != 0) - __db_shalloc_free(dbmp->reginfo[0].addr, - R_ADDR(dbmp->reginfo, mfp->fileid_off)); - if (mfp != NULL) - __db_shalloc_free(dbmp->reginfo[0].addr, mfp); -mem_err: __db_err(dbmp->dbenv, - "Unable to allocate memory for mpool file"); - } - return (ret); + MUTEX_LOCK(dbenv, &dbmpf->mfp->mutex); + if (set) + F_SET(dbmpf->mfp, MP_UNLINK); + else + F_CLR(dbmpf->mfp, MP_UNLINK); + MUTEX_UNLOCK(dbenv, &dbmpf->mfp->mutex); } /* * memp_fclose -- * Close a backing file for the memory pool. */ +static int +__memp_fclose(dbmfp, flags) + DB_MPOOLFILE *dbmfp; + u_int32_t flags; +{ + DB_ENV *dbenv; + int ret, t_ret; + + dbenv = dbmfp->dbmp->dbenv; + + PANIC_CHECK(dbenv); + + /* + * XXX + * DB_MPOOL_DISCARD: Undocumented flag: DB private. + */ + ret = __db_fchk(dbenv, "DB_MPOOLFILE->close", flags, DB_MPOOL_DISCARD); + + if ((t_ret = __memp_fclose_int(dbmfp, flags)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __memp_fclose_int -- + * Internal version of __memp_fclose. + * + * PUBLIC: int __memp_fclose_int __P((DB_MPOOLFILE *, u_int32_t)); + */ int -memp_fclose(dbmfp) +__memp_fclose_int(dbmfp, flags) DB_MPOOLFILE *dbmfp; + u_int32_t flags; { DB_ENV *dbenv; DB_MPOOL *dbmp; MPOOLFILE *mfp; char *rpath; - int ret, t_ret; + int deleted, ret, t_ret; dbmp = dbmfp->dbmp; dbenv = dbmp->dbenv; ret = 0; - PANIC_CHECK(dbenv); - -#ifdef HAVE_RPC - if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) - return (__dbcl_memp_fclose(dbmfp)); -#endif - /* - * Remove the DB_MPOOLFILE from the queue. This has to happen before - * we perform any action that can fail, otherwise __memp_close may - * loop infinitely when calling us to discard all of the DB_MPOOLFILEs. + * We have to reference count DB_MPOOLFILE structures as other threads + * in the process may be using them. Here's the problem: + * + * Thread A opens a database. + * Thread B uses thread A's DB_MPOOLFILE to write a buffer + * in order to free up memory in the mpool cache. + * Thread A closes the database while thread B is using the + * DB_MPOOLFILE structure. + * + * By opening all databases before creating any threads, and closing + * the databases after all the threads have exited, applications get + * better performance and avoid the problem path entirely. + * + * Regardless, holding the DB_MPOOLFILE to flush a dirty buffer is a + * short-term lock, even in worst case, since we better be the only + * thread of control using the DB_MPOOLFILE structure to read pages + * *into* the cache. Wait until we're the only reference holder and + * remove the DB_MPOOLFILE structure from the list, so nobody else can + * find it. We do this, rather than have the last reference holder + * (whoever that might be) discard the DB_MPOOLFILE structure, because + * we'd rather write error messages to the application in the close + * routine, not in the checkpoint/sync routine. + * + * !!! + * It's possible the DB_MPOOLFILE was never added to the DB_MPOOLFILE + * file list, check the DB_OPEN_CALLED flag to be sure. */ - for (;;) { + for (deleted = 0;;) { MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp); - - /* - * We have to reference count DB_MPOOLFILE structures as other - * threads may be using them. The problem only happens if the - * application makes a bad design choice. Here's the path: - * - * Thread A opens a database. - * Thread B uses thread A's DB_MPOOLFILE to write a buffer - * in order to free up memory in the mpool cache. - * Thread A closes the database while thread B is using the - * DB_MPOOLFILE structure. - * - * By opening all databases before creating the threads, and - * closing them after the threads have exited, applications - * get better performance and avoid the problem path entirely. - * - * Regardless, holding the DB_MPOOLFILE to flush a dirty buffer - * is a short-term lock, even in worst case, since we better be - * the only thread of control using the DB_MPOOLFILE structure - * to read pages *into* the cache. Wait until we're the only - * reference holder and remove the DB_MPOOLFILE structure from - * the list, so nobody else can even find it. - */ if (dbmfp->ref == 1) { - TAILQ_REMOVE(&dbmp->dbmfq, dbmfp, q); - break; + if (F_ISSET(dbmfp, MP_OPEN_CALLED)) + TAILQ_REMOVE(&dbmp->dbmfq, dbmfp, q); + deleted = 1; } MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); - (void)__os_sleep(dbenv, 1, 0); + if (deleted) + break; + __os_sleep(dbenv, 1, 0); } - MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); /* Complain if pinned blocks never returned. */ - if (dbmfp->pinref != 0) + if (dbmfp->pinref != 0) { __db_err(dbenv, "%s: close: %lu blocks left pinned", __memp_fn(dbmfp), (u_long)dbmfp->pinref); + ret = __db_panic(dbenv, DB_RUNRECOVERY); + } /* Discard any mmap information. */ if (dbmfp->addr != NULL && @@ -615,11 +837,11 @@ memp_fclose(dbmfp) __db_err(dbenv, "%s: %s", __memp_fn(dbmfp), db_strerror(ret)); /* Close the file; temporary files may not yet have been created. */ - if (F_ISSET(&dbmfp->fh, DB_FH_VALID) && - (t_ret = __os_closehandle(&dbmfp->fh)) != 0) { + if (F_ISSET(dbmfp->fhp, DB_FH_VALID) && + (t_ret = __os_closehandle(dbenv, dbmfp->fhp)) != 0) { __db_err(dbenv, "%s: %s", __memp_fn(dbmfp), db_strerror(t_ret)); - if (ret != 0) - t_ret = ret; + if (ret == 0) + ret = t_ret; } /* Discard the thread mutex. */ @@ -628,38 +850,51 @@ memp_fclose(dbmfp) /* * Discard our reference on the the underlying MPOOLFILE, and close - * it if it's no longer useful to anyone. - * - * If we're not discarding it, and it's a temp file, this means - * all the outstanding references belong to unflushed buffers. - * (A temp file can only be referenced by one DB_MPOOLFILE). - * We don't care about preserving any of those buffers, so mark - * the MPOOLFILE as dead so that when we try to flush them, - * even the dirty ones just get discarded. + * it if it's no longer useful to anyone. It possible the open of + * the file never happened or wasn't successful, in which case, mpf + * will be NULL; */ - R_LOCK(dbenv, dbmp->reginfo); - mfp = dbmfp->mfp; - if (--mfp->mpf_cnt == 0) { + if ((mfp = dbmfp->mfp) == NULL) + goto done; + + /* + * If it's a temp file, all outstanding references belong to unflushed + * buffers. (A temp file can only be referenced by one DB_MPOOLFILE). + * We don't care about preserving any of those buffers, so mark the + * MPOOLFILE as dead so that even the dirty ones just get discarded + * when we try to flush them. + */ + deleted = 0; + MUTEX_LOCK(dbenv, &mfp->mutex); + if (--mfp->mpf_cnt == 0 || LF_ISSET(DB_MPOOL_DISCARD)) { + if (LF_ISSET(DB_MPOOL_DISCARD) || + F_ISSET(mfp, MP_TEMP | MP_UNLINK)) + MPOOLFILE_IGNORE(mfp); if (F_ISSET(mfp, MP_UNLINK)) { - MEMP_FREMOVE(mfp); if ((t_ret = __db_appname(dbmp->dbenv, - DB_APP_DATA, NULL, R_ADDR(dbmp->reginfo, + DB_APP_DATA, R_ADDR(dbmp->reginfo, mfp->path_off), 0, NULL, &rpath)) != 0 && ret == 0) ret = t_ret; - if (t_ret == 0 && (t_ret = - __os_unlink(dbmp->dbenv, rpath) != 0 && ret == 0)) + if (t_ret == 0) { + if ((t_ret = __os_unlink( + dbmp->dbenv, rpath) != 0) && ret == 0) + ret = t_ret; + __os_free(dbenv, rpath); + } + } + if (mfp->block_cnt == 0) { + if ((t_ret = + __memp_mf_discard(dbmp, mfp)) != 0 && ret == 0) ret = t_ret; - __os_free(rpath, 0); + deleted = 1; } - if (mfp->block_cnt == 0) - __memp_mf_discard(dbmp, mfp); } - else if (F_ISSET(mfp, MP_TEMP)) - MEMP_FREMOVE(mfp); - R_UNLOCK(dbenv, dbmp->reginfo); + if (deleted == 0) + MUTEX_UNLOCK(dbenv, &mfp->mutex); /* Discard the DB_MPOOLFILE structure. */ - __os_free(dbmfp, sizeof(DB_MPOOLFILE)); +done: __os_free(dbenv, dbmfp->fhp); + __os_free(dbenv, dbmfp); return (ret); } @@ -668,20 +903,69 @@ memp_fclose(dbmfp) * __memp_mf_discard -- * Discard an MPOOLFILE. * - * PUBLIC: void __memp_mf_discard __P((DB_MPOOL *, MPOOLFILE *)); + * PUBLIC: int __memp_mf_discard __P((DB_MPOOL *, MPOOLFILE *)); */ -void +int __memp_mf_discard(dbmp, mfp) DB_MPOOL *dbmp; MPOOLFILE *mfp; { + DB_ENV *dbenv; + DB_FH fh; + DB_MPOOL_STAT *sp; MPOOL *mp; + char *rpath; + int ret; + dbenv = dbmp->dbenv; mp = dbmp->reginfo[0].primary; + ret = 0; + + /* + * Expects caller to be holding the MPOOLFILE mutex. + * + * When discarding a file, we have to flush writes from it to disk. + * The scenario is that dirty buffers from this file need to be + * flushed to satisfy a future checkpoint, but when the checkpoint + * calls mpool sync, the sync code won't know anything about them. + */ + if (!F_ISSET(mfp, MP_DEADFILE) && + (ret = __db_appname(dbenv, DB_APP_DATA, + R_ADDR(dbmp->reginfo, mfp->path_off), 0, NULL, &rpath)) == 0) { + if ((ret = __os_open(dbenv, rpath, 0, 0, &fh)) == 0) { + ret = __os_fsync(dbenv, &fh); + (void)__os_closehandle(dbenv, &fh); + } + __os_free(dbenv, rpath); + } + + /* + * We have to release the MPOOLFILE lock before acquiring the region + * lock so that we don't deadlock. Make sure nobody ever looks at + * this structure again. + */ + MPOOLFILE_IGNORE(mfp); + + /* Discard the mutex we're holding. */ + MUTEX_UNLOCK(dbenv, &mfp->mutex); /* Delete from the list of MPOOLFILEs. */ + R_LOCK(dbenv, dbmp->reginfo); SH_TAILQ_REMOVE(&mp->mpfq, mfp, q, __mpoolfile); + /* Copy the statistics into the region. */ + sp = &mp->stat; + sp->st_cache_hit += mfp->stat.st_cache_hit; + sp->st_cache_miss += mfp->stat.st_cache_miss; + sp->st_map += mfp->stat.st_map; + sp->st_page_create += mfp->stat.st_page_create; + sp->st_page_in += mfp->stat.st_page_in; + sp->st_page_out += mfp->stat.st_page_out; + + /* Clear the mutex this MPOOLFILE recorded. */ + __db_shlocks_clear(&mfp->mutex, dbmp->reginfo, + (REGMAINT *)R_ADDR(dbmp->reginfo, mp->maint_off)); + /* Free the space. */ if (mfp->path_off != 0) __db_shalloc_free(dbmp->reginfo[0].addr, @@ -693,35 +977,10 @@ __memp_mf_discard(dbmp, mfp) __db_shalloc_free(dbmp->reginfo[0].addr, R_ADDR(dbmp->reginfo, mfp->pgcookie_off)); __db_shalloc_free(dbmp->reginfo[0].addr, mfp); -} - -/* - * __memp_fremove -- - * Remove an underlying file from the system. - * - * PUBLIC: int __memp_fremove __P((DB_MPOOLFILE *)); - */ -int -__memp_fremove(dbmfp) - DB_MPOOLFILE *dbmfp; -{ - DB_ENV *dbenv; - DB_MPOOL *dbmp; - MPOOLFILE *mfp; - - dbmp = dbmfp->dbmp; - dbenv = dbmp->dbenv; - mfp = dbmfp->mfp; - - PANIC_CHECK(dbenv); - - R_LOCK(dbenv, dbmp->reginfo); - - MEMP_FREMOVE(mfp); R_UNLOCK(dbenv, dbmp->reginfo); - return (0); + return (ret); } /* diff --git a/bdb/mp/mp_fput.c b/bdb/mp/mp_fput.c index be03b721f36..271e44a4ef8 100644 --- a/bdb/mp/mp_fput.c +++ b/bdb/mp/mp_fput.c @@ -1,13 +1,13 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: mp_fput.c,v 11.16 2000/11/30 00:58:41 ubell Exp $"; +static const char revid[] = "$Id: mp_fput.c,v 11.36 2002/08/09 19:04:11 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -15,43 +15,32 @@ static const char revid[] = "$Id: mp_fput.c,v 11.16 2000/11/30 00:58:41 ubell Ex #endif -#ifdef HAVE_RPC -#include "db_server.h" -#endif - #include "db_int.h" -#include "db_shash.h" -#include "mp.h" - -#ifdef HAVE_RPC -#include "gen_client_ext.h" -#include "rpc_client_ext.h" -#endif +#include "dbinc/db_shash.h" +#include "dbinc/mp.h" /* - * memp_fput -- + * __memp_fput -- * Mpool file put function. + * + * PUBLIC: int __memp_fput __P((DB_MPOOLFILE *, void *, u_int32_t)); */ int -memp_fput(dbmfp, pgaddr, flags) +__memp_fput(dbmfp, pgaddr, flags) DB_MPOOLFILE *dbmfp; void *pgaddr; u_int32_t flags; { - BH *bhp; + BH *argbhp, *bhp, *prev; DB_ENV *dbenv; DB_MPOOL *dbmp; - MPOOL *c_mp, *mp; - int ret, wrote; + DB_MPOOL_HASH *hp; + MPOOL *c_mp; + u_int32_t n_cache; + int adjust, ret; dbmp = dbmfp->dbmp; dbenv = dbmp->dbenv; - mp = dbmp->reginfo[0].primary; - -#ifdef HAVE_RPC - if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) - return (__dbcl_memp_fput(dbmfp, pgaddr, flags)); -#endif PANIC_CHECK(dbenv); @@ -72,17 +61,6 @@ memp_fput(dbmfp, pgaddr, flags) } } - R_LOCK(dbenv, dbmp->reginfo); - - /* Decrement the pinned reference count. */ - if (dbmfp->pinref == 0) { - __db_err(dbenv, - "%s: more pages returned than retrieved", __memp_fn(dbmfp)); - R_UNLOCK(dbenv, dbmp->reginfo); - return (EINVAL); - } else - --dbmfp->pinref; - /* * If we're mapping the file, there's nothing to do. Because we can * stop mapping the file at any time, we have to check on each buffer @@ -90,97 +68,135 @@ memp_fput(dbmfp, pgaddr, flags) * region. */ if (dbmfp->addr != NULL && pgaddr >= dbmfp->addr && - (u_int8_t *)pgaddr <= (u_int8_t *)dbmfp->addr + dbmfp->len) { - R_UNLOCK(dbenv, dbmp->reginfo); + (u_int8_t *)pgaddr <= (u_int8_t *)dbmfp->addr + dbmfp->len) return (0); + +#ifdef DIAGNOSTIC + /* + * Decrement the per-file pinned buffer count (mapped pages aren't + * counted). + */ + R_LOCK(dbenv, dbmp->reginfo); + if (dbmfp->pinref == 0) { + ret = EINVAL; + __db_err(dbenv, + "%s: more pages returned than retrieved", __memp_fn(dbmfp)); + } else { + ret = 0; + --dbmfp->pinref; } + R_UNLOCK(dbenv, dbmp->reginfo); + if (ret != 0) + return (ret); +#endif - /* Convert the page address to a buffer header. */ + /* Convert a page address to a buffer header and hash bucket. */ bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf)); + n_cache = NCACHE(dbmp->reginfo[0].primary, bhp->mf_offset, bhp->pgno); + c_mp = dbmp->reginfo[n_cache].primary; + hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab); + hp = &hp[NBUCKET(c_mp, bhp->mf_offset, bhp->pgno)]; - /* Convert the buffer header to a cache. */ - c_mp = BH_TO_CACHE(dbmp, bhp); - -/* UNLOCK THE REGION, LOCK THE CACHE. */ + MUTEX_LOCK(dbenv, &hp->hash_mutex); /* Set/clear the page bits. */ - if (LF_ISSET(DB_MPOOL_CLEAN) && F_ISSET(bhp, BH_DIRTY)) { - ++c_mp->stat.st_page_clean; - --c_mp->stat.st_page_dirty; + if (LF_ISSET(DB_MPOOL_CLEAN) && + F_ISSET(bhp, BH_DIRTY) && !F_ISSET(bhp, BH_DIRTY_CREATE)) { + DB_ASSERT(hp->hash_page_dirty != 0); + --hp->hash_page_dirty; F_CLR(bhp, BH_DIRTY); } if (LF_ISSET(DB_MPOOL_DIRTY) && !F_ISSET(bhp, BH_DIRTY)) { - --c_mp->stat.st_page_clean; - ++c_mp->stat.st_page_dirty; + ++hp->hash_page_dirty; F_SET(bhp, BH_DIRTY); } if (LF_ISSET(DB_MPOOL_DISCARD)) F_SET(bhp, BH_DISCARD); /* - * If the page is dirty and being scheduled to be written as part of - * a checkpoint, we no longer know that the log is up-to-date. - */ - if (F_ISSET(bhp, BH_DIRTY) && F_ISSET(bhp, BH_SYNC)) - F_SET(bhp, BH_SYNC_LOGFLSH); - - /* * Check for a reference count going to zero. This can happen if the * application returns a page twice. */ if (bhp->ref == 0) { __db_err(dbenv, "%s: page %lu: unpinned page returned", __memp_fn(dbmfp), (u_long)bhp->pgno); - R_UNLOCK(dbenv, dbmp->reginfo); + MUTEX_UNLOCK(dbenv, &hp->hash_mutex); return (EINVAL); } /* - * If more than one reference to the page, we're done. Ignore the - * discard flags (for now) and leave it at its position in the LRU - * chain. The rest gets done at last reference close. + * If more than one reference to the page or a reference other than a + * thread waiting to flush the buffer to disk, we're done. Ignore the + * discard flags (for now) and leave the buffer's priority alone. */ - if (--bhp->ref > 0) { - R_UNLOCK(dbenv, dbmp->reginfo); + if (--bhp->ref > 1 || (bhp->ref == 1 && !F_ISSET(bhp, BH_LOCKED))) { + MUTEX_UNLOCK(dbenv, &hp->hash_mutex); return (0); } + /* Update priority values. */ + if (F_ISSET(bhp, BH_DISCARD) || + dbmfp->mfp->priority == MPOOL_PRI_VERY_LOW) + bhp->priority = 0; + else { + /* + * We don't lock the LRU counter or the stat.st_pages field, if + * we get garbage (which won't happen on a 32-bit machine), it + * only means a buffer has the wrong priority. + */ + bhp->priority = c_mp->lru_count; + + adjust = 0; + if (dbmfp->mfp->priority != 0) + adjust = + (int)c_mp->stat.st_pages / dbmfp->mfp->priority; + if (F_ISSET(bhp, BH_DIRTY)) + adjust += c_mp->stat.st_pages / MPOOL_PRI_DIRTY; + + if (adjust > 0) { + if (UINT32_T_MAX - bhp->priority <= (u_int32_t)adjust) + bhp->priority += adjust; + } else if (adjust < 0) + if (bhp->priority > (u_int32_t)-adjust) + bhp->priority += adjust; + } + /* - * Move the buffer to the head/tail of the LRU chain. We do this - * before writing the buffer for checkpoint purposes, as the write - * can discard the region lock and allow another process to acquire - * buffer. We could keep that from happening, but there seems no - * reason to do so. + * Buffers on hash buckets are sorted by priority -- move the buffer + * to the correct position in the list. */ - SH_TAILQ_REMOVE(&c_mp->bhq, bhp, q, __bh); - if (F_ISSET(bhp, BH_DISCARD)) - SH_TAILQ_INSERT_HEAD(&c_mp->bhq, bhp, q, __bh); + argbhp = bhp; + SH_TAILQ_REMOVE(&hp->hash_bucket, argbhp, hq, __bh); + + prev = NULL; + for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh); + bhp != NULL; prev = bhp, bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) + if (bhp->priority > argbhp->priority) + break; + if (prev == NULL) + SH_TAILQ_INSERT_HEAD(&hp->hash_bucket, argbhp, hq, __bh); else - SH_TAILQ_INSERT_TAIL(&c_mp->bhq, bhp, q); + SH_TAILQ_INSERT_AFTER(&hp->hash_bucket, prev, argbhp, hq, __bh); + + /* Reset the hash bucket's priority. */ + hp->hash_priority = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority; + +#ifdef DIAGNOSTIC + __memp_check_order(hp); +#endif /* - * If this buffer is scheduled for writing because of a checkpoint, we - * need to write it (if it's dirty), or update the checkpoint counters - * (if it's not dirty). If we try to write it and can't, that's not - * necessarily an error as it's not completely unreasonable that the - * application have permission to write the underlying file, but set a - * flag so that the next time the memp_sync function is called we try - * writing it there, as the checkpoint thread of control better be able - * to write all of the files. + * The sync code has a separate counter for buffers on which it waits. + * It reads that value without holding a lock so we update it as the + * last thing we do. Once that value goes to 0, we won't see another + * reference to that buffer being returned to the cache until the sync + * code has finished, so we're safe as long as we don't let the value + * go to 0 before we finish with the buffer. */ - if (F_ISSET(bhp, BH_SYNC)) { - if (F_ISSET(bhp, BH_DIRTY)) { - if (__memp_bhwrite(dbmp, - dbmfp->mfp, bhp, NULL, &wrote) != 0 || !wrote) - F_SET(mp, MP_LSN_RETRY); - } else { - F_CLR(bhp, BH_SYNC); - - --mp->lsn_cnt; - --dbmfp->mfp->lsn_cnt; - } - } + if (F_ISSET(argbhp, BH_LOCKED) && argbhp->ref_sync != 0) + --argbhp->ref_sync; + + MUTEX_UNLOCK(dbenv, &hp->hash_mutex); - R_UNLOCK(dbenv, dbmp->reginfo); return (0); } diff --git a/bdb/mp/mp_fset.c b/bdb/mp/mp_fset.c index 08313c9b6f5..65cd6286ac9 100644 --- a/bdb/mp/mp_fset.c +++ b/bdb/mp/mp_fset.c @@ -1,13 +1,13 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: mp_fset.c,v 11.13 2000/11/30 00:58:41 ubell Exp $"; +static const char revid[] = "$Id: mp_fset.c,v 11.25 2002/05/03 15:21:17 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -15,25 +15,18 @@ static const char revid[] = "$Id: mp_fset.c,v 11.13 2000/11/30 00:58:41 ubell Ex #endif -#ifdef HAVE_RPC -#include "db_server.h" -#endif - #include "db_int.h" -#include "db_shash.h" -#include "mp.h" - -#ifdef HAVE_RPC -#include "gen_client_ext.h" -#include "rpc_client_ext.h" -#endif +#include "dbinc/db_shash.h" +#include "dbinc/mp.h" /* - * memp_fset -- + * __memp_fset -- * Mpool page set-flag routine. + * + * PUBLIC: int __memp_fset __P((DB_MPOOLFILE *, void *, u_int32_t)); */ int -memp_fset(dbmfp, pgaddr, flags) +__memp_fset(dbmfp, pgaddr, flags) DB_MPOOLFILE *dbmfp; void *pgaddr; u_int32_t flags; @@ -41,17 +34,13 @@ memp_fset(dbmfp, pgaddr, flags) BH *bhp; DB_ENV *dbenv; DB_MPOOL *dbmp; - MPOOL *c_mp, *mp; + DB_MPOOL_HASH *hp; + MPOOL *c_mp; + u_int32_t n_cache; int ret; dbmp = dbmfp->dbmp; dbenv = dbmp->dbenv; - mp = dbmp->reginfo[0].primary; - -#ifdef HAVE_RPC - if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) - return (__dbcl_memp_fset(dbmfp, pgaddr, flags)); -#endif PANIC_CHECK(dbenv); @@ -60,7 +49,7 @@ memp_fset(dbmfp, pgaddr, flags) return (__db_ferr(dbenv, "memp_fset", 1)); if ((ret = __db_fchk(dbenv, "memp_fset", flags, - DB_MPOOL_DIRTY | DB_MPOOL_CLEAN | DB_MPOOL_DISCARD)) != 0) + DB_MPOOL_CLEAN | DB_MPOOL_DIRTY | DB_MPOOL_DISCARD)) != 0) return (ret); if ((ret = __db_fcchk(dbenv, "memp_fset", flags, DB_MPOOL_CLEAN, DB_MPOOL_DIRTY)) != 0) @@ -72,27 +61,29 @@ memp_fset(dbmfp, pgaddr, flags) return (EACCES); } - /* Convert the page address to a buffer header. */ + /* Convert the page address to a buffer header and hash bucket. */ bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf)); - - /* Convert the buffer header to a cache. */ - c_mp = BH_TO_CACHE(dbmp, bhp); - - R_LOCK(dbenv, dbmp->reginfo); - - if (LF_ISSET(DB_MPOOL_CLEAN) && F_ISSET(bhp, BH_DIRTY)) { - ++c_mp->stat.st_page_clean; - --c_mp->stat.st_page_dirty; + n_cache = NCACHE(dbmp->reginfo[0].primary, bhp->mf_offset, bhp->pgno); + c_mp = dbmp->reginfo[n_cache].primary; + hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab); + hp = &hp[NBUCKET(c_mp, bhp->mf_offset, bhp->pgno)]; + + MUTEX_LOCK(dbenv, &hp->hash_mutex); + + /* Set/clear the page bits. */ + if (LF_ISSET(DB_MPOOL_CLEAN) && + F_ISSET(bhp, BH_DIRTY) && !F_ISSET(bhp, BH_DIRTY_CREATE)) { + DB_ASSERT(hp->hash_page_dirty != 0); + --hp->hash_page_dirty; F_CLR(bhp, BH_DIRTY); } if (LF_ISSET(DB_MPOOL_DIRTY) && !F_ISSET(bhp, BH_DIRTY)) { - --c_mp->stat.st_page_clean; - ++c_mp->stat.st_page_dirty; + ++hp->hash_page_dirty; F_SET(bhp, BH_DIRTY); } if (LF_ISSET(DB_MPOOL_DISCARD)) F_SET(bhp, BH_DISCARD); - R_UNLOCK(dbenv, dbmp->reginfo); + MUTEX_UNLOCK(dbenv, &hp->hash_mutex); return (0); } diff --git a/bdb/mp/mp_method.c b/bdb/mp/mp_method.c index 85a6239b032..38f0a645f16 100644 --- a/bdb/mp/mp_method.c +++ b/bdb/mp/mp_method.c @@ -1,30 +1,30 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: mp_method.c,v 11.10 2000/04/04 20:12:04 bostic Exp $"; +static const char revid[] = "$Id: mp_method.c,v 11.29 2002/03/27 04:32:27 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES #include <sys/types.h> -#endif -#ifdef HAVE_RPC -#include "db_server.h" +#ifdef HAVE_RPC +#include <rpc/rpc.h> +#endif #endif #include "db_int.h" -#include "db_shash.h" -#include "mp.h" +#include "dbinc/db_shash.h" +#include "dbinc/mp.h" #ifdef HAVE_RPC -#include "gen_client_ext.h" -#include "rpc_client_ext.h" +#include "dbinc_auto/db_server.h" +#include "dbinc_auto/rpc_client_ext.h" #endif static int __memp_set_cachesize __P((DB_ENV *, u_int32_t, u_int32_t, int)); @@ -41,29 +41,46 @@ __memp_dbenv_create(dbenv) DB_ENV *dbenv; { /* + * !!! + * Our caller has not yet had the opportunity to reset the panic + * state or turn off mutex locking, and so we can neither check + * the panic state or acquire a mutex in the DB_ENV create path. + * * We default to 32 8K pages. We don't default to a flat 256K, because * some systems require significantly more memory to hold 32 pages than * others. For example, HP-UX with POSIX pthreads needs 88 bytes for * a POSIX pthread mutex and almost 200 bytes per buffer header, while - * Solaris needs 24 and 52 bytes for the same structures. + * Solaris needs 24 and 52 bytes for the same structures. The minimum + * number of hash buckets is 37. These contain a mutex also. */ - dbenv->mp_bytes = 32 * ((8 * 1024) + sizeof(BH)); + dbenv->mp_bytes = + 32 * ((8 * 1024) + sizeof(BH)) + 37 * sizeof(DB_MPOOL_HASH); dbenv->mp_ncache = 1; - dbenv->set_mp_mmapsize = __memp_set_mp_mmapsize; - dbenv->set_cachesize = __memp_set_cachesize; - -#ifdef HAVE_RPC - /* - * If we have a client, overwrite what we just setup to - * point to client functions. - */ +#ifdef HAVE_RPC if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) { dbenv->set_cachesize = __dbcl_env_cachesize; dbenv->set_mp_mmapsize = __dbcl_set_mp_mmapsize; - } + dbenv->memp_dump_region = NULL; + dbenv->memp_fcreate = __dbcl_memp_fcreate; + dbenv->memp_nameop = NULL; + dbenv->memp_register = __dbcl_memp_register; + dbenv->memp_stat = __dbcl_memp_stat; + dbenv->memp_sync = __dbcl_memp_sync; + dbenv->memp_trickle = __dbcl_memp_trickle; + } else #endif - + { + dbenv->set_cachesize = __memp_set_cachesize; + dbenv->set_mp_mmapsize = __memp_set_mp_mmapsize; + dbenv->memp_dump_region = __memp_dump_region; + dbenv->memp_fcreate = __memp_fcreate; + dbenv->memp_nameop = __memp_nameop; + dbenv->memp_register = __memp_register; + dbenv->memp_stat = __memp_stat; + dbenv->memp_sync = __memp_sync; + dbenv->memp_trickle = __memp_trickle; + } } /* @@ -78,26 +95,50 @@ __memp_set_cachesize(dbenv, gbytes, bytes, ncache) { ENV_ILLEGAL_AFTER_OPEN(dbenv, "set_cachesize"); - dbenv->mp_gbytes = gbytes + bytes / GIGABYTE; - dbenv->mp_bytes = bytes % GIGABYTE; - dbenv->mp_ncache = ncache == 0 ? 1 : ncache; + /* Normalize the values. */ + if (ncache == 0) + ncache = 1; /* - * If the application requested less than 500Mb, increase the - * cachesize by 25% to account for our overhead. (I'm guessing - * that caches over 500Mb are specifically sized, i.e., it's - * a large server and the application actually knows how much - * memory is available.) + * You can only store 4GB-1 in an unsigned 32-bit value, so correct for + * applications that specify 4GB cache sizes -- we know what they meant. + */ + if (gbytes / ncache == 4 && bytes == 0) { + --gbytes; + bytes = GIGABYTE - 1; + } else { + gbytes += bytes / GIGABYTE; + bytes %= GIGABYTE; + } + + /* Avoid too-large cache sizes, they result in a region size of zero. */ + if (gbytes / ncache > 4 || (gbytes / ncache == 4 && bytes != 0)) { + __db_err(dbenv, "individual cache size too large"); + return (EINVAL); + } + + /* + * If the application requested less than 500Mb, increase the cachesize + * by 25% and factor in the size of the hash buckets to account for our + * overhead. (I'm guessing caches over 500Mb are specifically sized, + * that is, it's a large server and the application actually knows how + * much memory is available. We only document the 25% overhead number, + * not the hash buckets, but I don't see a reason to confuse the issue, + * it shouldn't matter to an application.) * * There is a minimum cache size, regardless. */ - if (dbenv->mp_gbytes == 0) { - if (dbenv->mp_bytes < 500 * MEGABYTE) - dbenv->mp_bytes += dbenv->mp_bytes / 4; - if (dbenv->mp_bytes < DB_CACHESIZE_MIN) - dbenv->mp_bytes = DB_CACHESIZE_MIN; + if (gbytes == 0) { + if (bytes < 500 * MEGABYTE) + bytes += (bytes / 4) + 37 * sizeof(DB_MPOOL_HASH); + if (bytes / ncache < DB_CACHESIZE_MIN) + bytes = ncache * DB_CACHESIZE_MIN; } + dbenv->mp_gbytes = gbytes; + dbenv->mp_bytes = bytes; + dbenv->mp_ncache = ncache; + return (0); } diff --git a/bdb/mp/mp_region.c b/bdb/mp/mp_region.c index 4b85466ce63..06eca2f8646 100644 --- a/bdb/mp/mp_region.c +++ b/bdb/mp/mp_region.c @@ -1,13 +1,13 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: mp_region.c,v 11.26 2000/11/30 00:58:41 ubell Exp $"; +static const char revid[] = "$Id: mp_region.c,v 11.49 2002/05/07 18:42:20 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -17,11 +17,11 @@ static const char revid[] = "$Id: mp_region.c,v 11.26 2000/11/30 00:58:41 ubell #endif #include "db_int.h" -#include "db_shash.h" -#include "mp.h" +#include "dbinc/db_shash.h" +#include "dbinc/mp.h" static int __mpool_init __P((DB_ENV *, DB_MPOOL *, int, int)); -#ifdef MUTEX_SYSTEM_RESOURCES +#ifdef HAVE_MUTEX_SYSTEM_RESOURCES static size_t __mpool_region_maint __P((REGINFO *)); #endif @@ -119,6 +119,8 @@ __memp_open(dbenv) regids[i] = dbmp->reginfo[i].id; } + + R_UNLOCK(dbenv, dbmp->reginfo); } else { /* * Determine how many regions there are going to be, allocate @@ -135,6 +137,19 @@ __memp_open(dbenv) dbmp->reginfo[i].id = INVALID_REGION_ID; dbmp->reginfo[0] = reginfo; + /* + * We have to unlock the primary mpool region before we attempt + * to join the additional mpool regions. If we don't, we can + * deadlock. The scenario is that we hold the primary mpool + * region lock. We then try to attach to an additional mpool + * region, which requires the acquisition/release of the main + * region lock (to search the list of regions). If another + * thread of control already holds the main region lock and is + * waiting on our primary mpool region lock, we'll deadlock. + * See [#4696] for more information. + */ + R_UNLOCK(dbenv, dbmp->reginfo); + /* Join remaining regions. */ regids = R_ADDR(dbmp->reginfo, mp->regids); for (i = 1; i < dbmp->nreg; ++i) { @@ -155,17 +170,10 @@ __memp_open(dbenv) R_ADDR(&dbmp->reginfo[i], dbmp->reginfo[i].rp->primary); /* If the region is threaded, allocate a mutex to lock the handles. */ - if (F_ISSET(dbenv, DB_ENV_THREAD)) { - if ((ret = __db_mutex_alloc( - dbenv, dbmp->reginfo, &dbmp->mutexp)) != 0) { - goto err; - } - if ((ret = - __db_mutex_init(dbenv, dbmp->mutexp, 0, MUTEX_THREAD)) != 0) - goto err; - } - - R_UNLOCK(dbenv, dbmp->reginfo); + if (F_ISSET(dbenv, DB_ENV_THREAD) && + (ret = __db_mutex_setup(dbenv, dbmp->reginfo, &dbmp->mutexp, + MUTEX_ALLOC | MUTEX_THREAD)) != 0) + goto err; dbenv->mp_handle = dbmp; return (0); @@ -180,12 +188,11 @@ err: if (dbmp->reginfo != NULL && dbmp->reginfo[0].addr != NULL) { if (dbmp->reginfo[i].id != INVALID_REGION_ID) (void)__db_r_detach( dbenv, &dbmp->reginfo[i], 0); - __os_free(dbmp->reginfo, - dbmp->nreg * sizeof(*dbmp->reginfo)); + __os_free(dbenv, dbmp->reginfo); } if (dbmp->mutexp != NULL) __db_mutex_free(dbenv, dbmp->reginfo, dbmp->mutexp); - __os_free(dbmp, sizeof(*dbmp)); + __os_free(dbenv, dbmp); return (ret); } @@ -199,13 +206,13 @@ __mpool_init(dbenv, dbmp, reginfo_off, htab_buckets) DB_MPOOL *dbmp; int reginfo_off, htab_buckets; { - DB_HASHTAB *htab; + DB_MPOOL_HASH *htab; MPOOL *mp; REGINFO *reginfo; -#ifdef MUTEX_SYSTEM_RESOURCES +#ifdef HAVE_MUTEX_SYSTEM_RESOURCES size_t maint_size; #endif - int ret; + int i, ret; void *p; mp = NULL; @@ -218,7 +225,7 @@ __mpool_init(dbenv, dbmp, reginfo_off, htab_buckets) mp = reginfo->primary; memset(mp, 0, sizeof(*mp)); -#ifdef MUTEX_SYSTEM_RESOURCES +#ifdef HAVE_MUTEX_SYSTEM_RESOURCES maint_size = __mpool_region_maint(reginfo); /* Allocate room for the maintenance info and initialize it. */ if ((ret = __db_shalloc(reginfo->addr, @@ -231,14 +238,7 @@ __mpool_init(dbenv, dbmp, reginfo_off, htab_buckets) if (reginfo_off == 0) { SH_TAILQ_INIT(&mp->mpfq); - if ((ret = __db_shmutex_init(dbenv, &mp->sync_mutex, - R_OFFSET(dbmp->reginfo, &mp->sync_mutex) + - DB_FCNTL_OFF_MPOOL, 0, dbmp->reginfo, - (REGMAINT *)R_ADDR(dbmp->reginfo, mp->maint_off))) != 0) - goto err; - ZERO_LSN(mp->lsn); - mp->lsn_cnt = 0; mp->nreg = dbmp->nreg; if ((ret = __db_shalloc(dbmp->reginfo[0].addr, @@ -247,32 +247,41 @@ __mpool_init(dbenv, dbmp, reginfo_off, htab_buckets) mp->regids = R_OFFSET(dbmp->reginfo, p); } - SH_TAILQ_INIT(&mp->bhq); - /* Allocate hash table space and initialize it. */ if ((ret = __db_shalloc(reginfo->addr, - htab_buckets * sizeof(DB_HASHTAB), 0, &htab)) != 0) + htab_buckets * sizeof(DB_MPOOL_HASH), 0, &htab)) != 0) goto mem_err; - __db_hashinit(htab, htab_buckets); mp->htab = R_OFFSET(reginfo, htab); - mp->htab_buckets = htab_buckets; + for (i = 0; i < htab_buckets; i++) { + if ((ret = __db_mutex_setup(dbenv, + reginfo, &htab[i].hash_mutex, + MUTEX_NO_RLOCK)) != 0) + return (ret); + SH_TAILQ_INIT(&htab[i].hash_bucket); + htab[i].hash_page_dirty = htab[i].hash_priority = 0; + } + mp->htab_buckets = mp->stat.st_hash_buckets = htab_buckets; + /* + * Only the environment creator knows the total cache size, fill in + * those statistics now. + */ + mp->stat.st_gbytes = dbenv->mp_gbytes; + mp->stat.st_bytes = dbenv->mp_bytes; return (0); mem_err:__db_err(dbenv, "Unable to allocate memory for mpool region"); -err: if (reginfo->primary != NULL) - __db_shalloc_free(reginfo->addr, reginfo->primary); return (ret); } /* - * __memp_close -- - * Internal version of memp_close: only called from DB_ENV->close. + * __memp_dbenv_refresh -- + * Clean up after the mpool system on a close or failed open. * - * PUBLIC: int __memp_close __P((DB_ENV *)); + * PUBLIC: int __memp_dbenv_refresh __P((DB_ENV *)); */ int -__memp_close(dbenv) +__memp_dbenv_refresh(dbenv) DB_ENV *dbenv; { DB_MPOOL *dbmp; @@ -287,12 +296,12 @@ __memp_close(dbenv) /* Discard DB_MPREGs. */ while ((mpreg = LIST_FIRST(&dbmp->dbregq)) != NULL) { LIST_REMOVE(mpreg, q); - __os_free(mpreg, sizeof(DB_MPREG)); + __os_free(dbenv, mpreg); } /* Discard DB_MPOOLFILEs. */ while ((dbmfp = TAILQ_FIRST(&dbmp->dbmfq)) != NULL) - if ((t_ret = memp_fclose(dbmfp)) != 0 && ret == 0) + if ((t_ret = __memp_fclose_int(dbmfp, 0)) != 0 && ret == 0) ret = t_ret; /* Discard the thread mutex. */ @@ -305,14 +314,14 @@ __memp_close(dbenv) dbenv, &dbmp->reginfo[i], 0)) != 0 && ret == 0) ret = t_ret; - __os_free(dbmp->reginfo, dbmp->nreg * sizeof(*dbmp->reginfo)); - __os_free(dbmp, sizeof(*dbmp)); + __os_free(dbenv, dbmp->reginfo); + __os_free(dbenv, dbmp); dbenv->mp_handle = NULL; return (ret); } -#ifdef MUTEX_SYSTEM_RESOURCES +#ifdef HAVE_MUTEX_SYSTEM_RESOURCES /* * __mpool_region_maint -- * Return the amount of space needed for region maintenance info. @@ -328,9 +337,11 @@ __mpool_region_maint(infop) /* * For mutex maintenance we need one mutex per possible page. * Compute the maximum number of pages this cache can have. - * Also add in an mpool mutex. + * Also add in an mpool mutex and mutexes for all dbenv and db + * handles. */ numlocks = ((infop->rp->size / DB_MIN_PGSIZE) + 1); + numlocks += DB_MAX_HANDLES; s = sizeof(roff_t) * numlocks; return (s); } @@ -347,11 +358,109 @@ __mpool_region_destroy(dbenv, infop) DB_ENV *dbenv; REGINFO *infop; { - MPOOL *mp; + __db_shlocks_destroy(infop, (REGMAINT *)R_ADDR(infop, + ((MPOOL *)R_ADDR(infop, infop->rp->primary))->maint_off)); COMPQUIET(dbenv, NULL); - mp = R_ADDR(infop, infop->rp->primary); + COMPQUIET(infop, NULL); +} + +/* + * __memp_nameop + * Remove or rename a file in the pool. + * + * PUBLIC: int __memp_nameop __P((DB_ENV *, + * PUBLIC: u_int8_t *, const char *, const char *, const char *)); + * + * XXX + * Undocumented interface: DB private. + */ +int +__memp_nameop(dbenv, fileid, newname, fullold, fullnew) + DB_ENV *dbenv; + u_int8_t *fileid; + const char *newname, *fullold, *fullnew; +{ + DB_MPOOL *dbmp; + MPOOL *mp; + MPOOLFILE *mfp; + roff_t newname_off; + int locked, ret; + void *p; + + locked = 0; + dbmp = NULL; - __db_shlocks_destroy(infop, (REGMAINT *)R_ADDR(infop, mp->maint_off)); - return; + if (!MPOOL_ON(dbenv)) + goto fsop; + + dbmp = dbenv->mp_handle; + mp = dbmp->reginfo[0].primary; + + /* + * Remove or rename a file that the mpool might know about. We assume + * that the fop layer has the file locked for exclusive access, so we + * don't worry about locking except for the mpool mutexes. Checkpoint + * can happen at any time, independent of file locking, so we have to + * do the actual unlink or rename system call to avoid any race. + * + * If this is a rename, allocate first, because we can't recursively + * grab the region lock. + */ + if (newname == NULL) + p = NULL; + else { + if ((ret = __memp_alloc(dbmp, dbmp->reginfo, + NULL, strlen(newname) + 1, &newname_off, &p)) != 0) + return (ret); + memcpy(p, newname, strlen(newname) + 1); + } + + locked = 1; + R_LOCK(dbenv, dbmp->reginfo); + + /* + * Find the file -- if mpool doesn't know about this file, that's not + * an error-- we may not have it open. + */ + for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile); + mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) { + /* Ignore non-active files. */ + if (F_ISSET(mfp, MP_DEADFILE | MP_TEMP)) + continue; + + /* Ignore non-matching files. */ + if (memcmp(fileid, R_ADDR( + dbmp->reginfo, mfp->fileid_off), DB_FILE_ID_LEN) != 0) + continue; + + /* If newname is NULL, we're removing the file. */ + if (newname == NULL) { + MUTEX_LOCK(dbenv, &mfp->mutex); + MPOOLFILE_IGNORE(mfp); + MUTEX_UNLOCK(dbenv, &mfp->mutex); + } else { + /* + * Else, it's a rename. We've allocated memory + * for the new name. Swap it with the old one. + */ + p = R_ADDR(dbmp->reginfo, mfp->path_off); + mfp->path_off = newname_off; + } + break; + } + + /* Delete the memory we no longer need. */ + if (p != NULL) + __db_shalloc_free(dbmp->reginfo[0].addr, p); + +fsop: if (newname == NULL) + (void)__os_unlink(dbenv, fullold); + else + (void)__os_rename(dbenv, fullold, fullnew, 1); + + if (locked) + R_UNLOCK(dbenv, dbmp->reginfo); + + return (0); } diff --git a/bdb/mp/mp_register.c b/bdb/mp/mp_register.c index 27859f69d7b..46eefad986f 100644 --- a/bdb/mp/mp_register.c +++ b/bdb/mp/mp_register.c @@ -1,38 +1,33 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: mp_register.c,v 11.12 2000/11/15 19:25:39 sue Exp $"; +static const char revid[] = "$Id: mp_register.c,v 11.21 2002/03/27 04:32:27 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES #include <sys/types.h> #endif -#ifdef HAVE_RPC -#include "db_server.h" -#endif - #include "db_int.h" -#include "db_shash.h" -#include "mp.h" - -#ifdef HAVE_RPC -#include "gen_client_ext.h" -#include "rpc_client_ext.h" -#endif +#include "dbinc/db_shash.h" +#include "dbinc/mp.h" /* * memp_register -- * Register a file type's pgin, pgout routines. + * + * PUBLIC: int __memp_register __P((DB_ENV *, int, + * PUBLIC: int (*)(DB_ENV *, db_pgno_t, void *, DBT *), + * PUBLIC: int (*)(DB_ENV *, db_pgno_t, void *, DBT *))); */ int -memp_register(dbenv, ftype, pgin, pgout) +__memp_register(dbenv, ftype, pgin, pgout) DB_ENV *dbenv; int ftype; int (*pgin) __P((DB_ENV *, db_pgno_t, void *, DBT *)); @@ -42,13 +37,9 @@ memp_register(dbenv, ftype, pgin, pgout) DB_MPREG *mpreg; int ret; -#ifdef HAVE_RPC - if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) - return (__dbcl_memp_register(dbenv, ftype, pgin, pgout)); -#endif - PANIC_CHECK(dbenv); - ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL); + ENV_REQUIRES_CONFIG(dbenv, + dbenv->mp_handle, "DB_ENV->memp_register", DB_INIT_MPOOL); dbmp = dbenv->mp_handle; @@ -70,7 +61,7 @@ memp_register(dbenv, ftype, pgin, pgout) return (0); /* New entry. */ - if ((ret = __os_malloc(dbenv, sizeof(DB_MPREG), NULL, &mpreg)) != 0) + if ((ret = __os_malloc(dbenv, sizeof(DB_MPREG), &mpreg)) != 0) return (ret); mpreg->ftype = ftype; diff --git a/bdb/mp/mp_stat.c b/bdb/mp/mp_stat.c index 7982513448d..12e72b91d70 100644 --- a/bdb/mp/mp_stat.c +++ b/bdb/mp/mp_stat.c @@ -1,13 +1,13 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: mp_stat.c,v 11.21 2001/01/09 16:59:30 bostic Exp $"; +static const char revid[] = "$Id: mp_stat.c,v 11.51 2002/08/06 06:13:47 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -18,123 +18,150 @@ static const char revid[] = "$Id: mp_stat.c,v 11.21 2001/01/09 16:59:30 bostic E #include <unistd.h> #endif -#ifdef HAVE_RPC -#include "db_server.h" -#endif - #include "db_int.h" -#include "db_page.h" -#include "db_shash.h" -#include "db_am.h" -#include "mp.h" - -#ifdef HAVE_RPC -#include "gen_client_ext.h" -#include "rpc_client_ext.h" -#endif +#include "dbinc/db_page.h" +#include "dbinc/db_shash.h" +#include "dbinc/db_am.h" +#include "dbinc/mp.h" -static void __memp_dumpcache - __P((DB_MPOOL *, REGINFO *, size_t *, FILE *, u_int32_t)); +static void __memp_dumpcache __P((DB_ENV *, + DB_MPOOL *, REGINFO *, size_t *, FILE *, u_int32_t)); static void __memp_pbh __P((DB_MPOOL *, BH *, size_t *, FILE *)); +static void __memp_stat_wait __P((REGINFO *, MPOOL *, DB_MPOOL_STAT *, int)); /* - * memp_stat -- + * __memp_stat -- * Display MPOOL statistics. + * + * PUBLIC: int __memp_stat + * PUBLIC: __P((DB_ENV *, DB_MPOOL_STAT **, DB_MPOOL_FSTAT ***, u_int32_t)); */ int -memp_stat(dbenv, gspp, fspp, db_malloc) +__memp_stat(dbenv, gspp, fspp, flags) DB_ENV *dbenv; DB_MPOOL_STAT **gspp; DB_MPOOL_FSTAT ***fspp; - void *(*db_malloc) __P((size_t)); + u_int32_t flags; { DB_MPOOL *dbmp; DB_MPOOL_FSTAT **tfsp, *tstruct; DB_MPOOL_STAT *sp; MPOOL *c_mp, *mp; MPOOLFILE *mfp; - char *tname; - size_t len, nlen; - u_int32_t i; + size_t len, nlen, pagesize; + u_int32_t pages, i; int ret; - char *name; - -#ifdef HAVE_RPC - if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) - return (__dbcl_memp_stat(dbenv, gspp, fspp, db_malloc)); -#endif + char *name, *tname; PANIC_CHECK(dbenv); - ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL); + ENV_REQUIRES_CONFIG(dbenv, + dbenv->mp_handle, "memp_stat", DB_INIT_MPOOL); + + if ((ret = __db_fchk(dbenv, + "DB_ENV->memp_stat", flags, DB_STAT_CLEAR)) != 0) + return (ret); dbmp = dbenv->mp_handle; - sp = NULL; + mp = dbmp->reginfo[0].primary; /* Global statistics. */ - mp = dbmp->reginfo[0].primary; if (gspp != NULL) { *gspp = NULL; - if ((ret = __os_calloc(dbenv, 1, sizeof(**gspp), gspp)) != 0) + if ((ret = __os_umalloc(dbenv, sizeof(**gspp), gspp)) != 0) return (ret); + memset(*gspp, 0, sizeof(**gspp)); sp = *gspp; /* * Initialization and information that is not maintained on * a per-cache basis. */ - sp->st_hash_longest = 0; - sp->st_region_wait = dbmp->reginfo[0].rp->mutex.mutex_set_wait; - sp->st_region_nowait = - dbmp->reginfo[0].rp->mutex.mutex_set_nowait; - sp->st_gbytes = dbenv->mp_gbytes; - sp->st_bytes = dbenv->mp_bytes; + c_mp = dbmp->reginfo[0].primary; + sp->st_gbytes = c_mp->stat.st_gbytes; + sp->st_bytes = c_mp->stat.st_bytes; sp->st_ncache = dbmp->nreg; sp->st_regsize = dbmp->reginfo[0].rp->size; - R_LOCK(dbenv, dbmp->reginfo); - /* Walk the cache list and accumulate the global information. */ for (i = 0; i < mp->nreg; ++i) { c_mp = dbmp->reginfo[i].primary; + + sp->st_map += c_mp->stat.st_map; sp->st_cache_hit += c_mp->stat.st_cache_hit; sp->st_cache_miss += c_mp->stat.st_cache_miss; - sp->st_map += c_mp->stat.st_map; sp->st_page_create += c_mp->stat.st_page_create; sp->st_page_in += c_mp->stat.st_page_in; sp->st_page_out += c_mp->stat.st_page_out; sp->st_ro_evict += c_mp->stat.st_ro_evict; sp->st_rw_evict += c_mp->stat.st_rw_evict; + sp->st_page_trickle += c_mp->stat.st_page_trickle; + sp->st_pages += c_mp->stat.st_pages; + /* + * st_page_dirty calculated by __memp_stat_hash + * st_page_clean calculated here + */ + __memp_stat_hash( + &dbmp->reginfo[i], c_mp, &sp->st_page_dirty); + sp->st_page_clean = sp->st_pages - sp->st_page_dirty; sp->st_hash_buckets += c_mp->stat.st_hash_buckets; sp->st_hash_searches += c_mp->stat.st_hash_searches; - if (c_mp->stat.st_hash_longest > sp->st_hash_longest) - sp->st_hash_longest = - c_mp->stat.st_hash_longest; + sp->st_hash_longest += c_mp->stat.st_hash_longest; sp->st_hash_examined += c_mp->stat.st_hash_examined; - sp->st_page_clean += c_mp->stat.st_page_clean; - sp->st_page_dirty += c_mp->stat.st_page_dirty; - sp->st_page_trickle += c_mp->stat.st_page_trickle; - sp->st_region_wait += c_mp->stat.st_region_wait; - sp->st_region_nowait += c_mp->stat.st_region_nowait; + /* + * st_hash_nowait calculated by __memp_stat_wait + * st_hash_wait + */ + __memp_stat_wait(&dbmp->reginfo[i], c_mp, sp, flags); + sp->st_region_nowait += + dbmp->reginfo[i].rp->mutex.mutex_set_nowait; + sp->st_region_wait += + dbmp->reginfo[i].rp->mutex.mutex_set_wait; + sp->st_alloc += c_mp->stat.st_alloc; + sp->st_alloc_buckets += c_mp->stat.st_alloc_buckets; + if (sp->st_alloc_max_buckets < + c_mp->stat.st_alloc_max_buckets) + sp->st_alloc_max_buckets = + c_mp->stat.st_alloc_max_buckets; + sp->st_alloc_pages += c_mp->stat.st_alloc_pages; + if (sp->st_alloc_max_pages < + c_mp->stat.st_alloc_max_pages) + sp->st_alloc_max_pages = + c_mp->stat.st_alloc_max_pages; + + if (LF_ISSET(DB_STAT_CLEAR)) { + dbmp->reginfo[i].rp->mutex.mutex_set_wait = 0; + dbmp->reginfo[i].rp->mutex.mutex_set_nowait = 0; + pages = c_mp->stat.st_pages; + memset(&c_mp->stat, 0, sizeof(c_mp->stat)); + c_mp->stat.st_hash_buckets = c_mp->htab_buckets; + c_mp->stat.st_pages = pages; + } } /* - * We have duplicate statistics fields in the cache and - * per-file structures. The counters are only incremented - * in the per-file structures, though. The intent is that - * if we ever flush files from the pool we can save their - * last known totals in the cache structure. + * We have duplicate statistics fields in per-file structures + * and the cache. The counters are only incremented in the + * per-file structures, except if a file is flushed from the + * mpool, at which time we copy its information into the cache + * statistics. We added the cache information above, now we + * add the per-file information. */ + R_LOCK(dbenv, dbmp->reginfo); for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile); mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) { + sp->st_map += mfp->stat.st_map; sp->st_cache_hit += mfp->stat.st_cache_hit; sp->st_cache_miss += mfp->stat.st_cache_miss; - sp->st_map += mfp->stat.st_map; sp->st_page_create += mfp->stat.st_page_create; sp->st_page_in += mfp->stat.st_page_in; sp->st_page_out += mfp->stat.st_page_out; + if (fspp == NULL && LF_ISSET(DB_STAT_CLEAR)) { + pagesize = mfp->stat.st_pagesize; + memset(&mfp->stat, 0, sizeof(mfp->stat)); + mfp->stat.st_pagesize = pagesize; + } } - R_UNLOCK(dbenv, dbmp->reginfo); } @@ -142,9 +169,8 @@ memp_stat(dbenv, gspp, fspp, db_malloc) if (fspp != NULL) { *fspp = NULL; - R_LOCK(dbenv, dbmp->reginfo); - /* Count the MPOOLFILE structures. */ + R_LOCK(dbenv, dbmp->reginfo); for (i = 0, len = 0, mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile); mfp != NULL; @@ -153,18 +179,15 @@ memp_stat(dbenv, gspp, fspp, db_malloc) sizeof(DB_MPOOL_FSTAT) + strlen(__memp_fns(dbmp, mfp)) + 1; len += sizeof(DB_MPOOL_FSTAT *); /* Trailing NULL */ - R_UNLOCK(dbenv, dbmp->reginfo); - if (len == 0) + if (i == 0) return (0); /* Allocate space */ - if ((ret = __os_malloc(dbenv, len, db_malloc, fspp)) != 0) + if ((ret = __os_umalloc(dbenv, len, fspp)) != 0) return (ret); - R_LOCK(dbenv, dbmp->reginfo); - /* * Build each individual entry. We assume that an array of * pointers are aligned correctly to be followed by an array @@ -179,20 +202,30 @@ memp_stat(dbenv, gspp, fspp, db_malloc) tstruct = (DB_MPOOL_FSTAT *)(tfsp + i + 1); tname = (char *)(tstruct + i); + /* + * Files may have been opened since we counted, don't walk + * off the end of the allocated space. + */ + R_LOCK(dbenv, dbmp->reginfo); for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile); - mfp != NULL; + mfp != NULL && i-- > 0; ++tfsp, ++tstruct, tname += nlen, mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) { name = __memp_fns(dbmp, mfp); nlen = strlen(name) + 1; *tfsp = tstruct; *tstruct = mfp->stat; + if (LF_ISSET(DB_STAT_CLEAR)) { + pagesize = mfp->stat.st_pagesize; + memset(&mfp->stat, 0, sizeof(mfp->stat)); + mfp->stat.st_pagesize = pagesize; + } tstruct->file_name = tname; memcpy(tname, name, nlen); } - *tfsp = NULL; - R_UNLOCK(dbenv, dbmp->reginfo); + + *tfsp = NULL; } return (0); } @@ -200,7 +233,6 @@ memp_stat(dbenv, gspp, fspp, db_malloc) #define FMAP_ENTRIES 200 /* Files we map. */ #define MPOOL_DUMP_HASH 0x01 /* Debug hash chains. */ -#define MPOOL_DUMP_LRU 0x02 /* Debug LRU chains. */ #define MPOOL_DUMP_MEM 0x04 /* Debug region memory. */ #define MPOOL_DUMP_ALL 0x07 /* Debug all. */ @@ -208,14 +240,23 @@ memp_stat(dbenv, gspp, fspp, db_malloc) * __memp_dump_region -- * Display MPOOL structures. * - * PUBLIC: void __memp_dump_region __P((DB_ENV *, char *, FILE *)); + * PUBLIC: int __memp_dump_region __P((DB_ENV *, char *, FILE *)); */ -void +int __memp_dump_region(dbenv, area, fp) DB_ENV *dbenv; char *area; FILE *fp; { + static const FN fn[] = { + { MP_CAN_MMAP, "mmapped" }, + { MP_DEADFILE, "dead" }, + { MP_DIRECT, "no buffer" }, + { MP_EXTENT, "extent" }, + { MP_TEMP, "temporary" }, + { MP_UNLINK, "unlink" }, + { 0, NULL } + }; DB_MPOOL *dbmp; DB_MPOOLFILE *dbmfp; MPOOL *mp; @@ -225,6 +266,10 @@ __memp_dump_region(dbenv, area, fp) int cnt; u_int8_t *p; + PANIC_CHECK(dbenv); + ENV_REQUIRES_CONFIG(dbenv, + dbenv->mp_handle, "memp_dump_region", DB_INIT_MPOOL); + dbmp = dbenv->mp_handle; /* Make it easy to call from the debugger. */ @@ -239,40 +284,42 @@ __memp_dump_region(dbenv, area, fp) case 'h': LF_SET(MPOOL_DUMP_HASH); break; - case 'l': - LF_SET(MPOOL_DUMP_LRU); - break; case 'm': LF_SET(MPOOL_DUMP_MEM); break; } - R_LOCK(dbenv, dbmp->reginfo); - mp = dbmp->reginfo[0].primary; /* Display MPOOL structures. */ (void)fprintf(fp, "%s\nPool (region addr 0x%lx)\n", - DB_LINE, (u_long)dbmp->reginfo[0].addr); + DB_LINE, P_TO_ULONG(dbmp->reginfo[0].addr)); /* Display the MPOOLFILE structures. */ - cnt = 0; - for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile); + R_LOCK(dbenv, dbmp->reginfo); + for (cnt = 0, mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile); mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile), ++cnt) { - (void)fprintf(fp, "File #%d: %s: type %ld, %s\n\t [UID: ", - cnt + 1, __memp_fns(dbmp, mfp), (long)mfp->ftype, - F_ISSET(mfp, MP_CAN_MMAP) ? "mmap" : "read/write"); + (void)fprintf(fp, "File #%d: %s: pagesize %lu\n", cnt + 1, + __memp_fns(dbmp, mfp), (u_long)mfp->stat.st_pagesize); + (void)fprintf(fp, "\t type %ld; ref %lu; blocks %lu; last %lu;", + (long)mfp->ftype, (u_long)mfp->mpf_cnt, + (u_long)mfp->block_cnt, (u_long)mfp->last_pgno); + __db_prflags(mfp->flags, fn, fp); + + (void)fprintf(fp, "\n\t UID: "); p = R_ADDR(dbmp->reginfo, mfp->fileid_off); - for (i = 0; i < DB_FILE_ID_LEN; ++i) { - (void)fprintf(fp, "%x", *p++); + for (i = 0; i < DB_FILE_ID_LEN; ++i, ++p) { + (void)fprintf(fp, "%x", (u_int)*p); if (i < DB_FILE_ID_LEN - 1) (void)fprintf(fp, " "); } - (void)fprintf(fp, "]\n"); + (void)fprintf(fp, "\n"); if (cnt < FMAP_ENTRIES) fmap[cnt] = R_OFFSET(dbmp->reginfo, mfp); } + R_UNLOCK(dbenv, dbmp->reginfo); + MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp); for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq); dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q), ++cnt) { (void)fprintf(fp, "File #%d: %s: per-process, %s\n", @@ -281,6 +328,7 @@ __memp_dump_region(dbenv, area, fp) if (cnt < FMAP_ENTRIES) fmap[cnt] = R_OFFSET(dbmp->reginfo, mfp); } + MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); if (cnt < FMAP_ENTRIES) fmap[cnt] = INVALID_ROFF; else @@ -289,13 +337,14 @@ __memp_dump_region(dbenv, area, fp) /* Dump the memory pools. */ for (i = 0; i < mp->nreg; ++i) { (void)fprintf(fp, "%s\nCache #%d:\n", DB_LINE, i + 1); - __memp_dumpcache(dbmp, &dbmp->reginfo[i], fmap, fp, flags); + __memp_dumpcache( + dbenv, dbmp, &dbmp->reginfo[i], fmap, fp, flags); } - R_UNLOCK(dbenv, dbmp->reginfo); - /* Flush in case we're debugging. */ (void)fflush(fp); + + return (0); } /* @@ -303,7 +352,8 @@ __memp_dump_region(dbenv, area, fp) * Display statistics for a cache. */ static void -__memp_dumpcache(dbmp, reginfo, fmap, fp, flags) +__memp_dumpcache(dbenv, dbmp, reginfo, fmap, fp, flags) + DB_ENV *dbenv; DB_MPOOL *dbmp; REGINFO *reginfo; size_t *fmap; @@ -311,7 +361,7 @@ __memp_dumpcache(dbmp, reginfo, fmap, fp, flags) u_int32_t flags; { BH *bhp; - DB_HASHTAB *dbht; + DB_MPOOL_HASH *hp; MPOOL *c_mp; int bucket; @@ -320,27 +370,24 @@ __memp_dumpcache(dbmp, reginfo, fmap, fp, flags) /* Display the hash table list of BH's. */ if (LF_ISSET(MPOOL_DUMP_HASH)) { (void)fprintf(fp, - "%s\nBH hash table (%lu hash slots)\npageno, file, ref, address\n", + "%s\nBH hash table (%lu hash slots)\nbucket (priority):\n", DB_LINE, (u_long)c_mp->htab_buckets); - for (dbht = R_ADDR(reginfo, c_mp->htab), - bucket = 0; bucket < c_mp->htab_buckets; ++dbht, ++bucket) { - if (SH_TAILQ_FIRST(dbht, __bh) != NULL) - (void)fprintf(fp, "%lu:\n", (u_long)bucket); - for (bhp = SH_TAILQ_FIRST(dbht, __bh); - bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) + (void)fprintf(fp, + "\tpageno, file, ref, address [LSN] priority\n"); + + for (hp = R_ADDR(reginfo, c_mp->htab), + bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) { + MUTEX_LOCK(dbenv, &hp->hash_mutex); + if ((bhp = + SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) != NULL) + (void)fprintf(fp, "%lu (%u):\n", + (u_long)bucket, hp->hash_priority); + for (; bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) __memp_pbh(dbmp, bhp, fmap, fp); + MUTEX_UNLOCK(dbenv, &hp->hash_mutex); } } - /* Display the LRU list of BH's. */ - if (LF_ISSET(MPOOL_DUMP_LRU)) { - (void)fprintf(fp, "%s\nBH LRU list\n", DB_LINE); - (void)fprintf(fp, "pageno, file, ref, address\n"); - for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh); - bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) - __memp_pbh(dbmp, bhp, fmap, fp); - } - /* Dump the memory pool. */ if (LF_ISSET(MPOOL_DUMP_MEM)) __db_shalloc_dump(reginfo->addr, fp); @@ -360,10 +407,9 @@ __memp_pbh(dbmp, bhp, fmap, fp) static const FN fn[] = { { BH_CALLPGIN, "callpgin" }, { BH_DIRTY, "dirty" }, + { BH_DIRTY_CREATE, "created" }, { BH_DISCARD, "discard" }, { BH_LOCKED, "locked" }, - { BH_SYNC, "sync" }, - { BH_SYNC_LOGFLSH, "sync:logflush" }, { BH_TRASH, "trash" }, { 0, NULL } }; @@ -374,15 +420,72 @@ __memp_pbh(dbmp, bhp, fmap, fp) break; if (fmap[i] == INVALID_ROFF) - (void)fprintf(fp, " %4lu, %lu, %2lu, %lu", + (void)fprintf(fp, "\t%5lu, %lu, %2lu, %8lu [%lu,%lu] %lu", (u_long)bhp->pgno, (u_long)bhp->mf_offset, - (u_long)bhp->ref, (u_long)R_OFFSET(dbmp->reginfo, bhp)); + (u_long)bhp->ref, (u_long)R_OFFSET(dbmp->reginfo, bhp), + (u_long)LSN(bhp->buf).file, (u_long)LSN(bhp->buf).offset, + (u_long)bhp->priority); else - (void)fprintf(fp, " %4lu, #%d, %2lu, %lu", + (void)fprintf(fp, "\t%5lu, #%d, %2lu, %8lu [%lu,%lu] %lu", (u_long)bhp->pgno, i + 1, - (u_long)bhp->ref, (u_long)R_OFFSET(dbmp->reginfo, bhp)); + (u_long)bhp->ref, (u_long)R_OFFSET(dbmp->reginfo, bhp), + (u_long)LSN(bhp->buf).file, (u_long)LSN(bhp->buf).offset, + (u_long)bhp->priority); __db_prflags(bhp->flags, fn, fp); (void)fprintf(fp, "\n"); } + +/* + * __memp_stat_hash -- + * Total hash bucket stats (other than mutex wait) into the region. + * + * PUBLIC: void __memp_stat_hash __P((REGINFO *, MPOOL *, u_int32_t *)); + */ +void +__memp_stat_hash(reginfo, mp, dirtyp) + REGINFO *reginfo; + MPOOL *mp; + u_int32_t *dirtyp; +{ + DB_MPOOL_HASH *hp; + u_int32_t dirty; + int i; + + hp = R_ADDR(reginfo, mp->htab); + for (i = 0, dirty = 0; i < mp->htab_buckets; i++, hp++) + dirty += hp->hash_page_dirty; + *dirtyp = dirty; +} + +/* + * __memp_stat_wait -- + * Total hash bucket wait stats into the region. + */ +static void +__memp_stat_wait(reginfo, mp, mstat, flags) + REGINFO *reginfo; + MPOOL *mp; + DB_MPOOL_STAT *mstat; + int flags; +{ + DB_MPOOL_HASH *hp; + DB_MUTEX *mutexp; + int i; + + mstat->st_hash_max_wait = 0; + hp = R_ADDR(reginfo, mp->htab); + for (i = 0; i < mp->htab_buckets; i++, hp++) { + mutexp = &hp->hash_mutex; + mstat->st_hash_nowait += mutexp->mutex_set_nowait; + mstat->st_hash_wait += mutexp->mutex_set_wait; + if (mutexp->mutex_set_wait > mstat->st_hash_max_wait) + mstat->st_hash_max_wait = mutexp->mutex_set_wait; + + if (LF_ISSET(DB_STAT_CLEAR)) { + mutexp->mutex_set_wait = 0; + mutexp->mutex_set_nowait = 0; + } + } +} diff --git a/bdb/mp/mp_sync.c b/bdb/mp/mp_sync.c index 1b0751db709..03b42208b39 100644 --- a/bdb/mp/mp_sync.c +++ b/bdb/mp/mp_sync.c @@ -1,13 +1,13 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: mp_sync.c,v 11.29 2001/01/11 18:19:53 bostic Exp $"; +static const char revid[] = "$Id: mp_sync.c,v 11.64 2002/08/25 16:00:27 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -16,339 +16,92 @@ static const char revid[] = "$Id: mp_sync.c,v 11.29 2001/01/11 18:19:53 bostic E #include <stdlib.h> #endif -#ifdef HAVE_RPC -#include "db_server.h" -#endif - #include "db_int.h" -#include "db_shash.h" -#include "mp.h" +#include "dbinc/db_shash.h" +#include "dbinc/mp.h" -#ifdef HAVE_RPC -#include "gen_client_ext.h" -#include "rpc_client_ext.h" -#endif +typedef struct { + DB_MPOOL_HASH *track_hp; /* Hash bucket. */ + + roff_t track_off; /* Page file offset. */ + db_pgno_t track_pgno; /* Page number. */ +} BH_TRACK; static int __bhcmp __P((const void *, const void *)); -static int __memp_fsync __P((DB_MPOOLFILE *)); -static int __memp_sballoc __P((DB_ENV *, BH ***, u_int32_t *)); +static int __memp_close_flush_files __P((DB_ENV *, DB_MPOOL *)); +static int __memp_sync_files __P((DB_ENV *, DB_MPOOL *)); /* - * memp_sync -- + * __memp_sync -- * Mpool sync function. + * + * PUBLIC: int __memp_sync __P((DB_ENV *, DB_LSN *)); */ int -memp_sync(dbenv, lsnp) +__memp_sync(dbenv, lsnp) DB_ENV *dbenv; DB_LSN *lsnp; { - BH *bhp, **bharray; DB_MPOOL *dbmp; - DB_LSN tlsn; - MPOOL *c_mp, *mp; - MPOOLFILE *mfp; - u_int32_t ar_cnt, i, ndirty; - int ret, retry_done, retry_need, wrote; - -#ifdef HAVE_RPC - if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) - return (__dbcl_memp_sync(dbenv, lsnp)); -#endif + MPOOL *mp; + int ret; PANIC_CHECK(dbenv); - ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL); - - dbmp = dbenv->mp_handle; - mp = dbmp->reginfo[0].primary; + ENV_REQUIRES_CONFIG(dbenv, + dbenv->mp_handle, "memp_sync", DB_INIT_MPOOL); /* - * If no LSN is provided, flush the entire cache. - * - * !!! - * Our current behavior is to flush the entire cache, so there's - * nothing special we have to do here other than deal with NULL - * pointers. + * If no LSN is provided, flush the entire cache (reasonable usage + * even if there's no log subsystem configured). */ - if (lsnp == NULL) { - ZERO_LSN(tlsn); - lsnp = &tlsn; - F_SET(mp, MP_LSN_RETRY); - } else if (!LOGGING_ON(dbenv)) { - __db_err(dbenv, "memp_sync: requires logging"); - return (EINVAL); - } + if (lsnp != NULL) + ENV_REQUIRES_CONFIG(dbenv, + dbenv->lg_handle, "memp_sync", DB_INIT_LOG); - /* - * Sync calls are single-threaded so that we don't have multiple - * threads, with different checkpoint LSNs, walking the caches - * and updating the checkpoint LSNs and how many buffers remain - * to be written for the checkpoint. This shouldn't be a problem, - * any application that has multiple checkpoint threads isn't what - * I'd call trustworthy. - */ - MUTEX_LOCK(dbenv, &mp->sync_mutex, dbenv->lockfhp); + dbmp = dbenv->mp_handle; + mp = dbmp->reginfo[0].primary; - /* - * If the application is asking about a previous call to memp_sync(), - * and we haven't found any buffers that the application holding the - * pin couldn't write, return yes or no based on the current count. - * Note, if the application is asking about a LSN *smaller* than one - * we've already handled or are currently handling, then we return a - * result based on the count for the larger LSN. - */ - R_LOCK(dbenv, dbmp->reginfo); - if (!IS_ZERO_LSN(*lsnp) && - !F_ISSET(mp, MP_LSN_RETRY) && log_compare(lsnp, &mp->lsn) <= 0) { - if (mp->lsn_cnt == 0) { + /* If we've flushed to the requested LSN, return that information. */ + if (lsnp != NULL) { + R_LOCK(dbenv, dbmp->reginfo); + if (log_compare(lsnp, &mp->lsn) <= 0) { *lsnp = mp->lsn; - ret = 0; - } else - ret = DB_INCOMPLETE; + R_UNLOCK(dbenv, dbmp->reginfo); + return (0); + } R_UNLOCK(dbenv, dbmp->reginfo); - MUTEX_UNLOCK(dbenv, &mp->sync_mutex); - return (ret); } - /* - * Allocate room for a list of buffers, and decide how many buffers - * we can pin down. - * - * !!! - * Note: __memp_sballoc has released the region lock if we're not - * continuing forward. - */ - if ((ret = - __memp_sballoc(dbenv, &bharray, &ndirty)) != 0 || ndirty == 0) { - MUTEX_UNLOCK(dbenv, &mp->sync_mutex); + if ((ret = __memp_sync_int(dbenv, NULL, 0, DB_SYNC_CACHE, NULL)) != 0) return (ret); - } - retry_done = 0; -retry: retry_need = 0; - /* - * Start a new checkpoint. - * - * Save the LSN. We know that it's a new LSN, a retry, or larger than - * the one for which we were already doing a checkpoint. (BTW, I don't - * expect to see multiple LSN's from the same or multiple processes, - * but You Just Never Know. Responding as if they all called with the - * largest of the LSNs specified makes everything work.) - * - * We don't currently use the LSN we save. We could potentially save - * the last-written LSN in each buffer header and use it to determine - * what buffers need to be written. The problem with this is that it's - * sizeof(LSN) more bytes of buffer header. We currently write all the - * dirty buffers instead, but with a sufficiently large cache that's - * going to be a problem. - */ - mp->lsn = *lsnp; - - /* - * Clear the global count of buffers waiting to be written, walk the - * list of files clearing the count of buffers waiting to be written. - * - * Clear the retry flag. - */ - mp->lsn_cnt = 0; - for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile); - mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) - mfp->lsn_cnt = 0; - F_CLR(mp, MP_LSN_RETRY); - - /* - * Walk each cache's list of buffers and mark all dirty buffers to be - * written and all pinned buffers to be potentially written (we can't - * know if they'll need to be written until the holder returns them to - * the cache). We do this in one pass while holding the region locked - * so that processes can't make new buffers dirty, causing us to never - * finish. Since the application may have restarted the sync using a - * different LSN value, clear any BH_SYNC | BH_SYNC_LOGFLSH flags that - * appear leftover from previous calls. - * - * Keep a count of the total number of buffers we need to write in - * MPOOL->lsn_cnt, and for each file, in MPOOLFILE->lsn_count. - */ - for (ar_cnt = 0, i = 0; i < mp->nreg; ++i) { - c_mp = dbmp->reginfo[i].primary; - for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh); - bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) { - if (F_ISSET(bhp, BH_DIRTY) || bhp->ref != 0) { - F_SET(bhp, BH_SYNC); - - ++mp->lsn_cnt; - - mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); - ++mfp->lsn_cnt; - - /* - * If the buffer isn't being used, we can write - * it immediately, so increment its reference - * count to lock it down, and save a reference - * to it. - * - * If we've run out space to store buffer refs, - * we're screwed. We don't want to realloc the - * array while holding a region lock, so we set - * a flag and deal with it later. - */ - if (bhp->ref == 0) { - ++bhp->ref; - bharray[ar_cnt] = bhp; - - if (++ar_cnt >= ndirty) { - retry_need = 1; - break; - } - } - } else - if (F_ISSET(bhp, BH_SYNC)) - F_CLR(bhp, BH_SYNC | BH_SYNC_LOGFLSH); - } - if (ar_cnt >= ndirty) - break; - } - - /* If there no buffers we can write immediately, we're done. */ - if (ar_cnt == 0) { - ret = mp->lsn_cnt ? DB_INCOMPLETE : 0; - goto done; - } - - R_UNLOCK(dbenv, dbmp->reginfo); - - /* - * Sort the buffers we're going to write immediately. - * - * We try and write the buffers in file/page order: it should reduce - * seeks by the underlying filesystem and possibly reduce the actual - * number of writes. - */ - if (ar_cnt > 1) - qsort(bharray, ar_cnt, sizeof(BH *), __bhcmp); - - /* - * Flush the log. We have to ensure the log records reflecting the - * changes on the database pages we're writing have already made it - * to disk. We usually do that as we write each page, but if we - * are going to write a large number of pages, repeatedly acquiring - * the log region lock is going to be expensive. Flush the entire - * log now, so that sync doesn't require any more log flushes. - */ - if (LOGGING_ON(dbenv) && (ret = log_flush(dbenv, NULL)) != 0) - goto done; - - R_LOCK(dbenv, dbmp->reginfo); - - /* Walk the array, writing buffers. */ - for (i = 0; i < ar_cnt; ++i) { - /* - * It's possible for a thread to have gotten the buffer since - * we listed it for writing. If the reference count is still - * 1, we're the only ones using the buffer, go ahead and write. - * If it's >1, then skip the buffer and assume that it will be - * written when it's returned to the cache. - */ - if (bharray[i]->ref > 1) { - --bharray[i]->ref; - continue; - } - - /* Write the buffer. */ - mfp = R_ADDR(dbmp->reginfo, bharray[i]->mf_offset); - ret = __memp_bhwrite(dbmp, mfp, bharray[i], NULL, &wrote); - - /* Release the buffer. */ - --bharray[i]->ref; - - if (ret == 0 && wrote) - continue; - - /* - * Any process syncing the shared memory buffer pool had best - * be able to write to any underlying file. Be understanding, - * but firm, on this point. - */ - if (ret == 0) { - __db_err(dbenv, "%s: unable to flush page: %lu", - __memp_fns(dbmp, mfp), (u_long)bharray[i]->pgno); - ret = EPERM; - } - - /* - * On error, clear MPOOL->lsn and set MP_LSN_RETRY so that no - * future checkpoint return can depend on this failure. Clear - * the buffer's BH_SYNC flag, because it's used to determine - * if lsn_cnt values are incremented/decremented. Don't bother - * to reset/clear: - * - * MPOOL->lsn_cnt - * MPOOLFILE->lsn_cnt - * - * they don't make any difference. - */ - ZERO_LSN(mp->lsn); - F_SET(mp, MP_LSN_RETRY); - - /* Release any buffers we're still pinning down. */ - while (++i < ar_cnt) { - bhp = bharray[i]; - --bhp->ref; - F_CLR(bhp, BH_SYNC | BH_SYNC_LOGFLSH); - } - - goto done; - } - - ret = mp->lsn_cnt != 0 ? DB_INCOMPLETE : 0; - - /* - * If there were too many buffers and we're not returning an error, we - * re-try the checkpoint once -- since we allocated 80% of the total - * buffer count, once should be enough. If it still doesn't work, some - * other thread of control is dirtying buffers as fast as we're writing - * them, and we might as well give up for now. In the latter case, set - * the global retry flag, we'll have to start from scratch on the next - * checkpoint. - */ - if (retry_need) { - if (retry_done) { - ret = DB_INCOMPLETE; - F_SET(mp, MP_LSN_RETRY); - } else { - retry_done = 1; - goto retry; - } + if (lsnp != NULL) { + R_LOCK(dbenv, dbmp->reginfo); + if (log_compare(lsnp, &mp->lsn) > 0) + mp->lsn = *lsnp; + R_UNLOCK(dbenv, dbmp->reginfo); } -done: R_UNLOCK(dbenv, dbmp->reginfo); - MUTEX_UNLOCK(dbenv, &mp->sync_mutex); - - __os_free(bharray, ndirty * sizeof(BH *)); - - return (ret); + return (0); } /* - * memp_fsync -- + * __memp_fsync -- * Mpool file sync function. + * + * PUBLIC: int __memp_fsync __P((DB_MPOOLFILE *)); */ int -memp_fsync(dbmfp) +__memp_fsync(dbmfp) DB_MPOOLFILE *dbmfp; { DB_ENV *dbenv; DB_MPOOL *dbmp; - int is_tmp; dbmp = dbmfp->dbmp; dbenv = dbmp->dbenv; -#ifdef HAVE_RPC - if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) - return (__dbcl_memp_fsync(dbmfp)); -#endif - PANIC_CHECK(dbenv); /* @@ -359,13 +112,10 @@ memp_fsync(dbmfp) if (F_ISSET(dbmfp, MP_READONLY)) return (0); - R_LOCK(dbenv, dbmp->reginfo); - is_tmp = F_ISSET(dbmfp->mfp, MP_TEMP); - R_UNLOCK(dbenv, dbmp->reginfo); - if (is_tmp) + if (F_ISSET(dbmfp->mfp, MP_TEMP)) return (0); - return (__memp_fsync(dbmfp)); + return (__memp_sync_int(dbenv, dbmfp, 0, DB_SYNC_FILE, NULL)); } /* @@ -379,6 +129,7 @@ __mp_xxx_fh(dbmfp, fhp) DB_MPOOLFILE *dbmfp; DB_FH **fhp; { + DB_ENV *dbenv; /* * This is a truly spectacular layering violation, intended ONLY to * support compatibility for the DB 1.85 DB->fd call. @@ -393,239 +144,457 @@ __mp_xxx_fh(dbmfp, fhp) * because we want to write to the backing file regardless so that * we get a file descriptor to return. */ - *fhp = &dbmfp->fh; - return (F_ISSET(&dbmfp->fh, DB_FH_VALID) ? 0 : __memp_fsync(dbmfp)); + *fhp = dbmfp->fhp; + if (F_ISSET(dbmfp->fhp, DB_FH_VALID)) + return (0); + dbenv = dbmfp->dbmp->dbenv; + + return (__memp_sync_int(dbenv, dbmfp, 0, DB_SYNC_FILE, NULL)); } /* - * __memp_fsync -- - * Mpool file internal sync function. + * __memp_sync_int -- + * Mpool sync internal function. + * + * PUBLIC: int __memp_sync_int + * PUBLIC: __P((DB_ENV *, DB_MPOOLFILE *, int, db_sync_op, int *)); */ -static int -__memp_fsync(dbmfp) +int +__memp_sync_int(dbenv, dbmfp, ar_max, op, wrotep) + DB_ENV *dbenv; DB_MPOOLFILE *dbmfp; + int ar_max, *wrotep; + db_sync_op op; { - BH *bhp, **bharray; - DB_ENV *dbenv; + BH *bhp; + BH_TRACK *bharray; DB_MPOOL *dbmp; + DB_MPOOL_HASH *hp; + DB_MUTEX *mutexp; MPOOL *c_mp, *mp; - size_t mf_offset; - u_int32_t ar_cnt, i, ndirty; - int incomplete, ret, retry_done, retry_need, wrote; + MPOOLFILE *mfp; + u_int32_t n_cache; + int ar_cnt, hb_lock, i, pass, remaining, ret, t_ret, wait_cnt, wrote; - dbmp = dbmfp->dbmp; - dbenv = dbmp->dbenv; + dbmp = dbenv->mp_handle; mp = dbmp->reginfo[0].primary; - - R_LOCK(dbenv, dbmp->reginfo); + pass = wrote = 0; /* - * Allocate room for a list of buffers, and decide how many buffers - * we can pin down. - * - * !!! - * Note: __memp_sballoc has released our region lock if we're not - * continuing forward. + * If the caller does not specify how many pages assume one + * per bucket. */ + if (ar_max == 0) + ar_max = mp->nreg * mp->htab_buckets; + if ((ret = - __memp_sballoc(dbenv, &bharray, &ndirty)) != 0 || ndirty == 0) + __os_malloc(dbenv, ar_max * sizeof(BH_TRACK), &bharray)) != 0) return (ret); - retry_done = 0; -retry: retry_need = 0; /* * Walk each cache's list of buffers and mark all dirty buffers to be - * written and all pinned buffers to be potentially written (we can't - * know if they'll need to be written until the holder returns them to - * the cache). We do this in one pass while holding the region locked - * so that processes can't make new buffers dirty, causing us to never - * finish. + * written and all pinned buffers to be potentially written, depending + * on our flags. */ - mf_offset = R_OFFSET(dbmp->reginfo, dbmfp->mfp); - for (ar_cnt = 0, incomplete = 0, i = 0; i < mp->nreg; ++i) { - c_mp = dbmp->reginfo[i].primary; - for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh); - bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) { - if (!F_ISSET(bhp, BH_DIRTY) || - bhp->mf_offset != mf_offset) - continue; - if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED)) { - incomplete = 1; - continue; - } + for (ar_cnt = 0, n_cache = 0; n_cache < mp->nreg; ++n_cache) { + c_mp = dbmp->reginfo[n_cache].primary; + hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab); + for (i = 0; i < c_mp->htab_buckets; i++, hp++) { /* - * If the buffer isn't being used, we can write - * it immediately, so increment its reference - * count to lock it down, and save a reference - * to it. - * - * If we've run out space to store buffer refs, - * we're screwed. We don't want to realloc the - * array while holding a region lock, so we set - * a flag and deal with it later. + * We can check for empty buckets before locking as we + * only care if the pointer is zero or non-zero. We + * can ignore empty buckets because we only need write + * buffers that were dirty before we started. */ - ++bhp->ref; - bharray[ar_cnt] = bhp; - if (++ar_cnt >= ndirty) { - retry_need = 1; - break; + if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL) + continue; + + MUTEX_LOCK(dbenv, &hp->hash_mutex); + for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh); + bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) { + /* Always ignore unreferenced, clean pages. */ + if (bhp->ref == 0 && !F_ISSET(bhp, BH_DIRTY)) + continue; + + /* + * Checkpoints have to wait on all pinned pages, + * as pages may be marked dirty when returned to + * the cache. + * + * File syncs only wait on pages both pinned and + * dirty. (We don't care if pages are marked + * dirty when returned to the cache, that means + * there's another writing thread and flushing + * the cache for this handle is meaningless.) + */ + if (op == DB_SYNC_FILE && + !F_ISSET(bhp, BH_DIRTY)) + continue; + + mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); + + /* + * Ignore temporary files -- this means you + * can't even flush temporary files by handle. + * (Checkpoint doesn't require temporary files + * be flushed and the underlying buffer write + * write routine may not be able to write it + * anyway.) + */ + if (F_ISSET(mfp, MP_TEMP)) + continue; + + /* + * If we're flushing a specific file, see if + * this page is from that file. + */ + if (dbmfp != NULL && mfp != dbmfp->mfp) + continue; + + /* + * Ignore files that aren't involved in DB's + * transactional operations during checkpoints. + */ + if (dbmfp == NULL && mfp->lsn_off == -1) + continue; + + /* Track the buffer, we want it. */ + bharray[ar_cnt].track_hp = hp; + bharray[ar_cnt].track_pgno = bhp->pgno; + bharray[ar_cnt].track_off = bhp->mf_offset; + ar_cnt++; + + if (ar_cnt >= ar_max) { + if ((ret = __os_realloc(dbenv, + (ar_max * 2) * sizeof(BH_TRACK), + &bharray)) != 0) + break; + ar_max *= 2; + } } + MUTEX_UNLOCK(dbenv, &hp->hash_mutex); + + if (ret != 0) + goto err; } - if (ar_cnt >= ndirty) - break; } - /* If there no buffers we can write immediately, we're done. */ - if (ar_cnt == 0) { - ret = 0; + /* If there no buffers to write, we're done. */ + if (ar_cnt == 0) goto done; - } - R_UNLOCK(dbenv, dbmp->reginfo); - - /* Sort the buffers we're going to write. */ + /* + * Write the buffers in file/page order, trying to reduce seeks by the + * filesystem and, when pages are smaller than filesystem block sizes, + * reduce the actual number of writes. + */ if (ar_cnt > 1) - qsort(bharray, ar_cnt, sizeof(BH *), __bhcmp); + qsort(bharray, ar_cnt, sizeof(BH_TRACK), __bhcmp); - R_LOCK(dbenv, dbmp->reginfo); + /* + * If we're trickling buffers, only write enough to reach the correct + * percentage for this region. We may not write enough if the dirty + * buffers have an unbalanced distribution among the regions, but that + * seems unlikely. + */ + if (op == DB_SYNC_TRICKLE && ar_cnt > ar_max / (int)mp->nreg) + ar_cnt = ar_max / (int)mp->nreg; + + /* + * Flush the log. We have to ensure the log records reflecting the + * changes on the database pages we're writing have already made it + * to disk. We still have to check the log each time we write a page + * (because pages we are about to write may be modified after we have + * flushed the log), but in general this will at least avoid any I/O + * on the log's part. + */ + if (LOGGING_ON(dbenv) && (ret = dbenv->log_flush(dbenv, NULL)) != 0) + goto err; + + /* + * Walk the array, writing buffers. When we write a buffer, we NULL + * out its hash bucket pointer so we don't process a slot more than + * once. + */ + for (remaining = ar_cnt, i = pass = 0; remaining > 0; ++i) { + if (i >= ar_cnt) { + i = 0; + ++pass; + __os_sleep(dbenv, 1, 0); + } + if ((hp = bharray[i].track_hp) == NULL) + continue; + + /* Lock the hash bucket and find the buffer. */ + mutexp = &hp->hash_mutex; + MUTEX_LOCK(dbenv, mutexp); + for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh); + bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) + if (bhp->pgno == bharray[i].track_pgno && + bhp->mf_offset == bharray[i].track_off) + break; - /* Walk the array, writing buffers. */ - for (i = 0; i < ar_cnt;) { /* - * It's possible for a thread to have gotten the buffer since - * we listed it for writing. If the reference count is still - * 1, we're the only ones using the buffer, go ahead and write. - * If it's >1, then skip the buffer and assume that it will be - * written when it's returned to the cache. + * If we can't find the buffer we're done, somebody else had + * to have written it. + * + * If the buffer isn't pinned or dirty, we're done, there's + * no work needed. */ - if (bharray[i]->ref > 1) { - incomplete = 1; - --bharray[i++]->ref; + if (bhp == NULL || (bhp->ref == 0 && !F_ISSET(bhp, BH_DIRTY))) { + MUTEX_UNLOCK(dbenv, mutexp); + --remaining; + bharray[i].track_hp = NULL; continue; } - /* Write the buffer. */ - ret = __memp_pgwrite(dbmp, dbmfp, bharray[i], NULL, &wrote); + /* + * If the buffer is locked by another thread, ignore it, we'll + * come back to it. + * + * If the buffer is pinned and it's only the first or second + * time we have looked at it, ignore it, we'll come back to + * it. + * + * In either case, skip the buffer if we're not required to + * write it. + */ + if (F_ISSET(bhp, BH_LOCKED) || (bhp->ref != 0 && pass < 2)) { + MUTEX_UNLOCK(dbenv, mutexp); + if (op != DB_SYNC_CACHE && op != DB_SYNC_FILE) { + --remaining; + bharray[i].track_hp = NULL; + } + continue; + } + + /* + * The buffer is either pinned or dirty. + * + * Set the sync wait-for count, used to count down outstanding + * references to this buffer as they are returned to the cache. + */ + bhp->ref_sync = bhp->ref; - /* Release the buffer. */ - --bharray[i++]->ref; + /* Pin the buffer into memory and lock it. */ + ++bhp->ref; + F_SET(bhp, BH_LOCKED); + MUTEX_LOCK(dbenv, &bhp->mutex); - if (ret == 0) { - if (!wrote) - incomplete = 1; - continue; + /* + * Unlock the hash bucket and wait for the wait-for count to + * go to 0. No new thread can acquire the buffer because we + * have it locked. + * + * If a thread attempts to re-pin a page, the wait-for count + * will never go to 0 (the thread spins on our buffer lock, + * while we spin on the thread's ref count). Give up if we + * don't get the buffer in 3 seconds, we can try again later. + * + * If, when the wait-for count goes to 0, the buffer is found + * to be dirty, write it. + */ + MUTEX_UNLOCK(dbenv, mutexp); + for (wait_cnt = 1; + bhp->ref_sync != 0 && wait_cnt < 4; ++wait_cnt) + __os_sleep(dbenv, 1, 0); + MUTEX_LOCK(dbenv, mutexp); + hb_lock = 1; + + /* + * If the ref_sync count has gone to 0, we're going to be done + * with this buffer no matter what happens. + */ + if (bhp->ref_sync == 0) { + --remaining; + bharray[i].track_hp = NULL; } /* - * On error: + * If the ref_sync count has gone to 0 and the buffer is still + * dirty, we write it. We only try to write the buffer once. + * Any process checkpointing or trickle-flushing the pool + * must be able to write any underlying file -- if the write + * fails, error out. It would be very strange if file sync + * failed to write, but we don't care if it happens. + */ + if (bhp->ref_sync == 0 && F_ISSET(bhp, BH_DIRTY)) { + hb_lock = 0; + MUTEX_UNLOCK(dbenv, mutexp); + + mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); + if ((ret = __memp_bhwrite(dbmp, hp, mfp, bhp, 1)) == 0) + ++wrote; + else if (op == DB_SYNC_CACHE || op == DB_SYNC_TRICKLE) + __db_err(dbenv, "%s: unable to flush page: %lu", + __memp_fns(dbmp, mfp), (u_long)bhp->pgno); + else + ret = 0; + } + + /* + * If ref_sync count never went to 0, the buffer was written + * by another thread, or the write failed, we still have the + * buffer locked. + * + * We may or may not currently hold the hash bucket mutex. If + * the __memp_bhwrite -> __memp_pgwrite call was successful, + * then __memp_pgwrite will have swapped the buffer lock for + * the hash lock. All other call paths will leave us without + * the hash bucket lock. * - * Release any buffers we're still pinning down. + * The order of mutexes above was to acquire the buffer lock + * while holding the hash bucket lock. Don't deadlock here, + * release the buffer lock and then acquire the hash bucket + * lock. */ - while (i < ar_cnt) - --bharray[i++]->ref; - break; - } + if (F_ISSET(bhp, BH_LOCKED)) { + F_CLR(bhp, BH_LOCKED); + MUTEX_UNLOCK(dbenv, &bhp->mutex); - /* - * If there were too many buffers and we're not returning an error, we - * re-try the flush once -- since we allocated 80% of the total - * buffer count, once should be enough. If it still doesn't work, some - * other thread of control is dirtying buffers as fast as we're writing - * them, and we might as well give up. - */ - if (retry_need) { - if (retry_done) - incomplete = 1; - else { - retry_done = 1; - goto retry; + if (!hb_lock) + MUTEX_LOCK(dbenv, mutexp); } - } -done: R_UNLOCK(dbenv, dbmp->reginfo); + /* + * Reset the ref_sync count regardless of our success, we're + * done with this buffer for now. + */ + bhp->ref_sync = 0; + + /* Discard our reference and unlock the bucket. */ + --bhp->ref; + MUTEX_UNLOCK(dbenv, mutexp); - __os_free(bharray, ndirty * sizeof(BH *)); + if (ret != 0) + break; + } + +done: /* If we've opened files to flush pages, close them. */ + if ((t_ret = __memp_close_flush_files(dbenv, dbmp)) != 0 && ret == 0) + ret = t_ret; /* - * Sync the underlying file as the last thing we do, so that the OS - * has a maximal opportunity to flush buffers before we request it. - * - * !!!: - * Don't lock the region around the sync, fsync(2) has no atomicity - * issues. + * If doing a checkpoint or flushing a file for the application, we + * have to force the pages to disk. We don't do this as we go along + * because we want to give the OS as much time as possible to lazily + * flush, and because we have to flush files that might not even have + * had dirty buffers in the cache, so we have to walk the files list. */ - if (ret == 0) - ret = incomplete ? - DB_INCOMPLETE : __os_fsync(dbenv, &dbmfp->fh); + if (ret == 0 && (op == DB_SYNC_CACHE || op == DB_SYNC_FILE)) { + if (dbmfp == NULL) + ret = __memp_sync_files(dbenv, dbmp); + else + ret = __os_fsync(dbenv, dbmfp->fhp); + } + +err: __os_free(dbenv, bharray); + if (wrotep != NULL) + *wrotep = wrote; return (ret); } /* - * __memp_sballoc -- - * Allocate room for a list of buffers. + * __memp_sync_files -- + * Sync all the files in the environment, open or not. */ -static int -__memp_sballoc(dbenv, bharrayp, ndirtyp) +static +int __memp_sync_files(dbenv, dbmp) DB_ENV *dbenv; - BH ***bharrayp; - u_int32_t *ndirtyp; -{ DB_MPOOL *dbmp; - MPOOL *c_mp, *mp; - u_int32_t i, nclean, ndirty, maxpin; - int ret; +{ + DB_MPOOLFILE *dbmfp; + MPOOL *mp; + MPOOLFILE *mfp; + int ret, t_ret; - dbmp = dbenv->mp_handle; + ret = 0; mp = dbmp->reginfo[0].primary; - /* - * We don't want to hold the region lock while we write the buffers, - * so only lock it while we create a list. - * - * Walk through the list of caches, figuring out how many buffers - * we're going to need. - * - * Make a point of not holding the region lock across the library - * allocation call. - */ - for (nclean = ndirty = 0, i = 0; i < mp->nreg; ++i) { - c_mp = dbmp->reginfo[i].primary; - ndirty += c_mp->stat.st_page_dirty; - nclean += c_mp->stat.st_page_clean; + R_LOCK(dbenv, dbmp->reginfo); + for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile); + mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) { + if (mfp->stat.st_page_out == 0 || + F_ISSET(mfp, MP_DEADFILE | MP_TEMP)) + continue; + + /* Look for an already open handle. */ + ret = 0; + MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp); + for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq); + dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q)) + if (dbmfp->mfp == mfp) { + ret = __os_fsync(dbenv, dbmfp->fhp); + break; + } + MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); + if (ret != 0) + goto err; + + /* If we don't find one, open one. */ + if (dbmfp == NULL) { + if ((ret = dbenv->memp_fcreate(dbenv, &dbmfp, 0)) != 0) + goto err; + ret = __memp_fopen_int( + dbmfp, mfp, R_ADDR(dbmp->reginfo, mfp->path_off), + 0, 0, mfp->stat.st_pagesize); + if (ret == 0) + ret = __os_fsync(dbenv, dbmfp->fhp); + if ((t_ret = + __memp_fclose_int(dbmfp, 0)) != 0 && ret == 0) + ret = t_ret; + if (ret != 0) + goto err; + } } - R_UNLOCK(dbenv, dbmp->reginfo); - if (ndirty == 0) { - *ndirtyp = 0; - return (0); + + if (0) { +err: __db_err(dbenv, "%s: cannot sync: %s", + R_ADDR(dbmp->reginfo, mfp->path_off), db_strerror(ret)); } + R_UNLOCK(dbenv, dbmp->reginfo); - /* - * We don't want to pin down the entire buffer cache, otherwise we'll - * starve threads needing new pages. Don't pin down more than 80% of - * the cache, making sure that we don't screw up just because only a - * few pages have been created. - */ - maxpin = ((ndirty + nclean) * 8) / 10; - if (maxpin < 10) - maxpin = 10; + return (ret); +} + +/* + * __memp_close_flush_files -- + * Close files opened only to flush buffers. + */ +static int +__memp_close_flush_files(dbenv, dbmp) + DB_ENV *dbenv; + DB_MPOOL *dbmp; +{ + DB_MPOOLFILE *dbmfp; + int ret; /* - * Get a good-sized block of memory to hold buffer pointers, we don't - * want to run out, but correct if we want to allocate more than we - * would be allowed to store, regardless. + * The routine exists because we must close files opened by sync to + * flush buffers. There are two cases: first, extent files have to + * be closed so they may be removed when empty. Second, regular + * files have to be closed so we don't run out of descriptors (for + * example, and application partitioning its data into databases + * based on timestamps, so there's a continually increasing set of + * files). + * + * We mark files opened in the __memp_bhwrite() function with the + * MP_FLUSH flag. Here we walk through our file descriptor list, + * and, if a file was opened by __memp_bhwrite(), we close it. */ - ndirty += ndirty / 2 + 10; - if (ndirty > maxpin) - ndirty = maxpin; - if ((ret = - __os_malloc(dbenv, ndirty * sizeof(BH *), NULL, bharrayp)) != 0) - return (ret); - - *ndirtyp = ndirty; - - R_LOCK(dbenv, dbmp->reginfo); +retry: MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp); + for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq); + dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q)) + if (F_ISSET(dbmfp, MP_FLUSH)) { + F_CLR(dbmfp, MP_FLUSH); + MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); + if ((ret = __memp_fclose_int(dbmfp, 0)) != 0) + return (ret); + goto retry; + } + MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp); return (0); } @@ -634,15 +603,15 @@ static int __bhcmp(p1, p2) const void *p1, *p2; { - BH *bhp1, *bhp2; + BH_TRACK *bhp1, *bhp2; - bhp1 = *(BH * const *)p1; - bhp2 = *(BH * const *)p2; + bhp1 = (BH_TRACK *)p1; + bhp2 = (BH_TRACK *)p2; /* Sort by file (shared memory pool offset). */ - if (bhp1->mf_offset < bhp2->mf_offset) + if (bhp1->track_off < bhp2->track_off) return (-1); - if (bhp1->mf_offset > bhp2->mf_offset) + if (bhp1->track_off > bhp2->track_off) return (1); /* @@ -650,9 +619,9 @@ __bhcmp(p1, p2) * Defend against badly written quicksort code calling the comparison * function with two identical pointers (e.g., WATCOM C++ (Power++)). */ - if (bhp1->pgno < bhp2->pgno) + if (bhp1->track_pgno < bhp2->track_pgno) return (-1); - if (bhp1->pgno > bhp2->pgno) + if (bhp1->track_pgno > bhp2->track_pgno) return (1); return (0); } diff --git a/bdb/mp/mp_trickle.c b/bdb/mp/mp_trickle.c index f937805cf40..71077ab60cc 100644 --- a/bdb/mp/mp_trickle.c +++ b/bdb/mp/mp_trickle.c @@ -1,13 +1,13 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: mp_trickle.c,v 11.12 2000/11/30 00:58:41 ubell Exp $"; +static const char revid[] = "$Id: mp_trickle.c,v 11.24 2002/08/06 06:13:53 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -16,42 +16,29 @@ static const char revid[] = "$Id: mp_trickle.c,v 11.12 2000/11/30 00:58:41 ubell #include <stdlib.h> #endif -#ifdef HAVE_RPC -#include "db_server.h" -#endif - #include "db_int.h" -#include "db_shash.h" -#include "mp.h" - -#ifdef HAVE_RPC -#include "gen_client_ext.h" -#include "rpc_client_ext.h" -#endif - -static int __memp_trick __P((DB_ENV *, int, int, int *)); +#include "dbinc/db_shash.h" +#include "dbinc/mp.h" /* - * memp_trickle -- + * __memp_trickle -- * Keep a specified percentage of the buffers clean. + * + * PUBLIC: int __memp_trickle __P((DB_ENV *, int, int *)); */ int -memp_trickle(dbenv, pct, nwrotep) +__memp_trickle(dbenv, pct, nwrotep) DB_ENV *dbenv; int pct, *nwrotep; { DB_MPOOL *dbmp; - MPOOL *mp; - u_int32_t i; - int ret; - -#ifdef HAVE_RPC - if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) - return (__dbcl_memp_trickle(dbenv, pct, nwrotep)); -#endif + MPOOL *c_mp, *mp; + u_int32_t clean, dirty, i, total, dtmp; + int ret, wrote; PANIC_CHECK(dbenv); - ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL); + ENV_REQUIRES_CONFIG(dbenv, + dbenv->mp_handle, "memp_trickle", DB_INIT_MPOOL); dbmp = dbenv->mp_handle; mp = dbmp->reginfo[0].primary; @@ -62,88 +49,35 @@ memp_trickle(dbenv, pct, nwrotep) if (pct < 1 || pct > 100) return (EINVAL); - R_LOCK(dbenv, dbmp->reginfo); - - /* Loop through the caches... */ - for (ret = 0, i = 0; i < mp->nreg; ++i) - if ((ret = __memp_trick(dbenv, i, pct, nwrotep)) != 0) - break; - - R_UNLOCK(dbenv, dbmp->reginfo); - return (ret); -} - -/* - * __memp_trick -- - * Trickle a single cache. - */ -static int -__memp_trick(dbenv, ncache, pct, nwrotep) - DB_ENV *dbenv; - int ncache, pct, *nwrotep; -{ - BH *bhp; - DB_MPOOL *dbmp; - MPOOL *c_mp; - MPOOLFILE *mfp; - db_pgno_t pgno; - u_long total; - int ret, wrote; - - dbmp = dbenv->mp_handle; - c_mp = dbmp->reginfo[ncache].primary; - /* - * If there are sufficient clean buffers, or no buffers or no dirty + * If there are sufficient clean buffers, no buffers or no dirty * buffers, we're done. * * XXX - * Using st_page_clean and st_page_dirty is our only choice at the - * moment, but it's not as correct as we might like in the presence - * of pools with more than one buffer size, as a free 512-byte buffer - * isn't the same as a free 8K buffer. + * Using hash_page_dirty is our only choice at the moment, but it's not + * as correct as we might like in the presence of pools having more + * than one page size, as a free 512B buffer isn't the same as a free + * 8KB buffer. + * + * Loop through the caches counting total/dirty buffers. */ -loop: total = c_mp->stat.st_page_clean + c_mp->stat.st_page_dirty; - if (total == 0 || c_mp->stat.st_page_dirty == 0 || - (c_mp->stat.st_page_clean * 100) / total >= (u_long)pct) - return (0); - - /* Loop until we write a buffer. */ - for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh); - bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) { - if (bhp->ref != 0 || - !F_ISSET(bhp, BH_DIRTY) || F_ISSET(bhp, BH_LOCKED)) - continue; - - mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); - - /* - * We can't write to temporary files -- see the comment in - * mp_bh.c:__memp_bhwrite(). - */ - if (F_ISSET(mfp, MP_TEMP)) - continue; + for (ret = 0, i = dirty = total = 0; i < mp->nreg; ++i) { + c_mp = dbmp->reginfo[i].primary; + total += c_mp->stat.st_pages; + __memp_stat_hash(&dbmp->reginfo[i], c_mp, &dtmp); + dirty += dtmp; + } - pgno = bhp->pgno; - if ((ret = __memp_bhwrite(dbmp, mfp, bhp, NULL, &wrote)) != 0) - return (ret); + clean = total - dirty; + if (clean == total || (clean * 100) / total >= (u_long)pct) + return (0); - /* - * Any process syncing the shared memory buffer pool had better - * be able to write to any underlying file. Be understanding, - * but firm, on this point. - */ - if (!wrote) { - __db_err(dbenv, "%s: unable to flush page: %lu", - __memp_fns(dbmp, mfp), (u_long)pgno); - return (EPERM); - } + if (nwrotep == NULL) + nwrotep = &wrote; + ret = __memp_sync_int(dbenv, NULL, + ((total * pct) / 100) - clean, DB_SYNC_TRICKLE, nwrotep); - ++c_mp->stat.st_page_trickle; - if (nwrotep != NULL) - ++*nwrotep; - goto loop; - } + mp->stat.st_page_trickle += *nwrotep; - return (0); + return (ret); } |