diff options
Diffstat (limited to 'bdb/mp/mp_fget.c')
-rw-r--r-- | bdb/mp/mp_fget.c | 763 |
1 files changed, 500 insertions, 263 deletions
diff --git a/bdb/mp/mp_fget.c b/bdb/mp/mp_fget.c index 1bff5e136ab..be0785a2184 100644 --- a/bdb/mp/mp_fget.c +++ b/bdb/mp/mp_fget.c @@ -1,13 +1,13 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: mp_fget.c,v 11.28 2001/01/10 04:50:53 ubell Exp $"; +static const char revid[] = "$Id: mp_fget.c,v 11.68 2002/08/06 04:58:09 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -16,51 +16,54 @@ static const char revid[] = "$Id: mp_fget.c,v 11.28 2001/01/10 04:50:53 ubell Ex #include <string.h> #endif -#ifdef HAVE_RPC -#include "db_server.h" -#endif - #include "db_int.h" -#include "db_shash.h" -#include "mp.h" +#include "dbinc/db_shash.h" +#include "dbinc/mp.h" -#ifdef HAVE_RPC -#include "gen_client_ext.h" -#include "rpc_client_ext.h" +#ifdef HAVE_FILESYSTEM_NOTZERO +static int __memp_fs_notzero + __P((DB_ENV *, DB_MPOOLFILE *, MPOOLFILE *, db_pgno_t *)); #endif /* - * memp_fget -- + * __memp_fget -- * Get a page from the file. + * + * PUBLIC: int __memp_fget + * PUBLIC: __P((DB_MPOOLFILE *, db_pgno_t *, u_int32_t, void *)); */ int -memp_fget(dbmfp, pgnoaddr, flags, addrp) +__memp_fget(dbmfp, pgnoaddr, flags, addrp) DB_MPOOLFILE *dbmfp; db_pgno_t *pgnoaddr; u_int32_t flags; void *addrp; { - BH *bhp; + enum { FIRST_FOUND, FIRST_MISS, SECOND_FOUND, SECOND_MISS } state; + BH *alloc_bhp, *bhp; DB_ENV *dbenv; DB_MPOOL *dbmp; - DB_HASHTAB *dbht; + DB_MPOOL_HASH *hp; MPOOL *c_mp, *mp; MPOOLFILE *mfp; - size_t n_bucket, n_cache, mf_offset; - u_int32_t st_hsearch; - int b_incr, first, ret; + roff_t mf_offset; + u_int32_t n_cache, st_hsearch; + int b_incr, extending, first, ret; + + *(void **)addrp = NULL; dbmp = dbmfp->dbmp; dbenv = dbmp->dbenv; - mp = dbmp->reginfo[0].primary; - mfp = dbmfp->mfp; -#ifdef HAVE_RPC - if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) - return (__dbcl_memp_fget(dbmfp, pgnoaddr, flags, addrp)); -#endif PANIC_CHECK(dbenv); + mp = dbmp->reginfo[0].primary; + mfp = dbmfp->mfp; + mf_offset = R_OFFSET(dbmp->reginfo, mfp); + alloc_bhp = bhp = NULL; + hp = NULL; + b_incr = extending = ret = 0; + /* * Validate arguments. * @@ -74,100 +77,35 @@ memp_fget(dbmfp, pgnoaddr, flags, addrp) * is to keep database files small. It's sleazy as hell, but we catch * any attempt to actually write the file in memp_fput(). */ -#define OKFLAGS \ - (DB_MPOOL_CREATE | DB_MPOOL_LAST | \ - DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP | DB_MPOOL_EXTENT) +#define OKFLAGS (DB_MPOOL_CREATE | DB_MPOOL_LAST | DB_MPOOL_NEW) if (flags != 0) { if ((ret = __db_fchk(dbenv, "memp_fget", flags, OKFLAGS)) != 0) return (ret); - switch (flags & ~DB_MPOOL_EXTENT) { + switch (flags) { case DB_MPOOL_CREATE: + break; case DB_MPOOL_LAST: + /* Get the last page number in the file. */ + if (flags == DB_MPOOL_LAST) { + R_LOCK(dbenv, dbmp->reginfo); + *pgnoaddr = mfp->last_pgno; + R_UNLOCK(dbenv, dbmp->reginfo); + } + break; case DB_MPOOL_NEW: - case DB_MPOOL_NEW_GROUP: - case 0: + /* + * If always creating a page, skip the first search + * of the hash bucket. + */ + if (flags == DB_MPOOL_NEW) + goto alloc; break; default: return (__db_ferr(dbenv, "memp_fget", 1)); } } -#ifdef DIAGNOSTIC - /* - * XXX - * We want to switch threads as often as possible. Yield every time - * we get a new page to ensure contention. - */ - if (DB_GLOBAL(db_pageyield)) - __os_yield(dbenv, 1); -#endif - - /* Initialize remaining local variables. */ - mf_offset = R_OFFSET(dbmp->reginfo, mfp); - bhp = NULL; - st_hsearch = 0; - b_incr = ret = 0; - - R_LOCK(dbenv, dbmp->reginfo); - - /* - * Check for the new, last or last + 1 page requests. - * - * Examine and update the file's last_pgno value. We don't care if - * the last_pgno value immediately changes due to another thread -- - * at this instant in time, the value is correct. We do increment the - * current last_pgno value if the thread is asking for a new page, - * however, to ensure that two threads creating pages don't get the - * same one. - * - * If we create a page, there is the potential that a page after it - * in the file will be written before it will be written. Recovery - * depends on pages that are "created" in the file by subsequent pages - * being written be zeroed out, not have random garbage. Ensure that - * the OS agrees. - * - * !!! - * DB_MPOOL_NEW_GROUP is undocumented -- the hash access method needs - * to allocate contiguous groups of pages in order to do subdatabases. - * We return the first page in the group, but the caller must put an - * LSN on the *last* page and write it, otherwise after a crash we may - * not create all of the pages we need to create. - */ - if (LF_ISSET(DB_MPOOL_LAST | DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP)) { - if (LF_ISSET(DB_MPOOL_NEW)) { - if (F_ISSET(&dbmfp->fh, DB_FH_VALID) && (ret = - __os_fpinit(dbenv, &dbmfp->fh, mfp->last_pgno + 1, - 1, mfp->stat.st_pagesize)) != 0) { - R_UNLOCK(dbenv, dbmp->reginfo); - return (ret); - } - ++mfp->last_pgno; - } - if (LF_ISSET(DB_MPOOL_NEW_GROUP)) { - if (F_ISSET(&dbmfp->fh, DB_FH_VALID) && (ret = - __os_fpinit(dbenv, &dbmfp->fh, mfp->last_pgno + 1, - (int)*pgnoaddr, mfp->stat.st_pagesize)) != 0) { - R_UNLOCK(dbenv, dbmp->reginfo); - return (ret); - } - mfp->last_pgno += *pgnoaddr; - } - *pgnoaddr = mfp->last_pgno; - } - - /* - * Determine the hash bucket where this page will live, and get local - * pointers to the cache and its hash table. - */ - n_cache = NCACHE(mp, *pgnoaddr); - c_mp = dbmp->reginfo[n_cache].primary; - n_bucket = NBUCKET(c_mp, mf_offset, *pgnoaddr); - dbht = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab); - - if (LF_ISSET(DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP)) - goto alloc; - /* * If mmap'ing the file and the page is not past the end of the file, * just return a pointer. @@ -183,235 +121,534 @@ memp_fget(dbmfp, pgnoaddr, flags, addrp) * goes through the cache. All pages previously returned will be safe, * as long as the correct locking protocol was observed. * - * XXX * We don't discard the map because we don't know when all of the * pages will have been discarded from the process' address space. * It would be possible to do so by reference counting the open * pages from the mmap, but it's unclear to me that it's worth it. */ - if (dbmfp->addr != NULL && F_ISSET(mfp, MP_CAN_MMAP)) { - if (*pgnoaddr > mfp->orig_last_pgno) { - /* - * !!! - * See the comment above about non-existent pages and - * the hash access method. - */ - if (!LF_ISSET(DB_MPOOL_CREATE)) { - if (!LF_ISSET(DB_MPOOL_EXTENT)) - __db_err(dbenv, - "%s: page %lu doesn't exist", - __memp_fn(dbmfp), (u_long)*pgnoaddr); - ret = EINVAL; - goto err; - } - } else { - *(void **)addrp = - R_ADDR(dbmfp, *pgnoaddr * mfp->stat.st_pagesize); - ++mfp->stat.st_map; - goto done; - } + if (dbmfp->addr != NULL && + F_ISSET(mfp, MP_CAN_MMAP) && *pgnoaddr <= mfp->orig_last_pgno) { + *(void **)addrp = + R_ADDR(dbmfp, *pgnoaddr * mfp->stat.st_pagesize); + ++mfp->stat.st_map; + return (0); } +hb_search: + /* + * Determine the cache and hash bucket where this page lives and get + * local pointers to them. Reset on each pass through this code, the + * page number can change. + */ + n_cache = NCACHE(mp, mf_offset, *pgnoaddr); + c_mp = dbmp->reginfo[n_cache].primary; + hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab); + hp = &hp[NBUCKET(c_mp, mf_offset, *pgnoaddr)]; + /* Search the hash chain for the page. */ - for (bhp = SH_TAILQ_FIRST(&dbht[n_bucket], __bh); +retry: st_hsearch = 0; + MUTEX_LOCK(dbenv, &hp->hash_mutex); + for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh); bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) { ++st_hsearch; if (bhp->pgno != *pgnoaddr || bhp->mf_offset != mf_offset) continue; - /* Increment the reference count. */ + /* + * Increment the reference count. We may discard the hash + * bucket lock as we evaluate and/or read the buffer, so we + * need to ensure it doesn't move and its contents remain + * unchanged. + */ if (bhp->ref == UINT16_T_MAX) { __db_err(dbenv, "%s: page %lu: reference count overflow", __memp_fn(dbmfp), (u_long)bhp->pgno); ret = EINVAL; + MUTEX_UNLOCK(dbenv, &hp->hash_mutex); goto err; } - - /* - * Increment the reference count. We may discard the region - * lock as we evaluate and/or read the buffer, so we need to - * ensure that it doesn't move and that its contents remain - * unchanged. - */ ++bhp->ref; b_incr = 1; /* - * Any buffer we find might be trouble. - * * BH_LOCKED -- - * I/O is in progress. Because we've incremented the buffer - * reference count, we know the buffer can't move. Unlock - * the region lock, wait for the I/O to complete, and reacquire - * the region. + * I/O is in progress or sync is waiting on the buffer to write + * it. Because we've incremented the buffer reference count, + * we know the buffer can't move. Unlock the bucket lock, wait + * for the buffer to become available, reacquire the bucket. */ - for (first = 1; F_ISSET(bhp, BH_LOCKED); first = 0) { - R_UNLOCK(dbenv, dbmp->reginfo); + for (first = 1; F_ISSET(bhp, BH_LOCKED) && + !F_ISSET(dbenv, DB_ENV_NOLOCKING); first = 0) { + /* + * If someone is trying to sync this buffer and the + * buffer is hot, they may never get in. Give up + * and try again. + */ + if (!first && bhp->ref_sync != 0) { + --bhp->ref; + b_incr = 0; + MUTEX_UNLOCK(dbenv, &hp->hash_mutex); + __os_yield(dbenv, 1); + goto retry; + } + MUTEX_UNLOCK(dbenv, &hp->hash_mutex); /* - * Explicitly yield the processor if it's not the first - * pass through this loop -- if we don't, we might end - * up running to the end of our CPU quantum as we will - * simply be swapping between the two locks. + * Explicitly yield the processor if not the first pass + * through this loop -- if we don't, we might run to the + * end of our CPU quantum as we will simply be swapping + * between the two locks. */ if (!first) __os_yield(dbenv, 1); - MUTEX_LOCK(dbenv, &bhp->mutex, dbenv->lockfhp); + MUTEX_LOCK(dbenv, &bhp->mutex); /* Wait for I/O to finish... */ MUTEX_UNLOCK(dbenv, &bhp->mutex); - R_LOCK(dbenv, dbmp->reginfo); + MUTEX_LOCK(dbenv, &hp->hash_mutex); + } + + ++mfp->stat.st_cache_hit; + break; + } + + /* + * Update the hash bucket search statistics -- do now because our next + * search may be for a different bucket. + */ + ++c_mp->stat.st_hash_searches; + if (st_hsearch > c_mp->stat.st_hash_longest) + c_mp->stat.st_hash_longest = st_hsearch; + c_mp->stat.st_hash_examined += st_hsearch; + + /* + * There are 4 possible paths to this location: + * + * FIRST_MISS: + * Didn't find the page in the hash bucket on our first pass: + * bhp == NULL, alloc_bhp == NULL + * + * FIRST_FOUND: + * Found the page in the hash bucket on our first pass: + * bhp != NULL, alloc_bhp == NULL + * + * SECOND_FOUND: + * Didn't find the page in the hash bucket on the first pass, + * allocated space, and found the page in the hash bucket on + * our second pass: + * bhp != NULL, alloc_bhp != NULL + * + * SECOND_MISS: + * Didn't find the page in the hash bucket on the first pass, + * allocated space, and didn't find the page in the hash bucket + * on our second pass: + * bhp == NULL, alloc_bhp != NULL + */ + state = bhp == NULL ? + (alloc_bhp == NULL ? FIRST_MISS : SECOND_MISS) : + (alloc_bhp == NULL ? FIRST_FOUND : SECOND_FOUND); + switch (state) { + case FIRST_FOUND: + /* We found the buffer in our first check -- we're done. */ + break; + case FIRST_MISS: + /* + * We didn't find the buffer in our first check. Figure out + * if the page exists, and allocate structures so we can add + * the page to the buffer pool. + */ + MUTEX_UNLOCK(dbenv, &hp->hash_mutex); + +alloc: /* + * If DB_MPOOL_NEW is set, we have to allocate a page number. + * If neither DB_MPOOL_CREATE or DB_MPOOL_CREATE is set, then + * it's an error to try and get a page past the end of file. + */ + COMPQUIET(n_cache, 0); + + extending = ret = 0; + R_LOCK(dbenv, dbmp->reginfo); + switch (flags) { + case DB_MPOOL_NEW: + extending = 1; + *pgnoaddr = mfp->last_pgno + 1; + break; + case DB_MPOOL_CREATE: + extending = *pgnoaddr > mfp->last_pgno; + break; + default: + ret = *pgnoaddr > mfp->last_pgno ? DB_PAGE_NOTFOUND : 0; + break; } + R_UNLOCK(dbenv, dbmp->reginfo); + if (ret != 0) + goto err; /* - * BH_TRASH -- - * The contents of the buffer are garbage. Shouldn't happen, - * and this read is likely to fail, but might as well try. + * !!! + * In the DB_MPOOL_NEW code path, mf_offset and n_cache have + * not yet been initialized. */ - if (F_ISSET(bhp, BH_TRASH)) - goto reread; + mf_offset = R_OFFSET(dbmp->reginfo, mfp); + n_cache = NCACHE(mp, mf_offset, *pgnoaddr); + /* Allocate a new buffer header and data space. */ + if ((ret = __memp_alloc(dbmp, + &dbmp->reginfo[n_cache], mfp, 0, NULL, &alloc_bhp)) != 0) + goto err; +#ifdef DIAGNOSTIC + if ((db_alignp_t)alloc_bhp->buf & (sizeof(size_t) - 1)) { + __db_err(dbenv, + "Error: buffer data is NOT size_t aligned"); + ret = EINVAL; + goto err; + } +#endif /* - * BH_CALLPGIN -- - * The buffer was converted so it could be written, and the - * contents need to be converted again. + * If we are extending the file, we'll need the region lock + * again. */ - if (F_ISSET(bhp, BH_CALLPGIN)) { - if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0) + if (extending) + R_LOCK(dbenv, dbmp->reginfo); + + /* + * DB_MPOOL_NEW does not guarantee you a page unreferenced by + * any other thread of control. (That guarantee is interesting + * for DB_MPOOL_NEW, unlike DB_MPOOL_CREATE, because the caller + * did not specify the page number, and so, may reasonably not + * have any way to lock the page outside of mpool.) Regardless, + * if we allocate the page, and some other thread of control + * requests the page by number, we will not detect that and the + * thread of control that allocated using DB_MPOOL_NEW may not + * have a chance to initialize the page. (Note: we *could* + * detect this case if we set a flag in the buffer header which + * guaranteed that no gets of the page would succeed until the + * reference count went to 0, that is, until the creating page + * put the page.) What we do guarantee is that if two threads + * of control are both doing DB_MPOOL_NEW calls, they won't + * collide, that is, they won't both get the same page. + * + * There's a possibility that another thread allocated the page + * we were planning to allocate while we were off doing buffer + * allocation. We can do that by making sure the page number + * we were going to use is still available. If it's not, then + * we check to see if the next available page number hashes to + * the same mpool region as the old one -- if it does, we can + * continue, otherwise, we have to start over. + */ + if (flags == DB_MPOOL_NEW && *pgnoaddr != mfp->last_pgno + 1) { + *pgnoaddr = mfp->last_pgno + 1; + if (n_cache != NCACHE(mp, mf_offset, *pgnoaddr)) { + __db_shalloc_free( + dbmp->reginfo[n_cache].addr, alloc_bhp); + /* + * flags == DB_MPOOL_NEW, so extending is set + * and we're holding the region locked. + */ + R_UNLOCK(dbenv, dbmp->reginfo); + + alloc_bhp = NULL; + goto alloc; + } + } + + /* + * We released the region lock, so another thread might have + * extended the file. Update the last_pgno and initialize + * the file, as necessary, if we extended the file. + */ + if (extending) { +#ifdef HAVE_FILESYSTEM_NOTZERO + if (*pgnoaddr > mfp->last_pgno && + __os_fs_notzero() && + F_ISSET(dbmfp->fhp, DB_FH_VALID)) + ret = __memp_fs_notzero( + dbenv, dbmfp, mfp, pgnoaddr); + else + ret = 0; +#endif + if (ret == 0 && *pgnoaddr > mfp->last_pgno) + mfp->last_pgno = *pgnoaddr; + + R_UNLOCK(dbenv, dbmp->reginfo); + if (ret != 0) goto err; - F_CLR(bhp, BH_CALLPGIN); } + goto hb_search; + case SECOND_FOUND: + /* + * We allocated buffer space for the requested page, but then + * found the page in the buffer cache on our second check. + * That's OK -- we can use the page we found in the pool, + * unless DB_MPOOL_NEW is set. + * + * Free the allocated memory, we no longer need it. Since we + * can't acquire the region lock while holding the hash bucket + * lock, we have to release the hash bucket and re-acquire it. + * That's OK, because we have the buffer pinned down. + */ + MUTEX_UNLOCK(dbenv, &hp->hash_mutex); + R_LOCK(dbenv, &dbmp->reginfo[n_cache]); + __db_shalloc_free(dbmp->reginfo[n_cache].addr, alloc_bhp); + alloc_bhp = NULL; + R_UNLOCK(dbenv, &dbmp->reginfo[n_cache]); + MUTEX_LOCK(dbenv, &hp->hash_mutex); - ++mfp->stat.st_cache_hit; - *(void **)addrp = bhp->buf; - goto done; - } + /* + * We can't use the page we found in the pool if DB_MPOOL_NEW + * was set. (For details, see the above comment beginning + * "DB_MPOOL_NEW does not guarantee you a page unreferenced by + * any other thread of control".) If DB_MPOOL_NEW is set, we + * release our pin on this particular buffer, and try to get + * another one. + */ + if (flags == DB_MPOOL_NEW) { + --bhp->ref; + b_incr = 0; + goto alloc; + } + break; + case SECOND_MISS: + /* + * We allocated buffer space for the requested page, and found + * the page still missing on our second pass through the buffer + * cache. Instantiate the page. + */ + bhp = alloc_bhp; + alloc_bhp = NULL; -alloc: /* Allocate new buffer header and data space. */ - if ((ret = __memp_alloc(dbmp, - &dbmp->reginfo[n_cache], mfp, 0, NULL, &bhp)) != 0) - goto err; + /* + * Initialize all the BH and hash bucket fields so we can call + * __memp_bhfree if an error occurs. + * + * Append the buffer to the tail of the bucket list and update + * the hash bucket's priority. + */ + b_incr = 1; + + memset(bhp, 0, sizeof(BH)); + bhp->ref = 1; + bhp->priority = UINT32_T_MAX; + bhp->pgno = *pgnoaddr; + bhp->mf_offset = mf_offset; + SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq); + hp->hash_priority = + SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority; + + /* If we extended the file, make sure the page is never lost. */ + if (extending) { + ++hp->hash_page_dirty; + F_SET(bhp, BH_DIRTY | BH_DIRTY_CREATE); + } - ++c_mp->stat.st_page_clean; + /* + * If we created the page, zero it out. If we didn't create + * the page, read from the backing file. + * + * !!! + * DB_MPOOL_NEW doesn't call the pgin function. + * + * If DB_MPOOL_CREATE is used, then the application's pgin + * function has to be able to handle pages of 0's -- if it + * uses DB_MPOOL_NEW, it can detect all of its page creates, + * and not bother. + * + * If we're running in diagnostic mode, smash any bytes on the + * page that are unknown quantities for the caller. + * + * Otherwise, read the page into memory, optionally creating it + * if DB_MPOOL_CREATE is set. + */ + if (extending) { + if (mfp->clear_len == 0) + memset(bhp->buf, 0, mfp->stat.st_pagesize); + else { + memset(bhp->buf, 0, mfp->clear_len); +#if defined(DIAGNOSTIC) || defined(UMRW) + memset(bhp->buf + mfp->clear_len, CLEAR_BYTE, + mfp->stat.st_pagesize - mfp->clear_len); +#endif + } - /* - * Initialize the BH fields so that we can call the __memp_bhfree - * routine if an error occurs. - */ - memset(bhp, 0, sizeof(BH)); - bhp->ref = 1; - bhp->pgno = *pgnoaddr; - bhp->mf_offset = mf_offset; + if (flags == DB_MPOOL_CREATE && mfp->ftype != 0) + F_SET(bhp, BH_CALLPGIN); - /* Increment the count of buffers referenced by this MPOOLFILE. */ - ++mfp->block_cnt; + ++mfp->stat.st_page_create; + } else { + F_SET(bhp, BH_TRASH); + ++mfp->stat.st_cache_miss; + } - /* - * Prepend the bucket header to the head of the appropriate MPOOL - * bucket hash list. Append the bucket header to the tail of the - * MPOOL LRU chain. - */ - SH_TAILQ_INSERT_HEAD(&dbht[n_bucket], bhp, hq, __bh); - SH_TAILQ_INSERT_TAIL(&c_mp->bhq, bhp, q); + /* Increment buffer count referenced by MPOOLFILE. */ + MUTEX_LOCK(dbenv, &mfp->mutex); + ++mfp->block_cnt; + MUTEX_UNLOCK(dbenv, &mfp->mutex); -#ifdef DIAGNOSTIC - if ((db_alignp_t)bhp->buf & (sizeof(size_t) - 1)) { - __db_err(dbenv, "Internal error: BH data NOT size_t aligned."); - ret = EINVAL; - __memp_bhfree(dbmp, bhp, 1); - goto err; + /* + * Initialize the mutex. This is the last initialization step, + * because it's the only one that can fail, and everything else + * must be set up or we can't jump to the err label because it + * will call __memp_bhfree. + */ + if ((ret = __db_mutex_setup(dbenv, + &dbmp->reginfo[n_cache], &bhp->mutex, 0)) != 0) + goto err; } -#endif - if ((ret = __db_shmutex_init(dbenv, &bhp->mutex, - R_OFFSET(dbmp->reginfo, &bhp->mutex) + DB_FCNTL_OFF_MPOOL, - 0, &dbmp->reginfo[n_cache], - (REGMAINT *)R_ADDR(&dbmp->reginfo[n_cache], c_mp->maint_off))) - != 0) { - __memp_bhfree(dbmp, bhp, 1); - goto err; + DB_ASSERT(bhp->ref != 0); + + /* + * If we're the only reference, update buffer and bucket priorities. + * We may be about to release the hash bucket lock, and everything + * should be correct, first. (We've already done this if we created + * the buffer, so there is no need to do it again.) + */ + if (state != SECOND_MISS && bhp->ref == 1) { + bhp->priority = UINT32_T_MAX; + SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh); + SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq); + hp->hash_priority = + SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority; } /* - * If we created the page, zero it out and continue. - * - * !!! - * Note: DB_MPOOL_NEW specifically doesn't call the pgin function. - * If DB_MPOOL_CREATE is used, then the application's pgin function - * has to be able to handle pages of 0's -- if it uses DB_MPOOL_NEW, - * it can detect all of its page creates, and not bother. + * BH_TRASH -- + * The buffer we found may need to be filled from the disk. * - * If we're running in diagnostic mode, smash any bytes on the - * page that are unknown quantities for the caller. - * - * Otherwise, read the page into memory, optionally creating it if - * DB_MPOOL_CREATE is set. + * It's possible for the read function to fail, which means we fail as + * well. Note, the __memp_pgread() function discards and reacquires + * the hash lock, so the buffer must be pinned down so that it cannot + * move and its contents are unchanged. Discard the buffer on failure + * unless another thread is waiting on our I/O to complete. It's OK to + * leave the buffer around, as the waiting thread will see the BH_TRASH + * flag set, and will also attempt to discard it. If there's a waiter, + * we need to decrement our reference count. */ - if (LF_ISSET(DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP)) { - if (mfp->clear_len == 0) - memset(bhp->buf, 0, mfp->stat.st_pagesize); - else { - memset(bhp->buf, 0, mfp->clear_len); -#ifdef DIAGNOSTIC - memset(bhp->buf + mfp->clear_len, CLEAR_BYTE, - mfp->stat.st_pagesize - mfp->clear_len); -#endif - } + if (F_ISSET(bhp, BH_TRASH) && + (ret = __memp_pgread(dbmfp, + &hp->hash_mutex, bhp, LF_ISSET(DB_MPOOL_CREATE) ? 1 : 0)) != 0) + goto err; - ++mfp->stat.st_page_create; - } else { - /* - * It's possible for the read function to fail, which means - * that we fail as well. Note, the __memp_pgread() function - * discards the region lock, so the buffer must be pinned - * down so that it cannot move and its contents are unchanged. - */ -reread: if ((ret = __memp_pgread(dbmfp, - bhp, LF_ISSET(DB_MPOOL_CREATE|DB_MPOOL_EXTENT))) != 0) { - /* - * !!! - * Discard the buffer unless another thread is waiting - * on our I/O to complete. Regardless, the header has - * the BH_TRASH flag set. - */ - if (bhp->ref == 1) - __memp_bhfree(dbmp, bhp, 1); + /* + * BH_CALLPGIN -- + * The buffer was processed for being written to disk, and now has + * to be re-converted for use. + */ + if (F_ISSET(bhp, BH_CALLPGIN)) { + if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0) goto err; - } - - ++mfp->stat.st_cache_miss; + F_CLR(bhp, BH_CALLPGIN); } + MUTEX_UNLOCK(dbenv, &hp->hash_mutex); + +#ifdef DIAGNOSTIC + /* Update the file's pinned reference count. */ + R_LOCK(dbenv, dbmp->reginfo); + ++dbmfp->pinref; + R_UNLOCK(dbenv, dbmp->reginfo); + /* - * If we're returning a page after our current notion of the last-page, - * update our information. Note, there's no way to un-instantiate this - * page, it's going to exist whether it's returned to us dirty or not. + * We want to switch threads as often as possible, and at awkward + * times. Yield every time we get a new page to ensure contention. */ - if (bhp->pgno > mfp->last_pgno) - mfp->last_pgno = bhp->pgno; + if (F_ISSET(dbenv, DB_ENV_YIELDCPU)) + __os_yield(dbenv, 1); +#endif *(void **)addrp = bhp->buf; + return (0); -done: /* Update the chain search statistics. */ - if (st_hsearch) { - ++c_mp->stat.st_hash_searches; - if (st_hsearch > c_mp->stat.st_hash_longest) - c_mp->stat.st_hash_longest = st_hsearch; - c_mp->stat.st_hash_examined += st_hsearch; +err: /* + * Discard our reference. If we're the only reference, discard the + * the buffer entirely. If we held a reference to a buffer, we are + * also still holding the hash bucket mutex. + */ + if (b_incr) { + if (bhp->ref == 1) + (void)__memp_bhfree(dbmp, hp, bhp, 1); + else { + --bhp->ref; + MUTEX_UNLOCK(dbenv, &hp->hash_mutex); + } } - ++dbmfp->pinref; + /* If alloc_bhp is set, free the memory. */ + if (alloc_bhp != NULL) + __db_shalloc_free(dbmp->reginfo[n_cache].addr, alloc_bhp); - R_UNLOCK(dbenv, dbmp->reginfo); + return (ret); +} - return (0); +#ifdef HAVE_FILESYSTEM_NOTZERO +/* + * __memp_fs_notzero -- + * Initialize the underlying allocated pages in the file. + */ +static int +__memp_fs_notzero(dbenv, dbmfp, mfp, pgnoaddr) + DB_ENV *dbenv; + DB_MPOOLFILE *dbmfp; + MPOOLFILE *mfp; + db_pgno_t *pgnoaddr; +{ + DB_IO db_io; + u_int32_t i, npages; + size_t nw; + int ret; + u_int8_t *page; + char *fail; -err: /* Discard our reference. */ - if (b_incr) - --bhp->ref; - R_UNLOCK(dbenv, dbmp->reginfo); + /* + * Pages allocated by writing pages past end-of-file are not zeroed, + * on some systems. Recovery could theoretically be fooled by a page + * showing up that contained garbage. In order to avoid this, we + * have to write the pages out to disk, and flush them. The reason + * for the flush is because if we don't sync, the allocation of another + * page subsequent to this one might reach the disk first, and if we + * crashed at the right moment, leave us with this page as the one + * allocated by writing a page past it in the file. + * + * Hash is the only access method that allocates groups of pages. We + * know that it will use the existence of the last page in a group to + * signify that the entire group is OK; so, write all the pages but + * the last one in the group, flush them to disk, and then write the + * last one to disk and flush it. + */ + if ((ret = __os_calloc(dbenv, 1, mfp->stat.st_pagesize, &page)) != 0) + return (ret); + + db_io.fhp = dbmfp->fhp; + db_io.mutexp = dbmfp->mutexp; + db_io.pagesize = db_io.bytes = mfp->stat.st_pagesize; + db_io.buf = page; + + npages = *pgnoaddr - mfp->last_pgno; + for (i = 1; i < npages; ++i) { + db_io.pgno = mfp->last_pgno + i; + if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) { + fail = "write"; + goto err; + } + } + if (i != 1 && (ret = __os_fsync(dbenv, dbmfp->fhp)) != 0) { + fail = "sync"; + goto err; + } - *(void **)addrp = NULL; + db_io.pgno = mfp->last_pgno + npages; + if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) { + fail = "write"; + goto err; + } + if ((ret = __os_fsync(dbenv, dbmfp->fhp)) != 0) { + fail = "sync"; +err: __db_err(dbenv, "%s: %s failed for page %lu", + __memp_fn(dbmfp), fail, (u_long)db_io.pgno); + } + + __os_free(dbenv, page); return (ret); } +#endif |