diff options
Diffstat (limited to 'bdb/mp/mp_fget.c')
-rw-r--r-- | bdb/mp/mp_fget.c | 654 |
1 files changed, 0 insertions, 654 deletions
diff --git a/bdb/mp/mp_fget.c b/bdb/mp/mp_fget.c deleted file mode 100644 index be0785a2184..00000000000 --- a/bdb/mp/mp_fget.c +++ /dev/null @@ -1,654 +0,0 @@ -/*- - * See the file LICENSE for redistribution information. - * - * Copyright (c) 1996-2002 - * Sleepycat Software. All rights reserved. - */ -#include "db_config.h" - -#ifndef lint -static const char revid[] = "$Id: mp_fget.c,v 11.68 2002/08/06 04:58:09 bostic Exp $"; -#endif /* not lint */ - -#ifndef NO_SYSTEM_INCLUDES -#include <sys/types.h> - -#include <string.h> -#endif - -#include "db_int.h" -#include "dbinc/db_shash.h" -#include "dbinc/mp.h" - -#ifdef HAVE_FILESYSTEM_NOTZERO -static int __memp_fs_notzero - __P((DB_ENV *, DB_MPOOLFILE *, MPOOLFILE *, db_pgno_t *)); -#endif - -/* - * __memp_fget -- - * Get a page from the file. - * - * PUBLIC: int __memp_fget - * PUBLIC: __P((DB_MPOOLFILE *, db_pgno_t *, u_int32_t, void *)); - */ -int -__memp_fget(dbmfp, pgnoaddr, flags, addrp) - DB_MPOOLFILE *dbmfp; - db_pgno_t *pgnoaddr; - u_int32_t flags; - void *addrp; -{ - enum { FIRST_FOUND, FIRST_MISS, SECOND_FOUND, SECOND_MISS } state; - BH *alloc_bhp, *bhp; - DB_ENV *dbenv; - DB_MPOOL *dbmp; - DB_MPOOL_HASH *hp; - MPOOL *c_mp, *mp; - MPOOLFILE *mfp; - roff_t mf_offset; - u_int32_t n_cache, st_hsearch; - int b_incr, extending, first, ret; - - *(void **)addrp = NULL; - - dbmp = dbmfp->dbmp; - dbenv = dbmp->dbenv; - - PANIC_CHECK(dbenv); - - mp = dbmp->reginfo[0].primary; - mfp = dbmfp->mfp; - mf_offset = R_OFFSET(dbmp->reginfo, mfp); - alloc_bhp = bhp = NULL; - hp = NULL; - b_incr = extending = ret = 0; - - /* - * Validate arguments. - * - * !!! - * Don't test for DB_MPOOL_CREATE and DB_MPOOL_NEW flags for readonly - * files here, and create non-existent pages in readonly files if the - * flags are set, later. The reason is that the hash access method - * wants to get empty pages that don't really exist in readonly files. - * The only alternative is for hash to write the last "bucket" all the - * time, which we don't want to do because one of our big goals in life - * is to keep database files small. It's sleazy as hell, but we catch - * any attempt to actually write the file in memp_fput(). - */ -#define OKFLAGS (DB_MPOOL_CREATE | DB_MPOOL_LAST | DB_MPOOL_NEW) - if (flags != 0) { - if ((ret = __db_fchk(dbenv, "memp_fget", flags, OKFLAGS)) != 0) - return (ret); - - switch (flags) { - case DB_MPOOL_CREATE: - break; - case DB_MPOOL_LAST: - /* Get the last page number in the file. */ - if (flags == DB_MPOOL_LAST) { - R_LOCK(dbenv, dbmp->reginfo); - *pgnoaddr = mfp->last_pgno; - R_UNLOCK(dbenv, dbmp->reginfo); - } - break; - case DB_MPOOL_NEW: - /* - * If always creating a page, skip the first search - * of the hash bucket. - */ - if (flags == DB_MPOOL_NEW) - goto alloc; - break; - default: - return (__db_ferr(dbenv, "memp_fget", 1)); - } - } - - /* - * If mmap'ing the file and the page is not past the end of the file, - * just return a pointer. - * - * The page may be past the end of the file, so check the page number - * argument against the original length of the file. If we previously - * returned pages past the original end of the file, last_pgno will - * have been updated to match the "new" end of the file, and checking - * against it would return pointers past the end of the mmap'd region. - * - * If another process has opened the file for writing since we mmap'd - * it, we will start playing the game by their rules, i.e. everything - * goes through the cache. All pages previously returned will be safe, - * as long as the correct locking protocol was observed. - * - * We don't discard the map because we don't know when all of the - * pages will have been discarded from the process' address space. - * It would be possible to do so by reference counting the open - * pages from the mmap, but it's unclear to me that it's worth it. - */ - if (dbmfp->addr != NULL && - F_ISSET(mfp, MP_CAN_MMAP) && *pgnoaddr <= mfp->orig_last_pgno) { - *(void **)addrp = - R_ADDR(dbmfp, *pgnoaddr * mfp->stat.st_pagesize); - ++mfp->stat.st_map; - return (0); - } - -hb_search: - /* - * Determine the cache and hash bucket where this page lives and get - * local pointers to them. Reset on each pass through this code, the - * page number can change. - */ - n_cache = NCACHE(mp, mf_offset, *pgnoaddr); - c_mp = dbmp->reginfo[n_cache].primary; - hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab); - hp = &hp[NBUCKET(c_mp, mf_offset, *pgnoaddr)]; - - /* Search the hash chain for the page. */ -retry: st_hsearch = 0; - MUTEX_LOCK(dbenv, &hp->hash_mutex); - for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh); - bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) { - ++st_hsearch; - if (bhp->pgno != *pgnoaddr || bhp->mf_offset != mf_offset) - continue; - - /* - * Increment the reference count. We may discard the hash - * bucket lock as we evaluate and/or read the buffer, so we - * need to ensure it doesn't move and its contents remain - * unchanged. - */ - if (bhp->ref == UINT16_T_MAX) { - __db_err(dbenv, - "%s: page %lu: reference count overflow", - __memp_fn(dbmfp), (u_long)bhp->pgno); - ret = EINVAL; - MUTEX_UNLOCK(dbenv, &hp->hash_mutex); - goto err; - } - ++bhp->ref; - b_incr = 1; - - /* - * BH_LOCKED -- - * I/O is in progress or sync is waiting on the buffer to write - * it. Because we've incremented the buffer reference count, - * we know the buffer can't move. Unlock the bucket lock, wait - * for the buffer to become available, reacquire the bucket. - */ - for (first = 1; F_ISSET(bhp, BH_LOCKED) && - !F_ISSET(dbenv, DB_ENV_NOLOCKING); first = 0) { - /* - * If someone is trying to sync this buffer and the - * buffer is hot, they may never get in. Give up - * and try again. - */ - if (!first && bhp->ref_sync != 0) { - --bhp->ref; - b_incr = 0; - MUTEX_UNLOCK(dbenv, &hp->hash_mutex); - __os_yield(dbenv, 1); - goto retry; - } - - MUTEX_UNLOCK(dbenv, &hp->hash_mutex); - /* - * Explicitly yield the processor if not the first pass - * through this loop -- if we don't, we might run to the - * end of our CPU quantum as we will simply be swapping - * between the two locks. - */ - if (!first) - __os_yield(dbenv, 1); - - MUTEX_LOCK(dbenv, &bhp->mutex); - /* Wait for I/O to finish... */ - MUTEX_UNLOCK(dbenv, &bhp->mutex); - MUTEX_LOCK(dbenv, &hp->hash_mutex); - } - - ++mfp->stat.st_cache_hit; - break; - } - - /* - * Update the hash bucket search statistics -- do now because our next - * search may be for a different bucket. - */ - ++c_mp->stat.st_hash_searches; - if (st_hsearch > c_mp->stat.st_hash_longest) - c_mp->stat.st_hash_longest = st_hsearch; - c_mp->stat.st_hash_examined += st_hsearch; - - /* - * There are 4 possible paths to this location: - * - * FIRST_MISS: - * Didn't find the page in the hash bucket on our first pass: - * bhp == NULL, alloc_bhp == NULL - * - * FIRST_FOUND: - * Found the page in the hash bucket on our first pass: - * bhp != NULL, alloc_bhp == NULL - * - * SECOND_FOUND: - * Didn't find the page in the hash bucket on the first pass, - * allocated space, and found the page in the hash bucket on - * our second pass: - * bhp != NULL, alloc_bhp != NULL - * - * SECOND_MISS: - * Didn't find the page in the hash bucket on the first pass, - * allocated space, and didn't find the page in the hash bucket - * on our second pass: - * bhp == NULL, alloc_bhp != NULL - */ - state = bhp == NULL ? - (alloc_bhp == NULL ? FIRST_MISS : SECOND_MISS) : - (alloc_bhp == NULL ? FIRST_FOUND : SECOND_FOUND); - switch (state) { - case FIRST_FOUND: - /* We found the buffer in our first check -- we're done. */ - break; - case FIRST_MISS: - /* - * We didn't find the buffer in our first check. Figure out - * if the page exists, and allocate structures so we can add - * the page to the buffer pool. - */ - MUTEX_UNLOCK(dbenv, &hp->hash_mutex); - -alloc: /* - * If DB_MPOOL_NEW is set, we have to allocate a page number. - * If neither DB_MPOOL_CREATE or DB_MPOOL_CREATE is set, then - * it's an error to try and get a page past the end of file. - */ - COMPQUIET(n_cache, 0); - - extending = ret = 0; - R_LOCK(dbenv, dbmp->reginfo); - switch (flags) { - case DB_MPOOL_NEW: - extending = 1; - *pgnoaddr = mfp->last_pgno + 1; - break; - case DB_MPOOL_CREATE: - extending = *pgnoaddr > mfp->last_pgno; - break; - default: - ret = *pgnoaddr > mfp->last_pgno ? DB_PAGE_NOTFOUND : 0; - break; - } - R_UNLOCK(dbenv, dbmp->reginfo); - if (ret != 0) - goto err; - - /* - * !!! - * In the DB_MPOOL_NEW code path, mf_offset and n_cache have - * not yet been initialized. - */ - mf_offset = R_OFFSET(dbmp->reginfo, mfp); - n_cache = NCACHE(mp, mf_offset, *pgnoaddr); - - /* Allocate a new buffer header and data space. */ - if ((ret = __memp_alloc(dbmp, - &dbmp->reginfo[n_cache], mfp, 0, NULL, &alloc_bhp)) != 0) - goto err; -#ifdef DIAGNOSTIC - if ((db_alignp_t)alloc_bhp->buf & (sizeof(size_t) - 1)) { - __db_err(dbenv, - "Error: buffer data is NOT size_t aligned"); - ret = EINVAL; - goto err; - } -#endif - /* - * If we are extending the file, we'll need the region lock - * again. - */ - if (extending) - R_LOCK(dbenv, dbmp->reginfo); - - /* - * DB_MPOOL_NEW does not guarantee you a page unreferenced by - * any other thread of control. (That guarantee is interesting - * for DB_MPOOL_NEW, unlike DB_MPOOL_CREATE, because the caller - * did not specify the page number, and so, may reasonably not - * have any way to lock the page outside of mpool.) Regardless, - * if we allocate the page, and some other thread of control - * requests the page by number, we will not detect that and the - * thread of control that allocated using DB_MPOOL_NEW may not - * have a chance to initialize the page. (Note: we *could* - * detect this case if we set a flag in the buffer header which - * guaranteed that no gets of the page would succeed until the - * reference count went to 0, that is, until the creating page - * put the page.) What we do guarantee is that if two threads - * of control are both doing DB_MPOOL_NEW calls, they won't - * collide, that is, they won't both get the same page. - * - * There's a possibility that another thread allocated the page - * we were planning to allocate while we were off doing buffer - * allocation. We can do that by making sure the page number - * we were going to use is still available. If it's not, then - * we check to see if the next available page number hashes to - * the same mpool region as the old one -- if it does, we can - * continue, otherwise, we have to start over. - */ - if (flags == DB_MPOOL_NEW && *pgnoaddr != mfp->last_pgno + 1) { - *pgnoaddr = mfp->last_pgno + 1; - if (n_cache != NCACHE(mp, mf_offset, *pgnoaddr)) { - __db_shalloc_free( - dbmp->reginfo[n_cache].addr, alloc_bhp); - /* - * flags == DB_MPOOL_NEW, so extending is set - * and we're holding the region locked. - */ - R_UNLOCK(dbenv, dbmp->reginfo); - - alloc_bhp = NULL; - goto alloc; - } - } - - /* - * We released the region lock, so another thread might have - * extended the file. Update the last_pgno and initialize - * the file, as necessary, if we extended the file. - */ - if (extending) { -#ifdef HAVE_FILESYSTEM_NOTZERO - if (*pgnoaddr > mfp->last_pgno && - __os_fs_notzero() && - F_ISSET(dbmfp->fhp, DB_FH_VALID)) - ret = __memp_fs_notzero( - dbenv, dbmfp, mfp, pgnoaddr); - else - ret = 0; -#endif - if (ret == 0 && *pgnoaddr > mfp->last_pgno) - mfp->last_pgno = *pgnoaddr; - - R_UNLOCK(dbenv, dbmp->reginfo); - if (ret != 0) - goto err; - } - goto hb_search; - case SECOND_FOUND: - /* - * We allocated buffer space for the requested page, but then - * found the page in the buffer cache on our second check. - * That's OK -- we can use the page we found in the pool, - * unless DB_MPOOL_NEW is set. - * - * Free the allocated memory, we no longer need it. Since we - * can't acquire the region lock while holding the hash bucket - * lock, we have to release the hash bucket and re-acquire it. - * That's OK, because we have the buffer pinned down. - */ - MUTEX_UNLOCK(dbenv, &hp->hash_mutex); - R_LOCK(dbenv, &dbmp->reginfo[n_cache]); - __db_shalloc_free(dbmp->reginfo[n_cache].addr, alloc_bhp); - alloc_bhp = NULL; - R_UNLOCK(dbenv, &dbmp->reginfo[n_cache]); - MUTEX_LOCK(dbenv, &hp->hash_mutex); - - /* - * We can't use the page we found in the pool if DB_MPOOL_NEW - * was set. (For details, see the above comment beginning - * "DB_MPOOL_NEW does not guarantee you a page unreferenced by - * any other thread of control".) If DB_MPOOL_NEW is set, we - * release our pin on this particular buffer, and try to get - * another one. - */ - if (flags == DB_MPOOL_NEW) { - --bhp->ref; - b_incr = 0; - goto alloc; - } - break; - case SECOND_MISS: - /* - * We allocated buffer space for the requested page, and found - * the page still missing on our second pass through the buffer - * cache. Instantiate the page. - */ - bhp = alloc_bhp; - alloc_bhp = NULL; - - /* - * Initialize all the BH and hash bucket fields so we can call - * __memp_bhfree if an error occurs. - * - * Append the buffer to the tail of the bucket list and update - * the hash bucket's priority. - */ - b_incr = 1; - - memset(bhp, 0, sizeof(BH)); - bhp->ref = 1; - bhp->priority = UINT32_T_MAX; - bhp->pgno = *pgnoaddr; - bhp->mf_offset = mf_offset; - SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq); - hp->hash_priority = - SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority; - - /* If we extended the file, make sure the page is never lost. */ - if (extending) { - ++hp->hash_page_dirty; - F_SET(bhp, BH_DIRTY | BH_DIRTY_CREATE); - } - - /* - * If we created the page, zero it out. If we didn't create - * the page, read from the backing file. - * - * !!! - * DB_MPOOL_NEW doesn't call the pgin function. - * - * If DB_MPOOL_CREATE is used, then the application's pgin - * function has to be able to handle pages of 0's -- if it - * uses DB_MPOOL_NEW, it can detect all of its page creates, - * and not bother. - * - * If we're running in diagnostic mode, smash any bytes on the - * page that are unknown quantities for the caller. - * - * Otherwise, read the page into memory, optionally creating it - * if DB_MPOOL_CREATE is set. - */ - if (extending) { - if (mfp->clear_len == 0) - memset(bhp->buf, 0, mfp->stat.st_pagesize); - else { - memset(bhp->buf, 0, mfp->clear_len); -#if defined(DIAGNOSTIC) || defined(UMRW) - memset(bhp->buf + mfp->clear_len, CLEAR_BYTE, - mfp->stat.st_pagesize - mfp->clear_len); -#endif - } - - if (flags == DB_MPOOL_CREATE && mfp->ftype != 0) - F_SET(bhp, BH_CALLPGIN); - - ++mfp->stat.st_page_create; - } else { - F_SET(bhp, BH_TRASH); - ++mfp->stat.st_cache_miss; - } - - /* Increment buffer count referenced by MPOOLFILE. */ - MUTEX_LOCK(dbenv, &mfp->mutex); - ++mfp->block_cnt; - MUTEX_UNLOCK(dbenv, &mfp->mutex); - - /* - * Initialize the mutex. This is the last initialization step, - * because it's the only one that can fail, and everything else - * must be set up or we can't jump to the err label because it - * will call __memp_bhfree. - */ - if ((ret = __db_mutex_setup(dbenv, - &dbmp->reginfo[n_cache], &bhp->mutex, 0)) != 0) - goto err; - } - - DB_ASSERT(bhp->ref != 0); - - /* - * If we're the only reference, update buffer and bucket priorities. - * We may be about to release the hash bucket lock, and everything - * should be correct, first. (We've already done this if we created - * the buffer, so there is no need to do it again.) - */ - if (state != SECOND_MISS && bhp->ref == 1) { - bhp->priority = UINT32_T_MAX; - SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh); - SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq); - hp->hash_priority = - SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority; - } - - /* - * BH_TRASH -- - * The buffer we found may need to be filled from the disk. - * - * It's possible for the read function to fail, which means we fail as - * well. Note, the __memp_pgread() function discards and reacquires - * the hash lock, so the buffer must be pinned down so that it cannot - * move and its contents are unchanged. Discard the buffer on failure - * unless another thread is waiting on our I/O to complete. It's OK to - * leave the buffer around, as the waiting thread will see the BH_TRASH - * flag set, and will also attempt to discard it. If there's a waiter, - * we need to decrement our reference count. - */ - if (F_ISSET(bhp, BH_TRASH) && - (ret = __memp_pgread(dbmfp, - &hp->hash_mutex, bhp, LF_ISSET(DB_MPOOL_CREATE) ? 1 : 0)) != 0) - goto err; - - /* - * BH_CALLPGIN -- - * The buffer was processed for being written to disk, and now has - * to be re-converted for use. - */ - if (F_ISSET(bhp, BH_CALLPGIN)) { - if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0) - goto err; - F_CLR(bhp, BH_CALLPGIN); - } - - MUTEX_UNLOCK(dbenv, &hp->hash_mutex); - -#ifdef DIAGNOSTIC - /* Update the file's pinned reference count. */ - R_LOCK(dbenv, dbmp->reginfo); - ++dbmfp->pinref; - R_UNLOCK(dbenv, dbmp->reginfo); - - /* - * We want to switch threads as often as possible, and at awkward - * times. Yield every time we get a new page to ensure contention. - */ - if (F_ISSET(dbenv, DB_ENV_YIELDCPU)) - __os_yield(dbenv, 1); -#endif - - *(void **)addrp = bhp->buf; - return (0); - -err: /* - * Discard our reference. If we're the only reference, discard the - * the buffer entirely. If we held a reference to a buffer, we are - * also still holding the hash bucket mutex. - */ - if (b_incr) { - if (bhp->ref == 1) - (void)__memp_bhfree(dbmp, hp, bhp, 1); - else { - --bhp->ref; - MUTEX_UNLOCK(dbenv, &hp->hash_mutex); - } - } - - /* If alloc_bhp is set, free the memory. */ - if (alloc_bhp != NULL) - __db_shalloc_free(dbmp->reginfo[n_cache].addr, alloc_bhp); - - return (ret); -} - -#ifdef HAVE_FILESYSTEM_NOTZERO -/* - * __memp_fs_notzero -- - * Initialize the underlying allocated pages in the file. - */ -static int -__memp_fs_notzero(dbenv, dbmfp, mfp, pgnoaddr) - DB_ENV *dbenv; - DB_MPOOLFILE *dbmfp; - MPOOLFILE *mfp; - db_pgno_t *pgnoaddr; -{ - DB_IO db_io; - u_int32_t i, npages; - size_t nw; - int ret; - u_int8_t *page; - char *fail; - - /* - * Pages allocated by writing pages past end-of-file are not zeroed, - * on some systems. Recovery could theoretically be fooled by a page - * showing up that contained garbage. In order to avoid this, we - * have to write the pages out to disk, and flush them. The reason - * for the flush is because if we don't sync, the allocation of another - * page subsequent to this one might reach the disk first, and if we - * crashed at the right moment, leave us with this page as the one - * allocated by writing a page past it in the file. - * - * Hash is the only access method that allocates groups of pages. We - * know that it will use the existence of the last page in a group to - * signify that the entire group is OK; so, write all the pages but - * the last one in the group, flush them to disk, and then write the - * last one to disk and flush it. - */ - if ((ret = __os_calloc(dbenv, 1, mfp->stat.st_pagesize, &page)) != 0) - return (ret); - - db_io.fhp = dbmfp->fhp; - db_io.mutexp = dbmfp->mutexp; - db_io.pagesize = db_io.bytes = mfp->stat.st_pagesize; - db_io.buf = page; - - npages = *pgnoaddr - mfp->last_pgno; - for (i = 1; i < npages; ++i) { - db_io.pgno = mfp->last_pgno + i; - if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) { - fail = "write"; - goto err; - } - } - if (i != 1 && (ret = __os_fsync(dbenv, dbmfp->fhp)) != 0) { - fail = "sync"; - goto err; - } - - db_io.pgno = mfp->last_pgno + npages; - if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) { - fail = "write"; - goto err; - } - if ((ret = __os_fsync(dbenv, dbmfp->fhp)) != 0) { - fail = "sync"; -err: __db_err(dbenv, "%s: %s failed for page %lu", - __memp_fn(dbmfp), fail, (u_long)db_io.pgno); - } - - __os_free(dbenv, page); - return (ret); -} -#endif |