summaryrefslogtreecommitdiff
path: root/bdb/mp/mp_fget.c
diff options
context:
space:
mode:
Diffstat (limited to 'bdb/mp/mp_fget.c')
-rw-r--r--bdb/mp/mp_fget.c763
1 files changed, 500 insertions, 263 deletions
diff --git a/bdb/mp/mp_fget.c b/bdb/mp/mp_fget.c
index 1bff5e136ab..be0785a2184 100644
--- a/bdb/mp/mp_fget.c
+++ b/bdb/mp/mp_fget.c
@@ -1,13 +1,13 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: mp_fget.c,v 11.28 2001/01/10 04:50:53 ubell Exp $";
+static const char revid[] = "$Id: mp_fget.c,v 11.68 2002/08/06 04:58:09 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -16,51 +16,54 @@ static const char revid[] = "$Id: mp_fget.c,v 11.28 2001/01/10 04:50:53 ubell Ex
#include <string.h>
#endif
-#ifdef HAVE_RPC
-#include "db_server.h"
-#endif
-
#include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"
-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
+#ifdef HAVE_FILESYSTEM_NOTZERO
+static int __memp_fs_notzero
+ __P((DB_ENV *, DB_MPOOLFILE *, MPOOLFILE *, db_pgno_t *));
#endif
/*
- * memp_fget --
+ * __memp_fget --
* Get a page from the file.
+ *
+ * PUBLIC: int __memp_fget
+ * PUBLIC: __P((DB_MPOOLFILE *, db_pgno_t *, u_int32_t, void *));
*/
int
-memp_fget(dbmfp, pgnoaddr, flags, addrp)
+__memp_fget(dbmfp, pgnoaddr, flags, addrp)
DB_MPOOLFILE *dbmfp;
db_pgno_t *pgnoaddr;
u_int32_t flags;
void *addrp;
{
- BH *bhp;
+ enum { FIRST_FOUND, FIRST_MISS, SECOND_FOUND, SECOND_MISS } state;
+ BH *alloc_bhp, *bhp;
DB_ENV *dbenv;
DB_MPOOL *dbmp;
- DB_HASHTAB *dbht;
+ DB_MPOOL_HASH *hp;
MPOOL *c_mp, *mp;
MPOOLFILE *mfp;
- size_t n_bucket, n_cache, mf_offset;
- u_int32_t st_hsearch;
- int b_incr, first, ret;
+ roff_t mf_offset;
+ u_int32_t n_cache, st_hsearch;
+ int b_incr, extending, first, ret;
+
+ *(void **)addrp = NULL;
dbmp = dbmfp->dbmp;
dbenv = dbmp->dbenv;
- mp = dbmp->reginfo[0].primary;
- mfp = dbmfp->mfp;
-#ifdef HAVE_RPC
- if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
- return (__dbcl_memp_fget(dbmfp, pgnoaddr, flags, addrp));
-#endif
PANIC_CHECK(dbenv);
+ mp = dbmp->reginfo[0].primary;
+ mfp = dbmfp->mfp;
+ mf_offset = R_OFFSET(dbmp->reginfo, mfp);
+ alloc_bhp = bhp = NULL;
+ hp = NULL;
+ b_incr = extending = ret = 0;
+
/*
* Validate arguments.
*
@@ -74,100 +77,35 @@ memp_fget(dbmfp, pgnoaddr, flags, addrp)
* is to keep database files small. It's sleazy as hell, but we catch
* any attempt to actually write the file in memp_fput().
*/
-#define OKFLAGS \
- (DB_MPOOL_CREATE | DB_MPOOL_LAST | \
- DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP | DB_MPOOL_EXTENT)
+#define OKFLAGS (DB_MPOOL_CREATE | DB_MPOOL_LAST | DB_MPOOL_NEW)
if (flags != 0) {
if ((ret = __db_fchk(dbenv, "memp_fget", flags, OKFLAGS)) != 0)
return (ret);
- switch (flags & ~DB_MPOOL_EXTENT) {
+ switch (flags) {
case DB_MPOOL_CREATE:
+ break;
case DB_MPOOL_LAST:
+ /* Get the last page number in the file. */
+ if (flags == DB_MPOOL_LAST) {
+ R_LOCK(dbenv, dbmp->reginfo);
+ *pgnoaddr = mfp->last_pgno;
+ R_UNLOCK(dbenv, dbmp->reginfo);
+ }
+ break;
case DB_MPOOL_NEW:
- case DB_MPOOL_NEW_GROUP:
- case 0:
+ /*
+ * If always creating a page, skip the first search
+ * of the hash bucket.
+ */
+ if (flags == DB_MPOOL_NEW)
+ goto alloc;
break;
default:
return (__db_ferr(dbenv, "memp_fget", 1));
}
}
-#ifdef DIAGNOSTIC
- /*
- * XXX
- * We want to switch threads as often as possible. Yield every time
- * we get a new page to ensure contention.
- */
- if (DB_GLOBAL(db_pageyield))
- __os_yield(dbenv, 1);
-#endif
-
- /* Initialize remaining local variables. */
- mf_offset = R_OFFSET(dbmp->reginfo, mfp);
- bhp = NULL;
- st_hsearch = 0;
- b_incr = ret = 0;
-
- R_LOCK(dbenv, dbmp->reginfo);
-
- /*
- * Check for the new, last or last + 1 page requests.
- *
- * Examine and update the file's last_pgno value. We don't care if
- * the last_pgno value immediately changes due to another thread --
- * at this instant in time, the value is correct. We do increment the
- * current last_pgno value if the thread is asking for a new page,
- * however, to ensure that two threads creating pages don't get the
- * same one.
- *
- * If we create a page, there is the potential that a page after it
- * in the file will be written before it will be written. Recovery
- * depends on pages that are "created" in the file by subsequent pages
- * being written be zeroed out, not have random garbage. Ensure that
- * the OS agrees.
- *
- * !!!
- * DB_MPOOL_NEW_GROUP is undocumented -- the hash access method needs
- * to allocate contiguous groups of pages in order to do subdatabases.
- * We return the first page in the group, but the caller must put an
- * LSN on the *last* page and write it, otherwise after a crash we may
- * not create all of the pages we need to create.
- */
- if (LF_ISSET(DB_MPOOL_LAST | DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP)) {
- if (LF_ISSET(DB_MPOOL_NEW)) {
- if (F_ISSET(&dbmfp->fh, DB_FH_VALID) && (ret =
- __os_fpinit(dbenv, &dbmfp->fh, mfp->last_pgno + 1,
- 1, mfp->stat.st_pagesize)) != 0) {
- R_UNLOCK(dbenv, dbmp->reginfo);
- return (ret);
- }
- ++mfp->last_pgno;
- }
- if (LF_ISSET(DB_MPOOL_NEW_GROUP)) {
- if (F_ISSET(&dbmfp->fh, DB_FH_VALID) && (ret =
- __os_fpinit(dbenv, &dbmfp->fh, mfp->last_pgno + 1,
- (int)*pgnoaddr, mfp->stat.st_pagesize)) != 0) {
- R_UNLOCK(dbenv, dbmp->reginfo);
- return (ret);
- }
- mfp->last_pgno += *pgnoaddr;
- }
- *pgnoaddr = mfp->last_pgno;
- }
-
- /*
- * Determine the hash bucket where this page will live, and get local
- * pointers to the cache and its hash table.
- */
- n_cache = NCACHE(mp, *pgnoaddr);
- c_mp = dbmp->reginfo[n_cache].primary;
- n_bucket = NBUCKET(c_mp, mf_offset, *pgnoaddr);
- dbht = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
-
- if (LF_ISSET(DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP))
- goto alloc;
-
/*
* If mmap'ing the file and the page is not past the end of the file,
* just return a pointer.
@@ -183,235 +121,534 @@ memp_fget(dbmfp, pgnoaddr, flags, addrp)
* goes through the cache. All pages previously returned will be safe,
* as long as the correct locking protocol was observed.
*
- * XXX
* We don't discard the map because we don't know when all of the
* pages will have been discarded from the process' address space.
* It would be possible to do so by reference counting the open
* pages from the mmap, but it's unclear to me that it's worth it.
*/
- if (dbmfp->addr != NULL && F_ISSET(mfp, MP_CAN_MMAP)) {
- if (*pgnoaddr > mfp->orig_last_pgno) {
- /*
- * !!!
- * See the comment above about non-existent pages and
- * the hash access method.
- */
- if (!LF_ISSET(DB_MPOOL_CREATE)) {
- if (!LF_ISSET(DB_MPOOL_EXTENT))
- __db_err(dbenv,
- "%s: page %lu doesn't exist",
- __memp_fn(dbmfp), (u_long)*pgnoaddr);
- ret = EINVAL;
- goto err;
- }
- } else {
- *(void **)addrp =
- R_ADDR(dbmfp, *pgnoaddr * mfp->stat.st_pagesize);
- ++mfp->stat.st_map;
- goto done;
- }
+ if (dbmfp->addr != NULL &&
+ F_ISSET(mfp, MP_CAN_MMAP) && *pgnoaddr <= mfp->orig_last_pgno) {
+ *(void **)addrp =
+ R_ADDR(dbmfp, *pgnoaddr * mfp->stat.st_pagesize);
+ ++mfp->stat.st_map;
+ return (0);
}
+hb_search:
+ /*
+ * Determine the cache and hash bucket where this page lives and get
+ * local pointers to them. Reset on each pass through this code, the
+ * page number can change.
+ */
+ n_cache = NCACHE(mp, mf_offset, *pgnoaddr);
+ c_mp = dbmp->reginfo[n_cache].primary;
+ hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
+ hp = &hp[NBUCKET(c_mp, mf_offset, *pgnoaddr)];
+
/* Search the hash chain for the page. */
- for (bhp = SH_TAILQ_FIRST(&dbht[n_bucket], __bh);
+retry: st_hsearch = 0;
+ MUTEX_LOCK(dbenv, &hp->hash_mutex);
+ for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) {
++st_hsearch;
if (bhp->pgno != *pgnoaddr || bhp->mf_offset != mf_offset)
continue;
- /* Increment the reference count. */
+ /*
+ * Increment the reference count. We may discard the hash
+ * bucket lock as we evaluate and/or read the buffer, so we
+ * need to ensure it doesn't move and its contents remain
+ * unchanged.
+ */
if (bhp->ref == UINT16_T_MAX) {
__db_err(dbenv,
"%s: page %lu: reference count overflow",
__memp_fn(dbmfp), (u_long)bhp->pgno);
ret = EINVAL;
+ MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
goto err;
}
-
- /*
- * Increment the reference count. We may discard the region
- * lock as we evaluate and/or read the buffer, so we need to
- * ensure that it doesn't move and that its contents remain
- * unchanged.
- */
++bhp->ref;
b_incr = 1;
/*
- * Any buffer we find might be trouble.
- *
* BH_LOCKED --
- * I/O is in progress. Because we've incremented the buffer
- * reference count, we know the buffer can't move. Unlock
- * the region lock, wait for the I/O to complete, and reacquire
- * the region.
+ * I/O is in progress or sync is waiting on the buffer to write
+ * it. Because we've incremented the buffer reference count,
+ * we know the buffer can't move. Unlock the bucket lock, wait
+ * for the buffer to become available, reacquire the bucket.
*/
- for (first = 1; F_ISSET(bhp, BH_LOCKED); first = 0) {
- R_UNLOCK(dbenv, dbmp->reginfo);
+ for (first = 1; F_ISSET(bhp, BH_LOCKED) &&
+ !F_ISSET(dbenv, DB_ENV_NOLOCKING); first = 0) {
+ /*
+ * If someone is trying to sync this buffer and the
+ * buffer is hot, they may never get in. Give up
+ * and try again.
+ */
+ if (!first && bhp->ref_sync != 0) {
+ --bhp->ref;
+ b_incr = 0;
+ MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+ __os_yield(dbenv, 1);
+ goto retry;
+ }
+ MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
/*
- * Explicitly yield the processor if it's not the first
- * pass through this loop -- if we don't, we might end
- * up running to the end of our CPU quantum as we will
- * simply be swapping between the two locks.
+ * Explicitly yield the processor if not the first pass
+ * through this loop -- if we don't, we might run to the
+ * end of our CPU quantum as we will simply be swapping
+ * between the two locks.
*/
if (!first)
__os_yield(dbenv, 1);
- MUTEX_LOCK(dbenv, &bhp->mutex, dbenv->lockfhp);
+ MUTEX_LOCK(dbenv, &bhp->mutex);
/* Wait for I/O to finish... */
MUTEX_UNLOCK(dbenv, &bhp->mutex);
- R_LOCK(dbenv, dbmp->reginfo);
+ MUTEX_LOCK(dbenv, &hp->hash_mutex);
+ }
+
+ ++mfp->stat.st_cache_hit;
+ break;
+ }
+
+ /*
+ * Update the hash bucket search statistics -- do now because our next
+ * search may be for a different bucket.
+ */
+ ++c_mp->stat.st_hash_searches;
+ if (st_hsearch > c_mp->stat.st_hash_longest)
+ c_mp->stat.st_hash_longest = st_hsearch;
+ c_mp->stat.st_hash_examined += st_hsearch;
+
+ /*
+ * There are 4 possible paths to this location:
+ *
+ * FIRST_MISS:
+ * Didn't find the page in the hash bucket on our first pass:
+ * bhp == NULL, alloc_bhp == NULL
+ *
+ * FIRST_FOUND:
+ * Found the page in the hash bucket on our first pass:
+ * bhp != NULL, alloc_bhp == NULL
+ *
+ * SECOND_FOUND:
+ * Didn't find the page in the hash bucket on the first pass,
+ * allocated space, and found the page in the hash bucket on
+ * our second pass:
+ * bhp != NULL, alloc_bhp != NULL
+ *
+ * SECOND_MISS:
+ * Didn't find the page in the hash bucket on the first pass,
+ * allocated space, and didn't find the page in the hash bucket
+ * on our second pass:
+ * bhp == NULL, alloc_bhp != NULL
+ */
+ state = bhp == NULL ?
+ (alloc_bhp == NULL ? FIRST_MISS : SECOND_MISS) :
+ (alloc_bhp == NULL ? FIRST_FOUND : SECOND_FOUND);
+ switch (state) {
+ case FIRST_FOUND:
+ /* We found the buffer in our first check -- we're done. */
+ break;
+ case FIRST_MISS:
+ /*
+ * We didn't find the buffer in our first check. Figure out
+ * if the page exists, and allocate structures so we can add
+ * the page to the buffer pool.
+ */
+ MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+
+alloc: /*
+ * If DB_MPOOL_NEW is set, we have to allocate a page number.
+ * If neither DB_MPOOL_CREATE or DB_MPOOL_CREATE is set, then
+ * it's an error to try and get a page past the end of file.
+ */
+ COMPQUIET(n_cache, 0);
+
+ extending = ret = 0;
+ R_LOCK(dbenv, dbmp->reginfo);
+ switch (flags) {
+ case DB_MPOOL_NEW:
+ extending = 1;
+ *pgnoaddr = mfp->last_pgno + 1;
+ break;
+ case DB_MPOOL_CREATE:
+ extending = *pgnoaddr > mfp->last_pgno;
+ break;
+ default:
+ ret = *pgnoaddr > mfp->last_pgno ? DB_PAGE_NOTFOUND : 0;
+ break;
}
+ R_UNLOCK(dbenv, dbmp->reginfo);
+ if (ret != 0)
+ goto err;
/*
- * BH_TRASH --
- * The contents of the buffer are garbage. Shouldn't happen,
- * and this read is likely to fail, but might as well try.
+ * !!!
+ * In the DB_MPOOL_NEW code path, mf_offset and n_cache have
+ * not yet been initialized.
*/
- if (F_ISSET(bhp, BH_TRASH))
- goto reread;
+ mf_offset = R_OFFSET(dbmp->reginfo, mfp);
+ n_cache = NCACHE(mp, mf_offset, *pgnoaddr);
+ /* Allocate a new buffer header and data space. */
+ if ((ret = __memp_alloc(dbmp,
+ &dbmp->reginfo[n_cache], mfp, 0, NULL, &alloc_bhp)) != 0)
+ goto err;
+#ifdef DIAGNOSTIC
+ if ((db_alignp_t)alloc_bhp->buf & (sizeof(size_t) - 1)) {
+ __db_err(dbenv,
+ "Error: buffer data is NOT size_t aligned");
+ ret = EINVAL;
+ goto err;
+ }
+#endif
/*
- * BH_CALLPGIN --
- * The buffer was converted so it could be written, and the
- * contents need to be converted again.
+ * If we are extending the file, we'll need the region lock
+ * again.
*/
- if (F_ISSET(bhp, BH_CALLPGIN)) {
- if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0)
+ if (extending)
+ R_LOCK(dbenv, dbmp->reginfo);
+
+ /*
+ * DB_MPOOL_NEW does not guarantee you a page unreferenced by
+ * any other thread of control. (That guarantee is interesting
+ * for DB_MPOOL_NEW, unlike DB_MPOOL_CREATE, because the caller
+ * did not specify the page number, and so, may reasonably not
+ * have any way to lock the page outside of mpool.) Regardless,
+ * if we allocate the page, and some other thread of control
+ * requests the page by number, we will not detect that and the
+ * thread of control that allocated using DB_MPOOL_NEW may not
+ * have a chance to initialize the page. (Note: we *could*
+ * detect this case if we set a flag in the buffer header which
+ * guaranteed that no gets of the page would succeed until the
+ * reference count went to 0, that is, until the creating page
+ * put the page.) What we do guarantee is that if two threads
+ * of control are both doing DB_MPOOL_NEW calls, they won't
+ * collide, that is, they won't both get the same page.
+ *
+ * There's a possibility that another thread allocated the page
+ * we were planning to allocate while we were off doing buffer
+ * allocation. We can do that by making sure the page number
+ * we were going to use is still available. If it's not, then
+ * we check to see if the next available page number hashes to
+ * the same mpool region as the old one -- if it does, we can
+ * continue, otherwise, we have to start over.
+ */
+ if (flags == DB_MPOOL_NEW && *pgnoaddr != mfp->last_pgno + 1) {
+ *pgnoaddr = mfp->last_pgno + 1;
+ if (n_cache != NCACHE(mp, mf_offset, *pgnoaddr)) {
+ __db_shalloc_free(
+ dbmp->reginfo[n_cache].addr, alloc_bhp);
+ /*
+ * flags == DB_MPOOL_NEW, so extending is set
+ * and we're holding the region locked.
+ */
+ R_UNLOCK(dbenv, dbmp->reginfo);
+
+ alloc_bhp = NULL;
+ goto alloc;
+ }
+ }
+
+ /*
+ * We released the region lock, so another thread might have
+ * extended the file. Update the last_pgno and initialize
+ * the file, as necessary, if we extended the file.
+ */
+ if (extending) {
+#ifdef HAVE_FILESYSTEM_NOTZERO
+ if (*pgnoaddr > mfp->last_pgno &&
+ __os_fs_notzero() &&
+ F_ISSET(dbmfp->fhp, DB_FH_VALID))
+ ret = __memp_fs_notzero(
+ dbenv, dbmfp, mfp, pgnoaddr);
+ else
+ ret = 0;
+#endif
+ if (ret == 0 && *pgnoaddr > mfp->last_pgno)
+ mfp->last_pgno = *pgnoaddr;
+
+ R_UNLOCK(dbenv, dbmp->reginfo);
+ if (ret != 0)
goto err;
- F_CLR(bhp, BH_CALLPGIN);
}
+ goto hb_search;
+ case SECOND_FOUND:
+ /*
+ * We allocated buffer space for the requested page, but then
+ * found the page in the buffer cache on our second check.
+ * That's OK -- we can use the page we found in the pool,
+ * unless DB_MPOOL_NEW is set.
+ *
+ * Free the allocated memory, we no longer need it. Since we
+ * can't acquire the region lock while holding the hash bucket
+ * lock, we have to release the hash bucket and re-acquire it.
+ * That's OK, because we have the buffer pinned down.
+ */
+ MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+ R_LOCK(dbenv, &dbmp->reginfo[n_cache]);
+ __db_shalloc_free(dbmp->reginfo[n_cache].addr, alloc_bhp);
+ alloc_bhp = NULL;
+ R_UNLOCK(dbenv, &dbmp->reginfo[n_cache]);
+ MUTEX_LOCK(dbenv, &hp->hash_mutex);
- ++mfp->stat.st_cache_hit;
- *(void **)addrp = bhp->buf;
- goto done;
- }
+ /*
+ * We can't use the page we found in the pool if DB_MPOOL_NEW
+ * was set. (For details, see the above comment beginning
+ * "DB_MPOOL_NEW does not guarantee you a page unreferenced by
+ * any other thread of control".) If DB_MPOOL_NEW is set, we
+ * release our pin on this particular buffer, and try to get
+ * another one.
+ */
+ if (flags == DB_MPOOL_NEW) {
+ --bhp->ref;
+ b_incr = 0;
+ goto alloc;
+ }
+ break;
+ case SECOND_MISS:
+ /*
+ * We allocated buffer space for the requested page, and found
+ * the page still missing on our second pass through the buffer
+ * cache. Instantiate the page.
+ */
+ bhp = alloc_bhp;
+ alloc_bhp = NULL;
-alloc: /* Allocate new buffer header and data space. */
- if ((ret = __memp_alloc(dbmp,
- &dbmp->reginfo[n_cache], mfp, 0, NULL, &bhp)) != 0)
- goto err;
+ /*
+ * Initialize all the BH and hash bucket fields so we can call
+ * __memp_bhfree if an error occurs.
+ *
+ * Append the buffer to the tail of the bucket list and update
+ * the hash bucket's priority.
+ */
+ b_incr = 1;
+
+ memset(bhp, 0, sizeof(BH));
+ bhp->ref = 1;
+ bhp->priority = UINT32_T_MAX;
+ bhp->pgno = *pgnoaddr;
+ bhp->mf_offset = mf_offset;
+ SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
+ hp->hash_priority =
+ SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
+
+ /* If we extended the file, make sure the page is never lost. */
+ if (extending) {
+ ++hp->hash_page_dirty;
+ F_SET(bhp, BH_DIRTY | BH_DIRTY_CREATE);
+ }
- ++c_mp->stat.st_page_clean;
+ /*
+ * If we created the page, zero it out. If we didn't create
+ * the page, read from the backing file.
+ *
+ * !!!
+ * DB_MPOOL_NEW doesn't call the pgin function.
+ *
+ * If DB_MPOOL_CREATE is used, then the application's pgin
+ * function has to be able to handle pages of 0's -- if it
+ * uses DB_MPOOL_NEW, it can detect all of its page creates,
+ * and not bother.
+ *
+ * If we're running in diagnostic mode, smash any bytes on the
+ * page that are unknown quantities for the caller.
+ *
+ * Otherwise, read the page into memory, optionally creating it
+ * if DB_MPOOL_CREATE is set.
+ */
+ if (extending) {
+ if (mfp->clear_len == 0)
+ memset(bhp->buf, 0, mfp->stat.st_pagesize);
+ else {
+ memset(bhp->buf, 0, mfp->clear_len);
+#if defined(DIAGNOSTIC) || defined(UMRW)
+ memset(bhp->buf + mfp->clear_len, CLEAR_BYTE,
+ mfp->stat.st_pagesize - mfp->clear_len);
+#endif
+ }
- /*
- * Initialize the BH fields so that we can call the __memp_bhfree
- * routine if an error occurs.
- */
- memset(bhp, 0, sizeof(BH));
- bhp->ref = 1;
- bhp->pgno = *pgnoaddr;
- bhp->mf_offset = mf_offset;
+ if (flags == DB_MPOOL_CREATE && mfp->ftype != 0)
+ F_SET(bhp, BH_CALLPGIN);
- /* Increment the count of buffers referenced by this MPOOLFILE. */
- ++mfp->block_cnt;
+ ++mfp->stat.st_page_create;
+ } else {
+ F_SET(bhp, BH_TRASH);
+ ++mfp->stat.st_cache_miss;
+ }
- /*
- * Prepend the bucket header to the head of the appropriate MPOOL
- * bucket hash list. Append the bucket header to the tail of the
- * MPOOL LRU chain.
- */
- SH_TAILQ_INSERT_HEAD(&dbht[n_bucket], bhp, hq, __bh);
- SH_TAILQ_INSERT_TAIL(&c_mp->bhq, bhp, q);
+ /* Increment buffer count referenced by MPOOLFILE. */
+ MUTEX_LOCK(dbenv, &mfp->mutex);
+ ++mfp->block_cnt;
+ MUTEX_UNLOCK(dbenv, &mfp->mutex);
-#ifdef DIAGNOSTIC
- if ((db_alignp_t)bhp->buf & (sizeof(size_t) - 1)) {
- __db_err(dbenv, "Internal error: BH data NOT size_t aligned.");
- ret = EINVAL;
- __memp_bhfree(dbmp, bhp, 1);
- goto err;
+ /*
+ * Initialize the mutex. This is the last initialization step,
+ * because it's the only one that can fail, and everything else
+ * must be set up or we can't jump to the err label because it
+ * will call __memp_bhfree.
+ */
+ if ((ret = __db_mutex_setup(dbenv,
+ &dbmp->reginfo[n_cache], &bhp->mutex, 0)) != 0)
+ goto err;
}
-#endif
- if ((ret = __db_shmutex_init(dbenv, &bhp->mutex,
- R_OFFSET(dbmp->reginfo, &bhp->mutex) + DB_FCNTL_OFF_MPOOL,
- 0, &dbmp->reginfo[n_cache],
- (REGMAINT *)R_ADDR(&dbmp->reginfo[n_cache], c_mp->maint_off)))
- != 0) {
- __memp_bhfree(dbmp, bhp, 1);
- goto err;
+ DB_ASSERT(bhp->ref != 0);
+
+ /*
+ * If we're the only reference, update buffer and bucket priorities.
+ * We may be about to release the hash bucket lock, and everything
+ * should be correct, first. (We've already done this if we created
+ * the buffer, so there is no need to do it again.)
+ */
+ if (state != SECOND_MISS && bhp->ref == 1) {
+ bhp->priority = UINT32_T_MAX;
+ SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
+ SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
+ hp->hash_priority =
+ SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
}
/*
- * If we created the page, zero it out and continue.
- *
- * !!!
- * Note: DB_MPOOL_NEW specifically doesn't call the pgin function.
- * If DB_MPOOL_CREATE is used, then the application's pgin function
- * has to be able to handle pages of 0's -- if it uses DB_MPOOL_NEW,
- * it can detect all of its page creates, and not bother.
+ * BH_TRASH --
+ * The buffer we found may need to be filled from the disk.
*
- * If we're running in diagnostic mode, smash any bytes on the
- * page that are unknown quantities for the caller.
- *
- * Otherwise, read the page into memory, optionally creating it if
- * DB_MPOOL_CREATE is set.
+ * It's possible for the read function to fail, which means we fail as
+ * well. Note, the __memp_pgread() function discards and reacquires
+ * the hash lock, so the buffer must be pinned down so that it cannot
+ * move and its contents are unchanged. Discard the buffer on failure
+ * unless another thread is waiting on our I/O to complete. It's OK to
+ * leave the buffer around, as the waiting thread will see the BH_TRASH
+ * flag set, and will also attempt to discard it. If there's a waiter,
+ * we need to decrement our reference count.
*/
- if (LF_ISSET(DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP)) {
- if (mfp->clear_len == 0)
- memset(bhp->buf, 0, mfp->stat.st_pagesize);
- else {
- memset(bhp->buf, 0, mfp->clear_len);
-#ifdef DIAGNOSTIC
- memset(bhp->buf + mfp->clear_len, CLEAR_BYTE,
- mfp->stat.st_pagesize - mfp->clear_len);
-#endif
- }
+ if (F_ISSET(bhp, BH_TRASH) &&
+ (ret = __memp_pgread(dbmfp,
+ &hp->hash_mutex, bhp, LF_ISSET(DB_MPOOL_CREATE) ? 1 : 0)) != 0)
+ goto err;
- ++mfp->stat.st_page_create;
- } else {
- /*
- * It's possible for the read function to fail, which means
- * that we fail as well. Note, the __memp_pgread() function
- * discards the region lock, so the buffer must be pinned
- * down so that it cannot move and its contents are unchanged.
- */
-reread: if ((ret = __memp_pgread(dbmfp,
- bhp, LF_ISSET(DB_MPOOL_CREATE|DB_MPOOL_EXTENT))) != 0) {
- /*
- * !!!
- * Discard the buffer unless another thread is waiting
- * on our I/O to complete. Regardless, the header has
- * the BH_TRASH flag set.
- */
- if (bhp->ref == 1)
- __memp_bhfree(dbmp, bhp, 1);
+ /*
+ * BH_CALLPGIN --
+ * The buffer was processed for being written to disk, and now has
+ * to be re-converted for use.
+ */
+ if (F_ISSET(bhp, BH_CALLPGIN)) {
+ if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0)
goto err;
- }
-
- ++mfp->stat.st_cache_miss;
+ F_CLR(bhp, BH_CALLPGIN);
}
+ MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+
+#ifdef DIAGNOSTIC
+ /* Update the file's pinned reference count. */
+ R_LOCK(dbenv, dbmp->reginfo);
+ ++dbmfp->pinref;
+ R_UNLOCK(dbenv, dbmp->reginfo);
+
/*
- * If we're returning a page after our current notion of the last-page,
- * update our information. Note, there's no way to un-instantiate this
- * page, it's going to exist whether it's returned to us dirty or not.
+ * We want to switch threads as often as possible, and at awkward
+ * times. Yield every time we get a new page to ensure contention.
*/
- if (bhp->pgno > mfp->last_pgno)
- mfp->last_pgno = bhp->pgno;
+ if (F_ISSET(dbenv, DB_ENV_YIELDCPU))
+ __os_yield(dbenv, 1);
+#endif
*(void **)addrp = bhp->buf;
+ return (0);
-done: /* Update the chain search statistics. */
- if (st_hsearch) {
- ++c_mp->stat.st_hash_searches;
- if (st_hsearch > c_mp->stat.st_hash_longest)
- c_mp->stat.st_hash_longest = st_hsearch;
- c_mp->stat.st_hash_examined += st_hsearch;
+err: /*
+ * Discard our reference. If we're the only reference, discard the
+ * the buffer entirely. If we held a reference to a buffer, we are
+ * also still holding the hash bucket mutex.
+ */
+ if (b_incr) {
+ if (bhp->ref == 1)
+ (void)__memp_bhfree(dbmp, hp, bhp, 1);
+ else {
+ --bhp->ref;
+ MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+ }
}
- ++dbmfp->pinref;
+ /* If alloc_bhp is set, free the memory. */
+ if (alloc_bhp != NULL)
+ __db_shalloc_free(dbmp->reginfo[n_cache].addr, alloc_bhp);
- R_UNLOCK(dbenv, dbmp->reginfo);
+ return (ret);
+}
- return (0);
+#ifdef HAVE_FILESYSTEM_NOTZERO
+/*
+ * __memp_fs_notzero --
+ * Initialize the underlying allocated pages in the file.
+ */
+static int
+__memp_fs_notzero(dbenv, dbmfp, mfp, pgnoaddr)
+ DB_ENV *dbenv;
+ DB_MPOOLFILE *dbmfp;
+ MPOOLFILE *mfp;
+ db_pgno_t *pgnoaddr;
+{
+ DB_IO db_io;
+ u_int32_t i, npages;
+ size_t nw;
+ int ret;
+ u_int8_t *page;
+ char *fail;
-err: /* Discard our reference. */
- if (b_incr)
- --bhp->ref;
- R_UNLOCK(dbenv, dbmp->reginfo);
+ /*
+ * Pages allocated by writing pages past end-of-file are not zeroed,
+ * on some systems. Recovery could theoretically be fooled by a page
+ * showing up that contained garbage. In order to avoid this, we
+ * have to write the pages out to disk, and flush them. The reason
+ * for the flush is because if we don't sync, the allocation of another
+ * page subsequent to this one might reach the disk first, and if we
+ * crashed at the right moment, leave us with this page as the one
+ * allocated by writing a page past it in the file.
+ *
+ * Hash is the only access method that allocates groups of pages. We
+ * know that it will use the existence of the last page in a group to
+ * signify that the entire group is OK; so, write all the pages but
+ * the last one in the group, flush them to disk, and then write the
+ * last one to disk and flush it.
+ */
+ if ((ret = __os_calloc(dbenv, 1, mfp->stat.st_pagesize, &page)) != 0)
+ return (ret);
+
+ db_io.fhp = dbmfp->fhp;
+ db_io.mutexp = dbmfp->mutexp;
+ db_io.pagesize = db_io.bytes = mfp->stat.st_pagesize;
+ db_io.buf = page;
+
+ npages = *pgnoaddr - mfp->last_pgno;
+ for (i = 1; i < npages; ++i) {
+ db_io.pgno = mfp->last_pgno + i;
+ if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) {
+ fail = "write";
+ goto err;
+ }
+ }
+ if (i != 1 && (ret = __os_fsync(dbenv, dbmfp->fhp)) != 0) {
+ fail = "sync";
+ goto err;
+ }
- *(void **)addrp = NULL;
+ db_io.pgno = mfp->last_pgno + npages;
+ if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) {
+ fail = "write";
+ goto err;
+ }
+ if ((ret = __os_fsync(dbenv, dbmfp->fhp)) != 0) {
+ fail = "sync";
+err: __db_err(dbenv, "%s: %s failed for page %lu",
+ __memp_fn(dbmfp), fail, (u_long)db_io.pgno);
+ }
+
+ __os_free(dbenv, page);
return (ret);
}
+#endif