summaryrefslogtreecommitdiff
path: root/src/mp
diff options
context:
space:
mode:
Diffstat (limited to 'src/mp')
-rw-r--r--src/mp/mp_alloc.c320
-rw-r--r--src/mp/mp_backup.c7
-rw-r--r--src/mp/mp_bh.c14
-rw-r--r--src/mp/mp_fget.c170
-rw-r--r--src/mp/mp_fmethod.c58
-rw-r--r--src/mp/mp_fopen.c79
-rw-r--r--src/mp/mp_fput.c5
-rw-r--r--src/mp/mp_fset.c2
-rw-r--r--src/mp/mp_method.c21
-rw-r--r--src/mp/mp_mvcc.c20
-rw-r--r--src/mp/mp_region.c260
-rw-r--r--src/mp/mp_register.c2
-rw-r--r--src/mp/mp_resize.c121
-rw-r--r--src/mp/mp_stat.c73
-rw-r--r--src/mp/mp_sync.c21
-rw-r--r--src/mp/mp_trickle.c2
16 files changed, 858 insertions, 317 deletions
diff --git a/src/mp/mp_alloc.c b/src/mp/mp_alloc.c
index dc331215..011f54c6 100644
--- a/src/mp/mp_alloc.c
+++ b/src/mp/mp_alloc.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -22,8 +22,112 @@
#endif
/*
+ * __memp_bh_unreachable --
+ *
+ * Determine whether this buffer can not ever be seen again: is the next
+ * newer version visible to the same transaction which sees this one?
+ * If both versions are visibile to the same transaction, there is no
+ * reason to keep the older one: it can be purged.
+ *
+ * If this buffer has a more recent version, and there is a transaction
+ * with a read_lsn between this buffer's and that more recent version's,
+ * the buffer is visible to at least that transaction, so return FALSE.
+ * Otherwise return TRUE.
+ *
+ * txns: 3/10 2/10 2/5 2/1 1/10
+ * vers: 3/15 2/15 2/14 2/10 2/8 1/150
+ * vis vis unreach vis unreach vis
+ * who new txns 3/10 2/10 2/5, 2/1
+ * sees
+ *
+ * Note: in the abvove example, the page was allocated after txn 1/10
+ * started. 1/10 would not see any version of the page.
+ *
+ * PUBLIC: int __memp_bh_unreachable __P((ENV *, BH *, DB_LSN *, int));
+ */
+int
+__memp_bh_unreachable(env, bhp, snapshots, n_snapshots)
+ ENV *env;
+ BH *bhp;
+ DB_LSN *snapshots;
+ int n_snapshots;
+{
+ BH *newer_bhp;
+ DB_LSN b_vlsn, n_vlsn;
+ int i, ret;
+#ifdef DIAGNOSTIC
+ DB_MPOOL *dbmp;
+ DB_MSGBUF mb;
+ MPOOLFILE *bh_mfp;
+#endif
+
+ /*
+ * The buffer can't be purged if it is being used, or is the most recent
+ * version, or the next newer version isn't a copy yet.
+ */
+ if (BH_REFCOUNT(bhp) != 0 ||
+ (newer_bhp = SH_CHAIN_NEXT(bhp, vc, __bh)) == NULL ||
+ newer_bhp->td_off == INVALID_ROFF)
+ return (FALSE);
+
+ /*
+ * Find the visiblity LSNs for this buffer (b_vlsn) and the more recent,
+ * newer buffer (n_vlsn). If the newer version hasn't committed yet the
+ * bhp could be needed.
+ */
+ n_vlsn = *VISIBLE_LSN(env, newer_bhp);
+ if (IS_MAX_LSN(n_vlsn))
+ return (FALSE);
+ if (bhp->td_off == INVALID_ROFF)
+ INIT_LSN(b_vlsn);
+ else
+ b_vlsn = *VISIBLE_LSN(env, bhp);
+
+ ret = TRUE;
+ /*
+ * Look for a transaction which is between n_lsn and b_lsn - determining
+ * that bhp is reachable. Stop looking once the transactions get so
+ * small (old) that they precede the buffer's version; no earlier txn
+ * could be between n_vlsn and b_vlsn.
+ */
+ for (i = 0;
+ i < n_snapshots && LOG_COMPARE(&snapshots[i], &b_vlsn) >= 0;
+ i++) {
+ if (LOG_COMPARE(&snapshots[i], &n_vlsn) < 0) {
+ /*
+ * This txn can see (started after) bhp, but not
+ * newer_bhp (which committed after this txn started).
+ */
+ ret = FALSE;
+ break;
+ }
+ }
+
+#ifdef DIAGNOSTIC
+ if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC)) {
+ dbmp = env->mp_handle;
+ bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+ DB_MSGBUF_INIT(&mb);
+ __db_msgadd(env, &mb,
+ "bh_unreachable %s pgno %d %s %lu/%lu %x newer %lu/%lu txn #%d in\n",
+ __memp_fns(dbmp, bh_mfp), bhp->pgno,
+ ret ? "purgeable" : "needed",
+ (u_long)b_vlsn.file, (u_long)b_vlsn.offset, bhp->flags,
+ (u_long)n_vlsn.file, (u_long)n_vlsn.offset, i);
+ for (i = 0; i != n_snapshots; i++)
+ __db_msgadd(env, &mb, " %lu/%lu",
+ (u_long)snapshots[i].file,
+ (u_long)snapshots[i].offset);
+ DB_MSGBUF_FLUSH(env, &mb);
+ }
+#endif
+ return (ret);
+}
+
+/*
* __memp_alloc --
- * Allocate some space from a cache region.
+ * Allocate some space from a cache region. If the region is full then
+ * reuse one or more cache buffers.
*
* PUBLIC: int __memp_alloc __P((DB_MPOOL *,
* PUBLIC: REGINFO *, MPOOLFILE *, size_t, roff_t *, void *));
@@ -39,7 +143,7 @@ __memp_alloc(dbmp, infop, mfp, len, offsetp, retp)
{
BH *bhp, *current_bhp, *mvcc_bhp, *oldest_bhp;
BH_FROZEN_PAGE *frozen_bhp;
- DB_LSN oldest_reader, vlsn;
+ DB_LSN *snapshots, vlsn;
DB_MPOOL_HASH *dbht, *hp, *hp_end, *hp_saved, *hp_tmp;
ENV *env;
MPOOL *c_mp;
@@ -49,7 +153,7 @@ __memp_alloc(dbmp, infop, mfp, len, offsetp, retp)
u_int32_t dirty_eviction, high_priority, priority, versions;
u_int32_t priority_saved, put_counter, lru_generation, total_buckets;
int aggressive, alloc_freeze, b_lock, giveup;
- int h_locked, need_free, obsolete, ret, write_error;
+ int h_locked, need_free, n_snapshots, obsolete, ret, write_error;
u_int8_t *endp;
void *p;
@@ -58,11 +162,10 @@ __memp_alloc(dbmp, infop, mfp, len, offsetp, retp)
dbht = R_ADDR(infop, c_mp->htab);
hp_end = &dbht[c_mp->htab_buckets];
hp_saved = NULL;
- priority_saved = 0;
- write_error = 0;
-
+ snapshots = NULL;
+ priority_saved = write_error = 0;
buckets = buffers = put_counter = total_buckets = versions = 0;
- aggressive = alloc_freeze = giveup = h_locked = 0;
+ aggressive = alloc_freeze = giveup = h_locked = n_snapshots = 0;
/*
* If we're allocating a buffer, and the one we're discarding is the
@@ -138,13 +241,15 @@ found: if (offsetp != NULL)
c_mp->stat.st_alloc_pages, buffers, infop->id);
}
#endif
- return (0);
+ goto done;
} else if (giveup || c_mp->pages == 0) {
MPOOL_REGION_UNLOCK(env, infop);
__db_errx(env, DB_STR("3017",
"unable to allocate space from the buffer cache"));
- return ((ret == ENOMEM && write_error != 0) ? EIO : ret);
+ if (ret == ENOMEM && write_error != 0)
+ ret = EIO;
+ goto done;
}
search:
@@ -158,7 +263,6 @@ search:
lru_generation = c_mp->lru_generation;
ret = 0;
- MAX_LSN(oldest_reader);
/*
* We re-attempt the allocation every time we've freed 3 times what
@@ -222,6 +326,13 @@ search:
goto alloc;
MPOOL_REGION_UNLOCK(env, infop);
+ /* Refresh the list of mvcc reader transactions. */
+ if (snapshots != NULL)
+ __os_free(env, snapshots);
+ if ((ret = __txn_get_readers(
+ env, &snapshots, &n_snapshots)) != 0)
+ goto err;
+
aggressive++;
/*
* Once aggressive, we consider all buffers. By setting
@@ -266,13 +377,6 @@ search:
if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
continue;
- /* Set aggressive if we have already searched for too long. */
- if (aggressive == 0 && buckets >= MPOOL_ALLOC_SEARCH_LIMIT) {
- aggressive = 1;
- /* Once aggressive, we consider all buffers. */
- high_priority = MPOOL_LRU_MAX;
- }
-
/* Unlock the region and lock the hash bucket. */
MPOOL_REGION_UNLOCK(env, infop);
MUTEX_READLOCK(env, hp->mtx_hash);
@@ -280,29 +384,45 @@ search:
b_lock = 0;
/*
+ * Set aggressive to consider all buffers if we have already
+ * searched in too many buckets.
+ */
+ if (buckets > MPOOL_ALLOC_SEARCH_LIMIT && aggressive == 0) {
+ aggressive = 1;
+ /* Once aggressive, we consider all buffers. */
+ high_priority = MPOOL_LRU_MAX;
+ if (snapshots == NULL && (ret = __txn_get_readers(
+ env, &snapshots, &n_snapshots)) != 0)
+ goto err;
+ }
+
+ /*
* Find a buffer we can use.
+ * Skip over refcount > 0 buffers; we can't get rid of them.
*
- * We use the lowest-LRU singleton buffer if we find one and
- * it's better than the result of another hash bucket we've
+ * Without MVCC we use the lowest-LRU singleton buffer we find
+ * that's better than the result of another hash bucket we've
* reviewed. We do not use a buffer which has a priority
* greater than high_priority unless we are being aggressive.
*
- * With MVCC buffers, the situation is more complicated: we
- * don't want to free a buffer out of the middle of an MVCC
- * chain, since that requires I/O. So, walk the buffers,
- * looking for an obsolete buffer at the end of an MVCC chain.
- * Once a buffer becomes obsolete, its LRU priority is
- * irrelevant because that version can never be accessed again.
+ * MVCC requires looking at additional factors: we don't want to
+ * free a still-relevent buffer out of the middle of an MVCC
+ * chain, since that requires freezing - lots of I/O. So,
+ * walk the buffers, looking for an obsolete buffer at the
+ * end of the MVCC chain. Once a buffer becomes obsolete, its
+ * LRU priority is irrelevant because that version can never
+ * be accessed again.
*
* If we don't find any obsolete MVCC buffers, we will get
* aggressive, and in that case consider the lowest priority
* buffer within a chain.
- *
- * Ignore referenced buffers, we can't get rid of them.
*/
retry_search: bhp = NULL;
bucket_priority = high_priority;
obsolete = 0;
+ if (n_snapshots > 0 && LOG_COMPARE(&snapshots[n_snapshots - 1],
+ &hp->old_reader) > 0)
+ hp->old_reader = snapshots[n_snapshots - 1];
SH_TAILQ_FOREACH(current_bhp, &hp->hash_bucket, hq, __bh) {
/*
* First, do the standard LRU check for singletons.
@@ -340,55 +460,63 @@ retry_search: bhp = NULL;
mvcc_bhp != NULL;
oldest_bhp = mvcc_bhp,
mvcc_bhp = SH_CHAIN_PREV(mvcc_bhp, vc, __bh)) {
+ DB_ASSERT(env, mvcc_bhp !=
+ SH_CHAIN_PREV(mvcc_bhp, vc, __bh));
#ifdef MPOOL_ALLOC_SEARCH_DYN
if (aggressive == 0 &&
- ++high_priority >= c_mp->lru_priority)
+ ++high_priority >= c_mp->lru_priority) {
aggressive = 1;
+ if (snapshots == NULL && (ret =
+ __txn_readers(env,
+ &snapshots, &n_snapshots)) != 0)
+ goto err;
+ }
#endif
- DB_ASSERT(env, mvcc_bhp !=
- SH_CHAIN_PREV(mvcc_bhp, vc, __bh));
- if ((aggressive < 2 &&
- ++versions < (buffers >> 2)) ||
- BH_REFCOUNT(mvcc_bhp) != 0)
+ if (n_snapshots > 0 &&
+ __memp_bh_unreachable(env,
+ mvcc_bhp, snapshots, n_snapshots)) {
+ oldest_bhp = mvcc_bhp;
+ goto is_obsolete;
+ }
+ if (bhp != NULL &&
+ mvcc_bhp->priority >= bhp->priority)
+ continue;
+ if (BH_REFCOUNT(mvcc_bhp) != 0)
+ continue;
+ /*
+ * Since taking still-relevant versions requires
+ * freezing, skip over them at low aggression
+ * levels unless we see that a high proportion
+ * of buffers (over 1/4) are MVCC copies.
+ */
+ if (aggressive < 2 &&
+ ++versions < (buffers >> 2))
continue;
buffers++;
- if (!F_ISSET(mvcc_bhp, BH_FROZEN) &&
- (bhp == NULL ||
- bhp->priority > mvcc_bhp->priority)) {
- if (bhp != NULL)
- atomic_dec(env, &bhp->ref);
- bhp = mvcc_bhp;
- atomic_inc(env, &bhp->ref);
- }
+ if (F_ISSET(mvcc_bhp, BH_FROZEN))
+ continue;
+ /*
+ * Select mvcc_bhp as current best candidate,
+ * releasing the current candidate, if any.
+ */
+ if (bhp != NULL)
+ atomic_dec(env, &bhp->ref);
+ bhp = mvcc_bhp;
+ atomic_inc(env, &bhp->ref);
}
/*
* oldest_bhp is the last buffer on the MVCC chain, and
* an obsolete buffer at the end of the MVCC chain gets
- * used without further search. Before checking for
- * obsolescence, update the cached oldest reader LSN in
- * the bucket if it is older than call's oldest_reader.
+ * used without further search.
*/
if (BH_REFCOUNT(oldest_bhp) != 0)
continue;
- if (LOG_COMPARE(&oldest_reader, &hp->old_reader) > 0) {
- if (IS_MAX_LSN(oldest_reader) &&
- (ret = __txn_oldest_reader(
- env, &oldest_reader)) != 0) {
- MUTEX_UNLOCK(env, hp->mtx_hash);
- if (bhp != NULL)
- atomic_dec(env, &bhp->ref);
- return (ret);
- }
- if (LOG_COMPARE(&oldest_reader,
- &hp->old_reader) > 0)
- hp->old_reader = oldest_reader;
- }
-
if (BH_OBSOLETE(oldest_bhp, hp->old_reader, vlsn)) {
if (aggressive < 2)
buffers++;
+is_obsolete:
obsolete = 1;
if (bhp != NULL)
atomic_dec(env, &bhp->ref);
@@ -410,10 +538,18 @@ retry_search: bhp = NULL;
/*
* Compare two hash buckets and select the one with the lower
- * priority. Performance testing showed looking at two improves
- * the LRU-ness and looking at more only does a little better.
+ * priority, except mvcc at high aggression levels. Performance
+ * testing shows looking at two improves the LRU-ness and
+ * looking at more only does a little better.
*/
if (hp_saved == NULL) {
+ /*
+ * At high aggressive levels when mvcc is active, stop
+ * looking for candidate once one has been found.
+ * Freezing takes more time than writing out to a db.
+ */
+ if (aggressive > 1 && n_snapshots > 1)
+ goto this_buffer;
hp_saved = hp;
priority_saved = priority;
goto next_hb;
@@ -487,11 +623,15 @@ this_buffer: /*
/* We cannot block as the caller is probably holding locks. */
if ((ret = MUTEX_TRYLOCK(env, bhp->mtx_buf)) != 0) {
- if (ret != DB_LOCK_NOTGRANTED)
- return (ret);
+ if (ret != DB_LOCK_NOTGRANTED) {
+ goto err;
+ }
+ ret = 0;
goto next_hb;
}
F_SET(bhp, BH_EXCLUSIVE);
+ if (obsolete)
+ F_SET(bhp, BH_UNREACHABLE);
b_lock = 1;
/* Someone may have grabbed it while we got the lock. */
@@ -557,7 +697,7 @@ this_buffer: /*
F_CLR(bhp, BH_EXCLUSIVE);
MUTEX_UNLOCK(env, bhp->mtx_buf);
DB_ASSERT(env, !h_locked);
- return (ret);
+ goto err;
}
}
@@ -573,16 +713,25 @@ this_buffer: /*
if (BH_REFCOUNT(bhp) != 1 || F_ISSET(bhp, BH_DIRTY) ||
(SH_CHAIN_HASNEXT(bhp, vc) &&
SH_CHAIN_NEXTP(bhp, vc, __bh)->td_off != bhp->td_off &&
- !BH_OBSOLETE(bhp, hp->old_reader, vlsn)))
+ !(obsolete || BH_OBSOLETE(bhp, hp->old_reader, vlsn)))) {
+ if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC))
+ __db_msg(env,
+ "memp_alloc next_hb past bhp %lx flags %x ref %d %lx/%lx",
+ (u_long)R_OFFSET(infop, bhp), bhp->flags,
+ BH_REFCOUNT(bhp),
+ (u_long)R_OFFSET(infop, SH_CHAIN_NEXTP(bhp, vc, __bh)),
+ (u_long)R_OFFSET(infop, SH_CHAIN_PREVP(bhp, vc, __bh)));
goto next_hb;
+ }
/*
* If the buffer is frozen, thaw it and look for another one
- * we can use. (Calling __memp_bh_freeze above will not
- * mark bhp BH_FROZEN.)
+ * we can use. (Calling __memp_bh_freeze above will not mark
+ * this bhp BH_FROZEN; it creates another frozen one.)
*/
if (F_ISSET(bhp, BH_FROZEN)) {
- DB_ASSERT(env, obsolete || SH_CHAIN_SINGLETON(bhp, vc));
+ DB_ASSERT(env, SH_CHAIN_SINGLETON(bhp, vc) ||
+ obsolete || BH_OBSOLETE(bhp, hp->old_reader, vlsn));
DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
if (!F_ISSET(bhp, BH_THAWED)) {
/*
@@ -592,10 +741,10 @@ this_buffer: /*
*/
if ((ret = __memp_bh_thaw(dbmp,
infop, hp, bhp, NULL)) != 0)
- return (ret);
+ goto done;
MUTEX_READLOCK(env, hp->mtx_hash);
} else {
- need_free = (atomic_dec(env, &bhp->ref) == 0);
+ need_free = atomic_dec(env, &bhp->ref) == 0;
F_CLR(bhp, BH_EXCLUSIVE);
MUTEX_UNLOCK(env, bhp->mtx_buf);
if (need_free) {
@@ -626,7 +775,10 @@ this_buffer: /*
if (alloc_freeze) {
if ((ret = __memp_bhfree(dbmp,
infop, bh_mfp, hp, bhp, 0)) != 0)
- return (ret);
+ goto err;
+ DB_ASSERT(env, bhp->mtx_buf != MUTEX_INVALID);
+ if ((ret = __mutex_free(env, &bhp->mtx_buf)) != 0)
+ goto err;
b_lock = 0;
h_locked = 0;
@@ -654,23 +806,21 @@ this_buffer: /*
}
/*
- * Check to see if the buffer is the size we're looking for.
- * If so, we can simply reuse it. Otherwise, free the buffer
- * and its space and keep looking.
+ * If the buffer is the size we're looking for, we can simply
+ * reuse it. Otherwise, free it and keep looking.
*/
if (mfp != NULL && mfp->pagesize == bh_mfp->pagesize) {
if ((ret = __memp_bhfree(dbmp,
infop, bh_mfp, hp, bhp, 0)) != 0)
- return (ret);
+ goto err;
p = bhp;
goto found;
}
freed_space += sizeof(*bhp) + bh_mfp->pagesize;
- if ((ret =
- __memp_bhfree(dbmp, infop,
- bh_mfp, hp, bhp, BH_FREE_FREEMEM)) != 0)
- return (ret);
+ if ((ret = __memp_bhfree(dbmp,
+ infop, bh_mfp, hp, bhp, BH_FREE_FREEMEM)) != 0)
+ goto err;
/* Reset "aggressive" and "write_error" if we free any space. */
if (aggressive > 1)
@@ -689,12 +839,14 @@ next_hb: if (bhp != NULL) {
if (b_lock) {
F_CLR(bhp, BH_EXCLUSIVE);
MUTEX_UNLOCK(env, bhp->mtx_buf);
+ b_lock = 0;
}
}
if (h_locked)
MUTEX_UNLOCK(env, hp->mtx_hash);
h_locked = 0;
}
+ obsolete = 0;
MPOOL_REGION_LOCK(env, infop);
/*
@@ -706,7 +858,15 @@ next_hb: if (bhp != NULL) {
if (freed_space >= 3 * len)
goto alloc;
}
- /* NOTREACHED */
+err:
+ if (h_locked) {
+ MUTEX_UNLOCK(env, hp->mtx_hash);
+ h_locked = 0;
+ }
+done:
+ if (snapshots != NULL)
+ __os_free(env, snapshots);
+ return (ret);
}
/*
diff --git a/src/mp/mp_backup.c b/src/mp/mp_backup.c
index f376cda7..f1072292 100644
--- a/src/mp/mp_backup.c
+++ b/src/mp/mp_backup.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -145,6 +145,9 @@ __memp_backup_mpf(env, mpf, ip, first_pgno, last_pgno, fp, handle, flags)
if (backup == NULL || (len = backup->size) == 0)
len = MEGABYTE;
+ /* Ensure backup page size is at least as big as db page size */
+ if (len < mfp->pagesize)
+ len = mfp->pagesize;
if ((ret = __os_malloc(env, len, &buf)) != 0)
return (ret);
write_size = (u_int32_t)(len / mfp->pagesize);
@@ -188,7 +191,7 @@ __memp_backup_mpf(env, mpf, ip, first_pgno, last_pgno, fp, handle, flags)
if (backup != NULL && backup->write != NULL) {
if ((ret = backup->write(
- env->dbenv, gigs, off, (u_int32_t)nr,
+ env->dbenv, gigs, off, (u_int32_t)nr,
buf, handle)) != 0)
break;
} else {
diff --git a/src/mp/mp_bh.c b/src/mp/mp_bh.c
index 1df8e206..30293f29 100644
--- a/src/mp/mp_bh.c
+++ b/src/mp/mp_bh.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -157,7 +157,7 @@ __memp_bhwrite(dbmp, hp, mfp, bhp, open_extents)
opened = 1;
if ((ret = __memp_fopen(dbmfp, mfp, NULL,
NULL, DB_FLUSH | DB_DURABLE_UNKNOWN, 0, mfp->pagesize)) != 0) {
- dbmfp->ref--;
+ dbmfp->ref--;
(void)__memp_fclose(dbmfp, 0);
/*
@@ -264,7 +264,7 @@ __memp_pgread(dbmfp, bhp, can_create)
* how to handle the error.
*/
if (!can_create) {
- ret = DB_PAGE_NOTFOUND;
+ ret = USR_ERR(env, DB_PAGE_NOTFOUND);
goto err;
}
@@ -557,6 +557,9 @@ err: __db_errx(env, DB_STR_A("3016",
* __memp_bhfree --
* Free a bucket header and its referenced data.
*
+ * The hash bucket is unlocked before returning except when flags includes
+ * BH_FREE_UNLOCKED -- or there was no hp passed in to begin with.
+ *
* PUBLIC: int __memp_bhfree __P((DB_MPOOL *,
* PUBLIC: REGINFO *, MPOOLFILE *, DB_MPOOL_HASH *, BH *, u_int32_t));
*/
@@ -600,10 +603,13 @@ __memp_bhfree(dbmp, infop, mfp, hp, bhp, flags)
(SH_CHAIN_NEXTP(bhp, vc, __bh)->td_off == bhp->td_off ||
bhp->td_off == INVALID_ROFF ||
IS_MAX_LSN(*VISIBLE_LSN(env, bhp)) ||
+ F_ISSET(bhp, BH_UNREACHABLE) ||
BH_OBSOLETE(bhp, hp->old_reader, vlsn))));
PERFMON3(env, mpool, evict, __memp_fns(dbmp, mfp), bhp->pgno, bhp);
-
+ if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC))
+ __db_msg(env, "bhfree pgno %lu roff %lx",
+ (u_long)bhp->pgno, (u_long)R_OFFSET(dbmp->reginfo, bhp));
/*
* Delete the buffer header from the hash bucket queue or the
* version chain.
diff --git a/src/mp/mp_fget.c b/src/mp/mp_fget.c
index 5f9a4bf9..270135bd 100644
--- a/src/mp/mp_fget.c
+++ b/src/mp/mp_fget.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -53,15 +53,19 @@ __memp_fget_pp(dbmfp, pgnoaddr, txnp, flags, addrp)
* time, which we don't want to do because one of our big goals in life
* is to keep database files small. It's sleazy as hell, but we catch
* any attempt to actually write the file in memp_fput().
+ *
+ * CREATE, LAST, and NEW are mutually exclusive. DIRTY and EDIT are also
+ * mutually exclusive - that is checked in __memp_fget() itself..
*/
+#undef OKMODE
#undef OKFLAGS
-#define OKFLAGS (DB_MPOOL_CREATE | DB_MPOOL_DIRTY | \
- DB_MPOOL_EDIT | DB_MPOOL_LAST | DB_MPOOL_NEW)
+#define OKMODE (DB_MPOOL_CREATE | DB_MPOOL_LAST | DB_MPOOL_NEW)
+#define OKFLAGS (OKMODE | DB_MPOOL_DIRTY | DB_MPOOL_EDIT)
if (flags != 0) {
if ((ret = __db_fchk(env, "memp_fget", flags, OKFLAGS)) != 0)
return (ret);
- switch (FLD_CLR(flags, DB_MPOOL_DIRTY | DB_MPOOL_EDIT)) {
+ switch (FLD_ISSET(flags, OKMODE)) {
case DB_MPOOL_CREATE:
case DB_MPOOL_LAST:
case DB_MPOOL_NEW:
@@ -131,6 +135,7 @@ __memp_fget(dbmfp, pgnoaddr, ip, txn, flags, addrp)
#ifdef DIAGNOSTIC
DB_LOCKTAB *lt;
DB_LOCKER *locker;
+ int pagelock_err;
#endif
*(void **)addrp = NULL;
@@ -274,7 +279,7 @@ retry: MUTEX_LOCK(env, hp->mtx_hash);
* the BTREE in a subsequent txn).
*/
if (bhp == NULL) {
- ret = DB_PAGE_NOTFOUND;
+ ret = USR_ERR(env, DB_PAGE_NOTFOUND);
goto err;
}
}
@@ -303,7 +308,10 @@ retry: MUTEX_LOCK(env, hp->mtx_hash);
MUTEX_UNLOCK(env, hp->mtx_hash);
h_locked = 0;
if (dirty || extending || makecopy || F_ISSET(bhp, BH_FROZEN)) {
-xlatch: if (LF_ISSET(DB_MPOOL_TRY)) {
+#ifdef HAVE_SHARED_LATCHES
+xlatch:
+#endif
+ if (LF_ISSET(DB_MPOOL_TRY)) {
if ((ret =
MUTEX_TRYLOCK(env, bhp->mtx_buf)) != 0)
goto err;
@@ -373,11 +381,11 @@ thawed: need_free = (atomic_dec(env, &bhp->ref) == 0);
bhp = NULL;
goto retry;
} else if (dirty && SH_CHAIN_HASNEXT(bhp, vc)) {
- ret = DB_LOCK_DEADLOCK;
+ ret = USR_ERR(env, DB_LOCK_DEADLOCK);
goto err;
} else if (F_ISSET(bhp, BH_FREED) && flags != DB_MPOOL_CREATE &&
flags != DB_MPOOL_NEW && flags != DB_MPOOL_FREE) {
- ret = DB_PAGE_NOTFOUND;
+ ret = USR_ERR(env, DB_PAGE_NOTFOUND);
goto err;
}
@@ -508,9 +516,13 @@ revive: if (F_ISSET(bhp, BH_FREED))
/*
* With multiversion databases, we might need to
* allocate a new buffer into which we can copy the one
- * that we found. In that case, check the last buffer
+ * that we found. In that case, check the old versions
* in the chain to see whether we can reuse an obsolete
- * buffer.
+ * or unreachable buffer. First see whether the oldest
+ * version is truly obsolete. If not, look for somewhat
+ * more recent versions which are no longer needed
+ * because the snapshot transactions which once could
+ * have seen them have now exited.
*
* To provide snapshot isolation, we need to make sure
* that we've seen a buffer older than the oldest
@@ -523,24 +535,17 @@ reuse: if ((makecopy || F_ISSET(bhp, BH_FROZEN)) &&
}
if ((makecopy || F_ISSET(bhp, BH_FROZEN)) &&
SH_CHAIN_HASPREV(bhp, vc)) {
- oldest_bhp = SH_CHAIN_PREVP(bhp, vc, __bh);
- while (SH_CHAIN_HASPREV(oldest_bhp, vc))
- oldest_bhp = SH_CHAIN_PREVP(
- oldest_bhp, vc, __bh);
-
- if (BH_REFCOUNT(oldest_bhp) == 0 &&
- !BH_OBSOLETE(
- oldest_bhp, hp->old_reader, vlsn) &&
- (ret = __txn_oldest_reader(env,
- &hp->old_reader)) != 0)
+ if ((ret = __memp_find_obsolete_version(env,
+ bhp, hp, &oldest_bhp)) != 0)
goto err;
-
- if (BH_OBSOLETE(
- oldest_bhp, hp->old_reader, vlsn) &&
- BH_REFCOUNT(oldest_bhp) == 0) {
+ if (oldest_bhp != NULL) {
DB_ASSERT(env,
!F_ISSET(oldest_bhp, BH_DIRTY));
atomic_inc(env, &oldest_bhp->ref);
+#ifdef HAVE_STATISTICS
+ if (SH_CHAIN_HASPREV(oldest_bhp, vc))
+ c_mp->stat.st_mvcc_reused++;
+#endif
if (F_ISSET(oldest_bhp, BH_FROZEN)) {
/*
* This call will release the
@@ -606,7 +611,7 @@ newpg: /*
mfp->last_pgno >= mfp->maxpgno) {
__db_errx(env, DB_STR_A("3023",
"%s: file limited to %lu pages", "%s %lu"),
- __memp_fn(dbmfp), (u_long)mfp->maxpgno);
+ __memp_fn(dbmfp), (u_long)mfp->maxpgno + 1);
ret = ENOSPC;
} else
*pgnoaddr = mfp->last_pgno + 1;
@@ -615,7 +620,7 @@ newpg: /*
if (mfp->maxpgno != 0 && *pgnoaddr > mfp->maxpgno) {
__db_errx(env, DB_STR_A("3024",
"%s: file limited to %lu pages", "%s %lu"),
- __memp_fn(dbmfp), (u_long)mfp->maxpgno);
+ __memp_fn(dbmfp), (u_long)mfp->maxpgno + 1);
ret = ENOSPC;
} else if (!extending)
extending = *pgnoaddr > mfp->last_pgno;
@@ -937,8 +942,17 @@ alloc: /* Allocate a new buffer header and data space. */
* need to make copy, so we now need to allocate another buffer
* to hold the new copy.
*/
- if (alloc_bhp == NULL)
+ if (alloc_bhp == NULL) {
+ if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC))
+ __db_msg(env,
+ "fget makecopy txn %08x %lu/%lu going to reuse pgno %d from %lu/%lu",
+ txn->txnid, td == NULL ? 0L :
+ (u_long)td->read_lsn.file, td == NULL ? 0L :
+ (u_long)td->read_lsn.offset, bhp->pgno,
+ (u_long)VISIBLE_LSN(env, bhp)->file,
+ (u_long)VISIBLE_LSN(env, bhp)->offset);
goto reuse;
+ }
DB_ASSERT(env, bhp != NULL && alloc_bhp != bhp);
DB_ASSERT(env, bhp->td_off == INVALID_ROFF ||
@@ -1019,6 +1033,15 @@ alloc: /* Allocate a new buffer header and data space. */
F_CLR(bhp, BH_EXCLUSIVE);
MUTEX_UNLOCK(env, bhp->mtx_buf);
+ if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC))
+ __db_msg(env,
+ "fget makecopy txn %08x %lx pgno %d from %lu/%lu",
+ txn->txnid, (u_long)R_OFFSET(infop, bhp),
+ bhp->pgno, bhp->td_off == INVALID_ROFF ? 0L :
+ (u_long)VISIBLE_LSN(env, bhp)->file,
+ bhp->td_off == INVALID_ROFF ? 0L :
+ (u_long)VISIBLE_LSN(env, bhp)->offset);
+
bhp = alloc_bhp;
DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
b_incr = 1;
@@ -1164,8 +1187,15 @@ alloc: /* Allocate a new buffer header and data space. */
lt = env->lk_handle;
locker = (DB_LOCKER *)
(R_ADDR(&lt->reginfo, ip->dbth_locker));
- DB_ASSERT(env, __db_has_pagelock(env, locker, dbmfp,
- (PAGE*)bhp->buf, DB_LOCK_WRITE) == 0);
+ pagelock_err = __db_has_pagelock(env, locker, dbmfp,
+ (PAGE *)bhp->buf, DB_LOCK_WRITE);
+ if (pagelock_err != 0) {
+ if (pagelock_err == DB_RUNRECOVERY)
+ return (pagelock_err);
+ __db_syserr(env, pagelock_err,
+ "Locker %x has no page lock for pgno %d",
+ locker->id, ((PAGE *)bhp->buf)->pgno);
+ }
}
#endif
@@ -1228,3 +1258,85 @@ err: /*
return (ret);
}
+
+/*
+ * __memp_find_obsolete_version --
+ *
+ * Search the version chain, from oldest to youngest, looking for buffers
+ * which are no longer BH_VISIBLE() to any existing transaction.
+ *
+ * The hash bucket is locked, no buffer is locked.
+ *
+ * PUBLIC: int __memp_find_obsolete_version
+ * PUBLIC: __P((ENV *, BH *, DB_MPOOL_HASH *, BH **));
+ */
+int
+__memp_find_obsolete_version(env, vis_bhp, hp, foundp)
+ ENV *env;
+ BH *vis_bhp;
+ DB_MPOOL_HASH *hp;
+ BH **foundp;
+{
+ BH *bhp;
+ DB_LSN *readers, vlsn;
+ int n_readers, ret;
+
+ *foundp = NULL;
+ readers = NULL;
+ ret = 0;
+ bhp = SH_CHAIN_PREVP(vis_bhp, vc, __bh);
+ while (SH_CHAIN_HASPREV(bhp, vc))
+ bhp = SH_CHAIN_PREVP(bhp, vc, __bh);
+
+ /*
+ * The least-expensive case is finding an obsolete version without
+ * needing to build the active snapshot transactionn list.
+ */
+ if (BH_OBSOLETE(bhp, hp->old_reader, vlsn) && BH_REFCOUNT(bhp) == 0) {
+ *foundp = bhp;
+ goto out;
+ }
+
+ if ((ret = __txn_get_readers(env, &readers, &n_readers)) != 0)
+ goto out;
+
+ if (LOG_COMPARE(&readers[n_readers - 1], &hp->old_reader) > 0) {
+ hp->old_reader = readers[n_readers - 1];
+ if (BH_OBSOLETE(bhp, hp->old_reader, vlsn) &&
+ BH_REFCOUNT(bhp) == 0) {
+ *foundp = bhp;
+ goto cleanup;
+ }
+ }
+
+ while ((bhp = SH_CHAIN_NEXT(bhp, vc, __bh)) != vis_bhp) {
+ if (BH_REFCOUNT(bhp) == 0 &&
+ __memp_bh_unreachable(env, bhp, readers, n_readers)) {
+ *foundp = bhp;
+#ifdef DIAGNOSTIC
+ /*
+ * Usually when the hash bucket is locked, the refcount
+ * is incremented and the bucket unlocked before the
+ * buffer is locked; this avoids mtx_buf deadlocks.
+ * This unreachable version cannot be involved with any
+ * deadlock-creating locking, though the head of the
+ * version chain could be locked. No TRYLOCK needed.
+ */
+ MUTEX_LOCK(env, bhp->mtx_buf);
+ F_SET(bhp, BH_UNREACHABLE);
+ MUTEX_UNLOCK(env, bhp->mtx_buf);
+#endif
+ break;
+ }
+ }
+
+cleanup:
+ if (readers != NULL)
+ __os_free(env, readers);
+out:
+ if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC) && *foundp != NULL)
+ __db_msg(env, "fget reusing %p pgno %d @%lu/%lu", bhp,
+ bhp->pgno, (u_long)VISIBLE_LSN(env, bhp)->file,
+ (u_long)VISIBLE_LSN(env, bhp)->offset);
+ return (ret);
+}
diff --git a/src/mp/mp_fmethod.c b/src/mp/mp_fmethod.c
index 41bd638c..4974f57c 100644
--- a/src/mp/mp_fmethod.c
+++ b/src/mp/mp_fmethod.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -315,7 +315,7 @@ __memp_set_lsn_offset(dbmfp, lsn_offset)
/*
* __memp_get_maxsize --
- * Get the file's maximum size.
+ * Get the file's maximum size, returning zeroes if none is set.
*/
static int
__memp_get_maxsize(dbmfp, gbytesp, bytesp)
@@ -334,11 +334,22 @@ __memp_get_maxsize(dbmfp, gbytesp, bytesp)
ENV_ENTER(env, ip);
MUTEX_LOCK(env, mfp->mutex);
- *gbytesp = (u_int32_t)
- (mfp->maxpgno / (GIGABYTE / mfp->pagesize));
- *bytesp = (u_int32_t)
- ((mfp->maxpgno % (GIGABYTE / mfp->pagesize)) *
- mfp->pagesize);
+ if (mfp->maxpgno == 0) {
+ *gbytesp = *bytesp = 0;
+ } else {
+ *gbytesp = (u_int32_t)
+ (mfp->maxpgno / (GIGABYTE / mfp->pagesize));
+ *bytesp = (u_int32_t) (mfp->maxpgno %
+ (GIGABYTE / mfp->pagesize) + 1) * mfp->pagesize;
+ /*
+ * After converting from 0-based maxpgno to #pages, we
+ * might have bumped into the next gigabyte boundary.
+ */
+ if (*bytesp >= GIGABYTE) {
+ *bytesp -= GIGABYTE;
+ *gbytesp += 1;
+ }
+ }
MUTEX_UNLOCK(env, mfp->mutex);
ENV_LEAVE(env, ip);
@@ -348,8 +359,34 @@ __memp_get_maxsize(dbmfp, gbytesp, bytesp)
}
/*
+ * __memp_set_maxpgno --
+ * Set the file's maxpgno from the configured max size. If that size is
+ * pagesize or less then the filesize limit is disabled.
+ *
+ * PUBLIC: void __memp_set_maxpgno __P((MPOOLFILE *, u_int32_t, u_int32_t));
+ */
+void
+__memp_set_maxpgno(mfp, gbytes, bytes)
+ MPOOLFILE *mfp;
+ u_int32_t gbytes, bytes;
+{
+ if (gbytes == 0 && bytes <= mfp->pagesize)
+ mfp->maxpgno = 0;
+ else {
+ mfp->maxpgno = (db_pgno_t)
+ (gbytes * (GIGABYTE / mfp->pagesize));
+ /* Round up to account for any fractional page. */
+ mfp->maxpgno += (db_pgno_t)
+ ((bytes + mfp->pagesize - 1) / mfp->pagesize);
+ /* Convert from #pages to the zero-based max pgno. */
+ mfp->maxpgno--;
+ }
+}
+
+/*
* __memp_set_maxsize --
- * Set the file's maximum size.
+ * Set the file's maximum size; if the size is <= pagesize then
+ * remove any file size limit.
*/
static int
__memp_set_maxsize(dbmfp, gbytes, bytes)
@@ -368,10 +405,7 @@ __memp_set_maxsize(dbmfp, gbytes, bytes)
ENV_ENTER(env, ip);
MUTEX_LOCK(env, mfp->mutex);
- mfp->maxpgno = (db_pgno_t)
- (gbytes * (GIGABYTE / mfp->pagesize));
- mfp->maxpgno += (db_pgno_t)
- ((bytes + mfp->pagesize - 1) / mfp->pagesize);
+ __memp_set_maxpgno(mfp, gbytes, bytes);
MUTEX_UNLOCK(env, mfp->mutex);
ENV_LEAVE(env, ip);
diff --git a/src/mp/mp_fopen.c b/src/mp/mp_fopen.c
index ef7f886a..dbe7b9c8 100644
--- a/src/mp/mp_fopen.c
+++ b/src/mp/mp_fopen.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -89,8 +89,9 @@ __memp_fopen_pp(dbmfp, path, flags, mode, pagesize)
* Generate the number of user opens. If there is no backing file
* there is an extra open count to keep the in memory db around.
*/
-#define MFP_OPEN_CNT(mfp) ((mfp)->mpf_cnt - ((mfp)->neutral_cnt + \
+#define MFP_OPEN_CNT(mfp) ((mfp)->mpf_cnt - ((mfp)->neutral_cnt + \
(u_int32_t)(mfp)->no_backing_file))
+#define MP_IOINFO_RETRIES 5
/*
* __memp_fopen --
* DB_MPOOLFILE->open.
@@ -118,7 +119,7 @@ __memp_fopen(dbmfp, mfp, path, dirp, flags, mode, pgsize)
size_t maxmap;
db_pgno_t last_pgno;
u_int32_t bucket, mbytes, bytes, oflags, pagesize;
- int refinc, ret, isdir;
+ int isdir, refinc, ret, tries;
char *rpath;
/* If this handle is already open, return. */
@@ -249,7 +250,7 @@ __memp_fopen(dbmfp, mfp, path, dirp, flags, mode, pgsize)
if (MFP_OPEN_CNT(mfp) > 0 &&
atomic_read(&mfp->multiversion) == 0) {
mvcc_err: __db_errx(env, DB_STR("3041",
-"DB_MULTIVERSION cannot be specified on a database file which is already open"));
+"DB_MULTIVERSION cannot be specified on a database file that is already open"));
ret = EINVAL;
goto err;
}
@@ -399,11 +400,44 @@ mvcc_err: __db_errx(env, DB_STR("3041",
if (LF_ISSET(DB_ODDFILESIZE))
bytes -= (u_int32_t)(bytes % pagesize);
else {
- __db_errx(env, DB_STR_A("3037",
- "%s: file size not a multiple of the pagesize", "%s"),
- rpath);
- ret = EINVAL;
- goto err;
+ /*
+ * If the file size is not a multiple of the
+ * pagesize, it is likely because the ioinfo
+ * call is racing with a write that is extending
+ * the file. Many file systems will extend
+ * in fs block size units, and if the pagesize
+ * is larger than that, we can briefly see a
+ * file size that is not a multiple of pagesize.
+ *
+ * Yield the processor to allow that to finish
+ * and try again a few times.
+ */
+ tries = 0;
+ STAT((mp->stat.st_oddfsize_detect++));
+ while (tries < MP_IOINFO_RETRIES) {
+ if ((ret = __os_ioinfo(env, rpath,
+ dbmfp->fhp, &mbytes, &bytes,
+ NULL)) != 0) {
+ __db_err(env, ret, "%s", rpath);
+ goto err;
+ }
+ if (bytes % pagesize != 0) {
+ __os_yield(env, 0, 50000);
+ tries++;
+ } else {
+ STAT((
+ mp->stat.st_oddfsize_resolve++));
+ break;
+ }
+ }
+ if (tries == MP_IOINFO_RETRIES) {
+ __db_errx(env, DB_STR_A("3043",
+ "%s: file size (%lu %lu) not a multiple of the pagesize %lu",
+ "%s %lu %lu %lu"),
+ rpath, (u_long)mbytes, (u_long)bytes, (u_long)pagesize);
+ ret = EINVAL;
+ goto err;
+ }
}
}
@@ -786,13 +820,7 @@ __memp_mpf_alloc(dbmp, dbmfp, path, pagesize, flags, retmfp)
mfp->lsn_off = dbmfp->lsn_offset;
mfp->clear_len = dbmfp->clear_len;
mfp->priority = dbmfp->priority;
- if (dbmfp->gbytes != 0 || dbmfp->bytes != 0) {
- mfp->maxpgno = (db_pgno_t)
- (dbmfp->gbytes * (GIGABYTE / mfp->pagesize));
- mfp->maxpgno += (db_pgno_t)
- ((dbmfp->bytes + mfp->pagesize - 1) /
- mfp->pagesize);
- }
+ __memp_set_maxpgno(mfp, dbmfp->gbytes, dbmfp->bytes);
if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE))
mfp->no_backing_file = 1;
if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_UNLINK))
@@ -1019,6 +1047,7 @@ __memp_fclose(dbmfp, flags)
ret = t_ret;
__os_free(env, rpath);
}
+ mfp->unlink_on_close = 0;
}
if (MFP_OPEN_CNT(mfp) == 0) {
F_CLR(mfp, MP_NOT_DURABLE);
@@ -1068,6 +1097,7 @@ __memp_mf_discard(dbmp, mfp, hp_locked)
DB_MPOOL_STAT *sp;
#endif
MPOOL *mp;
+ char *rpath;
int need_sync, ret, t_ret;
env = dbmp->env;
@@ -1095,6 +1125,23 @@ __memp_mf_discard(dbmp, mfp, hp_locked)
*/
mfp->deadfile = 1;
+ /* We should unlink the file if necessary. */
+ if (mfp->block_cnt == 0 && mfp->mpf_cnt == 0 && mfp->unlink_on_close &&
+ !F_ISSET(mfp, MP_TEMP) && !mfp->no_backing_file) {
+ if ((t_ret = __db_appname(env, DB_APP_DATA,
+ R_ADDR(dbmp->reginfo, mfp->path_off), NULL,
+ &rpath)) != 0 && ret == 0)
+ ret = t_ret;
+ if (t_ret == 0) {
+ if ((t_ret = __os_unlink(
+ dbmp->env, rpath, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ __os_free(env, rpath);
+ }
+ mfp->unlink_on_close = 0;
+ need_sync = 0;
+ }
+
/* Discard the mutex we're holding and return it too the pool. */
MUTEX_UNLOCK(env, mfp->mutex);
if ((t_ret = __mutex_free(env, &mfp->mutex)) != 0 && ret == 0)
diff --git a/src/mp/mp_fput.c b/src/mp/mp_fput.c
index 7a900fd0..06b30fd4 100644
--- a/src/mp/mp_fput.c
+++ b/src/mp/mp_fput.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -52,7 +52,8 @@ __memp_fput_pp(dbmfp, pgaddr, priority, flags)
/*
* __memp_fput --
- * DB_MPOOLFILE->put.
+ * DB_MPOOLFILE->put. Release this reference to the page. If the reference
+ * count drop to zero adjust the buffer's cache priority.
*
* PUBLIC: int __memp_fput __P((DB_MPOOLFILE *,
* PUBLIC: DB_THREAD_INFO *, void *, DB_CACHE_PRIORITY));
diff --git a/src/mp/mp_fset.c b/src/mp/mp_fset.c
index 1129853f..770ec5c8 100644
--- a/src/mp/mp_fset.c
+++ b/src/mp/mp_fset.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/mp/mp_method.c b/src/mp/mp_method.c
index 7afae248..56d6c42b 100644
--- a/src/mp/mp_method.c
+++ b/src/mp/mp_method.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -67,6 +67,7 @@ __memp_get_cachesize(dbenv, gbytesp, bytesp, ncachep)
int *ncachep;
{
DB_MPOOL *dbmp;
+ DB_THREAD_INFO *ip;
ENV *env;
MPOOL *mp;
@@ -78,12 +79,16 @@ __memp_get_cachesize(dbenv, gbytesp, bytesp, ncachep)
if (MPOOL_ON(env)) {
dbmp = env->mp_handle;
mp = dbmp->reginfo[0].primary;
+ ENV_ENTER(env, ip);
+ MUTEX_LOCK(env, mp->mtx_resize);
if (gbytesp != NULL)
*gbytesp = mp->gbytes;
if (bytesp != NULL)
*bytesp = mp->bytes;
if (ncachep != NULL)
*ncachep = (int)mp->nreg;
+ MUTEX_UNLOCK(env, mp->mtx_resize);
+ ENV_LEAVE(env, ip);
} else {
if (gbytesp != NULL)
*gbytesp = dbenv->mp_gbytes;
@@ -380,7 +385,7 @@ __memp_set_mp_max_write(dbenv, maxwrite, maxwrite_sleep)
env = dbenv->env;
ENV_NOT_CONFIGURED(env,
- env->mp_handle, "DB_ENV->get_mp_max_write", DB_INIT_MPOOL);
+ env->mp_handle, "DB_ENV->set_mp_max_write", DB_INIT_MPOOL);
if (MPOOL_ON(env)) {
dbmp = env->mp_handle;
@@ -448,7 +453,7 @@ __memp_set_mp_mmapsize(dbenv, mp_mmapsize)
env = dbenv->env;
ENV_NOT_CONFIGURED(env,
- env->mp_handle, "DB_ENV->set_mp_max_mmapsize", DB_INIT_MPOOL);
+ env->mp_handle, "DB_ENV->set_mp_mmapsize", DB_INIT_MPOOL);
if (MPOOL_ON(env)) {
dbmp = env->mp_handle;
@@ -512,7 +517,7 @@ __memp_set_mp_pagesize(dbenv, mp_pagesize)
env = dbenv->env;
ENV_NOT_CONFIGURED(env,
- env->mp_handle, "DB_ENV->get_mp_max_mmapsize", DB_INIT_MPOOL);
+ env->mp_handle, "DB_ENV->set_mp_pagesize", DB_INIT_MPOOL);
ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_mp_pagesize");
dbenv->mp_pagesize = mp_pagesize;
@@ -561,7 +566,7 @@ __memp_set_mp_tablesize(dbenv, mp_tablesize)
env = dbenv->env;
ENV_NOT_CONFIGURED(env,
- env->mp_handle, "DB_ENV->get_mp_max_mmapsize", DB_INIT_MPOOL);
+ env->mp_handle, "DB_ENV->set_mp_tablesize", DB_INIT_MPOOL);
ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_mp_tablesize");
dbenv->mp_tablesize = mp_tablesize;
@@ -583,7 +588,7 @@ __memp_get_mp_mtxcount(dbenv, mp_mtxcountp)
env = dbenv->env;
ENV_NOT_CONFIGURED(env,
- env->mp_handle, "DB_ENV->get_mp_max_mtxcount", DB_INIT_MPOOL);
+ env->mp_handle, "DB_ENV->get_mp_mtxcount", DB_INIT_MPOOL);
if (MPOOL_ON(env)) {
dbmp = env->mp_handle;
@@ -610,7 +615,7 @@ __memp_set_mp_mtxcount(dbenv, mp_mtxcount)
env = dbenv->env;
ENV_NOT_CONFIGURED(env,
- env->mp_handle, "DB_ENV->get_mp_max_mmapsize", DB_INIT_MPOOL);
+ env->mp_handle, "DB_ENV->set_mp_mtxcount", DB_INIT_MPOOL);
ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_mp_mtxcount");
dbenv->mp_mtxcount = mp_mtxcount;
@@ -870,7 +875,7 @@ __memp_ftruncate(dbmfp, txn, ip, pgno, flags)
!mfp->no_backing_file && pgno <= mfp->last_flushed_pgno)
#ifdef HAVE_FTRUNCATE
ret = __os_truncate(env,
- dbmfp->fhp, pgno, mfp->pagesize);
+ dbmfp->fhp, pgno, mfp->pagesize, 0);
#else
ret = __db_zero_extend(env,
dbmfp->fhp, pgno, mfp->last_pgno, mfp->pagesize);
diff --git a/src/mp/mp_mvcc.c b/src/mp/mp_mvcc.c
index 47531528..b51ae135 100644
--- a/src/mp/mp_mvcc.c
+++ b/src/mp/mp_mvcc.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2006, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -151,6 +151,11 @@ __memp_bh_freeze(dbmp, infop, hp, bhp, need_frozenp)
real_name = NULL;
fhp = NULL;
+ if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC))
+ __db_msg(env, "freeze %s %d @%lu/%lu", __memp_fns(dbmp, mfp),
+ bhp->pgno, (u_long)VISIBLE_LSN(env, bhp)->file,
+ (u_long)VISIBLE_LSN(env, bhp)->offset);
+
MVCC_MPROTECT(bhp->buf, pagesize, PROT_READ | PROT_WRITE);
MPOOL_REGION_LOCK(env, infop);
@@ -161,7 +166,7 @@ __memp_bh_freeze(dbmp, infop, hp, bhp, need_frozenp)
} else {
*need_frozenp = 1;
- /* There might be a small amount of unallocated space. */
+ /* There might be enough space for a single-item block. */
if (__env_alloc(infop,
sizeof(BH_FROZEN_ALLOC) + sizeof(BH_FROZEN_PAGE),
&frozen_alloc) == 0) {
@@ -405,6 +410,12 @@ __memp_bh_thaw(dbmp, infop, hp, frozen_bhp, alloc_bhp)
ret = 0;
real_name = NULL;
+ if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC))
+ __db_msg(env, "thaw %s %d @%lu/%lu", __memp_fns(dbmp, mfp),
+ frozen_bhp->pgno,
+ (u_long)VISIBLE_LSN(env, frozen_bhp)->file,
+ (u_long)VISIBLE_LSN(env, frozen_bhp)->offset);
+
MUTEX_REQUIRED(env, hp->mtx_hash);
DB_ASSERT(env, F_ISSET(frozen_bhp, BH_EXCLUSIVE) || alloc_bhp == NULL);
h_locked = 1;
@@ -414,7 +425,8 @@ __memp_bh_thaw(dbmp, infop, hp, frozen_bhp, alloc_bhp)
DB_ASSERT(env, alloc_bhp != NULL ||
SH_CHAIN_SINGLETON(frozen_bhp, vc) ||
(SH_CHAIN_HASNEXT(frozen_bhp, vc) &&
- BH_OBSOLETE(frozen_bhp, hp->old_reader, vlsn)));
+ BH_OBSOLETE(frozen_bhp, hp->old_reader, vlsn)) ||
+ F_ISSET(frozen_bhp, BH_UNREACHABLE));
DB_ASSERT(env, alloc_bhp == NULL || !F_ISSET(alloc_bhp, BH_FROZEN));
spgno = ((BH_FROZEN_PAGE *)frozen_bhp)->spgno;
@@ -516,7 +528,7 @@ __memp_bh_thaw(dbmp, infop, hp, frozen_bhp, alloc_bhp)
else {
maxpgno -= (db_pgno_t)ntrunc;
if ((ret = __os_truncate(env, fhp,
- maxpgno + 1, pagesize)) != 0)
+ maxpgno + 1, pagesize, 0)) != 0)
goto err;
/* Fix up the linked list */
diff --git a/src/mp/mp_region.c b/src/mp/mp_region.c
index 07134de7..ba836cf4 100644
--- a/src/mp/mp_region.c
+++ b/src/mp/mp_region.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -11,7 +11,7 @@
#include "db_int.h"
#include "dbinc/mp.h"
-static int __memp_init_config __P((ENV *, MPOOL *));
+static int __memp_init_config __P((ENV *, MPOOL *, int));
static void __memp_region_size __P((ENV *, roff_t *, u_int32_t *));
#define MPOOL_DEFAULT_PAGESIZE (4 * 1024)
@@ -34,7 +34,7 @@ __memp_open(env, create_ok)
roff_t cache_size, max_size, reg_size;
u_int i, max_nreg;
u_int32_t htab_buckets, *regids;
- int ret;
+ int create, ret;
dbenv = env->dbenv;
cache_size = 0;
@@ -77,7 +77,8 @@ __memp_open(env, create_ok)
* If we created the region, initialize it. Create or join any
* additional regions.
*/
- if (F_ISSET(&reginfo, REGION_CREATE)) {
+ create = F_ISSET(&reginfo, REGION_CREATE);
+ if (create) {
/*
* We define how many regions there are going to be, allocate
* the REGINFO structures and create them. Make sure we don't
@@ -167,23 +168,38 @@ __memp_open(env, create_ok)
env->mp_handle = dbmp;
/* A process joining the region may reset the mpool configuration. */
- if ((ret = __memp_init_config(env, mp)) != 0)
+ if ((ret = __memp_init_config(env, mp, create)) != 0)
return (ret);
return (0);
-err: env->mp_handle = NULL;
- if (dbmp->reginfo != NULL && dbmp->reginfo[0].addr != NULL) {
- for (i = 0; i < dbenv->mp_ncache; ++i)
+err: (void)__mutex_free(env, &dbmp->mutex);
+ (void)__memp_region_detach(env, dbmp);
+ return (ret);
+}
+
+/* __memp_region_detach
+ * Detach from any attached mempool regions.
+ *
+ * PUBLIC: int __memp_region_detach __P((ENV *, DB_MPOOL *));
+ */
+int
+__memp_region_detach(env, dbmp)
+ ENV *env;
+ DB_MPOOL *dbmp;
+{
+ u_int i;
+
+ if (dbmp != NULL &&
+ dbmp->reginfo != NULL && dbmp->reginfo[0].addr != NULL) {
+ for (i = 0; i < env->dbenv->mp_ncache; ++i)
if (dbmp->reginfo[i].id != INVALID_REGION_ID)
(void)__env_region_detach(
env, &dbmp->reginfo[i], 0);
__os_free(env, dbmp->reginfo);
}
-
- (void)__mutex_free(env, &dbmp->mutex);
- __os_free(env, dbmp);
- return (ret);
+ env->mp_handle = NULL;
+ return (0);
}
/*
@@ -207,7 +223,7 @@ __memp_init(env, dbmp, reginfo_off, htab_buckets, max_nreg)
MPOOL *mp, *main_mp;
REGINFO *infop;
db_mutex_t mtx_base, mtx_discard, mtx_prev;
- u_int32_t i;
+ u_int32_t i, mp_mtxcount;
int ret;
void *p;
@@ -224,6 +240,23 @@ __memp_init(env, dbmp, reginfo_off, htab_buckets, max_nreg)
__mutex_alloc(env, MTX_MPOOL_REGION, 0, &mp->mtx_region)) != 0)
return (ret);
+ /*
+ * Intializing the first mpool region allocates the mpool region id
+ * array, file table and, if not ENV_PRIVATE, all the cache regions'
+ * hash bucket mutexes in a single contiguous block of mutex ids, which
+ * remain allocated when the cache is resized. The block is 'known' to
+ * start with the first id (mtx_base), and to end #regions * mp_mtxcount
+ * later. In private environments, mutex ids are not smallish integers,
+ * but __env_alloc()'d pointers. Since a range of (base, count) doesn't
+ * work for these likely-scattered mutexes, we allocate private threaded
+ * mutexes as they are needed. Private non-threaded caches don't need
+ * any mutexes at all.
+ */
+ if ((mp_mtxcount = dbenv->mp_mtxcount) == 0)
+ mp_mtxcount = dbenv->mp_mtxcount = htab_buckets;
+ if (!MUTEX_ON(env) ||
+ F_ISSET(env, ENV_PRIVATE | ENV_THREAD) == ENV_PRIVATE)
+ mp_mtxcount = dbenv->mp_mtxcount = 0;
if (reginfo_off == 0) {
ZERO_LSN(mp->lsn);
@@ -248,15 +281,10 @@ __memp_init(env, dbmp, reginfo_off, htab_buckets, max_nreg)
atomic_init(&htab[i].hash_page_dirty, 0);
}
- /*
- * Allocate all of the hash bucket mutexes up front. We do
- * this so that we don't need to free and reallocate mutexes as
- * the cache is resized.
- */
mtx_base = mtx_prev = MUTEX_INVALID;
- if (!MUTEX_ON(env) || F_ISSET(env, ENV_PRIVATE))
+ if (F_ISSET(env, ENV_PRIVATE))
goto no_prealloc;
- for (i = 0; i < mp->max_nreg * dbenv->mp_mtxcount; i++) {
+ for (i = 0; i < mp->max_nreg * mp_mtxcount; i++) {
if ((ret = __mutex_alloc(env, MTX_MPOOL_HASH_BUCKET,
DB_MUTEX_SHARED, &mtx_discard)) != 0)
return (ret);
@@ -274,13 +302,12 @@ __memp_init(env, dbmp, reginfo_off, htab_buckets, max_nreg)
}
/*
- * We preallocated all of the mutexes in a block, so for regions after
- * the first, we skip mutexes in use in earlier regions. Each region
- * has the same number of buckets
+ * If we preallocated all the mutexes, then in regions after the first,
+ * we skip mutexes in use in earlier regions. Each region has the same
+ * number of buckets.
*/
no_prealloc:
- if (MUTEX_ON(env))
- mtx_base += reginfo_off * dbenv->mp_mtxcount;
+ mtx_base += reginfo_off * mp_mtxcount;
/* Allocate hash table space and initialize it. */
if ((ret = __env_alloc(infop,
@@ -289,18 +316,21 @@ no_prealloc:
mp->htab = R_OFFSET(infop, htab);
for (i = 0; i < htab_buckets; i++) {
hp = &htab[i];
- if (!MUTEX_ON(env) || dbenv->mp_mtxcount == 0)
+ /*
+ * Set mtx_hash to do no locking, or share a mutex with an
+ * earlier hash bucket in this region, or assign it from the
+ * block of mutexes allocated above, or (in a private
+ * environment) allocate a new mutex.
+ */
+ if (mp_mtxcount == 0)
hp->mtx_hash = MUTEX_INVALID;
- else if (F_ISSET(env, ENV_PRIVATE)) {
- if (i >= dbenv->mp_mtxcount)
- hp->mtx_hash =
- htab[i % dbenv->mp_mtxcount].mtx_hash;
- else if
- ((ret = __mutex_alloc(env, MTX_MPOOL_HASH_BUCKET,
- DB_MUTEX_SHARED, &hp->mtx_hash)) != 0)
- return (ret);
- } else
- hp->mtx_hash = mtx_base + (i % dbenv->mp_mtxcount);
+ else if (i >= mp_mtxcount)
+ hp->mtx_hash = htab[i % mp_mtxcount].mtx_hash;
+ else if (!F_ISSET(env, ENV_PRIVATE))
+ hp->mtx_hash = mtx_base + i;
+ else if ((ret = __mutex_alloc(env, MTX_MPOOL_HASH_BUCKET,
+ DB_MUTEX_SHARED, &hp->mtx_hash)) != 0)
+ return (ret);
SH_TAILQ_INIT(&hp->hash_bucket);
atomic_init(&hp->hash_page_dirty, 0);
#ifdef HAVE_STATISTICS
@@ -311,7 +341,7 @@ no_prealloc:
ZERO_LSN(hp->old_reader);
}
mp->htab_buckets = htab_buckets;
- mp->htab_mutexes = dbenv->mp_mtxcount;
+ mp->htab_mutexes = mp_mtxcount;
mp->pagesize = dbenv->mp_pagesize == 0 ?
MPOOL_DEFAULT_PAGESIZE : dbenv->mp_pagesize;
@@ -443,11 +473,21 @@ __memp_region_mutex_count(env)
dbenv = env->dbenv;
__memp_region_size(env, &reg_size, &htab_buckets);
- if (F_ISSET(env->dbenv, DB_ENV_MULTIVERSION))
- pgsize = sizeof(BH_FROZEN_ALLOC) + sizeof(BH_FROZEN_PAGE);
- if ((pgsize = dbenv->mp_pagesize) == 0)
- pgsize = MPOOL_DEFAULT_PAGESIZE;
+ if (dbenv->mp_mtxcount != 0)
+ htab_buckets = dbenv->mp_mtxcount;
max_region = __memp_max_regions(env);
+ if ((pgsize = dbenv->mp_pagesize) == 0) {
+ /*
+ * If MVCC is on during environment creation, provide enough
+ * mutexes so that half the cache can be frozen buffer headers.
+ */
+ if (F_ISSET(env->dbenv, DB_ENV_MULTIVERSION))
+ pgsize = (MPOOL_DEFAULT_PAGESIZE +
+ sizeof(BH_FROZEN_ALLOC) +
+ sizeof(BH_FROZEN_PAGE)) / 2;
+ else
+ pgsize = MPOOL_DEFAULT_PAGESIZE;
+ }
/*
* We need a couple of mutexes for the region itself, one for each
@@ -456,10 +496,6 @@ __memp_region_mutex_count(env)
* hash bucket. We then need one mutex per page in the cache,
* the worst case is really big if the pages are 512 bytes.
*/
- if (dbenv->mp_mtxcount != 0)
- htab_buckets = dbenv->mp_mtxcount;
- else
- dbenv->mp_mtxcount = htab_buckets;
num_per_cache = htab_buckets + (u_int32_t)(reg_size / pgsize);
return ((max_region * num_per_cache) + 50 + MPOOL_FILE_BUCKETS);
}
@@ -469,23 +505,39 @@ __memp_region_mutex_count(env)
* Initialize shared configuration information.
*/
static int
-__memp_init_config(env, mp)
+__memp_init_config(env, mp, create)
ENV *env;
MPOOL *mp;
+ int create;
{
DB_ENV *dbenv;
dbenv = env->dbenv;
MPOOL_SYSTEM_LOCK(env);
- if (dbenv->mp_mmapsize != 0)
+ if (create) {
mp->mp_mmapsize = (db_size_t)dbenv->mp_mmapsize;
- if (dbenv->mp_maxopenfd != 0)
mp->mp_maxopenfd = dbenv->mp_maxopenfd;
- if (dbenv->mp_maxwrite != 0)
mp->mp_maxwrite = dbenv->mp_maxwrite;
- if (dbenv->mp_maxwrite_sleep != 0)
mp->mp_maxwrite_sleep = dbenv->mp_maxwrite_sleep;
+ } else {
+ if (dbenv->mp_mmapsize != 0 &&
+ mp->mp_mmapsize != (db_size_t)dbenv->mp_mmapsize)
+ __db_msg(env, DB_STR("3044",
+"Warning: Ignoring maximum memory map size when joining environment"));
+
+ if (dbenv->mp_maxopenfd != 0 &&
+ mp->mp_maxopenfd != dbenv->mp_maxopenfd)
+ __db_msg(env, DB_STR("3045",
+"Warning: Ignoring max open file descriptors value when joining environment"));
+
+ if ((dbenv->mp_maxwrite != 0 &&
+ mp->mp_maxwrite != dbenv->mp_maxwrite) ||
+ (dbenv->mp_maxwrite_sleep != 0 &&
+ mp->mp_maxwrite_sleep != dbenv->mp_maxwrite_sleep))
+ __db_msg(env, DB_STR("3046",
+"Warning: Ignoring maximum sequential writes value when joining environment"));
+ }
MPOOL_SYSTEM_UNLOCK(env);
return (0);
@@ -501,22 +553,18 @@ int
__memp_env_refresh(env)
ENV *env;
{
- BH *bhp;
- BH_FROZEN_ALLOC *frozen_alloc;
DB_MPOOL *dbmp;
DB_MPOOLFILE *dbmfp;
- DB_MPOOL_HASH *hp;
DB_MPREG *mpreg;
MPOOL *mp, *c_mp;
REGINFO *infop;
- u_int32_t bucket, i, nreg;
+ u_int32_t i, nreg;
int ret, t_ret;
ret = 0;
dbmp = env->mp_handle;
mp = dbmp->reginfo[0].primary;
nreg = mp->nreg;
- hp = R_ADDR(&dbmp->reginfo[0], mp->htab);
/*
* If a private region, return the memory to the heap. Not needed for
@@ -526,49 +574,20 @@ __memp_env_refresh(env)
if (!F_ISSET(env, ENV_PRIVATE))
goto not_priv;
- /* Discard buffers. */
for (i = 0; i < nreg; ++i) {
infop = &dbmp->reginfo[i];
- c_mp = infop->primary;
- for (hp = R_ADDR(infop, c_mp->htab), bucket = 0;
- bucket < c_mp->htab_buckets; ++hp, ++bucket) {
- while ((bhp = SH_TAILQ_FIRST(
- &hp->hash_bucket, __bh)) != NULL)
- if (F_ISSET(bhp, BH_FROZEN))
- SH_TAILQ_REMOVE(
- &hp->hash_bucket, bhp,
- hq, __bh);
- else {
- if (F_ISSET(bhp, BH_DIRTY)) {
- atomic_dec(env,
- &hp->hash_page_dirty);
- F_CLR(bhp,
- BH_DIRTY | BH_DIRTY_CREATE);
- }
- atomic_inc(env, &bhp->ref);
- if ((t_ret = __memp_bhfree(dbmp, infop,
- R_ADDR(dbmp->reginfo,
- bhp->mf_offset), hp, bhp,
- BH_FREE_FREEMEM |
- BH_FREE_UNLOCKED)) != 0 && ret == 0)
- ret = t_ret;
- }
- }
- MPOOL_REGION_LOCK(env, infop);
- while ((frozen_alloc = SH_TAILQ_FIRST(
- &c_mp->alloc_frozen, __bh_frozen_a)) != NULL) {
- SH_TAILQ_REMOVE(&c_mp->alloc_frozen, frozen_alloc,
- links, __bh_frozen_a);
- __env_alloc_free(infop, frozen_alloc);
- }
- MPOOL_REGION_UNLOCK(env, infop);
+ if ((t_ret = __memp_region_bhfree(infop)) != 0 && ret == 0)
+ ret = t_ret;
}
not_priv:
/* Discard DB_MPOOLFILEs. */
while ((dbmfp = TAILQ_FIRST(&dbmp->dbmfq)) != NULL)
- if ((t_ret = __memp_fclose(dbmfp, DB_FLUSH)) != 0 && ret == 0)
- ret = t_ret;
+ if ((t_ret = __memp_fclose(dbmfp, DB_FLUSH)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ break;
+ }
/* Discard DB_MPREGs. */
if (dbmp->pg_inout != NULL)
@@ -618,3 +637,62 @@ not_priv:
env->mp_handle = NULL;
return (ret);
}
+
+/*
+ * __memp_region_bhfree --
+ * Discard the buffers for a region.
+ *
+ * PUBLIC: int __memp_region_bhfree __P((REGINFO *));
+ */
+int
+__memp_region_bhfree(infop)
+ REGINFO *infop;
+{
+ BH *bhp;
+ BH_FROZEN_ALLOC *frozen_alloc;
+ DB_MPOOL *dbmp;
+ DB_MPOOL_HASH *hp;
+ ENV *env;
+ MPOOL *c_mp;
+ u_int32_t bucket;
+ int ret, t_ret;
+
+ env = infop->env;
+ dbmp = env->mp_handle;
+ ret = 0;
+
+ /* Discard buffers. */
+ c_mp = infop->primary;
+ for (hp = R_ADDR(infop, c_mp->htab), bucket = 0;
+ bucket < c_mp->htab_buckets; ++hp, ++bucket) {
+ while ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) != NULL)
+ if (F_ISSET(bhp, BH_FROZEN))
+ SH_TAILQ_REMOVE(&hp->hash_bucket,
+ bhp, hq, __bh);
+ else {
+ if (F_ISSET(bhp, BH_DIRTY)) {
+ atomic_dec(env, &hp->hash_page_dirty);
+ F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE);
+ }
+ atomic_inc(env, &bhp->ref);
+ if ((t_ret = __memp_bhfree(dbmp, infop,
+ R_ADDR(dbmp->reginfo, bhp->mf_offset),
+ hp, bhp, BH_FREE_FREEMEM |
+ BH_FREE_UNLOCKED)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ break;
+ }
+ }
+ }
+ MPOOL_REGION_LOCK(env, infop);
+ while ((frozen_alloc = SH_TAILQ_FIRST(
+ &c_mp->alloc_frozen, __bh_frozen_a)) != NULL) {
+ SH_TAILQ_REMOVE(&c_mp->alloc_frozen,
+ frozen_alloc, links, __bh_frozen_a);
+ __env_alloc_free(infop, frozen_alloc);
+ }
+ MPOOL_REGION_UNLOCK(env, infop);
+
+ return (ret);
+}
diff --git a/src/mp/mp_register.c b/src/mp/mp_register.c
index dc7015a7..cc59af9c 100644
--- a/src/mp/mp_register.c
+++ b/src/mp/mp_register.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
diff --git a/src/mp/mp_resize.c b/src/mp/mp_resize.c
index 97719554..932a1baa 100644
--- a/src/mp/mp_resize.c
+++ b/src/mp/mp_resize.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2006, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -126,12 +126,13 @@ __memp_merge_buckets(dbmp, new_nbuckets, old_bucket, new_bucket)
MPOOLFILE *mfp;
REGINFO *new_infop, *old_infop;
u_int32_t bucket, high_mask, new_region, old_region;
- int ret;
+ int expanding, ret;
env = dbmp->env;
mp = dbmp->reginfo[0].primary;
new_bhp = NULL;
ret = 0;
+ expanding = (mp->nbuckets > new_nbuckets) ? 0 : 1;
MP_MASK(new_nbuckets, high_mask);
@@ -150,36 +151,42 @@ __memp_merge_buckets(dbmp, new_nbuckets, old_bucket, new_bucket)
/*
* Before merging, we need to check that there are no old buffers left
* in the target hash bucket after a previous split.
+ * Only free the buffers if we are expanding into new buckets. If
+ * we are contracting, the buffers in the original (old) bucket should
+ * not be freed.
*/
free_old:
- MUTEX_LOCK(env, new_hp->mtx_hash);
- SH_TAILQ_FOREACH(bhp, &new_hp->hash_bucket, hq, __bh) {
- MP_BUCKET(bhp->mf_offset, bhp->pgno, mp->nbuckets, bucket);
+ if (expanding != 0) {
+ MUTEX_LOCK(env, new_hp->mtx_hash);
+ SH_TAILQ_FOREACH(bhp, &new_hp->hash_bucket, hq, __bh) {
+ MP_BUCKET(
+ bhp->mf_offset, bhp->pgno, mp->nbuckets, bucket);
+
+ if (bucket != new_bucket) {
+ /*
+ * There is no way that an old buffer can be
+ * locked after a split, since everyone will
+ * look for it in the new hash bucket.
+ */
+ DB_ASSERT(env, !F_ISSET(bhp, BH_DIRTY) &&
+ atomic_read(&bhp->ref) == 0);
+ atomic_inc(env, &bhp->ref);
+ mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+ if ((ret = __memp_bhfree(dbmp, new_infop,
+ mfp, new_hp, bhp, BH_FREE_FREEMEM)) != 0) {
+ MUTEX_UNLOCK(env, new_hp->mtx_hash);
+ return (ret);
+ }
- if (bucket != new_bucket) {
- /*
- * There is no way that an old buffer can be locked
- * after a split, since everyone will look for it in
- * the new hash bucket.
- */
- DB_ASSERT(env, !F_ISSET(bhp, BH_DIRTY) &&
- atomic_read(&bhp->ref) == 0);
- atomic_inc(env, &bhp->ref);
- mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
- if ((ret = __memp_bhfree(dbmp, new_infop,
- mfp, new_hp, bhp, BH_FREE_FREEMEM)) != 0) {
- MUTEX_UNLOCK(env, new_hp->mtx_hash);
- return (ret);
+ /*
+ * The free has modified the list of buffers and
+ * dropped the mutex. We need to start again.
+ */
+ goto free_old;
}
-
- /*
- * The free has modified the list of buffers and
- * dropped the mutex. We need to start again.
- */
- goto free_old;
}
+ MUTEX_UNLOCK(env, new_hp->mtx_hash);
}
- MUTEX_UNLOCK(env, new_hp->mtx_hash);
/*
* Before we begin, make sure that all of the buffers we care about are
@@ -305,7 +312,9 @@ err: atomic_dec(env, &bhp->ref);
next_bhp, alloc_bhp, vc, __bh);
}
- DB_ASSERT(env, new_hp->mtx_hash != old_hp->mtx_hash);
+ /* The mutexes must be different, unless they aren't in use. */
+ DB_ASSERT(env, new_hp->mtx_hash != old_hp->mtx_hash ||
+ new_hp->mtx_hash == MUTEX_INVALID);
MUTEX_LOCK(env, new_hp->mtx_hash);
SH_TAILQ_INSERT_TAIL(&new_hp->hash_bucket, new_bhp, hq);
if (F_ISSET(new_bhp, BH_DIRTY))
@@ -362,16 +371,15 @@ __memp_add_region(dbmp)
MPOOL *mp;
REGINFO *infop;
int ret;
- roff_t cache_size, reg_size;
+ roff_t reg_size;
u_int i;
u_int32_t *regids;
env = dbmp->env;
mp = dbmp->reginfo[0].primary;
- cache_size = (roff_t)mp->gbytes * GIGABYTE + mp->bytes;
/* All cache regions are the same size. */
- reg_size = dbmp->reginfo[0].rp->size;
+ reg_size = dbmp->reginfo[0].rp->max;
ret = 0;
infop = &dbmp->reginfo[mp->nreg];
@@ -384,9 +392,6 @@ __memp_add_region(dbmp)
if ((ret = __memp_init(env,
dbmp, mp->nreg, mp->htab_buckets, mp->max_nreg)) != 0)
return (ret);
- cache_size += reg_size;
- mp->gbytes = (u_int32_t)(cache_size / GIGABYTE);
- mp->bytes = (u_int32_t)(cache_size % GIGABYTE);
regids = R_ADDR(dbmp->reginfo, mp->regids);
regids[mp->nreg++] = infop->id;
@@ -425,16 +430,13 @@ __memp_remove_region(dbmp)
{
DB_MPOOL_HASH *hp;
ENV *env;
- MPOOL *mp;
+ MPOOL *mp, *c_mp;
REGINFO *infop;
int ret;
- roff_t cache_size, reg_size;
u_int i;
env = dbmp->env;
mp = dbmp->reginfo[0].primary;
- reg_size = dbmp->reginfo[0].rp->size;
- cache_size = (roff_t)mp->gbytes * GIGABYTE + mp->bytes;
ret = 0;
if (mp->nreg == 1) {
@@ -448,21 +450,36 @@ __memp_remove_region(dbmp)
return (ret);
/* Detach from the region then destroy it. */
- infop = &dbmp->reginfo[mp->nreg];
+ infop = &dbmp->reginfo[mp->nreg - 1];
+ c_mp = infop->primary;
+ hp = R_ADDR(infop, c_mp->htab);
+ /*
+ * For private enviroment, we need to free everything, and
+ * for non-private environment, we need to refresh the mutexes
+ * so that they can be in a ready state for later resize.
+ */
if (F_ISSET(env, ENV_PRIVATE)) {
- hp = R_ADDR(infop, ((MPOOL*)infop->primary)->htab);
- for (i = 0; i < env->dbenv->mp_mtxcount; i++)
- if ((ret = __mutex_free(env, &hp[i].mtx_hash)) != 0)
+ if ((ret = __memp_region_bhfree(infop)) != 0)
+ return (ret);
+ if (MUTEX_ON(env)) {
+ DB_ASSERT(env,
+ env->dbenv->mp_mtxcount == mp->htab_mutexes);
+ for (i = 0; i < mp->htab_mutexes; i++)
+ if ((ret = __mutex_free(env,
+ &hp[i].mtx_hash)) != 0)
+ return (ret);
+ }
+ __env_alloc_free(infop, hp);
+ } else if (MUTEX_ON(env)) {
+ DB_ASSERT(env, env->dbenv->mp_mtxcount == mp->htab_mutexes);
+ for (i = 0; i < mp->htab_mutexes; i++)
+ if ((ret = __mutex_refresh(env, hp[i].mtx_hash)) != 0)
return (ret);
}
ret = __env_region_detach(env, infop, 1);
- if (ret == 0) {
+ if (ret == 0)
mp->nreg--;
- cache_size -= reg_size;
- mp->gbytes = (u_int32_t)(cache_size / GIGABYTE);
- mp->bytes = (u_int32_t)(cache_size % GIGABYTE);
- }
return (ret);
}
@@ -511,6 +528,9 @@ __memp_map_regions(dbmp)
}
/*
+ * __memp_resize --
+ * Change the overall cache size by adding or removing cache regions.
+ *
* PUBLIC: int __memp_resize __P((DB_MPOOL *, u_int32_t, u_int32_t));
*/
int
@@ -526,7 +546,7 @@ __memp_resize(dbmp, gbytes, bytes)
env = dbmp->env;
mp = dbmp->reginfo[0].primary;
- reg_size = dbmp->reginfo[0].rp->size;
+ reg_size = dbmp->reginfo[0].rp->max;
total_size = (roff_t)gbytes * GIGABYTE + bytes;
ncache = (u_int32_t)((total_size + reg_size / 2) / reg_size);
@@ -546,6 +566,9 @@ __memp_resize(dbmp, gbytes, bytes)
__memp_add_region(dbmp) :
__memp_remove_region(dbmp))) != 0)
break;
+ total_size = reg_size * (roff_t)mp->nreg;
+ mp->gbytes = (u_int32_t)(total_size / GIGABYTE);
+ mp->bytes = (u_int32_t)(total_size % GIGABYTE);
MUTEX_UNLOCK(env, mp->mtx_resize);
return (ret);
@@ -567,13 +590,13 @@ __memp_get_cache_max(dbenv, max_gbytesp, max_bytesp)
env = dbenv->env;
ENV_NOT_CONFIGURED(env,
- env->mp_handle, "DB_ENV->get_mp_max_ncache", DB_INIT_MPOOL);
+ env->mp_handle, "DB_ENV->get_cache_max", DB_INIT_MPOOL);
if (MPOOL_ON(env)) {
/* Cannot be set after open, no lock required to read. */
dbmp = env->mp_handle;
mp = dbmp->reginfo[0].primary;
- reg_size = dbmp->reginfo[0].rp->size;
+ reg_size = dbmp->reginfo[0].rp->max;
max_size = mp->max_nreg * reg_size;
*max_gbytesp = (u_int32_t)(max_size / GIGABYTE);
*max_bytesp = (u_int32_t)(max_size % GIGABYTE);
diff --git a/src/mp/mp_stat.c b/src/mp/mp_stat.c
index 246b44d7..81ea35c1 100644
--- a/src/mp/mp_stat.c
+++ b/src/mp/mp_stat.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -133,7 +133,14 @@ __memp_stat(env, gspp, fspp, flags)
sp->st_ro_evict += c_mp->stat.st_ro_evict;
sp->st_rw_evict += c_mp->stat.st_rw_evict;
sp->st_page_trickle += c_mp->stat.st_page_trickle;
+ sp->st_mvcc_reused += c_mp->stat.st_mvcc_reused;
sp->st_pages += c_mp->pages;
+ /* Undocumented field used by tests only. */
+ sp->st_oddfsize_detect +=
+ c_mp->stat.st_oddfsize_detect;
+ /* Undocumented field used by tests only. */
+ sp->st_oddfsize_resolve +=
+ c_mp->stat.st_oddfsize_resolve;
/*
* st_page_dirty calculated by __memp_stat_hash
* st_page_clean calculated here
@@ -195,7 +202,12 @@ __memp_stat(env, gspp, fspp, flags)
/* Count the MPOOLFILE structures. */
i = 0;
- len = 0;
+ /*
+ * Allow space for the first __memp_get_files() to align the
+ * structure array to uintmax_t, DB_MPOOL_STAT's most
+ * restrictive field. [#23150]
+ */
+ len = sizeof(uintmax_t);
if ((ret = __memp_walk_files(env,
mp, __memp_count_files, &len, &i, flags)) != 0)
return (ret);
@@ -252,6 +264,11 @@ __memp_file_stats(env, mfp, argp, countp, flags)
return (0);
}
+/*
+ * __memp_count_files --
+ * This __memp_walk_files() iterator counts the number of files as well as
+ * the space needed for their statistics, including file names.
+ */
static int
__memp_count_files(env, mfp, argp, countp, flags)
ENV *env;
@@ -277,13 +294,25 @@ __memp_count_files(env, mfp, argp, countp, flags)
/*
* __memp_get_files --
- * get file specific statistics
+ * get another file's specific statistics
*
- * Build each individual entry. We assume that an array of pointers are
- * aligned correctly to be followed by an array of structures, which should
- * be safe (in this particular case, the first element of the structure
- * is a pointer, so we're doubly safe). The array is followed by space
- * for the text file names.
+ * Add a file statistics entry to the current list. The chunk of memory
+ * starts with an array of DB_MPOOL_FSTAT pointers, a null pointer to mark
+ * the last one, then an aligned array of DB_MPOOL_FSTAT structures, then
+ * characters space for the file names.
+ * +-----------------------------------------------+
+ * | count * DB_MPOOL_FSTAT pointers |
+ * +-----------------------------------------------+
+ * | null pointer +
+ * +-----------------------------------------------|
+ * | [space for aligning DB_MPOOL_FSTAT array] |
+ * +-----------------------------------------------+
+ * | count * DB_MPOOL_FSTAT structs |
+ * +-----------------------------------------------+
+ * | first file name | second file name | third... |
+ * +-----------------------------------------------+
+ * | file name | ... |
+ * +-----------------------------------------------+
*/
static int
__memp_get_files(env, mfp, argp, countp, flags)
@@ -305,11 +334,21 @@ __memp_get_files(env, mfp, argp, countp, flags)
tfsp = *(DB_MPOOL_FSTAT ***)argp;
if (*tfsp == NULL) {
- /* Add 1 to count because we need to skip over the NULL. */
- tstruct = (DB_MPOOL_FSTAT *)(tfsp + *countp + 1);
- tname = (char *)(tstruct + *countp);
+ /*
+ * Add 1 to count because to skip over the NULL end marker.
+ * Align it further for DB_MPOOL_STAT's most restrictive field
+ * because uintmax_t might require stricter alignment than
+ * pointers; e.g., IP32 LL64 SPARC. [#23150]
+ */
+ tstruct = (DB_MPOOL_FSTAT *)&tfsp[*countp + 1];
+ tstruct = ALIGNP_INC(tstruct, sizeof(uintmax_t));
+ tname = (char *)&tstruct[*countp];
*tfsp = tstruct;
} else {
+ /*
+ * This stat struct follows the previous one; the file name
+ * follows the previous entry's filename.
+ */
tstruct = *tfsp + 1;
tname = (*tfsp)->file_name + strlen((*tfsp)->file_name) + 1;
*++tfsp = tstruct;
@@ -486,6 +525,8 @@ __memp_print_stats(env, flags)
(u_long)gsp->st_mvcc_thawed);
__db_dl(env, "The number of frozen buffers freed",
(u_long)gsp->st_mvcc_freed);
+ __db_dl(env, "The number of outdated intermediate versions reused",
+ (u_long)gsp->st_mvcc_reused);
__db_dl(env, "The number of page allocations", (u_long)gsp->st_alloc);
__db_dl(env,
"The number of hash buckets examined during allocations",
@@ -744,11 +785,18 @@ __memp_print_hash(env, dbmp, reginfo, fmap, flags)
vbhp != NULL;
vbhp = SH_CHAIN_PREV(vbhp, vc, __bh)) {
__memp_print_bh(env, dbmp,
- " next:\t", vbhp, fmap);
+ " prev:\t", vbhp, fmap);
}
}
MUTEX_UNLOCK(env, hp->mtx_hash);
}
+#ifdef DIAGNOSTIC
+ SH_TAILQ_FOREACH(bhp, &c_mp->free_frozen, hq, __bh) {
+ __db_msg(env, "free frozen %lu pgno %lu mtx_buf %lu",
+ (u_long)R_OFFSET(dbmp->reginfo, bhp),
+ (u_long)bhp->pgno, (u_long)bhp->mtx_buf);
+ }
+#endif
return (0);
}
@@ -775,6 +823,7 @@ __memp_print_bh(env, dbmp, prefix, bhp, fmap)
{ BH_FROZEN, "frozen" },
{ BH_TRASH, "trash" },
{ BH_THAWED, "thawed" },
+ { BH_UNREACHABLE, "unreachable" },
{ 0, NULL }
};
DB_MSGBUF mb;
diff --git a/src/mp/mp_sync.c b/src/mp/mp_sync.c
index fa06b1d4..82d5c8de 100644
--- a/src/mp/mp_sync.c
+++ b/src/mp/mp_sync.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -95,9 +95,11 @@ __memp_discard_all_mpfs (env, mp)
while ((mfp = SH_TAILQ_FIRST(
&hp->hash_bucket, __mpoolfile)) != NULL) {
MUTEX_LOCK(env, mfp->mutex);
- if ((t_ret = __memp_mf_discard(dbmp, mfp, 1)) != 0 &&
- ret == 0)
- ret = t_ret;
+ if ((t_ret = __memp_mf_discard(dbmp, mfp, 1)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ break;
+ }
}
MUTEX_UNLOCK(env, hp->mtx_hash);
}
@@ -837,6 +839,7 @@ __memp_mf_sync(dbmp, mfp, locked)
MPOOLFILE *mfp;
int locked;
{
+ APPNAME appname;
DB_FH *fhp;
DB_MPOOL_HASH *hp;
ENV *env;
@@ -846,6 +849,7 @@ __memp_mf_sync(dbmp, mfp, locked)
COMPQUIET(hp, NULL);
env = dbmp->env;
+ appname = DB_APP_DATA;
/*
* We need to be holding the hash lock: we're using the path name
@@ -859,13 +863,20 @@ __memp_mf_sync(dbmp, mfp, locked)
MUTEX_LOCK(env, hp->mtx_hash);
}
- if ((ret = __db_appname(env, DB_APP_DATA,
+mpsync: if ((ret = __db_appname(env, appname,
R_ADDR(dbmp->reginfo, mfp->path_off), NULL, &rpath)) == 0) {
if ((ret = __os_open(env, rpath, 0, 0, 0, &fhp)) == 0) {
ret = __os_fsync(env, fhp);
if ((t_ret =
__os_closehandle(env, fhp)) != 0 && ret == 0)
ret = t_ret;
+ } else {
+ /* We may be syncing the blob meta db. */
+ if (appname != DB_APP_BLOB) {
+ __os_free(env, rpath);
+ appname = DB_APP_BLOB;
+ goto mpsync;
+ }
}
__os_free(env, rpath);
}
diff --git a/src/mp/mp_trickle.c b/src/mp/mp_trickle.c
index fba528b3..ff8cb875 100644
--- a/src/mp/mp_trickle.c
+++ b/src/mp/mp_trickle.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/