diff options
Diffstat (limited to 'src/mp')
-rw-r--r-- | src/mp/mp_alloc.c | 320 | ||||
-rw-r--r-- | src/mp/mp_backup.c | 7 | ||||
-rw-r--r-- | src/mp/mp_bh.c | 14 | ||||
-rw-r--r-- | src/mp/mp_fget.c | 170 | ||||
-rw-r--r-- | src/mp/mp_fmethod.c | 58 | ||||
-rw-r--r-- | src/mp/mp_fopen.c | 79 | ||||
-rw-r--r-- | src/mp/mp_fput.c | 5 | ||||
-rw-r--r-- | src/mp/mp_fset.c | 2 | ||||
-rw-r--r-- | src/mp/mp_method.c | 21 | ||||
-rw-r--r-- | src/mp/mp_mvcc.c | 20 | ||||
-rw-r--r-- | src/mp/mp_region.c | 260 | ||||
-rw-r--r-- | src/mp/mp_register.c | 2 | ||||
-rw-r--r-- | src/mp/mp_resize.c | 121 | ||||
-rw-r--r-- | src/mp/mp_stat.c | 73 | ||||
-rw-r--r-- | src/mp/mp_sync.c | 21 | ||||
-rw-r--r-- | src/mp/mp_trickle.c | 2 |
16 files changed, 858 insertions, 317 deletions
diff --git a/src/mp/mp_alloc.c b/src/mp/mp_alloc.c index dc331215..011f54c6 100644 --- a/src/mp/mp_alloc.c +++ b/src/mp/mp_alloc.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -22,8 +22,112 @@ #endif /* + * __memp_bh_unreachable -- + * + * Determine whether this buffer can not ever be seen again: is the next + * newer version visible to the same transaction which sees this one? + * If both versions are visibile to the same transaction, there is no + * reason to keep the older one: it can be purged. + * + * If this buffer has a more recent version, and there is a transaction + * with a read_lsn between this buffer's and that more recent version's, + * the buffer is visible to at least that transaction, so return FALSE. + * Otherwise return TRUE. + * + * txns: 3/10 2/10 2/5 2/1 1/10 + * vers: 3/15 2/15 2/14 2/10 2/8 1/150 + * vis vis unreach vis unreach vis + * who new txns 3/10 2/10 2/5, 2/1 + * sees + * + * Note: in the abvove example, the page was allocated after txn 1/10 + * started. 1/10 would not see any version of the page. + * + * PUBLIC: int __memp_bh_unreachable __P((ENV *, BH *, DB_LSN *, int)); + */ +int +__memp_bh_unreachable(env, bhp, snapshots, n_snapshots) + ENV *env; + BH *bhp; + DB_LSN *snapshots; + int n_snapshots; +{ + BH *newer_bhp; + DB_LSN b_vlsn, n_vlsn; + int i, ret; +#ifdef DIAGNOSTIC + DB_MPOOL *dbmp; + DB_MSGBUF mb; + MPOOLFILE *bh_mfp; +#endif + + /* + * The buffer can't be purged if it is being used, or is the most recent + * version, or the next newer version isn't a copy yet. + */ + if (BH_REFCOUNT(bhp) != 0 || + (newer_bhp = SH_CHAIN_NEXT(bhp, vc, __bh)) == NULL || + newer_bhp->td_off == INVALID_ROFF) + return (FALSE); + + /* + * Find the visiblity LSNs for this buffer (b_vlsn) and the more recent, + * newer buffer (n_vlsn). If the newer version hasn't committed yet the + * bhp could be needed. + */ + n_vlsn = *VISIBLE_LSN(env, newer_bhp); + if (IS_MAX_LSN(n_vlsn)) + return (FALSE); + if (bhp->td_off == INVALID_ROFF) + INIT_LSN(b_vlsn); + else + b_vlsn = *VISIBLE_LSN(env, bhp); + + ret = TRUE; + /* + * Look for a transaction which is between n_lsn and b_lsn - determining + * that bhp is reachable. Stop looking once the transactions get so + * small (old) that they precede the buffer's version; no earlier txn + * could be between n_vlsn and b_vlsn. + */ + for (i = 0; + i < n_snapshots && LOG_COMPARE(&snapshots[i], &b_vlsn) >= 0; + i++) { + if (LOG_COMPARE(&snapshots[i], &n_vlsn) < 0) { + /* + * This txn can see (started after) bhp, but not + * newer_bhp (which committed after this txn started). + */ + ret = FALSE; + break; + } + } + +#ifdef DIAGNOSTIC + if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC)) { + dbmp = env->mp_handle; + bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); + DB_MSGBUF_INIT(&mb); + __db_msgadd(env, &mb, + "bh_unreachable %s pgno %d %s %lu/%lu %x newer %lu/%lu txn #%d in\n", + __memp_fns(dbmp, bh_mfp), bhp->pgno, + ret ? "purgeable" : "needed", + (u_long)b_vlsn.file, (u_long)b_vlsn.offset, bhp->flags, + (u_long)n_vlsn.file, (u_long)n_vlsn.offset, i); + for (i = 0; i != n_snapshots; i++) + __db_msgadd(env, &mb, " %lu/%lu", + (u_long)snapshots[i].file, + (u_long)snapshots[i].offset); + DB_MSGBUF_FLUSH(env, &mb); + } +#endif + return (ret); +} + +/* * __memp_alloc -- - * Allocate some space from a cache region. + * Allocate some space from a cache region. If the region is full then + * reuse one or more cache buffers. * * PUBLIC: int __memp_alloc __P((DB_MPOOL *, * PUBLIC: REGINFO *, MPOOLFILE *, size_t, roff_t *, void *)); @@ -39,7 +143,7 @@ __memp_alloc(dbmp, infop, mfp, len, offsetp, retp) { BH *bhp, *current_bhp, *mvcc_bhp, *oldest_bhp; BH_FROZEN_PAGE *frozen_bhp; - DB_LSN oldest_reader, vlsn; + DB_LSN *snapshots, vlsn; DB_MPOOL_HASH *dbht, *hp, *hp_end, *hp_saved, *hp_tmp; ENV *env; MPOOL *c_mp; @@ -49,7 +153,7 @@ __memp_alloc(dbmp, infop, mfp, len, offsetp, retp) u_int32_t dirty_eviction, high_priority, priority, versions; u_int32_t priority_saved, put_counter, lru_generation, total_buckets; int aggressive, alloc_freeze, b_lock, giveup; - int h_locked, need_free, obsolete, ret, write_error; + int h_locked, need_free, n_snapshots, obsolete, ret, write_error; u_int8_t *endp; void *p; @@ -58,11 +162,10 @@ __memp_alloc(dbmp, infop, mfp, len, offsetp, retp) dbht = R_ADDR(infop, c_mp->htab); hp_end = &dbht[c_mp->htab_buckets]; hp_saved = NULL; - priority_saved = 0; - write_error = 0; - + snapshots = NULL; + priority_saved = write_error = 0; buckets = buffers = put_counter = total_buckets = versions = 0; - aggressive = alloc_freeze = giveup = h_locked = 0; + aggressive = alloc_freeze = giveup = h_locked = n_snapshots = 0; /* * If we're allocating a buffer, and the one we're discarding is the @@ -138,13 +241,15 @@ found: if (offsetp != NULL) c_mp->stat.st_alloc_pages, buffers, infop->id); } #endif - return (0); + goto done; } else if (giveup || c_mp->pages == 0) { MPOOL_REGION_UNLOCK(env, infop); __db_errx(env, DB_STR("3017", "unable to allocate space from the buffer cache")); - return ((ret == ENOMEM && write_error != 0) ? EIO : ret); + if (ret == ENOMEM && write_error != 0) + ret = EIO; + goto done; } search: @@ -158,7 +263,6 @@ search: lru_generation = c_mp->lru_generation; ret = 0; - MAX_LSN(oldest_reader); /* * We re-attempt the allocation every time we've freed 3 times what @@ -222,6 +326,13 @@ search: goto alloc; MPOOL_REGION_UNLOCK(env, infop); + /* Refresh the list of mvcc reader transactions. */ + if (snapshots != NULL) + __os_free(env, snapshots); + if ((ret = __txn_get_readers( + env, &snapshots, &n_snapshots)) != 0) + goto err; + aggressive++; /* * Once aggressive, we consider all buffers. By setting @@ -266,13 +377,6 @@ search: if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL) continue; - /* Set aggressive if we have already searched for too long. */ - if (aggressive == 0 && buckets >= MPOOL_ALLOC_SEARCH_LIMIT) { - aggressive = 1; - /* Once aggressive, we consider all buffers. */ - high_priority = MPOOL_LRU_MAX; - } - /* Unlock the region and lock the hash bucket. */ MPOOL_REGION_UNLOCK(env, infop); MUTEX_READLOCK(env, hp->mtx_hash); @@ -280,29 +384,45 @@ search: b_lock = 0; /* + * Set aggressive to consider all buffers if we have already + * searched in too many buckets. + */ + if (buckets > MPOOL_ALLOC_SEARCH_LIMIT && aggressive == 0) { + aggressive = 1; + /* Once aggressive, we consider all buffers. */ + high_priority = MPOOL_LRU_MAX; + if (snapshots == NULL && (ret = __txn_get_readers( + env, &snapshots, &n_snapshots)) != 0) + goto err; + } + + /* * Find a buffer we can use. + * Skip over refcount > 0 buffers; we can't get rid of them. * - * We use the lowest-LRU singleton buffer if we find one and - * it's better than the result of another hash bucket we've + * Without MVCC we use the lowest-LRU singleton buffer we find + * that's better than the result of another hash bucket we've * reviewed. We do not use a buffer which has a priority * greater than high_priority unless we are being aggressive. * - * With MVCC buffers, the situation is more complicated: we - * don't want to free a buffer out of the middle of an MVCC - * chain, since that requires I/O. So, walk the buffers, - * looking for an obsolete buffer at the end of an MVCC chain. - * Once a buffer becomes obsolete, its LRU priority is - * irrelevant because that version can never be accessed again. + * MVCC requires looking at additional factors: we don't want to + * free a still-relevent buffer out of the middle of an MVCC + * chain, since that requires freezing - lots of I/O. So, + * walk the buffers, looking for an obsolete buffer at the + * end of the MVCC chain. Once a buffer becomes obsolete, its + * LRU priority is irrelevant because that version can never + * be accessed again. * * If we don't find any obsolete MVCC buffers, we will get * aggressive, and in that case consider the lowest priority * buffer within a chain. - * - * Ignore referenced buffers, we can't get rid of them. */ retry_search: bhp = NULL; bucket_priority = high_priority; obsolete = 0; + if (n_snapshots > 0 && LOG_COMPARE(&snapshots[n_snapshots - 1], + &hp->old_reader) > 0) + hp->old_reader = snapshots[n_snapshots - 1]; SH_TAILQ_FOREACH(current_bhp, &hp->hash_bucket, hq, __bh) { /* * First, do the standard LRU check for singletons. @@ -340,55 +460,63 @@ retry_search: bhp = NULL; mvcc_bhp != NULL; oldest_bhp = mvcc_bhp, mvcc_bhp = SH_CHAIN_PREV(mvcc_bhp, vc, __bh)) { + DB_ASSERT(env, mvcc_bhp != + SH_CHAIN_PREV(mvcc_bhp, vc, __bh)); #ifdef MPOOL_ALLOC_SEARCH_DYN if (aggressive == 0 && - ++high_priority >= c_mp->lru_priority) + ++high_priority >= c_mp->lru_priority) { aggressive = 1; + if (snapshots == NULL && (ret = + __txn_readers(env, + &snapshots, &n_snapshots)) != 0) + goto err; + } #endif - DB_ASSERT(env, mvcc_bhp != - SH_CHAIN_PREV(mvcc_bhp, vc, __bh)); - if ((aggressive < 2 && - ++versions < (buffers >> 2)) || - BH_REFCOUNT(mvcc_bhp) != 0) + if (n_snapshots > 0 && + __memp_bh_unreachable(env, + mvcc_bhp, snapshots, n_snapshots)) { + oldest_bhp = mvcc_bhp; + goto is_obsolete; + } + if (bhp != NULL && + mvcc_bhp->priority >= bhp->priority) + continue; + if (BH_REFCOUNT(mvcc_bhp) != 0) + continue; + /* + * Since taking still-relevant versions requires + * freezing, skip over them at low aggression + * levels unless we see that a high proportion + * of buffers (over 1/4) are MVCC copies. + */ + if (aggressive < 2 && + ++versions < (buffers >> 2)) continue; buffers++; - if (!F_ISSET(mvcc_bhp, BH_FROZEN) && - (bhp == NULL || - bhp->priority > mvcc_bhp->priority)) { - if (bhp != NULL) - atomic_dec(env, &bhp->ref); - bhp = mvcc_bhp; - atomic_inc(env, &bhp->ref); - } + if (F_ISSET(mvcc_bhp, BH_FROZEN)) + continue; + /* + * Select mvcc_bhp as current best candidate, + * releasing the current candidate, if any. + */ + if (bhp != NULL) + atomic_dec(env, &bhp->ref); + bhp = mvcc_bhp; + atomic_inc(env, &bhp->ref); } /* * oldest_bhp is the last buffer on the MVCC chain, and * an obsolete buffer at the end of the MVCC chain gets - * used without further search. Before checking for - * obsolescence, update the cached oldest reader LSN in - * the bucket if it is older than call's oldest_reader. + * used without further search. */ if (BH_REFCOUNT(oldest_bhp) != 0) continue; - if (LOG_COMPARE(&oldest_reader, &hp->old_reader) > 0) { - if (IS_MAX_LSN(oldest_reader) && - (ret = __txn_oldest_reader( - env, &oldest_reader)) != 0) { - MUTEX_UNLOCK(env, hp->mtx_hash); - if (bhp != NULL) - atomic_dec(env, &bhp->ref); - return (ret); - } - if (LOG_COMPARE(&oldest_reader, - &hp->old_reader) > 0) - hp->old_reader = oldest_reader; - } - if (BH_OBSOLETE(oldest_bhp, hp->old_reader, vlsn)) { if (aggressive < 2) buffers++; +is_obsolete: obsolete = 1; if (bhp != NULL) atomic_dec(env, &bhp->ref); @@ -410,10 +538,18 @@ retry_search: bhp = NULL; /* * Compare two hash buckets and select the one with the lower - * priority. Performance testing showed looking at two improves - * the LRU-ness and looking at more only does a little better. + * priority, except mvcc at high aggression levels. Performance + * testing shows looking at two improves the LRU-ness and + * looking at more only does a little better. */ if (hp_saved == NULL) { + /* + * At high aggressive levels when mvcc is active, stop + * looking for candidate once one has been found. + * Freezing takes more time than writing out to a db. + */ + if (aggressive > 1 && n_snapshots > 1) + goto this_buffer; hp_saved = hp; priority_saved = priority; goto next_hb; @@ -487,11 +623,15 @@ this_buffer: /* /* We cannot block as the caller is probably holding locks. */ if ((ret = MUTEX_TRYLOCK(env, bhp->mtx_buf)) != 0) { - if (ret != DB_LOCK_NOTGRANTED) - return (ret); + if (ret != DB_LOCK_NOTGRANTED) { + goto err; + } + ret = 0; goto next_hb; } F_SET(bhp, BH_EXCLUSIVE); + if (obsolete) + F_SET(bhp, BH_UNREACHABLE); b_lock = 1; /* Someone may have grabbed it while we got the lock. */ @@ -557,7 +697,7 @@ this_buffer: /* F_CLR(bhp, BH_EXCLUSIVE); MUTEX_UNLOCK(env, bhp->mtx_buf); DB_ASSERT(env, !h_locked); - return (ret); + goto err; } } @@ -573,16 +713,25 @@ this_buffer: /* if (BH_REFCOUNT(bhp) != 1 || F_ISSET(bhp, BH_DIRTY) || (SH_CHAIN_HASNEXT(bhp, vc) && SH_CHAIN_NEXTP(bhp, vc, __bh)->td_off != bhp->td_off && - !BH_OBSOLETE(bhp, hp->old_reader, vlsn))) + !(obsolete || BH_OBSOLETE(bhp, hp->old_reader, vlsn)))) { + if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC)) + __db_msg(env, + "memp_alloc next_hb past bhp %lx flags %x ref %d %lx/%lx", + (u_long)R_OFFSET(infop, bhp), bhp->flags, + BH_REFCOUNT(bhp), + (u_long)R_OFFSET(infop, SH_CHAIN_NEXTP(bhp, vc, __bh)), + (u_long)R_OFFSET(infop, SH_CHAIN_PREVP(bhp, vc, __bh))); goto next_hb; + } /* * If the buffer is frozen, thaw it and look for another one - * we can use. (Calling __memp_bh_freeze above will not - * mark bhp BH_FROZEN.) + * we can use. (Calling __memp_bh_freeze above will not mark + * this bhp BH_FROZEN; it creates another frozen one.) */ if (F_ISSET(bhp, BH_FROZEN)) { - DB_ASSERT(env, obsolete || SH_CHAIN_SINGLETON(bhp, vc)); + DB_ASSERT(env, SH_CHAIN_SINGLETON(bhp, vc) || + obsolete || BH_OBSOLETE(bhp, hp->old_reader, vlsn)); DB_ASSERT(env, BH_REFCOUNT(bhp) > 0); if (!F_ISSET(bhp, BH_THAWED)) { /* @@ -592,10 +741,10 @@ this_buffer: /* */ if ((ret = __memp_bh_thaw(dbmp, infop, hp, bhp, NULL)) != 0) - return (ret); + goto done; MUTEX_READLOCK(env, hp->mtx_hash); } else { - need_free = (atomic_dec(env, &bhp->ref) == 0); + need_free = atomic_dec(env, &bhp->ref) == 0; F_CLR(bhp, BH_EXCLUSIVE); MUTEX_UNLOCK(env, bhp->mtx_buf); if (need_free) { @@ -626,7 +775,10 @@ this_buffer: /* if (alloc_freeze) { if ((ret = __memp_bhfree(dbmp, infop, bh_mfp, hp, bhp, 0)) != 0) - return (ret); + goto err; + DB_ASSERT(env, bhp->mtx_buf != MUTEX_INVALID); + if ((ret = __mutex_free(env, &bhp->mtx_buf)) != 0) + goto err; b_lock = 0; h_locked = 0; @@ -654,23 +806,21 @@ this_buffer: /* } /* - * Check to see if the buffer is the size we're looking for. - * If so, we can simply reuse it. Otherwise, free the buffer - * and its space and keep looking. + * If the buffer is the size we're looking for, we can simply + * reuse it. Otherwise, free it and keep looking. */ if (mfp != NULL && mfp->pagesize == bh_mfp->pagesize) { if ((ret = __memp_bhfree(dbmp, infop, bh_mfp, hp, bhp, 0)) != 0) - return (ret); + goto err; p = bhp; goto found; } freed_space += sizeof(*bhp) + bh_mfp->pagesize; - if ((ret = - __memp_bhfree(dbmp, infop, - bh_mfp, hp, bhp, BH_FREE_FREEMEM)) != 0) - return (ret); + if ((ret = __memp_bhfree(dbmp, + infop, bh_mfp, hp, bhp, BH_FREE_FREEMEM)) != 0) + goto err; /* Reset "aggressive" and "write_error" if we free any space. */ if (aggressive > 1) @@ -689,12 +839,14 @@ next_hb: if (bhp != NULL) { if (b_lock) { F_CLR(bhp, BH_EXCLUSIVE); MUTEX_UNLOCK(env, bhp->mtx_buf); + b_lock = 0; } } if (h_locked) MUTEX_UNLOCK(env, hp->mtx_hash); h_locked = 0; } + obsolete = 0; MPOOL_REGION_LOCK(env, infop); /* @@ -706,7 +858,15 @@ next_hb: if (bhp != NULL) { if (freed_space >= 3 * len) goto alloc; } - /* NOTREACHED */ +err: + if (h_locked) { + MUTEX_UNLOCK(env, hp->mtx_hash); + h_locked = 0; + } +done: + if (snapshots != NULL) + __os_free(env, snapshots); + return (ret); } /* diff --git a/src/mp/mp_backup.c b/src/mp/mp_backup.c index f376cda7..f1072292 100644 --- a/src/mp/mp_backup.c +++ b/src/mp/mp_backup.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -145,6 +145,9 @@ __memp_backup_mpf(env, mpf, ip, first_pgno, last_pgno, fp, handle, flags) if (backup == NULL || (len = backup->size) == 0) len = MEGABYTE; + /* Ensure backup page size is at least as big as db page size */ + if (len < mfp->pagesize) + len = mfp->pagesize; if ((ret = __os_malloc(env, len, &buf)) != 0) return (ret); write_size = (u_int32_t)(len / mfp->pagesize); @@ -188,7 +191,7 @@ __memp_backup_mpf(env, mpf, ip, first_pgno, last_pgno, fp, handle, flags) if (backup != NULL && backup->write != NULL) { if ((ret = backup->write( - env->dbenv, gigs, off, (u_int32_t)nr, + env->dbenv, gigs, off, (u_int32_t)nr, buf, handle)) != 0) break; } else { diff --git a/src/mp/mp_bh.c b/src/mp/mp_bh.c index 1df8e206..30293f29 100644 --- a/src/mp/mp_bh.c +++ b/src/mp/mp_bh.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -157,7 +157,7 @@ __memp_bhwrite(dbmp, hp, mfp, bhp, open_extents) opened = 1; if ((ret = __memp_fopen(dbmfp, mfp, NULL, NULL, DB_FLUSH | DB_DURABLE_UNKNOWN, 0, mfp->pagesize)) != 0) { - dbmfp->ref--; + dbmfp->ref--; (void)__memp_fclose(dbmfp, 0); /* @@ -264,7 +264,7 @@ __memp_pgread(dbmfp, bhp, can_create) * how to handle the error. */ if (!can_create) { - ret = DB_PAGE_NOTFOUND; + ret = USR_ERR(env, DB_PAGE_NOTFOUND); goto err; } @@ -557,6 +557,9 @@ err: __db_errx(env, DB_STR_A("3016", * __memp_bhfree -- * Free a bucket header and its referenced data. * + * The hash bucket is unlocked before returning except when flags includes + * BH_FREE_UNLOCKED -- or there was no hp passed in to begin with. + * * PUBLIC: int __memp_bhfree __P((DB_MPOOL *, * PUBLIC: REGINFO *, MPOOLFILE *, DB_MPOOL_HASH *, BH *, u_int32_t)); */ @@ -600,10 +603,13 @@ __memp_bhfree(dbmp, infop, mfp, hp, bhp, flags) (SH_CHAIN_NEXTP(bhp, vc, __bh)->td_off == bhp->td_off || bhp->td_off == INVALID_ROFF || IS_MAX_LSN(*VISIBLE_LSN(env, bhp)) || + F_ISSET(bhp, BH_UNREACHABLE) || BH_OBSOLETE(bhp, hp->old_reader, vlsn)))); PERFMON3(env, mpool, evict, __memp_fns(dbmp, mfp), bhp->pgno, bhp); - + if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC)) + __db_msg(env, "bhfree pgno %lu roff %lx", + (u_long)bhp->pgno, (u_long)R_OFFSET(dbmp->reginfo, bhp)); /* * Delete the buffer header from the hash bucket queue or the * version chain. diff --git a/src/mp/mp_fget.c b/src/mp/mp_fget.c index 5f9a4bf9..270135bd 100644 --- a/src/mp/mp_fget.c +++ b/src/mp/mp_fget.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -53,15 +53,19 @@ __memp_fget_pp(dbmfp, pgnoaddr, txnp, flags, addrp) * time, which we don't want to do because one of our big goals in life * is to keep database files small. It's sleazy as hell, but we catch * any attempt to actually write the file in memp_fput(). + * + * CREATE, LAST, and NEW are mutually exclusive. DIRTY and EDIT are also + * mutually exclusive - that is checked in __memp_fget() itself.. */ +#undef OKMODE #undef OKFLAGS -#define OKFLAGS (DB_MPOOL_CREATE | DB_MPOOL_DIRTY | \ - DB_MPOOL_EDIT | DB_MPOOL_LAST | DB_MPOOL_NEW) +#define OKMODE (DB_MPOOL_CREATE | DB_MPOOL_LAST | DB_MPOOL_NEW) +#define OKFLAGS (OKMODE | DB_MPOOL_DIRTY | DB_MPOOL_EDIT) if (flags != 0) { if ((ret = __db_fchk(env, "memp_fget", flags, OKFLAGS)) != 0) return (ret); - switch (FLD_CLR(flags, DB_MPOOL_DIRTY | DB_MPOOL_EDIT)) { + switch (FLD_ISSET(flags, OKMODE)) { case DB_MPOOL_CREATE: case DB_MPOOL_LAST: case DB_MPOOL_NEW: @@ -131,6 +135,7 @@ __memp_fget(dbmfp, pgnoaddr, ip, txn, flags, addrp) #ifdef DIAGNOSTIC DB_LOCKTAB *lt; DB_LOCKER *locker; + int pagelock_err; #endif *(void **)addrp = NULL; @@ -274,7 +279,7 @@ retry: MUTEX_LOCK(env, hp->mtx_hash); * the BTREE in a subsequent txn). */ if (bhp == NULL) { - ret = DB_PAGE_NOTFOUND; + ret = USR_ERR(env, DB_PAGE_NOTFOUND); goto err; } } @@ -303,7 +308,10 @@ retry: MUTEX_LOCK(env, hp->mtx_hash); MUTEX_UNLOCK(env, hp->mtx_hash); h_locked = 0; if (dirty || extending || makecopy || F_ISSET(bhp, BH_FROZEN)) { -xlatch: if (LF_ISSET(DB_MPOOL_TRY)) { +#ifdef HAVE_SHARED_LATCHES +xlatch: +#endif + if (LF_ISSET(DB_MPOOL_TRY)) { if ((ret = MUTEX_TRYLOCK(env, bhp->mtx_buf)) != 0) goto err; @@ -373,11 +381,11 @@ thawed: need_free = (atomic_dec(env, &bhp->ref) == 0); bhp = NULL; goto retry; } else if (dirty && SH_CHAIN_HASNEXT(bhp, vc)) { - ret = DB_LOCK_DEADLOCK; + ret = USR_ERR(env, DB_LOCK_DEADLOCK); goto err; } else if (F_ISSET(bhp, BH_FREED) && flags != DB_MPOOL_CREATE && flags != DB_MPOOL_NEW && flags != DB_MPOOL_FREE) { - ret = DB_PAGE_NOTFOUND; + ret = USR_ERR(env, DB_PAGE_NOTFOUND); goto err; } @@ -508,9 +516,13 @@ revive: if (F_ISSET(bhp, BH_FREED)) /* * With multiversion databases, we might need to * allocate a new buffer into which we can copy the one - * that we found. In that case, check the last buffer + * that we found. In that case, check the old versions * in the chain to see whether we can reuse an obsolete - * buffer. + * or unreachable buffer. First see whether the oldest + * version is truly obsolete. If not, look for somewhat + * more recent versions which are no longer needed + * because the snapshot transactions which once could + * have seen them have now exited. * * To provide snapshot isolation, we need to make sure * that we've seen a buffer older than the oldest @@ -523,24 +535,17 @@ reuse: if ((makecopy || F_ISSET(bhp, BH_FROZEN)) && } if ((makecopy || F_ISSET(bhp, BH_FROZEN)) && SH_CHAIN_HASPREV(bhp, vc)) { - oldest_bhp = SH_CHAIN_PREVP(bhp, vc, __bh); - while (SH_CHAIN_HASPREV(oldest_bhp, vc)) - oldest_bhp = SH_CHAIN_PREVP( - oldest_bhp, vc, __bh); - - if (BH_REFCOUNT(oldest_bhp) == 0 && - !BH_OBSOLETE( - oldest_bhp, hp->old_reader, vlsn) && - (ret = __txn_oldest_reader(env, - &hp->old_reader)) != 0) + if ((ret = __memp_find_obsolete_version(env, + bhp, hp, &oldest_bhp)) != 0) goto err; - - if (BH_OBSOLETE( - oldest_bhp, hp->old_reader, vlsn) && - BH_REFCOUNT(oldest_bhp) == 0) { + if (oldest_bhp != NULL) { DB_ASSERT(env, !F_ISSET(oldest_bhp, BH_DIRTY)); atomic_inc(env, &oldest_bhp->ref); +#ifdef HAVE_STATISTICS + if (SH_CHAIN_HASPREV(oldest_bhp, vc)) + c_mp->stat.st_mvcc_reused++; +#endif if (F_ISSET(oldest_bhp, BH_FROZEN)) { /* * This call will release the @@ -606,7 +611,7 @@ newpg: /* mfp->last_pgno >= mfp->maxpgno) { __db_errx(env, DB_STR_A("3023", "%s: file limited to %lu pages", "%s %lu"), - __memp_fn(dbmfp), (u_long)mfp->maxpgno); + __memp_fn(dbmfp), (u_long)mfp->maxpgno + 1); ret = ENOSPC; } else *pgnoaddr = mfp->last_pgno + 1; @@ -615,7 +620,7 @@ newpg: /* if (mfp->maxpgno != 0 && *pgnoaddr > mfp->maxpgno) { __db_errx(env, DB_STR_A("3024", "%s: file limited to %lu pages", "%s %lu"), - __memp_fn(dbmfp), (u_long)mfp->maxpgno); + __memp_fn(dbmfp), (u_long)mfp->maxpgno + 1); ret = ENOSPC; } else if (!extending) extending = *pgnoaddr > mfp->last_pgno; @@ -937,8 +942,17 @@ alloc: /* Allocate a new buffer header and data space. */ * need to make copy, so we now need to allocate another buffer * to hold the new copy. */ - if (alloc_bhp == NULL) + if (alloc_bhp == NULL) { + if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC)) + __db_msg(env, + "fget makecopy txn %08x %lu/%lu going to reuse pgno %d from %lu/%lu", + txn->txnid, td == NULL ? 0L : + (u_long)td->read_lsn.file, td == NULL ? 0L : + (u_long)td->read_lsn.offset, bhp->pgno, + (u_long)VISIBLE_LSN(env, bhp)->file, + (u_long)VISIBLE_LSN(env, bhp)->offset); goto reuse; + } DB_ASSERT(env, bhp != NULL && alloc_bhp != bhp); DB_ASSERT(env, bhp->td_off == INVALID_ROFF || @@ -1019,6 +1033,15 @@ alloc: /* Allocate a new buffer header and data space. */ F_CLR(bhp, BH_EXCLUSIVE); MUTEX_UNLOCK(env, bhp->mtx_buf); + if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC)) + __db_msg(env, + "fget makecopy txn %08x %lx pgno %d from %lu/%lu", + txn->txnid, (u_long)R_OFFSET(infop, bhp), + bhp->pgno, bhp->td_off == INVALID_ROFF ? 0L : + (u_long)VISIBLE_LSN(env, bhp)->file, + bhp->td_off == INVALID_ROFF ? 0L : + (u_long)VISIBLE_LSN(env, bhp)->offset); + bhp = alloc_bhp; DB_ASSERT(env, BH_REFCOUNT(bhp) > 0); b_incr = 1; @@ -1164,8 +1187,15 @@ alloc: /* Allocate a new buffer header and data space. */ lt = env->lk_handle; locker = (DB_LOCKER *) (R_ADDR(<->reginfo, ip->dbth_locker)); - DB_ASSERT(env, __db_has_pagelock(env, locker, dbmfp, - (PAGE*)bhp->buf, DB_LOCK_WRITE) == 0); + pagelock_err = __db_has_pagelock(env, locker, dbmfp, + (PAGE *)bhp->buf, DB_LOCK_WRITE); + if (pagelock_err != 0) { + if (pagelock_err == DB_RUNRECOVERY) + return (pagelock_err); + __db_syserr(env, pagelock_err, + "Locker %x has no page lock for pgno %d", + locker->id, ((PAGE *)bhp->buf)->pgno); + } } #endif @@ -1228,3 +1258,85 @@ err: /* return (ret); } + +/* + * __memp_find_obsolete_version -- + * + * Search the version chain, from oldest to youngest, looking for buffers + * which are no longer BH_VISIBLE() to any existing transaction. + * + * The hash bucket is locked, no buffer is locked. + * + * PUBLIC: int __memp_find_obsolete_version + * PUBLIC: __P((ENV *, BH *, DB_MPOOL_HASH *, BH **)); + */ +int +__memp_find_obsolete_version(env, vis_bhp, hp, foundp) + ENV *env; + BH *vis_bhp; + DB_MPOOL_HASH *hp; + BH **foundp; +{ + BH *bhp; + DB_LSN *readers, vlsn; + int n_readers, ret; + + *foundp = NULL; + readers = NULL; + ret = 0; + bhp = SH_CHAIN_PREVP(vis_bhp, vc, __bh); + while (SH_CHAIN_HASPREV(bhp, vc)) + bhp = SH_CHAIN_PREVP(bhp, vc, __bh); + + /* + * The least-expensive case is finding an obsolete version without + * needing to build the active snapshot transactionn list. + */ + if (BH_OBSOLETE(bhp, hp->old_reader, vlsn) && BH_REFCOUNT(bhp) == 0) { + *foundp = bhp; + goto out; + } + + if ((ret = __txn_get_readers(env, &readers, &n_readers)) != 0) + goto out; + + if (LOG_COMPARE(&readers[n_readers - 1], &hp->old_reader) > 0) { + hp->old_reader = readers[n_readers - 1]; + if (BH_OBSOLETE(bhp, hp->old_reader, vlsn) && + BH_REFCOUNT(bhp) == 0) { + *foundp = bhp; + goto cleanup; + } + } + + while ((bhp = SH_CHAIN_NEXT(bhp, vc, __bh)) != vis_bhp) { + if (BH_REFCOUNT(bhp) == 0 && + __memp_bh_unreachable(env, bhp, readers, n_readers)) { + *foundp = bhp; +#ifdef DIAGNOSTIC + /* + * Usually when the hash bucket is locked, the refcount + * is incremented and the bucket unlocked before the + * buffer is locked; this avoids mtx_buf deadlocks. + * This unreachable version cannot be involved with any + * deadlock-creating locking, though the head of the + * version chain could be locked. No TRYLOCK needed. + */ + MUTEX_LOCK(env, bhp->mtx_buf); + F_SET(bhp, BH_UNREACHABLE); + MUTEX_UNLOCK(env, bhp->mtx_buf); +#endif + break; + } + } + +cleanup: + if (readers != NULL) + __os_free(env, readers); +out: + if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC) && *foundp != NULL) + __db_msg(env, "fget reusing %p pgno %d @%lu/%lu", bhp, + bhp->pgno, (u_long)VISIBLE_LSN(env, bhp)->file, + (u_long)VISIBLE_LSN(env, bhp)->offset); + return (ret); +} diff --git a/src/mp/mp_fmethod.c b/src/mp/mp_fmethod.c index 41bd638c..4974f57c 100644 --- a/src/mp/mp_fmethod.c +++ b/src/mp/mp_fmethod.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -315,7 +315,7 @@ __memp_set_lsn_offset(dbmfp, lsn_offset) /* * __memp_get_maxsize -- - * Get the file's maximum size. + * Get the file's maximum size, returning zeroes if none is set. */ static int __memp_get_maxsize(dbmfp, gbytesp, bytesp) @@ -334,11 +334,22 @@ __memp_get_maxsize(dbmfp, gbytesp, bytesp) ENV_ENTER(env, ip); MUTEX_LOCK(env, mfp->mutex); - *gbytesp = (u_int32_t) - (mfp->maxpgno / (GIGABYTE / mfp->pagesize)); - *bytesp = (u_int32_t) - ((mfp->maxpgno % (GIGABYTE / mfp->pagesize)) * - mfp->pagesize); + if (mfp->maxpgno == 0) { + *gbytesp = *bytesp = 0; + } else { + *gbytesp = (u_int32_t) + (mfp->maxpgno / (GIGABYTE / mfp->pagesize)); + *bytesp = (u_int32_t) (mfp->maxpgno % + (GIGABYTE / mfp->pagesize) + 1) * mfp->pagesize; + /* + * After converting from 0-based maxpgno to #pages, we + * might have bumped into the next gigabyte boundary. + */ + if (*bytesp >= GIGABYTE) { + *bytesp -= GIGABYTE; + *gbytesp += 1; + } + } MUTEX_UNLOCK(env, mfp->mutex); ENV_LEAVE(env, ip); @@ -348,8 +359,34 @@ __memp_get_maxsize(dbmfp, gbytesp, bytesp) } /* + * __memp_set_maxpgno -- + * Set the file's maxpgno from the configured max size. If that size is + * pagesize or less then the filesize limit is disabled. + * + * PUBLIC: void __memp_set_maxpgno __P((MPOOLFILE *, u_int32_t, u_int32_t)); + */ +void +__memp_set_maxpgno(mfp, gbytes, bytes) + MPOOLFILE *mfp; + u_int32_t gbytes, bytes; +{ + if (gbytes == 0 && bytes <= mfp->pagesize) + mfp->maxpgno = 0; + else { + mfp->maxpgno = (db_pgno_t) + (gbytes * (GIGABYTE / mfp->pagesize)); + /* Round up to account for any fractional page. */ + mfp->maxpgno += (db_pgno_t) + ((bytes + mfp->pagesize - 1) / mfp->pagesize); + /* Convert from #pages to the zero-based max pgno. */ + mfp->maxpgno--; + } +} + +/* * __memp_set_maxsize -- - * Set the file's maximum size. + * Set the file's maximum size; if the size is <= pagesize then + * remove any file size limit. */ static int __memp_set_maxsize(dbmfp, gbytes, bytes) @@ -368,10 +405,7 @@ __memp_set_maxsize(dbmfp, gbytes, bytes) ENV_ENTER(env, ip); MUTEX_LOCK(env, mfp->mutex); - mfp->maxpgno = (db_pgno_t) - (gbytes * (GIGABYTE / mfp->pagesize)); - mfp->maxpgno += (db_pgno_t) - ((bytes + mfp->pagesize - 1) / mfp->pagesize); + __memp_set_maxpgno(mfp, gbytes, bytes); MUTEX_UNLOCK(env, mfp->mutex); ENV_LEAVE(env, ip); diff --git a/src/mp/mp_fopen.c b/src/mp/mp_fopen.c index ef7f886a..dbe7b9c8 100644 --- a/src/mp/mp_fopen.c +++ b/src/mp/mp_fopen.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -89,8 +89,9 @@ __memp_fopen_pp(dbmfp, path, flags, mode, pagesize) * Generate the number of user opens. If there is no backing file * there is an extra open count to keep the in memory db around. */ -#define MFP_OPEN_CNT(mfp) ((mfp)->mpf_cnt - ((mfp)->neutral_cnt + \ +#define MFP_OPEN_CNT(mfp) ((mfp)->mpf_cnt - ((mfp)->neutral_cnt + \ (u_int32_t)(mfp)->no_backing_file)) +#define MP_IOINFO_RETRIES 5 /* * __memp_fopen -- * DB_MPOOLFILE->open. @@ -118,7 +119,7 @@ __memp_fopen(dbmfp, mfp, path, dirp, flags, mode, pgsize) size_t maxmap; db_pgno_t last_pgno; u_int32_t bucket, mbytes, bytes, oflags, pagesize; - int refinc, ret, isdir; + int isdir, refinc, ret, tries; char *rpath; /* If this handle is already open, return. */ @@ -249,7 +250,7 @@ __memp_fopen(dbmfp, mfp, path, dirp, flags, mode, pgsize) if (MFP_OPEN_CNT(mfp) > 0 && atomic_read(&mfp->multiversion) == 0) { mvcc_err: __db_errx(env, DB_STR("3041", -"DB_MULTIVERSION cannot be specified on a database file which is already open")); +"DB_MULTIVERSION cannot be specified on a database file that is already open")); ret = EINVAL; goto err; } @@ -399,11 +400,44 @@ mvcc_err: __db_errx(env, DB_STR("3041", if (LF_ISSET(DB_ODDFILESIZE)) bytes -= (u_int32_t)(bytes % pagesize); else { - __db_errx(env, DB_STR_A("3037", - "%s: file size not a multiple of the pagesize", "%s"), - rpath); - ret = EINVAL; - goto err; + /* + * If the file size is not a multiple of the + * pagesize, it is likely because the ioinfo + * call is racing with a write that is extending + * the file. Many file systems will extend + * in fs block size units, and if the pagesize + * is larger than that, we can briefly see a + * file size that is not a multiple of pagesize. + * + * Yield the processor to allow that to finish + * and try again a few times. + */ + tries = 0; + STAT((mp->stat.st_oddfsize_detect++)); + while (tries < MP_IOINFO_RETRIES) { + if ((ret = __os_ioinfo(env, rpath, + dbmfp->fhp, &mbytes, &bytes, + NULL)) != 0) { + __db_err(env, ret, "%s", rpath); + goto err; + } + if (bytes % pagesize != 0) { + __os_yield(env, 0, 50000); + tries++; + } else { + STAT(( + mp->stat.st_oddfsize_resolve++)); + break; + } + } + if (tries == MP_IOINFO_RETRIES) { + __db_errx(env, DB_STR_A("3043", + "%s: file size (%lu %lu) not a multiple of the pagesize %lu", + "%s %lu %lu %lu"), + rpath, (u_long)mbytes, (u_long)bytes, (u_long)pagesize); + ret = EINVAL; + goto err; + } } } @@ -786,13 +820,7 @@ __memp_mpf_alloc(dbmp, dbmfp, path, pagesize, flags, retmfp) mfp->lsn_off = dbmfp->lsn_offset; mfp->clear_len = dbmfp->clear_len; mfp->priority = dbmfp->priority; - if (dbmfp->gbytes != 0 || dbmfp->bytes != 0) { - mfp->maxpgno = (db_pgno_t) - (dbmfp->gbytes * (GIGABYTE / mfp->pagesize)); - mfp->maxpgno += (db_pgno_t) - ((dbmfp->bytes + mfp->pagesize - 1) / - mfp->pagesize); - } + __memp_set_maxpgno(mfp, dbmfp->gbytes, dbmfp->bytes); if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) mfp->no_backing_file = 1; if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_UNLINK)) @@ -1019,6 +1047,7 @@ __memp_fclose(dbmfp, flags) ret = t_ret; __os_free(env, rpath); } + mfp->unlink_on_close = 0; } if (MFP_OPEN_CNT(mfp) == 0) { F_CLR(mfp, MP_NOT_DURABLE); @@ -1068,6 +1097,7 @@ __memp_mf_discard(dbmp, mfp, hp_locked) DB_MPOOL_STAT *sp; #endif MPOOL *mp; + char *rpath; int need_sync, ret, t_ret; env = dbmp->env; @@ -1095,6 +1125,23 @@ __memp_mf_discard(dbmp, mfp, hp_locked) */ mfp->deadfile = 1; + /* We should unlink the file if necessary. */ + if (mfp->block_cnt == 0 && mfp->mpf_cnt == 0 && mfp->unlink_on_close && + !F_ISSET(mfp, MP_TEMP) && !mfp->no_backing_file) { + if ((t_ret = __db_appname(env, DB_APP_DATA, + R_ADDR(dbmp->reginfo, mfp->path_off), NULL, + &rpath)) != 0 && ret == 0) + ret = t_ret; + if (t_ret == 0) { + if ((t_ret = __os_unlink( + dbmp->env, rpath, 0)) != 0 && ret == 0) + ret = t_ret; + __os_free(env, rpath); + } + mfp->unlink_on_close = 0; + need_sync = 0; + } + /* Discard the mutex we're holding and return it too the pool. */ MUTEX_UNLOCK(env, mfp->mutex); if ((t_ret = __mutex_free(env, &mfp->mutex)) != 0 && ret == 0) diff --git a/src/mp/mp_fput.c b/src/mp/mp_fput.c index 7a900fd0..06b30fd4 100644 --- a/src/mp/mp_fput.c +++ b/src/mp/mp_fput.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -52,7 +52,8 @@ __memp_fput_pp(dbmfp, pgaddr, priority, flags) /* * __memp_fput -- - * DB_MPOOLFILE->put. + * DB_MPOOLFILE->put. Release this reference to the page. If the reference + * count drop to zero adjust the buffer's cache priority. * * PUBLIC: int __memp_fput __P((DB_MPOOLFILE *, * PUBLIC: DB_THREAD_INFO *, void *, DB_CACHE_PRIORITY)); diff --git a/src/mp/mp_fset.c b/src/mp/mp_fset.c index 1129853f..770ec5c8 100644 --- a/src/mp/mp_fset.c +++ b/src/mp/mp_fset.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/mp/mp_method.c b/src/mp/mp_method.c index 7afae248..56d6c42b 100644 --- a/src/mp/mp_method.c +++ b/src/mp/mp_method.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -67,6 +67,7 @@ __memp_get_cachesize(dbenv, gbytesp, bytesp, ncachep) int *ncachep; { DB_MPOOL *dbmp; + DB_THREAD_INFO *ip; ENV *env; MPOOL *mp; @@ -78,12 +79,16 @@ __memp_get_cachesize(dbenv, gbytesp, bytesp, ncachep) if (MPOOL_ON(env)) { dbmp = env->mp_handle; mp = dbmp->reginfo[0].primary; + ENV_ENTER(env, ip); + MUTEX_LOCK(env, mp->mtx_resize); if (gbytesp != NULL) *gbytesp = mp->gbytes; if (bytesp != NULL) *bytesp = mp->bytes; if (ncachep != NULL) *ncachep = (int)mp->nreg; + MUTEX_UNLOCK(env, mp->mtx_resize); + ENV_LEAVE(env, ip); } else { if (gbytesp != NULL) *gbytesp = dbenv->mp_gbytes; @@ -380,7 +385,7 @@ __memp_set_mp_max_write(dbenv, maxwrite, maxwrite_sleep) env = dbenv->env; ENV_NOT_CONFIGURED(env, - env->mp_handle, "DB_ENV->get_mp_max_write", DB_INIT_MPOOL); + env->mp_handle, "DB_ENV->set_mp_max_write", DB_INIT_MPOOL); if (MPOOL_ON(env)) { dbmp = env->mp_handle; @@ -448,7 +453,7 @@ __memp_set_mp_mmapsize(dbenv, mp_mmapsize) env = dbenv->env; ENV_NOT_CONFIGURED(env, - env->mp_handle, "DB_ENV->set_mp_max_mmapsize", DB_INIT_MPOOL); + env->mp_handle, "DB_ENV->set_mp_mmapsize", DB_INIT_MPOOL); if (MPOOL_ON(env)) { dbmp = env->mp_handle; @@ -512,7 +517,7 @@ __memp_set_mp_pagesize(dbenv, mp_pagesize) env = dbenv->env; ENV_NOT_CONFIGURED(env, - env->mp_handle, "DB_ENV->get_mp_max_mmapsize", DB_INIT_MPOOL); + env->mp_handle, "DB_ENV->set_mp_pagesize", DB_INIT_MPOOL); ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_mp_pagesize"); dbenv->mp_pagesize = mp_pagesize; @@ -561,7 +566,7 @@ __memp_set_mp_tablesize(dbenv, mp_tablesize) env = dbenv->env; ENV_NOT_CONFIGURED(env, - env->mp_handle, "DB_ENV->get_mp_max_mmapsize", DB_INIT_MPOOL); + env->mp_handle, "DB_ENV->set_mp_tablesize", DB_INIT_MPOOL); ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_mp_tablesize"); dbenv->mp_tablesize = mp_tablesize; @@ -583,7 +588,7 @@ __memp_get_mp_mtxcount(dbenv, mp_mtxcountp) env = dbenv->env; ENV_NOT_CONFIGURED(env, - env->mp_handle, "DB_ENV->get_mp_max_mtxcount", DB_INIT_MPOOL); + env->mp_handle, "DB_ENV->get_mp_mtxcount", DB_INIT_MPOOL); if (MPOOL_ON(env)) { dbmp = env->mp_handle; @@ -610,7 +615,7 @@ __memp_set_mp_mtxcount(dbenv, mp_mtxcount) env = dbenv->env; ENV_NOT_CONFIGURED(env, - env->mp_handle, "DB_ENV->get_mp_max_mmapsize", DB_INIT_MPOOL); + env->mp_handle, "DB_ENV->set_mp_mtxcount", DB_INIT_MPOOL); ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_mp_mtxcount"); dbenv->mp_mtxcount = mp_mtxcount; @@ -870,7 +875,7 @@ __memp_ftruncate(dbmfp, txn, ip, pgno, flags) !mfp->no_backing_file && pgno <= mfp->last_flushed_pgno) #ifdef HAVE_FTRUNCATE ret = __os_truncate(env, - dbmfp->fhp, pgno, mfp->pagesize); + dbmfp->fhp, pgno, mfp->pagesize, 0); #else ret = __db_zero_extend(env, dbmfp->fhp, pgno, mfp->last_pgno, mfp->pagesize); diff --git a/src/mp/mp_mvcc.c b/src/mp/mp_mvcc.c index 47531528..b51ae135 100644 --- a/src/mp/mp_mvcc.c +++ b/src/mp/mp_mvcc.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2006, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -151,6 +151,11 @@ __memp_bh_freeze(dbmp, infop, hp, bhp, need_frozenp) real_name = NULL; fhp = NULL; + if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC)) + __db_msg(env, "freeze %s %d @%lu/%lu", __memp_fns(dbmp, mfp), + bhp->pgno, (u_long)VISIBLE_LSN(env, bhp)->file, + (u_long)VISIBLE_LSN(env, bhp)->offset); + MVCC_MPROTECT(bhp->buf, pagesize, PROT_READ | PROT_WRITE); MPOOL_REGION_LOCK(env, infop); @@ -161,7 +166,7 @@ __memp_bh_freeze(dbmp, infop, hp, bhp, need_frozenp) } else { *need_frozenp = 1; - /* There might be a small amount of unallocated space. */ + /* There might be enough space for a single-item block. */ if (__env_alloc(infop, sizeof(BH_FROZEN_ALLOC) + sizeof(BH_FROZEN_PAGE), &frozen_alloc) == 0) { @@ -405,6 +410,12 @@ __memp_bh_thaw(dbmp, infop, hp, frozen_bhp, alloc_bhp) ret = 0; real_name = NULL; + if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC)) + __db_msg(env, "thaw %s %d @%lu/%lu", __memp_fns(dbmp, mfp), + frozen_bhp->pgno, + (u_long)VISIBLE_LSN(env, frozen_bhp)->file, + (u_long)VISIBLE_LSN(env, frozen_bhp)->offset); + MUTEX_REQUIRED(env, hp->mtx_hash); DB_ASSERT(env, F_ISSET(frozen_bhp, BH_EXCLUSIVE) || alloc_bhp == NULL); h_locked = 1; @@ -414,7 +425,8 @@ __memp_bh_thaw(dbmp, infop, hp, frozen_bhp, alloc_bhp) DB_ASSERT(env, alloc_bhp != NULL || SH_CHAIN_SINGLETON(frozen_bhp, vc) || (SH_CHAIN_HASNEXT(frozen_bhp, vc) && - BH_OBSOLETE(frozen_bhp, hp->old_reader, vlsn))); + BH_OBSOLETE(frozen_bhp, hp->old_reader, vlsn)) || + F_ISSET(frozen_bhp, BH_UNREACHABLE)); DB_ASSERT(env, alloc_bhp == NULL || !F_ISSET(alloc_bhp, BH_FROZEN)); spgno = ((BH_FROZEN_PAGE *)frozen_bhp)->spgno; @@ -516,7 +528,7 @@ __memp_bh_thaw(dbmp, infop, hp, frozen_bhp, alloc_bhp) else { maxpgno -= (db_pgno_t)ntrunc; if ((ret = __os_truncate(env, fhp, - maxpgno + 1, pagesize)) != 0) + maxpgno + 1, pagesize, 0)) != 0) goto err; /* Fix up the linked list */ diff --git a/src/mp/mp_region.c b/src/mp/mp_region.c index 07134de7..ba836cf4 100644 --- a/src/mp/mp_region.c +++ b/src/mp/mp_region.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -11,7 +11,7 @@ #include "db_int.h" #include "dbinc/mp.h" -static int __memp_init_config __P((ENV *, MPOOL *)); +static int __memp_init_config __P((ENV *, MPOOL *, int)); static void __memp_region_size __P((ENV *, roff_t *, u_int32_t *)); #define MPOOL_DEFAULT_PAGESIZE (4 * 1024) @@ -34,7 +34,7 @@ __memp_open(env, create_ok) roff_t cache_size, max_size, reg_size; u_int i, max_nreg; u_int32_t htab_buckets, *regids; - int ret; + int create, ret; dbenv = env->dbenv; cache_size = 0; @@ -77,7 +77,8 @@ __memp_open(env, create_ok) * If we created the region, initialize it. Create or join any * additional regions. */ - if (F_ISSET(®info, REGION_CREATE)) { + create = F_ISSET(®info, REGION_CREATE); + if (create) { /* * We define how many regions there are going to be, allocate * the REGINFO structures and create them. Make sure we don't @@ -167,23 +168,38 @@ __memp_open(env, create_ok) env->mp_handle = dbmp; /* A process joining the region may reset the mpool configuration. */ - if ((ret = __memp_init_config(env, mp)) != 0) + if ((ret = __memp_init_config(env, mp, create)) != 0) return (ret); return (0); -err: env->mp_handle = NULL; - if (dbmp->reginfo != NULL && dbmp->reginfo[0].addr != NULL) { - for (i = 0; i < dbenv->mp_ncache; ++i) +err: (void)__mutex_free(env, &dbmp->mutex); + (void)__memp_region_detach(env, dbmp); + return (ret); +} + +/* __memp_region_detach + * Detach from any attached mempool regions. + * + * PUBLIC: int __memp_region_detach __P((ENV *, DB_MPOOL *)); + */ +int +__memp_region_detach(env, dbmp) + ENV *env; + DB_MPOOL *dbmp; +{ + u_int i; + + if (dbmp != NULL && + dbmp->reginfo != NULL && dbmp->reginfo[0].addr != NULL) { + for (i = 0; i < env->dbenv->mp_ncache; ++i) if (dbmp->reginfo[i].id != INVALID_REGION_ID) (void)__env_region_detach( env, &dbmp->reginfo[i], 0); __os_free(env, dbmp->reginfo); } - - (void)__mutex_free(env, &dbmp->mutex); - __os_free(env, dbmp); - return (ret); + env->mp_handle = NULL; + return (0); } /* @@ -207,7 +223,7 @@ __memp_init(env, dbmp, reginfo_off, htab_buckets, max_nreg) MPOOL *mp, *main_mp; REGINFO *infop; db_mutex_t mtx_base, mtx_discard, mtx_prev; - u_int32_t i; + u_int32_t i, mp_mtxcount; int ret; void *p; @@ -224,6 +240,23 @@ __memp_init(env, dbmp, reginfo_off, htab_buckets, max_nreg) __mutex_alloc(env, MTX_MPOOL_REGION, 0, &mp->mtx_region)) != 0) return (ret); + /* + * Intializing the first mpool region allocates the mpool region id + * array, file table and, if not ENV_PRIVATE, all the cache regions' + * hash bucket mutexes in a single contiguous block of mutex ids, which + * remain allocated when the cache is resized. The block is 'known' to + * start with the first id (mtx_base), and to end #regions * mp_mtxcount + * later. In private environments, mutex ids are not smallish integers, + * but __env_alloc()'d pointers. Since a range of (base, count) doesn't + * work for these likely-scattered mutexes, we allocate private threaded + * mutexes as they are needed. Private non-threaded caches don't need + * any mutexes at all. + */ + if ((mp_mtxcount = dbenv->mp_mtxcount) == 0) + mp_mtxcount = dbenv->mp_mtxcount = htab_buckets; + if (!MUTEX_ON(env) || + F_ISSET(env, ENV_PRIVATE | ENV_THREAD) == ENV_PRIVATE) + mp_mtxcount = dbenv->mp_mtxcount = 0; if (reginfo_off == 0) { ZERO_LSN(mp->lsn); @@ -248,15 +281,10 @@ __memp_init(env, dbmp, reginfo_off, htab_buckets, max_nreg) atomic_init(&htab[i].hash_page_dirty, 0); } - /* - * Allocate all of the hash bucket mutexes up front. We do - * this so that we don't need to free and reallocate mutexes as - * the cache is resized. - */ mtx_base = mtx_prev = MUTEX_INVALID; - if (!MUTEX_ON(env) || F_ISSET(env, ENV_PRIVATE)) + if (F_ISSET(env, ENV_PRIVATE)) goto no_prealloc; - for (i = 0; i < mp->max_nreg * dbenv->mp_mtxcount; i++) { + for (i = 0; i < mp->max_nreg * mp_mtxcount; i++) { if ((ret = __mutex_alloc(env, MTX_MPOOL_HASH_BUCKET, DB_MUTEX_SHARED, &mtx_discard)) != 0) return (ret); @@ -274,13 +302,12 @@ __memp_init(env, dbmp, reginfo_off, htab_buckets, max_nreg) } /* - * We preallocated all of the mutexes in a block, so for regions after - * the first, we skip mutexes in use in earlier regions. Each region - * has the same number of buckets + * If we preallocated all the mutexes, then in regions after the first, + * we skip mutexes in use in earlier regions. Each region has the same + * number of buckets. */ no_prealloc: - if (MUTEX_ON(env)) - mtx_base += reginfo_off * dbenv->mp_mtxcount; + mtx_base += reginfo_off * mp_mtxcount; /* Allocate hash table space and initialize it. */ if ((ret = __env_alloc(infop, @@ -289,18 +316,21 @@ no_prealloc: mp->htab = R_OFFSET(infop, htab); for (i = 0; i < htab_buckets; i++) { hp = &htab[i]; - if (!MUTEX_ON(env) || dbenv->mp_mtxcount == 0) + /* + * Set mtx_hash to do no locking, or share a mutex with an + * earlier hash bucket in this region, or assign it from the + * block of mutexes allocated above, or (in a private + * environment) allocate a new mutex. + */ + if (mp_mtxcount == 0) hp->mtx_hash = MUTEX_INVALID; - else if (F_ISSET(env, ENV_PRIVATE)) { - if (i >= dbenv->mp_mtxcount) - hp->mtx_hash = - htab[i % dbenv->mp_mtxcount].mtx_hash; - else if - ((ret = __mutex_alloc(env, MTX_MPOOL_HASH_BUCKET, - DB_MUTEX_SHARED, &hp->mtx_hash)) != 0) - return (ret); - } else - hp->mtx_hash = mtx_base + (i % dbenv->mp_mtxcount); + else if (i >= mp_mtxcount) + hp->mtx_hash = htab[i % mp_mtxcount].mtx_hash; + else if (!F_ISSET(env, ENV_PRIVATE)) + hp->mtx_hash = mtx_base + i; + else if ((ret = __mutex_alloc(env, MTX_MPOOL_HASH_BUCKET, + DB_MUTEX_SHARED, &hp->mtx_hash)) != 0) + return (ret); SH_TAILQ_INIT(&hp->hash_bucket); atomic_init(&hp->hash_page_dirty, 0); #ifdef HAVE_STATISTICS @@ -311,7 +341,7 @@ no_prealloc: ZERO_LSN(hp->old_reader); } mp->htab_buckets = htab_buckets; - mp->htab_mutexes = dbenv->mp_mtxcount; + mp->htab_mutexes = mp_mtxcount; mp->pagesize = dbenv->mp_pagesize == 0 ? MPOOL_DEFAULT_PAGESIZE : dbenv->mp_pagesize; @@ -443,11 +473,21 @@ __memp_region_mutex_count(env) dbenv = env->dbenv; __memp_region_size(env, ®_size, &htab_buckets); - if (F_ISSET(env->dbenv, DB_ENV_MULTIVERSION)) - pgsize = sizeof(BH_FROZEN_ALLOC) + sizeof(BH_FROZEN_PAGE); - if ((pgsize = dbenv->mp_pagesize) == 0) - pgsize = MPOOL_DEFAULT_PAGESIZE; + if (dbenv->mp_mtxcount != 0) + htab_buckets = dbenv->mp_mtxcount; max_region = __memp_max_regions(env); + if ((pgsize = dbenv->mp_pagesize) == 0) { + /* + * If MVCC is on during environment creation, provide enough + * mutexes so that half the cache can be frozen buffer headers. + */ + if (F_ISSET(env->dbenv, DB_ENV_MULTIVERSION)) + pgsize = (MPOOL_DEFAULT_PAGESIZE + + sizeof(BH_FROZEN_ALLOC) + + sizeof(BH_FROZEN_PAGE)) / 2; + else + pgsize = MPOOL_DEFAULT_PAGESIZE; + } /* * We need a couple of mutexes for the region itself, one for each @@ -456,10 +496,6 @@ __memp_region_mutex_count(env) * hash bucket. We then need one mutex per page in the cache, * the worst case is really big if the pages are 512 bytes. */ - if (dbenv->mp_mtxcount != 0) - htab_buckets = dbenv->mp_mtxcount; - else - dbenv->mp_mtxcount = htab_buckets; num_per_cache = htab_buckets + (u_int32_t)(reg_size / pgsize); return ((max_region * num_per_cache) + 50 + MPOOL_FILE_BUCKETS); } @@ -469,23 +505,39 @@ __memp_region_mutex_count(env) * Initialize shared configuration information. */ static int -__memp_init_config(env, mp) +__memp_init_config(env, mp, create) ENV *env; MPOOL *mp; + int create; { DB_ENV *dbenv; dbenv = env->dbenv; MPOOL_SYSTEM_LOCK(env); - if (dbenv->mp_mmapsize != 0) + if (create) { mp->mp_mmapsize = (db_size_t)dbenv->mp_mmapsize; - if (dbenv->mp_maxopenfd != 0) mp->mp_maxopenfd = dbenv->mp_maxopenfd; - if (dbenv->mp_maxwrite != 0) mp->mp_maxwrite = dbenv->mp_maxwrite; - if (dbenv->mp_maxwrite_sleep != 0) mp->mp_maxwrite_sleep = dbenv->mp_maxwrite_sleep; + } else { + if (dbenv->mp_mmapsize != 0 && + mp->mp_mmapsize != (db_size_t)dbenv->mp_mmapsize) + __db_msg(env, DB_STR("3044", +"Warning: Ignoring maximum memory map size when joining environment")); + + if (dbenv->mp_maxopenfd != 0 && + mp->mp_maxopenfd != dbenv->mp_maxopenfd) + __db_msg(env, DB_STR("3045", +"Warning: Ignoring max open file descriptors value when joining environment")); + + if ((dbenv->mp_maxwrite != 0 && + mp->mp_maxwrite != dbenv->mp_maxwrite) || + (dbenv->mp_maxwrite_sleep != 0 && + mp->mp_maxwrite_sleep != dbenv->mp_maxwrite_sleep)) + __db_msg(env, DB_STR("3046", +"Warning: Ignoring maximum sequential writes value when joining environment")); + } MPOOL_SYSTEM_UNLOCK(env); return (0); @@ -501,22 +553,18 @@ int __memp_env_refresh(env) ENV *env; { - BH *bhp; - BH_FROZEN_ALLOC *frozen_alloc; DB_MPOOL *dbmp; DB_MPOOLFILE *dbmfp; - DB_MPOOL_HASH *hp; DB_MPREG *mpreg; MPOOL *mp, *c_mp; REGINFO *infop; - u_int32_t bucket, i, nreg; + u_int32_t i, nreg; int ret, t_ret; ret = 0; dbmp = env->mp_handle; mp = dbmp->reginfo[0].primary; nreg = mp->nreg; - hp = R_ADDR(&dbmp->reginfo[0], mp->htab); /* * If a private region, return the memory to the heap. Not needed for @@ -526,49 +574,20 @@ __memp_env_refresh(env) if (!F_ISSET(env, ENV_PRIVATE)) goto not_priv; - /* Discard buffers. */ for (i = 0; i < nreg; ++i) { infop = &dbmp->reginfo[i]; - c_mp = infop->primary; - for (hp = R_ADDR(infop, c_mp->htab), bucket = 0; - bucket < c_mp->htab_buckets; ++hp, ++bucket) { - while ((bhp = SH_TAILQ_FIRST( - &hp->hash_bucket, __bh)) != NULL) - if (F_ISSET(bhp, BH_FROZEN)) - SH_TAILQ_REMOVE( - &hp->hash_bucket, bhp, - hq, __bh); - else { - if (F_ISSET(bhp, BH_DIRTY)) { - atomic_dec(env, - &hp->hash_page_dirty); - F_CLR(bhp, - BH_DIRTY | BH_DIRTY_CREATE); - } - atomic_inc(env, &bhp->ref); - if ((t_ret = __memp_bhfree(dbmp, infop, - R_ADDR(dbmp->reginfo, - bhp->mf_offset), hp, bhp, - BH_FREE_FREEMEM | - BH_FREE_UNLOCKED)) != 0 && ret == 0) - ret = t_ret; - } - } - MPOOL_REGION_LOCK(env, infop); - while ((frozen_alloc = SH_TAILQ_FIRST( - &c_mp->alloc_frozen, __bh_frozen_a)) != NULL) { - SH_TAILQ_REMOVE(&c_mp->alloc_frozen, frozen_alloc, - links, __bh_frozen_a); - __env_alloc_free(infop, frozen_alloc); - } - MPOOL_REGION_UNLOCK(env, infop); + if ((t_ret = __memp_region_bhfree(infop)) != 0 && ret == 0) + ret = t_ret; } not_priv: /* Discard DB_MPOOLFILEs. */ while ((dbmfp = TAILQ_FIRST(&dbmp->dbmfq)) != NULL) - if ((t_ret = __memp_fclose(dbmfp, DB_FLUSH)) != 0 && ret == 0) - ret = t_ret; + if ((t_ret = __memp_fclose(dbmfp, DB_FLUSH)) != 0) { + if (ret == 0) + ret = t_ret; + break; + } /* Discard DB_MPREGs. */ if (dbmp->pg_inout != NULL) @@ -618,3 +637,62 @@ not_priv: env->mp_handle = NULL; return (ret); } + +/* + * __memp_region_bhfree -- + * Discard the buffers for a region. + * + * PUBLIC: int __memp_region_bhfree __P((REGINFO *)); + */ +int +__memp_region_bhfree(infop) + REGINFO *infop; +{ + BH *bhp; + BH_FROZEN_ALLOC *frozen_alloc; + DB_MPOOL *dbmp; + DB_MPOOL_HASH *hp; + ENV *env; + MPOOL *c_mp; + u_int32_t bucket; + int ret, t_ret; + + env = infop->env; + dbmp = env->mp_handle; + ret = 0; + + /* Discard buffers. */ + c_mp = infop->primary; + for (hp = R_ADDR(infop, c_mp->htab), bucket = 0; + bucket < c_mp->htab_buckets; ++hp, ++bucket) { + while ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) != NULL) + if (F_ISSET(bhp, BH_FROZEN)) + SH_TAILQ_REMOVE(&hp->hash_bucket, + bhp, hq, __bh); + else { + if (F_ISSET(bhp, BH_DIRTY)) { + atomic_dec(env, &hp->hash_page_dirty); + F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE); + } + atomic_inc(env, &bhp->ref); + if ((t_ret = __memp_bhfree(dbmp, infop, + R_ADDR(dbmp->reginfo, bhp->mf_offset), + hp, bhp, BH_FREE_FREEMEM | + BH_FREE_UNLOCKED)) != 0) { + if (ret == 0) + ret = t_ret; + break; + } + } + } + MPOOL_REGION_LOCK(env, infop); + while ((frozen_alloc = SH_TAILQ_FIRST( + &c_mp->alloc_frozen, __bh_frozen_a)) != NULL) { + SH_TAILQ_REMOVE(&c_mp->alloc_frozen, + frozen_alloc, links, __bh_frozen_a); + __env_alloc_free(infop, frozen_alloc); + } + MPOOL_REGION_UNLOCK(env, infop); + + return (ret); +} diff --git a/src/mp/mp_register.c b/src/mp/mp_register.c index dc7015a7..cc59af9c 100644 --- a/src/mp/mp_register.c +++ b/src/mp/mp_register.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ diff --git a/src/mp/mp_resize.c b/src/mp/mp_resize.c index 97719554..932a1baa 100644 --- a/src/mp/mp_resize.c +++ b/src/mp/mp_resize.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2006, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -126,12 +126,13 @@ __memp_merge_buckets(dbmp, new_nbuckets, old_bucket, new_bucket) MPOOLFILE *mfp; REGINFO *new_infop, *old_infop; u_int32_t bucket, high_mask, new_region, old_region; - int ret; + int expanding, ret; env = dbmp->env; mp = dbmp->reginfo[0].primary; new_bhp = NULL; ret = 0; + expanding = (mp->nbuckets > new_nbuckets) ? 0 : 1; MP_MASK(new_nbuckets, high_mask); @@ -150,36 +151,42 @@ __memp_merge_buckets(dbmp, new_nbuckets, old_bucket, new_bucket) /* * Before merging, we need to check that there are no old buffers left * in the target hash bucket after a previous split. + * Only free the buffers if we are expanding into new buckets. If + * we are contracting, the buffers in the original (old) bucket should + * not be freed. */ free_old: - MUTEX_LOCK(env, new_hp->mtx_hash); - SH_TAILQ_FOREACH(bhp, &new_hp->hash_bucket, hq, __bh) { - MP_BUCKET(bhp->mf_offset, bhp->pgno, mp->nbuckets, bucket); + if (expanding != 0) { + MUTEX_LOCK(env, new_hp->mtx_hash); + SH_TAILQ_FOREACH(bhp, &new_hp->hash_bucket, hq, __bh) { + MP_BUCKET( + bhp->mf_offset, bhp->pgno, mp->nbuckets, bucket); + + if (bucket != new_bucket) { + /* + * There is no way that an old buffer can be + * locked after a split, since everyone will + * look for it in the new hash bucket. + */ + DB_ASSERT(env, !F_ISSET(bhp, BH_DIRTY) && + atomic_read(&bhp->ref) == 0); + atomic_inc(env, &bhp->ref); + mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); + if ((ret = __memp_bhfree(dbmp, new_infop, + mfp, new_hp, bhp, BH_FREE_FREEMEM)) != 0) { + MUTEX_UNLOCK(env, new_hp->mtx_hash); + return (ret); + } - if (bucket != new_bucket) { - /* - * There is no way that an old buffer can be locked - * after a split, since everyone will look for it in - * the new hash bucket. - */ - DB_ASSERT(env, !F_ISSET(bhp, BH_DIRTY) && - atomic_read(&bhp->ref) == 0); - atomic_inc(env, &bhp->ref); - mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); - if ((ret = __memp_bhfree(dbmp, new_infop, - mfp, new_hp, bhp, BH_FREE_FREEMEM)) != 0) { - MUTEX_UNLOCK(env, new_hp->mtx_hash); - return (ret); + /* + * The free has modified the list of buffers and + * dropped the mutex. We need to start again. + */ + goto free_old; } - - /* - * The free has modified the list of buffers and - * dropped the mutex. We need to start again. - */ - goto free_old; } + MUTEX_UNLOCK(env, new_hp->mtx_hash); } - MUTEX_UNLOCK(env, new_hp->mtx_hash); /* * Before we begin, make sure that all of the buffers we care about are @@ -305,7 +312,9 @@ err: atomic_dec(env, &bhp->ref); next_bhp, alloc_bhp, vc, __bh); } - DB_ASSERT(env, new_hp->mtx_hash != old_hp->mtx_hash); + /* The mutexes must be different, unless they aren't in use. */ + DB_ASSERT(env, new_hp->mtx_hash != old_hp->mtx_hash || + new_hp->mtx_hash == MUTEX_INVALID); MUTEX_LOCK(env, new_hp->mtx_hash); SH_TAILQ_INSERT_TAIL(&new_hp->hash_bucket, new_bhp, hq); if (F_ISSET(new_bhp, BH_DIRTY)) @@ -362,16 +371,15 @@ __memp_add_region(dbmp) MPOOL *mp; REGINFO *infop; int ret; - roff_t cache_size, reg_size; + roff_t reg_size; u_int i; u_int32_t *regids; env = dbmp->env; mp = dbmp->reginfo[0].primary; - cache_size = (roff_t)mp->gbytes * GIGABYTE + mp->bytes; /* All cache regions are the same size. */ - reg_size = dbmp->reginfo[0].rp->size; + reg_size = dbmp->reginfo[0].rp->max; ret = 0; infop = &dbmp->reginfo[mp->nreg]; @@ -384,9 +392,6 @@ __memp_add_region(dbmp) if ((ret = __memp_init(env, dbmp, mp->nreg, mp->htab_buckets, mp->max_nreg)) != 0) return (ret); - cache_size += reg_size; - mp->gbytes = (u_int32_t)(cache_size / GIGABYTE); - mp->bytes = (u_int32_t)(cache_size % GIGABYTE); regids = R_ADDR(dbmp->reginfo, mp->regids); regids[mp->nreg++] = infop->id; @@ -425,16 +430,13 @@ __memp_remove_region(dbmp) { DB_MPOOL_HASH *hp; ENV *env; - MPOOL *mp; + MPOOL *mp, *c_mp; REGINFO *infop; int ret; - roff_t cache_size, reg_size; u_int i; env = dbmp->env; mp = dbmp->reginfo[0].primary; - reg_size = dbmp->reginfo[0].rp->size; - cache_size = (roff_t)mp->gbytes * GIGABYTE + mp->bytes; ret = 0; if (mp->nreg == 1) { @@ -448,21 +450,36 @@ __memp_remove_region(dbmp) return (ret); /* Detach from the region then destroy it. */ - infop = &dbmp->reginfo[mp->nreg]; + infop = &dbmp->reginfo[mp->nreg - 1]; + c_mp = infop->primary; + hp = R_ADDR(infop, c_mp->htab); + /* + * For private enviroment, we need to free everything, and + * for non-private environment, we need to refresh the mutexes + * so that they can be in a ready state for later resize. + */ if (F_ISSET(env, ENV_PRIVATE)) { - hp = R_ADDR(infop, ((MPOOL*)infop->primary)->htab); - for (i = 0; i < env->dbenv->mp_mtxcount; i++) - if ((ret = __mutex_free(env, &hp[i].mtx_hash)) != 0) + if ((ret = __memp_region_bhfree(infop)) != 0) + return (ret); + if (MUTEX_ON(env)) { + DB_ASSERT(env, + env->dbenv->mp_mtxcount == mp->htab_mutexes); + for (i = 0; i < mp->htab_mutexes; i++) + if ((ret = __mutex_free(env, + &hp[i].mtx_hash)) != 0) + return (ret); + } + __env_alloc_free(infop, hp); + } else if (MUTEX_ON(env)) { + DB_ASSERT(env, env->dbenv->mp_mtxcount == mp->htab_mutexes); + for (i = 0; i < mp->htab_mutexes; i++) + if ((ret = __mutex_refresh(env, hp[i].mtx_hash)) != 0) return (ret); } ret = __env_region_detach(env, infop, 1); - if (ret == 0) { + if (ret == 0) mp->nreg--; - cache_size -= reg_size; - mp->gbytes = (u_int32_t)(cache_size / GIGABYTE); - mp->bytes = (u_int32_t)(cache_size % GIGABYTE); - } return (ret); } @@ -511,6 +528,9 @@ __memp_map_regions(dbmp) } /* + * __memp_resize -- + * Change the overall cache size by adding or removing cache regions. + * * PUBLIC: int __memp_resize __P((DB_MPOOL *, u_int32_t, u_int32_t)); */ int @@ -526,7 +546,7 @@ __memp_resize(dbmp, gbytes, bytes) env = dbmp->env; mp = dbmp->reginfo[0].primary; - reg_size = dbmp->reginfo[0].rp->size; + reg_size = dbmp->reginfo[0].rp->max; total_size = (roff_t)gbytes * GIGABYTE + bytes; ncache = (u_int32_t)((total_size + reg_size / 2) / reg_size); @@ -546,6 +566,9 @@ __memp_resize(dbmp, gbytes, bytes) __memp_add_region(dbmp) : __memp_remove_region(dbmp))) != 0) break; + total_size = reg_size * (roff_t)mp->nreg; + mp->gbytes = (u_int32_t)(total_size / GIGABYTE); + mp->bytes = (u_int32_t)(total_size % GIGABYTE); MUTEX_UNLOCK(env, mp->mtx_resize); return (ret); @@ -567,13 +590,13 @@ __memp_get_cache_max(dbenv, max_gbytesp, max_bytesp) env = dbenv->env; ENV_NOT_CONFIGURED(env, - env->mp_handle, "DB_ENV->get_mp_max_ncache", DB_INIT_MPOOL); + env->mp_handle, "DB_ENV->get_cache_max", DB_INIT_MPOOL); if (MPOOL_ON(env)) { /* Cannot be set after open, no lock required to read. */ dbmp = env->mp_handle; mp = dbmp->reginfo[0].primary; - reg_size = dbmp->reginfo[0].rp->size; + reg_size = dbmp->reginfo[0].rp->max; max_size = mp->max_nreg * reg_size; *max_gbytesp = (u_int32_t)(max_size / GIGABYTE); *max_bytesp = (u_int32_t)(max_size % GIGABYTE); diff --git a/src/mp/mp_stat.c b/src/mp/mp_stat.c index 246b44d7..81ea35c1 100644 --- a/src/mp/mp_stat.c +++ b/src/mp/mp_stat.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -133,7 +133,14 @@ __memp_stat(env, gspp, fspp, flags) sp->st_ro_evict += c_mp->stat.st_ro_evict; sp->st_rw_evict += c_mp->stat.st_rw_evict; sp->st_page_trickle += c_mp->stat.st_page_trickle; + sp->st_mvcc_reused += c_mp->stat.st_mvcc_reused; sp->st_pages += c_mp->pages; + /* Undocumented field used by tests only. */ + sp->st_oddfsize_detect += + c_mp->stat.st_oddfsize_detect; + /* Undocumented field used by tests only. */ + sp->st_oddfsize_resolve += + c_mp->stat.st_oddfsize_resolve; /* * st_page_dirty calculated by __memp_stat_hash * st_page_clean calculated here @@ -195,7 +202,12 @@ __memp_stat(env, gspp, fspp, flags) /* Count the MPOOLFILE structures. */ i = 0; - len = 0; + /* + * Allow space for the first __memp_get_files() to align the + * structure array to uintmax_t, DB_MPOOL_STAT's most + * restrictive field. [#23150] + */ + len = sizeof(uintmax_t); if ((ret = __memp_walk_files(env, mp, __memp_count_files, &len, &i, flags)) != 0) return (ret); @@ -252,6 +264,11 @@ __memp_file_stats(env, mfp, argp, countp, flags) return (0); } +/* + * __memp_count_files -- + * This __memp_walk_files() iterator counts the number of files as well as + * the space needed for their statistics, including file names. + */ static int __memp_count_files(env, mfp, argp, countp, flags) ENV *env; @@ -277,13 +294,25 @@ __memp_count_files(env, mfp, argp, countp, flags) /* * __memp_get_files -- - * get file specific statistics + * get another file's specific statistics * - * Build each individual entry. We assume that an array of pointers are - * aligned correctly to be followed by an array of structures, which should - * be safe (in this particular case, the first element of the structure - * is a pointer, so we're doubly safe). The array is followed by space - * for the text file names. + * Add a file statistics entry to the current list. The chunk of memory + * starts with an array of DB_MPOOL_FSTAT pointers, a null pointer to mark + * the last one, then an aligned array of DB_MPOOL_FSTAT structures, then + * characters space for the file names. + * +-----------------------------------------------+ + * | count * DB_MPOOL_FSTAT pointers | + * +-----------------------------------------------+ + * | null pointer + + * +-----------------------------------------------| + * | [space for aligning DB_MPOOL_FSTAT array] | + * +-----------------------------------------------+ + * | count * DB_MPOOL_FSTAT structs | + * +-----------------------------------------------+ + * | first file name | second file name | third... | + * +-----------------------------------------------+ + * | file name | ... | + * +-----------------------------------------------+ */ static int __memp_get_files(env, mfp, argp, countp, flags) @@ -305,11 +334,21 @@ __memp_get_files(env, mfp, argp, countp, flags) tfsp = *(DB_MPOOL_FSTAT ***)argp; if (*tfsp == NULL) { - /* Add 1 to count because we need to skip over the NULL. */ - tstruct = (DB_MPOOL_FSTAT *)(tfsp + *countp + 1); - tname = (char *)(tstruct + *countp); + /* + * Add 1 to count because to skip over the NULL end marker. + * Align it further for DB_MPOOL_STAT's most restrictive field + * because uintmax_t might require stricter alignment than + * pointers; e.g., IP32 LL64 SPARC. [#23150] + */ + tstruct = (DB_MPOOL_FSTAT *)&tfsp[*countp + 1]; + tstruct = ALIGNP_INC(tstruct, sizeof(uintmax_t)); + tname = (char *)&tstruct[*countp]; *tfsp = tstruct; } else { + /* + * This stat struct follows the previous one; the file name + * follows the previous entry's filename. + */ tstruct = *tfsp + 1; tname = (*tfsp)->file_name + strlen((*tfsp)->file_name) + 1; *++tfsp = tstruct; @@ -486,6 +525,8 @@ __memp_print_stats(env, flags) (u_long)gsp->st_mvcc_thawed); __db_dl(env, "The number of frozen buffers freed", (u_long)gsp->st_mvcc_freed); + __db_dl(env, "The number of outdated intermediate versions reused", + (u_long)gsp->st_mvcc_reused); __db_dl(env, "The number of page allocations", (u_long)gsp->st_alloc); __db_dl(env, "The number of hash buckets examined during allocations", @@ -744,11 +785,18 @@ __memp_print_hash(env, dbmp, reginfo, fmap, flags) vbhp != NULL; vbhp = SH_CHAIN_PREV(vbhp, vc, __bh)) { __memp_print_bh(env, dbmp, - " next:\t", vbhp, fmap); + " prev:\t", vbhp, fmap); } } MUTEX_UNLOCK(env, hp->mtx_hash); } +#ifdef DIAGNOSTIC + SH_TAILQ_FOREACH(bhp, &c_mp->free_frozen, hq, __bh) { + __db_msg(env, "free frozen %lu pgno %lu mtx_buf %lu", + (u_long)R_OFFSET(dbmp->reginfo, bhp), + (u_long)bhp->pgno, (u_long)bhp->mtx_buf); + } +#endif return (0); } @@ -775,6 +823,7 @@ __memp_print_bh(env, dbmp, prefix, bhp, fmap) { BH_FROZEN, "frozen" }, { BH_TRASH, "trash" }, { BH_THAWED, "thawed" }, + { BH_UNREACHABLE, "unreachable" }, { 0, NULL } }; DB_MSGBUF mb; diff --git a/src/mp/mp_sync.c b/src/mp/mp_sync.c index fa06b1d4..82d5c8de 100644 --- a/src/mp/mp_sync.c +++ b/src/mp/mp_sync.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -95,9 +95,11 @@ __memp_discard_all_mpfs (env, mp) while ((mfp = SH_TAILQ_FIRST( &hp->hash_bucket, __mpoolfile)) != NULL) { MUTEX_LOCK(env, mfp->mutex); - if ((t_ret = __memp_mf_discard(dbmp, mfp, 1)) != 0 && - ret == 0) - ret = t_ret; + if ((t_ret = __memp_mf_discard(dbmp, mfp, 1)) != 0) { + if (ret == 0) + ret = t_ret; + break; + } } MUTEX_UNLOCK(env, hp->mtx_hash); } @@ -837,6 +839,7 @@ __memp_mf_sync(dbmp, mfp, locked) MPOOLFILE *mfp; int locked; { + APPNAME appname; DB_FH *fhp; DB_MPOOL_HASH *hp; ENV *env; @@ -846,6 +849,7 @@ __memp_mf_sync(dbmp, mfp, locked) COMPQUIET(hp, NULL); env = dbmp->env; + appname = DB_APP_DATA; /* * We need to be holding the hash lock: we're using the path name @@ -859,13 +863,20 @@ __memp_mf_sync(dbmp, mfp, locked) MUTEX_LOCK(env, hp->mtx_hash); } - if ((ret = __db_appname(env, DB_APP_DATA, +mpsync: if ((ret = __db_appname(env, appname, R_ADDR(dbmp->reginfo, mfp->path_off), NULL, &rpath)) == 0) { if ((ret = __os_open(env, rpath, 0, 0, 0, &fhp)) == 0) { ret = __os_fsync(env, fhp); if ((t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0) ret = t_ret; + } else { + /* We may be syncing the blob meta db. */ + if (appname != DB_APP_BLOB) { + __os_free(env, rpath); + appname = DB_APP_BLOB; + goto mpsync; + } } __os_free(env, rpath); } diff --git a/src/mp/mp_trickle.c b/src/mp/mp_trickle.c index fba528b3..ff8cb875 100644 --- a/src/mp/mp_trickle.c +++ b/src/mp/mp_trickle.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ |