summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Cahill <michael.cahill@mongodb.com>2016-11-15 17:02:07 +1100
committerMichael Cahill <michael.cahill@mongodb.com>2016-11-15 17:02:07 +1100
commit85ec028f36c0a48bb4e05fc1841b25c56c0f173b (patch)
treee38d422594801c8f22ba20f097c6672e214b3af3
parent038fc3672336622aee5f5957fcf6d1eeff3426e3 (diff)
parente11d885f11fc7c47f1a9160087f738da80567ad2 (diff)
downloadmongo-85ec028f36c0a48bb4e05fc1841b25c56c0f173b.tar.gz
Merge branch 'develop' into mongodb-3.4
-rw-r--r--src/btree/bt_handle.c4
-rw-r--r--src/conn/conn_dhandle.c4
-rw-r--r--src/conn/conn_open.c3
-rw-r--r--src/conn/conn_sweep.c6
-rw-r--r--src/evict/evict_lru.c2
-rw-r--r--src/include/btree.i22
-rw-r--r--src/include/meta.h3
-rw-r--r--src/include/txn.h11
-rw-r--r--src/include/txn.i9
-rw-r--r--src/session/session_dhandle.c4
-rw-r--r--src/txn/txn.c65
-rw-r--r--src/txn/txn_ckpt.c8
12 files changed, 85 insertions, 56 deletions
diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c
index 337a3ea036f..47c7972dd57 100644
--- a/src/btree/bt_handle.c
+++ b/src/btree/bt_handle.c
@@ -341,7 +341,7 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
* always inherit from the connection.
*/
WT_RET(__wt_config_gets(session, cfg, "encryption.name", &cval));
- if (WT_IS_METADATA(session, btree->dhandle) || cval.len == 0)
+ if (WT_IS_METADATA(btree->dhandle) || cval.len == 0)
btree->kencryptor = conn->kencryptor;
else if (WT_STRING_MATCH("none", cval.str, cval.len))
btree->kencryptor = NULL;
@@ -432,7 +432,7 @@ __wt_btree_tree_open(
* Failure to open metadata means that the database is unavailable.
* Try to provide a helpful failure message.
*/
- if (ret != 0 && WT_IS_METADATA(session, session->dhandle)) {
+ if (ret != 0 && WT_IS_METADATA(session->dhandle)) {
__wt_errx(session,
"WiredTiger has failed to open its metadata");
__wt_errx(session, "This may be due to the database"
diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c
index ac72e330b67..e9e3925c57e 100644
--- a/src/conn/conn_dhandle.c
+++ b/src/conn/conn_dhandle.c
@@ -447,7 +447,7 @@ __wt_conn_btree_apply(WT_SESSION_IMPL *session, const char *uri,
F_ISSET(dhandle, WT_DHANDLE_DEAD) ||
dhandle->checkpoint != NULL ||
!WT_PREFIX_MATCH(dhandle->name, "file:") ||
- WT_IS_METADATA(session, dhandle))
+ WT_IS_METADATA(dhandle))
continue;
WT_RET(__conn_btree_apply_internal(
session, dhandle, file_func, name_func, cfg));
@@ -627,7 +627,7 @@ __wt_conn_dhandle_discard(WT_SESSION_IMPL *session)
*/
restart:
TAILQ_FOREACH(dhandle, &conn->dhqh, q) {
- if (WT_IS_METADATA(session, dhandle))
+ if (WT_IS_METADATA(dhandle))
continue;
WT_WITH_DHANDLE(session, dhandle,
diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c
index 69b50147bf5..6454503d6cb 100644
--- a/src/conn/conn_open.c
+++ b/src/conn/conn_open.c
@@ -95,7 +95,8 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
for (;;) {
WT_TRET(__wt_txn_update_oldest(session,
WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT));
- if (txn_global->oldest_id == txn_global->current)
+ if (txn_global->oldest_id == txn_global->current &&
+ txn_global->metadata_pinned == txn_global->current)
break;
__wt_yield();
}
diff --git a/src/conn/conn_sweep.c b/src/conn/conn_sweep.c
index dba37fa2eb0..d1254d8afcc 100644
--- a/src/conn/conn_sweep.c
+++ b/src/conn/conn_sweep.c
@@ -26,7 +26,7 @@ __sweep_mark(WT_SESSION_IMPL *session, time_t now)
conn = S2C(session);
TAILQ_FOREACH(dhandle, &conn->dhqh, q) {
- if (WT_IS_METADATA(session, dhandle))
+ if (WT_IS_METADATA(dhandle))
continue;
/*
@@ -122,7 +122,7 @@ __sweep_expire(WT_SESSION_IMPL *session, time_t now)
if (conn->open_btree_count < conn->sweep_handles_min)
break;
- if (WT_IS_METADATA(session, dhandle) ||
+ if (WT_IS_METADATA(dhandle) ||
!F_ISSET(dhandle, WT_DHANDLE_OPEN) ||
dhandle->session_inuse != 0 ||
dhandle->timeofdeath == 0 ||
@@ -228,7 +228,7 @@ __sweep_remove_handles(WT_SESSION_IMPL *session)
dhandle != NULL;
dhandle = dhandle_next) {
dhandle_next = TAILQ_NEXT(dhandle, q);
- if (WT_IS_METADATA(session, dhandle))
+ if (WT_IS_METADATA(dhandle))
continue;
if (!WT_DHANDLE_CAN_DISCARD(dhandle))
continue;
diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c
index 909fe813f98..dfa3fae48d9 100644
--- a/src/evict/evict_lru.c
+++ b/src/evict/evict_lru.c
@@ -1449,7 +1449,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue,
/* Limit internal pages to 50% of the total. */
if (WT_PAGE_IS_INTERNAL(page) &&
- internal_pages >= (int)(evict - start) / 2)
+ internal_pages > (int)(evict - start) / 2)
continue;
/* If eviction gets aggressive, anything else is fair game. */
diff --git a/src/include/btree.i b/src/include/btree.i
index 9c8ec1dbdea..8f44bc4eddd 100644
--- a/src/include/btree.i
+++ b/src/include/btree.i
@@ -1562,7 +1562,7 @@ __wt_btree_lsm_switch_primary(WT_SESSION_IMPL *session, bool on)
cache = S2C(session)->cache;
root = btree->root.page;
- if (on && !F_ISSET(btree, WT_BTREE_LSM_PRIMARY))
+ if (!F_ISSET(btree, WT_BTREE_LSM_PRIMARY))
F_SET(btree, WT_BTREE_LSM_PRIMARY | WT_BTREE_NO_EVICTION);
if (!on && F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) {
pindex = WT_INTL_INDEX_GET_SAFE(root);
@@ -1575,21 +1575,19 @@ __wt_btree_lsm_switch_primary(WT_SESSION_IMPL *session, bool on)
* We're reaching down into the page without a hazard pointer,
* but that's OK because we know that no-eviction is set so the
* page can't disappear.
- */
- child = first->page;
- if (first->state != WT_REF_MEM ||
- child->type != WT_PAGE_ROW_LEAF ||
- !__wt_page_is_modified(child))
- return;
-
- /*
+ *
* While this tree was the primary, its dirty bytes were not
* included in the cache accounting. Fix that now before we
* open it up for eviction.
*/
- size = child->modify->bytes_dirty;
- (void)__wt_atomic_add64(&btree->bytes_dirty_leaf, size);
- (void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size);
+ child = first->page;
+ if (first->state == WT_REF_MEM &&
+ child->type == WT_PAGE_ROW_LEAF &&
+ __wt_page_is_modified(child)) {
+ size = child->modify->bytes_dirty;
+ (void)__wt_atomic_add64(&btree->bytes_dirty_leaf, size);
+ (void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size);
+ }
F_CLR(btree, WT_BTREE_LSM_PRIMARY | WT_BTREE_NO_EVICTION);
}
diff --git a/src/include/meta.h b/src/include/meta.h
index 6d4a167a8e5..74df3c57ce4 100644
--- a/src/include/meta.h
+++ b/src/include/meta.h
@@ -32,8 +32,7 @@
* Optimize comparisons against the metafile URI, flag handles that reference
* the metadata file.
*/
-#define WT_IS_METADATA(session, dh) \
- F_ISSET((dh), WT_DHANDLE_IS_METADATA)
+#define WT_IS_METADATA(dh) F_ISSET((dh), WT_DHANDLE_IS_METADATA)
#define WT_METAFILE_ID 0 /* Metadata file ID */
#define WT_METADATA_VERSION "WiredTiger version" /* Version keys */
diff --git a/src/include/txn.h b/src/include/txn.h
index 8128e8e4cc2..774f635d7ba 100644
--- a/src/include/txn.h
+++ b/src/include/txn.h
@@ -67,6 +67,7 @@ struct __wt_named_snapshot {
struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_txn_state {
volatile uint64_t id;
volatile uint64_t pinned_id;
+ volatile uint64_t metadata_pinned;
};
struct __wt_txn_global {
@@ -94,12 +95,18 @@ struct __wt_txn_global {
* for a long time so we keep them out of regular visibility checks.
* Eviction and checkpoint operations know when they need to be aware
* of checkpoint transactions.
+ *
+ * We rely on the fact that (a) the only table a checkpoint updates is
+ * the metadata; and (b) once checkpoint has finished reading a table,
+ * it won't revisit it.
*/
volatile uint32_t checkpoint_id; /* Checkpoint's session ID */
- volatile uint64_t checkpoint_gen;
- volatile uint64_t checkpoint_pinned;
+ volatile uint64_t checkpoint_gen; /* Checkpoint generation */
+ volatile uint64_t checkpoint_pinned; /* Oldest ID for checkpoint */
volatile uint64_t checkpoint_txnid; /* Checkpoint's txn ID */
+ volatile uint64_t metadata_pinned; /* Oldest ID for metadata */
+
/* Named snapshot state. */
WT_RWLOCK *nsnap_rwlock;
volatile uint64_t nsnap_oldest_id;
diff --git a/src/include/txn.i b/src/include/txn.i
index 0f75b2e90e1..3e8dd45003e 100644
--- a/src/include/txn.i
+++ b/src/include/txn.i
@@ -112,6 +112,13 @@ __wt_txn_oldest_id(WT_SESSION_IMPL *session)
btree = S2BT_SAFE(session);
/*
+ * The metadata is tracked specially because of optimizations for
+ * checkpoints.
+ */
+ if (session->dhandle != NULL && WT_IS_METADATA(session->dhandle))
+ return (txn_global->metadata_pinned);
+
+ /*
* Take a local copy of these IDs in case they are updated while we are
* checking visibility. The read of the transaction ID pinned by a
* checkpoint needs to be carefully ordered: if a checkpoint is
@@ -482,6 +489,8 @@ __wt_txn_cursor_op(WT_SESSION_IMPL *session)
if (txn->isolation == WT_ISO_READ_UNCOMMITTED) {
if (txn_state->pinned_id == WT_TXN_NONE)
txn_state->pinned_id = txn_global->last_running;
+ if (txn_state->metadata_pinned == WT_TXN_NONE)
+ txn_state->metadata_pinned = txn_state->pinned_id;
} else if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT))
__wt_txn_get_snapshot(session);
}
diff --git a/src/session/session_dhandle.c b/src/session/session_dhandle.c
index 725854c6001..94326aebe46 100644
--- a/src/session/session_dhandle.c
+++ b/src/session/session_dhandle.c
@@ -68,7 +68,7 @@ __session_find_dhandle(WT_SESSION_IMPL *session,
retry: TAILQ_FOREACH(dhandle_cache, &session->dhhash[bucket], hashq) {
dhandle = dhandle_cache->dhandle;
if (WT_DHANDLE_INACTIVE(dhandle) &&
- !WT_IS_METADATA(session, dhandle)) {
+ !WT_IS_METADATA(dhandle)) {
__session_discard_dhandle(session, dhandle_cache);
/* We deleted our entry, retry from the start. */
goto retry;
@@ -401,7 +401,7 @@ __session_dhandle_sweep(WT_SESSION_IMPL *session)
difftime(now, dhandle->timeofdeath) >
conn->sweep_idle_time))) {
WT_STAT_CONN_INCR(session, dh_session_handles);
- WT_ASSERT(session, !WT_IS_METADATA(session, dhandle));
+ WT_ASSERT(session, !WT_IS_METADATA(dhandle));
__session_discard_dhandle(session, dhandle_cache);
}
dhandle_cache = dhandle_cache_next;
diff --git a/src/txn/txn.c b/src/txn/txn.c
index 3dba51f5220..a70551cdeb2 100644
--- a/src/txn/txn.c
+++ b/src/txn/txn.c
@@ -100,7 +100,7 @@ __wt_txn_release_snapshot(WT_SESSION_IMPL *session)
session->txn.isolation == WT_ISO_READ_UNCOMMITTED ||
!__wt_txn_visible_all(session, txn_state->pinned_id));
- txn_state->pinned_id = WT_TXN_NONE;
+ txn_state->metadata_pinned = txn_state->pinned_id = WT_TXN_NONE;
F_CLR(txn, WT_TXN_HAS_SNAPSHOT);
}
@@ -137,8 +137,10 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session)
* metadata. We don't have to keep the checkpoint's changes pinned so
* don't including it in the published pinned ID.
*/
- if ((id = txn_global->checkpoint_txnid) != WT_TXN_NONE)
+ if ((id = txn_global->checkpoint_txnid) != WT_TXN_NONE) {
txn->snapshot[n++] = id;
+ txn_state->metadata_pinned = id;
+ }
/* For pure read-only workloads, avoid scanning. */
if (prev_oldest_id == current_id) {
@@ -188,14 +190,14 @@ done: __wt_readunlock(session, txn_global->scan_rwlock);
*/
static void
__txn_oldest_scan(WT_SESSION_IMPL *session,
- uint64_t *oldest_idp, uint64_t *last_runningp,
+ uint64_t *oldest_idp, uint64_t *last_runningp, uint64_t *metadata_pinnedp,
WT_SESSION_IMPL **oldest_sessionp)
{
WT_CONNECTION_IMPL *conn;
WT_SESSION_IMPL *oldest_session;
WT_TXN_GLOBAL *txn_global;
WT_TXN_STATE *s;
- uint64_t id, last_running, oldest_id, prev_oldest_id;
+ uint64_t id, last_running, metadata_pinned, oldest_id, prev_oldest_id;
uint32_t i, session_cnt;
conn = S2C(session);
@@ -204,24 +206,24 @@ __txn_oldest_scan(WT_SESSION_IMPL *session,
/* The oldest ID cannot change while we are holding the scan lock. */
prev_oldest_id = txn_global->oldest_id;
- oldest_id = last_running = txn_global->current;
+ last_running = oldest_id = txn_global->current;
+ if ((metadata_pinned = txn_global->checkpoint_txnid) == WT_TXN_NONE)
+ metadata_pinned = oldest_id;
/* Walk the array of concurrent transactions. */
WT_ORDERED_READ(session_cnt, conn->session_cnt);
for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
- /*
- * Update the oldest ID.
- *
- * Ignore: IDs older than the oldest ID we saw. This can happen
- * if we race with a thread that is allocating an ID -- the ID
- * will not be used because the thread will keep spinning until
- * it gets a valid one.
- */
+ /* Update the last running transaction ID. */
if ((id = s->id) != WT_TXN_NONE &&
WT_TXNID_LE(prev_oldest_id, id) &&
WT_TXNID_LT(id, last_running))
last_running = id;
+ /* Update the metadata pinned ID. */
+ if ((id = s->metadata_pinned) != WT_TXN_NONE &&
+ WT_TXNID_LT(id, metadata_pinned))
+ metadata_pinned = id;
+
/*
* !!!
* Note: Don't ignore pinned ID values older than the previous
@@ -245,9 +247,14 @@ __txn_oldest_scan(WT_SESSION_IMPL *session,
WT_TXNID_LT(id, oldest_id))
oldest_id = id;
+ /* The metadata pinned ID can't move past the oldest ID. */
+ if (WT_TXNID_LT(oldest_id, metadata_pinned))
+ metadata_pinned = oldest_id;
+
+ *last_runningp = last_running;
+ *metadata_pinnedp = metadata_pinned;
*oldest_idp = oldest_id;
*oldest_sessionp = oldest_session;
- *last_runningp = last_running;
}
/*
@@ -261,8 +268,8 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags)
WT_DECL_RET;
WT_SESSION_IMPL *oldest_session;
WT_TXN_GLOBAL *txn_global;
- uint64_t current_id, last_running, oldest_id;
- uint64_t prev_last_running, prev_oldest_id;
+ uint64_t current_id, last_running, metadata_pinned, oldest_id;
+ uint64_t prev_last_running, prev_metadata_pinned, prev_oldest_id;
bool strict, wait;
conn = S2C(session);
@@ -270,15 +277,17 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags)
strict = LF_ISSET(WT_TXN_OLDEST_STRICT);
wait = LF_ISSET(WT_TXN_OLDEST_WAIT);
- current_id = last_running = txn_global->current;
+ current_id = last_running = metadata_pinned = txn_global->current;
prev_last_running = txn_global->last_running;
+ prev_metadata_pinned = txn_global->metadata_pinned;
prev_oldest_id = txn_global->oldest_id;
/*
* For pure read-only workloads, or if the update isn't forced and the
* oldest ID isn't too far behind, avoid scanning.
*/
- if (prev_oldest_id == current_id ||
+ if ((prev_oldest_id == current_id &&
+ prev_metadata_pinned == current_id) ||
(!strict && WT_TXNID_LT(current_id, prev_oldest_id + 100)))
return (0);
@@ -288,7 +297,8 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags)
else if ((ret =
__wt_try_readlock(session, txn_global->scan_rwlock)) != 0)
return (ret == EBUSY ? 0 : ret);
- __txn_oldest_scan(session, &oldest_id, &last_running, &oldest_session);
+ __txn_oldest_scan(session,
+ &oldest_id, &last_running, &metadata_pinned, &oldest_session);
__wt_readunlock(session, txn_global->scan_rwlock);
/*
@@ -298,7 +308,8 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags)
if ((oldest_id == prev_oldest_id ||
(!strict && WT_TXNID_LT(oldest_id, prev_oldest_id + 100))) &&
((last_running == prev_last_running) ||
- (!strict && WT_TXNID_LT(last_running, prev_last_running + 100))))
+ (!strict && WT_TXNID_LT(last_running, prev_last_running + 100))) &&
+ metadata_pinned == prev_metadata_pinned)
return (0);
/* It looks like an update is necessary, wait for exclusive access. */
@@ -313,7 +324,8 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags)
* scanning.
*/
if (WT_TXNID_LE(oldest_id, txn_global->oldest_id) &&
- WT_TXNID_LE(last_running, txn_global->last_running))
+ WT_TXNID_LE(last_running, txn_global->last_running) &&
+ WT_TXNID_LE(metadata_pinned, txn_global->metadata_pinned))
goto done;
/*
@@ -322,7 +334,8 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags)
* sure that there isn't a thread that has got a snapshot locally but
* not yet published its snap_min.
*/
- __txn_oldest_scan(session, &oldest_id, &last_running, &oldest_session);
+ __txn_oldest_scan(session,
+ &oldest_id, &last_running, &metadata_pinned, &oldest_session);
#ifdef HAVE_DIAGNOSTIC
{
@@ -338,7 +351,9 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags)
id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id));
}
#endif
- /* Update the oldest ID. */
+ /* Update the public IDs. */
+ if (WT_TXNID_LT(txn_global->metadata_pinned, metadata_pinned))
+ txn_global->metadata_pinned = metadata_pinned;
if (WT_TXNID_LT(txn_global->oldest_id, oldest_id))
txn_global->oldest_id = oldest_id;
if (WT_TXNID_LT(txn_global->last_running, last_running)) {
@@ -749,7 +764,7 @@ __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[])
txn_global = &conn->txn_global;
txn_global->current = txn_global->last_running =
- txn_global->oldest_id = WT_TXN_FIRST;
+ txn_global->metadata_pinned = txn_global->oldest_id = WT_TXN_FIRST;
WT_RET(__wt_spin_init(session,
&txn_global->id_lock, "transaction id lock"));
@@ -765,7 +780,7 @@ __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[])
WT_CACHE_LINE_ALIGNMENT_VERIFY(session, txn_global->states);
for (i = 0, s = txn_global->states; i < conn->session_size; i++, s++)
- s->id = s->pinned_id = WT_TXN_NONE;
+ s->id = s->metadata_pinned = s->pinned_id = WT_TXN_NONE;
return (0);
}
diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c
index 8fc7bc821b0..698cae23562 100644
--- a/src/txn/txn_ckpt.c
+++ b/src/txn/txn_ckpt.c
@@ -299,7 +299,7 @@ __checkpoint_update_generation(WT_SESSION_IMPL *session)
WT_BTREE *btree;
btree = S2BT(session);
- if (!WT_IS_METADATA(session, session->dhandle))
+ if (!WT_IS_METADATA(session->dhandle))
WT_PUBLISH(btree->include_checkpoint_txn, false);
WT_PUBLISH(btree->checkpoint_gen,
@@ -1055,7 +1055,7 @@ __checkpoint_lock_tree(WT_SESSION_IMPL *session,
* - On connection close when we know there can't be any races.
*/
WT_ASSERT(session, !need_tracking ||
- WT_IS_METADATA(session, dhandle) || WT_META_TRACKING(session));
+ WT_IS_METADATA(dhandle) || WT_META_TRACKING(session));
/* Get the list of checkpoints for this file. */
WT_RET(__wt_meta_ckptlist_get(session, dhandle->name, &ckptbase));
@@ -1419,7 +1419,7 @@ fake: /*
* sync the file here or we could roll forward the metadata in
* recovery and open a checkpoint that isn't yet durable.
*/
- if (WT_IS_METADATA(session, dhandle) ||
+ if (WT_IS_METADATA(dhandle) ||
!F_ISSET(&session->txn, WT_TXN_RUNNING))
WT_ERR(__wt_checkpoint_sync(session, NULL));
@@ -1530,7 +1530,7 @@ __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
WT_ASSERT(session, session->dhandle->checkpoint == NULL);
/* We must hold the metadata lock if checkpointing the metadata. */
- WT_ASSERT(session, !WT_IS_METADATA(session, session->dhandle) ||
+ WT_ASSERT(session, !WT_IS_METADATA(session->dhandle) ||
F_ISSET(session, WT_SESSION_LOCKED_METADATA));
WT_SAVE_DHANDLE(session,