diff options
author | Michael Cahill <michael.cahill@mongodb.com> | 2016-11-15 17:02:07 +1100 |
---|---|---|
committer | Michael Cahill <michael.cahill@mongodb.com> | 2016-11-15 17:02:07 +1100 |
commit | 85ec028f36c0a48bb4e05fc1841b25c56c0f173b (patch) | |
tree | e38d422594801c8f22ba20f097c6672e214b3af3 | |
parent | 038fc3672336622aee5f5957fcf6d1eeff3426e3 (diff) | |
parent | e11d885f11fc7c47f1a9160087f738da80567ad2 (diff) | |
download | mongo-85ec028f36c0a48bb4e05fc1841b25c56c0f173b.tar.gz |
Merge branch 'develop' into mongodb-3.4
-rw-r--r-- | src/btree/bt_handle.c | 4 | ||||
-rw-r--r-- | src/conn/conn_dhandle.c | 4 | ||||
-rw-r--r-- | src/conn/conn_open.c | 3 | ||||
-rw-r--r-- | src/conn/conn_sweep.c | 6 | ||||
-rw-r--r-- | src/evict/evict_lru.c | 2 | ||||
-rw-r--r-- | src/include/btree.i | 22 | ||||
-rw-r--r-- | src/include/meta.h | 3 | ||||
-rw-r--r-- | src/include/txn.h | 11 | ||||
-rw-r--r-- | src/include/txn.i | 9 | ||||
-rw-r--r-- | src/session/session_dhandle.c | 4 | ||||
-rw-r--r-- | src/txn/txn.c | 65 | ||||
-rw-r--r-- | src/txn/txn_ckpt.c | 8 |
12 files changed, 85 insertions, 56 deletions
diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index 337a3ea036f..47c7972dd57 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -341,7 +341,7 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) * always inherit from the connection. */ WT_RET(__wt_config_gets(session, cfg, "encryption.name", &cval)); - if (WT_IS_METADATA(session, btree->dhandle) || cval.len == 0) + if (WT_IS_METADATA(btree->dhandle) || cval.len == 0) btree->kencryptor = conn->kencryptor; else if (WT_STRING_MATCH("none", cval.str, cval.len)) btree->kencryptor = NULL; @@ -432,7 +432,7 @@ __wt_btree_tree_open( * Failure to open metadata means that the database is unavailable. * Try to provide a helpful failure message. */ - if (ret != 0 && WT_IS_METADATA(session, session->dhandle)) { + if (ret != 0 && WT_IS_METADATA(session->dhandle)) { __wt_errx(session, "WiredTiger has failed to open its metadata"); __wt_errx(session, "This may be due to the database" diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c index ac72e330b67..e9e3925c57e 100644 --- a/src/conn/conn_dhandle.c +++ b/src/conn/conn_dhandle.c @@ -447,7 +447,7 @@ __wt_conn_btree_apply(WT_SESSION_IMPL *session, const char *uri, F_ISSET(dhandle, WT_DHANDLE_DEAD) || dhandle->checkpoint != NULL || !WT_PREFIX_MATCH(dhandle->name, "file:") || - WT_IS_METADATA(session, dhandle)) + WT_IS_METADATA(dhandle)) continue; WT_RET(__conn_btree_apply_internal( session, dhandle, file_func, name_func, cfg)); @@ -627,7 +627,7 @@ __wt_conn_dhandle_discard(WT_SESSION_IMPL *session) */ restart: TAILQ_FOREACH(dhandle, &conn->dhqh, q) { - if (WT_IS_METADATA(session, dhandle)) + if (WT_IS_METADATA(dhandle)) continue; WT_WITH_DHANDLE(session, dhandle, diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c index 69b50147bf5..6454503d6cb 100644 --- a/src/conn/conn_open.c +++ b/src/conn/conn_open.c @@ -95,7 +95,8 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) for (;;) { WT_TRET(__wt_txn_update_oldest(session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT)); - if (txn_global->oldest_id == txn_global->current) + if (txn_global->oldest_id == txn_global->current && + txn_global->metadata_pinned == txn_global->current) break; __wt_yield(); } diff --git a/src/conn/conn_sweep.c b/src/conn/conn_sweep.c index dba37fa2eb0..d1254d8afcc 100644 --- a/src/conn/conn_sweep.c +++ b/src/conn/conn_sweep.c @@ -26,7 +26,7 @@ __sweep_mark(WT_SESSION_IMPL *session, time_t now) conn = S2C(session); TAILQ_FOREACH(dhandle, &conn->dhqh, q) { - if (WT_IS_METADATA(session, dhandle)) + if (WT_IS_METADATA(dhandle)) continue; /* @@ -122,7 +122,7 @@ __sweep_expire(WT_SESSION_IMPL *session, time_t now) if (conn->open_btree_count < conn->sweep_handles_min) break; - if (WT_IS_METADATA(session, dhandle) || + if (WT_IS_METADATA(dhandle) || !F_ISSET(dhandle, WT_DHANDLE_OPEN) || dhandle->session_inuse != 0 || dhandle->timeofdeath == 0 || @@ -228,7 +228,7 @@ __sweep_remove_handles(WT_SESSION_IMPL *session) dhandle != NULL; dhandle = dhandle_next) { dhandle_next = TAILQ_NEXT(dhandle, q); - if (WT_IS_METADATA(session, dhandle)) + if (WT_IS_METADATA(dhandle)) continue; if (!WT_DHANDLE_CAN_DISCARD(dhandle)) continue; diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 909fe813f98..dfa3fae48d9 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -1449,7 +1449,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, /* Limit internal pages to 50% of the total. */ if (WT_PAGE_IS_INTERNAL(page) && - internal_pages >= (int)(evict - start) / 2) + internal_pages > (int)(evict - start) / 2) continue; /* If eviction gets aggressive, anything else is fair game. */ diff --git a/src/include/btree.i b/src/include/btree.i index 9c8ec1dbdea..8f44bc4eddd 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -1562,7 +1562,7 @@ __wt_btree_lsm_switch_primary(WT_SESSION_IMPL *session, bool on) cache = S2C(session)->cache; root = btree->root.page; - if (on && !F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) + if (!F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) F_SET(btree, WT_BTREE_LSM_PRIMARY | WT_BTREE_NO_EVICTION); if (!on && F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) { pindex = WT_INTL_INDEX_GET_SAFE(root); @@ -1575,21 +1575,19 @@ __wt_btree_lsm_switch_primary(WT_SESSION_IMPL *session, bool on) * We're reaching down into the page without a hazard pointer, * but that's OK because we know that no-eviction is set so the * page can't disappear. - */ - child = first->page; - if (first->state != WT_REF_MEM || - child->type != WT_PAGE_ROW_LEAF || - !__wt_page_is_modified(child)) - return; - - /* + * * While this tree was the primary, its dirty bytes were not * included in the cache accounting. Fix that now before we * open it up for eviction. */ - size = child->modify->bytes_dirty; - (void)__wt_atomic_add64(&btree->bytes_dirty_leaf, size); - (void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size); + child = first->page; + if (first->state == WT_REF_MEM && + child->type == WT_PAGE_ROW_LEAF && + __wt_page_is_modified(child)) { + size = child->modify->bytes_dirty; + (void)__wt_atomic_add64(&btree->bytes_dirty_leaf, size); + (void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size); + } F_CLR(btree, WT_BTREE_LSM_PRIMARY | WT_BTREE_NO_EVICTION); } diff --git a/src/include/meta.h b/src/include/meta.h index 6d4a167a8e5..74df3c57ce4 100644 --- a/src/include/meta.h +++ b/src/include/meta.h @@ -32,8 +32,7 @@ * Optimize comparisons against the metafile URI, flag handles that reference * the metadata file. */ -#define WT_IS_METADATA(session, dh) \ - F_ISSET((dh), WT_DHANDLE_IS_METADATA) +#define WT_IS_METADATA(dh) F_ISSET((dh), WT_DHANDLE_IS_METADATA) #define WT_METAFILE_ID 0 /* Metadata file ID */ #define WT_METADATA_VERSION "WiredTiger version" /* Version keys */ diff --git a/src/include/txn.h b/src/include/txn.h index 8128e8e4cc2..774f635d7ba 100644 --- a/src/include/txn.h +++ b/src/include/txn.h @@ -67,6 +67,7 @@ struct __wt_named_snapshot { struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_txn_state { volatile uint64_t id; volatile uint64_t pinned_id; + volatile uint64_t metadata_pinned; }; struct __wt_txn_global { @@ -94,12 +95,18 @@ struct __wt_txn_global { * for a long time so we keep them out of regular visibility checks. * Eviction and checkpoint operations know when they need to be aware * of checkpoint transactions. + * + * We rely on the fact that (a) the only table a checkpoint updates is + * the metadata; and (b) once checkpoint has finished reading a table, + * it won't revisit it. */ volatile uint32_t checkpoint_id; /* Checkpoint's session ID */ - volatile uint64_t checkpoint_gen; - volatile uint64_t checkpoint_pinned; + volatile uint64_t checkpoint_gen; /* Checkpoint generation */ + volatile uint64_t checkpoint_pinned; /* Oldest ID for checkpoint */ volatile uint64_t checkpoint_txnid; /* Checkpoint's txn ID */ + volatile uint64_t metadata_pinned; /* Oldest ID for metadata */ + /* Named snapshot state. */ WT_RWLOCK *nsnap_rwlock; volatile uint64_t nsnap_oldest_id; diff --git a/src/include/txn.i b/src/include/txn.i index 0f75b2e90e1..3e8dd45003e 100644 --- a/src/include/txn.i +++ b/src/include/txn.i @@ -112,6 +112,13 @@ __wt_txn_oldest_id(WT_SESSION_IMPL *session) btree = S2BT_SAFE(session); /* + * The metadata is tracked specially because of optimizations for + * checkpoints. + */ + if (session->dhandle != NULL && WT_IS_METADATA(session->dhandle)) + return (txn_global->metadata_pinned); + + /* * Take a local copy of these IDs in case they are updated while we are * checking visibility. The read of the transaction ID pinned by a * checkpoint needs to be carefully ordered: if a checkpoint is @@ -482,6 +489,8 @@ __wt_txn_cursor_op(WT_SESSION_IMPL *session) if (txn->isolation == WT_ISO_READ_UNCOMMITTED) { if (txn_state->pinned_id == WT_TXN_NONE) txn_state->pinned_id = txn_global->last_running; + if (txn_state->metadata_pinned == WT_TXN_NONE) + txn_state->metadata_pinned = txn_state->pinned_id; } else if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT)) __wt_txn_get_snapshot(session); } diff --git a/src/session/session_dhandle.c b/src/session/session_dhandle.c index 725854c6001..94326aebe46 100644 --- a/src/session/session_dhandle.c +++ b/src/session/session_dhandle.c @@ -68,7 +68,7 @@ __session_find_dhandle(WT_SESSION_IMPL *session, retry: TAILQ_FOREACH(dhandle_cache, &session->dhhash[bucket], hashq) { dhandle = dhandle_cache->dhandle; if (WT_DHANDLE_INACTIVE(dhandle) && - !WT_IS_METADATA(session, dhandle)) { + !WT_IS_METADATA(dhandle)) { __session_discard_dhandle(session, dhandle_cache); /* We deleted our entry, retry from the start. */ goto retry; @@ -401,7 +401,7 @@ __session_dhandle_sweep(WT_SESSION_IMPL *session) difftime(now, dhandle->timeofdeath) > conn->sweep_idle_time))) { WT_STAT_CONN_INCR(session, dh_session_handles); - WT_ASSERT(session, !WT_IS_METADATA(session, dhandle)); + WT_ASSERT(session, !WT_IS_METADATA(dhandle)); __session_discard_dhandle(session, dhandle_cache); } dhandle_cache = dhandle_cache_next; diff --git a/src/txn/txn.c b/src/txn/txn.c index 3dba51f5220..a70551cdeb2 100644 --- a/src/txn/txn.c +++ b/src/txn/txn.c @@ -100,7 +100,7 @@ __wt_txn_release_snapshot(WT_SESSION_IMPL *session) session->txn.isolation == WT_ISO_READ_UNCOMMITTED || !__wt_txn_visible_all(session, txn_state->pinned_id)); - txn_state->pinned_id = WT_TXN_NONE; + txn_state->metadata_pinned = txn_state->pinned_id = WT_TXN_NONE; F_CLR(txn, WT_TXN_HAS_SNAPSHOT); } @@ -137,8 +137,10 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) * metadata. We don't have to keep the checkpoint's changes pinned so * don't including it in the published pinned ID. */ - if ((id = txn_global->checkpoint_txnid) != WT_TXN_NONE) + if ((id = txn_global->checkpoint_txnid) != WT_TXN_NONE) { txn->snapshot[n++] = id; + txn_state->metadata_pinned = id; + } /* For pure read-only workloads, avoid scanning. */ if (prev_oldest_id == current_id) { @@ -188,14 +190,14 @@ done: __wt_readunlock(session, txn_global->scan_rwlock); */ static void __txn_oldest_scan(WT_SESSION_IMPL *session, - uint64_t *oldest_idp, uint64_t *last_runningp, + uint64_t *oldest_idp, uint64_t *last_runningp, uint64_t *metadata_pinnedp, WT_SESSION_IMPL **oldest_sessionp) { WT_CONNECTION_IMPL *conn; WT_SESSION_IMPL *oldest_session; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s; - uint64_t id, last_running, oldest_id, prev_oldest_id; + uint64_t id, last_running, metadata_pinned, oldest_id, prev_oldest_id; uint32_t i, session_cnt; conn = S2C(session); @@ -204,24 +206,24 @@ __txn_oldest_scan(WT_SESSION_IMPL *session, /* The oldest ID cannot change while we are holding the scan lock. */ prev_oldest_id = txn_global->oldest_id; - oldest_id = last_running = txn_global->current; + last_running = oldest_id = txn_global->current; + if ((metadata_pinned = txn_global->checkpoint_txnid) == WT_TXN_NONE) + metadata_pinned = oldest_id; /* Walk the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { - /* - * Update the oldest ID. - * - * Ignore: IDs older than the oldest ID we saw. This can happen - * if we race with a thread that is allocating an ID -- the ID - * will not be used because the thread will keep spinning until - * it gets a valid one. - */ + /* Update the last running transaction ID. */ if ((id = s->id) != WT_TXN_NONE && WT_TXNID_LE(prev_oldest_id, id) && WT_TXNID_LT(id, last_running)) last_running = id; + /* Update the metadata pinned ID. */ + if ((id = s->metadata_pinned) != WT_TXN_NONE && + WT_TXNID_LT(id, metadata_pinned)) + metadata_pinned = id; + /* * !!! * Note: Don't ignore pinned ID values older than the previous @@ -245,9 +247,14 @@ __txn_oldest_scan(WT_SESSION_IMPL *session, WT_TXNID_LT(id, oldest_id)) oldest_id = id; + /* The metadata pinned ID can't move past the oldest ID. */ + if (WT_TXNID_LT(oldest_id, metadata_pinned)) + metadata_pinned = oldest_id; + + *last_runningp = last_running; + *metadata_pinnedp = metadata_pinned; *oldest_idp = oldest_id; *oldest_sessionp = oldest_session; - *last_runningp = last_running; } /* @@ -261,8 +268,8 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags) WT_DECL_RET; WT_SESSION_IMPL *oldest_session; WT_TXN_GLOBAL *txn_global; - uint64_t current_id, last_running, oldest_id; - uint64_t prev_last_running, prev_oldest_id; + uint64_t current_id, last_running, metadata_pinned, oldest_id; + uint64_t prev_last_running, prev_metadata_pinned, prev_oldest_id; bool strict, wait; conn = S2C(session); @@ -270,15 +277,17 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags) strict = LF_ISSET(WT_TXN_OLDEST_STRICT); wait = LF_ISSET(WT_TXN_OLDEST_WAIT); - current_id = last_running = txn_global->current; + current_id = last_running = metadata_pinned = txn_global->current; prev_last_running = txn_global->last_running; + prev_metadata_pinned = txn_global->metadata_pinned; prev_oldest_id = txn_global->oldest_id; /* * For pure read-only workloads, or if the update isn't forced and the * oldest ID isn't too far behind, avoid scanning. */ - if (prev_oldest_id == current_id || + if ((prev_oldest_id == current_id && + prev_metadata_pinned == current_id) || (!strict && WT_TXNID_LT(current_id, prev_oldest_id + 100))) return (0); @@ -288,7 +297,8 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags) else if ((ret = __wt_try_readlock(session, txn_global->scan_rwlock)) != 0) return (ret == EBUSY ? 0 : ret); - __txn_oldest_scan(session, &oldest_id, &last_running, &oldest_session); + __txn_oldest_scan(session, + &oldest_id, &last_running, &metadata_pinned, &oldest_session); __wt_readunlock(session, txn_global->scan_rwlock); /* @@ -298,7 +308,8 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags) if ((oldest_id == prev_oldest_id || (!strict && WT_TXNID_LT(oldest_id, prev_oldest_id + 100))) && ((last_running == prev_last_running) || - (!strict && WT_TXNID_LT(last_running, prev_last_running + 100)))) + (!strict && WT_TXNID_LT(last_running, prev_last_running + 100))) && + metadata_pinned == prev_metadata_pinned) return (0); /* It looks like an update is necessary, wait for exclusive access. */ @@ -313,7 +324,8 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags) * scanning. */ if (WT_TXNID_LE(oldest_id, txn_global->oldest_id) && - WT_TXNID_LE(last_running, txn_global->last_running)) + WT_TXNID_LE(last_running, txn_global->last_running) && + WT_TXNID_LE(metadata_pinned, txn_global->metadata_pinned)) goto done; /* @@ -322,7 +334,8 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags) * sure that there isn't a thread that has got a snapshot locally but * not yet published its snap_min. */ - __txn_oldest_scan(session, &oldest_id, &last_running, &oldest_session); + __txn_oldest_scan(session, + &oldest_id, &last_running, &metadata_pinned, &oldest_session); #ifdef HAVE_DIAGNOSTIC { @@ -338,7 +351,9 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags) id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id)); } #endif - /* Update the oldest ID. */ + /* Update the public IDs. */ + if (WT_TXNID_LT(txn_global->metadata_pinned, metadata_pinned)) + txn_global->metadata_pinned = metadata_pinned; if (WT_TXNID_LT(txn_global->oldest_id, oldest_id)) txn_global->oldest_id = oldest_id; if (WT_TXNID_LT(txn_global->last_running, last_running)) { @@ -749,7 +764,7 @@ __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]) txn_global = &conn->txn_global; txn_global->current = txn_global->last_running = - txn_global->oldest_id = WT_TXN_FIRST; + txn_global->metadata_pinned = txn_global->oldest_id = WT_TXN_FIRST; WT_RET(__wt_spin_init(session, &txn_global->id_lock, "transaction id lock")); @@ -765,7 +780,7 @@ __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]) WT_CACHE_LINE_ALIGNMENT_VERIFY(session, txn_global->states); for (i = 0, s = txn_global->states; i < conn->session_size; i++, s++) - s->id = s->pinned_id = WT_TXN_NONE; + s->id = s->metadata_pinned = s->pinned_id = WT_TXN_NONE; return (0); } diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index 8fc7bc821b0..698cae23562 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -299,7 +299,7 @@ __checkpoint_update_generation(WT_SESSION_IMPL *session) WT_BTREE *btree; btree = S2BT(session); - if (!WT_IS_METADATA(session, session->dhandle)) + if (!WT_IS_METADATA(session->dhandle)) WT_PUBLISH(btree->include_checkpoint_txn, false); WT_PUBLISH(btree->checkpoint_gen, @@ -1055,7 +1055,7 @@ __checkpoint_lock_tree(WT_SESSION_IMPL *session, * - On connection close when we know there can't be any races. */ WT_ASSERT(session, !need_tracking || - WT_IS_METADATA(session, dhandle) || WT_META_TRACKING(session)); + WT_IS_METADATA(dhandle) || WT_META_TRACKING(session)); /* Get the list of checkpoints for this file. */ WT_RET(__wt_meta_ckptlist_get(session, dhandle->name, &ckptbase)); @@ -1419,7 +1419,7 @@ fake: /* * sync the file here or we could roll forward the metadata in * recovery and open a checkpoint that isn't yet durable. */ - if (WT_IS_METADATA(session, dhandle) || + if (WT_IS_METADATA(dhandle) || !F_ISSET(&session->txn, WT_TXN_RUNNING)) WT_ERR(__wt_checkpoint_sync(session, NULL)); @@ -1530,7 +1530,7 @@ __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_ASSERT(session, session->dhandle->checkpoint == NULL); /* We must hold the metadata lock if checkpointing the metadata. */ - WT_ASSERT(session, !WT_IS_METADATA(session, session->dhandle) || + WT_ASSERT(session, !WT_IS_METADATA(session->dhandle) || F_ISSET(session, WT_SESSION_LOCKED_METADATA)); WT_SAVE_DHANDLE(session, |