diff options
author | Michael Cahill <michael.cahill@mongodb.com> | 2017-03-17 11:02:25 +1100 |
---|---|---|
committer | Michael Cahill <michael.cahill@mongodb.com> | 2017-03-17 11:02:25 +1100 |
commit | 5df5125fd63295a9b71d79e68a84ba51e0c1c87f (patch) | |
tree | e28bae1a5e7ec42bb236b40caef42bfc5df6089e /src | |
parent | f53a88a57da5788b355cb3037061372a706ccf0d (diff) | |
download | mongo-5df5125fd63295a9b71d79e68a84ba51e0c1c87f.tar.gz |
Import wiredtiger: cc2f15f595b16479affd73791c207da334453bcc from branch mongodb-3.6
ref: e1bcc30da9..cc2f15f595
for: 3.5.5
WT-3149 Change eviction to start new walks from a random place in the tree
WT-3187 Hang on shutdown with a busy cache pool
WT-3188 Fix error handling in logging where fatal errors could lead to a hang
WT-3189 Fix a segfault in the eviction server random positioning
WT-3206 bug: core dump on NULL page index
WT-3218 unexpected checkpoint ordering failures
Diffstat (limited to 'src')
-rw-r--r-- | src/third_party/wiredtiger/bench/wtperf/runners/many-table-stress.wtperf | 19 | ||||
-rw-r--r-- | src/third_party/wiredtiger/import.data | 2 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_random.c | 10 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_split.c | 71 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/conn/conn_cache_pool.c | 56 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/conn/conn_dhandle.c | 8 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/evict/evict_lru.c | 48 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/btree.h | 6 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/log/log.c | 19 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/log/log_slot.c | 2 |
10 files changed, 179 insertions, 62 deletions
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/many-table-stress.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/many-table-stress.wtperf new file mode 100644 index 00000000000..51d0bb0dd9d --- /dev/null +++ b/src/third_party/wiredtiger/bench/wtperf/runners/many-table-stress.wtperf @@ -0,0 +1,19 @@ +# Create a set of tables with uneven distribution of data +conn_config="cache_size=1G,eviction=(threads_max=8),file_manager=(close_idle_time=100000),checkpoint=(wait=20,log_size=2GB),statistics=(fast),statistics_log=(wait=5,json),session_max=1000" +table_config="type=file" +table_count=5000 +icount=0 +random_range=1000000000 +pareto=10 +range_partition=true +report_interval=5 + +run_ops=1000000 +populate_threads=0 +icount=0 +threads=((count=60,inserts=1)) + +# Warn if a latency over 1 second is seen +max_latency=1000 +sample_interval=5 +sample_rate=1 diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index d2d2afda061..eae444f8f57 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -1,5 +1,5 @@ { - "commit": "e1bcc30da91eedd0b17cebb725cc7e607ffa2340", + "commit": "cc2f15f595b16479affd73791c207da334453bcc", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-3.6" } diff --git a/src/third_party/wiredtiger/src/btree/bt_random.c b/src/third_party/wiredtiger/src/btree/bt_random.c index 44de511f787..4c7ff861d26 100644 --- a/src/third_party/wiredtiger/src/btree/bt_random.c +++ b/src/third_party/wiredtiger/src/btree/bt_random.c @@ -201,6 +201,16 @@ restart: /* current = &btree->root; for (;;) { page = current->page; + /* + * When walking a tree for eviction, an exclusive operation may + * be in progress leaving the root page is not valid. Just give + * up in that case. + */ + if (page == NULL) { + WT_ASSERT(session, eviction); + break; + } + if (!WT_PAGE_IS_INTERNAL(page)) break; diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index 45550ff627f..6b2100ec7e3 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -187,7 +187,7 @@ __split_safe_free(WT_SESSION_IMPL *session, exclusive = true; if (exclusive) { - __wt_free(session, p); + __wt_overwrite_and_free_len(session, p, s); return (0); } @@ -640,12 +640,12 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) /* Start making real changes to the tree, errors are fatal. */ complete = WT_ERR_PANIC; - /* Get a generation for this split, mark the root page. */ - split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); - root->pg_intl_split_gen = split_gen; - - /* Prepare the WT_REFs for the move. */ - __split_ref_prepare(session, alloc_index, split_gen, false); + /* + * Prepare the WT_REFs for the move: this requires a stable split + * generation to block splits in newly created pages, so get one. + */ + WT_ENTER_PAGE_INDEX(session); + __split_ref_prepare(session, alloc_index, session->split_gen, false); /* * Confirm the root page's index hasn't moved, then update it, which @@ -655,6 +655,16 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) WT_INTL_INDEX_SET(root, alloc_index); alloc_index = NULL; + WT_LEAVE_PAGE_INDEX(session); + + /* + * Get a generation for this split, mark the root page. This must be + * after the new index is swapped into place in order to know that no + * readers are looking at the old index. + */ + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + root->pg_intl_split_gen = split_gen; + #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, ret = __split_verify_root(session, root)); @@ -825,10 +835,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, /* Start making real changes to the tree, errors are fatal. */ complete = WT_ERR_PANIC; - /* Get a generation for this split, mark the parent page. */ - split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); - parent->pg_intl_split_gen = split_gen; - /* * Confirm the parent page's index hasn't moved then update it, which * makes the split visible to threads descending the tree. @@ -838,6 +844,14 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, alloc_index = NULL; /* + * Get a generation for this split, mark the page. This must be after + * the new index is swapped into place in order to know that no readers + * are looking at the old index. + */ + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + parent->pg_intl_split_gen = split_gen; + + /* * If discarding the page's original WT_REF field, reset it to split. * Threads cursoring through the tree were blocked because that WT_REF * state was set to locked. Changing the locked state to split unblocks @@ -1154,23 +1168,34 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) /* Start making real changes to the tree, errors are fatal. */ complete = WT_ERR_PANIC; - /* Get a generation for this split, mark the page. */ - split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); - page->pg_intl_split_gen = split_gen; - - /* Prepare the WT_REFs for the move. */ - __split_ref_prepare(session, alloc_index, split_gen, true); + /* + * Prepare the WT_REFs for the move: this requires a stable split + * generation to block splits in newly created pages, so get one. + */ + WT_ENTER_PAGE_INDEX(session); + __split_ref_prepare(session, alloc_index, session->split_gen, true); /* Split into the parent. */ - WT_ERR(__split_parent(session, page_ref, alloc_index->index, - alloc_index->entries, parent_incr, false, false)); + if ((ret = __split_parent(session, page_ref, alloc_index->index, + alloc_index->entries, parent_incr, false, false)) == 0) { + /* + * Confirm the page's index hasn't moved, then update it, which + * makes the split visible to threads descending the tree. + */ + WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex); + WT_INTL_INDEX_SET(page, replace_index); + } + + WT_LEAVE_PAGE_INDEX(session); + WT_ERR(ret); /* - * Confirm the page's index hasn't moved, then update it, which makes - * the split visible to threads descending the tree. + * Get a generation for this split, mark the parent page. This must be + * after the new index is swapped into place in order to know that no + * readers are looking at the old index. */ - WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex); - WT_INTL_INDEX_SET(page, replace_index); + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + page->pg_intl_split_gen = split_gen; #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, diff --git a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c index 49b766f4602..ed078991581 100644 --- a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c +++ b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c @@ -418,8 +418,9 @@ static void __cache_pool_balance(WT_SESSION_IMPL *session, bool forward) { WT_CACHE_POOL *cp; - bool adjusted; uint64_t bump_threshold, highest; + int i; + bool adjusted; cp = __wt_process.cache_pool; adjusted = false; @@ -438,11 +439,17 @@ __cache_pool_balance(WT_SESSION_IMPL *session, bool forward) /* * Actively attempt to: - * - Reduce the amount allocated, if we are over the budget + * - Reduce the amount allocated, if we are over the budget. * - Increase the amount used if there is capacity and any pressure. + * Don't keep trying indefinitely, if we aren't succeeding in reducing + * the cache in use re-assessing the participants' states is necessary. + * We are also holding a lock across this process, which can slow + * participant shutdown if we spend a long time balancing. */ - while (F_ISSET(cp, WT_CACHE_POOL_ACTIVE) && - F_ISSET(S2C(session)->cache, WT_CACHE_POOL_RUN)) { + for (i = 0; + i < 2 * WT_CACHE_POOL_BUMP_THRESHOLD && + F_ISSET(cp, WT_CACHE_POOL_ACTIVE) && + F_ISSET(S2C(session)->cache, WT_CACHE_POOL_RUN); i++) { __cache_pool_adjust( session, highest, bump_threshold, forward, &adjusted); /* @@ -565,7 +572,7 @@ __cache_pool_adjust(WT_SESSION_IMPL *session, WT_CONNECTION_IMPL *entry; uint64_t adjustment, highest_percentile, pressure, reserved, smallest; u_int pct_full; - bool busy, pool_full, grow; + bool busy, decrease_ok, grow, pool_full; *adjustedp = false; cp = __wt_process.cache_pool; @@ -612,6 +619,34 @@ __cache_pool_adjust(WT_SESSION_IMPL *session, continue; /* + * The bump threshold decreases as we try longer to balance + * the pool. Adjust how aggressively we free space from + * participants depending on how long we have been trying. + */ + decrease_ok = false; + /* + * Any participant is a candidate if we have been trying + * for long enough. + */ + if (bump_threshold == 0) + decrease_ok = true; + /* + * Participants that aren't doing application eviction and + * are showing a reasonable amount of usage are excluded + * even if we have been trying for a while. + */ + else if (bump_threshold < WT_CACHE_POOL_BUMP_THRESHOLD / 3 && + (!busy && highest > 1)) + decrease_ok = true; + /* + * Any participant that is proportionally less busy is a + * candidate from the first attempt. + */ + else if (highest > 1 && + pressure < WT_CACHE_POOL_REDUCE_THRESHOLD) + decrease_ok = true; + + /* * If the entry is currently allocated less than the reserved * size, increase its allocation. This should only happen if: * - it's the first time we've seen this member, or @@ -624,17 +659,12 @@ __cache_pool_adjust(WT_SESSION_IMPL *session, * Conditions for reducing the amount of resources for an * entry: * - the pool is full, - * - application threads are not busy doing eviction already, * - this entry has more than the minimum amount of space in * use, - * - the read pressure in this entry is below the threshold, - * other entries need more cache, the entry has more than - * the minimum space and there is no available space in the - * pool. + * - it was determined that this slot is a good candidate */ - } else if (pool_full && !busy && - entry->cache_size > reserved && - pressure < WT_CACHE_POOL_REDUCE_THRESHOLD && highest > 1) { + } else if (pool_full && + entry->cache_size > reserved && decrease_ok) { grow = false; /* * Don't drop the size down too much - or it can diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c index 866b8633f71..99213c5b557 100644 --- a/src/third_party/wiredtiger/src/conn/conn_dhandle.c +++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c @@ -38,6 +38,14 @@ __wt_conn_dhandle_alloc( WT_DECL_RET; uint64_t bucket; + /* + * Ensure no one beat us to creating the handle now that we hold the + * write lock. + */ + if ((ret = + __wt_conn_dhandle_find(session, uri, checkpoint)) != WT_NOTFOUND) + return (ret); + WT_RET(__wt_calloc_one(session, &dhandle)); __wt_rwlock_init(session, &dhandle->rwlock); diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index 42fe4d4608e..f1949a7c320 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -1654,31 +1654,33 @@ __evict_walk_file(WT_SESSION_IMPL *session, !F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) min_pages *= 10; + walk_flags = + WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT; + /* * Choose a random point in the tree if looking for candidates in a * tree with no starting point set. This is mostly aimed at ensuring * eviction fairly visits all pages in trees with a lot of in-cache * content. */ - if (btree->evict_ref == NULL) { - /* Ensure internal pages indexes remain valid for our walk */ - WT_WITH_PAGE_INDEX(session, ret = - __wt_random_descent(session, &btree->evict_ref, true)); - WT_RET_NOTFOUND_OK(ret); - - /* - * Reverse the direction of the walk each time we start at a - * random point so both ends of the tree are equally likely to - * be visited. - */ - btree->evict_walk_reverse = !btree->evict_walk_reverse; - } - - walk_flags = - WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT; - - if (btree->evict_walk_reverse) + switch (btree->evict_walk_state) { + case WT_EVICT_WALK_NEXT: + break; + case WT_EVICT_WALK_PREV: FLD_SET(walk_flags, WT_READ_PREV); + break; + case WT_EVICT_WALK_RAND_PREV: + FLD_SET(walk_flags, WT_READ_PREV); + /* FALLTHROUGH */ + case WT_EVICT_WALK_RAND_NEXT: + if (btree->evict_ref == NULL) { + /* Ensure internal pages indexes remain valid */ + WT_WITH_PAGE_INDEX(session, ret = __wt_random_descent( + session, &btree->evict_ref, true)); + WT_RET_NOTFOUND_OK(ret); + } + break; + } /* * Get some more eviction candidate pages, starting at the last saved @@ -1713,8 +1715,16 @@ __evict_walk_file(WT_SESSION_IMPL *session, pages_seen > min_pages && (pages_queued == 0 || (pages_seen / pages_queued) > (min_pages / target_pages)); - if (give_up) + if (give_up) { + /* + * Try a different walk start point next time if a + * walk gave up. + */ + btree->evict_walk_state = + (btree->evict_walk_state + 1) % + WT_EVICT_WALK_MAX_LEGAL_VALUE; break; + } if (ref == NULL) { if (++restarts == 2) diff --git a/src/third_party/wiredtiger/src/include/btree.h b/src/third_party/wiredtiger/src/include/btree.h index d742310bf8f..976c1d2110c 100644 --- a/src/third_party/wiredtiger/src/include/btree.h +++ b/src/third_party/wiredtiger/src/include/btree.h @@ -141,7 +141,11 @@ struct __wt_btree { u_int evict_walk_skips; /* Number of walks skipped */ u_int evict_disabled; /* Eviction disabled count */ volatile uint32_t evict_busy; /* Count of threads in eviction */ - bool evict_walk_reverse; /* Walk direction */ + enum { + WT_EVICT_WALK_NEXT, WT_EVICT_WALK_PREV, + WT_EVICT_WALK_RAND_NEXT, WT_EVICT_WALK_RAND_PREV + } evict_walk_state; /* Eviction walk state */ +#define WT_EVICT_WALK_MAX_LEGAL_VALUE WT_EVICT_WALK_RAND_PREV + 1 enum { WT_CKPT_OFF, WT_CKPT_PREPARE, WT_CKPT_RUNNING diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c index d6caa55f8c7..3477ca52502 100644 --- a/src/third_party/wiredtiger/src/log/log.c +++ b/src/third_party/wiredtiger/src/log/log.c @@ -24,7 +24,7 @@ static int __log_write_internal( * __log_wait_for_earlier_slot -- * Wait for write_lsn to catch up to this slot. */ -static void +static int __log_wait_for_earlier_slot(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) { WT_CONNECTION_IMPL *conn; @@ -41,6 +41,7 @@ __log_wait_for_earlier_slot(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) * unlock in case an earlier thread is trying to switch its * slot and complete its operation. */ + WT_RET(WT_SESSION_CHECK_PANIC(session)); if (F_ISSET(session, WT_SESSION_LOCKED_SLOT)) __wt_spin_unlock(session, &log->log_slot_lock); __wt_cond_signal(session, conn->log_wrlsn_cond); @@ -51,6 +52,7 @@ __log_wait_for_earlier_slot(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) if (F_ISSET(session, WT_SESSION_LOCKED_SLOT)) __wt_spin_lock(session, &log->log_slot_lock); } + return (0); } /* @@ -70,7 +72,7 @@ __log_fs_write(WT_SESSION_IMPL *session, * be a hole at the end of the previous log file that we cannot detect. */ if (slot->slot_release_lsn.l.file < slot->slot_start_lsn.l.file) { - __log_wait_for_earlier_slot(session, slot); + WT_RET(__log_wait_for_earlier_slot(session, slot)); WT_RET(__wt_log_force_sync(session, &slot->slot_release_lsn)); } if ((ret = __wt_write(session, slot->slot_fh, offset, len, buf)) != 0) @@ -110,6 +112,7 @@ __wt_log_flush_lsn(WT_SESSION_IMPL *session, WT_LSN *lsn, bool start) conn = S2C(session); log = conn->log; + WT_RET(WT_SESSION_CHECK_PANIC(session)); WT_RET(__wt_log_force_write(session, 1, NULL)); __wt_log_wrlsn(session, NULL); if (start) @@ -174,6 +177,7 @@ __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn) * log file ready to close. */ while (log->sync_lsn.l.file < min_lsn->l.file) { + WT_RET(WT_SESSION_CHECK_PANIC(session)); __wt_cond_signal(session, S2C(session)->log_file_cond); __wt_cond_wait(session, log->log_sync_cond, 10000, NULL); } @@ -1467,7 +1471,7 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) * be holes in the log file. */ WT_STAT_CONN_INCR(session, log_release_write_lsn); - __log_wait_for_earlier_slot(session, slot); + WT_ERR(__log_wait_for_earlier_slot(session, slot)); log->write_start_lsn = slot->slot_start_lsn; log->write_lsn = slot->slot_end_lsn; @@ -1488,6 +1492,7 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) * current fsync completes and advance log->sync_lsn. */ while (F_ISSET(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR)) { + WT_ERR(WT_SESSION_CHECK_PANIC(session)); /* * We have to wait until earlier log files have finished their * sync operations. The most recent one will set the LSN to the @@ -2178,15 +2183,19 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, if (LF_ISSET(WT_LOG_FLUSH)) { /* Wait for our writes to reach the OS */ while (__wt_log_cmp(&log->write_lsn, &lsn) <= 0 && - myslot.slot->slot_error == 0) + myslot.slot->slot_error == 0) { + WT_ERR(WT_SESSION_CHECK_PANIC(session)); __wt_cond_wait( session, log->log_write_cond, 10000, NULL); + } } else if (LF_ISSET(WT_LOG_FSYNC)) { /* Wait for our writes to reach disk */ while (__wt_log_cmp(&log->sync_lsn, &lsn) <= 0 && - myslot.slot->slot_error == 0) + myslot.slot->slot_error == 0) { + WT_ERR(WT_SESSION_CHECK_PANIC(session)); __wt_cond_wait( session, log->log_sync_cond, 10000, NULL); + } } /* diff --git a/src/third_party/wiredtiger/src/log/log_slot.c b/src/third_party/wiredtiger/src/log/log_slot.c index 542f010ea53..b4655ff6c1a 100644 --- a/src/third_party/wiredtiger/src/log/log_slot.c +++ b/src/third_party/wiredtiger/src/log/log_slot.c @@ -220,6 +220,7 @@ __log_slot_switch_internal( if (slot != log->active_slot) return (0); + WT_RET(WT_SESSION_CHECK_PANIC(session)); /* * We may come through here multiple times if we were able to close * a slot but could not set up a new one. If we closed it already, @@ -582,6 +583,7 @@ __wt_log_slot_release(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int64_t size) * was written rather than the beginning record of the slot. */ while ((cur_offset = slot->slot_last_offset) < my_start) { + WT_RET(WT_SESSION_CHECK_PANIC(session)); /* * Set our offset if we are larger. */ |