summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMichael Cahill <michael.cahill@mongodb.com>2017-03-17 11:02:25 +1100
committerMichael Cahill <michael.cahill@mongodb.com>2017-03-17 11:02:25 +1100
commit5df5125fd63295a9b71d79e68a84ba51e0c1c87f (patch)
treee28bae1a5e7ec42bb236b40caef42bfc5df6089e /src
parentf53a88a57da5788b355cb3037061372a706ccf0d (diff)
downloadmongo-5df5125fd63295a9b71d79e68a84ba51e0c1c87f.tar.gz
Import wiredtiger: cc2f15f595b16479affd73791c207da334453bcc from branch mongodb-3.6
ref: e1bcc30da9..cc2f15f595 for: 3.5.5 WT-3149 Change eviction to start new walks from a random place in the tree WT-3187 Hang on shutdown with a busy cache pool WT-3188 Fix error handling in logging where fatal errors could lead to a hang WT-3189 Fix a segfault in the eviction server random positioning WT-3206 bug: core dump on NULL page index WT-3218 unexpected checkpoint ordering failures
Diffstat (limited to 'src')
-rw-r--r--src/third_party/wiredtiger/bench/wtperf/runners/many-table-stress.wtperf19
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_random.c10
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_split.c71
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_cache_pool.c56
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_dhandle.c8
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_lru.c48
-rw-r--r--src/third_party/wiredtiger/src/include/btree.h6
-rw-r--r--src/third_party/wiredtiger/src/log/log.c19
-rw-r--r--src/third_party/wiredtiger/src/log/log_slot.c2
10 files changed, 179 insertions, 62 deletions
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/many-table-stress.wtperf b/src/third_party/wiredtiger/bench/wtperf/runners/many-table-stress.wtperf
new file mode 100644
index 00000000000..51d0bb0dd9d
--- /dev/null
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/many-table-stress.wtperf
@@ -0,0 +1,19 @@
+# Create a set of tables with uneven distribution of data
+conn_config="cache_size=1G,eviction=(threads_max=8),file_manager=(close_idle_time=100000),checkpoint=(wait=20,log_size=2GB),statistics=(fast),statistics_log=(wait=5,json),session_max=1000"
+table_config="type=file"
+table_count=5000
+icount=0
+random_range=1000000000
+pareto=10
+range_partition=true
+report_interval=5
+
+run_ops=1000000
+populate_threads=0
+icount=0
+threads=((count=60,inserts=1))
+
+# Warn if a latency over 1 second is seen
+max_latency=1000
+sample_interval=5
+sample_rate=1
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index d2d2afda061..eae444f8f57 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -1,5 +1,5 @@
{
- "commit": "e1bcc30da91eedd0b17cebb725cc7e607ffa2340",
+ "commit": "cc2f15f595b16479affd73791c207da334453bcc",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-3.6"
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_random.c b/src/third_party/wiredtiger/src/btree/bt_random.c
index 44de511f787..4c7ff861d26 100644
--- a/src/third_party/wiredtiger/src/btree/bt_random.c
+++ b/src/third_party/wiredtiger/src/btree/bt_random.c
@@ -201,6 +201,16 @@ restart: /*
current = &btree->root;
for (;;) {
page = current->page;
+ /*
+ * When walking a tree for eviction, an exclusive operation may
+ * be in progress leaving the root page is not valid. Just give
+ * up in that case.
+ */
+ if (page == NULL) {
+ WT_ASSERT(session, eviction);
+ break;
+ }
+
if (!WT_PAGE_IS_INTERNAL(page))
break;
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c
index 45550ff627f..6b2100ec7e3 100644
--- a/src/third_party/wiredtiger/src/btree/bt_split.c
+++ b/src/third_party/wiredtiger/src/btree/bt_split.c
@@ -187,7 +187,7 @@ __split_safe_free(WT_SESSION_IMPL *session,
exclusive = true;
if (exclusive) {
- __wt_free(session, p);
+ __wt_overwrite_and_free_len(session, p, s);
return (0);
}
@@ -640,12 +640,12 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
/* Start making real changes to the tree, errors are fatal. */
complete = WT_ERR_PANIC;
- /* Get a generation for this split, mark the root page. */
- split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
- root->pg_intl_split_gen = split_gen;
-
- /* Prepare the WT_REFs for the move. */
- __split_ref_prepare(session, alloc_index, split_gen, false);
+ /*
+ * Prepare the WT_REFs for the move: this requires a stable split
+ * generation to block splits in newly created pages, so get one.
+ */
+ WT_ENTER_PAGE_INDEX(session);
+ __split_ref_prepare(session, alloc_index, session->split_gen, false);
/*
* Confirm the root page's index hasn't moved, then update it, which
@@ -655,6 +655,16 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
WT_INTL_INDEX_SET(root, alloc_index);
alloc_index = NULL;
+ WT_LEAVE_PAGE_INDEX(session);
+
+ /*
+ * Get a generation for this split, mark the root page. This must be
+ * after the new index is swapped into place in order to know that no
+ * readers are looking at the old index.
+ */
+ split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
+ root->pg_intl_split_gen = split_gen;
+
#ifdef HAVE_DIAGNOSTIC
WT_WITH_PAGE_INDEX(session,
ret = __split_verify_root(session, root));
@@ -825,10 +835,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
/* Start making real changes to the tree, errors are fatal. */
complete = WT_ERR_PANIC;
- /* Get a generation for this split, mark the parent page. */
- split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
- parent->pg_intl_split_gen = split_gen;
-
/*
* Confirm the parent page's index hasn't moved then update it, which
* makes the split visible to threads descending the tree.
@@ -838,6 +844,14 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
alloc_index = NULL;
/*
+ * Get a generation for this split, mark the page. This must be after
+ * the new index is swapped into place in order to know that no readers
+ * are looking at the old index.
+ */
+ split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
+ parent->pg_intl_split_gen = split_gen;
+
+ /*
* If discarding the page's original WT_REF field, reset it to split.
* Threads cursoring through the tree were blocked because that WT_REF
* state was set to locked. Changing the locked state to split unblocks
@@ -1154,23 +1168,34 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
/* Start making real changes to the tree, errors are fatal. */
complete = WT_ERR_PANIC;
- /* Get a generation for this split, mark the page. */
- split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
- page->pg_intl_split_gen = split_gen;
-
- /* Prepare the WT_REFs for the move. */
- __split_ref_prepare(session, alloc_index, split_gen, true);
+ /*
+ * Prepare the WT_REFs for the move: this requires a stable split
+ * generation to block splits in newly created pages, so get one.
+ */
+ WT_ENTER_PAGE_INDEX(session);
+ __split_ref_prepare(session, alloc_index, session->split_gen, true);
/* Split into the parent. */
- WT_ERR(__split_parent(session, page_ref, alloc_index->index,
- alloc_index->entries, parent_incr, false, false));
+ if ((ret = __split_parent(session, page_ref, alloc_index->index,
+ alloc_index->entries, parent_incr, false, false)) == 0) {
+ /*
+ * Confirm the page's index hasn't moved, then update it, which
+ * makes the split visible to threads descending the tree.
+ */
+ WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex);
+ WT_INTL_INDEX_SET(page, replace_index);
+ }
+
+ WT_LEAVE_PAGE_INDEX(session);
+ WT_ERR(ret);
/*
- * Confirm the page's index hasn't moved, then update it, which makes
- * the split visible to threads descending the tree.
+ * Get a generation for this split, mark the parent page. This must be
+ * after the new index is swapped into place in order to know that no
+ * readers are looking at the old index.
*/
- WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex);
- WT_INTL_INDEX_SET(page, replace_index);
+ split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
+ page->pg_intl_split_gen = split_gen;
#ifdef HAVE_DIAGNOSTIC
WT_WITH_PAGE_INDEX(session,
diff --git a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c
index 49b766f4602..ed078991581 100644
--- a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c
+++ b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c
@@ -418,8 +418,9 @@ static void
__cache_pool_balance(WT_SESSION_IMPL *session, bool forward)
{
WT_CACHE_POOL *cp;
- bool adjusted;
uint64_t bump_threshold, highest;
+ int i;
+ bool adjusted;
cp = __wt_process.cache_pool;
adjusted = false;
@@ -438,11 +439,17 @@ __cache_pool_balance(WT_SESSION_IMPL *session, bool forward)
/*
* Actively attempt to:
- * - Reduce the amount allocated, if we are over the budget
+ * - Reduce the amount allocated, if we are over the budget.
* - Increase the amount used if there is capacity and any pressure.
+ * Don't keep trying indefinitely, if we aren't succeeding in reducing
+ * the cache in use re-assessing the participants' states is necessary.
+ * We are also holding a lock across this process, which can slow
+ * participant shutdown if we spend a long time balancing.
*/
- while (F_ISSET(cp, WT_CACHE_POOL_ACTIVE) &&
- F_ISSET(S2C(session)->cache, WT_CACHE_POOL_RUN)) {
+ for (i = 0;
+ i < 2 * WT_CACHE_POOL_BUMP_THRESHOLD &&
+ F_ISSET(cp, WT_CACHE_POOL_ACTIVE) &&
+ F_ISSET(S2C(session)->cache, WT_CACHE_POOL_RUN); i++) {
__cache_pool_adjust(
session, highest, bump_threshold, forward, &adjusted);
/*
@@ -565,7 +572,7 @@ __cache_pool_adjust(WT_SESSION_IMPL *session,
WT_CONNECTION_IMPL *entry;
uint64_t adjustment, highest_percentile, pressure, reserved, smallest;
u_int pct_full;
- bool busy, pool_full, grow;
+ bool busy, decrease_ok, grow, pool_full;
*adjustedp = false;
cp = __wt_process.cache_pool;
@@ -612,6 +619,34 @@ __cache_pool_adjust(WT_SESSION_IMPL *session,
continue;
/*
+ * The bump threshold decreases as we try longer to balance
+ * the pool. Adjust how aggressively we free space from
+ * participants depending on how long we have been trying.
+ */
+ decrease_ok = false;
+ /*
+ * Any participant is a candidate if we have been trying
+ * for long enough.
+ */
+ if (bump_threshold == 0)
+ decrease_ok = true;
+ /*
+ * Participants that aren't doing application eviction and
+ * are showing a reasonable amount of usage are excluded
+ * even if we have been trying for a while.
+ */
+ else if (bump_threshold < WT_CACHE_POOL_BUMP_THRESHOLD / 3 &&
+ (!busy && highest > 1))
+ decrease_ok = true;
+ /*
+ * Any participant that is proportionally less busy is a
+ * candidate from the first attempt.
+ */
+ else if (highest > 1 &&
+ pressure < WT_CACHE_POOL_REDUCE_THRESHOLD)
+ decrease_ok = true;
+
+ /*
* If the entry is currently allocated less than the reserved
* size, increase its allocation. This should only happen if:
* - it's the first time we've seen this member, or
@@ -624,17 +659,12 @@ __cache_pool_adjust(WT_SESSION_IMPL *session,
* Conditions for reducing the amount of resources for an
* entry:
* - the pool is full,
- * - application threads are not busy doing eviction already,
* - this entry has more than the minimum amount of space in
* use,
- * - the read pressure in this entry is below the threshold,
- * other entries need more cache, the entry has more than
- * the minimum space and there is no available space in the
- * pool.
+ * - it was determined that this slot is a good candidate
*/
- } else if (pool_full && !busy &&
- entry->cache_size > reserved &&
- pressure < WT_CACHE_POOL_REDUCE_THRESHOLD && highest > 1) {
+ } else if (pool_full &&
+ entry->cache_size > reserved && decrease_ok) {
grow = false;
/*
* Don't drop the size down too much - or it can
diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
index 866b8633f71..99213c5b557 100644
--- a/src/third_party/wiredtiger/src/conn/conn_dhandle.c
+++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c
@@ -38,6 +38,14 @@ __wt_conn_dhandle_alloc(
WT_DECL_RET;
uint64_t bucket;
+ /*
+ * Ensure no one beat us to creating the handle now that we hold the
+ * write lock.
+ */
+ if ((ret =
+ __wt_conn_dhandle_find(session, uri, checkpoint)) != WT_NOTFOUND)
+ return (ret);
+
WT_RET(__wt_calloc_one(session, &dhandle));
__wt_rwlock_init(session, &dhandle->rwlock);
diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c
index 42fe4d4608e..f1949a7c320 100644
--- a/src/third_party/wiredtiger/src/evict/evict_lru.c
+++ b/src/third_party/wiredtiger/src/evict/evict_lru.c
@@ -1654,31 +1654,33 @@ __evict_walk_file(WT_SESSION_IMPL *session,
!F_ISSET(cache, WT_CACHE_EVICT_CLEAN))
min_pages *= 10;
+ walk_flags =
+ WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT;
+
/*
* Choose a random point in the tree if looking for candidates in a
* tree with no starting point set. This is mostly aimed at ensuring
* eviction fairly visits all pages in trees with a lot of in-cache
* content.
*/
- if (btree->evict_ref == NULL) {
- /* Ensure internal pages indexes remain valid for our walk */
- WT_WITH_PAGE_INDEX(session, ret =
- __wt_random_descent(session, &btree->evict_ref, true));
- WT_RET_NOTFOUND_OK(ret);
-
- /*
- * Reverse the direction of the walk each time we start at a
- * random point so both ends of the tree are equally likely to
- * be visited.
- */
- btree->evict_walk_reverse = !btree->evict_walk_reverse;
- }
-
- walk_flags =
- WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT;
-
- if (btree->evict_walk_reverse)
+ switch (btree->evict_walk_state) {
+ case WT_EVICT_WALK_NEXT:
+ break;
+ case WT_EVICT_WALK_PREV:
FLD_SET(walk_flags, WT_READ_PREV);
+ break;
+ case WT_EVICT_WALK_RAND_PREV:
+ FLD_SET(walk_flags, WT_READ_PREV);
+ /* FALLTHROUGH */
+ case WT_EVICT_WALK_RAND_NEXT:
+ if (btree->evict_ref == NULL) {
+ /* Ensure internal pages indexes remain valid */
+ WT_WITH_PAGE_INDEX(session, ret = __wt_random_descent(
+ session, &btree->evict_ref, true));
+ WT_RET_NOTFOUND_OK(ret);
+ }
+ break;
+ }
/*
* Get some more eviction candidate pages, starting at the last saved
@@ -1713,8 +1715,16 @@ __evict_walk_file(WT_SESSION_IMPL *session,
pages_seen > min_pages &&
(pages_queued == 0 || (pages_seen / pages_queued) >
(min_pages / target_pages));
- if (give_up)
+ if (give_up) {
+ /*
+ * Try a different walk start point next time if a
+ * walk gave up.
+ */
+ btree->evict_walk_state =
+ (btree->evict_walk_state + 1) %
+ WT_EVICT_WALK_MAX_LEGAL_VALUE;
break;
+ }
if (ref == NULL) {
if (++restarts == 2)
diff --git a/src/third_party/wiredtiger/src/include/btree.h b/src/third_party/wiredtiger/src/include/btree.h
index d742310bf8f..976c1d2110c 100644
--- a/src/third_party/wiredtiger/src/include/btree.h
+++ b/src/third_party/wiredtiger/src/include/btree.h
@@ -141,7 +141,11 @@ struct __wt_btree {
u_int evict_walk_skips; /* Number of walks skipped */
u_int evict_disabled; /* Eviction disabled count */
volatile uint32_t evict_busy; /* Count of threads in eviction */
- bool evict_walk_reverse; /* Walk direction */
+ enum {
+ WT_EVICT_WALK_NEXT, WT_EVICT_WALK_PREV,
+ WT_EVICT_WALK_RAND_NEXT, WT_EVICT_WALK_RAND_PREV
+ } evict_walk_state; /* Eviction walk state */
+#define WT_EVICT_WALK_MAX_LEGAL_VALUE WT_EVICT_WALK_RAND_PREV + 1
enum {
WT_CKPT_OFF, WT_CKPT_PREPARE, WT_CKPT_RUNNING
diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c
index d6caa55f8c7..3477ca52502 100644
--- a/src/third_party/wiredtiger/src/log/log.c
+++ b/src/third_party/wiredtiger/src/log/log.c
@@ -24,7 +24,7 @@ static int __log_write_internal(
* __log_wait_for_earlier_slot --
* Wait for write_lsn to catch up to this slot.
*/
-static void
+static int
__log_wait_for_earlier_slot(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
{
WT_CONNECTION_IMPL *conn;
@@ -41,6 +41,7 @@ __log_wait_for_earlier_slot(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
* unlock in case an earlier thread is trying to switch its
* slot and complete its operation.
*/
+ WT_RET(WT_SESSION_CHECK_PANIC(session));
if (F_ISSET(session, WT_SESSION_LOCKED_SLOT))
__wt_spin_unlock(session, &log->log_slot_lock);
__wt_cond_signal(session, conn->log_wrlsn_cond);
@@ -51,6 +52,7 @@ __log_wait_for_earlier_slot(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
if (F_ISSET(session, WT_SESSION_LOCKED_SLOT))
__wt_spin_lock(session, &log->log_slot_lock);
}
+ return (0);
}
/*
@@ -70,7 +72,7 @@ __log_fs_write(WT_SESSION_IMPL *session,
* be a hole at the end of the previous log file that we cannot detect.
*/
if (slot->slot_release_lsn.l.file < slot->slot_start_lsn.l.file) {
- __log_wait_for_earlier_slot(session, slot);
+ WT_RET(__log_wait_for_earlier_slot(session, slot));
WT_RET(__wt_log_force_sync(session, &slot->slot_release_lsn));
}
if ((ret = __wt_write(session, slot->slot_fh, offset, len, buf)) != 0)
@@ -110,6 +112,7 @@ __wt_log_flush_lsn(WT_SESSION_IMPL *session, WT_LSN *lsn, bool start)
conn = S2C(session);
log = conn->log;
+ WT_RET(WT_SESSION_CHECK_PANIC(session));
WT_RET(__wt_log_force_write(session, 1, NULL));
__wt_log_wrlsn(session, NULL);
if (start)
@@ -174,6 +177,7 @@ __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn)
* log file ready to close.
*/
while (log->sync_lsn.l.file < min_lsn->l.file) {
+ WT_RET(WT_SESSION_CHECK_PANIC(session));
__wt_cond_signal(session, S2C(session)->log_file_cond);
__wt_cond_wait(session, log->log_sync_cond, 10000, NULL);
}
@@ -1467,7 +1471,7 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep)
* be holes in the log file.
*/
WT_STAT_CONN_INCR(session, log_release_write_lsn);
- __log_wait_for_earlier_slot(session, slot);
+ WT_ERR(__log_wait_for_earlier_slot(session, slot));
log->write_start_lsn = slot->slot_start_lsn;
log->write_lsn = slot->slot_end_lsn;
@@ -1488,6 +1492,7 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep)
* current fsync completes and advance log->sync_lsn.
*/
while (F_ISSET(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR)) {
+ WT_ERR(WT_SESSION_CHECK_PANIC(session));
/*
* We have to wait until earlier log files have finished their
* sync operations. The most recent one will set the LSN to the
@@ -2178,15 +2183,19 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
if (LF_ISSET(WT_LOG_FLUSH)) {
/* Wait for our writes to reach the OS */
while (__wt_log_cmp(&log->write_lsn, &lsn) <= 0 &&
- myslot.slot->slot_error == 0)
+ myslot.slot->slot_error == 0) {
+ WT_ERR(WT_SESSION_CHECK_PANIC(session));
__wt_cond_wait(
session, log->log_write_cond, 10000, NULL);
+ }
} else if (LF_ISSET(WT_LOG_FSYNC)) {
/* Wait for our writes to reach disk */
while (__wt_log_cmp(&log->sync_lsn, &lsn) <= 0 &&
- myslot.slot->slot_error == 0)
+ myslot.slot->slot_error == 0) {
+ WT_ERR(WT_SESSION_CHECK_PANIC(session));
__wt_cond_wait(
session, log->log_sync_cond, 10000, NULL);
+ }
}
/*
diff --git a/src/third_party/wiredtiger/src/log/log_slot.c b/src/third_party/wiredtiger/src/log/log_slot.c
index 542f010ea53..b4655ff6c1a 100644
--- a/src/third_party/wiredtiger/src/log/log_slot.c
+++ b/src/third_party/wiredtiger/src/log/log_slot.c
@@ -220,6 +220,7 @@ __log_slot_switch_internal(
if (slot != log->active_slot)
return (0);
+ WT_RET(WT_SESSION_CHECK_PANIC(session));
/*
* We may come through here multiple times if we were able to close
* a slot but could not set up a new one. If we closed it already,
@@ -582,6 +583,7 @@ __wt_log_slot_release(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int64_t size)
* was written rather than the beginning record of the slot.
*/
while ((cur_offset = slot->slot_last_offset) < my_start) {
+ WT_RET(WT_SESSION_CHECK_PANIC(session));
/*
* Set our offset if we are larger.
*/