summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Gorrod <alexander.gorrod@mongodb.com>2017-02-20 15:19:37 +1100
committerAlex Gorrod <alexander.gorrod@mongodb.com>2017-02-20 15:19:37 +1100
commitd6659de8d742b9562d08c1ba5138be881f8e24fa (patch)
tree1f947ab2c784f20836277a57e93dd026f1d5cf86
parente1bcc30da91eedd0b17cebb725cc7e607ffa2340 (diff)
parente7b2a53c33271598c9041eec8363c95ff37daa58 (diff)
downloadmongo-d6659de8d742b9562d08c1ba5138be881f8e24fa.tar.gz
Merge branch 'develop' into mongodb-3.6
-rw-r--r--src/btree/bt_random.c10
-rw-r--r--src/conn/conn_cache_pool.c56
-rw-r--r--src/evict/evict_lru.c48
-rw-r--r--src/include/btree.h6
-rw-r--r--src/log/log.c19
-rw-r--r--src/log/log_slot.c2
6 files changed, 103 insertions, 38 deletions
diff --git a/src/btree/bt_random.c b/src/btree/bt_random.c
index 44de511f787..4c7ff861d26 100644
--- a/src/btree/bt_random.c
+++ b/src/btree/bt_random.c
@@ -201,6 +201,16 @@ restart: /*
current = &btree->root;
for (;;) {
page = current->page;
+ /*
+ * When walking a tree for eviction, an exclusive operation may
+ * be in progress leaving the root page is not valid. Just give
+ * up in that case.
+ */
+ if (page == NULL) {
+ WT_ASSERT(session, eviction);
+ break;
+ }
+
if (!WT_PAGE_IS_INTERNAL(page))
break;
diff --git a/src/conn/conn_cache_pool.c b/src/conn/conn_cache_pool.c
index 49b766f4602..ed078991581 100644
--- a/src/conn/conn_cache_pool.c
+++ b/src/conn/conn_cache_pool.c
@@ -418,8 +418,9 @@ static void
__cache_pool_balance(WT_SESSION_IMPL *session, bool forward)
{
WT_CACHE_POOL *cp;
- bool adjusted;
uint64_t bump_threshold, highest;
+ int i;
+ bool adjusted;
cp = __wt_process.cache_pool;
adjusted = false;
@@ -438,11 +439,17 @@ __cache_pool_balance(WT_SESSION_IMPL *session, bool forward)
/*
* Actively attempt to:
- * - Reduce the amount allocated, if we are over the budget
+ * - Reduce the amount allocated, if we are over the budget.
* - Increase the amount used if there is capacity and any pressure.
+ * Don't keep trying indefinitely, if we aren't succeeding in reducing
+ * the cache in use re-assessing the participants' states is necessary.
+ * We are also holding a lock across this process, which can slow
+ * participant shutdown if we spend a long time balancing.
*/
- while (F_ISSET(cp, WT_CACHE_POOL_ACTIVE) &&
- F_ISSET(S2C(session)->cache, WT_CACHE_POOL_RUN)) {
+ for (i = 0;
+ i < 2 * WT_CACHE_POOL_BUMP_THRESHOLD &&
+ F_ISSET(cp, WT_CACHE_POOL_ACTIVE) &&
+ F_ISSET(S2C(session)->cache, WT_CACHE_POOL_RUN); i++) {
__cache_pool_adjust(
session, highest, bump_threshold, forward, &adjusted);
/*
@@ -565,7 +572,7 @@ __cache_pool_adjust(WT_SESSION_IMPL *session,
WT_CONNECTION_IMPL *entry;
uint64_t adjustment, highest_percentile, pressure, reserved, smallest;
u_int pct_full;
- bool busy, pool_full, grow;
+ bool busy, decrease_ok, grow, pool_full;
*adjustedp = false;
cp = __wt_process.cache_pool;
@@ -612,6 +619,34 @@ __cache_pool_adjust(WT_SESSION_IMPL *session,
continue;
/*
+ * The bump threshold decreases as we try longer to balance
+ * the pool. Adjust how aggressively we free space from
+ * participants depending on how long we have been trying.
+ */
+ decrease_ok = false;
+ /*
+ * Any participant is a candidate if we have been trying
+ * for long enough.
+ */
+ if (bump_threshold == 0)
+ decrease_ok = true;
+ /*
+ * Participants that aren't doing application eviction and
+ * are showing a reasonable amount of usage are excluded
+ * even if we have been trying for a while.
+ */
+ else if (bump_threshold < WT_CACHE_POOL_BUMP_THRESHOLD / 3 &&
+ (!busy && highest > 1))
+ decrease_ok = true;
+ /*
+ * Any participant that is proportionally less busy is a
+ * candidate from the first attempt.
+ */
+ else if (highest > 1 &&
+ pressure < WT_CACHE_POOL_REDUCE_THRESHOLD)
+ decrease_ok = true;
+
+ /*
* If the entry is currently allocated less than the reserved
* size, increase its allocation. This should only happen if:
* - it's the first time we've seen this member, or
@@ -624,17 +659,12 @@ __cache_pool_adjust(WT_SESSION_IMPL *session,
* Conditions for reducing the amount of resources for an
* entry:
* - the pool is full,
- * - application threads are not busy doing eviction already,
* - this entry has more than the minimum amount of space in
* use,
- * - the read pressure in this entry is below the threshold,
- * other entries need more cache, the entry has more than
- * the minimum space and there is no available space in the
- * pool.
+ * - it was determined that this slot is a good candidate
*/
- } else if (pool_full && !busy &&
- entry->cache_size > reserved &&
- pressure < WT_CACHE_POOL_REDUCE_THRESHOLD && highest > 1) {
+ } else if (pool_full &&
+ entry->cache_size > reserved && decrease_ok) {
grow = false;
/*
* Don't drop the size down too much - or it can
diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c
index 42fe4d4608e..f1949a7c320 100644
--- a/src/evict/evict_lru.c
+++ b/src/evict/evict_lru.c
@@ -1654,31 +1654,33 @@ __evict_walk_file(WT_SESSION_IMPL *session,
!F_ISSET(cache, WT_CACHE_EVICT_CLEAN))
min_pages *= 10;
+ walk_flags =
+ WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT;
+
/*
* Choose a random point in the tree if looking for candidates in a
* tree with no starting point set. This is mostly aimed at ensuring
* eviction fairly visits all pages in trees with a lot of in-cache
* content.
*/
- if (btree->evict_ref == NULL) {
- /* Ensure internal pages indexes remain valid for our walk */
- WT_WITH_PAGE_INDEX(session, ret =
- __wt_random_descent(session, &btree->evict_ref, true));
- WT_RET_NOTFOUND_OK(ret);
-
- /*
- * Reverse the direction of the walk each time we start at a
- * random point so both ends of the tree are equally likely to
- * be visited.
- */
- btree->evict_walk_reverse = !btree->evict_walk_reverse;
- }
-
- walk_flags =
- WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT;
-
- if (btree->evict_walk_reverse)
+ switch (btree->evict_walk_state) {
+ case WT_EVICT_WALK_NEXT:
+ break;
+ case WT_EVICT_WALK_PREV:
FLD_SET(walk_flags, WT_READ_PREV);
+ break;
+ case WT_EVICT_WALK_RAND_PREV:
+ FLD_SET(walk_flags, WT_READ_PREV);
+ /* FALLTHROUGH */
+ case WT_EVICT_WALK_RAND_NEXT:
+ if (btree->evict_ref == NULL) {
+ /* Ensure internal pages indexes remain valid */
+ WT_WITH_PAGE_INDEX(session, ret = __wt_random_descent(
+ session, &btree->evict_ref, true));
+ WT_RET_NOTFOUND_OK(ret);
+ }
+ break;
+ }
/*
* Get some more eviction candidate pages, starting at the last saved
@@ -1713,8 +1715,16 @@ __evict_walk_file(WT_SESSION_IMPL *session,
pages_seen > min_pages &&
(pages_queued == 0 || (pages_seen / pages_queued) >
(min_pages / target_pages));
- if (give_up)
+ if (give_up) {
+ /*
+ * Try a different walk start point next time if a
+ * walk gave up.
+ */
+ btree->evict_walk_state =
+ (btree->evict_walk_state + 1) %
+ WT_EVICT_WALK_MAX_LEGAL_VALUE;
break;
+ }
if (ref == NULL) {
if (++restarts == 2)
diff --git a/src/include/btree.h b/src/include/btree.h
index d742310bf8f..976c1d2110c 100644
--- a/src/include/btree.h
+++ b/src/include/btree.h
@@ -141,7 +141,11 @@ struct __wt_btree {
u_int evict_walk_skips; /* Number of walks skipped */
u_int evict_disabled; /* Eviction disabled count */
volatile uint32_t evict_busy; /* Count of threads in eviction */
- bool evict_walk_reverse; /* Walk direction */
+ enum {
+ WT_EVICT_WALK_NEXT, WT_EVICT_WALK_PREV,
+ WT_EVICT_WALK_RAND_NEXT, WT_EVICT_WALK_RAND_PREV
+ } evict_walk_state; /* Eviction walk state */
+#define WT_EVICT_WALK_MAX_LEGAL_VALUE WT_EVICT_WALK_RAND_PREV + 1
enum {
WT_CKPT_OFF, WT_CKPT_PREPARE, WT_CKPT_RUNNING
diff --git a/src/log/log.c b/src/log/log.c
index d6caa55f8c7..3477ca52502 100644
--- a/src/log/log.c
+++ b/src/log/log.c
@@ -24,7 +24,7 @@ static int __log_write_internal(
* __log_wait_for_earlier_slot --
* Wait for write_lsn to catch up to this slot.
*/
-static void
+static int
__log_wait_for_earlier_slot(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
{
WT_CONNECTION_IMPL *conn;
@@ -41,6 +41,7 @@ __log_wait_for_earlier_slot(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
* unlock in case an earlier thread is trying to switch its
* slot and complete its operation.
*/
+ WT_RET(WT_SESSION_CHECK_PANIC(session));
if (F_ISSET(session, WT_SESSION_LOCKED_SLOT))
__wt_spin_unlock(session, &log->log_slot_lock);
__wt_cond_signal(session, conn->log_wrlsn_cond);
@@ -51,6 +52,7 @@ __log_wait_for_earlier_slot(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
if (F_ISSET(session, WT_SESSION_LOCKED_SLOT))
__wt_spin_lock(session, &log->log_slot_lock);
}
+ return (0);
}
/*
@@ -70,7 +72,7 @@ __log_fs_write(WT_SESSION_IMPL *session,
* be a hole at the end of the previous log file that we cannot detect.
*/
if (slot->slot_release_lsn.l.file < slot->slot_start_lsn.l.file) {
- __log_wait_for_earlier_slot(session, slot);
+ WT_RET(__log_wait_for_earlier_slot(session, slot));
WT_RET(__wt_log_force_sync(session, &slot->slot_release_lsn));
}
if ((ret = __wt_write(session, slot->slot_fh, offset, len, buf)) != 0)
@@ -110,6 +112,7 @@ __wt_log_flush_lsn(WT_SESSION_IMPL *session, WT_LSN *lsn, bool start)
conn = S2C(session);
log = conn->log;
+ WT_RET(WT_SESSION_CHECK_PANIC(session));
WT_RET(__wt_log_force_write(session, 1, NULL));
__wt_log_wrlsn(session, NULL);
if (start)
@@ -174,6 +177,7 @@ __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn)
* log file ready to close.
*/
while (log->sync_lsn.l.file < min_lsn->l.file) {
+ WT_RET(WT_SESSION_CHECK_PANIC(session));
__wt_cond_signal(session, S2C(session)->log_file_cond);
__wt_cond_wait(session, log->log_sync_cond, 10000, NULL);
}
@@ -1467,7 +1471,7 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep)
* be holes in the log file.
*/
WT_STAT_CONN_INCR(session, log_release_write_lsn);
- __log_wait_for_earlier_slot(session, slot);
+ WT_ERR(__log_wait_for_earlier_slot(session, slot));
log->write_start_lsn = slot->slot_start_lsn;
log->write_lsn = slot->slot_end_lsn;
@@ -1488,6 +1492,7 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep)
* current fsync completes and advance log->sync_lsn.
*/
while (F_ISSET(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR)) {
+ WT_ERR(WT_SESSION_CHECK_PANIC(session));
/*
* We have to wait until earlier log files have finished their
* sync operations. The most recent one will set the LSN to the
@@ -2178,15 +2183,19 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
if (LF_ISSET(WT_LOG_FLUSH)) {
/* Wait for our writes to reach the OS */
while (__wt_log_cmp(&log->write_lsn, &lsn) <= 0 &&
- myslot.slot->slot_error == 0)
+ myslot.slot->slot_error == 0) {
+ WT_ERR(WT_SESSION_CHECK_PANIC(session));
__wt_cond_wait(
session, log->log_write_cond, 10000, NULL);
+ }
} else if (LF_ISSET(WT_LOG_FSYNC)) {
/* Wait for our writes to reach disk */
while (__wt_log_cmp(&log->sync_lsn, &lsn) <= 0 &&
- myslot.slot->slot_error == 0)
+ myslot.slot->slot_error == 0) {
+ WT_ERR(WT_SESSION_CHECK_PANIC(session));
__wt_cond_wait(
session, log->log_sync_cond, 10000, NULL);
+ }
}
/*
diff --git a/src/log/log_slot.c b/src/log/log_slot.c
index 542f010ea53..b4655ff6c1a 100644
--- a/src/log/log_slot.c
+++ b/src/log/log_slot.c
@@ -220,6 +220,7 @@ __log_slot_switch_internal(
if (slot != log->active_slot)
return (0);
+ WT_RET(WT_SESSION_CHECK_PANIC(session));
/*
* We may come through here multiple times if we were able to close
* a slot but could not set up a new one. If we closed it already,
@@ -582,6 +583,7 @@ __wt_log_slot_release(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int64_t size)
* was written rather than the beginning record of the slot.
*/
while ((cur_offset = slot->slot_last_offset) < my_start) {
+ WT_RET(WT_SESSION_CHECK_PANIC(session));
/*
* Set our offset if we are larger.
*/