summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Cahill <michael.cahill@mongodb.com>2017-11-01 20:58:31 -0400
committerAlex Gorrod <alexander.gorrod@mongodb.com>2017-11-02 11:58:31 +1100
commit4907cf82a81b2eae27f086c44f8ce13d24e2038a (patch)
tree061a8d25ccc1e82ea6f7e1d386c8d6b34ab0c045
parent942901116294b5e60266aa5fa1036e2607d4196e (diff)
downloadmongo-4907cf82a81b2eae27f086c44f8ce13d24e2038a.tar.gz
WT-3675 Fix the lookaside interactions with checkpoints. (#3776)
Fix the lookaside info saved by reconciliation and how lookaside interacts with checkpoints. Previously, we tracked whether eviction was successful, and if so, continued the checkpoint from after the evicted page. That could skip over pages in some cases (presumably if eviction caused a split). Instead, simplify the loop to make eviction advisory. If eviction succeeds, it should leave the reference in a state where checkpoint can skip over it quickly. If eviction fails, it may still have written the reference and leave it clean, saving work for checkpoint. Either way, checkpoint visits every reference in the tree regardless of splits.
-rw-r--r--src/btree/bt_read.c5
-rw-r--r--src/btree/bt_sync.c71
-rw-r--r--src/include/btmem.h4
-rw-r--r--src/reconcile/rec_write.c28
-rw-r--r--src/txn/txn_timestamp.c11
5 files changed, 58 insertions, 61 deletions
diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c
index 7ecf1ca3bf4..fe6be6517a2 100644
--- a/src/btree/bt_read.c
+++ b/src/btree/bt_read.c
@@ -479,6 +479,11 @@ __las_page_skip(WT_SESSION_IMPL *session, WT_REF *ref)
* Skip lookaside pages if reading as of a timestamp and all the
* updates are in the future.
*/
+ WT_ASSERT(session,
+ !F_ISSET(&session->txn, WT_TXN_HAS_TS_READ) ||
+ __wt_timestamp_cmp(&ref->page_las->onpage_timestamp,
+ &session->txn.read_timestamp) <= 0);
+
if (F_ISSET(&session->txn, WT_TXN_HAS_TS_READ) &&
ref->page_las->las_skew_oldest &&
__wt_timestamp_cmp(
diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c
index 0f81b626ed7..d15852af935 100644
--- a/src/btree/bt_sync.c
+++ b/src/btree/bt_sync.c
@@ -107,36 +107,6 @@ __sync_dup_walk(
}
/*
- * __sync_evict_page --
- * Attempt to evict a page during a checkpoint walk.
- */
-static int
-__sync_evict_page(WT_SESSION_IMPL *session, WT_REF **walkp, uint32_t flags)
-{
- WT_DECL_RET;
- WT_REF *next, *to_evict;
-
- to_evict = *walkp;
- next = NULL;
-
- /*
- * Get the ref after the page we're trying to evicting. If the
- * eviction is successful, the walk will continue from here.
- */
- WT_RET(__sync_dup_walk(session, to_evict, flags, &next));
- WT_ERR(__wt_tree_walk(session, &next, flags));
-
- WT_ERR(__wt_page_release_evict(session, to_evict));
-
- /* Success: continue the walk at the next page. */
- *walkp = next;
- return (0);
-
-err: WT_TRET(__wt_page_release(session, next, flags));
- return (ret);
-}
-
-/*
* __sync_file --
* Flush pages for a specific file.
*/
@@ -153,13 +123,13 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages;
uint64_t oldest_id, saved_pinned_id;
uint32_t flags;
- bool evict_failed, skip_walk, timer;
+ bool timer, tried_eviction;
conn = S2C(session);
btree = S2BT(session);
prev = walk = NULL;
txn = &session->txn;
- evict_failed = skip_walk = false;
+ tried_eviction = false;
flags = WT_READ_CACHE | WT_READ_NO_GEN;
internal_bytes = leaf_bytes = 0;
@@ -266,12 +236,8 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
LF_SET(WT_READ_LOOKASIDE | WT_READ_WONT_NEED);
for (;;) {
- if (!skip_walk) {
- WT_ERR(__sync_dup_walk(
- session, walk, flags, &prev));
- WT_ERR(__wt_tree_walk(session, &walk, flags));
- }
- skip_walk = false;
+ WT_ERR(__sync_dup_walk(session, walk, flags, &prev));
+ WT_ERR(__wt_tree_walk(session, &walk, flags));
if (walk == NULL)
break;
@@ -317,27 +283,26 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
* visit. We want to avoid this code being too special
* purpose, so try to reuse the ordinary eviction path.
*
- * If eviction succeeded, it steps to the next ref, so
- * we have to skip the next walk. If eviction fails,
- * remember so we don't retry it.
+ * Regardless of whether eviction succeeds or fails,
+ * the walk continues from the previous location. We
+ * remember whether we tried eviction, and don't try
+ * again. Even if eviction fails (the page may stay in
+ * cache clean but with history that cannot be
+ * discarded), that is not wasted effort because
+ * checkpoint doesn't need to write the page again.
*/
if (!WT_PAGE_IS_INTERNAL(page) &&
page->read_gen == WT_READGEN_WONT_NEED &&
- !evict_failed) {
- if ((ret = __sync_evict_page(
- session, &walk, flags)) == 0) {
- evict_failed = false;
- skip_walk = true;
- } else {
- walk = prev;
- prev = NULL;
- evict_failed = true;
- }
- WT_ERR_BUSY_OK(ret);
+ !tried_eviction) {
+ WT_ERR_BUSY_OK(
+ __wt_page_release_evict(session, walk));
+ walk = prev;
+ prev = NULL;
+ tried_eviction = true;
continue;
}
+ tried_eviction = false;
- evict_failed = false;
WT_ERR(__wt_reconcile(
session, walk, NULL, WT_REC_CHECKPOINT, NULL));
diff --git a/src/include/btmem.h b/src/include/btmem.h
index 4b51a6b48cd..c3646a2ae59 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -193,8 +193,8 @@ struct __wt_page_lookaside {
uint64_t las_pageid; /* Page ID in lookaside */
uint64_t las_max_txn; /* Maximum transaction ID in
lookaside */
- WT_DECL_TIMESTAMP(min_timestamp) /* Oldest timestamp in
- lookaside for the page */
+ WT_DECL_TIMESTAMP(min_timestamp) /* Min timestamp in lookaside */
+ WT_DECL_TIMESTAMP(onpage_timestamp) /* Max timestamp on page */
bool las_skew_oldest; /* On-page skewed to oldest */
};
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c
index cd71521f873..01b6d100a0c 100644
--- a/src/reconcile/rec_write.c
+++ b/src/reconcile/rec_write.c
@@ -48,6 +48,7 @@ typedef struct {
/* Track the page's min/maximum transactions. */
uint64_t max_txn;
WT_DECL_TIMESTAMP(max_timestamp)
+ WT_DECL_TIMESTAMP(max_onpage_timestamp)
WT_DECL_TIMESTAMP(min_saved_timestamp)
u_int updates_seen; /* Count of updates seen. */
@@ -978,6 +979,7 @@ __rec_init(WT_SESSION_IMPL *session,
r->max_txn = WT_TXN_NONE;
#ifdef HAVE_TIMESTAMPS
__wt_timestamp_set_zero(&r->max_timestamp);
+ __wt_timestamp_set_zero(&r->max_onpage_timestamp);
__wt_timestamp_set_inf(&r->min_saved_timestamp);
#endif
@@ -1459,16 +1461,21 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
#ifdef HAVE_TIMESTAMPS
/* Track the oldest saved timestamp for lookaside. */
- if (F_ISSET(r, WT_REC_LOOKASIDE))
- for (upd = first_upd; upd != NULL; upd = upd->next) {
+ if (F_ISSET(r, WT_REC_LOOKASIDE)) {
+ /* If no updates had timestamps, we're done. */
+ if (first_ts_upd == NULL)
+ __wt_timestamp_set_zero(&r->min_saved_timestamp);
+ for (upd = first_upd; upd != *updp; upd = upd->next) {
if (upd->txnid != WT_TXN_ABORTED &&
__wt_timestamp_cmp(&upd->timestamp,
&r->min_saved_timestamp) < 0)
__wt_timestamp_set(&r->min_saved_timestamp,
&upd->timestamp);
- if (upd == *updp)
- break;
+
+ WT_ASSERT(session, upd->txnid == WT_TXN_ABORTED ||
+ WT_TXNID_LE(upd->txnid, r->max_txn));
}
+ }
#endif
check_original_value:
@@ -1493,6 +1500,12 @@ check_original_value:
WT_RET(
__rec_append_orig_value(session, page, first_upd, vpack));
+#ifdef HAVE_TIMESTAMPS
+ if ((upd = *updp) != NULL &&
+ __wt_timestamp_cmp(&upd->timestamp, &r->max_onpage_timestamp) > 0)
+ __wt_timestamp_set(&r->max_onpage_timestamp, &upd->timestamp);
+#endif
+
return (0);
}
@@ -3283,7 +3296,7 @@ __rec_split_write_supd(WT_SESSION_IMPL *session,
WT_RET(__rec_supd_move(session, multi, r->supd, r->supd_next));
r->supd_next = 0;
r->supd_memsize = 0;
- return (0);
+ goto done;
}
/*
@@ -3343,12 +3356,15 @@ __rec_split_write_supd(WT_SESSION_IMPL *session,
r->supd_next = j;
}
- /* Track the oldest timestamp seen so far. */
+done: /* Track the oldest timestamp seen so far. */
multi->page_las.las_skew_oldest = r->las_skew_oldest;
multi->page_las.las_max_txn = r->max_txn;
+ WT_ASSERT(session, r->max_txn != WT_TXN_NONE);
#ifdef HAVE_TIMESTAMPS
__wt_timestamp_set(
&multi->page_las.min_timestamp, &r->min_saved_timestamp);
+ __wt_timestamp_set(
+ &multi->page_las.onpage_timestamp, &r->max_onpage_timestamp);
#endif
err: __wt_scr_free(session, &key);
diff --git a/src/txn/txn_timestamp.c b/src/txn/txn_timestamp.c
index 0201036684d..c52700e6b69 100644
--- a/src/txn/txn_timestamp.c
+++ b/src/txn/txn_timestamp.c
@@ -687,6 +687,17 @@ __wt_txn_clear_read_timestamp(WT_SESSION_IMPL *session)
if (!F_ISSET(txn, WT_TXN_PUBLIC_TS_READ))
return;
+#ifdef HAVE_DIAGNOSTIC
+ {
+ wt_timestamp_t pinned_ts;
+
+ WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock,
+ __wt_timestamp_set(&pinned_ts, &txn_global->pinned_timestamp));
+ WT_ASSERT(session,
+ __wt_timestamp_cmp(&txn->read_timestamp, &pinned_ts) >= 0);
+ }
+#endif
+
__wt_writelock(session, &txn_global->read_timestamp_rwlock);
TAILQ_REMOVE(&txn_global->read_timestamph, txn, read_timestampq);
--txn_global->read_timestampq_len;