diff options
author | Michael Cahill <michael.cahill@mongodb.com> | 2017-11-01 20:58:31 -0400 |
---|---|---|
committer | Alex Gorrod <alexander.gorrod@mongodb.com> | 2017-11-02 11:58:31 +1100 |
commit | 4907cf82a81b2eae27f086c44f8ce13d24e2038a (patch) | |
tree | 061a8d25ccc1e82ea6f7e1d386c8d6b34ab0c045 /src | |
parent | 942901116294b5e60266aa5fa1036e2607d4196e (diff) | |
download | mongo-4907cf82a81b2eae27f086c44f8ce13d24e2038a.tar.gz |
WT-3675 Fix the lookaside interactions with checkpoints. (#3776)
Fix the lookaside info saved by reconciliation and how lookaside interacts with checkpoints.
Previously, we tracked whether eviction was successful, and if so,
continued the checkpoint from after the evicted page. That could skip
over pages in some cases (presumably if eviction caused a split).
Instead, simplify the loop to make eviction advisory. If eviction
succeeds, it should leave the reference in a state where checkpoint can
skip over it quickly. If eviction fails, it may still have written the
reference and leave it clean, saving work for checkpoint. Either way,
checkpoint visits every reference in the tree regardless of splits.
Diffstat (limited to 'src')
-rw-r--r-- | src/btree/bt_read.c | 5 | ||||
-rw-r--r-- | src/btree/bt_sync.c | 71 | ||||
-rw-r--r-- | src/include/btmem.h | 4 | ||||
-rw-r--r-- | src/reconcile/rec_write.c | 28 | ||||
-rw-r--r-- | src/txn/txn_timestamp.c | 11 |
5 files changed, 58 insertions, 61 deletions
diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c index 7ecf1ca3bf4..fe6be6517a2 100644 --- a/src/btree/bt_read.c +++ b/src/btree/bt_read.c @@ -479,6 +479,11 @@ __las_page_skip(WT_SESSION_IMPL *session, WT_REF *ref) * Skip lookaside pages if reading as of a timestamp and all the * updates are in the future. */ + WT_ASSERT(session, + !F_ISSET(&session->txn, WT_TXN_HAS_TS_READ) || + __wt_timestamp_cmp(&ref->page_las->onpage_timestamp, + &session->txn.read_timestamp) <= 0); + if (F_ISSET(&session->txn, WT_TXN_HAS_TS_READ) && ref->page_las->las_skew_oldest && __wt_timestamp_cmp( diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c index 0f81b626ed7..d15852af935 100644 --- a/src/btree/bt_sync.c +++ b/src/btree/bt_sync.c @@ -107,36 +107,6 @@ __sync_dup_walk( } /* - * __sync_evict_page -- - * Attempt to evict a page during a checkpoint walk. - */ -static int -__sync_evict_page(WT_SESSION_IMPL *session, WT_REF **walkp, uint32_t flags) -{ - WT_DECL_RET; - WT_REF *next, *to_evict; - - to_evict = *walkp; - next = NULL; - - /* - * Get the ref after the page we're trying to evicting. If the - * eviction is successful, the walk will continue from here. - */ - WT_RET(__sync_dup_walk(session, to_evict, flags, &next)); - WT_ERR(__wt_tree_walk(session, &next, flags)); - - WT_ERR(__wt_page_release_evict(session, to_evict)); - - /* Success: continue the walk at the next page. */ - *walkp = next; - return (0); - -err: WT_TRET(__wt_page_release(session, next, flags)); - return (ret); -} - -/* * __sync_file -- * Flush pages for a specific file. */ @@ -153,13 +123,13 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages; uint64_t oldest_id, saved_pinned_id; uint32_t flags; - bool evict_failed, skip_walk, timer; + bool timer, tried_eviction; conn = S2C(session); btree = S2BT(session); prev = walk = NULL; txn = &session->txn; - evict_failed = skip_walk = false; + tried_eviction = false; flags = WT_READ_CACHE | WT_READ_NO_GEN; internal_bytes = leaf_bytes = 0; @@ -266,12 +236,8 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) LF_SET(WT_READ_LOOKASIDE | WT_READ_WONT_NEED); for (;;) { - if (!skip_walk) { - WT_ERR(__sync_dup_walk( - session, walk, flags, &prev)); - WT_ERR(__wt_tree_walk(session, &walk, flags)); - } - skip_walk = false; + WT_ERR(__sync_dup_walk(session, walk, flags, &prev)); + WT_ERR(__wt_tree_walk(session, &walk, flags)); if (walk == NULL) break; @@ -317,27 +283,26 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) * visit. We want to avoid this code being too special * purpose, so try to reuse the ordinary eviction path. * - * If eviction succeeded, it steps to the next ref, so - * we have to skip the next walk. If eviction fails, - * remember so we don't retry it. + * Regardless of whether eviction succeeds or fails, + * the walk continues from the previous location. We + * remember whether we tried eviction, and don't try + * again. Even if eviction fails (the page may stay in + * cache clean but with history that cannot be + * discarded), that is not wasted effort because + * checkpoint doesn't need to write the page again. */ if (!WT_PAGE_IS_INTERNAL(page) && page->read_gen == WT_READGEN_WONT_NEED && - !evict_failed) { - if ((ret = __sync_evict_page( - session, &walk, flags)) == 0) { - evict_failed = false; - skip_walk = true; - } else { - walk = prev; - prev = NULL; - evict_failed = true; - } - WT_ERR_BUSY_OK(ret); + !tried_eviction) { + WT_ERR_BUSY_OK( + __wt_page_release_evict(session, walk)); + walk = prev; + prev = NULL; + tried_eviction = true; continue; } + tried_eviction = false; - evict_failed = false; WT_ERR(__wt_reconcile( session, walk, NULL, WT_REC_CHECKPOINT, NULL)); diff --git a/src/include/btmem.h b/src/include/btmem.h index 4b51a6b48cd..c3646a2ae59 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -193,8 +193,8 @@ struct __wt_page_lookaside { uint64_t las_pageid; /* Page ID in lookaside */ uint64_t las_max_txn; /* Maximum transaction ID in lookaside */ - WT_DECL_TIMESTAMP(min_timestamp) /* Oldest timestamp in - lookaside for the page */ + WT_DECL_TIMESTAMP(min_timestamp) /* Min timestamp in lookaside */ + WT_DECL_TIMESTAMP(onpage_timestamp) /* Max timestamp on page */ bool las_skew_oldest; /* On-page skewed to oldest */ }; diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index cd71521f873..01b6d100a0c 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -48,6 +48,7 @@ typedef struct { /* Track the page's min/maximum transactions. */ uint64_t max_txn; WT_DECL_TIMESTAMP(max_timestamp) + WT_DECL_TIMESTAMP(max_onpage_timestamp) WT_DECL_TIMESTAMP(min_saved_timestamp) u_int updates_seen; /* Count of updates seen. */ @@ -978,6 +979,7 @@ __rec_init(WT_SESSION_IMPL *session, r->max_txn = WT_TXN_NONE; #ifdef HAVE_TIMESTAMPS __wt_timestamp_set_zero(&r->max_timestamp); + __wt_timestamp_set_zero(&r->max_onpage_timestamp); __wt_timestamp_set_inf(&r->min_saved_timestamp); #endif @@ -1459,16 +1461,21 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, #ifdef HAVE_TIMESTAMPS /* Track the oldest saved timestamp for lookaside. */ - if (F_ISSET(r, WT_REC_LOOKASIDE)) - for (upd = first_upd; upd != NULL; upd = upd->next) { + if (F_ISSET(r, WT_REC_LOOKASIDE)) { + /* If no updates had timestamps, we're done. */ + if (first_ts_upd == NULL) + __wt_timestamp_set_zero(&r->min_saved_timestamp); + for (upd = first_upd; upd != *updp; upd = upd->next) { if (upd->txnid != WT_TXN_ABORTED && __wt_timestamp_cmp(&upd->timestamp, &r->min_saved_timestamp) < 0) __wt_timestamp_set(&r->min_saved_timestamp, &upd->timestamp); - if (upd == *updp) - break; + + WT_ASSERT(session, upd->txnid == WT_TXN_ABORTED || + WT_TXNID_LE(upd->txnid, r->max_txn)); } + } #endif check_original_value: @@ -1493,6 +1500,12 @@ check_original_value: WT_RET( __rec_append_orig_value(session, page, first_upd, vpack)); +#ifdef HAVE_TIMESTAMPS + if ((upd = *updp) != NULL && + __wt_timestamp_cmp(&upd->timestamp, &r->max_onpage_timestamp) > 0) + __wt_timestamp_set(&r->max_onpage_timestamp, &upd->timestamp); +#endif + return (0); } @@ -3283,7 +3296,7 @@ __rec_split_write_supd(WT_SESSION_IMPL *session, WT_RET(__rec_supd_move(session, multi, r->supd, r->supd_next)); r->supd_next = 0; r->supd_memsize = 0; - return (0); + goto done; } /* @@ -3343,12 +3356,15 @@ __rec_split_write_supd(WT_SESSION_IMPL *session, r->supd_next = j; } - /* Track the oldest timestamp seen so far. */ +done: /* Track the oldest timestamp seen so far. */ multi->page_las.las_skew_oldest = r->las_skew_oldest; multi->page_las.las_max_txn = r->max_txn; + WT_ASSERT(session, r->max_txn != WT_TXN_NONE); #ifdef HAVE_TIMESTAMPS __wt_timestamp_set( &multi->page_las.min_timestamp, &r->min_saved_timestamp); + __wt_timestamp_set( + &multi->page_las.onpage_timestamp, &r->max_onpage_timestamp); #endif err: __wt_scr_free(session, &key); diff --git a/src/txn/txn_timestamp.c b/src/txn/txn_timestamp.c index 0201036684d..c52700e6b69 100644 --- a/src/txn/txn_timestamp.c +++ b/src/txn/txn_timestamp.c @@ -687,6 +687,17 @@ __wt_txn_clear_read_timestamp(WT_SESSION_IMPL *session) if (!F_ISSET(txn, WT_TXN_PUBLIC_TS_READ)) return; +#ifdef HAVE_DIAGNOSTIC + { + wt_timestamp_t pinned_ts; + + WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, + __wt_timestamp_set(&pinned_ts, &txn_global->pinned_timestamp)); + WT_ASSERT(session, + __wt_timestamp_cmp(&txn->read_timestamp, &pinned_ts) >= 0); + } +#endif + __wt_writelock(session, &txn_global->read_timestamp_rwlock); TAILQ_REMOVE(&txn_global->read_timestamph, txn, read_timestampq); --txn_global->read_timestampq_len; |