summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Cahill <michael.cahill@mongodb.com>2017-10-26 14:55:30 -0400
committerGitHub <noreply@github.com>2017-10-26 14:55:30 -0400
commit073dc4a2e21d7315239367aa2318139068e43d58 (patch)
treeb1961597ee4e3df0936bd2c2529bd0b45497d226
parent329d765593f861ef7260e88e4f26502fa4e28cc7 (diff)
downloadmongo-073dc4a2e21d7315239367aa2318139068e43d58.tar.gz
WT-3666 Fix lost updates with lookaside eviction. (#3759)
* Don't write uncommitted updates during eviction for checkpoints. * Since blocks appear immediately in lookaside, retry cursor positioning. * If checkpoint skips lookaside pages, the tree must stay dirty.
-rw-r--r--src/btree/bt_read.c18
-rw-r--r--src/cache/cache_las.c57
-rw-r--r--src/include/extern.h1
-rw-r--r--src/reconcile/rec_write.c38
4 files changed, 85 insertions, 29 deletions
diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c
index 838c6845b08..2dd366d8d5e 100644
--- a/src/btree/bt_read.c
+++ b/src/btree/bt_read.c
@@ -88,7 +88,6 @@ __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t btree_id)
uint32_t las_id, session_flags;
const uint8_t *p;
uint8_t upd_type;
- int exact;
cursor = NULL;
page = ref->page;
@@ -112,14 +111,9 @@ __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t btree_id)
* in-order updates for a subsequent key. We process all of the updates
* for a key and then insert those updates into the page, then all the
* updates for the next key, and so on.
- *
- * Search for the block's unique prefix, stepping through any matching
- * records.
*/
- cursor->set_key(cursor,
- btree_id, ref->page_las->las_pageid, (uint64_t)0, &las_key);
- if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0)
- ret = cursor->next(cursor);
+ ret = __wt_las_cursor_position(
+ cursor, btree_id, ref->page_las->las_pageid);
for (; ret == 0; ret = cursor->next(cursor)) {
WT_ERR(cursor->get_key(cursor,
&las_id, &las_pageid, &las_counter, &las_key));
@@ -483,14 +477,18 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
/*
* Skip lookaside pages if reading as of a
* timestamp and all the updates are in the
- * future.
+ * future. If we skip a lookaside page, the
+ * tree cannot be left clean: it must be
+ * visited by future checkpoints.
*/
if (F_ISSET(
&session->txn, WT_TXN_HAS_TS_READ) &&
__wt_timestamp_cmp(
&ref->page_las->min_timestamp,
- &session->txn.read_timestamp) > 0)
+ &session->txn.read_timestamp) > 0) {
+ __wt_tree_modify_set(session);
return (WT_NOTFOUND);
+ }
#endif
}
diff --git a/src/cache/cache_las.c b/src/cache/cache_las.c
index 13516d80c58..1ad5501b28f 100644
--- a/src/cache/cache_las.c
+++ b/src/cache/cache_las.c
@@ -437,6 +437,57 @@ __wt_las_insert_block(WT_SESSION_IMPL *session,
}
/*
+ * __wt_las_cursor_position --
+ * Position a lookaside cursor at the beginning of a block.
+ *
+ * There may be no block of lookaside entries if they have been removed by
+ * WT_CONNECTION::rollback_to_stable.
+ */
+int
+__wt_las_cursor_position(WT_CURSOR *cursor, uint32_t btree_id, uint64_t pageid)
+{
+ WT_ITEM las_key;
+ uint64_t las_counter, las_pageid;
+ uint32_t las_id;
+ int exact;
+
+ /*
+ * Because of the special visibility rules for lookaside, a new block
+ * can appear in between our search and the block of interest. Keep
+ * trying until we find it.
+ */
+ for (;;) {
+ WT_CLEAR(las_key);
+ cursor->set_key(cursor,
+ btree_id, pageid, (uint64_t)0, &las_key);
+ WT_RET(cursor->search_near(cursor, &exact));
+ if (exact < 0) {
+ WT_RET(cursor->next(cursor));
+
+ /*
+ * Because of the special visibility rules for
+ * lookaside, a new block can appear in between our
+ * search and the block of interest. Keep trying while
+ * we have a key lower that we expect.
+ *
+ * There may be no block of lookaside entries if they
+ * have been removed by
+ * WT_CONNECTION::rollback_to_stable.
+ */
+ WT_RET(cursor->get_key(cursor,
+ &las_id, &las_pageid, &las_counter, &las_key));
+ if (las_id < btree_id || (las_id == btree_id &&
+ pageid != 0 && las_pageid < pageid))
+ continue;
+ }
+
+ return (0);
+ }
+
+ /* NOTREACHED */
+}
+
+/*
* __wt_las_remove_block --
* Remove all records matching a key prefix from the lookaside store.
*/
@@ -448,7 +499,6 @@ __wt_las_remove_block(WT_SESSION_IMPL *session,
WT_ITEM las_key;
uint64_t las_counter, las_pageid, remove_cnt;
uint32_t las_id, session_flags;
- int exact;
bool local_cursor;
remove_cnt = 0;
@@ -464,10 +514,7 @@ __wt_las_remove_block(WT_SESSION_IMPL *session,
* Search for the block's unique prefix and step through all matching
* records, removing them.
*/
- las_key.size = 0;
- cursor->set_key(cursor, btree_id, pageid, (uint64_t)0, &las_key);
- if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0)
- ret = cursor->next(cursor);
+ ret = __wt_las_cursor_position(cursor, btree_id, pageid);
for (; ret == 0; ret = cursor->next(cursor)) {
WT_ERR(cursor->get_key(cursor,
&las_id, &las_pageid, &las_counter, &las_key));
diff --git a/src/include/extern.h b/src/include/extern.h
index c4365911e74..bbe66abf753 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -207,6 +207,7 @@ extern int __wt_las_cursor_open(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRI
extern void __wt_las_cursor( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags);
extern int __wt_las_cursor_close( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_las_insert_block(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CURSOR *cursor, WT_MULTI *multi, WT_ITEM *key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_las_cursor_position(WT_CURSOR *cursor, uint32_t btree_id, uint64_t pageid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_las_remove_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id, uint64_t pageid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern uint32_t __wt_checksum_sw(const void *chunk, size_t len) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")));
extern void __wt_checksum_init(void);
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c
index 108d9cf15f9..ae856649ede 100644
--- a/src/reconcile/rec_write.c
+++ b/src/reconcile/rec_write.c
@@ -378,9 +378,16 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
* Otherwise we would need to keep updates in memory that go back older
* than the version in the disk image, and since modify operations
* aren't idempotent, that is problematic.
+ *
+ * If we try to do eviction using transaction visibility, we had better
+ * have a snapshot. This doesn't apply to checkpoints: there are
+ * (rare) cases where we write data at read-uncommitted isolation.
*/
WT_ASSERT(session, !LF_ISSET(WT_REC_UPDATE_RESTORE) ||
LF_ISSET(WT_REC_VISIBLE_ALL));
+ WT_ASSERT(session, !LF_ISSET(WT_REC_EVICT) ||
+ LF_ISSET(WT_REC_VISIBLE_ALL) ||
+ F_ISSET(&session->txn, WT_TXN_HAS_SNAPSHOT));
/* We shouldn't get called with a clean page, that's an error. */
WT_ASSERT(session, __wt_page_is_modified(page));
@@ -1248,6 +1255,8 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
if ((txnid = upd->txnid) == WT_TXN_ABORTED)
continue;
+ upd_memsize += WT_UPDATE_MEMSIZE(upd);
+
/*
* Track the first update in the chain that is not aborted and
* the maximum transaction ID.
@@ -1266,10 +1275,20 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
* concurrent transaction commits or rolls back while we are
* examining its updates.
*/
- if (WT_TXNID_LE(r->last_running, txnid))
+ if (F_ISSET(r, WT_REC_EVICT) &&
+ (F_ISSET(r, WT_REC_VISIBLE_ALL) ?
+ WT_TXNID_LE(r->last_running, txnid) :
+ !__txn_visible_id(session, txnid))) {
uncommitted = r->update_uncommitted = true;
+ continue;
+ }
- upd_memsize += WT_UPDATE_MEMSIZE(upd);
+#ifdef HAVE_TIMESTAMPS
+ /* Track the first update with non-zero timestamp. */
+ if (first_ts_upd == NULL &&
+ !__wt_timestamp_iszero(&upd->timestamp))
+ first_ts_upd = upd;
+#endif
/*
* Find the first update we can use.
@@ -1299,13 +1318,6 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
if (*updp == NULL)
*updp = upd;
-
-#ifdef HAVE_TIMESTAMPS
- /* Track the first update with non-zero timestamp. */
- if (first_ts_upd == NULL &&
- !__wt_timestamp_iszero(&upd->timestamp))
- first_ts_upd = upd;
-#endif
}
/* Reconciliation should never see an aborted or reserved update. */
@@ -1360,9 +1372,9 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
#else
timestampp = NULL;
#endif
- all_visible = *updp == first_txn_upd &&
+ all_visible = *updp == first_txn_upd && !uncommitted &&
(F_ISSET(r, WT_REC_VISIBLE_ALL) ?
- !uncommitted && __wt_txn_visible_all(session, max_txn, timestampp) :
+ __wt_txn_visible_all(session, max_txn, timestampp) :
__wt_txn_visible(session, max_txn, timestampp));
if (all_visible)
@@ -1409,9 +1421,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
#ifdef HAVE_TIMESTAMPS
/* Track the oldest saved timestamp for lookaside. */
- if (first_ts_upd == NULL)
- __wt_timestamp_set_zero(&r->min_saved_timestamp);
- else if (F_ISSET(r, WT_REC_LOOKASIDE))
+ if (F_ISSET(r, WT_REC_LOOKASIDE))
for (upd = first_upd; upd != NULL; upd = upd->next)
if (upd->txnid != WT_TXN_ABORTED &&
upd->txnid != WT_TXN_NONE &&