diff options
author | Michael Cahill <michael.cahill@mongodb.com> | 2017-10-26 14:55:30 -0400 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-10-26 14:55:30 -0400 |
commit | 073dc4a2e21d7315239367aa2318139068e43d58 (patch) | |
tree | b1961597ee4e3df0936bd2c2529bd0b45497d226 | |
parent | 329d765593f861ef7260e88e4f26502fa4e28cc7 (diff) | |
download | mongo-073dc4a2e21d7315239367aa2318139068e43d58.tar.gz |
WT-3666 Fix lost updates with lookaside eviction. (#3759)
* Don't write uncommitted updates during eviction for checkpoints.
* Since blocks appear immediately in lookaside, retry cursor positioning.
* If checkpoint skips lookaside pages, the tree must stay dirty.
-rw-r--r-- | src/btree/bt_read.c | 18 | ||||
-rw-r--r-- | src/cache/cache_las.c | 57 | ||||
-rw-r--r-- | src/include/extern.h | 1 | ||||
-rw-r--r-- | src/reconcile/rec_write.c | 38 |
4 files changed, 85 insertions, 29 deletions
diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c index 838c6845b08..2dd366d8d5e 100644 --- a/src/btree/bt_read.c +++ b/src/btree/bt_read.c @@ -88,7 +88,6 @@ __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t btree_id) uint32_t las_id, session_flags; const uint8_t *p; uint8_t upd_type; - int exact; cursor = NULL; page = ref->page; @@ -112,14 +111,9 @@ __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t btree_id) * in-order updates for a subsequent key. We process all of the updates * for a key and then insert those updates into the page, then all the * updates for the next key, and so on. - * - * Search for the block's unique prefix, stepping through any matching - * records. */ - cursor->set_key(cursor, - btree_id, ref->page_las->las_pageid, (uint64_t)0, &las_key); - if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0) - ret = cursor->next(cursor); + ret = __wt_las_cursor_position( + cursor, btree_id, ref->page_las->las_pageid); for (; ret == 0; ret = cursor->next(cursor)) { WT_ERR(cursor->get_key(cursor, &las_id, &las_pageid, &las_counter, &las_key)); @@ -483,14 +477,18 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags /* * Skip lookaside pages if reading as of a * timestamp and all the updates are in the - * future. + * future. If we skip a lookaside page, the + * tree cannot be left clean: it must be + * visited by future checkpoints. */ if (F_ISSET( &session->txn, WT_TXN_HAS_TS_READ) && __wt_timestamp_cmp( &ref->page_las->min_timestamp, - &session->txn.read_timestamp) > 0) + &session->txn.read_timestamp) > 0) { + __wt_tree_modify_set(session); return (WT_NOTFOUND); + } #endif } diff --git a/src/cache/cache_las.c b/src/cache/cache_las.c index 13516d80c58..1ad5501b28f 100644 --- a/src/cache/cache_las.c +++ b/src/cache/cache_las.c @@ -437,6 +437,57 @@ __wt_las_insert_block(WT_SESSION_IMPL *session, } /* + * __wt_las_cursor_position -- + * Position a lookaside cursor at the beginning of a block. + * + * There may be no block of lookaside entries if they have been removed by + * WT_CONNECTION::rollback_to_stable. + */ +int +__wt_las_cursor_position(WT_CURSOR *cursor, uint32_t btree_id, uint64_t pageid) +{ + WT_ITEM las_key; + uint64_t las_counter, las_pageid; + uint32_t las_id; + int exact; + + /* + * Because of the special visibility rules for lookaside, a new block + * can appear in between our search and the block of interest. Keep + * trying until we find it. + */ + for (;;) { + WT_CLEAR(las_key); + cursor->set_key(cursor, + btree_id, pageid, (uint64_t)0, &las_key); + WT_RET(cursor->search_near(cursor, &exact)); + if (exact < 0) { + WT_RET(cursor->next(cursor)); + + /* + * Because of the special visibility rules for + * lookaside, a new block can appear in between our + * search and the block of interest. Keep trying while + * we have a key lower that we expect. + * + * There may be no block of lookaside entries if they + * have been removed by + * WT_CONNECTION::rollback_to_stable. + */ + WT_RET(cursor->get_key(cursor, + &las_id, &las_pageid, &las_counter, &las_key)); + if (las_id < btree_id || (las_id == btree_id && + pageid != 0 && las_pageid < pageid)) + continue; + } + + return (0); + } + + /* NOTREACHED */ +} + +/* * __wt_las_remove_block -- * Remove all records matching a key prefix from the lookaside store. */ @@ -448,7 +499,6 @@ __wt_las_remove_block(WT_SESSION_IMPL *session, WT_ITEM las_key; uint64_t las_counter, las_pageid, remove_cnt; uint32_t las_id, session_flags; - int exact; bool local_cursor; remove_cnt = 0; @@ -464,10 +514,7 @@ __wt_las_remove_block(WT_SESSION_IMPL *session, * Search for the block's unique prefix and step through all matching * records, removing them. */ - las_key.size = 0; - cursor->set_key(cursor, btree_id, pageid, (uint64_t)0, &las_key); - if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0) - ret = cursor->next(cursor); + ret = __wt_las_cursor_position(cursor, btree_id, pageid); for (; ret == 0; ret = cursor->next(cursor)) { WT_ERR(cursor->get_key(cursor, &las_id, &las_pageid, &las_counter, &las_key)); diff --git a/src/include/extern.h b/src/include/extern.h index c4365911e74..bbe66abf753 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -207,6 +207,7 @@ extern int __wt_las_cursor_open(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRI extern void __wt_las_cursor( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags); extern int __wt_las_cursor_close( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_las_insert_block(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CURSOR *cursor, WT_MULTI *multi, WT_ITEM *key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_las_cursor_position(WT_CURSOR *cursor, uint32_t btree_id, uint64_t pageid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_las_remove_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id, uint64_t pageid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern uint32_t __wt_checksum_sw(const void *chunk, size_t len) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern void __wt_checksum_init(void); diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index 108d9cf15f9..ae856649ede 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -378,9 +378,16 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, * Otherwise we would need to keep updates in memory that go back older * than the version in the disk image, and since modify operations * aren't idempotent, that is problematic. + * + * If we try to do eviction using transaction visibility, we had better + * have a snapshot. This doesn't apply to checkpoints: there are + * (rare) cases where we write data at read-uncommitted isolation. */ WT_ASSERT(session, !LF_ISSET(WT_REC_UPDATE_RESTORE) || LF_ISSET(WT_REC_VISIBLE_ALL)); + WT_ASSERT(session, !LF_ISSET(WT_REC_EVICT) || + LF_ISSET(WT_REC_VISIBLE_ALL) || + F_ISSET(&session->txn, WT_TXN_HAS_SNAPSHOT)); /* We shouldn't get called with a clean page, that's an error. */ WT_ASSERT(session, __wt_page_is_modified(page)); @@ -1248,6 +1255,8 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, if ((txnid = upd->txnid) == WT_TXN_ABORTED) continue; + upd_memsize += WT_UPDATE_MEMSIZE(upd); + /* * Track the first update in the chain that is not aborted and * the maximum transaction ID. @@ -1266,10 +1275,20 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, * concurrent transaction commits or rolls back while we are * examining its updates. */ - if (WT_TXNID_LE(r->last_running, txnid)) + if (F_ISSET(r, WT_REC_EVICT) && + (F_ISSET(r, WT_REC_VISIBLE_ALL) ? + WT_TXNID_LE(r->last_running, txnid) : + !__txn_visible_id(session, txnid))) { uncommitted = r->update_uncommitted = true; + continue; + } - upd_memsize += WT_UPDATE_MEMSIZE(upd); +#ifdef HAVE_TIMESTAMPS + /* Track the first update with non-zero timestamp. */ + if (first_ts_upd == NULL && + !__wt_timestamp_iszero(&upd->timestamp)) + first_ts_upd = upd; +#endif /* * Find the first update we can use. @@ -1299,13 +1318,6 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, if (*updp == NULL) *updp = upd; - -#ifdef HAVE_TIMESTAMPS - /* Track the first update with non-zero timestamp. */ - if (first_ts_upd == NULL && - !__wt_timestamp_iszero(&upd->timestamp)) - first_ts_upd = upd; -#endif } /* Reconciliation should never see an aborted or reserved update. */ @@ -1360,9 +1372,9 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, #else timestampp = NULL; #endif - all_visible = *updp == first_txn_upd && + all_visible = *updp == first_txn_upd && !uncommitted && (F_ISSET(r, WT_REC_VISIBLE_ALL) ? - !uncommitted && __wt_txn_visible_all(session, max_txn, timestampp) : + __wt_txn_visible_all(session, max_txn, timestampp) : __wt_txn_visible(session, max_txn, timestampp)); if (all_visible) @@ -1409,9 +1421,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, #ifdef HAVE_TIMESTAMPS /* Track the oldest saved timestamp for lookaside. */ - if (first_ts_upd == NULL) - __wt_timestamp_set_zero(&r->min_saved_timestamp); - else if (F_ISSET(r, WT_REC_LOOKASIDE)) + if (F_ISSET(r, WT_REC_LOOKASIDE)) for (upd = first_upd; upd != NULL; upd = upd->next) if (upd->txnid != WT_TXN_ABORTED && upd->txnid != WT_TXN_NONE && |