From 00374e0c7f1a356a30355f59d4bbb9e60bbd94d3 Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Mon, 27 Nov 2017 08:53:34 +1100 Subject: WT-3765 Prevent eviction of pages being truncated. (#3809) When an application performs a truncate operation, WiredTiger marks pages deleted. If such a page is subsequently read with a view earlier than the truncate, the page is reinstantiated and all records deleted (as if truncate had taken the slow path). Such a page cannot be evicted: if the truncate is rolled back, it expects to find the page and any tombstones so it can roll them all back. If the page is evicted or split, the rollback will fail. This change takes two approaches: don't allow checkpoints to queue pages for urgent eviction, since checkpoints use special rules to determine whether eviction is permitted. In addition, check for uncommitted truncate operations before allowing any page to be evicted. --- src/include/btree.i | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/src/include/btree.i b/src/include/btree.i index 19b300908b1..ae0c2b4908c 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -1335,6 +1335,14 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool *inmem_splitp) F_ISSET_ATOMIC(ref->home, WT_PAGE_OVERFLOW_KEYS)) return (false); + /* + * If the page was restored after a truncate, it can't be evicted until + * the truncate completes. + */ + if (ref->page_del != NULL && !__wt_txn_visible_all(session, + ref->page_del->txnid, WT_TIMESTAMP_NULL(&ref->page_del->timestamp))) + return (false); + /* * Check for in-memory splits before other eviction tests. If the page * should split in-memory, return success immediately and skip more @@ -1423,15 +1431,21 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) * * Fast checks if eviction is disabled for this handle, operation or * tree, then perform a general check if eviction will be possible. + * + * Checkpoint should not queue pages for urgent eviction if it cannot + * evict them immediately: there is a special exemption that allows + * checkpoint to evict dirty pages in a tree that is being + * checkpointed, and no other thread can help with that. */ page = ref->page; if (WT_READGEN_EVICT_SOON(page->read_gen) && btree->evict_disabled == 0 && __wt_page_can_evict(session, ref, &inmem_split)) { - if ((LF_ISSET(WT_READ_NO_SPLIT) || (!inmem_split && - F_ISSET(session, WT_SESSION_NO_RECONCILE)))) - __wt_page_evict_urgent(session, ref); - else { + if (LF_ISSET(WT_READ_NO_SPLIT) || (!inmem_split && + F_ISSET(session, WT_SESSION_NO_RECONCILE))) { + if (!WT_SESSION_IS_CHECKPOINT(session)) + __wt_page_evict_urgent(session, ref); + } else { WT_RET_BUSY_OK(__wt_page_release_evict(session, ref)); return (0); } -- cgit v1.2.1 From 6b3dee7f0808a7877129c804b95d1e986e4e5fa6 Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Tue, 28 Nov 2017 08:02:44 +1100 Subject: WT-3764 Allow fast eviction of unwanted clean pages. (#3806) --- src/btree/bt_compact.c | 2 +- src/btree/bt_read.c | 2 +- src/evict/evict_page.c | 4 +--- src/include/btree.i | 16 ++++++++++++++-- 4 files changed, 17 insertions(+), 7 deletions(-) diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c index b3e23a8251c..63015312232 100644 --- a/src/btree/bt_compact.c +++ b/src/btree/bt_compact.c @@ -35,7 +35,7 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) * If the page is a replacement, test the replacement addresses. * Ignore empty pages, they get merged into the parent. */ - if (mod == NULL || mod->rec_result == 0) { + if (__wt_page_evict_clean(page)) { __wt_ref_info(ref, &addr, &addr_size, NULL); if (addr == NULL) return (0); diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c index fd9a7597d73..7ce1522daa3 100644 --- a/src/btree/bt_read.c +++ b/src/btree/bt_read.c @@ -268,7 +268,7 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_REF *ref) * It's hard to imagine a page with a huge memory footprint that has * never been modified, but check to be sure. */ - if (page->modify == NULL) + if (__wt_page_evict_clean(page)) return (false); /* Pages are usually small enough, check that first. */ diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c index 65009dc3449..cf56b8cfe7a 100644 --- a/src/evict/evict_page.c +++ b/src/evict/evict_page.c @@ -121,7 +121,6 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_PAGE *page; - WT_PAGE_MODIFY *mod; bool clean_page, inmem_split, tree_dead; conn = S2C(session); @@ -166,8 +165,7 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) conn->cache->evict_max_page_size = page->memory_footprint; /* Figure out whether reconciliation was done on the page */ - mod = page->modify; - clean_page = mod == NULL || mod->rec_result == 0; + clean_page = __wt_page_evict_clean(page); /* Update the reference and discard the page. */ if (__wt_ref_is_root(ref)) diff --git a/src/include/btree.i b/src/include/btree.i index ae0c2b4908c..9941fb038ce 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -27,6 +27,17 @@ __wt_page_is_empty(WT_PAGE *page) page->modify->rec_result == WT_PM_REC_EMPTY); } +/* + * __wt_page_evict_clean -- + * Return if the page can be evicted without dirtying the tree. + */ +static inline bool +__wt_page_evict_clean(WT_PAGE *page) +{ + return (page->modify == NULL || (page->modify->write_gen == 0 && + page->modify->rec_result == 0)); +} + /* * __wt_page_is_modified -- * Return if the page is dirty. @@ -1441,8 +1452,9 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) if (WT_READGEN_EVICT_SOON(page->read_gen) && btree->evict_disabled == 0 && __wt_page_can_evict(session, ref, &inmem_split)) { - if (LF_ISSET(WT_READ_NO_SPLIT) || (!inmem_split && - F_ISSET(session, WT_SESSION_NO_RECONCILE))) { + if (!__wt_page_evict_clean(page) && + (LF_ISSET(WT_READ_NO_SPLIT) || (!inmem_split && + F_ISSET(session, WT_SESSION_NO_RECONCILE)))) { if (!WT_SESSION_IS_CHECKPOINT(session)) __wt_page_evict_urgent(session, ref); } else { -- cgit v1.2.1 From 42139b18dd8a105d0ccc3882786ae92bcb9a4c18 Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Tue, 28 Nov 2017 12:53:30 +1100 Subject: WT-3763 Tune eviction for various MongoDB workloads. (#3804) In particular, balance primary inserts, overflowing the cache to use the lookaside table, secondary inserts and secondary reads of the oplog (assuming the oplog is at least partially stored in the lookaside table). --- src/btree/bt_read.c | 3 +-- src/btree/bt_split.c | 5 +++-- src/cache/cache_las.c | 4 +++- src/evict/evict_lru.c | 20 ++++++++++++-------- src/evict/evict_page.c | 10 +++++----- src/include/btmem.h | 1 + src/include/btree.i | 41 +++++++++++++++++++++++++++-------------- src/reconcile/rec_write.c | 20 +++++++++++--------- 8 files changed, 63 insertions(+), 41 deletions(-) diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c index 7ce1522daa3..dd39610a3e2 100644 --- a/src/btree/bt_read.c +++ b/src/btree/bt_read.c @@ -297,8 +297,7 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_REF *ref) * skipping the page indefinitely or large records can lead to * extremely large memory footprints. */ - if (page->modify->update_restored && - !__wt_page_evict_retry(session, page)) + if (!__wt_page_evict_retry(session, page)) return (false); /* Trigger eviction on the next page release. */ diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index a9643ed92a0..bf7ea54adb0 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -1493,9 +1493,10 @@ __split_multi_inmem( page->modify->first_dirty_txn = WT_TXN_FIRST; /* - * If the new page is modified, save the oldest ID from reconciliation - * to avoid repeatedly attempting eviction on the same page. + * If the new page is modified, save the eviction generation to avoid + * repeatedly attempting eviction on the same page. */ + page->modify->last_evict_pass_gen = orig->modify->last_evict_pass_gen; page->modify->last_eviction_id = orig->modify->last_eviction_id; __wt_timestamp_set(&page->modify->last_eviction_timestamp, &orig->modify->last_eviction_timestamp); diff --git a/src/cache/cache_las.c b/src/cache/cache_las.c index 9f8aeb7cc9e..f7b62b5f809 100644 --- a/src/cache/cache_las.c +++ b/src/cache/cache_las.c @@ -64,7 +64,7 @@ __wt_las_stats_update(WT_SESSION_IMPL *session) dstats = ((WT_CURSOR_BTREE *) cache->las_session[0]->las_cursor)->btree->dhandle->stats; - v = WT_STAT_READ(dstats, cursor_insert); + v = WT_STAT_READ(dstats, cursor_update); WT_STAT_SET(session, cstats, cache_lookaside_insert, v); v = WT_STAT_READ(dstats, cursor_remove); WT_STAT_SET(session, cstats, cache_lookaside_remove, v); @@ -433,6 +433,7 @@ __wt_las_insert_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, /* Wrap all the updates in a transaction. */ las_session = (WT_SESSION_IMPL *)cursor->session; WT_RET(__wt_txn_begin(las_session, NULL)); + las_session->txn.isolation = WT_TXN_ISO_READ_UNCOMMITTED; /* * Make sure there are no leftover entries (e.g., from a handle @@ -638,6 +639,7 @@ __wt_las_remove_block(WT_SESSION_IMPL *session, */ if (local_cursor) { WT_ERR(__wt_txn_begin(las_session, NULL)); + las_session->txn.isolation = WT_TXN_ISO_READ_UNCOMMITTED; local_txn = true; } diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index b1e42fcf489..fe389b65e4d 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -1864,6 +1864,10 @@ __evict_walk_file(WT_SESSION_IMPL *session, if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)) continue; + /* Don't queue dirty pages in trees during checkpoints. */ + if (modified && btree->checkpointing != WT_CKPT_OFF) + continue; + /* * It's possible (but unlikely) to visit a page without a read * generation, if we race with the read instantiating the page. @@ -1944,14 +1948,13 @@ __evict_walk_file(WT_SESSION_IMPL *session, goto fast; /* - * If there are active transaction and oldest transaction - * hasn't changed since the last time this page was written, - * it's unlikely we can make progress. Similarly, if the most - * recent update on the page is not yet globally visible, - * eviction will fail. This heuristic avoids repeated attempts - * to evict the same page. + * If the global transaction state hasn't changed since the + * last time we tried eviction, it's unlikely we can make + * progress. Similarly, if the most recent update on the page + * is not yet globally visible, eviction will fail. This + * heuristic avoids repeated attempts to evict the same page. */ - if (modified && (!__wt_page_evict_retry(session, page) || + if (!__wt_page_evict_retry(session, page) || (modified && !__txn_visible_all_id(session, page->modify->update_txn))) continue; @@ -2050,9 +2053,10 @@ __evict_get_ref( cache = S2C(session)->cache; is_app = !F_ISSET(session, WT_SESSION_INTERNAL); server_only = is_server && !WT_EVICT_HAS_WORKERS(session); + /* Application threads do eviction when cache is full of dirty data */ urgent_ok = (!is_app && !is_server) || !WT_EVICT_HAS_WORKERS(session) || - (is_app && __wt_cache_aggressive(session)); + (is_app && F_ISSET(cache, WT_CACHE_EVICT_DIRTY_HARD)); urgent_queue = cache->evict_urgent_queue; WT_STAT_CONN_INCR(session, cache_eviction_get_ref); diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c index cf56b8cfe7a..7a84f90eb81 100644 --- a/src/evict/evict_page.c +++ b/src/evict/evict_page.c @@ -567,13 +567,13 @@ __evict_review( if (F_ISSET(conn, WT_CONN_IN_MEMORY)) LF_SET(WT_REC_IN_MEMORY | WT_REC_SCRUB | WT_REC_UPDATE_RESTORE); + else if (WT_SESSION_IS_CHECKPOINT(session)) + LF_SET(WT_REC_LOOKASIDE); else if (!WT_IS_METADATA(session->dhandle)) { - if (!WT_SESSION_IS_CHECKPOINT(session)) { - LF_SET(WT_REC_UPDATE_RESTORE); + LF_SET(WT_REC_UPDATE_RESTORE); - if (F_ISSET(cache, WT_CACHE_EVICT_SCRUB)) - LF_SET(WT_REC_SCRUB); - } + if (F_ISSET(cache, WT_CACHE_EVICT_SCRUB)) + LF_SET(WT_REC_SCRUB); /* * If the cache is under pressure with many updates diff --git a/src/include/btmem.h b/src/include/btmem.h index d45b68d1972..54a0f7c3487 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -215,6 +215,7 @@ struct __wt_page_modify { uint64_t first_dirty_txn; /* The transaction state last time eviction was attempted. */ + uint64_t last_evict_pass_gen; uint64_t last_eviction_id; WT_DECL_TIMESTAMP(last_eviction_timestamp) diff --git a/src/include/btree.i b/src/include/btree.i index 9941fb038ce..560cc8eb212 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -1279,8 +1279,7 @@ __wt_leaf_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) /* * __wt_page_evict_retry -- - * Check if there has been transaction progress since the last eviction - * attempt. + * Avoid busy-spinning attempting to evict the same page all the time. */ static inline bool __wt_page_evict_retry(WT_SESSION_IMPL *session, WT_PAGE *page) @@ -1290,29 +1289,43 @@ __wt_page_evict_retry(WT_SESSION_IMPL *session, WT_PAGE *page) txn_global = &S2C(session)->txn_global; - if ((mod = page->modify) == NULL) + /* + * If the page hasn't been through one round of update/restore, give it + * a try. + */ + if ((mod = page->modify) == NULL || !mod->update_restored) return (true); - if (txn_global->current != txn_global->oldest_id && - mod->last_eviction_id == __wt_txn_oldest_id(session)) - return (false); + /* + * Retry if a reasonable amount of eviction time has passed, the + * choice of 5 eviction passes as a reasonable amount of time is + * currently pretty arbitrary. + */ + if (__wt_cache_aggressive(session) || + mod->last_evict_pass_gen + 5 < S2C(session)->cache->evict_pass_gen) + return (true); + + /* Retry if the global transaction state has moved forward. */ + if (txn_global->current == txn_global->oldest_id || + mod->last_eviction_id != __wt_txn_oldest_id(session)) + return (true); #ifdef HAVE_TIMESTAMPS { bool same_timestamp; - if (__wt_timestamp_iszero(&mod->last_eviction_timestamp)) + same_timestamp = false; + if (!__wt_timestamp_iszero(&mod->last_eviction_timestamp)) + WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, + same_timestamp = __wt_timestamp_cmp( + &mod->last_eviction_timestamp, + &txn_global->pinned_timestamp) == 0); + if (!same_timestamp) return (true); - - WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, - same_timestamp = __wt_timestamp_cmp( - &mod->last_eviction_timestamp, &txn_global->pinned_timestamp) == 0); - if (same_timestamp) - return (false); } #endif - return (true); + return (false); } /* diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index 77b8c2a2e78..18e8df4918c 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -428,7 +428,8 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, __wt_timestamp_set(&mod->last_eviction_timestamp, &S2C(session)->txn_global.pinned_timestamp)); #endif - } + mod->last_evict_pass_gen = S2C(session)->cache->evict_pass_gen; + } #ifdef HAVE_DIAGNOSTIC /* @@ -620,10 +621,11 @@ __rec_write_check_complete( /* * If we have used the lookaside table, check for a lookaside table and - * checkpoint collision. + * checkpoint collision. If there is no collision, go ahead with the + * eviction. */ - if (r->cache_write_lookaside && __rec_las_checkpoint_test(session, r)) - return (EBUSY); + if (r->cache_write_lookaside) + return (__rec_las_checkpoint_test(session, r) ? EBUSY : 0); /* * Fall back to lookaside eviction during checkpoints if a page can't @@ -644,8 +646,11 @@ __rec_write_check_complete( * likely get to write at least one of the blocks. If we've created a * page image for a page that previously didn't have one, or we had a * page image and it is now empty, that's also progress. + * + * Also check that the current reconciliation applied some updates, in + * which case evict/restore should gain us some space. */ - if (r->multi_next > 1) + if (r->multi_next > 1 && r->update_used) return (0); /* @@ -661,13 +666,10 @@ __rec_write_check_complete( return (0); /* - * Check if the current reconciliation applied some updates, in which - * case evict/restore should gain us some space. - * * Check if lookaside eviction is possible. If any of the updates we * saw were uncommitted, the lookaside table cannot be used. */ - if (r->update_uncommitted || r->update_used) + if (r->update_uncommitted) return (0); *lookaside_retryp = true; -- cgit v1.2.1 From c0ab6fd31c7d3ff3735b17a710365527c8e75098 Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Tue, 28 Nov 2017 16:35:54 +1100 Subject: WT-3763 Disable suffix compression on key with saved updates. (#3814) --- src/reconcile/rec_write.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index 18e8df4918c..b69ca00de92 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -5675,10 +5675,10 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) session, r, WT_INSERT_KEY_SIZE(ins))); /* - * Turn off prefix compression until a full key is - * written into the new page. + * Turn off prefix and suffix compression until a full + * key is written into the new page. */ - r->key_pfx_compress = false; + r->key_pfx_compress = r->key_sfx_compress = false; continue; } -- cgit v1.2.1 From 792bd7a3a976631e47568b4a4e34a09c2ab508dc Mon Sep 17 00:00:00 2001 From: nehakhatri5 <30581300+nehakhatri5@users.noreply.github.com> Date: Wed, 29 Nov 2017 16:31:10 +1100 Subject: WT-3763 Revert part of the change that made reconciliation more likely to use lookaside (#3816) --- src/reconcile/rec_write.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index b69ca00de92..f0f9064d53e 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -621,11 +621,10 @@ __rec_write_check_complete( /* * If we have used the lookaside table, check for a lookaside table and - * checkpoint collision. If there is no collision, go ahead with the - * eviction. + * checkpoint collision. */ - if (r->cache_write_lookaside) - return (__rec_las_checkpoint_test(session, r) ? EBUSY : 0); + if (r->cache_write_lookaside && __rec_las_checkpoint_test(session, r)) + return (EBUSY); /* * Fall back to lookaside eviction during checkpoints if a page can't @@ -646,11 +645,8 @@ __rec_write_check_complete( * likely get to write at least one of the blocks. If we've created a * page image for a page that previously didn't have one, or we had a * page image and it is now empty, that's also progress. - * - * Also check that the current reconciliation applied some updates, in - * which case evict/restore should gain us some space. */ - if (r->multi_next > 1 && r->update_used) + if (r->multi_next > 1) return (0); /* @@ -666,10 +662,13 @@ __rec_write_check_complete( return (0); /* + * Check if the current reconciliation applied some updates, in which + * case evict/restore should gain us some space. + * * Check if lookaside eviction is possible. If any of the updates we * saw were uncommitted, the lookaside table cannot be used. */ - if (r->update_uncommitted) + if (r->update_uncommitted || r->update_used) return (0); *lookaside_retryp = true; -- cgit v1.2.1 From 6dcff54e40ce18729d14a9e96b1cbcb4fbc331fe Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Wed, 29 Nov 2017 17:22:07 +1100 Subject: WT-3773 Fix a bug calculating on-disk images for modify updates. (#3817) Previously, when evicting a page with modify updates, some of the modifications could be skipped when calculating the complete value to write to the new page. This could lead to updates being lost. --- src/btree/bt_read.c | 6 +++--- src/btree/bt_ret.c | 13 +++++++------ src/cache/cache_las.c | 2 +- src/include/btmem.h | 2 +- src/include/extern.h | 1 + src/reconcile/rec_write.c | 38 +++++++++++++++++++++++--------------- 6 files changed, 36 insertions(+), 26 deletions(-) diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c index dd39610a3e2..19ff15fb21e 100644 --- a/src/btree/bt_read.c +++ b/src/btree/bt_read.c @@ -221,7 +221,7 @@ __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t btree_id) */ page->modify->first_dirty_txn = WT_TXN_FIRST; - if (!ref->page_las->las_skew_oldest && + if (ref->page_las->las_skew_newest && !S2C(session)->txn_global.has_stable_timestamp && __wt_txn_visible_all(session, ref->page_las->las_max_txn, WT_TIMESTAMP_NULL(&ref->page_las->onpage_timestamp))) { @@ -495,7 +495,7 @@ __las_page_skip(WT_SESSION_IMPL *session, WT_REF *ref) goto done; if (!F_ISSET(txn, WT_TXN_HAS_TS_READ) && - !ref->page_las->las_skew_oldest) { + ref->page_las->las_skew_newest) { skip = true; goto done; } @@ -511,7 +511,7 @@ __las_page_skip(WT_SESSION_IMPL *session, WT_REF *ref) &session->txn.read_timestamp) <= 0); if (F_ISSET(&session->txn, WT_TXN_HAS_TS_READ) && - ref->page_las->las_skew_oldest && + !ref->page_las->las_skew_newest && __wt_timestamp_cmp( &ref->page_las->min_timestamp, &session->txn.read_timestamp) > 0) { skip = true; diff --git a/src/btree/bt_ret.c b/src/btree/bt_ret.c index d63b5884fef..b24a4e1db45 100644 --- a/src/btree/bt_ret.c +++ b/src/btree/bt_ret.c @@ -137,13 +137,13 @@ __value_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) #define WT_MODIFY_ARRAY_SIZE (WT_MAX_MODIFY_UPDATE + 10) /* - * __value_return_upd -- + * __wt_value_return_upd -- * Change the cursor to reference an internal update structure return * value. */ -static inline int -__value_return_upd( - WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) +int +__wt_value_return_upd(WT_SESSION_IMPL *session, + WT_CURSOR_BTREE *cbt, WT_UPDATE *upd, bool ignore_visibility) { WT_CURSOR *cursor; WT_DECL_RET; @@ -173,7 +173,8 @@ __value_return_upd( * that are visible to us. */ for (i = 0, listp = list; upd != NULL; upd = upd->next) { - if (!__wt_txn_upd_visible(session, upd)) + if (upd->txnid == WT_TXN_ABORTED || + (!ignore_visibility && !__wt_txn_upd_visible(session, upd))) continue; if (WT_UPDATE_DATA_VALUE(upd)) @@ -273,7 +274,7 @@ __wt_value_return( if (upd == NULL) WT_RET(__value_return(session, cbt)); else - WT_RET(__value_return_upd(session, cbt, upd)); + WT_RET(__wt_value_return_upd(session, cbt, upd, false)); F_SET(cursor, WT_CURSTD_VALUE_INT); return (0); } diff --git a/src/cache/cache_las.c b/src/cache/cache_las.c index f7b62b5f809..e0d90ad836d 100644 --- a/src/cache/cache_las.c +++ b/src/cache/cache_las.c @@ -385,7 +385,7 @@ __las_insert_block_verbose(WT_SESSION_IMPL *session, WT_MULTI *multi) btree_id, multi->page_las.las_pageid, multi->page_las.las_max_txn, hex_timestamp, - multi->page_las.las_skew_oldest? "oldest" : "youngest", + multi->page_las.las_skew_newest? "newest" : "oldest", WT_STAT_READ(conn->stats, cache_lookaside_entries), pct_dirty, pct_full); } diff --git a/src/include/btmem.h b/src/include/btmem.h index 54a0f7c3487..c5cdfe5850a 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -203,7 +203,7 @@ struct __wt_page_lookaside { lookaside */ WT_DECL_TIMESTAMP(min_timestamp) /* Min timestamp in lookaside */ WT_DECL_TIMESTAMP(onpage_timestamp) /* Max timestamp on page */ - bool las_skew_oldest; /* On-page skewed to oldest */ + bool las_skew_newest; /* On-page skewed to newest */ }; /* diff --git a/src/include/extern.h b/src/include/extern.h index ce9e1e57a47..272544b08f7 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -166,6 +166,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags #endif ); extern int __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_value_return_upd(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd, bool ignore_visibility) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_key_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_value_return( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index f0f9064d53e..233e0ec61f6 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -40,9 +40,9 @@ typedef struct { /* * Track the oldest running transaction and whether to skew lookaside - * to the newest or oldest update. + * to the newest update. */ - bool las_skew_oldest; + bool las_skew_newest; uint64_t last_running; /* Track the page's min/maximum transactions. */ @@ -907,6 +907,7 @@ __rec_init(WT_SESSION_IMPL *session, WT_PAGE *page; WT_RECONCILE *r; WT_TXN_GLOBAL *txn_global; + bool las_skew_oldest; btree = S2BT(session); page = ref->page; @@ -952,10 +953,13 @@ __rec_init(WT_SESSION_IMPL *session, */ txn_global = &S2C(session)->txn_global; if (__wt_btree_immediately_durable(session)) - r->las_skew_oldest = false; + las_skew_oldest = false; else - WT_ORDERED_READ(r->las_skew_oldest, + WT_ORDERED_READ(las_skew_oldest, txn_global->has_stable_timestamp); + r->las_skew_newest = LF_ISSET(WT_REC_LOOKASIDE) && + LF_ISSET(WT_REC_VISIBLE_ALL) && !las_skew_oldest; + WT_ORDERED_READ(r->last_running, txn_global->last_running); /* @@ -1344,8 +1348,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, * version (but we save enough information that checkpoint can * fix things up if we choose an update that is too new). */ - if (*updp == NULL && F_ISSET(r, WT_REC_LOOKASIDE) && - F_ISSET(r, WT_REC_VISIBLE_ALL) && !r->las_skew_oldest) + if (*updp == NULL && r->las_skew_newest) *updp = upd; if (F_ISSET(r, WT_REC_VISIBLE_ALL) ? @@ -1480,7 +1483,6 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, * unresolved updates, move the entire update list. */ WT_RET(__rec_update_save(session, r, ins, ripcip, *updp, upd_memsize)); - if (upd_savedp != NULL) *upd_savedp = true; @@ -1523,7 +1525,7 @@ check_original_value: * - or any reconciliation of a backing overflow record that will be * physically removed once it's no longer needed. */ - if (*updp != NULL && ((*updp)->type == WT_UPDATE_MODIFIED || + if (*updp != NULL && (!WT_UPDATE_DATA_VALUE(*updp) || F_ISSET(r, WT_REC_LOOKASIDE) || (vpack != NULL && vpack->ovfl && vpack->raw != WT_CELL_VALUE_OVFL_RM))) WT_RET( @@ -3386,7 +3388,7 @@ __rec_split_write_supd(WT_SESSION_IMPL *session, } done: /* Track the oldest timestamp seen so far. */ - multi->page_las.las_skew_oldest = r->las_skew_oldest; + multi->page_las.las_skew_newest = r->las_skew_newest; multi->page_las.las_max_txn = r->max_txn; WT_ASSERT(session, r->max_txn != WT_TXN_NONE); #ifdef HAVE_TIMESTAMPS @@ -4629,8 +4631,9 @@ record_loop: /* break; case WT_UPDATE_MODIFIED: cbt->slot = WT_COL_SLOT(page, cip); - WT_ERR(__wt_value_return( - session, cbt, upd)); + WT_ERR(__wt_value_return_upd( + session, cbt, upd, + F_ISSET(r, WT_REC_VISIBLE_ALL))); data = cbt->iface.value.data; size = (uint32_t)cbt->iface.value.size; update_no_copy = false; @@ -4873,8 +4876,9 @@ compare: /* * on-page item. */ cbt->slot = UINT32_MAX; - WT_ERR(__wt_value_return( - session, cbt, upd)); + WT_ERR(__wt_value_return_upd( + session, cbt, upd, + F_ISSET(r, WT_REC_VISIBLE_ALL))); data = cbt->iface.value.data; size = (uint32_t)cbt->iface.value.size; update_no_copy = false; @@ -5471,7 +5475,9 @@ __rec_row_leaf(WT_SESSION_IMPL *session, goto leaf_insert; case WT_UPDATE_MODIFIED: cbt->slot = WT_ROW_SLOT(page, rip); - WT_ERR(__wt_value_return(session, cbt, upd)); + WT_ERR(__wt_value_return_upd( + session, cbt, upd, + F_ISSET(r, WT_REC_VISIBLE_ALL))); WT_ERR(__rec_cell_build_val(session, r, cbt->iface.value.data, cbt->iface.value.size, (uint64_t)0)); @@ -5690,7 +5696,9 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) * item. */ cbt->slot = UINT32_MAX; - WT_RET(__wt_value_return(session, cbt, upd)); + WT_RET(__wt_value_return_upd( + session, cbt, upd, + F_ISSET(r, WT_REC_VISIBLE_ALL))); WT_RET(__rec_cell_build_val(session, r, cbt->iface.value.data, cbt->iface.value.size, (uint64_t)0)); -- cgit v1.2.1