From 6dcff54e40ce18729d14a9e96b1cbcb4fbc331fe Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Wed, 29 Nov 2017 17:22:07 +1100 Subject: WT-3773 Fix a bug calculating on-disk images for modify updates. (#3817) Previously, when evicting a page with modify updates, some of the modifications could be skipped when calculating the complete value to write to the new page. This could lead to updates being lost. --- src/btree/bt_read.c | 6 +++--- src/btree/bt_ret.c | 13 +++++++------ src/cache/cache_las.c | 2 +- src/include/btmem.h | 2 +- src/include/extern.h | 1 + src/reconcile/rec_write.c | 38 +++++++++++++++++++++++--------------- 6 files changed, 36 insertions(+), 26 deletions(-) diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c index dd39610a3e2..19ff15fb21e 100644 --- a/src/btree/bt_read.c +++ b/src/btree/bt_read.c @@ -221,7 +221,7 @@ __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t btree_id) */ page->modify->first_dirty_txn = WT_TXN_FIRST; - if (!ref->page_las->las_skew_oldest && + if (ref->page_las->las_skew_newest && !S2C(session)->txn_global.has_stable_timestamp && __wt_txn_visible_all(session, ref->page_las->las_max_txn, WT_TIMESTAMP_NULL(&ref->page_las->onpage_timestamp))) { @@ -495,7 +495,7 @@ __las_page_skip(WT_SESSION_IMPL *session, WT_REF *ref) goto done; if (!F_ISSET(txn, WT_TXN_HAS_TS_READ) && - !ref->page_las->las_skew_oldest) { + ref->page_las->las_skew_newest) { skip = true; goto done; } @@ -511,7 +511,7 @@ __las_page_skip(WT_SESSION_IMPL *session, WT_REF *ref) &session->txn.read_timestamp) <= 0); if (F_ISSET(&session->txn, WT_TXN_HAS_TS_READ) && - ref->page_las->las_skew_oldest && + !ref->page_las->las_skew_newest && __wt_timestamp_cmp( &ref->page_las->min_timestamp, &session->txn.read_timestamp) > 0) { skip = true; diff --git a/src/btree/bt_ret.c b/src/btree/bt_ret.c index d63b5884fef..b24a4e1db45 100644 --- a/src/btree/bt_ret.c +++ b/src/btree/bt_ret.c @@ -137,13 +137,13 @@ __value_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) #define WT_MODIFY_ARRAY_SIZE (WT_MAX_MODIFY_UPDATE + 10) /* - * __value_return_upd -- + * __wt_value_return_upd -- * Change the cursor to reference an internal update structure return * value. */ -static inline int -__value_return_upd( - WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) +int +__wt_value_return_upd(WT_SESSION_IMPL *session, + WT_CURSOR_BTREE *cbt, WT_UPDATE *upd, bool ignore_visibility) { WT_CURSOR *cursor; WT_DECL_RET; @@ -173,7 +173,8 @@ __value_return_upd( * that are visible to us. */ for (i = 0, listp = list; upd != NULL; upd = upd->next) { - if (!__wt_txn_upd_visible(session, upd)) + if (upd->txnid == WT_TXN_ABORTED || + (!ignore_visibility && !__wt_txn_upd_visible(session, upd))) continue; if (WT_UPDATE_DATA_VALUE(upd)) @@ -273,7 +274,7 @@ __wt_value_return( if (upd == NULL) WT_RET(__value_return(session, cbt)); else - WT_RET(__value_return_upd(session, cbt, upd)); + WT_RET(__wt_value_return_upd(session, cbt, upd, false)); F_SET(cursor, WT_CURSTD_VALUE_INT); return (0); } diff --git a/src/cache/cache_las.c b/src/cache/cache_las.c index f7b62b5f809..e0d90ad836d 100644 --- a/src/cache/cache_las.c +++ b/src/cache/cache_las.c @@ -385,7 +385,7 @@ __las_insert_block_verbose(WT_SESSION_IMPL *session, WT_MULTI *multi) btree_id, multi->page_las.las_pageid, multi->page_las.las_max_txn, hex_timestamp, - multi->page_las.las_skew_oldest? "oldest" : "youngest", + multi->page_las.las_skew_newest? "newest" : "oldest", WT_STAT_READ(conn->stats, cache_lookaside_entries), pct_dirty, pct_full); } diff --git a/src/include/btmem.h b/src/include/btmem.h index 54a0f7c3487..c5cdfe5850a 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -203,7 +203,7 @@ struct __wt_page_lookaside { lookaside */ WT_DECL_TIMESTAMP(min_timestamp) /* Min timestamp in lookaside */ WT_DECL_TIMESTAMP(onpage_timestamp) /* Max timestamp on page */ - bool las_skew_oldest; /* On-page skewed to oldest */ + bool las_skew_newest; /* On-page skewed to newest */ }; /* diff --git a/src/include/extern.h b/src/include/extern.h index ce9e1e57a47..272544b08f7 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -166,6 +166,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags #endif ); extern int __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_value_return_upd(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd, bool ignore_visibility) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_key_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_value_return( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index f0f9064d53e..233e0ec61f6 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -40,9 +40,9 @@ typedef struct { /* * Track the oldest running transaction and whether to skew lookaside - * to the newest or oldest update. + * to the newest update. */ - bool las_skew_oldest; + bool las_skew_newest; uint64_t last_running; /* Track the page's min/maximum transactions. */ @@ -907,6 +907,7 @@ __rec_init(WT_SESSION_IMPL *session, WT_PAGE *page; WT_RECONCILE *r; WT_TXN_GLOBAL *txn_global; + bool las_skew_oldest; btree = S2BT(session); page = ref->page; @@ -952,10 +953,13 @@ __rec_init(WT_SESSION_IMPL *session, */ txn_global = &S2C(session)->txn_global; if (__wt_btree_immediately_durable(session)) - r->las_skew_oldest = false; + las_skew_oldest = false; else - WT_ORDERED_READ(r->las_skew_oldest, + WT_ORDERED_READ(las_skew_oldest, txn_global->has_stable_timestamp); + r->las_skew_newest = LF_ISSET(WT_REC_LOOKASIDE) && + LF_ISSET(WT_REC_VISIBLE_ALL) && !las_skew_oldest; + WT_ORDERED_READ(r->last_running, txn_global->last_running); /* @@ -1344,8 +1348,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, * version (but we save enough information that checkpoint can * fix things up if we choose an update that is too new). */ - if (*updp == NULL && F_ISSET(r, WT_REC_LOOKASIDE) && - F_ISSET(r, WT_REC_VISIBLE_ALL) && !r->las_skew_oldest) + if (*updp == NULL && r->las_skew_newest) *updp = upd; if (F_ISSET(r, WT_REC_VISIBLE_ALL) ? @@ -1480,7 +1483,6 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, * unresolved updates, move the entire update list. */ WT_RET(__rec_update_save(session, r, ins, ripcip, *updp, upd_memsize)); - if (upd_savedp != NULL) *upd_savedp = true; @@ -1523,7 +1525,7 @@ check_original_value: * - or any reconciliation of a backing overflow record that will be * physically removed once it's no longer needed. */ - if (*updp != NULL && ((*updp)->type == WT_UPDATE_MODIFIED || + if (*updp != NULL && (!WT_UPDATE_DATA_VALUE(*updp) || F_ISSET(r, WT_REC_LOOKASIDE) || (vpack != NULL && vpack->ovfl && vpack->raw != WT_CELL_VALUE_OVFL_RM))) WT_RET( @@ -3386,7 +3388,7 @@ __rec_split_write_supd(WT_SESSION_IMPL *session, } done: /* Track the oldest timestamp seen so far. */ - multi->page_las.las_skew_oldest = r->las_skew_oldest; + multi->page_las.las_skew_newest = r->las_skew_newest; multi->page_las.las_max_txn = r->max_txn; WT_ASSERT(session, r->max_txn != WT_TXN_NONE); #ifdef HAVE_TIMESTAMPS @@ -4629,8 +4631,9 @@ record_loop: /* break; case WT_UPDATE_MODIFIED: cbt->slot = WT_COL_SLOT(page, cip); - WT_ERR(__wt_value_return( - session, cbt, upd)); + WT_ERR(__wt_value_return_upd( + session, cbt, upd, + F_ISSET(r, WT_REC_VISIBLE_ALL))); data = cbt->iface.value.data; size = (uint32_t)cbt->iface.value.size; update_no_copy = false; @@ -4873,8 +4876,9 @@ compare: /* * on-page item. */ cbt->slot = UINT32_MAX; - WT_ERR(__wt_value_return( - session, cbt, upd)); + WT_ERR(__wt_value_return_upd( + session, cbt, upd, + F_ISSET(r, WT_REC_VISIBLE_ALL))); data = cbt->iface.value.data; size = (uint32_t)cbt->iface.value.size; update_no_copy = false; @@ -5471,7 +5475,9 @@ __rec_row_leaf(WT_SESSION_IMPL *session, goto leaf_insert; case WT_UPDATE_MODIFIED: cbt->slot = WT_ROW_SLOT(page, rip); - WT_ERR(__wt_value_return(session, cbt, upd)); + WT_ERR(__wt_value_return_upd( + session, cbt, upd, + F_ISSET(r, WT_REC_VISIBLE_ALL))); WT_ERR(__rec_cell_build_val(session, r, cbt->iface.value.data, cbt->iface.value.size, (uint64_t)0)); @@ -5690,7 +5696,9 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) * item. */ cbt->slot = UINT32_MAX; - WT_RET(__wt_value_return(session, cbt, upd)); + WT_RET(__wt_value_return_upd( + session, cbt, upd, + F_ISSET(r, WT_REC_VISIBLE_ALL))); WT_RET(__rec_cell_build_val(session, r, cbt->iface.value.data, cbt->iface.value.size, (uint64_t)0)); -- cgit v1.2.1