summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Cahill <michael.cahill@mongodb.com>2017-11-29 17:22:07 +1100
committerAlex Gorrod <alexander.gorrod@mongodb.com>2017-11-29 17:22:07 +1100
commit6dcff54e40ce18729d14a9e96b1cbcb4fbc331fe (patch)
tree58494372b075b8c48c081a77479124ba12b59716
parent792bd7a3a976631e47568b4a4e34a09c2ab508dc (diff)
downloadmongo-6dcff54e40ce18729d14a9e96b1cbcb4fbc331fe.tar.gz
WT-3773 Fix a bug calculating on-disk images for modify updates. (#3817)
Previously, when evicting a page with modify updates, some of the modifications could be skipped when calculating the complete value to write to the new page. This could lead to updates being lost.
-rw-r--r--src/btree/bt_read.c6
-rw-r--r--src/btree/bt_ret.c13
-rw-r--r--src/cache/cache_las.c2
-rw-r--r--src/include/btmem.h2
-rw-r--r--src/include/extern.h1
-rw-r--r--src/reconcile/rec_write.c38
6 files changed, 36 insertions, 26 deletions
diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c
index dd39610a3e2..19ff15fb21e 100644
--- a/src/btree/bt_read.c
+++ b/src/btree/bt_read.c
@@ -221,7 +221,7 @@ __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t btree_id)
*/
page->modify->first_dirty_txn = WT_TXN_FIRST;
- if (!ref->page_las->las_skew_oldest &&
+ if (ref->page_las->las_skew_newest &&
!S2C(session)->txn_global.has_stable_timestamp &&
__wt_txn_visible_all(session, ref->page_las->las_max_txn,
WT_TIMESTAMP_NULL(&ref->page_las->onpage_timestamp))) {
@@ -495,7 +495,7 @@ __las_page_skip(WT_SESSION_IMPL *session, WT_REF *ref)
goto done;
if (!F_ISSET(txn, WT_TXN_HAS_TS_READ) &&
- !ref->page_las->las_skew_oldest) {
+ ref->page_las->las_skew_newest) {
skip = true;
goto done;
}
@@ -511,7 +511,7 @@ __las_page_skip(WT_SESSION_IMPL *session, WT_REF *ref)
&session->txn.read_timestamp) <= 0);
if (F_ISSET(&session->txn, WT_TXN_HAS_TS_READ) &&
- ref->page_las->las_skew_oldest &&
+ !ref->page_las->las_skew_newest &&
__wt_timestamp_cmp(
&ref->page_las->min_timestamp, &session->txn.read_timestamp) > 0) {
skip = true;
diff --git a/src/btree/bt_ret.c b/src/btree/bt_ret.c
index d63b5884fef..b24a4e1db45 100644
--- a/src/btree/bt_ret.c
+++ b/src/btree/bt_ret.c
@@ -137,13 +137,13 @@ __value_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
#define WT_MODIFY_ARRAY_SIZE (WT_MAX_MODIFY_UPDATE + 10)
/*
- * __value_return_upd --
+ * __wt_value_return_upd --
* Change the cursor to reference an internal update structure return
* value.
*/
-static inline int
-__value_return_upd(
- WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
+int
+__wt_value_return_upd(WT_SESSION_IMPL *session,
+ WT_CURSOR_BTREE *cbt, WT_UPDATE *upd, bool ignore_visibility)
{
WT_CURSOR *cursor;
WT_DECL_RET;
@@ -173,7 +173,8 @@ __value_return_upd(
* that are visible to us.
*/
for (i = 0, listp = list; upd != NULL; upd = upd->next) {
- if (!__wt_txn_upd_visible(session, upd))
+ if (upd->txnid == WT_TXN_ABORTED ||
+ (!ignore_visibility && !__wt_txn_upd_visible(session, upd)))
continue;
if (WT_UPDATE_DATA_VALUE(upd))
@@ -273,7 +274,7 @@ __wt_value_return(
if (upd == NULL)
WT_RET(__value_return(session, cbt));
else
- WT_RET(__value_return_upd(session, cbt, upd));
+ WT_RET(__wt_value_return_upd(session, cbt, upd, false));
F_SET(cursor, WT_CURSTD_VALUE_INT);
return (0);
}
diff --git a/src/cache/cache_las.c b/src/cache/cache_las.c
index f7b62b5f809..e0d90ad836d 100644
--- a/src/cache/cache_las.c
+++ b/src/cache/cache_las.c
@@ -385,7 +385,7 @@ __las_insert_block_verbose(WT_SESSION_IMPL *session, WT_MULTI *multi)
btree_id, multi->page_las.las_pageid,
multi->page_las.las_max_txn,
hex_timestamp,
- multi->page_las.las_skew_oldest? "oldest" : "youngest",
+ multi->page_las.las_skew_newest? "newest" : "oldest",
WT_STAT_READ(conn->stats, cache_lookaside_entries),
pct_dirty, pct_full);
}
diff --git a/src/include/btmem.h b/src/include/btmem.h
index 54a0f7c3487..c5cdfe5850a 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -203,7 +203,7 @@ struct __wt_page_lookaside {
lookaside */
WT_DECL_TIMESTAMP(min_timestamp) /* Min timestamp in lookaside */
WT_DECL_TIMESTAMP(onpage_timestamp) /* Max timestamp on page */
- bool las_skew_oldest; /* On-page skewed to oldest */
+ bool las_skew_newest; /* On-page skewed to newest */
};
/*
diff --git a/src/include/extern.h b/src/include/extern.h
index ce9e1e57a47..272544b08f7 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -166,6 +166,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
#endif
);
extern int __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_value_return_upd(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd, bool ignore_visibility) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_key_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_value_return( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c
index f0f9064d53e..233e0ec61f6 100644
--- a/src/reconcile/rec_write.c
+++ b/src/reconcile/rec_write.c
@@ -40,9 +40,9 @@ typedef struct {
/*
* Track the oldest running transaction and whether to skew lookaside
- * to the newest or oldest update.
+ * to the newest update.
*/
- bool las_skew_oldest;
+ bool las_skew_newest;
uint64_t last_running;
/* Track the page's min/maximum transactions. */
@@ -907,6 +907,7 @@ __rec_init(WT_SESSION_IMPL *session,
WT_PAGE *page;
WT_RECONCILE *r;
WT_TXN_GLOBAL *txn_global;
+ bool las_skew_oldest;
btree = S2BT(session);
page = ref->page;
@@ -952,10 +953,13 @@ __rec_init(WT_SESSION_IMPL *session,
*/
txn_global = &S2C(session)->txn_global;
if (__wt_btree_immediately_durable(session))
- r->las_skew_oldest = false;
+ las_skew_oldest = false;
else
- WT_ORDERED_READ(r->las_skew_oldest,
+ WT_ORDERED_READ(las_skew_oldest,
txn_global->has_stable_timestamp);
+ r->las_skew_newest = LF_ISSET(WT_REC_LOOKASIDE) &&
+ LF_ISSET(WT_REC_VISIBLE_ALL) && !las_skew_oldest;
+
WT_ORDERED_READ(r->last_running, txn_global->last_running);
/*
@@ -1344,8 +1348,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
* version (but we save enough information that checkpoint can
* fix things up if we choose an update that is too new).
*/
- if (*updp == NULL && F_ISSET(r, WT_REC_LOOKASIDE) &&
- F_ISSET(r, WT_REC_VISIBLE_ALL) && !r->las_skew_oldest)
+ if (*updp == NULL && r->las_skew_newest)
*updp = upd;
if (F_ISSET(r, WT_REC_VISIBLE_ALL) ?
@@ -1480,7 +1483,6 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
* unresolved updates, move the entire update list.
*/
WT_RET(__rec_update_save(session, r, ins, ripcip, *updp, upd_memsize));
-
if (upd_savedp != NULL)
*upd_savedp = true;
@@ -1523,7 +1525,7 @@ check_original_value:
* - or any reconciliation of a backing overflow record that will be
* physically removed once it's no longer needed.
*/
- if (*updp != NULL && ((*updp)->type == WT_UPDATE_MODIFIED ||
+ if (*updp != NULL && (!WT_UPDATE_DATA_VALUE(*updp) ||
F_ISSET(r, WT_REC_LOOKASIDE) || (vpack != NULL &&
vpack->ovfl && vpack->raw != WT_CELL_VALUE_OVFL_RM)))
WT_RET(
@@ -3386,7 +3388,7 @@ __rec_split_write_supd(WT_SESSION_IMPL *session,
}
done: /* Track the oldest timestamp seen so far. */
- multi->page_las.las_skew_oldest = r->las_skew_oldest;
+ multi->page_las.las_skew_newest = r->las_skew_newest;
multi->page_las.las_max_txn = r->max_txn;
WT_ASSERT(session, r->max_txn != WT_TXN_NONE);
#ifdef HAVE_TIMESTAMPS
@@ -4629,8 +4631,9 @@ record_loop: /*
break;
case WT_UPDATE_MODIFIED:
cbt->slot = WT_COL_SLOT(page, cip);
- WT_ERR(__wt_value_return(
- session, cbt, upd));
+ WT_ERR(__wt_value_return_upd(
+ session, cbt, upd,
+ F_ISSET(r, WT_REC_VISIBLE_ALL)));
data = cbt->iface.value.data;
size = (uint32_t)cbt->iface.value.size;
update_no_copy = false;
@@ -4873,8 +4876,9 @@ compare: /*
* on-page item.
*/
cbt->slot = UINT32_MAX;
- WT_ERR(__wt_value_return(
- session, cbt, upd));
+ WT_ERR(__wt_value_return_upd(
+ session, cbt, upd,
+ F_ISSET(r, WT_REC_VISIBLE_ALL)));
data = cbt->iface.value.data;
size = (uint32_t)cbt->iface.value.size;
update_no_copy = false;
@@ -5471,7 +5475,9 @@ __rec_row_leaf(WT_SESSION_IMPL *session,
goto leaf_insert;
case WT_UPDATE_MODIFIED:
cbt->slot = WT_ROW_SLOT(page, rip);
- WT_ERR(__wt_value_return(session, cbt, upd));
+ WT_ERR(__wt_value_return_upd(
+ session, cbt, upd,
+ F_ISSET(r, WT_REC_VISIBLE_ALL)));
WT_ERR(__rec_cell_build_val(session, r,
cbt->iface.value.data,
cbt->iface.value.size, (uint64_t)0));
@@ -5690,7 +5696,9 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
* item.
*/
cbt->slot = UINT32_MAX;
- WT_RET(__wt_value_return(session, cbt, upd));
+ WT_RET(__wt_value_return_upd(
+ session, cbt, upd,
+ F_ISSET(r, WT_REC_VISIBLE_ALL)));
WT_RET(__rec_cell_build_val(session, r,
cbt->iface.value.data,
cbt->iface.value.size, (uint64_t)0));