summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKeith Bostic <keith.bostic@mongodb.com>2017-08-17 22:33:47 -0400
committerMichael Cahill <michael.cahill@mongodb.com>2017-08-18 12:33:47 +1000
commitebbdceba0d2f91c0b3a513101fecffa23dd7278e (patch)
tree39613b847036532910131d4d9f550da3cfc1fd8e
parentc0b7568a8ac1208993f0da710ef0fdcd0ada655a (diff)
downloadmongo-ebbdceba0d2f91c0b3a513101fecffa23dd7278e.tar.gz
WT-3498 snapshot isolation failure with overflow records (#3591)
Rework reconciliation to cache overflow records required by readers as part of selecting the WT_UPDATE to be written. That way we know the cached record is associated with the appropriate update. Instead of caching a single copy of an on-page overflow value that's being removed, and referencing it from both the WT_OVFL_TRACK structure (for existing reader threads), and from all of the WT_UPDATE affected chains, cache a complete copy of the on-disk overflow value itself everywhere it might be needed. This requires 2x the memory when updating row-store overflow tables, and potentially more for column-store tables with RLEs greater than 1, but it has the virtue of working. Additionally, the memory used should be discarded relatively quickly if there's cache pressure: from both the update chains and from the WT_OVFL_TRACK structure whenever we attempt to evict the page.
-rw-r--r--src/btree/bt_debug.c85
-rw-r--r--src/btree/bt_discard.c3
-rw-r--r--src/btree/bt_ovfl.c176
-rw-r--r--src/btree/bt_read.c4
-rw-r--r--src/btree/bt_split.c8
-rw-r--r--src/btree/row_modify.c14
-rw-r--r--src/include/btmem.h14
-rw-r--r--src/include/extern.h3
-rw-r--r--src/reconcile/rec_write.c190
9 files changed, 221 insertions, 276 deletions
diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c
index 0e907ef25a8..d91ac027738 100644
--- a/src/btree/bt_debug.c
+++ b/src/btree/bt_debug.c
@@ -39,7 +39,6 @@ static int __debug_col_skip(WT_DBG *, WT_INSERT_HEAD *, const char *, bool);
static int __debug_config(WT_SESSION_IMPL *, WT_DBG *, const char *);
static int __debug_dsk_cell(WT_DBG *, const WT_PAGE_HEADER *);
static int __debug_dsk_col_fix(WT_DBG *, const WT_PAGE_HEADER *);
-static int __debug_item(WT_DBG *, const char *, const void *, size_t);
static int __debug_page(WT_DBG *, WT_REF *, uint32_t);
static int __debug_page_col_fix(WT_DBG *, WT_REF *);
static int __debug_page_col_int(WT_DBG *, WT_PAGE *, uint32_t);
@@ -81,6 +80,41 @@ __debug_hex_byte(WT_DBG *ds, uint8_t v)
}
/*
+ * __debug_bytes --
+ * Dump a single set of bytes.
+ */
+static int
+__debug_bytes(WT_DBG *ds, const void *data_arg, size_t size)
+{
+ size_t i;
+ u_char ch;
+ const uint8_t *data;
+
+ for (data = data_arg, i = 0; i < size; ++i, ++data) {
+ ch = data[0];
+ if (__wt_isprint(ch))
+ WT_RET(ds->f(ds, "%c", (int)ch));
+ else
+ WT_RET(__debug_hex_byte(ds, data[0]));
+ }
+ return (0);
+}
+
+/*
+ * __debug_item --
+ * Dump a single data/size pair, with an optional tag.
+ */
+static int
+__debug_item(WT_DBG *ds, const char *tag, const void *data_arg, size_t size)
+{
+ WT_RET(ds->f(ds,
+ "\t%s%s{", tag == NULL ? "" : tag, tag == NULL ? "" : " "));
+ WT_RET(__debug_bytes(ds, data_arg, size));
+ WT_RET(ds->f(ds, "}\n"));
+ return (0);
+}
+
+/*
* __dmsg_event --
* Send a debug message to the event handler.
*/
@@ -993,23 +1027,26 @@ static int
__debug_modified(WT_DBG *ds, WT_UPDATE *upd)
{
const size_t *p;
- int nentries;
+ size_t nentries, data_size, offset, size;
const uint8_t *data;
- void *modify;
- modify = upd->data;
-
- p = modify;
- nentries = (int)*p++;
- data = (uint8_t *)modify +
+ p = (size_t *)upd->data;
+ memcpy(&nentries, p++, sizeof(size_t));
+ data = upd->data +
sizeof(size_t) + ((size_t)nentries * 3 * sizeof(size_t));
- WT_RET(ds->f(ds, "%d: ", nentries));
- for (; nentries-- > 0; data += p[0], p += 3)
+ WT_RET(ds->f(ds, "%" WT_SIZET_FMT ": ", nentries));
+ for (; nentries-- > 0; data += data_size) {
+ memcpy(&data_size, p++, sizeof(size_t));
+ memcpy(&offset, p++, sizeof(size_t));
+ memcpy(&size, p++, sizeof(size_t));
WT_RET(ds->f(ds,
"{%" WT_SIZET_FMT ", %" WT_SIZET_FMT ", %" WT_SIZET_FMT
- ", %.*s}%s", p[0], p[1], p[2],
- (int)p[2], data, nentries == 0 ? "" : ", "));
+ ", ",
+ data_size, offset, size));
+ WT_RET(__debug_bytes(ds, data, data_size));
+ WT_RET(ds->f(ds, "}%s", nentries == 0 ? "" : ", "));
+ }
return (0);
}
@@ -1243,28 +1280,4 @@ __debug_cell_data(WT_DBG *ds,
return (ret);
}
-
-/*
- * __debug_item --
- * Dump a single data/size pair, with an optional tag.
- */
-static int
-__debug_item(WT_DBG *ds, const char *tag, const void *data_arg, size_t size)
-{
- size_t i;
- u_char ch;
- const uint8_t *data;
-
- WT_RET(ds->f(ds,
- "\t%s%s{", tag == NULL ? "" : tag, tag == NULL ? "" : " "));
- for (data = data_arg, i = 0; i < size; ++i, ++data) {
- ch = data[0];
- if (__wt_isprint(ch))
- WT_RET(ds->f(ds, "%c", (int)ch));
- else
- WT_RET(__debug_hex_byte(ds, data[0]));
- }
- WT_RET(ds->f(ds, "}\n"));
- return (0);
-}
#endif
diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c
index bc9356e2669..806a9770057 100644
--- a/src/btree/bt_discard.c
+++ b/src/btree/bt_discard.c
@@ -249,9 +249,8 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page)
/* Free the overflow on-page, reuse and transaction-cache skiplists. */
__wt_ovfl_reuse_free(session, page);
- if (mod->ovfl_track != NULL)
- __wt_free(session, mod->ovfl_track->remove);
__wt_ovfl_discard_free(session, page);
+ __wt_ovfl_discard_remove(session, page);
__wt_free(session, page->modify->ovfl_track);
__wt_spin_destroy(session, &page->modify->page_lock);
diff --git a/src/btree/bt_ovfl.c b/src/btree/bt_ovfl.c
index f933245eaef..fab38f3cc8d 100644
--- a/src/btree/bt_ovfl.c
+++ b/src/btree/bt_ovfl.c
@@ -49,7 +49,6 @@ __wt_ovfl_read(WT_SESSION_IMPL *session,
{
WT_DECL_RET;
WT_OVFL_TRACK *track;
- WT_UPDATE *upd;
size_t i;
*decoded = false;
@@ -74,14 +73,13 @@ __wt_ovfl_read(WT_SESSION_IMPL *session,
__wt_readlock(session, &S2BT(session)->ovfl_lock);
if (__wt_cell_type_raw(unpack->cell) == WT_CELL_VALUE_OVFL_RM) {
track = page->modify->ovfl_track;
- for (upd = NULL, i = 0; i < track->remove_next; ++i)
+ for (i = 0; i < track->remove_next; ++i)
if (track->remove[i].cell == unpack->cell) {
- upd = track->remove[i].upd;
+ store->data = track->remove[i].data;
+ store->size = track->remove[i].size;
break;
}
WT_ASSERT(session, i < track->remove_next);
- store->data = upd->data;
- store->size = upd->size;
*decoded = true;
} else
ret = __ovfl_read(session, unpack->data, unpack->size, store);
@@ -91,134 +89,56 @@ __wt_ovfl_read(WT_SESSION_IMPL *session,
}
/*
- * __ovfl_cache_col_visible --
- * column-store: check for a globally visible update.
+ * __wt_ovfl_discard_remove --
+ * Free the on-page overflow value cache.
*/
-static bool
-__ovfl_cache_col_visible(
- WT_SESSION_IMPL *session, WT_UPDATE *upd, WT_CELL_UNPACK *unpack)
+void
+__wt_ovfl_discard_remove(WT_SESSION_IMPL *session, WT_PAGE *page)
{
- /*
- * Column-store is harder than row_store: we're here because there's a
- * reader in the system that might read the original version of an
- * overflow record, which might match a number of records. For example,
- * the original overflow value was for records 100-200, we've replaced
- * each of those records individually, but there exists a reader that
- * might read any one of those records, and all of those records have
- * different update entries with different transaction IDs. Since it's
- * infeasible to determine if there's a globally visible update for each
- * reader for each record, we test the simple case where a single record
- * has a single, globally visible update. If that's not the case, cache
- * the value.
- */
- if (__wt_cell_rle(unpack) == 1 &&
- WT_UPDATE_DATA_VALUE(upd) && __wt_txn_upd_visible_all(session, upd))
- return (true);
- return (false);
-}
-
-/*
- * __ovfl_cache_row_visible --
- * row-store: check for a globally visible update.
- */
-static bool
-__ovfl_cache_row_visible(WT_SESSION_IMPL *session, WT_UPDATE *upd)
-{
- /* Check to see if there's a globally visible update. */
- for (; upd != NULL; upd = upd->next)
- if (WT_UPDATE_DATA_VALUE(upd) &&
- __wt_txn_upd_visible_all(session, upd))
- return (true);
-
- return (false);
+ WT_OVFL_TRACK *track;
+ uint32_t i;
+
+ if (page->modify != NULL &&
+ (track = page->modify->ovfl_track) != NULL) {
+ for (i = 0; i < track->remove_next; ++i)
+ __wt_free(session, track->remove[i].data);
+ __wt_free(session, page->modify->ovfl_track->remove);
+ track->remove_allocated = 0;
+ track->remove_next = 0;
+ }
}
/*
- * __ovfl_cache_append_update --
- * Append an overflow value to the update list.
+ * __ovfl_cache --
+ * Cache an overflow value.
*/
static int
-__ovfl_cache_append_update(WT_SESSION_IMPL *session, WT_PAGE *page,
- WT_UPDATE *upd_list, WT_CELL_UNPACK *unpack, WT_UPDATE **updp)
+__ovfl_cache(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack)
{
WT_DECL_ITEM(tmp);
WT_DECL_RET;
- WT_UPDATE *append, *upd;
- size_t size;
-
- *updp = NULL;
+ WT_OVFL_TRACK *track;
/* Read the overflow value. */
WT_RET(__wt_scr_alloc(session, 1024, &tmp));
WT_ERR(__wt_dsk_cell_data_ref(session, page->type, unpack, tmp));
- /*
- * Create an update entry with no transaction ID to ensure global
- * visibility, append it to the update list.
- *
- * We don't need locks or barriers in this function: any thread reading
- * the update list will see our newly appended record or not, it doesn't
- * matter until the on-page cell is set to WT_CELL_VALUE_OVFL_RM. That
- * involves atomic operations which will act as our barrier. Regardless,
- * we update the page footprint as part of this operation, which acts as
- * a barrier as well.
- *
- * The update transaction ID choice is tricky, to work around an issue
- * in variable-length column store. Imagine an overflow value with an
- * RLE greater than 1. We append a copy to the end of an update chain,
- * but it's possible it's the overflow value for more than one record,
- * and appending it to the end of one record's update chain means a
- * subsequent enter of a globally visible value to one of the records
- * would allow the truncation of the overflow chain that leaves other
- * records without a value. If appending such an overflow record, set
- * the transaction ID to the first possible transaction ID. That ID is
- * old enough to be globally visible, but we can use it as a flag if an
- * update record cannot be discarded when truncating an update chain.
- */
- WT_ERR(__wt_update_alloc(
- session, tmp, &append, &size, WT_UPDATE_STANDARD));
- append->txnid = page->type == WT_PAGE_COL_VAR &&
- __wt_cell_rle(unpack) > 1 ? WT_TXN_FIRST : WT_TXN_NONE;
- for (upd = upd_list; upd->next != NULL; upd = upd->next)
- ;
- WT_PUBLISH(upd->next, append);
-
- __wt_cache_page_inmem_incr(session, page, size);
-
- *updp = append;
-
-err: __wt_scr_free(session, &tmp);
- return (ret);
-}
-
-/*
- * __ovfl_cache --
- * Cache an overflow value.
- */
-static int
-__ovfl_cache(WT_SESSION_IMPL *session,
- WT_PAGE *page, WT_UPDATE *upd_list, WT_CELL_UNPACK *unpack)
-{
- WT_OVFL_TRACK *track;
- WT_UPDATE *upd;
-
- /* Append a copy of the overflow value to the update list. */
- WT_RET(__ovfl_cache_append_update(
- session, page, upd_list, unpack, &upd));
-
/* Allocating tracking structures as necessary. */
if (page->modify->ovfl_track == NULL)
- WT_RET(__wt_ovfl_track_init(session, page));
+ WT_ERR(__wt_ovfl_track_init(session, page));
track = page->modify->ovfl_track;
- /* Add the value's information to the update list. */
- WT_RET(__wt_realloc_def(session,
+ /* Copy the overflow item into place. */
+ WT_ERR(__wt_realloc_def(session,
&track->remove_allocated, track->remove_next + 1, &track->remove));
track->remove[track->remove_next].cell = unpack->cell;
- track->remove[track->remove_next].upd = upd;
+ WT_ERR(__wt_strndup(session,
+ tmp->data, tmp->size, &track->remove[track->remove_next].data));
+ track->remove[track->remove_next].size = tmp->size;
++track->remove_next;
- return (0);
+err: __wt_scr_free(session, &tmp);
+ return (ret);
}
/*
@@ -227,12 +147,14 @@ __ovfl_cache(WT_SESSION_IMPL *session,
*/
int
__wt_ovfl_remove(WT_SESSION_IMPL *session,
- WT_PAGE *page, WT_UPDATE *upd_list, WT_CELL_UNPACK *unpack)
+ WT_PAGE *page, WT_CELL_UNPACK *unpack, bool checkpoint)
{
- bool visible;
-
/*
- * This function solves a problem in reconciliation. The scenario is:
+ * This function solves two problems in reconciliation.
+ *
+ * The first problem is snapshot readers needing on-page overflow values
+ * that have been removed. The scenario is as follows:
+ *
* - reconciling a leaf page that references an overflow item
* - the item is updated and the update committed
* - a checkpoint runs, freeing the backing overflow blocks
@@ -263,28 +185,16 @@ __wt_ovfl_remove(WT_SESSION_IMPL *session,
* per overflow item. We don't do any of that because overflow values
* are supposed to be rare and we shouldn't see contention for the lock.
*
- * Check for a globally visible update. If there is a globally visible
- * update, we don't need to cache the item because it's not possible for
- * a running thread to have moved past it.
- */
- switch (page->type) {
- case WT_PAGE_COL_VAR:
- visible = __ovfl_cache_col_visible(session, upd_list, unpack);
- break;
- case WT_PAGE_ROW_LEAF:
- visible = __ovfl_cache_row_visible(session, upd_list);
- break;
- WT_ILLEGAL_VALUE(session);
- }
-
- /*
- * If there's no globally visible update, there's a reader in the system
- * that might try and read the old value, cache it.
+ * We only have to do this for checkpoints: in any eviction mode, there
+ * can't be threads sitting in our update lists.
*/
- if (!visible)
- WT_RET(__ovfl_cache(session, page, upd_list, unpack));
+ if (checkpoint)
+ WT_RET(__ovfl_cache(session, page, unpack));
/*
+ * The second problem is to only remove the underlying blocks once,
+ * solved by the WT_CELL_VALUE_OVFL_RM flag.
+ *
* Queue the on-page cell to be set to WT_CELL_VALUE_OVFL_RM and the
* underlying overflow value's blocks to be freed when reconciliation
* completes.
diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c
index 49b12b2d4e9..0c3cb026421 100644
--- a/src/btree/bt_read.c
+++ b/src/btree/bt_read.c
@@ -96,7 +96,7 @@ __col_instantiate(WT_SESSION_IMPL *session,
/* Search the page and add updates. */
WT_RET(__wt_col_search(session, recno, ref, cbt));
WT_RET(__wt_col_modify(
- session, cbt, recno, NULL, updlist, updlist->type, false));
+ session, cbt, recno, NULL, updlist, WT_UPDATE_INVALID, false));
return (0);
}
@@ -121,7 +121,7 @@ __row_instantiate(WT_SESSION_IMPL *session,
/* Search the page and add updates. */
WT_RET(__wt_row_search(session, key, ref, cbt, true));
WT_RET(__wt_row_modify(
- session, cbt, key, NULL, updlist, updlist->type, false));
+ session, cbt, key, NULL, updlist, WT_UPDATE_INVALID, false));
return (0);
}
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index a0db4457f62..ac90d6693d3 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -1446,8 +1446,8 @@ __split_multi_inmem(
WT_ERR(__wt_col_search(session, recno, ref, &cbt));
/* Apply the modification. */
- WT_ERR(__wt_col_modify(
- session, &cbt, recno, NULL, upd, upd->type, true));
+ WT_ERR(__wt_col_modify(session, &cbt,
+ recno, NULL, upd, WT_UPDATE_INVALID, true));
break;
case WT_PAGE_ROW_LEAF:
/* Build a key. */
@@ -1468,8 +1468,8 @@ __split_multi_inmem(
WT_ERR(__wt_row_search(session, key, ref, &cbt, true));
/* Apply the modification. */
- WT_ERR(__wt_row_modify(
- session, &cbt, key, NULL, upd, upd->type, true));
+ WT_ERR(__wt_row_modify(session,
+ &cbt, key, NULL, upd, WT_UPDATE_INVALID, true));
break;
WT_ILLEGAL_VALUE_ERR(session);
}
diff --git a/src/btree/row_modify.c b/src/btree/row_modify.c
index 6e610b86376..5e84899999a 100644
--- a/src/btree/row_modify.c
+++ b/src/btree/row_modify.c
@@ -263,6 +263,13 @@ __wt_update_alloc(WT_SESSION_IMPL *session, const WT_ITEM *value,
*updp = NULL;
/*
+ * The code paths leading here are convoluted: assert we never attempt
+ * to allocate an update structure if only intending to insert one we
+ * already have.
+ */
+ WT_ASSERT(session, modify_type != WT_UPDATE_INVALID);
+
+ /*
* Allocate the WT_UPDATE structure and room for the value, then copy
* the value into place.
*/
@@ -304,14 +311,11 @@ __wt_update_obsolete_check(
* Walk the list of updates, looking for obsolete updates at the end.
*
* Only updates with globally visible, self-contained data can terminate
- * update chains, ignore modified and reserved updates. Special case the
- * first transaction ID, it flags column-store overflow values which can
- * never be discarded.
+ * update chains.
*/
for (first = NULL, count = 0; upd != NULL; upd = upd->next, count++)
if (WT_UPDATE_DATA_VALUE(upd) &&
- __wt_txn_upd_visible_all(session, upd) &&
- upd->txnid != WT_TXN_FIRST) {
+ __wt_txn_upd_visible_all(session, upd)) {
if (first == NULL)
first = upd;
} else if (upd->txnid != WT_TXN_ABORTED)
diff --git a/src/include/btmem.h b/src/include/btmem.h
index 094f6f2c680..f0d810281c2 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -386,8 +386,9 @@ struct __wt_page_modify {
/* Cached overflow value cell/update address pairs. */
struct {
- WT_CELL *cell;
- WT_UPDATE *upd;
+ WT_CELL *cell;
+ uint8_t *data;
+ size_t size;
} *remove;
size_t remove_allocated;
uint32_t remove_next;
@@ -895,10 +896,11 @@ struct __wt_update {
uint32_t size; /* data length */
-#define WT_UPDATE_DELETED 0 /* deleted */
-#define WT_UPDATE_MODIFIED 1 /* partial-update modify value */
-#define WT_UPDATE_RESERVED 2 /* reserved */
-#define WT_UPDATE_STANDARD 3 /* complete value */
+#define WT_UPDATE_INVALID 0 /* diagnostic check */
+#define WT_UPDATE_DELETED 1 /* deleted */
+#define WT_UPDATE_MODIFIED 2 /* partial-update modify value */
+#define WT_UPDATE_RESERVED 3 /* reserved */
+#define WT_UPDATE_STANDARD 4 /* complete value */
uint8_t type; /* type (one byte to conserve memory) */
/* If the update includes a complete value. */
diff --git a/src/include/extern.h b/src/include/extern.h
index 58de2f98030..9a86dbc1a26 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -150,7 +150,8 @@ extern const char *__wt_cell_type_string(uint8_t type);
extern const char *__wt_page_addr_string(WT_SESSION_IMPL *session, WT_REF *ref, WT_ITEM *buf);
extern const char *__wt_addr_string(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, WT_ITEM *buf);
extern int __wt_ovfl_read(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store, bool *decoded) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_ovfl_remove(WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd_list, WT_CELL_UNPACK *unpack) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern void __wt_ovfl_discard_remove(WT_SESSION_IMPL *session, WT_PAGE *page);
+extern int __wt_ovfl_remove(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, bool checkpoint) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint32_t alloc_entries, bool alloc_refs, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, size_t memsize, uint32_t flags, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c
index a3cb6a53a09..1eeb99e0a05 100644
--- a/src/reconcile/rec_write.c
+++ b/src/reconcile/rec_write.c
@@ -506,6 +506,13 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
WT_TRET(session->block_manager_cleanup(session));
WT_TRET(__rec_destroy_session(session));
+
+ /*
+ * We track removed overflow objects in case there's a reader
+ * in transit when they're removed. Any form of eviction locks
+ * out readers, we can discard them all.
+ */
+ __wt_ovfl_discard_remove(session, page);
}
WT_RET(ret);
@@ -1194,6 +1201,64 @@ __rec_update_move(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd, WT_SAVE_UPD *supd)
}
/*
+ * __rec_append_orig_value --
+ * Append the key's original value to its update list.
+ */
+static int
+__rec_append_orig_value(WT_SESSION_IMPL *session,
+ WT_PAGE *page, WT_UPDATE *upd_list, WT_CELL_UNPACK *unpack)
+{
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ WT_UPDATE *append, *upd;
+ size_t size;
+
+ /* If at least one standard update is globally visible, we're done. */
+ for (upd = upd_list; upd != NULL; upd = upd->next)
+ if (WT_UPDATE_DATA_VALUE(upd) &&
+ __wt_txn_upd_visible_all(session, upd))
+ return (0);
+
+ /*
+ * We need the original on-page value for some reader: get a copy and
+ * append it to the end of the update list with a transaction ID that
+ * guarantees its visibility.
+ *
+ * If we don't have a value cell, it's an insert/append list key/value
+ * pair which simply doesn't exist for some reader; place a deleted
+ * record at the end of the update list.
+ */
+ append = NULL; /* -Wconditional-uninitialized */
+ size = 0; /* -Wconditional-uninitialized */
+ if (unpack == NULL || unpack->type == WT_CELL_DEL)
+ WT_RET(__wt_update_alloc(session,
+ NULL, &append, &size, WT_UPDATE_DELETED));
+ else {
+ WT_RET(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(__wt_page_cell_data_ref(session, page, unpack, tmp));
+ WT_ERR(__wt_update_alloc(
+ session, tmp, &append, &size, WT_UPDATE_STANDARD));
+ }
+
+ /*
+ * Give the entry no transaction ID to ensure global visibility, append
+ * it to the update list.
+ *
+ * Note the change to the actual reader-accessible update list: from now
+ * on, the original on-page value appears at the end of the update list,
+ * even if this reconciliation subsequently fails.
+ */
+ append->txnid = WT_TXN_NONE;
+ for (upd = upd_list; upd->next != NULL; upd = upd->next)
+ ;
+ WT_PUBLISH(upd->next, append);
+ __wt_cache_page_inmem_incr(session, page, size);
+
+err: __wt_scr_free(session, &tmp);
+ return (ret);
+}
+
+/*
* __rec_txn_read --
* Return the update in a list that should be written (or NULL if none can
* be written).
@@ -1203,18 +1268,14 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
WT_INSERT *ins, void *ripcip, WT_CELL_UNPACK *vpack, WT_UPDATE **updp)
{
WT_BTREE *btree;
- WT_DECL_RET;
- WT_DECL_ITEM(tmp);
- WT_DECL_TIMESTAMP(min_timestamp)
WT_DECL_TIMESTAMP(max_timestamp)
WT_PAGE *page;
- WT_UPDATE *append, *upd, *upd_list;
- size_t size, update_mem;
- uint64_t max_txn, min_txn, txnid;
- bool append_origv, skipped;
+ WT_UPDATE *upd, *upd_list;
+ size_t update_mem;
+ uint64_t max_txn, txnid;
+ bool skipped;
*updp = NULL;
- append = NULL; /* -Wconditional-uninitialized */
btree = S2BT(session);
page = r->page;
@@ -1235,9 +1296,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
max_txn = WT_TXN_NONE;
#ifdef HAVE_TIMESTAMPS
__wt_timestamp_set_zero(&max_timestamp);
- __wt_timestamp_set_inf(&min_timestamp);
#endif
- min_txn = UINT64_MAX;
if (F_ISSET(r, WT_EVICTING)) {
/* Discard obsolete updates. */
@@ -1258,8 +1317,6 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
*/
if (WT_TXNID_LT(max_txn, txnid))
max_txn = txnid;
- if (WT_TXNID_LT(txnid, min_txn))
- min_txn = txnid;
/*
* Find the first update we can use.
@@ -1291,11 +1348,6 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
&max_timestamp, &upd->timestamp) < 0)
__wt_timestamp_set(
&max_timestamp, &upd->timestamp);
-
- if (__wt_timestamp_cmp(
- &min_timestamp, &upd->timestamp) > 0)
- __wt_timestamp_set(
- &min_timestamp, &upd->timestamp);
#endif
}
} else
@@ -1325,7 +1377,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
}
}
- /* Reconciliation should never see a reserved update. */
+ /* Reconciliation should never see an aborted or reserved update. */
WT_ASSERT(session, *updp == NULL ||
((*updp)->txnid != WT_TXN_ABORTED &&
(*updp)->type != WT_UPDATE_RESERVED));
@@ -1370,18 +1422,17 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
if (!skipped && (F_ISSET(btree, WT_BTREE_LOOKASIDE) ||
__wt_txn_visible_all(session,
max_txn, WT_TIMESTAMP_NULL(&max_timestamp)))) {
-#ifdef HAVE_DIAGNOSTIC
/*
* The checkpoint transaction is special. Make sure we never
* write (metadata) updates from a checkpoint in a concurrent
* session.
*/
- txnid = *updp == NULL ? WT_TXN_NONE : (*updp)->txnid;
- WT_ASSERT(session, txnid == WT_TXN_NONE ||
- txnid != S2C(session)->txn_global.checkpoint_state.id ||
+ WT_ASSERT(session, *updp == NULL ||
+ (*updp)->txnid !=
+ S2C(session)->txn_global.checkpoint_state.id ||
WT_SESSION_IS_CHECKPOINT(session));
-#endif
- return (0);
+
+ goto check_original_value;
}
/*
@@ -1400,7 +1451,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
*/
if (!F_ISSET(r, WT_EVICTING)) {
r->leave_dirty = true;
- return (0);
+ goto check_original_value;
}
/*
@@ -1441,7 +1492,6 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
if (skipped)
r->update_mem_uncommitted += update_mem;
- append_origv = false;
if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE)) {
/*
* The save/restore eviction path.
@@ -1456,58 +1506,6 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
/* The page can't be marked clean. */
r->leave_dirty = true;
- } else {
- /*
- * The lookaside table eviction path.
- *
- * If at least one update is globally visible, copy the update
- * list and ignore the current on-page value. If no update is
- * globally visible, readers require the page's original value.
- */
- if (!__wt_txn_visible_all(
- session, min_txn, WT_TIMESTAMP_NULL(&min_timestamp)))
- append_origv = true;
- }
-
- /*
- * We need the original on-page value for some reason: get a copy and
- * append it to the end of the update list with a transaction ID that
- * guarantees its visibility.
- */
- if (append_origv) {
- /*
- * If we don't have a value cell, it's an insert/append list
- * key/value pair which simply doesn't exist for some reader;
- * place a deleted record at the end of the update list.
- */
- size = 0; /* -Wconditional-uninitialized */
- if (vpack == NULL || vpack->type == WT_CELL_DEL)
- WT_RET(__wt_update_alloc(session,
- NULL, &append, &size, WT_UPDATE_DELETED));
- else {
- WT_RET(__wt_scr_alloc(session, 0, &tmp));
- if ((ret = __wt_page_cell_data_ref(
- session, page, vpack, tmp)) == 0)
- ret = __wt_update_alloc(session,
- tmp, &append, &size, WT_UPDATE_STANDARD);
- __wt_scr_free(session, &tmp);
- WT_RET(ret);
- }
-
- /*
- * Give the entry no transaction ID to ensure global visibility,
- * append it to the update list.
- *
- * Note the change to the actual reader-accessible update list:
- * from now on, the original on-page value appears at the end
- * of the update list, even if this reconciliation subsequently
- * fails.
- */
- append->txnid = WT_TXN_NONE;
- for (upd = upd_list; upd->next != NULL; upd = upd->next)
- ;
- WT_PUBLISH(upd->next, append);
- __wt_cache_page_inmem_incr(session, page, size);
}
/*
@@ -1521,7 +1519,23 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
* that transaction ID is globally visible, we know we no longer need
* the lookaside table records, allowing them to be discarded.
*/
- return (__rec_update_save(session, r, ins, ripcip, *updp));
+ WT_RET(__rec_update_save(session, r, ins, ripcip, *updp));
+
+check_original_value:
+ /*
+ * Returning an update means the original on-page value might be lost,
+ * and that's a problem if there's a reader that needs it. There are
+ * two cases: any lookaside table eviction (because the backing disk
+ * image is rewritten), or any reconciliation of a backing overflow
+ * record that will be physically removed once it's no longer needed.
+ */
+ if (*updp != NULL &&
+ (F_ISSET(r, WT_EVICT_LOOKASIDE) ||
+ (vpack != NULL &&
+ vpack->ovfl && vpack->raw != WT_CELL_VALUE_OVFL_RM)))
+ WT_RET(__rec_append_orig_value(session, page, *updp, vpack));
+
+ return (0);
}
/*
@@ -4708,7 +4722,7 @@ __rec_col_var(WT_SESSION_IMPL *session,
* file, otherwise we'll leak blocks on the checkpoint.
* That's safe because if the backing overflow value is
* still needed by any running transaction, we'll cache
- * a copy in the reconciliation tracking structures.
+ * a copy in the update list.
*
* Regardless, we avoid copying in overflow records: if
* there's a WT_INSERT entry that modifies a reference
@@ -4793,8 +4807,8 @@ record_loop: /*
* The on-page value will never be accessed,
* write a placeholder record.
*/
- data = "@";
- size = 1;
+ data = "ovfl-unused";
+ size = WT_STORE_SIZE(strlen("ovfl-unused"));
} else {
update_no_copy = false; /* Maybe data copy */
@@ -4928,7 +4942,8 @@ compare: /*
*/
if (ovfl_state == OVFL_UNUSED &&
vpack->raw != WT_CELL_VALUE_OVFL_RM)
- WT_ERR(__wt_ovfl_remove(session, page, upd, vpack));
+ WT_ERR(__wt_ovfl_remove(
+ session, page, vpack, !F_ISSET(r, WT_EVICTING)));
}
/* Walk any append list. */
@@ -5535,8 +5550,9 @@ __rec_row_leaf(WT_SESSION_IMPL *session,
* The on-page value will never be accessed,
* write a placeholder record.
*/
- WT_ERR(__rec_cell_build_val(
- session, r, "@", 1, (uint64_t)0));
+ WT_ERR(__rec_cell_build_val(session, r,
+ "ovfl-unused", strlen("ovfl-unused"),
+ (uint64_t)0));
} else {
val->buf.data = val_cell;
val->buf.size = __wt_cell_total_len(vpack);
@@ -5554,8 +5570,8 @@ __rec_row_leaf(WT_SESSION_IMPL *session,
*/
if (vpack != NULL &&
vpack->ovfl && vpack->raw != WT_CELL_VALUE_OVFL_RM)
- WT_ERR(__wt_ovfl_remove(
- session, page, upd, vpack));
+ WT_ERR(__wt_ovfl_remove(session,
+ page, vpack, !F_ISSET(r, WT_EVICTING)));
switch (upd->type) {
case WT_UPDATE_DELETED: