diff options
author | Keith Bostic <keith.bostic@mongodb.com> | 2017-08-17 22:33:47 -0400 |
---|---|---|
committer | Michael Cahill <michael.cahill@mongodb.com> | 2017-08-18 12:33:47 +1000 |
commit | ebbdceba0d2f91c0b3a513101fecffa23dd7278e (patch) | |
tree | 39613b847036532910131d4d9f550da3cfc1fd8e | |
parent | c0b7568a8ac1208993f0da710ef0fdcd0ada655a (diff) | |
download | mongo-ebbdceba0d2f91c0b3a513101fecffa23dd7278e.tar.gz |
WT-3498 snapshot isolation failure with overflow records (#3591)
Rework reconciliation to cache overflow records required by readers as part of selecting the WT_UPDATE to be written. That way we know the cached record is associated with the appropriate update.
Instead of caching a single copy of an on-page overflow value that's being removed, and referencing it from both the WT_OVFL_TRACK structure (for existing reader threads), and from all of the WT_UPDATE affected chains, cache a complete copy of the on-disk overflow value itself everywhere it might be needed. This requires 2x the memory when updating row-store overflow tables, and potentially more for column-store tables with RLEs greater than 1, but it has the virtue of working. Additionally, the memory used should be discarded relatively quickly if
there's cache pressure: from both the update chains and from the WT_OVFL_TRACK structure whenever we attempt to evict the page.
-rw-r--r-- | src/btree/bt_debug.c | 85 | ||||
-rw-r--r-- | src/btree/bt_discard.c | 3 | ||||
-rw-r--r-- | src/btree/bt_ovfl.c | 176 | ||||
-rw-r--r-- | src/btree/bt_read.c | 4 | ||||
-rw-r--r-- | src/btree/bt_split.c | 8 | ||||
-rw-r--r-- | src/btree/row_modify.c | 14 | ||||
-rw-r--r-- | src/include/btmem.h | 14 | ||||
-rw-r--r-- | src/include/extern.h | 3 | ||||
-rw-r--r-- | src/reconcile/rec_write.c | 190 |
9 files changed, 221 insertions, 276 deletions
diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c index 0e907ef25a8..d91ac027738 100644 --- a/src/btree/bt_debug.c +++ b/src/btree/bt_debug.c @@ -39,7 +39,6 @@ static int __debug_col_skip(WT_DBG *, WT_INSERT_HEAD *, const char *, bool); static int __debug_config(WT_SESSION_IMPL *, WT_DBG *, const char *); static int __debug_dsk_cell(WT_DBG *, const WT_PAGE_HEADER *); static int __debug_dsk_col_fix(WT_DBG *, const WT_PAGE_HEADER *); -static int __debug_item(WT_DBG *, const char *, const void *, size_t); static int __debug_page(WT_DBG *, WT_REF *, uint32_t); static int __debug_page_col_fix(WT_DBG *, WT_REF *); static int __debug_page_col_int(WT_DBG *, WT_PAGE *, uint32_t); @@ -81,6 +80,41 @@ __debug_hex_byte(WT_DBG *ds, uint8_t v) } /* + * __debug_bytes -- + * Dump a single set of bytes. + */ +static int +__debug_bytes(WT_DBG *ds, const void *data_arg, size_t size) +{ + size_t i; + u_char ch; + const uint8_t *data; + + for (data = data_arg, i = 0; i < size; ++i, ++data) { + ch = data[0]; + if (__wt_isprint(ch)) + WT_RET(ds->f(ds, "%c", (int)ch)); + else + WT_RET(__debug_hex_byte(ds, data[0])); + } + return (0); +} + +/* + * __debug_item -- + * Dump a single data/size pair, with an optional tag. + */ +static int +__debug_item(WT_DBG *ds, const char *tag, const void *data_arg, size_t size) +{ + WT_RET(ds->f(ds, + "\t%s%s{", tag == NULL ? "" : tag, tag == NULL ? "" : " ")); + WT_RET(__debug_bytes(ds, data_arg, size)); + WT_RET(ds->f(ds, "}\n")); + return (0); +} + +/* * __dmsg_event -- * Send a debug message to the event handler. */ @@ -993,23 +1027,26 @@ static int __debug_modified(WT_DBG *ds, WT_UPDATE *upd) { const size_t *p; - int nentries; + size_t nentries, data_size, offset, size; const uint8_t *data; - void *modify; - modify = upd->data; - - p = modify; - nentries = (int)*p++; - data = (uint8_t *)modify + + p = (size_t *)upd->data; + memcpy(&nentries, p++, sizeof(size_t)); + data = upd->data + sizeof(size_t) + ((size_t)nentries * 3 * sizeof(size_t)); - WT_RET(ds->f(ds, "%d: ", nentries)); - for (; nentries-- > 0; data += p[0], p += 3) + WT_RET(ds->f(ds, "%" WT_SIZET_FMT ": ", nentries)); + for (; nentries-- > 0; data += data_size) { + memcpy(&data_size, p++, sizeof(size_t)); + memcpy(&offset, p++, sizeof(size_t)); + memcpy(&size, p++, sizeof(size_t)); WT_RET(ds->f(ds, "{%" WT_SIZET_FMT ", %" WT_SIZET_FMT ", %" WT_SIZET_FMT - ", %.*s}%s", p[0], p[1], p[2], - (int)p[2], data, nentries == 0 ? "" : ", ")); + ", ", + data_size, offset, size)); + WT_RET(__debug_bytes(ds, data, data_size)); + WT_RET(ds->f(ds, "}%s", nentries == 0 ? "" : ", ")); + } return (0); } @@ -1243,28 +1280,4 @@ __debug_cell_data(WT_DBG *ds, return (ret); } - -/* - * __debug_item -- - * Dump a single data/size pair, with an optional tag. - */ -static int -__debug_item(WT_DBG *ds, const char *tag, const void *data_arg, size_t size) -{ - size_t i; - u_char ch; - const uint8_t *data; - - WT_RET(ds->f(ds, - "\t%s%s{", tag == NULL ? "" : tag, tag == NULL ? "" : " ")); - for (data = data_arg, i = 0; i < size; ++i, ++data) { - ch = data[0]; - if (__wt_isprint(ch)) - WT_RET(ds->f(ds, "%c", (int)ch)); - else - WT_RET(__debug_hex_byte(ds, data[0])); - } - WT_RET(ds->f(ds, "}\n")); - return (0); -} #endif diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c index bc9356e2669..806a9770057 100644 --- a/src/btree/bt_discard.c +++ b/src/btree/bt_discard.c @@ -249,9 +249,8 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page) /* Free the overflow on-page, reuse and transaction-cache skiplists. */ __wt_ovfl_reuse_free(session, page); - if (mod->ovfl_track != NULL) - __wt_free(session, mod->ovfl_track->remove); __wt_ovfl_discard_free(session, page); + __wt_ovfl_discard_remove(session, page); __wt_free(session, page->modify->ovfl_track); __wt_spin_destroy(session, &page->modify->page_lock); diff --git a/src/btree/bt_ovfl.c b/src/btree/bt_ovfl.c index f933245eaef..fab38f3cc8d 100644 --- a/src/btree/bt_ovfl.c +++ b/src/btree/bt_ovfl.c @@ -49,7 +49,6 @@ __wt_ovfl_read(WT_SESSION_IMPL *session, { WT_DECL_RET; WT_OVFL_TRACK *track; - WT_UPDATE *upd; size_t i; *decoded = false; @@ -74,14 +73,13 @@ __wt_ovfl_read(WT_SESSION_IMPL *session, __wt_readlock(session, &S2BT(session)->ovfl_lock); if (__wt_cell_type_raw(unpack->cell) == WT_CELL_VALUE_OVFL_RM) { track = page->modify->ovfl_track; - for (upd = NULL, i = 0; i < track->remove_next; ++i) + for (i = 0; i < track->remove_next; ++i) if (track->remove[i].cell == unpack->cell) { - upd = track->remove[i].upd; + store->data = track->remove[i].data; + store->size = track->remove[i].size; break; } WT_ASSERT(session, i < track->remove_next); - store->data = upd->data; - store->size = upd->size; *decoded = true; } else ret = __ovfl_read(session, unpack->data, unpack->size, store); @@ -91,134 +89,56 @@ __wt_ovfl_read(WT_SESSION_IMPL *session, } /* - * __ovfl_cache_col_visible -- - * column-store: check for a globally visible update. + * __wt_ovfl_discard_remove -- + * Free the on-page overflow value cache. */ -static bool -__ovfl_cache_col_visible( - WT_SESSION_IMPL *session, WT_UPDATE *upd, WT_CELL_UNPACK *unpack) +void +__wt_ovfl_discard_remove(WT_SESSION_IMPL *session, WT_PAGE *page) { - /* - * Column-store is harder than row_store: we're here because there's a - * reader in the system that might read the original version of an - * overflow record, which might match a number of records. For example, - * the original overflow value was for records 100-200, we've replaced - * each of those records individually, but there exists a reader that - * might read any one of those records, and all of those records have - * different update entries with different transaction IDs. Since it's - * infeasible to determine if there's a globally visible update for each - * reader for each record, we test the simple case where a single record - * has a single, globally visible update. If that's not the case, cache - * the value. - */ - if (__wt_cell_rle(unpack) == 1 && - WT_UPDATE_DATA_VALUE(upd) && __wt_txn_upd_visible_all(session, upd)) - return (true); - return (false); -} - -/* - * __ovfl_cache_row_visible -- - * row-store: check for a globally visible update. - */ -static bool -__ovfl_cache_row_visible(WT_SESSION_IMPL *session, WT_UPDATE *upd) -{ - /* Check to see if there's a globally visible update. */ - for (; upd != NULL; upd = upd->next) - if (WT_UPDATE_DATA_VALUE(upd) && - __wt_txn_upd_visible_all(session, upd)) - return (true); - - return (false); + WT_OVFL_TRACK *track; + uint32_t i; + + if (page->modify != NULL && + (track = page->modify->ovfl_track) != NULL) { + for (i = 0; i < track->remove_next; ++i) + __wt_free(session, track->remove[i].data); + __wt_free(session, page->modify->ovfl_track->remove); + track->remove_allocated = 0; + track->remove_next = 0; + } } /* - * __ovfl_cache_append_update -- - * Append an overflow value to the update list. + * __ovfl_cache -- + * Cache an overflow value. */ static int -__ovfl_cache_append_update(WT_SESSION_IMPL *session, WT_PAGE *page, - WT_UPDATE *upd_list, WT_CELL_UNPACK *unpack, WT_UPDATE **updp) +__ovfl_cache(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack) { WT_DECL_ITEM(tmp); WT_DECL_RET; - WT_UPDATE *append, *upd; - size_t size; - - *updp = NULL; + WT_OVFL_TRACK *track; /* Read the overflow value. */ WT_RET(__wt_scr_alloc(session, 1024, &tmp)); WT_ERR(__wt_dsk_cell_data_ref(session, page->type, unpack, tmp)); - /* - * Create an update entry with no transaction ID to ensure global - * visibility, append it to the update list. - * - * We don't need locks or barriers in this function: any thread reading - * the update list will see our newly appended record or not, it doesn't - * matter until the on-page cell is set to WT_CELL_VALUE_OVFL_RM. That - * involves atomic operations which will act as our barrier. Regardless, - * we update the page footprint as part of this operation, which acts as - * a barrier as well. - * - * The update transaction ID choice is tricky, to work around an issue - * in variable-length column store. Imagine an overflow value with an - * RLE greater than 1. We append a copy to the end of an update chain, - * but it's possible it's the overflow value for more than one record, - * and appending it to the end of one record's update chain means a - * subsequent enter of a globally visible value to one of the records - * would allow the truncation of the overflow chain that leaves other - * records without a value. If appending such an overflow record, set - * the transaction ID to the first possible transaction ID. That ID is - * old enough to be globally visible, but we can use it as a flag if an - * update record cannot be discarded when truncating an update chain. - */ - WT_ERR(__wt_update_alloc( - session, tmp, &append, &size, WT_UPDATE_STANDARD)); - append->txnid = page->type == WT_PAGE_COL_VAR && - __wt_cell_rle(unpack) > 1 ? WT_TXN_FIRST : WT_TXN_NONE; - for (upd = upd_list; upd->next != NULL; upd = upd->next) - ; - WT_PUBLISH(upd->next, append); - - __wt_cache_page_inmem_incr(session, page, size); - - *updp = append; - -err: __wt_scr_free(session, &tmp); - return (ret); -} - -/* - * __ovfl_cache -- - * Cache an overflow value. - */ -static int -__ovfl_cache(WT_SESSION_IMPL *session, - WT_PAGE *page, WT_UPDATE *upd_list, WT_CELL_UNPACK *unpack) -{ - WT_OVFL_TRACK *track; - WT_UPDATE *upd; - - /* Append a copy of the overflow value to the update list. */ - WT_RET(__ovfl_cache_append_update( - session, page, upd_list, unpack, &upd)); - /* Allocating tracking structures as necessary. */ if (page->modify->ovfl_track == NULL) - WT_RET(__wt_ovfl_track_init(session, page)); + WT_ERR(__wt_ovfl_track_init(session, page)); track = page->modify->ovfl_track; - /* Add the value's information to the update list. */ - WT_RET(__wt_realloc_def(session, + /* Copy the overflow item into place. */ + WT_ERR(__wt_realloc_def(session, &track->remove_allocated, track->remove_next + 1, &track->remove)); track->remove[track->remove_next].cell = unpack->cell; - track->remove[track->remove_next].upd = upd; + WT_ERR(__wt_strndup(session, + tmp->data, tmp->size, &track->remove[track->remove_next].data)); + track->remove[track->remove_next].size = tmp->size; ++track->remove_next; - return (0); +err: __wt_scr_free(session, &tmp); + return (ret); } /* @@ -227,12 +147,14 @@ __ovfl_cache(WT_SESSION_IMPL *session, */ int __wt_ovfl_remove(WT_SESSION_IMPL *session, - WT_PAGE *page, WT_UPDATE *upd_list, WT_CELL_UNPACK *unpack) + WT_PAGE *page, WT_CELL_UNPACK *unpack, bool checkpoint) { - bool visible; - /* - * This function solves a problem in reconciliation. The scenario is: + * This function solves two problems in reconciliation. + * + * The first problem is snapshot readers needing on-page overflow values + * that have been removed. The scenario is as follows: + * * - reconciling a leaf page that references an overflow item * - the item is updated and the update committed * - a checkpoint runs, freeing the backing overflow blocks @@ -263,28 +185,16 @@ __wt_ovfl_remove(WT_SESSION_IMPL *session, * per overflow item. We don't do any of that because overflow values * are supposed to be rare and we shouldn't see contention for the lock. * - * Check for a globally visible update. If there is a globally visible - * update, we don't need to cache the item because it's not possible for - * a running thread to have moved past it. - */ - switch (page->type) { - case WT_PAGE_COL_VAR: - visible = __ovfl_cache_col_visible(session, upd_list, unpack); - break; - case WT_PAGE_ROW_LEAF: - visible = __ovfl_cache_row_visible(session, upd_list); - break; - WT_ILLEGAL_VALUE(session); - } - - /* - * If there's no globally visible update, there's a reader in the system - * that might try and read the old value, cache it. + * We only have to do this for checkpoints: in any eviction mode, there + * can't be threads sitting in our update lists. */ - if (!visible) - WT_RET(__ovfl_cache(session, page, upd_list, unpack)); + if (checkpoint) + WT_RET(__ovfl_cache(session, page, unpack)); /* + * The second problem is to only remove the underlying blocks once, + * solved by the WT_CELL_VALUE_OVFL_RM flag. + * * Queue the on-page cell to be set to WT_CELL_VALUE_OVFL_RM and the * underlying overflow value's blocks to be freed when reconciliation * completes. diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c index 49b12b2d4e9..0c3cb026421 100644 --- a/src/btree/bt_read.c +++ b/src/btree/bt_read.c @@ -96,7 +96,7 @@ __col_instantiate(WT_SESSION_IMPL *session, /* Search the page and add updates. */ WT_RET(__wt_col_search(session, recno, ref, cbt)); WT_RET(__wt_col_modify( - session, cbt, recno, NULL, updlist, updlist->type, false)); + session, cbt, recno, NULL, updlist, WT_UPDATE_INVALID, false)); return (0); } @@ -121,7 +121,7 @@ __row_instantiate(WT_SESSION_IMPL *session, /* Search the page and add updates. */ WT_RET(__wt_row_search(session, key, ref, cbt, true)); WT_RET(__wt_row_modify( - session, cbt, key, NULL, updlist, updlist->type, false)); + session, cbt, key, NULL, updlist, WT_UPDATE_INVALID, false)); return (0); } diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index a0db4457f62..ac90d6693d3 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -1446,8 +1446,8 @@ __split_multi_inmem( WT_ERR(__wt_col_search(session, recno, ref, &cbt)); /* Apply the modification. */ - WT_ERR(__wt_col_modify( - session, &cbt, recno, NULL, upd, upd->type, true)); + WT_ERR(__wt_col_modify(session, &cbt, + recno, NULL, upd, WT_UPDATE_INVALID, true)); break; case WT_PAGE_ROW_LEAF: /* Build a key. */ @@ -1468,8 +1468,8 @@ __split_multi_inmem( WT_ERR(__wt_row_search(session, key, ref, &cbt, true)); /* Apply the modification. */ - WT_ERR(__wt_row_modify( - session, &cbt, key, NULL, upd, upd->type, true)); + WT_ERR(__wt_row_modify(session, + &cbt, key, NULL, upd, WT_UPDATE_INVALID, true)); break; WT_ILLEGAL_VALUE_ERR(session); } diff --git a/src/btree/row_modify.c b/src/btree/row_modify.c index 6e610b86376..5e84899999a 100644 --- a/src/btree/row_modify.c +++ b/src/btree/row_modify.c @@ -263,6 +263,13 @@ __wt_update_alloc(WT_SESSION_IMPL *session, const WT_ITEM *value, *updp = NULL; /* + * The code paths leading here are convoluted: assert we never attempt + * to allocate an update structure if only intending to insert one we + * already have. + */ + WT_ASSERT(session, modify_type != WT_UPDATE_INVALID); + + /* * Allocate the WT_UPDATE structure and room for the value, then copy * the value into place. */ @@ -304,14 +311,11 @@ __wt_update_obsolete_check( * Walk the list of updates, looking for obsolete updates at the end. * * Only updates with globally visible, self-contained data can terminate - * update chains, ignore modified and reserved updates. Special case the - * first transaction ID, it flags column-store overflow values which can - * never be discarded. + * update chains. */ for (first = NULL, count = 0; upd != NULL; upd = upd->next, count++) if (WT_UPDATE_DATA_VALUE(upd) && - __wt_txn_upd_visible_all(session, upd) && - upd->txnid != WT_TXN_FIRST) { + __wt_txn_upd_visible_all(session, upd)) { if (first == NULL) first = upd; } else if (upd->txnid != WT_TXN_ABORTED) diff --git a/src/include/btmem.h b/src/include/btmem.h index 094f6f2c680..f0d810281c2 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -386,8 +386,9 @@ struct __wt_page_modify { /* Cached overflow value cell/update address pairs. */ struct { - WT_CELL *cell; - WT_UPDATE *upd; + WT_CELL *cell; + uint8_t *data; + size_t size; } *remove; size_t remove_allocated; uint32_t remove_next; @@ -895,10 +896,11 @@ struct __wt_update { uint32_t size; /* data length */ -#define WT_UPDATE_DELETED 0 /* deleted */ -#define WT_UPDATE_MODIFIED 1 /* partial-update modify value */ -#define WT_UPDATE_RESERVED 2 /* reserved */ -#define WT_UPDATE_STANDARD 3 /* complete value */ +#define WT_UPDATE_INVALID 0 /* diagnostic check */ +#define WT_UPDATE_DELETED 1 /* deleted */ +#define WT_UPDATE_MODIFIED 2 /* partial-update modify value */ +#define WT_UPDATE_RESERVED 3 /* reserved */ +#define WT_UPDATE_STANDARD 4 /* complete value */ uint8_t type; /* type (one byte to conserve memory) */ /* If the update includes a complete value. */ diff --git a/src/include/extern.h b/src/include/extern.h index 58de2f98030..9a86dbc1a26 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -150,7 +150,8 @@ extern const char *__wt_cell_type_string(uint8_t type); extern const char *__wt_page_addr_string(WT_SESSION_IMPL *session, WT_REF *ref, WT_ITEM *buf); extern const char *__wt_addr_string(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, WT_ITEM *buf); extern int __wt_ovfl_read(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store, bool *decoded) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_ovfl_remove(WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd_list, WT_CELL_UNPACK *unpack) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern void __wt_ovfl_discard_remove(WT_SESSION_IMPL *session, WT_PAGE *page); +extern int __wt_ovfl_remove(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, bool checkpoint) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint32_t alloc_entries, bool alloc_refs, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, size_t memsize, uint32_t flags, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index a3cb6a53a09..1eeb99e0a05 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -506,6 +506,13 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_TRET(session->block_manager_cleanup(session)); WT_TRET(__rec_destroy_session(session)); + + /* + * We track removed overflow objects in case there's a reader + * in transit when they're removed. Any form of eviction locks + * out readers, we can discard them all. + */ + __wt_ovfl_discard_remove(session, page); } WT_RET(ret); @@ -1194,6 +1201,64 @@ __rec_update_move(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd, WT_SAVE_UPD *supd) } /* + * __rec_append_orig_value -- + * Append the key's original value to its update list. + */ +static int +__rec_append_orig_value(WT_SESSION_IMPL *session, + WT_PAGE *page, WT_UPDATE *upd_list, WT_CELL_UNPACK *unpack) +{ + WT_DECL_ITEM(tmp); + WT_DECL_RET; + WT_UPDATE *append, *upd; + size_t size; + + /* If at least one standard update is globally visible, we're done. */ + for (upd = upd_list; upd != NULL; upd = upd->next) + if (WT_UPDATE_DATA_VALUE(upd) && + __wt_txn_upd_visible_all(session, upd)) + return (0); + + /* + * We need the original on-page value for some reader: get a copy and + * append it to the end of the update list with a transaction ID that + * guarantees its visibility. + * + * If we don't have a value cell, it's an insert/append list key/value + * pair which simply doesn't exist for some reader; place a deleted + * record at the end of the update list. + */ + append = NULL; /* -Wconditional-uninitialized */ + size = 0; /* -Wconditional-uninitialized */ + if (unpack == NULL || unpack->type == WT_CELL_DEL) + WT_RET(__wt_update_alloc(session, + NULL, &append, &size, WT_UPDATE_DELETED)); + else { + WT_RET(__wt_scr_alloc(session, 0, &tmp)); + WT_ERR(__wt_page_cell_data_ref(session, page, unpack, tmp)); + WT_ERR(__wt_update_alloc( + session, tmp, &append, &size, WT_UPDATE_STANDARD)); + } + + /* + * Give the entry no transaction ID to ensure global visibility, append + * it to the update list. + * + * Note the change to the actual reader-accessible update list: from now + * on, the original on-page value appears at the end of the update list, + * even if this reconciliation subsequently fails. + */ + append->txnid = WT_TXN_NONE; + for (upd = upd_list; upd->next != NULL; upd = upd->next) + ; + WT_PUBLISH(upd->next, append); + __wt_cache_page_inmem_incr(session, page, size); + +err: __wt_scr_free(session, &tmp); + return (ret); +} + +/* * __rec_txn_read -- * Return the update in a list that should be written (or NULL if none can * be written). @@ -1203,18 +1268,14 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, void *ripcip, WT_CELL_UNPACK *vpack, WT_UPDATE **updp) { WT_BTREE *btree; - WT_DECL_RET; - WT_DECL_ITEM(tmp); - WT_DECL_TIMESTAMP(min_timestamp) WT_DECL_TIMESTAMP(max_timestamp) WT_PAGE *page; - WT_UPDATE *append, *upd, *upd_list; - size_t size, update_mem; - uint64_t max_txn, min_txn, txnid; - bool append_origv, skipped; + WT_UPDATE *upd, *upd_list; + size_t update_mem; + uint64_t max_txn, txnid; + bool skipped; *updp = NULL; - append = NULL; /* -Wconditional-uninitialized */ btree = S2BT(session); page = r->page; @@ -1235,9 +1296,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, max_txn = WT_TXN_NONE; #ifdef HAVE_TIMESTAMPS __wt_timestamp_set_zero(&max_timestamp); - __wt_timestamp_set_inf(&min_timestamp); #endif - min_txn = UINT64_MAX; if (F_ISSET(r, WT_EVICTING)) { /* Discard obsolete updates. */ @@ -1258,8 +1317,6 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, */ if (WT_TXNID_LT(max_txn, txnid)) max_txn = txnid; - if (WT_TXNID_LT(txnid, min_txn)) - min_txn = txnid; /* * Find the first update we can use. @@ -1291,11 +1348,6 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, &max_timestamp, &upd->timestamp) < 0) __wt_timestamp_set( &max_timestamp, &upd->timestamp); - - if (__wt_timestamp_cmp( - &min_timestamp, &upd->timestamp) > 0) - __wt_timestamp_set( - &min_timestamp, &upd->timestamp); #endif } } else @@ -1325,7 +1377,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, } } - /* Reconciliation should never see a reserved update. */ + /* Reconciliation should never see an aborted or reserved update. */ WT_ASSERT(session, *updp == NULL || ((*updp)->txnid != WT_TXN_ABORTED && (*updp)->type != WT_UPDATE_RESERVED)); @@ -1370,18 +1422,17 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, if (!skipped && (F_ISSET(btree, WT_BTREE_LOOKASIDE) || __wt_txn_visible_all(session, max_txn, WT_TIMESTAMP_NULL(&max_timestamp)))) { -#ifdef HAVE_DIAGNOSTIC /* * The checkpoint transaction is special. Make sure we never * write (metadata) updates from a checkpoint in a concurrent * session. */ - txnid = *updp == NULL ? WT_TXN_NONE : (*updp)->txnid; - WT_ASSERT(session, txnid == WT_TXN_NONE || - txnid != S2C(session)->txn_global.checkpoint_state.id || + WT_ASSERT(session, *updp == NULL || + (*updp)->txnid != + S2C(session)->txn_global.checkpoint_state.id || WT_SESSION_IS_CHECKPOINT(session)); -#endif - return (0); + + goto check_original_value; } /* @@ -1400,7 +1451,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, */ if (!F_ISSET(r, WT_EVICTING)) { r->leave_dirty = true; - return (0); + goto check_original_value; } /* @@ -1441,7 +1492,6 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, if (skipped) r->update_mem_uncommitted += update_mem; - append_origv = false; if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE)) { /* * The save/restore eviction path. @@ -1456,58 +1506,6 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, /* The page can't be marked clean. */ r->leave_dirty = true; - } else { - /* - * The lookaside table eviction path. - * - * If at least one update is globally visible, copy the update - * list and ignore the current on-page value. If no update is - * globally visible, readers require the page's original value. - */ - if (!__wt_txn_visible_all( - session, min_txn, WT_TIMESTAMP_NULL(&min_timestamp))) - append_origv = true; - } - - /* - * We need the original on-page value for some reason: get a copy and - * append it to the end of the update list with a transaction ID that - * guarantees its visibility. - */ - if (append_origv) { - /* - * If we don't have a value cell, it's an insert/append list - * key/value pair which simply doesn't exist for some reader; - * place a deleted record at the end of the update list. - */ - size = 0; /* -Wconditional-uninitialized */ - if (vpack == NULL || vpack->type == WT_CELL_DEL) - WT_RET(__wt_update_alloc(session, - NULL, &append, &size, WT_UPDATE_DELETED)); - else { - WT_RET(__wt_scr_alloc(session, 0, &tmp)); - if ((ret = __wt_page_cell_data_ref( - session, page, vpack, tmp)) == 0) - ret = __wt_update_alloc(session, - tmp, &append, &size, WT_UPDATE_STANDARD); - __wt_scr_free(session, &tmp); - WT_RET(ret); - } - - /* - * Give the entry no transaction ID to ensure global visibility, - * append it to the update list. - * - * Note the change to the actual reader-accessible update list: - * from now on, the original on-page value appears at the end - * of the update list, even if this reconciliation subsequently - * fails. - */ - append->txnid = WT_TXN_NONE; - for (upd = upd_list; upd->next != NULL; upd = upd->next) - ; - WT_PUBLISH(upd->next, append); - __wt_cache_page_inmem_incr(session, page, size); } /* @@ -1521,7 +1519,23 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, * that transaction ID is globally visible, we know we no longer need * the lookaside table records, allowing them to be discarded. */ - return (__rec_update_save(session, r, ins, ripcip, *updp)); + WT_RET(__rec_update_save(session, r, ins, ripcip, *updp)); + +check_original_value: + /* + * Returning an update means the original on-page value might be lost, + * and that's a problem if there's a reader that needs it. There are + * two cases: any lookaside table eviction (because the backing disk + * image is rewritten), or any reconciliation of a backing overflow + * record that will be physically removed once it's no longer needed. + */ + if (*updp != NULL && + (F_ISSET(r, WT_EVICT_LOOKASIDE) || + (vpack != NULL && + vpack->ovfl && vpack->raw != WT_CELL_VALUE_OVFL_RM))) + WT_RET(__rec_append_orig_value(session, page, *updp, vpack)); + + return (0); } /* @@ -4708,7 +4722,7 @@ __rec_col_var(WT_SESSION_IMPL *session, * file, otherwise we'll leak blocks on the checkpoint. * That's safe because if the backing overflow value is * still needed by any running transaction, we'll cache - * a copy in the reconciliation tracking structures. + * a copy in the update list. * * Regardless, we avoid copying in overflow records: if * there's a WT_INSERT entry that modifies a reference @@ -4793,8 +4807,8 @@ record_loop: /* * The on-page value will never be accessed, * write a placeholder record. */ - data = "@"; - size = 1; + data = "ovfl-unused"; + size = WT_STORE_SIZE(strlen("ovfl-unused")); } else { update_no_copy = false; /* Maybe data copy */ @@ -4928,7 +4942,8 @@ compare: /* */ if (ovfl_state == OVFL_UNUSED && vpack->raw != WT_CELL_VALUE_OVFL_RM) - WT_ERR(__wt_ovfl_remove(session, page, upd, vpack)); + WT_ERR(__wt_ovfl_remove( + session, page, vpack, !F_ISSET(r, WT_EVICTING))); } /* Walk any append list. */ @@ -5535,8 +5550,9 @@ __rec_row_leaf(WT_SESSION_IMPL *session, * The on-page value will never be accessed, * write a placeholder record. */ - WT_ERR(__rec_cell_build_val( - session, r, "@", 1, (uint64_t)0)); + WT_ERR(__rec_cell_build_val(session, r, + "ovfl-unused", strlen("ovfl-unused"), + (uint64_t)0)); } else { val->buf.data = val_cell; val->buf.size = __wt_cell_total_len(vpack); @@ -5554,8 +5570,8 @@ __rec_row_leaf(WT_SESSION_IMPL *session, */ if (vpack != NULL && vpack->ovfl && vpack->raw != WT_CELL_VALUE_OVFL_RM) - WT_ERR(__wt_ovfl_remove( - session, page, upd, vpack)); + WT_ERR(__wt_ovfl_remove(session, + page, vpack, !F_ISSET(r, WT_EVICTING))); switch (upd->type) { case WT_UPDATE_DELETED: |