diff options
Diffstat (limited to 'src/third_party/wiredtiger/src/reconcile/rec_visibility.c')
-rw-r--r-- | src/third_party/wiredtiger/src/reconcile/rec_visibility.c | 104 |
1 files changed, 54 insertions, 50 deletions
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c index b4dd84d58ee..e25b02a3104 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c @@ -63,67 +63,65 @@ __rec_append_orig_value( WT_ASSERT(session, upd != NULL && unpack != NULL && unpack->type != WT_CELL_DEL); - for (;; upd = upd->next) { - /* Done if at least one self-contained update is globally visible. */ - if (WT_UPDATE_DATA_VALUE(upd) && __wt_txn_upd_visible_all(session, upd)) - return (0); + append = tombstone = NULL; + total_size = 0; + /* Review the current update list, checking conditions that mean no work is needed. */ + for (;; upd = upd->next) { /* - * If the update is restored from the history store for the rollback to stable operation we - * don't need the on-disk value anymore and we're done. + * Done if the update was restored from the history store for the rollback to stable + * operation. */ if (F_ISSET(upd, WT_UPDATE_RESTORED_FOR_ROLLBACK)) return (0); - /* On page value already on chain */ - if (unpack != NULL && unpack->start_ts == upd->start_ts && unpack->start_txn == upd->txnid) + /* Done if the on page value already appears on the update list. */ + if (unpack->start_ts == upd->start_ts && unpack->start_txn == upd->txnid) + return (0); + + /* + * Done if at least one self-contained update is globally visible. It's tempting to pull + * this test out of the loop and only test the oldest self-contained update for global + * visibility (as visibility tests are expensive). However, when running at lower isolation + * levels, or when an application intentionally commits in out of timestamp order, it's + * possible for an update on the chain to be globally visible and followed by an (earlier) + * update that is not yet globally visible. + */ + if (WT_UPDATE_DATA_VALUE(upd) && __wt_txn_upd_visible_all(session, upd)) return (0); - /* Leave reference at the last item in the chain. */ + /* Leave reference pointing to the last item in the update list. */ if (upd->next == NULL) break; } - /* - * We need the original on-page value for some reader: get a copy and append it to the end of - * the update list with a transaction ID that guarantees its visibility. - * - * If we don't have a value cell, it's an insert/append list key/value pair which simply doesn't - * exist for some reader; place a deleted record at the end of the update list. - * - * If the an update is out of order so it masks the value in the cell, don't append. - */ - append = tombstone = NULL; /* -Wconditional-uninitialized */ - total_size = size = 0; /* -Wconditional-uninitialized */ + /* Done if the stop time pair of the onpage cell is globally visible. */ + if ((unpack->stop_ts != WT_TS_MAX || unpack->stop_txn != WT_TXN_MAX) && + __wt_txn_visible_all(session, unpack->stop_txn, unpack->stop_ts)) + return (0); + + /* We need the original on-page value for some reader: get a copy. */ + WT_ERR(__wt_scr_alloc(session, 0, &tmp)); + WT_ERR(__wt_page_cell_data_ref(session, page, unpack, tmp)); + WT_ERR(__wt_update_alloc(session, tmp, &append, &size, WT_UPDATE_STANDARD)); + total_size += size; + append->txnid = unpack->start_txn; + append->start_ts = unpack->start_ts; + append->durable_ts = unpack->durable_start_ts; /* - * We need to append a TOMBSTONE before the onpage value if the onpage value has a valid - * stop pair. - * - * Imagine a case we insert and delete a value respectively at timestamp 0 and 10, and later - * insert it again at 20. We need the TOMBSTONE to tell us there is no value between 10 and - * 20. + * Additionally, we need to append a tombstone before the onpage value we're about to append to + * the list, if the onpage value has a valid stop pair. Imagine a case where we insert and + * delete a value respectively at timestamp 0 and 10, and later insert it again at 20. We need + * the tombstone to tell us there is no value between 10 and 20. */ if (unpack->stop_ts != WT_TS_MAX || unpack->stop_txn != WT_TXN_MAX) { - /* No need to append anything if the stop time pair is globally visible. */ - if (__wt_txn_visible_all(session, unpack->stop_txn, unpack->stop_ts)) - return (0); WT_ERR(__wt_update_alloc(session, NULL, &tombstone, &size, WT_UPDATE_TOMBSTONE)); + total_size += size; tombstone->txnid = unpack->stop_txn; tombstone->start_ts = unpack->stop_ts; tombstone->durable_ts = unpack->durable_stop_ts; - total_size += size; - } - - WT_ERR(__wt_scr_alloc(session, 0, &tmp)); - WT_ERR(__wt_page_cell_data_ref(session, page, unpack, tmp)); - WT_ERR(__wt_update_alloc(session, tmp, &append, &size, WT_UPDATE_STANDARD)); - append->txnid = unpack->start_txn; - append->start_ts = unpack->start_ts; - append->durable_ts = unpack->durable_start_ts; - total_size += size; - if (tombstone != NULL) { tombstone->next = append; append = tombstone; } @@ -133,13 +131,12 @@ __rec_append_orig_value( __wt_cache_page_inmem_incr(session, page, total_size); + if (0) { err: - __wt_scr_free(session, &tmp); - /* Free append when tombstone allocation fails */ - if (ret != 0) { __wt_free(session, append); __wt_free(session, tombstone); } + __wt_scr_free(session, &tmp); return (ret); } @@ -156,8 +153,8 @@ __rec_need_save_upd( /* * Save updates for any reconciliation that doesn't involve history store (in-memory database - * and fixed length column store), except when the maximum timestamp and txnid are globally - * visible. + * and fixed length column store), except when the selected stop time pair or the selected start + * time pair is globally visible. */ if (!F_ISSET(r, WT_REC_HS) && !F_ISSET(r, WT_REC_IN_MEMORY) && r->page->type != WT_PAGE_COL_FIX) return (false); @@ -296,7 +293,8 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v * checkpoint in a concurrent session. */ WT_ASSERT(session, !WT_IS_METADATA(session->dhandle) || upd == NULL || - upd->txnid == WT_TXN_NONE || upd->txnid != S2C(session)->txn_global.checkpoint_state.id || + upd->txnid == WT_TXN_NONE || + upd->txnid != S2C(session)->txn_global.checkpoint_txn_shared.id || WT_SESSION_IS_CHECKPOINT(session)); /* If all of the updates were aborted, quit. */ @@ -464,11 +462,17 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v /* * Returning an update means the original on-page value might be lost, and that's a problem if - * there's a reader that needs it. This call makes a copy of the on-page value. We do that any - * time there are saved updates and during reconciliation of a backing overflow record that will - * be physically removed once it's no longer needed. + * there's a reader that needs it, make a copy of the on-page value. We do that any time there + * are saved updates (we may need the original on-page value to terminate the update chain, for + * example, in the case of an update that modifies the original value). Additionally, make a + * copy of the on-page value if the value is an overflow item and anything other than the + * on-page cell is being written. This is because the value's backing overflow blocks aren't + * part of the page, and they are physically removed by checkpoint writing this page, that is, + * the checkpoint doesn't include the overflow blocks so they're removed and future readers of + * this page won't be able to find them. */ - if (vpack != NULL && vpack->type != WT_CELL_DEL && upd_select->upd != NULL && upd_saved) + if (upd_select->upd != NULL && vpack != NULL && vpack->type != WT_CELL_DEL && + (upd_saved || F_ISSET(vpack, WT_CELL_UNPACK_OVERFLOW))) WT_ERR(__rec_append_orig_value(session, page, upd_select->upd, vpack)); err: |