diff options
Diffstat (limited to 'src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c')
-rw-r--r-- | src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c | 189 |
1 files changed, 110 insertions, 79 deletions
diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c index 04df57cf66f..6aee858e94b 100644 --- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c +++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c @@ -155,17 +155,19 @@ __rollback_abort_update(WT_SESSION_IMPL *session, WT_ITEM *key, WT_UPDATE *first /* * __rollback_abort_insert_list -- - * Apply the update abort check to each entry in an insert skip list. + * Apply the update abort check to each entry in an insert skip list. Return how many entries + * had stable updates. */ static int __rollback_abort_insert_list(WT_SESSION_IMPL *session, WT_PAGE *page, WT_INSERT_HEAD *head, - wt_timestamp_t rollback_timestamp, bool *stable_update_found) + wt_timestamp_t rollback_timestamp, uint32_t *stable_updates_count) { WT_DECL_ITEM(key); WT_DECL_RET; WT_INSERT *ins; uint64_t recno; uint8_t *memp; + bool stable_update_found; WT_ERR( __wt_scr_alloc(session, page->type == WT_PAGE_ROW_LEAF ? 0 : WT_INTPACK64_MAXSIZE, &key)); @@ -182,7 +184,9 @@ __rollback_abort_insert_list(WT_SESSION_IMPL *session, WT_PAGE *page, WT_INSERT_ key->size = WT_PTRDIFF(memp, key->data); } WT_ERR(__rollback_abort_update( - session, key, ins->upd, rollback_timestamp, stable_update_found)); + session, key, ins->upd, rollback_timestamp, &stable_update_found)); + if (stable_update_found && stable_updates_count != NULL) + (*stable_updates_count)++; } err: @@ -191,6 +195,19 @@ err: } /* + * __rollback_has_stable_update -- + * Check if an update chain has a stable update on it. Assume the update chain has already been + * processed so all we need to do is look for a valid, non-aborted entry. + */ +static bool +__rollback_has_stable_update(WT_UPDATE *upd) +{ + while (upd != NULL && (upd->type == WT_UPDATE_INVALID || upd->txnid == WT_TXN_ABORTED)) + upd = upd->next; + return upd != NULL; +} + +/* * __rollback_col_modify -- * Add the provided update to the head of the update list. */ @@ -225,57 +242,27 @@ err: * Add the provided update to the head of the update list. */ static inline int -__rollback_row_modify(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_UPDATE *upd) +__rollback_row_modify(WT_SESSION_IMPL *session, WT_REF *ref, WT_UPDATE *upd, WT_ITEM *key) { + WT_CURSOR_BTREE cbt; WT_DECL_RET; - WT_PAGE_MODIFY *mod; - WT_UPDATE *last_upd, *old_upd, **upd_entry; - size_t upd_size; - - last_upd = NULL; - /* If we don't yet have a modify structure, we'll need one. */ - WT_RET(__wt_page_modify_init(session, page)); - mod = page->modify; - - /* Allocate an update array as necessary. */ - WT_PAGE_ALLOC_AND_SWAP(session, page, mod->mod_row_update, upd_entry, page->entries); - - /* Set the WT_UPDATE array reference. */ - upd_entry = &mod->mod_row_update[WT_ROW_SLOT(page, rip)]; - upd_size = __wt_update_list_memsize(upd); - /* If there are existing updates, append them after the new updates. */ - for (last_upd = upd; last_upd->next != NULL; last_upd = last_upd->next) - ; - last_upd->next = *upd_entry; - - /* - * We can either put a tombstone plus an update or a single update on the update chain. - * - * Set the "old" entry to the second update in the list so that the serialization function - * succeeds in swapping the first update into place. - */ - if (upd->next != NULL) - *upd_entry = upd->next; - old_upd = *upd_entry; + __wt_btcur_init(session, &cbt); + __wt_btcur_open(&cbt); - /* - * Point the new WT_UPDATE item to the next element in the list. The serialization function acts - * as our memory barrier to flush this write. - */ - upd->next = old_upd; + /* Search the page. */ + WT_ERR(__wt_row_search(&cbt, key, true, ref, true, NULL)); - /* - * Serialize the update. Rollback to stable doesn't need to check the visibility of the on page - * value to detect conflict. - */ - WT_ERR(__wt_update_serial(session, NULL, page, upd_entry, &upd, upd_size, true)); + /* Apply the modification. */ +#ifdef HAVE_DIAGNOSTIC + WT_ERR(__wt_row_modify(&cbt, key, NULL, upd, WT_UPDATE_INVALID, true, false)); +#else + WT_ERR(__wt_row_modify(&cbt, key, NULL, upd, WT_UPDATE_INVALID, true)); +#endif - if (0) { err: - if (last_upd != NULL) - last_upd->next = NULL; - } + /* Free any resources that may have been cached in the cursor. */ + WT_TRET(__wt_btcur_close(&cbt, true)); return (ret); } @@ -606,7 +593,7 @@ __rollback_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_REF *ref, WT_ROW *rip, } if (rip != NULL) - WT_ERR(__rollback_row_modify(session, page, rip, upd)); + WT_ERR(__rollback_row_modify(session, ref, upd, key)); else WT_ERR(__rollback_col_modify(session, ref, upd, recno)); @@ -644,6 +631,7 @@ __rollback_abort_ondisk_kv(WT_SESSION_IMPL *session, WT_REF *ref, WT_ROW *rip, u WT_ITEM *row_key, WT_CELL_UNPACK_KV *vpack, wt_timestamp_t rollback_timestamp, bool *is_ondisk_stable) { + WT_DECL_ITEM(key); WT_DECL_ITEM(tmp); WT_DECL_RET; WT_PAGE *page; @@ -766,14 +754,24 @@ __rollback_abort_ondisk_kv(WT_SESSION_IMPL *session, WT_REF *ref, WT_ROW *rip, u return (0); } - if (rip != NULL) - WT_ERR(__rollback_row_modify(session, page, rip, upd)); - else + if (rip != NULL) { + if (row_key != NULL) + key = row_key; + else { + /* Unpack a row key. */ + WT_ERR(__wt_scr_alloc(session, 0, &key)); + WT_ERR(__wt_row_leaf_key(session, page, rip, key, false)); + } + WT_ERR(__rollback_row_modify(session, ref, upd, key)); + } else WT_ERR(__rollback_col_modify(session, ref, upd, recno)); - return (0); + if (0) { err: - __wt_free(session, upd); + __wt_free(session, upd); + } + if (rip != NULL && row_key == NULL) + __wt_scr_free(session, &key); return (ret); } @@ -788,11 +786,12 @@ __rollback_abort_col_var(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t r WT_CELL *kcell; WT_CELL_UNPACK_KV unpack; WT_COL *cip; - WT_INSERT_HEAD *ins; + WT_INSERT *ins; + WT_INSERT_HEAD *inshead; WT_PAGE *page; - uint64_t recno, rle; - uint32_t i, j; - bool is_ondisk_stable, stable_update_found; + uint64_t ins_recno, recno, rle; + uint32_t i, j, stable_updates_count; + bool is_ondisk_stable; page = ref->page; /* @@ -805,11 +804,11 @@ __rollback_abort_col_var(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t r /* Review the changes to the original on-page data items. */ WT_COL_FOREACH (page, cip, i) { - stable_update_found = false; + stable_updates_count = 0; - if ((ins = WT_COL_UPDATE(page, cip)) != NULL) + if ((inshead = WT_COL_UPDATE(page, cip)) != NULL) WT_RET(__rollback_abort_insert_list( - session, page, ins, rollback_timestamp, &stable_update_found)); + session, page, inshead, rollback_timestamp, &stable_updates_count)); if (page->dsk != NULL) { /* Unpack the cell. We need its RLE count whether or not we're going to iterate it. */ @@ -818,44 +817,76 @@ __rollback_abort_col_var(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t r rle = __wt_cell_rle(&unpack); /* - * If we found a stable update on the insert list, this key needs no further attention. - * Any other keys in this cell with stable updates also do not require attention. But - * beyond that, the on-disk value must be older than the update we found. That means it - * too is stable(*), so any keys in the cell that _don't_ have stable updates on the - * update list don't need further attention either. (And any unstable updates were just - * handled above.) Thus we can skip iterating over the cell. + * Each key whose on-disk value is not stable and has no stable update on the update + * list must be processed downstream. + * + * If we can determine that the cell's on-disk value is stable, we can skip iterating + * over the cell; likewise, if we can determine that every key in the cell has a stable + * update on the update list, we can skip the iteration. Otherwise we have to try each + * key. + * + * If the on-disk cell is deleted, it is stable, because cells only appear as deleted + * when there is no older value that might need to be restored. * - * Furthermore, if the cell is deleted it must be - * itself stable, because cells only appear as deleted if there is no older value that - * might need to be restored. We can skip iterating over the cell. + * Note that in a purely timestamped world, the presence of any stable update for any + * key in the cell means the on-disk value must be stable, because the update must be + * newer than the on-disk value. However, this is no longer true if the stable update + * has no timestamp. It may also not be true if the on-disk value is prepared, or other + * corner cases. Therefore, we must iterate the cell unless _every_ key has a stable + * update. * - * (*) Either that, or the update is not timestamped, in which case the on-disk value - * might not be stable but the non-timestamp update will hide it until the next - * reconciliation and then overwrite it. + * We can, however, stop iterating as soon as the downstream code reports back that the + * on-disk value is actually stable. */ - if (stable_update_found) - WT_STAT_CONN_DATA_INCR(session, txn_rts_stable_rle_skipped); - else if (unpack.type == WT_CELL_DEL) + if (unpack.type == WT_CELL_DEL) WT_STAT_CONN_DATA_INCR(session, txn_rts_delete_rle_skipped); + else if (stable_updates_count == rle) + WT_STAT_CONN_DATA_INCR(session, txn_rts_stable_rle_skipped); else { - for (j = 0; j < rle; j++) { + j = 0; + if (inshead != NULL) { + WT_SKIP_FOREACH (ins, inshead) { + /* If the update list goes past the end of the cell, something's wrong. */ + WT_ASSERT(session, j < rle); + ins_recno = WT_INSERT_RECNO(ins); + /* Process all the keys before this update. */ + while (recno + j < ins_recno) { + WT_RET(__rollback_abort_ondisk_kv(session, ref, NULL, recno + j, NULL, + &unpack, rollback_timestamp, &is_ondisk_stable)); + /* We can stop right away if the on-disk version is stable. */ + if (is_ondisk_stable) { + if (rle > 1) + WT_STAT_CONN_DATA_INCR(session, txn_rts_stable_rle_skipped); + goto stop; + } + j++; + } + /* If this key has a stable update, skip over it. */ + if (recno + j == ins_recno && __rollback_has_stable_update(ins->upd)) + j++; + } + } + /* Process the rest of the keys. */ + while (j < rle) { WT_RET(__rollback_abort_ondisk_kv(session, ref, NULL, recno + j, NULL, &unpack, rollback_timestamp, &is_ondisk_stable)); /* We can stop right away if the on-disk version is stable. */ if (is_ondisk_stable) { if (rle > 1) WT_STAT_CONN_DATA_INCR(session, txn_rts_stable_rle_skipped); - break; + goto stop; } + j++; } } +stop: recno += rle; } } /* Review the append list */ - if ((ins = WT_COL_APPEND(page)) != NULL) - WT_RET(__rollback_abort_insert_list(session, page, ins, rollback_timestamp, NULL)); + if ((inshead = WT_COL_APPEND(page)) != NULL) + WT_RET(__rollback_abort_insert_list(session, page, inshead, rollback_timestamp, NULL)); /* Mark the page as dirty to reconcile the page. */ if (page->modify) |