diff options
author | Luke Chen <luke.chen@mongodb.com> | 2022-11-21 14:13:56 +1100 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-11-21 03:46:22 +0000 |
commit | 4910ab4f20b59258cbfbd055db5175e1a6a1e5e8 (patch) | |
tree | fd76fb39e359cc98cebe58938c79e237a56ff632 | |
parent | 4cc5fbff67cee1d004e9ac70c772228010e8637b (diff) | |
download | mongo-4910ab4f20b59258cbfbd055db5175e1a6a1e5e8.tar.gz |
Import wiredtiger: 185cb659029fc4e5213783c5d290aa0fab5ab043 from branch mongodb-master
ref: 47951efac1..185cb65902
for: 6.3.0-rc0
WT-10164 Split apart the rollback to stable implementation into multiple files
19 files changed, 2179 insertions, 2068 deletions
diff --git a/src/third_party/wiredtiger/dist/filelist b/src/third_party/wiredtiger/dist/filelist index de5ac7fc1f1..b5de826bcf1 100644 --- a/src/third_party/wiredtiger/dist/filelist +++ b/src/third_party/wiredtiger/dist/filelist @@ -194,6 +194,12 @@ src/schema/schema_worker.c src/session/session_api.c src/session/session_compact.c src/session/session_dhandle.c +src/rollback_to_stable/rts_api.c +src/rollback_to_stable/rts_btree.c +src/rollback_to_stable/rts_btree_walk.c +src/rollback_to_stable/rts.c +src/rollback_to_stable/rts_history.c +src/rollback_to_stable/rts_visibility.c src/support/cond_auto.c src/support/crypto.c src/support/err.c @@ -221,5 +227,4 @@ src/txn/txn.c src/txn/txn_ckpt.c src/txn/txn_log.c src/txn/txn_recover.c -src/txn/txn_rollback_to_stable.c src/txn/txn_timestamp.c diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index c7c0ee94b22..a6ed00890a0 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-master", - "commit": "47951efac16fbf8e5d44230ad18fce1b18858cda" + "commit": "185cb659029fc4e5213783c5d290aa0fab5ab043" } diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c index 38bd79e6230..e6c8f0893fb 100644 --- a/src/third_party/wiredtiger/src/conn/conn_api.c +++ b/src/third_party/wiredtiger/src/conn/conn_api.c @@ -1361,7 +1361,7 @@ __conn_rollback_to_stable(WT_CONNECTION *wt_conn, const char *config) CONNECTION_API_CALL(conn, session, rollback_to_stable, config, cfg); WT_STAT_CONN_INCR(session, txn_rts); - ret = __wt_rollback_to_stable(session, cfg, false); + ret = conn->rts->rollback_to_stable(session, cfg, false); err: API_END_RET(session, ret); } diff --git a/src/third_party/wiredtiger/src/conn/conn_open.c b/src/third_party/wiredtiger/src/conn/conn_open.c index 322c7ba6b50..1ca0d683278 100644 --- a/src/third_party/wiredtiger/src/conn/conn_open.c +++ b/src/third_party/wiredtiger/src/conn/conn_open.c @@ -51,6 +51,7 @@ __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[]) /* Initialize transaction support. */ WT_RET(__wt_txn_global_init(session, cfg)); + __wt_rollback_to_stable_init(conn); WT_STAT_CONN_SET(session, dh_conn_handle_size, sizeof(WT_DATA_HANDLE)); return (0); } diff --git a/src/third_party/wiredtiger/src/docs/arch-fast-truncate.dox b/src/third_party/wiredtiger/src/docs/arch-fast-truncate.dox index fba8e415089..a775c35008b 100644 --- a/src/third_party/wiredtiger/src/docs/arch-fast-truncate.dox +++ b/src/third_party/wiredtiger/src/docs/arch-fast-truncate.dox @@ -829,10 +829,10 @@ Call \c __wt_txn_op_delete_apply_prepare_state.} @row{txn.c, Hook, in \c __wt_txn_rollback, Call \c __wt_delete_page_rollback.} -@row{txn_rollback_to_stable.c, Hook, in \c __rollback_page_needs_abort, +@row{rts_visibility.c, Hook, in \c __wt_rollback_page_needs_abort, Check \c page_del when deciding whether the page contains unstable values that need to be examined.} -@row{txn_rollback_to_stable.c, Hook, in \c __rollback_to_stable_page_skip, +@row{rts_btree_walk.c, Hook, in \c __rollback_to_stable_page_skip, Check \c page del when deciding whether to skip over the page.} </table> diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h index 629088c2aab..e68db066f72 100644 --- a/src/third_party/wiredtiger/src/include/connection.h +++ b/src/third_party/wiredtiger/src/include/connection.h @@ -497,6 +497,7 @@ struct __wt_connection_impl { uint16_t log_req_min; /* Min required log version */ uint32_t txn_logsync; /* Log sync configuration */ + WT_ROLLBACK_TO_STABLE *rts, _rts; /* Rollback to stable subsystem */ WT_SESSION_IMPL *meta_ckpt_session; /* Metadata checkpoint session */ /* diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 3d88ffc24ca..6d466ba2ca3 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -33,6 +33,12 @@ extern bool __wt_page_evict_urgent(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern bool __wt_read_cell_time_window(WT_CURSOR_BTREE *cbt, WT_TIME_WINDOW *tw) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern bool __wt_rts_visibility_has_stable_update(WT_UPDATE *upd) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern bool __wt_rts_visibility_page_needs_abort(WT_SESSION_IMPL *session, WT_REF *ref, + wt_timestamp_t rollback_timestamp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern bool __wt_rts_visibility_txn_visible_id(WT_SESSION_IMPL *session, uint64_t id) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern bool __wt_rwlock_islocked(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern bool __wt_txn_active(WT_SESSION_IMPL *session, uint64_t txnid) @@ -1284,10 +1290,6 @@ extern int __wt_remove_if_exists(WT_SESSION_IMPL *session, const char *name, boo WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_reset_blkmod(WT_SESSION_IMPL *session, const char *orig_config, WT_ITEM *buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[], bool no_ckpt) - WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_rollback_to_stable_one(WT_SESSION_IMPL *session, const char *uri, bool *skipp) - WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_row_ikey(WT_SESSION_IMPL *session, uint32_t cell_offset, const void *key, size_t size, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_row_ikey_alloc(WT_SESSION_IMPL *session, uint32_t cell_offset, const void *key, @@ -1309,6 +1311,23 @@ extern int __wt_row_modify(WT_CURSOR_BTREE *cbt, const WT_ITEM *key, const WT_IT ) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_row_search(WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key, bool insert, WT_REF *leaf, bool leaf_safe, bool *leaf_foundp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_rts_btree_abort_updates(WT_SESSION_IMPL *session, WT_REF *ref, + wt_timestamp_t rollback_timestamp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_rts_btree_apply_all(WT_SESSION_IMPL *session, wt_timestamp_t rollback_timestamp) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_rts_btree_walk_btree(WT_SESSION_IMPL *session, wt_timestamp_t rollback_timestamp) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_rts_btree_walk_btree_apply( + WT_SESSION_IMPL *session, const char *uri, const char *config, wt_timestamp_t rollback_timestamp) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_rts_check(WT_SESSION_IMPL *session) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_rts_history_btree_hs_truncate(WT_SESSION_IMPL *session, uint32_t btree_id) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_rts_history_delete_hs(WT_SESSION_IMPL *session, WT_ITEM *key, wt_timestamp_t ts) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_rts_history_final_pass(WT_SESSION_IMPL *session, wt_timestamp_t rollback_timestamp) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_rwlock_init(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[]) @@ -1861,6 +1880,7 @@ extern void __wt_rec_dictionary_free(WT_SESSION_IMPL *session, WT_RECONCILE *r); extern void __wt_rec_dictionary_reset(WT_RECONCILE *r); extern void __wt_ref_addr_free(WT_SESSION_IMPL *session, WT_REF *ref); extern void __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref); +extern void __wt_rollback_to_stable_init(WT_CONNECTION_IMPL *conn); extern void __wt_root_ref_init( WT_SESSION_IMPL *session, WT_REF *root_ref, WT_PAGE *root, bool is_recno); extern void __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK *l); diff --git a/src/third_party/wiredtiger/src/include/rollback_to_stable.h b/src/third_party/wiredtiger/src/include/rollback_to_stable.h new file mode 100644 index 00000000000..5c1fd555e43 --- /dev/null +++ b/src/third_party/wiredtiger/src/include/rollback_to_stable.h @@ -0,0 +1,28 @@ +/*- + * Copyright (c) 2014-present MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#define WT_CHECK_RECOVERY_FLAG_TXNID(session, txnid) \ + (F_ISSET(S2C(session), WT_CONN_RECOVERING) && S2C(session)->recovery_ckpt_snap_min != 0 && \ + (txnid) >= S2C(session)->recovery_ckpt_snap_min) + +/* Enable rollback to stable verbose messaging during recovery. */ +#define WT_VERB_RECOVERY_RTS(session) \ + (F_ISSET(S2C(session), WT_CONN_RECOVERING) ? \ + WT_DECL_VERBOSE_MULTI_CATEGORY(((WT_VERBOSE_CATEGORY[]){WT_VERB_RECOVERY, WT_VERB_RTS})) : \ + WT_DECL_VERBOSE_MULTI_CATEGORY(((WT_VERBOSE_CATEGORY[]){WT_VERB_RTS}))) + +/* + * WT_ROLLBACK_TO_STABLE -- + * Rollback to stable singleton, contains the interface to rollback to stable along + * with context used by rollback to stable. + */ +struct __wt_rollback_to_stable { + /* Methods */ + int (*rollback_to_stable_one)(WT_SESSION_IMPL *, const char *, bool *); + int (*rollback_to_stable)(WT_SESSION_IMPL *, const char *[], bool); +}; diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h index 53072c2df5e..c5e53d12d36 100644 --- a/src/third_party/wiredtiger/src/include/wt_internal.h +++ b/src/third_party/wiredtiger/src/include/wt_internal.h @@ -313,6 +313,8 @@ struct __wt_ref; typedef struct __wt_ref WT_REF; struct __wt_ref_hist; typedef struct __wt_ref_hist WT_REF_HIST; +struct __wt_rollback_to_stable; +typedef struct __wt_rollback_to_stable WT_ROLLBACK_TO_STABLE; struct __wt_row; typedef struct __wt_row WT_ROW; struct __wt_rwlock; @@ -445,6 +447,7 @@ typedef uint64_t wt_timestamp_t; #include "optrack.h" #include "os.h" #include "reconcile.h" +#include "rollback_to_stable.h" #include "schema.h" #include "thread_group.h" #include "tiered.h" diff --git a/src/third_party/wiredtiger/src/rollback_to_stable/rts.c b/src/third_party/wiredtiger/src/rollback_to_stable/rts.c new file mode 100644 index 00000000000..7a0f35f5164 --- /dev/null +++ b/src/third_party/wiredtiger/src/rollback_to_stable/rts.c @@ -0,0 +1,151 @@ +/*- + * Copyright (c) 2014-present MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_rts_check -- + * Check to the extent possible that the rollback request is reasonable. + */ +int +__wt_rts_check(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_SESSION_IMPL *session_in_list; + uint32_t i, session_cnt; + bool cursor_active, txn_active; + + conn = S2C(session); + cursor_active = txn_active = false; + + WT_STAT_CONN_INCR(session, txn_walk_sessions); + + /* + * Help the user comply with the requirement there be no concurrent user operations. It is okay + * to have a transaction in the prepared state. + * + * WT_TXN structures are allocated and freed as sessions are activated and closed. Lock the + * session open/close to ensure we don't race. This call is a rarely used RTS-only function, + * acquiring the lock shouldn't be an issue. + */ + __wt_spin_lock(session, &conn->api_lock); + + WT_ORDERED_READ(session_cnt, conn->session_cnt); + for (i = 0, session_in_list = conn->sessions; i < session_cnt; i++, session_in_list++) { + + /* Skip inactive or internal sessions. */ + if (!session_in_list->active || F_ISSET(session_in_list, WT_SESSION_INTERNAL)) + continue; + + /* Check if a user session has a running transaction. */ + if (F_ISSET(session_in_list->txn, WT_TXN_RUNNING)) { + txn_active = true; + break; + } + + /* Check if a user session has an active file cursor. */ + if (session_in_list->ncursors != 0) { + cursor_active = true; + break; + } + } + __wt_spin_unlock(session, &conn->api_lock); + + /* + * A new cursor may be positioned or a transaction may start after we return from this call and + * callers should be aware of this limitation. + */ + if (cursor_active) + WT_RET_MSG(session, EBUSY, "rollback_to_stable illegal with active file cursors"); + if (txn_active) { + ret = EBUSY; + WT_TRET(__wt_verbose_dump_txn(session)); + WT_RET_MSG(session, ret, "rollback_to_stable illegal with active transactions"); + } + return (0); +} + +/* + * __rts_progress_msg -- + * Log a verbose message about the progress of the current rollback to stable. + */ +static void +__rts_progress_msg(WT_SESSION_IMPL *session, struct timespec rollback_start, + uint64_t rollback_count, uint64_t *rollback_msg_count) +{ + struct timespec cur_time; + uint64_t time_diff; + + __wt_epoch(session, &cur_time); + + /* Time since the rollback started. */ + time_diff = WT_TIMEDIFF_SEC(cur_time, rollback_start); + + if ((time_diff / WT_PROGRESS_MSG_PERIOD) > *rollback_msg_count) { + __wt_verbose(session, WT_VERB_RECOVERY_PROGRESS, + "Rollback to stable has been running for %" PRIu64 " seconds and has inspected %" PRIu64 + " files. For more detailed logging, enable WT_VERB_RTS", + time_diff, rollback_count); + ++(*rollback_msg_count); + } +} + +/* + * __wt_rts_btree_apply_all -- + * Perform rollback to stable to all files listed in the metadata, apart from the metadata and + * history store files. + */ +int +__wt_rts_btree_apply_all(WT_SESSION_IMPL *session, wt_timestamp_t rollback_timestamp) +{ + struct timespec rollback_timer; + WT_CURSOR *cursor; + WT_DECL_RET; + uint64_t rollback_count, rollback_msg_count; + const char *config, *uri; + + /* Initialize the verbose tracking timer. */ + __wt_epoch(session, &rollback_timer); + rollback_count = 0; + rollback_msg_count = 0; + + WT_RET(__wt_metadata_cursor(session, &cursor)); + while ((ret = cursor->next(cursor)) == 0) { + /* Log a progress message. */ + __rts_progress_msg(session, rollback_timer, rollback_count, &rollback_msg_count); + ++rollback_count; + + WT_ERR(cursor->get_key(cursor, &uri)); + WT_ERR(cursor->get_value(cursor, &config)); + + F_SET(session, WT_SESSION_QUIET_CORRUPT_FILE); + ret = __wt_rts_btree_walk_btree_apply(session, uri, config, rollback_timestamp); + F_CLR(session, WT_SESSION_QUIET_CORRUPT_FILE); + + /* + * Ignore rollback to stable failures on files that don't exist or files where corruption is + * detected. + */ + if (ret == ENOENT || (ret == WT_ERROR && F_ISSET(S2C(session), WT_CONN_DATA_CORRUPTION))) { + __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), + "%s: skipped performing rollback to stable because the file %s", uri, + ret == ENOENT ? "does not exist" : "is corrupted."); + continue; + } + WT_ERR(ret); + } + WT_ERR_NOTFOUND_OK(ret, false); + + if (F_ISSET(S2C(session), WT_CONN_RECOVERING)) + WT_ERR(__wt_rts_history_final_pass(session, rollback_timestamp)); + +err: + WT_TRET(__wt_metadata_cursor_release(session, &cursor)); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/rollback_to_stable/rts_api.c b/src/third_party/wiredtiger/src/rollback_to_stable/rts_api.c new file mode 100644 index 00000000000..8a484070f79 --- /dev/null +++ b/src/third_party/wiredtiger/src/rollback_to_stable/rts_api.c @@ -0,0 +1,168 @@ +/*- + * Copyright (c) 2014-present MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __rollback_to_stable_int -- + * Rollback all modifications with timestamps more recent than the passed in timestamp. + */ +static int +__rollback_to_stable_int(WT_SESSION_IMPL *session, bool no_ckpt) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_TXN_GLOBAL *txn_global; + wt_timestamp_t rollback_timestamp; + char ts_string[2][WT_TS_INT_STRING_SIZE]; + + conn = S2C(session); + txn_global = &conn->txn_global; + + /* + * Rollback to stable should ignore tombstones in the history store since it needs to scan the + * entire table sequentially. + */ + F_SET(session, WT_SESSION_ROLLBACK_TO_STABLE); + + WT_ERR(__wt_rts_check(session)); + + /* + * Update the global time window state to have consistent view from global visibility rules for + * the rollback to stable to bring back the database into a consistent state. + * + * As part of the below function call, the oldest transaction id and pinned timestamps are + * updated. + */ + WT_ERR(__wt_txn_update_oldest(session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT)); + + WT_ASSERT_ALWAYS(session, + (txn_global->has_pinned_timestamp || !txn_global->has_oldest_timestamp), + "Database has no pinned timestamp but an oldest timestamp. Pinned timestamp is required to " + "find out the global visibility/obsolete of an update."); + + /* + * Copy the stable timestamp, otherwise we'd need to lock it each time it's accessed. Even + * though the stable timestamp isn't supposed to be updated while rolling back, accessing it + * without a lock would violate protocol. + */ + WT_ORDERED_READ(rollback_timestamp, txn_global->stable_timestamp); + __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), + "performing rollback to stable with stable timestamp: %s and oldest timestamp: %s", + __wt_timestamp_to_string(rollback_timestamp, ts_string[0]), + __wt_timestamp_to_string(txn_global->oldest_timestamp, ts_string[1])); + + if (F_ISSET(conn, WT_CONN_RECOVERING)) + __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), + "recovered checkpoint snapshot min: %" PRIu64 ", snapshot max: %" PRIu64 + ", snapshot count: %" PRIu32, + conn->recovery_ckpt_snap_min, conn->recovery_ckpt_snap_max, + conn->recovery_ckpt_snapshot_count); + + WT_ERR(__wt_rts_btree_apply_all(session, rollback_timestamp)); + + /* Rollback the global durable timestamp to the stable timestamp. */ + txn_global->has_durable_timestamp = txn_global->has_stable_timestamp; + txn_global->durable_timestamp = txn_global->stable_timestamp; + + /* + * If the configuration is not in-memory, forcibly log a checkpoint after rollback to stable to + * ensure that both in-memory and on-disk versions are the same unless caller requested for no + * checkpoint. + */ + if (!F_ISSET(conn, WT_CONN_IN_MEMORY) && !no_ckpt) + WT_ERR(session->iface.checkpoint(&session->iface, "force=1")); + +err: + F_CLR(session, WT_SESSION_ROLLBACK_TO_STABLE); + return (ret); +} + +/* + * __rollback_to_stable_one -- + * Perform rollback to stable on a single object. + */ +static int +__rollback_to_stable_one(WT_SESSION_IMPL *session, const char *uri, bool *skipp) +{ + WT_DECL_RET; + wt_timestamp_t rollback_timestamp; + char *config; + + /* + * This is confusing: the caller's boolean argument "skip" stops the schema-worker loop from + * processing this object and any underlying objects it may have (for example, a table with + * multiple underlying file objects). We rollback-to-stable all of the file objects an object + * may contain, so set the caller's skip argument to true on all file objects, else set the + * caller's skip argument to false so our caller continues down the tree of objects. + */ + *skipp = WT_BTREE_PREFIX(uri); + if (!*skipp) + return (0); + + WT_RET(__wt_metadata_search(session, uri, &config)); + + /* Read the stable timestamp once, when we first start up. */ + WT_ORDERED_READ(rollback_timestamp, S2C(session)->txn_global.stable_timestamp); + + F_SET(session, WT_SESSION_QUIET_CORRUPT_FILE); + ret = __wt_rts_btree_walk_btree_apply(session, uri, config, rollback_timestamp); + F_CLR(session, WT_SESSION_QUIET_CORRUPT_FILE); + + __wt_free(session, config); + + return (ret); +} + +/* + * __rollback_to_stable -- + * Rollback the database to the stable timestamp. + */ +static int +__rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[], bool no_ckpt) +{ + WT_DECL_RET; + + WT_UNUSED(cfg); + + /* + * Don't use the connection's default session: we are working on data handles and (a) don't want + * to cache all of them forever, plus (b) can't guarantee that no other method will be called + * concurrently. Copy parent session no logging option to the internal session to make sure that + * rollback to stable doesn't generate log records. + */ + WT_RET( + __wt_open_internal_session(S2C(session), "txn rollback_to_stable", true, 0, 0, &session)); + + WT_STAT_CONN_SET(session, txn_rollback_to_stable_running, 1); + WT_WITH_CHECKPOINT_LOCK( + session, WT_WITH_SCHEMA_LOCK(session, ret = __rollback_to_stable_int(session, no_ckpt))); + WT_STAT_CONN_SET(session, txn_rollback_to_stable_running, 0); + + WT_TRET(__wt_session_close_internal(session)); + + return (ret); +} + +/* + * __wt_rollback_to_stable_init -- + * Initialize the data structures for the rollback to stable subsystem + */ +void +__wt_rollback_to_stable_init(WT_CONNECTION_IMPL *conn) +{ + /* + * Setup the pointer so the data structure can be accessed easily while avoiding the need to do + * explicit memory management. + */ + conn->rts = &conn->_rts; + + /* Setup function pointers */ + conn->rts->rollback_to_stable = __rollback_to_stable; + conn->rts->rollback_to_stable_one = __rollback_to_stable_one; +} diff --git a/src/third_party/wiredtiger/src/rollback_to_stable/rts_btree.c b/src/third_party/wiredtiger/src/rollback_to_stable/rts_btree.c new file mode 100644 index 00000000000..34a2971939d --- /dev/null +++ b/src/third_party/wiredtiger/src/rollback_to_stable/rts_btree.c @@ -0,0 +1,1083 @@ +/*- + * Copyright (c) 2014-present MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __rts_btree_abort_update -- + * Abort updates in an update change with timestamps newer than the rollback timestamp. Also, + * clear the history store flag for the first stable update in the update. + */ +static int +__rts_btree_abort_update(WT_SESSION_IMPL *session, WT_ITEM *key, WT_UPDATE *first_upd, + wt_timestamp_t rollback_timestamp, bool *stable_update_found) +{ + WT_UPDATE *stable_upd, *tombstone, *upd; + char ts_string[2][WT_TS_INT_STRING_SIZE]; + bool txn_id_visible; + + stable_upd = tombstone = NULL; + txn_id_visible = false; + if (stable_update_found != NULL) + *stable_update_found = false; + for (upd = first_upd; upd != NULL; upd = upd->next) { + /* Skip the updates that are aborted. */ + if (upd->txnid == WT_TXN_ABORTED) + continue; + + /* + * An unstable update needs to be aborted if any of the following are true: + * 1. An update is invisible based on the checkpoint snapshot during recovery. + * 2. The update durable timestamp is greater than the stable timestamp. + * 3. The update is a prepared update. + * + * Usually during recovery, there are no in memory updates present on the page. But + * whenever an unstable fast truncate operation is written to the disk, as part + * of the rollback to stable page read, it instantiates the tombstones on the page. + * The transaction id validation is ignored in all scenarios except recovery. + */ + txn_id_visible = __wt_rts_visibility_txn_visible_id(session, upd->txnid); + if (!txn_id_visible || rollback_timestamp < upd->durable_ts || + upd->prepare_state == WT_PREPARE_INPROGRESS) { + __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), + "rollback to stable update aborted with txnid: %" PRIu64 + ", txnid not visible: %s, or stable timestamp (%s) < durable timestamp (%s): %s, or " + "prepare state (%d) is in progress: %s", + upd->txnid, !txn_id_visible ? "true" : "false", + __wt_timestamp_to_string(rollback_timestamp, ts_string[1]), + __wt_timestamp_to_string(upd->durable_ts, ts_string[0]), + rollback_timestamp < upd->durable_ts ? "true" : "false", upd->prepare_state, + upd->prepare_state == WT_PREPARE_INPROGRESS ? "true" : "false"); + + upd->txnid = WT_TXN_ABORTED; + WT_STAT_CONN_INCR(session, txn_rts_upd_aborted); + } else { + /* Valid update is found. */ + stable_upd = upd; + break; + } + } + + /* + * Clear the history store flags for the stable update to indicate that this update should be + * written to the history store later. The next time when this update is moved into the history + * store, it will have a different stop time point. + */ + if (stable_upd != NULL) { + if (F_ISSET(stable_upd, WT_UPDATE_HS | WT_UPDATE_TO_DELETE_FROM_HS)) { + /* Find the update following a stable tombstone. */ + if (stable_upd->type == WT_UPDATE_TOMBSTONE) { + tombstone = stable_upd; + for (stable_upd = stable_upd->next; stable_upd != NULL; + stable_upd = stable_upd->next) { + if (stable_upd->txnid != WT_TXN_ABORTED) { + WT_ASSERT(session, + stable_upd->type != WT_UPDATE_TOMBSTONE && + F_ISSET(stable_upd, WT_UPDATE_HS | WT_UPDATE_TO_DELETE_FROM_HS)); + break; + } + } + } + + /* + * Delete the first stable update and any newer update from the history store. If the + * update following the stable tombstone is removed by obsolete check, no need to remove + * that update from the history store as it has a globally visible tombstone. In that + * case, it is enough to delete everything up until to the tombstone timestamp. + */ + WT_RET(__wt_rts_history_delete_hs( + session, key, stable_upd == NULL ? tombstone->start_ts : stable_upd->start_ts)); + + /* + * Clear the history store flags for the first stable update. Otherwise, it will not be + * moved to history store again. + */ + if (stable_upd != NULL) + F_CLR(stable_upd, WT_UPDATE_HS | WT_UPDATE_TO_DELETE_FROM_HS); + if (tombstone != NULL) + F_CLR(tombstone, WT_UPDATE_HS | WT_UPDATE_TO_DELETE_FROM_HS); + } + if (stable_update_found != NULL) + *stable_update_found = true; + } + + return (0); +} + +/* + * __rts_btree_abort_insert_list -- + * Apply the update abort check to each entry in an insert skip list. Return how many entries + * had stable updates. + */ +static int +__rts_btree_abort_insert_list(WT_SESSION_IMPL *session, WT_PAGE *page, WT_INSERT_HEAD *head, + wt_timestamp_t rollback_timestamp, uint32_t *stable_updates_count) +{ + WT_DECL_ITEM(key); + WT_DECL_RET; + WT_INSERT *ins; + uint64_t recno; + uint8_t *memp; + bool stable_update_found; + + WT_ERR( + __wt_scr_alloc(session, page->type == WT_PAGE_ROW_LEAF ? 0 : WT_INTPACK64_MAXSIZE, &key)); + + WT_SKIP_FOREACH (ins, head) + if (ins->upd != NULL) { + if (page->type == WT_PAGE_ROW_LEAF) { + key->data = WT_INSERT_KEY(ins); + key->size = WT_INSERT_KEY_SIZE(ins); + } else { + recno = WT_INSERT_RECNO(ins); + memp = key->mem; + WT_ERR(__wt_vpack_uint(&memp, 0, recno)); + key->size = WT_PTRDIFF(memp, key->data); + } + WT_ERR(__rts_btree_abort_update( + session, key, ins->upd, rollback_timestamp, &stable_update_found)); + if (stable_update_found && stable_updates_count != NULL) + (*stable_updates_count)++; + if (!stable_update_found && page->type == WT_PAGE_ROW_LEAF && + !F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) + /* + * When a new key is added to a page and the page is then checkpointed, updates for + * that key can be present in the History Store while the key isn't present in the + * disk image. RTS will then only remove these updates when there is a stable update + * on-chain. These updates still need removing when no stable updates are on-chain, + * so do so here explicitly. Pass in rollback_timestamp + 1 as history store cleanup + * removes updates inclusive of the provided timestamp, but we only want to remove + * unstable updates. + * + * FIXME-WT-10017: WT-9846 is an interim fix only for row-store while we investigate + * the impacts of a long term correction in WT-10017. Once completed this change can + * be reverted. + */ + WT_ERR(__wt_rts_history_delete_hs(session, key, rollback_timestamp + 1)); + } + +err: + __wt_scr_free(session, &key); + return (ret); +} + +/* + * __rts_btree_col_modify -- + * Add the provided update to the head of the update list. + */ +static inline int +__rts_btree_col_modify(WT_SESSION_IMPL *session, WT_REF *ref, WT_UPDATE *upd, uint64_t recno) +{ + WT_CURSOR_BTREE cbt; + WT_DECL_RET; + + __wt_btcur_init(session, &cbt); + __wt_btcur_open(&cbt); + + /* Search the page. */ + WT_ERR(__wt_col_search(&cbt, recno, ref, true, NULL)); + + /* Apply the modification. */ +#ifdef HAVE_DIAGNOSTIC + WT_ERR(__wt_col_modify(&cbt, recno, NULL, upd, WT_UPDATE_INVALID, true, false)); +#else + WT_ERR(__wt_col_modify(&cbt, recno, NULL, upd, WT_UPDATE_INVALID, true)); +#endif + +err: + /* Free any resources that may have been cached in the cursor. */ + WT_TRET(__wt_btcur_close(&cbt, true)); + + return (ret); +} + +/* + * __rts_btree_row_modify -- + * Add the provided update to the head of the update list. + */ +static inline int +__rts_btree_row_modify(WT_SESSION_IMPL *session, WT_REF *ref, WT_UPDATE *upd, WT_ITEM *key) +{ + WT_CURSOR_BTREE cbt; + WT_DECL_RET; + + __wt_btcur_init(session, &cbt); + __wt_btcur_open(&cbt); + + /* Search the page. */ + WT_ERR(__wt_row_search(&cbt, key, true, ref, true, NULL)); + + /* Apply the modification. */ +#ifdef HAVE_DIAGNOSTIC + WT_ERR(__wt_row_modify(&cbt, key, NULL, upd, WT_UPDATE_INVALID, true, false)); +#else + WT_ERR(__wt_row_modify(&cbt, key, NULL, upd, WT_UPDATE_INVALID, true)); +#endif + +err: + /* Free any resources that may have been cached in the cursor. */ + WT_TRET(__wt_btcur_close(&cbt, true)); + + return (ret); +} + +/* + * __rts_btree_ondisk_fixup_key -- + * Abort updates in the history store and replace the on-disk value with an update that + * satisfies the given timestamp. + */ +static int +__rts_btree_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_REF *ref, WT_ROW *rip, uint64_t recno, + WT_ITEM *row_key, WT_CELL_UNPACK_KV *unpack, wt_timestamp_t rollback_timestamp) +{ + WT_CURSOR *hs_cursor; + WT_DECL_ITEM(full_value); + WT_DECL_ITEM(hs_key); + WT_DECL_ITEM(hs_value); + WT_DECL_ITEM(key); + WT_DECL_ITEM(key_string); + WT_DECL_RET; + WT_PAGE *page; + WT_TIME_WINDOW *hs_tw; + WT_UPDATE *tombstone, *upd; + wt_timestamp_t hs_durable_ts, hs_start_ts, hs_stop_durable_ts, newer_hs_durable_ts, pinned_ts; + uint64_t hs_counter, type_full; + uint32_t hs_btree_id; + uint8_t *memp; + uint8_t type; + char ts_string[4][WT_TS_INT_STRING_SIZE]; + char tw_string[WT_TIME_STRING_SIZE]; + bool valid_update_found; +#ifdef HAVE_DIAGNOSTIC + bool first_record; +#endif + + page = ref->page; + + hs_cursor = NULL; + tombstone = upd = NULL; + hs_durable_ts = hs_start_ts = hs_stop_durable_ts = WT_TS_NONE; + hs_btree_id = S2BT(session)->id; + valid_update_found = false; +#ifdef HAVE_DIAGNOSTIC + first_record = true; +#endif + + /* Allocate buffers for the data store and history store key. */ + WT_ERR(__wt_scr_alloc(session, 0, &hs_key)); + WT_ERR(__wt_scr_alloc(session, 0, &hs_value)); + + if (rip != NULL) { + if (row_key != NULL) + key = row_key; + else { + /* Unpack a row key. */ + WT_ERR(__wt_scr_alloc(session, 0, &key)); + WT_ERR(__wt_row_leaf_key(session, page, rip, key, false)); + } + } else { + /* Manufacture a column key. */ + WT_ERR(__wt_scr_alloc(session, WT_INTPACK64_MAXSIZE, &key)); + memp = key->mem; + WT_ERR(__wt_vpack_uint(&memp, 0, recno)); + key->size = WT_PTRDIFF(memp, key->data); + } + + WT_ERR(__wt_scr_alloc(session, 0, &key_string)); + __wt_verbose_level_multi(session, WT_VERB_RECOVERY_RTS(session), WT_VERBOSE_DEBUG_2, + "rolling back the on-disk key: %s", + __wt_key_string(session, key->data, key->size, S2BT(session)->key_format, key_string)); + + WT_ERR(__wt_scr_alloc(session, 0, &full_value)); + WT_ERR(__wt_page_cell_data_ref_kv(session, page, unpack, full_value)); + /* + * We can read overflow removed value if checkpoint has run before rollback to stable. In this + * case, we have already appended the on page value to the update chain. At this point, we have + * visited the update chain and decided the value is not stable. In addition, checkpoint must + * have moved this value to the history store as a full value. Therefore, we can safely ignore + * the on page value if it is overflow removed. + */ + if (__wt_cell_type_raw(unpack->cell) == WT_CELL_VALUE_OVFL_RM) + ret = 0; + else + WT_ERR(__wt_buf_set(session, full_value, full_value->data, full_value->size)); + + newer_hs_durable_ts = unpack->tw.durable_start_ts; + + __wt_txn_pinned_timestamp(session, &pinned_ts); + + /* Open a history store table cursor. */ + WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor)); + /* + * Rollback-to-stable operates exclusively (i.e., it is the only active operation in the system) + * outside the constraints of transactions. Therefore, there is no need for snapshot based + * visibility checks. + */ + F_SET(hs_cursor, WT_CURSTD_HS_READ_ALL); + + /* + * Scan the history store for the given btree and key with maximum start timestamp to let the + * search point to the last version of the key and start traversing backwards to find out the + * satisfying record according the given timestamp. Any satisfying history store record is moved + * into data store and removed from history store. If none of the history store records satisfy + * the given timestamp, the key is removed from data store. + */ + hs_cursor->set_key(hs_cursor, 4, hs_btree_id, key, WT_TS_MAX, UINT64_MAX); + ret = __wt_curhs_search_near_before(session, hs_cursor); + for (; ret == 0; ret = hs_cursor->prev(hs_cursor)) { + WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, hs_key, &hs_start_ts, &hs_counter)); + + /* Get current value and convert to full update if it is a modify. */ + WT_ERR(hs_cursor->get_value( + hs_cursor, &hs_stop_durable_ts, &hs_durable_ts, &type_full, hs_value)); + type = (uint8_t)type_full; + + /* Retrieve the time window from the history cursor. */ + __wt_hs_upd_time_window(hs_cursor, &hs_tw); + + /* + * We have a tombstone on the history update and it is obsolete according to the timestamp + * and txnid, so no need to restore it. These obsolete updates are written to the disk when + * they are not obsolete at the time of reconciliation by an eviction thread and later they + * become obsolete according to the checkpoint. + */ + if (__wt_rts_visibility_txn_visible_id(session, hs_tw->stop_txn) && + hs_tw->durable_stop_ts <= pinned_ts) { + __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), + "history store stop is obsolete with time window: %s and pinned timestamp: %s", + __wt_time_window_to_string(hs_tw, tw_string), + __wt_timestamp_to_string(pinned_ts, ts_string[0])); + WT_ERR(hs_cursor->remove(hs_cursor)); + WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_removed); + continue; + } + + /* + * Do not include history store updates greater than on-disk data store version to construct + * a full update to restore except when the on-disk update is prepared. Including more + * recent updates than the on-disk version shouldn't be problem as the on-disk version in + * history store is always a full update. It is better to not to include those updates as it + * unnecessarily increases the rollback to stable time. + * + * Comparing with timestamps here has no problem unlike in search flow where the timestamps + * may be reset during reconciliation. RTS detects an on-disk update is unstable based on + * the written proper timestamp, so comparing against it with history store shouldn't have + * any problem. + */ + if (hs_tw->start_ts <= unpack->tw.start_ts || unpack->tw.prepare) { + if (type == WT_UPDATE_MODIFY) + WT_ERR(__wt_modify_apply_item( + session, S2BT(session)->value_format, full_value, hs_value->data)); + else { + WT_ASSERT(session, type == WT_UPDATE_STANDARD); + WT_ERR(__wt_buf_set(session, full_value, hs_value->data, hs_value->size)); + } + } else + __wt_verbose_level_multi(session, WT_VERB_RECOVERY_RTS(session), WT_VERBOSE_DEBUG_2, + "history store update more recent than on-disk update with time window: %s and type: " + "%" PRIu8, + __wt_time_window_to_string(hs_tw, tw_string), type); + + /* + * Verify the history store timestamps are in order. The start timestamp may be equal to the + * stop timestamp if the original update's commit timestamp is in order. We may see records + * newer than or equal to the onpage value if eviction runs concurrently with checkpoint. In + * that case, don't verify the first record. + * + * It is possible during a prepared transaction rollback, the history store update that have + * its own stop timestamp doesn't get removed leads to duplicate records in history store + * after further operations on that same key. Rollback to stable should ignore such records + * for timestamp ordering verification. + * + * If we have fixed the missing timestamps, then the newer update reinserted with an older + * timestamp may have a durable timestamp that is smaller than the current stop durable + * timestamp. + * + * It is possible that there can be an update in the history store with a max stop timestamp + * in the middle of the same key updates. This occurs when the checkpoint writes the + * committed prepared update and further updates on that key including the history store + * changes before the transaction fixes the history store update to have a proper stop + * timestamp. It is a rare scenario. + */ + WT_ASSERT(session, + hs_stop_durable_ts <= newer_hs_durable_ts || hs_start_ts == hs_stop_durable_ts || + hs_start_ts == newer_hs_durable_ts || newer_hs_durable_ts == hs_durable_ts || + first_record || hs_stop_durable_ts == WT_TS_MAX); + + if (hs_stop_durable_ts < newer_hs_durable_ts) + WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_stop_older_than_newer_start); + + /* + * Validate the timestamps in the key and the cell are same. This must be validated only + * after verifying it's stop time window is not globally visible. The start timestamps of + * the time window are cleared when they are globally visible and there will be no stop + * timestamp in the history store whenever a prepared update is written to the data store. + */ + WT_ASSERT(session, + (hs_tw->start_ts == WT_TS_NONE || hs_tw->start_ts == hs_start_ts) && + (hs_tw->durable_start_ts == WT_TS_NONE || hs_tw->durable_start_ts == hs_durable_ts) && + ((hs_tw->durable_stop_ts == 0 && hs_stop_durable_ts == WT_TS_MAX) || + hs_tw->durable_stop_ts == hs_stop_durable_ts)); + + /* + * Stop processing when we find a stable update according to the given timestamp and + * transaction id. + */ + if (__wt_rts_visibility_txn_visible_id(session, hs_tw->start_txn) && + hs_tw->durable_start_ts <= rollback_timestamp) { + __wt_verbose_level_multi(session, WT_VERB_RECOVERY_RTS(session), WT_VERBOSE_DEBUG_2, + "history store update valid with time window: %s, type: %" PRIu8 + " and stable timestamp: %s", + __wt_time_window_to_string(hs_tw, tw_string), type, + __wt_timestamp_to_string(rollback_timestamp, ts_string[0])); + WT_ASSERT(session, unpack->tw.prepare || hs_tw->start_ts <= unpack->tw.start_ts); + valid_update_found = true; + break; + } + + __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), + "history store update aborted with time window: %s, type: %" PRIu8 + " and stable timestamp: %s", + __wt_time_window_to_string(hs_tw, tw_string), type, + __wt_timestamp_to_string(rollback_timestamp, ts_string[3])); + + /* + * Start time point of the current record may be used as stop time point of the previous + * record. Save it to verify against the previous record and check if we need to append the + * stop time point as a tombstone when we rollback the history store record. + */ + newer_hs_durable_ts = hs_durable_ts; +#ifdef HAVE_DIAGNOSTIC + first_record = false; +#endif + + WT_ERR(hs_cursor->remove(hs_cursor)); + WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_removed); + WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_rts_unstable); + } + + /* + * If we found a history value that satisfied the given timestamp, add it to the update list. + * Otherwise remove the key by adding a tombstone. + */ + if (valid_update_found) { + /* Retrieve the time window from the history cursor. */ + __wt_hs_upd_time_window(hs_cursor, &hs_tw); + WT_ASSERT(session, + hs_tw->start_ts < unpack->tw.start_ts || hs_tw->start_txn < unpack->tw.start_txn); + WT_ERR(__wt_upd_alloc(session, full_value, WT_UPDATE_STANDARD, &upd, NULL)); + + /* + * Set the transaction id of updates to WT_TXN_NONE when called from recovery, because the + * connections write generation will be initialized after rollback to stable and the updates + * in the cache will be problematic. The transaction id of pages which are in disk will be + * automatically reset as part of unpacking cell when loaded to cache. + */ + if (F_ISSET(S2C(session), WT_CONN_RECOVERING)) + upd->txnid = WT_TXN_NONE; + else + upd->txnid = hs_tw->start_txn; + upd->durable_ts = hs_tw->durable_start_ts; + upd->start_ts = hs_tw->start_ts; + __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), + "history store update restored txnid: %" PRIu64 ", start_ts: %s and durable_ts: %s", + upd->txnid, __wt_timestamp_to_string(upd->start_ts, ts_string[0]), + __wt_timestamp_to_string(upd->durable_ts, ts_string[1])); + + /* + * Set the flag to indicate that this update has been restored from history store for the + * rollback to stable operation. + */ + F_SET(upd, WT_UPDATE_RESTORED_FROM_HS); + WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_restore_updates); + + /* + * We have a tombstone on the original update chain and it is stable according to the + * timestamp and txnid, we need to restore that as well. + */ + if (__wt_rts_visibility_txn_visible_id(session, hs_tw->stop_txn) && + hs_tw->durable_stop_ts <= rollback_timestamp) { + /* + * The restoring tombstone timestamp must be zero or less than previous update start + * timestamp. + */ + WT_ASSERT(session, + hs_stop_durable_ts == WT_TS_NONE || hs_stop_durable_ts < newer_hs_durable_ts || + unpack->tw.prepare); + + WT_ERR(__wt_upd_alloc_tombstone(session, &tombstone, NULL)); + /* + * Set the transaction id of updates to WT_TXN_NONE when called from recovery, because + * the connections write generation will be initialized after rollback to stable and the + * updates in the cache will be problematic. The transaction id of pages which are in + * disk will be automatically reset as part of unpacking cell when loaded to cache. + */ + if (F_ISSET(S2C(session), WT_CONN_RECOVERING)) + tombstone->txnid = WT_TXN_NONE; + else + tombstone->txnid = hs_tw->stop_txn; + tombstone->durable_ts = hs_tw->durable_stop_ts; + tombstone->start_ts = hs_tw->stop_ts; + __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), + "history store tombstone restored txnid: %" PRIu64 + ", start_ts: %s and durable_ts: %s", + tombstone->txnid, __wt_timestamp_to_string(tombstone->start_ts, ts_string[0]), + __wt_timestamp_to_string(tombstone->durable_ts, ts_string[1])); + + /* + * Set the flag to indicate that this update has been restored from history store for + * the rollback to stable operation. + */ + F_SET(tombstone, WT_UPDATE_RESTORED_FROM_HS); + + tombstone->next = upd; + upd = tombstone; + WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_restore_tombstones); + } + } else { + WT_ERR(__wt_upd_alloc_tombstone(session, &upd, NULL)); + WT_STAT_CONN_DATA_INCR(session, txn_rts_keys_removed); + __wt_verbose_level_multi( + session, WT_VERB_RECOVERY_RTS(session), WT_VERBOSE_DEBUG_3, "%s", "key removed"); + } + + if (rip != NULL) + WT_ERR(__rts_btree_row_modify(session, ref, upd, key)); + else + WT_ERR(__rts_btree_col_modify(session, ref, upd, recno)); + + /* Finally remove that update from history store. */ + if (valid_update_found) { + /* Avoid freeing the updates while still in use if hs_cursor->remove fails. */ + upd = tombstone = NULL; + + WT_ERR(hs_cursor->remove(hs_cursor)); + WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_removed); + WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_rts); + } + + if (0) { +err: + WT_ASSERT(session, tombstone == NULL || upd == tombstone); + __wt_free_update_list(session, &upd); + } + __wt_scr_free(session, &full_value); + __wt_scr_free(session, &hs_key); + __wt_scr_free(session, &hs_value); + if (rip == NULL || row_key == NULL) + __wt_scr_free(session, &key); + __wt_scr_free(session, &key_string); + if (hs_cursor != NULL) + WT_TRET(hs_cursor->close(hs_cursor)); + return (ret); +} + +/* + * __rts_btree_abort_ondisk_kv -- + * Fix the on-disk K/V version according to the given timestamp. + */ +static int +__rts_btree_abort_ondisk_kv(WT_SESSION_IMPL *session, WT_REF *ref, WT_ROW *rip, uint64_t recno, + WT_ITEM *row_key, WT_CELL_UNPACK_KV *vpack, wt_timestamp_t rollback_timestamp, + bool *is_ondisk_stable) +{ + WT_DECL_ITEM(key); + WT_DECL_ITEM(key_string); + WT_DECL_ITEM(tmp); + WT_DECL_RET; + WT_PAGE *page; + WT_UPDATE *upd; + uint8_t *memp; + char time_string[WT_TIME_STRING_SIZE]; + char ts_string[5][WT_TS_INT_STRING_SIZE]; + bool prepared; + + page = ref->page; + upd = NULL; + + /* Initialize the on-disk stable version flag. */ + if (is_ondisk_stable != NULL) + *is_ondisk_stable = false; + + prepared = vpack->tw.prepare; + if (WT_IS_HS(session->dhandle)) { + /* + * Abort the history store update with stop durable timestamp greater than the stable + * timestamp or the updates with max stop timestamp which implies that they are associated + * with prepared transactions. + */ + if (vpack->tw.durable_stop_ts > rollback_timestamp || vpack->tw.stop_ts == WT_TS_MAX) { + __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), + "history store update aborted with start durable/commit timestamp: %s, %s, stop " + "durable/commit timestamp: %s, %s and stable timestamp: %s", + __wt_timestamp_to_string(vpack->tw.durable_start_ts, ts_string[0]), + __wt_timestamp_to_string(vpack->tw.start_ts, ts_string[1]), + __wt_timestamp_to_string(vpack->tw.durable_stop_ts, ts_string[2]), + __wt_timestamp_to_string(vpack->tw.stop_ts, ts_string[3]), + __wt_timestamp_to_string(rollback_timestamp, ts_string[4])); + WT_RET(__wt_upd_alloc_tombstone(session, &upd, NULL)); + WT_STAT_CONN_DATA_INCR(session, txn_rts_sweep_hs_keys); + } else + return (0); + } else if (vpack->tw.durable_start_ts > rollback_timestamp || + !__wt_rts_visibility_txn_visible_id(session, vpack->tw.start_txn) || + (!WT_TIME_WINDOW_HAS_STOP(&vpack->tw) && prepared)) { + __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), + "on-disk update aborted with time window %s. Start durable timestamp > stable timestamp: " + "%s, or txnid is not visible: %s, or tw has not stop and is prepared: %s", + __wt_time_point_to_string( + vpack->tw.start_ts, vpack->tw.durable_start_ts, vpack->tw.start_txn, time_string), + vpack->tw.durable_start_ts > rollback_timestamp ? "true" : "false", + !__wt_rts_visibility_txn_visible_id(session, vpack->tw.start_txn) ? "true" : "false", + !WT_TIME_WINDOW_HAS_STOP(&vpack->tw) && prepared ? "true" : "false"); + if (!F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) + return (__rts_btree_ondisk_fixup_key( + session, ref, rip, recno, row_key, vpack, rollback_timestamp)); + else { + /* + * In-memory database don't have a history store to provide a stable update, so remove + * the key. Note that an in-memory database will have saved old values in the update + * chain, so we should only get here for a key/value that never existed at all as of the + * rollback timestamp; thus, deleting it is the correct response. + */ + WT_RET(__wt_upd_alloc_tombstone(session, &upd, NULL)); + WT_STAT_CONN_DATA_INCR(session, txn_rts_keys_removed); + } + } else if (WT_TIME_WINDOW_HAS_STOP(&vpack->tw) && + (vpack->tw.durable_stop_ts > rollback_timestamp || + !__wt_rts_visibility_txn_visible_id(session, vpack->tw.stop_txn) || prepared)) { + /* + * For prepared transactions, it is possible that both the on-disk key start and stop time + * windows can be the same. To abort these updates, check for any stable update from history + * store or remove the key. + */ + if (vpack->tw.start_ts == vpack->tw.stop_ts && + vpack->tw.durable_start_ts == vpack->tw.durable_stop_ts && + vpack->tw.start_txn == vpack->tw.stop_txn) { + WT_ASSERT(session, prepared == true); + if (!F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) + return (__rts_btree_ondisk_fixup_key( + session, ref, rip, recno, row_key, vpack, rollback_timestamp)); + else { + /* + * In-memory database don't have a history store to provide a stable update, so + * remove the key. + */ + WT_RET(__wt_upd_alloc_tombstone(session, &upd, NULL)); + WT_STAT_CONN_DATA_INCR(session, txn_rts_keys_removed); + } + } else { + /* + * Clear the remove operation from the key by inserting the original on-disk value as a + * standard update. + */ + WT_RET(__wt_scr_alloc(session, 0, &tmp)); + if ((ret = __wt_page_cell_data_ref_kv(session, page, vpack, tmp)) == 0) + ret = __wt_upd_alloc(session, tmp, WT_UPDATE_STANDARD, &upd, NULL); + __wt_scr_free(session, &tmp); + WT_RET(ret); + + /* + * Set the transaction id of updates to WT_TXN_NONE when called from recovery, because + * the connections write generation will be initialized after rollback to stable and the + * updates in the cache will be problematic. The transaction id of pages which are in + * disk will be automatically reset as part of unpacking cell when loaded to cache. + */ + if (F_ISSET(S2C(session), WT_CONN_RECOVERING)) + upd->txnid = WT_TXN_NONE; + else + upd->txnid = vpack->tw.start_txn; + upd->durable_ts = vpack->tw.durable_start_ts; + upd->start_ts = vpack->tw.start_ts; + F_SET(upd, WT_UPDATE_RESTORED_FROM_DS); + WT_STAT_CONN_DATA_INCR(session, txn_rts_keys_restored); + __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), + "key restored with commit timestamp: %s, durable timestamp: %s, stable timestamp: " + "%s, " + "txnid: %" PRIu64 + " and removed commit timestamp: %s, durable timestamp: %s, txnid: %" PRIu64 + ", prepared: %s", + __wt_timestamp_to_string(upd->start_ts, ts_string[0]), + __wt_timestamp_to_string(upd->durable_ts, ts_string[1]), + __wt_timestamp_to_string(rollback_timestamp, ts_string[2]), upd->txnid, + __wt_timestamp_to_string(vpack->tw.stop_ts, ts_string[3]), + __wt_timestamp_to_string(vpack->tw.durable_stop_ts, ts_string[4]), vpack->tw.stop_txn, + prepared ? "true" : "false"); + } + } else { + /* Stable version according to the timestamp. */ + if (is_ondisk_stable != NULL) + *is_ondisk_stable = true; + return (0); + } + + if (rip != NULL) { + if (row_key != NULL) + key = row_key; + else { + /* Unpack a row key. */ + WT_ERR(__wt_scr_alloc(session, 0, &key)); + WT_ERR(__wt_row_leaf_key(session, page, rip, key, false)); + } + } else { + /* Manufacture a column key. */ + WT_ERR(__wt_scr_alloc(session, WT_INTPACK64_MAXSIZE, &key)); + memp = key->mem; + WT_ERR(__wt_vpack_uint(&memp, 0, recno)); + key->size = WT_PTRDIFF(memp, key->data); + } + + WT_ERR(__wt_scr_alloc(session, 0, &key_string)); + __wt_verbose_level_multi(session, WT_VERB_RECOVERY_RTS(session), WT_VERBOSE_DEBUG_2, + "removing the key%s: %s", upd->type == WT_UPDATE_TOMBSTONE ? "" : " tombstone", + __wt_key_string(session, key->data, key->size, S2BT(session)->key_format, key_string)); + + if (rip != NULL) + WT_ERR(__rts_btree_row_modify(session, ref, upd, key)); + else + WT_ERR(__rts_btree_col_modify(session, ref, upd, recno)); + + if (0) { +err: + __wt_free(session, upd); + } + if (rip == NULL || row_key == NULL) + __wt_scr_free(session, &key); + __wt_scr_free(session, &key_string); + return (ret); +} + +/* + * __rts_btree_abort_col_var -- + * Abort updates on a variable length col leaf page with timestamps newer than the rollback + * timestamp. + */ +static int +__rts_btree_abort_col_var(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t rollback_timestamp) +{ + WT_CELL *kcell; + WT_CELL_UNPACK_KV unpack; + WT_COL *cip; + WT_INSERT *ins; + WT_INSERT_HEAD *inshead; + WT_PAGE *page; + uint64_t ins_recno, recno, rle; + uint32_t i, j, stable_updates_count; + bool is_ondisk_stable; + + page = ref->page; + /* + * If a disk image exists, start from the provided recno; or else start from 0. + */ + if (page->dsk != NULL) + recno = page->dsk->recno; + else + recno = 0; + + /* Review the changes to the original on-page data items. */ + WT_COL_FOREACH (page, cip, i) { + stable_updates_count = 0; + + if ((inshead = WT_COL_UPDATE(page, cip)) != NULL) + WT_RET(__rts_btree_abort_insert_list( + session, page, inshead, rollback_timestamp, &stable_updates_count)); + + if (page->dsk != NULL) { + /* Unpack the cell. We need its RLE count whether or not we're going to iterate it. */ + kcell = WT_COL_PTR(page, cip); + __wt_cell_unpack_kv(session, page->dsk, kcell, &unpack); + rle = __wt_cell_rle(&unpack); + + /* + * Each key whose on-disk value is not stable and has no stable update on the update + * list must be processed downstream. + * + * If we can determine that the cell's on-disk value is stable, we can skip iterating + * over the cell; likewise, if we can determine that every key in the cell has a stable + * update on the update list, we can skip the iteration. Otherwise we have to try each + * key. + * + * If the on-disk cell is deleted, it is stable, because cells only appear as deleted + * when there is no older value that might need to be restored. + * + * Note that in a purely timestamped world, the presence of any stable update for any + * key in the cell means the on-disk value must be stable, because the update must be + * newer than the on-disk value. However, this is no longer true if the stable update + * has no timestamp. It may also not be true if the on-disk value is prepared, or other + * corner cases. Therefore, we must iterate the cell unless _every_ key has a stable + * update. + * + * We can, however, stop iterating as soon as the downstream code reports back that the + * on-disk value is actually stable. + */ + if (unpack.type == WT_CELL_DEL) + WT_STAT_CONN_DATA_INCR(session, txn_rts_delete_rle_skipped); + else if (stable_updates_count == rle) + WT_STAT_CONN_DATA_INCR(session, txn_rts_stable_rle_skipped); + else { + j = 0; + if (inshead != NULL) { + WT_SKIP_FOREACH (ins, inshead) { + /* If the update list goes past the end of the cell, something's wrong. */ + WT_ASSERT(session, j < rle); + ins_recno = WT_INSERT_RECNO(ins); + /* Process all the keys before this update. */ + while (recno + j < ins_recno) { + WT_RET(__rts_btree_abort_ondisk_kv(session, ref, NULL, recno + j, NULL, + &unpack, rollback_timestamp, &is_ondisk_stable)); + /* We can stop right away if the on-disk version is stable. */ + if (is_ondisk_stable) { + if (rle > 1) + WT_STAT_CONN_DATA_INCR(session, txn_rts_stable_rle_skipped); + goto stop; + } + j++; + } + /* If this key has a stable update, skip over it. */ + if (recno + j == ins_recno && + __wt_rts_visibility_has_stable_update(ins->upd)) + j++; + } + } + /* Process the rest of the keys. */ + while (j < rle) { + WT_RET(__rts_btree_abort_ondisk_kv(session, ref, NULL, recno + j, NULL, &unpack, + rollback_timestamp, &is_ondisk_stable)); + /* We can stop right away if the on-disk version is stable. */ + if (is_ondisk_stable) { + if (rle > 1) + WT_STAT_CONN_DATA_INCR(session, txn_rts_stable_rle_skipped); + goto stop; + } + j++; + } + } +stop: + recno += rle; + } + } + + /* Review the append list */ + if ((inshead = WT_COL_APPEND(page)) != NULL) + WT_RET(__rts_btree_abort_insert_list(session, page, inshead, rollback_timestamp, NULL)); + + return (0); +} + +/* + * __rts_btree_abort_col_fix_one -- + * Handle one possibly unstable on-disk time window. + */ +static int +__rts_btree_abort_col_fix_one(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t tw, + uint32_t recno_offset, wt_timestamp_t rollback_timestamp) +{ + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK_KV unpack; + WT_PAGE *page; + uint8_t value; + + btree = S2BT(session); + page = ref->page; + + /* Unpack the cell to get the time window. */ + cell = WT_COL_FIX_TW_CELL(page, &page->pg_fix_tws[tw]); + __wt_cell_unpack_kv(session, page->dsk, cell, &unpack); + + /* Fake up the value (which is not physically in the cell) in case it's wanted. */ + value = __bit_getv(page->pg_fix_bitf, recno_offset, btree->bitcnt); + unpack.data = &value; + unpack.size = 1; + + return (__rts_btree_abort_ondisk_kv(session, ref, NULL, page->dsk->recno + recno_offset, NULL, + &unpack, rollback_timestamp, NULL)); +} + +/* + * __rts_btree_abort_col_fix -- + * Abort updates on a fixed length col leaf page with timestamps newer than the rollback + * timestamp. + */ +static int +__rts_btree_abort_col_fix(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t rollback_timestamp) +{ + WT_INSERT *ins; + WT_INSERT_HEAD *inshead; + WT_PAGE *page; + uint32_t ins_recno_offset, recno_offset, numtws, tw; + + page = ref->page; + + /* + * Review the changes to the original on-page data items. Note that while this can report back + * to us whether it saw a stable update, that information doesn't do us any good -- unlike in + * VLCS where the uniformity of cells lets us reason about the timestamps of all of them based + * on the timestamp of an update to any of them, in FLCS everything is just thrown together, so + * we'll need to iterate over all the keys anyway. + */ + if ((inshead = WT_COL_UPDATE_SINGLE(page)) != NULL) + WT_RET(__rts_btree_abort_insert_list(session, page, inshead, rollback_timestamp, NULL)); + + /* + * Iterate over all the keys, stopping only on keys that (a) have a time window on disk, and + * also (b) do not have a stable update remaining in the update list. Keys with no on-disk time + * window are stable. And we must not try to adjust the on-disk value for keys with stable + * updates, because the downstream code assumes that has already been checked and in some cases + * (e.g. in-memory databases) the wrong thing will happen. + * + * Iterate over the update list and carry along the iteration over the time window list in + * parallel, even though the code would perhaps make more sense the other way around, because + * this allows using the skiplist iterator macro instead of an open-coded mess. + */ + numtws = WT_COL_FIX_TWS_SET(page) ? page->pg_fix_numtws : 0; + WT_ASSERT(session, numtws == 0 || page->dsk != NULL); + tw = 0; + if (inshead != NULL) { + WT_SKIP_FOREACH (ins, inshead) { + /* Process all the keys before this update entry. */ + ins_recno_offset = (uint32_t)(WT_INSERT_RECNO(ins) - ref->ref_recno); + while (tw < numtws && + (recno_offset = page->pg_fix_tws[tw].recno_offset) < ins_recno_offset) { + + WT_RET(__rts_btree_abort_col_fix_one( + session, ref, tw, recno_offset, rollback_timestamp)); + tw++; + } + /* If this key has a stable update, skip over it. */ + if (tw < numtws && page->pg_fix_tws[tw].recno_offset == ins_recno_offset && + ins->upd != NULL && __wt_rts_visibility_has_stable_update(ins->upd)) + tw++; + } + } + /* Process the rest of the keys with time windows. */ + while (tw < numtws) { + recno_offset = page->pg_fix_tws[tw].recno_offset; + WT_RET(__rts_btree_abort_col_fix_one(session, ref, tw, recno_offset, rollback_timestamp)); + tw++; + } + + /* Review the append list. */ + if ((inshead = WT_COL_APPEND(page)) != NULL) + WT_RET(__rts_btree_abort_insert_list(session, page, inshead, rollback_timestamp, NULL)); + + return (0); +} + +/* + * __rts_btree_abort_row_leaf -- + * Abort updates on a row leaf page with timestamps newer than the rollback timestamp. + */ +static int +__rts_btree_abort_row_leaf(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t rollback_timestamp) +{ + WT_CELL_UNPACK_KV *vpack, _vpack; + WT_DECL_ITEM(key); + WT_DECL_RET; + WT_INSERT_HEAD *insert; + WT_PAGE *page; + WT_ROW *rip; + WT_UPDATE *upd; + uint32_t i; + bool have_key, stable_update_found; + + page = ref->page; + + WT_RET(__wt_scr_alloc(session, 0, &key)); + + /* + * Review the insert list for keys before the first entry on the disk page. + */ + if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL) + WT_ERR(__rts_btree_abort_insert_list(session, page, insert, rollback_timestamp, NULL)); + + /* + * Review updates that belong to keys that are on the disk image, as well as for keys inserted + * since the page was read from disk. + */ + WT_ROW_FOREACH (page, rip, i) { + stable_update_found = false; + if ((upd = WT_ROW_UPDATE(page, rip)) != NULL) { + WT_ERR(__wt_row_leaf_key(session, page, rip, key, false)); + WT_ERR(__rts_btree_abort_update( + session, key, upd, rollback_timestamp, &stable_update_found)); + have_key = true; + } else + have_key = false; + + if ((insert = WT_ROW_INSERT(page, rip)) != NULL) + WT_ERR(__rts_btree_abort_insert_list(session, page, insert, rollback_timestamp, NULL)); + + /* + * If there is no stable update found in the update list, abort any on-disk value. + */ + if (!stable_update_found) { + vpack = &_vpack; + __wt_row_leaf_value_cell(session, page, rip, vpack); + WT_ERR(__rts_btree_abort_ondisk_kv( + session, ref, rip, 0, have_key ? key : NULL, vpack, rollback_timestamp, NULL)); + } + } + +err: + __wt_scr_free(session, &key); + return (ret); +} + +/* + * __wt_rts_btree_abort_updates -- + * Abort updates on this page newer than the timestamp. + */ +int +__wt_rts_btree_abort_updates( + WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t rollback_timestamp) +{ + WT_PAGE *page; + bool modified; + + /* + * If we have a ref with clean page, find out whether the page has any modifications that are + * newer than the given timestamp. As eviction writes the newest version to page, even a clean + * page may also contain modifications that need rollback. + */ + page = ref->page; + modified = __wt_page_is_modified(page); + if (!modified && !__wt_rts_visibility_page_needs_abort(session, ref, rollback_timestamp)) { + __wt_verbose_level_multi(session, WT_VERB_RECOVERY_RTS(session), WT_VERBOSE_DEBUG_3, + "%p: unmodified stable page skipped", (void *)ref); + return (0); + } + + WT_STAT_CONN_INCR(session, txn_rts_pages_visited); + __wt_verbose_level_multi(session, WT_VERB_RECOVERY_RTS(session), WT_VERBOSE_DEBUG_2, + "%p: roll back %s page", (void *)ref, modified ? "modified" : "clean"); + + switch (page->type) { + case WT_PAGE_COL_FIX: + WT_RET(__rts_btree_abort_col_fix(session, ref, rollback_timestamp)); + break; + case WT_PAGE_COL_VAR: + WT_RET(__rts_btree_abort_col_var(session, ref, rollback_timestamp)); + break; + case WT_PAGE_ROW_LEAF: + WT_RET(__rts_btree_abort_row_leaf(session, ref, rollback_timestamp)); + break; + case WT_PAGE_COL_INT: + case WT_PAGE_ROW_INT: + /* This function is not called for internal pages. */ + WT_ASSERT(session, false); + /* Fall through. */ + default: + WT_RET(__wt_illegal_value(session, page->type)); + } + + /* Mark the page as dirty to reconcile the page. */ + if (page->modify) + __wt_page_modify_set(session, page); + return (0); +} diff --git a/src/third_party/wiredtiger/src/rollback_to_stable/rts_btree_walk.c b/src/third_party/wiredtiger/src/rollback_to_stable/rts_btree_walk.c new file mode 100644 index 00000000000..7045e2d2950 --- /dev/null +++ b/src/third_party/wiredtiger/src/rollback_to_stable/rts_btree_walk.c @@ -0,0 +1,320 @@ +/*- + * Copyright (c) 2014-present MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __rts_btree_walk_check_btree_modified -- + * Check that the rollback to stable btree is modified or not. + */ +static int +__rts_btree_walk_check_btree_modified(WT_SESSION_IMPL *session, const char *uri, bool *modified) +{ + WT_DECL_RET; + + ret = __wt_conn_dhandle_find(session, uri, NULL); + *modified = ret == 0 && S2BT(session)->modified; + return (ret); +} + +/* + * __rts_btree_walk_page_skip -- + * Skip if rollback to stable doesn't require reading this page. + */ +static int +__rts_btree_walk_page_skip( + WT_SESSION_IMPL *session, WT_REF *ref, void *context, bool visible_all, bool *skipp) +{ + WT_PAGE_DELETED *page_del; + wt_timestamp_t rollback_timestamp; + char time_string[WT_TIME_STRING_SIZE]; + + rollback_timestamp = *(wt_timestamp_t *)context; + WT_UNUSED(visible_all); + + *skipp = false; /* Default to reading */ + + /* + * Skip pages truncated at or before the RTS timestamp. (We could read the page, but that would + * unnecessarily instantiate it). If the page has no fast-delete information, that means either + * it was discarded because the delete is globally visible, or the internal page holding the + * cell was an old format page so none was loaded. In the latter case we should skip the page as + * there's no way to get correct behavior and skipping matches the historic behavior. Note that + * eviction is running; we must lock the WT_REF before examining the fast-delete information. + */ + if (ref->state == WT_REF_DELETED && + WT_REF_CAS_STATE(session, ref, WT_REF_DELETED, WT_REF_LOCKED)) { + page_del = ref->page_del; + if (page_del == NULL || + (__wt_rts_visibility_txn_visible_id(session, page_del->txnid) && + page_del->durable_timestamp <= rollback_timestamp)) { + /* + * We should never see a prepared truncate here; not at recovery time because prepared + * truncates can't be written to disk, and not during a runtime RTS either because it + * should not be possible to do that with an unresolved prepared transaction. + */ + WT_ASSERT(session, + page_del == NULL || page_del->prepare_state == WT_PREPARE_INIT || + page_del->prepare_state == WT_PREPARE_RESOLVED); + + if (page_del == NULL) + __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), + "%p: deleted page walk skipped", (void *)ref); + else { + __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), + "%p: deleted page walk skipped page_del %s", (void *)ref, + __wt_time_point_to_string(page_del->timestamp, page_del->durable_timestamp, + page_del->txnid, time_string)); + } + WT_STAT_CONN_INCR(session, txn_rts_tree_walk_skip_pages); + *skipp = true; + } + WT_REF_SET_STATE(ref, WT_REF_DELETED); + return (0); + } + + /* Otherwise, if the page state is other than on disk, we want to look at it. */ + if (ref->state != WT_REF_DISK) + return (0); + + /* + * Check whether this on-disk page has any updates to be aborted. We are not holding a hazard + * reference on the page and so we rely on there being no other threads of control in the tree, + * that is, eviction ignores WT_REF_DISK pages and no other thread is reading pages, this page + * cannot change state from on-disk to something else. + */ + if (!__wt_rts_visibility_page_needs_abort(session, ref, rollback_timestamp)) { + *skipp = true; + __wt_verbose_multi( + session, WT_VERB_RECOVERY_RTS(session), "%p: stable page walk skipped", (void *)ref); + WT_STAT_CONN_INCR(session, txn_rts_tree_walk_skip_pages); + } + + return (0); +} + +/* + * __rts_btree_walk -- + * Called for each open handle - choose to either skip or wipe the commits + */ +static int +__rts_btree_walk(WT_SESSION_IMPL *session, wt_timestamp_t rollback_timestamp) +{ + WT_DECL_RET; + WT_REF *ref; + + /* Walk the tree, marking commits aborted where appropriate. */ + ref = NULL; + while ( + (ret = __wt_tree_walk_custom_skip(session, &ref, __rts_btree_walk_page_skip, + &rollback_timestamp, WT_READ_NO_EVICT | WT_READ_VISIBLE_ALL | WT_READ_WONT_NEED)) == 0 && + ref != NULL) + if (F_ISSET(ref, WT_REF_FLAG_LEAF)) + WT_RET(__wt_rts_btree_abort_updates(session, ref, rollback_timestamp)); + + return (ret); +} + +/* + * __wt_rts_btree_walk_btree_apply -- + * Perform rollback to stable on a single file. + */ +int +__wt_rts_btree_walk_btree_apply( + WT_SESSION_IMPL *session, const char *uri, const char *config, wt_timestamp_t rollback_timestamp) +{ + WT_CONFIG ckptconf; + WT_CONFIG_ITEM cval, value, key; + WT_DECL_RET; + wt_timestamp_t max_durable_ts, newest_start_durable_ts, newest_stop_durable_ts; + size_t addr_size; + uint64_t rollback_txnid, write_gen; + uint32_t btree_id; + char ts_string[2][WT_TS_INT_STRING_SIZE]; + bool dhandle_allocated, durable_ts_found, has_txn_updates_gt_than_ckpt_snap, modified; + bool prepared_updates; + + /* Ignore non-btree objects as well as the metadata and history store files. */ + if (!WT_BTREE_PREFIX(uri) || strcmp(uri, WT_HS_URI) == 0 || strcmp(uri, WT_METAFILE_URI) == 0) + return (0); + + addr_size = 0; + rollback_txnid = 0; + write_gen = 0; + dhandle_allocated = false; + + /* Find out the max durable timestamp of the object from checkpoint. */ + newest_start_durable_ts = newest_stop_durable_ts = WT_TS_NONE; + durable_ts_found = prepared_updates = has_txn_updates_gt_than_ckpt_snap = false; + + WT_RET(__wt_config_getones(session, config, "checkpoint", &cval)); + __wt_config_subinit(session, &ckptconf, &cval); + for (; __wt_config_next(&ckptconf, &key, &cval) == 0;) { + ret = __wt_config_subgets(session, &cval, "newest_start_durable_ts", &value); + if (ret == 0) { + newest_start_durable_ts = WT_MAX(newest_start_durable_ts, (wt_timestamp_t)value.val); + durable_ts_found = true; + } + WT_RET_NOTFOUND_OK(ret); + ret = __wt_config_subgets(session, &cval, "newest_stop_durable_ts", &value); + if (ret == 0) { + newest_stop_durable_ts = WT_MAX(newest_stop_durable_ts, (wt_timestamp_t)value.val); + durable_ts_found = true; + } + WT_RET_NOTFOUND_OK(ret); + ret = __wt_config_subgets(session, &cval, "prepare", &value); + if (ret == 0) { + if (value.val) + prepared_updates = true; + } + WT_RET_NOTFOUND_OK(ret); + ret = __wt_config_subgets(session, &cval, "newest_txn", &value); + if (value.len != 0) + rollback_txnid = (uint64_t)value.val; + WT_RET_NOTFOUND_OK(ret); + ret = __wt_config_subgets(session, &cval, "addr", &value); + if (ret == 0) + addr_size = value.len; + WT_RET_NOTFOUND_OK(ret); + ret = __wt_config_subgets(session, &cval, "write_gen", &value); + if (ret == 0) + write_gen = (uint64_t)value.val; + WT_RET_NOTFOUND_OK(ret); + } + max_durable_ts = WT_MAX(newest_start_durable_ts, newest_stop_durable_ts); + + /* + * Perform rollback to stable when the newest written transaction of the btree is greater than + * or equal to the checkpoint snapshot. The snapshot comparison is valid only when the btree + * write generation number is greater than the last checkpoint connection base write generation + * to confirm that the btree is modified in the previous restart cycle. + */ + if (WT_CHECK_RECOVERY_FLAG_TXNID(session, rollback_txnid) && + (write_gen >= S2C(session)->last_ckpt_base_write_gen)) { + has_txn_updates_gt_than_ckpt_snap = true; + /* Increment the inconsistent checkpoint stats counter. */ + WT_STAT_CONN_DATA_INCR(session, txn_rts_inconsistent_ckpt); + } + + /* + * The rollback to stable will skip the tables during recovery and shutdown in the following + * conditions. + * 1. Empty table. + * 2. Table has timestamped updates without a stable timestamp. + */ + if ((F_ISSET(S2C(session), WT_CONN_RECOVERING) || + F_ISSET(S2C(session), WT_CONN_CLOSING_CHECKPOINT)) && + (addr_size == 0 || (rollback_timestamp == WT_TS_NONE && max_durable_ts != WT_TS_NONE))) { + __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), + "skip rollback to stable on file %s because %s", uri, + addr_size == 0 ? "its checkpoint address length is 0" : + "it has timestamped updates and the stable timestamp is 0"); + return (0); + } + + /* + * The rollback operation should be performed on this file based on the following: + * 1. The dhandle is present in the cache and tree is modified. + * 2. The checkpoint durable start/stop timestamp is greater than the rollback timestamp. + * 3. The checkpoint has prepared updates written to disk. + * 4. There is no durable timestamp in any checkpoint. + * 5. The checkpoint newest txn is greater than snapshot min txn id. + */ + WT_WITHOUT_DHANDLE(session, + WT_WITH_HANDLE_LIST_READ_LOCK( + session, (ret = __rts_btree_walk_check_btree_modified(session, uri, &modified)))); + + WT_ERR_NOTFOUND_OK(ret, false); + + if (modified || max_durable_ts > rollback_timestamp || prepared_updates || !durable_ts_found || + has_txn_updates_gt_than_ckpt_snap) { + /* + * Open a handle; we're potentially opening a lot of handles and there's no reason to cache + * all of them for future unknown use, discard on close. + */ + ret = __wt_session_get_dhandle(session, uri, NULL, NULL, WT_DHANDLE_DISCARD); + if (ret != 0) + WT_ERR_MSG(session, ret, "%s: unable to open handle%s", uri, + ret == EBUSY ? ", error indicates handle is unavailable due to concurrent use" : ""); + dhandle_allocated = true; + + __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), + "tree rolled back because it is modified: %s, or its durable timestamp (%s) > stable " + "timestamp (%s): " + "%s, or it has prepared updates: %s, or durable " + "timestamp is not found: %s, or txnid (%" PRIu64 + ") > recovery checkpoint snap min (%" PRIu64 "): %s", + S2BT(session)->modified ? "true" : "false", + __wt_timestamp_to_string(max_durable_ts, ts_string[0]), + __wt_timestamp_to_string(rollback_timestamp, ts_string[1]), + max_durable_ts > rollback_timestamp ? "true" : "false", + prepared_updates ? "true" : "false", !durable_ts_found ? "true" : "false", rollback_txnid, + S2C(session)->recovery_ckpt_snap_min, + has_txn_updates_gt_than_ckpt_snap ? "true" : "false"); + + WT_ERR(__wt_rts_btree_walk_btree(session, rollback_timestamp)); + } else + __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), + "%s: tree skipped with durable timestamp: %s and stable timestamp: %s or txnid: %" PRIu64, + uri, __wt_timestamp_to_string(max_durable_ts, ts_string[0]), + __wt_timestamp_to_string(rollback_timestamp, ts_string[1]), rollback_txnid); + + /* + * Truncate history store entries for the non-timestamped table. + * Exceptions: + * 1. Modified tree - Scenarios where the tree is never checkpointed lead to zero + * durable timestamp even they are timestamped tables. Until we have a special + * indication of letting to know the table type other than checking checkpointed durable + * timestamp to WT_TS_NONE, we need this exception. + * 2. In-memory database - In this scenario, there is no history store to truncate. + */ + if ((!dhandle_allocated || !S2BT(session)->modified) && max_durable_ts == WT_TS_NONE && + !F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) { + WT_ERR(__wt_config_getones(session, config, "id", &cval)); + btree_id = (uint32_t)cval.val; + WT_ERR(__wt_rts_history_btree_hs_truncate(session, btree_id)); + } + +err: + if (dhandle_allocated) + WT_TRET(__wt_session_release_dhandle(session)); + return (ret); +} + +/* + * __wt_rts_btree_walk_btree -- + * Called for each object handle - choose to either skip or wipe the commits + */ +int +__wt_rts_btree_walk_btree(WT_SESSION_IMPL *session, wt_timestamp_t rollback_timestamp) +{ + WT_BTREE *btree; + WT_CONNECTION_IMPL *conn; + + btree = S2BT(session); + conn = S2C(session); + + __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), + "rollback to stable connection logging enabled: %s and btree logging enabled: %s", + FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) ? "true" : "false", + F_ISSET(btree, WT_BTREE_LOGGED) ? "true" : "false"); + + /* Files with commit-level durability (without timestamps), don't get their commits wiped. */ + if (F_ISSET(btree, WT_BTREE_LOGGED)) + return (0); + + /* There is never anything to do for checkpoint handles. */ + if (WT_READING_CHECKPOINT(session)) + return (0); + + /* There is nothing to do on an empty tree. */ + if (btree->root.page == NULL) + return (0); + + return (__rts_btree_walk(session, rollback_timestamp)); +} diff --git a/src/third_party/wiredtiger/src/rollback_to_stable/rts_history.c b/src/third_party/wiredtiger/src/rollback_to_stable/rts_history.c new file mode 100644 index 00000000000..2d1d4ecd292 --- /dev/null +++ b/src/third_party/wiredtiger/src/rollback_to_stable/rts_history.c @@ -0,0 +1,230 @@ +/*- + * Copyright (c) 2014-present MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_rts_history_delete_hs -- + * Delete the updates for a key in the history store until the first update (including) that is + * larger than or equal to the specified timestamp. + */ +int +__wt_rts_history_delete_hs(WT_SESSION_IMPL *session, WT_ITEM *key, wt_timestamp_t ts) +{ + WT_CURSOR *hs_cursor; + WT_DECL_ITEM(hs_key); + WT_DECL_RET; + WT_TIME_WINDOW *hs_tw; + + /* Open a history store table cursor. */ + WT_RET(__wt_curhs_open(session, NULL, &hs_cursor)); + /* + * Rollback-to-stable operates exclusively (i.e., it is the only active operation in the system) + * outside the constraints of transactions. Therefore, there is no need for snapshot based + * visibility checks. + */ + F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED); + + WT_ERR(__wt_scr_alloc(session, 0, &hs_key)); + + /* + * Scan the history store for the given btree and key with maximum start timestamp to let the + * search point to the last version of the key and start traversing backwards to delete all the + * records until the first update with the start timestamp larger than or equal to the specified + * timestamp. + */ + hs_cursor->set_key(hs_cursor, 4, S2BT(session)->id, key, WT_TS_MAX, UINT64_MAX); + ret = __wt_curhs_search_near_before(session, hs_cursor); + for (; ret == 0; ret = hs_cursor->prev(hs_cursor)) { + /* Retrieve the time window from the history cursor. */ + __wt_hs_upd_time_window(hs_cursor, &hs_tw); + + /* + * Remove all history store versions with a stop timestamp greater than the start/stop + * timestamp of a stable update in the data store. + */ + if (hs_tw->stop_ts <= ts) + break; + + WT_ERR(hs_cursor->remove(hs_cursor)); + WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_removed); + + /* + * The globally visible start time window's are cleared during history store reconciliation. + * Treat them also as a stable entry removal from the history store. + */ + if (hs_tw->start_ts == ts || hs_tw->start_ts == WT_TS_NONE) + WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_rts); + else + WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_rts_unstable); + } + WT_ERR_NOTFOUND_OK(ret, false); + +err: + __wt_scr_free(session, &hs_key); + WT_TRET(hs_cursor->close(hs_cursor)); + return (ret); +} + +/* + * __wt_rts_history_btree_hs_truncate -- + * Wipe all history store updates for the btree (non-timestamped tables) + */ +int +__wt_rts_history_btree_hs_truncate(WT_SESSION_IMPL *session, uint32_t btree_id) +{ + WT_CURSOR *hs_cursor_start, *hs_cursor_stop; + WT_DECL_ITEM(hs_key); + WT_DECL_RET; + WT_SESSION *truncate_session; + wt_timestamp_t hs_start_ts; + uint64_t hs_counter; + uint32_t hs_btree_id; + + hs_cursor_start = hs_cursor_stop = NULL; + hs_btree_id = 0; + truncate_session = (WT_SESSION *)session; + + WT_RET(__wt_scr_alloc(session, 0, &hs_key)); + + /* Open a history store start cursor. */ + WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor_start)); + F_SET(hs_cursor_start, WT_CURSTD_HS_READ_COMMITTED); + + hs_cursor_start->set_key(hs_cursor_start, 1, btree_id); + WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_after(session, hs_cursor_start), true); + if (ret == WT_NOTFOUND) { + ret = 0; + goto done; + } + + /* Open a history store stop cursor. */ + WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor_stop)); + F_SET(hs_cursor_stop, WT_CURSTD_HS_READ_COMMITTED | WT_CURSTD_HS_READ_ACROSS_BTREE); + + hs_cursor_stop->set_key(hs_cursor_stop, 1, btree_id + 1); + WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_after(session, hs_cursor_stop), true); + +#ifdef HAVE_DIAGNOSTIC + /* If we get not found, we are at the largest btree id in the history store. */ + if (ret == 0) { + hs_cursor_stop->get_key(hs_cursor_stop, &hs_btree_id, hs_key, &hs_start_ts, &hs_counter); + WT_ASSERT(session, hs_btree_id > btree_id); + } +#endif + + do { + WT_ASSERT(session, ret == WT_NOTFOUND || hs_btree_id > btree_id); + + WT_ERR_NOTFOUND_OK(hs_cursor_stop->prev(hs_cursor_stop), true); + /* We can find the start point then we must be able to find the stop point. */ + if (ret == WT_NOTFOUND) + WT_ERR_PANIC( + session, ret, "cannot locate the stop point to truncate the history store."); + hs_cursor_stop->get_key(hs_cursor_stop, &hs_btree_id, hs_key, &hs_start_ts, &hs_counter); + } while (hs_btree_id != btree_id); + + WT_ERR( + truncate_session->truncate(truncate_session, NULL, hs_cursor_start, hs_cursor_stop, NULL)); + + WT_STAT_CONN_DATA_INCR(session, cache_hs_btree_truncate); + + __wt_verbose(session, WT_VERB_RECOVERY_PROGRESS, + "Rollback to stable has truncated records for btree %u from the history store", btree_id); + +done: +err: + __wt_scr_free(session, &hs_key); + if (hs_cursor_start != NULL) + WT_TRET(hs_cursor_start->close(hs_cursor_start)); + if (hs_cursor_stop != NULL) + WT_TRET(hs_cursor_stop->close(hs_cursor_stop)); + + return (ret); +} + +/* + * __wt_rts_history_final_pass -- + * Perform rollback to stable on the history store to remove any entries newer than the stable + * timestamp. + */ +int +__wt_rts_history_final_pass(WT_SESSION_IMPL *session, wt_timestamp_t rollback_timestamp) +{ + WT_CONFIG ckptconf; + WT_CONFIG_ITEM cval, durableval, key; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + wt_timestamp_t max_durable_ts, newest_stop_durable_ts, newest_stop_ts; + size_t i; + char *config; + char ts_string[2][WT_TS_INT_STRING_SIZE]; + + config = NULL; + conn = S2C(session); + + WT_RET(__wt_metadata_search(session, WT_HS_URI, &config)); + + /* + * Find out the max durable timestamp of the history store from checkpoint. Most of the history + * store updates have stop timestamp either greater or equal to the start timestamp except for + * the updates written for the prepared updates on the data store. To abort the updates with no + * stop timestamp, we must include the newest stop timestamp also into the calculation of + * maximum timestamp of the history store. + */ + newest_stop_durable_ts = newest_stop_ts = WT_TS_NONE; + WT_ERR(__wt_config_getones(session, config, "checkpoint", &cval)); + __wt_config_subinit(session, &ckptconf, &cval); + for (; __wt_config_next(&ckptconf, &key, &cval) == 0;) { + ret = __wt_config_subgets(session, &cval, "newest_stop_durable_ts", &durableval); + if (ret == 0) + newest_stop_durable_ts = WT_MAX(newest_stop_durable_ts, (wt_timestamp_t)durableval.val); + WT_ERR_NOTFOUND_OK(ret, false); + ret = __wt_config_subgets(session, &cval, "newest_stop_ts", &durableval); + if (ret == 0) + newest_stop_ts = WT_MAX(newest_stop_ts, (wt_timestamp_t)durableval.val); + WT_ERR_NOTFOUND_OK(ret, false); + } + max_durable_ts = WT_MAX(newest_stop_ts, newest_stop_durable_ts); + WT_ERR(__wt_session_get_dhandle(session, WT_HS_URI, NULL, NULL, 0)); + + /* + * The rollback operation should be performed on the history store file when the checkpoint + * durable start/stop timestamp is greater than the rollback timestamp. But skip if there is no + * stable timestamp. + * + * Note that the corresponding code for RTS btree apply also checks whether there _are_ + * timestamped updates by checking max_durable_ts; that check is redundant here for several + * reasons, the most immediate being that max_durable_ts cannot be none (zero) because it's + * greater than rollback_timestamp, which is itself greater than zero. + */ + if (max_durable_ts > rollback_timestamp && rollback_timestamp != WT_TS_NONE) { + __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), + "tree rolled back with durable timestamp: %s", + __wt_timestamp_to_string(max_durable_ts, ts_string[0])); + WT_TRET(__wt_rts_btree_walk_btree(session, rollback_timestamp)); + } else + __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), + "tree skipped with durable timestamp: %s and stable timestamp: %s", + __wt_timestamp_to_string(max_durable_ts, ts_string[0]), + __wt_timestamp_to_string(rollback_timestamp, ts_string[1])); + + /* + * Truncate history store entries from the partial backup remove list. The list holds all of the + * btree ids that do not exist as part of the database anymore due to performing a selective + * restore from backup. + */ + if (F_ISSET(conn, WT_CONN_BACKUP_PARTIAL_RESTORE) && conn->partial_backup_remove_ids != NULL) + for (i = 0; conn->partial_backup_remove_ids[i] != 0; ++i) + WT_ERR(__wt_rts_history_btree_hs_truncate(session, conn->partial_backup_remove_ids[i])); +err: + if (session->dhandle != NULL) + WT_TRET(__wt_session_release_dhandle(session)); + __wt_free(session, config); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/rollback_to_stable/rts_visibility.c b/src/third_party/wiredtiger/src/rollback_to_stable/rts_visibility.c new file mode 100644 index 00000000000..11bd6ac7a5e --- /dev/null +++ b/src/third_party/wiredtiger/src/rollback_to_stable/rts_visibility.c @@ -0,0 +1,156 @@ +/*- + * Copyright (c) 2014-present MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_rts_visibility_has_stable_update -- + * Check if an update chain has a stable update on it. Assume the update chain has already been + * processed so all we need to do is look for a valid, non-aborted entry. + */ +bool +__wt_rts_visibility_has_stable_update(WT_UPDATE *upd) +{ + while (upd != NULL && (upd->type == WT_UPDATE_INVALID || upd->txnid == WT_TXN_ABORTED)) + upd = upd->next; + return (upd != NULL); +} + +/* + * __wt_rts_visibility_txn_visible_id -- + * Check if the transaction id is visible or not. + */ +bool +__wt_rts_visibility_txn_visible_id(WT_SESSION_IMPL *session, uint64_t id) +{ + WT_CONNECTION_IMPL *conn; + + conn = S2C(session); + + /* If not recovery then assume all the data as visible. */ + if (!F_ISSET(conn, WT_CONN_RECOVERING)) + return (true); + + /* + * Only full checkpoint writes the metadata with snapshot. If the recovered checkpoint snapshot + * details are none then return false i.e, updates are visible. + */ + if (conn->recovery_ckpt_snap_min == WT_TXN_NONE && conn->recovery_ckpt_snap_max == WT_TXN_NONE) + return (true); + + return ( + __wt_txn_visible_id_snapshot(id, conn->recovery_ckpt_snap_min, conn->recovery_ckpt_snap_max, + conn->recovery_ckpt_snapshot, conn->recovery_ckpt_snapshot_count)); +} + +/* + * __rts_visibility_get_ref_max_durable_timestamp -- + * Returns the ref aggregated max durable timestamp. The max durable timestamp is calculated + * between both start and stop durable timestamps except for history store, because most of the + * history store updates have stop timestamp either greater or equal to the start timestamp + * except for the updates written for the prepared updates on the data store. To abort the + * updates with no stop timestamp, we must include the newest stop timestamp also into the + * calculation of maximum durable timestamp of the history store. + */ +static wt_timestamp_t +__rts_visibility_get_ref_max_durable_timestamp(WT_SESSION_IMPL *session, WT_TIME_AGGREGATE *ta) +{ + if (WT_IS_HS(session->dhandle)) + return (WT_MAX(ta->newest_stop_durable_ts, ta->newest_stop_ts)); + return (WT_MAX(ta->newest_start_durable_ts, ta->newest_stop_durable_ts)); +} + +/* + * __wt_rts_visibility_page_needs_abort -- + * Check whether the page needs rollback, returning true if the page has modifications newer + * than the given timestamp. + */ +bool +__wt_rts_visibility_page_needs_abort( + WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t rollback_timestamp) +{ + WT_ADDR *addr; + WT_CELL_UNPACK_ADDR vpack; + WT_MULTI *multi; + WT_PAGE_MODIFY *mod; + wt_timestamp_t durable_ts; + uint64_t newest_txn; + uint32_t i; + char ts_string[WT_TS_INT_STRING_SIZE]; + const char *tag; + bool prepared, result; + + addr = ref->addr; + mod = ref->page == NULL ? NULL : ref->page->modify; + durable_ts = WT_TS_NONE; + newest_txn = WT_TXN_NONE; + tag = "undefined state"; + prepared = result = false; + + /* + * The rollback operation should be performed on this page when any one of the following is + * greater than the given timestamp or during recovery if the newest transaction id on the page + * is greater than or equal to recovered checkpoint snapshot min: + * 1. The reconciled replace page max durable timestamp. + * 2. The reconciled multi page max durable timestamp. + * 3. For just-instantiated deleted pages that have not otherwise been modified, the durable + * timestamp in the page delete information. This timestamp isn't reflected in the address's + * time aggregate. + * 4. The on page address max durable timestamp. + * 5. The off page address max durable timestamp. + */ + if (mod != NULL && mod->rec_result == WT_PM_REC_REPLACE) { + tag = "reconciled replace block"; + durable_ts = __rts_visibility_get_ref_max_durable_timestamp(session, &mod->mod_replace.ta); + prepared = mod->mod_replace.ta.prepare; + result = (durable_ts > rollback_timestamp) || prepared; + } else if (mod != NULL && mod->rec_result == WT_PM_REC_MULTIBLOCK) { + tag = "reconciled multi block"; + /* Calculate the max durable timestamp by traversing all multi addresses. */ + for (multi = mod->mod_multi, i = 0; i < mod->mod_multi_entries; ++multi, ++i) { + durable_ts = WT_MAX( + durable_ts, __rts_visibility_get_ref_max_durable_timestamp(session, &multi->addr.ta)); + if (multi->addr.ta.prepare) + prepared = true; + } + result = (durable_ts > rollback_timestamp) || prepared; + } else if (mod != NULL && mod->instantiated && !__wt_page_is_modified(ref->page) && + ref->page_del != NULL) { + tag = "page_del info"; + durable_ts = ref->page_del->durable_timestamp; + prepared = ref->page_del->prepare_state == WT_PREPARE_INPROGRESS || + ref->page_del->prepare_state == WT_PREPARE_LOCKED; + newest_txn = ref->page_del->txnid; + result = (durable_ts > rollback_timestamp) || prepared || + WT_CHECK_RECOVERY_FLAG_TXNID(session, newest_txn); + } else if (!__wt_off_page(ref->home, addr)) { + tag = "on page cell"; + /* Check if the page is obsolete using the page disk address. */ + __wt_cell_unpack_addr(session, ref->home->dsk, (WT_CELL *)addr, &vpack); + durable_ts = __rts_visibility_get_ref_max_durable_timestamp(session, &vpack.ta); + prepared = vpack.ta.prepare; + newest_txn = vpack.ta.newest_txn; + result = (durable_ts > rollback_timestamp) || prepared || + WT_CHECK_RECOVERY_FLAG_TXNID(session, newest_txn); + } else if (addr != NULL) { + tag = "address"; + durable_ts = __rts_visibility_get_ref_max_durable_timestamp(session, &addr->ta); + prepared = addr->ta.prepare; + newest_txn = addr->ta.newest_txn; + result = (durable_ts > rollback_timestamp) || prepared || + WT_CHECK_RECOVERY_FLAG_TXNID(session, newest_txn); + } + + __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), + "%p: page with %s durable timestamp: %s, newest txn: %" PRIu64 + " and prepared updates: %s needs abort: %s", + (void *)ref, tag, __wt_timestamp_to_string(durable_ts, ts_string), newest_txn, + prepared ? "true" : "false", result ? "true" : "false"); + + return (result); +} diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c index 16e475bff58..574101c266f 100644 --- a/src/third_party/wiredtiger/src/session/session_api.c +++ b/src/third_party/wiredtiger/src/session/session_api.c @@ -1396,7 +1396,8 @@ __session_salvage_worker(WT_SESSION_IMPL *session, const char *uri, const char * { WT_RET(__wt_schema_worker( session, uri, __wt_salvage, NULL, cfg, WT_DHANDLE_EXCLUSIVE | WT_BTREE_SALVAGE)); - WT_RET(__wt_schema_worker(session, uri, NULL, __wt_rollback_to_stable_one, cfg, 0)); + WT_RET( + __wt_schema_worker(session, uri, NULL, S2C(session)->rts->rollback_to_stable_one, cfg, 0)); return (0); } diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index 0aca6b79756..8ab3e75bf5a 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -2432,7 +2432,7 @@ __wt_txn_global_shutdown(WT_SESSION_IMPL *session, const char **cfg) __wt_verbose(session, WT_VERB_RTS, "performing shutdown rollback to stable with stable timestamp: %s", __wt_timestamp_to_string(conn->txn_global.stable_timestamp, ts_string)); - WT_TRET(__wt_rollback_to_stable(session, cfg, true)); + WT_TRET(conn->rts->rollback_to_stable(session, cfg, true)); } s = NULL; diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c index 300798f3b31..3e95d24e600 100644 --- a/src/third_party/wiredtiger/src/txn/txn_recover.c +++ b/src/third_party/wiredtiger/src/txn/txn_recover.c @@ -1014,7 +1014,7 @@ done: __wt_timestamp_to_string(conn->txn_global.stable_timestamp, ts_string[0]), __wt_timestamp_to_string(conn->txn_global.oldest_timestamp, ts_string[1])); rts_executed = true; - WT_ERR(__wt_rollback_to_stable(session, NULL, true)); + WT_ERR(conn->rts->rollback_to_stable(session, NULL, true)); } /* diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c deleted file mode 100644 index 02a05cbd0bb..00000000000 --- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c +++ /dev/null @@ -1,2056 +0,0 @@ -/*- - * Copyright (c) 2014-present MongoDB, Inc. - * Copyright (c) 2008-2014 WiredTiger, Inc. - * All rights reserved. - * - * See the file LICENSE for redistribution information. - */ - -#include "wt_internal.h" - -#define WT_CHECK_RECOVERY_FLAG_TXNID(session, txnid) \ - (F_ISSET(S2C(session), WT_CONN_RECOVERING) && S2C(session)->recovery_ckpt_snap_min != 0 && \ - (txnid) >= S2C(session)->recovery_ckpt_snap_min) - -/* Enable rollback to stable verbose messaging during recovery. */ -#define WT_VERB_RECOVERY_RTS(session) \ - (F_ISSET(S2C(session), WT_CONN_RECOVERING) ? \ - WT_DECL_VERBOSE_MULTI_CATEGORY(((WT_VERBOSE_CATEGORY[]){WT_VERB_RECOVERY, WT_VERB_RTS})) : \ - WT_DECL_VERBOSE_MULTI_CATEGORY(((WT_VERBOSE_CATEGORY[]){WT_VERB_RTS}))) - -static bool __rollback_txn_visible_id(WT_SESSION_IMPL *session, uint64_t id); - -/* - * __rollback_delete_hs -- - * Delete the updates for a key in the history store until the first update (including) that is - * larger than or equal to the specified timestamp. - */ -static int -__rollback_delete_hs(WT_SESSION_IMPL *session, WT_ITEM *key, wt_timestamp_t ts) -{ - WT_CURSOR *hs_cursor; - WT_DECL_ITEM(hs_key); - WT_DECL_RET; - WT_TIME_WINDOW *hs_tw; - - /* Open a history store table cursor. */ - WT_RET(__wt_curhs_open(session, NULL, &hs_cursor)); - /* - * Rollback-to-stable operates exclusively (i.e., it is the only active operation in the system) - * outside the constraints of transactions. Therefore, there is no need for snapshot based - * visibility checks. - */ - F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED); - - WT_ERR(__wt_scr_alloc(session, 0, &hs_key)); - - /* - * Scan the history store for the given btree and key with maximum start timestamp to let the - * search point to the last version of the key and start traversing backwards to delete all the - * records until the first update with the start timestamp larger than or equal to the specified - * timestamp. - */ - hs_cursor->set_key(hs_cursor, 4, S2BT(session)->id, key, WT_TS_MAX, UINT64_MAX); - ret = __wt_curhs_search_near_before(session, hs_cursor); - for (; ret == 0; ret = hs_cursor->prev(hs_cursor)) { - /* Retrieve the time window from the history cursor. */ - __wt_hs_upd_time_window(hs_cursor, &hs_tw); - - /* - * Remove all history store versions with a stop timestamp greater than the start/stop - * timestamp of a stable update in the data store. - */ - if (hs_tw->stop_ts <= ts) - break; - - WT_ERR(hs_cursor->remove(hs_cursor)); - WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_removed); - - /* - * The globally visible start time window's are cleared during history store reconciliation. - * Treat them also as a stable entry removal from the history store. - */ - if (hs_tw->start_ts == ts || hs_tw->start_ts == WT_TS_NONE) - WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_rts); - else - WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_rts_unstable); - } - WT_ERR_NOTFOUND_OK(ret, false); - -err: - __wt_scr_free(session, &hs_key); - WT_TRET(hs_cursor->close(hs_cursor)); - return (ret); -} - -/* - * __rollback_abort_update -- - * Abort updates in an update change with timestamps newer than the rollback timestamp. Also, - * clear the history store flag for the first stable update in the update. - */ -static int -__rollback_abort_update(WT_SESSION_IMPL *session, WT_ITEM *key, WT_UPDATE *first_upd, - wt_timestamp_t rollback_timestamp, bool *stable_update_found) -{ - WT_UPDATE *stable_upd, *tombstone, *upd; - char ts_string[2][WT_TS_INT_STRING_SIZE]; - bool txn_id_visible; - - stable_upd = tombstone = NULL; - txn_id_visible = false; - if (stable_update_found != NULL) - *stable_update_found = false; - for (upd = first_upd; upd != NULL; upd = upd->next) { - /* Skip the updates that are aborted. */ - if (upd->txnid == WT_TXN_ABORTED) - continue; - - /* - * An unstable update needs to be aborted if any of the following are true: - * 1. An update is invisible based on the checkpoint snapshot during recovery. - * 2. The update durable timestamp is greater than the stable timestamp. - * 3. The update is a prepared update. - * - * Usually during recovery, there are no in memory updates present on the page. But - * whenever an unstable fast truncate operation is written to the disk, as part - * of the rollback to stable page read, it instantiates the tombstones on the page. - * The transaction id validation is ignored in all scenarios except recovery. - */ - txn_id_visible = __rollback_txn_visible_id(session, upd->txnid); - if (!txn_id_visible || rollback_timestamp < upd->durable_ts || - upd->prepare_state == WT_PREPARE_INPROGRESS) { - __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), - "rollback to stable update aborted with txnid: %" PRIu64 - ", txnid not visible: %s, or stable timestamp (%s) < durable timestamp (%s): %s, or " - "prepare state (%d) is in progress: %s", - upd->txnid, !txn_id_visible ? "true" : "false", - __wt_timestamp_to_string(rollback_timestamp, ts_string[1]), - __wt_timestamp_to_string(upd->durable_ts, ts_string[0]), - rollback_timestamp < upd->durable_ts ? "true" : "false", upd->prepare_state, - upd->prepare_state == WT_PREPARE_INPROGRESS ? "true" : "false"); - - upd->txnid = WT_TXN_ABORTED; - WT_STAT_CONN_INCR(session, txn_rts_upd_aborted); - } else { - /* Valid update is found. */ - stable_upd = upd; - break; - } - } - - /* - * Clear the history store flags for the stable update to indicate that this update should be - * written to the history store later. The next time when this update is moved into the history - * store, it will have a different stop time point. - */ - if (stable_upd != NULL) { - if (F_ISSET(stable_upd, WT_UPDATE_HS | WT_UPDATE_TO_DELETE_FROM_HS)) { - /* Find the update following a stable tombstone. */ - if (stable_upd->type == WT_UPDATE_TOMBSTONE) { - tombstone = stable_upd; - for (stable_upd = stable_upd->next; stable_upd != NULL; - stable_upd = stable_upd->next) { - if (stable_upd->txnid != WT_TXN_ABORTED) { - WT_ASSERT(session, - stable_upd->type != WT_UPDATE_TOMBSTONE && - F_ISSET(stable_upd, WT_UPDATE_HS | WT_UPDATE_TO_DELETE_FROM_HS)); - break; - } - } - } - - /* - * Delete the first stable update and any newer update from the history store. If the - * update following the stable tombstone is removed by obsolete check, no need to remove - * that update from the history store as it has a globally visible tombstone. In that - * case, it is enough to delete everything up until to the tombstone timestamp. - */ - WT_RET(__rollback_delete_hs( - session, key, stable_upd == NULL ? tombstone->start_ts : stable_upd->start_ts)); - - /* - * Clear the history store flags for the first stable update. Otherwise, it will not be - * moved to history store again. - */ - if (stable_upd != NULL) - F_CLR(stable_upd, WT_UPDATE_HS | WT_UPDATE_TO_DELETE_FROM_HS); - if (tombstone != NULL) - F_CLR(tombstone, WT_UPDATE_HS | WT_UPDATE_TO_DELETE_FROM_HS); - } - if (stable_update_found != NULL) - *stable_update_found = true; - } - - return (0); -} - -/* - * __rollback_abort_insert_list -- - * Apply the update abort check to each entry in an insert skip list. Return how many entries - * had stable updates. - */ -static int -__rollback_abort_insert_list(WT_SESSION_IMPL *session, WT_PAGE *page, WT_INSERT_HEAD *head, - wt_timestamp_t rollback_timestamp, uint32_t *stable_updates_count) -{ - WT_DECL_ITEM(key); - WT_DECL_RET; - WT_INSERT *ins; - uint64_t recno; - uint8_t *memp; - bool stable_update_found; - - WT_ERR( - __wt_scr_alloc(session, page->type == WT_PAGE_ROW_LEAF ? 0 : WT_INTPACK64_MAXSIZE, &key)); - - WT_SKIP_FOREACH (ins, head) - if (ins->upd != NULL) { - if (page->type == WT_PAGE_ROW_LEAF) { - key->data = WT_INSERT_KEY(ins); - key->size = WT_INSERT_KEY_SIZE(ins); - } else { - recno = WT_INSERT_RECNO(ins); - memp = key->mem; - WT_ERR(__wt_vpack_uint(&memp, 0, recno)); - key->size = WT_PTRDIFF(memp, key->data); - } - WT_ERR(__rollback_abort_update( - session, key, ins->upd, rollback_timestamp, &stable_update_found)); - if (stable_update_found && stable_updates_count != NULL) - (*stable_updates_count)++; - if (!stable_update_found && page->type == WT_PAGE_ROW_LEAF && - !F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) - /* - * When a new key is added to a page and the page is then checkpointed, updates for - * that key can be present in the History Store while the key isn't present in the - * disk image. RTS will then only remove these updates when there is a stable update - * on-chain. These updates still need removing when no stable updates are on-chain, - * so do so here explicitly. Pass in rollback_timestamp + 1 as __rollback_delete_hs - * removes updates inclusive of the provided timestamp, but we only want to remove - * unstable updates. - * - * FIXME-WT-10017: WT-9846 is an interim fix only for row-store while we investigate - * the impacts of a long term correction in WT-10017. Once completed this change can - * be reverted. - */ - WT_ERR(__rollback_delete_hs(session, key, rollback_timestamp + 1)); - } - -err: - __wt_scr_free(session, &key); - return (ret); -} - -/* - * __rollback_has_stable_update -- - * Check if an update chain has a stable update on it. Assume the update chain has already been - * processed so all we need to do is look for a valid, non-aborted entry. - */ -static bool -__rollback_has_stable_update(WT_UPDATE *upd) -{ - while (upd != NULL && (upd->type == WT_UPDATE_INVALID || upd->txnid == WT_TXN_ABORTED)) - upd = upd->next; - return (upd != NULL); -} - -/* - * __rollback_col_modify -- - * Add the provided update to the head of the update list. - */ -static inline int -__rollback_col_modify(WT_SESSION_IMPL *session, WT_REF *ref, WT_UPDATE *upd, uint64_t recno) -{ - WT_CURSOR_BTREE cbt; - WT_DECL_RET; - - __wt_btcur_init(session, &cbt); - __wt_btcur_open(&cbt); - - /* Search the page. */ - WT_ERR(__wt_col_search(&cbt, recno, ref, true, NULL)); - - /* Apply the modification. */ -#ifdef HAVE_DIAGNOSTIC - WT_ERR(__wt_col_modify(&cbt, recno, NULL, upd, WT_UPDATE_INVALID, true, false)); -#else - WT_ERR(__wt_col_modify(&cbt, recno, NULL, upd, WT_UPDATE_INVALID, true)); -#endif - -err: - /* Free any resources that may have been cached in the cursor. */ - WT_TRET(__wt_btcur_close(&cbt, true)); - - return (ret); -} - -/* - * __rollback_row_modify -- - * Add the provided update to the head of the update list. - */ -static inline int -__rollback_row_modify(WT_SESSION_IMPL *session, WT_REF *ref, WT_UPDATE *upd, WT_ITEM *key) -{ - WT_CURSOR_BTREE cbt; - WT_DECL_RET; - - __wt_btcur_init(session, &cbt); - __wt_btcur_open(&cbt); - - /* Search the page. */ - WT_ERR(__wt_row_search(&cbt, key, true, ref, true, NULL)); - - /* Apply the modification. */ -#ifdef HAVE_DIAGNOSTIC - WT_ERR(__wt_row_modify(&cbt, key, NULL, upd, WT_UPDATE_INVALID, true, false)); -#else - WT_ERR(__wt_row_modify(&cbt, key, NULL, upd, WT_UPDATE_INVALID, true)); -#endif - -err: - /* Free any resources that may have been cached in the cursor. */ - WT_TRET(__wt_btcur_close(&cbt, true)); - - return (ret); -} - -/* - * __rollback_txn_visible_id -- - * Check if the transaction id is visible or not. - */ -static bool -__rollback_txn_visible_id(WT_SESSION_IMPL *session, uint64_t id) -{ - WT_CONNECTION_IMPL *conn; - - conn = S2C(session); - - /* If not recovery then assume all the data as visible. */ - if (!F_ISSET(conn, WT_CONN_RECOVERING)) - return (true); - - /* - * Only full checkpoint writes the metadata with snapshot. If the recovered checkpoint snapshot - * details are none then return false i.e, updates are visible. - */ - if (conn->recovery_ckpt_snap_min == WT_TXN_NONE && conn->recovery_ckpt_snap_max == WT_TXN_NONE) - return (true); - - return ( - __wt_txn_visible_id_snapshot(id, conn->recovery_ckpt_snap_min, conn->recovery_ckpt_snap_max, - conn->recovery_ckpt_snapshot, conn->recovery_ckpt_snapshot_count)); -} - -/* - * __rollback_ondisk_fixup_key -- - * Abort updates in the history store and replace the on-disk value with an update that - * satisfies the given timestamp. - */ -static int -__rollback_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_REF *ref, WT_ROW *rip, uint64_t recno, - WT_ITEM *row_key, WT_CELL_UNPACK_KV *unpack, wt_timestamp_t rollback_timestamp) -{ - WT_CURSOR *hs_cursor; - WT_DECL_ITEM(full_value); - WT_DECL_ITEM(hs_key); - WT_DECL_ITEM(hs_value); - WT_DECL_ITEM(key); - WT_DECL_ITEM(key_string); - WT_DECL_RET; - WT_PAGE *page; - WT_TIME_WINDOW *hs_tw; - WT_UPDATE *tombstone, *upd; - wt_timestamp_t hs_durable_ts, hs_start_ts, hs_stop_durable_ts, newer_hs_durable_ts, pinned_ts; - uint64_t hs_counter, type_full; - uint32_t hs_btree_id; - uint8_t *memp; - uint8_t type; - char ts_string[4][WT_TS_INT_STRING_SIZE]; - char tw_string[WT_TIME_STRING_SIZE]; - bool valid_update_found; -#ifdef HAVE_DIAGNOSTIC - bool first_record; -#endif - - page = ref->page; - - hs_cursor = NULL; - tombstone = upd = NULL; - hs_durable_ts = hs_start_ts = hs_stop_durable_ts = WT_TS_NONE; - hs_btree_id = S2BT(session)->id; - valid_update_found = false; -#ifdef HAVE_DIAGNOSTIC - first_record = true; -#endif - - /* Allocate buffers for the data store and history store key. */ - WT_ERR(__wt_scr_alloc(session, 0, &hs_key)); - WT_ERR(__wt_scr_alloc(session, 0, &hs_value)); - - if (rip != NULL) { - if (row_key != NULL) - key = row_key; - else { - /* Unpack a row key. */ - WT_ERR(__wt_scr_alloc(session, 0, &key)); - WT_ERR(__wt_row_leaf_key(session, page, rip, key, false)); - } - } else { - /* Manufacture a column key. */ - WT_ERR(__wt_scr_alloc(session, WT_INTPACK64_MAXSIZE, &key)); - memp = key->mem; - WT_ERR(__wt_vpack_uint(&memp, 0, recno)); - key->size = WT_PTRDIFF(memp, key->data); - } - - WT_ERR(__wt_scr_alloc(session, 0, &key_string)); - __wt_verbose_level_multi(session, WT_VERB_RECOVERY_RTS(session), WT_VERBOSE_DEBUG_2, - "rolling back the on-disk key: %s", - __wt_key_string(session, key->data, key->size, S2BT(session)->key_format, key_string)); - - WT_ERR(__wt_scr_alloc(session, 0, &full_value)); - WT_ERR(__wt_page_cell_data_ref_kv(session, page, unpack, full_value)); - /* - * We can read overflow removed value if checkpoint has run before rollback to stable. In this - * case, we have already appended the on page value to the update chain. At this point, we have - * visited the update chain and decided the value is not stable. In addition, checkpoint must - * have moved this value to the history store as a full value. Therefore, we can safely ignore - * the on page value if it is overflow removed. - */ - if (__wt_cell_type_raw(unpack->cell) == WT_CELL_VALUE_OVFL_RM) - ret = 0; - else - WT_ERR(__wt_buf_set(session, full_value, full_value->data, full_value->size)); - - newer_hs_durable_ts = unpack->tw.durable_start_ts; - - __wt_txn_pinned_timestamp(session, &pinned_ts); - - /* Open a history store table cursor. */ - WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor)); - /* - * Rollback-to-stable operates exclusively (i.e., it is the only active operation in the system) - * outside the constraints of transactions. Therefore, there is no need for snapshot based - * visibility checks. - */ - F_SET(hs_cursor, WT_CURSTD_HS_READ_ALL); - - /* - * Scan the history store for the given btree and key with maximum start timestamp to let the - * search point to the last version of the key and start traversing backwards to find out the - * satisfying record according the given timestamp. Any satisfying history store record is moved - * into data store and removed from history store. If none of the history store records satisfy - * the given timestamp, the key is removed from data store. - */ - hs_cursor->set_key(hs_cursor, 4, hs_btree_id, key, WT_TS_MAX, UINT64_MAX); - ret = __wt_curhs_search_near_before(session, hs_cursor); - for (; ret == 0; ret = hs_cursor->prev(hs_cursor)) { - WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, hs_key, &hs_start_ts, &hs_counter)); - - /* Get current value and convert to full update if it is a modify. */ - WT_ERR(hs_cursor->get_value( - hs_cursor, &hs_stop_durable_ts, &hs_durable_ts, &type_full, hs_value)); - type = (uint8_t)type_full; - - /* Retrieve the time window from the history cursor. */ - __wt_hs_upd_time_window(hs_cursor, &hs_tw); - - /* - * We have a tombstone on the history update and it is obsolete according to the timestamp - * and txnid, so no need to restore it. These obsolete updates are written to the disk when - * they are not obsolete at the time of reconciliation by an eviction thread and later they - * become obsolete according to the checkpoint. - */ - if (__rollback_txn_visible_id(session, hs_tw->stop_txn) && - hs_tw->durable_stop_ts <= pinned_ts) { - __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), - "history store stop is obsolete with time window: %s and pinned timestamp: %s", - __wt_time_window_to_string(hs_tw, tw_string), - __wt_timestamp_to_string(pinned_ts, ts_string[0])); - WT_ERR(hs_cursor->remove(hs_cursor)); - WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_removed); - continue; - } - - /* - * Do not include history store updates greater than on-disk data store version to construct - * a full update to restore except when the on-disk update is prepared. Including more - * recent updates than the on-disk version shouldn't be problem as the on-disk version in - * history store is always a full update. It is better to not to include those updates as it - * unnecessarily increases the rollback to stable time. - * - * Comparing with timestamps here has no problem unlike in search flow where the timestamps - * may be reset during reconciliation. RTS detects an on-disk update is unstable based on - * the written proper timestamp, so comparing against it with history store shouldn't have - * any problem. - */ - if (hs_tw->start_ts <= unpack->tw.start_ts || unpack->tw.prepare) { - if (type == WT_UPDATE_MODIFY) - WT_ERR(__wt_modify_apply_item( - session, S2BT(session)->value_format, full_value, hs_value->data)); - else { - WT_ASSERT(session, type == WT_UPDATE_STANDARD); - WT_ERR(__wt_buf_set(session, full_value, hs_value->data, hs_value->size)); - } - } else - __wt_verbose_level_multi(session, WT_VERB_RECOVERY_RTS(session), WT_VERBOSE_DEBUG_2, - "history store update more recent than on-disk update with time window: %s and type: " - "%" PRIu8, - __wt_time_window_to_string(hs_tw, tw_string), type); - - /* - * Verify the history store timestamps are in order. The start timestamp may be equal to the - * stop timestamp if the original update's commit timestamp is in order. We may see records - * newer than or equal to the onpage value if eviction runs concurrently with checkpoint. In - * that case, don't verify the first record. - * - * It is possible during a prepared transaction rollback, the history store update that have - * its own stop timestamp doesn't get removed leads to duplicate records in history store - * after further operations on that same key. Rollback to stable should ignore such records - * for timestamp ordering verification. - * - * If we have fixed the missing timestamps, then the newer update reinserted with an older - * timestamp may have a durable timestamp that is smaller than the current stop durable - * timestamp. - * - * It is possible that there can be an update in the history store with a max stop timestamp - * in the middle of the same key updates. This occurs when the checkpoint writes the - * committed prepared update and further updates on that key including the history store - * changes before the transaction fixes the history store update to have a proper stop - * timestamp. It is a rare scenario. - */ - WT_ASSERT(session, - hs_stop_durable_ts <= newer_hs_durable_ts || hs_start_ts == hs_stop_durable_ts || - hs_start_ts == newer_hs_durable_ts || newer_hs_durable_ts == hs_durable_ts || - first_record || hs_stop_durable_ts == WT_TS_MAX); - - if (hs_stop_durable_ts < newer_hs_durable_ts) - WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_stop_older_than_newer_start); - - /* - * Validate the timestamps in the key and the cell are same. This must be validated only - * after verifying it's stop time window is not globally visible. The start timestamps of - * the time window are cleared when they are globally visible and there will be no stop - * timestamp in the history store whenever a prepared update is written to the data store. - */ - WT_ASSERT(session, - (hs_tw->start_ts == WT_TS_NONE || hs_tw->start_ts == hs_start_ts) && - (hs_tw->durable_start_ts == WT_TS_NONE || hs_tw->durable_start_ts == hs_durable_ts) && - ((hs_tw->durable_stop_ts == 0 && hs_stop_durable_ts == WT_TS_MAX) || - hs_tw->durable_stop_ts == hs_stop_durable_ts)); - - /* - * Stop processing when we find a stable update according to the given timestamp and - * transaction id. - */ - if (__rollback_txn_visible_id(session, hs_tw->start_txn) && - hs_tw->durable_start_ts <= rollback_timestamp) { - __wt_verbose_level_multi(session, WT_VERB_RECOVERY_RTS(session), WT_VERBOSE_DEBUG_2, - "history store update valid with time window: %s, type: %" PRIu8 - " and stable timestamp: %s", - __wt_time_window_to_string(hs_tw, tw_string), type, - __wt_timestamp_to_string(rollback_timestamp, ts_string[0])); - WT_ASSERT(session, unpack->tw.prepare || hs_tw->start_ts <= unpack->tw.start_ts); - valid_update_found = true; - break; - } - - __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), - "history store update aborted with time window: %s, type: %" PRIu8 - " and stable timestamp: %s", - __wt_time_window_to_string(hs_tw, tw_string), type, - __wt_timestamp_to_string(rollback_timestamp, ts_string[3])); - - /* - * Start time point of the current record may be used as stop time point of the previous - * record. Save it to verify against the previous record and check if we need to append the - * stop time point as a tombstone when we rollback the history store record. - */ - newer_hs_durable_ts = hs_durable_ts; -#ifdef HAVE_DIAGNOSTIC - first_record = false; -#endif - - WT_ERR(hs_cursor->remove(hs_cursor)); - WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_removed); - WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_rts_unstable); - } - - /* - * If we found a history value that satisfied the given timestamp, add it to the update list. - * Otherwise remove the key by adding a tombstone. - */ - if (valid_update_found) { - /* Retrieve the time window from the history cursor. */ - __wt_hs_upd_time_window(hs_cursor, &hs_tw); - WT_ASSERT(session, - hs_tw->start_ts < unpack->tw.start_ts || hs_tw->start_txn < unpack->tw.start_txn); - WT_ERR(__wt_upd_alloc(session, full_value, WT_UPDATE_STANDARD, &upd, NULL)); - - /* - * Set the transaction id of updates to WT_TXN_NONE when called from recovery, because the - * connections write generation will be initialized after rollback to stable and the updates - * in the cache will be problematic. The transaction id of pages which are in disk will be - * automatically reset as part of unpacking cell when loaded to cache. - */ - if (F_ISSET(S2C(session), WT_CONN_RECOVERING)) - upd->txnid = WT_TXN_NONE; - else - upd->txnid = hs_tw->start_txn; - upd->durable_ts = hs_tw->durable_start_ts; - upd->start_ts = hs_tw->start_ts; - __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), - "history store update restored txnid: %" PRIu64 ", start_ts: %s and durable_ts: %s", - upd->txnid, __wt_timestamp_to_string(upd->start_ts, ts_string[0]), - __wt_timestamp_to_string(upd->durable_ts, ts_string[1])); - - /* - * Set the flag to indicate that this update has been restored from history store for the - * rollback to stable operation. - */ - F_SET(upd, WT_UPDATE_RESTORED_FROM_HS); - WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_restore_updates); - - /* - * We have a tombstone on the original update chain and it is stable according to the - * timestamp and txnid, we need to restore that as well. - */ - if (__rollback_txn_visible_id(session, hs_tw->stop_txn) && - hs_tw->durable_stop_ts <= rollback_timestamp) { - /* - * The restoring tombstone timestamp must be zero or less than previous update start - * timestamp. - */ - WT_ASSERT(session, - hs_stop_durable_ts == WT_TS_NONE || hs_stop_durable_ts < newer_hs_durable_ts || - unpack->tw.prepare); - - WT_ERR(__wt_upd_alloc_tombstone(session, &tombstone, NULL)); - /* - * Set the transaction id of updates to WT_TXN_NONE when called from recovery, because - * the connections write generation will be initialized after rollback to stable and the - * updates in the cache will be problematic. The transaction id of pages which are in - * disk will be automatically reset as part of unpacking cell when loaded to cache. - */ - if (F_ISSET(S2C(session), WT_CONN_RECOVERING)) - tombstone->txnid = WT_TXN_NONE; - else - tombstone->txnid = hs_tw->stop_txn; - tombstone->durable_ts = hs_tw->durable_stop_ts; - tombstone->start_ts = hs_tw->stop_ts; - __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), - "history store tombstone restored txnid: %" PRIu64 - ", start_ts: %s and durable_ts: %s", - tombstone->txnid, __wt_timestamp_to_string(tombstone->start_ts, ts_string[0]), - __wt_timestamp_to_string(tombstone->durable_ts, ts_string[1])); - - /* - * Set the flag to indicate that this update has been restored from history store for - * the rollback to stable operation. - */ - F_SET(tombstone, WT_UPDATE_RESTORED_FROM_HS); - - tombstone->next = upd; - upd = tombstone; - WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_restore_tombstones); - } - } else { - WT_ERR(__wt_upd_alloc_tombstone(session, &upd, NULL)); - WT_STAT_CONN_DATA_INCR(session, txn_rts_keys_removed); - __wt_verbose_level_multi( - session, WT_VERB_RECOVERY_RTS(session), WT_VERBOSE_DEBUG_3, "%s", "key removed"); - } - - if (rip != NULL) - WT_ERR(__rollback_row_modify(session, ref, upd, key)); - else - WT_ERR(__rollback_col_modify(session, ref, upd, recno)); - - /* Finally remove that update from history store. */ - if (valid_update_found) { - /* Avoid freeing the updates while still in use if hs_cursor->remove fails. */ - upd = tombstone = NULL; - - WT_ERR(hs_cursor->remove(hs_cursor)); - WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_removed); - WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_rts); - } - - if (0) { -err: - WT_ASSERT(session, tombstone == NULL || upd == tombstone); - __wt_free_update_list(session, &upd); - } - __wt_scr_free(session, &full_value); - __wt_scr_free(session, &hs_key); - __wt_scr_free(session, &hs_value); - if (rip == NULL || row_key == NULL) - __wt_scr_free(session, &key); - __wt_scr_free(session, &key_string); - if (hs_cursor != NULL) - WT_TRET(hs_cursor->close(hs_cursor)); - return (ret); -} - -/* - * __rollback_abort_ondisk_kv -- - * Fix the on-disk K/V version according to the given timestamp. - */ -static int -__rollback_abort_ondisk_kv(WT_SESSION_IMPL *session, WT_REF *ref, WT_ROW *rip, uint64_t recno, - WT_ITEM *row_key, WT_CELL_UNPACK_KV *vpack, wt_timestamp_t rollback_timestamp, - bool *is_ondisk_stable) -{ - WT_DECL_ITEM(key); - WT_DECL_ITEM(key_string); - WT_DECL_ITEM(tmp); - WT_DECL_RET; - WT_PAGE *page; - WT_UPDATE *upd; - uint8_t *memp; - char time_string[WT_TIME_STRING_SIZE]; - char ts_string[5][WT_TS_INT_STRING_SIZE]; - bool prepared; - - page = ref->page; - upd = NULL; - - /* Initialize the on-disk stable version flag. */ - if (is_ondisk_stable != NULL) - *is_ondisk_stable = false; - - prepared = vpack->tw.prepare; - if (WT_IS_HS(session->dhandle)) { - /* - * Abort the history store update with stop durable timestamp greater than the stable - * timestamp or the updates with max stop timestamp which implies that they are associated - * with prepared transactions. - */ - if (vpack->tw.durable_stop_ts > rollback_timestamp || vpack->tw.stop_ts == WT_TS_MAX) { - __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), - "history store update aborted with start durable/commit timestamp: %s, %s, stop " - "durable/commit timestamp: %s, %s and stable timestamp: %s", - __wt_timestamp_to_string(vpack->tw.durable_start_ts, ts_string[0]), - __wt_timestamp_to_string(vpack->tw.start_ts, ts_string[1]), - __wt_timestamp_to_string(vpack->tw.durable_stop_ts, ts_string[2]), - __wt_timestamp_to_string(vpack->tw.stop_ts, ts_string[3]), - __wt_timestamp_to_string(rollback_timestamp, ts_string[4])); - WT_RET(__wt_upd_alloc_tombstone(session, &upd, NULL)); - WT_STAT_CONN_DATA_INCR(session, txn_rts_sweep_hs_keys); - } else - return (0); - } else if (vpack->tw.durable_start_ts > rollback_timestamp || - !__rollback_txn_visible_id(session, vpack->tw.start_txn) || - (!WT_TIME_WINDOW_HAS_STOP(&vpack->tw) && prepared)) { - __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), - "on-disk update aborted with time window %s. Start durable timestamp > stable timestamp: " - "%s, or txnid is not visible: %s, or tw has not stop and is prepared: %s", - __wt_time_point_to_string( - vpack->tw.start_ts, vpack->tw.durable_start_ts, vpack->tw.start_txn, time_string), - vpack->tw.durable_start_ts > rollback_timestamp ? "true" : "false", - !__rollback_txn_visible_id(session, vpack->tw.start_txn) ? "true" : "false", - !WT_TIME_WINDOW_HAS_STOP(&vpack->tw) && prepared ? "true" : "false"); - if (!F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) - return (__rollback_ondisk_fixup_key( - session, ref, rip, recno, row_key, vpack, rollback_timestamp)); - else { - /* - * In-memory database don't have a history store to provide a stable update, so remove - * the key. Note that an in-memory database will have saved old values in the update - * chain, so we should only get here for a key/value that never existed at all as of the - * rollback timestamp; thus, deleting it is the correct response. - */ - WT_RET(__wt_upd_alloc_tombstone(session, &upd, NULL)); - WT_STAT_CONN_DATA_INCR(session, txn_rts_keys_removed); - } - } else if (WT_TIME_WINDOW_HAS_STOP(&vpack->tw) && - (vpack->tw.durable_stop_ts > rollback_timestamp || - !__rollback_txn_visible_id(session, vpack->tw.stop_txn) || prepared)) { - /* - * For prepared transactions, it is possible that both the on-disk key start and stop time - * windows can be the same. To abort these updates, check for any stable update from history - * store or remove the key. - */ - if (vpack->tw.start_ts == vpack->tw.stop_ts && - vpack->tw.durable_start_ts == vpack->tw.durable_stop_ts && - vpack->tw.start_txn == vpack->tw.stop_txn) { - WT_ASSERT(session, prepared == true); - if (!F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) - return (__rollback_ondisk_fixup_key( - session, ref, rip, recno, row_key, vpack, rollback_timestamp)); - else { - /* - * In-memory database don't have a history store to provide a stable update, so - * remove the key. - */ - WT_RET(__wt_upd_alloc_tombstone(session, &upd, NULL)); - WT_STAT_CONN_DATA_INCR(session, txn_rts_keys_removed); - } - } else { - /* - * Clear the remove operation from the key by inserting the original on-disk value as a - * standard update. - */ - WT_RET(__wt_scr_alloc(session, 0, &tmp)); - if ((ret = __wt_page_cell_data_ref_kv(session, page, vpack, tmp)) == 0) - ret = __wt_upd_alloc(session, tmp, WT_UPDATE_STANDARD, &upd, NULL); - __wt_scr_free(session, &tmp); - WT_RET(ret); - - /* - * Set the transaction id of updates to WT_TXN_NONE when called from recovery, because - * the connections write generation will be initialized after rollback to stable and the - * updates in the cache will be problematic. The transaction id of pages which are in - * disk will be automatically reset as part of unpacking cell when loaded to cache. - */ - if (F_ISSET(S2C(session), WT_CONN_RECOVERING)) - upd->txnid = WT_TXN_NONE; - else - upd->txnid = vpack->tw.start_txn; - upd->durable_ts = vpack->tw.durable_start_ts; - upd->start_ts = vpack->tw.start_ts; - F_SET(upd, WT_UPDATE_RESTORED_FROM_DS); - WT_STAT_CONN_DATA_INCR(session, txn_rts_keys_restored); - __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), - "key restored with commit timestamp: %s, durable timestamp: %s, stable timestamp: " - "%s, " - "txnid: %" PRIu64 - " and removed commit timestamp: %s, durable timestamp: %s, txnid: %" PRIu64 - ", prepared: %s", - __wt_timestamp_to_string(upd->start_ts, ts_string[0]), - __wt_timestamp_to_string(upd->durable_ts, ts_string[1]), - __wt_timestamp_to_string(rollback_timestamp, ts_string[2]), upd->txnid, - __wt_timestamp_to_string(vpack->tw.stop_ts, ts_string[3]), - __wt_timestamp_to_string(vpack->tw.durable_stop_ts, ts_string[4]), vpack->tw.stop_txn, - prepared ? "true" : "false"); - } - } else { - /* Stable version according to the timestamp. */ - if (is_ondisk_stable != NULL) - *is_ondisk_stable = true; - return (0); - } - - if (rip != NULL) { - if (row_key != NULL) - key = row_key; - else { - /* Unpack a row key. */ - WT_ERR(__wt_scr_alloc(session, 0, &key)); - WT_ERR(__wt_row_leaf_key(session, page, rip, key, false)); - } - } else { - /* Manufacture a column key. */ - WT_ERR(__wt_scr_alloc(session, WT_INTPACK64_MAXSIZE, &key)); - memp = key->mem; - WT_ERR(__wt_vpack_uint(&memp, 0, recno)); - key->size = WT_PTRDIFF(memp, key->data); - } - - WT_ERR(__wt_scr_alloc(session, 0, &key_string)); - __wt_verbose_level_multi(session, WT_VERB_RECOVERY_RTS(session), WT_VERBOSE_DEBUG_2, - "removing the key%s: %s", upd->type == WT_UPDATE_TOMBSTONE ? "" : " tombstone", - __wt_key_string(session, key->data, key->size, S2BT(session)->key_format, key_string)); - - if (rip != NULL) - WT_ERR(__rollback_row_modify(session, ref, upd, key)); - else - WT_ERR(__rollback_col_modify(session, ref, upd, recno)); - - if (0) { -err: - __wt_free(session, upd); - } - if (rip == NULL || row_key == NULL) - __wt_scr_free(session, &key); - __wt_scr_free(session, &key_string); - return (ret); -} - -/* - * __rollback_abort_col_var -- - * Abort updates on a variable length col leaf page with timestamps newer than the rollback - * timestamp. - */ -static int -__rollback_abort_col_var(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t rollback_timestamp) -{ - WT_CELL *kcell; - WT_CELL_UNPACK_KV unpack; - WT_COL *cip; - WT_INSERT *ins; - WT_INSERT_HEAD *inshead; - WT_PAGE *page; - uint64_t ins_recno, recno, rle; - uint32_t i, j, stable_updates_count; - bool is_ondisk_stable; - - page = ref->page; - /* - * If a disk image exists, start from the provided recno; or else start from 0. - */ - if (page->dsk != NULL) - recno = page->dsk->recno; - else - recno = 0; - - /* Review the changes to the original on-page data items. */ - WT_COL_FOREACH (page, cip, i) { - stable_updates_count = 0; - - if ((inshead = WT_COL_UPDATE(page, cip)) != NULL) - WT_RET(__rollback_abort_insert_list( - session, page, inshead, rollback_timestamp, &stable_updates_count)); - - if (page->dsk != NULL) { - /* Unpack the cell. We need its RLE count whether or not we're going to iterate it. */ - kcell = WT_COL_PTR(page, cip); - __wt_cell_unpack_kv(session, page->dsk, kcell, &unpack); - rle = __wt_cell_rle(&unpack); - - /* - * Each key whose on-disk value is not stable and has no stable update on the update - * list must be processed downstream. - * - * If we can determine that the cell's on-disk value is stable, we can skip iterating - * over the cell; likewise, if we can determine that every key in the cell has a stable - * update on the update list, we can skip the iteration. Otherwise we have to try each - * key. - * - * If the on-disk cell is deleted, it is stable, because cells only appear as deleted - * when there is no older value that might need to be restored. - * - * Note that in a purely timestamped world, the presence of any stable update for any - * key in the cell means the on-disk value must be stable, because the update must be - * newer than the on-disk value. However, this is no longer true if the stable update - * has no timestamp. It may also not be true if the on-disk value is prepared, or other - * corner cases. Therefore, we must iterate the cell unless _every_ key has a stable - * update. - * - * We can, however, stop iterating as soon as the downstream code reports back that the - * on-disk value is actually stable. - */ - if (unpack.type == WT_CELL_DEL) - WT_STAT_CONN_DATA_INCR(session, txn_rts_delete_rle_skipped); - else if (stable_updates_count == rle) - WT_STAT_CONN_DATA_INCR(session, txn_rts_stable_rle_skipped); - else { - j = 0; - if (inshead != NULL) { - WT_SKIP_FOREACH (ins, inshead) { - /* If the update list goes past the end of the cell, something's wrong. */ - WT_ASSERT(session, j < rle); - ins_recno = WT_INSERT_RECNO(ins); - /* Process all the keys before this update. */ - while (recno + j < ins_recno) { - WT_RET(__rollback_abort_ondisk_kv(session, ref, NULL, recno + j, NULL, - &unpack, rollback_timestamp, &is_ondisk_stable)); - /* We can stop right away if the on-disk version is stable. */ - if (is_ondisk_stable) { - if (rle > 1) - WT_STAT_CONN_DATA_INCR(session, txn_rts_stable_rle_skipped); - goto stop; - } - j++; - } - /* If this key has a stable update, skip over it. */ - if (recno + j == ins_recno && __rollback_has_stable_update(ins->upd)) - j++; - } - } - /* Process the rest of the keys. */ - while (j < rle) { - WT_RET(__rollback_abort_ondisk_kv(session, ref, NULL, recno + j, NULL, &unpack, - rollback_timestamp, &is_ondisk_stable)); - /* We can stop right away if the on-disk version is stable. */ - if (is_ondisk_stable) { - if (rle > 1) - WT_STAT_CONN_DATA_INCR(session, txn_rts_stable_rle_skipped); - goto stop; - } - j++; - } - } -stop: - recno += rle; - } - } - - /* Review the append list */ - if ((inshead = WT_COL_APPEND(page)) != NULL) - WT_RET(__rollback_abort_insert_list(session, page, inshead, rollback_timestamp, NULL)); - - return (0); -} - -/* - * __rollback_abort_col_fix_one -- - * Handle one possibly unstable on-disk time window. - */ -static int -__rollback_abort_col_fix_one(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t tw, - uint32_t recno_offset, wt_timestamp_t rollback_timestamp) -{ - WT_BTREE *btree; - WT_CELL *cell; - WT_CELL_UNPACK_KV unpack; - WT_PAGE *page; - uint8_t value; - - btree = S2BT(session); - page = ref->page; - - /* Unpack the cell to get the time window. */ - cell = WT_COL_FIX_TW_CELL(page, &page->pg_fix_tws[tw]); - __wt_cell_unpack_kv(session, page->dsk, cell, &unpack); - - /* Fake up the value (which is not physically in the cell) in case it's wanted. */ - value = __bit_getv(page->pg_fix_bitf, recno_offset, btree->bitcnt); - unpack.data = &value; - unpack.size = 1; - - return (__rollback_abort_ondisk_kv(session, ref, NULL, page->dsk->recno + recno_offset, NULL, - &unpack, rollback_timestamp, NULL)); -} - -/* - * __rollback_abort_col_fix -- - * Abort updates on a fixed length col leaf page with timestamps newer than the rollback - * timestamp. - */ -static int -__rollback_abort_col_fix(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t rollback_timestamp) -{ - WT_INSERT *ins; - WT_INSERT_HEAD *inshead; - WT_PAGE *page; - uint32_t ins_recno_offset, recno_offset, numtws, tw; - - page = ref->page; - - /* - * Review the changes to the original on-page data items. Note that while this can report back - * to us whether it saw a stable update, that information doesn't do us any good -- unlike in - * VLCS where the uniformity of cells lets us reason about the timestamps of all of them based - * on the timestamp of an update to any of them, in FLCS everything is just thrown together, so - * we'll need to iterate over all the keys anyway. - */ - if ((inshead = WT_COL_UPDATE_SINGLE(page)) != NULL) - WT_RET(__rollback_abort_insert_list(session, page, inshead, rollback_timestamp, NULL)); - - /* - * Iterate over all the keys, stopping only on keys that (a) have a time window on disk, and - * also (b) do not have a stable update remaining in the update list. Keys with no on-disk time - * window are stable. And we must not try to adjust the on-disk value for keys with stable - * updates, because the downstream code assumes that has already been checked and in some cases - * (e.g. in-memory databases) the wrong thing will happen. - * - * Iterate over the update list and carry along the iteration over the time window list in - * parallel, even though the code would perhaps make more sense the other way around, because - * this allows using the skiplist iterator macro instead of an open-coded mess. - */ - numtws = WT_COL_FIX_TWS_SET(page) ? page->pg_fix_numtws : 0; - WT_ASSERT(session, numtws == 0 || page->dsk != NULL); - tw = 0; - if (inshead != NULL) { - WT_SKIP_FOREACH (ins, inshead) { - /* Process all the keys before this update entry. */ - ins_recno_offset = (uint32_t)(WT_INSERT_RECNO(ins) - ref->ref_recno); - while (tw < numtws && - (recno_offset = page->pg_fix_tws[tw].recno_offset) < ins_recno_offset) { - - WT_RET( - __rollback_abort_col_fix_one(session, ref, tw, recno_offset, rollback_timestamp)); - tw++; - } - /* If this key has a stable update, skip over it. */ - if (tw < numtws && page->pg_fix_tws[tw].recno_offset == ins_recno_offset && - ins->upd != NULL && __rollback_has_stable_update(ins->upd)) - tw++; - } - } - /* Process the rest of the keys with time windows. */ - while (tw < numtws) { - recno_offset = page->pg_fix_tws[tw].recno_offset; - WT_RET(__rollback_abort_col_fix_one(session, ref, tw, recno_offset, rollback_timestamp)); - tw++; - } - - /* Review the append list. */ - if ((inshead = WT_COL_APPEND(page)) != NULL) - WT_RET(__rollback_abort_insert_list(session, page, inshead, rollback_timestamp, NULL)); - - return (0); -} - -/* - * __rollback_abort_row_leaf -- - * Abort updates on a row leaf page with timestamps newer than the rollback timestamp. - */ -static int -__rollback_abort_row_leaf(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t rollback_timestamp) -{ - WT_CELL_UNPACK_KV *vpack, _vpack; - WT_DECL_ITEM(key); - WT_DECL_RET; - WT_INSERT_HEAD *insert; - WT_PAGE *page; - WT_ROW *rip; - WT_UPDATE *upd; - uint32_t i; - bool have_key, stable_update_found; - - page = ref->page; - - WT_RET(__wt_scr_alloc(session, 0, &key)); - - /* - * Review the insert list for keys before the first entry on the disk page. - */ - if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL) - WT_ERR(__rollback_abort_insert_list(session, page, insert, rollback_timestamp, NULL)); - - /* - * Review updates that belong to keys that are on the disk image, as well as for keys inserted - * since the page was read from disk. - */ - WT_ROW_FOREACH (page, rip, i) { - stable_update_found = false; - if ((upd = WT_ROW_UPDATE(page, rip)) != NULL) { - WT_ERR(__wt_row_leaf_key(session, page, rip, key, false)); - WT_ERR( - __rollback_abort_update(session, key, upd, rollback_timestamp, &stable_update_found)); - have_key = true; - } else - have_key = false; - - if ((insert = WT_ROW_INSERT(page, rip)) != NULL) - WT_ERR(__rollback_abort_insert_list(session, page, insert, rollback_timestamp, NULL)); - - /* - * If there is no stable update found in the update list, abort any on-disk value. - */ - if (!stable_update_found) { - vpack = &_vpack; - __wt_row_leaf_value_cell(session, page, rip, vpack); - WT_ERR(__rollback_abort_ondisk_kv( - session, ref, rip, 0, have_key ? key : NULL, vpack, rollback_timestamp, NULL)); - } - } - -err: - __wt_scr_free(session, &key); - return (ret); -} - -/* - * __rollback_get_ref_max_durable_timestamp -- - * Returns the ref aggregated max durable timestamp. The max durable timestamp is calculated - * between both start and stop durable timestamps except for history store, because most of the - * history store updates have stop timestamp either greater or equal to the start timestamp - * except for the updates written for the prepared updates on the data store. To abort the - * updates with no stop timestamp, we must include the newest stop timestamp also into the - * calculation of maximum durable timestamp of the history store. - */ -static wt_timestamp_t -__rollback_get_ref_max_durable_timestamp(WT_SESSION_IMPL *session, WT_TIME_AGGREGATE *ta) -{ - if (WT_IS_HS(session->dhandle)) - return (WT_MAX(ta->newest_stop_durable_ts, ta->newest_stop_ts)); - return (WT_MAX(ta->newest_start_durable_ts, ta->newest_stop_durable_ts)); -} - -/* - * __rollback_page_needs_abort -- - * Check whether the page needs rollback, returning true if the page has modifications newer - * than the given timestamp. - */ -static bool -__rollback_page_needs_abort( - WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t rollback_timestamp) -{ - WT_ADDR *addr; - WT_CELL_UNPACK_ADDR vpack; - WT_MULTI *multi; - WT_PAGE_MODIFY *mod; - wt_timestamp_t durable_ts; - uint64_t newest_txn; - uint32_t i; - char ts_string[WT_TS_INT_STRING_SIZE]; - const char *tag; - bool prepared, result; - - addr = ref->addr; - mod = ref->page == NULL ? NULL : ref->page->modify; - durable_ts = WT_TS_NONE; - newest_txn = WT_TXN_NONE; - tag = "undefined state"; - prepared = result = false; - - /* - * The rollback operation should be performed on this page when any one of the following is - * greater than the given timestamp or during recovery if the newest transaction id on the page - * is greater than or equal to recovered checkpoint snapshot min: - * 1. The reconciled replace page max durable timestamp. - * 2. The reconciled multi page max durable timestamp. - * 3. For just-instantiated deleted pages that have not otherwise been modified, the durable - * timestamp in the page delete information. This timestamp isn't reflected in the address's - * time aggregate. - * 4. The on page address max durable timestamp. - * 5. The off page address max durable timestamp. - */ - if (mod != NULL && mod->rec_result == WT_PM_REC_REPLACE) { - tag = "reconciled replace block"; - durable_ts = __rollback_get_ref_max_durable_timestamp(session, &mod->mod_replace.ta); - prepared = mod->mod_replace.ta.prepare; - result = (durable_ts > rollback_timestamp) || prepared; - } else if (mod != NULL && mod->rec_result == WT_PM_REC_MULTIBLOCK) { - tag = "reconciled multi block"; - /* Calculate the max durable timestamp by traversing all multi addresses. */ - for (multi = mod->mod_multi, i = 0; i < mod->mod_multi_entries; ++multi, ++i) { - durable_ts = WT_MAX( - durable_ts, __rollback_get_ref_max_durable_timestamp(session, &multi->addr.ta)); - if (multi->addr.ta.prepare) - prepared = true; - } - result = (durable_ts > rollback_timestamp) || prepared; - } else if (mod != NULL && mod->instantiated && !__wt_page_is_modified(ref->page) && - ref->page_del != NULL) { - tag = "page_del info"; - durable_ts = ref->page_del->durable_timestamp; - prepared = ref->page_del->prepare_state == WT_PREPARE_INPROGRESS || - ref->page_del->prepare_state == WT_PREPARE_LOCKED; - newest_txn = ref->page_del->txnid; - result = (durable_ts > rollback_timestamp) || prepared || - WT_CHECK_RECOVERY_FLAG_TXNID(session, newest_txn); - } else if (!__wt_off_page(ref->home, addr)) { - tag = "on page cell"; - /* Check if the page is obsolete using the page disk address. */ - __wt_cell_unpack_addr(session, ref->home->dsk, (WT_CELL *)addr, &vpack); - durable_ts = __rollback_get_ref_max_durable_timestamp(session, &vpack.ta); - prepared = vpack.ta.prepare; - newest_txn = vpack.ta.newest_txn; - result = (durable_ts > rollback_timestamp) || prepared || - WT_CHECK_RECOVERY_FLAG_TXNID(session, newest_txn); - } else if (addr != NULL) { - tag = "address"; - durable_ts = __rollback_get_ref_max_durable_timestamp(session, &addr->ta); - prepared = addr->ta.prepare; - newest_txn = addr->ta.newest_txn; - result = (durable_ts > rollback_timestamp) || prepared || - WT_CHECK_RECOVERY_FLAG_TXNID(session, newest_txn); - } - - __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), - "%p: page with %s durable timestamp: %s, newest txn: %" PRIu64 - " and prepared updates: %s needs abort: %s", - (void *)ref, tag, __wt_timestamp_to_string(durable_ts, ts_string), newest_txn, - prepared ? "true" : "false", result ? "true" : "false"); - - return (result); -} - -/* - * __rollback_abort_updates -- - * Abort updates on this page newer than the timestamp. - */ -static int -__rollback_abort_updates(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t rollback_timestamp) -{ - WT_PAGE *page; - bool modified; - - /* - * If we have a ref with clean page, find out whether the page has any modifications that are - * newer than the given timestamp. As eviction writes the newest version to page, even a clean - * page may also contain modifications that need rollback. - */ - page = ref->page; - modified = __wt_page_is_modified(page); - if (!modified && !__rollback_page_needs_abort(session, ref, rollback_timestamp)) { - __wt_verbose_level_multi(session, WT_VERB_RECOVERY_RTS(session), WT_VERBOSE_DEBUG_3, - "%p: unmodified stable page skipped", (void *)ref); - return (0); - } - - WT_STAT_CONN_INCR(session, txn_rts_pages_visited); - __wt_verbose_level_multi(session, WT_VERB_RECOVERY_RTS(session), WT_VERBOSE_DEBUG_2, - "%p: roll back %s page", (void *)ref, modified ? "modified" : "clean"); - - switch (page->type) { - case WT_PAGE_COL_FIX: - WT_RET(__rollback_abort_col_fix(session, ref, rollback_timestamp)); - break; - case WT_PAGE_COL_VAR: - WT_RET(__rollback_abort_col_var(session, ref, rollback_timestamp)); - break; - case WT_PAGE_ROW_LEAF: - WT_RET(__rollback_abort_row_leaf(session, ref, rollback_timestamp)); - break; - case WT_PAGE_COL_INT: - case WT_PAGE_ROW_INT: - /* This function is not called for internal pages. */ - WT_ASSERT(session, false); - /* Fall through. */ - default: - WT_RET(__wt_illegal_value(session, page->type)); - } - - /* Mark the page as dirty to reconcile the page. */ - if (page->modify) - __wt_page_modify_set(session, page); - return (0); -} - -/* - * __rollback_to_stable_page_skip -- - * Skip if rollback to stable doesn't require reading this page. - */ -static int -__rollback_to_stable_page_skip( - WT_SESSION_IMPL *session, WT_REF *ref, void *context, bool visible_all, bool *skipp) -{ - WT_PAGE_DELETED *page_del; - wt_timestamp_t rollback_timestamp; - char time_string[WT_TIME_STRING_SIZE]; - - rollback_timestamp = *(wt_timestamp_t *)context; - WT_UNUSED(visible_all); - - *skipp = false; /* Default to reading */ - - /* - * Skip pages truncated at or before the RTS timestamp. (We could read the page, but that would - * unnecessarily instantiate it). If the page has no fast-delete information, that means either - * it was discarded because the delete is globally visible, or the internal page holding the - * cell was an old format page so none was loaded. In the latter case we should skip the page as - * there's no way to get correct behavior and skipping matches the historic behavior. Note that - * eviction is running; we must lock the WT_REF before examining the fast-delete information. - */ - if (ref->state == WT_REF_DELETED && - WT_REF_CAS_STATE(session, ref, WT_REF_DELETED, WT_REF_LOCKED)) { - page_del = ref->page_del; - if (page_del == NULL || - (__rollback_txn_visible_id(session, page_del->txnid) && - page_del->durable_timestamp <= rollback_timestamp)) { - /* - * We should never see a prepared truncate here; not at recovery time because prepared - * truncates can't be written to disk, and not during a runtime RTS either because it - * should not be possible to do that with an unresolved prepared transaction. - */ - WT_ASSERT(session, - page_del == NULL || page_del->prepare_state == WT_PREPARE_INIT || - page_del->prepare_state == WT_PREPARE_RESOLVED); - - if (page_del == NULL) - __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), - "%p: deleted page walk skipped", (void *)ref); - else { - __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), - "%p: deleted page walk skipped page_del %s", (void *)ref, - __wt_time_point_to_string(page_del->timestamp, page_del->durable_timestamp, - page_del->txnid, time_string)); - } - WT_STAT_CONN_INCR(session, txn_rts_tree_walk_skip_pages); - *skipp = true; - } - WT_REF_SET_STATE(ref, WT_REF_DELETED); - return (0); - } - - /* Otherwise, if the page state is other than on disk, we want to look at it. */ - if (ref->state != WT_REF_DISK) - return (0); - - /* - * Check whether this on-disk page has any updates to be aborted. We are not holding a hazard - * reference on the page and so we rely on there being no other threads of control in the tree, - * that is, eviction ignores WT_REF_DISK pages and no other thread is reading pages, this page - * cannot change state from on-disk to something else. - */ - if (!__rollback_page_needs_abort(session, ref, rollback_timestamp)) { - *skipp = true; - __wt_verbose_multi( - session, WT_VERB_RECOVERY_RTS(session), "%p: stable page walk skipped", (void *)ref); - WT_STAT_CONN_INCR(session, txn_rts_tree_walk_skip_pages); - } - - return (0); -} - -/* - * __rollback_to_stable_btree_walk -- - * Called for each open handle - choose to either skip or wipe the commits - */ -static int -__rollback_to_stable_btree_walk(WT_SESSION_IMPL *session, wt_timestamp_t rollback_timestamp) -{ - WT_DECL_RET; - WT_REF *ref; - - /* Walk the tree, marking commits aborted where appropriate. */ - ref = NULL; - while ( - (ret = __wt_tree_walk_custom_skip(session, &ref, __rollback_to_stable_page_skip, - &rollback_timestamp, WT_READ_NO_EVICT | WT_READ_VISIBLE_ALL | WT_READ_WONT_NEED)) == 0 && - ref != NULL) - if (F_ISSET(ref, WT_REF_FLAG_LEAF)) - WT_RET(__rollback_abort_updates(session, ref, rollback_timestamp)); - - return (ret); -} - -/* - * __rollback_to_stable_btree -- - * Called for each object handle - choose to either skip or wipe the commits - */ -static int -__rollback_to_stable_btree(WT_SESSION_IMPL *session, wt_timestamp_t rollback_timestamp) -{ - WT_BTREE *btree; - WT_CONNECTION_IMPL *conn; - - btree = S2BT(session); - conn = S2C(session); - - __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), - "rollback to stable connection logging enabled: %s and btree logging enabled: %s", - FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) ? "true" : "false", - F_ISSET(btree, WT_BTREE_LOGGED) ? "true" : "false"); - - /* Files with commit-level durability (without timestamps), don't get their commits wiped. */ - if (F_ISSET(btree, WT_BTREE_LOGGED)) - return (0); - - /* There is never anything to do for checkpoint handles. */ - if (WT_READING_CHECKPOINT(session)) - return (0); - - /* There is nothing to do on an empty tree. */ - if (btree->root.page == NULL) - return (0); - - return (__rollback_to_stable_btree_walk(session, rollback_timestamp)); -} - -/* - * __rollback_to_stable_check -- - * Check to the extent possible that the rollback request is reasonable. - */ -static int -__rollback_to_stable_check(WT_SESSION_IMPL *session) -{ - WT_CONNECTION_IMPL *conn; - WT_DECL_RET; - WT_SESSION_IMPL *session_in_list; - uint32_t i, session_cnt; - bool cursor_active, txn_active; - - conn = S2C(session); - cursor_active = txn_active = false; - - WT_STAT_CONN_INCR(session, txn_walk_sessions); - - /* - * Help the user comply with the requirement there be no concurrent user operations. It is okay - * to have a transaction in the prepared state. - * - * WT_TXN structures are allocated and freed as sessions are activated and closed. Lock the - * session open/close to ensure we don't race. This call is a rarely used RTS-only function, - * acquiring the lock shouldn't be an issue. - */ - __wt_spin_lock(session, &conn->api_lock); - - WT_ORDERED_READ(session_cnt, conn->session_cnt); - for (i = 0, session_in_list = conn->sessions; i < session_cnt; i++, session_in_list++) { - - /* Skip inactive or internal sessions. */ - if (!session_in_list->active || F_ISSET(session_in_list, WT_SESSION_INTERNAL)) - continue; - - /* Check if a user session has a running transaction. */ - if (F_ISSET(session_in_list->txn, WT_TXN_RUNNING)) { - txn_active = true; - break; - } - - /* Check if a user session has an active file cursor. */ - if (session_in_list->ncursors != 0) { - cursor_active = true; - break; - } - } - __wt_spin_unlock(session, &conn->api_lock); - - /* - * A new cursor may be positioned or a transaction may start after we return from this call and - * callers should be aware of this limitation. - */ - if (cursor_active) - WT_RET_MSG(session, EBUSY, "rollback_to_stable illegal with active file cursors"); - if (txn_active) { - ret = EBUSY; - WT_TRET(__wt_verbose_dump_txn(session)); - WT_RET_MSG(session, ret, "rollback_to_stable illegal with active transactions"); - } - return (0); -} - -/* - * __rollback_to_stable_btree_hs_truncate -- - * Wipe all history store updates for the btree (non-timestamped tables) - */ -static int -__rollback_to_stable_btree_hs_truncate(WT_SESSION_IMPL *session, uint32_t btree_id) -{ - WT_CURSOR *hs_cursor_start, *hs_cursor_stop; - WT_DECL_ITEM(hs_key); - WT_DECL_RET; - WT_SESSION *truncate_session; - wt_timestamp_t hs_start_ts; - uint64_t hs_counter; - uint32_t hs_btree_id; - - hs_cursor_start = hs_cursor_stop = NULL; - hs_btree_id = 0; - truncate_session = (WT_SESSION *)session; - - WT_RET(__wt_scr_alloc(session, 0, &hs_key)); - - /* Open a history store start cursor. */ - WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor_start)); - F_SET(hs_cursor_start, WT_CURSTD_HS_READ_COMMITTED); - - hs_cursor_start->set_key(hs_cursor_start, 1, btree_id); - WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_after(session, hs_cursor_start), true); - if (ret == WT_NOTFOUND) { - ret = 0; - goto done; - } - - /* Open a history store stop cursor. */ - WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor_stop)); - F_SET(hs_cursor_stop, WT_CURSTD_HS_READ_COMMITTED | WT_CURSTD_HS_READ_ACROSS_BTREE); - - hs_cursor_stop->set_key(hs_cursor_stop, 1, btree_id + 1); - WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_after(session, hs_cursor_stop), true); - -#ifdef HAVE_DIAGNOSTIC - /* If we get not found, we are at the largest btree id in the history store. */ - if (ret == 0) { - hs_cursor_stop->get_key(hs_cursor_stop, &hs_btree_id, hs_key, &hs_start_ts, &hs_counter); - WT_ASSERT(session, hs_btree_id > btree_id); - } -#endif - - do { - WT_ASSERT(session, ret == WT_NOTFOUND || hs_btree_id > btree_id); - - WT_ERR_NOTFOUND_OK(hs_cursor_stop->prev(hs_cursor_stop), true); - /* We can find the start point then we must be able to find the stop point. */ - if (ret == WT_NOTFOUND) - WT_ERR_PANIC( - session, ret, "cannot locate the stop point to truncate the history store."); - hs_cursor_stop->get_key(hs_cursor_stop, &hs_btree_id, hs_key, &hs_start_ts, &hs_counter); - } while (hs_btree_id != btree_id); - - WT_ERR( - truncate_session->truncate(truncate_session, NULL, hs_cursor_start, hs_cursor_stop, NULL)); - - WT_STAT_CONN_DATA_INCR(session, cache_hs_btree_truncate); - - __wt_verbose(session, WT_VERB_RECOVERY_PROGRESS, - "Rollback to stable has truncated records for btree %u from the history store", btree_id); - -done: -err: - __wt_scr_free(session, &hs_key); - if (hs_cursor_start != NULL) - WT_TRET(hs_cursor_start->close(hs_cursor_start)); - if (hs_cursor_stop != NULL) - WT_TRET(hs_cursor_stop->close(hs_cursor_stop)); - - return (ret); -} - -/* - * __rollback_to_stable_hs_final_pass -- - * Perform rollback to stable on the history store to remove any entries newer than the stable - * timestamp. - */ -static int -__rollback_to_stable_hs_final_pass(WT_SESSION_IMPL *session, wt_timestamp_t rollback_timestamp) -{ - WT_CONFIG ckptconf; - WT_CONFIG_ITEM cval, durableval, key; - WT_CONNECTION_IMPL *conn; - WT_DECL_RET; - wt_timestamp_t max_durable_ts, newest_stop_durable_ts, newest_stop_ts; - size_t i; - char *config; - char ts_string[2][WT_TS_INT_STRING_SIZE]; - - config = NULL; - conn = S2C(session); - - WT_RET(__wt_metadata_search(session, WT_HS_URI, &config)); - - /* - * Find out the max durable timestamp of the history store from checkpoint. Most of the history - * store updates have stop timestamp either greater or equal to the start timestamp except for - * the updates written for the prepared updates on the data store. To abort the updates with no - * stop timestamp, we must include the newest stop timestamp also into the calculation of - * maximum timestamp of the history store. - */ - newest_stop_durable_ts = newest_stop_ts = WT_TS_NONE; - WT_ERR(__wt_config_getones(session, config, "checkpoint", &cval)); - __wt_config_subinit(session, &ckptconf, &cval); - for (; __wt_config_next(&ckptconf, &key, &cval) == 0;) { - ret = __wt_config_subgets(session, &cval, "newest_stop_durable_ts", &durableval); - if (ret == 0) - newest_stop_durable_ts = WT_MAX(newest_stop_durable_ts, (wt_timestamp_t)durableval.val); - WT_ERR_NOTFOUND_OK(ret, false); - ret = __wt_config_subgets(session, &cval, "newest_stop_ts", &durableval); - if (ret == 0) - newest_stop_ts = WT_MAX(newest_stop_ts, (wt_timestamp_t)durableval.val); - WT_ERR_NOTFOUND_OK(ret, false); - } - max_durable_ts = WT_MAX(newest_stop_ts, newest_stop_durable_ts); - WT_ERR(__wt_session_get_dhandle(session, WT_HS_URI, NULL, NULL, 0)); - - /* - * The rollback operation should be performed on the history store file when the checkpoint - * durable start/stop timestamp is greater than the rollback timestamp. But skip if there is no - * stable timestamp. - * - * Note that the corresponding code in __rollback_to_stable_btree_apply also checks whether - * there _are_ timestamped updates by checking max_durable_ts; that check is redundant here for - * several reasons, the most immediate being that max_durable_ts cannot be none (zero) because - * it's greater than rollback_timestamp, which is itself greater than zero. - */ - if (max_durable_ts > rollback_timestamp && rollback_timestamp != WT_TS_NONE) { - __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), - "tree rolled back with durable timestamp: %s", - __wt_timestamp_to_string(max_durable_ts, ts_string[0])); - WT_TRET(__rollback_to_stable_btree(session, rollback_timestamp)); - } else - __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), - "tree skipped with durable timestamp: %s and stable timestamp: %s", - __wt_timestamp_to_string(max_durable_ts, ts_string[0]), - __wt_timestamp_to_string(rollback_timestamp, ts_string[1])); - - /* - * Truncate history store entries from the partial backup remove list. The list holds all of the - * btree ids that do not exist as part of the database anymore due to performing a selective - * restore from backup. - */ - if (F_ISSET(conn, WT_CONN_BACKUP_PARTIAL_RESTORE) && conn->partial_backup_remove_ids != NULL) - for (i = 0; conn->partial_backup_remove_ids[i] != 0; ++i) - WT_ERR( - __rollback_to_stable_btree_hs_truncate(session, conn->partial_backup_remove_ids[i])); -err: - if (session->dhandle != NULL) - WT_TRET(__wt_session_release_dhandle(session)); - __wt_free(session, config); - return (ret); -} - -/* - * __rollback_progress_msg -- - * Log a verbose message about the progress of the current rollback to stable. - */ -static void -__rollback_progress_msg(WT_SESSION_IMPL *session, struct timespec rollback_start, - uint64_t rollback_count, uint64_t *rollback_msg_count) -{ - struct timespec cur_time; - uint64_t time_diff; - - __wt_epoch(session, &cur_time); - - /* Time since the rollback started. */ - time_diff = WT_TIMEDIFF_SEC(cur_time, rollback_start); - - if ((time_diff / WT_PROGRESS_MSG_PERIOD) > *rollback_msg_count) { - __wt_verbose(session, WT_VERB_RECOVERY_PROGRESS, - "Rollback to stable has been running for %" PRIu64 " seconds and has inspected %" PRIu64 - " files. For more detailed logging, enable WT_VERB_RTS", - time_diff, rollback_count); - ++(*rollback_msg_count); - } -} - -/* - * __rollback_to_stable_check_btree_modified -- - * Check that the rollback to stable btree is modified or not. - */ -static int -__rollback_to_stable_check_btree_modified(WT_SESSION_IMPL *session, const char *uri, bool *modified) -{ - WT_DECL_RET; - - ret = __wt_conn_dhandle_find(session, uri, NULL); - *modified = ret == 0 && S2BT(session)->modified; - return (ret); -} - -/* - * __rollback_to_stable_btree_apply -- - * Perform rollback to stable on a single file. - */ -static int -__rollback_to_stable_btree_apply( - WT_SESSION_IMPL *session, const char *uri, const char *config, wt_timestamp_t rollback_timestamp) -{ - WT_CONFIG ckptconf; - WT_CONFIG_ITEM cval, value, key; - WT_DECL_RET; - wt_timestamp_t max_durable_ts, newest_start_durable_ts, newest_stop_durable_ts; - size_t addr_size; - uint64_t rollback_txnid, write_gen; - uint32_t btree_id; - char ts_string[2][WT_TS_INT_STRING_SIZE]; - bool dhandle_allocated, durable_ts_found, has_txn_updates_gt_than_ckpt_snap, modified; - bool prepared_updates; - - /* Ignore non-btree objects as well as the metadata and history store files. */ - if (!WT_BTREE_PREFIX(uri) || strcmp(uri, WT_HS_URI) == 0 || strcmp(uri, WT_METAFILE_URI) == 0) - return (0); - - addr_size = 0; - rollback_txnid = 0; - write_gen = 0; - dhandle_allocated = false; - - /* Find out the max durable timestamp of the object from checkpoint. */ - newest_start_durable_ts = newest_stop_durable_ts = WT_TS_NONE; - durable_ts_found = prepared_updates = has_txn_updates_gt_than_ckpt_snap = false; - - WT_RET(__wt_config_getones(session, config, "checkpoint", &cval)); - __wt_config_subinit(session, &ckptconf, &cval); - for (; __wt_config_next(&ckptconf, &key, &cval) == 0;) { - ret = __wt_config_subgets(session, &cval, "newest_start_durable_ts", &value); - if (ret == 0) { - newest_start_durable_ts = WT_MAX(newest_start_durable_ts, (wt_timestamp_t)value.val); - durable_ts_found = true; - } - WT_RET_NOTFOUND_OK(ret); - ret = __wt_config_subgets(session, &cval, "newest_stop_durable_ts", &value); - if (ret == 0) { - newest_stop_durable_ts = WT_MAX(newest_stop_durable_ts, (wt_timestamp_t)value.val); - durable_ts_found = true; - } - WT_RET_NOTFOUND_OK(ret); - ret = __wt_config_subgets(session, &cval, "prepare", &value); - if (ret == 0) { - if (value.val) - prepared_updates = true; - } - WT_RET_NOTFOUND_OK(ret); - ret = __wt_config_subgets(session, &cval, "newest_txn", &value); - if (value.len != 0) - rollback_txnid = (uint64_t)value.val; - WT_RET_NOTFOUND_OK(ret); - ret = __wt_config_subgets(session, &cval, "addr", &value); - if (ret == 0) - addr_size = value.len; - WT_RET_NOTFOUND_OK(ret); - ret = __wt_config_subgets(session, &cval, "write_gen", &value); - if (ret == 0) - write_gen = (uint64_t)value.val; - WT_RET_NOTFOUND_OK(ret); - } - max_durable_ts = WT_MAX(newest_start_durable_ts, newest_stop_durable_ts); - - /* - * Perform rollback to stable when the newest written transaction of the btree is greater than - * or equal to the checkpoint snapshot. The snapshot comparison is valid only when the btree - * write generation number is greater than the last checkpoint connection base write generation - * to confirm that the btree is modified in the previous restart cycle. - */ - if (WT_CHECK_RECOVERY_FLAG_TXNID(session, rollback_txnid) && - (write_gen >= S2C(session)->last_ckpt_base_write_gen)) { - has_txn_updates_gt_than_ckpt_snap = true; - /* Increment the inconsistent checkpoint stats counter. */ - WT_STAT_CONN_DATA_INCR(session, txn_rts_inconsistent_ckpt); - } - - /* - * The rollback to stable will skip the tables during recovery and shutdown in the following - * conditions. - * 1. Empty table. - * 2. Table has timestamped updates without a stable timestamp. - */ - if ((F_ISSET(S2C(session), WT_CONN_RECOVERING) || - F_ISSET(S2C(session), WT_CONN_CLOSING_CHECKPOINT)) && - (addr_size == 0 || (rollback_timestamp == WT_TS_NONE && max_durable_ts != WT_TS_NONE))) { - __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), - "skip rollback to stable on file %s because %s", uri, - addr_size == 0 ? "its checkpoint address length is 0" : - "it has timestamped updates and the stable timestamp is 0"); - return (0); - } - - /* - * The rollback operation should be performed on this file based on the following: - * 1. The dhandle is present in the cache and tree is modified. - * 2. The checkpoint durable start/stop timestamp is greater than the rollback timestamp. - * 3. The checkpoint has prepared updates written to disk. - * 4. There is no durable timestamp in any checkpoint. - * 5. The checkpoint newest txn is greater than snapshot min txn id. - */ - WT_WITHOUT_DHANDLE(session, - WT_WITH_HANDLE_LIST_READ_LOCK( - session, (ret = __rollback_to_stable_check_btree_modified(session, uri, &modified)))); - - WT_ERR_NOTFOUND_OK(ret, false); - - if (modified || max_durable_ts > rollback_timestamp || prepared_updates || !durable_ts_found || - has_txn_updates_gt_than_ckpt_snap) { - /* - * Open a handle; we're potentially opening a lot of handles and there's no reason to cache - * all of them for future unknown use, discard on close. - */ - ret = __wt_session_get_dhandle(session, uri, NULL, NULL, WT_DHANDLE_DISCARD); - if (ret != 0) - WT_ERR_MSG(session, ret, "%s: unable to open handle%s", uri, - ret == EBUSY ? ", error indicates handle is unavailable due to concurrent use" : ""); - dhandle_allocated = true; - - __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), - "tree rolled back because it is modified: %s, or its durable timestamp (%s) > stable " - "timestamp (%s): " - "%s, or it has prepared updates: %s, or durable " - "timestamp is not found: %s, or txnid (%" PRIu64 - ") > recovery checkpoint snap min (%" PRIu64 "): %s", - S2BT(session)->modified ? "true" : "false", - __wt_timestamp_to_string(max_durable_ts, ts_string[0]), - __wt_timestamp_to_string(rollback_timestamp, ts_string[1]), - max_durable_ts > rollback_timestamp ? "true" : "false", - prepared_updates ? "true" : "false", !durable_ts_found ? "true" : "false", rollback_txnid, - S2C(session)->recovery_ckpt_snap_min, - has_txn_updates_gt_than_ckpt_snap ? "true" : "false"); - - WT_ERR(__rollback_to_stable_btree(session, rollback_timestamp)); - } else - __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), - "%s: tree skipped with durable timestamp: %s and stable timestamp: %s or txnid: %" PRIu64, - uri, __wt_timestamp_to_string(max_durable_ts, ts_string[0]), - __wt_timestamp_to_string(rollback_timestamp, ts_string[1]), rollback_txnid); - - /* - * Truncate history store entries for the non-timestamped table. - * Exceptions: - * 1. Modified tree - Scenarios where the tree is never checkpointed lead to zero - * durable timestamp even they are timestamped tables. Until we have a special - * indication of letting to know the table type other than checking checkpointed durable - * timestamp to WT_TS_NONE, we need this exception. - * 2. In-memory database - In this scenario, there is no history store to truncate. - */ - if ((!dhandle_allocated || !S2BT(session)->modified) && max_durable_ts == WT_TS_NONE && - !F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) { - WT_ERR(__wt_config_getones(session, config, "id", &cval)); - btree_id = (uint32_t)cval.val; - WT_ERR(__rollback_to_stable_btree_hs_truncate(session, btree_id)); - } - -err: - if (dhandle_allocated) - WT_TRET(__wt_session_release_dhandle(session)); - return (ret); -} - -/* - * __wt_rollback_to_stable_one -- - * Perform rollback to stable on a single object. - */ -int -__wt_rollback_to_stable_one(WT_SESSION_IMPL *session, const char *uri, bool *skipp) -{ - WT_DECL_RET; - wt_timestamp_t rollback_timestamp; - char *config; - - /* - * This is confusing: the caller's boolean argument "skip" stops the schema-worker loop from - * processing this object and any underlying objects it may have (for example, a table with - * multiple underlying file objects). We rollback-to-stable all of the file objects an object - * may contain, so set the caller's skip argument to true on all file objects, else set the - * caller's skip argument to false so our caller continues down the tree of objects. - */ - *skipp = WT_BTREE_PREFIX(uri); - if (!*skipp) - return (0); - - WT_RET(__wt_metadata_search(session, uri, &config)); - - /* Read the stable timestamp once, when we first start up. */ - WT_ORDERED_READ(rollback_timestamp, S2C(session)->txn_global.stable_timestamp); - - F_SET(session, WT_SESSION_QUIET_CORRUPT_FILE); - ret = __rollback_to_stable_btree_apply(session, uri, config, rollback_timestamp); - F_CLR(session, WT_SESSION_QUIET_CORRUPT_FILE); - - __wt_free(session, config); - - return (ret); -} - -/* - * __rollback_to_stable_btree_apply_all -- - * Perform rollback to stable to all files listed in the metadata, apart from the metadata and - * history store files. - */ -static int -__rollback_to_stable_btree_apply_all(WT_SESSION_IMPL *session, wt_timestamp_t rollback_timestamp) -{ - struct timespec rollback_timer; - WT_CURSOR *cursor; - WT_DECL_RET; - uint64_t rollback_count, rollback_msg_count; - const char *config, *uri; - - /* Initialize the verbose tracking timer. */ - __wt_epoch(session, &rollback_timer); - rollback_count = 0; - rollback_msg_count = 0; - - WT_RET(__wt_metadata_cursor(session, &cursor)); - while ((ret = cursor->next(cursor)) == 0) { - /* Log a progress message. */ - __rollback_progress_msg(session, rollback_timer, rollback_count, &rollback_msg_count); - ++rollback_count; - - WT_ERR(cursor->get_key(cursor, &uri)); - WT_ERR(cursor->get_value(cursor, &config)); - - F_SET(session, WT_SESSION_QUIET_CORRUPT_FILE); - ret = __rollback_to_stable_btree_apply(session, uri, config, rollback_timestamp); - F_CLR(session, WT_SESSION_QUIET_CORRUPT_FILE); - - /* - * Ignore rollback to stable failures on files that don't exist or files where corruption is - * detected. - */ - if (ret == ENOENT || (ret == WT_ERROR && F_ISSET(S2C(session), WT_CONN_DATA_CORRUPTION))) { - __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), - "%s: skipped performing rollback to stable because the file %s", uri, - ret == ENOENT ? "does not exist" : "is corrupted."); - continue; - } - WT_ERR(ret); - } - WT_ERR_NOTFOUND_OK(ret, false); - - if (F_ISSET(S2C(session), WT_CONN_RECOVERING)) - WT_ERR(__rollback_to_stable_hs_final_pass(session, rollback_timestamp)); - -err: - WT_TRET(__wt_metadata_cursor_release(session, &cursor)); - return (ret); -} - -/* - * __rollback_to_stable -- - * Rollback all modifications with timestamps more recent than the passed in timestamp. - */ -static int -__rollback_to_stable(WT_SESSION_IMPL *session, bool no_ckpt) -{ - WT_CONNECTION_IMPL *conn; - WT_DECL_RET; - WT_TXN_GLOBAL *txn_global; - wt_timestamp_t rollback_timestamp; - char ts_string[2][WT_TS_INT_STRING_SIZE]; - - conn = S2C(session); - txn_global = &conn->txn_global; - - /* - * Rollback to stable should ignore tombstones in the history store since it needs to scan the - * entire table sequentially. - */ - F_SET(session, WT_SESSION_ROLLBACK_TO_STABLE); - - WT_ERR(__rollback_to_stable_check(session)); - - /* - * Update the global time window state to have consistent view from global visibility rules for - * the rollback to stable to bring back the database into a consistent state. - * - * As part of the below function call, the oldest transaction id and pinned timestamps are - * updated. - */ - WT_ERR(__wt_txn_update_oldest(session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT)); - - WT_ASSERT_ALWAYS(session, - (txn_global->has_pinned_timestamp || !txn_global->has_oldest_timestamp), - "Database has no pinned timestamp but an oldest timestamp. Pinned timestamp is required to " - "find out the global visibility/obsolete of an update."); - - /* - * Copy the stable timestamp, otherwise we'd need to lock it each time it's accessed. Even - * though the stable timestamp isn't supposed to be updated while rolling back, accessing it - * without a lock would violate protocol. - */ - WT_ORDERED_READ(rollback_timestamp, txn_global->stable_timestamp); - __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), - "performing rollback to stable with stable timestamp: %s and oldest timestamp: %s", - __wt_timestamp_to_string(rollback_timestamp, ts_string[0]), - __wt_timestamp_to_string(txn_global->oldest_timestamp, ts_string[1])); - - if (F_ISSET(conn, WT_CONN_RECOVERING)) - __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session), - "recovered checkpoint snapshot min: %" PRIu64 ", snapshot max: %" PRIu64 - ", snapshot count: %" PRIu32, - conn->recovery_ckpt_snap_min, conn->recovery_ckpt_snap_max, - conn->recovery_ckpt_snapshot_count); - - WT_ERR(__rollback_to_stable_btree_apply_all(session, rollback_timestamp)); - - /* Rollback the global durable timestamp to the stable timestamp. */ - txn_global->has_durable_timestamp = txn_global->has_stable_timestamp; - txn_global->durable_timestamp = txn_global->stable_timestamp; - - /* - * If the configuration is not in-memory, forcibly log a checkpoint after rollback to stable to - * ensure that both in-memory and on-disk versions are the same unless caller requested for no - * checkpoint. - */ - if (!F_ISSET(conn, WT_CONN_IN_MEMORY) && !no_ckpt) - WT_ERR(session->iface.checkpoint(&session->iface, "force=1")); - -err: - F_CLR(session, WT_SESSION_ROLLBACK_TO_STABLE); - return (ret); -} - -/* - * __wt_rollback_to_stable -- - * Rollback the database to the stable timestamp. - */ -int -__wt_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[], bool no_ckpt) -{ - WT_DECL_RET; - - WT_UNUSED(cfg); - - /* - * Don't use the connection's default session: we are working on data handles and (a) don't want - * to cache all of them forever, plus (b) can't guarantee that no other method will be called - * concurrently. Copy parent session no logging option to the internal session to make sure that - * rollback to stable doesn't generate log records. - */ - WT_RET( - __wt_open_internal_session(S2C(session), "txn rollback_to_stable", true, 0, 0, &session)); - - WT_STAT_CONN_SET(session, txn_rollback_to_stable_running, 1); - WT_WITH_CHECKPOINT_LOCK( - session, WT_WITH_SCHEMA_LOCK(session, ret = __rollback_to_stable(session, no_ckpt))); - WT_STAT_CONN_SET(session, txn_rollback_to_stable_running, 0); - - WT_TRET(__wt_session_close_internal(session)); - - return (ret); -} |