summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2020-09-09 11:37:44 +1000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2020-09-09 01:52:32 +0000
commit0f0eb111c9dbd5dd4ca86529aee81bad59c6579b (patch)
tree94f3efa909755fcc8b381bf68214ebb9bb326964 /src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
parent103b545528e7cf05537bd68bda4bc8235bf6fefa (diff)
downloadmongo-0f0eb111c9dbd5dd4ca86529aee81bad59c6579b.tar.gz
Import wiredtiger: 6a7db4f96fe828fdb1b9a31d83460c0573a2c3b1 from branch mongodb-4.4
ref: 579966149b..6a7db4f96f for: 4.4.2 WT-5144 Use wt_clock instead of wt_epoch in perf programs WT-5585 Remove cache_overflow config option WT-5693 Enable test_wt4105_large_doc_small_upd WT-5940 Migrate to Clang Format 10 WT-6000 Enhance incremental backup testing in format to support restart WT-6006 Revert test binaries to 10 in checkpoint-stress-test WT-6027 Fix docs spelling errors and warnings WT-6322 Split full compatibility tests into smaller groups WT-6390 Extend compact02 timeout from 8 => 10 minutes WT-6404 Add timing stress that delays checkpoint after it calls __wt_txn_begin WT-6451 Do not evict clean metadata pages if needed for historic reads WT-6458 read row-store leaf pages with prepared updates in a single pass WT-6463 History store operations should honor cache size WT-6471 Avoid the error message for non-existent clang-format binary WT-6472 Update timestamp_abort test cache configuration WT-6478 Cursor cache statistics not getting incremented WT-6505 Add debugging for missing file failure WT-6507 Exit cache eviction worker after our operation has timed out WT-6526 Fix assertion failure when opening DB in readonly mode after unclean shutdown WT-6532 Consider update structure overhead in split length calculation WT-6542 Add an assert to ensure we are not unintentionally returning empty values WT-6544 Onpage value not appended to the tombstone restored from the data or history store WT-6556 Fix internal sessions to use internal session close function than public API to avoid memory leak WT-6559 Use the session id from the new session to determine statistics bucket WT-6560 Fix usage of global salvage in WT utility WT-6561 Provide MongoDB configuration in the wt utility usage output WT-6569 Squash the prepared updates into a single update before writing it to data store WT-6570 RTS to remove the left over updates in the history store without stop timestamp WT-6571 Lseek cannot use error_sys_check because it does not return an int WT-6577 History store dump outputs confusing time window WT-6578 Prevent reconciliation from looking past the on-disk value WT-6581 Fix class name in test_hs15 WT-6585 Panic if updates that are older than the updates in history store are inserted to history store WT-6586 Tombstone inserted to history store should also be flagged as WT_UPDATE_HS WT-6589 Fix disabled cursor cache python tests WT-6591 Stop checkpoint thread before closing connection in Python tests WT-6593 Retry conflicting operations in test_rollback_to_stable10 WT-6596 Increase cache for timestamp abort test and separate key spaces for all abort tests WT-6598 Add new API allowing changing dhandle hash bucket size WT-6602 Allow operation timeout ms to be passed to commit and rollback WT-6604 Fix typo in the comment descibing WT_CELL structure WT-6610 Fix incremental backup checkpoint parsing to handle upgrades WT-6611 Revert enhancement allowing rename and incremental backup WT-6613 Add python test for early_load flag WT-6615 Initialize last_upd where it is actually used WT-6619 Eliminate possibility of infinite loop in test_cursor13.py WT-6623 Set the connection level file id in recovery file scan WT-6625 Remove outdated TODO WT-6635 Disable mix and column filetype test WT-6640 Coverity: Failure to restore saved dhandle WT-6641 Coverity: Unused value
Diffstat (limited to 'src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c')
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c109
1 files changed, 68 insertions, 41 deletions
diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
index 2c97ddf48c7..65ae870b8fe 100644
--- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
+++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
@@ -35,7 +35,8 @@ __rollback_abort_newer_update(WT_SESSION_IMPL *session, WT_UPDATE *first_upd,
* is not configured for key consistency check, the timestamps could be out of order
* here.
*/
- WT_ASSERT(session, !FLD_ISSET(S2BT(session)->assert_flags, WT_ASSERT_COMMIT_TS_KEYS) ||
+ WT_ASSERT(session,
+ !FLD_ISSET(S2BT(session)->assert_flags, WT_ASSERT_COMMIT_TS_KEYS) ||
upd == first_upd);
first_upd = upd->next;
@@ -165,7 +166,7 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
WT_UPDATE *hs_upd, *tombstone, *upd;
wt_timestamp_t hs_durable_ts, hs_start_ts, hs_stop_durable_ts, newer_hs_durable_ts;
uint64_t hs_counter, type_full;
- uint32_t hs_btree_id, session_flags;
+ uint32_t hs_btree_id;
uint8_t type;
int cmp;
char ts_string[4][WT_TS_INT_STRING_SIZE];
@@ -178,7 +179,6 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
hs_upd = tombstone = upd = NULL;
hs_durable_ts = hs_start_ts = hs_stop_durable_ts = WT_TS_NONE;
hs_btree_id = S2BT(session)->id;
- session_flags = 0;
WT_CLEAR(full_value);
valid_update_found = false;
#ifdef HAVE_DIAGNOSTIC
@@ -200,7 +200,7 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
newer_hs_durable_ts = unpack->tw.durable_start_ts;
/* Open a history store table cursor. */
- WT_ERR(__wt_hs_cursor_open(session, &session_flags));
+ WT_ERR(__wt_hs_cursor_open(session));
hs_cursor = session->hs_cursor;
cbt = (WT_CURSOR_BTREE *)hs_cursor;
@@ -263,8 +263,9 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
* records newer than or equal to the onpage value if eviction runs concurrently with
* checkpoint. In that case, don't verify the first record.
*/
- WT_ASSERT(session, hs_stop_durable_ts <= newer_hs_durable_ts ||
- hs_start_ts == hs_stop_durable_ts || first_record);
+ WT_ASSERT(session,
+ hs_stop_durable_ts <= newer_hs_durable_ts || hs_start_ts == hs_stop_durable_ts ||
+ first_record);
if (hs_stop_durable_ts < newer_hs_durable_ts)
WT_STAT_CONN_INCR(session, txn_rts_hs_stop_older_than_newer_start);
@@ -286,8 +287,8 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
/* Stop processing when we find a stable update according to the given timestamp. */
if (hs_durable_ts <= rollback_timestamp) {
__wt_verbose(session, WT_VERB_RTS,
- "history store update valid with start timestamp: %s, durable timestamp: %s, "
- "stop timestamp: %s and stable timestamp: %s",
+ "history store update valid with start timestamp: %s, durable timestamp: %s, stop "
+ "timestamp: %s and stable timestamp: %s",
__wt_timestamp_to_string(hs_start_ts, ts_string[0]),
__wt_timestamp_to_string(hs_durable_ts, ts_string[1]),
__wt_timestamp_to_string(hs_stop_durable_ts, ts_string[2]),
@@ -331,8 +332,9 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
upd->txnid = cbt->upd_value->tw.start_txn;
upd->durable_ts = cbt->upd_value->tw.durable_start_ts;
upd->start_ts = cbt->upd_value->tw.start_ts;
- __wt_verbose(session, WT_VERB_RTS, "update restored from history store (txnid: %" PRIu64
- ", start_ts: %s, durable_ts: %s",
+ __wt_verbose(session, WT_VERB_RTS,
+ "update restored from history store (txnid: %" PRIu64
+ ", start_ts: %s, durable_ts: %s",
upd->txnid, __wt_timestamp_to_string(upd->start_ts, ts_string[0]),
__wt_timestamp_to_string(upd->durable_ts, ts_string[1]));
@@ -395,7 +397,7 @@ err:
__wt_scr_free(session, &hs_value);
__wt_scr_free(session, &key);
__wt_buf_free(session, &full_value);
- WT_TRET(__wt_hs_cursor_close(session, session_flags));
+ WT_TRET(__wt_hs_cursor_close(session));
return (ret);
}
@@ -421,10 +423,15 @@ __rollback_abort_row_ondisk_kv(
__wt_row_leaf_value_cell(session, page, rip, NULL, vpack);
prepared = vpack->tw.prepare;
if (WT_IS_HS(S2BT(session))) {
- if (vpack->tw.durable_stop_ts > rollback_timestamp) {
+ /*
+ * Abort the history store update with stop durable timestamp greater than the stable
+ * timestamp or the updates with max stop timestamp which implies that they are associated
+ * with prepared transactions.
+ */
+ if (vpack->tw.durable_stop_ts > rollback_timestamp || vpack->tw.stop_ts == WT_TS_MAX) {
__wt_verbose(session, WT_VERB_RTS,
- "hs update aborted with start durable/commit timestamp: %s, %s, "
- "stop durable/commit timestamp: %s, %s and stable timestamp: %s",
+ "hs update aborted with start durable/commit timestamp: %s, %s, stop durable/commit "
+ "timestamp: %s, %s and stable timestamp: %s",
__wt_timestamp_to_string(vpack->tw.durable_start_ts, ts_string[0]),
__wt_timestamp_to_string(vpack->tw.start_ts, ts_string[1]),
__wt_timestamp_to_string(vpack->tw.durable_stop_ts, ts_string[2]),
@@ -595,11 +602,11 @@ __rollback_abort_row_reconciled_page(
if (mod->rec_result == WT_PM_REC_REPLACE &&
(mod->mod_replace.ta.newest_start_durable_ts > rollback_timestamp ||
- mod->mod_replace.ta.newest_stop_durable_ts > rollback_timestamp ||
- mod->mod_replace.ta.prepare)) {
+ mod->mod_replace.ta.newest_stop_durable_ts > rollback_timestamp ||
+ mod->mod_replace.ta.prepare)) {
__wt_verbose(session, WT_VERB_RTS,
- "reconciled replace block page history store update removal on-disk with start "
- "durable timestamp: %s, stop durable timestamp: %s and stable timestamp: %s",
+ "reconciled replace block page history store update removal on-disk with start durable "
+ "timestamp: %s, stop durable timestamp: %s and stable timestamp: %s",
__wt_timestamp_to_string(mod->mod_replace.ta.newest_start_durable_ts, ts_string[0]),
__wt_timestamp_to_string(mod->mod_replace.ta.newest_stop_durable_ts, ts_string[1]),
__wt_timestamp_to_string(rollback_timestamp, ts_string[2]));
@@ -622,9 +629,8 @@ __rollback_abort_row_reconciled_page(
multi->addr.ta.newest_stop_durable_ts > rollback_timestamp ||
multi->addr.ta.prepare) {
__wt_verbose(session, WT_VERB_RTS,
- "reconciled multi block page history store update removal on-disk with "
- "start durable timestamp: %s, stop durable timestamp: %s and stable "
- "timestamp: %s",
+ "reconciled multi block page history store update removal on-disk with start "
+ "durable timestamp: %s, stop durable timestamp: %s and stable timestamp: %s",
__wt_timestamp_to_string(multi->addr.ta.newest_start_durable_ts, ts_string[0]),
__wt_timestamp_to_string(multi->addr.ta.newest_stop_durable_ts, ts_string[1]),
__wt_timestamp_to_string(rollback_timestamp, ts_string[2]));
@@ -696,6 +702,24 @@ __rollback_abort_newer_row_leaf(
}
/*
+ * __rollback_get_ref_max_durable_timestamp --
+ * Returns the ref aggregated max durable timestamp. The max durable timestamp is calculated
+ * between both start and stop durable timestamps except for history store, because most of the
+ * history store updates have stop timestamp either greater or equal to the start timestamp
+ * except for the updates written for the prepared updates on the data store. To abort the
+ * updates with no stop timestamp, we must include the newest stop timestamp also into the
+ * calculation of maximum durable timestamp of the history store.
+ */
+static wt_timestamp_t
+__rollback_get_ref_max_durable_timestamp(WT_SESSION_IMPL *session, WT_TIME_AGGREGATE *ta)
+{
+ if (WT_IS_HS(S2BT(session)))
+ return WT_MAX(ta->newest_stop_durable_ts, ta->newest_stop_ts);
+ else
+ return WT_MAX(ta->newest_start_durable_ts, ta->newest_stop_durable_ts);
+}
+
+/*
* __rollback_page_needs_abort --
* Check whether the page needs rollback. Return true if the page has modifications newer than
* the given timestamp Otherwise return false.
@@ -730,16 +754,15 @@ __rollback_page_needs_abort(
*/
if (mod != NULL && mod->rec_result == WT_PM_REC_REPLACE) {
tag = "reconciled replace block";
- durable_ts = WT_MAX(
- mod->mod_replace.ta.newest_start_durable_ts, mod->mod_replace.ta.newest_stop_durable_ts);
+ durable_ts = __rollback_get_ref_max_durable_timestamp(session, &mod->mod_replace.ta);
prepared = mod->mod_replace.ta.prepare;
result = (durable_ts > rollback_timestamp) || prepared;
} else if (mod != NULL && mod->rec_result == WT_PM_REC_MULTIBLOCK) {
tag = "reconciled multi block";
/* Calculate the max durable timestamp by traversing all multi addresses. */
for (multi = mod->mod_multi, i = 0; i < mod->mod_multi_entries; ++multi, ++i) {
- durable_ts = WT_MAX(durable_ts, multi->addr.ta.newest_start_durable_ts);
- durable_ts = WT_MAX(durable_ts, multi->addr.ta.newest_stop_durable_ts);
+ durable_ts = WT_MAX(
+ durable_ts, __rollback_get_ref_max_durable_timestamp(session, &multi->addr.ta));
if (multi->addr.ta.prepare)
prepared = true;
}
@@ -748,12 +771,12 @@ __rollback_page_needs_abort(
tag = "on page cell";
/* Check if the page is obsolete using the page disk address. */
__wt_cell_unpack_addr(session, ref->home->dsk, (WT_CELL *)addr, &vpack);
- durable_ts = WT_MAX(vpack.ta.newest_start_durable_ts, vpack.ta.newest_stop_durable_ts);
+ durable_ts = __rollback_get_ref_max_durable_timestamp(session, &vpack.ta);
prepared = vpack.ta.prepare;
result = (durable_ts > rollback_timestamp) || prepared;
} else if (addr != NULL) {
tag = "address";
- durable_ts = WT_MAX(addr->ta.newest_start_durable_ts, addr->ta.newest_stop_durable_ts);
+ durable_ts = __rollback_get_ref_max_durable_timestamp(session, &addr->ta);
prepared = addr->ta.prepare;
result = (durable_ts > rollback_timestamp) || prepared;
}
@@ -987,19 +1010,18 @@ __rollback_to_stable_btree_hs_truncate(WT_SESSION_IMPL *session, uint32_t btree_
WT_UPDATE *hs_upd;
wt_timestamp_t hs_start_ts;
uint64_t hs_counter;
- uint32_t hs_btree_id, session_flags;
+ uint32_t hs_btree_id;
int exact;
char ts_string[WT_TS_INT_STRING_SIZE];
hs_cursor = NULL;
WT_CLEAR(key);
hs_upd = NULL;
- session_flags = 0;
WT_RET(__wt_scr_alloc(session, 0, &hs_key));
/* Open a history store table cursor. */
- WT_ERR(__wt_hs_cursor_open(session, &session_flags));
+ WT_ERR(__wt_hs_cursor_open(session));
hs_cursor = session->hs_cursor;
cbt = (WT_CURSOR_BTREE *)hs_cursor;
@@ -1048,7 +1070,7 @@ __rollback_to_stable_btree_hs_truncate(WT_SESSION_IMPL *session, uint32_t btree_
err:
__wt_scr_free(session, &hs_key);
__wt_free(session, hs_upd);
- WT_TRET(__wt_hs_cursor_close(session, session_flags));
+ WT_TRET(__wt_hs_cursor_close(session));
return (ret);
}
@@ -1064,7 +1086,7 @@ __rollback_to_stable_hs_final_pass(WT_SESSION_IMPL *session, wt_timestamp_t roll
WT_CONFIG ckptconf;
WT_CONFIG_ITEM cval, durableval, key;
WT_DECL_RET;
- wt_timestamp_t max_durable_ts, newest_start_durable_ts, newest_stop_durable_ts;
+ wt_timestamp_t max_durable_ts, newest_stop_durable_ts, newest_stop_ts;
char *config;
char ts_string[2][WT_TS_INT_STRING_SIZE];
@@ -1072,22 +1094,27 @@ __rollback_to_stable_hs_final_pass(WT_SESSION_IMPL *session, wt_timestamp_t roll
WT_RET(__wt_metadata_search(session, WT_HS_URI, &config));
- /* Find out the max durable timestamp of the object from checkpoint. */
- newest_start_durable_ts = newest_stop_durable_ts = WT_TS_NONE;
+ /*
+ * Find out the max durable timestamp of the history store from checkpoint. Most of the history
+ * store updates have stop timestamp either greater or equal to the start timestamp except for
+ * the updates written for the prepared updates on the data store. To abort the updates with no
+ * stop timestamp, we must include the newest stop timestamp also into the calculation of
+ * maximum timestamp of the history store.
+ */
+ newest_stop_durable_ts = newest_stop_ts = WT_TS_NONE;
WT_ERR(__wt_config_getones(session, config, "checkpoint", &cval));
__wt_config_subinit(session, &ckptconf, &cval);
for (; __wt_config_next(&ckptconf, &key, &cval) == 0;) {
- ret = __wt_config_subgets(session, &cval, "newest_start_durable_ts", &durableval);
- if (ret == 0)
- newest_start_durable_ts =
- WT_MAX(newest_start_durable_ts, (wt_timestamp_t)durableval.val);
- WT_ERR_NOTFOUND_OK(ret, false);
ret = __wt_config_subgets(session, &cval, "newest_stop_durable_ts", &durableval);
if (ret == 0)
newest_stop_durable_ts = WT_MAX(newest_stop_durable_ts, (wt_timestamp_t)durableval.val);
WT_ERR_NOTFOUND_OK(ret, false);
+ ret = __wt_config_subgets(session, &cval, "newest_stop_ts", &durableval);
+ if (ret == 0)
+ newest_stop_ts = WT_MAX(newest_stop_ts, (wt_timestamp_t)durableval.val);
+ WT_ERR_NOTFOUND_OK(ret, false);
}
- max_durable_ts = WT_MAX(newest_start_durable_ts, newest_stop_durable_ts);
+ max_durable_ts = WT_MAX(newest_stop_ts, newest_stop_durable_ts);
WT_ERR(__wt_session_get_dhandle(session, WT_HS_URI, NULL, NULL, 0));
/*
@@ -1299,7 +1326,7 @@ __wt_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[], bool no_ckp
*/
if (!F_ISSET(S2C(session), WT_CONN_IN_MEMORY) && !no_ckpt)
WT_TRET(session->iface.checkpoint(&session->iface, "force=1"));
- WT_TRET(session->iface.close(&session->iface, NULL));
+ WT_TRET(__wt_session_close_internal(session));
return (ret);
}