diff options
author | Luke Chen <luke.chen@mongodb.com> | 2020-09-09 11:37:44 +1000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-09-09 01:52:32 +0000 |
commit | 0f0eb111c9dbd5dd4ca86529aee81bad59c6579b (patch) | |
tree | 94f3efa909755fcc8b381bf68214ebb9bb326964 /src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c | |
parent | 103b545528e7cf05537bd68bda4bc8235bf6fefa (diff) | |
download | mongo-0f0eb111c9dbd5dd4ca86529aee81bad59c6579b.tar.gz |
Import wiredtiger: 6a7db4f96fe828fdb1b9a31d83460c0573a2c3b1 from branch mongodb-4.4
ref: 579966149b..6a7db4f96f
for: 4.4.2
WT-5144 Use wt_clock instead of wt_epoch in perf programs
WT-5585 Remove cache_overflow config option
WT-5693 Enable test_wt4105_large_doc_small_upd
WT-5940 Migrate to Clang Format 10
WT-6000 Enhance incremental backup testing in format to support restart
WT-6006 Revert test binaries to 10 in checkpoint-stress-test
WT-6027 Fix docs spelling errors and warnings
WT-6322 Split full compatibility tests into smaller groups
WT-6390 Extend compact02 timeout from 8 => 10 minutes
WT-6404 Add timing stress that delays checkpoint after it calls __wt_txn_begin
WT-6451 Do not evict clean metadata pages if needed for historic reads
WT-6458 read row-store leaf pages with prepared updates in a single pass
WT-6463 History store operations should honor cache size
WT-6471 Avoid the error message for non-existent clang-format binary
WT-6472 Update timestamp_abort test cache configuration
WT-6478 Cursor cache statistics not getting incremented
WT-6505 Add debugging for missing file failure
WT-6507 Exit cache eviction worker after our operation has timed out
WT-6526 Fix assertion failure when opening DB in readonly mode after unclean shutdown
WT-6532 Consider update structure overhead in split length calculation
WT-6542 Add an assert to ensure we are not unintentionally returning empty values
WT-6544 Onpage value not appended to the tombstone restored from the data or history store
WT-6556 Fix internal sessions to use internal session close function than public API to avoid memory leak
WT-6559 Use the session id from the new session to determine statistics bucket
WT-6560 Fix usage of global salvage in WT utility
WT-6561 Provide MongoDB configuration in the wt utility usage output
WT-6569 Squash the prepared updates into a single update before writing it to data store
WT-6570 RTS to remove the left over updates in the history store without stop timestamp
WT-6571 Lseek cannot use error_sys_check because it does not return an int
WT-6577 History store dump outputs confusing time window
WT-6578 Prevent reconciliation from looking past the on-disk value
WT-6581 Fix class name in test_hs15
WT-6585 Panic if updates that are older than the updates in history store are inserted to history store
WT-6586 Tombstone inserted to history store should also be flagged as WT_UPDATE_HS
WT-6589 Fix disabled cursor cache python tests
WT-6591 Stop checkpoint thread before closing connection in Python tests
WT-6593 Retry conflicting operations in test_rollback_to_stable10
WT-6596 Increase cache for timestamp abort test and separate key spaces for all abort tests
WT-6598 Add new API allowing changing dhandle hash bucket size
WT-6602 Allow operation timeout ms to be passed to commit and rollback
WT-6604 Fix typo in the comment descibing WT_CELL structure
WT-6610 Fix incremental backup checkpoint parsing to handle upgrades
WT-6611 Revert enhancement allowing rename and incremental backup
WT-6613 Add python test for early_load flag
WT-6615 Initialize last_upd where it is actually used
WT-6619 Eliminate possibility of infinite loop in test_cursor13.py
WT-6623 Set the connection level file id in recovery file scan
WT-6625 Remove outdated TODO
WT-6635 Disable mix and column filetype test
WT-6640 Coverity: Failure to restore saved dhandle
WT-6641 Coverity: Unused value
Diffstat (limited to 'src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c')
-rw-r--r-- | src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c | 109 |
1 files changed, 68 insertions, 41 deletions
diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c index 2c97ddf48c7..65ae870b8fe 100644 --- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c +++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c @@ -35,7 +35,8 @@ __rollback_abort_newer_update(WT_SESSION_IMPL *session, WT_UPDATE *first_upd, * is not configured for key consistency check, the timestamps could be out of order * here. */ - WT_ASSERT(session, !FLD_ISSET(S2BT(session)->assert_flags, WT_ASSERT_COMMIT_TS_KEYS) || + WT_ASSERT(session, + !FLD_ISSET(S2BT(session)->assert_flags, WT_ASSERT_COMMIT_TS_KEYS) || upd == first_upd); first_upd = upd->next; @@ -165,7 +166,7 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW WT_UPDATE *hs_upd, *tombstone, *upd; wt_timestamp_t hs_durable_ts, hs_start_ts, hs_stop_durable_ts, newer_hs_durable_ts; uint64_t hs_counter, type_full; - uint32_t hs_btree_id, session_flags; + uint32_t hs_btree_id; uint8_t type; int cmp; char ts_string[4][WT_TS_INT_STRING_SIZE]; @@ -178,7 +179,6 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW hs_upd = tombstone = upd = NULL; hs_durable_ts = hs_start_ts = hs_stop_durable_ts = WT_TS_NONE; hs_btree_id = S2BT(session)->id; - session_flags = 0; WT_CLEAR(full_value); valid_update_found = false; #ifdef HAVE_DIAGNOSTIC @@ -200,7 +200,7 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW newer_hs_durable_ts = unpack->tw.durable_start_ts; /* Open a history store table cursor. */ - WT_ERR(__wt_hs_cursor_open(session, &session_flags)); + WT_ERR(__wt_hs_cursor_open(session)); hs_cursor = session->hs_cursor; cbt = (WT_CURSOR_BTREE *)hs_cursor; @@ -263,8 +263,9 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW * records newer than or equal to the onpage value if eviction runs concurrently with * checkpoint. In that case, don't verify the first record. */ - WT_ASSERT(session, hs_stop_durable_ts <= newer_hs_durable_ts || - hs_start_ts == hs_stop_durable_ts || first_record); + WT_ASSERT(session, + hs_stop_durable_ts <= newer_hs_durable_ts || hs_start_ts == hs_stop_durable_ts || + first_record); if (hs_stop_durable_ts < newer_hs_durable_ts) WT_STAT_CONN_INCR(session, txn_rts_hs_stop_older_than_newer_start); @@ -286,8 +287,8 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW /* Stop processing when we find a stable update according to the given timestamp. */ if (hs_durable_ts <= rollback_timestamp) { __wt_verbose(session, WT_VERB_RTS, - "history store update valid with start timestamp: %s, durable timestamp: %s, " - "stop timestamp: %s and stable timestamp: %s", + "history store update valid with start timestamp: %s, durable timestamp: %s, stop " + "timestamp: %s and stable timestamp: %s", __wt_timestamp_to_string(hs_start_ts, ts_string[0]), __wt_timestamp_to_string(hs_durable_ts, ts_string[1]), __wt_timestamp_to_string(hs_stop_durable_ts, ts_string[2]), @@ -331,8 +332,9 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW upd->txnid = cbt->upd_value->tw.start_txn; upd->durable_ts = cbt->upd_value->tw.durable_start_ts; upd->start_ts = cbt->upd_value->tw.start_ts; - __wt_verbose(session, WT_VERB_RTS, "update restored from history store (txnid: %" PRIu64 - ", start_ts: %s, durable_ts: %s", + __wt_verbose(session, WT_VERB_RTS, + "update restored from history store (txnid: %" PRIu64 + ", start_ts: %s, durable_ts: %s", upd->txnid, __wt_timestamp_to_string(upd->start_ts, ts_string[0]), __wt_timestamp_to_string(upd->durable_ts, ts_string[1])); @@ -395,7 +397,7 @@ err: __wt_scr_free(session, &hs_value); __wt_scr_free(session, &key); __wt_buf_free(session, &full_value); - WT_TRET(__wt_hs_cursor_close(session, session_flags)); + WT_TRET(__wt_hs_cursor_close(session)); return (ret); } @@ -421,10 +423,15 @@ __rollback_abort_row_ondisk_kv( __wt_row_leaf_value_cell(session, page, rip, NULL, vpack); prepared = vpack->tw.prepare; if (WT_IS_HS(S2BT(session))) { - if (vpack->tw.durable_stop_ts > rollback_timestamp) { + /* + * Abort the history store update with stop durable timestamp greater than the stable + * timestamp or the updates with max stop timestamp which implies that they are associated + * with prepared transactions. + */ + if (vpack->tw.durable_stop_ts > rollback_timestamp || vpack->tw.stop_ts == WT_TS_MAX) { __wt_verbose(session, WT_VERB_RTS, - "hs update aborted with start durable/commit timestamp: %s, %s, " - "stop durable/commit timestamp: %s, %s and stable timestamp: %s", + "hs update aborted with start durable/commit timestamp: %s, %s, stop durable/commit " + "timestamp: %s, %s and stable timestamp: %s", __wt_timestamp_to_string(vpack->tw.durable_start_ts, ts_string[0]), __wt_timestamp_to_string(vpack->tw.start_ts, ts_string[1]), __wt_timestamp_to_string(vpack->tw.durable_stop_ts, ts_string[2]), @@ -595,11 +602,11 @@ __rollback_abort_row_reconciled_page( if (mod->rec_result == WT_PM_REC_REPLACE && (mod->mod_replace.ta.newest_start_durable_ts > rollback_timestamp || - mod->mod_replace.ta.newest_stop_durable_ts > rollback_timestamp || - mod->mod_replace.ta.prepare)) { + mod->mod_replace.ta.newest_stop_durable_ts > rollback_timestamp || + mod->mod_replace.ta.prepare)) { __wt_verbose(session, WT_VERB_RTS, - "reconciled replace block page history store update removal on-disk with start " - "durable timestamp: %s, stop durable timestamp: %s and stable timestamp: %s", + "reconciled replace block page history store update removal on-disk with start durable " + "timestamp: %s, stop durable timestamp: %s and stable timestamp: %s", __wt_timestamp_to_string(mod->mod_replace.ta.newest_start_durable_ts, ts_string[0]), __wt_timestamp_to_string(mod->mod_replace.ta.newest_stop_durable_ts, ts_string[1]), __wt_timestamp_to_string(rollback_timestamp, ts_string[2])); @@ -622,9 +629,8 @@ __rollback_abort_row_reconciled_page( multi->addr.ta.newest_stop_durable_ts > rollback_timestamp || multi->addr.ta.prepare) { __wt_verbose(session, WT_VERB_RTS, - "reconciled multi block page history store update removal on-disk with " - "start durable timestamp: %s, stop durable timestamp: %s and stable " - "timestamp: %s", + "reconciled multi block page history store update removal on-disk with start " + "durable timestamp: %s, stop durable timestamp: %s and stable timestamp: %s", __wt_timestamp_to_string(multi->addr.ta.newest_start_durable_ts, ts_string[0]), __wt_timestamp_to_string(multi->addr.ta.newest_stop_durable_ts, ts_string[1]), __wt_timestamp_to_string(rollback_timestamp, ts_string[2])); @@ -696,6 +702,24 @@ __rollback_abort_newer_row_leaf( } /* + * __rollback_get_ref_max_durable_timestamp -- + * Returns the ref aggregated max durable timestamp. The max durable timestamp is calculated + * between both start and stop durable timestamps except for history store, because most of the + * history store updates have stop timestamp either greater or equal to the start timestamp + * except for the updates written for the prepared updates on the data store. To abort the + * updates with no stop timestamp, we must include the newest stop timestamp also into the + * calculation of maximum durable timestamp of the history store. + */ +static wt_timestamp_t +__rollback_get_ref_max_durable_timestamp(WT_SESSION_IMPL *session, WT_TIME_AGGREGATE *ta) +{ + if (WT_IS_HS(S2BT(session))) + return WT_MAX(ta->newest_stop_durable_ts, ta->newest_stop_ts); + else + return WT_MAX(ta->newest_start_durable_ts, ta->newest_stop_durable_ts); +} + +/* * __rollback_page_needs_abort -- * Check whether the page needs rollback. Return true if the page has modifications newer than * the given timestamp Otherwise return false. @@ -730,16 +754,15 @@ __rollback_page_needs_abort( */ if (mod != NULL && mod->rec_result == WT_PM_REC_REPLACE) { tag = "reconciled replace block"; - durable_ts = WT_MAX( - mod->mod_replace.ta.newest_start_durable_ts, mod->mod_replace.ta.newest_stop_durable_ts); + durable_ts = __rollback_get_ref_max_durable_timestamp(session, &mod->mod_replace.ta); prepared = mod->mod_replace.ta.prepare; result = (durable_ts > rollback_timestamp) || prepared; } else if (mod != NULL && mod->rec_result == WT_PM_REC_MULTIBLOCK) { tag = "reconciled multi block"; /* Calculate the max durable timestamp by traversing all multi addresses. */ for (multi = mod->mod_multi, i = 0; i < mod->mod_multi_entries; ++multi, ++i) { - durable_ts = WT_MAX(durable_ts, multi->addr.ta.newest_start_durable_ts); - durable_ts = WT_MAX(durable_ts, multi->addr.ta.newest_stop_durable_ts); + durable_ts = WT_MAX( + durable_ts, __rollback_get_ref_max_durable_timestamp(session, &multi->addr.ta)); if (multi->addr.ta.prepare) prepared = true; } @@ -748,12 +771,12 @@ __rollback_page_needs_abort( tag = "on page cell"; /* Check if the page is obsolete using the page disk address. */ __wt_cell_unpack_addr(session, ref->home->dsk, (WT_CELL *)addr, &vpack); - durable_ts = WT_MAX(vpack.ta.newest_start_durable_ts, vpack.ta.newest_stop_durable_ts); + durable_ts = __rollback_get_ref_max_durable_timestamp(session, &vpack.ta); prepared = vpack.ta.prepare; result = (durable_ts > rollback_timestamp) || prepared; } else if (addr != NULL) { tag = "address"; - durable_ts = WT_MAX(addr->ta.newest_start_durable_ts, addr->ta.newest_stop_durable_ts); + durable_ts = __rollback_get_ref_max_durable_timestamp(session, &addr->ta); prepared = addr->ta.prepare; result = (durable_ts > rollback_timestamp) || prepared; } @@ -987,19 +1010,18 @@ __rollback_to_stable_btree_hs_truncate(WT_SESSION_IMPL *session, uint32_t btree_ WT_UPDATE *hs_upd; wt_timestamp_t hs_start_ts; uint64_t hs_counter; - uint32_t hs_btree_id, session_flags; + uint32_t hs_btree_id; int exact; char ts_string[WT_TS_INT_STRING_SIZE]; hs_cursor = NULL; WT_CLEAR(key); hs_upd = NULL; - session_flags = 0; WT_RET(__wt_scr_alloc(session, 0, &hs_key)); /* Open a history store table cursor. */ - WT_ERR(__wt_hs_cursor_open(session, &session_flags)); + WT_ERR(__wt_hs_cursor_open(session)); hs_cursor = session->hs_cursor; cbt = (WT_CURSOR_BTREE *)hs_cursor; @@ -1048,7 +1070,7 @@ __rollback_to_stable_btree_hs_truncate(WT_SESSION_IMPL *session, uint32_t btree_ err: __wt_scr_free(session, &hs_key); __wt_free(session, hs_upd); - WT_TRET(__wt_hs_cursor_close(session, session_flags)); + WT_TRET(__wt_hs_cursor_close(session)); return (ret); } @@ -1064,7 +1086,7 @@ __rollback_to_stable_hs_final_pass(WT_SESSION_IMPL *session, wt_timestamp_t roll WT_CONFIG ckptconf; WT_CONFIG_ITEM cval, durableval, key; WT_DECL_RET; - wt_timestamp_t max_durable_ts, newest_start_durable_ts, newest_stop_durable_ts; + wt_timestamp_t max_durable_ts, newest_stop_durable_ts, newest_stop_ts; char *config; char ts_string[2][WT_TS_INT_STRING_SIZE]; @@ -1072,22 +1094,27 @@ __rollback_to_stable_hs_final_pass(WT_SESSION_IMPL *session, wt_timestamp_t roll WT_RET(__wt_metadata_search(session, WT_HS_URI, &config)); - /* Find out the max durable timestamp of the object from checkpoint. */ - newest_start_durable_ts = newest_stop_durable_ts = WT_TS_NONE; + /* + * Find out the max durable timestamp of the history store from checkpoint. Most of the history + * store updates have stop timestamp either greater or equal to the start timestamp except for + * the updates written for the prepared updates on the data store. To abort the updates with no + * stop timestamp, we must include the newest stop timestamp also into the calculation of + * maximum timestamp of the history store. + */ + newest_stop_durable_ts = newest_stop_ts = WT_TS_NONE; WT_ERR(__wt_config_getones(session, config, "checkpoint", &cval)); __wt_config_subinit(session, &ckptconf, &cval); for (; __wt_config_next(&ckptconf, &key, &cval) == 0;) { - ret = __wt_config_subgets(session, &cval, "newest_start_durable_ts", &durableval); - if (ret == 0) - newest_start_durable_ts = - WT_MAX(newest_start_durable_ts, (wt_timestamp_t)durableval.val); - WT_ERR_NOTFOUND_OK(ret, false); ret = __wt_config_subgets(session, &cval, "newest_stop_durable_ts", &durableval); if (ret == 0) newest_stop_durable_ts = WT_MAX(newest_stop_durable_ts, (wt_timestamp_t)durableval.val); WT_ERR_NOTFOUND_OK(ret, false); + ret = __wt_config_subgets(session, &cval, "newest_stop_ts", &durableval); + if (ret == 0) + newest_stop_ts = WT_MAX(newest_stop_ts, (wt_timestamp_t)durableval.val); + WT_ERR_NOTFOUND_OK(ret, false); } - max_durable_ts = WT_MAX(newest_start_durable_ts, newest_stop_durable_ts); + max_durable_ts = WT_MAX(newest_stop_ts, newest_stop_durable_ts); WT_ERR(__wt_session_get_dhandle(session, WT_HS_URI, NULL, NULL, 0)); /* @@ -1299,7 +1326,7 @@ __wt_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[], bool no_ckp */ if (!F_ISSET(S2C(session), WT_CONN_IN_MEMORY) && !no_ckpt) WT_TRET(session->iface.checkpoint(&session->iface, "force=1")); - WT_TRET(session->iface.close(&session->iface, NULL)); + WT_TRET(__wt_session_close_internal(session)); return (ret); } |