diff options
-rw-r--r-- | src/third_party/wiredtiger/import.data | 2 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/history/hs_rec.c | 98 | ||||
-rwxr-xr-x | src/third_party/wiredtiger/test/suite/test_rollback_to_stable29.py | 100 |
3 files changed, 170 insertions, 30 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index f8f7fb490d4..baa8361c895 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-master", - "commit": "d3735be6dff0b323bb0bc876c65b4e98ad0aed83" + "commit": "854df2c974559fddb3412f919d6421376f3b0ae7" } diff --git a/src/third_party/wiredtiger/src/history/hs_rec.c b/src/third_party/wiredtiger/src/history/hs_rec.c index 30fb40dc8eb..ec66f5b35a4 100644 --- a/src/third_party/wiredtiger/src/history/hs_rec.c +++ b/src/third_party/wiredtiger/src/history/hs_rec.c @@ -68,9 +68,7 @@ static int __hs_insert_record(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BTREE *btree, const WT_ITEM *key, const uint8_t type, const WT_ITEM *hs_value, WT_TIME_WINDOW *tw, bool error_on_ooo_ts) { -#ifdef HAVE_DIAGNOSTIC WT_CURSOR_BTREE *hs_cbt; -#endif WT_DECL_ITEM(hs_key); #ifdef HAVE_DIAGNOSTIC WT_DECL_ITEM(existing_val); @@ -87,7 +85,7 @@ __hs_insert_record(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BTREE *btree, uint64_t counter, hs_counter; uint32_t hs_btree_id; - counter = 0; + counter = hs_counter = 0; /* * We might be entering this code from application thread's context. We should make sure that we @@ -114,9 +112,10 @@ __hs_insert_record(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BTREE *btree, #ifdef HAVE_DIAGNOSTIC /* Allocate buffer for the existing history store value for the same key. */ WT_ERR(__wt_scr_alloc(session, 0, &existing_val)); - hs_cbt = __wt_curhs_get_cbt(cursor); #endif + hs_cbt = __wt_curhs_get_cbt(cursor); + /* Sanity check that the btree is not a history store btree. */ WT_ASSERT(session, !WT_IS_HS(btree)); @@ -185,9 +184,16 @@ __hs_insert_record(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BTREE *btree, * would have received WT_NOT_FOUND. In that case we need to search again with a higher * timestamp. */ - if (ret == 0) - WT_ERR_NOTFOUND_OK(cursor->next(cursor), true); - else { + if (ret == 0) { + /* + * Check if the current history store update's stop timestamp is out of order with respect + * to the update to be inserted before before moving onto the next record. + */ + if (hs_cbt->upd_value->tw.stop_ts <= tw->start_ts) + WT_ERR_NOTFOUND_OK(cursor->next(cursor), true); + else + counter = hs_counter + 1; + } else { cursor->set_key(cursor, 3, btree->id, key, tw->start_ts + 1); WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_after(session, cursor), true); } @@ -869,8 +875,13 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp)); WT_ASSERT(session, cmp == 0); #endif - /* We find a key that is larger or equal to the specified timestamp*/ - if (hs_ts >= ts) + /* + * We have found a key with a timestamp larger than or equal to the specified timestamp. + * Always use the start timestamp retrieved from the key instead of the start timestamp from + * the cell. The cell's start timestamp can be cleared during reconciliation if it is + * globally visible. + */ + if (hs_ts >= ts || twp->stop_ts >= ts) break; } if (ret == WT_NOTFOUND) @@ -896,30 +907,44 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui * see them after they've been moved due to their transaction id. * * For example, if we're inserting an update at timestamp 3 with value ddd: - * btree key ts counter value - * 2 foo 5 0 aaa - * 2 foo 6 0 bbb - * 2 foo 7 0 ccc + * btree key ts counter value stop_ts + * 2 foo 5 0 aaa 6 + * 2 foo 6 0 bbb 7 + * 2 foo 7 0 ccc 8 * * We want to end up with this: - * btree key ts counter value - * 2 foo 3 0 aaa - * 2 foo 3 1 bbb - * 2 foo 3 2 ccc - * 2 foo 3 3 ddd + * btree key ts counter value stop_ts + * 2 foo 3 0 aaa 3 + * 2 foo 3 1 bbb 3 + * 2 foo 3 2 ccc 3 + * 2 foo 3 3 ddd 3 * * Another example, if we're inserting an update at timestamp 0 with value ddd: - * btree key ts counter value - * 2 foo 5 0 aaa - * 2 foo 6 0 bbb - * 2 foo 7 0 ccc + * btree key ts counter value stop_ts + * 2 foo 5 0 aaa 6 + * 2 foo 6 0 bbb 7 + * 2 foo 7 0 ccc 8 + * + * We want to end up with this: + * btree key ts counter value stop_ts + * 2 foo 0 0 aaa 0 + * 2 foo 0 1 bbb 0 + * 2 foo 0 2 ccc 0 + * 2 foo 0 3 ddd 0 + * + * Another example, if we're inserting an update at timestamp 3 with value ddd + * that is an out of order with a stop timestamp of 6: + * btree key ts counter value stop_ts + * 2 foo 1 0 aaa 6 + * 2 foo 6 0 bbb 7 + * 2 foo 7 0 ccc 8 * * We want to end up with this: - * btree key ts counter value - * 2 foo 0 0 aaa - * 2 foo 0 1 bbb - * 2 foo 0 2 ccc - * 2 foo 0 3 ddd + * btree key ts counter value stop_ts + * 2 foo 1 1 aaa 3 + * 2 foo 3 2 bbb 3 + * 2 foo 3 3 ccc 3 + * 2 foo 3 4 ddd 3 */ for (; ret == 0; ret = hs_cursor->next(hs_cursor)) { /* We shouldn't have crossed the btree and user key search space. */ @@ -935,8 +960,14 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui * Our strategy to rectify this is to remove all records for the same key with a timestamp * higher or equal than the specified timestamp and reinsert them at the smaller timestamp, * which is the timestamp of the update we are about to insert to the history store. + * + * It is possible that the cursor next call can find an update that was reinserted when it + * had an out of order tombstone with respect to the new update. Continue the search by + * ignoring them. */ - WT_ASSERT(session, hs_ts >= ts); + __wt_hs_upd_time_window(hs_cursor, &twp); + if (hs_ts < ts && twp->stop_ts < ts) + continue; if (reinsert) { /* @@ -965,7 +996,16 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui __wt_timestamp_to_string(hs_cbt->upd_value->tw.durable_stop_ts, ts_string[3]), __wt_timestamp_to_string(ts, ts_string[4])); - hs_insert_tw.start_ts = hs_insert_tw.durable_start_ts = ts - 1; + /* + * Use the original start time window's timestamps if it isn't out of order with respect + * to the new update. + */ + if (hs_cbt->upd_value->tw.start_ts >= ts) + hs_insert_tw.start_ts = hs_insert_tw.durable_start_ts = ts - 1; + else { + hs_insert_tw.start_ts = hs_cbt->upd_value->tw.start_ts; + hs_insert_tw.durable_start_ts = hs_cbt->upd_value->tw.durable_start_ts; + } hs_insert_tw.start_txn = hs_cbt->upd_value->tw.start_txn; /* diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable29.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable29.py new file mode 100755 index 00000000000..d40ef12d7c8 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable29.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python +# +# Public Domain 2014-present MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import os, threading, time +from wtthread import checkpoint_thread +import wiredtiger +from wiredtiger import stat +from wtdataset import SimpleDataSet +from wtscenario import make_scenarios +from helper import copy_wiredtiger_home, simulate_crash_restart +from test_rollback_to_stable01 import test_rollback_to_stable_base + +# test_rollback_to_stable29.py +# Test that the rollback to stable to verify the history store order when an out of order to a tombstone. +class test_rollback_to_stable29(test_rollback_to_stable_base): + conn_config = 'cache_size=25MB,statistics=(all),statistics_log=(json,on_close,wait=1),log=(enabled=true)' + + key_format_values = [ + ('column', dict(key_format='r')), + ('integer_row', dict(key_format='i')), + ] + + scenarios = make_scenarios(key_format_values) + + def test_rollback_to_stable(self): + uri = 'table:test_rollback_to_stable29' + nrows = 100 + + # Create our table. + ds = SimpleDataSet(self, uri, 0, key_format=self.key_format, value_format='S',config='log=(enabled=false)') + ds.populate() + + value_a = 'a' * 100 + value_b = 'b' * 100 + value_c = 'c' * 100 + value_d = 'd' * 100 + + # Pin oldest and stable to timestamp 1. + self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(1) + + ',stable_timestamp=' + self.timestamp_str(1)) + + self.large_updates(uri, value_a, ds, nrows, False, 10) + self.large_removes(uri, ds, nrows, False, 30) + self.large_updates(uri, value_b, ds, nrows, False, 40) + self.check(value_b, uri, nrows, 40) + self.large_updates(uri, value_c, ds, nrows, False, 50) + self.check(value_c, uri, nrows, 50) + self.evict_cursor(uri, nrows, value_c) + + # Insert an out of order update. + self.session.breakpoint() + self.large_updates(uri, value_d, ds, nrows, False, 20) + + self.check(value_a, uri, nrows, 10) + self.check(value_d, uri, nrows, 40) + self.check(value_d, uri, nrows, 50) + self.check(value_d, uri, nrows, 20) + + # Pin stable to timestamp 10. + self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(10)) + self.session.checkpoint() + + # Simulate a crash by copying to a new directory(RESTART). + simulate_crash_restart(self, ".", "RESTART") + + self.check(value_a, uri, nrows, 10) + + stat_cursor = self.session.open_cursor('statistics:', None, None) + hs_removed = stat_cursor[stat.conn.txn_rts_hs_removed][2] + stat_cursor.close() + + self.assertGreaterEqual(hs_removed, 3 * nrows) + +if __name__ == '__main__': + wttest.run() |