summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2021-11-01 17:42:27 +1100
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-11-01 07:08:29 +0000
commitd410535b8fb7fd6f93033bc66f3f653b8216428b (patch)
treec21c6bea47fd51d716aed8631c70d10e20b0d062
parent7a8f97641ecf75e1872e46a88c7a03721df8b249 (diff)
downloadmongo-d410535b8fb7fd6f93033bc66f3f653b8216428b.tar.gz
Import wiredtiger: 854df2c974559fddb3412f919d6421376f3b0ae7 from branch mongodb-master
ref: d3735be6df..854df2c974 for: 5.2.0 WT-8281 Fix out of order handling with history store stop timestamp
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/history/hs_rec.c98
-rwxr-xr-xsrc/third_party/wiredtiger/test/suite/test_rollback_to_stable29.py100
3 files changed, 170 insertions, 30 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index f8f7fb490d4..baa8361c895 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-master",
- "commit": "d3735be6dff0b323bb0bc876c65b4e98ad0aed83"
+ "commit": "854df2c974559fddb3412f919d6421376f3b0ae7"
}
diff --git a/src/third_party/wiredtiger/src/history/hs_rec.c b/src/third_party/wiredtiger/src/history/hs_rec.c
index 30fb40dc8eb..ec66f5b35a4 100644
--- a/src/third_party/wiredtiger/src/history/hs_rec.c
+++ b/src/third_party/wiredtiger/src/history/hs_rec.c
@@ -68,9 +68,7 @@ static int
__hs_insert_record(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BTREE *btree, const WT_ITEM *key,
const uint8_t type, const WT_ITEM *hs_value, WT_TIME_WINDOW *tw, bool error_on_ooo_ts)
{
-#ifdef HAVE_DIAGNOSTIC
WT_CURSOR_BTREE *hs_cbt;
-#endif
WT_DECL_ITEM(hs_key);
#ifdef HAVE_DIAGNOSTIC
WT_DECL_ITEM(existing_val);
@@ -87,7 +85,7 @@ __hs_insert_record(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BTREE *btree,
uint64_t counter, hs_counter;
uint32_t hs_btree_id;
- counter = 0;
+ counter = hs_counter = 0;
/*
* We might be entering this code from application thread's context. We should make sure that we
@@ -114,9 +112,10 @@ __hs_insert_record(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BTREE *btree,
#ifdef HAVE_DIAGNOSTIC
/* Allocate buffer for the existing history store value for the same key. */
WT_ERR(__wt_scr_alloc(session, 0, &existing_val));
- hs_cbt = __wt_curhs_get_cbt(cursor);
#endif
+ hs_cbt = __wt_curhs_get_cbt(cursor);
+
/* Sanity check that the btree is not a history store btree. */
WT_ASSERT(session, !WT_IS_HS(btree));
@@ -185,9 +184,16 @@ __hs_insert_record(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BTREE *btree,
* would have received WT_NOT_FOUND. In that case we need to search again with a higher
* timestamp.
*/
- if (ret == 0)
- WT_ERR_NOTFOUND_OK(cursor->next(cursor), true);
- else {
+ if (ret == 0) {
+ /*
+ * Check if the current history store update's stop timestamp is out of order with respect
+ * to the update to be inserted before before moving onto the next record.
+ */
+ if (hs_cbt->upd_value->tw.stop_ts <= tw->start_ts)
+ WT_ERR_NOTFOUND_OK(cursor->next(cursor), true);
+ else
+ counter = hs_counter + 1;
+ } else {
cursor->set_key(cursor, 3, btree->id, key, tw->start_ts + 1);
WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_after(session, cursor), true);
}
@@ -869,8 +875,13 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui
WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp));
WT_ASSERT(session, cmp == 0);
#endif
- /* We find a key that is larger or equal to the specified timestamp*/
- if (hs_ts >= ts)
+ /*
+ * We have found a key with a timestamp larger than or equal to the specified timestamp.
+ * Always use the start timestamp retrieved from the key instead of the start timestamp from
+ * the cell. The cell's start timestamp can be cleared during reconciliation if it is
+ * globally visible.
+ */
+ if (hs_ts >= ts || twp->stop_ts >= ts)
break;
}
if (ret == WT_NOTFOUND)
@@ -896,30 +907,44 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui
* see them after they've been moved due to their transaction id.
*
* For example, if we're inserting an update at timestamp 3 with value ddd:
- * btree key ts counter value
- * 2 foo 5 0 aaa
- * 2 foo 6 0 bbb
- * 2 foo 7 0 ccc
+ * btree key ts counter value stop_ts
+ * 2 foo 5 0 aaa 6
+ * 2 foo 6 0 bbb 7
+ * 2 foo 7 0 ccc 8
*
* We want to end up with this:
- * btree key ts counter value
- * 2 foo 3 0 aaa
- * 2 foo 3 1 bbb
- * 2 foo 3 2 ccc
- * 2 foo 3 3 ddd
+ * btree key ts counter value stop_ts
+ * 2 foo 3 0 aaa 3
+ * 2 foo 3 1 bbb 3
+ * 2 foo 3 2 ccc 3
+ * 2 foo 3 3 ddd 3
*
* Another example, if we're inserting an update at timestamp 0 with value ddd:
- * btree key ts counter value
- * 2 foo 5 0 aaa
- * 2 foo 6 0 bbb
- * 2 foo 7 0 ccc
+ * btree key ts counter value stop_ts
+ * 2 foo 5 0 aaa 6
+ * 2 foo 6 0 bbb 7
+ * 2 foo 7 0 ccc 8
+ *
+ * We want to end up with this:
+ * btree key ts counter value stop_ts
+ * 2 foo 0 0 aaa 0
+ * 2 foo 0 1 bbb 0
+ * 2 foo 0 2 ccc 0
+ * 2 foo 0 3 ddd 0
+ *
+ * Another example, if we're inserting an update at timestamp 3 with value ddd
+ * that is an out of order with a stop timestamp of 6:
+ * btree key ts counter value stop_ts
+ * 2 foo 1 0 aaa 6
+ * 2 foo 6 0 bbb 7
+ * 2 foo 7 0 ccc 8
*
* We want to end up with this:
- * btree key ts counter value
- * 2 foo 0 0 aaa
- * 2 foo 0 1 bbb
- * 2 foo 0 2 ccc
- * 2 foo 0 3 ddd
+ * btree key ts counter value stop_ts
+ * 2 foo 1 1 aaa 3
+ * 2 foo 3 2 bbb 3
+ * 2 foo 3 3 ccc 3
+ * 2 foo 3 4 ddd 3
*/
for (; ret == 0; ret = hs_cursor->next(hs_cursor)) {
/* We shouldn't have crossed the btree and user key search space. */
@@ -935,8 +960,14 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui
* Our strategy to rectify this is to remove all records for the same key with a timestamp
* higher or equal than the specified timestamp and reinsert them at the smaller timestamp,
* which is the timestamp of the update we are about to insert to the history store.
+ *
+ * It is possible that the cursor next call can find an update that was reinserted when it
+ * had an out of order tombstone with respect to the new update. Continue the search by
+ * ignoring them.
*/
- WT_ASSERT(session, hs_ts >= ts);
+ __wt_hs_upd_time_window(hs_cursor, &twp);
+ if (hs_ts < ts && twp->stop_ts < ts)
+ continue;
if (reinsert) {
/*
@@ -965,7 +996,16 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui
__wt_timestamp_to_string(hs_cbt->upd_value->tw.durable_stop_ts, ts_string[3]),
__wt_timestamp_to_string(ts, ts_string[4]));
- hs_insert_tw.start_ts = hs_insert_tw.durable_start_ts = ts - 1;
+ /*
+ * Use the original start time window's timestamps if it isn't out of order with respect
+ * to the new update.
+ */
+ if (hs_cbt->upd_value->tw.start_ts >= ts)
+ hs_insert_tw.start_ts = hs_insert_tw.durable_start_ts = ts - 1;
+ else {
+ hs_insert_tw.start_ts = hs_cbt->upd_value->tw.start_ts;
+ hs_insert_tw.durable_start_ts = hs_cbt->upd_value->tw.durable_start_ts;
+ }
hs_insert_tw.start_txn = hs_cbt->upd_value->tw.start_txn;
/*
diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable29.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable29.py
new file mode 100755
index 00000000000..d40ef12d7c8
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable29.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-present MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import os, threading, time
+from wtthread import checkpoint_thread
+import wiredtiger
+from wiredtiger import stat
+from wtdataset import SimpleDataSet
+from wtscenario import make_scenarios
+from helper import copy_wiredtiger_home, simulate_crash_restart
+from test_rollback_to_stable01 import test_rollback_to_stable_base
+
+# test_rollback_to_stable29.py
+# Test that the rollback to stable to verify the history store order when an out of order to a tombstone.
+class test_rollback_to_stable29(test_rollback_to_stable_base):
+ conn_config = 'cache_size=25MB,statistics=(all),statistics_log=(json,on_close,wait=1),log=(enabled=true)'
+
+ key_format_values = [
+ ('column', dict(key_format='r')),
+ ('integer_row', dict(key_format='i')),
+ ]
+
+ scenarios = make_scenarios(key_format_values)
+
+ def test_rollback_to_stable(self):
+ uri = 'table:test_rollback_to_stable29'
+ nrows = 100
+
+ # Create our table.
+ ds = SimpleDataSet(self, uri, 0, key_format=self.key_format, value_format='S',config='log=(enabled=false)')
+ ds.populate()
+
+ value_a = 'a' * 100
+ value_b = 'b' * 100
+ value_c = 'c' * 100
+ value_d = 'd' * 100
+
+ # Pin oldest and stable to timestamp 1.
+ self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(1) +
+ ',stable_timestamp=' + self.timestamp_str(1))
+
+ self.large_updates(uri, value_a, ds, nrows, False, 10)
+ self.large_removes(uri, ds, nrows, False, 30)
+ self.large_updates(uri, value_b, ds, nrows, False, 40)
+ self.check(value_b, uri, nrows, 40)
+ self.large_updates(uri, value_c, ds, nrows, False, 50)
+ self.check(value_c, uri, nrows, 50)
+ self.evict_cursor(uri, nrows, value_c)
+
+ # Insert an out of order update.
+ self.session.breakpoint()
+ self.large_updates(uri, value_d, ds, nrows, False, 20)
+
+ self.check(value_a, uri, nrows, 10)
+ self.check(value_d, uri, nrows, 40)
+ self.check(value_d, uri, nrows, 50)
+ self.check(value_d, uri, nrows, 20)
+
+ # Pin stable to timestamp 10.
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(10))
+ self.session.checkpoint()
+
+ # Simulate a crash by copying to a new directory(RESTART).
+ simulate_crash_restart(self, ".", "RESTART")
+
+ self.check(value_a, uri, nrows, 10)
+
+ stat_cursor = self.session.open_cursor('statistics:', None, None)
+ hs_removed = stat_cursor[stat.conn.txn_rts_hs_removed][2]
+ stat_cursor.close()
+
+ self.assertGreaterEqual(hs_removed, 3 * nrows)
+
+if __name__ == '__main__':
+ wttest.run()