summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2022-08-08 11:53:41 +1000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-08-08 02:21:40 +0000
commit7521e350018cf8c7f313a3f34a458beb61d777f4 (patch)
tree3504b682dd7f94a190a3a97456a840bef635a286
parent98b529b09006bff5a43f80bda685d6205b62c505 (diff)
downloadmongo-7521e350018cf8c7f313a3f34a458beb61d777f4.tar.gz
Import wiredtiger: 0a0e7777d06232d588bd1d8dd94df5c3a83dced3 from branch mongodb-5.0
ref: f151b51d8c..0a0e7777d0 for: 5.0.11 WT-8847 Add tombstone to WT_SAVE_UPD to truncate the update list upon page restore (#7703)
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_split.c30
-rw-r--r--src/third_party/wiredtiger/src/include/btmem.h1
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_visibility.c6
-rw-r--r--src/third_party/wiredtiger/test/suite/test_rollback_to_stable32.py114
5 files changed, 145 insertions, 8 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 80396926a21..e34c48d86a9 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-5.0",
- "commit": "f151b51d8cac62b5465deb847ed6f19c4dfcbe20"
+ "commit": "0a0e7777d06232d588bd1d8dd94df5c3a83dced3"
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c
index 54fc452bd49..107f99b7285 100644
--- a/src/third_party/wiredtiger/src/btree/bt_split.c
+++ b/src/third_party/wiredtiger/src/btree/bt_split.c
@@ -1357,7 +1357,7 @@ __split_multi_inmem(WT_SESSION_IMPL *session, WT_PAGE *orig, WT_MULTI *multi, WT
WT_PAGE *page;
WT_PAGE_MODIFY *mod;
WT_SAVE_UPD *supd;
- WT_UPDATE *prev_onpage, *upd;
+ WT_UPDATE *prev_onpage, *upd, *tmp;
uint64_t recno;
uint32_t i, slot;
bool prepare;
@@ -1440,19 +1440,39 @@ __split_multi_inmem(WT_SESSION_IMPL *session, WT_PAGE *orig, WT_MULTI *multi, WT
if (supd->onpage_upd != NULL && !F_ISSET(S2C(session), WT_CONN_IN_MEMORY) &&
orig->type != WT_PAGE_COL_FIX) {
/*
+ * If there is an on-page tombstone we need to remove it as well while performing update
+ * restore eviction.
+ */
+ tmp = supd->onpage_tombstone != NULL ? supd->onpage_tombstone : supd->onpage_upd;
+
+ /*
* We have decided to restore this update chain so it must have newer updates than the
* onpage value on it.
*/
- WT_ASSERT(session, upd != supd->onpage_upd);
+ WT_ASSERT(session, upd != tmp);
+ WT_ASSERT(session, F_ISSET(tmp, WT_UPDATE_DS));
+
/*
* Move the pointer to the position before the onpage value and truncate all the updates
* starting from the onpage value.
*/
- for (prev_onpage = upd;
- prev_onpage->next != NULL && prev_onpage->next != supd->onpage_upd;
+ for (prev_onpage = upd; prev_onpage->next != NULL && prev_onpage->next != tmp;
prev_onpage = prev_onpage->next)
;
- WT_ASSERT(session, prev_onpage->next == supd->onpage_upd);
+ WT_ASSERT(session, prev_onpage->next == tmp);
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * During update restore eviction we remove anything older than the on-page update,
+ * including the on-page update. However it is possible a tombstone is also written as
+ * the stop time of the on-page value. To handle this we also need to remove the
+ * tombstone from the update chain.
+ *
+ * This assertion checks that there aren't any unexpected updates between that tombstone
+ * and the subsequent value which both make up the on-page value.
+ */
+ for (; tmp != NULL && tmp != supd->onpage_upd; tmp = tmp->next)
+ WT_ASSERT(session, tmp == supd->onpage_tombstone || tmp->txnid == WT_TXN_ABORTED);
+#endif
prev_onpage->next = NULL;
}
diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h
index 8b4ee968dc4..b0cea3be6ff 100644
--- a/src/third_party/wiredtiger/src/include/btmem.h
+++ b/src/third_party/wiredtiger/src/include/btmem.h
@@ -255,6 +255,7 @@ struct __wt_save_upd {
WT_INSERT *ins; /* Insert list reference */
WT_ROW *rip; /* Original on-page reference */
WT_UPDATE *onpage_upd;
+ WT_UPDATE *onpage_tombstone;
bool restore; /* Whether to restore this saved update chain */
};
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
index ba947c13ac4..2deb26f587f 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
@@ -26,7 +26,7 @@ __rec_update_stable(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_UPDATE *upd)
*/
static inline int
__rec_update_save(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, WT_ROW *rip,
- WT_UPDATE *onpage_upd, bool supd_restore, size_t upd_memsize)
+ WT_UPDATE *onpage_upd, WT_UPDATE *tombstone, bool supd_restore, size_t upd_memsize)
{
WT_SAVE_UPD *supd;
@@ -44,6 +44,7 @@ __rec_update_save(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, WT_
supd->ins = ins;
supd->rip = rip;
supd->onpage_upd = onpage_upd;
+ supd->onpage_tombstone = tombstone;
supd->restore = supd_restore;
++r->supd_next;
r->supd_memsize += upd_memsize;
@@ -757,7 +758,8 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, W
(has_newer_updates || F_ISSET(S2C(session), WT_CONN_IN_MEMORY) ||
page->type == WT_PAGE_COL_FIX);
- WT_ERR(__rec_update_save(session, r, ins, rip, onpage_upd, supd_restore, upd_memsize));
+ WT_ERR(__rec_update_save(
+ session, r, ins, rip, onpage_upd, tombstone, supd_restore, upd_memsize));
/*
* Mark the selected update (and potentially the tombstone preceding it) as being destined
diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable32.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable32.py
new file mode 100644
index 00000000000..850dc145fe8
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable32.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-present MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+from helper import simulate_crash_restart
+from test_rollback_to_stable01 import test_rollback_to_stable_base
+from wiredtiger import stat
+from wtdataset import SimpleDataSet
+from wtscenario import make_scenarios
+
+# test_rollback_to_stable32.py
+# Test that update restore eviction correctly removes an on-disk
+# tombstone. Previously it would trigger an assertion in reconciliation.
+class test_rollback_to_stable32(test_rollback_to_stable_base):
+
+ format_values = [
+ ('column', dict(key_format='r', value_format='S')),
+ ('row_integer', dict(key_format='i', value_format='S')),
+ ]
+
+ prepare_values = [
+ ('no_prepare', dict(prepare=False)),
+ ('prepare', dict(prepare=True))
+ ]
+
+ scenarios = make_scenarios(format_values, prepare_values)
+
+ def conn_config(self):
+ config = 'cache_size=100MB,statistics=(all)'
+ return config
+
+ def test_rollback_to_stable_with_update_restore_evict(self):
+ nrows = 1000
+ # Create a table.
+ uri = "table:rollback_to_stable32"
+ ds = SimpleDataSet(self, uri, 0, key_format=self.key_format, value_format=self.value_format,
+ config='split_pct=50')
+ ds.populate()
+
+ if self.value_format == '8t':
+ value_a = 97
+ value_b = 98
+ value_c = 99
+ else:
+ value_a = "aaaaa" * 100
+ value_b = "bbbbb" * 100
+ value_c = "ccccc" * 100
+
+ # Pin oldest and stable to timestamp 10.
+ self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(10) +
+ ',stable_timestamp=' + self.timestamp_str(10))
+
+ # Perform several updates.
+ self.large_updates(uri, value_a, ds, nrows, self.prepare, 20)
+ # Perform several updates.
+ self.large_updates(uri, value_b, ds, nrows, self.prepare, 30)
+ # Perform several removes.
+ self.large_removes(uri, ds, nrows, self.prepare, 40)
+ # Pin stable to timestamp 50 if prepare otherwise 40.
+ if self.prepare:
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(50))
+ else:
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(40))
+ # Perform several updates and checkpoint.
+ self.large_updates(uri, value_c, ds, nrows, self.prepare, 60)
+ self.session.checkpoint()
+
+ # Verify data is visible and correct.
+ # (In FLCS, the removed rows should read back as zero.)
+ self.check(value_a, uri, nrows, 21 if self.prepare else 20)
+ self.check(None, uri, 0, 41 if self.prepare else 40)
+ self.check(value_c, uri, nrows, 61 if self.prepare else 60)
+ self.evict_cursor(uri, nrows, value_c)
+
+ self.conn.rollback_to_stable()
+
+ self.conn.reconfigure("debug_mode=(eviction=false)")
+
+ # Perform several updates and checkpoint.
+ cursor = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ for i in range(1, nrows + 1):
+ cursor[ds.key(i)] = value_c
+ cursor.close()
+ self.session.rollback_transaction()
+ self.session.breakpoint()
+ # Perform several updates and checkpoint.
+ self.large_updates(uri, value_c, ds, nrows, self.prepare, 60)
+ self.evict_cursor(uri, nrows, value_c)
+ self.check(value_b, uri, nrows, 31 if self.prepare else 30)
+ self.conn.close()