summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChenhao Qu <chenhao.qu@mongodb.com>2021-10-06 06:49:53 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-10-06 07:30:19 +0000
commita393aecbf12975bcd76f338b28524afe63dc88c6 (patch)
treef5cb15a0e67053bd5859965d3efaed0f80c8a209
parent6510de96ffa12fc284377d2127977837438f9c06 (diff)
downloadmongo-a393aecbf12975bcd76f338b28524afe63dc88c6.tar.gz
Import wiredtiger: e5c230df2773ea131e76bc73ec2ec059748f2dde from branch mongodb-master
ref: 284d0eff1c..e5c230df27 for: 5.2.0 WT-8193 Wrong corner case in VLCS rollback-to-stable
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c108
-rw-r--r--src/third_party/wiredtiger/test/suite/test_rollback_to_stable27.py116
3 files changed, 196 insertions, 30 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 1926f489920..279154919c0 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-master",
- "commit": "284d0eff1c7fda7c3ba52a989df8049de258d4b4"
+ "commit": "e5c230df2773ea131e76bc73ec2ec059748f2dde"
}
diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
index 5499f5fffbe..6aee858e94b 100644
--- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
+++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
@@ -155,17 +155,19 @@ __rollback_abort_update(WT_SESSION_IMPL *session, WT_ITEM *key, WT_UPDATE *first
/*
* __rollback_abort_insert_list --
- * Apply the update abort check to each entry in an insert skip list.
+ * Apply the update abort check to each entry in an insert skip list. Return how many entries
+ * had stable updates.
*/
static int
__rollback_abort_insert_list(WT_SESSION_IMPL *session, WT_PAGE *page, WT_INSERT_HEAD *head,
- wt_timestamp_t rollback_timestamp, bool *stable_update_found)
+ wt_timestamp_t rollback_timestamp, uint32_t *stable_updates_count)
{
WT_DECL_ITEM(key);
WT_DECL_RET;
WT_INSERT *ins;
uint64_t recno;
uint8_t *memp;
+ bool stable_update_found;
WT_ERR(
__wt_scr_alloc(session, page->type == WT_PAGE_ROW_LEAF ? 0 : WT_INTPACK64_MAXSIZE, &key));
@@ -182,7 +184,9 @@ __rollback_abort_insert_list(WT_SESSION_IMPL *session, WT_PAGE *page, WT_INSERT_
key->size = WT_PTRDIFF(memp, key->data);
}
WT_ERR(__rollback_abort_update(
- session, key, ins->upd, rollback_timestamp, stable_update_found));
+ session, key, ins->upd, rollback_timestamp, &stable_update_found));
+ if (stable_update_found && stable_updates_count != NULL)
+ (*stable_updates_count)++;
}
err:
@@ -191,6 +195,19 @@ err:
}
/*
+ * __rollback_has_stable_update --
+ * Check if an update chain has a stable update on it. Assume the update chain has already been
+ * processed so all we need to do is look for a valid, non-aborted entry.
+ */
+static bool
+__rollback_has_stable_update(WT_UPDATE *upd)
+{
+ while (upd != NULL && (upd->type == WT_UPDATE_INVALID || upd->txnid == WT_TXN_ABORTED))
+ upd = upd->next;
+ return upd != NULL;
+}
+
+/*
* __rollback_col_modify --
* Add the provided update to the head of the update list.
*/
@@ -769,11 +786,12 @@ __rollback_abort_col_var(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t r
WT_CELL *kcell;
WT_CELL_UNPACK_KV unpack;
WT_COL *cip;
- WT_INSERT_HEAD *ins;
+ WT_INSERT *ins;
+ WT_INSERT_HEAD *inshead;
WT_PAGE *page;
- uint64_t recno, rle;
- uint32_t i, j;
- bool is_ondisk_stable, stable_update_found;
+ uint64_t ins_recno, recno, rle;
+ uint32_t i, j, stable_updates_count;
+ bool is_ondisk_stable;
page = ref->page;
/*
@@ -786,11 +804,11 @@ __rollback_abort_col_var(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t r
/* Review the changes to the original on-page data items. */
WT_COL_FOREACH (page, cip, i) {
- stable_update_found = false;
+ stable_updates_count = 0;
- if ((ins = WT_COL_UPDATE(page, cip)) != NULL)
+ if ((inshead = WT_COL_UPDATE(page, cip)) != NULL)
WT_RET(__rollback_abort_insert_list(
- session, page, ins, rollback_timestamp, &stable_update_found));
+ session, page, inshead, rollback_timestamp, &stable_updates_count));
if (page->dsk != NULL) {
/* Unpack the cell. We need its RLE count whether or not we're going to iterate it. */
@@ -799,44 +817,76 @@ __rollback_abort_col_var(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t r
rle = __wt_cell_rle(&unpack);
/*
- * If we found a stable update on the insert list, this key needs no further attention.
- * Any other keys in this cell with stable updates also do not require attention. But
- * beyond that, the on-disk value must be older than the update we found. That means it
- * too is stable(*), so any keys in the cell that _don't_ have stable updates on the
- * update list don't need further attention either. (And any unstable updates were just
- * handled above.) Thus we can skip iterating over the cell.
+ * Each key whose on-disk value is not stable and has no stable update on the update
+ * list must be processed downstream.
*
- * Furthermore, if the cell is deleted it must be
- * itself stable, because cells only appear as deleted if there is no older value that
- * might need to be restored. We can skip iterating over the cell.
+ * If we can determine that the cell's on-disk value is stable, we can skip iterating
+ * over the cell; likewise, if we can determine that every key in the cell has a stable
+ * update on the update list, we can skip the iteration. Otherwise we have to try each
+ * key.
*
- * (*) Either that, or the update is not timestamped, in which case the on-disk value
- * might not be stable but the non-timestamp update will hide it until the next
- * reconciliation and then overwrite it.
+ * If the on-disk cell is deleted, it is stable, because cells only appear as deleted
+ * when there is no older value that might need to be restored.
+ *
+ * Note that in a purely timestamped world, the presence of any stable update for any
+ * key in the cell means the on-disk value must be stable, because the update must be
+ * newer than the on-disk value. However, this is no longer true if the stable update
+ * has no timestamp. It may also not be true if the on-disk value is prepared, or other
+ * corner cases. Therefore, we must iterate the cell unless _every_ key has a stable
+ * update.
+ *
+ * We can, however, stop iterating as soon as the downstream code reports back that the
+ * on-disk value is actually stable.
*/
- if (stable_update_found)
- WT_STAT_CONN_DATA_INCR(session, txn_rts_stable_rle_skipped);
- else if (unpack.type == WT_CELL_DEL)
+ if (unpack.type == WT_CELL_DEL)
WT_STAT_CONN_DATA_INCR(session, txn_rts_delete_rle_skipped);
+ else if (stable_updates_count == rle)
+ WT_STAT_CONN_DATA_INCR(session, txn_rts_stable_rle_skipped);
else {
- for (j = 0; j < rle; j++) {
+ j = 0;
+ if (inshead != NULL) {
+ WT_SKIP_FOREACH (ins, inshead) {
+ /* If the update list goes past the end of the cell, something's wrong. */
+ WT_ASSERT(session, j < rle);
+ ins_recno = WT_INSERT_RECNO(ins);
+ /* Process all the keys before this update. */
+ while (recno + j < ins_recno) {
+ WT_RET(__rollback_abort_ondisk_kv(session, ref, NULL, recno + j, NULL,
+ &unpack, rollback_timestamp, &is_ondisk_stable));
+ /* We can stop right away if the on-disk version is stable. */
+ if (is_ondisk_stable) {
+ if (rle > 1)
+ WT_STAT_CONN_DATA_INCR(session, txn_rts_stable_rle_skipped);
+ goto stop;
+ }
+ j++;
+ }
+ /* If this key has a stable update, skip over it. */
+ if (recno + j == ins_recno && __rollback_has_stable_update(ins->upd))
+ j++;
+ }
+ }
+ /* Process the rest of the keys. */
+ while (j < rle) {
WT_RET(__rollback_abort_ondisk_kv(session, ref, NULL, recno + j, NULL, &unpack,
rollback_timestamp, &is_ondisk_stable));
/* We can stop right away if the on-disk version is stable. */
if (is_ondisk_stable) {
if (rle > 1)
WT_STAT_CONN_DATA_INCR(session, txn_rts_stable_rle_skipped);
- break;
+ goto stop;
}
+ j++;
}
}
+stop:
recno += rle;
}
}
/* Review the append list */
- if ((ins = WT_COL_APPEND(page)) != NULL)
- WT_RET(__rollback_abort_insert_list(session, page, ins, rollback_timestamp, NULL));
+ if ((inshead = WT_COL_APPEND(page)) != NULL)
+ WT_RET(__rollback_abort_insert_list(session, page, inshead, rollback_timestamp, NULL));
/* Mark the page as dirty to reconcile the page. */
if (page->modify)
diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable27.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable27.py
new file mode 100644
index 00000000000..ee0499e72da
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable27.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-present MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+from test_rollback_to_stable01 import test_rollback_to_stable_base
+from wiredtiger import stat, Modify, WT_NOTFOUND
+from wtdataset import SimpleDataSet
+from wtscenario import make_scenarios
+
+# test_rollback_to_stable27.py
+#
+# Test mixing timestamped and non-timestamped updates on the same VLCS RLE cell.
+class test_rollback_to_stable27(test_rollback_to_stable_base):
+ session_config = 'isolation=snapshot'
+
+ # Run it all on row-store as well as a control group: if something odd arises from the
+ # RLE cell handling it won't happen in row-store.
+ key_format_values = [
+ ('column', dict(key_format='r')),
+ ('integer_row', dict(key_format='i')),
+ ]
+
+ in_memory_values = [
+ ('no_inmem', dict(in_memory=False)),
+ ('inmem', dict(in_memory=True))
+ ]
+
+ scenarios = make_scenarios(key_format_values, in_memory_values)
+
+ def conn_config(self):
+ if self.in_memory:
+ return 'in_memory=true'
+ else:
+ return 'in_memory=false'
+
+ # Evict the page to force reconciliation.
+ def evict(self, uri, key, check_value):
+ evict_cursor = self.session.open_cursor(uri, None, "debug=(release_evict)")
+ self.session.begin_transaction()
+ v = evict_cursor[1]
+ self.assertEqual(v, check_value)
+ self.assertEqual(evict_cursor.reset(), 0)
+ self.session.rollback_transaction()
+ evict_cursor.close()
+
+ def test_rollback_to_stable(self):
+ nrows = 10
+
+ # Create a table without logging.
+ uri = "table:rollback_to_stable27"
+ ds = SimpleDataSet(
+ self, uri, 0, key_format=self.key_format, value_format="S", config='log=(enabled=false)')
+ ds.populate()
+
+ value_a = "aaaaa" * 10
+ value_b = "bbbbb" * 10
+
+ # Pin oldest and stable to timestamp 10.
+ self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(10) +
+ ',stable_timestamp=' + self.timestamp_str(10))
+
+ # Write aaaaaa to all the keys at time 20.
+ self.large_updates(uri, value_a, ds, nrows, False, 20)
+
+ # Evict the page to force reconciliation.
+ self.evict(uri, 1, value_a)
+
+ # Ideally here we'd check to make sure we actually have a single RLE cell, because
+ # if not the rest of the work isn't going to do much good. Maybe via stats...?
+
+ cursor = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ cursor[7] = value_b
+ self.session.commit_transaction()
+ cursor.close()
+
+ # Now roll back.
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(15))
+ self.conn.rollback_to_stable()
+
+ # The only thing we should see (at any time) is value_b at key 7.
+ cursor = self.session.open_cursor(uri)
+ for ts in [10, 20, 30]:
+ self.session.begin_transaction('read_timestamp=' + self.timestamp_str(ts))
+ for k, v in cursor:
+ self.assertEqual(k, 7)
+ self.assertEqual(v, value_b)
+ self.session.rollback_transaction()
+ cursor.close()
+
+if __name__ == '__main__':
+ wttest.run()