From a393aecbf12975bcd76f338b28524afe63dc88c6 Mon Sep 17 00:00:00 2001 From: Chenhao Qu Date: Wed, 6 Oct 2021 06:49:53 +0000 Subject: Import wiredtiger: e5c230df2773ea131e76bc73ec2ec059748f2dde from branch mongodb-master ref: 284d0eff1c..e5c230df27 for: 5.2.0 WT-8193 Wrong corner case in VLCS rollback-to-stable --- src/third_party/wiredtiger/import.data | 2 +- .../wiredtiger/src/txn/txn_rollback_to_stable.c | 108 +++++++++++++------ .../test/suite/test_rollback_to_stable27.py | 116 +++++++++++++++++++++ 3 files changed, 196 insertions(+), 30 deletions(-) create mode 100644 src/third_party/wiredtiger/test/suite/test_rollback_to_stable27.py diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 1926f489920..279154919c0 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-master", - "commit": "284d0eff1c7fda7c3ba52a989df8049de258d4b4" + "commit": "e5c230df2773ea131e76bc73ec2ec059748f2dde" } diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c index 5499f5fffbe..6aee858e94b 100644 --- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c +++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c @@ -155,17 +155,19 @@ __rollback_abort_update(WT_SESSION_IMPL *session, WT_ITEM *key, WT_UPDATE *first /* * __rollback_abort_insert_list -- - * Apply the update abort check to each entry in an insert skip list. + * Apply the update abort check to each entry in an insert skip list. Return how many entries + * had stable updates. */ static int __rollback_abort_insert_list(WT_SESSION_IMPL *session, WT_PAGE *page, WT_INSERT_HEAD *head, - wt_timestamp_t rollback_timestamp, bool *stable_update_found) + wt_timestamp_t rollback_timestamp, uint32_t *stable_updates_count) { WT_DECL_ITEM(key); WT_DECL_RET; WT_INSERT *ins; uint64_t recno; uint8_t *memp; + bool stable_update_found; WT_ERR( __wt_scr_alloc(session, page->type == WT_PAGE_ROW_LEAF ? 0 : WT_INTPACK64_MAXSIZE, &key)); @@ -182,7 +184,9 @@ __rollback_abort_insert_list(WT_SESSION_IMPL *session, WT_PAGE *page, WT_INSERT_ key->size = WT_PTRDIFF(memp, key->data); } WT_ERR(__rollback_abort_update( - session, key, ins->upd, rollback_timestamp, stable_update_found)); + session, key, ins->upd, rollback_timestamp, &stable_update_found)); + if (stable_update_found && stable_updates_count != NULL) + (*stable_updates_count)++; } err: @@ -190,6 +194,19 @@ err: return (ret); } +/* + * __rollback_has_stable_update -- + * Check if an update chain has a stable update on it. Assume the update chain has already been + * processed so all we need to do is look for a valid, non-aborted entry. + */ +static bool +__rollback_has_stable_update(WT_UPDATE *upd) +{ + while (upd != NULL && (upd->type == WT_UPDATE_INVALID || upd->txnid == WT_TXN_ABORTED)) + upd = upd->next; + return upd != NULL; +} + /* * __rollback_col_modify -- * Add the provided update to the head of the update list. @@ -769,11 +786,12 @@ __rollback_abort_col_var(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t r WT_CELL *kcell; WT_CELL_UNPACK_KV unpack; WT_COL *cip; - WT_INSERT_HEAD *ins; + WT_INSERT *ins; + WT_INSERT_HEAD *inshead; WT_PAGE *page; - uint64_t recno, rle; - uint32_t i, j; - bool is_ondisk_stable, stable_update_found; + uint64_t ins_recno, recno, rle; + uint32_t i, j, stable_updates_count; + bool is_ondisk_stable; page = ref->page; /* @@ -786,11 +804,11 @@ __rollback_abort_col_var(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t r /* Review the changes to the original on-page data items. */ WT_COL_FOREACH (page, cip, i) { - stable_update_found = false; + stable_updates_count = 0; - if ((ins = WT_COL_UPDATE(page, cip)) != NULL) + if ((inshead = WT_COL_UPDATE(page, cip)) != NULL) WT_RET(__rollback_abort_insert_list( - session, page, ins, rollback_timestamp, &stable_update_found)); + session, page, inshead, rollback_timestamp, &stable_updates_count)); if (page->dsk != NULL) { /* Unpack the cell. We need its RLE count whether or not we're going to iterate it. */ @@ -799,44 +817,76 @@ __rollback_abort_col_var(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t r rle = __wt_cell_rle(&unpack); /* - * If we found a stable update on the insert list, this key needs no further attention. - * Any other keys in this cell with stable updates also do not require attention. But - * beyond that, the on-disk value must be older than the update we found. That means it - * too is stable(*), so any keys in the cell that _don't_ have stable updates on the - * update list don't need further attention either. (And any unstable updates were just - * handled above.) Thus we can skip iterating over the cell. + * Each key whose on-disk value is not stable and has no stable update on the update + * list must be processed downstream. * - * Furthermore, if the cell is deleted it must be - * itself stable, because cells only appear as deleted if there is no older value that - * might need to be restored. We can skip iterating over the cell. + * If we can determine that the cell's on-disk value is stable, we can skip iterating + * over the cell; likewise, if we can determine that every key in the cell has a stable + * update on the update list, we can skip the iteration. Otherwise we have to try each + * key. * - * (*) Either that, or the update is not timestamped, in which case the on-disk value - * might not be stable but the non-timestamp update will hide it until the next - * reconciliation and then overwrite it. + * If the on-disk cell is deleted, it is stable, because cells only appear as deleted + * when there is no older value that might need to be restored. + * + * Note that in a purely timestamped world, the presence of any stable update for any + * key in the cell means the on-disk value must be stable, because the update must be + * newer than the on-disk value. However, this is no longer true if the stable update + * has no timestamp. It may also not be true if the on-disk value is prepared, or other + * corner cases. Therefore, we must iterate the cell unless _every_ key has a stable + * update. + * + * We can, however, stop iterating as soon as the downstream code reports back that the + * on-disk value is actually stable. */ - if (stable_update_found) - WT_STAT_CONN_DATA_INCR(session, txn_rts_stable_rle_skipped); - else if (unpack.type == WT_CELL_DEL) + if (unpack.type == WT_CELL_DEL) WT_STAT_CONN_DATA_INCR(session, txn_rts_delete_rle_skipped); + else if (stable_updates_count == rle) + WT_STAT_CONN_DATA_INCR(session, txn_rts_stable_rle_skipped); else { - for (j = 0; j < rle; j++) { + j = 0; + if (inshead != NULL) { + WT_SKIP_FOREACH (ins, inshead) { + /* If the update list goes past the end of the cell, something's wrong. */ + WT_ASSERT(session, j < rle); + ins_recno = WT_INSERT_RECNO(ins); + /* Process all the keys before this update. */ + while (recno + j < ins_recno) { + WT_RET(__rollback_abort_ondisk_kv(session, ref, NULL, recno + j, NULL, + &unpack, rollback_timestamp, &is_ondisk_stable)); + /* We can stop right away if the on-disk version is stable. */ + if (is_ondisk_stable) { + if (rle > 1) + WT_STAT_CONN_DATA_INCR(session, txn_rts_stable_rle_skipped); + goto stop; + } + j++; + } + /* If this key has a stable update, skip over it. */ + if (recno + j == ins_recno && __rollback_has_stable_update(ins->upd)) + j++; + } + } + /* Process the rest of the keys. */ + while (j < rle) { WT_RET(__rollback_abort_ondisk_kv(session, ref, NULL, recno + j, NULL, &unpack, rollback_timestamp, &is_ondisk_stable)); /* We can stop right away if the on-disk version is stable. */ if (is_ondisk_stable) { if (rle > 1) WT_STAT_CONN_DATA_INCR(session, txn_rts_stable_rle_skipped); - break; + goto stop; } + j++; } } +stop: recno += rle; } } /* Review the append list */ - if ((ins = WT_COL_APPEND(page)) != NULL) - WT_RET(__rollback_abort_insert_list(session, page, ins, rollback_timestamp, NULL)); + if ((inshead = WT_COL_APPEND(page)) != NULL) + WT_RET(__rollback_abort_insert_list(session, page, inshead, rollback_timestamp, NULL)); /* Mark the page as dirty to reconcile the page. */ if (page->modify) diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable27.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable27.py new file mode 100644 index 00000000000..ee0499e72da --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable27.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python +# +# Public Domain 2014-present MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +from test_rollback_to_stable01 import test_rollback_to_stable_base +from wiredtiger import stat, Modify, WT_NOTFOUND +from wtdataset import SimpleDataSet +from wtscenario import make_scenarios + +# test_rollback_to_stable27.py +# +# Test mixing timestamped and non-timestamped updates on the same VLCS RLE cell. +class test_rollback_to_stable27(test_rollback_to_stable_base): + session_config = 'isolation=snapshot' + + # Run it all on row-store as well as a control group: if something odd arises from the + # RLE cell handling it won't happen in row-store. + key_format_values = [ + ('column', dict(key_format='r')), + ('integer_row', dict(key_format='i')), + ] + + in_memory_values = [ + ('no_inmem', dict(in_memory=False)), + ('inmem', dict(in_memory=True)) + ] + + scenarios = make_scenarios(key_format_values, in_memory_values) + + def conn_config(self): + if self.in_memory: + return 'in_memory=true' + else: + return 'in_memory=false' + + # Evict the page to force reconciliation. + def evict(self, uri, key, check_value): + evict_cursor = self.session.open_cursor(uri, None, "debug=(release_evict)") + self.session.begin_transaction() + v = evict_cursor[1] + self.assertEqual(v, check_value) + self.assertEqual(evict_cursor.reset(), 0) + self.session.rollback_transaction() + evict_cursor.close() + + def test_rollback_to_stable(self): + nrows = 10 + + # Create a table without logging. + uri = "table:rollback_to_stable27" + ds = SimpleDataSet( + self, uri, 0, key_format=self.key_format, value_format="S", config='log=(enabled=false)') + ds.populate() + + value_a = "aaaaa" * 10 + value_b = "bbbbb" * 10 + + # Pin oldest and stable to timestamp 10. + self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(10) + + ',stable_timestamp=' + self.timestamp_str(10)) + + # Write aaaaaa to all the keys at time 20. + self.large_updates(uri, value_a, ds, nrows, False, 20) + + # Evict the page to force reconciliation. + self.evict(uri, 1, value_a) + + # Ideally here we'd check to make sure we actually have a single RLE cell, because + # if not the rest of the work isn't going to do much good. Maybe via stats...? + + cursor = self.session.open_cursor(uri) + self.session.begin_transaction() + cursor[7] = value_b + self.session.commit_transaction() + cursor.close() + + # Now roll back. + self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(15)) + self.conn.rollback_to_stable() + + # The only thing we should see (at any time) is value_b at key 7. + cursor = self.session.open_cursor(uri) + for ts in [10, 20, 30]: + self.session.begin_transaction('read_timestamp=' + self.timestamp_str(ts)) + for k, v in cursor: + self.assertEqual(k, 7) + self.assertEqual(v, value_b) + self.session.rollback_transaction() + cursor.close() + +if __name__ == '__main__': + wttest.run() -- cgit v1.2.1