summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrew Morton <andrew.morton@mongodb.com>2023-05-16 22:21:28 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2023-05-17 00:50:02 +0000
commitec541cdf59f7dd96ee36082516ead214a23c7954 (patch)
tree7521ba0df7e6fcf5d382531e774fc0a485e52a4e
parentfa0e0ce777f0c3a438449a6bd1fb1b96fa5d8531 (diff)
downloadmongo-ec541cdf59f7dd96ee36082516ead214a23c7954.tar.gz
Import wiredtiger: 9318c8f5aed6dd7422f3c2a9c05a89ee40f804f8 from branch mongodb-master
ref: 8dc1c0b0f1..9318c8f5ae for: 7.1.0-rc0 WT-10820 Checkpoint to mark the tree dirty when the tree has more data
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_sync.c16
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_visibility.c15
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c37
-rw-r--r--src/third_party/wiredtiger/test/suite/test_alter05.py2
-rw-r--r--src/third_party/wiredtiger/test/suite/test_checkpoint_snapshot06.py254
-rw-r--r--src/third_party/wiredtiger/test/suite/test_hs21.py3
7 files changed, 299 insertions, 30 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index d6619336b55..eda98191606 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-master",
- "commit": "8dc1c0b0f1381773a0d13d5751ec724f23f77709"
+ "commit": "9318c8f5aed6dd7422f3c2a9c05a89ee40f804f8"
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c
index f052547b4a1..1e237009e5d 100644
--- a/src/third_party/wiredtiger/src/btree/bt_sync.c
+++ b/src/third_party/wiredtiger/src/btree/bt_sync.c
@@ -440,6 +440,22 @@ __wt_sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
__wt_checkpoint_progress(session, false);
}
}
+
+ /*
+ * During normal checkpoints, mark the tree dirty if the btree has modifications that are
+ * not visible to the checkpoint. There is a drawback in this approach as we compare the
+ * btree's maximum transaction id with the checkpoint snap_min and it is possible that this
+ * transaction may be visible to the checkpoint, but still, we mark the tree as dirty if
+ * there is a long-running transaction in the database.
+ *
+ * Do not mark the tree dirty if there is no change to stable timestamp compared to the last
+ * checkpoint.
+ */
+ if (!btree->modified && !F_ISSET(conn, WT_CONN_RECOVERING | WT_CONN_CLOSING_CHECKPOINT) &&
+ (btree->rec_max_txn >= txn->snap_min ||
+ (conn->txn_global.checkpoint_timestamp != conn->txn_global.last_ckpt_timestamp &&
+ btree->rec_max_timestamp > conn->txn_global.checkpoint_timestamp)))
+ __wt_tree_modify_set(session);
break;
case WT_SYNC_CLOSE:
case WT_SYNC_DISCARD:
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
index b2ae3a86048..129e286d43c 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c
@@ -536,8 +536,6 @@ __rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_UPDATE *first_upd
*/
if (*first_txn_updp == NULL)
*first_txn_updp = upd;
- if (WT_TXNID_LT(max_txn, txnid))
- max_txn = txnid;
/*
* Special handling for application threads evicting their own updates.
@@ -596,8 +594,6 @@ __rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_UPDATE *first_upd
if (F_ISSET(r, WT_REC_CHECKPOINT)) {
*upd_memsizep += WT_UPDATE_MEMSIZE(upd);
*has_newer_updatesp = true;
- if (upd->start_ts > max_ts)
- max_ts = upd->start_ts;
seen_prepare = true;
continue;
} else {
@@ -617,14 +613,17 @@ __rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_UPDATE *first_upd
}
}
- /* Track the first update with non-zero timestamp. */
- if (upd->start_ts > max_ts)
- max_ts = upd->start_ts;
-
/* Always select the newest committed update to write to disk */
if (upd_select->upd == NULL)
upd_select->upd = upd;
+ /* Track the selected update transaction id and timestamp. */
+ if (WT_TXNID_LT(max_txn, txnid))
+ max_txn = txnid;
+
+ if (upd->start_ts > max_ts)
+ max_ts = upd->start_ts;
+
/*
* We only need to walk the whole update chain if we are evicting metadata as it is written
* with read uncommitted isolation and we may see a committed update followed by uncommitted
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index 5bad5df9d9b..f82b4d2cdaf 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -381,6 +381,22 @@ __rec_write_page_status(WT_SESSION_IMPL *session, WT_RECONCILE *r)
mod = page->modify;
/*
+ * Track the page's maximum transaction ID (used to decide if we can evict a clean page and
+ * discard its history).
+ */
+ mod->rec_max_txn = r->max_txn;
+ mod->rec_max_timestamp = r->max_ts;
+
+ /*
+ * Track the tree's maximum transaction ID (used to decide if it's safe to discard the tree) and
+ * maximum timestamp.
+ */
+ if (WT_TXNID_LT(btree->rec_max_txn, r->max_txn))
+ btree->rec_max_txn = r->max_txn;
+ if (btree->rec_max_timestamp < r->max_ts)
+ btree->rec_max_timestamp = r->max_ts;
+
+ /*
* Set the page's status based on whether or not we cleaned the page.
*/
if (r->leave_dirty) {
@@ -407,27 +423,6 @@ __rec_write_page_status(WT_SESSION_IMPL *session, WT_RECONCILE *r)
(F_ISSET(r, WT_REC_HS | WT_REC_IN_MEMORY) || WT_IS_METADATA(btree->dhandle)));
} else {
/*
- * Track the page's maximum transaction ID (used to decide if we can evict a clean page and
- * discard its history).
- */
- mod->rec_max_txn = r->max_txn;
- mod->rec_max_timestamp = r->max_ts;
-
- /*
- * Track the tree's maximum transaction ID (used to decide if it's safe to discard the
- * tree). Reconciliation for eviction is multi-threaded, only update the tree's maximum
- * transaction ID when doing a checkpoint. That's sufficient, we only care about the maximum
- * transaction ID of current updates in the tree, and checkpoint visits every dirty page in
- * the tree.
- */
- if (!F_ISSET(r, WT_REC_EVICT)) {
- if (WT_TXNID_LT(btree->rec_max_txn, r->max_txn))
- btree->rec_max_txn = r->max_txn;
- if (btree->rec_max_timestamp < r->max_ts)
- btree->rec_max_timestamp = r->max_ts;
- }
-
- /*
* We set the page state to mark it as having been dirtied for the first time prior to
* reconciliation. A failed atomic cas indicates that an update has taken place during
* reconciliation.
diff --git a/src/third_party/wiredtiger/test/suite/test_alter05.py b/src/third_party/wiredtiger/test/suite/test_alter05.py
index 47436361e62..dad5ac8ea81 100644
--- a/src/third_party/wiredtiger/test/suite/test_alter05.py
+++ b/src/third_party/wiredtiger/test/suite/test_alter05.py
@@ -86,6 +86,7 @@ class test_alter05(TieredConfigMixin, wttest.WiredTigerTestCase):
self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(2))
c.close()
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(2))
prev_alter_checkpoints = self.get_stat(wiredtiger.stat.conn.session_table_alter_trigger_checkpoint)
# Alter the table and verify.
@@ -103,6 +104,7 @@ class test_alter05(TieredConfigMixin, wttest.WiredTigerTestCase):
c[k+1] = 2
self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(3))
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(3))
self.assertRaisesException(wiredtiger.WiredTigerError,
lambda: self.session.alter(uri, 'log=(enabled=true)'))
self.verify_metadata('log=(enabled=false)')
diff --git a/src/third_party/wiredtiger/test/suite/test_checkpoint_snapshot06.py b/src/third_party/wiredtiger/test/suite/test_checkpoint_snapshot06.py
new file mode 100644
index 00000000000..69c7ad483d9
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_checkpoint_snapshot06.py
@@ -0,0 +1,254 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-present MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import os, shutil, threading, time
+from wtthread import checkpoint_thread
+from helper import simulate_crash_restart
+import wiredtiger, wttest
+from wtdataset import SimpleDataSet
+from wtscenario import make_scenarios
+from helper import copy_wiredtiger_home
+from wiredtiger import stat
+
+# test_checkpoint_snapshot06.py
+# This test is to run checkpoint and truncate and insert followed by eviction
+# for one table in parallel with timing stress for checkpoint and let eviction
+# write more data than checkpoint.
+class test_checkpoint_snapshot06(wttest.WiredTigerTestCase):
+
+ # Create two tables.
+ uri_1 = "table:test_checkpoint_snapshot06_1"
+ uri_2 = "table:test_checkpoint_snapshot06_2"
+ backup_dir = "BACKUP"
+
+ format_values = [
+ ('column', dict(key_format='r', value_format='S')),
+ ('column_fix', dict(key_format='r', value_format='8t')),
+ ('row_integer', dict(key_format='i', value_format='S')),
+ ]
+
+ restart_values = [
+ ("crash_restart", dict(restart=True)),
+ ("backup", dict(restart=False)),
+ ]
+
+ scenarios = make_scenarios(format_values, restart_values)
+
+ def conn_config(self):
+ config = 'cache_size=10MB,statistics=(all),statistics_log=(json,on_close,wait=1),log=(enabled=true),debug_mode=(log_retention=10),timing_stress_for_test=[checkpoint_slow]'
+ return config
+
+ def moresetup(self):
+ if self.value_format == '8t':
+ # Rig to use more than one page; otherwise the inconsistent checkpoint assertions fail.
+ self.extraconfig = ',leaf_page_max=4096'
+ self.nrows = 5000
+ self.valuea = 97
+ self.valueb = 98
+ else:
+ self.extraconfig = ''
+ self.nrows = 1000
+ self.valuea = "aaaaa" * 100
+ self.valueb = "bbbbb" * 100
+
+ def take_full_backup(self, fromdir, todir):
+ # Open up the backup cursor, and copy the files. Do a full backup.
+ cursor = self.session.open_cursor('backup:', None, None)
+ self.pr('Full backup from '+ fromdir + ' to ' + todir + ': ')
+ os.mkdir(todir)
+ while True:
+ ret = cursor.next()
+ if ret != 0:
+ break
+ bkup_file = cursor.get_key()
+ copy_file = os.path.join(fromdir, bkup_file)
+ sz = os.path.getsize(copy_file)
+ self.pr('Copy from: ' + bkup_file + ' (' + str(sz) + ') to ' + todir)
+ shutil.copy(copy_file, todir)
+ self.assertEqual(ret, wiredtiger.WT_NOTFOUND)
+ cursor.close()
+
+ def evict_cursor(self, uri, ds, nrows):
+ # Configure debug behavior on a cursor to evict the page positioned on when the reset API is used.
+ evict_cursor = self.session.open_cursor(uri, None, "debug=(release_evict)")
+ self.session.begin_transaction("ignore_prepare=true")
+ for i in range (1, nrows + 1):
+ evict_cursor.set_key(ds.key(i))
+ evict_cursor.search()
+ if i % 10 == 0:
+ evict_cursor.reset()
+ evict_cursor.close()
+ self.session.rollback_transaction()
+
+ def large_updates(self, uri, value, ds, nrows, commit_ts):
+ # Update a large number of records.
+ session = self.session
+ cursor = session.open_cursor(uri)
+ for i in range(1, nrows+1):
+ session.begin_transaction()
+ cursor[ds.key(i)] = value
+ if commit_ts == 0:
+ session.commit_transaction()
+ else:
+ session.commit_transaction('commit_timestamp=' + self.timestamp_str(commit_ts))
+ cursor.close()
+
+ def check(self, check_value, uri, nrows, read_ts, more_invisible_rows_exist):
+ # In FLCS the existence of the invisible extra set of rows causes the table to
+ # extend under them. Until that's fixed, expect (not just allow) those rows to
+ # exist and demand that they read back as zero and not as check_value. When it
+ # is fixed (so the end of the table updates transactionally) the special-case
+ # logic can just be removed.
+ flcs_tolerance = more_invisible_rows_exist and self.value_format == '8t'
+
+ session = self.session
+ if read_ts == 0:
+ session.begin_transaction()
+ else:
+ session.begin_transaction('read_timestamp=' + self.timestamp_str(read_ts))
+ cursor = session.open_cursor(uri)
+ count = 0
+ for k, v in cursor:
+ if flcs_tolerance and count >= nrows:
+ self.assertEqual(v, 0)
+ else:
+ self.assertEqual(v, check_value)
+ count += 1
+ session.commit_transaction()
+ self.assertEqual(count, nrows * 2 if flcs_tolerance else nrows)
+
+ def perform_backup_or_crash_restart(self, fromdir, todir):
+ if self.restart == True:
+ #Simulate a crash by copying to a new directory(RESTART).
+ copy_wiredtiger_home(self, fromdir, todir + ".copy", True)
+ simulate_crash_restart(self, fromdir, todir)
+ else:
+ #Take a backup and restore it.
+ self.take_full_backup(fromdir, todir)
+ self.take_full_backup(fromdir, todir + ".copy")
+ self.reopen_conn(todir)
+
+ def test_checkpoint_snapshot(self):
+ self.moresetup()
+
+ ds_1 = SimpleDataSet(self, self.uri_1, 0, \
+ key_format=self.key_format, value_format=self.value_format, \
+ config="log=(enabled=true)" +self.extraconfig)
+ ds_1.populate()
+
+ ds_2 = SimpleDataSet(self, self.uri_2, 0, \
+ key_format=self.key_format, value_format=self.value_format, \
+ config="log=(enabled=true)" + self.extraconfig)
+ ds_2.populate()
+
+ # Insert number of records into both tables.
+ self.large_updates(self.uri_1, self.valuea, ds_1, self.nrows, 0)
+ self.check(self.valuea, self.uri_1, self.nrows, 0, False)
+
+ self.large_updates(self.uri_2, self.valuea, ds_2, self.nrows, 0)
+ self.check(self.valuea, self.uri_2, self.nrows, 0, False)
+
+ # Remove one key from both the tables.
+ cursor1 = self.session.open_cursor(self.uri_1)
+ cursor2 = self.session.open_cursor(self.uri_2)
+
+ cursor1.set_key(ds_1.key(50))
+ cursor2.set_key(ds_2.key(50))
+ self.assertEqual(cursor1.remove(), 0)
+ self.assertEqual(cursor2.remove(), 0)
+
+ # Truncate the range from 1-100 in both tables where key 50 doesn't exist.
+ # We only set a stop cursor for both tables and send in an empty start
+ # cursor to truncate from the beginning of the table.
+ session1 = self.conn.open_session()
+ session1.begin_transaction()
+ cursor11 = session1.open_cursor(self.uri_1)
+ cursor12 = session1.open_cursor(self.uri_2)
+
+ cursor11.set_key(ds_1.key(100))
+ cursor12.set_key(ds_2.key(100))
+ session1.truncate(None, None, cursor11, None)
+ session1.truncate(None, None, cursor12, None)
+
+ # Insert the key 50 that was already removed.
+ session2 = self.conn.open_session()
+ session2.begin_transaction()
+ cursor21 = session2.open_cursor(self.uri_1)
+ cursor22 = session2.open_cursor(self.uri_2)
+
+ cursor21.set_key(ds_1.key(50))
+ cursor21.set_value(self.valueb)
+ cursor22.set_key(ds_2.key(50))
+ cursor22.set_value(self.valueb)
+
+ self.assertEqual(cursor21.insert(), 0)
+ self.assertEqual(cursor22.insert(), 0)
+
+ # Create a checkpoint thread
+ done = threading.Event()
+ ckpt = checkpoint_thread(self.conn, done)
+ try:
+ ckpt.start()
+
+ # Wait for checkpoint to start and acquire its snapshot before committing.
+ ckpt_snapshot = 0
+ while not ckpt_snapshot:
+ time.sleep(1)
+ stat_cursor = self.session.open_cursor('statistics:', None, None)
+ ckpt_snapshot = stat_cursor[stat.conn.txn_checkpoint_snapshot_acquired][2]
+ stat_cursor.close()
+
+ # commit the operations in out of order. Insert followed by truncate.
+ session2.commit_transaction()
+ session1.commit_transaction()
+
+ # Evict all the modifications of the table1 before checkpoint gets into the table.
+ self.evict_cursor(self.uri_1, ds_1, self.nrows)
+
+ finally:
+ done.set()
+ ckpt.join()
+
+ # Perform an additional checkpoint to ensure table2 also has the latest data.
+ self.session.checkpoint()
+ self.perform_backup_or_crash_restart(".", self.backup_dir)
+
+ # Check that the both tables contains the last re-inserted value.
+ cursor11 = self.session.open_cursor(self.uri_1)
+ cursor12 = self.session.open_cursor(self.uri_2)
+
+ cursor11.set_key(ds_1.key(50))
+ self.assertEqual(cursor11.search(), 0)
+ self.assertEqual(cursor11.get_value(), self.valueb)
+
+ cursor12.set_key(ds_2.key(50))
+ self.assertEqual(cursor12.search(), 0)
+ self.assertEqual(cursor12.get_value(), self.valueb)
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_hs21.py b/src/third_party/wiredtiger/test/suite/test_hs21.py
index 638823d87d0..6479a6d64b0 100644
--- a/src/third_party/wiredtiger/test/suite/test_hs21.py
+++ b/src/third_party/wiredtiger/test/suite/test_hs21.py
@@ -154,6 +154,9 @@ class test_hs21(wttest.WiredTigerTestCase):
self.check(self.session, value1, ds.uri, self.nrows // 2, 2, flcs_nrows=self.nrows)
self.check(self.session, value2, ds.uri, self.nrows, 100)
+ # Set the stable timestamp to 100 to let checkpoint write all the stable data.
+ self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(100))
+
# Our sweep scan interval is every 1 second and the amount of idle time needed for a handle to be closed is 2 seconds.
# It should take roughly 3 seconds for the sweep server to close our file handles. Lets wait at least double
# that to be safe.