summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChenhao Qu <chenhao.qu@mongodb.com>2022-04-13 01:46:10 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-04-13 02:12:41 +0000
commit583df21242096af2467a4b94ff9cb2a86d55bbaf (patch)
tree305d3c8b487e9fec752314d003bfea961eb8ebf8
parent975b6cf999f3901fabd9f31ed7c2b0297fe852ec (diff)
downloadmongo-583df21242096af2467a4b94ff9cb2a86d55bbaf.tar.gz
Import wiredtiger: 07cee370d83fd1c90f4ecf6781331db020960323 from branch mongodb-master
ref: d071d4624a..07cee370d8 for: 6.0.0-rc0 WT-9041 checkpoint can miss transactions with commit times before stable
-rw-r--r--src/third_party/wiredtiger/dist/stat_data.py1
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h2
-rw-r--r--src/third_party/wiredtiger/src/include/stat.h1
-rw-r--r--src/third_party/wiredtiger/src/include/wiredtiger.in44
-rw-r--r--src/third_party/wiredtiger/src/support/stat.c3
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_ckpt.c37
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_timestamp.c50
-rw-r--r--src/third_party/wiredtiger/test/suite/test_checkpoint_snapshot02.py27
9 files changed, 135 insertions, 32 deletions
diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py
index da36da590a5..e5fbb183100 100644
--- a/src/third_party/wiredtiger/dist/stat_data.py
+++ b/src/third_party/wiredtiger/dist/stat_data.py
@@ -568,6 +568,7 @@ conn_stats = [
TxnStat('txn_checkpoint_prep_recent', 'transaction checkpoint prepare most recent time (msecs)', 'no_clear,no_scale'),
TxnStat('txn_checkpoint_prep_running', 'transaction checkpoint prepare currently running', 'no_clear,no_scale'),
TxnStat('txn_checkpoint_prep_total', 'transaction checkpoint prepare total time (msecs)', 'no_clear,no_scale'),
+ TxnStat('txn_checkpoint_prep_wait', 'transaction checkpoint prepare wait time (msecs)', 'no_clear,no_scale'),
TxnStat('txn_checkpoint_running', 'transaction checkpoint currently running', 'no_clear,no_scale'),
TxnStat('txn_checkpoint_running_hs', 'transaction checkpoint currently running for history store file', 'no_clear,no_scale'),
TxnStat('txn_checkpoint_scrub_target', 'transaction checkpoint scrub dirty target', 'no_clear,no_scale'),
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 6b2d2e15fb5..ed796a51ed0 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-master",
- "commit": "d071d4624a0bb5005d9968553dd493c15a3400da"
+ "commit": "07cee370d83fd1c90f4ecf6781331db020960323"
}
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index 7ebf7ba83e1..6f35d92d0bb 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -37,6 +37,8 @@ extern bool __wt_rwlock_islocked(WT_SESSION_IMPL *session, WT_RWLOCK *l)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern bool __wt_txn_active(WT_SESSION_IMPL *session, uint64_t txnid)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern bool __wt_txn_checkpoint_cannot_start(WT_SESSION_IMPL *session)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern char *__wt_time_aggregate_to_string(WT_TIME_AGGREGATE *ta, char *ta_string)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern char *__wt_time_point_to_string(wt_timestamp_t ts, wt_timestamp_t durable_ts,
diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h
index 2b80cca70c0..4657d2ebebe 100644
--- a/src/third_party/wiredtiger/src/include/stat.h
+++ b/src/third_party/wiredtiger/src/include/stat.h
@@ -827,6 +827,7 @@ struct __wt_connection_stats {
int64_t txn_checkpoint_prep_min;
int64_t txn_checkpoint_prep_recent;
int64_t txn_checkpoint_prep_total;
+ int64_t txn_checkpoint_prep_wait;
int64_t txn_checkpoint_scrub_target;
int64_t txn_checkpoint_scrub_time;
int64_t txn_checkpoint_time_total;
diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in
index 0418417ff1f..558f34ad0c3 100644
--- a/src/third_party/wiredtiger/src/include/wiredtiger.in
+++ b/src/third_party/wiredtiger/src/include/wiredtiger.in
@@ -6359,63 +6359,65 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_RECENT 1493
/*! transaction: transaction checkpoint prepare total time (msecs) */
#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_TOTAL 1494
+/*! transaction: transaction checkpoint prepare wait time (msecs) */
+#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_WAIT 1495
/*! transaction: transaction checkpoint scrub dirty target */
-#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1495
+#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1496
/*! transaction: transaction checkpoint scrub time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1496
+#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1497
/*! transaction: transaction checkpoint total time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1497
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1498
/*! transaction: transaction checkpoints */
-#define WT_STAT_CONN_TXN_CHECKPOINT 1498
+#define WT_STAT_CONN_TXN_CHECKPOINT 1499
/*! transaction: transaction checkpoints due to obsolete pages */
-#define WT_STAT_CONN_TXN_CHECKPOINT_OBSOLETE_APPLIED 1499
+#define WT_STAT_CONN_TXN_CHECKPOINT_OBSOLETE_APPLIED 1500
/*!
* transaction: transaction checkpoints skipped because database was
* clean
*/
-#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1500
+#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1501
/*! transaction: transaction failures due to history store */
-#define WT_STAT_CONN_TXN_FAIL_CACHE 1501
+#define WT_STAT_CONN_TXN_FAIL_CACHE 1502
/*!
* transaction: transaction fsync calls for checkpoint after allocating
* the transaction ID
*/
-#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1502
+#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1503
/*!
* transaction: transaction fsync duration for checkpoint after
* allocating the transaction ID (usecs)
*/
-#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1503
+#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1504
/*! transaction: transaction range of IDs currently pinned */
-#define WT_STAT_CONN_TXN_PINNED_RANGE 1504
+#define WT_STAT_CONN_TXN_PINNED_RANGE 1505
/*! transaction: transaction range of IDs currently pinned by a checkpoint */
-#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1505
+#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1506
/*! transaction: transaction range of timestamps currently pinned */
-#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1506
+#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1507
/*! transaction: transaction range of timestamps pinned by a checkpoint */
-#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_CHECKPOINT 1507
+#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_CHECKPOINT 1508
/*!
* transaction: transaction range of timestamps pinned by the oldest
* active read timestamp
*/
-#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_READER 1508
+#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_READER 1509
/*!
* transaction: transaction range of timestamps pinned by the oldest
* timestamp
*/
-#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1509
+#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1510
/*! transaction: transaction read timestamp of the oldest active reader */
-#define WT_STAT_CONN_TXN_TIMESTAMP_OLDEST_ACTIVE_READ 1510
+#define WT_STAT_CONN_TXN_TIMESTAMP_OLDEST_ACTIVE_READ 1511
/*! transaction: transaction rollback to stable currently running */
-#define WT_STAT_CONN_TXN_ROLLBACK_TO_STABLE_RUNNING 1511
+#define WT_STAT_CONN_TXN_ROLLBACK_TO_STABLE_RUNNING 1512
/*! transaction: transaction walk of concurrent sessions */
-#define WT_STAT_CONN_TXN_WALK_SESSIONS 1512
+#define WT_STAT_CONN_TXN_WALK_SESSIONS 1513
/*! transaction: transactions committed */
-#define WT_STAT_CONN_TXN_COMMIT 1513
+#define WT_STAT_CONN_TXN_COMMIT 1514
/*! transaction: transactions rolled back */
-#define WT_STAT_CONN_TXN_ROLLBACK 1514
+#define WT_STAT_CONN_TXN_ROLLBACK 1515
/*! transaction: update conflicts */
-#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1515
+#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1516
/*!
* @}
diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c
index d26fab52cea..2b3c6645a9a 100644
--- a/src/third_party/wiredtiger/src/support/stat.c
+++ b/src/third_party/wiredtiger/src/support/stat.c
@@ -1559,6 +1559,7 @@ static const char *const __stats_connection_desc[] = {
"transaction: transaction checkpoint prepare min time (msecs)",
"transaction: transaction checkpoint prepare most recent time (msecs)",
"transaction: transaction checkpoint prepare total time (msecs)",
+ "transaction: transaction checkpoint prepare wait time (msecs)",
"transaction: transaction checkpoint scrub dirty target",
"transaction: transaction checkpoint scrub time (msecs)",
"transaction: transaction checkpoint total time (msecs)",
@@ -2116,6 +2117,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
/* not clearing txn_checkpoint_prep_min */
/* not clearing txn_checkpoint_prep_recent */
/* not clearing txn_checkpoint_prep_total */
+ /* not clearing txn_checkpoint_prep_wait */
/* not clearing txn_checkpoint_scrub_target */
/* not clearing txn_checkpoint_scrub_time */
/* not clearing txn_checkpoint_time_total */
@@ -2693,6 +2695,7 @@ __wt_stat_connection_aggregate(WT_CONNECTION_STATS **from, WT_CONNECTION_STATS *
to->txn_checkpoint_prep_min += WT_STAT_READ(from, txn_checkpoint_prep_min);
to->txn_checkpoint_prep_recent += WT_STAT_READ(from, txn_checkpoint_prep_recent);
to->txn_checkpoint_prep_total += WT_STAT_READ(from, txn_checkpoint_prep_total);
+ to->txn_checkpoint_prep_wait += WT_STAT_READ(from, txn_checkpoint_prep_wait);
to->txn_checkpoint_scrub_target += WT_STAT_READ(from, txn_checkpoint_scrub_target);
to->txn_checkpoint_scrub_time += WT_STAT_READ(from, txn_checkpoint_scrub_time);
to->txn_checkpoint_time_total += WT_STAT_READ(from, txn_checkpoint_time_total);
diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
index 84140d18c18..1e5de6e3391 100644
--- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c
+++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
@@ -524,6 +524,7 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, bool *trackingp, const char *cfg[
WT_TXN_GLOBAL *txn_global;
WT_TXN_SHARED *txn_shared;
uint64_t original_snap_min;
+ u_int wait_time;
const char *txn_cfg[] = {
WT_CONFIG_BASE(session, WT_SESSION_begin_transaction), "isolation=snapshot", NULL};
bool use_timestamp;
@@ -532,6 +533,7 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, bool *trackingp, const char *cfg[
txn = session->txn;
txn_global = &conn->txn_global;
txn_shared = WT_SESSION_TXN_SHARED(session);
+ wait_time = 0;
WT_RET(__wt_config_gets(session, cfg, "use_timestamp", &cval));
use_timestamp = (cval.val != 0);
@@ -604,7 +606,8 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, bool *trackingp, const char *cfg[
* Set the checkpoint transaction's timestamp, if requested.
*
* We rely on having the global transaction data locked so the oldest timestamp can't move past
- * the stable timestamp.
+ * the stable timestamp we select until our read timestamp is in place. (Then it contributes to
+ * the pinned timestamp computation so our reads remain safe.)
*/
WT_ASSERT(session,
!F_ISSET(txn, WT_TXN_HAS_TS_COMMIT | WT_TXN_SHARED_TS_DURABLE | WT_TXN_SHARED_TS_READ));
@@ -658,10 +661,34 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, bool *trackingp, const char *cfg[
}
/*
- * Refresh our snapshot here without publishing our shared ids to the world, doing so prevents
- * us from racing with the stable timestamp moving ahead of current snapshot. i.e. if the stable
- * timestamp moves after we begin the checkpoint transaction but before we set the checkpoint
- * timestamp we can end up missing updates in our checkpoint.
+ * Wait for any transactions that are supposed to be stable to finish committing. This prevents
+ * a race where a transaction can begin committing at a time past stable, and another thread
+ * moves stable past that transaction's commit time, and we start checkpointing before it
+ * finishes; we need it in this checkpoint, but it won't be unless we wait for it. If we do,
+ * then this race reduces to the race described in the next comment.
+ *
+ * Note that arguably the proper solution for this race is to not allow stable to advance past a
+ * transaction that hasn't finished committing (or to make it wait instead of us) but that is
+ * not currently feasible.
+ */
+ if (use_timestamp && txn_global->meta_ckpt_timestamp != WT_TS_NONE) {
+ while (__wt_txn_checkpoint_cannot_start(session)) {
+ __wt_sleep(0, 100 * WT_THOUSAND);
+ WT_STAT_CONN_INCRV(session, txn_checkpoint_prep_wait, 100);
+ wait_time++;
+ }
+ /* Grumble (with timing data) if we had to wait more than five seconds. */
+ if (wait_time > 50)
+ __checkpoint_verbose_track(
+ session, "Finished waiting for necessary transactions to commit");
+ }
+
+ /*
+ * Refresh our snapshot here without publishing our shared ids to the world. This prevents a
+ * race where the application finishes committing a transaction and moves stable up to include
+ * that transaction in between when we began the checkpoint transaction and when we fetched
+ * stable. We want that transaction in the checkpoint, but it won't be unless we get a new
+ * snapshot.
*/
__wt_txn_bump_snapshot(session);
diff --git a/src/third_party/wiredtiger/src/txn/txn_timestamp.c b/src/third_party/wiredtiger/src/txn/txn_timestamp.c
index 160f638a72c..86df5e4339d 100644
--- a/src/third_party/wiredtiger/src/txn/txn_timestamp.c
+++ b/src/third_party/wiredtiger/src/txn/txn_timestamp.c
@@ -1049,7 +1049,8 @@ __wt_txn_publish_durable_timestamp(WT_SESSION_IMPL *session)
* If we know for a fact that this is a prepared transaction and we only have a commit
* timestamp, don't add to the durable queue. If we poll all_durable after setting the
* commit timestamp of a prepared transaction, that prepared transaction should NOT be
- * visible.
+ * visible. Note: this only happens when the commit timestamp is set in advance with
+ * timestamp_transaction; at commit time a durable timestamp is required.
*/
if (F_ISSET(txn, WT_TXN_PREPARE))
return;
@@ -1104,3 +1105,50 @@ __wt_txn_clear_read_timestamp(WT_SESSION_IMPL *session)
}
txn_shared->read_timestamp = WT_TS_NONE;
}
+
+/*
+ * __wt_txn_checkpoint_cannot_start --
+ * Return true if there's a transaction we need to wait for. This means transactions that have
+ * begun committing with durable timestamp at or before the checkpoint timestamp, but have not
+ * yet finished.
+ */
+bool
+__wt_txn_checkpoint_cannot_start(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_TXN_GLOBAL *txn_global;
+ WT_TXN_SHARED *txn_shared;
+ wt_timestamp_t durable_ts;
+ uint32_t i, session_count;
+
+ conn = S2C(session);
+ txn_global = &conn->txn_global;
+
+ /* We're going to scan the table: wait for the lock. */
+ __wt_readlock(session, &txn_global->rwlock);
+
+ /* Walk the array of concurrent transactions. */
+ WT_ORDERED_READ(session_count, conn->session_cnt);
+ WT_STAT_CONN_INCR(session, txn_walk_sessions);
+ for (i = 0, txn_shared = txn_global->txn_shared_list; i < session_count; i++, txn_shared++) {
+ WT_STAT_CONN_INCR(session, txn_sessions_walked);
+
+ if (txn_shared->id == WT_TXN_NONE)
+ continue;
+
+ /*
+ * FUTURE: there is currently no way to tell if a transaction has started committing, or has
+ * only been assigned a durable or commit timestamp with timestamp_transaction(). It would
+ * be better not to wait for transactions that haven't actually started committing yet.
+ */
+ __txn_get_durable_timestamp(txn_shared, &durable_ts);
+
+ if (durable_ts != WT_TXN_NONE && durable_ts <= txn_global->meta_ckpt_timestamp) {
+ __wt_readunlock(session, &txn_global->rwlock);
+ return (true);
+ }
+ }
+
+ __wt_readunlock(session, &txn_global->rwlock);
+ return (false);
+}
diff --git a/src/third_party/wiredtiger/test/suite/test_checkpoint_snapshot02.py b/src/third_party/wiredtiger/test/suite/test_checkpoint_snapshot02.py
index 89a7faddfc0..98335a00f9b 100644
--- a/src/third_party/wiredtiger/test/suite/test_checkpoint_snapshot02.py
+++ b/src/third_party/wiredtiger/test/suite/test_checkpoint_snapshot02.py
@@ -215,8 +215,18 @@ class test_checkpoint_snapshot02(wttest.WiredTigerTestCase):
ckpt = checkpoint_thread(self.conn, done)
try:
ckpt.start()
- # Sleep for sometime so that checkpoint starts before committing last transaction.
- time.sleep(2)
+
+ # Wait for checkpoint to start before committing last transaction.
+ # Note: because we assigned the transaction a commit timestamp before the
+ # checkpoint timestamp, the checkpoint will wait for us to finish committing.
+ # But that happens after it starts running according to the stat.
+ ckpt_started = 0
+ while not ckpt_started:
+ stat_cursor = self.session.open_cursor('statistics:', None, None)
+ ckpt_started = stat_cursor[stat.conn.txn_checkpoint_running][2]
+ stat_cursor.close()
+ time.sleep(1)
+
session1.commit_transaction()
finally:
@@ -262,6 +272,8 @@ class test_checkpoint_snapshot02(wttest.WiredTigerTestCase):
cursor2.set_key(ds.key(i))
cursor2.set_value(self.valuea)
self.assertEqual(cursor2.insert(), 0)
+
+ # Give the first transaction (which has no contents) a timestamp.
session1.timestamp_transaction('commit_timestamp=' + self.timestamp_str(30))
# Set stable timestamp to 40
@@ -272,8 +284,15 @@ class test_checkpoint_snapshot02(wttest.WiredTigerTestCase):
ckpt = checkpoint_thread(self.conn, done)
try:
ckpt.start()
- # Sleep for sometime so that checkpoint starts before committing last transaction.
- time.sleep(2)
+
+ # Wait for checkpoint to start before committing last transaction.
+ ckpt_started = 0
+ while not ckpt_started:
+ stat_cursor = self.session.open_cursor('statistics:', None, None)
+ ckpt_started = stat_cursor[stat.conn.txn_checkpoint_running][2]
+ stat_cursor.close()
+ time.sleep(1)
+
session2.commit_transaction()
finally: