diff options
author | Chenhao Qu <chenhao.qu@mongodb.com> | 2022-04-13 01:46:10 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-04-13 02:12:41 +0000 |
commit | 583df21242096af2467a4b94ff9cb2a86d55bbaf (patch) | |
tree | 305d3c8b487e9fec752314d003bfea961eb8ebf8 | |
parent | 975b6cf999f3901fabd9f31ed7c2b0297fe852ec (diff) | |
download | mongo-583df21242096af2467a4b94ff9cb2a86d55bbaf.tar.gz |
Import wiredtiger: 07cee370d83fd1c90f4ecf6781331db020960323 from branch mongodb-master
ref: d071d4624a..07cee370d8
for: 6.0.0-rc0
WT-9041 checkpoint can miss transactions with commit times before stable
-rw-r--r-- | src/third_party/wiredtiger/dist/stat_data.py | 1 | ||||
-rw-r--r-- | src/third_party/wiredtiger/import.data | 2 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/extern.h | 2 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/stat.h | 1 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/wiredtiger.in | 44 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/support/stat.c | 3 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/txn/txn_ckpt.c | 37 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/txn/txn_timestamp.c | 50 | ||||
-rw-r--r-- | src/third_party/wiredtiger/test/suite/test_checkpoint_snapshot02.py | 27 |
9 files changed, 135 insertions, 32 deletions
diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py index da36da590a5..e5fbb183100 100644 --- a/src/third_party/wiredtiger/dist/stat_data.py +++ b/src/third_party/wiredtiger/dist/stat_data.py @@ -568,6 +568,7 @@ conn_stats = [ TxnStat('txn_checkpoint_prep_recent', 'transaction checkpoint prepare most recent time (msecs)', 'no_clear,no_scale'), TxnStat('txn_checkpoint_prep_running', 'transaction checkpoint prepare currently running', 'no_clear,no_scale'), TxnStat('txn_checkpoint_prep_total', 'transaction checkpoint prepare total time (msecs)', 'no_clear,no_scale'), + TxnStat('txn_checkpoint_prep_wait', 'transaction checkpoint prepare wait time (msecs)', 'no_clear,no_scale'), TxnStat('txn_checkpoint_running', 'transaction checkpoint currently running', 'no_clear,no_scale'), TxnStat('txn_checkpoint_running_hs', 'transaction checkpoint currently running for history store file', 'no_clear,no_scale'), TxnStat('txn_checkpoint_scrub_target', 'transaction checkpoint scrub dirty target', 'no_clear,no_scale'), diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 6b2d2e15fb5..ed796a51ed0 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-master", - "commit": "d071d4624a0bb5005d9968553dd493c15a3400da" + "commit": "07cee370d83fd1c90f4ecf6781331db020960323" } diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 7ebf7ba83e1..6f35d92d0bb 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -37,6 +37,8 @@ extern bool __wt_rwlock_islocked(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern bool __wt_txn_active(WT_SESSION_IMPL *session, uint64_t txnid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern bool __wt_txn_checkpoint_cannot_start(WT_SESSION_IMPL *session) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern char *__wt_time_aggregate_to_string(WT_TIME_AGGREGATE *ta, char *ta_string) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern char *__wt_time_point_to_string(wt_timestamp_t ts, wt_timestamp_t durable_ts, diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h index 2b80cca70c0..4657d2ebebe 100644 --- a/src/third_party/wiredtiger/src/include/stat.h +++ b/src/third_party/wiredtiger/src/include/stat.h @@ -827,6 +827,7 @@ struct __wt_connection_stats { int64_t txn_checkpoint_prep_min; int64_t txn_checkpoint_prep_recent; int64_t txn_checkpoint_prep_total; + int64_t txn_checkpoint_prep_wait; int64_t txn_checkpoint_scrub_target; int64_t txn_checkpoint_scrub_time; int64_t txn_checkpoint_time_total; diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index 0418417ff1f..558f34ad0c3 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -6359,63 +6359,65 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_TXN_CHECKPOINT_PREP_RECENT 1493 /*! transaction: transaction checkpoint prepare total time (msecs) */ #define WT_STAT_CONN_TXN_CHECKPOINT_PREP_TOTAL 1494 +/*! transaction: transaction checkpoint prepare wait time (msecs) */ +#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_WAIT 1495 /*! transaction: transaction checkpoint scrub dirty target */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1495 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1496 /*! transaction: transaction checkpoint scrub time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1496 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1497 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1497 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1498 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1498 +#define WT_STAT_CONN_TXN_CHECKPOINT 1499 /*! transaction: transaction checkpoints due to obsolete pages */ -#define WT_STAT_CONN_TXN_CHECKPOINT_OBSOLETE_APPLIED 1499 +#define WT_STAT_CONN_TXN_CHECKPOINT_OBSOLETE_APPLIED 1500 /*! * transaction: transaction checkpoints skipped because database was * clean */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1500 +#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1501 /*! transaction: transaction failures due to history store */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1501 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1502 /*! * transaction: transaction fsync calls for checkpoint after allocating * the transaction ID */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1502 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1503 /*! * transaction: transaction fsync duration for checkpoint after * allocating the transaction ID (usecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1503 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1504 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1504 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1505 /*! transaction: transaction range of IDs currently pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1505 +#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1506 /*! transaction: transaction range of timestamps currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1506 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1507 /*! transaction: transaction range of timestamps pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_CHECKPOINT 1507 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_CHECKPOINT 1508 /*! * transaction: transaction range of timestamps pinned by the oldest * active read timestamp */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_READER 1508 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_READER 1509 /*! * transaction: transaction range of timestamps pinned by the oldest * timestamp */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1509 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1510 /*! transaction: transaction read timestamp of the oldest active reader */ -#define WT_STAT_CONN_TXN_TIMESTAMP_OLDEST_ACTIVE_READ 1510 +#define WT_STAT_CONN_TXN_TIMESTAMP_OLDEST_ACTIVE_READ 1511 /*! transaction: transaction rollback to stable currently running */ -#define WT_STAT_CONN_TXN_ROLLBACK_TO_STABLE_RUNNING 1511 +#define WT_STAT_CONN_TXN_ROLLBACK_TO_STABLE_RUNNING 1512 /*! transaction: transaction walk of concurrent sessions */ -#define WT_STAT_CONN_TXN_WALK_SESSIONS 1512 +#define WT_STAT_CONN_TXN_WALK_SESSIONS 1513 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1513 +#define WT_STAT_CONN_TXN_COMMIT 1514 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1514 +#define WT_STAT_CONN_TXN_ROLLBACK 1515 /*! transaction: update conflicts */ -#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1515 +#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1516 /*! * @} diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c index d26fab52cea..2b3c6645a9a 100644 --- a/src/third_party/wiredtiger/src/support/stat.c +++ b/src/third_party/wiredtiger/src/support/stat.c @@ -1559,6 +1559,7 @@ static const char *const __stats_connection_desc[] = { "transaction: transaction checkpoint prepare min time (msecs)", "transaction: transaction checkpoint prepare most recent time (msecs)", "transaction: transaction checkpoint prepare total time (msecs)", + "transaction: transaction checkpoint prepare wait time (msecs)", "transaction: transaction checkpoint scrub dirty target", "transaction: transaction checkpoint scrub time (msecs)", "transaction: transaction checkpoint total time (msecs)", @@ -2116,6 +2117,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) /* not clearing txn_checkpoint_prep_min */ /* not clearing txn_checkpoint_prep_recent */ /* not clearing txn_checkpoint_prep_total */ + /* not clearing txn_checkpoint_prep_wait */ /* not clearing txn_checkpoint_scrub_target */ /* not clearing txn_checkpoint_scrub_time */ /* not clearing txn_checkpoint_time_total */ @@ -2693,6 +2695,7 @@ __wt_stat_connection_aggregate(WT_CONNECTION_STATS **from, WT_CONNECTION_STATS * to->txn_checkpoint_prep_min += WT_STAT_READ(from, txn_checkpoint_prep_min); to->txn_checkpoint_prep_recent += WT_STAT_READ(from, txn_checkpoint_prep_recent); to->txn_checkpoint_prep_total += WT_STAT_READ(from, txn_checkpoint_prep_total); + to->txn_checkpoint_prep_wait += WT_STAT_READ(from, txn_checkpoint_prep_wait); to->txn_checkpoint_scrub_target += WT_STAT_READ(from, txn_checkpoint_scrub_target); to->txn_checkpoint_scrub_time += WT_STAT_READ(from, txn_checkpoint_scrub_time); to->txn_checkpoint_time_total += WT_STAT_READ(from, txn_checkpoint_time_total); diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c index 84140d18c18..1e5de6e3391 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c +++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c @@ -524,6 +524,7 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, bool *trackingp, const char *cfg[ WT_TXN_GLOBAL *txn_global; WT_TXN_SHARED *txn_shared; uint64_t original_snap_min; + u_int wait_time; const char *txn_cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_begin_transaction), "isolation=snapshot", NULL}; bool use_timestamp; @@ -532,6 +533,7 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, bool *trackingp, const char *cfg[ txn = session->txn; txn_global = &conn->txn_global; txn_shared = WT_SESSION_TXN_SHARED(session); + wait_time = 0; WT_RET(__wt_config_gets(session, cfg, "use_timestamp", &cval)); use_timestamp = (cval.val != 0); @@ -604,7 +606,8 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, bool *trackingp, const char *cfg[ * Set the checkpoint transaction's timestamp, if requested. * * We rely on having the global transaction data locked so the oldest timestamp can't move past - * the stable timestamp. + * the stable timestamp we select until our read timestamp is in place. (Then it contributes to + * the pinned timestamp computation so our reads remain safe.) */ WT_ASSERT(session, !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT | WT_TXN_SHARED_TS_DURABLE | WT_TXN_SHARED_TS_READ)); @@ -658,10 +661,34 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, bool *trackingp, const char *cfg[ } /* - * Refresh our snapshot here without publishing our shared ids to the world, doing so prevents - * us from racing with the stable timestamp moving ahead of current snapshot. i.e. if the stable - * timestamp moves after we begin the checkpoint transaction but before we set the checkpoint - * timestamp we can end up missing updates in our checkpoint. + * Wait for any transactions that are supposed to be stable to finish committing. This prevents + * a race where a transaction can begin committing at a time past stable, and another thread + * moves stable past that transaction's commit time, and we start checkpointing before it + * finishes; we need it in this checkpoint, but it won't be unless we wait for it. If we do, + * then this race reduces to the race described in the next comment. + * + * Note that arguably the proper solution for this race is to not allow stable to advance past a + * transaction that hasn't finished committing (or to make it wait instead of us) but that is + * not currently feasible. + */ + if (use_timestamp && txn_global->meta_ckpt_timestamp != WT_TS_NONE) { + while (__wt_txn_checkpoint_cannot_start(session)) { + __wt_sleep(0, 100 * WT_THOUSAND); + WT_STAT_CONN_INCRV(session, txn_checkpoint_prep_wait, 100); + wait_time++; + } + /* Grumble (with timing data) if we had to wait more than five seconds. */ + if (wait_time > 50) + __checkpoint_verbose_track( + session, "Finished waiting for necessary transactions to commit"); + } + + /* + * Refresh our snapshot here without publishing our shared ids to the world. This prevents a + * race where the application finishes committing a transaction and moves stable up to include + * that transaction in between when we began the checkpoint transaction and when we fetched + * stable. We want that transaction in the checkpoint, but it won't be unless we get a new + * snapshot. */ __wt_txn_bump_snapshot(session); diff --git a/src/third_party/wiredtiger/src/txn/txn_timestamp.c b/src/third_party/wiredtiger/src/txn/txn_timestamp.c index 160f638a72c..86df5e4339d 100644 --- a/src/third_party/wiredtiger/src/txn/txn_timestamp.c +++ b/src/third_party/wiredtiger/src/txn/txn_timestamp.c @@ -1049,7 +1049,8 @@ __wt_txn_publish_durable_timestamp(WT_SESSION_IMPL *session) * If we know for a fact that this is a prepared transaction and we only have a commit * timestamp, don't add to the durable queue. If we poll all_durable after setting the * commit timestamp of a prepared transaction, that prepared transaction should NOT be - * visible. + * visible. Note: this only happens when the commit timestamp is set in advance with + * timestamp_transaction; at commit time a durable timestamp is required. */ if (F_ISSET(txn, WT_TXN_PREPARE)) return; @@ -1104,3 +1105,50 @@ __wt_txn_clear_read_timestamp(WT_SESSION_IMPL *session) } txn_shared->read_timestamp = WT_TS_NONE; } + +/* + * __wt_txn_checkpoint_cannot_start -- + * Return true if there's a transaction we need to wait for. This means transactions that have + * begun committing with durable timestamp at or before the checkpoint timestamp, but have not + * yet finished. + */ +bool +__wt_txn_checkpoint_cannot_start(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_TXN_GLOBAL *txn_global; + WT_TXN_SHARED *txn_shared; + wt_timestamp_t durable_ts; + uint32_t i, session_count; + + conn = S2C(session); + txn_global = &conn->txn_global; + + /* We're going to scan the table: wait for the lock. */ + __wt_readlock(session, &txn_global->rwlock); + + /* Walk the array of concurrent transactions. */ + WT_ORDERED_READ(session_count, conn->session_cnt); + WT_STAT_CONN_INCR(session, txn_walk_sessions); + for (i = 0, txn_shared = txn_global->txn_shared_list; i < session_count; i++, txn_shared++) { + WT_STAT_CONN_INCR(session, txn_sessions_walked); + + if (txn_shared->id == WT_TXN_NONE) + continue; + + /* + * FUTURE: there is currently no way to tell if a transaction has started committing, or has + * only been assigned a durable or commit timestamp with timestamp_transaction(). It would + * be better not to wait for transactions that haven't actually started committing yet. + */ + __txn_get_durable_timestamp(txn_shared, &durable_ts); + + if (durable_ts != WT_TXN_NONE && durable_ts <= txn_global->meta_ckpt_timestamp) { + __wt_readunlock(session, &txn_global->rwlock); + return (true); + } + } + + __wt_readunlock(session, &txn_global->rwlock); + return (false); +} diff --git a/src/third_party/wiredtiger/test/suite/test_checkpoint_snapshot02.py b/src/third_party/wiredtiger/test/suite/test_checkpoint_snapshot02.py index 89a7faddfc0..98335a00f9b 100644 --- a/src/third_party/wiredtiger/test/suite/test_checkpoint_snapshot02.py +++ b/src/third_party/wiredtiger/test/suite/test_checkpoint_snapshot02.py @@ -215,8 +215,18 @@ class test_checkpoint_snapshot02(wttest.WiredTigerTestCase): ckpt = checkpoint_thread(self.conn, done) try: ckpt.start() - # Sleep for sometime so that checkpoint starts before committing last transaction. - time.sleep(2) + + # Wait for checkpoint to start before committing last transaction. + # Note: because we assigned the transaction a commit timestamp before the + # checkpoint timestamp, the checkpoint will wait for us to finish committing. + # But that happens after it starts running according to the stat. + ckpt_started = 0 + while not ckpt_started: + stat_cursor = self.session.open_cursor('statistics:', None, None) + ckpt_started = stat_cursor[stat.conn.txn_checkpoint_running][2] + stat_cursor.close() + time.sleep(1) + session1.commit_transaction() finally: @@ -262,6 +272,8 @@ class test_checkpoint_snapshot02(wttest.WiredTigerTestCase): cursor2.set_key(ds.key(i)) cursor2.set_value(self.valuea) self.assertEqual(cursor2.insert(), 0) + + # Give the first transaction (which has no contents) a timestamp. session1.timestamp_transaction('commit_timestamp=' + self.timestamp_str(30)) # Set stable timestamp to 40 @@ -272,8 +284,15 @@ class test_checkpoint_snapshot02(wttest.WiredTigerTestCase): ckpt = checkpoint_thread(self.conn, done) try: ckpt.start() - # Sleep for sometime so that checkpoint starts before committing last transaction. - time.sleep(2) + + # Wait for checkpoint to start before committing last transaction. + ckpt_started = 0 + while not ckpt_started: + stat_cursor = self.session.open_cursor('statistics:', None, None) + ckpt_started = stat_cursor[stat.conn.txn_checkpoint_running][2] + stat_cursor.close() + time.sleep(1) + session2.commit_transaction() finally: |