diff options
13 files changed, 60 insertions, 53 deletions
diff --git a/buildscripts/resmokeconfig/suites/concurrency_replication.yml b/buildscripts/resmokeconfig/suites/concurrency_replication.yml index 4a05e4a6a7f..a2270cc4c32 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_replication.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_replication.yml @@ -18,6 +18,7 @@ selector: executor: archive: hooks: + - CheckReplDBHashInBackground - CheckReplDBHash - ValidateCollections tests: true @@ -30,7 +31,7 @@ executor: # validating the entire contents of the collection. # # TODO SERVER-26466: Add CheckReplOplogs hook to the concurrency suite. - # TODO SERVER-39321: Re-enable and archive the CheckReplDBHashInBackground hook. + - class: CheckReplDBHashInBackground - class: CheckReplDBHash - class: ValidateCollections - class: CleanupConcurrencyWorkloads diff --git a/buildscripts/resmokeconfig/suites/concurrency_replication_multi_stmt_txn.yml b/buildscripts/resmokeconfig/suites/concurrency_replication_multi_stmt_txn.yml index cc33048a6e1..9291a7eb561 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_replication_multi_stmt_txn.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_replication_multi_stmt_txn.yml @@ -38,6 +38,7 @@ selector: executor: archive: hooks: + - CheckReplDBHashInBackground - CheckReplDBHash - ValidateCollections tests: true @@ -55,7 +56,7 @@ executor: # validating the entire contents of the collection. # # TODO SERVER-26466: Add CheckReplOplogs hook to the concurrency suite. - # TODO SERVER-39321: Re-enable and archive the CheckReplDBHashInBackground hook. + - class: CheckReplDBHashInBackground - class: CheckReplDBHash - class: ValidateCollections - class: CleanupConcurrencyWorkloads diff --git a/buildscripts/resmokeconfig/suites/concurrency_replication_multi_stmt_txn_ubsan.yml b/buildscripts/resmokeconfig/suites/concurrency_replication_multi_stmt_txn_ubsan.yml index 4a9883de4f3..0e5a2754774 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_replication_multi_stmt_txn_ubsan.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_replication_multi_stmt_txn_ubsan.yml @@ -38,6 +38,7 @@ selector: executor: archive: hooks: + - CheckReplDBHashInBackground - CheckReplDBHash - ValidateCollections tests: true @@ -55,7 +56,7 @@ executor: # validating the entire contents of the collection. # # TODO SERVER-26466: Add CheckReplOplogs hook to the concurrency suite. - # TODO SERVER-39321: Re-enable and archive the CheckReplDBHashInBackground hook. + - class: CheckReplDBHashInBackground - class: CheckReplDBHash - class: ValidateCollections - class: CleanEveryN diff --git a/buildscripts/resmokeconfig/suites/concurrency_replication_ubsan.yml b/buildscripts/resmokeconfig/suites/concurrency_replication_ubsan.yml index 07e6113f330..d535361c29f 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_replication_ubsan.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_replication_ubsan.yml @@ -20,6 +20,7 @@ executor: hooks: - CheckReplDBHash - ValidateCollections + - CheckReplDBHashInBackground tests: true config: shell_options: @@ -30,7 +31,7 @@ executor: # validating the entire contents of the collection. # # TODO SERVER-26466: Add CheckReplOplogs hook to the concurrency suite. - # TODO SERVER-39321: Re-enable and archive the CheckReplDBHashInBackground hook. + - class: CheckReplDBHashInBackground - class: CheckReplDBHash - class: ValidateCollections - class: CleanEveryN diff --git a/buildscripts/resmokeconfig/suites/concurrency_simultaneous_replication.yml b/buildscripts/resmokeconfig/suites/concurrency_simultaneous_replication.yml index 9983712e0b2..e073ab2a05d 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_simultaneous_replication.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_simultaneous_replication.yml @@ -36,6 +36,7 @@ selector: executor: archive: hooks: + - CheckReplDBHashInBackground - CheckReplDBHash - ValidateCollections tests: true @@ -51,7 +52,7 @@ executor: # validating the entire contents of the collection. # # TODO SERVER-26466: Add CheckReplOplogs hook to the concurrency suite. - # TODO SERVER-39321: Re-enable and archive the CheckReplDBHashInBackground hook. + - class: CheckReplDBHashInBackground - class: CheckReplDBHash - class: ValidateCollections - class: CleanupConcurrencyWorkloads diff --git a/buildscripts/resmokeconfig/suites/logical_session_cache_replication_100ms_refresh_jscore_passthrough.yml b/buildscripts/resmokeconfig/suites/logical_session_cache_replication_100ms_refresh_jscore_passthrough.yml index 81e6b26b006..0b3d1e5d440 100644 --- a/buildscripts/resmokeconfig/suites/logical_session_cache_replication_100ms_refresh_jscore_passthrough.yml +++ b/buildscripts/resmokeconfig/suites/logical_session_cache_replication_100ms_refresh_jscore_passthrough.yml @@ -12,12 +12,6 @@ selector: # These tests change the featureCompatibilityVersion which makes it so transactions aren't # supported. - jstests/core/txns/abort_transactions_on_FCV_downgrade.js - # TODO SERVER-39321: Remove the following block of blacklists. - # These tests change the transactionLifetimeLimitSeconds server parameter which conflicts with how - # the CheckReplDBHashInBackground hook doesn't want transactions to be reaped while it is running. - - jstests/core/txns/abort_expired_transaction.js - - jstests/core/txns/abort_transaction_thread_does_not_block_on_locks.js - - jstests/core/txns/kill_op_on_txn_expiry.js # The set_param1.js test attempts to compare the response from running the {getParameter: "*"} # command multiple times, which may observe the change to the "transactionLifetimeLimitSeconds" # server parameter. @@ -37,6 +31,7 @@ selector: executor: archive: hooks: + - CheckReplDBHashInBackground - CheckReplDBHash - CheckReplOplogs - ValidateCollections @@ -48,7 +43,7 @@ executor: # The CheckReplDBHash hook waits until all operations have replicated to and have been applied # on the secondaries, so we run the ValidateCollections hook after it to ensure we're # validating the entire contents of the collection. - # TODO SERVER-39321: Re-enable and archive the CheckReplDBHashInBackground hook. + - class: CheckReplDBHashInBackground - class: CheckReplOplogs - class: CheckReplDBHash - class: ValidateCollections diff --git a/buildscripts/resmokeconfig/suites/logical_session_cache_replication_10sec_refresh_jscore_passthrough.yml b/buildscripts/resmokeconfig/suites/logical_session_cache_replication_10sec_refresh_jscore_passthrough.yml index 93e35ee865d..b6b69c8e990 100644 --- a/buildscripts/resmokeconfig/suites/logical_session_cache_replication_10sec_refresh_jscore_passthrough.yml +++ b/buildscripts/resmokeconfig/suites/logical_session_cache_replication_10sec_refresh_jscore_passthrough.yml @@ -12,12 +12,6 @@ selector: # These tests change the featureCompatibilityVersion which makes it so transactions aren't # supported. - jstests/core/txns/abort_transactions_on_FCV_downgrade.js - # TODO SERVER-39321: Remove the following block of blacklists. - # These tests change the transactionLifetimeLimitSeconds server parameter which conflicts with how - # the CheckReplDBHashInBackground hook doesn't want transactions to be reaped while it is running. - - jstests/core/txns/abort_expired_transaction.js - - jstests/core/txns/abort_transaction_thread_does_not_block_on_locks.js - - jstests/core/txns/kill_op_on_txn_expiry.js # The set_param1.js test attempts to compare the response from running the {getParameter: "*"} # command multiple times, which may observe the change to the "transactionLifetimeLimitSeconds" # server parameter. @@ -37,6 +31,7 @@ selector: executor: archive: hooks: + - CheckReplDBHashInBackground - CheckReplDBHash - CheckReplOplogs - ValidateCollections @@ -48,7 +43,7 @@ executor: # The CheckReplDBHash hook waits until all operations have replicated to and have been applied # on the secondaries, so we run the ValidateCollections hook after it to ensure we're # validating the entire contents of the collection. - # TODO SERVER-39321: Re-enable and archive the CheckReplDBHashInBackground hook. + - class: CheckReplDBHashInBackground - class: CheckReplOplogs - class: CheckReplDBHash - class: ValidateCollections diff --git a/buildscripts/resmokeconfig/suites/logical_session_cache_replication_1sec_refresh_jscore_passthrough.yml b/buildscripts/resmokeconfig/suites/logical_session_cache_replication_1sec_refresh_jscore_passthrough.yml index c3859a94dd8..3ae31db3873 100644 --- a/buildscripts/resmokeconfig/suites/logical_session_cache_replication_1sec_refresh_jscore_passthrough.yml +++ b/buildscripts/resmokeconfig/suites/logical_session_cache_replication_1sec_refresh_jscore_passthrough.yml @@ -12,12 +12,6 @@ selector: # These tests change the featureCompatibilityVersion which makes it so transactions aren't # supported. - jstests/core/txns/abort_transactions_on_FCV_downgrade.js - # TODO SERVER-39321: Remove the following block of blacklists. - # These tests change the transactionLifetimeLimitSeconds server parameter which conflicts with how - # the CheckReplDBHashInBackground hook doesn't want transactions to be reaped while it is running. - - jstests/core/txns/abort_expired_transaction.js - - jstests/core/txns/abort_transaction_thread_does_not_block_on_locks.js - - jstests/core/txns/kill_op_on_txn_expiry.js # The set_param1.js test attempts to compare the response from running the {getParameter: "*"} # command multiple times, which may observe the change to the "transactionLifetimeLimitSeconds" # server parameter. @@ -37,6 +31,7 @@ selector: executor: archive: hooks: + - CheckReplDBHashInBackground - CheckReplDBHash - CheckReplOplogs - ValidateCollections @@ -48,7 +43,7 @@ executor: # The CheckReplDBHash hook waits until all operations have replicated to and have been applied # on the secondaries, so we run the ValidateCollections hook after it to ensure we're # validating the entire contents of the collection. - # TODO SERVER-39321: Re-enable and archive the CheckReplDBHashInBackground hook. + - class: CheckReplDBHashInBackground - class: CheckReplOplogs - class: CheckReplDBHash - class: ValidateCollections diff --git a/buildscripts/resmokeconfig/suites/logical_session_cache_replication_default_refresh_jscore_passthrough.yml b/buildscripts/resmokeconfig/suites/logical_session_cache_replication_default_refresh_jscore_passthrough.yml index 83c6b88f289..c4417ac733c 100644 --- a/buildscripts/resmokeconfig/suites/logical_session_cache_replication_default_refresh_jscore_passthrough.yml +++ b/buildscripts/resmokeconfig/suites/logical_session_cache_replication_default_refresh_jscore_passthrough.yml @@ -12,12 +12,6 @@ selector: # These tests change the featureCompatibilityVersion which makes it so transactions aren't # supported. - jstests/core/txns/abort_transactions_on_FCV_downgrade.js - # TODO SERVER-39321: Remove the following block of blacklists. - # These tests change the transactionLifetimeLimitSeconds server parameter which conflicts with how - # the CheckReplDBHashInBackground hook doesn't want transactions to be reaped while it is running. - - jstests/core/txns/abort_expired_transaction.js - - jstests/core/txns/abort_transaction_thread_does_not_block_on_locks.js - - jstests/core/txns/kill_op_on_txn_expiry.js # The set_param1.js test attempts to compare the response from running the {getParameter: "*"} # command multiple times, which may observe the change to the "transactionLifetimeLimitSeconds" # server parameter. @@ -37,6 +31,7 @@ selector: executor: archive: hooks: + - CheckReplDBHashInBackground - CheckReplDBHash - CheckReplOplogs - ValidateCollections @@ -48,7 +43,7 @@ executor: # The CheckReplDBHash hook waits until all operations have replicated to and have been applied # on the secondaries, so we run the ValidateCollections hook after it to ensure we're # validating the entire contents of the collection. - # TODO SERVER-39321: Re-enable and archive the CheckReplDBHashInBackground hook. + - class: CheckReplDBHashInBackground - class: CheckReplOplogs - class: CheckReplDBHash - class: ValidateCollections diff --git a/buildscripts/resmokeconfig/suites/replica_sets_jscore_passthrough.yml b/buildscripts/resmokeconfig/suites/replica_sets_jscore_passthrough.yml index 502ad225144..ef37c323fc5 100644 --- a/buildscripts/resmokeconfig/suites/replica_sets_jscore_passthrough.yml +++ b/buildscripts/resmokeconfig/suites/replica_sets_jscore_passthrough.yml @@ -25,6 +25,7 @@ selector: executor: archive: hooks: + - CheckReplDBHashInBackground - CheckReplDBHash - CheckReplOplogs - ValidateCollections @@ -36,7 +37,7 @@ executor: # The CheckReplDBHash hook waits until all operations have replicated to and have been applied # on the secondaries, so we run the ValidateCollections hook after it to ensure we're # validating the entire contents of the collection. - # TODO SERVER-39321: Re-enable and archive the CheckReplDBHashInBackground hook. + - class: CheckReplDBHashInBackground - class: CheckReplOplogs - class: CheckReplDBHash - class: ValidateCollections @@ -47,11 +48,4 @@ executor: mongod_options: set_parameters: enableTestCommands: 1 - # TODO SERVER-39321: Remove the following block now that SERVER-35377 has been resolved. - # - # When running tests that intentionally trigger a DuplicateKeyError, we somehow end up - # performing an atClusterTime read in the CheckReplDBHashInBackground hook based on an - # operationTime that is greater than anything in the oplog. The periodic no-op writer must - # be enabled to ensure that we eventually reach the clusterTime we are waiting for. - writePeriodicNoops: 1 num_nodes: 2 diff --git a/buildscripts/resmokeconfig/suites/replica_sets_multi_oplog_txns_jscore_passthrough.yml b/buildscripts/resmokeconfig/suites/replica_sets_multi_oplog_txns_jscore_passthrough.yml index cf0e6dd95cb..7b39c059c64 100644 --- a/buildscripts/resmokeconfig/suites/replica_sets_multi_oplog_txns_jscore_passthrough.yml +++ b/buildscripts/resmokeconfig/suites/replica_sets_multi_oplog_txns_jscore_passthrough.yml @@ -19,6 +19,7 @@ selector: executor: archive: hooks: + - CheckReplDBHashInBackground - CheckReplDBHash - CheckReplOplogs - ValidateCollections @@ -30,7 +31,7 @@ executor: # The CheckReplDBHash hook waits until all operations have replicated to and have been applied # on the secondaries, so we run the ValidateCollections hook after it to ensure we're # validating the entire contents of the collection. - # TODO SERVER-39321: Re-enable and archive the CheckReplDBHashInBackground hook. + - class: CheckReplDBHashInBackground - class: CheckReplOplogs - class: CheckReplDBHash - class: ValidateCollections @@ -42,11 +43,4 @@ executor: set_parameters: enableTestCommands: 1 useMultipleOplogEntryFormatForTransactions: true - # TODO SERVER-39321: Remove the following block now that SERVER-35377 has been resolved. - # - # When running tests that intentionally trigger a DuplicateKeyError, we somehow end up - # performing an atClusterTime read in the CheckReplDBHashInBackground hook based on an - # operationTime that is greater than anything in the oplog. The periodic no-op writer must - # be enabled to ensure that we eventually reach the clusterTime we are waiting for. - writePeriodicNoops: 1 num_nodes: 2 diff --git a/jstests/hooks/run_check_repl_dbhash_background.js b/jstests/hooks/run_check_repl_dbhash_background.js index 8cd338b2d32..712ea6013a7 100644 --- a/jstests/hooks/run_check_repl_dbhash_background.js +++ b/jstests/hooks/run_check_repl_dbhash_background.js @@ -251,7 +251,9 @@ for (let dbName of dbNames) { let result; let clusterTime; + let previousClusterTime; let hasTransientError; + let performNoopWrite; // The isTransientError() function is responsible for setting hasTransientError to true. const isTransientError = (e) => { @@ -265,18 +267,36 @@ // ReplSetTest#getCollectionDiffUsingSessions() upon detecting a dbHash mismatch. It is // presumed to still useful to know that a bug exists even if we cannot get more // diagnostics for it. - if (e.code === ErrorCodes.Interrupted || e.code === ErrorCodes.SnapshotUnavailable) { + if (e.code === ErrorCodes.Interrupted) { hasTransientError = true; - return true; } - return false; + // Perform a no-op write to the primary if the clusterTime between each call remain + // the same and if we encounter the SnapshotUnavailable error as the secondaries minimum + // timestamp can be greater than the primaries minimum timestamp. + if (e.code === ErrorCodes.SnapshotUnavailable) { + if (bsonBinaryEqual(clusterTime, previousClusterTime)) { + performNoopWrite = true; + } + hasTransientError = true; + } + + // InvalidOptions can be returned when $_internalReadAtClusterTime is greater than the + // all-committed timestamp. As the dbHash command is running in the background at + // varying times, it's possible that we may run dbHash while a prepared transactions + // has yet to commit or abort. + if (e.code === ErrorCodes.InvalidOptions) { + hasTransientError = true; + } + + return hasTransientError; }; do { // SERVER-38928: Due to races around advancing last applied, there's technically no // guarantee that a primary will report a later operation time than its // secondaries. Perform the snapshot read at the latest reported operation time. + previousClusterTime = clusterTime; clusterTime = sessions[0].getOperationTime(); let signedClusterTime = sessions[0].getClusterTime(); for (let sess of sessions.slice(1)) { @@ -297,12 +317,19 @@ } hasTransientError = false; + performNoopWrite = false; try { result = checkCollectionHashesForDB(dbName, clusterTime); } catch (e) { if (isTransientError(e)) { - debugInfo.push({"transientError": e}); + if (performNoopWrite) { + const primarySession = sessions[0]; + assert.commandWorked(primarySession.getDatabase(dbName).adminCommand( + {appendOplogNote: 1, data: {}})); + } + + debugInfo.push({"transientError": e, "performNoopWrite": performNoopWrite}); continue; } diff --git a/src/mongo/db/commands/dbhash.cpp b/src/mongo/db/commands/dbhash.cpp index 3706db1e133..6a6ab9cd1a3 100644 --- a/src/mongo/db/commands/dbhash.cpp +++ b/src/mongo/db/commands/dbhash.cpp @@ -181,11 +181,18 @@ public: // We lock the entire database in S-mode in order to ensure that the contents will not // change for the snapshot. auto lockMode = LockMode::MODE_S; + boost::optional<ShouldNotConflictWithSecondaryBatchApplicationBlock> shouldNotConflictBlock; if (opCtx->recoveryUnit()->getTimestampReadSource() == RecoveryUnit::ReadSource::kProvided) { // However, if we are performing a read at a timestamp, then we only need to lock the // database in intent mode to ensure that none of the collections get dropped. lockMode = LockMode::MODE_IS; + + // Additionally, if we are performing a read at a timestamp, then we allow oplog + // application to proceed concurrently with the dbHash command. This is done + // to ensure a prepare conflict is able to eventually be resolved by processing a + // later commitTransaction or abortTransaction oplog entry. + shouldNotConflictBlock.emplace(opCtx->lockState()); } AutoGetDb autoDb(opCtx, ns, lockMode); Database* db = autoDb.getDb(); |