diff options
-rw-r--r-- | buildscripts/resmokeconfig/suites/concurrency_replication.yml | 5 | ||||
-rw-r--r-- | buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_with_balancer.yml | 4 | ||||
-rw-r--r-- | buildscripts/resmokeconfig/suites/concurrency_sharded_replication.yml | 4 | ||||
-rw-r--r-- | buildscripts/resmokelib/testing/hooks/dbcheck_background.py | 73 | ||||
-rw-r--r-- | jstests/hooks/run_dbcheck_background.js | 162 | ||||
-rw-r--r-- | src/mongo/db/catalog/health_log.cpp | 1 | ||||
-rw-r--r-- | src/mongo/db/commands/dbcheck.cpp | 38 | ||||
-rw-r--r-- | src/mongo/db/concurrency/deferred_writer.cpp | 4 | ||||
-rw-r--r-- | src/mongo/db/repl/SConscript | 1 | ||||
-rw-r--r-- | src/mongo/db/repl/dbcheck.cpp | 13 | ||||
-rw-r--r-- | src/mongo/db/repl/dbcheck.h | 4 |
11 files changed, 305 insertions, 4 deletions
diff --git a/buildscripts/resmokeconfig/suites/concurrency_replication.yml b/buildscripts/resmokeconfig/suites/concurrency_replication.yml index 286c35413a1..ef63c635757 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_replication.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_replication.yml @@ -26,6 +26,7 @@ selector: executor: archive: hooks: + - RunDBCheckInBackground - CheckReplDBHashInBackground - ValidateCollectionsInBackground - CheckReplDBHash @@ -40,6 +41,7 @@ executor: # validating the entire contents of the collection. # # TODO SERVER-26466: Add CheckReplOplogs hook to the concurrency suite. + - class: RunDBCheckInBackground - class: CheckReplDBHashInBackground - class: ValidateCollectionsInBackground - class: CheckReplDBHash @@ -52,4 +54,7 @@ executor: set_parameters: enableTestCommands: 1 roleGraphInvalidationIsFatal: 1 + # TODO (SERVER-61754): dbCheck takes strong locks which causes transactions to timeout + maxTransactionLockRequestTimeoutMillis: 3000 + num_nodes: 3 diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_with_balancer.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_with_balancer.yml index 6d2d02219e5..3961bc6387d 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_with_balancer.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_with_balancer.yml @@ -150,6 +150,7 @@ selector: executor: archive: hooks: + - RunDBCheckInBackground - CheckReplDBHashInBackground - CheckReplDBHash - ValidateCollections @@ -163,6 +164,7 @@ executor: runningWithSessions: true traceExceptions: false hooks: + - class: RunDBCheckInBackground - class: CheckReplDBHashInBackground - class: CheckReplDBHash - class: ValidateCollections @@ -181,6 +183,8 @@ executor: enableTestCommands: 1 roleGraphInvalidationIsFatal: 1 receiveChunkWaitForRangeDeleterTimeoutMS: 90000 + # TODO (SERVER-61754): dbCheck takes strong locks which causes transactions to timeout + maxTransactionLockRequestTimeoutMillis: 3000 num_rs_nodes_per_shard: 3 num_shards: 2 num_mongos: 2 diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_replication.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_replication.yml index 53658c6b9d2..bb2a2303a8b 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_sharded_replication.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_replication.yml @@ -111,6 +111,7 @@ selector: executor: archive: hooks: + - RunDBCheckInBackground - CheckReplDBHashInBackground - CheckReplDBHash - ValidateCollections @@ -123,6 +124,7 @@ executor: runningWithAutoSplit: false runningWithBalancer: false hooks: + - class: RunDBCheckInBackground - class: CheckReplDBHashInBackground - class: CheckReplDBHash - class: ValidateCollections @@ -143,6 +145,8 @@ executor: enableTestCommands: 1 roleGraphInvalidationIsFatal: 1 receiveChunkWaitForRangeDeleterTimeoutMS: 90000 + # TODO (SERVER-61754): dbCheck takes strong locks which causes transactions to timeout + maxTransactionLockRequestTimeoutMillis: 3000 num_rs_nodes_per_shard: 3 num_shards: 2 num_mongos: 2 diff --git a/buildscripts/resmokelib/testing/hooks/dbcheck_background.py b/buildscripts/resmokelib/testing/hooks/dbcheck_background.py new file mode 100644 index 00000000000..6e8fc267af4 --- /dev/null +++ b/buildscripts/resmokelib/testing/hooks/dbcheck_background.py @@ -0,0 +1,73 @@ +"""Test hook for verifying data consistency across a replica set using the dbCheck command. + +This hook runs continuously in a background thread while the test is running. +""" + +import os.path + +from buildscripts.resmokelib import errors +from buildscripts.resmokelib.testing.hooks import jsfile +from buildscripts.resmokelib.testing.hooks.background_job import _BackgroundJob, _ContinuousDynamicJSTestCase + + +class RunDBCheckInBackground(jsfile.JSHook): + """A hook for running dbCheck on a replica set while a test is running.""" + + IS_BACKGROUND = True + + def __init__(self, hook_logger, fixture, shell_options=None): + """Initialize RunDBCheckInBackground.""" + description = "Runs dbCheck on a replica set while a test is running" + js_filename = os.path.join("jstests", "hooks", "run_dbcheck_background.js") + jsfile.JSHook.__init__(self, hook_logger, fixture, js_filename, description, + shell_options=shell_options) + + self._background_job = None + + def before_suite(self, test_report): + """Start the background thread.""" + self._background_job = _BackgroundJob("RunDBCheckInBackground") + self.logger.info("Starting the background check repl dbCheck thread.") + self._background_job.start() + + def after_suite(self, test_report): + """Signal the background thread to exit, and wait until it does.""" + if self._background_job is None: + return + + self.logger.info("Stopping the background check repl dbCheck thread.") + self._background_job.stop() + + def before_test(self, test, test_report): + """Instruct the background thread to run the dbCheck check while 'test' is also running.""" + if self._background_job is None: + return + + hook_test_case = _ContinuousDynamicJSTestCase.create_before_test( + test.logger, test, self, self._js_filename, self._shell_options) + hook_test_case.configure(self.fixture) + + self.logger.info("Resuming the background check repl dbCheck thread.") + self._background_job.resume(hook_test_case, test_report) + + def after_test(self, test, test_report): # noqa: D205,D400 + """Instruct the background thread to stop running the dbCheck check now that 'test' has + finished running. + """ + if self._background_job is None: + return + + self.logger.info("Pausing the background check repl dbCheck thread.") + self._background_job.pause() + + if self._background_job.exc_info is not None: + if isinstance(self._background_job.exc_info[1], errors.TestFailure): + # If the mongo shell process running the JavaScript file exited with a non-zero + # return code, then we raise an errors.ServerFailure exception to cause resmoke.py's + # test execution to stop. + raise errors.ServerFailure(self._background_job.exc_info[1].args[0]) + else: + self.logger.error( + "Encountered an error inside the background check repl dbCheck thread.", + exc_info=self._background_job.exc_info) + raise self._background_job.exc_info[1] diff --git a/jstests/hooks/run_dbcheck_background.js b/jstests/hooks/run_dbcheck_background.js new file mode 100644 index 00000000000..4abfb414f7c --- /dev/null +++ b/jstests/hooks/run_dbcheck_background.js @@ -0,0 +1,162 @@ +/** + * Runs dbCheck in background. + */ +'use strict'; + +(function() { +load('jstests/libs/discover_topology.js'); // For Topology and DiscoverTopology. +load('jstests/libs/parallelTester.js'); // For Thread. + +if (typeof db === 'undefined') { + throw new Error( + "Expected mongo shell to be connected a server, but global 'db' object isn't defined"); +} + +TestData = TestData || {}; + +const conn = db.getMongo(); +const topology = DiscoverTopology.findConnectedNodes(conn); + +function runBackgroundDbCheck(hosts) { + const quietly = (func) => { + const printOriginal = print; + try { + print = Function.prototype; + func(); + } finally { + print = printOriginal; + } + }; + + let rst; + // We construct the ReplSetTest instance with the print() function overridden to be a no-op + // in order to suppress the log messages about the replica set configuration. The + // run_dbcheck_background.js hook is executed frequently by resmoke.py and would + // otherwise lead to generating an overwhelming amount of log messages. + quietly(() => { + rst = new ReplSetTest(hosts[0]); + }); + + const dbNames = new Set(); + const primary = rst.getPrimary(); + + const version = + assert + .commandWorked(primary.adminCommand({getParameter: 1, featureCompatibilityVersion: 1})) + .featureCompatibilityVersion.version; + if (version != latestFCV) { + print("Not running dbCheck in FCV " + version); + return {ok: 1}; + } + + print("Running dbCheck for: " + rst.getURL()); + + const adminDb = primary.getDB('admin'); + let res = assert.commandWorked(adminDb.runCommand({listDatabases: 1, nameOnly: true})); + for (let dbInfo of res.databases) { + dbNames.add(dbInfo.name); + } + + // Transactions cannot be run on the following databases so we don't attempt to read at a + // clusterTime on them either. (The "local" database is also not replicated.) + // The config.transactions collection is different between primaries and secondaries. + dbNames.delete('config'); + dbNames.delete('local'); + + dbNames.forEach((dbName) => { + try { + assert.commandWorked(primary.getDB(dbName).runCommand({dbCheck: 1})); + jsTestLog("dbCheck done on database " + dbName); + } catch (e) { + if (e.code === ErrorCodes.NamespaceNotFound || e.code === ErrorCodes.LockTimeout) { + jsTestLog("Skipping dbCheck on database " + dbName + + " due to transient error: " + tojson(e)); + return; + } else { + throw e; + } + } + + const dbCheckCompleted = (db) => { + return db.currentOp().inprog.filter(x => x["desc"] == "dbCheck")[0] === undefined; + }; + + assert.soon(() => dbCheckCompleted(adminDb), + "timed out waiting for dbCheck to finish on database: " + dbName); + }); + + // Wait for all secondaries to finish applying all dbcheck batches. + rst.awaitReplication(); + + const nodes = [ + rst.getPrimary(), + ...rst.getSecondaries().filter(conn => { + return !conn.adminCommand({isMaster: 1}).arbiterOnly; + }) + ]; + nodes.forEach((node) => { + // Assert no errors (i.e., found inconsistencies). Allow warnings. + const healthlog = node.getDB('local').system.healthlog; + let errs = healthlog.find({"severity": "error"}); + if (errs.hasNext()) { + const err = "dbCheck found inconsistency on " + node.host; + jsTestLog(err + ". Errors: "); + for (let count = 0; errs.hasNext() && count < 20; count++) { + jsTestLog(tojson(errs.next())); + } + assert(false, err); + } + jsTestLog("Checked health log on " + node.host); + }); + + return {ok: 1}; +} + +if (topology.type === Topology.kReplicaSet) { + let res = runBackgroundDbCheck(topology.nodes); + assert.commandWorked(res, () => 'dbCheck replication consistency check failed: ' + tojson(res)); +} else if (topology.type === Topology.kShardedCluster) { + const threads = []; + try { + if (topology.configsvr.type === Topology.kReplicaSet) { + const thread = new Thread(runBackgroundDbCheck, topology.configsvr.nodes); + threads.push(thread); + thread.start(); + } + + for (let shardName of Object.keys(topology.shards)) { + const shard = topology.shards[shardName]; + if (shard.type === Topology.kReplicaSet) { + const thread = new Thread(runBackgroundDbCheck, shard.nodes); + threads.push(thread); + thread.start(); + } else { + throw new Error('Unrecognized topology format: ' + tojson(topology)); + } + } + } finally { + // Wait for each thread to finish. Throw an error if any thread fails. + let exception; + const returnData = threads.map(thread => { + try { + thread.join(); + return thread.returnData(); + } catch (e) { + if (!exception) { + exception = e; + } + } + }); + if (exception) { + throw exception; + } + + returnData.forEach(res => { + assert.commandWorked( + res, () => 'dbCheck replication consistency check failed: ' + tojson(res)); + }); + } +} else { + throw new Error('Unsupported topology configuration: ' + tojson(topology)); +} +})(); diff --git a/src/mongo/db/catalog/health_log.cpp b/src/mongo/db/catalog/health_log.cpp index 2703dee4aa1..aa49302f169 100644 --- a/src/mongo/db/catalog/health_log.cpp +++ b/src/mongo/db/catalog/health_log.cpp @@ -46,6 +46,7 @@ CollectionOptions getOptions(void) { CollectionOptions options; options.capped = true; options.cappedSize = kDefaultHealthlogSize; + options.setNoIdIndex(); return options; } } // namespace diff --git a/src/mongo/db/commands/dbcheck.cpp b/src/mongo/db/commands/dbcheck.cpp index e60c01f67e1..70ef441902f 100644 --- a/src/mongo/db/commands/dbcheck.cpp +++ b/src/mongo/db/commands/dbcheck.cpp @@ -113,6 +113,9 @@ std::unique_ptr<DbCheckRun> fullDatabaseRun(OperationContext* opCtx, const auto rate = invocation.getMaxCountPerSecond(); auto result = std::make_unique<DbCheckRun>(); auto perCollectionWork = [&](const CollectionPtr& coll) { + if (!coll->ns().isReplicated() || coll->isClustered()) { + return true; + } DbCheckCollectionInfo info{coll->ns(), BSONKey::min(), BSONKey::max(), max, max, rate}; result->push_back(info); return true; @@ -169,6 +172,11 @@ protected: // Every dbCheck runs in its own client. ThreadClient tc(name(), getGlobalServiceContext()); + { + stdx::lock_guard<Client> lk(*tc.get()); + tc.get()->setSystemOperationKillableByStepdown(lk); + } + for (const auto& coll : *_run) { try { _doCollection(coll); @@ -226,8 +234,34 @@ private: std::unique_ptr<HealthLogEntry> entry; if (!result.isOK()) { - entry = dbCheckErrorHealthLogEntry( - info.nss, "dbCheck batch failed", OplogEntriesEnum::Batch, result.getStatus()); + auto code = result.getStatus().code(); + if (code == ErrorCodes::LockTimeout) { + entry = dbCheckWarningHealthLogEntry( + info.nss, + "retrying dbCheck batch after timeout due to lock unavailability", + OplogEntriesEnum::Batch, + result.getStatus()); + HealthLog::get(Client::getCurrent()->getServiceContext()).log(*entry); + continue; + } + if (code == ErrorCodes::NamespaceNotFound) { + entry = dbCheckWarningHealthLogEntry( + info.nss, + "abandoning dbCheck batch because collection no longer exists", + OplogEntriesEnum::Batch, + result.getStatus()); + } else if (ErrorCodes::isA<ErrorCategory::NotPrimaryError>(code)) { + entry = dbCheckWarningHealthLogEntry( + info.nss, + "stopping dbCheck because node is no longer primary", + OplogEntriesEnum::Batch, + result.getStatus()); + } else { + entry = dbCheckErrorHealthLogEntry(info.nss, + "dbCheck batch failed", + OplogEntriesEnum::Batch, + result.getStatus()); + } HealthLog::get(Client::getCurrent()->getServiceContext()).log(*entry); return; } else { diff --git a/src/mongo/db/concurrency/deferred_writer.cpp b/src/mongo/db/concurrency/deferred_writer.cpp index 6c0f0ed3c77..40f5422eb62 100644 --- a/src/mongo/db/concurrency/deferred_writer.cpp +++ b/src/mongo/db/concurrency/deferred_writer.cpp @@ -100,7 +100,7 @@ StatusWith<std::unique_ptr<AutoGetCollection>> DeferredWriter::_getCollection( return std::move(agc); } -void DeferredWriter::_worker(InsertStatement stmt) { +void DeferredWriter::_worker(InsertStatement stmt) try { auto uniqueOpCtx = Client::getCurrent()->makeOperationContext(); OperationContext* opCtx = uniqueOpCtx.get(); auto result = _getCollection(opCtx); @@ -133,6 +133,8 @@ void DeferredWriter::_worker(InsertStatement stmt) { if (!status.isOK()) { _logFailure(status); } +} catch (const DBException& e) { + _logFailure(e.toStatus()); } DeferredWriter::DeferredWriter(NamespaceString nss, CollectionOptions opts, int64_t maxSize) diff --git a/src/mongo/db/repl/SConscript b/src/mongo/db/repl/SConscript index 0ac1a970add..eb96c043122 100644 --- a/src/mongo/db/repl/SConscript +++ b/src/mongo/db/repl/SConscript @@ -180,6 +180,7 @@ env.Library( LIBDEPS_PRIVATE=[ '$BUILD_DIR/mongo/db/catalog/collection_options', '$BUILD_DIR/mongo/db/query_exec', + '$BUILD_DIR/mongo/db/record_id_helpers', '$BUILD_DIR/mongo/util/md5', ], ) diff --git a/src/mongo/db/repl/dbcheck.cpp b/src/mongo/db/repl/dbcheck.cpp index 58baed702e4..7cca8b6a1d2 100644 --- a/src/mongo/db/repl/dbcheck.cpp +++ b/src/mongo/db/repl/dbcheck.cpp @@ -39,6 +39,7 @@ #include "mongo/db/db_raii.h" #include "mongo/db/operation_context.h" #include "mongo/db/query/internal_plans.h" +#include "mongo/db/record_id_helpers.h" #include "mongo/db/repl/dbcheck.h" #include "mongo/db/repl/dbcheck_gen.h" #include "mongo/db/repl/oplog.h" @@ -146,6 +147,17 @@ std::unique_ptr<HealthLogEntry> dbCheckErrorHealthLogEntry(const NamespaceString BSON("success" << false << "error" << err.toString())); } +std::unique_ptr<HealthLogEntry> dbCheckWarningHealthLogEntry(const NamespaceString& nss, + const std::string& msg, + OplogEntriesEnum operation, + const Status& err) { + return dbCheckHealthLogEntry(nss, + SeverityEnum::Warning, + msg, + operation, + BSON("success" << false << "error" << err.toString())); +} + /** * Get a HealthLogEntry for a dbCheck batch. */ @@ -198,7 +210,6 @@ DbCheckHasher::DbCheckHasher(OperationContext* opCtx, // Get the _id index. const IndexDescriptor* desc = collection->getIndexCatalog()->findIdIndex(opCtx); - uassert(ErrorCodes::IndexNotFound, "dbCheck needs _id index", desc); // Set up a simple index scan on that. diff --git a/src/mongo/db/repl/dbcheck.h b/src/mongo/db/repl/dbcheck.h index 6dac7630bd7..67076d07191 100644 --- a/src/mongo/db/repl/dbcheck.h +++ b/src/mongo/db/repl/dbcheck.h @@ -65,6 +65,10 @@ std::unique_ptr<HealthLogEntry> dbCheckErrorHealthLogEntry(const NamespaceString OplogEntriesEnum operation, const Status& err); +std::unique_ptr<HealthLogEntry> dbCheckWarningHealthLogEntry(const NamespaceString& nss, + const std::string& msg, + OplogEntriesEnum operation, + const Status& err); /** * Get a HealthLogEntry for a dbCheck batch. */ |