summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--buildscripts/resmokeconfig/suites/concurrency_replication.yml5
-rw-r--r--buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_with_balancer.yml4
-rw-r--r--buildscripts/resmokeconfig/suites/concurrency_sharded_replication.yml4
-rw-r--r--buildscripts/resmokelib/testing/hooks/dbcheck_background.py73
-rw-r--r--jstests/hooks/run_dbcheck_background.js162
-rw-r--r--src/mongo/db/catalog/health_log.cpp1
-rw-r--r--src/mongo/db/commands/dbcheck.cpp38
-rw-r--r--src/mongo/db/concurrency/deferred_writer.cpp4
-rw-r--r--src/mongo/db/repl/SConscript1
-rw-r--r--src/mongo/db/repl/dbcheck.cpp13
-rw-r--r--src/mongo/db/repl/dbcheck.h4
11 files changed, 305 insertions, 4 deletions
diff --git a/buildscripts/resmokeconfig/suites/concurrency_replication.yml b/buildscripts/resmokeconfig/suites/concurrency_replication.yml
index 286c35413a1..ef63c635757 100644
--- a/buildscripts/resmokeconfig/suites/concurrency_replication.yml
+++ b/buildscripts/resmokeconfig/suites/concurrency_replication.yml
@@ -26,6 +26,7 @@ selector:
executor:
archive:
hooks:
+ - RunDBCheckInBackground
- CheckReplDBHashInBackground
- ValidateCollectionsInBackground
- CheckReplDBHash
@@ -40,6 +41,7 @@ executor:
# validating the entire contents of the collection.
#
# TODO SERVER-26466: Add CheckReplOplogs hook to the concurrency suite.
+ - class: RunDBCheckInBackground
- class: CheckReplDBHashInBackground
- class: ValidateCollectionsInBackground
- class: CheckReplDBHash
@@ -52,4 +54,7 @@ executor:
set_parameters:
enableTestCommands: 1
roleGraphInvalidationIsFatal: 1
+ # TODO (SERVER-61754): dbCheck takes strong locks which causes transactions to timeout
+ maxTransactionLockRequestTimeoutMillis: 3000
+
num_nodes: 3
diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_with_balancer.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_with_balancer.yml
index 6d2d02219e5..3961bc6387d 100644
--- a/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_with_balancer.yml
+++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_with_balancer.yml
@@ -150,6 +150,7 @@ selector:
executor:
archive:
hooks:
+ - RunDBCheckInBackground
- CheckReplDBHashInBackground
- CheckReplDBHash
- ValidateCollections
@@ -163,6 +164,7 @@ executor:
runningWithSessions: true
traceExceptions: false
hooks:
+ - class: RunDBCheckInBackground
- class: CheckReplDBHashInBackground
- class: CheckReplDBHash
- class: ValidateCollections
@@ -181,6 +183,8 @@ executor:
enableTestCommands: 1
roleGraphInvalidationIsFatal: 1
receiveChunkWaitForRangeDeleterTimeoutMS: 90000
+ # TODO (SERVER-61754): dbCheck takes strong locks which causes transactions to timeout
+ maxTransactionLockRequestTimeoutMillis: 3000
num_rs_nodes_per_shard: 3
num_shards: 2
num_mongos: 2
diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_replication.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_replication.yml
index 53658c6b9d2..bb2a2303a8b 100644
--- a/buildscripts/resmokeconfig/suites/concurrency_sharded_replication.yml
+++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_replication.yml
@@ -111,6 +111,7 @@ selector:
executor:
archive:
hooks:
+ - RunDBCheckInBackground
- CheckReplDBHashInBackground
- CheckReplDBHash
- ValidateCollections
@@ -123,6 +124,7 @@ executor:
runningWithAutoSplit: false
runningWithBalancer: false
hooks:
+ - class: RunDBCheckInBackground
- class: CheckReplDBHashInBackground
- class: CheckReplDBHash
- class: ValidateCollections
@@ -143,6 +145,8 @@ executor:
enableTestCommands: 1
roleGraphInvalidationIsFatal: 1
receiveChunkWaitForRangeDeleterTimeoutMS: 90000
+ # TODO (SERVER-61754): dbCheck takes strong locks which causes transactions to timeout
+ maxTransactionLockRequestTimeoutMillis: 3000
num_rs_nodes_per_shard: 3
num_shards: 2
num_mongos: 2
diff --git a/buildscripts/resmokelib/testing/hooks/dbcheck_background.py b/buildscripts/resmokelib/testing/hooks/dbcheck_background.py
new file mode 100644
index 00000000000..6e8fc267af4
--- /dev/null
+++ b/buildscripts/resmokelib/testing/hooks/dbcheck_background.py
@@ -0,0 +1,73 @@
+"""Test hook for verifying data consistency across a replica set using the dbCheck command.
+
+This hook runs continuously in a background thread while the test is running.
+"""
+
+import os.path
+
+from buildscripts.resmokelib import errors
+from buildscripts.resmokelib.testing.hooks import jsfile
+from buildscripts.resmokelib.testing.hooks.background_job import _BackgroundJob, _ContinuousDynamicJSTestCase
+
+
+class RunDBCheckInBackground(jsfile.JSHook):
+ """A hook for running dbCheck on a replica set while a test is running."""
+
+ IS_BACKGROUND = True
+
+ def __init__(self, hook_logger, fixture, shell_options=None):
+ """Initialize RunDBCheckInBackground."""
+ description = "Runs dbCheck on a replica set while a test is running"
+ js_filename = os.path.join("jstests", "hooks", "run_dbcheck_background.js")
+ jsfile.JSHook.__init__(self, hook_logger, fixture, js_filename, description,
+ shell_options=shell_options)
+
+ self._background_job = None
+
+ def before_suite(self, test_report):
+ """Start the background thread."""
+ self._background_job = _BackgroundJob("RunDBCheckInBackground")
+ self.logger.info("Starting the background check repl dbCheck thread.")
+ self._background_job.start()
+
+ def after_suite(self, test_report):
+ """Signal the background thread to exit, and wait until it does."""
+ if self._background_job is None:
+ return
+
+ self.logger.info("Stopping the background check repl dbCheck thread.")
+ self._background_job.stop()
+
+ def before_test(self, test, test_report):
+ """Instruct the background thread to run the dbCheck check while 'test' is also running."""
+ if self._background_job is None:
+ return
+
+ hook_test_case = _ContinuousDynamicJSTestCase.create_before_test(
+ test.logger, test, self, self._js_filename, self._shell_options)
+ hook_test_case.configure(self.fixture)
+
+ self.logger.info("Resuming the background check repl dbCheck thread.")
+ self._background_job.resume(hook_test_case, test_report)
+
+ def after_test(self, test, test_report): # noqa: D205,D400
+ """Instruct the background thread to stop running the dbCheck check now that 'test' has
+ finished running.
+ """
+ if self._background_job is None:
+ return
+
+ self.logger.info("Pausing the background check repl dbCheck thread.")
+ self._background_job.pause()
+
+ if self._background_job.exc_info is not None:
+ if isinstance(self._background_job.exc_info[1], errors.TestFailure):
+ # If the mongo shell process running the JavaScript file exited with a non-zero
+ # return code, then we raise an errors.ServerFailure exception to cause resmoke.py's
+ # test execution to stop.
+ raise errors.ServerFailure(self._background_job.exc_info[1].args[0])
+ else:
+ self.logger.error(
+ "Encountered an error inside the background check repl dbCheck thread.",
+ exc_info=self._background_job.exc_info)
+ raise self._background_job.exc_info[1]
diff --git a/jstests/hooks/run_dbcheck_background.js b/jstests/hooks/run_dbcheck_background.js
new file mode 100644
index 00000000000..4abfb414f7c
--- /dev/null
+++ b/jstests/hooks/run_dbcheck_background.js
@@ -0,0 +1,162 @@
+/**
+ * Runs dbCheck in background.
+ */
+'use strict';
+
+(function() {
+load('jstests/libs/discover_topology.js'); // For Topology and DiscoverTopology.
+load('jstests/libs/parallelTester.js'); // For Thread.
+
+if (typeof db === 'undefined') {
+ throw new Error(
+ "Expected mongo shell to be connected a server, but global 'db' object isn't defined");
+}
+
+TestData = TestData || {};
+
+const conn = db.getMongo();
+const topology = DiscoverTopology.findConnectedNodes(conn);
+
+function runBackgroundDbCheck(hosts) {
+ const quietly = (func) => {
+ const printOriginal = print;
+ try {
+ print = Function.prototype;
+ func();
+ } finally {
+ print = printOriginal;
+ }
+ };
+
+ let rst;
+ // We construct the ReplSetTest instance with the print() function overridden to be a no-op
+ // in order to suppress the log messages about the replica set configuration. The
+ // run_dbcheck_background.js hook is executed frequently by resmoke.py and would
+ // otherwise lead to generating an overwhelming amount of log messages.
+ quietly(() => {
+ rst = new ReplSetTest(hosts[0]);
+ });
+
+ const dbNames = new Set();
+ const primary = rst.getPrimary();
+
+ const version =
+ assert
+ .commandWorked(primary.adminCommand({getParameter: 1, featureCompatibilityVersion: 1}))
+ .featureCompatibilityVersion.version;
+ if (version != latestFCV) {
+ print("Not running dbCheck in FCV " + version);
+ return {ok: 1};
+ }
+
+ print("Running dbCheck for: " + rst.getURL());
+
+ const adminDb = primary.getDB('admin');
+ let res = assert.commandWorked(adminDb.runCommand({listDatabases: 1, nameOnly: true}));
+ for (let dbInfo of res.databases) {
+ dbNames.add(dbInfo.name);
+ }
+
+ // Transactions cannot be run on the following databases so we don't attempt to read at a
+ // clusterTime on them either. (The "local" database is also not replicated.)
+ // The config.transactions collection is different between primaries and secondaries.
+ dbNames.delete('config');
+ dbNames.delete('local');
+
+ dbNames.forEach((dbName) => {
+ try {
+ assert.commandWorked(primary.getDB(dbName).runCommand({dbCheck: 1}));
+ jsTestLog("dbCheck done on database " + dbName);
+ } catch (e) {
+ if (e.code === ErrorCodes.NamespaceNotFound || e.code === ErrorCodes.LockTimeout) {
+ jsTestLog("Skipping dbCheck on database " + dbName +
+ " due to transient error: " + tojson(e));
+ return;
+ } else {
+ throw e;
+ }
+ }
+
+ const dbCheckCompleted = (db) => {
+ return db.currentOp().inprog.filter(x => x["desc"] == "dbCheck")[0] === undefined;
+ };
+
+ assert.soon(() => dbCheckCompleted(adminDb),
+ "timed out waiting for dbCheck to finish on database: " + dbName);
+ });
+
+ // Wait for all secondaries to finish applying all dbcheck batches.
+ rst.awaitReplication();
+
+ const nodes = [
+ rst.getPrimary(),
+ ...rst.getSecondaries().filter(conn => {
+ return !conn.adminCommand({isMaster: 1}).arbiterOnly;
+ })
+ ];
+ nodes.forEach((node) => {
+ // Assert no errors (i.e., found inconsistencies). Allow warnings.
+ const healthlog = node.getDB('local').system.healthlog;
+ let errs = healthlog.find({"severity": "error"});
+ if (errs.hasNext()) {
+ const err = "dbCheck found inconsistency on " + node.host;
+ jsTestLog(err + ". Errors: ");
+ for (let count = 0; errs.hasNext() && count < 20; count++) {
+ jsTestLog(tojson(errs.next()));
+ }
+ assert(false, err);
+ }
+ jsTestLog("Checked health log on " + node.host);
+ });
+
+ return {ok: 1};
+}
+
+if (topology.type === Topology.kReplicaSet) {
+ let res = runBackgroundDbCheck(topology.nodes);
+ assert.commandWorked(res, () => 'dbCheck replication consistency check failed: ' + tojson(res));
+} else if (topology.type === Topology.kShardedCluster) {
+ const threads = [];
+ try {
+ if (topology.configsvr.type === Topology.kReplicaSet) {
+ const thread = new Thread(runBackgroundDbCheck, topology.configsvr.nodes);
+ threads.push(thread);
+ thread.start();
+ }
+
+ for (let shardName of Object.keys(topology.shards)) {
+ const shard = topology.shards[shardName];
+ if (shard.type === Topology.kReplicaSet) {
+ const thread = new Thread(runBackgroundDbCheck, shard.nodes);
+ threads.push(thread);
+ thread.start();
+ } else {
+ throw new Error('Unrecognized topology format: ' + tojson(topology));
+ }
+ }
+ } finally {
+ // Wait for each thread to finish. Throw an error if any thread fails.
+ let exception;
+ const returnData = threads.map(thread => {
+ try {
+ thread.join();
+ return thread.returnData();
+ } catch (e) {
+ if (!exception) {
+ exception = e;
+ }
+ }
+ });
+ if (exception) {
+ throw exception;
+ }
+
+ returnData.forEach(res => {
+ assert.commandWorked(
+ res, () => 'dbCheck replication consistency check failed: ' + tojson(res));
+ });
+ }
+} else {
+ throw new Error('Unsupported topology configuration: ' + tojson(topology));
+}
+})();
diff --git a/src/mongo/db/catalog/health_log.cpp b/src/mongo/db/catalog/health_log.cpp
index 2703dee4aa1..aa49302f169 100644
--- a/src/mongo/db/catalog/health_log.cpp
+++ b/src/mongo/db/catalog/health_log.cpp
@@ -46,6 +46,7 @@ CollectionOptions getOptions(void) {
CollectionOptions options;
options.capped = true;
options.cappedSize = kDefaultHealthlogSize;
+ options.setNoIdIndex();
return options;
}
} // namespace
diff --git a/src/mongo/db/commands/dbcheck.cpp b/src/mongo/db/commands/dbcheck.cpp
index e60c01f67e1..70ef441902f 100644
--- a/src/mongo/db/commands/dbcheck.cpp
+++ b/src/mongo/db/commands/dbcheck.cpp
@@ -113,6 +113,9 @@ std::unique_ptr<DbCheckRun> fullDatabaseRun(OperationContext* opCtx,
const auto rate = invocation.getMaxCountPerSecond();
auto result = std::make_unique<DbCheckRun>();
auto perCollectionWork = [&](const CollectionPtr& coll) {
+ if (!coll->ns().isReplicated() || coll->isClustered()) {
+ return true;
+ }
DbCheckCollectionInfo info{coll->ns(), BSONKey::min(), BSONKey::max(), max, max, rate};
result->push_back(info);
return true;
@@ -169,6 +172,11 @@ protected:
// Every dbCheck runs in its own client.
ThreadClient tc(name(), getGlobalServiceContext());
+ {
+ stdx::lock_guard<Client> lk(*tc.get());
+ tc.get()->setSystemOperationKillableByStepdown(lk);
+ }
+
for (const auto& coll : *_run) {
try {
_doCollection(coll);
@@ -226,8 +234,34 @@ private:
std::unique_ptr<HealthLogEntry> entry;
if (!result.isOK()) {
- entry = dbCheckErrorHealthLogEntry(
- info.nss, "dbCheck batch failed", OplogEntriesEnum::Batch, result.getStatus());
+ auto code = result.getStatus().code();
+ if (code == ErrorCodes::LockTimeout) {
+ entry = dbCheckWarningHealthLogEntry(
+ info.nss,
+ "retrying dbCheck batch after timeout due to lock unavailability",
+ OplogEntriesEnum::Batch,
+ result.getStatus());
+ HealthLog::get(Client::getCurrent()->getServiceContext()).log(*entry);
+ continue;
+ }
+ if (code == ErrorCodes::NamespaceNotFound) {
+ entry = dbCheckWarningHealthLogEntry(
+ info.nss,
+ "abandoning dbCheck batch because collection no longer exists",
+ OplogEntriesEnum::Batch,
+ result.getStatus());
+ } else if (ErrorCodes::isA<ErrorCategory::NotPrimaryError>(code)) {
+ entry = dbCheckWarningHealthLogEntry(
+ info.nss,
+ "stopping dbCheck because node is no longer primary",
+ OplogEntriesEnum::Batch,
+ result.getStatus());
+ } else {
+ entry = dbCheckErrorHealthLogEntry(info.nss,
+ "dbCheck batch failed",
+ OplogEntriesEnum::Batch,
+ result.getStatus());
+ }
HealthLog::get(Client::getCurrent()->getServiceContext()).log(*entry);
return;
} else {
diff --git a/src/mongo/db/concurrency/deferred_writer.cpp b/src/mongo/db/concurrency/deferred_writer.cpp
index 6c0f0ed3c77..40f5422eb62 100644
--- a/src/mongo/db/concurrency/deferred_writer.cpp
+++ b/src/mongo/db/concurrency/deferred_writer.cpp
@@ -100,7 +100,7 @@ StatusWith<std::unique_ptr<AutoGetCollection>> DeferredWriter::_getCollection(
return std::move(agc);
}
-void DeferredWriter::_worker(InsertStatement stmt) {
+void DeferredWriter::_worker(InsertStatement stmt) try {
auto uniqueOpCtx = Client::getCurrent()->makeOperationContext();
OperationContext* opCtx = uniqueOpCtx.get();
auto result = _getCollection(opCtx);
@@ -133,6 +133,8 @@ void DeferredWriter::_worker(InsertStatement stmt) {
if (!status.isOK()) {
_logFailure(status);
}
+} catch (const DBException& e) {
+ _logFailure(e.toStatus());
}
DeferredWriter::DeferredWriter(NamespaceString nss, CollectionOptions opts, int64_t maxSize)
diff --git a/src/mongo/db/repl/SConscript b/src/mongo/db/repl/SConscript
index 0ac1a970add..eb96c043122 100644
--- a/src/mongo/db/repl/SConscript
+++ b/src/mongo/db/repl/SConscript
@@ -180,6 +180,7 @@ env.Library(
LIBDEPS_PRIVATE=[
'$BUILD_DIR/mongo/db/catalog/collection_options',
'$BUILD_DIR/mongo/db/query_exec',
+ '$BUILD_DIR/mongo/db/record_id_helpers',
'$BUILD_DIR/mongo/util/md5',
],
)
diff --git a/src/mongo/db/repl/dbcheck.cpp b/src/mongo/db/repl/dbcheck.cpp
index 58baed702e4..7cca8b6a1d2 100644
--- a/src/mongo/db/repl/dbcheck.cpp
+++ b/src/mongo/db/repl/dbcheck.cpp
@@ -39,6 +39,7 @@
#include "mongo/db/db_raii.h"
#include "mongo/db/operation_context.h"
#include "mongo/db/query/internal_plans.h"
+#include "mongo/db/record_id_helpers.h"
#include "mongo/db/repl/dbcheck.h"
#include "mongo/db/repl/dbcheck_gen.h"
#include "mongo/db/repl/oplog.h"
@@ -146,6 +147,17 @@ std::unique_ptr<HealthLogEntry> dbCheckErrorHealthLogEntry(const NamespaceString
BSON("success" << false << "error" << err.toString()));
}
+std::unique_ptr<HealthLogEntry> dbCheckWarningHealthLogEntry(const NamespaceString& nss,
+ const std::string& msg,
+ OplogEntriesEnum operation,
+ const Status& err) {
+ return dbCheckHealthLogEntry(nss,
+ SeverityEnum::Warning,
+ msg,
+ operation,
+ BSON("success" << false << "error" << err.toString()));
+}
+
/**
* Get a HealthLogEntry for a dbCheck batch.
*/
@@ -198,7 +210,6 @@ DbCheckHasher::DbCheckHasher(OperationContext* opCtx,
// Get the _id index.
const IndexDescriptor* desc = collection->getIndexCatalog()->findIdIndex(opCtx);
-
uassert(ErrorCodes::IndexNotFound, "dbCheck needs _id index", desc);
// Set up a simple index scan on that.
diff --git a/src/mongo/db/repl/dbcheck.h b/src/mongo/db/repl/dbcheck.h
index 6dac7630bd7..67076d07191 100644
--- a/src/mongo/db/repl/dbcheck.h
+++ b/src/mongo/db/repl/dbcheck.h
@@ -65,6 +65,10 @@ std::unique_ptr<HealthLogEntry> dbCheckErrorHealthLogEntry(const NamespaceString
OplogEntriesEnum operation,
const Status& err);
+std::unique_ptr<HealthLogEntry> dbCheckWarningHealthLogEntry(const NamespaceString& nss,
+ const std::string& msg,
+ OplogEntriesEnum operation,
+ const Status& err);
/**
* Get a HealthLogEntry for a dbCheck batch.
*/