summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--jstests/replsets/repl_startup_error_no_hang_on_shutdown.js50
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.cpp56
2 files changed, 87 insertions, 19 deletions
diff --git a/jstests/replsets/repl_startup_error_no_hang_on_shutdown.js b/jstests/replsets/repl_startup_error_no_hang_on_shutdown.js
new file mode 100644
index 00000000000..61160705195
--- /dev/null
+++ b/jstests/replsets/repl_startup_error_no_hang_on_shutdown.js
@@ -0,0 +1,50 @@
+/**
+ * Tests that errors generated as part of ReplicationCoordinatorImpl startup do not cause the server
+ * to hang during shutdown.
+ *
+ * @tags: [requires_persistence, requires_fcv_53]
+ */
+
+(function() {
+"use strict";
+
+load("jstests/libs/fail_point_util.js");
+
+const name = jsTestName();
+const rst = new ReplSetTest({
+ name: jsTestName(),
+ nodes: [{
+ setParameter: {
+ "failpoint.throwBeforeRecoveringTenantMigrationAccessBlockers":
+ tojson({mode: "alwaysOn"})
+ }
+ }],
+});
+rst.startSet();
+rst.initiate();
+
+let primary = rst.getPrimary();
+
+jsTestLog("Done initiating set. Restarting.");
+
+const exitCode = MongoRunner.EXIT_ABRUPT;
+let exceptionThrown = false;
+try {
+ rst.restart(0, {
+ startClean: false,
+ });
+} catch (e) {
+ assert(e.message.includes("MongoDB process stopped with exit code: " + exitCode),
+ () => tojson(e));
+ exceptionThrown = true;
+}
+
+assert.soon(
+ function() {
+ return rawMongoProgramOutput().search(/Fatal assertion.*6111701/) >= 0;
+ },
+ "Node should have fasserted upon encountering a fatal error during startup",
+ ReplSetTest.kDefaultTimeoutMS);
+
+assert(exceptionThrown, "Expected restart to fail.");
+})();
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index 89fcbd5e291..36ac01c19e6 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -150,6 +150,8 @@ MONGO_FAIL_POINT_DEFINE(skipBeforeFetchingConfig);
MONGO_FAIL_POINT_DEFINE(stepdownHangAfterGrabbingRSTL);
// Simulates returning a specified error in the hello response.
MONGO_FAIL_POINT_DEFINE(setCustomErrorInHelloResponseMongoD);
+// Throws right before the call into recoverTenantMigrationAccessBlockers.
+MONGO_FAIL_POINT_DEFINE(throwBeforeRecoveringTenantMigrationAccessBlockers);
// Number of times we tried to go live as a secondary.
Counter64 attemptsToBecomeSecondary;
@@ -521,6 +523,13 @@ bool ReplicationCoordinatorImpl::_startLoadLocalConfig(
_replicationProcess->getReplicationRecovery()->recoverFromOplog(opCtx, stableTimestamp);
LOGV2(4280505,
"Creating any necessary TenantMigrationAccessBlockers for unfinished migrations");
+
+ if (MONGO_unlikely(throwBeforeRecoveringTenantMigrationAccessBlockers.shouldFail())) {
+ uasserted(6111700,
+ "Failpoint 'throwBeforeRecoveringTenantMigrationAccessBlockers' triggered. "
+ "Throwing exception.");
+ }
+
tenant_migration_access_blocker::recoverTenantMigrationAccessBlockers(opCtx);
LOGV2(4280506, "Reconstructing prepared transactions");
reconstructPreparedTransactions(opCtx, OplogApplication::Mode::kRecovering);
@@ -914,29 +923,38 @@ void ReplicationCoordinatorImpl::startup(OperationContext* opCtx,
_storage->initializeStorageControlsForReplication(opCtx->getServiceContext());
- {
- stdx::lock_guard<Latch> lk(_mutex);
- fassert(18822, !_inShutdown);
- _setConfigState_inlock(kConfigStartingUp);
- _topCoord->setStorageEngineSupportsReadCommitted(
- _externalState->isReadCommittedSupportedByStorageEngine(opCtx));
- }
+ // We are expected to be able to transition out of the kConfigStartingUp state by the end
+ // of this function. Any uncaught exceptions here leave us in an invalid state and we will
+ // not be able to shut down by normal means, as clean shutdown assumes we can leave that state.
+ try {
+ {
+ stdx::lock_guard<Latch> lk(_mutex);
+ fassert(18822, !_inShutdown);
+ _setConfigState_inlock(kConfigStartingUp);
+ _topCoord->setStorageEngineSupportsReadCommitted(
+ _externalState->isReadCommittedSupportedByStorageEngine(opCtx));
+ }
- // Initialize the cached pointer to the oplog collection.
- acquireOplogCollectionForLogging(opCtx);
+ // Initialize the cached pointer to the oplog collection.
+ acquireOplogCollectionForLogging(opCtx);
- _replExecutor->startup();
+ _replExecutor->startup();
- LOGV2(6005300, "Starting up replica set aware services");
- ReplicaSetAwareServiceRegistry::get(_service).onStartup(opCtx);
+ LOGV2(6005300, "Starting up replica set aware services");
+ ReplicaSetAwareServiceRegistry::get(_service).onStartup(opCtx);
- bool doneLoadingConfig = _startLoadLocalConfig(opCtx, lastShutdownState);
- if (doneLoadingConfig) {
- // If we're not done loading the config, then the config state will be set by
- // _finishLoadLocalConfig.
- stdx::lock_guard<Latch> lk(_mutex);
- invariant(!_rsConfig.isInitialized());
- _setConfigState_inlock(kConfigUninitialized);
+ bool doneLoadingConfig = _startLoadLocalConfig(opCtx, lastShutdownState);
+ if (doneLoadingConfig) {
+ // If we're not done loading the config, then the config state will be set by
+ // _finishLoadLocalConfig.
+ stdx::lock_guard<Latch> lk(_mutex);
+ invariant(!_rsConfig.isInitialized());
+ _setConfigState_inlock(kConfigUninitialized);
+ }
+ } catch (DBException& e) {
+ auto status = e.toStatus();
+ LOGV2_FATAL_NOTRACE(
+ 6111701, "Failed to load local replica set config on startup", "status"_attr = status);
}
}