diff options
-rw-r--r-- | jstests/replsets/repl_startup_error_no_hang_on_shutdown.js | 50 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl.cpp | 56 |
2 files changed, 87 insertions, 19 deletions
diff --git a/jstests/replsets/repl_startup_error_no_hang_on_shutdown.js b/jstests/replsets/repl_startup_error_no_hang_on_shutdown.js new file mode 100644 index 00000000000..61160705195 --- /dev/null +++ b/jstests/replsets/repl_startup_error_no_hang_on_shutdown.js @@ -0,0 +1,50 @@ +/** + * Tests that errors generated as part of ReplicationCoordinatorImpl startup do not cause the server + * to hang during shutdown. + * + * @tags: [requires_persistence, requires_fcv_53] + */ + +(function() { +"use strict"; + +load("jstests/libs/fail_point_util.js"); + +const name = jsTestName(); +const rst = new ReplSetTest({ + name: jsTestName(), + nodes: [{ + setParameter: { + "failpoint.throwBeforeRecoveringTenantMigrationAccessBlockers": + tojson({mode: "alwaysOn"}) + } + }], +}); +rst.startSet(); +rst.initiate(); + +let primary = rst.getPrimary(); + +jsTestLog("Done initiating set. Restarting."); + +const exitCode = MongoRunner.EXIT_ABRUPT; +let exceptionThrown = false; +try { + rst.restart(0, { + startClean: false, + }); +} catch (e) { + assert(e.message.includes("MongoDB process stopped with exit code: " + exitCode), + () => tojson(e)); + exceptionThrown = true; +} + +assert.soon( + function() { + return rawMongoProgramOutput().search(/Fatal assertion.*6111701/) >= 0; + }, + "Node should have fasserted upon encountering a fatal error during startup", + ReplSetTest.kDefaultTimeoutMS); + +assert(exceptionThrown, "Expected restart to fail."); +})(); diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index 89fcbd5e291..36ac01c19e6 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -150,6 +150,8 @@ MONGO_FAIL_POINT_DEFINE(skipBeforeFetchingConfig); MONGO_FAIL_POINT_DEFINE(stepdownHangAfterGrabbingRSTL); // Simulates returning a specified error in the hello response. MONGO_FAIL_POINT_DEFINE(setCustomErrorInHelloResponseMongoD); +// Throws right before the call into recoverTenantMigrationAccessBlockers. +MONGO_FAIL_POINT_DEFINE(throwBeforeRecoveringTenantMigrationAccessBlockers); // Number of times we tried to go live as a secondary. Counter64 attemptsToBecomeSecondary; @@ -521,6 +523,13 @@ bool ReplicationCoordinatorImpl::_startLoadLocalConfig( _replicationProcess->getReplicationRecovery()->recoverFromOplog(opCtx, stableTimestamp); LOGV2(4280505, "Creating any necessary TenantMigrationAccessBlockers for unfinished migrations"); + + if (MONGO_unlikely(throwBeforeRecoveringTenantMigrationAccessBlockers.shouldFail())) { + uasserted(6111700, + "Failpoint 'throwBeforeRecoveringTenantMigrationAccessBlockers' triggered. " + "Throwing exception."); + } + tenant_migration_access_blocker::recoverTenantMigrationAccessBlockers(opCtx); LOGV2(4280506, "Reconstructing prepared transactions"); reconstructPreparedTransactions(opCtx, OplogApplication::Mode::kRecovering); @@ -914,29 +923,38 @@ void ReplicationCoordinatorImpl::startup(OperationContext* opCtx, _storage->initializeStorageControlsForReplication(opCtx->getServiceContext()); - { - stdx::lock_guard<Latch> lk(_mutex); - fassert(18822, !_inShutdown); - _setConfigState_inlock(kConfigStartingUp); - _topCoord->setStorageEngineSupportsReadCommitted( - _externalState->isReadCommittedSupportedByStorageEngine(opCtx)); - } + // We are expected to be able to transition out of the kConfigStartingUp state by the end + // of this function. Any uncaught exceptions here leave us in an invalid state and we will + // not be able to shut down by normal means, as clean shutdown assumes we can leave that state. + try { + { + stdx::lock_guard<Latch> lk(_mutex); + fassert(18822, !_inShutdown); + _setConfigState_inlock(kConfigStartingUp); + _topCoord->setStorageEngineSupportsReadCommitted( + _externalState->isReadCommittedSupportedByStorageEngine(opCtx)); + } - // Initialize the cached pointer to the oplog collection. - acquireOplogCollectionForLogging(opCtx); + // Initialize the cached pointer to the oplog collection. + acquireOplogCollectionForLogging(opCtx); - _replExecutor->startup(); + _replExecutor->startup(); - LOGV2(6005300, "Starting up replica set aware services"); - ReplicaSetAwareServiceRegistry::get(_service).onStartup(opCtx); + LOGV2(6005300, "Starting up replica set aware services"); + ReplicaSetAwareServiceRegistry::get(_service).onStartup(opCtx); - bool doneLoadingConfig = _startLoadLocalConfig(opCtx, lastShutdownState); - if (doneLoadingConfig) { - // If we're not done loading the config, then the config state will be set by - // _finishLoadLocalConfig. - stdx::lock_guard<Latch> lk(_mutex); - invariant(!_rsConfig.isInitialized()); - _setConfigState_inlock(kConfigUninitialized); + bool doneLoadingConfig = _startLoadLocalConfig(opCtx, lastShutdownState); + if (doneLoadingConfig) { + // If we're not done loading the config, then the config state will be set by + // _finishLoadLocalConfig. + stdx::lock_guard<Latch> lk(_mutex); + invariant(!_rsConfig.isInitialized()); + _setConfigState_inlock(kConfigUninitialized); + } + } catch (DBException& e) { + auto status = e.toStatus(); + LOGV2_FATAL_NOTRACE( + 6111701, "Failed to load local replica set config on startup", "status"_attr = status); } } |