diff options
author | Kaloian Manassiev <kaloian.manassiev@mongodb.com> | 2018-09-24 14:31:22 -0400 |
---|---|---|
committer | Kaloian Manassiev <kaloian.manassiev@mongodb.com> | 2018-12-05 15:37:37 -0500 |
commit | 8bd1c5d455ae101d7522a5c2918738601f8c6317 (patch) | |
tree | 7776f795aaec1fd6079b561be77d65501656c312 | |
parent | 1a7d130a3adf89b98c6df92326a3174a77a9af7c (diff) | |
download | mongo-8bd1c5d455ae101d7522a5c2918738601f8c6317.tar.gz |
SERVER-30714 Handle 'not master' errors in ReplicationCoordinatorExternalStateImpl::_shardingOnTransitionToPrimaryHook
(cherry picked from commit a0ebd4bb3a30fdf574fd08ab473e7d6ce1b59619)
-rw-r--r-- | src/mongo/db/db.cpp | 4 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_external_state_impl.cpp | 43 | ||||
-rw-r--r-- | src/mongo/db/s/sharding_state_recovery.cpp | 10 |
3 files changed, 27 insertions, 30 deletions
diff --git a/src/mongo/db/db.cpp b/src/mongo/db/db.cpp index a7ab9db34a8..c53dfbb2e58 100644 --- a/src/mongo/db/db.cpp +++ b/src/mongo/db/db.cpp @@ -549,7 +549,9 @@ ExitCode _initAndListen(int listenPort) { if (serverGlobalParams.clusterRole == ClusterRole::ShardServer) { // Note: For replica sets, ShardingStateRecovery happens on transition to primary. if (!repl::ReplicationCoordinator::get(startupOpCtx.get())->isReplEnabled()) { - uassertStatusOK(ShardingStateRecovery::recover(startupOpCtx.get())); + if (ShardingState::get(startupOpCtx.get())->enabled()) { + uassertStatusOK(ShardingStateRecovery::recover(startupOpCtx.get())); + } } } else if (serverGlobalParams.clusterRole == ClusterRole::ConfigServer) { initializeGlobalShardingStateForMongoD(startupOpCtx.get(), diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp index ee60c992357..b1f312ab6a4 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp @@ -700,7 +700,6 @@ void ReplicationCoordinatorExternalStateImpl::shardingOnStepDownHook() { if (serverGlobalParams.clusterRole == ClusterRole::ConfigServer) { Balancer::get(_service)->interruptBalancer(); } else if (ShardingState::get(_service)->enabled()) { - invariant(serverGlobalParams.clusterRole == ClusterRole::ShardServer); ChunkSplitter::get(_service).onStepDown(); CatalogCacheLoader::get(_service).onStepDown(); } @@ -719,24 +718,16 @@ void ReplicationCoordinatorExternalStateImpl::shardingOnStepDownHook() { void ReplicationCoordinatorExternalStateImpl::_shardingOnTransitionToPrimaryHook( OperationContext* opCtx) { - auto status = ShardingStateRecovery::recover(opCtx); - - if (ErrorCodes::isShutdownError(status.code())) { - // Note: callers of this method don't expect exceptions, so throw only unexpected fatal - // errors. - return; - } - - fassert(40107, status); - if (serverGlobalParams.clusterRole == ClusterRole::ConfigServer) { - status = ShardingCatalogManager::get(opCtx)->initializeConfigDatabaseIfNeeded(opCtx); + Status status = ShardingCatalogManager::get(opCtx)->initializeConfigDatabaseIfNeeded(opCtx); if (!status.isOK() && status != ErrorCodes::AlreadyInitialized) { - if (ErrorCodes::isShutdownError(status.code())) { - // Don't fassert if we're mid-shutdown, let the shutdown happen gracefully. + // If the node is shutting down or it lost quorum just as it was becoming primary, don't + // run the sharding onStepUp machinery. The onStepDown counterpart to these methods is + // already idempotent, so the machinery will remain in the stepped down state. + if (ErrorCodes::isShutdownError(status.code()) || + ErrorCodes::isNotMasterError(status.code())) { return; } - fassertFailedWithStatus( 40184, status.withContext("Failed to initialize config database on config server's " @@ -748,17 +739,16 @@ void ReplicationCoordinatorExternalStateImpl::_shardingOnTransitionToPrimaryHook // readConcern in drain mode because the global lock prevents replication. This is // safe, since if the clusterId write is rolled back, any writes that depend on it will // also be rolled back. - // Since we *just* wrote the cluster ID to the config.version document (via - // ShardingCatalogManager::initializeConfigDatabaseIfNeeded), this should always - // succeed. + // + // Since we *just* wrote the cluster ID to the config.version document (via the call to + // ShardingCatalogManager::initializeConfigDatabaseIfNeeded above), this read can only + // meaningfully fail if the node is shutting down. status = ClusterIdentityLoader::get(opCtx)->loadClusterId( opCtx, repl::ReadConcernLevel::kLocalReadConcern); if (ErrorCodes::isShutdownError(status.code())) { - // Don't fassert if we're mid-shutdown, let the shutdown happen gracefully. return; } - fassert(40217, status); } @@ -773,11 +763,20 @@ void ReplicationCoordinatorExternalStateImpl::_shardingOnTransitionToPrimaryHook validator->enableKeyGenerator(opCtx, true); } } else if (ShardingState::get(opCtx)->enabled()) { - invariant(serverGlobalParams.clusterRole == ClusterRole::ShardServer); + Status status = ShardingStateRecovery::recover(opCtx); + + // If the node is shutting down or it lost quorum just as it was becoming primary, don't run + // the sharding onStepUp machinery. The onStepDown counterpart to these methods is already + // idempotent, so the machinery will remain in the stepped down state. + if (ErrorCodes::isShutdownError(status.code()) || + ErrorCodes::isNotMasterError(status.code())) { + return; + } + fassert(40107, status); const auto configsvrConnStr = Grid::get(opCtx)->shardRegistry()->getConfigShard()->getConnString(); - auto status = ShardingInitializationMongoD::get(opCtx)->updateShardIdentityConfigString( + status = ShardingInitializationMongoD::get(opCtx)->updateShardIdentityConfigString( opCtx, configsvrConnStr); if (!status.isOK()) { warning() << "error encountered while trying to update config connection string to " diff --git a/src/mongo/db/s/sharding_state_recovery.cpp b/src/mongo/db/s/sharding_state_recovery.cpp index f1e9cb5ae83..65274d025a4 100644 --- a/src/mongo/db/s/sharding_state_recovery.cpp +++ b/src/mongo/db/s/sharding_state_recovery.cpp @@ -220,9 +220,9 @@ void ShardingStateRecovery::endMetadataOp(OperationContext* opCtx) { } Status ShardingStateRecovery::recover(OperationContext* opCtx) { - if (serverGlobalParams.clusterRole != ClusterRole::ShardServer) { - return Status::OK(); - } + Grid* const grid = Grid::get(opCtx); + ShardingState* const shardingState = ShardingState::get(opCtx); + invariant(shardingState->enabled()); BSONObj recoveryDocBSON; @@ -244,10 +244,6 @@ Status ShardingStateRecovery::recover(OperationContext* opCtx) { log() << "Sharding state recovery process found document " << redact(recoveryDoc.toBSON()); - Grid* const grid = Grid::get(opCtx); - ShardingState* const shardingState = ShardingState::get(opCtx); - invariant(shardingState->enabled()); - if (!recoveryDoc.getMinOpTimeUpdaters()) { // Treat the minOpTime as up-to-date grid->advanceConfigOpTime(recoveryDoc.getMinOpTime()); |