diff options
author | Kaloian Manassiev <kaloian.manassiev@mongodb.com> | 2018-09-24 14:31:22 -0400 |
---|---|---|
committer | Kaloian Manassiev <kaloian.manassiev@mongodb.com> | 2018-09-26 02:47:40 -0400 |
commit | a0ebd4bb3a30fdf574fd08ab473e7d6ce1b59619 (patch) | |
tree | ba437e69a942aa896bf856c02bb7c51eac80e407 /src/mongo/db | |
parent | dbc88bbcf51c3adc2f2a0ad3e4c22b98a2dc9388 (diff) | |
download | mongo-a0ebd4bb3a30fdf574fd08ab473e7d6ce1b59619.tar.gz |
SERVER-30714 Handle 'not master' errors in ReplicationCoordinatorExternalStateImpl::_shardingOnTransitionToPrimaryHook
Diffstat (limited to 'src/mongo/db')
-rw-r--r-- | src/mongo/db/db.cpp | 4 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_external_state_impl.cpp | 43 | ||||
-rw-r--r-- | src/mongo/db/s/sharding_state_recovery.cpp | 10 |
3 files changed, 27 insertions, 30 deletions
diff --git a/src/mongo/db/db.cpp b/src/mongo/db/db.cpp index f92ee6409e9..0b9c8105759 100644 --- a/src/mongo/db/db.cpp +++ b/src/mongo/db/db.cpp @@ -549,7 +549,9 @@ ExitCode _initAndListen(int listenPort) { if (serverGlobalParams.clusterRole == ClusterRole::ShardServer) { // Note: For replica sets, ShardingStateRecovery happens on transition to primary. if (!repl::ReplicationCoordinator::get(startupOpCtx.get())->isReplEnabled()) { - uassertStatusOK(ShardingStateRecovery::recover(startupOpCtx.get())); + if (ShardingState::get(startupOpCtx.get())->enabled()) { + uassertStatusOK(ShardingStateRecovery::recover(startupOpCtx.get())); + } } } else if (serverGlobalParams.clusterRole == ClusterRole::ConfigServer) { initializeGlobalShardingStateForMongoD(startupOpCtx.get(), diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp index 3ff7d24f4b9..4e7d3dd21f7 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp @@ -686,7 +686,6 @@ void ReplicationCoordinatorExternalStateImpl::shardingOnStepDownHook() { if (serverGlobalParams.clusterRole == ClusterRole::ConfigServer) { Balancer::get(_service)->interruptBalancer(); } else if (ShardingState::get(_service)->enabled()) { - invariant(serverGlobalParams.clusterRole == ClusterRole::ShardServer); ChunkSplitter::get(_service).onStepDown(); CatalogCacheLoader::get(_service).onStepDown(); PeriodicBalancerConfigRefresher::get(_service).onStepDown(); @@ -706,24 +705,16 @@ void ReplicationCoordinatorExternalStateImpl::shardingOnStepDownHook() { void ReplicationCoordinatorExternalStateImpl::_shardingOnTransitionToPrimaryHook( OperationContext* opCtx) { - auto status = ShardingStateRecovery::recover(opCtx); - - if (ErrorCodes::isShutdownError(status.code())) { - // Note: callers of this method don't expect exceptions, so throw only unexpected fatal - // errors. - return; - } - - fassert(40107, status); - if (serverGlobalParams.clusterRole == ClusterRole::ConfigServer) { - status = ShardingCatalogManager::get(opCtx)->initializeConfigDatabaseIfNeeded(opCtx); + Status status = ShardingCatalogManager::get(opCtx)->initializeConfigDatabaseIfNeeded(opCtx); if (!status.isOK() && status != ErrorCodes::AlreadyInitialized) { - if (ErrorCodes::isShutdownError(status.code())) { - // Don't fassert if we're mid-shutdown, let the shutdown happen gracefully. + // If the node is shutting down or it lost quorum just as it was becoming primary, don't + // run the sharding onStepUp machinery. The onStepDown counterpart to these methods is + // already idempotent, so the machinery will remain in the stepped down state. + if (ErrorCodes::isShutdownError(status.code()) || + ErrorCodes::isNotMasterError(status.code())) { return; } - fassertFailedWithStatus( 40184, status.withContext("Failed to initialize config database on config server's " @@ -735,17 +726,16 @@ void ReplicationCoordinatorExternalStateImpl::_shardingOnTransitionToPrimaryHook // readConcern in drain mode because the global lock prevents replication. This is // safe, since if the clusterId write is rolled back, any writes that depend on it will // also be rolled back. - // Since we *just* wrote the cluster ID to the config.version document (via - // ShardingCatalogManager::initializeConfigDatabaseIfNeeded), this should always - // succeed. + // + // Since we *just* wrote the cluster ID to the config.version document (via the call to + // ShardingCatalogManager::initializeConfigDatabaseIfNeeded above), this read can only + // meaningfully fail if the node is shutting down. status = ClusterIdentityLoader::get(opCtx)->loadClusterId( opCtx, repl::ReadConcernLevel::kLocalReadConcern); if (ErrorCodes::isShutdownError(status.code())) { - // Don't fassert if we're mid-shutdown, let the shutdown happen gracefully. return; } - fassert(40217, status); } @@ -760,11 +750,20 @@ void ReplicationCoordinatorExternalStateImpl::_shardingOnTransitionToPrimaryHook validator->enableKeyGenerator(opCtx, true); } } else if (ShardingState::get(opCtx)->enabled()) { - invariant(serverGlobalParams.clusterRole == ClusterRole::ShardServer); + Status status = ShardingStateRecovery::recover(opCtx); + + // If the node is shutting down or it lost quorum just as it was becoming primary, don't run + // the sharding onStepUp machinery. The onStepDown counterpart to these methods is already + // idempotent, so the machinery will remain in the stepped down state. + if (ErrorCodes::isShutdownError(status.code()) || + ErrorCodes::isNotMasterError(status.code())) { + return; + } + fassert(40107, status); const auto configsvrConnStr = Grid::get(opCtx)->shardRegistry()->getConfigShard()->getConnString(); - auto status = ShardingInitializationMongoD::get(opCtx)->updateShardIdentityConfigString( + status = ShardingInitializationMongoD::get(opCtx)->updateShardIdentityConfigString( opCtx, configsvrConnStr); if (!status.isOK()) { warning() << "error encountered while trying to update config connection string to " diff --git a/src/mongo/db/s/sharding_state_recovery.cpp b/src/mongo/db/s/sharding_state_recovery.cpp index 0eecfbfe135..8914dc56254 100644 --- a/src/mongo/db/s/sharding_state_recovery.cpp +++ b/src/mongo/db/s/sharding_state_recovery.cpp @@ -218,9 +218,9 @@ void ShardingStateRecovery::endMetadataOp(OperationContext* opCtx) { } Status ShardingStateRecovery::recover(OperationContext* opCtx) { - if (serverGlobalParams.clusterRole != ClusterRole::ShardServer) { - return Status::OK(); - } + Grid* const grid = Grid::get(opCtx); + ShardingState* const shardingState = ShardingState::get(opCtx); + invariant(shardingState->enabled()); BSONObj recoveryDocBSON; @@ -242,10 +242,6 @@ Status ShardingStateRecovery::recover(OperationContext* opCtx) { log() << "Sharding state recovery process found document " << redact(recoveryDoc.toBSON()); - Grid* const grid = Grid::get(opCtx); - ShardingState* const shardingState = ShardingState::get(opCtx); - invariant(shardingState->enabled()); - if (!recoveryDoc.getMinOpTimeUpdaters()) { // Treat the minOpTime as up-to-date grid->advanceConfigOpTime(recoveryDoc.getMinOpTime()); |