SERVER-30714 Handle 'not master' errors in ReplicationCoordinatorExternalStateImpl::_shardingOnTransitionToPrimaryHook

author: Kaloian Manassiev <kaloian.manassiev@mongodb.com> 2018-09-24 14:31:22 -0400
committer: Kaloian Manassiev <kaloian.manassiev@mongodb.com> 2018-09-26 02:47:40 -0400
commit: a0ebd4bb3a30fdf574fd08ab473e7d6ce1b59619 (patch)
tree: ba437e69a942aa896bf856c02bb7c51eac80e407 /src/mongo/db
parent: dbc88bbcf51c3adc2f2a0ad3e4c22b98a2dc9388 (diff)
download: mongo-a0ebd4bb3a30fdf574fd08ab473e7d6ce1b59619.tar.gz
3 files changed, 27 insertions, 30 deletions
diff --git a/src/mongo/db/db.cpp b/src/mongo/db/db.cpp
index f92ee6409e9..0b9c8105759 100644
--- a/src/mongo/db/db.cpp
+++ b/src/mongo/db/db.cpp
@@ -549,7 +549,9 @@ ExitCode _initAndListen(int listenPort) {
         if (serverGlobalParams.clusterRole == ClusterRole::ShardServer) {
             // Note: For replica sets, ShardingStateRecovery happens on transition to primary.
             if (!repl::ReplicationCoordinator::get(startupOpCtx.get())->isReplEnabled()) {
-                uassertStatusOK(ShardingStateRecovery::recover(startupOpCtx.get()));
+                if (ShardingState::get(startupOpCtx.get())->enabled()) {
+                    uassertStatusOK(ShardingStateRecovery::recover(startupOpCtx.get()));
+                }
             }
         } else if (serverGlobalParams.clusterRole == ClusterRole::ConfigServer) {
             initializeGlobalShardingStateForMongoD(startupOpCtx.get(),
diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
index 3ff7d24f4b9..4e7d3dd21f7 100644
--- a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
@@ -686,7 +686,6 @@ void ReplicationCoordinatorExternalStateImpl::shardingOnStepDownHook() {
     if (serverGlobalParams.clusterRole == ClusterRole::ConfigServer) {
         Balancer::get(_service)->interruptBalancer();
     } else if (ShardingState::get(_service)->enabled()) {
-        invariant(serverGlobalParams.clusterRole == ClusterRole::ShardServer);
         ChunkSplitter::get(_service).onStepDown();
         CatalogCacheLoader::get(_service).onStepDown();
         PeriodicBalancerConfigRefresher::get(_service).onStepDown();
@@ -706,24 +705,16 @@ void ReplicationCoordinatorExternalStateImpl::shardingOnStepDownHook() {
 
 void ReplicationCoordinatorExternalStateImpl::_shardingOnTransitionToPrimaryHook(
     OperationContext* opCtx) {
-    auto status = ShardingStateRecovery::recover(opCtx);
-
-    if (ErrorCodes::isShutdownError(status.code())) {
-        // Note: callers of this method don't expect exceptions, so throw only unexpected fatal
-        // errors.
-        return;
-    }
-
-    fassert(40107, status);
-
     if (serverGlobalParams.clusterRole == ClusterRole::ConfigServer) {
-        status = ShardingCatalogManager::get(opCtx)->initializeConfigDatabaseIfNeeded(opCtx);
+        Status status = ShardingCatalogManager::get(opCtx)->initializeConfigDatabaseIfNeeded(opCtx);
         if (!status.isOK() && status != ErrorCodes::AlreadyInitialized) {
-            if (ErrorCodes::isShutdownError(status.code())) {
-                // Don't fassert if we're mid-shutdown, let the shutdown happen gracefully.
+            // If the node is shutting down or it lost quorum just as it was becoming primary, don't
+            // run the sharding onStepUp machinery. The onStepDown counterpart to these methods is
+            // already idempotent, so the machinery will remain in the stepped down state.
+            if (ErrorCodes::isShutdownError(status.code()) ||
+                ErrorCodes::isNotMasterError(status.code())) {
                 return;
             }
-
             fassertFailedWithStatus(
                 40184,
                 status.withContext("Failed to initialize config database on config server's "
@@ -735,17 +726,16 @@ void ReplicationCoordinatorExternalStateImpl::_shardingOnTransitionToPrimaryHook
             // readConcern in drain mode because the global lock prevents replication. This is
             // safe, since if the clusterId write is rolled back, any writes that depend on it will
             // also be rolled back.
-            // Since we *just* wrote the cluster ID to the config.version document (via
-            // ShardingCatalogManager::initializeConfigDatabaseIfNeeded), this should always
-            // succeed.
+            //
+            // Since we *just* wrote the cluster ID to the config.version document (via the call to
+            // ShardingCatalogManager::initializeConfigDatabaseIfNeeded above), this read can only
+            // meaningfully fail if the node is shutting down.
             status = ClusterIdentityLoader::get(opCtx)->loadClusterId(
                 opCtx, repl::ReadConcernLevel::kLocalReadConcern);
 
             if (ErrorCodes::isShutdownError(status.code())) {
-                // Don't fassert if we're mid-shutdown, let the shutdown happen gracefully.
                 return;
             }
-
             fassert(40217, status);
         }
 
@@ -760,11 +750,20 @@ void ReplicationCoordinatorExternalStateImpl::_shardingOnTransitionToPrimaryHook
             validator->enableKeyGenerator(opCtx, true);
         }
     } else if (ShardingState::get(opCtx)->enabled()) {
-        invariant(serverGlobalParams.clusterRole == ClusterRole::ShardServer);
+        Status status = ShardingStateRecovery::recover(opCtx);
+
+        // If the node is shutting down or it lost quorum just as it was becoming primary, don't run
+        // the sharding onStepUp machinery. The onStepDown counterpart to these methods is already
+        // idempotent, so the machinery will remain in the stepped down state.
+        if (ErrorCodes::isShutdownError(status.code()) ||
+            ErrorCodes::isNotMasterError(status.code())) {
+            return;
+        }
+        fassert(40107, status);
 
         const auto configsvrConnStr =
             Grid::get(opCtx)->shardRegistry()->getConfigShard()->getConnString();
-        auto status = ShardingInitializationMongoD::get(opCtx)->updateShardIdentityConfigString(
+        status = ShardingInitializationMongoD::get(opCtx)->updateShardIdentityConfigString(
             opCtx, configsvrConnStr);
         if (!status.isOK()) {
             warning() << "error encountered while trying to update config connection string to "
diff --git a/src/mongo/db/s/sharding_state_recovery.cpp b/src/mongo/db/s/sharding_state_recovery.cpp
index 0eecfbfe135..8914dc56254 100644
--- a/src/mongo/db/s/sharding_state_recovery.cpp
+++ b/src/mongo/db/s/sharding_state_recovery.cpp
@@ -218,9 +218,9 @@ void ShardingStateRecovery::endMetadataOp(OperationContext* opCtx) {
 }
 
 Status ShardingStateRecovery::recover(OperationContext* opCtx) {
-    if (serverGlobalParams.clusterRole != ClusterRole::ShardServer) {
-        return Status::OK();
-    }
+    Grid* const grid = Grid::get(opCtx);
+    ShardingState* const shardingState = ShardingState::get(opCtx);
+    invariant(shardingState->enabled());
 
     BSONObj recoveryDocBSON;
 
@@ -242,10 +242,6 @@ Status ShardingStateRecovery::recover(OperationContext* opCtx) {
 
     log() << "Sharding state recovery process found document " << redact(recoveryDoc.toBSON());
 
-    Grid* const grid = Grid::get(opCtx);
-    ShardingState* const shardingState = ShardingState::get(opCtx);
-    invariant(shardingState->enabled());
-
     if (!recoveryDoc.getMinOpTimeUpdaters()) {
         // Treat the minOpTime as up-to-date
         grid->advanceConfigOpTime(recoveryDoc.getMinOpTime());
author	Kaloian Manassiev <kaloian.manassiev@mongodb.com>	2018-09-24 14:31:22 -0400
committer	Kaloian Manassiev <kaloian.manassiev@mongodb.com>	2018-09-26 02:47:40 -0400
commit	a0ebd4bb3a30fdf574fd08ab473e7d6ce1b59619 (patch)
tree	ba437e69a942aa896bf856c02bb7c51eac80e407 /src/mongo/db
parent	dbc88bbcf51c3adc2f2a0ad3e4c22b98a2dc9388 (diff)
download	mongo-a0ebd4bb3a30fdf574fd08ab473e7d6ce1b59619.tar.gz