SERVER-76872 Prevent donor from outliving MovePrimaryCoordinator

author: Brett Nawrocki <brett.nawrocki@mongodb.com> 2023-05-05 18:50:17 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2023-05-13 21:56:54 +0000
commit: 7ab0d365afc71fbd43b3583fae5424cfb8f7e1a8 (patch)
tree: a17ba92534ecf1dd60370642651f2fe270a195f0
parent: 52ecfb60254c6b0627bb0ae4c69b1cb8c46adf59 (diff)
download: mongo-7ab0d365afc71fbd43b3583fae5424cfb8f7e1a8.tar.gz
5 files changed, 100 insertions, 9 deletions
diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml
index 74d614bdfed..8d8cdad86af 100644
--- a/etc/backports_required_for_multiversion_tests.yml
+++ b/etc/backports_required_for_multiversion_tests.yml
@@ -383,6 +383,8 @@ last-continuous:
     ticket: SERVER-74806
   - test_file: jstests/sharding/resharding_with_multi_deletes_reduced_ticket_pool_size.js
     ticket: SERVER-77097
+  - test_file: jstests/sharding/move_primary_donor_cleaned_up_if_coordinator_steps_up_aborted.js
+    ticket: SERVER-76872
   suites: null
 last-lts:
   all:
@@ -850,4 +852,6 @@ last-lts:
     ticket: SERVER-74806
   - test_file: jstests/sharding/resharding_with_multi_deletes_reduced_ticket_pool_size.js
     ticket: SERVER-77097
+  - test_file: jstests/sharding/move_primary_donor_cleaned_up_if_coordinator_steps_up_aborted.js
+    ticket: SERVER-76872
   suites: null
diff --git a/jstests/sharding/move_primary_donor_cleaned_up_if_coordinator_steps_up_aborted.js b/jstests/sharding/move_primary_donor_cleaned_up_if_coordinator_steps_up_aborted.js
new file mode 100644
index 00000000000..c0bd7fbb364
--- /dev/null
+++ b/jstests/sharding/move_primary_donor_cleaned_up_if_coordinator_steps_up_aborted.js
@@ -0,0 +1,66 @@
+/**
+ * Test that movePrimary coordinator recovers and cleans up the donor after a failover when it is
+ * already aborted.
+ *
+ *  @tags: [
+ *    requires_fcv_70,
+ *    featureFlagOnlineMovePrimaryLifecycle
+ * ]
+ */
+(function() {
+'use strict';
+load("jstests/libs/fail_point_util.js");
+load("jstests/libs/parallel_shell_helpers.js");
+
+const st = new ShardingTest({mongos: 1, shards: 2, rs: {nodes: 3}});
+
+const mongos = st.s0;
+const shard0 = st.shard0;
+const oldDonorPrimary = st.rs0.getPrimary();
+const shard1 = st.shard1;
+
+const dbName = 'test_db';
+const collName = 'test_coll';
+const collNS = dbName + '.' + collName;
+
+assert.commandWorked(mongos.adminCommand({enableSharding: dbName, primaryShard: shard0.shardName}));
+assert.commandWorked(mongos.getCollection(collNS).insert({value: 1}));
+assert.commandWorked(mongos.getCollection(collNS).insert({value: 2}));
+
+const donorStartedCloningFp = configureFailPoint(oldDonorPrimary,
+                                                 "pauseDuringMovePrimaryDonorStateTransition",
+                                                 {progress: "after", state: "cloning"});
+
+// Run movePrimary and wait for MovePrimaryDonor to start.
+const joinMovePrimary = startParallelShell(
+    funWithArgs(function(dbName, toShard) {
+        assert.commandFailed(db.adminCommand({movePrimary: dbName, to: toShard}));
+    }, dbName, shard1.shardName), mongos.port);
+
+donorStartedCloningFp.wait();
+
+// Trigger a failover. The MovePrimaryCoordinator will abort on step up. Make sure it does not clean
+// up the donor yet.
+const pauseCoordinatorFps = new Map();
+st.rs0.nodes.map(node => pauseCoordinatorFps.put(
+                     node, configureFailPoint(node, "movePrimaryCoordinatorHangBeforeCleaningUp")));
+st.rs0.getPrimary().adminCommand({replSetStepDown: ReplSetTest.kForeverSecs, force: 1});
+donorStartedCloningFp.off();
+st.rs0.awaitNodesAgreeOnPrimary();
+
+// TODO SERVER-77115: Investigate why test times out if this sleep is removed.
+sleep(5000);
+
+// Trigger another failover when 1. the MovePrimaryCoordinator is already aborted and 2. the
+// MovePrimaryDonor is still alive. This is the case this test is trying to set up.
+pauseCoordinatorFps.get(st.rs0.getPrimary()).wait();
+st.rs0.getPrimary().adminCommand({replSetStepDown: ReplSetTest.kForeverSecs, force: 1});
+st.rs0.awaitNodesAgreeOnPrimary();
+pauseCoordinatorFps.values().map(fp => fp.off());
+joinMovePrimary();
+
+// Verify that the MovePrimaryCoordinator has cleaned up the MovePrimaryDonor.
+assert.eq([], shard0.getDB("config").movePrimaryDonors.find({}).toArray());
+
+st.stop();
+})();
diff --git a/src/mongo/db/s/move_primary/move_primary_donor_service.cpp b/src/mongo/db/s/move_primary/move_primary_donor_service.cpp
index 8bf7ec7c41b..104fb451593 100644
--- a/src/mongo/db/s/move_primary/move_primary_donor_service.cpp
+++ b/src/mongo/db/s/move_primary/move_primary_donor_service.cpp
@@ -42,8 +42,8 @@ namespace {
 // Both of these failpoints have the same implementation. A single failpoint can't be active
 // multiple times with different arguments, but setting up more complex scenarios sometimes requires
 // multiple failpoints.
-MONGO_FAIL_POINT_DEFINE(pauseDuringMovePrimaryDonorStateEnumTransition);
-MONGO_FAIL_POINT_DEFINE(pauseDuringMovePrimaryDonorStateEnumTransitionAlternate);
+MONGO_FAIL_POINT_DEFINE(pauseDuringMovePrimaryDonorStateTransition);
+MONGO_FAIL_POINT_DEFINE(pauseDuringMovePrimaryDonorStateTransitionAlternate);
 
 MONGO_FAIL_POINT_DEFINE(pauseBeforeBeginningMovePrimaryDonorWorkflow);
 MONGO_FAIL_POINT_DEFINE(pauseBeforeMovePrimaryDonorPersistsBlockTimestamp);
@@ -74,7 +74,7 @@ boost::optional<StateTransitionProgress> readProgressArgument(const BSONObj& dat
 boost::optional<MovePrimaryDonorStateEnum> readStateArgument(const BSONObj& data) {
     try {
         auto arg = data.getStringField("state");
-        IDLParserContext ectx("pauseDuringMovePrimaryDonorStateEnumTransition::readStateArgument");
+        IDLParserContext ectx("pauseDuringMovePrimaryDonorStateTransition::readStateArgument");
         return MovePrimaryDonorState_parse(ectx, arg);
     } catch (...) {
         return boost::none;
@@ -91,7 +91,7 @@ void evaluatePauseDuringStateTransitionFailpoint(StateTransitionProgress progres
             auto desiredState = readStateArgument(data);
             if (!desiredProgress.has_value() || !desiredState.has_value()) {
                 LOGV2(7306200,
-                      "pauseDuringMovePrimaryDonorStateEnumTransition failpoint data must contain "
+                      "pauseDuringMovePrimaryDonorStateTransition failpoint data must contain "
                       "progress and state arguments",
                       "failpoint"_attr = failpoint.getName(),
                       "data"_attr = data);
@@ -103,8 +103,8 @@ void evaluatePauseDuringStateTransitionFailpoint(StateTransitionProgress progres
 
 void evaluatePauseDuringStateTransitionFailpoints(StateTransitionProgress progress,
                                                   MovePrimaryDonorStateEnum newState) {
-    const auto fps = {std::ref(pauseDuringMovePrimaryDonorStateEnumTransition),
-                      std::ref(pauseDuringMovePrimaryDonorStateEnumTransitionAlternate)};
+    const auto fps = {std::ref(pauseDuringMovePrimaryDonorStateTransition),
+                      std::ref(pauseDuringMovePrimaryDonorStateTransitionAlternate)};
     for (auto& fp : fps) {
         evaluatePauseDuringStateTransitionFailpoint(progress, newState, fp);
     }
@@ -307,7 +307,8 @@ void MovePrimaryDonorExternalState::_runCommandOnRecipient(OperationContext* opC
                                DatabaseName::kAdmin.toString(),
                                command,
                                Shard::RetryPolicy::kNoRetry);
-    uassertStatusOK(Shard::CommandResponse::getEffectiveStatus(response));
+    uassertStatusOKWithContext(Shard::CommandResponse::getEffectiveStatus(response),
+                               "Received error from remote MovePrimaryRecipient");
 }
 
 MovePrimaryDonorExternalStateImpl::MovePrimaryDonorExternalStateImpl(
diff --git a/src/mongo/db/s/move_primary/move_primary_donor_service_test.cpp b/src/mongo/db/s/move_primary/move_primary_donor_service_test.cpp
index 56442402256..fe7ea75b7a2 100644
--- a/src/mongo/db/s/move_primary/move_primary_donor_service_test.cpp
+++ b/src/mongo/db/s/move_primary/move_primary_donor_service_test.cpp
@@ -216,13 +216,13 @@ protected:
 
     auto pauseStateTransition(const std::string& progress, MovePrimaryDonorStateEnum state) {
         return pauseStateTransitionImpl(
-            progress, state, "pauseDuringMovePrimaryDonorStateEnumTransition");
+            progress, state, "pauseDuringMovePrimaryDonorStateTransition");
     }
 
     auto pauseStateTransitionAlternate(const std::string& progress,
                                        MovePrimaryDonorStateEnum state) {
         return pauseStateTransitionImpl(
-            progress, state, "pauseDuringMovePrimaryDonorStateEnumTransitionAlternate");
+            progress, state, "pauseDuringMovePrimaryDonorStateTransitionAlternate");
     }
 
     auto failCrudOpsOn(NamespaceString nss, ErrorCodes::Error code) {
diff --git a/src/mongo/db/s/move_primary_coordinator.cpp b/src/mongo/db/s/move_primary_coordinator.cpp
index fbf4cbf3eb9..354e72e52aa 100644
--- a/src/mongo/db/s/move_primary_coordinator.cpp
+++ b/src/mongo/db/s/move_primary_coordinator.cpp
@@ -61,6 +61,7 @@ bool useOnlineCloner() {
 }  // namespace
 
 MONGO_FAIL_POINT_DEFINE(hangBeforeCloningData);
+MONGO_FAIL_POINT_DEFINE(movePrimaryCoordinatorHangBeforeCleaningUp);
 
 MovePrimaryCoordinator::MovePrimaryCoordinator(ShardingDDLCoordinatorService* service,
                                                const BSONObj& initialState)
@@ -348,8 +349,16 @@ bool MovePrimaryCoordinator::onlineClonerAllowedToBeMissing() const {
 }
 
 void MovePrimaryCoordinator::recoverOnlineCloner(OperationContext* opCtx) {
+    if (_onlineCloner) {
+        return;
+    }
     _onlineCloner = MovePrimaryDonor::get(opCtx, _dbName, _doc.getToShardId());
     if (_onlineCloner) {
+        LOGV2(7687200,
+              "MovePrimaryCoordinator found existing online cloner",
+              "migrationId"_attr = _onlineCloner->getMetadata().getMigrationId(),
+              logAttrs(_dbName),
+              "to"_attr = _doc.getToShardId());
         return;
     }
     invariant(onlineClonerAllowedToBeMissing());
@@ -358,6 +367,11 @@ void MovePrimaryCoordinator::recoverOnlineCloner(OperationContext* opCtx) {
 void MovePrimaryCoordinator::createOnlineCloner(OperationContext* opCtx) {
     invariant(onlineClonerPossiblyNeverCreated());
     _onlineCloner = MovePrimaryDonor::create(opCtx, _dbName, _doc.getToShardId());
+    LOGV2(7687201,
+          "MovePrimaryCoordinator created new online cloner",
+          "migrationId"_attr = _onlineCloner->getMetadata().getMigrationId(),
+          logAttrs(_dbName),
+          "to"_attr = _doc.getToShardId());
 }
 
 void MovePrimaryCoordinator::cloneDataUntilReadyForCatchup(OperationContext* opCtx,
@@ -397,6 +411,11 @@ ExecutorFuture<void> MovePrimaryCoordinator::_cleanupOnAbort(
             auto* opCtx = opCtxHolder.get();
             getForwardableOpMetadata().setOn(opCtx);
 
+            if (MONGO_unlikely(movePrimaryCoordinatorHangBeforeCleaningUp.shouldFail())) {
+                LOGV2(7687202, "Hit movePrimaryCoordinatorHangBeforeCleaningUp");
+                movePrimaryCoordinatorHangBeforeCleaningUp.pauseWhileSet(opCtx);
+            }
+
             _performNoopRetryableWriteOnAllShardsAndConfigsvr(
                 opCtx, getNewSession(opCtx), **executor);
 
@@ -464,6 +483,7 @@ void MovePrimaryCoordinator::cleanupOnAbortWithOnlineCloner(OperationContext* op
                                                             const CancellationToken& token,
                                                             const Status& status) {
     unblockReadsAndWrites(opCtx);
+    recoverOnlineCloner(opCtx);
     if (!_onlineCloner) {
         return;
     }
author	Brett Nawrocki <brett.nawrocki@mongodb.com>	2023-05-05 18:50:17 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2023-05-13 21:56:54 +0000
commit	7ab0d365afc71fbd43b3583fae5424cfb8f7e1a8 (patch)
tree	a17ba92534ecf1dd60370642651f2fe270a195f0
parent	52ecfb60254c6b0627bb0ae4c69b1cb8c46adf59 (diff)
download	mongo-7ab0d365afc71fbd43b3583fae5424cfb8f7e1a8.tar.gz