summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthew Russotto <matthew.russotto@10gen.com>2020-06-09 15:35:13 -0400
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2020-06-09 19:49:57 +0000
commit0ba63f264cc0be3bbc77e35ed94306c394ca95d9 (patch)
treef8390e36e3daca2d33b764bea9e6394fb63ef907
parent8b20e37d7164c8112b6173fc953031a39b59b3c0 (diff)
downloadmongo-0ba63f264cc0be3bbc77e35ed94306c394ca95d9.tar.gz
SERVER-48480 Abort initial sync upon transition to REMOVED state
-rw-r--r--jstests/replsets/initial_sync_stops_when_syncing_node_removed.js80
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp19
2 files changed, 99 insertions, 0 deletions
diff --git a/jstests/replsets/initial_sync_stops_when_syncing_node_removed.js b/jstests/replsets/initial_sync_stops_when_syncing_node_removed.js
new file mode 100644
index 00000000000..2f1d9db4c95
--- /dev/null
+++ b/jstests/replsets/initial_sync_stops_when_syncing_node_removed.js
@@ -0,0 +1,80 @@
+/**
+ * Tests that initial sync will abort an attempt if the sync source is removed during cloning.
+ * This test will timeout if the attempt is not aborted.
+ * @tags: [requires_fcv_44]
+ */
+(function() {
+"use strict";
+
+load("jstests/libs/fail_point_util.js");
+
+const testName = TestData.testName;
+const rst = new ReplSetTest({name: testName, nodes: 1});
+const nodes = rst.startSet();
+rst.initiateWithHighElectionTimeout();
+
+const primary = rst.getPrimary();
+const primaryDb = primary.getDB("test");
+const initialSyncSource = rst.getSecondary();
+
+// Add some data to be cloned.
+assert.commandWorked(primaryDb.test.insert([{a: 1}, {b: 2}, {c: 3}]));
+rst.awaitReplication();
+
+jsTest.log("Adding the initial sync destination node to the replica set");
+const initialSyncNode = rst.add({
+ rsConfig: {priority: 0, votes: 0},
+ setParameter: {
+ 'failpoint.initialSyncHangBeforeCopyingDatabases': tojson({mode: 'alwaysOn'}),
+ 'numInitialSyncAttempts': 1
+ }
+});
+rst.reInitiate();
+rst.waitForState(initialSyncNode, ReplSetTest.State.STARTUP_2);
+
+// The code handling this case is common to all cloners, so run it only for the stage most likely
+// to see an error.
+const cloner = 'CollectionCloner';
+const stage = 'query';
+
+// Set us up to hang before finish so we can check status.
+const beforeFinishFailPoint = configureFailPoint(initialSyncNode, "initialSyncHangBeforeFinish");
+const initialSyncNodeDb = initialSyncNode.getDB("test");
+const failPointData = {
+ cloner: cloner,
+ stage: stage,
+ nss: 'test.test'
+};
+// Set us up to stop right before the given stage.
+const beforeStageFailPoint =
+ configureFailPoint(initialSyncNodeDb, "hangBeforeClonerStage", failPointData);
+// Release the initial failpoint.
+assert.commandWorked(initialSyncNodeDb.adminCommand(
+ {configureFailPoint: "initialSyncHangBeforeCopyingDatabases", mode: "off"}));
+beforeStageFailPoint.wait();
+
+jsTestLog("Testing removing syncing node in cloner " + cloner + " stage " + stage);
+// We can't use remove/reInitiate here because we still need to communicate with the removed
+// node.
+let config = rst.getReplSetConfig();
+config.members.splice(1, 1); // Removes node[1]
+config.version = rst.getReplSetConfigFromNode().version + 1;
+assert.commandWorked(primary.getDB("admin").adminCommand({replSetReconfig: config}));
+
+jsTestLog("Waiting for sync node to realize it is removed. It should fail as a result.");
+let res;
+assert.soon(function() {
+ res = checkProgram(initialSyncNode.pid);
+ return !res.alive;
+});
+
+const fassertProcessExitCode = _isWindows() ? MongoRunner.EXIT_ABRUPT : MongoRunner.EXIT_ABORT;
+assert.eq(fassertProcessExitCode, res.exitCode);
+assert(
+ rawMongoProgramOutput().match('Fatal assertion.*4848002'),
+ 'Initial syncing node should have crashed as a result of being removed from the configuration.');
+
+// We skip validation and dbhashes because the initial sync failed so the initial sync node is
+// unreachable and invalid.
+rst.stopSet(null, null, {skipValidation: true});
+})();
diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
index 3c75ccf2843..c07dd8b3d6d 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
@@ -798,6 +798,25 @@ void ReplicationCoordinatorImpl::_heartbeatReconfigFinish(
// the data structures inside of the TopologyCoordinator.
const int myIndexValue = myIndex.getStatus().isOK() ? myIndex.getValue() : -1;
+ // If we were initial syncing and are now REMOVED, we must cancel the initial sync before
+ // installing the new config, to avoid triggering a storage invariant that has initial
+ // sync as a special case. We'll still fatally fail, but with a more meaningful error.
+ if (_initialSyncer && _memberState.startup2() && myIndexValue == -1) {
+ // The initial syncer may not be called inside the replication lock.
+ auto initialSyncerCopy = _initialSyncer;
+ LOGV2(4848000, "Canceling initial sync as this node is no longer in the configuration");
+ lk.unlock();
+ const auto status = initialSyncerCopy->shutdown();
+ if (!status.isOK()) {
+ LOGV2_WARNING(4848001, "InitialSyncer shutdown failed", "error"_attr = status);
+ }
+ initialSyncerCopy->join();
+ LOGV2_FATAL(4848002,
+ "Initial sync failed due to node being removed from the configuration. "
+ "Shutting down now. Restart the server to attempt a new initial sync");
+ lk.lock();
+ }
+
const PostMemberStateUpdateAction action =
_setCurrentRSConfig(lk, opCtx.get(), newConfig, myIndexValue);