diff options
author | Matthew Russotto <matthew.russotto@10gen.com> | 2020-06-09 15:35:13 -0400 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-06-09 19:49:57 +0000 |
commit | 0ba63f264cc0be3bbc77e35ed94306c394ca95d9 (patch) | |
tree | f8390e36e3daca2d33b764bea9e6394fb63ef907 | |
parent | 8b20e37d7164c8112b6173fc953031a39b59b3c0 (diff) | |
download | mongo-0ba63f264cc0be3bbc77e35ed94306c394ca95d9.tar.gz |
SERVER-48480 Abort initial sync upon transition to REMOVED state
-rw-r--r-- | jstests/replsets/initial_sync_stops_when_syncing_node_removed.js | 80 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp | 19 |
2 files changed, 99 insertions, 0 deletions
diff --git a/jstests/replsets/initial_sync_stops_when_syncing_node_removed.js b/jstests/replsets/initial_sync_stops_when_syncing_node_removed.js new file mode 100644 index 00000000000..2f1d9db4c95 --- /dev/null +++ b/jstests/replsets/initial_sync_stops_when_syncing_node_removed.js @@ -0,0 +1,80 @@ +/** + * Tests that initial sync will abort an attempt if the sync source is removed during cloning. + * This test will timeout if the attempt is not aborted. + * @tags: [requires_fcv_44] + */ +(function() { +"use strict"; + +load("jstests/libs/fail_point_util.js"); + +const testName = TestData.testName; +const rst = new ReplSetTest({name: testName, nodes: 1}); +const nodes = rst.startSet(); +rst.initiateWithHighElectionTimeout(); + +const primary = rst.getPrimary(); +const primaryDb = primary.getDB("test"); +const initialSyncSource = rst.getSecondary(); + +// Add some data to be cloned. +assert.commandWorked(primaryDb.test.insert([{a: 1}, {b: 2}, {c: 3}])); +rst.awaitReplication(); + +jsTest.log("Adding the initial sync destination node to the replica set"); +const initialSyncNode = rst.add({ + rsConfig: {priority: 0, votes: 0}, + setParameter: { + 'failpoint.initialSyncHangBeforeCopyingDatabases': tojson({mode: 'alwaysOn'}), + 'numInitialSyncAttempts': 1 + } +}); +rst.reInitiate(); +rst.waitForState(initialSyncNode, ReplSetTest.State.STARTUP_2); + +// The code handling this case is common to all cloners, so run it only for the stage most likely +// to see an error. +const cloner = 'CollectionCloner'; +const stage = 'query'; + +// Set us up to hang before finish so we can check status. +const beforeFinishFailPoint = configureFailPoint(initialSyncNode, "initialSyncHangBeforeFinish"); +const initialSyncNodeDb = initialSyncNode.getDB("test"); +const failPointData = { + cloner: cloner, + stage: stage, + nss: 'test.test' +}; +// Set us up to stop right before the given stage. +const beforeStageFailPoint = + configureFailPoint(initialSyncNodeDb, "hangBeforeClonerStage", failPointData); +// Release the initial failpoint. +assert.commandWorked(initialSyncNodeDb.adminCommand( + {configureFailPoint: "initialSyncHangBeforeCopyingDatabases", mode: "off"})); +beforeStageFailPoint.wait(); + +jsTestLog("Testing removing syncing node in cloner " + cloner + " stage " + stage); +// We can't use remove/reInitiate here because we still need to communicate with the removed +// node. +let config = rst.getReplSetConfig(); +config.members.splice(1, 1); // Removes node[1] +config.version = rst.getReplSetConfigFromNode().version + 1; +assert.commandWorked(primary.getDB("admin").adminCommand({replSetReconfig: config})); + +jsTestLog("Waiting for sync node to realize it is removed. It should fail as a result."); +let res; +assert.soon(function() { + res = checkProgram(initialSyncNode.pid); + return !res.alive; +}); + +const fassertProcessExitCode = _isWindows() ? MongoRunner.EXIT_ABRUPT : MongoRunner.EXIT_ABORT; +assert.eq(fassertProcessExitCode, res.exitCode); +assert( + rawMongoProgramOutput().match('Fatal assertion.*4848002'), + 'Initial syncing node should have crashed as a result of being removed from the configuration.'); + +// We skip validation and dbhashes because the initial sync failed so the initial sync node is +// unreachable and invalid. +rst.stopSet(null, null, {skipValidation: true}); +})(); diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp index 3c75ccf2843..c07dd8b3d6d 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp @@ -798,6 +798,25 @@ void ReplicationCoordinatorImpl::_heartbeatReconfigFinish( // the data structures inside of the TopologyCoordinator. const int myIndexValue = myIndex.getStatus().isOK() ? myIndex.getValue() : -1; + // If we were initial syncing and are now REMOVED, we must cancel the initial sync before + // installing the new config, to avoid triggering a storage invariant that has initial + // sync as a special case. We'll still fatally fail, but with a more meaningful error. + if (_initialSyncer && _memberState.startup2() && myIndexValue == -1) { + // The initial syncer may not be called inside the replication lock. + auto initialSyncerCopy = _initialSyncer; + LOGV2(4848000, "Canceling initial sync as this node is no longer in the configuration"); + lk.unlock(); + const auto status = initialSyncerCopy->shutdown(); + if (!status.isOK()) { + LOGV2_WARNING(4848001, "InitialSyncer shutdown failed", "error"_attr = status); + } + initialSyncerCopy->join(); + LOGV2_FATAL(4848002, + "Initial sync failed due to node being removed from the configuration. " + "Shutting down now. Restart the server to attempt a new initial sync"); + lk.lock(); + } + const PostMemberStateUpdateAction action = _setCurrentRSConfig(lk, opCtx.get(), newConfig, myIndexValue); |