diff options
author | Samy Lanka <samy.lanka@10gen.com> | 2017-07-05 15:18:06 -0400 |
---|---|---|
committer | Samy Lanka <samy.lanka@10gen.com> | 2017-08-02 15:29:26 -0400 |
commit | 5f1ce8b6765a25d45ba5e35063db417b3069c8d6 (patch) | |
tree | de942dd067b10bb2ae70f3a5b0f5601cd7fa87ec | |
parent | 743119c879ab2e5d1d8ca05aadf6fe29b1526a94 (diff) | |
download | mongo-5f1ce8b6765a25d45ba5e35063db417b3069c8d6.tar.gz |
SERVER-29500 Call for catchup takeover election when catchup takeover timeout fires
-rw-r--r-- | buildscripts/resmokeconfig/suites/replica_sets_pv0.yml | 5 | ||||
-rw-r--r-- | jstests/libs/write_concern_util.js | 8 | ||||
-rw-r--r-- | jstests/replsets/catchup_takeover_one_high_priority.js | 84 | ||||
-rw-r--r-- | jstests/replsets/catchup_takeover_two_nodes_ahead.js | 56 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl.cpp | 2 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl.h | 6 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl_elect.cpp | 4 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp | 210 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp | 38 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator.h | 9 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator_impl.cpp | 71 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator_impl.h | 10 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator_impl_v1_test.cpp | 232 | ||||
-rw-r--r-- | src/mongo/shell/replsettest.js | 5 |
14 files changed, 682 insertions, 58 deletions
diff --git a/buildscripts/resmokeconfig/suites/replica_sets_pv0.yml b/buildscripts/resmokeconfig/suites/replica_sets_pv0.yml index a5e8a37c1cd..b78e6fbb13e 100644 --- a/buildscripts/resmokeconfig/suites/replica_sets_pv0.yml +++ b/buildscripts/resmokeconfig/suites/replica_sets_pv0.yml @@ -4,7 +4,6 @@ selector: roots: - jstests/replsets/*.js exclude_files: - - jstests/replsets/catchup.js - jstests/replsets/config_server_checks.js - jstests/replsets/disallow_adding_initialized_node1.js - jstests/replsets/disallow_adding_initialized_node2.js @@ -32,6 +31,10 @@ selector: - jstests/replsets/command_response_operation_time.js # Majority read concern not supported in PV0. - jstests/replsets/operation_time_read_and_write_concern.js + # Catchup not supported in PV0. + - jstests/replsets/catchup.js + - jstests/replsets/catchup_takeover_one_high_priority.js + - jstests/replsets/catchup_takeover_two_nodes_ahead.js executor: config: diff --git a/jstests/libs/write_concern_util.js b/jstests/libs/write_concern_util.js index 8a846cd7f83..4bbf6ad4d29 100644 --- a/jstests/libs/write_concern_util.js +++ b/jstests/libs/write_concern_util.js @@ -35,8 +35,12 @@ function stopServerReplication(conn) { assert.commandWorked( conn.adminCommand({configureFailPoint: 'stopReplProducer', mode: 'alwaysOn'}), errMsg); - // Wait until the fail point is actually hit. - checkLog.contains(conn, 'bgsync - stopReplProducer fail point enabled'); + // Wait until the fail point is actually hit. Don't wait if the node is the primary, because + // the fail point won't be hit until the node transitions from being the primary. + if (assert.commandWorked(conn.adminCommand('replSetGetStatus')).myState != + ReplSetTest.State.PRIMARY) { + checkLog.contains(conn, 'bgsync - stopReplProducer fail point enabled'); + } } // Stops replication at all replicaset secondaries. diff --git a/jstests/replsets/catchup_takeover_one_high_priority.js b/jstests/replsets/catchup_takeover_one_high_priority.js new file mode 100644 index 00000000000..ed6d7faee50 --- /dev/null +++ b/jstests/replsets/catchup_takeover_one_high_priority.js @@ -0,0 +1,84 @@ +// Test to ensure that catchup takeover runs even if it isn't the highest +// priority node and that once the high priority node is caught up, +// it becomes primary again. + +// 3-node replica set with one high priority node. +// Start replica set. Make node 0 primary and stop the replication +// for the high priority node as well as isolate it. Have the +// primary write something so node 2 is more than 2 seconds behind. +// Write something else to ensure the third node is also lagged. +// Reconnect the high priority node to the other nodes and make +// the lagged node (node 1) the next primary. +// Confirm that the most up-to-date node becomes primary. +// Let the highest priority node catchup and then confirm +// that it becomes primary. + +(function() { + 'use strict'; + + load('jstests/replsets/rslib.js'); + + var name = 'catchup_takeover_one_high_priority'; + var replSet = + new ReplSetTest({name: name, nodes: [{}, {}, {rsConfig: {priority: 2}}], useBridge: true}); + var nodes = replSet.startSet(); + replSet.initiate(); + + // Wait until node 2 becomes primary. + replSet.waitForState(2, ReplSetTest.State.PRIMARY, replSet.kDefaultTimeoutMS); + jsTestLog('node 2 is now primary'); + + replSet.awaitReplication(); + + // Stop replication and disconnect node 2 so that it cannot do a priority takeover. + stopServerReplication(nodes[2]); + nodes[2].disconnect(nodes[1]); + nodes[2].disconnect(nodes[0]); + + // Ensure that node 0 becomes primary. + assert.commandWorked(nodes[0].adminCommand({replSetStepUp: 1})); + replSet.awaitNodesAgreeOnPrimary(replSet.kDefaultTimeoutMS, nodes.slice(0, 2)); + assert.eq(ReplSetTest.State.PRIMARY, + assert.commandWorked(nodes[0].adminCommand('replSetGetStatus')).myState, + nodes[0].host + " was not primary after step-up"); + jsTestLog('node 0 is now primary'); + + // Sleep for a few seconds to ensure that node 2's optime is more than 2 seconds behind. + // This will ensure it can't do a priority takeover until it catches up. + sleep(3000); + + var primary = replSet.getPrimary(); + var writeConcern = {writeConcern: {w: 2, wtimeout: replSet.kDefaultTimeoutMS}}; + assert.writeOK(primary.getDB(name).bar.insert({y: 100}, writeConcern)); + + // Write something so that node 0 is ahead of node 1. + stopServerReplication(nodes[1]); + writeConcern = {writeConcern: {w: 1, wtimeout: replSet.kDefaultTimeoutMS}}; + assert.writeOK(primary.getDB(name).bar.insert({x: 100}, writeConcern)); + + nodes[2].reconnect(nodes[0]); + nodes[2].reconnect(nodes[1]); + + // Step up a lagged node. + assert.commandWorked(nodes[1].adminCommand({replSetStepUp: 1})); + replSet.awaitNodesAgreeOnPrimary(replSet.kDefaultTimeoutMS, nodes); + assert.eq(ReplSetTest.State.PRIMARY, + assert.commandWorked(nodes[1].adminCommand('replSetGetStatus')).myState, + nodes[1].host + " was not primary after step-up"); + jsTestLog('node 1 is now primary, but cannot accept writes'); + + // Confirm that the most up-to-date node becomes primary + // after the default catchup delay. + replSet.waitForState(0, ReplSetTest.State.PRIMARY, 60 * 1000); + jsTestLog('node 0 performed catchup takeover and is now primary'); + + // Let the nodes catchup. + restartServerReplication(nodes[1]); + restartServerReplication(nodes[2]); + + // Confirm that the highest priority node becomes primary + // after catching up. + replSet.waitForState(2, ReplSetTest.State.PRIMARY, 30 * 1000); + jsTestLog('node 2 performed priority takeover and is now primary'); + +})();
\ No newline at end of file diff --git a/jstests/replsets/catchup_takeover_two_nodes_ahead.js b/jstests/replsets/catchup_takeover_two_nodes_ahead.js new file mode 100644 index 00000000000..4be8fee9007 --- /dev/null +++ b/jstests/replsets/catchup_takeover_two_nodes_ahead.js @@ -0,0 +1,56 @@ +// Test to ensure that a catchup takeover happens when the primary is lagged. +// Make sure that when two nodes are more caught up than the primary, +// the most up-to-date node becomes the primary. + +// 5-node replica set +// Start replica set. Ensure that node 0 becomes primary. +// Stop the replication for some nodes and have the primary write something. +// Stop replication for an up-to-date node and have the primary write something. +// Now the primary is most-up-to-date and another node is more up-to-date than others. +// Make a lagged node the next primary. +// Confirm that the most up-to-date node becomes primary. + +(function() { + 'use strict'; + + load('jstests/replsets/rslib.js'); + + var name = 'catchup_takeover_two_nodes_ahead'; + var replSet = new ReplSetTest({name: name, nodes: 5}); + var nodes = replSet.startSet(); + replSet.initiate(); + + // Wait until all nodes get the "no-op" of "new primary" after initial sync. + waitUntilAllNodesCaughtUp(nodes); + + // Write something so that nodes 0 and 1 are ahead. + stopServerReplication(nodes.slice(2, 5)); + var primary = replSet.getPrimary(); + var writeConcern = {writeConcern: {w: 2, wtimeout: replSet.kDefaultTimeoutMS}}; + assert.writeOK(primary.getDB(name).bar.insert({x: 100}, writeConcern)); + + // Write something so that node 0 is ahead of node 1. + stopServerReplication(nodes[1]); + writeConcern = {writeConcern: {w: 1, wtimeout: replSet.kDefaultTimeoutMS}}; + assert.writeOK(primary.getDB(name).bar.insert({y: 100}, writeConcern)); + + // Step up one of the lagged nodes. + assert.commandWorked(nodes[2].adminCommand({replSetStepUp: 1})); + replSet.awaitNodesAgreeOnPrimary(replSet.kDefaultTimeoutMS, nodes); + assert.eq(ReplSetTest.State.PRIMARY, + assert.commandWorked(nodes[2].adminCommand('replSetGetStatus')).myState, + nodes[2].host + " was not primary after step-up"); + jsTestLog('node 2 is now primary, but cannot accept writes'); + + // Make sure that node 2 cannot write anything. Because it is lagged and replication + // has been stopped, it shouldn't be able to become master. + assert.writeErrorWithCode(nodes[2].getDB(name).bar.insert({z: 100}, writeConcern), + ErrorCodes.NotMaster); + + // Confirm that the most up-to-date node becomes primary + // after the default catchup delay. + replSet.waitForState(0, ReplSetTest.State.PRIMARY, 60 * 1000); + + // Let the nodes catchup + restartServerReplication(nodes.slice(1, 5)); +})();
\ No newline at end of file diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index e1f995f4d6f..84f5cb58031 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -3375,7 +3375,7 @@ Status ReplicationCoordinatorImpl::stepUpIfEligible() { "Step-up command is only supported by Protocol Version 1"); } - _startElectSelfIfEligibleV1(StartElectionV1Reason::kStepUpRequest); + _startElectSelfIfEligibleV1(TopologyCoordinator::StartElectionReason::kStepUpRequest); EventHandle finishEvent; { stdx::lock_guard<stdx::mutex> lk(_mutex); diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h index 1fbc2d059fc..27d698feb69 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.h +++ b/src/mongo/db/repl/replication_coordinator_impl.h @@ -1056,12 +1056,8 @@ private: /** * Callback which starts an election if this node is electable and using protocolVersion 1. - * "isPriorityTakeover" is used to determine if the caller was a priority takeover or not and - * log messages accordingly. */ - enum StartElectionV1Reason { kElectionTimeout, kPriorityTakeover, kStepUpRequest }; - - void _startElectSelfIfEligibleV1(StartElectionV1Reason reason); + void _startElectSelfIfEligibleV1(TopologyCoordinator::StartElectionReason reason); /** * Resets the term of last vote to 0 to prevent any node from voting for term 0. diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect.cpp index 91b1c5dfc15..727d652630a 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_elect.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_elect.cpp @@ -284,7 +284,9 @@ void ReplicationCoordinatorImpl::_recoverFromElectionTie( if (!status.isOK()) { LOG(2) << "ReplicationCoordinatorImpl::_recoverFromElectionTie -- " << status.reason(); } else { - fassertStatusOK(28817, _topCoord->becomeCandidateIfElectable(now, false)); + fassertStatusOK(28817, + _topCoord->becomeCandidateIfElectable( + now, TopologyCoordinator::StartElectionReason::kElectionTimeout)); _startElectSelf_inlock(); } } diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp index 1f75f7c9a75..ca7ef2f5d86 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp @@ -859,9 +859,11 @@ public: return net->now(); } - void performSuccessfulPriorityTakeover(Date_t priorityTakeoverTime) { + void performSuccessfulTakeover(Date_t takeoverTime, + TopologyCoordinator::StartElectionReason reason, + const LastVote& lastVoteExpected) { startCapturingLogMessages(); - simulateSuccessfulV1ElectionAt(priorityTakeoverTime); + simulateSuccessfulV1ElectionAt(takeoverTime); getReplCoord()->waitForElectionFinish_forTest(); stopCapturingLogMessages(); @@ -870,10 +872,13 @@ public: // Check last vote auto lastVote = getExternalState()->loadLocalLastVoteDocument(nullptr); ASSERT(lastVote.isOK()); - ASSERT_EQ(0, lastVote.getValue().getCandidateIndex()); - ASSERT_EQ(1, lastVote.getValue().getTerm()); + ASSERT_EQ(lastVoteExpected.getCandidateIndex(), lastVote.getValue().getCandidateIndex()); + ASSERT_EQ(lastVoteExpected.getTerm(), lastVote.getValue().getTerm()); - ASSERT_EQUALS(1, countLogLinesContaining("Starting an election for a priority takeover")); + if (reason == TopologyCoordinator::StartElectionReason::kPriorityTakeover) { + ASSERT_EQUALS(1, + countLogLinesContaining("Starting an election for a priority takeover")); + } ASSERT_EQUALS(1, countLogLinesContaining("election succeeded")); } @@ -894,11 +899,22 @@ private: // Process all requests queued at the present time. while (net->hasReadyRequests()) { - auto noi = net->getNextReadyRequest(); + + // If we see that the next request isn't for a heartbeat, exit the function. + // This allows us to mock heartbeat responses with whatever info we want + // right up until another event happens (like an election). This is + // particularly important for simulating a catchup takeover because + // we need to know specific info about the primary. + auto noi = net->getFrontOfUnscheduledQueue(); + auto&& nextRequest = noi->getRequest(); + if (nextRequest.cmdObj.firstElement().fieldNameStringData() != "replSetHeartbeat") { + return; + } + + noi = net->getNextReadyRequest(); auto&& request = noi->getRequest(); log() << request.target << " processing " << request.cmdObj; - ASSERT_EQUALS("replSetHeartbeat", request.cmdObj.firstElement().fieldNameStringData()); // Make sure the heartbeat request is valid. ReplSetHeartbeatArgsV1 hbArgs; @@ -1290,7 +1306,7 @@ TEST_F(TakeoverTest, CatchupTakeoverCanceledIfTransitionToRollback) { ASSERT_EQUALS(0, countLogLinesContaining("Starting an election for a catchup takeover")); } -TEST_F(TakeoverTest, CatchupTakeoverElectionIsANoop) { +TEST_F(TakeoverTest, SuccessfulCatchupTakeover) { BSONObj configObj = BSON("_id" << "mySet" << "version" @@ -1318,6 +1334,11 @@ TEST_F(TakeoverTest, CatchupTakeoverElectionIsANoop) { replCoord->setMyLastAppliedOpTime(currentOptime); replCoord->setMyLastDurableOpTime(currentOptime); + // Update the term so that the current term is ahead of the term of + // the last applied op time. This means that the primary is still in + // catchup mode since it hasn't written anything this term. + ASSERT_EQUALS(ErrorCodes::StaleTerm, replCoord->updateTerm(&opCtx, replCoord->getTerm() + 1)); + // Make sure we're secondary and that no takeover has been scheduled. ASSERT_OK(replCoord->setFollowerMode(MemberState::RS_SECONDARY)); ASSERT_FALSE(replCoord->getCatchupTakeover_forTest()); @@ -1333,12 +1354,164 @@ TEST_F(TakeoverTest, CatchupTakeoverElectionIsANoop) { ASSERT_EQUALS(config.getCatchUpTakeoverDelay(), catchupTakeoverDelay); startCapturingLogMessages(); - now = respondToHeartbeatsUntil(config, catchupTakeoverTime, primaryHostAndPort, behindOptime); + + // The catchup takeover will be scheduled at a time later than one election + // timeout after our initial heartbeat responses, so mock a few rounds of + // heartbeat responses to prevent a normal election timeout. + now = respondToHeartbeatsUntil( + config, catchupTakeoverTime, HostAndPort("node2", 12345), behindOptime); + stopCapturingLogMessages(); + + // Since the heartbeats go through the catchupTakeoverTimeout, this log + // message happens already (otherwise it would happen in performSuccessfulTakeover). + ASSERT_EQUALS(1, countLogLinesContaining("Starting an election for a catchup takeover")); + + LastVote lastVoteExpected = LastVote(replCoord->getTerm() + 1, 0); + performSuccessfulTakeover(catchupTakeoverTime, + TopologyCoordinator::StartElectionReason::kCatchupTakeover, + lastVoteExpected); +} + +TEST_F(TakeoverTest, PrimaryCatchesUpBeforeCatchupTakeover) { + BSONObj configObj = BSON("_id" + << "mySet" + << "version" + << 1 + << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345") + << BSON("_id" << 2 << "host" + << "node2:12345") + << BSON("_id" << 3 << "host" + << "node3:12345")) + << "protocolVersion" + << 1); + assertStartSuccess(configObj, HostAndPort("node1", 12345)); + ReplSetConfig config = assertMakeRSConfig(configObj); + + auto replCoord = getReplCoord(); + auto now = getNet()->now(); + + OperationContextNoop opCtx; + OpTime currentOptime(Timestamp(200, 1), 0); + replCoord->setMyLastAppliedOpTime(currentOptime); + replCoord->setMyLastDurableOpTime(currentOptime); + OpTime behindOptime(Timestamp(100, 1), 0); + + // Update the term so that the current term is ahead of the term of + // the last applied op time. + ASSERT_EQUALS(ErrorCodes::StaleTerm, replCoord->updateTerm(&opCtx, replCoord->getTerm() + 1)); + + // Make sure we're secondary and that no catchup takeover has been scheduled. + ASSERT_OK(replCoord->setFollowerMode(MemberState::RS_SECONDARY)); + ASSERT_FALSE(replCoord->getCatchupTakeover_forTest()); + + startCapturingLogMessages(); + + // Mock a first round of heartbeat responses, which should give us enough information to know + // that we are fresher than the current primary, prompting the scheduling of a catchup + // takeover. + now = respondToHeartbeatsUntil(config, now, HostAndPort("node2", 12345), behindOptime); + + // Make sure that the catchup takeover has actually been scheduled and at the + // correct time. + ASSERT(replCoord->getCatchupTakeover_forTest()); + auto catchupTakeoverTime = replCoord->getCatchupTakeover_forTest().get(); + Milliseconds catchupTakeoverDelay = catchupTakeoverTime - now; + ASSERT_EQUALS(config.getCatchUpTakeoverDelay(), catchupTakeoverDelay); + + // Mock another heartbeat where the primary is now up to date + // and run time through when catchup takeover was supposed to happen. + now = respondToHeartbeatsUntil( + config, now + catchupTakeoverDelay, HostAndPort("node2", 12345), currentOptime); + + stopCapturingLogMessages(); + + // Make sure we're secondary and that no catchup takeover election happened. + ASSERT(replCoord->getMemberState().secondary()); + ASSERT_FALSE(replCoord->getCatchupTakeover_forTest()); + ASSERT_EQUALS(1, countLogLinesContaining("Not starting an election for a catchup takeover")); +} + +TEST_F(TakeoverTest, PrimaryCatchesUpBeforeHighPriorityNodeCatchupTakeover) { + BSONObj configObj = BSON("_id" + << "mySet" + << "version" + << 1 + << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345" + << "priority" + << 2) + << BSON("_id" << 2 << "host" + << "node2:12345") + << BSON("_id" << 3 << "host" + << "node3:12345")) + << "protocolVersion" + << 1); + assertStartSuccess(configObj, HostAndPort("node1", 12345)); + ReplSetConfig config = assertMakeRSConfig(configObj); + + auto replCoord = getReplCoord(); + auto now = getNet()->now(); + + OperationContextNoop opCtx; + OpTime currentOptime(Timestamp(200, 1), 0); + replCoord->setMyLastAppliedOpTime(currentOptime); + replCoord->setMyLastDurableOpTime(currentOptime); + OpTime behindOptime(Timestamp(100, 1), 0); + + // Update the term so that the current term is ahead of the term of + // the last applied op time. + ASSERT_EQUALS(ErrorCodes::StaleTerm, replCoord->updateTerm(&opCtx, replCoord->getTerm() + 1)); + + // Make sure we're secondary and that no catchup takeover has been scheduled. + ASSERT_OK(replCoord->setFollowerMode(MemberState::RS_SECONDARY)); + ASSERT_FALSE(replCoord->getCatchupTakeover_forTest()); + + startCapturingLogMessages(); + + // Mock a first round of heartbeat responses, which should give us enough information to know + // that we are fresher than the current primary, prompting the scheduling of a catchup + // takeover. + now = respondToHeartbeatsUntil(config, now, HostAndPort("node2", 12345), behindOptime); + + // Make sure that the catchup takeover has actually been scheduled and at the + // correct time. + ASSERT(replCoord->getCatchupTakeover_forTest()); + auto catchupTakeoverTime = replCoord->getCatchupTakeover_forTest().get(); + Milliseconds catchupTakeoverDelay = catchupTakeoverTime - now; + ASSERT_EQUALS(config.getCatchUpTakeoverDelay(), catchupTakeoverDelay); + + // Mock another heartbeat where the primary is now up to date + // and run time through when catchup takeover was supposed to happen. + now = respondToHeartbeatsUntil( + config, now + catchupTakeoverDelay, HostAndPort("node2", 12345), currentOptime); + stopCapturingLogMessages(); - // Make sure that the catchup takeover fired as a NOOP. + // Make sure we're secondary and that no catchup takeover election happens. ASSERT(replCoord->getMemberState().secondary()); - ASSERT_EQUALS(1, countLogLinesContaining("Starting an election for a catchup takeover [NOOP]")); + ASSERT_FALSE(replCoord->getCatchupTakeover_forTest()); + ASSERT_EQUALS(1, countLogLinesContaining("Not starting an election for a catchup takeover")); + + // Make sure that the priority takeover has now been scheduled and at the + // correct time. + ASSERT(replCoord->getPriorityTakeover_forTest()); + auto priorityTakeoverTime = replCoord->getPriorityTakeover_forTest().get(); + assertValidPriorityTakeoverDelay(config, now, priorityTakeoverTime, 0); + + // The priority takeover might be scheduled at a time later than one election + // timeout after our initial heartbeat responses, so mock another round of + // heartbeat responses to prevent a normal election timeout. + Milliseconds halfElectionTimeout = config.getElectionTimeoutPeriod() / 2; + now = respondToHeartbeatsUntil( + config, now + halfElectionTimeout, HostAndPort("node2", 12345), currentOptime); + + LastVote lastVoteExpected = LastVote(replCoord->getTerm() + 1, 0); + performSuccessfulTakeover(priorityTakeoverTime, + TopologyCoordinator::StartElectionReason::kPriorityTakeover, + lastVoteExpected); } TEST_F(TakeoverTest, SchedulesPriorityTakeoverIfNodeHasHigherPriorityThanCurrentPrimary) { @@ -1437,7 +1610,10 @@ TEST_F(TakeoverTest, SuccessfulPriorityTakeover) { now = respondToHeartbeatsUntil( config, now + halfElectionTimeout, HostAndPort("node2", 12345), myOptime); - performSuccessfulPriorityTakeover(priorityTakeoverTime); + LastVote lastVoteExpected = LastVote(replCoord->getTerm() + 1, 0); + performSuccessfulTakeover(priorityTakeoverTime, + TopologyCoordinator::StartElectionReason::kPriorityTakeover, + lastVoteExpected); } TEST_F(TakeoverTest, DontCallForPriorityTakeoverWhenLaggedSameSecond) { @@ -1513,7 +1689,10 @@ TEST_F(TakeoverTest, DontCallForPriorityTakeoverWhenLaggedSameSecond) { replCoord->setMyLastAppliedOpTime(closeEnoughOpTime); replCoord->setMyLastDurableOpTime(closeEnoughOpTime); - performSuccessfulPriorityTakeover(priorityTakeoverTime); + LastVote lastVoteExpected = LastVote(replCoord->getTerm() + 1, 0); + performSuccessfulTakeover(priorityTakeoverTime, + TopologyCoordinator::StartElectionReason::kPriorityTakeover, + lastVoteExpected); } TEST_F(TakeoverTest, DontCallForPriorityTakeoverWhenLaggedDifferentSecond) { @@ -1588,7 +1767,10 @@ TEST_F(TakeoverTest, DontCallForPriorityTakeoverWhenLaggedDifferentSecond) { replCoord->setMyLastAppliedOpTime(closeEnoughOpTime); replCoord->setMyLastDurableOpTime(closeEnoughOpTime); - performSuccessfulPriorityTakeover(priorityTakeoverTime); + LastVote lastVoteExpected = LastVote(replCoord->getTerm() + 1, 0); + performSuccessfulTakeover(priorityTakeoverTime, + TopologyCoordinator::StartElectionReason::kPriorityTakeover, + lastVoteExpected); } TEST_F(ReplCoordTest, NodeCancelsElectionUponReceivingANewConfigDuringDryRun) { diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp index fbaeb9dca44..35199798605 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp @@ -286,7 +286,7 @@ stdx::unique_lock<stdx::mutex> ReplicationCoordinatorImpl::_handleHeartbeatRespo _priorityTakeoverWhen, stdx::bind(&ReplicationCoordinatorImpl::_startElectSelfIfEligibleV1, this, - StartElectionV1Reason::kPriorityTakeover)); + TopologyCoordinator::StartElectionReason::kPriorityTakeover)); } break; } @@ -297,11 +297,10 @@ stdx::unique_lock<stdx::mutex> ReplicationCoordinatorImpl::_handleHeartbeatRespo _catchupTakeoverWhen = _replExecutor->now() + catchupTakeoverDelay; log() << "Scheduling catchup takeover at " << _catchupTakeoverWhen; _catchupTakeoverCbh = _scheduleWorkAt( - _catchupTakeoverWhen, [this](const executor::TaskExecutor::CallbackArgs& args) { - stdx::lock_guard<stdx::mutex> lock(_mutex); - _cancelCatchupTakeover_inlock(); - log() << "Starting an election for a catchup takeover [NOOP]"; - }); + _catchupTakeoverWhen, + stdx::bind(&ReplicationCoordinatorImpl::_startElectSelfIfEligibleV1, + this, + TopologyCoordinator::StartElectionReason::kCatchupTakeover)); } break; } @@ -767,10 +766,11 @@ void ReplicationCoordinatorImpl::_cancelAndRescheduleElectionTimeout_inlock() { _scheduleWorkAt(when, stdx::bind(&ReplicationCoordinatorImpl::_startElectSelfIfEligibleV1, this, - StartElectionV1Reason::kElectionTimeout)); + TopologyCoordinator::StartElectionReason::kElectionTimeout)); } -void ReplicationCoordinatorImpl::_startElectSelfIfEligibleV1(StartElectionV1Reason reason) { +void ReplicationCoordinatorImpl::_startElectSelfIfEligibleV1( + TopologyCoordinator::StartElectionReason reason) { if (!isV1ElectionProtocol()) { return; } @@ -788,37 +788,43 @@ void ReplicationCoordinatorImpl::_startElectSelfIfEligibleV1(StartElectionV1Reas } } - const auto status = _topCoord->becomeCandidateIfElectable( - _replExecutor->now(), reason == StartElectionV1Reason::kPriorityTakeover); + const auto status = _topCoord->becomeCandidateIfElectable(_replExecutor->now(), reason); if (!status.isOK()) { switch (reason) { - case StartElectionV1Reason::kElectionTimeout: + case TopologyCoordinator::TopologyCoordinator::StartElectionReason::kElectionTimeout: log() << "Not starting an election, since we are not electable due to: " << status.reason(); break; - case StartElectionV1Reason::kPriorityTakeover: + case TopologyCoordinator::StartElectionReason::kPriorityTakeover: log() << "Not starting an election for a priority takeover, " << "since we are not electable due to: " << status.reason(); break; - case StartElectionV1Reason::kStepUpRequest: + case TopologyCoordinator::StartElectionReason::kStepUpRequest: log() << "Not starting an election for a replSetStepUp request, " << "since we are not electable due to: " << status.reason(); break; + case TopologyCoordinator::StartElectionReason::kCatchupTakeover: + log() << "Not starting an election for a catchup takeover, " + << "since we are not electable due to: " << status.reason(); + break; } return; } switch (reason) { - case StartElectionV1Reason::kElectionTimeout: + case TopologyCoordinator::StartElectionReason::kElectionTimeout: log() << "Starting an election, since we've seen no PRIMARY in the past " << _rsConfig.getElectionTimeoutPeriod(); break; - case StartElectionV1Reason::kPriorityTakeover: + case TopologyCoordinator::StartElectionReason::kPriorityTakeover: log() << "Starting an election for a priority takeover"; break; - case StartElectionV1Reason::kStepUpRequest: + case TopologyCoordinator::StartElectionReason::kStepUpRequest: log() << "Starting an election due to step up request"; break; + case TopologyCoordinator::StartElectionReason::kCatchupTakeover: + log() << "Starting an election for a catchup takeover"; + break; } _startElectSelfV1_inlock(); diff --git a/src/mongo/db/repl/topology_coordinator.h b/src/mongo/db/repl/topology_coordinator.h index 377959fc47e..3c2df705d93 100644 --- a/src/mongo/db/repl/topology_coordinator.h +++ b/src/mongo/db/repl/topology_coordinator.h @@ -587,10 +587,17 @@ public: */ virtual void setPrimaryIndex(long long primaryIndex) = 0; + enum StartElectionReason { + kElectionTimeout, + kPriorityTakeover, + kStepUpRequest, + kCatchupTakeover + }; + /** * Transitions to the candidate role if the node is electable. */ - virtual Status becomeCandidateIfElectable(const Date_t now, bool isPriorityTakeover) = 0; + virtual Status becomeCandidateIfElectable(const Date_t now, StartElectionReason reason) = 0; /** * Updates the storage engine read committed support in the TopologyCoordinator options after diff --git a/src/mongo/db/repl/topology_coordinator_impl.cpp b/src/mongo/db/repl/topology_coordinator_impl.cpp index 8d707362fda..2d17af00d7a 100644 --- a/src/mongo/db/repl/topology_coordinator_impl.cpp +++ b/src/mongo/db/repl/topology_coordinator_impl.cpp @@ -740,7 +740,7 @@ Status TopologyCoordinatorImpl::prepareHeartbeatResponse(Date_t now, const OpTime lastOpDurable = getMyLastDurableOpTime(); // Are we electable - response->setElectable(!_getMyUnelectableReason(now, false)); + response->setElectable(!_getMyUnelectableReason(now, StartElectionReason::kElectionTimeout)); // Heartbeat status message response->setHbMsg(_getHbmsg(now)); @@ -1182,7 +1182,7 @@ HeartbeatResponseAction TopologyCoordinatorImpl::setMemberAsDown(Date_t now, MemberData& hbData = _memberData.at(memberIndex); hbData.setDownValues(now, "no response within election timeout period"); - if (CannotSeeMajority & _getMyUnelectableReason(now, false)) { + if (CannotSeeMajority & _getMyUnelectableReason(now, StartElectionReason::kElectionTimeout)) { if (_stepDownPending) { return HeartbeatResponseAction::makeNoAction(); } @@ -1487,7 +1487,8 @@ HeartbeatResponseAction TopologyCoordinatorImpl::_updatePrimaryFromHBData( // If we are primary, check if we can still see majority of the set; // stepdown if we can't. if (_iAmPrimary()) { - if (CannotSeeMajority & _getMyUnelectableReason(now, false)) { + if (CannotSeeMajority & + _getMyUnelectableReason(now, StartElectionReason::kElectionTimeout)) { if (_stepDownPending) { return HeartbeatResponseAction::makeNoAction(); } @@ -1522,7 +1523,7 @@ HeartbeatResponseAction TopologyCoordinatorImpl::_updatePrimaryFromHBData( LOG(2) << "TopologyCoordinatorImpl::_updatePrimaryFromHBData - " << status.reason(); return HeartbeatResponseAction::makeNoAction(); } - fassertStatusOK(28816, becomeCandidateIfElectable(now, false)); + fassertStatusOK(28816, becomeCandidateIfElectable(now, StartElectionReason::kElectionTimeout)); return HeartbeatResponseAction::makeElectAction(); } @@ -1536,7 +1537,8 @@ Status TopologyCoordinatorImpl::checkShouldStandForElection(Date_t now) const { return {ErrorCodes::NodeNotElectable, "Not standing for election again; already candidate"}; } - const UnelectableReasonMask unelectableReason = _getMyUnelectableReason(now, false); + const UnelectableReasonMask unelectableReason = + _getMyUnelectableReason(now, StartElectionReason::kElectionTimeout); if (NotCloseEnoughToLatestOptime & unelectableReason) { return {ErrorCodes::NodeNotElectable, str::stream() << "Not standing for election because " @@ -1633,6 +1635,40 @@ bool TopologyCoordinatorImpl::_amIFreshEnoughForPriorityTakeover() const { } } +bool TopologyCoordinatorImpl::_amIFreshEnoughForCatchupTakeover() const { + + const OpTime latestKnownOpTime = _latestKnownOpTime(); + + // Rules are: + // - We must have the freshest optime of all the up nodes. + // - We must specifically have a fresher optime than the primary (can't be equal). + // - The term of our last applied op must be less than the current term. This ensures that no + // writes have happened since the most recent election and that the primary is still in + // catchup mode. + + // There is no point to a catchup takeover if we aren't the freshest node because + // another node would immediately perform another catchup takeover when we become primary. + const OpTime ourLastOpApplied = getMyLastAppliedOpTime(); + if (ourLastOpApplied < latestKnownOpTime) { + return false; + } + + if (_currentPrimaryIndex == -1) { + return false; + } + + // If we aren't ahead of the primary, there is no point to having a catchup takeover. + const OpTime primaryLastOpApplied = _memberData[_currentPrimaryIndex].getLastAppliedOpTime(); + + if (ourLastOpApplied <= primaryLastOpApplied) { + return false; + } + + // If the term of our last applied op is less than the current term, the primary didn't write + // anything and it is still in catchup mode. + return ourLastOpApplied.getTerm() < _term; +} + bool TopologyCoordinatorImpl::_iAmPrimary() const { if (_role == Role::leader) { invariant(_currentPrimaryIndex == _selfIndex); @@ -1685,7 +1721,7 @@ int TopologyCoordinatorImpl::_getHighestPriorityElectableIndex(Date_t now) const int maxIndex = -1; for (int currentIndex = 0; currentIndex < _rsConfig.getNumMembers(); currentIndex++) { UnelectableReasonMask reason = currentIndex == _selfIndex - ? _getMyUnelectableReason(now, false) + ? _getMyUnelectableReason(now, StartElectionReason::kElectionTimeout) : _getUnelectableReason(currentIndex); if (None == reason && _isMemberHigherPriority(currentIndex, maxIndex)) { maxIndex = currentIndex; @@ -2310,7 +2346,7 @@ TopologyCoordinatorImpl::UnelectableReasonMask TopologyCoordinatorImpl::_getUnel } TopologyCoordinatorImpl::UnelectableReasonMask TopologyCoordinatorImpl::_getMyUnelectableReason( - const Date_t now, bool isPriorityTakeover) const { + const Date_t now, StartElectionReason reason) const { UnelectableReasonMask result = None; const OpTime lastApplied = getMyLastAppliedOpTime(); if (lastApplied.isNull()) { @@ -2351,9 +2387,15 @@ TopologyCoordinatorImpl::UnelectableReasonMask TopologyCoordinatorImpl::_getMyUn } else { // Election rules only for protocol version 1. invariant(_rsConfig.getProtocolVersion() == 1); - if (isPriorityTakeover && !_amIFreshEnoughForPriorityTakeover()) { + if (reason == StartElectionReason::kPriorityTakeover && + !_amIFreshEnoughForPriorityTakeover()) { result |= NotCloseEnoughToLatestForPriorityTakeover; } + + if (reason == StartElectionReason::kCatchupTakeover && + !_amIFreshEnoughForCatchupTakeover()) { + result |= NotFreshEnoughForCatchupTakeover; + } } return result; } @@ -2426,6 +2468,14 @@ std::string TopologyCoordinatorImpl::_getUnelectableReasonString( "takeover - must be within " << priorityTakeoverFreshnessWindowSeconds << " seconds"; } + if (ur & NotFreshEnoughForCatchupTakeover) { + if (hasWrittenToStream) { + ss << "; "; + } + hasWrittenToStream = true; + ss << "member is either not the most up-to-date member or not ahead of the primary, and " + "therefore cannot call for catchup takeover"; + } if (ur & NotInitialized) { if (hasWrittenToStream) { ss << "; "; @@ -2986,7 +3036,7 @@ void TopologyCoordinatorImpl::setPrimaryIndex(long long primaryIndex) { } Status TopologyCoordinatorImpl::becomeCandidateIfElectable(const Date_t now, - bool isPriorityTakeover) { + StartElectionReason reason) { if (_role == Role::leader) { return {ErrorCodes::NodeNotElectable, "Not standing for election again; already primary"}; } @@ -2995,8 +3045,7 @@ Status TopologyCoordinatorImpl::becomeCandidateIfElectable(const Date_t now, return {ErrorCodes::NodeNotElectable, "Not standing for election again; already candidate"}; } - const UnelectableReasonMask unelectableReason = - _getMyUnelectableReason(now, isPriorityTakeover); + const UnelectableReasonMask unelectableReason = _getMyUnelectableReason(now, reason); if (unelectableReason) { return {ErrorCodes::NodeNotElectable, str::stream() << "Not standing for election because " diff --git a/src/mongo/db/repl/topology_coordinator_impl.h b/src/mongo/db/repl/topology_coordinator_impl.h index f9f50f62871..5114f012be2 100644 --- a/src/mongo/db/repl/topology_coordinator_impl.h +++ b/src/mongo/db/repl/topology_coordinator_impl.h @@ -247,7 +247,7 @@ public: virtual MemberData* findMemberDataByMemberId(const int memberId); virtual MemberData* findMemberDataByRid(const OID rid); virtual MemberData* addSlaveMemberData(const OID rid); - virtual Status becomeCandidateIfElectable(const Date_t now, bool isPriorityTakeover); + virtual Status becomeCandidateIfElectable(const Date_t now, StartElectionReason reason); virtual void setStorageEngineSupportsReadCommitted(bool supported); virtual void restartHeartbeats(); @@ -295,6 +295,7 @@ private: VotedTooRecently = 1 << 8, RefusesToStand = 1 << 9, NotCloseEnoughToLatestForPriorityTakeover = 1 << 10, + NotFreshEnoughForCatchupTakeover = 1 << 11, }; typedef int UnelectableReasonMask; @@ -327,8 +328,13 @@ private: // Is our optime close enough to the latest known optime to call for a priority takeover. bool _amIFreshEnoughForPriorityTakeover() const; + // Is the primary node still in catchup mode and is our optime the latest + // known optime of all the up nodes. + bool _amIFreshEnoughForCatchupTakeover() const; + // Returns reason why "self" member is unelectable - UnelectableReasonMask _getMyUnelectableReason(const Date_t now, bool isPriorityTakeover) const; + UnelectableReasonMask _getMyUnelectableReason(const Date_t now, + StartElectionReason reason) const; // Returns reason why memberIndex is unelectable UnelectableReasonMask _getUnelectableReason(int memberIndex) const; diff --git a/src/mongo/db/repl/topology_coordinator_impl_v1_test.cpp b/src/mongo/db/repl/topology_coordinator_impl_v1_test.cpp index 72dbefe0def..8370f3595de 100644 --- a/src/mongo/db/repl/topology_coordinator_impl_v1_test.cpp +++ b/src/mongo/db/repl/topology_coordinator_impl_v1_test.cpp @@ -3829,6 +3829,232 @@ TEST_F(HeartbeatResponseTestV1, UpdateHeartbeatDataTermPreventsPriorityTakeover) ASSERT_EQUALS(2, getCurrentPrimaryIndex()); } +TEST_F(TopoCoordTest, FreshestNodeDoesCatchupTakeover) { + updateConfig(BSON("_id" + << "rs0" + << "version" + << 5 + << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "host1:27017") + << BSON("_id" << 2 << "host" + << "host2:27017") + << BSON("_id" << 3 << "host" + << "host3:27017")) + << "protocolVersion" + << 1 + << "settings" + + << BSON("heartbeatTimeoutSecs" << 5)), + 0); + + setSelfMemberState(MemberState::RS_SECONDARY); + + OpTime currentOptime(Timestamp(200, 1), 0); + OpTime behindOptime(Timestamp(100, 1), 0); + + // Create a mock heartbeat response to be able to compare who is the freshest node. + // The latest heartbeat responses are looked at for determining the latest optime + // and therefore freshness for catchup takeover. + ReplSetHeartbeatResponse hbResp = ReplSetHeartbeatResponse(); + hbResp.setState(MemberState::RS_SECONDARY); + hbResp.setAppliedOpTime(currentOptime); + hbResp.setTerm(1); + + Date_t firstRequestDate = unittest::assertGet(dateFromISOString("2014-08-29T13:00Z")); + + getTopoCoord().prepareHeartbeatRequestV1(firstRequestDate, "rs0", HostAndPort("host2:27017")); + getTopoCoord().prepareHeartbeatRequestV1(firstRequestDate, "rs0", HostAndPort("host3:27017")); + + // Set optimes so that I am the freshest node and strictly ahead of the primary. + getTopoCoord().getMyMemberData()->setLastAppliedOpTime(currentOptime, Date_t()); + getTopoCoord().processHeartbeatResponse(firstRequestDate + Milliseconds(1000), + Milliseconds(999), + HostAndPort("host3:27017"), + StatusWith<ReplSetHeartbeatResponse>(hbResp)); + hbResp.setAppliedOpTime(behindOptime); + hbResp.setState(MemberState::RS_PRIMARY); + getTopoCoord().processHeartbeatResponse(firstRequestDate + Milliseconds(1000), + Milliseconds(999), + HostAndPort("host2:27017"), + StatusWith<ReplSetHeartbeatResponse>(hbResp)); + getTopoCoord().updateTerm(1, Date_t()); + + ASSERT_OK(getTopoCoord().becomeCandidateIfElectable( + Date_t(), TopologyCoordinator::StartElectionReason::kCatchupTakeover)); +} + +TEST_F(TopoCoordTest, StaleNodeDoesntDoCatchupTakeover) { + updateConfig(BSON("_id" + << "rs0" + << "version" + << 5 + << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "host1:27017") + << BSON("_id" << 2 << "host" + << "host2:27017") + << BSON("_id" << 3 << "host" + << "host3:27017")) + << "protocolVersion" + << 1 + << "settings" + + << BSON("heartbeatTimeoutSecs" << 5)), + 0); + + setSelfMemberState(MemberState::RS_SECONDARY); + + OpTime currentOptime(Timestamp(200, 1), 0); + OpTime behindOptime(Timestamp(100, 1), 0); + + // Create a mock heartbeat response to be able to compare who is the freshest node. + ReplSetHeartbeatResponse hbResp = ReplSetHeartbeatResponse(); + hbResp.setState(MemberState::RS_SECONDARY); + hbResp.setAppliedOpTime(currentOptime); + hbResp.setTerm(1); + + Date_t firstRequestDate = unittest::assertGet(dateFromISOString("2014-08-29T13:00Z")); + + getTopoCoord().prepareHeartbeatRequestV1(firstRequestDate, "rs0", HostAndPort("host2:27017")); + getTopoCoord().prepareHeartbeatRequestV1(firstRequestDate, "rs0", HostAndPort("host3:27017")); + + // Set optimes so that the other (non-primary) node is ahead of me. + getTopoCoord().getMyMemberData()->setLastAppliedOpTime(behindOptime, Date_t()); + getTopoCoord().processHeartbeatResponse(firstRequestDate + Milliseconds(1000), + Milliseconds(999), + HostAndPort("host3:27017"), + StatusWith<ReplSetHeartbeatResponse>(hbResp)); + hbResp.setAppliedOpTime(behindOptime); + hbResp.setState(MemberState::RS_PRIMARY); + getTopoCoord().processHeartbeatResponse(firstRequestDate + Milliseconds(1000), + Milliseconds(999), + HostAndPort("host2:27017"), + StatusWith<ReplSetHeartbeatResponse>(hbResp)); + getTopoCoord().updateTerm(1, Date_t()); + + Status result = getTopoCoord().becomeCandidateIfElectable( + Date_t(), TopologyCoordinator::StartElectionReason::kCatchupTakeover); + ASSERT_NOT_OK(result); + ASSERT_STRING_CONTAINS(result.reason(), + "member is either not the most up-to-date member or not ahead of the " + "primary, and therefore cannot call for catchup takeover"); +} + +TEST_F(TopoCoordTest, NodeDoesntDoCatchupTakeoverHeartbeatSaysPrimaryCaughtUp) { + updateConfig(BSON("_id" + << "rs0" + << "version" + << 5 + << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "host1:27017") + << BSON("_id" << 2 << "host" + << "host2:27017") + << BSON("_id" << 3 << "host" + << "host3:27017")) + << "protocolVersion" + << 1 + << "settings" + + << BSON("heartbeatTimeoutSecs" << 5)), + 0); + + setSelfMemberState(MemberState::RS_SECONDARY); + + OpTime currentOptime(Timestamp(200, 1), 0); + + // Create a mock heartbeat response to be able to compare who is the freshest node. + ReplSetHeartbeatResponse hbResp = ReplSetHeartbeatResponse(); + hbResp.setState(MemberState::RS_SECONDARY); + hbResp.setAppliedOpTime(currentOptime); + hbResp.setTerm(1); + + Date_t firstRequestDate = unittest::assertGet(dateFromISOString("2014-08-29T13:00Z")); + + getTopoCoord().prepareHeartbeatRequestV1(firstRequestDate, "rs0", HostAndPort("host2:27017")); + getTopoCoord().prepareHeartbeatRequestV1(firstRequestDate, "rs0", HostAndPort("host3:27017")); + + // Set optimes so that the primary node is caught up with me. + getTopoCoord().getMyMemberData()->setLastAppliedOpTime(currentOptime, Date_t()); + getTopoCoord().processHeartbeatResponse(firstRequestDate + Milliseconds(1000), + Milliseconds(999), + HostAndPort("host3:27017"), + StatusWith<ReplSetHeartbeatResponse>(hbResp)); + hbResp.setState(MemberState::RS_PRIMARY); + getTopoCoord().processHeartbeatResponse(firstRequestDate + Milliseconds(1000), + Milliseconds(999), + HostAndPort("host2:27017"), + StatusWith<ReplSetHeartbeatResponse>(hbResp)); + getTopoCoord().updateTerm(1, Date_t()); + + Status result = getTopoCoord().becomeCandidateIfElectable( + Date_t(), TopologyCoordinator::StartElectionReason::kCatchupTakeover); + ASSERT_NOT_OK(result); + ASSERT_STRING_CONTAINS(result.reason(), + "member is either not the most up-to-date member or not ahead of the " + "primary, and therefore cannot call for catchup takeover"); +} + +TEST_F(TopoCoordTest, NodeDoesntDoCatchupTakeoverIfTermNumbersSayPrimaryCaughtUp) { + updateConfig(BSON("_id" + << "rs0" + << "version" + << 5 + << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "host1:27017") + << BSON("_id" << 2 << "host" + << "host2:27017") + << BSON("_id" << 3 << "host" + << "host3:27017")) + << "protocolVersion" + << 1 + << "settings" + + << BSON("heartbeatTimeoutSecs" << 5)), + 0); + + setSelfMemberState(MemberState::RS_SECONDARY); + + OpTime currentOptime(Timestamp(200, 1), 1); + OpTime behindOptime(Timestamp(100, 1), 0); + + // Create a mock heartbeat response to be able to compare who is the freshest node. + ReplSetHeartbeatResponse hbResp = ReplSetHeartbeatResponse(); + hbResp.setState(MemberState::RS_SECONDARY); + hbResp.setAppliedOpTime(currentOptime); + hbResp.setTerm(1); + + Date_t firstRequestDate = unittest::assertGet(dateFromISOString("2014-08-29T13:00Z")); + + getTopoCoord().prepareHeartbeatRequestV1(firstRequestDate, "rs0", HostAndPort("host2:27017")); + getTopoCoord().prepareHeartbeatRequestV1(firstRequestDate, "rs0", HostAndPort("host3:27017")); + + // Simulates a scenario where the node hasn't received a heartbeat from the primary in a while + // but the primary is caught up and has written something. The node is aware of this change + // and as a result realizes the primary is caught up. + getTopoCoord().getMyMemberData()->setLastAppliedOpTime(currentOptime, Date_t()); + getTopoCoord().processHeartbeatResponse(firstRequestDate + Milliseconds(1000), + Milliseconds(999), + HostAndPort("host3:27017"), + StatusWith<ReplSetHeartbeatResponse>(hbResp)); + hbResp.setAppliedOpTime(behindOptime); + hbResp.setState(MemberState::RS_PRIMARY); + getTopoCoord().processHeartbeatResponse(firstRequestDate + Milliseconds(1000), + Milliseconds(999), + HostAndPort("host2:27017"), + StatusWith<ReplSetHeartbeatResponse>(hbResp)); + getTopoCoord().updateTerm(1, Date_t()); + + Status result = getTopoCoord().becomeCandidateIfElectable( + Date_t(), TopologyCoordinator::StartElectionReason::kCatchupTakeover); + ASSERT_NOT_OK(result); + ASSERT_STRING_CONTAINS(result.reason(), + "member is either not the most up-to-date member or not ahead of the " + "primary, and therefore cannot call for catchup takeover"); +} + TEST_F(HeartbeatResponseTestV1, ScheduleACatchupTakeoverWhenElectableAndReceiveHeartbeatFromPrimaryInCatchup) { updateConfig(BSON("_id" @@ -3973,7 +4199,8 @@ TEST_F(HeartbeatResponseTestV1, ASSERT_NO_ACTION(nextAction.getAction()); ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); // We are electable now. - ASSERT_OK(getTopoCoord().becomeCandidateIfElectable(now(), false)); + ASSERT_OK(getTopoCoord().becomeCandidateIfElectable( + now(), TopologyCoordinator::StartElectionReason::kElectionTimeout)); ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); } @@ -3998,7 +4225,8 @@ TEST_F(HeartbeatResponseTestV1, ScheduleElectionWhenPrimaryIsMarkedDownAndWeAreE ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); // We are electable now. - ASSERT_OK(getTopoCoord().becomeCandidateIfElectable(now(), false)); + ASSERT_OK(getTopoCoord().becomeCandidateIfElectable( + now(), TopologyCoordinator::StartElectionReason::kElectionTimeout)); ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); } diff --git a/src/mongo/shell/replsettest.js b/src/mongo/shell/replsettest.js index d0ceb2520d1..3666f6e1d21 100644 --- a/src/mongo/shell/replsettest.js +++ b/src/mongo/shell/replsettest.js @@ -602,8 +602,9 @@ var ReplSetTest = function(opts) { }; /** - * Blocking call, which will wait for a primary to be elected for some pre-defined timeout and - * if primary is available will return a connection to it. Otherwise throws an exception. + * Blocking call, which will wait for a primary to be elected and become master for some + * pre-defined timeout. If a primary is available it will return a connection to it. + * Otherwise throws an exception. */ this.getPrimary = function(timeout) { timeout = timeout || self.kDefaultTimeoutMS; |