summaryrefslogtreecommitdiff
path: root/src/mongo
diff options
context:
space:
mode:
authorBenety Goh <benety@mongodb.com>2015-09-26 00:04:55 -0400
committerBenety Goh <benety@mongodb.com>2015-09-30 12:24:37 -0400
commitc4e2be33524776da70d77ada71eaf03ecb8e7d44 (patch)
tree068f0bde9438b22590ce18b55cafa162bea98e1a /src/mongo
parente5fbc5fda5a0b65e994b17feed12cb6c00717acf (diff)
downloadmongo-c4e2be33524776da70d77ada71eaf03ecb8e7d44.tar.gz
SERVER-20671 step down restarts heartbeats before waiting for secondaries to catch up
This re-applies commit 3331d34e110f47b5ef27eff74c7c302483fcc8f9 and also fixes a race condition in the StepDownCatchUp test case by using the non-blocking version of stepDown.
Diffstat (limited to 'src/mongo')
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.cpp6
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_test.cpp23
2 files changed, 20 insertions, 9 deletions
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index b12beab4de7..a7e69a2e8c8 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -1377,6 +1377,12 @@ void ReplicationCoordinatorImpl::_stepDownContinue(
return;
}
allFinishedGuard.Dismiss();
+
+ // We send out a fresh round of heartbeats because stepping down successfully without
+ // {force: true} is dependent on timely heartbeat data.
+ stdx::lock_guard<stdx::mutex> lk(_mutex);
+ _cancelHeartbeats();
+ _startHeartbeats_inlock(cbData);
}
void ReplicationCoordinatorImpl::_handleTimePassing(
diff --git a/src/mongo/db/repl/replication_coordinator_impl_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_test.cpp
index 75d66b2039b..8dd77720a2d 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_test.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_test.cpp
@@ -1492,19 +1492,23 @@ TEST_F(StepDownTest, StepDownCatchUp) {
ASSERT_OK(getReplCoord()->setLastOptime_forTest(1, 1, optime1));
ASSERT_OK(getReplCoord()->setLastOptime_forTest(1, 2, optime1));
- // stepDown where the secondary actually has to catch up before the stepDown can succeed
- StepDownRunner runner(getReplCoord());
- runner.setForce(false);
- runner.setWaitTime(Milliseconds(10000));
- runner.setStepDownTime(Milliseconds(60000));
-
simulateSuccessfulElection();
- runner.start(&txn);
+ // Step down where the secondary actually has to catch up before the stepDown can succeed.
+ // On entering the network, _stepDownContinue should cancel the heartbeats scheduled for
+ // T + 2 seconds and send out a new round of heartbeats immediately.
+ // This makes it unnecessary to advance the clock after entering the network to process
+ // the heartbeat requests.
+ auto repl = getReplCoord();
+ Status result(ErrorCodes::InternalError, "not mutated");
+ auto globalReadLockAndEventHandle =
+ repl->stepDown_nonBlocking(&txn, false, Milliseconds(10000), Milliseconds(60000), &result);
+ const auto& eventHandle = globalReadLockAndEventHandle.second;
+ ASSERT_TRUE(eventHandle);
+ ASSERT_TRUE(txn.lockState()->isReadLocked());
// Make a secondary actually catch up
enterNetwork();
- getNet()->runUntil(getNet()->now() + Milliseconds(2000));
ASSERT(getNet()->hasReadyRequests());
NetworkInterfaceMock::NetworkOperationIterator noi = getNet()->getNextReadyRequest();
RemoteCommandRequest request = noi->getRequest();
@@ -1527,7 +1531,8 @@ TEST_F(StepDownTest, StepDownCatchUp) {
getNet()->runReadyNetworkOperations();
exitNetwork();
- ASSERT_OK(runner.getResult());
+ getReplExec()->waitForEvent(eventHandle);
+ ASSERT_OK(result);
ASSERT_TRUE(getReplCoord()->getMemberState().secondary());
}