diff options
author | Judah Schvimer <judah@mongodb.com> | 2017-09-15 12:24:42 -0400 |
---|---|---|
committer | Judah Schvimer <judah@mongodb.com> | 2017-11-15 10:49:54 -0500 |
commit | e267cc9db06685424a3b8e074b5aeedc95746e87 (patch) | |
tree | 2521430f76f1226a5aacc9b1f069be690c574932 | |
parent | 2c285f7c60fb0fd78f7dcef28aad1641e58cc157 (diff) | |
download | mongo-e267cc9db06685424a3b8e074b5aeedc95746e87.tar.gz |
SERVER-29937 Make sure liveness timeouts cannot be missed
(cherry picked from commit f1bf0b33b4f1ce7bb50f208ef5e2d736ef5eba68)
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp | 28 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_executor_test.cpp | 15 | ||||
-rw-r--r-- | src/mongo/executor/task_executor.h | 2 | ||||
-rw-r--r-- | src/mongo/executor/task_executor_test_common.cpp | 8 |
4 files changed, 42 insertions, 11 deletions
diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp index b72d7e7dbcc..66c9799029d 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp @@ -778,18 +778,24 @@ void ReplicationCoordinatorImpl::_scheduleNextLivenessUpdate_inlock() { } auto nextTimeout = earliestDate + _rsConfig.getElectionTimeoutPeriod(); - if (nextTimeout > _replExecutor.now()) { - LOG(3) << "scheduling next check at " << nextTimeout; - auto cbh = _scheduleWorkAt(nextTimeout, - stdx::bind(&ReplicationCoordinatorImpl::_handleLivenessTimeout, - this, - stdx::placeholders::_1)); - if (!cbh) { - return; - } - _handleLivenessTimeoutCbh = cbh; - _earliestMemberId = earliestMemberId; + LOG(3) << "scheduling next check at " << nextTimeout; + + // It is possible we will schedule the next timeout in the past. + // ReplicationExecutor::_scheduleWorkAt() schedules its work immediately if it's given a + // time <= now(). + // If we missed the timeout, it means that on our last check the earliest live member was + // just barely fresh and it has become stale since then. We must schedule another liveness + // check to continue conducting liveness checks and be able to step down from primary if we + // lose contact with a majority of nodes. + auto cbh = _scheduleWorkAt(nextTimeout, + stdx::bind(&ReplicationCoordinatorImpl::_handleLivenessTimeout, + this, + stdx::placeholders::_1)); + if (!cbh) { + return; } + _handleLivenessTimeoutCbh = cbh; + _earliestMemberId = earliestMemberId; } void ReplicationCoordinatorImpl::_cancelAndRescheduleLivenessUpdate_inlock(int updatedMemberId) { diff --git a/src/mongo/db/repl/replication_executor_test.cpp b/src/mongo/db/repl/replication_executor_test.cpp index ba41df46769..2e8a66465de 100644 --- a/src/mongo/db/repl/replication_executor_test.cpp +++ b/src/mongo/db/repl/replication_executor_test.cpp @@ -249,6 +249,21 @@ TEST_F(ReplicationExecutorTest, ScheduleCallbackAtNow) { executor.waitForEvent(finishEvent); } +TEST_F(ReplicationExecutorTest, ScheduleCallbackInPast) { + launchExecutorThread(); + getNet()->exitNetwork(); + + ReplicationExecutor& executor = getReplExecutor(); + auto finishEvent = assertGet(executor.makeEvent()); + auto fn = [&executor, finishEvent](const ReplicationExecutor::CallbackArgs& cbData) { + ASSERT_OK(cbData.status); + executor.signalEvent(finishEvent); + }; + + auto cb = executor.scheduleWorkAt(getNet()->now() - Milliseconds(1000), fn); + executor.waitForEvent(finishEvent); +} + TEST_F(ReplicationExecutorTest, ScheduleCallbackAtAFutureTime) { launchExecutorThread(); getNet()->exitNetwork(); diff --git a/src/mongo/executor/task_executor.h b/src/mongo/executor/task_executor.h index 2d558512f91..ebf7c447ef0 100644 --- a/src/mongo/executor/task_executor.h +++ b/src/mongo/executor/task_executor.h @@ -196,6 +196,8 @@ public: /** * Schedules "work" to be run by the executor no sooner than "when". * + * If "when" is <= now(), then it schedules the "work" to be run ASAP. + * * Returns a handle for waiting on or canceling the callback, or * ErrorCodes::ShutdownInProgress. * diff --git a/src/mongo/executor/task_executor_test_common.cpp b/src/mongo/executor/task_executor_test_common.cpp index 57c12813250..d7a384231b8 100644 --- a/src/mongo/executor/task_executor_test_common.cpp +++ b/src/mongo/executor/task_executor_test_common.cpp @@ -328,14 +328,22 @@ COMMON_EXECUTOR_TEST(ScheduleWorkAt) { Status status1 = getDetectableErrorStatus(); Status status2 = getDetectableErrorStatus(); Status status3 = getDetectableErrorStatus(); + Status status4 = getDetectableErrorStatus(); + const Date_t now = net->now(); const TaskExecutor::CallbackHandle cb1 = unittest::assertGet(executor.scheduleWorkAt( now + Milliseconds(100), stdx::bind(setStatus, stdx::placeholders::_1, &status1))); + const TaskExecutor::CallbackHandle cb4 = unittest::assertGet(executor.scheduleWorkAt( + now - Milliseconds(50), stdx::bind(setStatus, stdx::placeholders::_1, &status4))); unittest::assertGet(executor.scheduleWorkAt( now + Milliseconds(5000), stdx::bind(setStatus, stdx::placeholders::_1, &status3))); const TaskExecutor::CallbackHandle cb2 = unittest::assertGet(executor.scheduleWorkAt( now + Milliseconds(200), stdx::bind(setStatusAndShutdown, stdx::placeholders::_1, &status2))); + + executor.wait(cb4); + ASSERT_OK(status4); + const Date_t startTime = net->now(); net->enterNetwork(); net->runUntil(startTime + Milliseconds(200)); |