diff options
author | Judah Schvimer <judah@mongodb.com> | 2017-09-15 12:24:42 -0400 |
---|---|---|
committer | Judah Schvimer <judah@mongodb.com> | 2017-10-30 12:29:08 -0400 |
commit | 6fdbdf619aed482bbe24ac3c27f8d4a9700a5937 (patch) | |
tree | f46d56d39257d864bad660ae202ec404c5c66e65 | |
parent | 918524be973eb056909f035df141b0dd5765ead1 (diff) | |
download | mongo-6fdbdf619aed482bbe24ac3c27f8d4a9700a5937.tar.gz |
SERVER-29937 Make sure liveness timeouts cannot be missed
(cherry picked from commit f1bf0b33b4f1ce7bb50f208ef5e2d736ef5eba68)
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp | 30 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_executor_test.cpp | 19 | ||||
-rw-r--r-- | src/mongo/executor/task_executor.h | 2 | ||||
-rw-r--r-- | src/mongo/executor/task_executor_test_common.cpp | 8 |
4 files changed, 45 insertions, 14 deletions
diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp index aa3548335c1..8a3d9532d1a 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp @@ -720,19 +720,25 @@ void ReplicationCoordinatorImpl::_scheduleNextLivenessUpdate_inlock( } auto nextTimeout = earliestDate + _rsConfig.getElectionTimeoutPeriod(); - if (nextTimeout > _replExecutor.now()) { - LOG(3) << "scheduling next check at " << nextTimeout; - auto cbh = _replExecutor.scheduleWorkAt( - nextTimeout, - stdx::bind( - &ReplicationCoordinatorImpl::_handleLivenessTimeout, this, stdx::placeholders::_1)); - if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { - return; - } - fassert(22002, cbh.getStatus()); - _handleLivenessTimeoutCbh = cbh.getValue(); - _earliestMemberId = earliestMemberId; + LOG(3) << "scheduling next check at " << nextTimeout; + + // It is possible we will schedule the next timeout in the past. + // ReplicationExecutor::scheduleWorkAt() schedules its work immediately if it's given a + // time <= now(). + // If we missed the timeout, it means that on our last check the earliest live member was + // just barely fresh and it has become stale since then. We must schedule another liveness + // check to continue conducting liveness checks and be able to step down from primary if we + // lose contact with a majority of nodes. + auto cbh = _replExecutor.scheduleWorkAt( + nextTimeout, + stdx::bind( + &ReplicationCoordinatorImpl::_handleLivenessTimeout, this, stdx::placeholders::_1)); + if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { + return; } + fassert(22002, cbh.getStatus()); + _handleLivenessTimeoutCbh = cbh.getValue(); + _earliestMemberId = earliestMemberId; } void ReplicationCoordinatorImpl::_cancelAndRescheduleLivenessUpdate_inlock(int updatedMemberId) { diff --git a/src/mongo/db/repl/replication_executor_test.cpp b/src/mongo/db/repl/replication_executor_test.cpp index 60259be0b52..dbb413a155d 100644 --- a/src/mongo/db/repl/replication_executor_test.cpp +++ b/src/mongo/db/repl/replication_executor_test.cpp @@ -31,15 +31,15 @@ #include <map> #include "mongo/base/init.h" -#include "mongo/executor/task_executor_test_common.h" #include "mongo/db/namespace_string.h" #include "mongo/db/operation_context.h" #include "mongo/db/repl/replication_executor.h" #include "mongo/db/repl/replication_executor_test_fixture.h" #include "mongo/db/repl/storage_interface_mock.h" #include "mongo/executor/network_interface_mock.h" -#include "mongo/stdx/memory.h" +#include "mongo/executor/task_executor_test_common.h" #include "mongo/stdx/functional.h" +#include "mongo/stdx/memory.h" #include "mongo/stdx/thread.h" #include "mongo/unittest/barrier.h" #include "mongo/unittest/unittest.h" @@ -229,6 +229,21 @@ TEST_F(ReplicationExecutorTest, ScheduleCallbackAtNow) { executor.waitForEvent(finishEvent); } +TEST_F(ReplicationExecutorTest, ScheduleCallbackInPast) { + launchExecutorThread(); + getNet()->exitNetwork(); + + ReplicationExecutor& executor = getReplExecutor(); + auto finishEvent = assertGet(executor.makeEvent()); + auto fn = [&executor, finishEvent](const ReplicationExecutor::CallbackArgs& cbData) { + ASSERT_OK(cbData.status); + executor.signalEvent(finishEvent); + }; + + auto cb = executor.scheduleWorkAt(getNet()->now() - Milliseconds(1000), fn); + executor.waitForEvent(finishEvent); +} + TEST_F(ReplicationExecutorTest, ScheduleCallbackAtAFutureTime) { launchExecutorThread(); getNet()->exitNetwork(); diff --git a/src/mongo/executor/task_executor.h b/src/mongo/executor/task_executor.h index ec68a837c8e..c7adf8cefe3 100644 --- a/src/mongo/executor/task_executor.h +++ b/src/mongo/executor/task_executor.h @@ -195,6 +195,8 @@ public: /** * Schedules "work" to be run by the executor no sooner than "when". * + * If "when" is <= now(), then it schedules the "work" to be run ASAP. + * * Returns a handle for waiting on or canceling the callback, or * ErrorCodes::ShutdownInProgress. * diff --git a/src/mongo/executor/task_executor_test_common.cpp b/src/mongo/executor/task_executor_test_common.cpp index d57f2e15434..0d3ce8bbc69 100644 --- a/src/mongo/executor/task_executor_test_common.cpp +++ b/src/mongo/executor/task_executor_test_common.cpp @@ -335,14 +335,22 @@ COMMON_EXECUTOR_TEST(ScheduleWorkAt) { Status status1 = getDetectableErrorStatus(); Status status2 = getDetectableErrorStatus(); Status status3 = getDetectableErrorStatus(); + Status status4 = getDetectableErrorStatus(); + const Date_t now = net->now(); const TaskExecutor::CallbackHandle cb1 = unittest::assertGet(executor.scheduleWorkAt( now + Milliseconds(100), stdx::bind(setStatus, stdx::placeholders::_1, &status1))); + const TaskExecutor::CallbackHandle cb4 = unittest::assertGet(executor.scheduleWorkAt( + now - Milliseconds(50), stdx::bind(setStatus, stdx::placeholders::_1, &status4))); unittest::assertGet(executor.scheduleWorkAt( now + Milliseconds(5000), stdx::bind(setStatus, stdx::placeholders::_1, &status3))); const TaskExecutor::CallbackHandle cb2 = unittest::assertGet(executor.scheduleWorkAt( now + Milliseconds(200), stdx::bind(setStatusAndShutdown, stdx::placeholders::_1, &status2))); + + executor.wait(cb4); + ASSERT_OK(status4); + const Date_t startTime = net->now(); net->enterNetwork(); net->runUntil(startTime + Milliseconds(200)); |