diff options
author | Judah Schvimer <judah@mongodb.com> | 2017-09-15 12:24:42 -0400 |
---|---|---|
committer | Judah Schvimer <judah@mongodb.com> | 2017-09-15 12:24:42 -0400 |
commit | f1bf0b33b4f1ce7bb50f208ef5e2d736ef5eba68 (patch) | |
tree | 67db4cd6d74deec4f1527508217ac8af9ea922d3 | |
parent | 84cb3ecc1e249f2d96a6d12c2cf2d22516e59efa (diff) | |
download | mongo-f1bf0b33b4f1ce7bb50f208ef5e2d736ef5eba68.tar.gz |
SERVER-29937 Make sure liveness timeouts cannot be missed
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp | 28 | ||||
-rw-r--r-- | src/mongo/executor/task_executor.h | 2 | ||||
-rw-r--r-- | src/mongo/executor/task_executor_test_common.cpp | 8 |
3 files changed, 27 insertions, 11 deletions
diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp index e7b2e77b83e..66ffbb76192 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp @@ -688,18 +688,24 @@ void ReplicationCoordinatorImpl::_scheduleNextLivenessUpdate_inlock() { } auto nextTimeout = earliestDate + _rsConfig.getElectionTimeoutPeriod(); - if (nextTimeout > _replExecutor->now()) { - LOG(3) << "scheduling next check at " << nextTimeout; - auto cbh = _scheduleWorkAt(nextTimeout, - stdx::bind(&ReplicationCoordinatorImpl::_handleLivenessTimeout, - this, - stdx::placeholders::_1)); - if (!cbh) { - return; - } - _handleLivenessTimeoutCbh = cbh; - _earliestMemberId = earliestMemberId; + LOG(3) << "scheduling next check at " << nextTimeout; + + // It is possible we will schedule the next timeout in the past. + // ThreadPoolTaskExecutor::_scheduleWorkAt() schedules its work immediately if it's given a + // time <= now(). + // If we missed the timeout, it means that on our last check the earliest live member was + // just barely fresh and it has become stale since then. We must schedule another liveness + // check to continue conducting liveness checks and be able to step down from primary if we + // lose contact with a majority of nodes. + auto cbh = _scheduleWorkAt(nextTimeout, + stdx::bind(&ReplicationCoordinatorImpl::_handleLivenessTimeout, + this, + stdx::placeholders::_1)); + if (!cbh) { + return; } + _handleLivenessTimeoutCbh = cbh; + _earliestMemberId = earliestMemberId; } void ReplicationCoordinatorImpl::_cancelAndRescheduleLivenessUpdate_inlock(int updatedMemberId) { diff --git a/src/mongo/executor/task_executor.h b/src/mongo/executor/task_executor.h index 22146268788..8fd2149f524 100644 --- a/src/mongo/executor/task_executor.h +++ b/src/mongo/executor/task_executor.h @@ -200,6 +200,8 @@ public: /** * Schedules "work" to be run by the executor no sooner than "when". * + * If "when" is <= now(), then it schedules the "work" to be run ASAP. + * * Returns a handle for waiting on or canceling the callback, or * ErrorCodes::ShutdownInProgress. * diff --git a/src/mongo/executor/task_executor_test_common.cpp b/src/mongo/executor/task_executor_test_common.cpp index 7532dc09569..a3e0b784034 100644 --- a/src/mongo/executor/task_executor_test_common.cpp +++ b/src/mongo/executor/task_executor_test_common.cpp @@ -330,14 +330,22 @@ COMMON_EXECUTOR_TEST(ScheduleWorkAt) { Status status1 = getDetectableErrorStatus(); Status status2 = getDetectableErrorStatus(); Status status3 = getDetectableErrorStatus(); + Status status4 = getDetectableErrorStatus(); + const Date_t now = net->now(); const TaskExecutor::CallbackHandle cb1 = unittest::assertGet(executor.scheduleWorkAt( now + Milliseconds(100), stdx::bind(setStatus, stdx::placeholders::_1, &status1))); + const TaskExecutor::CallbackHandle cb4 = unittest::assertGet(executor.scheduleWorkAt( + now - Milliseconds(50), stdx::bind(setStatus, stdx::placeholders::_1, &status4))); unittest::assertGet(executor.scheduleWorkAt( now + Milliseconds(5000), stdx::bind(setStatus, stdx::placeholders::_1, &status3))); const TaskExecutor::CallbackHandle cb2 = unittest::assertGet(executor.scheduleWorkAt( now + Milliseconds(200), stdx::bind(setStatusAndShutdown, stdx::placeholders::_1, &status2))); + + executor.wait(cb4); + ASSERT_OK(status4); + const Date_t startTime = net->now(); net->enterNetwork(); net->runUntil(startTime + Milliseconds(200)); |