summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJudah Schvimer <judah@mongodb.com>2017-09-15 12:24:42 -0400
committerJudah Schvimer <judah@mongodb.com>2017-09-15 12:24:42 -0400
commitf1bf0b33b4f1ce7bb50f208ef5e2d736ef5eba68 (patch)
tree67db4cd6d74deec4f1527508217ac8af9ea922d3
parent84cb3ecc1e249f2d96a6d12c2cf2d22516e59efa (diff)
downloadmongo-f1bf0b33b4f1ce7bb50f208ef5e2d736ef5eba68.tar.gz
SERVER-29937 Make sure liveness timeouts cannot be missed
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp28
-rw-r--r--src/mongo/executor/task_executor.h2
-rw-r--r--src/mongo/executor/task_executor_test_common.cpp8
3 files changed, 27 insertions, 11 deletions
diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
index e7b2e77b83e..66ffbb76192 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
@@ -688,18 +688,24 @@ void ReplicationCoordinatorImpl::_scheduleNextLivenessUpdate_inlock() {
}
auto nextTimeout = earliestDate + _rsConfig.getElectionTimeoutPeriod();
- if (nextTimeout > _replExecutor->now()) {
- LOG(3) << "scheduling next check at " << nextTimeout;
- auto cbh = _scheduleWorkAt(nextTimeout,
- stdx::bind(&ReplicationCoordinatorImpl::_handleLivenessTimeout,
- this,
- stdx::placeholders::_1));
- if (!cbh) {
- return;
- }
- _handleLivenessTimeoutCbh = cbh;
- _earliestMemberId = earliestMemberId;
+ LOG(3) << "scheduling next check at " << nextTimeout;
+
+ // It is possible we will schedule the next timeout in the past.
+ // ThreadPoolTaskExecutor::_scheduleWorkAt() schedules its work immediately if it's given a
+ // time <= now().
+ // If we missed the timeout, it means that on our last check the earliest live member was
+ // just barely fresh and it has become stale since then. We must schedule another liveness
+ // check to continue conducting liveness checks and be able to step down from primary if we
+ // lose contact with a majority of nodes.
+ auto cbh = _scheduleWorkAt(nextTimeout,
+ stdx::bind(&ReplicationCoordinatorImpl::_handleLivenessTimeout,
+ this,
+ stdx::placeholders::_1));
+ if (!cbh) {
+ return;
}
+ _handleLivenessTimeoutCbh = cbh;
+ _earliestMemberId = earliestMemberId;
}
void ReplicationCoordinatorImpl::_cancelAndRescheduleLivenessUpdate_inlock(int updatedMemberId) {
diff --git a/src/mongo/executor/task_executor.h b/src/mongo/executor/task_executor.h
index 22146268788..8fd2149f524 100644
--- a/src/mongo/executor/task_executor.h
+++ b/src/mongo/executor/task_executor.h
@@ -200,6 +200,8 @@ public:
/**
* Schedules "work" to be run by the executor no sooner than "when".
*
+ * If "when" is <= now(), then it schedules the "work" to be run ASAP.
+ *
* Returns a handle for waiting on or canceling the callback, or
* ErrorCodes::ShutdownInProgress.
*
diff --git a/src/mongo/executor/task_executor_test_common.cpp b/src/mongo/executor/task_executor_test_common.cpp
index 7532dc09569..a3e0b784034 100644
--- a/src/mongo/executor/task_executor_test_common.cpp
+++ b/src/mongo/executor/task_executor_test_common.cpp
@@ -330,14 +330,22 @@ COMMON_EXECUTOR_TEST(ScheduleWorkAt) {
Status status1 = getDetectableErrorStatus();
Status status2 = getDetectableErrorStatus();
Status status3 = getDetectableErrorStatus();
+ Status status4 = getDetectableErrorStatus();
+
const Date_t now = net->now();
const TaskExecutor::CallbackHandle cb1 = unittest::assertGet(executor.scheduleWorkAt(
now + Milliseconds(100), stdx::bind(setStatus, stdx::placeholders::_1, &status1)));
+ const TaskExecutor::CallbackHandle cb4 = unittest::assertGet(executor.scheduleWorkAt(
+ now - Milliseconds(50), stdx::bind(setStatus, stdx::placeholders::_1, &status4)));
unittest::assertGet(executor.scheduleWorkAt(
now + Milliseconds(5000), stdx::bind(setStatus, stdx::placeholders::_1, &status3)));
const TaskExecutor::CallbackHandle cb2 = unittest::assertGet(executor.scheduleWorkAt(
now + Milliseconds(200),
stdx::bind(setStatusAndShutdown, stdx::placeholders::_1, &status2)));
+
+ executor.wait(cb4);
+ ASSERT_OK(status4);
+
const Date_t startTime = net->now();
net->enterNetwork();
net->runUntil(startTime + Milliseconds(200));