summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJudah Schvimer <judah@mongodb.com>2017-09-15 12:24:42 -0400
committerJudah Schvimer <judah@mongodb.com>2017-10-30 12:29:08 -0400
commit6fdbdf619aed482bbe24ac3c27f8d4a9700a5937 (patch)
treef46d56d39257d864bad660ae202ec404c5c66e65
parent918524be973eb056909f035df141b0dd5765ead1 (diff)
downloadmongo-6fdbdf619aed482bbe24ac3c27f8d4a9700a5937.tar.gz
SERVER-29937 Make sure liveness timeouts cannot be missed
(cherry picked from commit f1bf0b33b4f1ce7bb50f208ef5e2d736ef5eba68)
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp30
-rw-r--r--src/mongo/db/repl/replication_executor_test.cpp19
-rw-r--r--src/mongo/executor/task_executor.h2
-rw-r--r--src/mongo/executor/task_executor_test_common.cpp8
4 files changed, 45 insertions, 14 deletions
diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
index aa3548335c1..8a3d9532d1a 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
@@ -720,19 +720,25 @@ void ReplicationCoordinatorImpl::_scheduleNextLivenessUpdate_inlock(
}
auto nextTimeout = earliestDate + _rsConfig.getElectionTimeoutPeriod();
- if (nextTimeout > _replExecutor.now()) {
- LOG(3) << "scheduling next check at " << nextTimeout;
- auto cbh = _replExecutor.scheduleWorkAt(
- nextTimeout,
- stdx::bind(
- &ReplicationCoordinatorImpl::_handleLivenessTimeout, this, stdx::placeholders::_1));
- if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) {
- return;
- }
- fassert(22002, cbh.getStatus());
- _handleLivenessTimeoutCbh = cbh.getValue();
- _earliestMemberId = earliestMemberId;
+ LOG(3) << "scheduling next check at " << nextTimeout;
+
+ // It is possible we will schedule the next timeout in the past.
+ // ReplicationExecutor::scheduleWorkAt() schedules its work immediately if it's given a
+ // time <= now().
+ // If we missed the timeout, it means that on our last check the earliest live member was
+ // just barely fresh and it has become stale since then. We must schedule another liveness
+ // check to continue conducting liveness checks and be able to step down from primary if we
+ // lose contact with a majority of nodes.
+ auto cbh = _replExecutor.scheduleWorkAt(
+ nextTimeout,
+ stdx::bind(
+ &ReplicationCoordinatorImpl::_handleLivenessTimeout, this, stdx::placeholders::_1));
+ if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) {
+ return;
}
+ fassert(22002, cbh.getStatus());
+ _handleLivenessTimeoutCbh = cbh.getValue();
+ _earliestMemberId = earliestMemberId;
}
void ReplicationCoordinatorImpl::_cancelAndRescheduleLivenessUpdate_inlock(int updatedMemberId) {
diff --git a/src/mongo/db/repl/replication_executor_test.cpp b/src/mongo/db/repl/replication_executor_test.cpp
index 60259be0b52..dbb413a155d 100644
--- a/src/mongo/db/repl/replication_executor_test.cpp
+++ b/src/mongo/db/repl/replication_executor_test.cpp
@@ -31,15 +31,15 @@
#include <map>
#include "mongo/base/init.h"
-#include "mongo/executor/task_executor_test_common.h"
#include "mongo/db/namespace_string.h"
#include "mongo/db/operation_context.h"
#include "mongo/db/repl/replication_executor.h"
#include "mongo/db/repl/replication_executor_test_fixture.h"
#include "mongo/db/repl/storage_interface_mock.h"
#include "mongo/executor/network_interface_mock.h"
-#include "mongo/stdx/memory.h"
+#include "mongo/executor/task_executor_test_common.h"
#include "mongo/stdx/functional.h"
+#include "mongo/stdx/memory.h"
#include "mongo/stdx/thread.h"
#include "mongo/unittest/barrier.h"
#include "mongo/unittest/unittest.h"
@@ -229,6 +229,21 @@ TEST_F(ReplicationExecutorTest, ScheduleCallbackAtNow) {
executor.waitForEvent(finishEvent);
}
+TEST_F(ReplicationExecutorTest, ScheduleCallbackInPast) {
+ launchExecutorThread();
+ getNet()->exitNetwork();
+
+ ReplicationExecutor& executor = getReplExecutor();
+ auto finishEvent = assertGet(executor.makeEvent());
+ auto fn = [&executor, finishEvent](const ReplicationExecutor::CallbackArgs& cbData) {
+ ASSERT_OK(cbData.status);
+ executor.signalEvent(finishEvent);
+ };
+
+ auto cb = executor.scheduleWorkAt(getNet()->now() - Milliseconds(1000), fn);
+ executor.waitForEvent(finishEvent);
+}
+
TEST_F(ReplicationExecutorTest, ScheduleCallbackAtAFutureTime) {
launchExecutorThread();
getNet()->exitNetwork();
diff --git a/src/mongo/executor/task_executor.h b/src/mongo/executor/task_executor.h
index ec68a837c8e..c7adf8cefe3 100644
--- a/src/mongo/executor/task_executor.h
+++ b/src/mongo/executor/task_executor.h
@@ -195,6 +195,8 @@ public:
/**
* Schedules "work" to be run by the executor no sooner than "when".
*
+ * If "when" is <= now(), then it schedules the "work" to be run ASAP.
+ *
* Returns a handle for waiting on or canceling the callback, or
* ErrorCodes::ShutdownInProgress.
*
diff --git a/src/mongo/executor/task_executor_test_common.cpp b/src/mongo/executor/task_executor_test_common.cpp
index d57f2e15434..0d3ce8bbc69 100644
--- a/src/mongo/executor/task_executor_test_common.cpp
+++ b/src/mongo/executor/task_executor_test_common.cpp
@@ -335,14 +335,22 @@ COMMON_EXECUTOR_TEST(ScheduleWorkAt) {
Status status1 = getDetectableErrorStatus();
Status status2 = getDetectableErrorStatus();
Status status3 = getDetectableErrorStatus();
+ Status status4 = getDetectableErrorStatus();
+
const Date_t now = net->now();
const TaskExecutor::CallbackHandle cb1 = unittest::assertGet(executor.scheduleWorkAt(
now + Milliseconds(100), stdx::bind(setStatus, stdx::placeholders::_1, &status1)));
+ const TaskExecutor::CallbackHandle cb4 = unittest::assertGet(executor.scheduleWorkAt(
+ now - Milliseconds(50), stdx::bind(setStatus, stdx::placeholders::_1, &status4)));
unittest::assertGet(executor.scheduleWorkAt(
now + Milliseconds(5000), stdx::bind(setStatus, stdx::placeholders::_1, &status3)));
const TaskExecutor::CallbackHandle cb2 = unittest::assertGet(executor.scheduleWorkAt(
now + Milliseconds(200),
stdx::bind(setStatusAndShutdown, stdx::placeholders::_1, &status2)));
+
+ executor.wait(cb4);
+ ASSERT_OK(status4);
+
const Date_t startTime = net->now();
net->enterNetwork();
net->runUntil(startTime + Milliseconds(200));