summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthew Russotto <matthew.russotto@mongodb.com>2019-10-23 14:48:52 +0000
committerevergreen <evergreen@mongodb.com>2019-10-23 14:48:52 +0000
commit2070d8ca09576b858c61677bc366a25e7ecfecad (patch)
treeb49f2dc4a3b5107c366876c5152df1903b3cb7a0
parent1eb3e9a9b1b9792e04351ab6fd72438071560975 (diff)
downloadmongo-2070d8ca09576b858c61677bc366a25e7ecfecad.tar.gz
SERVER-42385 Reduce log spam associated with election timeout resets.
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.h1
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp65
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_test.cpp85
-rw-r--r--src/mongo/db/repl/replication_coordinator_test_fixture.cpp2
4 files changed, 133 insertions, 20 deletions
diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h
index b8ca6a8a262..17a74cfcd7f 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.h
+++ b/src/mongo/db/repl/replication_coordinator_impl.h
@@ -1246,6 +1246,7 @@ private:
* Callback which starts an election if this node is electable and using protocolVersion 1.
*/
void _startElectSelfIfEligibleV1(StartElectionReasonEnum reason);
+ void _startElectSelfIfEligibleV1(WithLock, StartElectionReasonEnum reason);
/**
* Schedules work to be run no sooner than 'when' and returns handle to callback.
diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
index a337b04507b..bc733ba3459 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
@@ -825,43 +825,68 @@ void ReplicationCoordinatorImpl::_cancelCatchupTakeover_inlock() {
}
void ReplicationCoordinatorImpl::_cancelAndRescheduleElectionTimeout_inlock() {
- if (_handleElectionTimeoutCbh.isValid()) {
- LOG(4) << "Canceling election timeout callback at " << _handleElectionTimeoutWhen;
+ // We log at level 5 except when:
+ // * This is the first time we're scheduling after becoming an electable secondary.
+ // * We are not going to reschedule the election timeout because we are shutting down or
+ // no longer an electable secondary.
+ // * It has been at least a second since we last logged at level 4.
+ //
+ // In those instances we log at level 4. This routine is called on every replication batch,
+ // which would produce many log lines per second, so this logging strategy provides a
+ // compromise which allows us to see the election timeout being rescheduled without spamming
+ // the logs.
+ int cancelAndRescheduleLogLevel = 5;
+ static auto logThrottleTime = _replExecutor->now();
+ const bool wasActive = _handleElectionTimeoutCbh.isValid();
+ auto now = _replExecutor->now();
+ const bool doNotReschedule = _inShutdown || !_memberState.secondary() || _selfIndex < 0 ||
+ !_rsConfig.getMemberAt(_selfIndex).isElectable();
+
+ if (doNotReschedule || !wasActive || (now - logThrottleTime) >= Seconds(1)) {
+ cancelAndRescheduleLogLevel = 4;
+ logThrottleTime = now;
+ }
+ if (wasActive) {
+ LOG_FOR_ELECTION(cancelAndRescheduleLogLevel)
+ << "Canceling election timeout callback at " << _handleElectionTimeoutWhen;
_replExecutor->cancel(_handleElectionTimeoutCbh);
_handleElectionTimeoutCbh = CallbackHandle();
_handleElectionTimeoutWhen = Date_t();
}
- if (_inShutdown) {
+ if (doNotReschedule)
return;
- }
-
- if (!_memberState.secondary()) {
- return;
- }
-
- if (_selfIndex < 0) {
- return;
- }
-
- if (!_rsConfig.getMemberAt(_selfIndex).isElectable()) {
- return;
- }
Milliseconds randomOffset = _getRandomizedElectionOffset_inlock();
- auto now = _replExecutor->now();
auto when = now + _rsConfig.getElectionTimeoutPeriod() + randomOffset;
invariant(when > now);
- LOG_FOR_ELECTION(4) << "Scheduling election timeout callback at " << when;
+ if (wasActive) {
+ // The log level here is 4 once per second, otherwise 5.
+ LOG_FOR_ELECTION(cancelAndRescheduleLogLevel)
+ << "Rescheduling election timeout callback at " << when;
+ } else {
+ LOG_FOR_ELECTION(4) << "Scheduling election timeout callback at " << when;
+ }
_handleElectionTimeoutWhen = when;
_handleElectionTimeoutCbh =
- _scheduleWorkAt(when, [=](const mongo::executor::TaskExecutor::CallbackArgs&) {
- _startElectSelfIfEligibleV1(StartElectionReasonEnum::kElectionTimeout);
+ _scheduleWorkAt(when, [=](const mongo::executor::TaskExecutor::CallbackArgs& cbData) {
+ stdx::lock_guard<Latch> lk(_mutex);
+ if (_handleElectionTimeoutCbh == cbData.myHandle) {
+ // This lets _cancelAndRescheduleElectionTimeout_inlock know the callback
+ // has happened.
+ _handleElectionTimeoutCbh = CallbackHandle();
+ }
+ _startElectSelfIfEligibleV1(lk, StartElectionReasonEnum::kElectionTimeout);
});
}
void ReplicationCoordinatorImpl::_startElectSelfIfEligibleV1(StartElectionReasonEnum reason) {
stdx::lock_guard<Latch> lock(_mutex);
+ _startElectSelfIfEligibleV1(lock, reason);
+}
+
+void ReplicationCoordinatorImpl::_startElectSelfIfEligibleV1(WithLock,
+ StartElectionReasonEnum reason) {
// If it is not a single node replica set, no need to start an election after stepdown timeout.
if (reason == StartElectionReasonEnum::kSingleNodePromptElection &&
_rsConfig.getNumMembers() != 1) {
diff --git a/src/mongo/db/repl/replication_coordinator_impl_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_test.cpp
index cfdd4b29a05..f4bb250ce1d 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_test.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_test.cpp
@@ -5159,6 +5159,91 @@ TEST_F(ReplCoordTest,
ASSERT_EQUALS(electionTimeoutWhen, replCoord->getElectionTimeout_forTest());
}
+TEST_F(ReplCoordTest, CancelAndRescheduleElectionTimeoutLogging) {
+ // Log all the election messages.
+ logger::globalLogDomain()->setMinimumLoggedSeverity(logger::LogComponent::kReplicationElection,
+ logger::LogSeverity::Debug(5));
+ startCapturingLogMessages();
+ // heartbeatTimeoutSecs is made large so we can advance the clock without worrying about
+ // additional heartbeats.
+ assertStartSuccess(BSON("_id"
+ << "mySet"
+ << "settings" << BSON("heartbeatTimeoutSecs" << 60000)
+ << "protocolVersion" << 1 << "version" << 2 << "members"
+ << BSON_ARRAY(BSON("host"
+ << "node1:12345"
+ << "_id" << 0)
+ << BSON("host"
+ << "node2:12345"
+ << "_id" << 1))),
+ HostAndPort("node1", 12345));
+
+ // Setting mode to secondary should schedule the election timeout.
+ ReplicationCoordinatorImpl* replCoord = getReplCoord();
+ ASSERT_OK(replCoord->setFollowerMode(MemberState::RS_SECONDARY));
+ ASSERT_EQ(1, countLogLinesContaining("Scheduling election timeout callback"));
+ ASSERT_EQ(0, countLogLinesContaining("Rescheduling election timeout callback"));
+ ASSERT_EQ(0, countLogLinesContaining("Canceling election timeout callback"));
+
+ // Scheduling again should produce the "rescheduled", not the "scheduled", message .
+ replCoord->cancelAndRescheduleElectionTimeout();
+ ASSERT_EQ(1, countLogLinesContaining("Scheduling election timeout callback"));
+ ASSERT_EQ(1, countLogLinesContaining("Rescheduling election timeout callback"));
+ ASSERT_EQ(1, countLogLinesContaining("Canceling election timeout callback"));
+
+ auto net = getNet();
+ net->enterNetwork();
+
+ // Black hole heartbeat request scheduled after transitioning to SECONDARY.
+ ASSERT_TRUE(net->hasReadyRequests());
+ auto noi = net->getNextReadyRequest();
+ const auto& request = noi->getRequest();
+ ASSERT_EQUALS(HostAndPort("node2", 12345), request.target);
+ ASSERT_EQUALS("replSetHeartbeat", request.cmdObj.firstElement().fieldNameStringData());
+ log() << "black holing " << noi->getRequest().cmdObj;
+ net->blackHole(noi);
+
+ // Advance simulator clock to some time after the first scheduled election.
+ auto electionTimeoutWhen = replCoord->getElectionTimeout_forTest();
+ log() << "Election initially scheduled at " << electionTimeoutWhen << " (simulator time)";
+ ASSERT_GREATER_THAN(electionTimeoutWhen, net->now());
+ auto until = electionTimeoutWhen + Milliseconds(1);
+ net->runUntil(until);
+ ASSERT_EQUALS(until, net->now());
+ net->exitNetwork();
+
+ // The election should have scheduled (not rescheduled) another timeout.
+ ASSERT_EQ(2, countLogLinesContaining("Scheduling election timeout callback"));
+ ASSERT_EQ(1, countLogLinesContaining("Rescheduling election timeout callback"));
+ ASSERT_EQ(1, countLogLinesContaining("Canceling election timeout callback"));
+
+ logger::globalLogDomain()->setMinimumLoggedSeverity(logger::LogComponent::kReplicationElection,
+ logger::LogSeverity::Debug(4));
+ net->enterNetwork();
+ until = electionTimeoutWhen + Milliseconds(500);
+ net->runUntil(until);
+ net->exitNetwork();
+ replCoord->cancelAndRescheduleElectionTimeout();
+
+ // We should not see this reschedule because it should be at log level 5.
+ ASSERT_EQ(2, countLogLinesContaining("Scheduling election timeout callback"));
+ ASSERT_EQ(1, countLogLinesContaining("Rescheduling election timeout callback"));
+ ASSERT_EQ(1, countLogLinesContaining("Canceling election timeout callback"));
+
+ net->enterNetwork();
+ until = electionTimeoutWhen + Milliseconds(1001);
+ net->runUntil(until);
+ net->exitNetwork();
+ replCoord->cancelAndRescheduleElectionTimeout();
+
+ stopCapturingLogMessages();
+ // We should see this reschedule at level 4 because it has been over 1 sec since we logged
+ // at level 4.
+ ASSERT_EQ(2, countLogLinesContaining("Scheduling election timeout callback"));
+ ASSERT_EQ(2, countLogLinesContaining("Rescheduling election timeout callback"));
+ ASSERT_EQ(2, countLogLinesContaining("Canceling election timeout callback"));
+}
+
TEST_F(ReplCoordTest, AdvanceCommittedSnapshotToMostRecentSnapshotPriorToOpTimeWhenOpTimeChanges) {
init("mySet");
diff --git a/src/mongo/db/repl/replication_coordinator_test_fixture.cpp b/src/mongo/db/repl/replication_coordinator_test_fixture.cpp
index f3a0b72e636..b5b3cd407ba 100644
--- a/src/mongo/db/repl/replication_coordinator_test_fixture.cpp
+++ b/src/mongo/db/repl/replication_coordinator_test_fixture.cpp
@@ -86,6 +86,8 @@ ReplCoordTest::ReplCoordTest() {
}
ReplCoordTest::~ReplCoordTest() {
+ logger::globalLogDomain()->clearMinimumLoggedSeverity(
+ logger::LogComponent::kReplicationElection);
globalFailPointRegistry().find("blockHeartbeatReconfigFinish")->setMode(FailPoint::off);
if (_callShutdown) {