From 2070d8ca09576b858c61677bc366a25e7ecfecad Mon Sep 17 00:00:00 2001 From: Matthew Russotto Date: Wed, 23 Oct 2019 14:48:52 +0000 Subject: SERVER-42385 Reduce log spam associated with election timeout resets. --- src/mongo/db/repl/replication_coordinator_impl.h | 1 + .../replication_coordinator_impl_heartbeat.cpp | 65 ++++++++++++----- .../db/repl/replication_coordinator_impl_test.cpp | 85 ++++++++++++++++++++++ .../repl/replication_coordinator_test_fixture.cpp | 2 + 4 files changed, 133 insertions(+), 20 deletions(-) diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h index b8ca6a8a262..17a74cfcd7f 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.h +++ b/src/mongo/db/repl/replication_coordinator_impl.h @@ -1246,6 +1246,7 @@ private: * Callback which starts an election if this node is electable and using protocolVersion 1. */ void _startElectSelfIfEligibleV1(StartElectionReasonEnum reason); + void _startElectSelfIfEligibleV1(WithLock, StartElectionReasonEnum reason); /** * Schedules work to be run no sooner than 'when' and returns handle to callback. diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp index a337b04507b..bc733ba3459 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp @@ -825,43 +825,68 @@ void ReplicationCoordinatorImpl::_cancelCatchupTakeover_inlock() { } void ReplicationCoordinatorImpl::_cancelAndRescheduleElectionTimeout_inlock() { - if (_handleElectionTimeoutCbh.isValid()) { - LOG(4) << "Canceling election timeout callback at " << _handleElectionTimeoutWhen; + // We log at level 5 except when: + // * This is the first time we're scheduling after becoming an electable secondary. + // * We are not going to reschedule the election timeout because we are shutting down or + // no longer an electable secondary. + // * It has been at least a second since we last logged at level 4. + // + // In those instances we log at level 4. This routine is called on every replication batch, + // which would produce many log lines per second, so this logging strategy provides a + // compromise which allows us to see the election timeout being rescheduled without spamming + // the logs. + int cancelAndRescheduleLogLevel = 5; + static auto logThrottleTime = _replExecutor->now(); + const bool wasActive = _handleElectionTimeoutCbh.isValid(); + auto now = _replExecutor->now(); + const bool doNotReschedule = _inShutdown || !_memberState.secondary() || _selfIndex < 0 || + !_rsConfig.getMemberAt(_selfIndex).isElectable(); + + if (doNotReschedule || !wasActive || (now - logThrottleTime) >= Seconds(1)) { + cancelAndRescheduleLogLevel = 4; + logThrottleTime = now; + } + if (wasActive) { + LOG_FOR_ELECTION(cancelAndRescheduleLogLevel) + << "Canceling election timeout callback at " << _handleElectionTimeoutWhen; _replExecutor->cancel(_handleElectionTimeoutCbh); _handleElectionTimeoutCbh = CallbackHandle(); _handleElectionTimeoutWhen = Date_t(); } - if (_inShutdown) { + if (doNotReschedule) return; - } - - if (!_memberState.secondary()) { - return; - } - - if (_selfIndex < 0) { - return; - } - - if (!_rsConfig.getMemberAt(_selfIndex).isElectable()) { - return; - } Milliseconds randomOffset = _getRandomizedElectionOffset_inlock(); - auto now = _replExecutor->now(); auto when = now + _rsConfig.getElectionTimeoutPeriod() + randomOffset; invariant(when > now); - LOG_FOR_ELECTION(4) << "Scheduling election timeout callback at " << when; + if (wasActive) { + // The log level here is 4 once per second, otherwise 5. + LOG_FOR_ELECTION(cancelAndRescheduleLogLevel) + << "Rescheduling election timeout callback at " << when; + } else { + LOG_FOR_ELECTION(4) << "Scheduling election timeout callback at " << when; + } _handleElectionTimeoutWhen = when; _handleElectionTimeoutCbh = - _scheduleWorkAt(when, [=](const mongo::executor::TaskExecutor::CallbackArgs&) { - _startElectSelfIfEligibleV1(StartElectionReasonEnum::kElectionTimeout); + _scheduleWorkAt(when, [=](const mongo::executor::TaskExecutor::CallbackArgs& cbData) { + stdx::lock_guard lk(_mutex); + if (_handleElectionTimeoutCbh == cbData.myHandle) { + // This lets _cancelAndRescheduleElectionTimeout_inlock know the callback + // has happened. + _handleElectionTimeoutCbh = CallbackHandle(); + } + _startElectSelfIfEligibleV1(lk, StartElectionReasonEnum::kElectionTimeout); }); } void ReplicationCoordinatorImpl::_startElectSelfIfEligibleV1(StartElectionReasonEnum reason) { stdx::lock_guard lock(_mutex); + _startElectSelfIfEligibleV1(lock, reason); +} + +void ReplicationCoordinatorImpl::_startElectSelfIfEligibleV1(WithLock, + StartElectionReasonEnum reason) { // If it is not a single node replica set, no need to start an election after stepdown timeout. if (reason == StartElectionReasonEnum::kSingleNodePromptElection && _rsConfig.getNumMembers() != 1) { diff --git a/src/mongo/db/repl/replication_coordinator_impl_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_test.cpp index cfdd4b29a05..f4bb250ce1d 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_test.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_test.cpp @@ -5159,6 +5159,91 @@ TEST_F(ReplCoordTest, ASSERT_EQUALS(electionTimeoutWhen, replCoord->getElectionTimeout_forTest()); } +TEST_F(ReplCoordTest, CancelAndRescheduleElectionTimeoutLogging) { + // Log all the election messages. + logger::globalLogDomain()->setMinimumLoggedSeverity(logger::LogComponent::kReplicationElection, + logger::LogSeverity::Debug(5)); + startCapturingLogMessages(); + // heartbeatTimeoutSecs is made large so we can advance the clock without worrying about + // additional heartbeats. + assertStartSuccess(BSON("_id" + << "mySet" + << "settings" << BSON("heartbeatTimeoutSecs" << 60000) + << "protocolVersion" << 1 << "version" << 2 << "members" + << BSON_ARRAY(BSON("host" + << "node1:12345" + << "_id" << 0) + << BSON("host" + << "node2:12345" + << "_id" << 1))), + HostAndPort("node1", 12345)); + + // Setting mode to secondary should schedule the election timeout. + ReplicationCoordinatorImpl* replCoord = getReplCoord(); + ASSERT_OK(replCoord->setFollowerMode(MemberState::RS_SECONDARY)); + ASSERT_EQ(1, countLogLinesContaining("Scheduling election timeout callback")); + ASSERT_EQ(0, countLogLinesContaining("Rescheduling election timeout callback")); + ASSERT_EQ(0, countLogLinesContaining("Canceling election timeout callback")); + + // Scheduling again should produce the "rescheduled", not the "scheduled", message . + replCoord->cancelAndRescheduleElectionTimeout(); + ASSERT_EQ(1, countLogLinesContaining("Scheduling election timeout callback")); + ASSERT_EQ(1, countLogLinesContaining("Rescheduling election timeout callback")); + ASSERT_EQ(1, countLogLinesContaining("Canceling election timeout callback")); + + auto net = getNet(); + net->enterNetwork(); + + // Black hole heartbeat request scheduled after transitioning to SECONDARY. + ASSERT_TRUE(net->hasReadyRequests()); + auto noi = net->getNextReadyRequest(); + const auto& request = noi->getRequest(); + ASSERT_EQUALS(HostAndPort("node2", 12345), request.target); + ASSERT_EQUALS("replSetHeartbeat", request.cmdObj.firstElement().fieldNameStringData()); + log() << "black holing " << noi->getRequest().cmdObj; + net->blackHole(noi); + + // Advance simulator clock to some time after the first scheduled election. + auto electionTimeoutWhen = replCoord->getElectionTimeout_forTest(); + log() << "Election initially scheduled at " << electionTimeoutWhen << " (simulator time)"; + ASSERT_GREATER_THAN(electionTimeoutWhen, net->now()); + auto until = electionTimeoutWhen + Milliseconds(1); + net->runUntil(until); + ASSERT_EQUALS(until, net->now()); + net->exitNetwork(); + + // The election should have scheduled (not rescheduled) another timeout. + ASSERT_EQ(2, countLogLinesContaining("Scheduling election timeout callback")); + ASSERT_EQ(1, countLogLinesContaining("Rescheduling election timeout callback")); + ASSERT_EQ(1, countLogLinesContaining("Canceling election timeout callback")); + + logger::globalLogDomain()->setMinimumLoggedSeverity(logger::LogComponent::kReplicationElection, + logger::LogSeverity::Debug(4)); + net->enterNetwork(); + until = electionTimeoutWhen + Milliseconds(500); + net->runUntil(until); + net->exitNetwork(); + replCoord->cancelAndRescheduleElectionTimeout(); + + // We should not see this reschedule because it should be at log level 5. + ASSERT_EQ(2, countLogLinesContaining("Scheduling election timeout callback")); + ASSERT_EQ(1, countLogLinesContaining("Rescheduling election timeout callback")); + ASSERT_EQ(1, countLogLinesContaining("Canceling election timeout callback")); + + net->enterNetwork(); + until = electionTimeoutWhen + Milliseconds(1001); + net->runUntil(until); + net->exitNetwork(); + replCoord->cancelAndRescheduleElectionTimeout(); + + stopCapturingLogMessages(); + // We should see this reschedule at level 4 because it has been over 1 sec since we logged + // at level 4. + ASSERT_EQ(2, countLogLinesContaining("Scheduling election timeout callback")); + ASSERT_EQ(2, countLogLinesContaining("Rescheduling election timeout callback")); + ASSERT_EQ(2, countLogLinesContaining("Canceling election timeout callback")); +} + TEST_F(ReplCoordTest, AdvanceCommittedSnapshotToMostRecentSnapshotPriorToOpTimeWhenOpTimeChanges) { init("mySet"); diff --git a/src/mongo/db/repl/replication_coordinator_test_fixture.cpp b/src/mongo/db/repl/replication_coordinator_test_fixture.cpp index f3a0b72e636..b5b3cd407ba 100644 --- a/src/mongo/db/repl/replication_coordinator_test_fixture.cpp +++ b/src/mongo/db/repl/replication_coordinator_test_fixture.cpp @@ -86,6 +86,8 @@ ReplCoordTest::ReplCoordTest() { } ReplCoordTest::~ReplCoordTest() { + logger::globalLogDomain()->clearMinimumLoggedSeverity( + logger::LogComponent::kReplicationElection); globalFailPointRegistry().find("blockHeartbeatReconfigFinish")->setMode(FailPoint::off); if (_callShutdown) { -- cgit v1.2.1