diff options
author | Tess Avitabile <tess.avitabile@mongodb.com> | 2018-07-25 16:59:49 -0400 |
---|---|---|
committer | Tess Avitabile <tess.avitabile@mongodb.com> | 2018-07-27 15:17:45 -0400 |
commit | b4549a63371f9b108e8d7042731c9f1eec622004 (patch) | |
tree | d3c7208aae0efe324b4301680c20bf872946eecb /src/mongo/db | |
parent | 2f358ee8e4ca9f6b5a92e4face04246117dea8c4 (diff) | |
download | mongo-b4549a63371f9b108e8d7042731c9f1eec622004.tar.gz |
SERVER-32906 Improve logging around elections
Diffstat (limited to 'src/mongo/db')
-rw-r--r-- | src/mongo/db/repl/check_quorum_for_config_change.cpp | 9 | ||||
-rw-r--r-- | src/mongo/db/repl/repl_set_request_votes_args.cpp | 12 | ||||
-rw-r--r-- | src/mongo/db/repl/repl_set_request_votes_args.h | 2 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl.cpp | 9 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp | 3 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp | 46 | ||||
-rw-r--r-- | src/mongo/db/repl/scatter_gather_runner.cpp | 16 | ||||
-rw-r--r-- | src/mongo/db/repl/scatter_gather_runner.h | 9 | ||||
-rw-r--r-- | src/mongo/db/repl/scatter_gather_test.cpp | 22 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator.cpp | 71 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator.h | 10 | ||||
-rw-r--r-- | src/mongo/db/repl/vote_requester.cpp | 4 |
12 files changed, 156 insertions, 57 deletions
diff --git a/src/mongo/db/repl/check_quorum_for_config_change.cpp b/src/mongo/db/repl/check_quorum_for_config_change.cpp index 403f7f8303a..67204eed65c 100644 --- a/src/mongo/db/repl/check_quorum_for_config_change.cpp +++ b/src/mongo/db/repl/check_quorum_for_config_change.cpp @@ -280,9 +280,10 @@ bool QuorumChecker::hasReceivedSufficientResponses() const { Status checkQuorumGeneral(executor::TaskExecutor* executor, const ReplSetConfig& rsConfig, const int myIndex, - long long term) { + long long term, + std::string logMessage) { auto checker = std::make_shared<QuorumChecker>(&rsConfig, myIndex, term); - ScatterGatherRunner runner(checker, executor); + ScatterGatherRunner runner(checker, executor, std::move(logMessage)); Status status = runner.run(); if (!status.isOK()) { return status; @@ -296,7 +297,7 @@ Status checkQuorumForInitiate(executor::TaskExecutor* executor, const int myIndex, long long term) { invariant(rsConfig.getConfigVersion() == 1); - return checkQuorumGeneral(executor, rsConfig, myIndex, term); + return checkQuorumGeneral(executor, rsConfig, myIndex, term, "initiate quorum check"); } Status checkQuorumForReconfig(executor::TaskExecutor* executor, @@ -304,7 +305,7 @@ Status checkQuorumForReconfig(executor::TaskExecutor* executor, const int myIndex, long long term) { invariant(rsConfig.getConfigVersion() > 1); - return checkQuorumGeneral(executor, rsConfig, myIndex, term); + return checkQuorumGeneral(executor, rsConfig, myIndex, term, "reconfig quorum check"); } } // namespace repl diff --git a/src/mongo/db/repl/repl_set_request_votes_args.cpp b/src/mongo/db/repl/repl_set_request_votes_args.cpp index 7f170973483..691f7602dbf 100644 --- a/src/mongo/db/repl/repl_set_request_votes_args.cpp +++ b/src/mongo/db/repl/repl_set_request_votes_args.cpp @@ -132,6 +132,12 @@ void ReplSetRequestVotesArgs::addToBSON(BSONObjBuilder* builder) const { _lastDurableOpTime.append(builder, kLastDurableOpTimeFieldName); } +std::string ReplSetRequestVotesArgs::toString() const { + BSONObjBuilder builder; + addToBSON(&builder); + return builder.done().toString(); +} + Status ReplSetRequestVotesResponse::initialize(const BSONObj& argsObj) { auto status = bsonExtractIntegerField(argsObj, kTermFieldName, &_term); if (!status.isOK()) @@ -184,5 +190,11 @@ BSONObj ReplSetRequestVotesResponse::toBSON() const { return builder.obj(); } +std::string ReplSetRequestVotesResponse::toString() const { + BSONObjBuilder builder; + addToBSON(&builder); + return builder.done().toString(); +} + } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/repl_set_request_votes_args.h b/src/mongo/db/repl/repl_set_request_votes_args.h index 7f1cf3bc72e..cc8d3f9e270 100644 --- a/src/mongo/db/repl/repl_set_request_votes_args.h +++ b/src/mongo/db/repl/repl_set_request_votes_args.h @@ -50,6 +50,7 @@ public: bool isADryRun() const; void addToBSON(BSONObjBuilder* builder) const; + std::string toString() const; private: std::string _setName; // Name of the replset. @@ -75,6 +76,7 @@ public: void addToBSON(BSONObjBuilder* builder) const; BSONObj toBSON() const; + std::string toString() const; private: long long _term = -1; diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index 7fa82e95e8b..51b7313efa9 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -27,6 +27,8 @@ */ #define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kReplication +#define LOG_FOR_ELECTION(level) \ + MONGO_LOG_COMPONENT(level, ::mongo::logger::LogComponent::kReplicationElection) #include "mongo/platform/basic.h" @@ -2703,6 +2705,13 @@ void ReplicationCoordinatorImpl::CatchupState::signalHeartbeatUpdate_inlock() { } log() << "Heartbeats updated catchup target optime to " << *targetOpTime; + log() << "Latest known optime per replica set member:"; + auto opTimesPerMember = _repl->_topCoord->latestKnownOpTimeSinceHeartbeatRestartPerMember(); + for (auto&& pair : opTimesPerMember) { + log() << "Member ID: " << pair.first + << ", latest known optime: " << (pair.second ? (*pair.second).toString() : "unknown"); + } + if (_waiter) { _repl->_opTimeWaiterList.remove_inlock(_waiter.get()); } diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp index cae89b0fe2a..f9d030458f2 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp @@ -26,7 +26,7 @@ * it in the license file. */ -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kReplication +#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kReplicationElection #include "mongo/platform/basic.h" @@ -53,6 +53,7 @@ public: if (_dismissed) { return; } + log() << "Lost " << (_isDryRun ? "dry run " : "") << "election due to internal error"; _replCoord->_topCoord->processLoseElection(); _replCoord->_voteRequester.reset(nullptr); if (_isDryRun && _replCoord->_electionDryRunFinishedEvent.isValid()) { diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp index a4bb3eae1ef..479881621eb 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp @@ -27,6 +27,8 @@ */ #define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kReplication +#define LOG_FOR_ELECTION(level) \ + MONGO_LOG_COMPONENT(level, ::mongo::logger::LogComponent::kReplicationElection) #define LOG_FOR_HEARTBEATS(level) \ MONGO_LOG_COMPONENT(level, ::mongo::logger::LogComponent::kReplicationHeartbeats) @@ -192,6 +194,7 @@ void ReplicationCoordinatorImpl::_handleHeartbeatResponse( // Postpone election timeout if we have a successful heartbeat response from the primary. if (hbResponse.hasState() && hbResponse.getState().primary() && hbResponse.getTerm() == _topCoord->getTerm()) { + LOG_FOR_ELECTION(4) << "Postponing election timeout due to heartbeat from primary"; _cancelAndRescheduleElectionTimeout_inlock(); } } else { @@ -270,7 +273,7 @@ stdx::unique_lock<stdx::mutex> ReplicationCoordinatorImpl::_handleHeartbeatRespo Milliseconds priorityTakeoverDelay = _rsConfig.getPriorityTakeoverDelay(_selfIndex); Milliseconds randomOffset = _getRandomizedElectionOffset_inlock(); _priorityTakeoverWhen = _replExecutor->now() + priorityTakeoverDelay + randomOffset; - log() << "Scheduling priority takeover at " << _priorityTakeoverWhen; + LOG_FOR_ELECTION(0) << "Scheduling priority takeover at " << _priorityTakeoverWhen; _priorityTakeoverCbh = _scheduleWorkAt( _priorityTakeoverWhen, [=](const mongo::executor::TaskExecutor::CallbackArgs&) { _startElectSelfIfEligibleV1( @@ -284,7 +287,7 @@ stdx::unique_lock<stdx::mutex> ReplicationCoordinatorImpl::_handleHeartbeatRespo if (!_catchupTakeoverCbh.isValid() && !_priorityTakeoverCbh.isValid()) { Milliseconds catchupTakeoverDelay = _rsConfig.getCatchUpTakeoverDelay(); _catchupTakeoverWhen = _replExecutor->now() + catchupTakeoverDelay; - log() << "Scheduling catchup takeover at " << _catchupTakeoverWhen; + LOG_FOR_ELECTION(0) << "Scheduling catchup takeover at " << _catchupTakeoverWhen; _catchupTakeoverCbh = _scheduleWorkAt( _catchupTakeoverWhen, [=](const mongo::executor::TaskExecutor::CallbackArgs&) { _startElectSelfIfEligibleV1( @@ -779,7 +782,7 @@ void ReplicationCoordinatorImpl::_cancelAndRescheduleElectionTimeout_inlock() { auto now = _replExecutor->now(); auto when = now + _rsConfig.getElectionTimeoutPeriod() + randomOffset; invariant(when > now); - LOG(4) << "Scheduling election timeout callback at " << when; + LOG_FOR_ELECTION(4) << "Scheduling election timeout callback at " << when; _handleElectionTimeoutWhen = when; _handleElectionTimeoutCbh = _scheduleWorkAt(when, [=](const mongo::executor::TaskExecutor::CallbackArgs&) { @@ -803,7 +806,7 @@ void ReplicationCoordinatorImpl::_startElectSelfIfEligibleV1( _cancelPriorityTakeover_inlock(); _cancelAndRescheduleElectionTimeout_inlock(); if (_inShutdown) { - log() << "Not starting an election, since we are shutting down"; + LOG_FOR_ELECTION(0) << "Not starting an election, since we are shutting down"; return; } } @@ -812,24 +815,26 @@ void ReplicationCoordinatorImpl::_startElectSelfIfEligibleV1( if (!status.isOK()) { switch (reason) { case TopologyCoordinator::StartElectionReason::kElectionTimeout: - log() << "Not starting an election, since we are not electable due to: " - << status.reason(); + LOG_FOR_ELECTION(0) + << "Not starting an election, since we are not electable due to: " + << status.reason(); break; case TopologyCoordinator::StartElectionReason::kPriorityTakeover: - log() << "Not starting an election for a priority takeover, " - << "since we are not electable due to: " << status.reason(); + LOG_FOR_ELECTION(0) << "Not starting an election for a priority takeover, " + << "since we are not electable due to: " << status.reason(); break; case TopologyCoordinator::StartElectionReason::kStepUpRequest: - log() << "Not starting an election for a replSetStepUp request, " - << "since we are not electable due to: " << status.reason(); + LOG_FOR_ELECTION(0) << "Not starting an election for a replSetStepUp request, " + << "since we are not electable due to: " << status.reason(); break; case TopologyCoordinator::StartElectionReason::kCatchupTakeover: - log() << "Not starting an election for a catchup takeover, " - << "since we are not electable due to: " << status.reason(); + LOG_FOR_ELECTION(0) << "Not starting an election for a catchup takeover, " + << "since we are not electable due to: " << status.reason(); break; case TopologyCoordinator::StartElectionReason::kSingleNodePromptElection: - log() << "Not starting an election for a single node replica set prompt election, " - << "since we are not electable due to: " << status.reason(); + LOG_FOR_ELECTION(0) + << "Not starting an election for a single node replica set prompt election, " + << "since we are not electable due to: " << status.reason(); break; } return; @@ -837,20 +842,21 @@ void ReplicationCoordinatorImpl::_startElectSelfIfEligibleV1( switch (reason) { case TopologyCoordinator::StartElectionReason::kElectionTimeout: - log() << "Starting an election, since we've seen no PRIMARY in the past " - << _rsConfig.getElectionTimeoutPeriod(); + LOG_FOR_ELECTION(0) << "Starting an election, since we've seen no PRIMARY in the past " + << _rsConfig.getElectionTimeoutPeriod(); break; case TopologyCoordinator::StartElectionReason::kPriorityTakeover: - log() << "Starting an election for a priority takeover"; + LOG_FOR_ELECTION(0) << "Starting an election for a priority takeover"; break; case TopologyCoordinator::StartElectionReason::kStepUpRequest: - log() << "Starting an election due to step up request"; + LOG_FOR_ELECTION(0) << "Starting an election due to step up request"; break; case TopologyCoordinator::StartElectionReason::kCatchupTakeover: - log() << "Starting an election for a catchup takeover"; + LOG_FOR_ELECTION(0) << "Starting an election for a catchup takeover"; break; case TopologyCoordinator::StartElectionReason::kSingleNodePromptElection: - log() << "Starting an election due to single node replica set prompt election"; + LOG_FOR_ELECTION(0) + << "Starting an election due to single node replica set prompt election"; break; } diff --git a/src/mongo/db/repl/scatter_gather_runner.cpp b/src/mongo/db/repl/scatter_gather_runner.cpp index 51430847968..bcb92216794 100644 --- a/src/mongo/db/repl/scatter_gather_runner.cpp +++ b/src/mongo/db/repl/scatter_gather_runner.cpp @@ -26,6 +26,8 @@ * it in the license file. */ +#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kReplication + #include "mongo/platform/basic.h" #include "mongo/db/repl/scatter_gather_runner.h" @@ -36,6 +38,7 @@ #include "mongo/db/repl/scatter_gather_algorithm.h" #include "mongo/stdx/functional.h" #include "mongo/util/assert_util.h" +#include "mongo/util/log.h" #include "mongo/util/scopeguard.h" namespace mongo { @@ -49,8 +52,10 @@ using RemoteCommandCallbackArgs = executor::TaskExecutor::RemoteCommandCallbackA using RemoteCommandCallbackFn = executor::TaskExecutor::RemoteCommandCallbackFn; ScatterGatherRunner::ScatterGatherRunner(std::shared_ptr<ScatterGatherAlgorithm> algorithm, - executor::TaskExecutor* executor) - : _executor(executor), _impl(std::make_shared<RunnerImpl>(std::move(algorithm), executor)) {} + executor::TaskExecutor* executor, + std::string logMessage) + : _executor(executor), + _impl(std::make_shared<RunnerImpl>(std::move(algorithm), executor, std::move(logMessage))) {} Status ScatterGatherRunner::run() { auto finishEvh = start(); @@ -80,8 +85,9 @@ void ScatterGatherRunner::cancel() { * Scatter gather runner implementation. */ ScatterGatherRunner::RunnerImpl::RunnerImpl(std::shared_ptr<ScatterGatherAlgorithm> algorithm, - executor::TaskExecutor* executor) - : _executor(executor), _algorithm(std::move(algorithm)) {} + executor::TaskExecutor* executor, + std::string logMessage) + : _executor(executor), _algorithm(std::move(algorithm)), _logMessage(std::move(logMessage)) {} StatusWith<EventHandle> ScatterGatherRunner::RunnerImpl::start( const RemoteCommandCallbackFn processResponseCB) { @@ -98,6 +104,8 @@ StatusWith<EventHandle> ScatterGatherRunner::RunnerImpl::start( std::vector<RemoteCommandRequest> requests = _algorithm->getRequests(); for (size_t i = 0; i < requests.size(); ++i) { + log() << "Scheduling remote command request for " << _logMessage << ": " + << requests[i].toString(); const StatusWith<CallbackHandle> cbh = _executor->scheduleRemoteCommand(requests[i], processResponseCB); if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { diff --git a/src/mongo/db/repl/scatter_gather_runner.h b/src/mongo/db/repl/scatter_gather_runner.h index ce12c99b1a8..8f98e673e72 100644 --- a/src/mongo/db/repl/scatter_gather_runner.h +++ b/src/mongo/db/repl/scatter_gather_runner.h @@ -56,9 +56,12 @@ public: * * "executor" must remain in scope until the runner's destructor completes. * "algorithm" is shared between the runner and the caller. + * "logMessage" is the process for which this ScatterGatherRunner is used. It will be included + * in log lines written by the ScatterGatherRunner for remote command requests. */ explicit ScatterGatherRunner(std::shared_ptr<ScatterGatherAlgorithm> algorithm, - executor::TaskExecutor* executor); + executor::TaskExecutor* executor, + std::string logMessage); /** * Runs the scatter-gather process and blocks until it completes. @@ -94,7 +97,8 @@ private: class RunnerImpl { public: explicit RunnerImpl(std::shared_ptr<ScatterGatherAlgorithm> algorithm, - executor::TaskExecutor* executor); + executor::TaskExecutor* executor, + std::string logMessage); /** * On success, returns an event handle that will be signaled when the runner has @@ -125,6 +129,7 @@ private: executor::TaskExecutor* _executor; // Not owned here. std::shared_ptr<ScatterGatherAlgorithm> _algorithm; + std::string _logMessage; executor::TaskExecutor::EventHandle _sufficientResponsesReceived; std::vector<executor::TaskExecutor::CallbackHandle> _callbacks; bool _started = false; diff --git a/src/mongo/db/repl/scatter_gather_test.cpp b/src/mongo/db/repl/scatter_gather_test.cpp index 6b7896fd2bd..1d5a32b4895 100644 --- a/src/mongo/db/repl/scatter_gather_test.cpp +++ b/src/mongo/db/repl/scatter_gather_test.cpp @@ -153,7 +153,7 @@ executor::TaskExecutor::CallbackFn getOnCompletionTestFunction(bool* ran) { // completed. TEST_F(ScatterGatherTest, DeleteAlgorithmAfterItHasCompleted) { auto sga = std::make_shared<ScatterGatherTestAlgorithm>(); - ScatterGatherRunner* sgr = new ScatterGatherRunner(sga, &getExecutor()); + ScatterGatherRunner* sgr = new ScatterGatherRunner(sga, &getExecutor(), "test"); bool ranCompletion = false; StatusWith<executor::TaskExecutor::EventHandle> status = sgr->start(); ASSERT_OK(getExecutor() @@ -195,7 +195,7 @@ TEST_F(ScatterGatherTest, DeleteAlgorithmAfterItHasCompleted) { TEST_F(ScatterGatherTest, DeleteAlgorithmBeforeItCompletes) { auto sga = std::make_shared<ScatterGatherTestAlgorithm>(); - ScatterGatherRunner* sgr = new ScatterGatherRunner(sga, &getExecutor()); + ScatterGatherRunner* sgr = new ScatterGatherRunner(sga, &getExecutor(), "test"); bool ranCompletion = false; StatusWith<executor::TaskExecutor::EventHandle> status = sgr->start(); ASSERT_OK(status.getStatus()); @@ -236,7 +236,7 @@ TEST_F(ScatterGatherTest, DeleteAlgorithmBeforeItCompletes) { TEST_F(ScatterGatherTest, DeleteAlgorithmAfterCancel) { auto sga = std::make_shared<ScatterGatherTestAlgorithm>(); - ScatterGatherRunner* sgr = new ScatterGatherRunner(sga, &getExecutor()); + ScatterGatherRunner* sgr = new ScatterGatherRunner(sga, &getExecutor(), "test"); bool ranCompletion = false; StatusWith<executor::TaskExecutor::EventHandle> status = sgr->start(); ASSERT_OK(status.getStatus()); @@ -274,7 +274,7 @@ TEST_F(ScatterGatherTest, DeleteAlgorithmAfterCancel) { // to return ErrorCodes::ShutdownInProgress. TEST_F(ScatterGatherTest, ShutdownExecutorBeforeRun) { auto sga = std::make_shared<ScatterGatherTestAlgorithm>(); - ScatterGatherRunner sgr(sga, &getExecutor()); + ScatterGatherRunner sgr(sga, &getExecutor(), "test"); shutdownExecutorThread(); sga->finish(); Status status = sgr.run(); @@ -285,7 +285,7 @@ TEST_F(ScatterGatherTest, ShutdownExecutorBeforeRun) { // finishes will cause run() to return Status::OK(). TEST_F(ScatterGatherTest, ShutdownExecutorAfterRun) { auto sga = std::make_shared<ScatterGatherTestAlgorithm>(); - ScatterGatherRunner sgr(sga, &getExecutor()); + ScatterGatherRunner sgr(sga, &getExecutor(), "test"); ScatterGatherRunnerRunner sgrr(&sgr, &getExecutor()); sgrr.run(); // need to wait for the scatter-gather to be scheduled in the executor @@ -307,7 +307,7 @@ TEST_F(ScatterGatherTest, ShutdownExecutorAfterRun) { // to return ErrorCodes::ShutdownInProgress and should not run onCompletion(). TEST_F(ScatterGatherTest, ShutdownExecutorBeforeStart) { auto sga = std::make_shared<ScatterGatherTestAlgorithm>(); - ScatterGatherRunner sgr(sga, &getExecutor()); + ScatterGatherRunner sgr(sga, &getExecutor(), "test"); shutdownExecutorThread(); bool ranCompletion = false; StatusWith<executor::TaskExecutor::EventHandle> status = sgr.start(); @@ -320,7 +320,7 @@ TEST_F(ScatterGatherTest, ShutdownExecutorBeforeStart) { // to return Status::OK and should not run onCompletion(). TEST_F(ScatterGatherTest, ShutdownExecutorAfterStart) { auto sga = std::make_shared<ScatterGatherTestAlgorithm>(); - ScatterGatherRunner sgr(sga, &getExecutor()); + ScatterGatherRunner sgr(sga, &getExecutor(), "test"); bool ranCompletion = false; StatusWith<executor::TaskExecutor::EventHandle> status = sgr.start(); ASSERT_OK(getExecutor() @@ -335,7 +335,7 @@ TEST_F(ScatterGatherTest, ShutdownExecutorAfterStart) { // Confirm that responses are not processed once sufficient responses have been received. TEST_F(ScatterGatherTest, DoNotProcessMoreThanSufficientResponses) { auto sga = std::make_shared<ScatterGatherTestAlgorithm>(); - ScatterGatherRunner sgr(sga, &getExecutor()); + ScatterGatherRunner sgr(sga, &getExecutor(), "test"); bool ranCompletion = false; StatusWith<executor::TaskExecutor::EventHandle> status = sgr.start(); ASSERT_OK(getExecutor() @@ -378,7 +378,7 @@ TEST_F(ScatterGatherTest, DoNotProcessMoreThanSufficientResponses) { // and that the algorithm processes the response correctly. TEST_F(ScatterGatherTest, AlgorithmProcessesCallbackCanceledResponse) { auto sga = std::make_shared<ScatterGatherTestAlgorithm>(); - ScatterGatherRunner sgr(sga, &getExecutor()); + ScatterGatherRunner sgr(sga, &getExecutor(), "test"); bool ranCompletion = false; StatusWith<executor::TaskExecutor::EventHandle> status = sgr.start(); ASSERT_OK(getExecutor() @@ -422,7 +422,7 @@ TEST_F(ScatterGatherTest, DoNotCreateCallbacksIfHasSufficientResponsesReturnsTru auto sga = std::make_shared<ScatterGatherTestAlgorithm>(); // set hasReceivedSufficientResponses to return true before the run starts sga->finish(); - ScatterGatherRunner sgr(sga, &getExecutor()); + ScatterGatherRunner sgr(sga, &getExecutor(), "test"); bool ranCompletion = false; StatusWith<executor::TaskExecutor::EventHandle> status = sgr.start(); ASSERT_OK(getExecutor() @@ -485,7 +485,7 @@ TEST_F(ScatterGatherTest, DoNotCreateCallbacksIfHasSufficientResponsesReturnsTru // Confirm that running via run() will finish once sufficient responses have been received. TEST_F(ScatterGatherTest, SuccessfulScatterGatherViaRun) { auto sga = std::make_shared<ScatterGatherTestAlgorithm>(); - ScatterGatherRunner sgr(sga, &getExecutor()); + ScatterGatherRunner sgr(sga, &getExecutor(), "test"); ScatterGatherRunnerRunner sgrr(&sgr, &getExecutor()); sgrr.run(); diff --git a/src/mongo/db/repl/topology_coordinator.cpp b/src/mongo/db/repl/topology_coordinator.cpp index 85f6c4be236..03a7df790ce 100644 --- a/src/mongo/db/repl/topology_coordinator.cpp +++ b/src/mongo/db/repl/topology_coordinator.cpp @@ -27,6 +27,8 @@ */ #define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kReplication +#define LOG_FOR_ELECTION(level) \ + MONGO_LOG_COMPONENT(level, ::mongo::logger::LogComponent::kReplicationElection) #include "mongo/platform/basic.h" @@ -1096,21 +1098,24 @@ HeartbeatResponseAction TopologyCoordinator::_updatePrimaryFromHBDataV1( if (!catchupTakeoverDisabled && (_memberData.at(primaryIndex).getLastAppliedOpTime() < _memberData.at(_selfIndex).getLastAppliedOpTime())) { - LOG(2) << "I can take over the primary due to fresher data." - << " Current primary index: " << primaryIndex << " in term " - << _memberData.at(primaryIndex).getTerm() << "." - << " Current primary optime: " - << _memberData.at(primaryIndex).getLastAppliedOpTime() - << " My optime: " << _memberData.at(_selfIndex).getLastAppliedOpTime(); + LOG_FOR_ELECTION(2) << "I can take over the primary due to fresher data." + << " Current primary index: " << primaryIndex << " in term " + << _memberData.at(primaryIndex).getTerm() << "." + << " Current primary optime: " + << _memberData.at(primaryIndex).getLastAppliedOpTime() + << " My optime: " + << _memberData.at(_selfIndex).getLastAppliedOpTime(); + LOG_FOR_ELECTION(4) << _getReplSetStatusString(); scheduleCatchupTakeover = true; } if (_rsConfig.getMemberAt(primaryIndex).getPriority() < _rsConfig.getMemberAt(_selfIndex).getPriority()) { - LOG(2) << "I can take over the primary due to higher priority." - << " Current primary index: " << primaryIndex << " in term " - << _memberData.at(primaryIndex).getTerm(); + LOG_FOR_ELECTION(2) << "I can take over the primary due to higher priority." + << " Current primary index: " << primaryIndex << " in term " + << _memberData.at(primaryIndex).getTerm(); + LOG_FOR_ELECTION(4) << _getReplSetStatusString(); schedulePriorityTakeover = true; } @@ -1124,10 +1129,11 @@ HeartbeatResponseAction TopologyCoordinator::_updatePrimaryFromHBDataV1( // Otherwise, prefer to schedule a catchup takeover over a priority takeover if (scheduleCatchupTakeover && schedulePriorityTakeover && _rsConfig.calculatePriorityRank(currentNodePriority) == 0) { - LOG(2) << "I can take over the primary because I have a higher priority, the highest " - << "priority in the replica set, and fresher data." - << " Current primary index: " << primaryIndex << " in term " - << _memberData.at(primaryIndex).getTerm(); + LOG_FOR_ELECTION(2) + << "I can take over the primary because I have a higher priority, the highest " + << "priority in the replica set, and fresher data." + << " Current primary index: " << primaryIndex << " in term " + << _memberData.at(primaryIndex).getTerm(); return HeartbeatResponseAction::makePriorityTakeoverAction(); } if (scheduleCatchupTakeover) { @@ -1368,6 +1374,19 @@ const MemberConfig* TopologyCoordinator::_currentPrimaryMember() const { return &(_rsConfig.getMemberAt(_currentPrimaryIndex)); } +std::string TopologyCoordinator::_getReplSetStatusString() { + // Construct a ReplSetStatusArgs using default parameters. Missing parameters will not be + // included in the status string. + ReplSetStatusArgs rsStatusArgs{Date_t::now(), 0U, OpTime(), BSONObj(), boost::none}; + BSONObjBuilder builder; + Status result(ErrorCodes::InternalError, "didn't set status in prepareStatusResponse"); + prepareStatusResponse(rsStatusArgs, &builder, &result); + if (!result.isOK()) { + return str::stream() << "Could not get replSetGetStatus output: " << result.toString(); + } + return str::stream() << "Current replSetGetStatus output: " << builder.done().toString(); +} + void TopologyCoordinator::prepareStatusResponse(const ReplSetStatusArgs& rsStatusArgs, BSONObjBuilder* response, Status* result) { @@ -2652,6 +2671,10 @@ void TopologyCoordinator::processReplSetRequestVotes(const ReplSetRequestVotesAr response->setVoteGranted(true); } } + + LOG_FOR_ELECTION(0) << "Received vote request: " << args.toString(); + LOG_FOR_ELECTION(0) << "Sending vote response: " << response->toString(); + LOG_FOR_ELECTION(4) << _getReplSetStatusString(); } void TopologyCoordinator::loadLastVote(const LastVote& lastVote) { @@ -2725,5 +2748,27 @@ boost::optional<OpTime> TopologyCoordinator::latestKnownOpTimeSinceHeartbeatRest return latest; } +std::map<int, boost::optional<OpTime>> +TopologyCoordinator::latestKnownOpTimeSinceHeartbeatRestartPerMember() const { + std::map<int, boost::optional<OpTime>> opTimesPerMember; + for (size_t i = 0; i < _memberData.size(); i++) { + auto& member = _memberData[i]; + int memberId = _rsConfig.getMemberAt(i).getId(); + + if (!member.isUpdatedSinceRestart()) { + opTimesPerMember[memberId] = boost::none; + continue; + } + + if (!member.up()) { + opTimesPerMember[memberId] = boost::none; + continue; + } + + opTimesPerMember[memberId] = member.getHeartbeatAppliedOpTime(); + } + return opTimesPerMember; +} + } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/topology_coordinator.h b/src/mongo/db/repl/topology_coordinator.h index 361aa534ff2..184ba8ede52 100644 --- a/src/mongo/db/repl/topology_coordinator.h +++ b/src/mongo/db/repl/topology_coordinator.h @@ -691,6 +691,13 @@ public: */ boost::optional<OpTime> latestKnownOpTimeSinceHeartbeatRestart() const; + /** + * Similar to latestKnownOpTimeSinceHeartbeatRestart(), but returns the latest known optime for + * each member in the config. If the member is not up or hasn't responded to a heartbeat since + * we last restarted, then its value will be boost::none. + */ + std::map<int, boost::optional<OpTime>> latestKnownOpTimeSinceHeartbeatRestartPerMember() const; + //////////////////////////////////////////////////////////// // // Test support methods @@ -845,6 +852,9 @@ private: */ bool _isElectableNodeInSingleNodeReplicaSet() const; + // Returns a string representation of the current replica set status for logging purposes. + std::string _getReplSetStatusString(); + // This node's role in the replication protocol. Role _role; diff --git a/src/mongo/db/repl/vote_requester.cpp b/src/mongo/db/repl/vote_requester.cpp index aed7cca6049..b22afc59068 100644 --- a/src/mongo/db/repl/vote_requester.cpp +++ b/src/mongo/db/repl/vote_requester.cpp @@ -26,7 +26,7 @@ * it in the license file. */ -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kReplication +#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kReplicationElection #include "mongo/platform/basic.h" @@ -188,7 +188,7 @@ StatusWith<executor::TaskExecutor::EventHandle> VoteRequester::start( int primaryIndex) { _algorithm = std::make_shared<Algorithm>( rsConfig, candidateIndex, term, dryRun, lastDurableOpTime, primaryIndex); - _runner = stdx::make_unique<ScatterGatherRunner>(_algorithm, executor); + _runner = stdx::make_unique<ScatterGatherRunner>(_algorithm, executor, "vote request"); return _runner->start(); } |