summaryrefslogtreecommitdiff
path: root/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp')
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp203
1 files changed, 139 insertions, 64 deletions
diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
index 3b928f80df0..4a8c1ce7721 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp
@@ -27,11 +27,13 @@
* it in the license file.
*/
-#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kReplication
-#define LOG_FOR_ELECTION(level) \
- MONGO_LOG_COMPONENT(level, ::mongo::logger::LogComponent::kReplicationElection)
-#define LOG_FOR_HEARTBEATS(level) \
- MONGO_LOG_COMPONENT(level, ::mongo::logger::LogComponent::kReplicationHeartbeats)
+#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kReplication
+#define LOGV2_FOR_ELECTION(ID, DLEVEL, MESSAGE, ...) \
+ LOGV2_DEBUG_OPTIONS( \
+ ID, DLEVEL, {logv2::LogComponent::kReplicationElection}, MESSAGE, ##__VA_ARGS__)
+#define LOGV2_FOR_HEARTBEATS(ID, DLEVEL, MESSAGE, ...) \
+ LOGV2_DEBUG_OPTIONS( \
+ ID, DLEVEL, {logv2::LogComponent::kReplicationHeartbeats}, MESSAGE, ##__VA_ARGS__)
#include "mongo/platform/basic.h"
@@ -60,7 +62,6 @@
#include "mongo/rpc/metadata/repl_set_metadata.h"
#include "mongo/util/assert_util.h"
#include "mongo/util/fail_point.h"
-#include "mongo/util/log.h"
#include "mongo/util/str.h"
#include "mongo/util/time_support.h"
@@ -117,16 +118,23 @@ void ReplicationCoordinatorImpl::_doMemberHeartbeat(executor::TaskExecutor::Call
return _handleHeartbeatResponse(cbData, targetIndex);
};
- LOG_FOR_HEARTBEATS(2) << "Sending heartbeat (requestId: " << request.id << ") to " << target
- << ", " << heartbeatObj;
+ LOGV2_FOR_HEARTBEATS(4615670,
+ 2,
+ "Sending heartbeat (requestId: {request_id}) to {target} {heartbeatObj}",
+ "request_id"_attr = request.id,
+ "target"_attr = target,
+ "heartbeatObj"_attr = heartbeatObj);
_trackHeartbeatHandle_inlock(_replExecutor->scheduleRemoteCommand(request, callback));
}
void ReplicationCoordinatorImpl::_scheduleHeartbeatToTarget_inlock(const HostAndPort& target,
int targetIndex,
Date_t when) {
- LOG_FOR_HEARTBEATS(2) << "Scheduling heartbeat to " << target << " at "
- << dateToISOStringUTC(when);
+ LOGV2_FOR_HEARTBEATS(4615618,
+ 2,
+ "Scheduling heartbeat to {target} at {dateToISOStringUTC_when}",
+ "target"_attr = target,
+ "dateToISOStringUTC_when"_attr = dateToISOStringUTC(when));
_trackHeartbeatHandle_inlock(_replExecutor->scheduleWorkAt(
when, [=](const executor::TaskExecutor::CallbackArgs& cbData) {
_doMemberHeartbeat(cbData, target, targetIndex);
@@ -146,8 +154,12 @@ void ReplicationCoordinatorImpl::_handleHeartbeatResponse(
const HostAndPort& target = cbData.request.target;
if (responseStatus == ErrorCodes::CallbackCanceled) {
- LOG_FOR_HEARTBEATS(2) << "Received response to heartbeat (requestId: " << cbData.request.id
- << ") from " << target << " but the heartbeat was cancelled.";
+ LOGV2_FOR_HEARTBEATS(4615619,
+ 2,
+ "Received response to heartbeat (requestId: {cbData_request_id}) from "
+ "{target} but the heartbeat was cancelled.",
+ "cbData_request_id"_attr = cbData.request.id,
+ "target"_attr = target);
return;
}
@@ -159,8 +171,13 @@ void ReplicationCoordinatorImpl::_handleHeartbeatResponse(
StatusWith<rpc::ReplSetMetadata> replMetadata =
rpc::ReplSetMetadata::readFromMetadata(cbData.response.data);
- LOG_FOR_HEARTBEATS(2) << "Received response to heartbeat (requestId: " << cbData.request.id
- << ") from " << target << ", " << resp;
+ LOGV2_FOR_HEARTBEATS(
+ 4615620,
+ 2,
+ "Received response to heartbeat (requestId: {cbData_request_id}) from {target}, {resp}",
+ "cbData_request_id"_attr = cbData.request.id,
+ "target"_attr = target,
+ "resp"_attr = resp);
// Reject heartbeat responses (and metadata) from nodes with mismatched replica set IDs.
// It is problematic to perform this check in the heartbeat reconfiguring logic because it
@@ -209,12 +226,18 @@ void ReplicationCoordinatorImpl::_handleHeartbeatResponse(
// Postpone election timeout if we have a successful heartbeat response from the primary.
if (hbResponse.hasState() && hbResponse.getState().primary() &&
hbResponse.getTerm() == _topCoord->getTerm()) {
- LOG_FOR_ELECTION(4) << "Postponing election timeout due to heartbeat from primary";
+ LOGV2_FOR_ELECTION(
+ 4615659, 4, "Postponing election timeout due to heartbeat from primary");
_cancelAndRescheduleElectionTimeout_inlock();
}
} else {
- LOG_FOR_HEARTBEATS(2) << "Error in heartbeat (requestId: " << cbData.request.id << ") to "
- << target << ", response status: " << responseStatus;
+ LOGV2_FOR_HEARTBEATS(4615621,
+ 2,
+ "Error in heartbeat (requestId: {cbData_request_id}) to {target}, "
+ "response status: {responseStatus}",
+ "cbData_request_id"_attr = cbData.request.id,
+ "target"_attr = target,
+ "responseStatus"_attr = responseStatus);
hbStatusResponse = StatusWith<ReplSetHeartbeatResponse>(responseStatus);
}
@@ -292,7 +315,10 @@ stdx::unique_lock<Latch> ReplicationCoordinatorImpl::_handleHeartbeatResponseAct
Milliseconds priorityTakeoverDelay = _rsConfig.getPriorityTakeoverDelay(_selfIndex);
Milliseconds randomOffset = _getRandomizedElectionOffset_inlock();
_priorityTakeoverWhen = _replExecutor->now() + priorityTakeoverDelay + randomOffset;
- LOG_FOR_ELECTION(0) << "Scheduling priority takeover at " << _priorityTakeoverWhen;
+ LOGV2_FOR_ELECTION(4615601,
+ 0,
+ "Scheduling priority takeover at {priorityTakeoverWhen}",
+ "priorityTakeoverWhen"_attr = _priorityTakeoverWhen);
_priorityTakeoverCbh = _scheduleWorkAt(
_priorityTakeoverWhen, [=](const mongo::executor::TaskExecutor::CallbackArgs&) {
_startElectSelfIfEligibleV1(StartElectionReasonEnum::kPriorityTakeover);
@@ -305,7 +331,10 @@ stdx::unique_lock<Latch> ReplicationCoordinatorImpl::_handleHeartbeatResponseAct
if (!_catchupTakeoverCbh.isValid() && !_priorityTakeoverCbh.isValid()) {
Milliseconds catchupTakeoverDelay = _rsConfig.getCatchUpTakeoverDelay();
_catchupTakeoverWhen = _replExecutor->now() + catchupTakeoverDelay;
- LOG_FOR_ELECTION(0) << "Scheduling catchup takeover at " << _catchupTakeoverWhen;
+ LOGV2_FOR_ELECTION(4615648,
+ 0,
+ "Scheduling catchup takeover at {catchupTakeoverWhen}",
+ "catchupTakeoverWhen"_attr = _catchupTakeoverWhen);
_catchupTakeoverCbh = _scheduleWorkAt(
_catchupTakeoverWhen, [=](const mongo::executor::TaskExecutor::CallbackArgs&) {
_startElectSelfIfEligibleV1(StartElectionReasonEnum::kCatchupTakeover);
@@ -456,15 +485,21 @@ void ReplicationCoordinatorImpl::_scheduleHeartbeatReconfig_inlock(const ReplSet
switch (_rsConfigState) {
case kConfigUninitialized:
case kConfigSteady:
- LOG_FOR_HEARTBEATS(1) << "Received new config via heartbeat with "
- << newConfig.getConfigVersionAndTerm();
+ LOGV2_FOR_HEARTBEATS(
+ 4615622,
+ 1,
+ "Received new config via heartbeat with {newConfig_getConfigVersionAndTerm}",
+ "newConfig_getConfigVersionAndTerm"_attr = newConfig.getConfigVersionAndTerm());
break;
case kConfigInitiating:
case kConfigReconfiguring:
case kConfigHBReconfiguring:
- LOG_FOR_HEARTBEATS(1) << "Ignoring new configuration with "
- << newConfig.getConfigVersionAndTerm()
- << " because already in the midst of a configuration process.";
+ LOGV2_FOR_HEARTBEATS(
+ 4615623,
+ 1,
+ "Ignoring new configuration with {newConfig_getConfigVersionAndTerm} because "
+ "already in the midst of a configuration process.",
+ "newConfig_getConfigVersionAndTerm"_attr = newConfig.getConfigVersionAndTerm());
return;
case kConfigPreStart:
case kConfigStartingUp:
@@ -479,9 +514,12 @@ void ReplicationCoordinatorImpl::_scheduleHeartbeatReconfig_inlock(const ReplSet
invariant(!_rsConfig.isInitialized() ||
_rsConfig.getConfigVersionAndTerm() < newConfig.getConfigVersionAndTerm());
if (auto electionFinishedEvent = _cancelElectionIfNeeded_inlock()) {
- LOG_FOR_HEARTBEATS(2) << "Rescheduling heartbeat reconfig to config with "
- << newConfig.getConfigVersionAndTerm()
- << " to be processed after election is cancelled.";
+ LOGV2_FOR_HEARTBEATS(
+ 4615624,
+ 2,
+ "Rescheduling heartbeat reconfig to config with {newConfig_getConfigVersionAndTerm} to "
+ "be processed after election is cancelled.",
+ "newConfig_getConfigVersionAndTerm"_attr = newConfig.getConfigVersionAndTerm());
_replExecutor
->onEvent(electionFinishedEvent,
@@ -519,8 +557,10 @@ void ReplicationCoordinatorImpl::_heartbeatReconfigStore(
// transitioning into the RS_REMOVED state. See SERVER-15740.
if (!_rsConfig.isInitialized()) {
invariant(_rsConfigState == kConfigHBReconfiguring);
- LOG_FOR_HEARTBEATS(1) << "Ignoring new configuration in heartbeat response because we "
- "are uninitialized and not a member of the new configuration";
+ LOGV2_FOR_HEARTBEATS(4615625,
+ 1,
+ "Ignoring new configuration in heartbeat response because we "
+ "are uninitialized and not a member of the new configuration");
_setConfigState_inlock(kConfigUninitialized);
return;
}
@@ -533,8 +573,12 @@ void ReplicationCoordinatorImpl::_heartbeatReconfigStore(
"it is invalid: {myIndex_getStatus}",
"myIndex_getStatus"_attr = myIndex.getStatus());
} else {
- LOG_FOR_HEARTBEATS(2) << "Config with " << newConfig.getConfigVersionAndTerm()
- << " validated for reconfig; persisting to disk.";
+ LOGV2_FOR_HEARTBEATS(4615626,
+ 2,
+ "Config with {newConfig_getConfigVersionAndTerm} validated for "
+ "reconfig; persisting to disk.",
+ "newConfig_getConfigVersionAndTerm"_attr =
+ newConfig.getConfigVersionAndTerm());
auto opCtx = cc().makeOperationContext();
auto status = _externalState->storeLocalConfigDocument(opCtx.get(), newConfig.toBSON());
@@ -571,8 +615,12 @@ void ReplicationCoordinatorImpl::_heartbeatReconfigStore(
shouldStartDataReplication = true;
}
- LOG_FOR_HEARTBEATS(2) << "New configuration with " << newConfig.getConfigVersionAndTerm()
- << " persisted to local storage; installing new config in memory";
+ LOGV2_FOR_HEARTBEATS(4615627,
+ 2,
+ "New configuration with {newConfig_getConfigVersionAndTerm} persisted "
+ "to local storage; installing new config in memory",
+ "newConfig_getConfigVersionAndTerm"_attr =
+ newConfig.getConfigVersionAndTerm());
}
_heartbeatReconfigFinish(cbd, newConfig, myIndex);
@@ -595,8 +643,10 @@ void ReplicationCoordinatorImpl::_heartbeatReconfigFinish(
}
if (MONGO_unlikely(blockHeartbeatReconfigFinish.shouldFail())) {
- LOG_FOR_HEARTBEATS(0) << "blockHeartbeatReconfigFinish fail point enabled. Rescheduling "
- "_heartbeatReconfigFinish until fail point is disabled.";
+ LOGV2_FOR_HEARTBEATS(4615628,
+ 0,
+ "blockHeartbeatReconfigFinish fail point enabled. Rescheduling "
+ "_heartbeatReconfigFinish until fail point is disabled.");
_replExecutor
->scheduleWorkAt(_replExecutor->now() + Milliseconds{10},
[=](const executor::TaskExecutor::CallbackArgs& cbData) {
@@ -613,9 +663,12 @@ void ReplicationCoordinatorImpl::_heartbeatReconfigFinish(
{
stdx::lock_guard<Latch> lk(_mutex);
if (auto electionFinishedEvent = _cancelElectionIfNeeded_inlock()) {
- LOG_FOR_HEARTBEATS(0)
- << "Waiting for election to complete before finishing reconfig to config with "
- << newConfig.getConfigVersionAndTerm();
+ LOGV2_FOR_HEARTBEATS(4615629,
+ 0,
+ "Waiting for election to complete before finishing reconfig to "
+ "config with {newConfig_getConfigVersionAndTerm}",
+ "newConfig_getConfigVersionAndTerm"_attr =
+ newConfig.getConfigVersionAndTerm());
// Wait for the election to complete and the node's Role to be set to follower.
_replExecutor
->onEvent(electionFinishedEvent,
@@ -728,7 +781,7 @@ void ReplicationCoordinatorImpl::_untrackHeartbeatHandle_inlock(
}
void ReplicationCoordinatorImpl::_cancelHeartbeats_inlock() {
- LOG_FOR_HEARTBEATS(2) << "Cancelling all heartbeats.";
+ LOGV2_FOR_HEARTBEATS(4615630, 2, "Cancelling all heartbeats.");
for (const auto& handle : _heartbeatHandles) {
_replExecutor->cancel(handle);
@@ -874,8 +927,10 @@ void ReplicationCoordinatorImpl::_cancelAndRescheduleElectionTimeout_inlock() {
logThrottleTime = now;
}
if (wasActive) {
- LOG_FOR_ELECTION(cancelAndRescheduleLogLevel)
- << "Canceling election timeout callback at " << _handleElectionTimeoutWhen;
+ LOGV2_FOR_ELECTION(4615649,
+ logSeverityV1toV2(cancelAndRescheduleLogLevel).toInt(),
+ "Canceling election timeout callback at {handleElectionTimeoutWhen}",
+ "handleElectionTimeoutWhen"_attr = _handleElectionTimeoutWhen);
_replExecutor->cancel(_handleElectionTimeoutCbh);
_handleElectionTimeoutCbh = CallbackHandle();
_handleElectionTimeoutWhen = Date_t();
@@ -889,10 +944,13 @@ void ReplicationCoordinatorImpl::_cancelAndRescheduleElectionTimeout_inlock() {
invariant(when > now);
if (wasActive) {
// The log level here is 4 once per second, otherwise 5.
- LOG_FOR_ELECTION(cancelAndRescheduleLogLevel)
- << "Rescheduling election timeout callback at " << when;
+ LOGV2_FOR_ELECTION(4615650,
+ logSeverityV1toV2(cancelAndRescheduleLogLevel).toInt(),
+ "Rescheduling election timeout callback at {when}",
+ "when"_attr = when);
} else {
- LOG_FOR_ELECTION(4) << "Scheduling election timeout callback at " << when;
+ LOGV2_FOR_ELECTION(
+ 4615651, 4, "Scheduling election timeout callback at {when}", "when"_attr = when);
}
_handleElectionTimeoutWhen = when;
_handleElectionTimeoutCbh =
@@ -927,7 +985,7 @@ void ReplicationCoordinatorImpl::_startElectSelfIfEligibleV1(WithLock,
_cancelPriorityTakeover_inlock();
_cancelAndRescheduleElectionTimeout_inlock();
if (_inShutdown) {
- LOG_FOR_ELECTION(0) << "Not starting an election, since we are shutting down";
+ LOGV2_FOR_ELECTION(4615654, 0, "Not starting an election, since we are shutting down");
return;
}
}
@@ -936,27 +994,40 @@ void ReplicationCoordinatorImpl::_startElectSelfIfEligibleV1(WithLock,
if (!status.isOK()) {
switch (reason) {
case StartElectionReasonEnum::kElectionTimeout:
- LOG_FOR_ELECTION(0)
- << "Not starting an election, since we are not electable due to: "
- << status.reason();
+ LOGV2_FOR_ELECTION(
+ 4615655,
+ 0,
+ "Not starting an election, since we are not electable due to: {status_reason}",
+ "status_reason"_attr = status.reason());
break;
case StartElectionReasonEnum::kPriorityTakeover:
- LOG_FOR_ELECTION(0) << "Not starting an election for a priority takeover, "
- << "since we are not electable due to: " << status.reason();
+ LOGV2_FOR_ELECTION(4615656,
+ 0,
+ "Not starting an election for a priority takeover, since we are "
+ "not electable due to: {status_reason}",
+ "status_reason"_attr = status.reason());
break;
case StartElectionReasonEnum::kStepUpRequest:
case StartElectionReasonEnum::kStepUpRequestSkipDryRun:
- LOG_FOR_ELECTION(0) << "Not starting an election for a replSetStepUp request, "
- << "since we are not electable due to: " << status.reason();
+ LOGV2_FOR_ELECTION(4615657,
+ 0,
+ "Not starting an election for a replSetStepUp request, since we "
+ "are not electable due to: {status_reason}",
+ "status_reason"_attr = status.reason());
break;
case StartElectionReasonEnum::kCatchupTakeover:
- LOG_FOR_ELECTION(0) << "Not starting an election for a catchup takeover, "
- << "since we are not electable due to: " << status.reason();
+ LOGV2_FOR_ELECTION(4615658,
+ 0,
+ "Not starting an election for a catchup takeover, since we are "
+ "not electable due to: {status_reason}",
+ "status_reason"_attr = status.reason());
break;
case StartElectionReasonEnum::kSingleNodePromptElection:
- LOG_FOR_ELECTION(0)
- << "Not starting an election for a single node replica set prompt election, "
- << "since we are not electable due to: " << status.reason();
+ LOGV2_FOR_ELECTION(4615653,
+ 0,
+ "Not starting an election for a single node replica set prompt "
+ "election, since we are not electable due to: {status_reason}",
+ "status_reason"_attr = status.reason());
break;
default:
MONGO_UNREACHABLE;
@@ -966,22 +1037,26 @@ void ReplicationCoordinatorImpl::_startElectSelfIfEligibleV1(WithLock,
switch (reason) {
case StartElectionReasonEnum::kElectionTimeout:
- LOG_FOR_ELECTION(0) << "Starting an election, since we've seen no PRIMARY in the past "
- << _rsConfig.getElectionTimeoutPeriod();
+ LOGV2_FOR_ELECTION(4615652,
+ 0,
+ "Starting an election, since we've seen no PRIMARY in the past "
+ "{rsConfig_getElectionTimeoutPeriod}",
+ "rsConfig_getElectionTimeoutPeriod"_attr =
+ _rsConfig.getElectionTimeoutPeriod());
break;
case StartElectionReasonEnum::kPriorityTakeover:
- LOG_FOR_ELECTION(0) << "Starting an election for a priority takeover";
+ LOGV2_FOR_ELECTION(4615660, 0, "Starting an election for a priority takeover");
break;
case StartElectionReasonEnum::kStepUpRequest:
case StartElectionReasonEnum::kStepUpRequestSkipDryRun:
- LOG_FOR_ELECTION(0) << "Starting an election due to step up request";
+ LOGV2_FOR_ELECTION(4615661, 0, "Starting an election due to step up request");
break;
case StartElectionReasonEnum::kCatchupTakeover:
- LOG_FOR_ELECTION(0) << "Starting an election for a catchup takeover";
+ LOGV2_FOR_ELECTION(4615662, 0, "Starting an election for a catchup takeover");
break;
case StartElectionReasonEnum::kSingleNodePromptElection:
- LOG_FOR_ELECTION(0)
- << "Starting an election due to single node replica set prompt election";
+ LOGV2_FOR_ELECTION(
+ 4615663, 0, "Starting an election due to single node replica set prompt election");
break;
default:
MONGO_UNREACHABLE;