summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWenbin Zhu <wenbin.zhu@mongodb.com>2021-10-04 21:51:44 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-10-04 23:00:03 +0000
commite097735040d077adb8b03a061d65c315750277d3 (patch)
treec4e645a2f9e57da91b1d20fbf3dadcb19a645538
parent217285dd2d3689aaaf850adefc28aa8edbd7a957 (diff)
downloadmongo-e097735040d077adb8b03a061d65c315750277d3.tar.gz
SERVER-60153 Replenish INFO level logging for elections and repl startup.
-rw-r--r--src/mongo/db/kill_sessions_local.cpp2
-rw-r--r--src/mongo/db/repl/replication_coordinator_external_state_impl.cpp1
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.cpp86
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.h5
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp4
5 files changed, 73 insertions, 25 deletions
diff --git a/src/mongo/db/kill_sessions_local.cpp b/src/mongo/db/kill_sessions_local.cpp
index d1570e2f2e9..f5070365d4e 100644
--- a/src/mongo/db/kill_sessions_local.cpp
+++ b/src/mongo/db/kill_sessions_local.cpp
@@ -190,6 +190,7 @@ void yieldLocksForPreparedTransactions(OperationContext* opCtx) {
// Scan the sessions again to get the list of all sessions with prepared transaction
// to yield their locks.
+ LOGV2(6015318, "Yielding locks for prepared transactions.");
SessionKiller::Matcher matcherAllSessions(
KillAllSessionsByPatternSet{makeKillAllSessionsByPattern(newOpCtx.get())});
killSessionsAction(newOpCtx.get(),
@@ -222,6 +223,7 @@ void invalidateSessionsForStepdown(OperationContext* opCtx) {
// It is illegal to invalidate the sessions if the operation has a session checked out.
invariant(!OperationContextSession::get(opCtx));
+ LOGV2(6015319, "Invalidating sessions for stepdown.");
SessionKiller::Matcher matcherAllSessions(
KillAllSessionsByPatternSet{makeKillAllSessionsByPattern(opCtx)});
killSessionsAction(opCtx,
diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
index 3f4ace4b665..605593200b0 100644
--- a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
@@ -525,6 +525,7 @@ OpTime ReplicationCoordinatorExternalStateImpl::onTransitionToPrimary(OperationC
// exclusive mode.
_replicationProcess->getConsistencyMarkers()->clearAppliedThrough(opCtx, Timestamp());
+ LOGV2(6015309, "Logging transition to primary to oplog on stepup");
writeConflictRetry(opCtx, "logging transition to primary to oplog", "local.oplog.rs", [&] {
AutoGetOplog oplogWrite(opCtx, OplogAccessMode::kWrite);
WriteUnitOfWork wuow(opCtx);
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index f9ea5fbc72a..6f42076d2ec 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -421,7 +421,7 @@ void ReplicationCoordinatorImpl::appendConnectionStats(executor::ConnectionPoolS
bool ReplicationCoordinatorImpl::_startLoadLocalConfig(
OperationContext* opCtx, StorageEngine::LastShutdownState lastShutdownState) {
- LOGV2_DEBUG(4280500, 1, "Attempting to create internal replication collections");
+ LOGV2(4280500, "Attempting to create internal replication collections");
// Create necessary replication collections to guarantee that if a checkpoint sees data after
// initial sync has completed, it also sees these collections.
fassert(50708, _replicationProcess->getConsistencyMarkers()->createInternalCollections(opCtx));
@@ -434,7 +434,7 @@ bool ReplicationCoordinatorImpl::_startLoadLocalConfig(
fassert(51240, _externalState->createLocalLastVoteCollection(opCtx));
- LOGV2_DEBUG(4280501, 1, "Attempting to load local voted for document");
+ LOGV2(4280501, "Attempting to load local voted for document");
StatusWith<LastVote> lastVote = _externalState->loadLocalLastVoteDocument(opCtx);
if (!lastVote.isOK()) {
LOGV2_FATAL_NOTRACE(40367,
@@ -451,7 +451,7 @@ bool ReplicationCoordinatorImpl::_startLoadLocalConfig(
_topCoord->loadLastVote(lastVote.getValue());
}
- LOGV2_DEBUG(4280502, 1, "Searching for local Rollback ID document");
+ LOGV2(4280502, "Searching for local Rollback ID document");
// Check that we have a local Rollback ID. If we do not have one, create one.
auto status = _replicationProcess->refreshRollbackID(opCtx);
if (!status.isOK()) {
@@ -507,16 +507,14 @@ bool ReplicationCoordinatorImpl::_startLoadLocalConfig(
"config"_attr = cfg.getValue());
}
- LOGV2_DEBUG(
- 4280504, 1, "Cleaning up any partially applied oplog batches & reading last op from oplog");
+ LOGV2(4280504, "Cleaning up any partially applied oplog batches & reading last op from oplog");
// Read the last op from the oplog after cleaning up any partially applied batches.
const auto stableTimestamp = boost::none;
_replicationProcess->getReplicationRecovery()->recoverFromOplog(opCtx, stableTimestamp);
- LOGV2_DEBUG(4280505,
- 1,
- "Creating any necessary TenantMigrationAccessBlockers for unfinished migrations");
+ LOGV2(4280505,
+ "Creating any necessary TenantMigrationAccessBlockers for unfinished migrations");
tenant_migration_access_blocker::recoverTenantMigrationAccessBlockers(opCtx);
- LOGV2_DEBUG(4280506, 1, "Reconstructing prepared transactions");
+ LOGV2(4280506, "Reconstructing prepared transactions");
reconstructPreparedTransactions(opCtx, OplogApplication::Mode::kRecovering);
const auto lastOpTimeAndWallTimeResult = _externalState->loadLastOpTimeAndWallTime(opCtx);
@@ -535,7 +533,7 @@ bool ReplicationCoordinatorImpl::_startLoadLocalConfig(
stdx::lock_guard<Latch> lk(_mutex);
_finishLoadLocalConfigCbh = std::move(handle.getValue());
- LOGV2_DEBUG(4280507, 1, "Loaded replica set config, scheduled callback to set local config");
+ LOGV2(4280507, "Loaded replica set config, scheduled callback to set local config");
return false;
}
@@ -564,8 +562,7 @@ void ReplicationCoordinatorImpl::_finishLoadLocalConfig(
return;
}
- LOGV2_DEBUG(
- 4280508, 1, "Attempting to set local replica set config; validating config for startup");
+ LOGV2(4280508, "Attempting to set local replica set config; validating config for startup");
StatusWith<int> myIndex =
validateConfigForStartUp(_externalState.get(), localConfig, getServiceContext());
if (!myIndex.isOK()) {
@@ -606,7 +603,7 @@ void ReplicationCoordinatorImpl::_finishLoadLocalConfig(
"commandLineSetName"_attr = _settings.ourSetName());
myIndex = StatusWith<int>(-1);
}
- LOGV2_DEBUG(4280509, 1, "Local configuration validated for startup");
+ LOGV2(4280509, "Local configuration validated for startup");
// Do not check optime, if this node is an arbiter.
bool isArbiter =
@@ -689,14 +686,14 @@ void ReplicationCoordinatorImpl::_finishLoadLocalConfig(
// Step down is impossible, so we don't need to wait for the returned event.
_updateTerm_inlock(term);
}
- LOGV2_DEBUG(21320, 1, "Current term is now {term}", "Updated term", "term"_attr = term);
+ LOGV2(21320, "Current term is now {term}", "Updated term", "term"_attr = term);
_performPostMemberStateUpdateAction(action);
if (!isArbiter && myIndex.getValue() != -1) {
_externalState->startThreads();
_startDataReplication(opCtx.get());
}
- LOGV2_DEBUG(4280511, 1, "Set local replica set config");
+ LOGV2(4280511, "Set local replica set config");
}
void ReplicationCoordinatorImpl::_startInitialSync(
@@ -748,7 +745,7 @@ void ReplicationCoordinatorImpl::_startInitialSync(
// InitialSyncer::startup() must be called outside lock because it uses features (eg.
// setting the initial sync flag) which depend on the ReplicationCoordinatorImpl.
uassertStatusOK(initialSyncerCopy->startup(opCtx, numInitialSyncAttempts.load()));
- LOGV2_DEBUG(4280514, 1, "Initial sync started");
+ LOGV2(4280514, "Initial sync started");
} catch (const DBException& e) {
auto status = e.toStatus();
LOGV2(21327,
@@ -835,7 +832,7 @@ void ReplicationCoordinatorImpl::_startDataReplication(OperationContext* opCtx)
const auto needsInitialSync =
lastOpTime.isNull() || _externalState->isInitialSyncFlagSet(opCtx);
if (!needsInitialSync) {
- LOGV2_DEBUG(4280512, 1, "No initial sync required. Attempting to begin steady replication");
+ LOGV2(4280512, "No initial sync required. Attempting to begin steady replication");
// Start steady replication, since we already have data.
// ReplSetConfig has been installed, so it's either in STARTUP2 or REMOVED.
auto memberState = getMemberState();
@@ -848,7 +845,7 @@ void ReplicationCoordinatorImpl::_startDataReplication(OperationContext* opCtx)
return;
}
- LOGV2_DEBUG(4280513, 1, "Initial sync required. Attempting to start initial sync...");
+ LOGV2(4280513, "Initial sync required. Attempting to start initial sync...");
// Do initial sync.
if (!_externalState->getTaskExecutor()) {
LOGV2(21323, "Not running initial sync during test");
@@ -1173,6 +1170,7 @@ void ReplicationCoordinatorImpl::signalDrainComplete(OperationContext* opCtx,
stdx::unique_lock<Latch> lk(_mutex);
if (_applierState != ApplierState::Draining) {
+ LOGV2(6015306, "Applier already left draining state, exiting.");
return;
}
lk.unlock();
@@ -1197,6 +1195,8 @@ void ReplicationCoordinatorImpl::signalDrainComplete(OperationContext* opCtx,
// current term, and we're allowed to become the writable primary.
if (_applierState != ApplierState::Draining ||
!_topCoord->canCompleteTransitionToPrimary(termWhenBufferIsEmpty)) {
+ LOGV2(6015308,
+ "Applier left draining state or not allowed to become writeable primary, exiting");
return;
}
_applierState = ApplierState::Stopped;
@@ -1257,6 +1257,7 @@ void ReplicationCoordinatorImpl::signalDrainComplete(OperationContext* opCtx,
hangAfterReconfigOnDrainComplete.pauseWhileSet(opCtx);
}
+ LOGV2(6015310, "Starting to transition to primary.");
AllowNonLocalWritesBlock writesAllowed(opCtx);
OpTime firstOpTime = _externalState->onTransitionToPrimary(opCtx);
ReplicaSetAwareServiceRegistry::get(_service).onStepUpComplete(opCtx,
@@ -3458,6 +3459,7 @@ Status ReplicationCoordinatorImpl::_doReplSetReconfig(OperationContext* opCtx,
"_rsConfigState"_attr = int(_rsConfigState));
}
+ LOGV2(6015313, "Replication config state is Steady, starting reconfig");
invariant(_rsConfig.isInitialized());
if (!force && !_readWriteAbility->canAcceptNonLocalWrites(lk) && !skipSafetyChecks) {
@@ -3621,6 +3623,7 @@ Status ReplicationCoordinatorImpl::_doReplSetReconfig(OperationContext* opCtx,
// Always debug assert if we reach this point.
dassert(!skipSafetyChecks);
}
+ LOGV2(6015314, "Finding self in new config");
StatusWith<int> myIndexSw = force
? findSelfInConfig(_externalState.get(), newConfig, opCtx->getServiceContext())
: findSelfInConfigIfElectable(
@@ -3681,6 +3684,7 @@ Status ReplicationCoordinatorImpl::_doReplSetReconfig(OperationContext* opCtx,
}
// Wait for durability of the new config document.
JournalFlusher::get(opCtx)->waitForJournalFlush();
+ LOGV2(6015315, "Persisted new config to disk");
configStateGuard.dismiss();
_finishReplSetReconfig(opCtx, newConfig, force, myIndex);
@@ -3703,12 +3707,11 @@ void ReplicationCoordinatorImpl::_finishReplSetReconfig(OperationContext* opCtx,
// we have already set our ReplicationCoordinatorImpl::_rsConfigState state to
// "kConfigReconfiguring" which prevents new elections from happening.
if (electionFinishedEvent) {
- LOGV2_DEBUG(21354,
- 2,
- "Waiting for election to complete before finishing reconfig to config with "
- "{configVersionAndTerm}",
- "Waiting for election to complete before finishing reconfig",
- "configVersionAndTerm"_attr = newConfig.getConfigVersionAndTerm());
+ LOGV2(21354,
+ "Waiting for election to complete before finishing reconfig to config with "
+ "{configVersionAndTerm}",
+ "Waiting for election to complete before finishing reconfig",
+ "configVersionAndTerm"_attr = newConfig.getConfigVersionAndTerm());
// Wait for the election to complete and the node's Role to be set to follower.
_replExecutor->waitForEvent(electionFinishedEvent);
}
@@ -3780,7 +3783,6 @@ void ReplicationCoordinatorImpl::_finishReplSetReconfig(OperationContext* opCtx,
_clearCommittedSnapshot_inlock();
}
-
lk.unlock();
_performPostMemberStateUpdateAction(action);
}
@@ -4105,11 +4107,38 @@ void ReplicationCoordinatorImpl::_finishReplSetInitiate(OperationContext* opCtx,
void ReplicationCoordinatorImpl::_setConfigState_inlock(ConfigState newState) {
if (newState != _rsConfigState) {
+ LOGV2(6015317,
+ "Setting new configuration state",
+ "newState"_attr = getConfigStateString(newState),
+ "oldState"_attr = getConfigStateString(_rsConfigState));
_rsConfigState = newState;
_rsConfigStateChange.notify_all();
}
}
+std::string ReplicationCoordinatorImpl::getConfigStateString(ConfigState state) {
+ switch (state) {
+ case kConfigPreStart:
+ return "ConfigPreStart";
+ case kConfigStartingUp:
+ return "ConfigStartingUp";
+ case kConfigReplicationDisabled:
+ return "ConfigReplicationDisabled";
+ case kConfigUninitialized:
+ return "ConfigUninitialized";
+ case kConfigSteady:
+ return "ConfigSteady";
+ case kConfigInitiating:
+ return "ConfigInitiating";
+ case kConfigReconfiguring:
+ return "ConfigReconfiguring";
+ case kConfigHBReconfiguring:
+ return "ConfigHBReconfiguring";
+ default:
+ MONGO_UNREACHABLE;
+ }
+}
+
void ReplicationCoordinatorImpl::_errorOnPromisesIfHorizonChanged(WithLock lk,
OperationContext* opCtx,
const ReplSetConfig& oldConfig,
@@ -4378,6 +4407,7 @@ void ReplicationCoordinatorImpl::CatchupState::start_inlock() {
// No catchup in single node replica set.
if (_repl->_rsConfig.getNumMembers() == 1) {
+ LOGV2(6015304, "Skipping primary catchup since we are the only node in the replica set.");
abort_inlock(PrimaryCatchUpConclusionReason::kSkipped);
return;
}
@@ -4447,6 +4477,10 @@ void ReplicationCoordinatorImpl::CatchupState::signalHeartbeatUpdate_inlock() {
auto targetOpTime = _repl->_topCoord->latestKnownOpTimeSinceHeartbeatRestart();
// Haven't collected all heartbeat responses.
if (!targetOpTime) {
+ LOGV2_DEBUG(
+ 6015305,
+ 1,
+ "Not updating target optime for catchup, we haven't collected all heartbeat responses");
return;
}
@@ -5542,6 +5576,7 @@ Status ReplicationCoordinatorImpl::updateTerm(OperationContext* opCtx, long long
// Wait for potential stepdown to finish.
if (finishEvh.isValid()) {
+ LOGV2(6015302, "Waiting for potential stepdown to complete before finishing term update");
_replExecutor->waitForEvent(finishEvh);
}
if (updateTermResult == TopologyCoordinator::UpdateTermResult::kUpdatedTerm ||
@@ -5775,6 +5810,7 @@ Status ReplicationCoordinatorImpl::stepUpIfEligible(bool skipDryRun) {
}
}
if (finishEvent.isValid()) {
+ LOGV2(6015303, "Waiting for in-progress election to complete before finishing stepup");
_replExecutor->waitForEvent(finishEvent);
}
{
diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h
index 86aa18dd34e..b0e9fb0cba9 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.h
+++ b/src/mongo/db/repl/replication_coordinator_impl.h
@@ -1164,6 +1164,11 @@ private:
void _setConfigState_inlock(ConfigState newState);
/**
+ * Returns the string representation of the config state.
+ */
+ static std::string getConfigStateString(ConfigState state);
+
+ /**
* Returns true if the horizon mappings between the oldConfig and newConfig are different.
*/
void _errorOnPromisesIfHorizonChanged(WithLock lk,
diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp
index 68c619af21e..0c05b9f4a0c 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp
@@ -335,6 +335,9 @@ void ReplicationCoordinatorImpl::ElectionState::_writeLastVoteForMyElection(
auto opCtx = cc().makeOperationContext();
// Any writes that occur as part of an election should not be subject to Flow Control.
opCtx->setShouldParticipateInFlowControl(false);
+ LOGV2(6015300,
+ "Storing last vote document in local storage for my election",
+ "lastVote"_attr = lastVote);
return _repl->_externalState->storeLocalLastVoteDocument(opCtx.get(), lastVote);
}();
@@ -345,6 +348,7 @@ void ReplicationCoordinatorImpl::ElectionState::_writeLastVoteForMyElection(
stdx::lock_guard<Latch> lk(_repl->_mutex);
LoseElectionDryRunGuardV1 lossGuard(_repl);
if (status == ErrorCodes::CallbackCanceled) {
+ LOGV2(6015301, "Callback for storing last vote got cancelled");
return;
}