summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWenbin Zhu <wenbin.zhu@mongodb.com>2021-10-06 17:41:26 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-10-06 18:07:14 +0000
commitcf4ce26ca21a836aa336313a4a10676ca91c3131 (patch)
tree7e2017d5dc13c3fef67cd06ca3426a28325bf34c
parent28f46cb15f2ff5bbbd22b45006789bb3edb71872 (diff)
downloadmongo-cf4ce26ca21a836aa336313a4a10676ca91c3131.tar.gz
SERVER-60153 Replenish INFO level logging for elections and repl startup.
-rw-r--r--src/mongo/db/kill_sessions_local.cpp2
-rw-r--r--src/mongo/db/repl/replication_coordinator_external_state_impl.cpp1
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.cpp64
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.h5
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp4
5 files changed, 69 insertions, 7 deletions
diff --git a/src/mongo/db/kill_sessions_local.cpp b/src/mongo/db/kill_sessions_local.cpp
index d1570e2f2e9..f5070365d4e 100644
--- a/src/mongo/db/kill_sessions_local.cpp
+++ b/src/mongo/db/kill_sessions_local.cpp
@@ -190,6 +190,7 @@ void yieldLocksForPreparedTransactions(OperationContext* opCtx) {
// Scan the sessions again to get the list of all sessions with prepared transaction
// to yield their locks.
+ LOGV2(6015318, "Yielding locks for prepared transactions.");
SessionKiller::Matcher matcherAllSessions(
KillAllSessionsByPatternSet{makeKillAllSessionsByPattern(newOpCtx.get())});
killSessionsAction(newOpCtx.get(),
@@ -222,6 +223,7 @@ void invalidateSessionsForStepdown(OperationContext* opCtx) {
// It is illegal to invalidate the sessions if the operation has a session checked out.
invariant(!OperationContextSession::get(opCtx));
+ LOGV2(6015319, "Invalidating sessions for stepdown.");
SessionKiller::Matcher matcherAllSessions(
KillAllSessionsByPatternSet{makeKillAllSessionsByPattern(opCtx)});
killSessionsAction(opCtx,
diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
index f3193f6f178..cabdbe1eece 100644
--- a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
@@ -518,6 +518,7 @@ OpTime ReplicationCoordinatorExternalStateImpl::onTransitionToPrimary(OperationC
_replicationProcess->getConsistencyMarkers()->clearAppliedThrough(
opCtx, lastAppliedOpTime.getTimestamp());
+ LOGV2(6015309, "Logging transition to primary to oplog on stepup");
writeConflictRetry(opCtx, "logging transition to primary to oplog", "local.oplog.rs", [&] {
AutoGetOplog oplogWrite(opCtx, OplogAccessMode::kWrite);
WriteUnitOfWork wuow(opCtx);
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index 66c36a993dd..c2a46a1954b 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -411,6 +411,7 @@ void ReplicationCoordinatorImpl::appendConnectionStats(executor::ConnectionPoolS
bool ReplicationCoordinatorImpl::_startLoadLocalConfig(
OperationContext* opCtx, LastStorageEngineShutdownState lastStorageEngineShutdownState) {
+ LOGV2(4280500, "Attempting to create internal replication collections");
// Create necessary replication collections to guarantee that if a checkpoint sees data after
// initial sync has completed, it also sees these collections.
fassert(50708, _replicationProcess->getConsistencyMarkers()->createInternalCollections(opCtx));
@@ -423,6 +424,7 @@ bool ReplicationCoordinatorImpl::_startLoadLocalConfig(
fassert(51240, _externalState->createLocalLastVoteCollection(opCtx));
+ LOGV2(4280501, "Attempting to load local voted for document");
StatusWith<LastVote> lastVote = _externalState->loadLocalLastVoteDocument(opCtx);
if (!lastVote.isOK()) {
LOGV2_FATAL_NOTRACE(40367,
@@ -439,6 +441,7 @@ bool ReplicationCoordinatorImpl::_startLoadLocalConfig(
_topCoord->loadLastVote(lastVote.getValue());
}
+ LOGV2(4280502, "Searching for local Rollback ID document");
// Check that we have a local Rollback ID. If we do not have one, create one.
auto status = _replicationProcess->refreshRollbackID(opCtx);
if (!status.isOK()) {
@@ -491,9 +494,11 @@ bool ReplicationCoordinatorImpl::_startLoadLocalConfig(
"config"_attr = cfg.getValue());
}
+ LOGV2(4280504, "Cleaning up any partially applied oplog batches & reading last op from oplog");
// Read the last op from the oplog after cleaning up any partially applied batches.
const auto stableTimestamp = boost::none;
_replicationProcess->getReplicationRecovery()->recoverFromOplog(opCtx, stableTimestamp);
+ LOGV2(4280506, "Reconstructing prepared transactions");
reconstructPreparedTransactions(opCtx, OplogApplication::Mode::kRecovering);
const auto lastOpTimeAndWallTimeResult = _externalState->loadLastOpTimeAndWallTime(opCtx);
@@ -512,6 +517,7 @@ bool ReplicationCoordinatorImpl::_startLoadLocalConfig(
stdx::lock_guard<Latch> lk(_mutex);
_finishLoadLocalConfigCbh = std::move(handle.getValue());
+ LOGV2(4280507, "Loaded replica set config, scheduled callback to set local config");
return false;
}
@@ -540,6 +546,7 @@ void ReplicationCoordinatorImpl::_finishLoadLocalConfig(
return;
}
+ LOGV2(4280508, "Attempting to set local replica set config; validating config for startup");
StatusWith<int> myIndex =
validateConfigForStartUp(_externalState.get(), localConfig, getServiceContext());
if (!myIndex.isOK()) {
@@ -598,6 +605,7 @@ void ReplicationCoordinatorImpl::_finishLoadLocalConfig(
"** https://dochub.mongodb.org/core/psa-disable-rc-majority");
LOGV2_OPTIONS(21319, {logv2::LogTag::kStartupWarnings}, "");
}
+ LOGV2(4280509, "Local configuration validated for startup");
// Do not check optime, if this node is an arbiter.
bool isArbiter =
@@ -672,7 +680,7 @@ void ReplicationCoordinatorImpl::_finishLoadLocalConfig(
// Step down is impossible, so we don't need to wait for the returned event.
_updateTerm_inlock(term);
}
- LOGV2_DEBUG(21320, 1, "Current term is now {term}", "Updated term", "term"_attr = term);
+ LOGV2(21320, "Current term is now {term}", "Updated term", "term"_attr = term);
_performPostMemberStateUpdateAction(action);
if (!isArbiter && myIndex.getValue() != -1) {
@@ -728,6 +736,7 @@ void ReplicationCoordinatorImpl::_startDataReplication(OperationContext* opCtx,
const auto needsInitialSync =
lastOpTime.isNull() || _externalState->isInitialSyncFlagSet(opCtx);
if (!needsInitialSync) {
+ LOGV2(4280512, "No initial sync required. Attempting to begin steady replication");
// Start steady replication, since we already have data.
// ReplSetConfig has been installed, so it's either in STARTUP2 or REMOVED.
auto memberState = _getMemberState_inlock();
@@ -742,6 +751,7 @@ void ReplicationCoordinatorImpl::_startDataReplication(OperationContext* opCtx,
return;
}
+ LOGV2(4280513, "Initial sync required. Attempting to start initial sync...");
// Do initial sync.
if (!_externalState->getTaskExecutor()) {
LOGV2(21323, "Not running initial sync during test");
@@ -1102,6 +1112,7 @@ void ReplicationCoordinatorImpl::signalDrainComplete(OperationContext* opCtx,
stdx::unique_lock<Latch> lk(_mutex);
if (_applierState != ApplierState::Draining) {
+ LOGV2(6015306, "Applier already left draining state, exiting.");
return;
}
lk.unlock();
@@ -1120,6 +1131,8 @@ void ReplicationCoordinatorImpl::signalDrainComplete(OperationContext* opCtx,
// current term, and we're allowed to become the write master.
if (_applierState != ApplierState::Draining ||
!_topCoord->canCompleteTransitionToPrimary(termWhenBufferIsEmpty)) {
+ LOGV2(6015308,
+ "Applier left draining state or not allowed to become writeable primary, exiting");
return;
}
_applierState = ApplierState::Stopped;
@@ -1178,6 +1191,7 @@ void ReplicationCoordinatorImpl::signalDrainComplete(OperationContext* opCtx,
hangAfterReconfigOnDrainComplete.pauseWhileSet(opCtx);
}
+ LOGV2(6015310, "Starting to transition to primary.");
AllowNonLocalWritesBlock writesAllowed(opCtx);
OpTime firstOpTime = _externalState->onTransitionToPrimary(opCtx);
lk.lock();
@@ -3252,6 +3266,7 @@ Status ReplicationCoordinatorImpl::doReplSetReconfig(OperationContext* opCtx,
"_rsConfigState"_attr = int(_rsConfigState));
}
+ LOGV2(6015313, "Replication config state is Steady, starting reconfig");
invariant(_rsConfig.isInitialized());
if (!force && !_readWriteAbility->canAcceptNonLocalWrites(lk)) {
@@ -3346,6 +3361,7 @@ Status ReplicationCoordinatorImpl::doReplSetReconfig(OperationContext* opCtx,
// we bypass the check for finding ourselves in the config, since we know it should already be
// satisfied.
if (!sameConfigContents(oldConfig, newConfig)) {
+ LOGV2(6015314, "Finding self in new config");
StatusWith<int> myIndexSw = force
? findSelfInConfig(_externalState.get(), newConfig, opCtx->getServiceContext())
: findSelfInConfigIfElectable(
@@ -3400,6 +3416,7 @@ Status ReplicationCoordinatorImpl::doReplSetReconfig(OperationContext* opCtx,
}
// Wait for durability of the new config document.
JournalFlusher::get(opCtx)->waitForJournalFlush();
+ LOGV2(6015315, "Persisted new config to disk");
configStateGuard.dismiss();
_finishReplSetReconfig(opCtx, newConfig, force, myIndex);
@@ -3422,12 +3439,11 @@ void ReplicationCoordinatorImpl::_finishReplSetReconfig(OperationContext* opCtx,
// we have already set our ReplicationCoordinatorImpl::_rsConfigState state to
// "kConfigReconfiguring" which prevents new elections from happening.
if (electionFinishedEvent) {
- LOGV2_DEBUG(21354,
- 2,
- "Waiting for election to complete before finishing reconfig to config with "
- "{configVersionAndTerm}",
- "Waiting for election to complete before finishing reconfig",
- "configVersionAndTerm"_attr = newConfig.getConfigVersionAndTerm());
+ LOGV2(21354,
+ "Waiting for election to complete before finishing reconfig to config with "
+ "{configVersionAndTerm}",
+ "Waiting for election to complete before finishing reconfig",
+ "configVersionAndTerm"_attr = newConfig.getConfigVersionAndTerm());
// Wait for the election to complete and the node's Role to be set to follower.
_replExecutor->waitForEvent(electionFinishedEvent);
}
@@ -3733,11 +3749,38 @@ void ReplicationCoordinatorImpl::_finishReplSetInitiate(OperationContext* opCtx,
void ReplicationCoordinatorImpl::_setConfigState_inlock(ConfigState newState) {
if (newState != _rsConfigState) {
+ LOGV2(6015317,
+ "Setting new configuration state",
+ "newState"_attr = getConfigStateString(newState),
+ "oldState"_attr = getConfigStateString(_rsConfigState));
_rsConfigState = newState;
_rsConfigStateChange.notify_all();
}
}
+std::string ReplicationCoordinatorImpl::getConfigStateString(ConfigState state) {
+ switch (state) {
+ case kConfigPreStart:
+ return "ConfigPreStart";
+ case kConfigStartingUp:
+ return "ConfigStartingUp";
+ case kConfigReplicationDisabled:
+ return "ConfigReplicationDisabled";
+ case kConfigUninitialized:
+ return "ConfigUninitialized";
+ case kConfigSteady:
+ return "ConfigSteady";
+ case kConfigInitiating:
+ return "ConfigInitiating";
+ case kConfigReconfiguring:
+ return "ConfigReconfiguring";
+ case kConfigHBReconfiguring:
+ return "ConfigHBReconfiguring";
+ default:
+ MONGO_UNREACHABLE;
+ }
+}
+
void ReplicationCoordinatorImpl::_errorOnPromisesIfHorizonChanged(WithLock lk,
OperationContext* opCtx,
const ReplSetConfig& oldConfig,
@@ -4038,6 +4081,7 @@ void ReplicationCoordinatorImpl::CatchupState::start_inlock() {
// No catchup in single node replica set.
if (_repl->_rsConfig.getNumMembers() == 1) {
+ LOGV2(6015304, "Skipping primary catchup since we are the only node in the replica set.");
abort_inlock(PrimaryCatchUpConclusionReason::kSkipped);
return;
}
@@ -4107,6 +4151,10 @@ void ReplicationCoordinatorImpl::CatchupState::signalHeartbeatUpdate_inlock() {
auto targetOpTime = _repl->_topCoord->latestKnownOpTimeSinceHeartbeatRestart();
// Haven't collected all heartbeat responses.
if (!targetOpTime) {
+ LOGV2_DEBUG(
+ 6015305,
+ 1,
+ "Not updating target optime for catchup, we haven't collected all heartbeat responses");
return;
}
@@ -5141,6 +5189,7 @@ Status ReplicationCoordinatorImpl::updateTerm(OperationContext* opCtx, long long
// Wait for potential stepdown to finish.
if (finishEvh.isValid()) {
+ LOGV2(6015302, "Waiting for potential stepdown to complete before finishing term update");
_replExecutor->waitForEvent(finishEvh);
}
if (updateTermResult == TopologyCoordinator::UpdateTermResult::kUpdatedTerm ||
@@ -5360,6 +5409,7 @@ Status ReplicationCoordinatorImpl::stepUpIfEligible(bool skipDryRun) {
finishEvent = _electionFinishedEvent;
}
if (finishEvent.isValid()) {
+ LOGV2(6015303, "Waiting for in-progress election to complete before finishing stepup");
_replExecutor->waitForEvent(finishEvent);
}
{
diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h
index fd56e878ccc..730385c472b 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.h
+++ b/src/mongo/db/repl/replication_coordinator_impl.h
@@ -1024,6 +1024,11 @@ private:
void _setConfigState_inlock(ConfigState newState);
/**
+ * Returns the string representation of the config state.
+ */
+ static std::string getConfigStateString(ConfigState state);
+
+ /**
* Returns true if the horizon mappings between the oldConfig and newConfig are different.
*/
void _errorOnPromisesIfHorizonChanged(WithLock lk,
diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp
index e0c14764474..ade56675c51 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp
@@ -296,12 +296,16 @@ void ReplicationCoordinatorImpl::_writeLastVoteForMyElection(
auto opCtx = cc().makeOperationContext();
// Any writes that occur as part of an election should not be subject to Flow Control.
opCtx->setShouldParticipateInFlowControl(false);
+ LOGV2(6015300,
+ "Storing last vote document in local storage for my election",
+ "lastVote"_attr = lastVote);
return _externalState->storeLocalLastVoteDocument(opCtx.get(), lastVote);
}();
stdx::lock_guard<Latch> lk(_mutex);
LoseElectionDryRunGuardV1 lossGuard(this);
if (status == ErrorCodes::CallbackCanceled) {
+ LOGV2(6015301, "Callback for storing last vote got cancelled");
return;
}