SERVER-60153 Replenish INFO level logging for elections and repl startup.

author: Wenbin Zhu <wenbin.zhu@mongodb.com> 2021-10-06 17:41:26 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2021-10-06 18:07:14 +0000
commit: cf4ce26ca21a836aa336313a4a10676ca91c3131 (patch)
tree: 7e2017d5dc13c3fef67cd06ca3426a28325bf34c
parent: 28f46cb15f2ff5bbbd22b45006789bb3edb71872 (diff)
download: mongo-cf4ce26ca21a836aa336313a4a10676ca91c3131.tar.gz
5 files changed, 69 insertions, 7 deletions
diff --git a/src/mongo/db/kill_sessions_local.cpp b/src/mongo/db/kill_sessions_local.cpp
index d1570e2f2e9..f5070365d4e 100644
--- a/src/mongo/db/kill_sessions_local.cpp
+++ b/src/mongo/db/kill_sessions_local.cpp
@@ -190,6 +190,7 @@ void yieldLocksForPreparedTransactions(OperationContext* opCtx) {
 
     // Scan the sessions again to get the list of all sessions with prepared transaction
     // to yield their locks.
+    LOGV2(6015318, "Yielding locks for prepared transactions.");
     SessionKiller::Matcher matcherAllSessions(
         KillAllSessionsByPatternSet{makeKillAllSessionsByPattern(newOpCtx.get())});
     killSessionsAction(newOpCtx.get(),
@@ -222,6 +223,7 @@ void invalidateSessionsForStepdown(OperationContext* opCtx) {
     // It is illegal to invalidate the sessions if the operation has a session checked out.
     invariant(!OperationContextSession::get(opCtx));
 
+    LOGV2(6015319, "Invalidating sessions for stepdown.");
     SessionKiller::Matcher matcherAllSessions(
         KillAllSessionsByPatternSet{makeKillAllSessionsByPattern(opCtx)});
     killSessionsAction(opCtx,
diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
index f3193f6f178..cabdbe1eece 100644
--- a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
@@ -518,6 +518,7 @@ OpTime ReplicationCoordinatorExternalStateImpl::onTransitionToPrimary(OperationC
     _replicationProcess->getConsistencyMarkers()->clearAppliedThrough(
         opCtx, lastAppliedOpTime.getTimestamp());
 
+    LOGV2(6015309, "Logging transition to primary to oplog on stepup");
     writeConflictRetry(opCtx, "logging transition to primary to oplog", "local.oplog.rs", [&] {
         AutoGetOplog oplogWrite(opCtx, OplogAccessMode::kWrite);
         WriteUnitOfWork wuow(opCtx);
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index 66c36a993dd..c2a46a1954b 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -411,6 +411,7 @@ void ReplicationCoordinatorImpl::appendConnectionStats(executor::ConnectionPoolS
 
 bool ReplicationCoordinatorImpl::_startLoadLocalConfig(
     OperationContext* opCtx, LastStorageEngineShutdownState lastStorageEngineShutdownState) {
+    LOGV2(4280500, "Attempting to create internal replication collections");
     // Create necessary replication collections to guarantee that if a checkpoint sees data after
     // initial sync has completed, it also sees these collections.
     fassert(50708, _replicationProcess->getConsistencyMarkers()->createInternalCollections(opCtx));
@@ -423,6 +424,7 @@ bool ReplicationCoordinatorImpl::_startLoadLocalConfig(
 
     fassert(51240, _externalState->createLocalLastVoteCollection(opCtx));
 
+    LOGV2(4280501, "Attempting to load local voted for document");
     StatusWith<LastVote> lastVote = _externalState->loadLocalLastVoteDocument(opCtx);
     if (!lastVote.isOK()) {
         LOGV2_FATAL_NOTRACE(40367,
@@ -439,6 +441,7 @@ bool ReplicationCoordinatorImpl::_startLoadLocalConfig(
         _topCoord->loadLastVote(lastVote.getValue());
     }
 
+    LOGV2(4280502, "Searching for local Rollback ID document");
     // Check that we have a local Rollback ID. If we do not have one, create one.
     auto status = _replicationProcess->refreshRollbackID(opCtx);
     if (!status.isOK()) {
@@ -491,9 +494,11 @@ bool ReplicationCoordinatorImpl::_startLoadLocalConfig(
             "config"_attr = cfg.getValue());
     }
 
+    LOGV2(4280504, "Cleaning up any partially applied oplog batches & reading last op from oplog");
     // Read the last op from the oplog after cleaning up any partially applied batches.
     const auto stableTimestamp = boost::none;
     _replicationProcess->getReplicationRecovery()->recoverFromOplog(opCtx, stableTimestamp);
+    LOGV2(4280506, "Reconstructing prepared transactions");
     reconstructPreparedTransactions(opCtx, OplogApplication::Mode::kRecovering);
 
     const auto lastOpTimeAndWallTimeResult = _externalState->loadLastOpTimeAndWallTime(opCtx);
@@ -512,6 +517,7 @@ bool ReplicationCoordinatorImpl::_startLoadLocalConfig(
     stdx::lock_guard<Latch> lk(_mutex);
     _finishLoadLocalConfigCbh = std::move(handle.getValue());
 
+    LOGV2(4280507, "Loaded replica set config, scheduled callback to set local config");
     return false;
 }
 
@@ -540,6 +546,7 @@ void ReplicationCoordinatorImpl::_finishLoadLocalConfig(
         return;
     }
 
+    LOGV2(4280508, "Attempting to set local replica set config; validating config for startup");
     StatusWith<int> myIndex =
         validateConfigForStartUp(_externalState.get(), localConfig, getServiceContext());
     if (!myIndex.isOK()) {
@@ -598,6 +605,7 @@ void ReplicationCoordinatorImpl::_finishLoadLocalConfig(
                       "**          https://dochub.mongodb.org/core/psa-disable-rc-majority");
         LOGV2_OPTIONS(21319, {logv2::LogTag::kStartupWarnings}, "");
     }
+    LOGV2(4280509, "Local configuration validated for startup");
 
     // Do not check optime, if this node is an arbiter.
     bool isArbiter =
@@ -672,7 +680,7 @@ void ReplicationCoordinatorImpl::_finishLoadLocalConfig(
         // Step down is impossible, so we don't need to wait for the returned event.
         _updateTerm_inlock(term);
     }
-    LOGV2_DEBUG(21320, 1, "Current term is now {term}", "Updated term", "term"_attr = term);
+    LOGV2(21320, "Current term is now {term}", "Updated term", "term"_attr = term);
     _performPostMemberStateUpdateAction(action);
 
     if (!isArbiter && myIndex.getValue() != -1) {
@@ -728,6 +736,7 @@ void ReplicationCoordinatorImpl::_startDataReplication(OperationContext* opCtx,
     const auto needsInitialSync =
         lastOpTime.isNull() || _externalState->isInitialSyncFlagSet(opCtx);
     if (!needsInitialSync) {
+        LOGV2(4280512, "No initial sync required. Attempting to begin steady replication");
         // Start steady replication, since we already have data.
         // ReplSetConfig has been installed, so it's either in STARTUP2 or REMOVED.
         auto memberState = _getMemberState_inlock();
@@ -742,6 +751,7 @@ void ReplicationCoordinatorImpl::_startDataReplication(OperationContext* opCtx,
         return;
     }
 
+    LOGV2(4280513, "Initial sync required. Attempting to start initial sync...");
     // Do initial sync.
     if (!_externalState->getTaskExecutor()) {
         LOGV2(21323, "Not running initial sync during test");
@@ -1102,6 +1112,7 @@ void ReplicationCoordinatorImpl::signalDrainComplete(OperationContext* opCtx,
 
     stdx::unique_lock<Latch> lk(_mutex);
     if (_applierState != ApplierState::Draining) {
+        LOGV2(6015306, "Applier already left draining state, exiting.");
         return;
     }
     lk.unlock();
@@ -1120,6 +1131,8 @@ void ReplicationCoordinatorImpl::signalDrainComplete(OperationContext* opCtx,
     // current term, and we're allowed to become the write master.
     if (_applierState != ApplierState::Draining ||
         !_topCoord->canCompleteTransitionToPrimary(termWhenBufferIsEmpty)) {
+        LOGV2(6015308,
+              "Applier left draining state or not allowed to become writeable primary, exiting");
         return;
     }
     _applierState = ApplierState::Stopped;
@@ -1178,6 +1191,7 @@ void ReplicationCoordinatorImpl::signalDrainComplete(OperationContext* opCtx,
             hangAfterReconfigOnDrainComplete.pauseWhileSet(opCtx);
         }
 
+        LOGV2(6015310, "Starting to transition to primary.");
         AllowNonLocalWritesBlock writesAllowed(opCtx);
         OpTime firstOpTime = _externalState->onTransitionToPrimary(opCtx);
         lk.lock();
@@ -3252,6 +3266,7 @@ Status ReplicationCoordinatorImpl::doReplSetReconfig(OperationContext* opCtx,
                         "_rsConfigState"_attr = int(_rsConfigState));
     }
 
+    LOGV2(6015313, "Replication config state is Steady, starting reconfig");
     invariant(_rsConfig.isInitialized());
 
     if (!force && !_readWriteAbility->canAcceptNonLocalWrites(lk)) {
@@ -3346,6 +3361,7 @@ Status ReplicationCoordinatorImpl::doReplSetReconfig(OperationContext* opCtx,
     // we bypass the check for finding ourselves in the config, since we know it should already be
     // satisfied.
     if (!sameConfigContents(oldConfig, newConfig)) {
+        LOGV2(6015314, "Finding self in new config");
         StatusWith<int> myIndexSw = force
             ? findSelfInConfig(_externalState.get(), newConfig, opCtx->getServiceContext())
             : findSelfInConfigIfElectable(
@@ -3400,6 +3416,7 @@ Status ReplicationCoordinatorImpl::doReplSetReconfig(OperationContext* opCtx,
     }
     // Wait for durability of the new config document.
     JournalFlusher::get(opCtx)->waitForJournalFlush();
+    LOGV2(6015315, "Persisted new config to disk");
 
     configStateGuard.dismiss();
     _finishReplSetReconfig(opCtx, newConfig, force, myIndex);
@@ -3422,12 +3439,11 @@ void ReplicationCoordinatorImpl::_finishReplSetReconfig(OperationContext* opCtx,
     // we have already set our ReplicationCoordinatorImpl::_rsConfigState state to
     // "kConfigReconfiguring" which prevents new elections from happening.
     if (electionFinishedEvent) {
-        LOGV2_DEBUG(21354,
-                    2,
-                    "Waiting for election to complete before finishing reconfig to config with "
-                    "{configVersionAndTerm}",
-                    "Waiting for election to complete before finishing reconfig",
-                    "configVersionAndTerm"_attr = newConfig.getConfigVersionAndTerm());
+        LOGV2(21354,
+              "Waiting for election to complete before finishing reconfig to config with "
+              "{configVersionAndTerm}",
+              "Waiting for election to complete before finishing reconfig",
+              "configVersionAndTerm"_attr = newConfig.getConfigVersionAndTerm());
         // Wait for the election to complete and the node's Role to be set to follower.
         _replExecutor->waitForEvent(electionFinishedEvent);
     }
@@ -3733,11 +3749,38 @@ void ReplicationCoordinatorImpl::_finishReplSetInitiate(OperationContext* opCtx,
 
 void ReplicationCoordinatorImpl::_setConfigState_inlock(ConfigState newState) {
     if (newState != _rsConfigState) {
+        LOGV2(6015317,
+              "Setting new configuration state",
+              "newState"_attr = getConfigStateString(newState),
+              "oldState"_attr = getConfigStateString(_rsConfigState));
         _rsConfigState = newState;
         _rsConfigStateChange.notify_all();
     }
 }
 
+std::string ReplicationCoordinatorImpl::getConfigStateString(ConfigState state) {
+    switch (state) {
+        case kConfigPreStart:
+            return "ConfigPreStart";
+        case kConfigStartingUp:
+            return "ConfigStartingUp";
+        case kConfigReplicationDisabled:
+            return "ConfigReplicationDisabled";
+        case kConfigUninitialized:
+            return "ConfigUninitialized";
+        case kConfigSteady:
+            return "ConfigSteady";
+        case kConfigInitiating:
+            return "ConfigInitiating";
+        case kConfigReconfiguring:
+            return "ConfigReconfiguring";
+        case kConfigHBReconfiguring:
+            return "ConfigHBReconfiguring";
+        default:
+            MONGO_UNREACHABLE;
+    }
+}
+
 void ReplicationCoordinatorImpl::_errorOnPromisesIfHorizonChanged(WithLock lk,
                                                                   OperationContext* opCtx,
                                                                   const ReplSetConfig& oldConfig,
@@ -4038,6 +4081,7 @@ void ReplicationCoordinatorImpl::CatchupState::start_inlock() {
 
     // No catchup in single node replica set.
     if (_repl->_rsConfig.getNumMembers() == 1) {
+        LOGV2(6015304, "Skipping primary catchup since we are the only node in the replica set.");
         abort_inlock(PrimaryCatchUpConclusionReason::kSkipped);
         return;
     }
@@ -4107,6 +4151,10 @@ void ReplicationCoordinatorImpl::CatchupState::signalHeartbeatUpdate_inlock() {
     auto targetOpTime = _repl->_topCoord->latestKnownOpTimeSinceHeartbeatRestart();
     // Haven't collected all heartbeat responses.
     if (!targetOpTime) {
+        LOGV2_DEBUG(
+            6015305,
+            1,
+            "Not updating target optime for catchup, we haven't collected all heartbeat responses");
         return;
     }
 
@@ -5141,6 +5189,7 @@ Status ReplicationCoordinatorImpl::updateTerm(OperationContext* opCtx, long long
 
     // Wait for potential stepdown to finish.
     if (finishEvh.isValid()) {
+        LOGV2(6015302, "Waiting for potential stepdown to complete before finishing term update");
         _replExecutor->waitForEvent(finishEvh);
     }
     if (updateTermResult == TopologyCoordinator::UpdateTermResult::kUpdatedTerm ||
@@ -5360,6 +5409,7 @@ Status ReplicationCoordinatorImpl::stepUpIfEligible(bool skipDryRun) {
         finishEvent = _electionFinishedEvent;
     }
     if (finishEvent.isValid()) {
+        LOGV2(6015303, "Waiting for in-progress election to complete before finishing stepup");
         _replExecutor->waitForEvent(finishEvent);
     }
     {
diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h
index fd56e878ccc..730385c472b 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.h
+++ b/src/mongo/db/repl/replication_coordinator_impl.h
@@ -1024,6 +1024,11 @@ private:
     void _setConfigState_inlock(ConfigState newState);
 
     /**
+     * Returns the string representation of the config state.
+     */
+    static std::string getConfigStateString(ConfigState state);
+
+    /**
      * Returns true if the horizon mappings between the oldConfig and newConfig are different.
      */
     void _errorOnPromisesIfHorizonChanged(WithLock lk,
diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp
index e0c14764474..ade56675c51 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp
@@ -296,12 +296,16 @@ void ReplicationCoordinatorImpl::_writeLastVoteForMyElection(
         auto opCtx = cc().makeOperationContext();
         // Any writes that occur as part of an election should not be subject to Flow Control.
         opCtx->setShouldParticipateInFlowControl(false);
+        LOGV2(6015300,
+              "Storing last vote document in local storage for my election",
+              "lastVote"_attr = lastVote);
         return _externalState->storeLocalLastVoteDocument(opCtx.get(), lastVote);
     }();
 
     stdx::lock_guard<Latch> lk(_mutex);
     LoseElectionDryRunGuardV1 lossGuard(this);
     if (status == ErrorCodes::CallbackCanceled) {
+        LOGV2(6015301, "Callback for storing last vote got cancelled");
         return;
     }
author	Wenbin Zhu <wenbin.zhu@mongodb.com>	2021-10-06 17:41:26 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2021-10-06 18:07:14 +0000
commit	cf4ce26ca21a836aa336313a4a10676ca91c3131 (patch)
tree	7e2017d5dc13c3fef67cd06ca3426a28325bf34c
parent	28f46cb15f2ff5bbbd22b45006789bb3edb71872 (diff)
download	mongo-cf4ce26ca21a836aa336313a4a10676ca91c3131.tar.gz