SERVER-60153 Replenish INFO level logging for elections and repl startup.

author: Wenbin Zhu <wenbin.zhu@mongodb.com> 2021-10-04 21:51:44 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2021-10-04 23:00:03 +0000
commit: e097735040d077adb8b03a061d65c315750277d3 (patch)
tree: c4e645a2f9e57da91b1d20fbf3dadcb19a645538 /src
parent: 217285dd2d3689aaaf850adefc28aa8edbd7a957 (diff)
download: mongo-e097735040d077adb8b03a061d65c315750277d3.tar.gz
5 files changed, 73 insertions, 25 deletions
diff --git a/src/mongo/db/kill_sessions_local.cpp b/src/mongo/db/kill_sessions_local.cpp
index d1570e2f2e9..f5070365d4e 100644
--- a/src/mongo/db/kill_sessions_local.cpp
+++ b/src/mongo/db/kill_sessions_local.cpp
@@ -190,6 +190,7 @@ void yieldLocksForPreparedTransactions(OperationContext* opCtx) {
 
     // Scan the sessions again to get the list of all sessions with prepared transaction
     // to yield their locks.
+    LOGV2(6015318, "Yielding locks for prepared transactions.");
     SessionKiller::Matcher matcherAllSessions(
         KillAllSessionsByPatternSet{makeKillAllSessionsByPattern(newOpCtx.get())});
     killSessionsAction(newOpCtx.get(),
@@ -222,6 +223,7 @@ void invalidateSessionsForStepdown(OperationContext* opCtx) {
     // It is illegal to invalidate the sessions if the operation has a session checked out.
     invariant(!OperationContextSession::get(opCtx));
 
+    LOGV2(6015319, "Invalidating sessions for stepdown.");
     SessionKiller::Matcher matcherAllSessions(
         KillAllSessionsByPatternSet{makeKillAllSessionsByPattern(opCtx)});
     killSessionsAction(opCtx,
diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
index 3f4ace4b665..605593200b0 100644
--- a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
@@ -525,6 +525,7 @@ OpTime ReplicationCoordinatorExternalStateImpl::onTransitionToPrimary(OperationC
     // exclusive mode.
     _replicationProcess->getConsistencyMarkers()->clearAppliedThrough(opCtx, Timestamp());
 
+    LOGV2(6015309, "Logging transition to primary to oplog on stepup");
     writeConflictRetry(opCtx, "logging transition to primary to oplog", "local.oplog.rs", [&] {
         AutoGetOplog oplogWrite(opCtx, OplogAccessMode::kWrite);
         WriteUnitOfWork wuow(opCtx);
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index f9ea5fbc72a..6f42076d2ec 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -421,7 +421,7 @@ void ReplicationCoordinatorImpl::appendConnectionStats(executor::ConnectionPoolS
 
 bool ReplicationCoordinatorImpl::_startLoadLocalConfig(
     OperationContext* opCtx, StorageEngine::LastShutdownState lastShutdownState) {
-    LOGV2_DEBUG(4280500, 1, "Attempting to create internal replication collections");
+    LOGV2(4280500, "Attempting to create internal replication collections");
     // Create necessary replication collections to guarantee that if a checkpoint sees data after
     // initial sync has completed, it also sees these collections.
     fassert(50708, _replicationProcess->getConsistencyMarkers()->createInternalCollections(opCtx));
@@ -434,7 +434,7 @@ bool ReplicationCoordinatorImpl::_startLoadLocalConfig(
 
     fassert(51240, _externalState->createLocalLastVoteCollection(opCtx));
 
-    LOGV2_DEBUG(4280501, 1, "Attempting to load local voted for document");
+    LOGV2(4280501, "Attempting to load local voted for document");
     StatusWith<LastVote> lastVote = _externalState->loadLocalLastVoteDocument(opCtx);
     if (!lastVote.isOK()) {
         LOGV2_FATAL_NOTRACE(40367,
@@ -451,7 +451,7 @@ bool ReplicationCoordinatorImpl::_startLoadLocalConfig(
         _topCoord->loadLastVote(lastVote.getValue());
     }
 
-    LOGV2_DEBUG(4280502, 1, "Searching for local Rollback ID document");
+    LOGV2(4280502, "Searching for local Rollback ID document");
     // Check that we have a local Rollback ID. If we do not have one, create one.
     auto status = _replicationProcess->refreshRollbackID(opCtx);
     if (!status.isOK()) {
@@ -507,16 +507,14 @@ bool ReplicationCoordinatorImpl::_startLoadLocalConfig(
             "config"_attr = cfg.getValue());
     }
 
-    LOGV2_DEBUG(
-        4280504, 1, "Cleaning up any partially applied oplog batches & reading last op from oplog");
+    LOGV2(4280504, "Cleaning up any partially applied oplog batches & reading last op from oplog");
     // Read the last op from the oplog after cleaning up any partially applied batches.
     const auto stableTimestamp = boost::none;
     _replicationProcess->getReplicationRecovery()->recoverFromOplog(opCtx, stableTimestamp);
-    LOGV2_DEBUG(4280505,
-                1,
-                "Creating any necessary TenantMigrationAccessBlockers for unfinished migrations");
+    LOGV2(4280505,
+          "Creating any necessary TenantMigrationAccessBlockers for unfinished migrations");
     tenant_migration_access_blocker::recoverTenantMigrationAccessBlockers(opCtx);
-    LOGV2_DEBUG(4280506, 1, "Reconstructing prepared transactions");
+    LOGV2(4280506, "Reconstructing prepared transactions");
     reconstructPreparedTransactions(opCtx, OplogApplication::Mode::kRecovering);
 
     const auto lastOpTimeAndWallTimeResult = _externalState->loadLastOpTimeAndWallTime(opCtx);
@@ -535,7 +533,7 @@ bool ReplicationCoordinatorImpl::_startLoadLocalConfig(
     stdx::lock_guard<Latch> lk(_mutex);
     _finishLoadLocalConfigCbh = std::move(handle.getValue());
 
-    LOGV2_DEBUG(4280507, 1, "Loaded replica set config, scheduled callback to set local config");
+    LOGV2(4280507, "Loaded replica set config, scheduled callback to set local config");
     return false;
 }
 
@@ -564,8 +562,7 @@ void ReplicationCoordinatorImpl::_finishLoadLocalConfig(
         return;
     }
 
-    LOGV2_DEBUG(
-        4280508, 1, "Attempting to set local replica set config; validating config for startup");
+    LOGV2(4280508, "Attempting to set local replica set config; validating config for startup");
     StatusWith<int> myIndex =
         validateConfigForStartUp(_externalState.get(), localConfig, getServiceContext());
     if (!myIndex.isOK()) {
@@ -606,7 +603,7 @@ void ReplicationCoordinatorImpl::_finishLoadLocalConfig(
                       "commandLineSetName"_attr = _settings.ourSetName());
         myIndex = StatusWith<int>(-1);
     }
-    LOGV2_DEBUG(4280509, 1, "Local configuration validated for startup");
+    LOGV2(4280509, "Local configuration validated for startup");
 
     // Do not check optime, if this node is an arbiter.
     bool isArbiter =
@@ -689,14 +686,14 @@ void ReplicationCoordinatorImpl::_finishLoadLocalConfig(
         // Step down is impossible, so we don't need to wait for the returned event.
         _updateTerm_inlock(term);
     }
-    LOGV2_DEBUG(21320, 1, "Current term is now {term}", "Updated term", "term"_attr = term);
+    LOGV2(21320, "Current term is now {term}", "Updated term", "term"_attr = term);
     _performPostMemberStateUpdateAction(action);
 
     if (!isArbiter && myIndex.getValue() != -1) {
         _externalState->startThreads();
         _startDataReplication(opCtx.get());
     }
-    LOGV2_DEBUG(4280511, 1, "Set local replica set config");
+    LOGV2(4280511, "Set local replica set config");
 }
 
 void ReplicationCoordinatorImpl::_startInitialSync(
@@ -748,7 +745,7 @@ void ReplicationCoordinatorImpl::_startInitialSync(
         // InitialSyncer::startup() must be called outside lock because it uses features (eg.
         // setting the initial sync flag) which depend on the ReplicationCoordinatorImpl.
         uassertStatusOK(initialSyncerCopy->startup(opCtx, numInitialSyncAttempts.load()));
-        LOGV2_DEBUG(4280514, 1, "Initial sync started");
+        LOGV2(4280514, "Initial sync started");
     } catch (const DBException& e) {
         auto status = e.toStatus();
         LOGV2(21327,
@@ -835,7 +832,7 @@ void ReplicationCoordinatorImpl::_startDataReplication(OperationContext* opCtx)
     const auto needsInitialSync =
         lastOpTime.isNull() || _externalState->isInitialSyncFlagSet(opCtx);
     if (!needsInitialSync) {
-        LOGV2_DEBUG(4280512, 1, "No initial sync required. Attempting to begin steady replication");
+        LOGV2(4280512, "No initial sync required. Attempting to begin steady replication");
         // Start steady replication, since we already have data.
         // ReplSetConfig has been installed, so it's either in STARTUP2 or REMOVED.
         auto memberState = getMemberState();
@@ -848,7 +845,7 @@ void ReplicationCoordinatorImpl::_startDataReplication(OperationContext* opCtx)
         return;
     }
 
-    LOGV2_DEBUG(4280513, 1, "Initial sync required. Attempting to start initial sync...");
+    LOGV2(4280513, "Initial sync required. Attempting to start initial sync...");
     // Do initial sync.
     if (!_externalState->getTaskExecutor()) {
         LOGV2(21323, "Not running initial sync during test");
@@ -1173,6 +1170,7 @@ void ReplicationCoordinatorImpl::signalDrainComplete(OperationContext* opCtx,
 
     stdx::unique_lock<Latch> lk(_mutex);
     if (_applierState != ApplierState::Draining) {
+        LOGV2(6015306, "Applier already left draining state, exiting.");
         return;
     }
     lk.unlock();
@@ -1197,6 +1195,8 @@ void ReplicationCoordinatorImpl::signalDrainComplete(OperationContext* opCtx,
     // current term, and we're allowed to become the writable primary.
     if (_applierState != ApplierState::Draining ||
         !_topCoord->canCompleteTransitionToPrimary(termWhenBufferIsEmpty)) {
+        LOGV2(6015308,
+              "Applier left draining state or not allowed to become writeable primary, exiting");
         return;
     }
     _applierState = ApplierState::Stopped;
@@ -1257,6 +1257,7 @@ void ReplicationCoordinatorImpl::signalDrainComplete(OperationContext* opCtx,
             hangAfterReconfigOnDrainComplete.pauseWhileSet(opCtx);
         }
 
+        LOGV2(6015310, "Starting to transition to primary.");
         AllowNonLocalWritesBlock writesAllowed(opCtx);
         OpTime firstOpTime = _externalState->onTransitionToPrimary(opCtx);
         ReplicaSetAwareServiceRegistry::get(_service).onStepUpComplete(opCtx,
@@ -3458,6 +3459,7 @@ Status ReplicationCoordinatorImpl::_doReplSetReconfig(OperationContext* opCtx,
                         "_rsConfigState"_attr = int(_rsConfigState));
     }
 
+    LOGV2(6015313, "Replication config state is Steady, starting reconfig");
     invariant(_rsConfig.isInitialized());
 
     if (!force && !_readWriteAbility->canAcceptNonLocalWrites(lk) && !skipSafetyChecks) {
@@ -3621,6 +3623,7 @@ Status ReplicationCoordinatorImpl::_doReplSetReconfig(OperationContext* opCtx,
             // Always debug assert if we reach this point.
             dassert(!skipSafetyChecks);
         }
+        LOGV2(6015314, "Finding self in new config");
         StatusWith<int> myIndexSw = force
             ? findSelfInConfig(_externalState.get(), newConfig, opCtx->getServiceContext())
             : findSelfInConfigIfElectable(
@@ -3681,6 +3684,7 @@ Status ReplicationCoordinatorImpl::_doReplSetReconfig(OperationContext* opCtx,
     }
     // Wait for durability of the new config document.
     JournalFlusher::get(opCtx)->waitForJournalFlush();
+    LOGV2(6015315, "Persisted new config to disk");
 
     configStateGuard.dismiss();
     _finishReplSetReconfig(opCtx, newConfig, force, myIndex);
@@ -3703,12 +3707,11 @@ void ReplicationCoordinatorImpl::_finishReplSetReconfig(OperationContext* opCtx,
     // we have already set our ReplicationCoordinatorImpl::_rsConfigState state to
     // "kConfigReconfiguring" which prevents new elections from happening.
     if (electionFinishedEvent) {
-        LOGV2_DEBUG(21354,
-                    2,
-                    "Waiting for election to complete before finishing reconfig to config with "
-                    "{configVersionAndTerm}",
-                    "Waiting for election to complete before finishing reconfig",
-                    "configVersionAndTerm"_attr = newConfig.getConfigVersionAndTerm());
+        LOGV2(21354,
+              "Waiting for election to complete before finishing reconfig to config with "
+              "{configVersionAndTerm}",
+              "Waiting for election to complete before finishing reconfig",
+              "configVersionAndTerm"_attr = newConfig.getConfigVersionAndTerm());
         // Wait for the election to complete and the node's Role to be set to follower.
         _replExecutor->waitForEvent(electionFinishedEvent);
     }
@@ -3780,7 +3783,6 @@ void ReplicationCoordinatorImpl::_finishReplSetReconfig(OperationContext* opCtx,
         _clearCommittedSnapshot_inlock();
     }
 
-
     lk.unlock();
     _performPostMemberStateUpdateAction(action);
 }
@@ -4105,11 +4107,38 @@ void ReplicationCoordinatorImpl::_finishReplSetInitiate(OperationContext* opCtx,
 
 void ReplicationCoordinatorImpl::_setConfigState_inlock(ConfigState newState) {
     if (newState != _rsConfigState) {
+        LOGV2(6015317,
+              "Setting new configuration state",
+              "newState"_attr = getConfigStateString(newState),
+              "oldState"_attr = getConfigStateString(_rsConfigState));
         _rsConfigState = newState;
         _rsConfigStateChange.notify_all();
     }
 }
 
+std::string ReplicationCoordinatorImpl::getConfigStateString(ConfigState state) {
+    switch (state) {
+        case kConfigPreStart:
+            return "ConfigPreStart";
+        case kConfigStartingUp:
+            return "ConfigStartingUp";
+        case kConfigReplicationDisabled:
+            return "ConfigReplicationDisabled";
+        case kConfigUninitialized:
+            return "ConfigUninitialized";
+        case kConfigSteady:
+            return "ConfigSteady";
+        case kConfigInitiating:
+            return "ConfigInitiating";
+        case kConfigReconfiguring:
+            return "ConfigReconfiguring";
+        case kConfigHBReconfiguring:
+            return "ConfigHBReconfiguring";
+        default:
+            MONGO_UNREACHABLE;
+    }
+}
+
 void ReplicationCoordinatorImpl::_errorOnPromisesIfHorizonChanged(WithLock lk,
                                                                   OperationContext* opCtx,
                                                                   const ReplSetConfig& oldConfig,
@@ -4378,6 +4407,7 @@ void ReplicationCoordinatorImpl::CatchupState::start_inlock() {
 
     // No catchup in single node replica set.
     if (_repl->_rsConfig.getNumMembers() == 1) {
+        LOGV2(6015304, "Skipping primary catchup since we are the only node in the replica set.");
         abort_inlock(PrimaryCatchUpConclusionReason::kSkipped);
         return;
     }
@@ -4447,6 +4477,10 @@ void ReplicationCoordinatorImpl::CatchupState::signalHeartbeatUpdate_inlock() {
     auto targetOpTime = _repl->_topCoord->latestKnownOpTimeSinceHeartbeatRestart();
     // Haven't collected all heartbeat responses.
     if (!targetOpTime) {
+        LOGV2_DEBUG(
+            6015305,
+            1,
+            "Not updating target optime for catchup, we haven't collected all heartbeat responses");
         return;
     }
 
@@ -5542,6 +5576,7 @@ Status ReplicationCoordinatorImpl::updateTerm(OperationContext* opCtx, long long
 
     // Wait for potential stepdown to finish.
     if (finishEvh.isValid()) {
+        LOGV2(6015302, "Waiting for potential stepdown to complete before finishing term update");
         _replExecutor->waitForEvent(finishEvh);
     }
     if (updateTermResult == TopologyCoordinator::UpdateTermResult::kUpdatedTerm ||
@@ -5775,6 +5810,7 @@ Status ReplicationCoordinatorImpl::stepUpIfEligible(bool skipDryRun) {
         }
     }
     if (finishEvent.isValid()) {
+        LOGV2(6015303, "Waiting for in-progress election to complete before finishing stepup");
         _replExecutor->waitForEvent(finishEvent);
     }
     {
diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h
index 86aa18dd34e..b0e9fb0cba9 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.h
+++ b/src/mongo/db/repl/replication_coordinator_impl.h
@@ -1164,6 +1164,11 @@ private:
     void _setConfigState_inlock(ConfigState newState);
 
     /**
+     * Returns the string representation of the config state.
+     */
+    static std::string getConfigStateString(ConfigState state);
+
+    /**
      * Returns true if the horizon mappings between the oldConfig and newConfig are different.
      */
     void _errorOnPromisesIfHorizonChanged(WithLock lk,
diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp
index 68c619af21e..0c05b9f4a0c 100644
--- a/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp
@@ -335,6 +335,9 @@ void ReplicationCoordinatorImpl::ElectionState::_writeLastVoteForMyElection(
         auto opCtx = cc().makeOperationContext();
         // Any writes that occur as part of an election should not be subject to Flow Control.
         opCtx->setShouldParticipateInFlowControl(false);
+        LOGV2(6015300,
+              "Storing last vote document in local storage for my election",
+              "lastVote"_attr = lastVote);
         return _repl->_externalState->storeLocalLastVoteDocument(opCtx.get(), lastVote);
     }();
 
@@ -345,6 +348,7 @@ void ReplicationCoordinatorImpl::ElectionState::_writeLastVoteForMyElection(
     stdx::lock_guard<Latch> lk(_repl->_mutex);
     LoseElectionDryRunGuardV1 lossGuard(_repl);
     if (status == ErrorCodes::CallbackCanceled) {
+        LOGV2(6015301, "Callback for storing last vote got cancelled");
         return;
     }
author	Wenbin Zhu <wenbin.zhu@mongodb.com>	2021-10-04 21:51:44 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2021-10-04 23:00:03 +0000
commit	e097735040d077adb8b03a061d65c315750277d3 (patch)
tree	c4e645a2f9e57da91b1d20fbf3dadcb19a645538 /src
parent	217285dd2d3689aaaf850adefc28aa8edbd7a957 (diff)
download	mongo-e097735040d077adb8b03a061d65c315750277d3.tar.gz