diff options
author | Wenbin Zhu <wenbin.zhu@mongodb.com> | 2021-10-04 21:51:44 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-10-04 23:00:03 +0000 |
commit | e097735040d077adb8b03a061d65c315750277d3 (patch) | |
tree | c4e645a2f9e57da91b1d20fbf3dadcb19a645538 /src | |
parent | 217285dd2d3689aaaf850adefc28aa8edbd7a957 (diff) | |
download | mongo-e097735040d077adb8b03a061d65c315750277d3.tar.gz |
SERVER-60153 Replenish INFO level logging for elections and repl startup.
Diffstat (limited to 'src')
5 files changed, 73 insertions, 25 deletions
diff --git a/src/mongo/db/kill_sessions_local.cpp b/src/mongo/db/kill_sessions_local.cpp index d1570e2f2e9..f5070365d4e 100644 --- a/src/mongo/db/kill_sessions_local.cpp +++ b/src/mongo/db/kill_sessions_local.cpp @@ -190,6 +190,7 @@ void yieldLocksForPreparedTransactions(OperationContext* opCtx) { // Scan the sessions again to get the list of all sessions with prepared transaction // to yield their locks. + LOGV2(6015318, "Yielding locks for prepared transactions."); SessionKiller::Matcher matcherAllSessions( KillAllSessionsByPatternSet{makeKillAllSessionsByPattern(newOpCtx.get())}); killSessionsAction(newOpCtx.get(), @@ -222,6 +223,7 @@ void invalidateSessionsForStepdown(OperationContext* opCtx) { // It is illegal to invalidate the sessions if the operation has a session checked out. invariant(!OperationContextSession::get(opCtx)); + LOGV2(6015319, "Invalidating sessions for stepdown."); SessionKiller::Matcher matcherAllSessions( KillAllSessionsByPatternSet{makeKillAllSessionsByPattern(opCtx)}); killSessionsAction(opCtx, diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp index 3f4ace4b665..605593200b0 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp @@ -525,6 +525,7 @@ OpTime ReplicationCoordinatorExternalStateImpl::onTransitionToPrimary(OperationC // exclusive mode. _replicationProcess->getConsistencyMarkers()->clearAppliedThrough(opCtx, Timestamp()); + LOGV2(6015309, "Logging transition to primary to oplog on stepup"); writeConflictRetry(opCtx, "logging transition to primary to oplog", "local.oplog.rs", [&] { AutoGetOplog oplogWrite(opCtx, OplogAccessMode::kWrite); WriteUnitOfWork wuow(opCtx); diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index f9ea5fbc72a..6f42076d2ec 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -421,7 +421,7 @@ void ReplicationCoordinatorImpl::appendConnectionStats(executor::ConnectionPoolS bool ReplicationCoordinatorImpl::_startLoadLocalConfig( OperationContext* opCtx, StorageEngine::LastShutdownState lastShutdownState) { - LOGV2_DEBUG(4280500, 1, "Attempting to create internal replication collections"); + LOGV2(4280500, "Attempting to create internal replication collections"); // Create necessary replication collections to guarantee that if a checkpoint sees data after // initial sync has completed, it also sees these collections. fassert(50708, _replicationProcess->getConsistencyMarkers()->createInternalCollections(opCtx)); @@ -434,7 +434,7 @@ bool ReplicationCoordinatorImpl::_startLoadLocalConfig( fassert(51240, _externalState->createLocalLastVoteCollection(opCtx)); - LOGV2_DEBUG(4280501, 1, "Attempting to load local voted for document"); + LOGV2(4280501, "Attempting to load local voted for document"); StatusWith<LastVote> lastVote = _externalState->loadLocalLastVoteDocument(opCtx); if (!lastVote.isOK()) { LOGV2_FATAL_NOTRACE(40367, @@ -451,7 +451,7 @@ bool ReplicationCoordinatorImpl::_startLoadLocalConfig( _topCoord->loadLastVote(lastVote.getValue()); } - LOGV2_DEBUG(4280502, 1, "Searching for local Rollback ID document"); + LOGV2(4280502, "Searching for local Rollback ID document"); // Check that we have a local Rollback ID. If we do not have one, create one. auto status = _replicationProcess->refreshRollbackID(opCtx); if (!status.isOK()) { @@ -507,16 +507,14 @@ bool ReplicationCoordinatorImpl::_startLoadLocalConfig( "config"_attr = cfg.getValue()); } - LOGV2_DEBUG( - 4280504, 1, "Cleaning up any partially applied oplog batches & reading last op from oplog"); + LOGV2(4280504, "Cleaning up any partially applied oplog batches & reading last op from oplog"); // Read the last op from the oplog after cleaning up any partially applied batches. const auto stableTimestamp = boost::none; _replicationProcess->getReplicationRecovery()->recoverFromOplog(opCtx, stableTimestamp); - LOGV2_DEBUG(4280505, - 1, - "Creating any necessary TenantMigrationAccessBlockers for unfinished migrations"); + LOGV2(4280505, + "Creating any necessary TenantMigrationAccessBlockers for unfinished migrations"); tenant_migration_access_blocker::recoverTenantMigrationAccessBlockers(opCtx); - LOGV2_DEBUG(4280506, 1, "Reconstructing prepared transactions"); + LOGV2(4280506, "Reconstructing prepared transactions"); reconstructPreparedTransactions(opCtx, OplogApplication::Mode::kRecovering); const auto lastOpTimeAndWallTimeResult = _externalState->loadLastOpTimeAndWallTime(opCtx); @@ -535,7 +533,7 @@ bool ReplicationCoordinatorImpl::_startLoadLocalConfig( stdx::lock_guard<Latch> lk(_mutex); _finishLoadLocalConfigCbh = std::move(handle.getValue()); - LOGV2_DEBUG(4280507, 1, "Loaded replica set config, scheduled callback to set local config"); + LOGV2(4280507, "Loaded replica set config, scheduled callback to set local config"); return false; } @@ -564,8 +562,7 @@ void ReplicationCoordinatorImpl::_finishLoadLocalConfig( return; } - LOGV2_DEBUG( - 4280508, 1, "Attempting to set local replica set config; validating config for startup"); + LOGV2(4280508, "Attempting to set local replica set config; validating config for startup"); StatusWith<int> myIndex = validateConfigForStartUp(_externalState.get(), localConfig, getServiceContext()); if (!myIndex.isOK()) { @@ -606,7 +603,7 @@ void ReplicationCoordinatorImpl::_finishLoadLocalConfig( "commandLineSetName"_attr = _settings.ourSetName()); myIndex = StatusWith<int>(-1); } - LOGV2_DEBUG(4280509, 1, "Local configuration validated for startup"); + LOGV2(4280509, "Local configuration validated for startup"); // Do not check optime, if this node is an arbiter. bool isArbiter = @@ -689,14 +686,14 @@ void ReplicationCoordinatorImpl::_finishLoadLocalConfig( // Step down is impossible, so we don't need to wait for the returned event. _updateTerm_inlock(term); } - LOGV2_DEBUG(21320, 1, "Current term is now {term}", "Updated term", "term"_attr = term); + LOGV2(21320, "Current term is now {term}", "Updated term", "term"_attr = term); _performPostMemberStateUpdateAction(action); if (!isArbiter && myIndex.getValue() != -1) { _externalState->startThreads(); _startDataReplication(opCtx.get()); } - LOGV2_DEBUG(4280511, 1, "Set local replica set config"); + LOGV2(4280511, "Set local replica set config"); } void ReplicationCoordinatorImpl::_startInitialSync( @@ -748,7 +745,7 @@ void ReplicationCoordinatorImpl::_startInitialSync( // InitialSyncer::startup() must be called outside lock because it uses features (eg. // setting the initial sync flag) which depend on the ReplicationCoordinatorImpl. uassertStatusOK(initialSyncerCopy->startup(opCtx, numInitialSyncAttempts.load())); - LOGV2_DEBUG(4280514, 1, "Initial sync started"); + LOGV2(4280514, "Initial sync started"); } catch (const DBException& e) { auto status = e.toStatus(); LOGV2(21327, @@ -835,7 +832,7 @@ void ReplicationCoordinatorImpl::_startDataReplication(OperationContext* opCtx) const auto needsInitialSync = lastOpTime.isNull() || _externalState->isInitialSyncFlagSet(opCtx); if (!needsInitialSync) { - LOGV2_DEBUG(4280512, 1, "No initial sync required. Attempting to begin steady replication"); + LOGV2(4280512, "No initial sync required. Attempting to begin steady replication"); // Start steady replication, since we already have data. // ReplSetConfig has been installed, so it's either in STARTUP2 or REMOVED. auto memberState = getMemberState(); @@ -848,7 +845,7 @@ void ReplicationCoordinatorImpl::_startDataReplication(OperationContext* opCtx) return; } - LOGV2_DEBUG(4280513, 1, "Initial sync required. Attempting to start initial sync..."); + LOGV2(4280513, "Initial sync required. Attempting to start initial sync..."); // Do initial sync. if (!_externalState->getTaskExecutor()) { LOGV2(21323, "Not running initial sync during test"); @@ -1173,6 +1170,7 @@ void ReplicationCoordinatorImpl::signalDrainComplete(OperationContext* opCtx, stdx::unique_lock<Latch> lk(_mutex); if (_applierState != ApplierState::Draining) { + LOGV2(6015306, "Applier already left draining state, exiting."); return; } lk.unlock(); @@ -1197,6 +1195,8 @@ void ReplicationCoordinatorImpl::signalDrainComplete(OperationContext* opCtx, // current term, and we're allowed to become the writable primary. if (_applierState != ApplierState::Draining || !_topCoord->canCompleteTransitionToPrimary(termWhenBufferIsEmpty)) { + LOGV2(6015308, + "Applier left draining state or not allowed to become writeable primary, exiting"); return; } _applierState = ApplierState::Stopped; @@ -1257,6 +1257,7 @@ void ReplicationCoordinatorImpl::signalDrainComplete(OperationContext* opCtx, hangAfterReconfigOnDrainComplete.pauseWhileSet(opCtx); } + LOGV2(6015310, "Starting to transition to primary."); AllowNonLocalWritesBlock writesAllowed(opCtx); OpTime firstOpTime = _externalState->onTransitionToPrimary(opCtx); ReplicaSetAwareServiceRegistry::get(_service).onStepUpComplete(opCtx, @@ -3458,6 +3459,7 @@ Status ReplicationCoordinatorImpl::_doReplSetReconfig(OperationContext* opCtx, "_rsConfigState"_attr = int(_rsConfigState)); } + LOGV2(6015313, "Replication config state is Steady, starting reconfig"); invariant(_rsConfig.isInitialized()); if (!force && !_readWriteAbility->canAcceptNonLocalWrites(lk) && !skipSafetyChecks) { @@ -3621,6 +3623,7 @@ Status ReplicationCoordinatorImpl::_doReplSetReconfig(OperationContext* opCtx, // Always debug assert if we reach this point. dassert(!skipSafetyChecks); } + LOGV2(6015314, "Finding self in new config"); StatusWith<int> myIndexSw = force ? findSelfInConfig(_externalState.get(), newConfig, opCtx->getServiceContext()) : findSelfInConfigIfElectable( @@ -3681,6 +3684,7 @@ Status ReplicationCoordinatorImpl::_doReplSetReconfig(OperationContext* opCtx, } // Wait for durability of the new config document. JournalFlusher::get(opCtx)->waitForJournalFlush(); + LOGV2(6015315, "Persisted new config to disk"); configStateGuard.dismiss(); _finishReplSetReconfig(opCtx, newConfig, force, myIndex); @@ -3703,12 +3707,11 @@ void ReplicationCoordinatorImpl::_finishReplSetReconfig(OperationContext* opCtx, // we have already set our ReplicationCoordinatorImpl::_rsConfigState state to // "kConfigReconfiguring" which prevents new elections from happening. if (electionFinishedEvent) { - LOGV2_DEBUG(21354, - 2, - "Waiting for election to complete before finishing reconfig to config with " - "{configVersionAndTerm}", - "Waiting for election to complete before finishing reconfig", - "configVersionAndTerm"_attr = newConfig.getConfigVersionAndTerm()); + LOGV2(21354, + "Waiting for election to complete before finishing reconfig to config with " + "{configVersionAndTerm}", + "Waiting for election to complete before finishing reconfig", + "configVersionAndTerm"_attr = newConfig.getConfigVersionAndTerm()); // Wait for the election to complete and the node's Role to be set to follower. _replExecutor->waitForEvent(electionFinishedEvent); } @@ -3780,7 +3783,6 @@ void ReplicationCoordinatorImpl::_finishReplSetReconfig(OperationContext* opCtx, _clearCommittedSnapshot_inlock(); } - lk.unlock(); _performPostMemberStateUpdateAction(action); } @@ -4105,11 +4107,38 @@ void ReplicationCoordinatorImpl::_finishReplSetInitiate(OperationContext* opCtx, void ReplicationCoordinatorImpl::_setConfigState_inlock(ConfigState newState) { if (newState != _rsConfigState) { + LOGV2(6015317, + "Setting new configuration state", + "newState"_attr = getConfigStateString(newState), + "oldState"_attr = getConfigStateString(_rsConfigState)); _rsConfigState = newState; _rsConfigStateChange.notify_all(); } } +std::string ReplicationCoordinatorImpl::getConfigStateString(ConfigState state) { + switch (state) { + case kConfigPreStart: + return "ConfigPreStart"; + case kConfigStartingUp: + return "ConfigStartingUp"; + case kConfigReplicationDisabled: + return "ConfigReplicationDisabled"; + case kConfigUninitialized: + return "ConfigUninitialized"; + case kConfigSteady: + return "ConfigSteady"; + case kConfigInitiating: + return "ConfigInitiating"; + case kConfigReconfiguring: + return "ConfigReconfiguring"; + case kConfigHBReconfiguring: + return "ConfigHBReconfiguring"; + default: + MONGO_UNREACHABLE; + } +} + void ReplicationCoordinatorImpl::_errorOnPromisesIfHorizonChanged(WithLock lk, OperationContext* opCtx, const ReplSetConfig& oldConfig, @@ -4378,6 +4407,7 @@ void ReplicationCoordinatorImpl::CatchupState::start_inlock() { // No catchup in single node replica set. if (_repl->_rsConfig.getNumMembers() == 1) { + LOGV2(6015304, "Skipping primary catchup since we are the only node in the replica set."); abort_inlock(PrimaryCatchUpConclusionReason::kSkipped); return; } @@ -4447,6 +4477,10 @@ void ReplicationCoordinatorImpl::CatchupState::signalHeartbeatUpdate_inlock() { auto targetOpTime = _repl->_topCoord->latestKnownOpTimeSinceHeartbeatRestart(); // Haven't collected all heartbeat responses. if (!targetOpTime) { + LOGV2_DEBUG( + 6015305, + 1, + "Not updating target optime for catchup, we haven't collected all heartbeat responses"); return; } @@ -5542,6 +5576,7 @@ Status ReplicationCoordinatorImpl::updateTerm(OperationContext* opCtx, long long // Wait for potential stepdown to finish. if (finishEvh.isValid()) { + LOGV2(6015302, "Waiting for potential stepdown to complete before finishing term update"); _replExecutor->waitForEvent(finishEvh); } if (updateTermResult == TopologyCoordinator::UpdateTermResult::kUpdatedTerm || @@ -5775,6 +5810,7 @@ Status ReplicationCoordinatorImpl::stepUpIfEligible(bool skipDryRun) { } } if (finishEvent.isValid()) { + LOGV2(6015303, "Waiting for in-progress election to complete before finishing stepup"); _replExecutor->waitForEvent(finishEvent); } { diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h index 86aa18dd34e..b0e9fb0cba9 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.h +++ b/src/mongo/db/repl/replication_coordinator_impl.h @@ -1164,6 +1164,11 @@ private: void _setConfigState_inlock(ConfigState newState); /** + * Returns the string representation of the config state. + */ + static std::string getConfigStateString(ConfigState state); + + /** * Returns true if the horizon mappings between the oldConfig and newConfig are different. */ void _errorOnPromisesIfHorizonChanged(WithLock lk, diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp index 68c619af21e..0c05b9f4a0c 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp @@ -335,6 +335,9 @@ void ReplicationCoordinatorImpl::ElectionState::_writeLastVoteForMyElection( auto opCtx = cc().makeOperationContext(); // Any writes that occur as part of an election should not be subject to Flow Control. opCtx->setShouldParticipateInFlowControl(false); + LOGV2(6015300, + "Storing last vote document in local storage for my election", + "lastVote"_attr = lastVote); return _repl->_externalState->storeLocalLastVoteDocument(opCtx.get(), lastVote); }(); @@ -345,6 +348,7 @@ void ReplicationCoordinatorImpl::ElectionState::_writeLastVoteForMyElection( stdx::lock_guard<Latch> lk(_repl->_mutex); LoseElectionDryRunGuardV1 lossGuard(_repl); if (status == ErrorCodes::CallbackCanceled) { + LOGV2(6015301, "Callback for storing last vote got cancelled"); return; } |