From a574d23ec0b7d06b8d872bf64136308f541a796d Mon Sep 17 00:00:00 2001 From: "A. Jesse Jiryu Davis" Date: Wed, 2 Dec 2020 13:52:21 -0500 Subject: SERVER-53026 Fix "resync" command --- src/mongo/db/repl/replication_coordinator_impl.cpp | 20 ++++++++++++++------ src/mongo/db/repl/replication_coordinator_impl.h | 3 ++- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index 22699a68a6f..fdc10af2565 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -633,6 +633,11 @@ void ReplicationCoordinatorImpl::_stopDataReplication(OperationContext* opCtx) { std::shared_ptr initialSyncerCopy; { stdx::lock_guard lk(_mutex); + if (!_startedSteadyStateReplication) { + return; + } + + _startedSteadyStateReplication = false; _initialSyncer.swap(initialSyncerCopy); } if (initialSyncerCopy) { @@ -652,20 +657,24 @@ void ReplicationCoordinatorImpl::_stopDataReplication(OperationContext* opCtx) { void ReplicationCoordinatorImpl::_startDataReplication(OperationContext* opCtx, stdx::function startCompleted) { - if (_startedSteadyStateReplication.swap(true)) { - // This is not the first call. + stdx::unique_lock lk(_mutex); + if (_startedSteadyStateReplication) { return; } + _startedSteadyStateReplication = true; + // Check to see if we need to do an initial sync. - const auto lastOpTime = getMyLastAppliedOpTime(); + const auto lastOpTime = _getMyLastAppliedOpTime_inlock(); const auto needsInitialSync = lastOpTime.isNull() || _externalState->isInitialSyncFlagSet(opCtx); if (!needsInitialSync) { // Start steady replication, since we already have data. // ReplSetConfig has been installed, so it's either in STARTUP2 or REMOVED. - auto memberState = getMemberState(); + auto memberState = _getMemberState_inlock(); invariant(memberState.startup2() || memberState.removed()); + + lk.unlock(); invariantOK(setFollowerMode(MemberState::RS_RECOVERING)); _externalState->startSteadyStateReplication(opCtx, this); return; @@ -719,8 +728,6 @@ void ReplicationCoordinatorImpl::_startDataReplication(OperationContext* opCtx, std::shared_ptr initialSyncerCopy; try { { - // Must take the lock to set _initialSyncer, but not call it. - stdx::lock_guard lock(_mutex); initialSyncerCopy = std::make_shared( createInitialSyncerOptions(this, _externalState.get()), stdx::make_unique(this, @@ -732,6 +739,7 @@ void ReplicationCoordinatorImpl::_startDataReplication(OperationContext* opCtx, } // InitialSyncer::startup() must be called outside lock because it uses features (eg. // setting the initial sync flag) which depend on the ReplicationCoordinatorImpl. + lk.unlock(); uassertStatusOK(initialSyncerCopy->startup(opCtx, numInitialSyncAttempts.load())); } catch (...) { auto status = exceptionToStatus(); diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h index 105269b8855..cd690f6d4df 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.h +++ b/src/mongo/db/repl/replication_coordinator_impl.h @@ -1383,7 +1383,8 @@ private: // here so we can update our term to match as part of finishing stepdown. boost::optional _pendingTermUpdateDuringStepDown; // (M) - AtomicWord _startedSteadyStateReplication{false}; + // Whether data replication is active. + bool _startedSteadyStateReplication = false; // (M) // If we're in terminal shutdown. If true, we'll refuse to vote in elections. bool _inTerminalShutdown = false; // (M) -- cgit v1.2.1