diff options
author | Huayu Ouyang <huayu.ouyang@mongodb.com> | 2021-09-29 14:41:12 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-10-01 20:58:11 +0000 |
commit | 5aca1178efce73d99a33c753b22040f72259f022 (patch) | |
tree | d28b798ada078b7a7081f1892ede05466a8977e9 | |
parent | 16682cbb1eacfd5fc80d07582105938cd5cb5e91 (diff) | |
download | mongo-5aca1178efce73d99a33c753b22040f72259f022.tar.gz |
SERVER-57806 Handle fallback to logical initial sync if FileCopyBasedInitialSync fails for lack of sync sources
-rw-r--r-- | buildscripts/resmokeconfig/suites/replica_sets.yml | 1 | ||||
-rw-r--r-- | buildscripts/resmokeconfig/suites/replica_sets_auth.yml | 1 | ||||
-rw-r--r-- | buildscripts/resmokeconfig/suites/replica_sets_ese.yml | 1 | ||||
-rw-r--r-- | buildscripts/resmokeconfig/suites/replica_sets_ese_gcm.yml | 1 | ||||
-rw-r--r-- | src/mongo/db/repl/initial_syncer.cpp | 4 | ||||
-rw-r--r-- | src/mongo/db/repl/initial_syncer.h | 2 | ||||
-rw-r--r-- | src/mongo/db/repl/initial_syncer_interface.h | 5 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl.cpp | 182 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_impl.h | 18 |
9 files changed, 135 insertions, 80 deletions
diff --git a/buildscripts/resmokeconfig/suites/replica_sets.yml b/buildscripts/resmokeconfig/suites/replica_sets.yml index 0bceeaa4abf..cdaf2b828ef 100644 --- a/buildscripts/resmokeconfig/suites/replica_sets.yml +++ b/buildscripts/resmokeconfig/suites/replica_sets.yml @@ -3,6 +3,7 @@ test_kind: js_test selector: roots: - jstests/replsets/*.js + - src/mongo/db/modules/enterprise/jstests/fcbis/*.js exclude_files: executor: diff --git a/buildscripts/resmokeconfig/suites/replica_sets_auth.yml b/buildscripts/resmokeconfig/suites/replica_sets_auth.yml index 7f056891902..57971907916 100644 --- a/buildscripts/resmokeconfig/suites/replica_sets_auth.yml +++ b/buildscripts/resmokeconfig/suites/replica_sets_auth.yml @@ -8,6 +8,7 @@ test_kind: js_test selector: roots: - jstests/replsets/*.js + - src/mongo/db/modules/enterprise/jstests/fcbis/*.js exclude_files: # Skip any tests that run with auth explicitly. - jstests/replsets/*[aA]uth*.js diff --git a/buildscripts/resmokeconfig/suites/replica_sets_ese.yml b/buildscripts/resmokeconfig/suites/replica_sets_ese.yml index f816fc49e8f..ddd9853c958 100644 --- a/buildscripts/resmokeconfig/suites/replica_sets_ese.yml +++ b/buildscripts/resmokeconfig/suites/replica_sets_ese.yml @@ -7,6 +7,7 @@ test_kind: js_test selector: roots: - jstests/replsets/*.js + - src/mongo/db/modules/enterprise/jstests/fcbis/*.js executor: config: diff --git a/buildscripts/resmokeconfig/suites/replica_sets_ese_gcm.yml b/buildscripts/resmokeconfig/suites/replica_sets_ese_gcm.yml index edc637e393c..1b3180e9b8d 100644 --- a/buildscripts/resmokeconfig/suites/replica_sets_ese_gcm.yml +++ b/buildscripts/resmokeconfig/suites/replica_sets_ese_gcm.yml @@ -7,6 +7,7 @@ test_kind: js_test selector: roots: - jstests/replsets/*.js + - src/mongo/db/modules/enterprise/jstests/fcbis/*.js executor: config: diff --git a/src/mongo/db/repl/initial_syncer.cpp b/src/mongo/db/repl/initial_syncer.cpp index a404f7b6ccf..f4810d8ce3d 100644 --- a/src/mongo/db/repl/initial_syncer.cpp +++ b/src/mongo/db/repl/initial_syncer.cpp @@ -271,6 +271,10 @@ bool InitialSyncer::_isActive_inlock() const { return State::kRunning == _state || State::kShuttingDown == _state; } +std::string InitialSyncer::getInitialSyncMethod() const { + return "logical"; +} + Status InitialSyncer::startup(OperationContext* opCtx, std::uint32_t initialSyncMaxAttempts) noexcept { invariant(opCtx); diff --git a/src/mongo/db/repl/initial_syncer.h b/src/mongo/db/repl/initial_syncer.h index 7ac0f6d3b66..566bde4919f 100644 --- a/src/mongo/db/repl/initial_syncer.h +++ b/src/mongo/db/repl/initial_syncer.h @@ -160,6 +160,8 @@ public: */ bool isActive() const; + std::string getInitialSyncMethod() const final; + Status startup(OperationContext* opCtx, std::uint32_t maxAttempts) noexcept final; Status shutdown() final; diff --git a/src/mongo/db/repl/initial_syncer_interface.h b/src/mongo/db/repl/initial_syncer_interface.h index 1886296214d..9bf683e99b8 100644 --- a/src/mongo/db/repl/initial_syncer_interface.h +++ b/src/mongo/db/repl/initial_syncer_interface.h @@ -116,6 +116,11 @@ public: * Cancels the current initial sync attempt if the initial syncer is active. */ virtual void cancelCurrentAttempt() = 0; + + /** + * Returns the initial sync method that this initial syncer is using. + */ + virtual std::string getInitialSyncMethod() const = 0; }; } // namespace repl diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index 7414e39a3bf..01d22fd8319 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -699,83 +699,10 @@ void ReplicationCoordinatorImpl::_finishLoadLocalConfig( LOGV2_DEBUG(4280511, 1, "Set local replica set config"); } -void ReplicationCoordinatorImpl::_startDataReplication(OperationContext* opCtx, - std::function<void()> startCompleted) { - if (_startedSteadyStateReplication.swap(true)) { - // This is not the first call. - return; - } - - // Check to see if we need to do an initial sync. - const auto lastOpTime = getMyLastAppliedOpTime(); - const auto needsInitialSync = - lastOpTime.isNull() || _externalState->isInitialSyncFlagSet(opCtx); - if (!needsInitialSync) { - LOGV2_DEBUG(4280512, 1, "No initial sync required. Attempting to begin steady replication"); - // Start steady replication, since we already have data. - // ReplSetConfig has been installed, so it's either in STARTUP2 or REMOVED. - auto memberState = getMemberState(); - invariant(memberState.startup2() || memberState.removed()); - invariant(setFollowerMode(MemberState::RS_RECOVERING)); - // Set an initial sync ID, in case we were upgraded or restored from backup without doing - // an initial sync. - _replicationProcess->getConsistencyMarkers()->setInitialSyncIdIfNotSet(opCtx); - _externalState->startSteadyStateReplication(opCtx, this); - return; - } - - LOGV2_DEBUG(4280513, 1, "Initial sync required. Attempting to start initial sync..."); - // Do initial sync. - if (!_externalState->getTaskExecutor()) { - LOGV2(21323, "Not running initial sync during test"); - return; - } - - auto onCompletion = [this, startCompleted](const StatusWith<OpTimeAndWallTime>& opTimeStatus) { - { - stdx::lock_guard<Latch> lock(_mutex); - if (opTimeStatus == ErrorCodes::CallbackCanceled) { - LOGV2(21324, - "Initial Sync has been cancelled: {error}", - "Initial Sync has been cancelled", - "error"_attr = opTimeStatus.getStatus()); - return; - } else if (!opTimeStatus.isOK()) { - if (_inShutdown) { - LOGV2(21325, - "Initial Sync failed during shutdown due to {error}", - "Initial Sync failed during shutdown", - "error"_attr = opTimeStatus.getStatus()); - return; - } else { - LOGV2_ERROR(21416, - "Initial sync failed, shutting down now. Restart the server " - "to attempt a new initial sync"); - fassertFailedWithStatusNoTrace(40088, opTimeStatus.getStatus()); - } - } - - const auto lastApplied = opTimeStatus.getValue(); - _setMyLastAppliedOpTimeAndWallTime(lock, lastApplied, false); - - _topCoord->resetMaintenanceCount(); - } - - if (startCompleted) { - startCompleted(); - } - // Transition from STARTUP2 to RECOVERING and start the producer and the applier. - // If the member state is REMOVED, this will do nothing until we receive a config with - // ourself in it. - const auto memberState = getMemberState(); - invariant(memberState.startup2() || memberState.removed()); - invariant(setFollowerMode(MemberState::RS_RECOVERING)); - auto opCtxHolder = cc().makeOperationContext(); - _externalState->startSteadyStateReplication(opCtxHolder.get(), this); - // This log is used in tests to ensure we made it to this point. - LOGV2_DEBUG(4853000, 1, "initial sync complete."); - }; - +void ReplicationCoordinatorImpl::_startInitialSync( + OperationContext* opCtx, + InitialSyncerInterface::OnCompletionFn onCompletion, + bool fallbackToLogical) { std::shared_ptr<InitialSyncerInterface> initialSyncerCopy; try { { @@ -799,7 +726,8 @@ void ReplicationCoordinatorImpl::_startDataReplication(OperationContext* opCtx, onCompletion); }; - if (repl::feature_flags::gFileCopyBasedInitialSync.isEnabledAndIgnoreFCV()) { + if (repl::feature_flags::gFileCopyBasedInitialSync.isEnabledAndIgnoreFCV() && + !fallbackToLogical) { auto swInitialSyncer = createInitialSyncer(initialSyncMethod); if (swInitialSyncer.getStatus().code() == ErrorCodes::NotImplemented && initialSyncMethod != "logical") { @@ -834,6 +762,104 @@ void ReplicationCoordinatorImpl::_startDataReplication(OperationContext* opCtx, } } +void ReplicationCoordinatorImpl::_initialSyncerCompletionFunction( + const StatusWith<OpTimeAndWallTime>& opTimeStatus) { + { + stdx::unique_lock<Latch> lock(_mutex); + if (opTimeStatus == ErrorCodes::CallbackCanceled) { + LOGV2(21324, + "Initial Sync has been cancelled: {error}", + "Initial Sync has been cancelled", + "error"_attr = opTimeStatus.getStatus()); + return; + } else if (opTimeStatus == ErrorCodes::InvalidSyncSource && + _initialSyncer->getInitialSyncMethod() != "logical") { + LOGV2(5780600, + "Falling back to logical initial sync: {error}", + "Falling back to logical initial sync", + "error"_attr = opTimeStatus.getStatus()); + lock.unlock(); + clearSyncSourceDenylist(); + _scheduleWorkAt(_replExecutor->now(), + [=](const mongo::executor::TaskExecutor::CallbackArgs& cbData) { + _startInitialSync( + cc().makeOperationContext().get(), + [this](const StatusWith<OpTimeAndWallTime>& opTimeStatus) { + _initialSyncerCompletionFunction(opTimeStatus); + }, + true /* fallbackToLogical */); + }); + return; + } else if (!opTimeStatus.isOK()) { + if (_inShutdown) { + LOGV2(21325, + "Initial Sync failed during shutdown due to {error}", + "Initial Sync failed during shutdown", + "error"_attr = opTimeStatus.getStatus()); + return; + } else { + LOGV2_ERROR(21416, + "Initial sync failed, shutting down now. Restart the server " + "to attempt a new initial sync"); + fassertFailedWithStatusNoTrace(40088, opTimeStatus.getStatus()); + } + } + + + const auto lastApplied = opTimeStatus.getValue(); + _setMyLastAppliedOpTimeAndWallTime(lock, lastApplied, false); + + _topCoord->resetMaintenanceCount(); + } + + // Transition from STARTUP2 to RECOVERING and start the producer and the applier. + // If the member state is REMOVED, this will do nothing until we receive a config with + // ourself in it. + const auto memberState = getMemberState(); + invariant(memberState.startup2() || memberState.removed()); + invariant(setFollowerMode(MemberState::RS_RECOVERING)); + auto opCtxHolder = cc().makeOperationContext(); + _externalState->startSteadyStateReplication(opCtxHolder.get(), this); + // This log is used in tests to ensure we made it to this point. + LOGV2_DEBUG(4853000, 1, "initial sync complete."); +} + +void ReplicationCoordinatorImpl::_startDataReplication(OperationContext* opCtx) { + if (_startedSteadyStateReplication.swap(true)) { + // This is not the first call. + return; + } + + // Check to see if we need to do an initial sync. + const auto lastOpTime = getMyLastAppliedOpTime(); + const auto needsInitialSync = + lastOpTime.isNull() || _externalState->isInitialSyncFlagSet(opCtx); + if (!needsInitialSync) { + LOGV2_DEBUG(4280512, 1, "No initial sync required. Attempting to begin steady replication"); + // Start steady replication, since we already have data. + // ReplSetConfig has been installed, so it's either in STARTUP2 or REMOVED. + auto memberState = getMemberState(); + invariant(memberState.startup2() || memberState.removed()); + invariant(setFollowerMode(MemberState::RS_RECOVERING)); + // Set an initial sync ID, in case we were upgraded or restored from backup without doing + // an initial sync. + _replicationProcess->getConsistencyMarkers()->setInitialSyncIdIfNotSet(opCtx); + _externalState->startSteadyStateReplication(opCtx, this); + return; + } + + LOGV2_DEBUG(4280513, 1, "Initial sync required. Attempting to start initial sync..."); + // Do initial sync. + if (!_externalState->getTaskExecutor()) { + LOGV2(21323, "Not running initial sync during test"); + return; + } + + _startInitialSync(opCtx, [this](const StatusWith<OpTimeAndWallTime>& opTimeStatus) { + _initialSyncerCompletionFunction(opTimeStatus); + }); +} + void ReplicationCoordinatorImpl::startup(OperationContext* opCtx, StorageEngine::LastShutdownState lastShutdownState) { if (!isReplEnabled()) { diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h index cd094f0c905..1e78979e8f8 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.h +++ b/src/mongo/db/repl/replication_coordinator_impl.h @@ -1127,8 +1127,20 @@ private: /** * Start replicating data, and does an initial sync if needed first. */ - void _startDataReplication(OperationContext* opCtx, - std::function<void()> startCompleted = nullptr); + void _startDataReplication(OperationContext* opCtx); + + /** + * Start initial sync. + */ + void _startInitialSync(OperationContext* opCtx, + InitialSyncerInterface::OnCompletionFn onCompletionFn, + bool fallbackToLogical = false); + + + /** + * Function to be called on completion of initial sync. + */ + void _initialSyncerCompletionFunction(const StatusWith<OpTimeAndWallTime>& opTimeStatus); /** * Finishes the work of processReplSetInitiate() in the event of a successful quorum check. @@ -1720,6 +1732,8 @@ private: // This should be set during sharding initialization. boost::optional<bool> _wasCWWCSetOnConfigServerOnStartup; + + InitialSyncerInterface::OnCompletionFn _onCompletion; }; } // namespace repl |