summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHuayu Ouyang <huayu.ouyang@mongodb.com>2021-09-29 14:41:12 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-10-01 20:58:11 +0000
commit5aca1178efce73d99a33c753b22040f72259f022 (patch)
treed28b798ada078b7a7081f1892ede05466a8977e9
parent16682cbb1eacfd5fc80d07582105938cd5cb5e91 (diff)
downloadmongo-5aca1178efce73d99a33c753b22040f72259f022.tar.gz
SERVER-57806 Handle fallback to logical initial sync if FileCopyBasedInitialSync fails for lack of sync sources
-rw-r--r--buildscripts/resmokeconfig/suites/replica_sets.yml1
-rw-r--r--buildscripts/resmokeconfig/suites/replica_sets_auth.yml1
-rw-r--r--buildscripts/resmokeconfig/suites/replica_sets_ese.yml1
-rw-r--r--buildscripts/resmokeconfig/suites/replica_sets_ese_gcm.yml1
-rw-r--r--src/mongo/db/repl/initial_syncer.cpp4
-rw-r--r--src/mongo/db/repl/initial_syncer.h2
-rw-r--r--src/mongo/db/repl/initial_syncer_interface.h5
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.cpp182
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.h18
9 files changed, 135 insertions, 80 deletions
diff --git a/buildscripts/resmokeconfig/suites/replica_sets.yml b/buildscripts/resmokeconfig/suites/replica_sets.yml
index 0bceeaa4abf..cdaf2b828ef 100644
--- a/buildscripts/resmokeconfig/suites/replica_sets.yml
+++ b/buildscripts/resmokeconfig/suites/replica_sets.yml
@@ -3,6 +3,7 @@ test_kind: js_test
selector:
roots:
- jstests/replsets/*.js
+ - src/mongo/db/modules/enterprise/jstests/fcbis/*.js
exclude_files:
executor:
diff --git a/buildscripts/resmokeconfig/suites/replica_sets_auth.yml b/buildscripts/resmokeconfig/suites/replica_sets_auth.yml
index 7f056891902..57971907916 100644
--- a/buildscripts/resmokeconfig/suites/replica_sets_auth.yml
+++ b/buildscripts/resmokeconfig/suites/replica_sets_auth.yml
@@ -8,6 +8,7 @@ test_kind: js_test
selector:
roots:
- jstests/replsets/*.js
+ - src/mongo/db/modules/enterprise/jstests/fcbis/*.js
exclude_files:
# Skip any tests that run with auth explicitly.
- jstests/replsets/*[aA]uth*.js
diff --git a/buildscripts/resmokeconfig/suites/replica_sets_ese.yml b/buildscripts/resmokeconfig/suites/replica_sets_ese.yml
index f816fc49e8f..ddd9853c958 100644
--- a/buildscripts/resmokeconfig/suites/replica_sets_ese.yml
+++ b/buildscripts/resmokeconfig/suites/replica_sets_ese.yml
@@ -7,6 +7,7 @@ test_kind: js_test
selector:
roots:
- jstests/replsets/*.js
+ - src/mongo/db/modules/enterprise/jstests/fcbis/*.js
executor:
config:
diff --git a/buildscripts/resmokeconfig/suites/replica_sets_ese_gcm.yml b/buildscripts/resmokeconfig/suites/replica_sets_ese_gcm.yml
index edc637e393c..1b3180e9b8d 100644
--- a/buildscripts/resmokeconfig/suites/replica_sets_ese_gcm.yml
+++ b/buildscripts/resmokeconfig/suites/replica_sets_ese_gcm.yml
@@ -7,6 +7,7 @@ test_kind: js_test
selector:
roots:
- jstests/replsets/*.js
+ - src/mongo/db/modules/enterprise/jstests/fcbis/*.js
executor:
config:
diff --git a/src/mongo/db/repl/initial_syncer.cpp b/src/mongo/db/repl/initial_syncer.cpp
index a404f7b6ccf..f4810d8ce3d 100644
--- a/src/mongo/db/repl/initial_syncer.cpp
+++ b/src/mongo/db/repl/initial_syncer.cpp
@@ -271,6 +271,10 @@ bool InitialSyncer::_isActive_inlock() const {
return State::kRunning == _state || State::kShuttingDown == _state;
}
+std::string InitialSyncer::getInitialSyncMethod() const {
+ return "logical";
+}
+
Status InitialSyncer::startup(OperationContext* opCtx,
std::uint32_t initialSyncMaxAttempts) noexcept {
invariant(opCtx);
diff --git a/src/mongo/db/repl/initial_syncer.h b/src/mongo/db/repl/initial_syncer.h
index 7ac0f6d3b66..566bde4919f 100644
--- a/src/mongo/db/repl/initial_syncer.h
+++ b/src/mongo/db/repl/initial_syncer.h
@@ -160,6 +160,8 @@ public:
*/
bool isActive() const;
+ std::string getInitialSyncMethod() const final;
+
Status startup(OperationContext* opCtx, std::uint32_t maxAttempts) noexcept final;
Status shutdown() final;
diff --git a/src/mongo/db/repl/initial_syncer_interface.h b/src/mongo/db/repl/initial_syncer_interface.h
index 1886296214d..9bf683e99b8 100644
--- a/src/mongo/db/repl/initial_syncer_interface.h
+++ b/src/mongo/db/repl/initial_syncer_interface.h
@@ -116,6 +116,11 @@ public:
* Cancels the current initial sync attempt if the initial syncer is active.
*/
virtual void cancelCurrentAttempt() = 0;
+
+ /**
+ * Returns the initial sync method that this initial syncer is using.
+ */
+ virtual std::string getInitialSyncMethod() const = 0;
};
} // namespace repl
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index 7414e39a3bf..01d22fd8319 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -699,83 +699,10 @@ void ReplicationCoordinatorImpl::_finishLoadLocalConfig(
LOGV2_DEBUG(4280511, 1, "Set local replica set config");
}
-void ReplicationCoordinatorImpl::_startDataReplication(OperationContext* opCtx,
- std::function<void()> startCompleted) {
- if (_startedSteadyStateReplication.swap(true)) {
- // This is not the first call.
- return;
- }
-
- // Check to see if we need to do an initial sync.
- const auto lastOpTime = getMyLastAppliedOpTime();
- const auto needsInitialSync =
- lastOpTime.isNull() || _externalState->isInitialSyncFlagSet(opCtx);
- if (!needsInitialSync) {
- LOGV2_DEBUG(4280512, 1, "No initial sync required. Attempting to begin steady replication");
- // Start steady replication, since we already have data.
- // ReplSetConfig has been installed, so it's either in STARTUP2 or REMOVED.
- auto memberState = getMemberState();
- invariant(memberState.startup2() || memberState.removed());
- invariant(setFollowerMode(MemberState::RS_RECOVERING));
- // Set an initial sync ID, in case we were upgraded or restored from backup without doing
- // an initial sync.
- _replicationProcess->getConsistencyMarkers()->setInitialSyncIdIfNotSet(opCtx);
- _externalState->startSteadyStateReplication(opCtx, this);
- return;
- }
-
- LOGV2_DEBUG(4280513, 1, "Initial sync required. Attempting to start initial sync...");
- // Do initial sync.
- if (!_externalState->getTaskExecutor()) {
- LOGV2(21323, "Not running initial sync during test");
- return;
- }
-
- auto onCompletion = [this, startCompleted](const StatusWith<OpTimeAndWallTime>& opTimeStatus) {
- {
- stdx::lock_guard<Latch> lock(_mutex);
- if (opTimeStatus == ErrorCodes::CallbackCanceled) {
- LOGV2(21324,
- "Initial Sync has been cancelled: {error}",
- "Initial Sync has been cancelled",
- "error"_attr = opTimeStatus.getStatus());
- return;
- } else if (!opTimeStatus.isOK()) {
- if (_inShutdown) {
- LOGV2(21325,
- "Initial Sync failed during shutdown due to {error}",
- "Initial Sync failed during shutdown",
- "error"_attr = opTimeStatus.getStatus());
- return;
- } else {
- LOGV2_ERROR(21416,
- "Initial sync failed, shutting down now. Restart the server "
- "to attempt a new initial sync");
- fassertFailedWithStatusNoTrace(40088, opTimeStatus.getStatus());
- }
- }
-
- const auto lastApplied = opTimeStatus.getValue();
- _setMyLastAppliedOpTimeAndWallTime(lock, lastApplied, false);
-
- _topCoord->resetMaintenanceCount();
- }
-
- if (startCompleted) {
- startCompleted();
- }
- // Transition from STARTUP2 to RECOVERING and start the producer and the applier.
- // If the member state is REMOVED, this will do nothing until we receive a config with
- // ourself in it.
- const auto memberState = getMemberState();
- invariant(memberState.startup2() || memberState.removed());
- invariant(setFollowerMode(MemberState::RS_RECOVERING));
- auto opCtxHolder = cc().makeOperationContext();
- _externalState->startSteadyStateReplication(opCtxHolder.get(), this);
- // This log is used in tests to ensure we made it to this point.
- LOGV2_DEBUG(4853000, 1, "initial sync complete.");
- };
-
+void ReplicationCoordinatorImpl::_startInitialSync(
+ OperationContext* opCtx,
+ InitialSyncerInterface::OnCompletionFn onCompletion,
+ bool fallbackToLogical) {
std::shared_ptr<InitialSyncerInterface> initialSyncerCopy;
try {
{
@@ -799,7 +726,8 @@ void ReplicationCoordinatorImpl::_startDataReplication(OperationContext* opCtx,
onCompletion);
};
- if (repl::feature_flags::gFileCopyBasedInitialSync.isEnabledAndIgnoreFCV()) {
+ if (repl::feature_flags::gFileCopyBasedInitialSync.isEnabledAndIgnoreFCV() &&
+ !fallbackToLogical) {
auto swInitialSyncer = createInitialSyncer(initialSyncMethod);
if (swInitialSyncer.getStatus().code() == ErrorCodes::NotImplemented &&
initialSyncMethod != "logical") {
@@ -834,6 +762,104 @@ void ReplicationCoordinatorImpl::_startDataReplication(OperationContext* opCtx,
}
}
+void ReplicationCoordinatorImpl::_initialSyncerCompletionFunction(
+ const StatusWith<OpTimeAndWallTime>& opTimeStatus) {
+ {
+ stdx::unique_lock<Latch> lock(_mutex);
+ if (opTimeStatus == ErrorCodes::CallbackCanceled) {
+ LOGV2(21324,
+ "Initial Sync has been cancelled: {error}",
+ "Initial Sync has been cancelled",
+ "error"_attr = opTimeStatus.getStatus());
+ return;
+ } else if (opTimeStatus == ErrorCodes::InvalidSyncSource &&
+ _initialSyncer->getInitialSyncMethod() != "logical") {
+ LOGV2(5780600,
+ "Falling back to logical initial sync: {error}",
+ "Falling back to logical initial sync",
+ "error"_attr = opTimeStatus.getStatus());
+ lock.unlock();
+ clearSyncSourceDenylist();
+ _scheduleWorkAt(_replExecutor->now(),
+ [=](const mongo::executor::TaskExecutor::CallbackArgs& cbData) {
+ _startInitialSync(
+ cc().makeOperationContext().get(),
+ [this](const StatusWith<OpTimeAndWallTime>& opTimeStatus) {
+ _initialSyncerCompletionFunction(opTimeStatus);
+ },
+ true /* fallbackToLogical */);
+ });
+ return;
+ } else if (!opTimeStatus.isOK()) {
+ if (_inShutdown) {
+ LOGV2(21325,
+ "Initial Sync failed during shutdown due to {error}",
+ "Initial Sync failed during shutdown",
+ "error"_attr = opTimeStatus.getStatus());
+ return;
+ } else {
+ LOGV2_ERROR(21416,
+ "Initial sync failed, shutting down now. Restart the server "
+ "to attempt a new initial sync");
+ fassertFailedWithStatusNoTrace(40088, opTimeStatus.getStatus());
+ }
+ }
+
+
+ const auto lastApplied = opTimeStatus.getValue();
+ _setMyLastAppliedOpTimeAndWallTime(lock, lastApplied, false);
+
+ _topCoord->resetMaintenanceCount();
+ }
+
+ // Transition from STARTUP2 to RECOVERING and start the producer and the applier.
+ // If the member state is REMOVED, this will do nothing until we receive a config with
+ // ourself in it.
+ const auto memberState = getMemberState();
+ invariant(memberState.startup2() || memberState.removed());
+ invariant(setFollowerMode(MemberState::RS_RECOVERING));
+ auto opCtxHolder = cc().makeOperationContext();
+ _externalState->startSteadyStateReplication(opCtxHolder.get(), this);
+ // This log is used in tests to ensure we made it to this point.
+ LOGV2_DEBUG(4853000, 1, "initial sync complete.");
+}
+
+void ReplicationCoordinatorImpl::_startDataReplication(OperationContext* opCtx) {
+ if (_startedSteadyStateReplication.swap(true)) {
+ // This is not the first call.
+ return;
+ }
+
+ // Check to see if we need to do an initial sync.
+ const auto lastOpTime = getMyLastAppliedOpTime();
+ const auto needsInitialSync =
+ lastOpTime.isNull() || _externalState->isInitialSyncFlagSet(opCtx);
+ if (!needsInitialSync) {
+ LOGV2_DEBUG(4280512, 1, "No initial sync required. Attempting to begin steady replication");
+ // Start steady replication, since we already have data.
+ // ReplSetConfig has been installed, so it's either in STARTUP2 or REMOVED.
+ auto memberState = getMemberState();
+ invariant(memberState.startup2() || memberState.removed());
+ invariant(setFollowerMode(MemberState::RS_RECOVERING));
+ // Set an initial sync ID, in case we were upgraded or restored from backup without doing
+ // an initial sync.
+ _replicationProcess->getConsistencyMarkers()->setInitialSyncIdIfNotSet(opCtx);
+ _externalState->startSteadyStateReplication(opCtx, this);
+ return;
+ }
+
+ LOGV2_DEBUG(4280513, 1, "Initial sync required. Attempting to start initial sync...");
+ // Do initial sync.
+ if (!_externalState->getTaskExecutor()) {
+ LOGV2(21323, "Not running initial sync during test");
+ return;
+ }
+
+ _startInitialSync(opCtx, [this](const StatusWith<OpTimeAndWallTime>& opTimeStatus) {
+ _initialSyncerCompletionFunction(opTimeStatus);
+ });
+}
+
void ReplicationCoordinatorImpl::startup(OperationContext* opCtx,
StorageEngine::LastShutdownState lastShutdownState) {
if (!isReplEnabled()) {
diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h
index cd094f0c905..1e78979e8f8 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.h
+++ b/src/mongo/db/repl/replication_coordinator_impl.h
@@ -1127,8 +1127,20 @@ private:
/**
* Start replicating data, and does an initial sync if needed first.
*/
- void _startDataReplication(OperationContext* opCtx,
- std::function<void()> startCompleted = nullptr);
+ void _startDataReplication(OperationContext* opCtx);
+
+ /**
+ * Start initial sync.
+ */
+ void _startInitialSync(OperationContext* opCtx,
+ InitialSyncerInterface::OnCompletionFn onCompletionFn,
+ bool fallbackToLogical = false);
+
+
+ /**
+ * Function to be called on completion of initial sync.
+ */
+ void _initialSyncerCompletionFunction(const StatusWith<OpTimeAndWallTime>& opTimeStatus);
/**
* Finishes the work of processReplSetInitiate() in the event of a successful quorum check.
@@ -1720,6 +1732,8 @@ private:
// This should be set during sharding initialization.
boost::optional<bool> _wasCWWCSetOnConfigServerOnStartup;
+
+ InitialSyncerInterface::OnCompletionFn _onCompletion;
};
} // namespace repl