summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--jstests/replsets/initial_sync_fails_when_source_removed.js84
-rw-r--r--src/mongo/db/repl/base_cloner.cpp2
-rw-r--r--src/mongo/db/repl/base_cloner.h2
-rw-r--r--src/mongo/db/repl/initial_syncer.cpp18
-rw-r--r--src/mongo/db/repl/initial_syncer.h14
5 files changed, 102 insertions, 18 deletions
diff --git a/jstests/replsets/initial_sync_fails_when_source_removed.js b/jstests/replsets/initial_sync_fails_when_source_removed.js
new file mode 100644
index 00000000000..53cf3431bb8
--- /dev/null
+++ b/jstests/replsets/initial_sync_fails_when_source_removed.js
@@ -0,0 +1,84 @@
+/**
+ * Tests that initial sync will abort an attempt if the sync source is removed during cloning.
+ * This test will timeout if the attempt is not aborted.
+ * @tags: [requires_fcv_44]
+ */
+(function() {
+"use strict";
+
+load("jstests/libs/fail_point_util.js");
+
+const testName = "initial_sync_fails_when_source_removed";
+const rst =
+ new ReplSetTest({name: testName, nodes: [{}, {rsConfig: {priority: 0}}], allowChaining: true});
+const nodes = rst.startSet();
+rst.initiate();
+
+const primary = rst.getPrimary();
+const primaryDb = primary.getDB("test");
+const initialSyncSource = rst.getSecondary();
+
+// Add some data to be cloned.
+assert.commandWorked(primaryDb.test.insert([{a: 1}, {b: 2}, {c: 3}]));
+rst.awaitReplication();
+
+jsTest.log("Adding the initial sync destination node to the replica set");
+const initialSyncNode = rst.add({
+ rsConfig: {priority: 0},
+ setParameter: {
+ 'failpoint.initialSyncHangBeforeCopyingDatabases': tojson({mode: 'alwaysOn'}),
+ 'numInitialSyncAttempts': 1,
+ 'failpoint.forceSyncSourceCandidate':
+ tojson({mode: 'alwaysOn', data: {hostAndPort: initialSyncSource.host}})
+ }
+});
+rst.reInitiate();
+rst.waitForState(initialSyncNode, ReplSetTest.State.STARTUP_2);
+
+// The code handling this case is common to all cloners, so run it only for the stage most likely
+// to see an error.
+const cloner = 'CollectionCloner';
+const stage = 'query';
+
+// Set us up to hang before finish so we can check status.
+const beforeFinishFailPoint = configureFailPoint(initialSyncNode, "initialSyncHangBeforeFinish");
+const initialSyncNodeDb = initialSyncNode.getDB("test");
+const failPointData = {
+ cloner: cloner,
+ stage: stage,
+ nss: 'test.test'
+};
+// Set us up to stop right before the given stage.
+const beforeStageFailPoint =
+ configureFailPoint(initialSyncNodeDb, "hangBeforeClonerStage", failPointData);
+// Release the initial failpoint.
+assert.commandWorked(initialSyncNodeDb.adminCommand(
+ {configureFailPoint: "initialSyncHangBeforeCopyingDatabases", mode: "off"}));
+beforeStageFailPoint.wait();
+
+jsTestLog("Testing removing sync source in cloner " + cloner + " stage " + stage);
+// We can't use remove/reInitiate here because that does not properly remove a node
+// in the middle of a config.
+let config = rst.getReplSetConfig();
+config.members.splice(1, 1); // Removes node[1]
+config.version = rst.getReplSetConfigFromNode().version + 1;
+assert.commandWorked(primary.getDB("admin").adminCommand({replSetReconfig: config}));
+
+jsTestLog("Waiting for source to realize it is removed.");
+assert.soonNoExcept(() => assert.commandFailedWithCode(
+ initialSyncSource.getDB("test").adminCommand({replSetGetStatus: 1}),
+ ErrorCodes.InvalidReplicaSetConfig));
+
+jsTestLog("Resuming the initial sync.");
+beforeStageFailPoint.off();
+beforeFinishFailPoint.wait();
+const res = assert.commandWorked(initialSyncNode.adminCommand({replSetGetStatus: 1}));
+// The initial sync should have failed.
+assert.eq(res.initialSyncStatus.failedInitialSyncAttempts, 1);
+beforeFinishFailPoint.off();
+
+// We skip validation and dbhashes because the initial sync failed so the initial sync node is
+// invalid and unreachable.
+TestData.skipCheckDBHashes = true;
+rst.stopSet(null, null, {skipValidation: true});
+})();
diff --git a/src/mongo/db/repl/base_cloner.cpp b/src/mongo/db/repl/base_cloner.cpp
index a9a51f891bf..40a17c02a0c 100644
--- a/src/mongo/db/repl/base_cloner.cpp
+++ b/src/mongo/db/repl/base_cloner.cpp
@@ -167,7 +167,7 @@ Status BaseCloner::checkRollBackIdIsUnchanged() {
try {
getClient()->simpleCommand("admin", &info, "replSetGetRBID");
} catch (DBException& e) {
- if (ErrorCodes::isNetworkError(e)) {
+ if (ErrorCodes::isRetriableError(e)) {
auto status = e.toStatus().withContext(
": failed while attempting to retrieve rollBackId after re-connect");
LOG(1) << status;
diff --git a/src/mongo/db/repl/base_cloner.h b/src/mongo/db/repl/base_cloner.h
index 1961c57bd6f..7efdd9e640b 100644
--- a/src/mongo/db/repl/base_cloner.h
+++ b/src/mongo/db/repl/base_cloner.h
@@ -103,7 +103,7 @@ protected:
* Returns true if the Status represents an error which should be retried.
*/
virtual bool isTransientError(const Status& status) {
- return ErrorCodes::isNetworkError(status);
+ return ErrorCodes::isRetriableError(status);
}
/**
diff --git a/src/mongo/db/repl/initial_syncer.cpp b/src/mongo/db/repl/initial_syncer.cpp
index 08c54f5738f..0c9bda496d3 100644
--- a/src/mongo/db/repl/initial_syncer.cpp
+++ b/src/mongo/db/repl/initial_syncer.cpp
@@ -299,7 +299,7 @@ void InitialSyncer::_cancelRemainingWork_inlock() {
_shutdownComponent_inlock(_oplogFetcher);
if (_sharedData) {
// We actually hold the required lock, but the lock object itself is not passed through.
- _clearNetworkError(WithLock::withoutLock());
+ _clearRetriableError(WithLock::withoutLock());
stdx::lock_guard<InitialSyncSharedData> lock(*_sharedData);
_sharedData->setInitialSyncStatusIfOK(
lock, Status{ErrorCodes::CallbackCanceled, "Initial sync attempt canceled"});
@@ -1192,7 +1192,7 @@ void InitialSyncer::_lastOplogEntryFetcherCallbackForStopTimestamp(
stdx::lock_guard<Latch> lock(_mutex);
auto status = _checkForShutdownAndConvertStatus_inlock(
result.getStatus(), "error fetching last oplog entry for stop timestamp");
- if (_shouldRetryNetworkError(lock, status)) {
+ if (_shouldRetryError(lock, status)) {
auto scheduleStatus = _exec->scheduleWork(
[this, onCompletionGuard](executor::TaskExecutor::CallbackArgs args) {
// It is not valid to schedule the retry from within this callback,
@@ -1417,7 +1417,7 @@ void InitialSyncer::_rollbackCheckerCheckForRollbackCallback(
stdx::lock_guard<Latch> lock(_mutex);
auto status = _checkForShutdownAndConvertStatus_inlock(result.getStatus(),
"error while getting last rollback ID");
- if (_shouldRetryNetworkError(lock, status)) {
+ if (_shouldRetryError(lock, status)) {
LOG(1) << "Retrying rollback checker because of network error " << status;
_scheduleRollbackCheckerCheckForRollback_inlock(lock, onCompletionGuard);
return;
@@ -1712,18 +1712,18 @@ void InitialSyncer::_scheduleRollbackCheckerCheckForRollback_inlock(
return;
}
-bool InitialSyncer::_shouldRetryNetworkError(WithLock lk, Status status) {
- if (ErrorCodes::isNetworkError(status)) {
+bool InitialSyncer::_shouldRetryError(WithLock lk, Status status) {
+ if (ErrorCodes::isRetriableError(status)) {
stdx::lock_guard<InitialSyncSharedData> sharedDataLock(*_sharedData);
return _sharedData->shouldRetryOperation(sharedDataLock, &_retryingOperation);
}
- // The status was OK or some error other than a network error, so clear the network error
+ // The status was OK or some error other than a retriable error, so clear the retriable error
// state and indicate that we should not retry.
- _clearNetworkError(lk);
+ _clearRetriableError(lk);
return false;
}
-void InitialSyncer::_clearNetworkError(WithLock lk) {
+void InitialSyncer::_clearRetriableError(WithLock lk) {
_retryingOperation = boost::none;
}
@@ -1919,7 +1919,7 @@ void InitialSyncer::InitialSyncAttemptInfo::append(BSONObjBuilder* builder) cons
bool InitialSyncer::OplogFetcherRestartDecisionInitialSyncer::shouldContinue(
AbstractOplogFetcher* fetcher, Status status) {
- if (ErrorCodes::isNetworkError(status)) {
+ if (ErrorCodes::isRetriableError(status)) {
stdx::lock_guard<InitialSyncSharedData> lk(*_sharedData);
return _sharedData->shouldRetryOperation(lk, &_retryingOperation);
}
diff --git a/src/mongo/db/repl/initial_syncer.h b/src/mongo/db/repl/initial_syncer.h
index 399fdd57c54..54506e384fe 100644
--- a/src/mongo/db/repl/initial_syncer.h
+++ b/src/mongo/db/repl/initial_syncer.h
@@ -596,17 +596,17 @@ private:
const stdx::lock_guard<Latch>& lock, std::shared_ptr<OnCompletionGuard> onCompletionGuard);
/**
- * Check if a status is one which means there's a network error and we should retry the current
- * operation, and records whether an operation is currently being retried. Note this can only
- * handle one operation at a time (i.e. it should not be used in both parts of the "split"
- * section of Initial Sync)
+ * Check if a status is one which means there's a retriable error and we should retry the
+ * current operation, and records whether an operation is currently being retried. Note this
+ * can only handle one operation at a time (i.e. it should not be used in both parts of the
+ * "split" section of Initial Sync)
*/
- bool _shouldRetryNetworkError(WithLock lk, Status status);
+ bool _shouldRetryError(WithLock lk, Status status);
/**
- * Indicates we are no longer handling a network error.
+ * Indicates we are no longer handling a retriable error.
*/
- void _clearNetworkError(WithLock lk);
+ void _clearRetriableError(WithLock lk);
/**
* Checks the given status (or embedded status inside the callback args) and current data