diff options
-rw-r--r-- | jstests/replsets/initial_sync_fails_when_source_removed.js | 84 | ||||
-rw-r--r-- | src/mongo/db/repl/base_cloner.cpp | 2 | ||||
-rw-r--r-- | src/mongo/db/repl/base_cloner.h | 2 | ||||
-rw-r--r-- | src/mongo/db/repl/initial_syncer.cpp | 18 | ||||
-rw-r--r-- | src/mongo/db/repl/initial_syncer.h | 14 |
5 files changed, 102 insertions, 18 deletions
diff --git a/jstests/replsets/initial_sync_fails_when_source_removed.js b/jstests/replsets/initial_sync_fails_when_source_removed.js new file mode 100644 index 00000000000..53cf3431bb8 --- /dev/null +++ b/jstests/replsets/initial_sync_fails_when_source_removed.js @@ -0,0 +1,84 @@ +/** + * Tests that initial sync will abort an attempt if the sync source is removed during cloning. + * This test will timeout if the attempt is not aborted. + * @tags: [requires_fcv_44] + */ +(function() { +"use strict"; + +load("jstests/libs/fail_point_util.js"); + +const testName = "initial_sync_fails_when_source_removed"; +const rst = + new ReplSetTest({name: testName, nodes: [{}, {rsConfig: {priority: 0}}], allowChaining: true}); +const nodes = rst.startSet(); +rst.initiate(); + +const primary = rst.getPrimary(); +const primaryDb = primary.getDB("test"); +const initialSyncSource = rst.getSecondary(); + +// Add some data to be cloned. +assert.commandWorked(primaryDb.test.insert([{a: 1}, {b: 2}, {c: 3}])); +rst.awaitReplication(); + +jsTest.log("Adding the initial sync destination node to the replica set"); +const initialSyncNode = rst.add({ + rsConfig: {priority: 0}, + setParameter: { + 'failpoint.initialSyncHangBeforeCopyingDatabases': tojson({mode: 'alwaysOn'}), + 'numInitialSyncAttempts': 1, + 'failpoint.forceSyncSourceCandidate': + tojson({mode: 'alwaysOn', data: {hostAndPort: initialSyncSource.host}}) + } +}); +rst.reInitiate(); +rst.waitForState(initialSyncNode, ReplSetTest.State.STARTUP_2); + +// The code handling this case is common to all cloners, so run it only for the stage most likely +// to see an error. +const cloner = 'CollectionCloner'; +const stage = 'query'; + +// Set us up to hang before finish so we can check status. +const beforeFinishFailPoint = configureFailPoint(initialSyncNode, "initialSyncHangBeforeFinish"); +const initialSyncNodeDb = initialSyncNode.getDB("test"); +const failPointData = { + cloner: cloner, + stage: stage, + nss: 'test.test' +}; +// Set us up to stop right before the given stage. +const beforeStageFailPoint = + configureFailPoint(initialSyncNodeDb, "hangBeforeClonerStage", failPointData); +// Release the initial failpoint. +assert.commandWorked(initialSyncNodeDb.adminCommand( + {configureFailPoint: "initialSyncHangBeforeCopyingDatabases", mode: "off"})); +beforeStageFailPoint.wait(); + +jsTestLog("Testing removing sync source in cloner " + cloner + " stage " + stage); +// We can't use remove/reInitiate here because that does not properly remove a node +// in the middle of a config. +let config = rst.getReplSetConfig(); +config.members.splice(1, 1); // Removes node[1] +config.version = rst.getReplSetConfigFromNode().version + 1; +assert.commandWorked(primary.getDB("admin").adminCommand({replSetReconfig: config})); + +jsTestLog("Waiting for source to realize it is removed."); +assert.soonNoExcept(() => assert.commandFailedWithCode( + initialSyncSource.getDB("test").adminCommand({replSetGetStatus: 1}), + ErrorCodes.InvalidReplicaSetConfig)); + +jsTestLog("Resuming the initial sync."); +beforeStageFailPoint.off(); +beforeFinishFailPoint.wait(); +const res = assert.commandWorked(initialSyncNode.adminCommand({replSetGetStatus: 1})); +// The initial sync should have failed. +assert.eq(res.initialSyncStatus.failedInitialSyncAttempts, 1); +beforeFinishFailPoint.off(); + +// We skip validation and dbhashes because the initial sync failed so the initial sync node is +// invalid and unreachable. +TestData.skipCheckDBHashes = true; +rst.stopSet(null, null, {skipValidation: true}); +})(); diff --git a/src/mongo/db/repl/base_cloner.cpp b/src/mongo/db/repl/base_cloner.cpp index a9a51f891bf..40a17c02a0c 100644 --- a/src/mongo/db/repl/base_cloner.cpp +++ b/src/mongo/db/repl/base_cloner.cpp @@ -167,7 +167,7 @@ Status BaseCloner::checkRollBackIdIsUnchanged() { try { getClient()->simpleCommand("admin", &info, "replSetGetRBID"); } catch (DBException& e) { - if (ErrorCodes::isNetworkError(e)) { + if (ErrorCodes::isRetriableError(e)) { auto status = e.toStatus().withContext( ": failed while attempting to retrieve rollBackId after re-connect"); LOG(1) << status; diff --git a/src/mongo/db/repl/base_cloner.h b/src/mongo/db/repl/base_cloner.h index 1961c57bd6f..7efdd9e640b 100644 --- a/src/mongo/db/repl/base_cloner.h +++ b/src/mongo/db/repl/base_cloner.h @@ -103,7 +103,7 @@ protected: * Returns true if the Status represents an error which should be retried. */ virtual bool isTransientError(const Status& status) { - return ErrorCodes::isNetworkError(status); + return ErrorCodes::isRetriableError(status); } /** diff --git a/src/mongo/db/repl/initial_syncer.cpp b/src/mongo/db/repl/initial_syncer.cpp index 08c54f5738f..0c9bda496d3 100644 --- a/src/mongo/db/repl/initial_syncer.cpp +++ b/src/mongo/db/repl/initial_syncer.cpp @@ -299,7 +299,7 @@ void InitialSyncer::_cancelRemainingWork_inlock() { _shutdownComponent_inlock(_oplogFetcher); if (_sharedData) { // We actually hold the required lock, but the lock object itself is not passed through. - _clearNetworkError(WithLock::withoutLock()); + _clearRetriableError(WithLock::withoutLock()); stdx::lock_guard<InitialSyncSharedData> lock(*_sharedData); _sharedData->setInitialSyncStatusIfOK( lock, Status{ErrorCodes::CallbackCanceled, "Initial sync attempt canceled"}); @@ -1192,7 +1192,7 @@ void InitialSyncer::_lastOplogEntryFetcherCallbackForStopTimestamp( stdx::lock_guard<Latch> lock(_mutex); auto status = _checkForShutdownAndConvertStatus_inlock( result.getStatus(), "error fetching last oplog entry for stop timestamp"); - if (_shouldRetryNetworkError(lock, status)) { + if (_shouldRetryError(lock, status)) { auto scheduleStatus = _exec->scheduleWork( [this, onCompletionGuard](executor::TaskExecutor::CallbackArgs args) { // It is not valid to schedule the retry from within this callback, @@ -1417,7 +1417,7 @@ void InitialSyncer::_rollbackCheckerCheckForRollbackCallback( stdx::lock_guard<Latch> lock(_mutex); auto status = _checkForShutdownAndConvertStatus_inlock(result.getStatus(), "error while getting last rollback ID"); - if (_shouldRetryNetworkError(lock, status)) { + if (_shouldRetryError(lock, status)) { LOG(1) << "Retrying rollback checker because of network error " << status; _scheduleRollbackCheckerCheckForRollback_inlock(lock, onCompletionGuard); return; @@ -1712,18 +1712,18 @@ void InitialSyncer::_scheduleRollbackCheckerCheckForRollback_inlock( return; } -bool InitialSyncer::_shouldRetryNetworkError(WithLock lk, Status status) { - if (ErrorCodes::isNetworkError(status)) { +bool InitialSyncer::_shouldRetryError(WithLock lk, Status status) { + if (ErrorCodes::isRetriableError(status)) { stdx::lock_guard<InitialSyncSharedData> sharedDataLock(*_sharedData); return _sharedData->shouldRetryOperation(sharedDataLock, &_retryingOperation); } - // The status was OK or some error other than a network error, so clear the network error + // The status was OK or some error other than a retriable error, so clear the retriable error // state and indicate that we should not retry. - _clearNetworkError(lk); + _clearRetriableError(lk); return false; } -void InitialSyncer::_clearNetworkError(WithLock lk) { +void InitialSyncer::_clearRetriableError(WithLock lk) { _retryingOperation = boost::none; } @@ -1919,7 +1919,7 @@ void InitialSyncer::InitialSyncAttemptInfo::append(BSONObjBuilder* builder) cons bool InitialSyncer::OplogFetcherRestartDecisionInitialSyncer::shouldContinue( AbstractOplogFetcher* fetcher, Status status) { - if (ErrorCodes::isNetworkError(status)) { + if (ErrorCodes::isRetriableError(status)) { stdx::lock_guard<InitialSyncSharedData> lk(*_sharedData); return _sharedData->shouldRetryOperation(lk, &_retryingOperation); } diff --git a/src/mongo/db/repl/initial_syncer.h b/src/mongo/db/repl/initial_syncer.h index 399fdd57c54..54506e384fe 100644 --- a/src/mongo/db/repl/initial_syncer.h +++ b/src/mongo/db/repl/initial_syncer.h @@ -596,17 +596,17 @@ private: const stdx::lock_guard<Latch>& lock, std::shared_ptr<OnCompletionGuard> onCompletionGuard); /** - * Check if a status is one which means there's a network error and we should retry the current - * operation, and records whether an operation is currently being retried. Note this can only - * handle one operation at a time (i.e. it should not be used in both parts of the "split" - * section of Initial Sync) + * Check if a status is one which means there's a retriable error and we should retry the + * current operation, and records whether an operation is currently being retried. Note this + * can only handle one operation at a time (i.e. it should not be used in both parts of the + * "split" section of Initial Sync) */ - bool _shouldRetryNetworkError(WithLock lk, Status status); + bool _shouldRetryError(WithLock lk, Status status); /** - * Indicates we are no longer handling a network error. + * Indicates we are no longer handling a retriable error. */ - void _clearNetworkError(WithLock lk); + void _clearRetriableError(WithLock lk); /** * Checks the given status (or embedded status inside the callback args) and current data |