diff options
author | Jason Chan <jason.chan@mongodb.com> | 2021-07-12 21:28:34 +0000 |
---|---|---|
committer | Jason Chan <jason.chan@mongodb.com> | 2021-07-13 23:24:10 +0000 |
commit | 92662765968eff784a82adea2f57ee5d1125712d (patch) | |
tree | ffbca8f157682723dbfd429bce1f91b4349acb08 | |
parent | 1184f004a99660de6f5e745573419bda8a28c0e9 (diff) | |
download | mongo-92662765968eff784a82adea2f57ee5d1125712d.tar.gz |
SERVER-58398 TenantMigrationDonor will not retry recipientSyncData on non-retriable interruption errors
(cherry picked from commit bbd0b90085c06de2882e48d68812ac822a4412f9)
4 files changed, 86 insertions, 60 deletions
diff --git a/jstests/replsets/tenant_migration_donor_kill_op_retry.js b/jstests/replsets/tenant_migration_donor_kill_op_retry.js index a9e1f9bd44f..e922acd6edc 100644 --- a/jstests/replsets/tenant_migration_donor_kill_op_retry.js +++ b/jstests/replsets/tenant_migration_donor_kill_op_retry.js @@ -84,64 +84,6 @@ if (!tenantMigrationTest.isFeatureFlagEnabled()) { } { - jsTestLog( - "Test that killing the recipientSyncData command on the recipient will trigger the donor " + - "to retry sending recipientSyncData command."); - - const migrationOpts = { - migrationIdString: extractUUIDFromObject(UUID()), - recipientConnString: tenantMigrationTest.getRecipientConnString(), - tenantId: makeTenantId(), - - }; - - let fp = configureFailPoint(tenantMigrationTest.getRecipientPrimary(), "failCommand", { - failInternalCommands: true, - blockConnection: true, - blockTimeMS: kDelayMS, - failCommands: ["recipientSyncData"], - }); - - const donorRstArgs = TenantMigrationUtil.createRstArgs(tenantMigrationTest.getDonorRst()); - - const runMigrationThread = - new Thread(TenantMigrationUtil.runMigrationAsync, migrationOpts, donorRstArgs); - runMigrationThread.start(); - fp.wait(); - - const res = assert.commandWorked(tenantMigrationTest.getRecipientPrimary().adminCommand({ - currentOp: true, - $or: [ - {"command.$truncated": {$exists: true}}, - {"command.recipientSyncData": {$exists: true}} - ] - })); - - // If the recipientSyncData command has been truncated, we check if the truncated command - // contains "recipientSyncData". - let opid; - for (let op of res.inprog) { - if (op.command.recipientSyncData) { - opid = op.opid; - } else { - if (op.command.$truncated.includes("recipientSyncData")) { - opid = op.opid; - } - } - } - assert(opid); - - assert.commandWorked( - tenantMigrationTest.getRecipientPrimary().adminCommand({killOp: 1, op: opid})); - - fp.off(); - runMigrationThread.join(); - - TenantMigrationTest.assertCommitted(runMigrationThread.returnData()); - assert.commandWorked(tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString)); -} - -{ // This section tests the behavior during TenantMigrationDonorService creation. let fpNames = [ "pauseTenantMigrationBeforeCreatingStateDocumentTTLIndex", diff --git a/jstests/replsets/tenant_migration_donor_retry.js b/jstests/replsets/tenant_migration_donor_retry.js index 9db52a3c621..02237198ac1 100644 --- a/jstests/replsets/tenant_migration_donor_retry.js +++ b/jstests/replsets/tenant_migration_donor_retry.js @@ -183,6 +183,15 @@ function testDonorRetryRecipientForgetMigrationCmdOnError(errorCode) { testDonorRetryRecipientForgetMigrationCmdOnError(ErrorCodes.ShutdownInProgress); })(); +(() => { + jsTest.log("Test that the donor retries recipientForgetMigration on interruption errors"); + // Test an error code that is in the 'Interruption' category but not in the 'isRetriable' + // category. + const interruptionErrorCode = ErrorCodes.MaxTimeMSExpired; + assert(ErrorCodes.isInterruption(interruptionErrorCode)); + testDonorRetryRecipientForgetMigrationCmdOnError(interruptionErrorCode); +})(); + // Each donor state doc is updated three times throughout the lifetime of a tenant migration: // - Set the "state" to "blocking" // - Set the "state" to "commit"/"abort" diff --git a/jstests/replsets/tenant_migration_donor_wont_retry_recipientsyncdata_on_non_retriable_interruption_errors.js b/jstests/replsets/tenant_migration_donor_wont_retry_recipientsyncdata_on_non_retriable_interruption_errors.js new file mode 100644 index 00000000000..3faeff4d607 --- /dev/null +++ b/jstests/replsets/tenant_migration_donor_wont_retry_recipientsyncdata_on_non_retriable_interruption_errors.js @@ -0,0 +1,57 @@ +/** + * Tests that a tenant migration will be aborted when the recipient returns a non-retriable + * 'interruption' error for the 'recipientSyncData' command. This is to avoid situations like + * SERVER-58398. + * + * @tags: [requires_fcv_50, requires_majority_read_concern, requires_persistence, + * incompatible_with_eft, incompatible_with_windows_tls, incompatible_with_macos] + */ + +(function() { +"use strict"; + +load("jstests/libs/fail_point_util.js"); +load("jstests/libs/parallelTester.js"); +load("jstests/libs/uuid_util.js"); +load("jstests/replsets/libs/tenant_migration_test.js"); +load("jstests/replsets/libs/tenant_migration_util.js"); + +const kTenantId = "testTenantId"; +const migrationX509Options = TenantMigrationUtil.makeX509OptionsForTest(); + +const tenantMigrationTest = new TenantMigrationTest({name: jsTestName()}); +if (!tenantMigrationTest.isFeatureFlagEnabled()) { + jsTestLog("Skipping test because the tenant migrations feature flag is disabled"); + return; +} +const donorRst = tenantMigrationTest.getDonorRst(); +let recipientPrimary = tenantMigrationTest.getRecipientPrimary(); + +const interruptionErrorCode = ErrorCodes.MaxTimeMSExpired; +assert(ErrorCodes.isInterruption(interruptionErrorCode)); +configureFailPoint(recipientPrimary, "failCommand", { + failInternalCommands: true, + errorCode: interruptionErrorCode, + failCommands: ["recipientSyncData"], +}); + +const migrationId = UUID(); +const migrationOpts = { + migrationIdString: extractUUIDFromObject(migrationId), + tenantId: kTenantId, + recipientConnString: tenantMigrationTest.getRecipientConnString(), +}; +const donorRstArgs = TenantMigrationUtil.createRstArgs(donorRst); +const runMigrationThread = new Thread(TenantMigrationUtil.runMigrationAsync, + migrationOpts, + donorRstArgs, + true /* retryOnRetryableErrors */); +runMigrationThread.start(); + +TenantMigrationTest.assertAborted(runMigrationThread.returnData()); +tenantMigrationTest.waitForDonorNodesToReachState( + donorRst.nodes, migrationId, migrationOpts.tenantId, TenantMigrationTest.DonorState.kAborted); +assert.commandWorked(tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString)); + +tenantMigrationTest.stop(); +})(); diff --git a/src/mongo/db/repl/tenant_migration_donor_service.cpp b/src/mongo/db/repl/tenant_migration_donor_service.cpp index 0ac4d9c671e..6340b6a761b 100644 --- a/src/mongo/db/repl/tenant_migration_donor_service.cpp +++ b/src/mongo/db/repl/tenant_migration_donor_service.cpp @@ -88,7 +88,7 @@ bool shouldStopInsertingDonorStateDoc(Status status) { return status.isOK() || status == ErrorCodes::ConflictingOperationInProgress; } -bool shouldStopSendingRecipientCommand(Status status) { +bool shouldStopSendingRecipientForgetMigrationCommand(Status status) { return status.isOK() || !(ErrorCodes::isRetriableError(status) || // Returned if findHost() is unable to target the recipient in 15 seconds, which may @@ -97,6 +97,14 @@ bool shouldStopSendingRecipientCommand(Status status) { ErrorCodes::isInterruption(status)); } +bool shouldStopSendingRecipientSyncDataCommand(Status status) { + return status.isOK() || + !(ErrorCodes::isRetriableError(status) || + // Returned if findHost() is unable to target the recipient in 15 seconds, which may + // happen after a failover. + status == ErrorCodes::FailedToSatisfyReadPreference); +} + bool shouldStopFetchingRecipientClusterTimeKeyDocs(Status status) { return status.isOK() || !(ErrorCodes::isRetriableError(status) || ErrorCodes::isInterruption(status)); @@ -645,6 +653,7 @@ ExecutorFuture<void> TenantMigrationDonorService::Instance::_sendCommandToRecipi std::shared_ptr<RemoteCommandTargeter> recipientTargeterRS, const BSONObj& cmdObj, const CancellationToken& token) { + const bool isRecipientSyncDataCmd = cmdObj.hasField(RecipientSyncData::kCommandName); return AsyncTry( [this, self = shared_from_this(), executor, recipientTargeterRS, cmdObj, token] { return recipientTargeterRS->findHost(kPrimaryOnlyReadPreference, token) @@ -673,7 +682,16 @@ ExecutorFuture<void> TenantMigrationDonorService::Instance::_sendCommandToRecipi }); }); }) - .until([token](Status status) { return shouldStopSendingRecipientCommand(status); }) + .until([token, cmdObj, isRecipientSyncDataCmd](Status status) { + if (isRecipientSyncDataCmd) { + return shouldStopSendingRecipientSyncDataCommand(status); + } else { + // If the recipient command is not 'recipientSyncData', it must be + // 'recipientForgetMigration'. + invariant(cmdObj.hasField(RecipientForgetMigration::kCommandName)); + return shouldStopSendingRecipientForgetMigrationCommand(status); + } + }) .withBackoffBetweenIterations(kExponentialBackoff) .on(**executor, token); } |