summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJason Chan <jason.chan@mongodb.com>2021-07-12 21:28:34 +0000
committerJason Chan <jason.chan@mongodb.com>2021-07-13 23:24:10 +0000
commit92662765968eff784a82adea2f57ee5d1125712d (patch)
treeffbca8f157682723dbfd429bce1f91b4349acb08
parent1184f004a99660de6f5e745573419bda8a28c0e9 (diff)
downloadmongo-92662765968eff784a82adea2f57ee5d1125712d.tar.gz
SERVER-58398 TenantMigrationDonor will not retry recipientSyncData on non-retriable interruption errors
(cherry picked from commit bbd0b90085c06de2882e48d68812ac822a4412f9)
-rw-r--r--jstests/replsets/tenant_migration_donor_kill_op_retry.js58
-rw-r--r--jstests/replsets/tenant_migration_donor_retry.js9
-rw-r--r--jstests/replsets/tenant_migration_donor_wont_retry_recipientsyncdata_on_non_retriable_interruption_errors.js57
-rw-r--r--src/mongo/db/repl/tenant_migration_donor_service.cpp22
4 files changed, 86 insertions, 60 deletions
diff --git a/jstests/replsets/tenant_migration_donor_kill_op_retry.js b/jstests/replsets/tenant_migration_donor_kill_op_retry.js
index a9e1f9bd44f..e922acd6edc 100644
--- a/jstests/replsets/tenant_migration_donor_kill_op_retry.js
+++ b/jstests/replsets/tenant_migration_donor_kill_op_retry.js
@@ -84,64 +84,6 @@ if (!tenantMigrationTest.isFeatureFlagEnabled()) {
}
{
- jsTestLog(
- "Test that killing the recipientSyncData command on the recipient will trigger the donor " +
- "to retry sending recipientSyncData command.");
-
- const migrationOpts = {
- migrationIdString: extractUUIDFromObject(UUID()),
- recipientConnString: tenantMigrationTest.getRecipientConnString(),
- tenantId: makeTenantId(),
-
- };
-
- let fp = configureFailPoint(tenantMigrationTest.getRecipientPrimary(), "failCommand", {
- failInternalCommands: true,
- blockConnection: true,
- blockTimeMS: kDelayMS,
- failCommands: ["recipientSyncData"],
- });
-
- const donorRstArgs = TenantMigrationUtil.createRstArgs(tenantMigrationTest.getDonorRst());
-
- const runMigrationThread =
- new Thread(TenantMigrationUtil.runMigrationAsync, migrationOpts, donorRstArgs);
- runMigrationThread.start();
- fp.wait();
-
- const res = assert.commandWorked(tenantMigrationTest.getRecipientPrimary().adminCommand({
- currentOp: true,
- $or: [
- {"command.$truncated": {$exists: true}},
- {"command.recipientSyncData": {$exists: true}}
- ]
- }));
-
- // If the recipientSyncData command has been truncated, we check if the truncated command
- // contains "recipientSyncData".
- let opid;
- for (let op of res.inprog) {
- if (op.command.recipientSyncData) {
- opid = op.opid;
- } else {
- if (op.command.$truncated.includes("recipientSyncData")) {
- opid = op.opid;
- }
- }
- }
- assert(opid);
-
- assert.commandWorked(
- tenantMigrationTest.getRecipientPrimary().adminCommand({killOp: 1, op: opid}));
-
- fp.off();
- runMigrationThread.join();
-
- TenantMigrationTest.assertCommitted(runMigrationThread.returnData());
- assert.commandWorked(tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString));
-}
-
-{
// This section tests the behavior during TenantMigrationDonorService creation.
let fpNames = [
"pauseTenantMigrationBeforeCreatingStateDocumentTTLIndex",
diff --git a/jstests/replsets/tenant_migration_donor_retry.js b/jstests/replsets/tenant_migration_donor_retry.js
index 9db52a3c621..02237198ac1 100644
--- a/jstests/replsets/tenant_migration_donor_retry.js
+++ b/jstests/replsets/tenant_migration_donor_retry.js
@@ -183,6 +183,15 @@ function testDonorRetryRecipientForgetMigrationCmdOnError(errorCode) {
testDonorRetryRecipientForgetMigrationCmdOnError(ErrorCodes.ShutdownInProgress);
})();
+(() => {
+ jsTest.log("Test that the donor retries recipientForgetMigration on interruption errors");
+ // Test an error code that is in the 'Interruption' category but not in the 'isRetriable'
+ // category.
+ const interruptionErrorCode = ErrorCodes.MaxTimeMSExpired;
+ assert(ErrorCodes.isInterruption(interruptionErrorCode));
+ testDonorRetryRecipientForgetMigrationCmdOnError(interruptionErrorCode);
+})();
+
// Each donor state doc is updated three times throughout the lifetime of a tenant migration:
// - Set the "state" to "blocking"
// - Set the "state" to "commit"/"abort"
diff --git a/jstests/replsets/tenant_migration_donor_wont_retry_recipientsyncdata_on_non_retriable_interruption_errors.js b/jstests/replsets/tenant_migration_donor_wont_retry_recipientsyncdata_on_non_retriable_interruption_errors.js
new file mode 100644
index 00000000000..3faeff4d607
--- /dev/null
+++ b/jstests/replsets/tenant_migration_donor_wont_retry_recipientsyncdata_on_non_retriable_interruption_errors.js
@@ -0,0 +1,57 @@
+/**
+ * Tests that a tenant migration will be aborted when the recipient returns a non-retriable
+ * 'interruption' error for the 'recipientSyncData' command. This is to avoid situations like
+ * SERVER-58398.
+ *
+ * @tags: [requires_fcv_50, requires_majority_read_concern, requires_persistence,
+ * incompatible_with_eft, incompatible_with_windows_tls, incompatible_with_macos]
+ */
+
+(function() {
+"use strict";
+
+load("jstests/libs/fail_point_util.js");
+load("jstests/libs/parallelTester.js");
+load("jstests/libs/uuid_util.js");
+load("jstests/replsets/libs/tenant_migration_test.js");
+load("jstests/replsets/libs/tenant_migration_util.js");
+
+const kTenantId = "testTenantId";
+const migrationX509Options = TenantMigrationUtil.makeX509OptionsForTest();
+
+const tenantMigrationTest = new TenantMigrationTest({name: jsTestName()});
+if (!tenantMigrationTest.isFeatureFlagEnabled()) {
+ jsTestLog("Skipping test because the tenant migrations feature flag is disabled");
+ return;
+}
+const donorRst = tenantMigrationTest.getDonorRst();
+let recipientPrimary = tenantMigrationTest.getRecipientPrimary();
+
+const interruptionErrorCode = ErrorCodes.MaxTimeMSExpired;
+assert(ErrorCodes.isInterruption(interruptionErrorCode));
+configureFailPoint(recipientPrimary, "failCommand", {
+ failInternalCommands: true,
+ errorCode: interruptionErrorCode,
+ failCommands: ["recipientSyncData"],
+});
+
+const migrationId = UUID();
+const migrationOpts = {
+ migrationIdString: extractUUIDFromObject(migrationId),
+ tenantId: kTenantId,
+ recipientConnString: tenantMigrationTest.getRecipientConnString(),
+};
+const donorRstArgs = TenantMigrationUtil.createRstArgs(donorRst);
+const runMigrationThread = new Thread(TenantMigrationUtil.runMigrationAsync,
+ migrationOpts,
+ donorRstArgs,
+ true /* retryOnRetryableErrors */);
+runMigrationThread.start();
+
+TenantMigrationTest.assertAborted(runMigrationThread.returnData());
+tenantMigrationTest.waitForDonorNodesToReachState(
+ donorRst.nodes, migrationId, migrationOpts.tenantId, TenantMigrationTest.DonorState.kAborted);
+assert.commandWorked(tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString));
+
+tenantMigrationTest.stop();
+})();
diff --git a/src/mongo/db/repl/tenant_migration_donor_service.cpp b/src/mongo/db/repl/tenant_migration_donor_service.cpp
index 0ac4d9c671e..6340b6a761b 100644
--- a/src/mongo/db/repl/tenant_migration_donor_service.cpp
+++ b/src/mongo/db/repl/tenant_migration_donor_service.cpp
@@ -88,7 +88,7 @@ bool shouldStopInsertingDonorStateDoc(Status status) {
return status.isOK() || status == ErrorCodes::ConflictingOperationInProgress;
}
-bool shouldStopSendingRecipientCommand(Status status) {
+bool shouldStopSendingRecipientForgetMigrationCommand(Status status) {
return status.isOK() ||
!(ErrorCodes::isRetriableError(status) ||
// Returned if findHost() is unable to target the recipient in 15 seconds, which may
@@ -97,6 +97,14 @@ bool shouldStopSendingRecipientCommand(Status status) {
ErrorCodes::isInterruption(status));
}
+bool shouldStopSendingRecipientSyncDataCommand(Status status) {
+ return status.isOK() ||
+ !(ErrorCodes::isRetriableError(status) ||
+ // Returned if findHost() is unable to target the recipient in 15 seconds, which may
+ // happen after a failover.
+ status == ErrorCodes::FailedToSatisfyReadPreference);
+}
+
bool shouldStopFetchingRecipientClusterTimeKeyDocs(Status status) {
return status.isOK() ||
!(ErrorCodes::isRetriableError(status) || ErrorCodes::isInterruption(status));
@@ -645,6 +653,7 @@ ExecutorFuture<void> TenantMigrationDonorService::Instance::_sendCommandToRecipi
std::shared_ptr<RemoteCommandTargeter> recipientTargeterRS,
const BSONObj& cmdObj,
const CancellationToken& token) {
+ const bool isRecipientSyncDataCmd = cmdObj.hasField(RecipientSyncData::kCommandName);
return AsyncTry(
[this, self = shared_from_this(), executor, recipientTargeterRS, cmdObj, token] {
return recipientTargeterRS->findHost(kPrimaryOnlyReadPreference, token)
@@ -673,7 +682,16 @@ ExecutorFuture<void> TenantMigrationDonorService::Instance::_sendCommandToRecipi
});
});
})
- .until([token](Status status) { return shouldStopSendingRecipientCommand(status); })
+ .until([token, cmdObj, isRecipientSyncDataCmd](Status status) {
+ if (isRecipientSyncDataCmd) {
+ return shouldStopSendingRecipientSyncDataCommand(status);
+ } else {
+ // If the recipient command is not 'recipientSyncData', it must be
+ // 'recipientForgetMigration'.
+ invariant(cmdObj.hasField(RecipientForgetMigration::kCommandName));
+ return shouldStopSendingRecipientForgetMigrationCommand(status);
+ }
+ })
.withBackoffBetweenIterations(kExponentialBackoff)
.on(**executor, token);
}