diff options
author | Jason Zhang <jason.zhang@mongodb.com> | 2021-04-02 19:00:00 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-04-21 14:14:33 +0000 |
commit | 8d962a7ce64afc7a0a96da4212506f557792786c (patch) | |
tree | 62de7559868d203ed29c341cacbcee41437c91d7 | |
parent | 6e9b3bf294a08a1e2b78147d4aae34217cd5245a (diff) | |
download | mongo-8d962a7ce64afc7a0a96da4212506f557792786c.tar.gz |
SERVER-54207 Add test coverage for interrupting tenant migration donor operation contexts
(cherry picked from commit 3cb1bf74b1cf6271a7ea641eeb5f38e56d27535f)
-rw-r--r-- | jstests/replsets/libs/tenant_migration_test.js | 64 | ||||
-rw-r--r-- | jstests/replsets/libs/tenant_migration_util.js | 63 | ||||
-rw-r--r-- | jstests/replsets/tenant_migration_donor_kill_op_retry.js | 262 | ||||
-rw-r--r-- | src/mongo/db/repl/tenant_migration_donor_service.cpp | 28 | ||||
-rw-r--r-- | src/mongo/db/repl/tenant_migration_util.cpp | 3 |
5 files changed, 348 insertions, 72 deletions
diff --git a/jstests/replsets/libs/tenant_migration_test.js b/jstests/replsets/libs/tenant_migration_test.js index 8fd5409b13b..654b2f88dcf 100644 --- a/jstests/replsets/libs/tenant_migration_test.js +++ b/jstests/replsets/libs/tenant_migration_test.js @@ -42,13 +42,13 @@ function TenantMigrationTest({ donorRst.asCluster(donorRst.nodes, () => { donorRst.getPrimary(); donorRst.awaitReplication(); - createTenantMigrationRecipientRoleIfNotExist(donorRst); + TenantMigrationUtil.createTenantMigrationRecipientRoleIfNotExist(donorRst); }); recipientRst.asCluster(recipientRst.nodes, () => { recipientRst.getPrimary(); recipientRst.awaitReplication(); - createTenantMigrationDonorRoleIfNotExist(recipientRst); + TenantMigrationUtil.createTenantMigrationDonorRoleIfNotExist(recipientRst); }); /** @@ -77,71 +77,13 @@ function TenantMigrationTest({ } /** - * Returns true if the given database role already exists. - */ - function roleExists(db, roleName) { - const roles = db.getRoles({rolesInfo: 1, showPrivileges: false, showBuiltinRoles: false}); - const fullRoleName = `${db.getName()}.${roleName}`; - for (let role of roles) { - if (role._id == fullRoleName) { - return true; - } - } - return false; - } - - /** - * Creates a role for tenant migration donor if it doesn't exist. - */ - function createTenantMigrationDonorRoleIfNotExist(rst) { - const adminDB = rst.getPrimary().getDB("admin"); - - if (roleExists(adminDB, "tenantMigrationDonorRole")) { - return; - } - - assert.commandWorked(adminDB.runCommand({ - createRole: "tenantMigrationDonorRole", - privileges: [ - {resource: {cluster: true}, actions: ["runTenantMigration"]}, - {resource: {db: "admin", collection: "system.keys"}, actions: ["find"]} - ], - roles: [] - })); - } - - /** - * Creates a role for tenant migration recipient if it doesn't exist. - */ - function createTenantMigrationRecipientRoleIfNotExist(rst) { - const adminDB = rst.getPrimary().getDB("admin"); - - if (roleExists(adminDB, "tenantMigrationRecipientRole")) { - return; - } - - assert.commandWorked(adminDB.runCommand({ - createRole: "tenantMigrationRecipientRole", - privileges: [ - {resource: {cluster: true}, actions: ["listDatabases", "useUUID"]}, - {resource: {db: "", collection: ""}, actions: ["listCollections"]}, - { - resource: {anyResource: true}, - actions: ["dbStats", "collStats", "find", "listIndexes"] - } - ], - roles: [] - })); - } - - /** * Creates a role for running find command against config.external_validation_keys if it * doesn't exist. */ function createFindExternalClusterTimeKeysRoleIfNotExist(rst) { const adminDB = rst.getPrimary().getDB("admin"); - if (roleExists(adminDB, "findExternalClusterTimeKeysRole")) { + if (TenantMigrationUtil.roleExists(adminDB, "findExternalClusterTimeKeysRole")) { return; } diff --git a/jstests/replsets/libs/tenant_migration_util.js b/jstests/replsets/libs/tenant_migration_util.js index de4ca29b447..0ab2c45b09d 100644 --- a/jstests/replsets/libs/tenant_migration_util.js +++ b/jstests/replsets/libs/tenant_migration_util.js @@ -332,6 +332,64 @@ var TenantMigrationUtil = (function() { } } + /** + * Creates a role for tenant migration donor if it doesn't exist. + */ + function createTenantMigrationDonorRoleIfNotExist(rst) { + const adminDB = rst.getPrimary().getDB("admin"); + + if (roleExists(adminDB, "tenantMigrationDonorRole")) { + return; + } + + assert.commandWorked(adminDB.runCommand({ + createRole: "tenantMigrationDonorRole", + privileges: [ + {resource: {cluster: true}, actions: ["runTenantMigration"]}, + {resource: {db: "admin", collection: "system.keys"}, actions: ["find"]} + ], + roles: [] + })); + } + + /** + * Creates a role for tenant migration recipient if it doesn't exist. + */ + function createTenantMigrationRecipientRoleIfNotExist(rst) { + const adminDB = rst.getPrimary().getDB("admin"); + + if (roleExists(adminDB, "tenantMigrationRecipientRole")) { + return; + } + + assert.commandWorked(adminDB.runCommand({ + createRole: "tenantMigrationRecipientRole", + privileges: [ + {resource: {cluster: true}, actions: ["listDatabases", "useUUID"]}, + {resource: {db: "", collection: ""}, actions: ["listCollections"]}, + { + resource: {anyResource: true}, + actions: ["dbStats", "collStats", "find", "listIndexes"] + } + ], + roles: [] + })); + } + + /** + * Returns true if the given database role already exists. + */ + function roleExists(db, roleName) { + const roles = db.getRoles({rolesInfo: 1, showPrivileges: false, showBuiltinRoles: false}); + const fullRoleName = `${db.getName()}.${roleName}`; + for (let role of roles) { + if (role._id == fullRoleName) { + return true; + } + } + return false; + } + return { kExternalKeysNs, getExternalKeys, @@ -351,6 +409,9 @@ var TenantMigrationUtil = (function() { getNumBlockedReads, getNumBlockedWrites, isNamespaceForTenant, - checkTenantDBHashes + checkTenantDBHashes, + createTenantMigrationDonorRoleIfNotExist, + createTenantMigrationRecipientRoleIfNotExist, + roleExists }; })(); diff --git a/jstests/replsets/tenant_migration_donor_kill_op_retry.js b/jstests/replsets/tenant_migration_donor_kill_op_retry.js new file mode 100644 index 00000000000..06b8a853ec7 --- /dev/null +++ b/jstests/replsets/tenant_migration_donor_kill_op_retry.js @@ -0,0 +1,262 @@ +/** + * Tests that the donor will retry its steps if its OperationContext is interrupted by a killOp. + * + * @tags: [requires_fcv_47, requires_majority_read_concern, incompatible_with_eft, + * incompatible_with_windows_tls] + */ + +(function() { +"use strict"; + +load("jstests/libs/fail_point_util.js"); +load("jstests/libs/parallelTester.js"); +load("jstests/libs/uuid_util.js"); +load("jstests/replsets/libs/tenant_migration_test.js"); +load("jstests/replsets/libs/tenant_migration_util.js"); + +const kGarbageCollectionDelayMS = 5 * 1000; +const kDelayMS = 100000; // Set some arbitrarily large blockTimeMS to let recipientSyncData command + // hang until we use kill op to kill it. +const kTenantIdPrefix = "testTenantId"; +let testNum = 0; +const migrationX509Options = TenantMigrationUtil.makeX509OptionsForTest(); +const garbageCollectionOpts = { + // Set the delay before a donor state doc is garbage collected to be short to speed + // up the test. + tenantMigrationGarbageCollectionDelayMS: kGarbageCollectionDelayMS, + ttlMonitorSleepSecs: 1 +}; + +function makeTenantId() { + return kTenantIdPrefix + testNum++; +} + +const tenantMigrationTest = new TenantMigrationTest({name: jsTestName()}); +if (!tenantMigrationTest.isFeatureFlagEnabled()) { + jsTestLog("Skipping test because the tenant migrations feature flag is disabled"); + return; +} + +{ + // This section tests behavior in the middle of a tenant migration. + let fpNames = [ + "pauseTenantMigrationBeforeInsertingDonorStateDoc", + "pauseTenantMigrationDonorWhileUpdatingStateDoc", + "pauseTenantMigrationBeforeStoringExternalClusterTimeKeyDocs" + ]; + for (let fpName of fpNames) { + jsTestLog("Setting failpoint \"" + fpName + + "\" to test that the migration will retry the " + + "operation at the failpoint if a killOp is issued."); + + const migrationOpts = { + migrationIdString: extractUUIDFromObject(UUID()), + recipientConnString: tenantMigrationTest.getRecipientConnString(), + tenantId: makeTenantId(), + + }; + const donorPrimary = tenantMigrationTest.getDonorPrimary(); + let fp = configureFailPoint(donorPrimary, fpName); + + const donorRstArgs = TenantMigrationUtil.createRstArgs(tenantMigrationTest.getDonorRst()); + + const runMigrationThread = + new Thread(TenantMigrationUtil.runMigrationAsync, migrationOpts, donorRstArgs); + runMigrationThread.start(); + fp.wait(); + + const res = assert.commandWorked(donorPrimary.adminCommand({ + currentOp: true, + $all: true, + desc: {$regex: 'TenantMigrationDonorService'}, + opid: {$exists: true} + })); + + const opid = res.inprog[0].opid; + assert.commandWorked(donorPrimary.adminCommand({killOp: 1, op: opid})); + + fp.off(); + runMigrationThread.join(); + + const stateRes = assert.commandWorked(runMigrationThread.returnData()); + assert.eq(stateRes.state, TenantMigrationTest.DonorState.kCommitted); + assert.commandWorked(tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString)); + } +} + +{ + jsTestLog( + "Test that killing the recipientSyncData command on the recipient will trigger the donor " + + "to retry sending recipientSyncData command."); + + const migrationOpts = { + migrationIdString: extractUUIDFromObject(UUID()), + recipientConnString: tenantMigrationTest.getRecipientConnString(), + tenantId: makeTenantId(), + + }; + + let fp = configureFailPoint(tenantMigrationTest.getRecipientPrimary(), "failCommand", { + failInternalCommands: true, + blockConnection: true, + blockTimeMS: kDelayMS, + failCommands: ["recipientSyncData"], + }); + + const donorRstArgs = TenantMigrationUtil.createRstArgs(tenantMigrationTest.getDonorRst()); + + const runMigrationThread = + new Thread(TenantMigrationUtil.runMigrationAsync, migrationOpts, donorRstArgs); + runMigrationThread.start(); + fp.wait(); + + const res = assert.commandWorked(tenantMigrationTest.getRecipientPrimary().adminCommand({ + currentOp: true, + $or: [ + {"command.$truncated": {$exists: true}}, + {"command.recipientSyncData": {$exists: true}} + ] + })); + + // If the recipientSyncData command has been truncated, we check if the truncated command + // contains "recipientSyncData". + let opid; + for (let op of res.inprog) { + if (op.command.recipientSyncData) { + opid = op.opid; + } else { + if (op.command.$truncated.includes("recipientSyncData")) { + opid = op.opid; + } + } + } + assert(opid); + + assert.commandWorked( + tenantMigrationTest.getRecipientPrimary().adminCommand({killOp: 1, op: opid})); + + fp.off(); + runMigrationThread.join(); + + const stateRes = assert.commandWorked(runMigrationThread.returnData()); + assert.eq(stateRes.state, TenantMigrationTest.DonorState.kCommitted); + assert.commandWorked(tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString)); +} + +{ + // This section tests the behavior during TenantMigrationDonorService creation. + let fpNames = [ + "pauseTenantMigrationBeforeCreatingStateDocumentTTLIndex", + "pauseTenantMigrationBeforeCreatingExternalKeysTTLIndex" + ]; + for (let fpName of fpNames) { + tenantMigrationTest.getDonorRst().stopSet(); + tenantMigrationTest.getDonorRst().startSet( + Object.assign(migrationX509Options.donor, + {setParameter: {['failpoint.' + fpName]: tojson({mode: 'alwaysOn'})}})); + tenantMigrationTest.getDonorRst().initiate(); + TenantMigrationUtil.createTenantMigrationRecipientRoleIfNotExist( + tenantMigrationTest.getDonorRst()); + + jsTestLog( + "Setting failpoint \"" + fpName + + "\" during the creation of a ReplSetTest to test that the migration will retry the " + + "operation at the failpoint if a killOp is issued."); + + const migrationOpts = { + migrationIdString: extractUUIDFromObject(UUID()), + recipientConnString: tenantMigrationTest.getRecipientConnString(), + tenantId: makeTenantId(), + + }; + const donorPrimary = tenantMigrationTest.getDonorPrimary(); + const donorRstArgs = TenantMigrationUtil.createRstArgs(tenantMigrationTest.getDonorRst()); + const runMigrationThread = + new Thread(TenantMigrationUtil.runMigrationAsync, migrationOpts, donorRstArgs); + runMigrationThread.start(); + + const res = assert.commandWorked(donorPrimary.adminCommand({ + currentOp: true, + $all: true, + desc: {$regex: 'TenantMigrationDonorService'}, + opid: {$exists: true} + })); + const opid = res.inprog[0].opid; + assert.commandWorked(donorPrimary.adminCommand({killOp: 1, op: opid})); + + assert.commandWorked(donorPrimary.adminCommand({configureFailPoint: fpName, mode: "off"})); + + runMigrationThread.join(); + + const stateRes = assert.commandWorked(runMigrationThread.returnData()); + assert.eq(stateRes.state, TenantMigrationTest.DonorState.kCommitted); + assert.commandWorked(tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString)); + } +} + +{ + // This section is testing behavior during garbage collection. + tenantMigrationTest.getDonorRst().stopSet(); + tenantMigrationTest.getDonorRst().startSet( + Object.assign(migrationX509Options.donor, {setParameter: garbageCollectionOpts})); + tenantMigrationTest.getDonorRst().initiate(); + TenantMigrationUtil.createTenantMigrationRecipientRoleIfNotExist( + tenantMigrationTest.getDonorRst()); + + tenantMigrationTest.getRecipientRst().stopSet(); + tenantMigrationTest.getRecipientRst().startSet( + Object.assign(migrationX509Options.recipient, {setParameter: garbageCollectionOpts})); + tenantMigrationTest.getRecipientRst().initiate(); + TenantMigrationUtil.createTenantMigrationDonorRoleIfNotExist( + tenantMigrationTest.getRecipientRst()); + + let fpNames = [ + "pauseTenantMigrationDonorBeforeMarkingStateGarbageCollectable", + "pauseTenantMigrationBeforeMarkingExternalKeysGarbageCollectable" + ]; + for (let fpName of fpNames) { + jsTestLog( + "Setting failpoint \"" + fpName + + "\" during migration garbage collection to test that the migration will retry the " + + "operation at the failpoint if a killOp is issued."); + const migrationId = UUID(); + const tenantId = makeTenantId(); + const migrationOpts = { + migrationIdString: extractUUIDFromObject(migrationId), + recipientConnString: tenantMigrationTest.getRecipientConnString(), + tenantId: tenantId, + }; + + let fp = configureFailPoint(tenantMigrationTest.getDonorPrimary(), fpName); + + const stateRes = assert.commandWorked( + tenantMigrationTest.runMigration(migrationOpts, + false /* retry on retriable errors */, + false /* Automatically forget migration */)); + assert.eq(stateRes.state, TenantMigrationTest.DonorState.kCommitted); + + const donorPrimary = tenantMigrationTest.getDonorPrimary(); + const donorRstArgs = TenantMigrationUtil.createRstArgs(tenantMigrationTest.getDonorRst()); + const forgetMigrationThread = new Thread(TenantMigrationUtil.forgetMigrationAsync, + migrationOpts.migrationIdString, + donorRstArgs); + forgetMigrationThread.start(); + + fp.wait(); + + const res = assert.commandWorked(donorPrimary.adminCommand({ + currentOp: true, + $all: true, + desc: {$regex: 'TenantMigrationDonorService'}, + opid: {$exists: true} + })); + const opid = res.inprog[0].opid; + assert.commandWorked(donorPrimary.adminCommand({killOp: 1, op: opid})); + + fp.off(); + forgetMigrationThread.join(); + tenantMigrationTest.waitForMigrationGarbageCollection(migrationId, tenantId); + } +} +tenantMigrationTest.stop(); +})(); diff --git a/src/mongo/db/repl/tenant_migration_donor_service.cpp b/src/mongo/db/repl/tenant_migration_donor_service.cpp index 47330bea141..0595a77f7a5 100644 --- a/src/mongo/db/repl/tenant_migration_donor_service.cpp +++ b/src/mongo/db/repl/tenant_migration_donor_service.cpp @@ -70,6 +70,10 @@ MONGO_FAIL_POINT_DEFINE(pauseTenantMigrationDonorBeforeWaitingForKeysToReplicate MONGO_FAIL_POINT_DEFINE(pauseTenantMigrationDonorBeforeMarkingStateGarbageCollectable); MONGO_FAIL_POINT_DEFINE(pauseTenantMigrationBeforeEnteringFutureChain); MONGO_FAIL_POINT_DEFINE(pauseTenantMigrationAfterFetchingAndStoringKeys); +MONGO_FAIL_POINT_DEFINE(pauseTenantMigrationDonorWhileUpdatingStateDoc); +MONGO_FAIL_POINT_DEFINE(pauseTenantMigrationBeforeInsertingDonorStateDoc); +MONGO_FAIL_POINT_DEFINE(pauseTenantMigrationBeforeCreatingStateDocumentTTLIndex); +MONGO_FAIL_POINT_DEFINE(pauseTenantMigrationBeforeCreatingExternalKeysTTLIndex); const std::string kTTLIndexName = "TenantMigrationDonorTTLIndex"; const std::string kExternalKeysTTLIndexName = "ExternalKeysTTLIndex"; @@ -88,13 +92,15 @@ bool shouldStopSendingRecipientCommand(Status status) { !(ErrorCodes::isRetriableError(status) || // Returned if findHost() is unable to target the recipient in 15 seconds, which may // happen after a failover. - status == ErrorCodes::FailedToSatisfyReadPreference); + status == ErrorCodes::FailedToSatisfyReadPreference || + ErrorCodes::isInterruption(status)); } bool shouldStopFetchingRecipientClusterTimeKeyDocs(Status status) { // TODO (SERVER-54926): Convert HostUnreachable error in // _fetchAndStoreRecipientClusterTimeKeyDocs to specific error. - return status.isOK() || !ErrorCodes::isRetriableError(status) || + return status.isOK() || + !(ErrorCodes::isRetriableError(status) || ErrorCodes::isInterruption(status)) || status.code() == ErrorCodes::HostUnreachable; } void checkForTokenInterrupt(const CancelationToken& token) { @@ -148,6 +154,8 @@ ExecutorFuture<void> TenantMigrationDonorService::createStateDocumentTTLIndex( auto opCtx = opCtxHolder.get(); DBDirectClient client(opCtx); + pauseTenantMigrationBeforeCreatingStateDocumentTTLIndex.pauseWhileSet(opCtx); + BSONObj result; client.runCommand( nss.db().toString(), @@ -173,6 +181,8 @@ ExecutorFuture<void> TenantMigrationDonorService::createExternalKeysTTLIndex( auto opCtx = opCtxHolder.get(); DBDirectClient client(opCtx); + pauseTenantMigrationBeforeCreatingExternalKeysTTLIndex.pauseWhileSet(opCtx); + BSONObj result; client.runCommand( nss.db().toString(), @@ -414,6 +424,8 @@ ExecutorFuture<repl::OpTime> TenantMigrationDonorService::Instance::_insertState auto opCtxHolder = cc().makeOperationContext(); auto opCtx = opCtxHolder.get(); + pauseTenantMigrationBeforeInsertingDonorStateDoc.pauseWhileSet(opCtx); + AutoGetCollection collection(opCtx, _stateDocumentsNS, MODE_IX); writeConflictRetry( @@ -455,6 +467,8 @@ ExecutorFuture<repl::OpTime> TenantMigrationDonorService::Instance::_updateState auto opCtxHolder = cc().makeOperationContext(); auto opCtx = opCtxHolder.get(); + pauseTenantMigrationDonorWhileUpdatingStateDoc.pauseWhileSet(opCtx); + AutoGetCollection collection(opCtx, _stateDocumentsNS, MODE_IX); uassert(ErrorCodes::NamespaceNotFound, @@ -658,12 +672,9 @@ ExecutorFuture<void> TenantMigrationDonorService::Instance::_sendRecipientSyncDa std::shared_ptr<RemoteCommandTargeter> recipientTargeterRS, const CancelationToken& token) { - auto opCtxHolder = cc().makeOperationContext(); - auto opCtx = opCtxHolder.get(); - const auto cmdObj = [&] { auto donorConnString = - repl::ReplicationCoordinator::get(opCtx)->getConfig().getConnectionString(); + repl::ReplicationCoordinator::get(_serviceContext)->getConfig().getConnectionString(); RecipientSyncData request; request.setDbName(NamespaceString::kAdminDb); @@ -689,11 +700,8 @@ ExecutorFuture<void> TenantMigrationDonorService::Instance::_sendRecipientForget std::shared_ptr<RemoteCommandTargeter> recipientTargeterRS, const CancelationToken& token) { - auto opCtxHolder = cc().makeOperationContext(); - auto opCtx = opCtxHolder.get(); - auto donorConnString = - repl::ReplicationCoordinator::get(opCtx)->getConfig().getConnectionString(); + repl::ReplicationCoordinator::get(_serviceContext)->getConfig().getConnectionString(); RecipientForgetMigration request; request.setDbName(NamespaceString::kAdminDb); diff --git a/src/mongo/db/repl/tenant_migration_util.cpp b/src/mongo/db/repl/tenant_migration_util.cpp index 2727ac524fa..9989cf4cb9d 100644 --- a/src/mongo/db/repl/tenant_migration_util.cpp +++ b/src/mongo/db/repl/tenant_migration_util.cpp @@ -61,6 +61,7 @@ const std::set<std::string> kSensitiveFieldNames{"donorCertificateForRecipient", "recipientCertificateForDonor"}; MONGO_FAIL_POINT_DEFINE(pauseTenantMigrationBeforeMarkingExternalKeysGarbageCollectable); +MONGO_FAIL_POINT_DEFINE(pauseTenantMigrationBeforeStoringExternalClusterTimeKeyDocs); } // namespace @@ -81,6 +82,8 @@ repl::OpTime storeExternalClusterTimeKeyDocs(std::vector<ExternalKeysCollectionD auto opCtx = opCtxHolder.get(); auto nss = NamespaceString::kExternalKeysCollectionNamespace; + pauseTenantMigrationBeforeStoringExternalClusterTimeKeyDocs.pauseWhileSet(opCtx); + for (auto& keyDoc : keyDocs) { AutoGetCollection collection(opCtx, nss, MODE_IX); |