From 3afa5f4fde94a47d8d3ca3743954216c8342f32d Mon Sep 17 00:00:00 2001 From: Lingzhi Deng Date: Wed, 17 Mar 2021 23:03:23 +0000 Subject: SERVER-54920: Move the tenant migration recipient MTAB lifetime earlier --- .../replsets/tenant_migration_collection_ttl.js | 95 +++++++++++++++++----- ...ant_migration_concurrent_writes_on_recipient.js | 4 +- ...igration_recipient_resumes_on_donor_failover.js | 13 ++- ...e_collection_cloner_after_recipient_failover.js | 13 ++- ...ration_resume_collection_cloner_after_rename.js | 13 ++- src/mongo/db/repl/SConscript | 12 ++- src/mongo/db/repl/tenant_collection_cloner.cpp | 7 ++ src/mongo/db/repl/tenant_database_cloner.cpp | 3 + .../tenant_migration_recipient_access_blocker.cpp | 11 +++ .../tenant_migration_recipient_access_blocker.h | 3 + .../tenant_migration_recipient_op_observer.cpp | 11 +-- 11 files changed, 143 insertions(+), 42 deletions(-) diff --git a/jstests/replsets/tenant_migration_collection_ttl.js b/jstests/replsets/tenant_migration_collection_ttl.js index e1fd6d1ec0e..3f5a56961df 100644 --- a/jstests/replsets/tenant_migration_collection_ttl.js +++ b/jstests/replsets/tenant_migration_collection_ttl.js @@ -20,7 +20,9 @@ const garbageCollectionOpts = { // up the test. tenantMigrationGarbageCollectionDelayMS: 5 * 1000, // Set the TTL interval large enough to decrease the probability of races. - ttlMonitorSleepSecs: 5 + ttlMonitorSleepSecs: 5, + // Allow reads on recipient before migration completes for testing. + 'failpoint.tenantMigrationRecipientNotRejectReads': tojson({mode: 'alwaysOn'}), }; const tenantMigrationTest = new TenantMigrationTest( @@ -30,8 +32,6 @@ if (!tenantMigrationTest.isFeatureFlagEnabled()) { return; } -const tenantId = "testTenantId"; -const dbName = tenantMigrationTest.tenantDB(tenantId, "testDB"); const collName = "testColl"; const donorRst = tenantMigrationTest.getDonorRst(); @@ -50,7 +50,7 @@ function prepareData() { }); } -function prepareDb(ttlTimeoutSeconds = 0) { +function prepareDb(dbName, ttlTimeoutSeconds = 0) { let db = donorPrimary.getDB(dbName); tenantMigrationTest.insertDonorDB(dbName, collName, prepareData()); // Create TTL index. @@ -72,29 +72,86 @@ function waitForOneTTLPassAtNode(node) { }, "TTLMonitor never did any passes."); } -function getDocumentCount(node) { +function getDocumentCount(dbName, node) { return node.getDB(dbName)[collName].count(); } -function assertTTLNotDeleteExpiredDocs(node) { - assert.eq(numDocs, getDocumentCount(node)); +function assertTTLNotDeleteExpiredDocs(dbName, node) { + assert.eq(numDocs, getDocumentCount(dbName, node)); } -function assertTTLDeleteExpiredDocs(node) { +function assertTTLDeleteExpiredDocs(dbName, node) { waitForOneTTLPassAtNode(node); assert.soon(() => { - let found = getDocumentCount(node); + let found = getDocumentCount(dbName, node); jsTest.log(`${found} documents in the ${node} collection`); return found == 0; }, `TTL doesn't clean the database at ${node}`); } // Tests that: -// 1. At the recipient, the TTL deletions are suspended until migration is forgotten. +// 1. At the recipient, the TTL deletions are suspended during the cloning phase. +// 2. At the donor, TTL deletions are not suspended before blocking state. +(() => { + jsTest.log("Test that the TTL does not delete documents on recipient during cloning"); + + const tenantId = "testTenantId_duringCloning"; + const dbName = tenantMigrationTest.tenantDB(tenantId, "testDB"); + + const migrationId = UUID(); + const migrationOpts = { + migrationIdString: extractUUIDFromObject(migrationId), + tenantId: tenantId, + recipientConnString: tenantMigrationTest.getRecipientConnString(), + }; + + // We start the test right after the donor TTL cycle. + waitForOneTTLPassAtNode(donorPrimary); + // The TTL timeout is intentionally shorter than TTL interval to let the documents to be subject + // of TTL in the first round. + prepareDb(dbName, 3); + + const recipientDb = recipientPrimary.getDB(dbName); + let recipientColl = recipientDb.getCollection(collName); + const hangDuringCollectionClone = configureFailPoint( + recipientDb, + "hangAfterClonerStage", + {cloner: "TenantCollectionCloner", stage: "query", nss: recipientColl.getFullName()}); + + assert.commandWorked(tenantMigrationTest.startMigration(migrationOpts)); + + hangDuringCollectionClone.wait(); + + waitForOneTTLPassAtNode(donorPrimary); + waitForOneTTLPassAtNode(recipientPrimary); + + // All documents should expire on the donor but not on the recipient. + assertTTLDeleteExpiredDocs(dbName, donorPrimary); + assertTTLNotDeleteExpiredDocs(dbName, recipientPrimary); + + hangDuringCollectionClone.off(); + + const stateRes = + assert.commandWorked(tenantMigrationTest.waitForMigrationToComplete(migrationOpts)); + assert.eq(stateRes.state, TenantMigrationTest.State.kCommitted); + + // Data should be consistent after the migration commits. + assertTTLDeleteExpiredDocs(dbName, recipientPrimary); + assertTTLDeleteExpiredDocs(dbName, donorPrimary); + + assert.commandWorked(tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString)); +})(); + +// Tests that: +// 1. At the recipient, the TTL deletions are suspended after the cloning phase until migration is +// forgotten. // 2. At the donor, TTL deletions are suspended during blocking state. This verifies that // the TTL mechanism respects the same MTAB mechanism as normal updates. (() => { - jsTest.log("Test that the TTL does not delete documents during tenant migration"); + jsTest.log("Test that the TTL does not delete documents on recipient after cloning"); + + const tenantId = "testTenantId_afterCloning"; + const dbName = tenantMigrationTest.tenantDB(tenantId, "testDB"); const migrationId = UUID(); const migrationOpts = { @@ -108,7 +165,7 @@ function assertTTLDeleteExpiredDocs(node) { // The TTL timeout is intentionally shorter than TTL interval to let the documents to be subject // of TTL in the first round. It also should be long enough to let the startMigration() finish // before the timeout expires. - prepareDb(3); + prepareDb(dbName, 3); let blockFp = configureFailPoint(donorPrimary, "pauseTenantMigrationBeforeLeavingBlockingState"); @@ -118,7 +175,7 @@ function assertTTLDeleteExpiredDocs(node) { // At a very slow machine, there is a chance that a TTL cycle happened at the donor // before it entered the blocking phase. This flag is set when there was a race. - const donorHadNoTTLCyclesBeforeBlocking = numDocs == getDocumentCount(donorPrimary); + const donorHadNoTTLCyclesBeforeBlocking = numDocs == getDocumentCount(dbName, donorPrimary); if (!donorHadNoTTLCyclesBeforeBlocking) { jsTestLog('A rare race when TTL cycle happened before donor entered its blocking phase'); return; @@ -128,8 +185,8 @@ function assertTTLDeleteExpiredDocs(node) { // 1. TTL is suspended at the recipient // 2. As there was no race with TTL cycle at the donor, TTL is suspended as well. waitForOneTTLPassAtNode(recipientPrimary); - assertTTLNotDeleteExpiredDocs(recipientPrimary); - assertTTLNotDeleteExpiredDocs(donorPrimary); + assertTTLNotDeleteExpiredDocs(dbName, recipientPrimary); + assertTTLNotDeleteExpiredDocs(dbName, donorPrimary); blockFp.off(); @@ -140,14 +197,14 @@ function assertTTLDeleteExpiredDocs(node) { // Tests that the TTL cleanup was suspended during the tenant migration. waitForOneTTLPassAtNode(donorPrimary); waitForOneTTLPassAtNode(recipientPrimary); - assertTTLNotDeleteExpiredDocs(recipientPrimary); - assertTTLNotDeleteExpiredDocs(donorPrimary); + assertTTLNotDeleteExpiredDocs(dbName, recipientPrimary); + assertTTLNotDeleteExpiredDocs(dbName, donorPrimary); assert.commandWorked(tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString)); // After the tenant migration is aborted, the TTL cleanup is restored. - assertTTLDeleteExpiredDocs(recipientPrimary); - assertTTLDeleteExpiredDocs(donorPrimary); + assertTTLDeleteExpiredDocs(dbName, recipientPrimary); + assertTTLDeleteExpiredDocs(dbName, donorPrimary); })(); tenantMigrationTest.stop(); diff --git a/jstests/replsets/tenant_migration_concurrent_writes_on_recipient.js b/jstests/replsets/tenant_migration_concurrent_writes_on_recipient.js index a398bb52d96..f2a1bc6824c 100644 --- a/jstests/replsets/tenant_migration_concurrent_writes_on_recipient.js +++ b/jstests/replsets/tenant_migration_concurrent_writes_on_recipient.js @@ -58,7 +58,7 @@ const kTenantId = "testTenantId"; startOplogFetcherFp.wait(); // Write before cloning is done. - assert.commandWorked(tenantCollOnRecipient.remove({_id: 1})); + assert.commandFailedWithCode(tenantCollOnRecipient.remove({_id: 1}), ErrorCodes.SnapshotTooOld); startOplogFetcherFp.off(); clonerDoneFp.wait(); @@ -152,4 +152,4 @@ const kTenantId = "testTenantId"; })(); tenantMigrationTest.stop(); -})(); \ No newline at end of file +})(); diff --git a/jstests/replsets/tenant_migration_recipient_resumes_on_donor_failover.js b/jstests/replsets/tenant_migration_recipient_resumes_on_donor_failover.js index b1fc37d8ecd..3a885ba2376 100644 --- a/jstests/replsets/tenant_migration_recipient_resumes_on_donor_failover.js +++ b/jstests/replsets/tenant_migration_recipient_resumes_on_donor_failover.js @@ -23,10 +23,15 @@ function runTest(failPoint) { const recipientRst = new ReplSetTest({ nodes: 2, name: jsTestName() + "_recipient", - // Use a batch size of 2 so that collection cloner requires more than a single batch to - // complete. - nodeOptions: Object.assign(TenantMigrationUtil.makeX509OptionsForTest().recipient, - {setParameter: {collectionClonerBatchSize: 2}}) + nodeOptions: Object.assign(TenantMigrationUtil.makeX509OptionsForTest().recipient, { + setParameter: { + // Use a batch size of 2 so that collection cloner requires more than a single batch + // to complete. + collectionClonerBatchSize: 2, + // Allow reads on recipient before migration completes for testing. + 'failpoint.tenantMigrationRecipientNotRejectReads': tojson({mode: 'alwaysOn'}), + } + }) }); recipientRst.startSet(); recipientRst.initiateWithHighElectionTimeout(); diff --git a/jstests/replsets/tenant_migration_resume_collection_cloner_after_recipient_failover.js b/jstests/replsets/tenant_migration_resume_collection_cloner_after_recipient_failover.js index 3a2b0439ea9..71510fb02cd 100644 --- a/jstests/replsets/tenant_migration_resume_collection_cloner_after_recipient_failover.js +++ b/jstests/replsets/tenant_migration_resume_collection_cloner_after_recipient_failover.js @@ -16,10 +16,15 @@ load("jstests/replsets/libs/tenant_migration_util.js"); const recipientRst = new ReplSetTest({ nodes: 2, name: jsTestName() + "_recipient", - // Use a batch size of 2 so that collection cloner requires more than a single batch to - // complete. - nodeOptions: Object.assign(TenantMigrationUtil.makeX509OptionsForTest().recipient, - {setParameter: {collectionClonerBatchSize: 2}}) + nodeOptions: Object.assign(TenantMigrationUtil.makeX509OptionsForTest().recipient, { + setParameter: { + // Use a batch size of 2 so that collection cloner requires more than a single batch to + // complete. + collectionClonerBatchSize: 2, + // Allow reads on recipient before migration completes for testing. + 'failpoint.tenantMigrationRecipientNotRejectReads': tojson({mode: 'alwaysOn'}), + } + }) }); recipientRst.startSet(); diff --git a/jstests/replsets/tenant_migration_resume_collection_cloner_after_rename.js b/jstests/replsets/tenant_migration_resume_collection_cloner_after_rename.js index 9cde0e2cbf0..a949e25e08f 100644 --- a/jstests/replsets/tenant_migration_resume_collection_cloner_after_rename.js +++ b/jstests/replsets/tenant_migration_resume_collection_cloner_after_rename.js @@ -16,10 +16,15 @@ load("jstests/replsets/libs/tenant_migration_util.js"); const recipientRst = new ReplSetTest({ nodes: 2, name: jsTestName() + "_recipient", - // Use a batch size of 2 so that collection cloner requires more than a single batch to - // complete. - nodeOptions: Object.assign(TenantMigrationUtil.makeX509OptionsForTest().recipient, - {setParameter: {collectionClonerBatchSize: 2}}) + nodeOptions: Object.assign(TenantMigrationUtil.makeX509OptionsForTest().recipient, { + setParameter: { + // Use a batch size of 2 so that collection cloner requires more than a single batch to + // complete. + collectionClonerBatchSize: 2, + // Allow reads on recipient before migration completes for testing. + 'failpoint.tenantMigrationRecipientNotRejectReads': tojson({mode: 'alwaysOn'}), + } + }) }); recipientRst.startSet(); diff --git a/src/mongo/db/repl/SConscript b/src/mongo/db/repl/SConscript index cbcc6c938de..c2bf8e36c53 100644 --- a/src/mongo/db/repl/SConscript +++ b/src/mongo/db/repl/SConscript @@ -32,16 +32,25 @@ env.Library( ], ) +env.Library( + target='tenant_migration_decoration', + source=[ + 'tenant_migration_decoration.cpp', + ], +) + env.Library( target='oplog', source=[ 'apply_ops.cpp', 'oplog.cpp', 'oplog_entry_or_grouped_inserts.cpp', - 'tenant_migration_decoration.cpp', 'transaction_oplog_application.cpp', 'apply_ops.idl', ], + LIBDEPS=[ + 'tenant_migration_decoration', + ], LIBDEPS_PRIVATE=[ '$BUILD_DIR/mongo/base', '$BUILD_DIR/mongo/db/catalog/catalog_helpers', @@ -1371,6 +1380,7 @@ env.Library( 'local_oplog_info', 'optime', 'repl_coordinator_interface', + 'tenant_migration_decoration', 'tenant_migration_errors', 'tenant_migration_state_machine_idl' ], diff --git a/src/mongo/db/repl/tenant_collection_cloner.cpp b/src/mongo/db/repl/tenant_collection_cloner.cpp index 5a16d7a4576..a1beb962892 100644 --- a/src/mongo/db/repl/tenant_collection_cloner.cpp +++ b/src/mongo/db/repl/tenant_collection_cloner.cpp @@ -323,6 +323,13 @@ BaseCloner::AfterStageBehavior TenantCollectionCloner::createCollectionStage() { // We are resuming and the collection already exists. DBDirectClient client(opCtx.get()); + // Set the recipient info on the opCtx to bypass the access blocker for local reads. + tenantMigrationRecipientInfo(opCtx.get()) = + boost::make_optional(getSharedData()->getMigrationId()); + // Reset the recipient info after local reads so oplog entries for future writes + // (createCollection/createIndex) don't get stamped with the fromTenantMigration field. + ON_BLOCK_EXIT([&opCtx] { tenantMigrationRecipientInfo(opCtx.get()) = boost::none; }); + auto fieldsToReturn = BSON("_id" << 1); _lastDocId = client.findOne(_existingNss->ns(), Query().sort(BSON("_id" << -1)), &fieldsToReturn); diff --git a/src/mongo/db/repl/tenant_database_cloner.cpp b/src/mongo/db/repl/tenant_database_cloner.cpp index c4c21178f81..faeafb32516 100644 --- a/src/mongo/db/repl/tenant_database_cloner.cpp +++ b/src/mongo/db/repl/tenant_database_cloner.cpp @@ -38,6 +38,7 @@ #include "mongo/db/repl/database_cloner_gen.h" #include "mongo/db/repl/tenant_collection_cloner.h" #include "mongo/db/repl/tenant_database_cloner.h" +#include "mongo/db/repl/tenant_migration_decoration.h" #include "mongo/logv2/log.h" #include "mongo/rpc/get_status_from_command_result.h" #include "mongo/util/assert_util.h" @@ -174,6 +175,8 @@ BaseCloner::AfterStageBehavior TenantDatabaseCloner::listCollectionsStage() { BaseCloner::AfterStageBehavior TenantDatabaseCloner::listExistingCollectionsStage() { auto opCtx = cc().makeOperationContext(); DBDirectClient client(opCtx.get()); + tenantMigrationRecipientInfo(opCtx.get()) = + boost::make_optional(getSharedData()->getMigrationId()); std::vector clonedCollectionUUIDs; auto collectionInfos = diff --git a/src/mongo/db/repl/tenant_migration_recipient_access_blocker.cpp b/src/mongo/db/repl/tenant_migration_recipient_access_blocker.cpp index 3a22934b09d..4ddf2bb1260 100644 --- a/src/mongo/db/repl/tenant_migration_recipient_access_blocker.cpp +++ b/src/mongo/db/repl/tenant_migration_recipient_access_blocker.cpp @@ -36,6 +36,7 @@ #include "mongo/db/repl/replication_coordinator.h" #include "mongo/db/repl/storage_interface.h" #include "mongo/db/repl/tenant_migration_access_blocker_executor.h" +#include "mongo/db/repl/tenant_migration_decoration.h" #include "mongo/db/repl/tenant_migration_recipient_access_blocker.h" #include "mongo/logv2/log.h" #include "mongo/util/cancelation.h" @@ -82,6 +83,16 @@ SharedSemiFuture TenantMigrationRecipientAccessBlocker::getCanReadFuture( return SharedSemiFuture(); } + // Exclude internal reads decorated with 'tenantMigrationRecipientInfo' from any logic. + if (repl::tenantMigrationRecipientInfo(opCtx).has_value()) { + LOGV2_DEBUG(5492000, + 1, + "Internal tenant read got excluded from the MTAB filtering", + "tenantId"_attr = _tenantId, + "opId"_attr = opCtx->getOpID()); + return SharedSemiFuture(); + } + auto readConcernArgs = repl::ReadConcernArgs::get(opCtx); auto atClusterTime = [opCtx, &readConcernArgs]() -> boost::optional { if (auto atClusterTime = readConcernArgs.getArgsAtClusterTime()) { diff --git a/src/mongo/db/repl/tenant_migration_recipient_access_blocker.h b/src/mongo/db/repl/tenant_migration_recipient_access_blocker.h index fa387afdca4..3f622d62039 100644 --- a/src/mongo/db/repl/tenant_migration_recipient_access_blocker.h +++ b/src/mongo/db/repl/tenant_migration_recipient_access_blocker.h @@ -64,6 +64,9 @@ namespace mongo { * To ensure atClusterTime and afterClusterTime reads are consistent, when the recipient receives a * recipientSyncData command with a returnAfterReachingTimestamp after the consistent point, the * `rejectBeforeTimestamp` will be advanced to the given returnAfterReachingTimestamp. + * + * Blocker excludes all operations with 'tenantMigrationRecipientInfo' decoration set, as they are + * internal. */ class TenantMigrationRecipientAccessBlocker : public std::enable_shared_from_this, diff --git a/src/mongo/db/repl/tenant_migration_recipient_op_observer.cpp b/src/mongo/db/repl/tenant_migration_recipient_op_observer.cpp index e57ba2aa7e2..6c01ec62935 100644 --- a/src/mongo/db/repl/tenant_migration_recipient_op_observer.cpp +++ b/src/mongo/db/repl/tenant_migration_recipient_op_observer.cpp @@ -49,11 +49,8 @@ const auto tenantIdToDeleteDecoration = * Initializes the TenantMigrationRecipientAccessBlocker for the tenant migration denoted by the * given state doc. */ -void onFinishCloning(OperationContext* opCtx, - const TenantMigrationRecipientDocument& recipientStateDoc) { - invariant(recipientStateDoc.getState() == TenantMigrationRecipientStateEnum::kStarted); - invariant(recipientStateDoc.getDataConsistentStopDonorOpTime()); - +void createAccessBlockerIfNeeded(OperationContext* opCtx, + const TenantMigrationRecipientDocument& recipientStateDoc) { if (tenant_migration_access_blocker::getTenantMigrationRecipientAccessBlocker( opCtx->getServiceContext(), recipientStateDoc.getTenantId())) { // The migration failed part-way on the recipient with a retryable error, and got retried @@ -117,9 +114,7 @@ void TenantMigrationRecipientOpObserver::onUpdate(OperationContext* opCtx, case TenantMigrationRecipientStateEnum::kDone: break; case TenantMigrationRecipientStateEnum::kStarted: - if (recipientStateDoc.getDataConsistentStopDonorOpTime()) { - onFinishCloning(opCtx, recipientStateDoc); - } + createAccessBlockerIfNeeded(opCtx, recipientStateDoc); break; case TenantMigrationRecipientStateEnum::kConsistent: if (recipientStateDoc.getRejectReadsBeforeTimestamp()) { -- cgit v1.2.1