From 09c474af1cca7c54d3f93ac46d8fbe9549cd4689 Mon Sep 17 00:00:00 2001 From: Vishnu Kaushik Date: Thu, 11 Feb 2021 22:05:17 +0000 Subject: SERVER-54478 Fix issue where recipient primary failover after forget migration can lead to the oplog buffer collection not being dropped on new primary --- ...nt_migration_recipient_stepdown_after_forget.js | 71 ++++++++++++++++++++++ .../db/repl/tenant_migration_recipient_service.cpp | 11 ++-- 2 files changed, 77 insertions(+), 5 deletions(-) create mode 100644 jstests/replsets/tenant_migration_recipient_stepdown_after_forget.js diff --git a/jstests/replsets/tenant_migration_recipient_stepdown_after_forget.js b/jstests/replsets/tenant_migration_recipient_stepdown_after_forget.js new file mode 100644 index 00000000000..53b93f73d57 --- /dev/null +++ b/jstests/replsets/tenant_migration_recipient_stepdown_after_forget.js @@ -0,0 +1,71 @@ +/** + * Tests whether the new recipient primary properly processes a forgetMigration when the original + * primary is made to step down after marking as garbage collectable. The oplog buffer collection + * must be dropped. + * + * @tags: [requires_fcv_49, requires_replication, incompatible_with_windows_tls] + */ + +(function() { + +"use strict"; +load("jstests/libs/uuid_util.js"); // For extractUUIDFromObject(). +load("jstests/libs/fail_point_util.js"); // For configureFailPoint(). +load("jstests/libs/parallelTester.js"); // For Thread(), used for async forgetMigration. +load("jstests/replsets/libs/tenant_migration_test.js"); +load("jstests/replsets/libs/tenant_migration_util.js"); + +const tenantMigrationTest = + new TenantMigrationTest({name: jsTestName(), sharedOptions: {nodes: 2}}); + +if (!tenantMigrationTest.isFeatureFlagEnabled()) { + jsTestLog("Skipping test because the tenant migrations feature flag is disabled"); + tenantMigrationTest.stop(); + return; +} + +const kMigrationId = UUID(); +const kTenantId = 'testTenantId'; +const kReadPreference = { + mode: "primary" +}; +const migrationOpts = { + migrationIdString: extractUUIDFromObject(kMigrationId), + tenantId: kTenantId, + readPreference: kReadPreference +}; + +tenantMigrationTest.runMigration( + migrationOpts, true /* retryOnRetryableErrors */, false /* automaticForgetMigration */); + +const fpBeforeDroppingOplogBufferCollection = + configureFailPoint(tenantMigrationTest.getRecipientPrimary(), + "fpBeforeDroppingOplogBufferCollection", + {action: "hang"}); + +jsTestLog("Issuing a forget migration command."); +const forgetMigrationThread = + new Thread(TenantMigrationUtil.forgetMigrationAsync, + migrationOpts.migrationIdString, + TenantMigrationUtil.createRstArgs(tenantMigrationTest.getDonorRst()), + true /* retryOnRetryableErrors */); +forgetMigrationThread.start(); + +fpBeforeDroppingOplogBufferCollection.wait(); + +jsTestLog("Step up a new recipient primary."); +assert.commandWorked(tenantMigrationTest.getRecipientRst().getSecondaries()[0].adminCommand( + {replSetStepUp: ReplSetTest.kForeverSecs, force: true})); + +fpBeforeDroppingOplogBufferCollection.off(); + +jsTestLog("Waiting for forget migration to complete."); +assert.commandWorked(forgetMigrationThread.returnData()); + +const configDBCollections = + tenantMigrationTest.getRecipientPrimary().getDB('config').getCollectionNames(); +assert(!configDBCollections.includes('repl.migration.oplog_' + migrationOpts.migrationIdString), + configDBCollections); + +tenantMigrationTest.stop(); +})(); \ No newline at end of file diff --git a/src/mongo/db/repl/tenant_migration_recipient_service.cpp b/src/mongo/db/repl/tenant_migration_recipient_service.cpp index d0780d91cd2..bf964c2e834 100644 --- a/src/mongo/db/repl/tenant_migration_recipient_service.cpp +++ b/src/mongo/db/repl/tenant_migration_recipient_service.cpp @@ -100,6 +100,7 @@ MONGO_FAIL_POINT_DEFINE(hangBeforeTaskCompletion); MONGO_FAIL_POINT_DEFINE(fpAfterReceivingRecipientForgetMigration); MONGO_FAIL_POINT_DEFINE(hangAfterCreatingRSM); MONGO_FAIL_POINT_DEFINE(skipRetriesWhenConnectingToDonorHost); +MONGO_FAIL_POINT_DEFINE(fpBeforeDroppingOplogBufferCollection); namespace { // We never restart just the oplog fetcher. If a failure occurs, we restart the whole state machine @@ -1377,15 +1378,14 @@ SemiFuture TenantMigrationRecipientService::Instance::run( pauseAfterRunTenantMigrationRecipientInstance.pauseWhileSet(); - uassert(ErrorCodes::TenantMigrationForgotten, - str::stream() << "Migration " << getMigrationUUID() - << " already marked for garbage collect", - !_stateDoc.getExpireAt()); - return _initializeStateDoc(lk); }) .then([this, self = shared_from_this()] { _stateDocPersistedPromise.emplaceValue(); + uassert(ErrorCodes::TenantMigrationForgotten, + str::stream() << "Migration " << getMigrationUUID() + << " already marked for garbage collect", + !_stateDoc.getExpireAt()); _stopOrHangOnFailPoint(&fpAfterPersistingTenantMigrationRecipientInstanceStateDoc); return _createAndConnectClients(); }) @@ -1604,6 +1604,7 @@ SemiFuture TenantMigrationRecipientService::Instance::run( }) .then([this, self = shared_from_this()] { return _markStateDocAsGarbageCollectable(); }) .then([this, self = shared_from_this()] { + _stopOrHangOnFailPoint(&fpBeforeDroppingOplogBufferCollection); auto opCtx = cc().makeOperationContext(); auto storageInterface = StorageInterface::get(opCtx.get()); -- cgit v1.2.1