diff options
author | Vesselina Ratcheva <vesselina.ratcheva@10gen.com> | 2021-02-02 03:15:44 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-02-09 01:56:05 +0000 |
commit | 6c3cf4a2640c0f08733df41330a8927b09744aaa (patch) | |
tree | 98b1cb9584d8b98dda03bed1027b4ac990e81583 | |
parent | 4d72470b050c348ad5fae2cf46c01e09943f5070 (diff) | |
download | mongo-6c3cf4a2640c0f08733df41330a8927b09744aaa.tar.gz |
SERVER-53823 Require matching FCV from donor when starting a tenant migration
5 files changed, 238 insertions, 0 deletions
diff --git a/jstests/replsets/tenant_migration_donor_recipient_fcv_mismatch.js b/jstests/replsets/tenant_migration_donor_recipient_fcv_mismatch.js new file mode 100644 index 00000000000..3696d6e99a6 --- /dev/null +++ b/jstests/replsets/tenant_migration_donor_recipient_fcv_mismatch.js @@ -0,0 +1,62 @@ +/** + * Tests that starting a migration fails if the donor and recipient do not share the same FCV. + * @tags: [requires_majority_read_concern, requires_fcv_49, incompatible_with_windows_tls] + */ + +(function() { +"use strict"; + +load("jstests/libs/fail_point_util.js"); +load("jstests/libs/uuid_util.js"); // for 'extractUUIDFromObject' +load("jstests/libs/parallelTester.js"); // for 'Thread' +load("jstests/replsets/libs/tenant_migration_test.js"); +load("jstests/replsets/libs/tenant_migration_util.js"); + +const tenantMigrationTest = new TenantMigrationTest({name: jsTestName()}); +if (!tenantMigrationTest.isFeatureFlagEnabled()) { + jsTestLog("Skipping test because the tenant migrations feature flag is disabled"); + return; +} + +const tenantId = "testTenantId"; +const dbName = tenantMigrationTest.tenantDB(tenantId, "testDB"); +const collName = "testColl"; + +const donorPrimary = tenantMigrationTest.getDonorPrimary(); +const recipientPrimary = tenantMigrationTest.getRecipientPrimary(); + +tenantMigrationTest.insertDonorDB(dbName, collName); + +const migrationId = UUID(); +const migrationOpts = { + migrationIdString: extractUUIDFromObject(migrationId), + recipientConnString: tenantMigrationTest.getRecipientConnString(), + tenantId: tenantId, +}; + +// Configure a failpoint to have the recipient primary hang after taking note of its FCV +// and before comparing it with that of the donor. +const recipientDB = recipientPrimary.getDB(dbName); +const hangAfterSavingFCV = configureFailPoint( + recipientDB, "fpAfterRecordingRecipientPrimaryStartingFCV", {action: "hang"}); + +// Start a migration and wait for recipient to hang at the failpoint. +assert.commandWorked(tenantMigrationTest.startMigration(migrationOpts)); +hangAfterSavingFCV.wait(); + +// Downgrade the FCV for the donor set and resume migration. +assert.commandWorked( + donorPrimary.adminCommand({setFeatureCompatibilityVersion: lastContinuousFCV})); +hangAfterSavingFCV.off(); + +// Make sure we see the FCV mismatch detection message on the recipient. +checkLog.containsJson(recipientPrimary, 5382300); + +// Upgrade again to check on the status of the migration from the donor's point of view. +assert.commandWorked(donorPrimary.adminCommand({setFeatureCompatibilityVersion: latestFCV})); +const stateRes = + assert.commandWorked(tenantMigrationTest.waitForMigrationToComplete(migrationOpts)); +assert.eq(stateRes.state, TenantMigrationTest.State.kAborted); + +tenantMigrationTest.stop(); +})(); diff --git a/jstests/replsets/tenant_migration_donor_recipient_fcv_mismatch_after_failover.js b/jstests/replsets/tenant_migration_donor_recipient_fcv_mismatch_after_failover.js new file mode 100644 index 00000000000..89e6a4bc387 --- /dev/null +++ b/jstests/replsets/tenant_migration_donor_recipient_fcv_mismatch_after_failover.js @@ -0,0 +1,70 @@ +/** + * Tests that restarting a migration attempt after a failover fails if the donor and recipient no + * longer share the same FCV. + * @tags: [requires_majority_read_concern, requires_fcv_49, incompatible_with_windows_tls] + */ + +(function() { +"use strict"; + +load("jstests/libs/fail_point_util.js"); +load("jstests/libs/uuid_util.js"); // for 'extractUUIDFromObject' +load("jstests/libs/parallelTester.js"); // for 'Thread' +load("jstests/replsets/libs/tenant_migration_test.js"); +load("jstests/replsets/libs/tenant_migration_util.js"); + +const tenantMigrationTest = new TenantMigrationTest({name: jsTestName()}); +if (!tenantMigrationTest.isFeatureFlagEnabled()) { + jsTestLog("Skipping test because the tenant migrations feature flag is disabled"); + return; +} + +const tenantId = "testTenantId"; +const dbName = tenantMigrationTest.tenantDB(tenantId, "testDB"); +const collName = "testColl"; + +const donorPrimary = tenantMigrationTest.getDonorPrimary(); +const recipientPrimary = tenantMigrationTest.getRecipientPrimary(); + +tenantMigrationTest.insertDonorDB(dbName, collName); + +const migrationId = UUID(); +const migrationOpts = { + migrationIdString: extractUUIDFromObject(migrationId), + recipientConnString: tenantMigrationTest.getRecipientConnString(), + tenantId: tenantId, +}; + +// Configure a failpoint to have the recipient primary hang after a successful initial comparison. +const recipientDB = recipientPrimary.getDB(dbName); +const hangAfterFirstFCVcheck = + configureFailPoint(recipientDB, "fpAfterComparingRecipientAndDonorFCV", {action: "hang"}); + +// Start a migration and wait for recipient to hang at the failpoint. +assert.commandWorked(tenantMigrationTest.startMigration(migrationOpts)); +hangAfterFirstFCVcheck.wait(); + +// Downgrade the FCV for the donor set. +assert.commandWorked( + donorPrimary.adminCommand({setFeatureCompatibilityVersion: lastContinuousFCV})); + +// Step up a new node in the recipient set and trigger a failover. The new primary should attempt to +// resume cloning, but fail upon re-checking the FCVs. +const recipientRst = tenantMigrationTest.getRecipientRst(); +const newRecipientPrimary = recipientRst.getSecondaries()[0]; +recipientRst.awaitLastOpCommitted(); +assert.commandWorked(newRecipientPrimary.adminCommand({replSetStepUp: 1})); +hangAfterFirstFCVcheck.off(); +recipientRst.getPrimary(); + +// Make sure we see the FCV mismatch detection message on the recipient regardless. +checkLog.containsJson(newRecipientPrimary, 5382300); + +// Upgrade again to check on the status of the migration from the donor's point of view. +assert.commandWorked(donorPrimary.adminCommand({setFeatureCompatibilityVersion: latestFCV})); +const stateRes = + assert.commandWorked(tenantMigrationTest.waitForMigrationToComplete(migrationOpts)); +assert.eq(stateRes.state, TenantMigrationTest.State.kAborted); + +tenantMigrationTest.stop(); +})(); diff --git a/src/mongo/db/repl/tenant_migration_recipient_service.cpp b/src/mongo/db/repl/tenant_migration_recipient_service.cpp index 514e73f7f21..8d569280538 100644 --- a/src/mongo/db/repl/tenant_migration_recipient_service.cpp +++ b/src/mongo/db/repl/tenant_migration_recipient_service.cpp @@ -79,6 +79,7 @@ NamespaceString getOplogBufferNs(const UUID& migrationUUID) { MONGO_FAIL_POINT_DEFINE(pauseBeforeRunTenantMigrationRecipientInstance); MONGO_FAIL_POINT_DEFINE(pauseAfterRunTenantMigrationRecipientInstance); MONGO_FAIL_POINT_DEFINE(skipTenantMigrationRecipientAuth); +MONGO_FAIL_POINT_DEFINE(skipComparingRecipientAndDonorFCV); MONGO_FAIL_POINT_DEFINE(autoRecipientForgetMigration); MONGO_FAIL_POINT_DEFINE(pauseAfterCreatingOplogBuffer); @@ -87,6 +88,7 @@ MONGO_FAIL_POINT_DEFINE(failWhilePersistingTenantMigrationRecipientInstanceState MONGO_FAIL_POINT_DEFINE(fpAfterPersistingTenantMigrationRecipientInstanceStateDoc); MONGO_FAIL_POINT_DEFINE(fpAfterConnectingTenantMigrationRecipientInstance); MONGO_FAIL_POINT_DEFINE(fpAfterRecordingRecipientPrimaryStartingFCV); +MONGO_FAIL_POINT_DEFINE(fpAfterComparingRecipientAndDonorFCV); MONGO_FAIL_POINT_DEFINE(fpAfterRetrievingStartOpTimesMigrationRecipientInstance); MONGO_FAIL_POINT_DEFINE(fpAfterStartingOplogFetcherMigrationRecipientInstance); MONGO_FAIL_POINT_DEFINE(setTenantMigrationRecipientInstanceHostTimeout); @@ -1263,6 +1265,39 @@ void TenantMigrationRecipientService::Instance::_fetchAndStoreDonorClusterTimeKe _scopedExecutor, std::move(keyDocs), token); } +void TenantMigrationRecipientService::Instance::_compareRecipientAndDonorFCV() const { + if (skipComparingRecipientAndDonorFCV.shouldFail()) { // Test-only. + return; + } + + auto donorFCVbson = + _client->findOne(NamespaceString::kServerConfigurationNamespace.ns(), + QUERY("_id" << FeatureCompatibilityVersionParser::kParameterName), + nullptr, + QueryOption_SecondaryOk, + ReadConcernArgs(ReadConcernLevel::kMajorityReadConcern).toBSONInner()); + + uassert(5382302, "FCV on donor not set", !donorFCVbson.isEmpty()); + + auto swDonorFCV = FeatureCompatibilityVersionParser::parse(donorFCVbson); + uassertStatusOK(swDonorFCV.getStatus()); + + stdx::lock_guard lk(_mutex); + auto donorFCV = swDonorFCV.getValue(); + auto recipientFCV = _stateDoc.getRecipientPrimaryStartingFCV(); + + if (donorFCV != recipientFCV) { + LOGV2_ERROR(5382300, + "Donor and recipient FCV mismatch", + "tenantId"_attr = getTenantId(), + "migrationId"_attr = getMigrationUUID(), + "donorConnString"_attr = _donorConnectionString, + "donorFCV"_attr = donorFCV, + "recipientFCV"_attr = recipientFCV); + uasserted(5382301, "Mismatch between donor and recipient FCV"); + } +} + SemiFuture<void> TenantMigrationRecipientService::Instance::run( std::shared_ptr<executor::ScopedTaskExecutor> executor, const CancelationToken& token) noexcept { @@ -1352,6 +1387,10 @@ SemiFuture<void> TenantMigrationRecipientService::Instance::run( }) .then([this, self = shared_from_this()] { _stopOrHangOnFailPoint(&fpAfterRecordingRecipientPrimaryStartingFCV); + _compareRecipientAndDonorFCV(); + }) + .then([this, self = shared_from_this()] { + _stopOrHangOnFailPoint(&fpAfterComparingRecipientAndDonorFCV); stdx::lock_guard lk(_mutex); _getStartOpTimesFromDonor(lk); return _updateStateDocForMajority(lk); diff --git a/src/mongo/db/repl/tenant_migration_recipient_service.h b/src/mongo/db/repl/tenant_migration_recipient_service.h index 21e4325b020..2fec9e6bf55 100644 --- a/src/mongo/db/repl/tenant_migration_recipient_service.h +++ b/src/mongo/db/repl/tenant_migration_recipient_service.h @@ -419,7 +419,12 @@ public: /* * Returns the majority OpTime on the donor node that 'client' is connected to. */ + OpTime _getDonorMajorityOpTime(std::unique_ptr<mongo::DBClientConnection>& client); + /** + * Enforces that the donor and recipient share the same featureCompatibilityVersion. + */ + void _compareRecipientAndDonorFCV() const; mutable Mutex _mutex = MONGO_MAKE_LATCH("TenantMigrationRecipientService::_mutex"); diff --git a/src/mongo/db/repl/tenant_migration_recipient_service_test.cpp b/src/mongo/db/repl/tenant_migration_recipient_service_test.cpp index f25250f2045..cae192be65a 100644 --- a/src/mongo/db/repl/tenant_migration_recipient_service_test.cpp +++ b/src/mongo/db/repl/tenant_migration_recipient_service_test.cpp @@ -38,6 +38,7 @@ #include "mongo/client/replica_set_monitor_protocol_test_util.h" #include "mongo/config.h" #include "mongo/db/client.h" +#include "mongo/db/commands/feature_compatibility_version_document_gen.h" #include "mongo/db/dbdirectclient.h" #include "mongo/db/op_observer_impl.h" #include "mongo/db/op_observer_registry.h" @@ -225,6 +226,10 @@ public: // Set the sslMode to allowSSL to avoid validation error. sslGlobalParams.sslMode.store(SSLParams::SSLMode_allowSSL); + // Skipped unless tested explicitly, as we will not receive an FCV document from the donor + // in these unittests without (unsightly) intervention. + auto compFp = globalFailPointRegistry().find("skipComparingRecipientAndDonorFCV"); + compFp->setMode(FailPoint::alwaysOn); // Timestamps of "0 seconds" are not allowed, so we must advance our clock mock to the first // real second. @@ -408,6 +413,18 @@ protected: initialStateDoc.setStartFetchingDonorOpTime(startFetchingDonorOptime); } + /** + * Sets the FCV on the donor so that it can respond to FCV requests appropriately. + * (Generic FCV reference): This FCV reference should exist across LTS binary versions. + */ + void setDonorFCV(const TenantMigrationRecipientService::Instance* instance, + ServerGlobalParams::FeatureCompatibility::Version version = + ServerGlobalParams::FeatureCompatibility::kLatest) { + auto fcvDoc = FeatureCompatibilityVersionDocument(version); + auto client = getClient(instance); + client->insert(NamespaceString::kServerConfigurationNamespace.ns(), fcvDoc.toBSON()); + } + private: std::shared_ptr<ClockSourceMock> _clkSource = std::make_shared<ClockSourceMock>(); @@ -2797,6 +2814,51 @@ TEST_F(TenantMigrationRecipientServiceTest, ASSERT_OK(instance->getCompletionFuture().getNoThrow()); } +TEST_F(TenantMigrationRecipientServiceTest, + TenantMigrationRecipientServiceDonorAndRecipientFCVMismatch) { + stopFailPointEnableBlock fp("fpAfterComparingRecipientAndDonorFCV"); + + // Tests skip this check by default but we are specifically testing it here. + auto compFp = globalFailPointRegistry().find("skipComparingRecipientAndDonorFCV"); + compFp->setMode(FailPoint::off); + + // Set to allow the donor to respond to FCV requests. + auto connFp = + globalFailPointRegistry().find("fpAfterConnectingTenantMigrationRecipientInstance"); + auto initialTimesEntered = connFp->setMode(FailPoint::alwaysOn, + 0, + BSON("action" + << "hang")); + + const UUID migrationUUID = UUID::gen(); + MockReplicaSet replSet("donorSet", 3, true /* hasPrimary */, true /* dollarPrefixHosts */); + + TenantMigrationRecipientDocument initialStateDocument( + migrationUUID, + replSet.getConnectionString(), + "tenantA", + ReadPreferenceSetting(ReadPreference::PrimaryOnly)); + initialStateDocument.setRecipientCertificateForDonor(kRecipientPEMPayload); + + // Create and start the instance. + auto opCtx = makeOperationContext(); + auto instance = TenantMigrationRecipientService::Instance::getOrCreate( + opCtx.get(), _service, initialStateDocument.toBSON()); + ASSERT(instance.get()); + + // Set the donor FCV to be different from 'latest'. + // (Generic FCV reference): This FCV reference should exist across LTS binary versions. + connFp->waitForTimesEntered(initialTimesEntered + 1); + setDonorFCV(instance.get(), ServerGlobalParams::FeatureCompatibility::kLastContinuous); + connFp->setMode(FailPoint::off); + + // Wait for task completion failure. + // The FCVs should differ so we expect to exit with an error. + std::int32_t expectedCode = 5382301; + ASSERT_EQ(expectedCode, instance->getDataSyncCompletionFuture().getNoThrow().code()); + ASSERT_OK(instance->getCompletionFuture().getNoThrow()); +} + TEST_F(TenantMigrationRecipientServiceTest, WaitUntilTimestampIsMajorityCommitted) { const UUID migrationUUID = UUID::gen(); const OpTime topOfOplogOpTime(Timestamp(5, 1), 1); |