diff options
6 files changed, 146 insertions, 0 deletions
diff --git a/jstests/multiVersion/genericSetFCVUsage/tenant_migration_donor_abort_on_fcv_change.js b/jstests/multiVersion/genericSetFCVUsage/tenant_migration_donor_abort_on_fcv_change.js new file mode 100644 index 00000000000..0e958568cc9 --- /dev/null +++ b/jstests/multiVersion/genericSetFCVUsage/tenant_migration_donor_abort_on_fcv_change.js @@ -0,0 +1,70 @@ +/** + * Tests that the donor cancels all migrations when its FCV changes. + * @tags: [requires_majority_read_concern, incompatible_with_windows_tls] + */ + +(function() { +"use strict"; + +load("jstests/libs/fail_point_util.js"); +load("jstests/libs/uuid_util.js"); // for 'extractUUIDFromObject' +load("jstests/libs/parallelTester.js"); // for 'Thread' +load("jstests/replsets/rslib.js"); // for 'setLogVerbosity' +load("jstests/replsets/libs/tenant_migration_test.js"); +load("jstests/replsets/libs/tenant_migration_util.js"); + +const tenantMigrationTest = new TenantMigrationTest({name: jsTestName()}); +if (!tenantMigrationTest.isFeatureFlagEnabled()) { + jsTestLog("Skipping test because the tenant migrations feature flag is disabled"); + return; +} + +const tenantId = "testTenantId"; +const dbName = tenantMigrationTest.tenantDB(tenantId, "testDB"); +const collName = "testColl"; + +const donorRst = tenantMigrationTest.getDonorRst(); +const donorPrimary = tenantMigrationTest.getDonorPrimary(); +const donorDB = donorPrimary.getDB(dbName); + +setLogVerbosity([donorPrimary], {"tenantMigration": {"verbosity": 3}}); + +tenantMigrationTest.insertDonorDB(dbName, collName); + +const migrationId = UUID(); +const migrationIdString = extractUUIDFromObject(migrationId); +const migrationOpts = { + migrationIdString: migrationIdString, + recipientConnString: tenantMigrationTest.getRecipientConnString(), + tenantId: tenantId, +}; + +const hangWhileMigratingFP = + configureFailPoint(donorDB, "pauseTenantMigrationBeforeLeavingAbortingIndexBuildsState"); + +// Start a migration and wait for donor to hang at the failpoint. +assert.commandWorked(tenantMigrationTest.startMigration(migrationOpts)); + +hangWhileMigratingFP.wait(); + +// Initiate a downgrade and let it complete. +assert.commandWorked( + donorPrimary.adminCommand({setFeatureCompatibilityVersion: lastContinuousFCV})); + +// Upgrade again and finish the test. +assert.commandWorked(donorPrimary.adminCommand({setFeatureCompatibilityVersion: latestFCV})); + +hangWhileMigratingFP.off(); + +const stateRes = + assert.commandWorked(tenantMigrationTest.waitForMigrationToComplete(migrationOpts)); +assert.eq(stateRes.state, TenantMigrationTest.DonorState.kAborted); +assert.eq(stateRes.abortReason.code, ErrorCodes.TenantMigrationAborted); + +tenantMigrationTest.waitForDonorNodesToReachState( + donorRst.nodes, migrationId, tenantId, TenantMigrationTest.DonorState.kAborted); + +assert.commandWorked(tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString)); + +tenantMigrationTest.stop(); +})();
\ No newline at end of file diff --git a/src/mongo/db/commands/set_feature_compatibility_version_command.cpp b/src/mongo/db/commands/set_feature_compatibility_version_command.cpp index b66bb57dbbd..c515f60f036 100644 --- a/src/mongo/db/commands/set_feature_compatibility_version_command.cpp +++ b/src/mongo/db/commands/set_feature_compatibility_version_command.cpp @@ -49,10 +49,12 @@ #include "mongo/db/namespace_string.h" #include "mongo/db/ops/write_ops.h" #include "mongo/db/read_write_concern_defaults.h" +#include "mongo/db/repl/primary_only_service.h" #include "mongo/db/repl/repl_client_info.h" #include "mongo/db/repl/repl_server_parameters_gen.h" #include "mongo/db/repl/repl_set_config.h" #include "mongo/db/repl/replication_coordinator.h" +#include "mongo/db/repl/tenant_migration_donor_service.h" #include "mongo/db/s/config/sharding_catalog_manager.h" #include "mongo/db/s/sharding_ddl_coordinator_service.h" #include "mongo/db/server_options.h" @@ -411,6 +413,8 @@ private: opCtx, CommandHelpers::appendMajorityWriteConcern(requestPhase1.toBSON({})))); } + _cancelTenantMigrations(opCtx); + auto replCoord = repl::ReplicationCoordinator::get(opCtx); const bool isReplSet = replCoord->getReplicationMode() == repl::ReplicationCoordinator::modeReplSet; @@ -535,6 +539,8 @@ private: opCtx, CommandHelpers::appendMajorityWriteConcern(requestPhase1.toBSON({})))); } + _cancelTenantMigrations(opCtx); + auto replCoord = repl::ReplicationCoordinator::get(opCtx); const bool isReplSet = replCoord->getReplicationMode() == repl::ReplicationCoordinator::modeReplSet; @@ -642,6 +648,20 @@ private: "last_continuous_version"_attr = FCVP::kLastContinuous); } + /** + * Kills all tenant migrations active on this node, for both donors and recipients. + * Called after reaching an upgrading or downgrading state. + */ + void _cancelTenantMigrations(OperationContext* opCtx) { + invariant(serverGlobalParams.featureCompatibility.isUpgradingOrDowngrading()); + if (serverGlobalParams.clusterRole == ClusterRole::None) { + auto donorService = checked_cast<TenantMigrationDonorService*>( + repl::PrimaryOnlyServiceRegistry::get(opCtx->getServiceContext()) + ->lookupServiceByName(TenantMigrationDonorService::kServiceName)); + donorService->abortAllMigrations(opCtx); + } + } + } setFeatureCompatibilityVersionCommand; } // namespace diff --git a/src/mongo/db/repl/primary_only_service.cpp b/src/mongo/db/repl/primary_only_service.cpp index c743fc1f489..1a9e0f99705 100644 --- a/src/mongo/db/repl/primary_only_service.cpp +++ b/src/mongo/db/repl/primary_only_service.cpp @@ -535,6 +535,36 @@ boost::optional<std::shared_ptr<PrimaryOnlyService::Instance>> PrimaryOnlyServic return it->second.getInstance(); } +std::vector<std::shared_ptr<PrimaryOnlyService::Instance>> PrimaryOnlyService::getAllInstances( + OperationContext* opCtx) { + // If this operation is holding any database locks, then it must have opted into getting + // interrupted at stepdown to prevent deadlocks. + invariant(!opCtx->lockState()->isLocked() || opCtx->shouldAlwaysInterruptAtStepDownOrUp() || + opCtx->lockState()->wasGlobalLockTakenInModeConflictingWithWrites()); + + std::vector<std::shared_ptr<PrimaryOnlyService::Instance>> instances; + + stdx::unique_lock lk(_mutex); + opCtx->waitForConditionOrInterrupt( + _rebuildCV, lk, [this]() { return _state != State::kRebuilding; }); + + if (_state == State::kShutdown || _state == State::kPaused) { + invariant(_activeInstances.empty()); + return instances; + } + if (_state == State::kRebuildFailed) { + uassertStatusOK(_rebuildStatus); + return instances; + } + invariant(_state == State::kRunning); + + for (auto& [instanceId, instance] : _activeInstances) { + instances.emplace_back(instance.getInstance()); + } + + return instances; +} + void PrimaryOnlyService::releaseInstance(const InstanceID& id, Status status) { auto savedInstanceNodeHandle = [&]() { stdx::lock_guard lk(_mutex); diff --git a/src/mongo/db/repl/primary_only_service.h b/src/mongo/db/repl/primary_only_service.h index 946245f62b8..9daa31881c9 100644 --- a/src/mongo/db/repl/primary_only_service.h +++ b/src/mongo/db/repl/primary_only_service.h @@ -331,6 +331,11 @@ protected: */ std::shared_ptr<executor::TaskExecutor> getInstanceCleanupExecutor() const; + /** + * Returns shared pointers to all Instance objects that belong to this service. + */ + std::vector<std::shared_ptr<Instance>> getAllInstances(OperationContext* opCtx); + private: /** * Represents a PrimaryOnlyService::Instance that has already been scheduled to be run. diff --git a/src/mongo/db/repl/tenant_migration_donor_service.cpp b/src/mongo/db/repl/tenant_migration_donor_service.cpp index 77e6f66e34c..35023a2f7b7 100644 --- a/src/mongo/db/repl/tenant_migration_donor_service.cpp +++ b/src/mongo/db/repl/tenant_migration_donor_service.cpp @@ -141,6 +141,15 @@ void setPromiseOkIfNotReady(WithLock lk, Promise& promise) { } // namespace +void TenantMigrationDonorService::abortAllMigrations(OperationContext* opCtx) { + LOGV2(5356301, "Aborting all tenant migrations on donor"); + auto instances = getAllInstances(opCtx); + for (auto& instance : instances) { + auto typedInstance = checked_pointer_cast<TenantMigrationDonorService::Instance>(instance); + typedInstance->onReceiveDonorAbortMigration(); + } +} + // Note this index is required on both the donor and recipient in a tenant migration, since each // will copy cluster time keys from the other. The donor service is set up on all mongods on stepup // to primary, so this index will be created on both donors and recipients. @@ -740,6 +749,13 @@ SemiFuture<void> TenantMigrationDonorService::Instance::run( } } + // We must abort the migration if we try to start or resume while upgrading or downgrading. + // (Generic FCV reference): This FCV check should exist across LTS binary versions. + if (serverGlobalParams.featureCompatibility.isUpgradingOrDowngrading()) { + LOGV2(5356302, "Must abort tenant migration as donor is upgrading or downgrading"); + onReceiveDonorAbortMigration(); + } + auto abortToken = _initAbortMigrationSource(token); auto recipientTargeterRS = std::make_shared<RemoteCommandTargeterRS>( diff --git a/src/mongo/db/repl/tenant_migration_donor_service.h b/src/mongo/db/repl/tenant_migration_donor_service.h index 842d2e8c0d5..799f02af7f8 100644 --- a/src/mongo/db/repl/tenant_migration_donor_service.h +++ b/src/mongo/db/repl/tenant_migration_donor_service.h @@ -68,6 +68,11 @@ public: _serviceContext, this, initialState); } + /** + * Sends an abort to all tenant migration instances on this donor. + */ + void abortAllMigrations(OperationContext* opCtx); + class Instance final : public PrimaryOnlyService::TypedInstance<Instance> { public: struct DurableState { |