diff options
9 files changed, 176 insertions, 18 deletions
diff --git a/jstests/serverless/libs/basic_serverless_test.js b/jstests/serverless/libs/basic_serverless_test.js index 0f374cb192c..3c6ae92556b 100644 --- a/jstests/serverless/libs/basic_serverless_test.js +++ b/jstests/serverless/libs/basic_serverless_test.js @@ -544,6 +544,12 @@ class BasicServerlessTest { } BasicServerlessTest.kConfigSplitDonorsNS = "config.tenantSplitDonors"; +BasicServerlessTest.DonorState = { + kUninitialized: "uninitialized", + kBlocking: "blocking", + kCommitted: "committed", + kAborted: "aborted" +}; function findSplitOperation(primary, migrationId) { const donorsCollection = primary.getCollection(BasicServerlessTest.kConfigSplitDonorsNS); diff --git a/jstests/serverless/shard_split_abort_during_upgrade_downgrade.js b/jstests/serverless/shard_split_abort_during_upgrade_downgrade.js new file mode 100644 index 00000000000..32ddb38b339 --- /dev/null +++ b/jstests/serverless/shard_split_abort_during_upgrade_downgrade.js @@ -0,0 +1,65 @@ +/* + * Prove that shard splits are aborted during FCV upgrade/downgrade. + * + * @tags: [requires_fcv_52, featureFlagShardSplit, serverless] + */ + +(function() { +"use strict"; +load("jstests/libs/fail_point_util.js"); +load("jstests/serverless/libs/basic_serverless_test.js"); + +// Shard split commands are gated by a feature flag, which will not be supported when we +// downgrade versions. Eventually, we will run this test when we have two consecutive versions +// that support `commitShardSplit` without a feature flag. This check will be removed as part +// of SERVER-66965. +if (MongoRunner.compareBinVersions(latestFCV, "6.2") < 0) { + return; +} + +// Skip db hash check because secondary is left with a different config. +TestData.skipCheckDBHashes = true; +const test = new BasicServerlessTest({ + recipientTagName: "recipientNode", + recipientSetName: "recipient", + quickGarbageCollection: true +}); + +test.addRecipientNodes(); + +const donorPrimary = testFixture.donor.getPrimary(); +const tenantIds = ["tenant1", "tenant2"]; + +jsTestLog("Assert shard splits are aborted when downgrading."); +const downgradeFCV = lastContinuousFCV; +const hangWhileDowngradingFp = configureFailPoint(donorPrimary, "hangWhileDowngrading"); +const downgradeThread = new Thread((host, downgradeFCV) => { + const db = new Mongo(host); + assert.commandWorked(db.adminCommand({setFeatureCompatibilityVersion: downgradeFCV})); +}, donorPrimary.host, downgradeFCV); + +downgradeThread.start(); +hangWhileDowngradingFp.wait(); +const firstSplit = test.createSplitOperation(tenantIds); +assert.commandFailedWithCode(firstSplit.commit(), ErrorCodes.TenantMigrationAborted); +hangWhileDowngradingFp.off(); +downgradeThread.join(); +firstSplit.forget(); + +jsTestLog("Assert shard splits are aborted when upgrading."); +const hangWhileUpgradingFp = configureFailPoint(donorPrimary, "hangWhileUpgrading"); +const upgradeThread = new Thread((host) => { + const db = new Mongo(host); + assert.commandWorked(db.adminCommand({setFeatureCompatibilityVersion: latestFCV})); +}, donorPrimary.host); + +upgradeThread.start(); +hangWhileUpgradingFp.wait(); +const secondSplit = test.createSplitOperation(tenantIds); +assert.commandFailedWithCode(secondSplit.commit(), ErrorCodes.TenantMigrationAborted); +hangWhileUpgradingFp.off(); +upgradeThread.join(); +secondSplit.forget(); + +test.stop(); +})(); diff --git a/jstests/serverless/shard_split_abort_on_setfcv.js b/jstests/serverless/shard_split_abort_on_setfcv.js new file mode 100644 index 00000000000..5de893203ae --- /dev/null +++ b/jstests/serverless/shard_split_abort_on_setfcv.js @@ -0,0 +1,57 @@ +/* + * Prove that shard splits are eagerly aborted when the `setFeatureCompatibilityVersion` command is + * received for both upgrade and downgrade paths. + * + * @tags: [requires_fcv_52, featureFlagShardSplit, serverless] + */ + +(function() { +"use strict"; +load("jstests/libs/fail_point_util.js"); +load("jstests/serverless/libs/basic_serverless_test.js"); + +// Skip db hash check because secondary is left with a different config. +TestData.skipCheckDBHashes = true; +const test = new BasicServerlessTest({ + recipientTagName: "recipientNode", + recipientSetName: "recipient", + quickGarbageCollection: true +}); + +test.addRecipientNodes(); + +const donorPrimary = test.donor.getPrimary(); +const tenantIds = ["tenant1", "tenant2"]; +const pauseAfterBlockingFp = configureFailPoint(donorPrimary, "pauseShardSplitAfterBlocking"); + +jsTestLog("Test FCV Downgrade"); +const split = test.createSplitOperation(tenantIds); +const commitThread = split.commitAsync(); +pauseAfterBlockingFp.wait(); +assert.commandWorked( + donorPrimary.adminCommand({setFeatureCompatibilityVersion: lastContinuousFCV})); +pauseAfterBlockingFp.off(); +assert.commandFailedWithCode(commitThread.returnData(), ErrorCodes.TenantMigrationAborted); + +jsTestLog("Test FCV Upgrade"); +if (lastContinuousFCV == "6.0") { + const secondSplit = test.createSplitOperation(tenantIds); + assert.commandFailedWithCode(secondSplit.commit(), ErrorCodes.IllegalOperation); +} else { + // `forgetShardSplit` will not be available until the downgraded version also supports the + // 'shard split' feature. + split.forget(); + test.cleanupSuccesfulAborted(split.migrationId, tenantIds); + + const secondSplit = test.createSplitOperation(tenantIds); + const commitThread = secondSplit.commitAsync(); + pauseAfterBlockingFp.wait(); + assert.commandWorked(donorPrimary.adminCommand({setFeatureCompatibilityVersion: latestFCV})); + pauseAfterBlockingFp.off(); + assert.commandFailedWithCode(commitThread.returnData(), ErrorCodes.TenantMigrationAborted); + secondSplit.forget(); + test.cleanupSuccesfulAborted(secondSplit.migrationId, tenantIds); +} + +test.stop(); +})(); diff --git a/jstests/serverless/shard_split_enabled.js b/jstests/serverless/shard_split_enabled.js index 6cc27de9db7..a58aec52d1d 100644 --- a/jstests/serverless/shard_split_enabled.js +++ b/jstests/serverless/shard_split_enabled.js @@ -50,13 +50,13 @@ function makeShardSplitTest() { let commitUUID = UUID(); let res = adminDB.runCommand(test.makeCommitShardSplitCmd(commitUUID)); assert.neq(res.code, - 6057900, + ErrorCodes.IllegalOperation, `commitShardSplitCmd shouldn't reject when featureFlagShardSplit is enabled`); test.removeRecipientNodesFromDonor(); res = adminDB.runCommand(test.makeForgetShardSplitCmd(commitUUID)); assert.neq(res.code, - 6057900, + ErrorCodes.IllegalOperation, `forgetShardSplit shouldn't reject when featureFlagShardSplit is enabled`); test.waitForGarbageCollection(commitUUID, tenantIds); @@ -64,22 +64,22 @@ function makeShardSplitTest() { let abortUUID = UUID(); res = adminDB.runCommand(test.makeAbortShardSplitCmd(abortUUID)); assert.neq(res.code, - 6057902, + ErrorCodes.IllegalOperation, `abortShardSplitCmd shouldn't reject when featureFlagShardSplit is enabled`); assert.commandWorked(adminDB.adminCommand({setFeatureCompatibilityVersion: downgradeFCV})); assert.commandFailedWithCode( adminDB.runCommand(test.makeCommitShardSplitCmd(UUID())), - 6057900, + ErrorCodes.IllegalOperation, `commitShardSplitCmd should reject when featureFlagShardSplit is disabled`); assert.commandFailedWithCode( adminDB.runCommand(test.makeAbortShardSplitCmd(UUID())), - 6057902, + ErrorCodes.IllegalOperation, `abortShardSplitCmd should reject when featureFlagShardSplit is disabled`); assert.commandFailedWithCode( adminDB.runCommand(test.makeForgetShardSplitCmd(UUID())), - 6236600, + ErrorCodes.IllegalOperation, `forgetShardSplit should reject when featureFlagShardSplit is disabled`); // shut down replica set diff --git a/src/mongo/db/commands/SConscript b/src/mongo/db/commands/SConscript index 073190f124e..cf6f79671bc 100644 --- a/src/mongo/db/commands/SConscript +++ b/src/mongo/db/commands/SConscript @@ -582,6 +582,7 @@ env.Library( '$BUILD_DIR/mongo/db/s/user_writes_recoverable_critical_section', '$BUILD_DIR/mongo/db/server_feature_flags', '$BUILD_DIR/mongo/db/server_options_core', + '$BUILD_DIR/mongo/db/serverless/shard_split_donor_service', '$BUILD_DIR/mongo/db/tenant_id', '$BUILD_DIR/mongo/db/timeseries/timeseries_conversion_util', '$BUILD_DIR/mongo/db/transaction_api', diff --git a/src/mongo/db/commands/set_feature_compatibility_version_command.cpp b/src/mongo/db/commands/set_feature_compatibility_version_command.cpp index 1a045c0e806..ad2084e2ac7 100644 --- a/src/mongo/db/commands/set_feature_compatibility_version_command.cpp +++ b/src/mongo/db/commands/set_feature_compatibility_version_command.cpp @@ -51,6 +51,7 @@ #include "mongo/db/concurrency/d_concurrency.h" #include "mongo/db/db_raii.h" #include "mongo/db/dbdirectclient.h" +#include "mongo/db/global_settings.h" #include "mongo/db/index_builds_coordinator.h" #include "mongo/db/namespace_string.h" #include "mongo/db/ops/write_ops.h" @@ -77,6 +78,7 @@ #include "mongo/db/s/transaction_coordinator_service.h" #include "mongo/db/server_feature_flags_gen.h" #include "mongo/db/server_options.h" +#include "mongo/db/serverless/shard_split_donor_service.h" #include "mongo/db/session_catalog.h" #include "mongo/db/session_txn_record_gen.h" #include "mongo/db/timeseries/timeseries_index_schema_conversion_functions.h" @@ -532,7 +534,7 @@ private: opCtx, CommandHelpers::appendMajorityWriteConcern(requestPhase1.toBSON({})))); } - _cancelTenantMigrations(opCtx); + _cancelServerlessMigrations(opCtx); { // Take the FCV full transition lock in S mode to create a barrier for operations taking @@ -608,7 +610,7 @@ private: Balancer::get(opCtx)->applyLegacyChunkSizeConstraintsOnClusterData(opCtx); } - _cancelTenantMigrations(opCtx); + _cancelServerlessMigrations(opCtx); { // Take the FCV full transition lock in S mode to create a barrier for operations taking @@ -797,21 +799,29 @@ private: } /** - * Kills all tenant migrations active on this node, for both donors and recipients. + * Abort all serverless migrations active on this node, for both donors and recipients. * Called after reaching an upgrading or downgrading state. */ - void _cancelTenantMigrations(OperationContext* opCtx) { + void _cancelServerlessMigrations(OperationContext* opCtx) { invariant(serverGlobalParams.featureCompatibility.isUpgradingOrDowngrading()); if (serverGlobalParams.clusterRole == ClusterRole::None) { auto donorService = checked_cast<TenantMigrationDonorService*>( repl::PrimaryOnlyServiceRegistry::get(opCtx->getServiceContext()) ->lookupServiceByName(TenantMigrationDonorService::kServiceName)); donorService->abortAllMigrations(opCtx); + auto recipientService = checked_cast<repl::TenantMigrationRecipientService*>( repl::PrimaryOnlyServiceRegistry::get(opCtx->getServiceContext()) ->lookupServiceByName(repl::TenantMigrationRecipientService:: kTenantMigrationRecipientServiceName)); recipientService->abortAllMigrations(opCtx); + + if (getGlobalReplSettings().isServerless()) { + auto splitDonorService = checked_cast<ShardSplitDonorService*>( + repl::PrimaryOnlyServiceRegistry::get(opCtx->getServiceContext()) + ->lookupServiceByName(ShardSplitDonorService::kServiceName)); + splitDonorService->abortAllSplits(opCtx); + } } } diff --git a/src/mongo/db/serverless/shard_split_commands.cpp b/src/mongo/db/serverless/shard_split_commands.cpp index 71a2d64203e..9da6c9ce8b8 100644 --- a/src/mongo/db/serverless/shard_split_commands.cpp +++ b/src/mongo/db/serverless/shard_split_commands.cpp @@ -50,8 +50,8 @@ public: using InvocationBase::InvocationBase; Response typedRun(OperationContext* opCtx) { - uassert(6057900, - "Feature \"shard split\" not supported", + uassert(ErrorCodes::IllegalOperation, + "Feature 'shard split' not supported", repl::feature_flags::gShardSplit.isEnabled( serverGlobalParams.featureCompatibility)); uassert(ErrorCodes::IllegalOperation, @@ -138,8 +138,8 @@ public: using InvocationBase::InvocationBase; void typedRun(OperationContext* opCtx) { - uassert(6057902, - "Feature \"shard split\" not supported", + uassert(ErrorCodes::IllegalOperation, + "Feature 'shard split' not supported", repl::feature_flags::gShardSplit.isEnabled( serverGlobalParams.featureCompatibility)); uassert(ErrorCodes::CommandNotSupported, @@ -215,8 +215,8 @@ public: using InvocationBase::InvocationBase; void typedRun(OperationContext* opCtx) { - uassert(6236600, - "feature \"shard split\" not supported", + uassert(ErrorCodes::IllegalOperation, + "Feature 'shard split' not supported", repl::feature_flags::gShardSplit.isEnabled( serverGlobalParams.featureCompatibility)); uassert(ErrorCodes::CommandNotSupported, diff --git a/src/mongo/db/serverless/shard_split_donor_service.cpp b/src/mongo/db/serverless/shard_split_donor_service.cpp index f6f28ed8aaf..f37a9416f5e 100644 --- a/src/mongo/db/serverless/shard_split_donor_service.cpp +++ b/src/mongo/db/serverless/shard_split_donor_service.cpp @@ -226,8 +226,8 @@ void ShardSplitDonorService::checkIfConflictsWithOtherInstances( isGarbageCollectable; uassert(ErrorCodes::ConflictingOperationInProgress, - str::stream() << "Can't start a concurent shard split operation against" - << " migrationId:" << existingTypedInstance->getId(), + str::stream() << "Can't start a concurent shard split operation, currently running" + << " migrationId: " << existingTypedInstance->getId(), existingIsAborted); } } @@ -240,6 +240,16 @@ std::shared_ptr<repl::PrimaryOnlyService::Instance> ShardSplitDonorService::cons ShardSplitDonorDocument::parse(IDLParserErrorContext("donorStateDoc"), initialState)); } +void ShardSplitDonorService::abortAllSplits(OperationContext* opCtx) { + LOGV2(8423361, "Aborting all active shard split operations."); + auto instances = getAllInstances(opCtx); + for (auto& instance : instances) { + auto typedInstance = + checked_pointer_cast<ShardSplitDonorService::DonorStateMachine>(instance); + typedInstance->tryAbort(); + } +} + ExecutorFuture<void> ShardSplitDonorService::_createStateDocumentTTLIndex( std::shared_ptr<executor::ScopedTaskExecutor> executor, const CancellationToken& token) { return AsyncTry([this] { @@ -337,6 +347,13 @@ SemiFuture<void> ShardSplitDonorService::DonorStateMachine::run( _abortSource->cancel(); } + // We must abort the migration if we try to start or resume while upgrading or downgrading. + // (Generic FCV reference): This FCV check should exist across LTS binary versions. + if (serverGlobalParams.featureCompatibility.isUpgradingOrDowngrading()) { + LOGV2(8423360, "Aborting shard split since donor is upgrading or downgrading."); + _abortSource->cancel(); + } + return _abortSource->token(); }(); diff --git a/src/mongo/db/serverless/shard_split_donor_service.h b/src/mongo/db/serverless/shard_split_donor_service.h index 36ba21c2a77..9c6c3645de2 100644 --- a/src/mongo/db/serverless/shard_split_donor_service.h +++ b/src/mongo/db/serverless/shard_split_donor_service.h @@ -61,6 +61,8 @@ public: ThreadPool::Limits getThreadPoolLimits() const override; + void abortAllSplits(OperationContext* opCtx); + protected: // Instance conflict check not yet implemented. void checkIfConflictsWithOtherInstances( |