diff options
author | Sergi Mateo Bellido <sergi.mateo-bellido@mongodb.com> | 2021-11-01 11:49:32 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-11-01 12:13:26 +0000 |
commit | 30a0b6165d7060831dc9741dbb5047c607dd622a (patch) | |
tree | 580e1f207615e9621b0bba60f828ca4ce3ebf809 | |
parent | 02751b472c71026b601c1c7a5aeceacc135f0667 (diff) | |
download | mongo-30a0b6165d7060831dc9741dbb5047c607dd622a.tar.gz |
SERVER-60834 new migration protocol setFCV requirements
15 files changed, 108 insertions, 38 deletions
diff --git a/jstests/multiVersion/genericSetFCVUsage/migration_between_mixed_FCV_mixed_version_mongods.js b/jstests/multiVersion/genericSetFCVUsage/migration_between_mixed_FCV_mixed_version_mongods.js index fb9080e1f7e..e73efb4b208 100644 --- a/jstests/multiVersion/genericSetFCVUsage/migration_between_mixed_FCV_mixed_version_mongods.js +++ b/jstests/multiVersion/genericSetFCVUsage/migration_between_mixed_FCV_mixed_version_mongods.js @@ -35,6 +35,23 @@ function runTest(downgradeVersion) { } }); + const featureFlagMigrationRecipientCriticalSection = + assert.commandWorked(st.configRS.getPrimary().adminCommand( + {getParameter: 1, featureFlagMigrationRecipientCriticalSection: 1})); + const featureFlagMigrationRecipientCriticalSectionEnabled = + featureFlagMigrationRecipientCriticalSection.featureFlagMigrationRecipientCriticalSection + .value; + + // SERVER-61072: Reenable this test once 6.0 becomes last LTS. + // We have to skip this test because the initial state with mixed binaries and mixed FCVs + // doesn't honor the upgrade/downgrade procedure and breaks with the setFCV requirements of the + // new migration protocol. + if (featureFlagMigrationRecipientCriticalSectionEnabled) { + jsTest.log('Skipping test because featureFlagMigrationRecipientCriticalSection is enabled'); + st.stop(); + return; + } + const downgradeFCV = binVersionToFCV(downgradeVersion); checkFCV(st.configRS.getPrimary().getDB("admin"), downgradeFCV); checkFCV(st.shard0.getDB("admin"), downgradeFCV); diff --git a/jstests/multiVersion/migrations_with_mixed_fcv.js b/jstests/multiVersion/migrations_with_mixed_fcv.js index e45330b9a3f..4ad33779916 100644 --- a/jstests/multiVersion/migrations_with_mixed_fcv.js +++ b/jstests/multiVersion/migrations_with_mixed_fcv.js @@ -199,6 +199,22 @@ function testSetFCVDoesNotBlockWhileMigratingChunk() { jsTestLog("Testing that setFCV does not block while migrating a chunk"); let st = setup(); + const featureFlagMigrationRecipientCriticalSection = + assert.commandWorked(st.configRS.getPrimary().adminCommand( + {getParameter: 1, featureFlagMigrationRecipientCriticalSection: 1})); + const featureFlagMigrationRecipientCriticalSectionEnabled = + featureFlagMigrationRecipientCriticalSection.featureFlagMigrationRecipientCriticalSection + .value; + + // SERVER-61072: Reenable this test once 6.0 becomes last LTS. + // We have to skip this test because the current implementation of the setFCV might wait for the + // completion of moveChunk operations before starting to use a different migration protocol. + if (featureFlagMigrationRecipientCriticalSectionEnabled) { + jsTest.log('Skipping test because featureFlagMigrationRecipientCriticalSection is enabled'); + st.stop(); + return; + } + // Set config and shards to last-lts FCV assert.commandWorked( st.s.getDB("admin").runCommand({setFeatureCompatibilityVersion: lastLTSFCV})); diff --git a/src/mongo/db/commands/set_feature_compatibility_version_command.cpp b/src/mongo/db/commands/set_feature_compatibility_version_command.cpp index b4f13ea5352..f0b7ae9505b 100644 --- a/src/mongo/db/commands/set_feature_compatibility_version_command.cpp +++ b/src/mongo/db/commands/set_feature_compatibility_version_command.cpp @@ -62,6 +62,7 @@ #include "mongo/db/repl/replication_coordinator.h" #include "mongo/db/repl/tenant_migration_donor_service.h" #include "mongo/db/repl/tenant_migration_recipient_service.h" +#include "mongo/db/s/active_migrations_registry.h" #include "mongo/db/s/config/sharding_catalog_manager.h" #include "mongo/db/s/resharding/coordinator_document_gen.h" #include "mongo/db/s/resharding/resharding_coordinator_service.h" @@ -72,6 +73,7 @@ #include "mongo/db/views/view_catalog.h" #include "mongo/logv2/log.h" #include "mongo/rpc/get_status_from_command_result.h" +#include "mongo/s/pm2423_feature_flags_gen.h" #include "mongo/stdx/unordered_set.h" #include "mongo/util/exit.h" #include "mongo/util/fail_point.h" @@ -321,6 +323,15 @@ public: if (!request.getPhase() || request.getPhase() == SetFCVPhaseEnum::kStart) { { + boost::optional<MigrationBlockingGuard> drainNewMoveChunks; + // Downgrades from a version >= 5.2 to 5.1 or lower must drain new protocol + // moveChunks + if (feature_flags::gFeatureFlagMigrationRecipientCriticalSection + .isEnabledAndIgnoreFCV() && + actualVersion > multiversion::FeatureCompatibilityVersion::kVersion_5_1 && + requestedVersion <= multiversion::FeatureCompatibilityVersion::kVersion_5_1) + drainNewMoveChunks.emplace(opCtx, "setFeatureCompatibilityVersionUpgrade"); + // Start transition to 'requestedVersion' by updating the local FCV document to a // 'kUpgrading' or 'kDowngrading' state, respectively. const auto fcvChangeRegion( @@ -352,6 +363,14 @@ public: } { + boost::optional<MigrationBlockingGuard> drainOldMoveChunks; + // Upgrades from a version <= 5.1 to 5.2 or greater must drain old protocol moveChunks + if (feature_flags::gFeatureFlagMigrationRecipientCriticalSection + .isEnabledAndIgnoreFCV() && + actualVersion <= multiversion::FeatureCompatibilityVersion::kVersion_5_1 && + requestedVersion > multiversion::FeatureCompatibilityVersion::kVersion_5_1) + drainOldMoveChunks.emplace(opCtx, "setFeatureCompatibilityVersionUpgrade"); + // Complete transition by updating the local FCV document to the fully upgraded or // downgraded requestedVersion. const auto fcvChangeRegion(FeatureCompatibilityVersion::enterFCVChangeRegion(opCtx)); diff --git a/src/mongo/db/s/migration_chunk_cloner_source.h b/src/mongo/db/s/migration_chunk_cloner_source.h index 00e17e8fe3a..a911c1c5efe 100644 --- a/src/mongo/db/s/migration_chunk_cloner_source.h +++ b/src/mongo/db/s/migration_chunk_cloner_source.h @@ -105,7 +105,7 @@ public: * * NOTE: Must be called without any locks. */ - virtual StatusWith<BSONObj> commitClone(OperationContext* opCtx) = 0; + virtual StatusWith<BSONObj> commitClone(OperationContext* opCtx, bool acquireCSOnRecipient) = 0; /** * Tells the recipient to abort the clone and cleanup any unused data. This method's diff --git a/src/mongo/db/s/migration_chunk_cloner_source_legacy.cpp b/src/mongo/db/s/migration_chunk_cloner_source_legacy.cpp index 6df9f3e4b87..dc9791d7b3c 100644 --- a/src/mongo/db/s/migration_chunk_cloner_source_legacy.cpp +++ b/src/mongo/db/s/migration_chunk_cloner_source_legacy.cpp @@ -339,7 +339,8 @@ Status MigrationChunkClonerSourceLegacy::awaitUntilCriticalSectionIsAppropriate( return _checkRecipientCloningStatus(opCtx, maxTimeToWait); } -StatusWith<BSONObj> MigrationChunkClonerSourceLegacy::commitClone(OperationContext* opCtx) { +StatusWith<BSONObj> MigrationChunkClonerSourceLegacy::commitClone(OperationContext* opCtx, + bool acquireCSOnRecipient) { invariant(_state == kCloning); invariant(!opCtx->lockState()->isLocked()); if (_jumboChunkCloneState && _forceJumbo) { @@ -358,8 +359,14 @@ StatusWith<BSONObj> MigrationChunkClonerSourceLegacy::commitClone(OperationConte _sessionCatalogSource->onCommitCloneStarted(); } - auto responseStatus = _callRecipient( - opCtx, createRequestWithSessionId(kRecvChunkCommit, _args.getNss(), _sessionId)); + + auto responseStatus = _callRecipient(opCtx, [&] { + BSONObjBuilder builder; + builder.append(kRecvChunkCommit, _args.getNss().ns()); + builder.append("acquireCSOnRecipient", acquireCSOnRecipient); + _sessionId.append(&builder); + return builder.obj(); + }()); if (responseStatus.isOK()) { _cleanup(opCtx); diff --git a/src/mongo/db/s/migration_chunk_cloner_source_legacy.h b/src/mongo/db/s/migration_chunk_cloner_source_legacy.h index 0beed19779a..b29ba8b302e 100644 --- a/src/mongo/db/s/migration_chunk_cloner_source_legacy.h +++ b/src/mongo/db/s/migration_chunk_cloner_source_legacy.h @@ -102,7 +102,7 @@ public: Status awaitUntilCriticalSectionIsAppropriate(OperationContext* opCtx, Milliseconds maxTimeToWait) override; - StatusWith<BSONObj> commitClone(OperationContext* opCtx) override; + StatusWith<BSONObj> commitClone(OperationContext* opCtx, bool acquireCSOnRecipient) override; void cancelClone(OperationContext* opCtx) override; diff --git a/src/mongo/db/s/migration_chunk_cloner_source_legacy_test.cpp b/src/mongo/db/s/migration_chunk_cloner_source_legacy_test.cpp index 4f18136a7f6..3f01fe4d3a7 100644 --- a/src/mongo/db/s/migration_chunk_cloner_source_legacy_test.cpp +++ b/src/mongo/db/s/migration_chunk_cloner_source_legacy_test.cpp @@ -382,7 +382,7 @@ TEST_F(MigrationChunkClonerSourceLegacyTest, CorrectDocumentsFetched) { onCommand([&](const RemoteCommandRequest& request) { return BSON("ok" << true); }); }); - ASSERT_OK(cloner.commitClone(operationContext())); + ASSERT_OK(cloner.commitClone(operationContext(), true /* acquireCSOnRecipient */)); futureCommit.default_timed_get(); } @@ -475,7 +475,7 @@ TEST_F(MigrationChunkClonerSourceLegacyTest, RemoveDuplicateDocuments) { onCommand([&](const RemoteCommandRequest& request) { return BSON("ok" << true); }); }); - ASSERT_OK(cloner.commitClone(operationContext())); + ASSERT_OK(cloner.commitClone(operationContext(), true /* acquireCSOnRecipient */)); futureCommit.default_timed_get(); } @@ -536,7 +536,7 @@ TEST_F(MigrationChunkClonerSourceLegacyTest, OneLargeDocumentTransferMods) { onCommand([&](const RemoteCommandRequest& request) { return BSON("ok" << true); }); }); - ASSERT_OK(cloner.commitClone(operationContext())); + ASSERT_OK(cloner.commitClone(operationContext(), true /* acquireCSOnRecipient */)); futureCommit.default_timed_get(); } @@ -611,7 +611,7 @@ TEST_F(MigrationChunkClonerSourceLegacyTest, ManySmallDocumentsTransferMods) { onCommand([&](const RemoteCommandRequest& request) { return BSON("ok" << true); }); }); - ASSERT_OK(cloner.commitClone(operationContext())); + ASSERT_OK(cloner.commitClone(operationContext(), true /* acquireCSOnRecipient */)); futureCommit.default_timed_get(); } diff --git a/src/mongo/db/s/migration_coordinator.cpp b/src/mongo/db/s/migration_coordinator.cpp index d62e4b48c3f..d43bd82cc2f 100644 --- a/src/mongo/db/s/migration_coordinator.cpp +++ b/src/mongo/db/s/migration_coordinator.cpp @@ -140,7 +140,8 @@ void MigrationCoordinator::setMigrationDecision(DecisionEnum decision) { } -boost::optional<SemiFuture<void>> MigrationCoordinator::completeMigration(OperationContext* opCtx) { +boost::optional<SemiFuture<void>> MigrationCoordinator::completeMigration( + OperationContext* opCtx, bool acquireCSOnRecipient) { auto decision = _migrationInfo.getDecision(); if (!decision) { LOGV2( @@ -159,8 +160,7 @@ boost::optional<SemiFuture<void>> MigrationCoordinator::completeMigration(Operat "decision"_attr = (decision == DecisionEnum::kCommitted ? "committed" : "aborted"), "migrationId"_attr = _migrationInfo.getId()); - if (feature_flags::gFeatureFlagMigrationRecipientCriticalSection.isEnabled( - serverGlobalParams.featureCompatibility)) { + if (acquireCSOnRecipient) { if (!_releaseRecipientCriticalSectionFuture) { launchReleaseRecipientCriticalSection(opCtx); } diff --git a/src/mongo/db/s/migration_coordinator.h b/src/mongo/db/s/migration_coordinator.h index 7f3f1571355..9f8b9d34d67 100644 --- a/src/mongo/db/s/migration_coordinator.h +++ b/src/mongo/db/s/migration_coordinator.h @@ -87,7 +87,8 @@ public: * If the decision was to commit, returns a future that is set when range deletion for * the donated range completes. */ - boost::optional<SemiFuture<void>> completeMigration(OperationContext* opCtx); + boost::optional<SemiFuture<void>> completeMigration(OperationContext* opCtx, + bool acquireCSOnRecipient); /** * Deletes the persistent state for this migration from config.migrationCoordinators. diff --git a/src/mongo/db/s/migration_destination_manager.cpp b/src/mongo/db/s/migration_destination_manager.cpp index 3c0b51d80d6..4cf9b4b3904 100644 --- a/src/mongo/db/s/migration_destination_manager.cpp +++ b/src/mongo/db/s/migration_destination_manager.cpp @@ -489,6 +489,7 @@ Status MigrationDestinationManager::restoreRecoveredMigrationState( _lsid = recoveryDoc.getLsid(); _txnNumber = recoveryDoc.getTxnNumber(); _state = COMMIT_START; + _acquireCSOnRecipient = true; LOGV2(6064500, "Recovering migration recipient", "sessionId"_attr = *_sessionId); @@ -615,10 +616,13 @@ void MigrationDestinationManager::abortWithoutSessionIdCheck() { _errmsg = "aborted without session id check"; } -Status MigrationDestinationManager::startCommit(const MigrationSessionId& sessionId) { +Status MigrationDestinationManager::startCommit(const MigrationSessionId& sessionId, + bool acquireCSOnRecipient) { stdx::unique_lock<Latch> lock(_mutex); + _acquireCSOnRecipient = acquireCSOnRecipient; + const auto convergenceTimeout = Shard::kDefaultConfigCommandTimeout + Shard::kDefaultConfigCommandTimeout / 4; @@ -665,8 +669,7 @@ Status MigrationDestinationManager::startCommit(const MigrationSessionId& sessio // Assigning a timeout slightly higher than the one used for network requests to the config // server. Enough time to retry at least once in case of network failures (SERVER-51397). deadline = Date_t::now() + convergenceTimeout; - if (!feature_flags::gFeatureFlagMigrationRecipientCriticalSection.isEnabled( - serverGlobalParams.featureCompatibility)) { + if (!_acquireCSOnRecipient) { while (_sessionId) { if (stdx::cv_status::timeout == _isActiveCV.wait_until(lock, deadline.toSystemTimePoint())) { @@ -1545,8 +1548,7 @@ void MigrationDestinationManager::_migrateDriver(OperationContext* outerOpCtx, return; } - if (feature_flags::gFeatureFlagMigrationRecipientCriticalSection.isEnabled( - serverGlobalParams.featureCompatibility)) { + if (_acquireCSOnRecipient) { const auto critSecReason = criticalSectionReason(*_sessionId); runWithoutSession(outerOpCtx, [&] { @@ -1592,8 +1594,7 @@ void MigrationDestinationManager::_migrateDriver(OperationContext* outerOpCtx, } } - if (feature_flags::gFeatureFlagMigrationRecipientCriticalSection.isEnabled( - serverGlobalParams.featureCompatibility)) { + if (_acquireCSOnRecipient) { outerOpCtx->setAlwaysInterruptAtStepDownOrUp(); auto newClient = outerOpCtx->getServiceContext()->makeClient("MigrationCoordinator"); { diff --git a/src/mongo/db/s/migration_destination_manager.h b/src/mongo/db/s/migration_destination_manager.h index 5b6bee19bd1..37764e0e879 100644 --- a/src/mongo/db/s/migration_destination_manager.h +++ b/src/mongo/db/s/migration_destination_manager.h @@ -151,7 +151,7 @@ public: */ void abortWithoutSessionIdCheck(); - Status startCommit(const MigrationSessionId& sessionId); + Status startCommit(const MigrationSessionId& sessionId, bool acquireCSOnRecipient); /* * Refreshes the filtering metadata and releases the migration recipient critical section for @@ -299,6 +299,8 @@ private: // Condition variable, which is signalled every time the state of the migration changes. stdx::condition_variable _stateChangedCV; + + bool _acquireCSOnRecipient{false}; }; } // namespace mongo diff --git a/src/mongo/db/s/migration_destination_manager_legacy_commands.cpp b/src/mongo/db/s/migration_destination_manager_legacy_commands.cpp index 7f989dd1a03..419f2144c8e 100644 --- a/src/mongo/db/s/migration_destination_manager_legacy_commands.cpp +++ b/src/mongo/db/s/migration_destination_manager_legacy_commands.cpp @@ -226,7 +226,11 @@ public: BSONObjBuilder& result) override { auto const sessionId = uassertStatusOK(MigrationSessionId::extractFromBSON(cmdObj)); auto const mdm = MigrationDestinationManager::get(opCtx); - Status const status = mdm->startCommit(sessionId); + + const auto elem = cmdObj.getField("acquireCSOnRecipient"); + const auto acquireCSOnRecipient = elem ? elem.boolean() : false; + + Status const status = mdm->startCommit(sessionId, acquireCSOnRecipient); mdm->report(result, opCtx, false); if (!status.isOK()) { LOGV2(22014, diff --git a/src/mongo/db/s/migration_source_manager.cpp b/src/mongo/db/s/migration_source_manager.cpp index 03a57664103..cbf61928e9e 100644 --- a/src/mongo/db/s/migration_source_manager.cpp +++ b/src/mongo/db/s/migration_source_manager.cpp @@ -131,7 +131,9 @@ MigrationSourceManager::MigrationSourceManager(OperationContext* opCtx, _critSecReason(BSON("command" << "moveChunk" << "fromShard" << _args.getFromShardId() << "toShard" - << _args.getToShardId())) { + << _args.getToShardId())), + _acquireCSOnRecipient(feature_flags::gFeatureFlagMigrationRecipientCriticalSection.isEnabled( + serverGlobalParams.featureCompatibility)) { invariant(!_opCtx->lockState()->isLocked()); LOGV2(22016, @@ -376,7 +378,7 @@ Status MigrationSourceManager::commitChunkOnRecipient() { ScopeGuard scopedGuard([&] { cleanupOnError(); }); // Tell the recipient shard to fetch the latest changes. - auto commitCloneStatus = _cloneDriver->commitClone(_opCtx); + auto commitCloneStatus = _cloneDriver->commitClone(_opCtx, _acquireCSOnRecipient); if (MONGO_unlikely(failMigrationCommit.shouldFail()) && commitCloneStatus.isOK()) { commitCloneStatus = {ErrorCodes::InternalError, @@ -445,8 +447,7 @@ Status MigrationSourceManager::commitChunkMetadataOnConfig() { ErrorCodes::InternalError, "Failpoint 'migrationCommitNetworkError' generated error"); } - if (feature_flags::gFeatureFlagMigrationRecipientCriticalSection.isEnabled( - serverGlobalParams.featureCompatibility)) { + if (_acquireCSOnRecipient) { // Asynchronously tell the recipient to release its critical section _coordinator->launchReleaseRecipientCriticalSection(_opCtx); } @@ -542,9 +543,7 @@ Status MigrationSourceManager::commitChunkMetadataOnConfig() { const ChunkRange range(_args.getMinKey(), _args.getMaxKey()); - if (!feature_flags::gFeatureFlagMigrationRecipientCriticalSection.isEnabled( - serverGlobalParams.featureCompatibility) && - !MONGO_unlikely(doNotRefreshRecipientAfterCommit.shouldFail())) { + if (!_acquireCSOnRecipient && !MONGO_unlikely(doNotRefreshRecipientAfterCommit.shouldFail())) { // Best-effort make the recipient refresh its routing table to the new collection // version. refreshRecipientRoutingTable( @@ -747,7 +746,8 @@ void MigrationSourceManager::_cleanup(bool completeMigration) noexcept { // This can be called on an exception path after the OperationContext has been // interrupted, so use a new OperationContext. Note, it's valid to call // getServiceContext on an interrupted OperationContext. - _cleanupCompleteFuture = _coordinator->completeMigration(newOpCtx); + _cleanupCompleteFuture = + _coordinator->completeMigration(newOpCtx, _acquireCSOnRecipient); } } diff --git a/src/mongo/db/s/migration_source_manager.h b/src/mongo/db/s/migration_source_manager.h index fdd7fd2283f..0b09335a172 100644 --- a/src/mongo/db/s/migration_source_manager.h +++ b/src/mongo/db/s/migration_source_manager.h @@ -273,6 +273,9 @@ private: // Information about the moveChunk to be used in the critical section. const BSONObj _critSecReason; + + // It states whether the critical section has to be acquired on the recipient. + const bool _acquireCSOnRecipient; }; } // namespace mongo diff --git a/src/mongo/db/s/migration_util.cpp b/src/mongo/db/s/migration_util.cpp index fc74b4fbac2..7eb5fccf95d 100644 --- a/src/mongo/db/s/migration_util.cpp +++ b/src/mongo/db/s/migration_util.cpp @@ -952,12 +952,17 @@ void recoverMigrationCoordinations(OperationContext* opCtx, NamespaceString nss) LOGV2_DEBUG(4798501, 2, "Starting migration recovery", "namespace"_attr = nss); unsigned migrationRecoveryCount = 0; + + const auto acquireCSOnRecipient = + feature_flags::gFeatureFlagMigrationRecipientCriticalSection.isEnabled( + serverGlobalParams.featureCompatibility); PersistentTaskStore<MigrationCoordinatorDocument> store( NamespaceString::kMigrationCoordinatorsNamespace); store.forEach( opCtx, BSON(MigrationCoordinatorDocument::kNssFieldName << nss.toString()), - [&opCtx, &migrationRecoveryCount](const MigrationCoordinatorDocument& doc) { + [&opCtx, &migrationRecoveryCount, acquireCSOnRecipient]( + const MigrationCoordinatorDocument& doc) { LOGV2_DEBUG(4798502, 2, "Recovering migration", @@ -973,7 +978,7 @@ void recoverMigrationCoordinations(OperationContext* opCtx, NamespaceString nss) if (doc.getDecision()) { // The decision is already known. - coordinator.completeMigration(opCtx); + coordinator.completeMigration(opCtx, acquireCSOnRecipient); return true; } @@ -1044,7 +1049,7 @@ void recoverMigrationCoordinations(OperationContext* opCtx, NamespaceString nss) coordinator.setMigrationDecision(DecisionEnum::kCommitted); } - coordinator.completeMigration(opCtx); + coordinator.completeMigration(opCtx, acquireCSOnRecipient); setFilteringMetadata(); return true; }); @@ -1129,11 +1134,6 @@ void deleteMigrationRecipientRecoveryDocument(OperationContext* opCtx, const UUI } void resumeMigrationRecipientsOnStepUp(OperationContext* opCtx) { - if (!feature_flags::gFeatureFlagMigrationRecipientCriticalSection.isEnabled( - serverGlobalParams.featureCompatibility)) { - return; - } - LOGV2_DEBUG(6064504, 2, "Starting migration recipient step-up recovery"); unsigned long long ongoingMigrationRecipientsCount = 0; |