From b95494949b10daac02ae9c0da2d492d2e3294f6e Mon Sep 17 00:00:00 2001 From: Antonio Fuschetto Date: Wed, 7 Apr 2021 18:21:57 +0000 Subject: SERVER-55635 Make the setFCV document updates idempotent in a sharded cluster --- .../db/commands/feature_compatibility_version.cpp | 93 ++++++++++++++++++---- .../db/commands/feature_compatibility_version.h | 14 +++- .../feature_compatibility_version_document.idl | 6 ++ .../commands/set_feature_compatibility_version.idl | 7 ++ .../set_feature_compatibility_version_command.cpp | 59 +++++++++++--- 5 files changed, 148 insertions(+), 31 deletions(-) (limited to 'src') diff --git a/src/mongo/db/commands/feature_compatibility_version.cpp b/src/mongo/db/commands/feature_compatibility_version.cpp index 93fa3ac6de6..6ce5e94a728 100644 --- a/src/mongo/db/commands/feature_compatibility_version.cpp +++ b/src/mongo/db/commands/feature_compatibility_version.cpp @@ -200,20 +200,6 @@ bool isWriteableStorageEngine() { return !storageGlobalParams.readOnly && (storageGlobalParams.engine != "devnull"); } -// Returns the featureCompatibilityVersion document if it exists. -boost::optional findFcvDocument(OperationContext* opCtx) { - // Ensure database is opened and exists. - AutoGetOrCreateDb autoDb(opCtx, NamespaceString::kServerConfigurationNamespace.db(), MODE_IX); - - const auto query = BSON("_id" << FeatureCompatibilityVersionParser::kParameterName); - const auto swFcv = repl::StorageInterface::get(opCtx)->findById( - opCtx, NamespaceString::kServerConfigurationNamespace, query["_id"]); - if (!swFcv.isOK()) { - return boost::none; - } - return swFcv.getValue(); -} - /** * Build update command for featureCompatibilityVersion document updates. */ @@ -253,13 +239,74 @@ void runUpdateCommand(OperationContext* opCtx, const FeatureCompatibilityVersion } // namespace +boost::optional FeatureCompatibilityVersion::findFeatureCompatibilityVersionDocument( + OperationContext* opCtx) { + AutoGetOrCreateDb autoDb(opCtx, NamespaceString::kServerConfigurationNamespace.db(), MODE_IX); + + const auto query = BSON("_id" << FeatureCompatibilityVersionParser::kParameterName); + const auto swFcv = repl::StorageInterface::get(opCtx)->findById( + opCtx, NamespaceString::kServerConfigurationNamespace, query["_id"]); + if (!swFcv.isOK()) { + return boost::none; + } + return swFcv.getValue(); +} + void FeatureCompatibilityVersion::validateSetFeatureCompatibilityVersionRequest( - FCV fromVersion, FCV newVersion, bool isFromConfigServer) { + OperationContext* opCtx, const SetFeatureCompatibilityVersion& setFCVRequest, FCV fromVersion) { + + auto newVersion = setFCVRequest.getCommandParameter(); + auto isFromConfigServer = setFCVRequest.getFromConfigServer().value_or(false); + uassert( 5147403, "cannot set featureCompatibilityVersion to '{}' while featureCompatibilityVersion is '{}'"_format( FCVP::toString(newVersion), FCVP::toString(fromVersion)), fcvTransitions.permitsTransition(fromVersion, newVersion, isFromConfigServer)); + + auto setFCVPhase = setFCVRequest.getPhase(); + if (!isFromConfigServer || !setFCVPhase) { + return; + } + + auto changeTimestamp = setFCVRequest.getChangeTimestamp(); + invariant(changeTimestamp); + + auto fcvObj = findFeatureCompatibilityVersionDocument(opCtx); + auto fcvDoc = FeatureCompatibilityVersionDocument::parse( + IDLParserErrorContext("featureCompatibilityVersionDocument"), fcvObj.get()); + auto previousTimestamp = fcvDoc.getChangeTimestamp(); + + if (setFCVPhase == SetFCVPhaseEnum::kStart) { + uassert( + 5563501, + "Shard received a timestamp for phase 1 of the 'setFeatureCompatibilityVersion' " + "command which is too old, so the request is discarded. This may indicate that it is a " + "request related to a previous invocation of the 'setFeatureCompatibilityVersion' " + "command which, for example, was temporarily stuck on a router.", + previousTimestamp && previousTimestamp > changeTimestamp); + } else { + uassert(5563601, + "Cannot transition to fully upgraded or fully downgraded state if the shard is not " + "in kUpgrading or kDowngrading state", + serverGlobalParams.featureCompatibility.isUpgradingOrDowngrading()); + + tassert(5563502, + "Shard received a timestamp for phase-2 of the 'setFeatureCompatibilityVersion' " + "command that does not match the one received for phase-1, so the request is " + "discarded. This may indicate that it is a request related to a previous " + "invocation of the 'setFeatureCompatibilityVersion' command which, for example, " + "was temporarily stuck on a router.", + !previousTimestamp || previousTimestamp < changeTimestamp); + + uassert( + 5563503, + "Shard received a timestamp for phase-2 of the 'setFeatureCompatibilityVersion' " + "command which is too old, so the request is discarded. This may indicate that it is a " + "request related to a previous invocation of the 'setFeatureCompatibilityVersion' " + "command which, for example, was temporarily stuck on a router.", + previousTimestamp > changeTimestamp); + } } void FeatureCompatibilityVersion::updateFeatureCompatibilityVersionDocument( @@ -267,6 +314,7 @@ void FeatureCompatibilityVersion::updateFeatureCompatibilityVersionDocument( ServerGlobalParams::FeatureCompatibility::Version fromVersion, ServerGlobalParams::FeatureCompatibility::Version newVersion, bool isFromConfigServer, + boost::optional changeTimestamp, bool setTargetVersion) { // Only transition to fully upgraded or downgraded states when we @@ -277,6 +325,17 @@ void FeatureCompatibilityVersion::updateFeatureCompatibilityVersionDocument( : fcvTransitions.getTransitionalVersion(fromVersion, newVersion, isFromConfigServer); FeatureCompatibilityVersionDocument fcvDoc = fcvTransitions.getFCVDocument(transitioningVersion); + + // The timestamp must be removed when downgrading to version 4.9 or 4.4. This is necessary to + // avoid the presence of an unknown field that old binaries are unable to deserialize. + if (transitioningVersion == ServerGlobalParams::FeatureCompatibility::Version::kVersion49 || + transitioningVersion == + ServerGlobalParams::FeatureCompatibility::Version::kFullyDowngradedTo44) { + fcvDoc.setChangeTimestamp(boost::none); + } else { + fcvDoc.setChangeTimestamp(changeTimestamp); + } + runUpdateCommand(opCtx, fcvDoc); } @@ -360,7 +419,7 @@ void FeatureCompatibilityVersion::updateMinWireVersion() { void FeatureCompatibilityVersion::initializeForStartup(OperationContext* opCtx) { // Global write lock must be held. invariant(opCtx->lockState()->isW()); - auto featureCompatibilityVersion = findFcvDocument(opCtx); + auto featureCompatibilityVersion = findFeatureCompatibilityVersionDocument(opCtx); if (!featureCompatibilityVersion) { return; } @@ -414,7 +473,7 @@ void FeatureCompatibilityVersion::fassertInitializedAfterStartup(OperationContex return; } - auto fcvDocument = findFcvDocument(opCtx); + auto fcvDocument = findFeatureCompatibilityVersionDocument(opCtx); auto const storageEngine = opCtx->getServiceContext()->getStorageEngine(); auto dbNames = storageEngine->listDatabases(); diff --git a/src/mongo/db/commands/feature_compatibility_version.h b/src/mongo/db/commands/feature_compatibility_version.h index 41563a7f42c..dc7a2ca8ef3 100644 --- a/src/mongo/db/commands/feature_compatibility_version.h +++ b/src/mongo/db/commands/feature_compatibility_version.h @@ -33,6 +33,7 @@ #include "mongo/base/string_data.h" #include "mongo/bson/bsonobj.h" #include "mongo/db/commands/feature_compatibility_version_document_gen.h" +#include "mongo/db/commands/set_feature_compatibility_version_gen.h" #include "mongo/db/repl/storage_interface.h" #include "mongo/db/server_options.h" @@ -55,14 +56,20 @@ public: */ static void fassertInitializedAfterStartup(OperationContext* opCtx); + /** + * Returns the on-disk feature compatibility version document if it exists. + */ + static boost::optional findFeatureCompatibilityVersionDocument( + OperationContext* opCtx); + /** * uassert that a transition from fromVersion to newVersion is permitted. Different rules apply * if the request is from a config server. */ static void validateSetFeatureCompatibilityVersionRequest( - ServerGlobalParams::FeatureCompatibility::Version fromVersion, - ServerGlobalParams::FeatureCompatibility::Version newVersion, - bool isFromConfigServer); + OperationContext* opCtx, + const SetFeatureCompatibilityVersion& setFCVRequest, + ServerGlobalParams::FeatureCompatibility::Version fromVersion); /** * Updates the on-disk feature compatibility version document for the transition fromVersion -> @@ -73,6 +80,7 @@ public: ServerGlobalParams::FeatureCompatibility::Version fromVersion, ServerGlobalParams::FeatureCompatibility::Version newVersion, bool isFromConfigServer, + boost::optional timestamp, bool setTargetVersion); /** diff --git a/src/mongo/db/commands/feature_compatibility_version_document.idl b/src/mongo/db/commands/feature_compatibility_version_document.idl index fc24de54653..90322ed5043 100644 --- a/src/mongo/db/commands/feature_compatibility_version_document.idl +++ b/src/mongo/db/commands/feature_compatibility_version_document.idl @@ -58,3 +58,9 @@ structs: optional: true validator: callback: "FeatureCompatibilityVersionParser::validatePreviousVersionField" + changeTimestamp: + description: "Timestamp used to identify the 2-phase setFCV request. Both phases (kStart and + kComplete) must have the same timestamp for the entire sequence, and every new + sequence started must strictly have incrementing timestamp." + type: timestamp + optional: true diff --git a/src/mongo/db/commands/set_feature_compatibility_version.idl b/src/mongo/db/commands/set_feature_compatibility_version.idl index d6fb33b6193..87a36367e76 100644 --- a/src/mongo/db/commands/set_feature_compatibility_version.idl +++ b/src/mongo/db/commands/set_feature_compatibility_version.idl @@ -75,3 +75,10 @@ commands: shard, it will run the full setFCV sequence (both phases)." type: SetFCVPhase optional: true + changeTimestamp: + description: "Timestamp used to identify the 2-phase setFCV request. Both phases + (kStart and kComplete) must have the same timestamp for the entire + sequence, and every new sequence started must strictly have + incrementing timestamp." + type: timestamp + optional: true diff --git a/src/mongo/db/commands/set_feature_compatibility_version_command.cpp b/src/mongo/db/commands/set_feature_compatibility_version_command.cpp index cf77ab48685..4bb4bd9a4d8 100644 --- a/src/mongo/db/commands/set_feature_compatibility_version_command.cpp +++ b/src/mongo/db/commands/set_feature_compatibility_version_command.cpp @@ -56,6 +56,7 @@ #include "mongo/db/s/config/sharding_catalog_manager.h" #include "mongo/db/server_options.h" #include "mongo/db/transaction_history_iterator.h" +#include "mongo/db/vector_clock.h" #include "mongo/db/views/view_catalog.h" #include "mongo/logv2/log.h" #include "mongo/rpc/get_status_from_command_result.h" @@ -294,14 +295,44 @@ public: return true; } - auto isFromConfigServer = request.getFromConfigServer().value_or(false); + boost::optional changeTimestamp; + if (serverGlobalParams.clusterRole == ClusterRole::ConfigServer) { + // The Config Server creates a new ID (i.e., timestamp) when it receives an upgrade or + // downgrade request. Alternatively, the request refers to a previously aborted + // operation for which the local FCV document must contain the ID to be reused. + + if (!serverGlobalParams.featureCompatibility.isUpgradingOrDowngrading()) { + const auto now = VectorClock::get(opCtx)->getTime(); + changeTimestamp = now.clusterTime().asTimestamp(); + } else { + auto fcvObj = + FeatureCompatibilityVersion::findFeatureCompatibilityVersionDocument(opCtx); + auto fcvDoc = FeatureCompatibilityVersionDocument::parse( + IDLParserErrorContext("featureCompatibilityVersionDocument"), fcvObj.get()); + changeTimestamp = fcvDoc.getChangeTimestamp(); + invariant(changeTimestamp); + } + } else if (serverGlobalParams.clusterRole == ClusterRole::ShardServer && + request.getPhase()) { + // Shards receive the timestamp from the Config Server's request. + changeTimestamp = request.getChangeTimestamp(); + uassert(5563500, + "The 'timestamp' field is missing even though the node is running as a shard. " + "This may indicate that the 'setFeatureCompatibilityVersion' command was " + "invoked directly against the shard or that the config server has not been " + "upgraded to at least version 5.0.", + changeTimestamp); + } + FeatureCompatibilityVersion::validateSetFeatureCompatibilityVersionRequest( - actualVersion, requestedVersion, isFromConfigServer); + opCtx, request, actualVersion); uassert(5563600, "'phase' field is only valid to be specified on shards", !request.getPhase() || serverGlobalParams.clusterRole == ClusterRole::ShardServer); + auto isFromConfigServer = request.getFromConfigServer().value_or(false); + if (!request.getPhase() || request.getPhase() == SetFCVPhaseEnum::kStart) { { // Start transition to 'requestedVersion' by updating the local FCV document to a @@ -316,6 +347,7 @@ public: actualVersion, requestedVersion, isFromConfigServer, + changeTimestamp, true /* setTargetVersion */); } @@ -331,19 +363,15 @@ public: } } + invariant(serverGlobalParams.featureCompatibility.isUpgradingOrDowngrading()); invariant(!request.getPhase() || request.getPhase() == SetFCVPhaseEnum::kComplete); - uassert(5563601, - "Cannot transition to fully upgraded or fully downgraded state if we are not in " - "kUpgrading or kDowngrading state", - serverGlobalParams.featureCompatibility.isUpgradingOrDowngrading()); - hangAfterStartingFCVTransition.pauseWhileSet(opCtx); if (requestedVersion > actualVersion) { - _runUpgrade(opCtx, request); + _runUpgrade(opCtx, request, changeTimestamp); } else { - _runDowngrade(opCtx, request); + _runDowngrade(opCtx, request, changeTimestamp); } { @@ -356,6 +384,7 @@ public: serverGlobalParams.featureCompatibility.getVersion(), requestedVersion, isFromConfigServer, + changeTimestamp, false /* setTargetVersion */); } @@ -363,13 +392,16 @@ public: } private: - void _runUpgrade(OperationContext* opCtx, const SetFeatureCompatibilityVersion& request) { + void _runUpgrade(OperationContext* opCtx, + const SetFeatureCompatibilityVersion& request, + boost::optional changeTimestamp) { const auto requestedVersion = request.getCommandParameter(); if (serverGlobalParams.clusterRole == ClusterRole::ConfigServer) { // Tell the shards to enter phase-1 of setFCV auto requestPhase1 = request; requestPhase1.setPhase(SetFCVPhaseEnum::kStart); + requestPhase1.setChangeTimestamp(changeTimestamp); uassertStatusOK( ShardingCatalogManager::get(opCtx)->setFeatureCompatibilityVersionOnShards( opCtx, CommandHelpers::appendMajorityWriteConcern(requestPhase1.toBSON({})))); @@ -434,6 +466,7 @@ private: // Tell the shards to enter phase-2 of setFCV (fully upgraded) auto requestPhase2 = request; requestPhase2.setPhase(SetFCVPhaseEnum::kComplete); + requestPhase2.setChangeTimestamp(changeTimestamp); uassertStatusOK( ShardingCatalogManager::get(opCtx)->setFeatureCompatibilityVersionOnShards( opCtx, CommandHelpers::appendMajorityWriteConcern(requestPhase2.toBSON({})))); @@ -483,13 +516,16 @@ private: } } - void _runDowngrade(OperationContext* opCtx, const SetFeatureCompatibilityVersion& request) { + void _runDowngrade(OperationContext* opCtx, + const SetFeatureCompatibilityVersion& request, + boost::optional changeTimestamp) { const auto requestedVersion = request.getCommandParameter(); if (serverGlobalParams.clusterRole == ClusterRole::ConfigServer) { // Tell the shards to enter phase-1 of setFCV auto requestPhase1 = request; requestPhase1.setPhase(SetFCVPhaseEnum::kStart); + requestPhase1.setChangeTimestamp(changeTimestamp); uassertStatusOK( ShardingCatalogManager::get(opCtx)->setFeatureCompatibilityVersionOnShards( opCtx, CommandHelpers::appendMajorityWriteConcern(requestPhase1.toBSON({})))); @@ -572,6 +608,7 @@ private: // Tell the shards to enter phase-2 of setFCV (fully downgraded) auto requestPhase2 = request; requestPhase2.setPhase(SetFCVPhaseEnum::kComplete); + requestPhase2.setChangeTimestamp(changeTimestamp); uassertStatusOK( ShardingCatalogManager::get(opCtx)->setFeatureCompatibilityVersionOnShards( opCtx, CommandHelpers::appendMajorityWriteConcern(requestPhase2.toBSON({})))); -- cgit v1.2.1