diff options
author | Siyuan Zhou <siyuan.zhou@mongodb.com> | 2020-02-24 01:22:36 -0500 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-02-29 00:02:40 +0000 |
commit | 6c0645af5999ef52579c04f1b6fff422bd89da3c (patch) | |
tree | d8ce20701fef24f7ed9f2691e4972b792487f53e | |
parent | 2d8f351c2640f402027a2fc0c51628db8d4f85b1 (diff) | |
download | mongo-6c0645af5999ef52579c04f1b6fff422bd89da3c.tar.gz |
SERVER-46288 Reconfig in 4.2 style with the current config on FCV downgrade
- Refactoring doReplSetConfig to accept a callback.
- Use replset tag system to wait for config replication.
- Revert kConfigMajority write concern.
- Include all nodes including arbiters in config replication quorum.
- Reconfig with term -1 using the current config on downgrade.
27 files changed, 427 insertions, 247 deletions
diff --git a/jstests/multiVersion/genericSetFCVUsage/set_feature_compatibility_version.js b/jstests/multiVersion/genericSetFCVUsage/set_feature_compatibility_version.js index d7a1c8853f0..02e6fbb9d40 100644 --- a/jstests/multiVersion/genericSetFCVUsage/set_feature_compatibility_version.js +++ b/jstests/multiVersion/genericSetFCVUsage/set_feature_compatibility_version.js @@ -309,8 +309,9 @@ assert.commandFailed( st.rs0.getPrimary().discardMessagesFrom(st.configRS.getPrimary(), 1.0); jsTestLog( "EXPECTED TO FAIL: setFeatureCompatibilityVersion cannot be set because the shard primary is not reachable"); +// Downgrading from 4.4 needs to wait for all nodes to reconfig, so we specify a higher timeout. assert.commandFailed( - mongosAdminDB.runCommand({setFeatureCompatibilityVersion: lastStableFCV, maxTimeMS: 1000})); + mongosAdminDB.runCommand({setFeatureCompatibilityVersion: lastStableFCV, maxTimeMS: 10000})); checkFCV(configPrimaryAdminDB, lastStableFCV, lastStableFCV /* indicates downgrade in progress */); st.rs0.getPrimary().discardMessagesFrom(st.configRS.getPrimary(), 0.0); diff --git a/jstests/multiVersion/remove_config_term_on_replset_downgrade.js b/jstests/multiVersion/remove_config_term_on_replset_downgrade.js index 4315fdcc506..782c4e24fd8 100644 --- a/jstests/multiVersion/remove_config_term_on_replset_downgrade.js +++ b/jstests/multiVersion/remove_config_term_on_replset_downgrade.js @@ -52,7 +52,11 @@ rst.nodes.forEach(function(node) { jsTestLog("Checking the config term on node " + tojson(node.host) + " after FCV downgrade."); let config = node.getDB("local").getCollection("system.replset").findOne(); assert(!config.hasOwnProperty("term"), tojson(config)); + // The configs can only differ in config versions and terms. config.term = configInNewVersion.term; + // The versions differ because of the force reconfig on downgrade. + assert.eq(config.version, configInNewVersion.version + 1); + config.version = configInNewVersion.version; assert.docEq(configInNewVersion, config); }); @@ -66,7 +70,11 @@ rst.nodes.forEach(function(node) { jsTestLog("Checking the config term on node " + tojson(node.host) + " after binary downgrade."); let config = node.getDB("local").getCollection("system.replset").findOne(); assert(!config.hasOwnProperty("term"), tojson(config)); + // The configs can only differ in config versions and terms. config.term = configInNewVersion.term; + // The versions differ because of the force reconfig on downgrade. + assert.eq(config.version, configInNewVersion.version + 1); + config.version = configInNewVersion.version; assert.docEq(configInNewVersion, config); }); rst.stopSet(); diff --git a/jstests/replsets/awaitable_ismaster_fcv_change.js b/jstests/replsets/awaitable_ismaster_fcv_change.js index 872e9ff33e8..5099acf462d 100644 --- a/jstests/replsets/awaitable_ismaster_fcv_change.js +++ b/jstests/replsets/awaitable_ismaster_fcv_change.js @@ -40,7 +40,7 @@ assert(secondaryTopologyVersion.hasOwnProperty("counter"), tojson(secondaryTopol function runAwaitableIsMasterBeforeFCVChange( topologyVersionField, isUpgrade, isPrimary, prevMinWireVersion, serverMaxWireVersion) { db.getMongo().setSlaveOk(); - const firstResponse = assert.commandWorked(db.runCommand({ + let response = assert.commandWorked(db.runCommand({ isMaster: 1, topologyVersion: topologyVersionField, maxAwaitTimeMS: 99999999, @@ -48,26 +48,37 @@ function runAwaitableIsMasterBeforeFCVChange( {minWireVersion: NumberInt(0), maxWireVersion: NumberInt(serverMaxWireVersion)}, })); + // On downgrade from 4.4 to 4.2, the primary will reconfig the replset and signal isMaster. + if (prevMinWireVersion === response.minWireVersion) { + jsTestLog("Min wire version didn't change: " + prevMinWireVersion + ". Retry isMaster."); + topologyVersionField = response.topologyVersion; + response = assert.commandWorked(db.runCommand({ + isMaster: 1, + topologyVersion: topologyVersionField, + maxAwaitTimeMS: 99999999, + internalClient: + {minWireVersion: NumberInt(0), maxWireVersion: NumberInt(serverMaxWireVersion)}, + })); + } // We only expect to increment the server TopologyVersion when the minWireVersion has changed. // This can only happen in two scenarios: // 1. Setting featureCompatibilityVersion from downgrading to fullyDowngraded. // 2. Setting featureCompatibilityVersion from fullyDowngraded to upgrading. - assert.eq( - topologyVersionField.counter + 1, firstResponse.topologyVersion.counter, firstResponse); + assert.eq(topologyVersionField.counter + 1, response.topologyVersion.counter, response); const expectedIsMasterValue = isPrimary; const expectedSecondaryValue = !isPrimary; - assert.eq(expectedIsMasterValue, firstResponse.ismaster, firstResponse); - assert.eq(expectedSecondaryValue, firstResponse.secondary, firstResponse); + assert.eq(expectedIsMasterValue, response.ismaster, response); + assert.eq(expectedSecondaryValue, response.secondary, response); - const minWireVersion = firstResponse.minWireVersion; - const maxWireVersion = firstResponse.maxWireVersion; + const minWireVersion = response.minWireVersion; + const maxWireVersion = response.maxWireVersion; assert.neq(prevMinWireVersion, minWireVersion); if (isUpgrade) { // minWireVersion should always equal maxWireVersion if we have not fully downgraded FCV. - assert.eq(minWireVersion, maxWireVersion, firstResponse); + assert.eq(minWireVersion, maxWireVersion, response); } else { - assert.eq(minWireVersion + 1, maxWireVersion, firstResponse); + assert.eq(minWireVersion + 1, maxWireVersion, response); } } diff --git a/src/mongo/db/commands/feature_compatibility_version.cpp b/src/mongo/db/commands/feature_compatibility_version.cpp index cffd5882666..1e06fc65e88 100644 --- a/src/mongo/db/commands/feature_compatibility_version.cpp +++ b/src/mongo/db/commands/feature_compatibility_version.cpp @@ -165,17 +165,6 @@ void FeatureCompatibilityVersion::onInsertOrUpdate(OperationContext* opCtx, cons FeatureCompatibilityVersionParser::toString(newVersion)); } - // Remove term field of config document on downgrade. - if (newVersion == ServerGlobalParams::FeatureCompatibility::Version::kFullyDowngradedTo42 && - repl::ReplicationCoordinator::get(opCtx)->getReplicationMode() == - repl::ReplicationCoordinator::modeReplSet) { - auto storageInterface = repl::StorageInterface::get(opCtx); - repl::UnreplicatedWritesBlock uwb(opCtx); - repl::TimestampedBSONObj update{BSON("$unset" << BSON("term" << 1)), Timestamp()}; - uassertStatusOK(storageInterface->updateSingleton( - opCtx, NamespaceString::kSystemReplSetNamespace, {}, update)); - } - opCtx->recoveryUnit()->onCommit([opCtx, newVersion](boost::optional<Timestamp>) { serverGlobalParams.featureCompatibility.setVersion(newVersion); updateMinWireVersion(); diff --git a/src/mongo/db/commands/set_feature_compatibility_version_command.cpp b/src/mongo/db/commands/set_feature_compatibility_version_command.cpp index b8794e24eb2..f35c67c5321 100644 --- a/src/mongo/db/commands/set_feature_compatibility_version_command.cpp +++ b/src/mongo/db/commands/set_feature_compatibility_version_command.cpp @@ -161,8 +161,9 @@ public: auto waitForWCStatus = waitForWriteConcern( opCtx, repl::ReplClientInfo::forClient(opCtx->getClient()).getLastOp(), - WriteConcernOptions( - WriteConcernOptions::kMajority, WriteConcernOptions::SyncMode::UNSET, timeout), + WriteConcernOptions(repl::ReplSetConfig::kMajorityWriteConcernModeName, + WriteConcernOptions::SyncMode::UNSET, + timeout), &res); CommandHelpers::appendCommandWCStatus(result, waitForWCStatus, res); }); @@ -305,6 +306,44 @@ public: FeatureCompatibilityVersion::setTargetDowngrade(opCtx); + // Safe reconfig introduces a new "term" field in the config document. If the user tries + // to downgrade the replset to FCV42, the primary will initiate a reconfig without the + // term and wait for it to be replicated on all nodes. + auto replCoord = repl::ReplicationCoordinator::get(opCtx); + const bool isReplSet = + replCoord->getReplicationMode() == repl::ReplicationCoordinator::modeReplSet; + if (isReplSet && + replCoord->getConfig().getConfigTerm() != repl::OpTime::kUninitializedTerm) { + // Force reconfig with term -1 to remove the 4.2 incompatible "term" field. + auto getNewConfig = [&](const repl::ReplSetConfig& oldConfig, long long term) { + auto newConfig = oldConfig; + newConfig.setConfigTerm(repl::OpTime::kUninitializedTerm); + newConfig.setConfigVersion(newConfig.getConfigVersion() + 1); + return newConfig; + }; + + // "force" reconfig in order to skip safety checks. This is safe since the content + // of config is the same. + LOGV2(4628800, "Downgrading replica set config."); + auto status = replCoord->doReplSetReconfig(opCtx, getNewConfig, true /* force */); + uassertStatusOKWithContext(status, "Failed to downgrade the replica set config"); + + LOGV2(4628801, + "Waiting for the downgraded replica set config to propagate to all nodes"); + // If a write concern is given, we'll use its wTimeout. It's kNoTimeout by default. + WriteConcernOptions writeConcern(repl::ReplSetConfig::kConfigAllWriteConcernName, + WriteConcernOptions::SyncMode::NONE, + opCtx->getWriteConcern().wTimeout); + writeConcern.checkCondition = WriteConcernOptions::CheckCondition::Config; + repl::OpTime fakeOpTime(Timestamp(1, 1), replCoord->getTerm()); + uassertStatusOKWithContext( + replCoord->awaitReplication(opCtx, fakeOpTime, writeConcern).status, + "Failed to wait for the downgraded replica set config to propagate to all " + "nodes"); + LOGV2(4628802, + "The downgraded replica set config has been propagated to all nodes"); + } + { // Take the global lock in S mode to create a barrier for operations taking the // global IX or X locks. This ensures that either @@ -319,9 +358,6 @@ public: if (failDowngrading.shouldFail()) return false; - const bool isReplSet = repl::ReplicationCoordinator::get(opCtx)->getReplicationMode() == - repl::ReplicationCoordinator::modeReplSet; - if (serverGlobalParams.clusterRole == ClusterRole::ShardServer) { LOGV2(20502, "Downgrade: dropping config.rangeDeletions collection"); migrationutil::dropRangeDeletionsCollection(opCtx); diff --git a/src/mongo/db/repl/member_config.cpp b/src/mongo/db/repl/member_config.cpp index 5ab43763c8c..2a9e9f0c16c 100644 --- a/src/mongo/db/repl/member_config.cpp +++ b/src/mongo/db/repl/member_config.cpp @@ -54,6 +54,7 @@ const std::string MemberConfig::kHorizonsFieldName = "horizons"; const std::string MemberConfig::kInternalVoterTagName = "$voter"; const std::string MemberConfig::kInternalElectableTagName = "$electable"; const std::string MemberConfig::kInternalAllTagName = "$all"; +const std::string MemberConfig::kConfigAllTagName = "$configAll"; namespace { const std::string kLegalMemberConfigFieldNames[] = {MemberConfig::kIdFieldName, @@ -217,6 +218,9 @@ MemberConfig::MemberConfig(const BSONObj& mcfg, ReplSetTagConfig* tagConfig) { if (!_arbiterOnly) { _tags.push_back(tagConfig->makeTag(kInternalAllTagName, id)); } + + // Add a tag for every node, including arbiters. + _tags.push_back(tagConfig->makeTag(kConfigAllTagName, id)); } Status MemberConfig::validate() const { @@ -231,7 +235,8 @@ Status MemberConfig::validate() const { << " but must be 0 or 1"); } if (_arbiterOnly) { - if (!_tags.empty()) { + // Arbiters only have one internal tag. + if (_tags.size() != 1) { return Status(ErrorCodes::BadValue, "Cannot set tags on arbiters."); } if (!isVoter()) { diff --git a/src/mongo/db/repl/member_config.h b/src/mongo/db/repl/member_config.h index 29b6e95232a..1994bacd753 100644 --- a/src/mongo/db/repl/member_config.h +++ b/src/mongo/db/repl/member_config.h @@ -66,6 +66,7 @@ public: static const std::string kInternalVoterTagName; static const std::string kInternalElectableTagName; static const std::string kInternalAllTagName; + static const std::string kConfigAllTagName; /** * Construct a MemberConfig from the contents of "mcfg". diff --git a/src/mongo/db/repl/member_config_test.cpp b/src/mongo/db/repl/member_config_test.cpp index 30941d51544..74adeb64776 100644 --- a/src/mongo/db/repl/member_config_test.cpp +++ b/src/mongo/db/repl/member_config_test.cpp @@ -52,7 +52,7 @@ TEST(MemberConfig, ParseMinimalMemberConfigAndCheckDefaults) { ASSERT_FALSE(mc.isHidden()); ASSERT_FALSE(mc.isArbiter()); ASSERT_TRUE(mc.shouldBuildIndexes()); - ASSERT_EQUALS(3U, mc.getNumTags()); + ASSERT_EQUALS(4U, mc.getNumTags()); ASSERT_OK(mc.validate()); } @@ -289,14 +289,16 @@ TEST(MemberConfig, ParseTags) { << "k2" << "v2")), &tagConfig); - ASSERT_EQUALS(5U, mc.getNumTags()); - ASSERT_EQUALS(5, std::distance(mc.tagsBegin(), mc.tagsEnd())); + ASSERT_EQUALS(6U, mc.getNumTags()); + ASSERT_EQUALS(6, std::distance(mc.tagsBegin(), mc.tagsEnd())); ASSERT_EQUALS(1, std::count(mc.tagsBegin(), mc.tagsEnd(), tagConfig.findTag("k1", "v1"))); ASSERT_EQUALS(1, std::count(mc.tagsBegin(), mc.tagsEnd(), tagConfig.findTag("k2", "v2"))); ASSERT_EQUALS(1, std::count(mc.tagsBegin(), mc.tagsEnd(), tagConfig.findTag("$voter", "0"))); ASSERT_EQUALS(1, std::count(mc.tagsBegin(), mc.tagsEnd(), tagConfig.findTag("$electable", "0"))); ASSERT_EQUALS(1, std::count(mc.tagsBegin(), mc.tagsEnd(), tagConfig.findTag("$all", "0"))); + ASSERT_EQUALS(1, + std::count(mc.tagsBegin(), mc.tagsEnd(), tagConfig.findTag("$configAll", "0"))); } TEST(MemberConfig, ParseHorizonFields) { diff --git a/src/mongo/db/repl/repl_set_config.cpp b/src/mongo/db/repl/repl_set_config.cpp index a19bb6d2b5a..23add8494e8 100644 --- a/src/mongo/db/repl/repl_set_config.cpp +++ b/src/mongo/db/repl/repl_set_config.cpp @@ -55,7 +55,6 @@ const Milliseconds ReplSetConfig::kCatchUpTakeoverDisabled(-1); const std::string ReplSetConfig::kConfigServerFieldName = "configsvr"; const std::string ReplSetConfig::kVersionFieldName = "version"; const std::string ReplSetConfig::kTermFieldName = "term"; -const std::string ReplSetConfig::kMajorityWriteConcernModeName = "$majority"; const Milliseconds ReplSetConfig::kDefaultHeartbeatInterval(2000); const Seconds ReplSetConfig::kDefaultHeartbeatTimeoutPeriod(10); const Milliseconds ReplSetConfig::kDefaultElectionTimeoutPeriod(10000); @@ -686,8 +685,7 @@ Status ReplSetConfig::validate() const { Status ReplSetConfig::checkIfWriteConcernCanBeSatisfied( const WriteConcernOptions& writeConcern) const { - if (!writeConcern.wMode.empty() && writeConcern.wMode != WriteConcernOptions::kMajority && - writeConcern.wMode != WriteConcernOptions::kConfigMajority) { + if (!writeConcern.wMode.empty() && writeConcern.wMode != WriteConcernOptions::kMajority) { StatusWith<ReplSetTagPattern> tagPatternStatus = findCustomWriteMode(writeConcern.wMode); if (!tagPatternStatus.isOK()) { return tagPatternStatus.getStatus(); @@ -841,6 +839,30 @@ void ReplSetConfig::_addInternalWriteConcernModes() { // other errors are unexpected fassert(28694, status); } + + // $majorityConfig: the majority of all members including arbiters. + pattern = _tagConfig.makePattern(); + status = _tagConfig.addTagCountConstraintToPattern( + &pattern, MemberConfig::kConfigAllTagName, _members.size() / 2 + 1); + if (status.isOK()) { + _customWriteConcernModes[kConfigMajorityWriteConcernModeName] = pattern; + } else if (status != ErrorCodes::NoSuchKey) { + // NoSuchKey means we have no $configAll-tagged nodes in this config; + // other errors are unexpected + fassert(31472, status); + } + + // $configAll: all members including arbiters. + pattern = _tagConfig.makePattern(); + status = _tagConfig.addTagCountConstraintToPattern( + &pattern, MemberConfig::kConfigAllTagName, _members.size()); + if (status.isOK()) { + _customWriteConcernModes[kConfigAllWriteConcernName] = pattern; + } else if (status != ErrorCodes::NoSuchKey) { + // NoSuchKey means we have no $all-tagged nodes in this config; + // other errors are unexpected + fassert(31473, status); + } } void ReplSetConfig::_initializeConnectionString() { diff --git a/src/mongo/db/repl/repl_set_config.h b/src/mongo/db/repl/repl_set_config.h index 9f52c69e9d6..7f188555dc6 100644 --- a/src/mongo/db/repl/repl_set_config.h +++ b/src/mongo/db/repl/repl_set_config.h @@ -117,7 +117,9 @@ public: static const std::string kConfigServerFieldName; static const std::string kVersionFieldName; static const std::string kTermFieldName; - static const std::string kMajorityWriteConcernModeName; + static constexpr char kMajorityWriteConcernModeName[] = "$majority"; + static constexpr char kConfigMajorityWriteConcernModeName[] = "$configMajority"; + static constexpr char kConfigAllWriteConcernName[] = "$configAll"; // If this field is present, a repair operation potentially modified replicated data. This // should never be included in a valid configuration document. @@ -181,7 +183,7 @@ public: Status checkIfWriteConcernCanBeSatisfied(const WriteConcernOptions& writeConcern) const; /** - * Gets the version of this configuration. + * Gets and sets the version of this configuration. * * The version number sequences configurations of the replica set, so that * nodes may distinguish between "older" and "newer" configurations. @@ -190,8 +192,12 @@ public: return _version; } + void setConfigVersion(long long version) { + _version = version; + } + /** - * Gets the term of this configuration. + * Gets and sets the term of this configuration. * * The configuration term is the term of the primary that originally created this configuration. * Configurations in a replica set are totally ordered by their term and configuration version. @@ -200,6 +206,10 @@ public: return _term; } + void setConfigTerm(long long term) { + _term = term; + } + /** * Gets the (version, term) pair of this configuration. */ diff --git a/src/mongo/db/repl/repl_set_config_checks.cpp b/src/mongo/db/repl/repl_set_config_checks.cpp index 19648e8319a..9c7deaad980 100644 --- a/src/mongo/db/repl/repl_set_config_checks.cpp +++ b/src/mongo/db/repl/repl_set_config_checks.cpp @@ -195,19 +195,23 @@ Status validateSingleNodeChange(const ReplSetConfig& oldConfig, const ReplSetCon * primary under "oldConfig" and is electable under "newConfig". Such checks that * require knowledge of which node is executing the configuration are out of scope * for this function. + * + * When "force" is true, skips config version check, since the version is guaranteed + * to be valid either by "force" reconfig command or by internal use. */ Status validateOldAndNewConfigsCompatible(const ReplSetConfig& oldConfig, const ReplSetConfig& newConfig) { invariant(newConfig.isInitialized()); invariant(oldConfig.isInitialized()); - if (oldConfig.getConfigVersion() >= newConfig.getConfigVersion()) { - return Status(ErrorCodes::NewReplicaSetConfigurationIncompatible, - str::stream() - << "New replica set configuration version must be greater than old, but " - << newConfig.getConfigVersion() << " is not greater than " - << oldConfig.getConfigVersion() << " for replica set " - << newConfig.getReplSetName()); + if (oldConfig.getConfigVersionAndTerm() >= newConfig.getConfigVersionAndTerm()) { + return Status( + ErrorCodes::NewReplicaSetConfigurationIncompatible, + str::stream() + << "New replica set configuration version and term must be greater than old, but " + << newConfig.getConfigVersionAndTerm().toString() << " is not greater than " + << oldConfig.getConfigVersionAndTerm().toString() << " for replica set " + << newConfig.getReplSetName()); } if (oldConfig.getReplSetName() != newConfig.getReplSetName()) { diff --git a/src/mongo/db/repl/repl_set_config_checks_test.cpp b/src/mongo/db/repl/repl_set_config_checks_test.cpp index d48383c767a..22c17222c11 100644 --- a/src/mongo/db/repl/repl_set_config_checks_test.cpp +++ b/src/mongo/db/repl/repl_set_config_checks_test.cpp @@ -244,22 +244,12 @@ TEST_F(ServiceContextTest, ValidateConfigForReconfig_NewConfigVersionNumberMustB validateConfigForReconfig( &externalState, oldConfig, oldConfig, getGlobalServiceContext(), false) .getStatus()); - // Forced reconfigs also do not allow this. - ASSERT_EQUALS(ErrorCodes::NewReplicaSetConfigurationIncompatible, - validateConfigForReconfig( - &externalState, oldConfig, oldConfig, getGlobalServiceContext(), true) - .getStatus()); // Cannot reconfig from new to old (versions must increase). ASSERT_EQUALS(ErrorCodes::NewReplicaSetConfigurationIncompatible, validateConfigForReconfig( &externalState, newConfig, oldConfig, getGlobalServiceContext(), false) .getStatus()); - // Forced reconfigs also do not allow this. - ASSERT_EQUALS(ErrorCodes::NewReplicaSetConfigurationIncompatible, - validateConfigForReconfig( - &externalState, newConfig, oldConfig, getGlobalServiceContext(), true) - .getStatus()); } TEST_F(ServiceContextTest, ValidateConfigForReconfig_NewConfigMustNotChangeSetName) { diff --git a/src/mongo/db/repl/replication_coordinator.h b/src/mongo/db/repl/replication_coordinator.h index 9984d3803f3..6d6d1f37c88 100644 --- a/src/mongo/db/repl/replication_coordinator.h +++ b/src/mongo/db/repl/replication_coordinator.h @@ -688,6 +688,15 @@ public: const ReplSetReconfigArgs& args, BSONObjBuilder* resultObj) = 0; + /** + * Install the new config returned by the callback "getNewConfig". + */ + using GetNewConfigFn = std::function<StatusWith<ReplSetConfig>(const ReplSetConfig& oldConfig, + long long currentTerm)>; + virtual Status doReplSetReconfig(OperationContext* opCtx, + GetNewConfigFn getNewConfig, + bool force) = 0; + /* * Handles an incoming replSetInitiate command. If "configObj" is empty, generates a default * configuration to use. diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index ee2a8bd820a..2531d08a5df 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -186,26 +186,6 @@ void lockAndCall(stdx::unique_lock<Latch>* lk, const std::function<void()>& fn) fn(); } -/** - * Implements the force-reconfig behavior of incrementing config version by a large random - * number. - */ -BSONObj incrementConfigVersionByRandom(BSONObj config) { - BSONObjBuilder builder; - SecureRandom generator; - for (BSONObjIterator iter(config); iter.more(); iter.next()) { - BSONElement elem = *iter; - if (elem.fieldNameStringData() == ReplSetConfig::kVersionFieldName && elem.isNumber()) { - const int random = generator.nextInt32(100'000); - builder.appendIntOrLL(ReplSetConfig::kVersionFieldName, - elem.numberLong() + 10'000 + random); - } else { - builder.append(elem); - } - } - return builder.obj(); -} - template <typename T> StatusOrStatusWith<T> futureGetNoThrowWithDeadline(OperationContext* opCtx, SharedSemiFuture<T>& f, @@ -1695,22 +1675,11 @@ bool ReplicationCoordinatorImpl::_doneWaitingForReplication_inlock( // The syncMode cannot be unset. invariant(writeConcern.syncMode != WriteConcernOptions::SyncMode::UNSET); - // When waiting for the config to be replicated to a majority, we do not wait on a specific - // OpTime. We specifically populate waiters with a null OpTime. - invariant(writeConcern.wMode != WriteConcernOptions::kConfigMajority || opTime.isNull()); - const bool useDurableOpTime = writeConcern.syncMode == WriteConcernOptions::SyncMode::JOURNAL; - if (writeConcern.wMode.empty()) { return _topCoord->haveNumNodesReachedOpTime( opTime, writeConcern.wNumNodes, useDurableOpTime); } - - // Check that the nodes in the new config have replicated the config. - if (writeConcern.wMode == WriteConcernOptions::kConfigMajority) { - return _topCoord->haveMajorityReplicatedConfig(); - } - StringData patternName; if (writeConcern.wMode == WriteConcernOptions::kMajority) { if (_externalState->snapshotsEnabled() && !gTestingSnapshotBehaviorInIsolation) { @@ -1759,9 +1728,14 @@ bool ReplicationCoordinatorImpl::_doneWaitingForReplication_inlock( } else { patternName = writeConcern.wMode; } - auto tagPattern = uassertStatusOK(_rsConfig.findCustomWriteMode(patternName)); - return _topCoord->haveTaggedNodesReachedOpTime(opTime, tagPattern, useDurableOpTime); + if (writeConcern.checkCondition == WriteConcernOptions::CheckCondition::OpTime) { + return _topCoord->haveTaggedNodesReachedOpTime(opTime, tagPattern, useDurableOpTime); + } else { + invariant(writeConcern.checkCondition == WriteConcernOptions::CheckCondition::Config); + auto pred = _topCoord->makeConfigPredicate(); + return _topCoord->haveTaggedNodesSatisfiedCondition(pred, tagPattern); + } } ReplicationCoordinator::StatusAndDuration ReplicationCoordinatorImpl::awaitReplication( @@ -1839,22 +1813,16 @@ BSONObj ReplicationCoordinatorImpl::_getReplicationProgress(WithLock wl) const { SharedSemiFuture<void> ReplicationCoordinatorImpl::_startWaitingForReplication( WithLock wl, const OpTime& opTime, const WriteConcernOptions& writeConcern) { - // When waiting for the config to be replicated to a majority, we do not wait on a specific - // OpTime. We specifically populate waiters with a null OpTime. - invariant(writeConcern.wMode != WriteConcernOptions::kConfigMajority || opTime.isNull()); const Mode replMode = getReplicationMode(); if (replMode == modeNone) { // no replication check needed (validated above) return Future<void>::makeReady(); } - - if (opTime.isNull() && writeConcern.wMode != WriteConcernOptions::kConfigMajority) { - // If waiting for the empty optime, always say it's been replicated unless we're trying to - // replicate a new config. + if (opTime.isNull()) { + // If waiting for the empty optime, always say it's been replicated. return Future<void>::makeReady(); } - if (_inShutdown) { return Future<void>::makeReady( Status{ErrorCodes::ShutdownInProgress, "Replication is being shut down"}); @@ -1866,11 +1834,7 @@ SharedSemiFuture<void> ReplicationCoordinatorImpl::_startWaitingForReplication( "Primary stepped down while waiting for replication"}; } - // Checking for a config majority does not rely on waiting for a specific OpTime. So, we - // pass a null OpTime to awaitReplication, which means the term here will always be -1. - // Make sure we don't unnecessarily step down in this case. - if (writeConcern.wMode != WriteConcernOptions::kConfigMajority && - opTime.getTerm() != _topCoord->getTerm()) { + if (opTime.getTerm() != _topCoord->getTerm()) { return { ErrorCodes::PrimarySteppedDown, str::stream() << "Term changed from " << opTime.getTerm() << " to " @@ -2897,6 +2861,51 @@ Status ReplicationCoordinatorImpl::processReplSetReconfig(OperationContext* opCt "replSetReconfig admin command received from client; new config: {newConfig}", "newConfig"_attr = args.newConfigObj); + auto getNewConfig = [&](const ReplSetConfig& oldConfig, + long long currentTerm) -> StatusWith<ReplSetConfig> { + ReplSetConfig newConfig; + + // Only explicitly set configTerm for reconfig to this node's term if we're in FCV 4.4. + // Otherwise, use -1. + auto isUpgraded = serverGlobalParams.featureCompatibility.isVersion( + ServerGlobalParams::FeatureCompatibility::Version::kFullyUpgradedTo44); + auto term = (!args.force && isUpgraded) ? currentTerm : OpTime::kUninitializedTerm; + + // When initializing a new config through the replSetReconfig command, ignore the term + // field passed in through its args. Instead, use this node's term. + Status status = newConfig.initialize(args.newConfigObj, term, oldConfig.getReplicaSetId()); + if (!status.isOK()) { + LOGV2_ERROR(21418, + "replSetReconfig got {status} while parsing {newConfigObj}", + "status"_attr = status, + "newConfigObj"_attr = args.newConfigObj); + return Status(ErrorCodes::InvalidReplicaSetConfig, status.reason()); + } + if (newConfig.getReplSetName() != _settings.ourSetName()) { + str::stream errmsg; + errmsg << "Attempting to reconfigure a replica set with name " + << newConfig.getReplSetName() << ", but command line reports " + << _settings.ourSetName() << "; rejecting"; + LOGV2_ERROR( + 21419, "{std_string_errmsg}", "std_string_errmsg"_attr = std::string(errmsg)); + return Status(ErrorCodes::InvalidReplicaSetConfig, errmsg); + } + + // Increase the config version for force reconfig. + if (args.force) { + auto version = std::max(oldConfig.getConfigVersion(), newConfig.getConfigVersion()); + version += 10'000 + SecureRandom().nextInt32(100'000); + newConfig.setConfigVersion(version); + } + return newConfig; + }; + + return doReplSetReconfig(opCtx, getNewConfig, args.force); +} + +Status ReplicationCoordinatorImpl::doReplSetReconfig(OperationContext* opCtx, + GetNewConfigFn getNewConfig, + bool force) { stdx::unique_lock<Latch> lk(_mutex); while (_rsConfigState == kConfigPreStart || _rsConfigState == kConfigStartingUp) { @@ -2927,24 +2936,26 @@ Status ReplicationCoordinatorImpl::processReplSetReconfig(OperationContext* opCt invariant(_rsConfig.isInitialized()); - if (!args.force && !_getMemberState_inlock().primary()) { + if (!force && !_getMemberState_inlock().primary()) { return Status(ErrorCodes::NotMaster, str::stream() << "replSetReconfig should only be run on PRIMARY, but my state is " << _getMemberState_inlock().toString() << "; use the \"force\" argument to override"); } + auto topCoordTerm = _topCoord->getTerm(); + + WriteConcernOptions configWriteConcern(ReplSetConfig::kConfigMajorityWriteConcernModeName, + WriteConcernOptions::SyncMode::NONE, + WriteConcernOptions::kNoTimeout); + configWriteConcern.checkCondition = WriteConcernOptions::CheckCondition::Config; + // Construct a fake OpTime that can be accepted but isn't used. + OpTime fakeOpTime(Timestamp(1, 1), topCoordTerm); if (serverGlobalParams.featureCompatibility.isVersion( ServerGlobalParams::FeatureCompatibility::Version::kFullyUpgradedTo44) && - !args.force) { - WriteConcernOptions writeConcern( - WriteConcernOptions::kConfigMajority, - WriteConcernOptions::SyncMode::NONE, - // The timeout isn't used by _doneWaitingForReplication_inlock. - WriteConcernOptions::kNoTimeout); - - if (!_doneWaitingForReplication_inlock(OpTime(), writeConcern)) { + !force) { + if (!_doneWaitingForReplication_inlock(fakeOpTime, configWriteConcern)) { return Status(ErrorCodes::ConfigurationInProgress, str::stream() << "Cannot run replSetReconfig because the current config: " @@ -2986,46 +2997,21 @@ Status ReplicationCoordinatorImpl::processReplSetReconfig(OperationContext* opCt makeGuard([&] { lockAndCall(&lk, [=] { _setConfigState_inlock(kConfigSteady); }); }); ReplSetConfig oldConfig = _rsConfig; - // Only explicitly set configTerm for reconfig to this node's term if we're in FCV 4.4. - // Otherwise, use -1. - // Make sure we get the term from the topCoord under the replCoord mutex. - auto topCoordTerm = serverGlobalParams.featureCompatibility.isVersion( - ServerGlobalParams::FeatureCompatibility::Version::kFullyUpgradedTo44) - ? _topCoord->getTerm() - : OpTime::kUninitializedTerm; lk.unlock(); - ReplSetConfig newConfig; - BSONObj newConfigObj = args.newConfigObj; - if (args.force) { - newConfigObj = incrementConfigVersionByRandom(newConfigObj); - } + // Call the callback to get the new config given the old one. + auto newConfigStatus = getNewConfig(oldConfig, topCoordTerm); + Status status = newConfigStatus.getStatus(); + if (!status.isOK()) + return status; + ReplSetConfig newConfig = newConfigStatus.getValue(); BSONObj oldConfigObj = oldConfig.toBSON(); + BSONObj newConfigObj = newConfig.toBSON(); audit::logReplSetReconfig(opCtx->getClient(), &oldConfigObj, &newConfigObj); - // When initializing a new config through the replSetReconfig command, ignore the term - // field passed in through its args. Instead, use this node's term. - // If it is a force reconfig, explicitly set the term to -1. - auto term = !args.force ? topCoordTerm : OpTime::kUninitializedTerm; - Status status = newConfig.initialize(newConfigObj, term, oldConfig.getReplicaSetId()); - if (!status.isOK()) { - LOGV2_ERROR(21418, - "replSetReconfig got {status} while parsing {newConfigObj}", - "status"_attr = status, - "newConfigObj"_attr = newConfigObj); - return Status(ErrorCodes::InvalidReplicaSetConfig, status.reason()); - } - if (newConfig.getReplSetName() != _settings.ourSetName()) { - str::stream errmsg; - errmsg << "Attempting to reconfigure a replica set with name " << newConfig.getReplSetName() - << ", but command line reports " << _settings.ourSetName() << "; rejecting"; - LOGV2_ERROR(21419, "{errmsg}", "errmsg"_attr = std::string(errmsg)); - return Status(ErrorCodes::InvalidReplicaSetConfig, errmsg); - } - StatusWith<int> myIndex = validateConfigForReconfig( - _externalState.get(), oldConfig, newConfig, opCtx->getServiceContext(), args.force); + _externalState.get(), oldConfig, newConfig, opCtx->getServiceContext(), force); if (!myIndex.isOK()) { LOGV2_ERROR(21420, "replSetReconfig got {status} while validating {newConfigObj}", @@ -3039,7 +3025,7 @@ Status ReplicationCoordinatorImpl::processReplSetReconfig(OperationContext* opCt "replSetReconfig config object with {numMembers} members parses ok", "numMembers"_attr = newConfig.getNumMembers()); - if (!args.force && !MONGO_unlikely(omitConfigQuorumCheck.shouldFail())) { + if (!force && !MONGO_unlikely(omitConfigQuorumCheck.shouldFail())) { status = checkQuorumForReconfig( _replExecutor.get(), newConfig, myIndex.getValue(), _topCoord->getTerm()); if (!status.isOK()) { @@ -3058,17 +3044,15 @@ Status ReplicationCoordinatorImpl::processReplSetReconfig(OperationContext* opCt } configStateGuard.dismiss(); - _finishReplSetReconfig(opCtx, newConfig, args.force, myIndex.getValue()); + _finishReplSetReconfig(opCtx, newConfig, force, myIndex.getValue()); - if (!args.force && + if (!force && serverGlobalParams.featureCompatibility.isVersion( ServerGlobalParams::FeatureCompatibility::Version::kFullyUpgradedTo44)) { // Wait for the config document to be replicated to a majority of nodes in the new // config. - WriteConcernOptions writeConcern(WriteConcernOptions::kConfigMajority, - WriteConcernOptions::SyncMode::NONE, - WriteConcernOptions::kNoTimeout); - StatusAndDuration configAwaitStatus = awaitReplication(opCtx, OpTime(), writeConcern); + StatusAndDuration configAwaitStatus = + awaitReplication(opCtx, fakeOpTime, configWriteConcern); uassertStatusOK(configAwaitStatus.status); // Now that the new config has been persisted and installed in memory, wait for the latest diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h index 39be77acb89..562417d99d9 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.h +++ b/src/mongo/db/repl/replication_coordinator_impl.h @@ -241,6 +241,10 @@ public: const ReplSetReconfigArgs& args, BSONObjBuilder* resultObj) override; + virtual Status doReplSetReconfig(OperationContext* opCtx, + GetNewConfigFn getNewConfig, + bool force) override; + virtual Status processReplSetInitiate(OperationContext* opCtx, const BSONObj& configObj, BSONObjBuilder* resultObj) override; diff --git a/src/mongo/db/repl/replication_coordinator_mock.cpp b/src/mongo/db/repl/replication_coordinator_mock.cpp index b8ff01aefa4..10189e95718 100644 --- a/src/mongo/db/repl/replication_coordinator_mock.cpp +++ b/src/mongo/db/repl/replication_coordinator_mock.cpp @@ -379,6 +379,12 @@ Status ReplicationCoordinatorMock::processReplSetReconfig(OperationContext* opCt return Status::OK(); } +Status ReplicationCoordinatorMock::doReplSetReconfig(OperationContext* opCtx, + GetNewConfigFn getNewConfig, + bool force) { + return Status::OK(); +} + Status ReplicationCoordinatorMock::processReplSetInitiate(OperationContext* opCtx, const BSONObj& configObj, BSONObjBuilder* resultObj) { diff --git a/src/mongo/db/repl/replication_coordinator_mock.h b/src/mongo/db/repl/replication_coordinator_mock.h index 80a368d25a2..57f60ece3cd 100644 --- a/src/mongo/db/repl/replication_coordinator_mock.h +++ b/src/mongo/db/repl/replication_coordinator_mock.h @@ -209,6 +209,10 @@ public: const ReplSetReconfigArgs& args, BSONObjBuilder* resultObj); + virtual Status doReplSetReconfig(OperationContext* opCtx, + GetNewConfigFn getNewConfig, + bool force); + virtual Status processReplSetInitiate(OperationContext* opCtx, const BSONObj& configObj, BSONObjBuilder* resultObj); diff --git a/src/mongo/db/repl/replication_coordinator_noop.cpp b/src/mongo/db/repl/replication_coordinator_noop.cpp index 14089bcba5b..1605c42b131 100644 --- a/src/mongo/db/repl/replication_coordinator_noop.cpp +++ b/src/mongo/db/repl/replication_coordinator_noop.cpp @@ -321,6 +321,12 @@ Status ReplicationCoordinatorNoOp::processReplSetReconfig(OperationContext*, MONGO_UNREACHABLE; } +Status ReplicationCoordinatorNoOp::doReplSetReconfig(OperationContext* opCtx, + GetNewConfigFn getNewConfig, + bool force) { + MONGO_UNREACHABLE; +} + Status ReplicationCoordinatorNoOp::processReplSetInitiate(OperationContext*, const BSONObj&, BSONObjBuilder*) { diff --git a/src/mongo/db/repl/replication_coordinator_noop.h b/src/mongo/db/repl/replication_coordinator_noop.h index 4c63d307335..3bc608f6b54 100644 --- a/src/mongo/db/repl/replication_coordinator_noop.h +++ b/src/mongo/db/repl/replication_coordinator_noop.h @@ -183,6 +183,10 @@ public: const ReplSetReconfigArgs&, BSONObjBuilder*) final; + Status doReplSetReconfig(OperationContext* opCtx, + GetNewConfigFn getNewConfig, + bool force) final; + Status processReplSetInitiate(OperationContext*, const BSONObj&, BSONObjBuilder*) final; Status processReplSetUpdatePosition(const UpdatePositionArgs&, long long*) final; diff --git a/src/mongo/db/repl/topology_coordinator.cpp b/src/mongo/db/repl/topology_coordinator.cpp index 3ffbbe4a695..697a2136884 100644 --- a/src/mongo/db/repl/topology_coordinator.cpp +++ b/src/mongo/db/repl/topology_coordinator.cpp @@ -945,29 +945,6 @@ HeartbeatResponseAction TopologyCoordinator::processHeartbeatResponse( return nextAction; } -bool TopologyCoordinator::haveMajorityReplicatedConfig() { - auto configWriteMaj = _rsConfig.getWriteMajority(); - auto numNodesWithReplicatedConfig = 0; - for (auto&& memberData : _memberData) { - // If this member is not in our new config, do not count it in the majority - if (_rsConfig.findMemberByID(memberData.getMemberId().getData()) == NULL) { - continue; - } - - if (memberData.getConfigVersionAndTerm() == _rsConfig.getConfigVersionAndTerm()) { - // configVersionAndTerm comparison will compare config versions alone and ignore terms - // if the config term is -1, which makes this compatible with 4.2 heartbeat responses. - numNodesWithReplicatedConfig++; - } - - // Once we know a majority of the nodes have replicated the config return. - if (numNodesWithReplicatedConfig >= configWriteMaj) - return true; - } - - return false; -} - bool TopologyCoordinator::haveNumNodesReachedOpTime(const OpTime& targetOpTime, int numNodes, bool durablyWritten) { @@ -1016,16 +993,20 @@ bool TopologyCoordinator::haveNumNodesReachedOpTime(const OpTime& targetOpTime, bool TopologyCoordinator::haveTaggedNodesReachedOpTime(const OpTime& opTime, const ReplSetTagPattern& tagPattern, bool durablyWritten) { - ReplSetTagMatch matcher(tagPattern); + auto pred = makeOpTimePredicate(opTime, durablyWritten); + return haveTaggedNodesSatisfiedCondition(pred, tagPattern); +} +TopologyCoordinator::MemberPredicate TopologyCoordinator::makeOpTimePredicate(const OpTime& opTime, + bool durablyWritten) { // Invariants that we only wait for an OpTime in the term that this node is currently writing // to. In other words, we do not support waiting for an OpTime written by a previous primary // because comparing members' lastApplied/lastDurable alone is not sufficient to tell if the // OpTime has been replicated. invariant(opTime.getTerm() == getMyLastAppliedOpTime().getTerm()); - for (auto&& memberData : _memberData) { - const OpTime& memberOpTime = + return [=](const MemberData& memberData) { + auto memberOpTime = durablyWritten ? memberData.getLastDurableOpTime() : memberData.getLastAppliedOpTime(); // In addition to checking if a member has a greater/equal timestamp field we also need to @@ -1034,16 +1015,29 @@ bool TopologyCoordinator::haveTaggedNodesReachedOpTime(const OpTime& opTime, // thus we do not know if the target OpTime in our previous term has been replicated to the // member because the memberOpTime in a higher term could correspond to an operation in a // divergent branch of history regardless of its timestamp. - if (memberOpTime.getTerm() == opTime.getTerm() && - memberOpTime.getTimestamp() >= opTime.getTimestamp()) { - // This node has reached the desired optime, now we need to check if it is a part + return memberOpTime.getTerm() == opTime.getTerm() && + memberOpTime.getTimestamp() >= opTime.getTimestamp(); + }; +} + +TopologyCoordinator::MemberPredicate TopologyCoordinator::makeConfigPredicate() { + return [&](const MemberData& memberData) { + return memberData.getConfigVersionAndTerm() == _rsConfig.getConfigVersionAndTerm(); + }; +} + +bool TopologyCoordinator::haveTaggedNodesSatisfiedCondition( + std::function<bool(const MemberData&)> pred, const ReplSetTagPattern& tagPattern) { + ReplSetTagMatch matcher(tagPattern); + + for (auto&& memberData : _memberData) { + if (pred(memberData)) { + // This node has satisfied the predicate, now we need to check if it is a part // of the tagPattern. int memberIndex = memberData.getConfigIndex(); invariant(memberIndex >= 0); const MemberConfig& memberConfig = _rsConfig.getMemberAt(memberIndex); - for (MemberConfig::TagIterator it = memberConfig.tagsBegin(); - it != memberConfig.tagsEnd(); - ++it) { + for (auto&& it = memberConfig.tagsBegin(); it != memberConfig.tagsEnd(); ++it) { if (matcher.update(*it)) { return true; } diff --git a/src/mongo/db/repl/topology_coordinator.h b/src/mongo/db/repl/topology_coordinator.h index f60d9a92044..20c2bbe8cbf 100644 --- a/src/mongo/db/repl/topology_coordinator.h +++ b/src/mongo/db/repl/topology_coordinator.h @@ -413,15 +413,6 @@ public: const StatusWith<ReplSetHeartbeatResponse>& hbResponse); /** - * Returns whether or not at least a majority of voting, data-bearing nodes in the current - * replica set config have replicated the config. - * This loops through _memberData and checks if: - * (1) The member is in the current config - * (2) The member's config (term, version) match the current config's (term, version) - */ - bool haveMajorityReplicatedConfig(); - - /** * Returns whether or not at least 'numNodes' have reached the given opTime with the same term. * "durablyWritten" indicates whether the operation has to be durably applied. */ @@ -436,6 +427,25 @@ public: const ReplSetTagPattern& tagPattern, bool durablyWritten); + using MemberPredicate = std::function<bool(const MemberData&)>; + + /** + * Return the predicate that tests if a member has reached the target OpTime. + */ + MemberPredicate makeOpTimePredicate(const OpTime& opTime, bool durablyWritten); + + /** + * Return the predicate that tests if a member has replicated the given config. + */ + MemberPredicate makeConfigPredicate(); + + /** + * Returns whether or not at least one node matching the tagPattern has satisfied the given + * condition. + */ + bool haveTaggedNodesSatisfiedCondition(MemberPredicate pred, + const ReplSetTagPattern& tagPattern); + /** * Returns a vector of members that have applied the operation with OpTime 'op'. * "durablyWritten" indicates whether the operation has to be durably applied. diff --git a/src/mongo/db/repl/topology_coordinator_v1_test.cpp b/src/mongo/db/repl/topology_coordinator_v1_test.cpp index 8902ba19ea3..5675c52d10e 100644 --- a/src/mongo/db/repl/topology_coordinator_v1_test.cpp +++ b/src/mongo/db/repl/topology_coordinator_v1_test.cpp @@ -5114,7 +5114,31 @@ TEST_F(TopoCoordTest, TwoNodesEligibleForElectionHandoffEqualPriorityResolveByMe ASSERT_EQUALS(1, getTopoCoord().chooseElectionHandoffCandidate()); } -TEST_F(TopoCoordTest, MajorityReplicatedConfigChecks) { +class ConfigReplicationTest : public TopoCoordTest { +public: + void setUp() override { + TopoCoordTest::setUp(); + } + + void simulateHBWithConfigVersionAndTerm(size_t remoteIndex) { + // Simulate a heartbeat to the remote. + auto member = getCurrentConfig().getMemberAt(remoteIndex); + getTopoCoord().prepareHeartbeatRequestV1( + now(), getCurrentConfig().getReplSetName(), member.getHostAndPort()); + // Simulate heartbeat response from the remote. + // This will call setUpValues, which will update the memberData for the remote. + ReplSetHeartbeatResponse hb; + hb.setConfigVersion(getCurrentConfig().getConfigVersion()); + hb.setConfigTerm(getCurrentConfig().getConfigTerm()); + hb.setState(member.isArbiter() ? MemberState::RS_ARBITER : MemberState::RS_SECONDARY); + StatusWith<ReplSetHeartbeatResponse> hbResponse = StatusWith<ReplSetHeartbeatResponse>(hb); + + getTopoCoord().processHeartbeatResponse( + now(), Milliseconds(0), member.getHostAndPort(), hbResponse); + } +}; + +TEST_F(ConfigReplicationTest, MajorityReplicatedConfigChecks) { // Config with three nodes requires at least 2 nodes to replicate the config. updateConfig(BSON("_id" << "rs0" @@ -5130,27 +5154,21 @@ TEST_F(TopoCoordTest, MajorityReplicatedConfigChecks) { // makeSelfPrimary() doesn't actually conduct a true election, so the term will still be 0. makeSelfPrimary(); + auto tagPatternStatus = + getCurrentConfig().findCustomWriteMode(ReplSetConfig::kConfigMajorityWriteConcernModeName); + ASSERT_TRUE(tagPatternStatus.isOK()); + auto tagPattern = tagPatternStatus.getValue(); + auto pred = getTopoCoord().makeConfigPredicate(); + // Currently, only we have config 2 (the initial config). // This simulates a reconfig where only the primary has written down the configVersion and // configTerm. - ASSERT_FALSE(getTopoCoord().haveMajorityReplicatedConfig()); + ASSERT_FALSE(getTopoCoord().haveTaggedNodesSatisfiedCondition(pred, tagPattern)); // Simulate a heartbeat to one of the other nodes. - getTopoCoord().prepareHeartbeatRequestV1(now(), "rs0", HostAndPort("host1")); - // Simulate heartbeat response from host1. This will call setUpValues, which will update the - // memberData for host1. - ReplSetHeartbeatResponse hb; - hb.setConfigVersion(2); - hb.setConfigTerm(0); - hb.setState(MemberState::RS_SECONDARY); - StatusWith<ReplSetHeartbeatResponse> hbResponse = StatusWith<ReplSetHeartbeatResponse>(hb); - - now() += Milliseconds(1); - getTopoCoord().processHeartbeatResponse( - now(), Milliseconds(1), HostAndPort("host1"), hbResponse); - + simulateHBWithConfigVersionAndTerm(1); // Now, node 0 and node 1 should have the current config. - ASSERT_TRUE(getTopoCoord().haveMajorityReplicatedConfig()); + ASSERT_TRUE(getTopoCoord().haveTaggedNodesSatisfiedCondition(pred, tagPattern)); // Update _rsConfig with config(v: 3, t: 0). configTerms are the same but configVersion 3 is // higher than 2. @@ -5168,24 +5186,12 @@ TEST_F(TopoCoordTest, MajorityReplicatedConfigChecks) { // Currently, only we have config 3. // This simulates a reconfig where only the primary has written down the configVersion and // configTerm. - ASSERT_FALSE(getTopoCoord().haveMajorityReplicatedConfig()); + ASSERT_FALSE(getTopoCoord().haveTaggedNodesSatisfiedCondition(pred, tagPattern)); // Simulate a heartbeat to one of the other nodes. - getTopoCoord().prepareHeartbeatRequestV1(now(), "rs0", HostAndPort("host2")); - // Simulate heartbeat response from host2. This will call setUpValues, which will update the - // memberData for host2. - ReplSetHeartbeatResponse hb2; - hb2.setConfigVersion(3); - hb2.setConfigTerm(0); - hb2.setState(MemberState::RS_SECONDARY); - StatusWith<ReplSetHeartbeatResponse> hbResponse2 = StatusWith<ReplSetHeartbeatResponse>(hb2); - - now() += Milliseconds(1); - getTopoCoord().processHeartbeatResponse( - now(), Milliseconds(1), HostAndPort("host2"), hbResponse2); - + simulateHBWithConfigVersionAndTerm(2); // Now, node 0 and node 2 should have the current config. - ASSERT_TRUE(getTopoCoord().haveMajorityReplicatedConfig()); + ASSERT_TRUE(getTopoCoord().haveTaggedNodesSatisfiedCondition(pred, tagPattern)); // Update _rsConfig with config(v: 4, t: 1). configTerm 1 is higher than configTerm 0. updateConfig(BSON("_id" @@ -5200,30 +5206,14 @@ TEST_F(TopoCoordTest, MajorityReplicatedConfigChecks) { 0); // Currently, only we have config 4. - // This simulates a reconfig where only the primary has written down the configVersion and - // configTerm. - ASSERT_FALSE(getTopoCoord().haveMajorityReplicatedConfig()); - - // Simulate a heartbeat to one of the other nodes. - getTopoCoord().prepareHeartbeatRequestV1(now(), "rs0", HostAndPort("host1")); - // Simulate heartbeat response from host1. This will call setUpValues, which will update the - // memberData for host1. - ReplSetHeartbeatResponse hb3; - hb3.setConfigVersion(4); - hb3.setConfigTerm(1); - hb3.setState(MemberState::RS_SECONDARY); - StatusWith<ReplSetHeartbeatResponse> hbResponse3 = StatusWith<ReplSetHeartbeatResponse>(hb3); - - now() += Milliseconds(1); - getTopoCoord().processHeartbeatResponse( - now(), Milliseconds(1), HostAndPort("host1"), hbResponse3); + ASSERT_FALSE(getTopoCoord().haveTaggedNodesSatisfiedCondition(pred, tagPattern)); + simulateHBWithConfigVersionAndTerm(1); // Now, node 0 and node 1 should have the current config. - ASSERT_TRUE(getTopoCoord().haveMajorityReplicatedConfig()); + ASSERT_TRUE(getTopoCoord().haveTaggedNodesSatisfiedCondition(pred, tagPattern)); } -TEST_F(TopoCoordTest, ArbiterNotIncludedInMajorityReplicatedConfigChecks) { - // Config with only one data-bearing node only needs one node to majority replicate config. +TEST_F(ConfigReplicationTest, ArbiterIncludedInMajorityReplicatedConfigChecks) { updateConfig(BSON("_id" << "rs0" << "version" << 2 << "term" << 0 << "members" @@ -5231,15 +5221,89 @@ TEST_F(TopoCoordTest, ArbiterNotIncludedInMajorityReplicatedConfigChecks) { << "host0:27017") << BSON("_id" << 1 << "host" << "host1:27017" + << "arbiterOnly" << true) + << BSON("_id" << 2 << "host" + << "host2:27017"))), + 0); + + // makeSelfPrimary() doesn't actually conduct a true election, so the term will still be 0. + makeSelfPrimary(); + + // Currently, only the primary has config 2 (the initial config). + auto tagPattern = + getCurrentConfig().findCustomWriteMode(ReplSetConfig::kConfigMajorityWriteConcernModeName); + ASSERT_TRUE(tagPattern.isOK()); + auto pred = getTopoCoord().makeConfigPredicate(); + ASSERT_FALSE(getTopoCoord().haveTaggedNodesSatisfiedCondition(pred, tagPattern.getValue())); + + simulateHBWithConfigVersionAndTerm(1); + ASSERT_TRUE(getTopoCoord().haveTaggedNodesSatisfiedCondition(pred, tagPattern.getValue())); +} + +TEST_F(ConfigReplicationTest, ArbiterIncludedInAllReplicatedConfigChecks) { + updateConfig(BSON("_id" + << "rs0" + << "version" << 2 << "term" << 0 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "host0:27017") + << BSON("_id" << 1 << "host" + << "host1:27017") + << BSON("_id" << 2 << "host" + << "host2:27017" << "arbiterOnly" << true))), 0); // makeSelfPrimary() doesn't actually conduct a true election, so the term will still be 0. makeSelfPrimary(); - // Currently, only the primary has config 2 (the initial config). But, since arbiters do not - // count towards the config majority, haveMajorityReplicatedConfig() should still return true. - ASSERT_TRUE(getTopoCoord().haveMajorityReplicatedConfig()); + // Currently, only the primary has config 2 (the initial config). + auto tagPattern = + getCurrentConfig().findCustomWriteMode(ReplSetConfig::kConfigAllWriteConcernName); + ASSERT_TRUE(tagPattern.isOK()); + auto pred = getTopoCoord().makeConfigPredicate(); + ASSERT_FALSE(getTopoCoord().haveTaggedNodesSatisfiedCondition(pred, tagPattern.getValue())); + + simulateHBWithConfigVersionAndTerm(1); + ASSERT_FALSE(getTopoCoord().haveTaggedNodesSatisfiedCondition(pred, tagPattern.getValue())); + + simulateHBWithConfigVersionAndTerm(2); + ASSERT_TRUE(getTopoCoord().haveTaggedNodesSatisfiedCondition(pred, tagPattern.getValue())); +} + +TEST_F(ConfigReplicationTest, ArbiterAndNonVotingNodeIncludedInAllReplicatedConfigChecks) { + updateConfig(BSON("_id" + << "rs0" + << "version" << 2 << "term" << 0 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "host0:27017") + << BSON("_id" << 1 << "host" + << "host1:27017") + << BSON("_id" << 2 << "host" + << "host2:27017" + << "votes" << 0 << "priority" << 0) + << BSON("_id" << 3 << "host" + << "host3:27017" + << "arbiterOnly" << true))), + 0); + + // makeSelfPrimary() doesn't actually conduct a true election, so the term will still be 0. + makeSelfPrimary(); + + // Currently, only the primary has config 2 (the initial config). + auto tagPattern = + getCurrentConfig().findCustomWriteMode(ReplSetConfig::kConfigAllWriteConcernName); + ASSERT_TRUE(tagPattern.isOK()); + auto pred = getTopoCoord().makeConfigPredicate(); + ASSERT_FALSE(getTopoCoord().haveTaggedNodesSatisfiedCondition(pred, tagPattern.getValue())); + + simulateHBWithConfigVersionAndTerm(1); + ASSERT_FALSE(getTopoCoord().haveTaggedNodesSatisfiedCondition(pred, tagPattern.getValue())); + + simulateHBWithConfigVersionAndTerm(2); + ASSERT_FALSE(getTopoCoord().haveTaggedNodesSatisfiedCondition(pred, tagPattern.getValue())); + + simulateHBWithConfigVersionAndTerm(3); + ASSERT_TRUE(getTopoCoord().haveTaggedNodesSatisfiedCondition(pred, tagPattern.getValue())); } TEST_F(TopoCoordTest, ArbiterNotIncludedInW3WriteInPSSAReplSet) { diff --git a/src/mongo/db/write_concern_options.cpp b/src/mongo/db/write_concern_options.cpp index 8ffdd14baf7..298941fbdef 100644 --- a/src/mongo/db/write_concern_options.cpp +++ b/src/mongo/db/write_concern_options.cpp @@ -67,7 +67,6 @@ constexpr int WriteConcernOptions::kNoWaiting; constexpr StringData WriteConcernOptions::kWriteConcernField; const char WriteConcernOptions::kMajority[] = "majority"; -const char WriteConcernOptions::kConfigMajority[] = "configMajority"; const BSONObj WriteConcernOptions::Default = BSONObj(); const BSONObj WriteConcernOptions::Acknowledged(BSON("w" << W_NORMAL)); diff --git a/src/mongo/db/write_concern_options.h b/src/mongo/db/write_concern_options.h index fbd870b1e4a..07cd6a6e1d7 100644 --- a/src/mongo/db/write_concern_options.h +++ b/src/mongo/db/write_concern_options.h @@ -42,6 +42,10 @@ struct WriteConcernOptions { public: enum class SyncMode { UNSET, NONE, FSYNC, JOURNAL }; + // This specifies the condition to check to satisfy given tags. + // Users can only provide OpTime condition, the others are used internally. + enum class CheckCondition { OpTime, Config }; + static constexpr int kNoTimeout = 0; static constexpr int kNoWaiting = -1; @@ -52,8 +56,7 @@ public: static const BSONObj ConfigMajority; static constexpr StringData kWriteConcernField = "writeConcern"_sd; - static const char kMajority[]; // = "majority" - static const char kConfigMajority[]; // = "configMajority" + static const char kMajority[]; // = "majority" static constexpr Seconds kWriteConcernTimeoutSystem{15}; static constexpr Seconds kWriteConcernTimeoutMigration{30}; @@ -130,6 +133,8 @@ public: // True if the default 'w' value of w:1 was used. bool usedDefaultW = false; + CheckCondition checkCondition = CheckCondition::OpTime; + private: ReadWriteConcernProvenance _provenance; }; diff --git a/src/mongo/embedded/replication_coordinator_embedded.cpp b/src/mongo/embedded/replication_coordinator_embedded.cpp index fabaa6f64f5..0238e7ea547 100644 --- a/src/mongo/embedded/replication_coordinator_embedded.cpp +++ b/src/mongo/embedded/replication_coordinator_embedded.cpp @@ -346,6 +346,12 @@ Status ReplicationCoordinatorEmbedded::processReplSetReconfig(OperationContext*, UASSERT_NOT_IMPLEMENTED; } +Status ReplicationCoordinatorEmbedded::doReplSetReconfig(OperationContext* opCtx, + GetNewConfigFn getNewConfig, + bool force) { + UASSERT_NOT_IMPLEMENTED; +} + Status ReplicationCoordinatorEmbedded::processReplSetInitiate(OperationContext*, const BSONObj&, BSONObjBuilder*) { diff --git a/src/mongo/embedded/replication_coordinator_embedded.h b/src/mongo/embedded/replication_coordinator_embedded.h index a0028c7591b..9bff367083c 100644 --- a/src/mongo/embedded/replication_coordinator_embedded.h +++ b/src/mongo/embedded/replication_coordinator_embedded.h @@ -190,6 +190,10 @@ public: const ReplSetReconfigArgs&, BSONObjBuilder*) override; + Status doReplSetReconfig(OperationContext* opCtx, + GetNewConfigFn getNewConfig, + bool force) override; + Status processReplSetInitiate(OperationContext*, const BSONObj&, BSONObjBuilder*) override; Status processReplSetUpdatePosition(const repl::UpdatePositionArgs&, long long*) override; diff --git a/src/mongo/shell/replsettest.js b/src/mongo/shell/replsettest.js index e88d5fad36c..be2f5e64128 100644 --- a/src/mongo/shell/replsettest.js +++ b/src/mongo/shell/replsettest.js @@ -1199,6 +1199,8 @@ var ReplSetTest = function(opts) { assert.commandWorked( self.getPrimary().adminCommand({setFeatureCompatibilityVersion: fcv})); checkFCV(self.getPrimary().getDB("admin"), lastStableFCV); + print("Fetch the config version from primay since 4.4 downgrade runs a reconfig."); + config.version = self.getReplSetConfigFromNode().version; }); } @@ -1221,7 +1223,7 @@ var ReplSetTest = function(opts) { if (originalSettings) { config.settings = originalSettings; } - config.version = 2; + config.version = config.version ? config.version + 1 : 2; // Nodes started with the --configsvr flag must have configsvr = true in their config. if (this.nodes[0].hasOwnProperty("fullOptions") && |