diff options
author | Benety Goh <benety@mongodb.com> | 2017-06-16 15:35:06 -0400 |
---|---|---|
committer | Benety Goh <benety@mongodb.com> | 2017-07-12 22:44:33 -0400 |
commit | ee6d550e81773fafd2a981b100ab520b73970c5e (patch) | |
tree | ff99f40f5c38594ef3d4909fb3c3103cdb2e3b27 | |
parent | a889b0d79a17eeed1f548a227e13ad553d1b32a2 (diff) | |
download | mongo-ee6d550e81773fafd2a981b100ab520b73970c5e.tar.gz |
SERVER-29015 TopologyCoordinator should not transition to candidate role in a single node replica set if we are in maintenance mode
(cherry picked from commit 5dd64f88d2b66078c957eea5a7889076ee5956b6)
5 files changed, 157 insertions, 14 deletions
diff --git a/src/mongo/db/repl/replication_coordinator_impl_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_test.cpp index 6ee1b399cdd..e586f1f0e9e 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_test.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_test.cpp @@ -1815,6 +1815,45 @@ TEST_F(ReplCoordTest, NodeBecomesPrimaryAgainWhenStepDownTimeoutExpiresInASingle ASSERT_TRUE(getReplCoord()->getMemberState().primary()); } +TEST_F( + ReplCoordTest, + NodeGoesIntoRecoveryAgainWhenStepDownTimeoutExpiresInASingleNodeSetAndWeAreInMaintenanceMode) { + init("mySet"); + + assertStartSuccess(BSON("_id" + << "mySet" + << "version" + << 1 + << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "test1:1234"))), + HostAndPort("test1", 1234)); + runSingleNodeElection(makeOperationContext(), getReplCoord(), getNet()); + const auto opCtx = makeOperationContext(); + + ASSERT_OK(getReplCoord()->stepDown(opCtx.get(), true, Milliseconds(0), Milliseconds(1000))); + getNet()->enterNetwork(); // Must do this before inspecting the topocoord + Date_t stepdownUntil = getNet()->now() + Seconds(1); + ASSERT_EQUALS(stepdownUntil, getTopoCoord().getStepDownTime()); + ASSERT_TRUE(getTopoCoord().getMemberState().secondary()); + ASSERT_TRUE(getReplCoord()->getMemberState().secondary()); + + // Go into maintenance mode. + ASSERT_EQUALS(0, getTopoCoord().getMaintenanceCount()); + ASSERT_FALSE(getReplCoord()->getMaintenanceMode()); + ASSERT_OK(getReplCoord()->setMaintenanceMode(true)); + ASSERT_EQUALS(1, getTopoCoord().getMaintenanceCount()); + ASSERT_TRUE(getReplCoord()->getMaintenanceMode()); + + // Now run time forward and make sure that the node goes into RECOVERING again when the stepdown + // period ends. + getNet()->runUntil(stepdownUntil); + ASSERT_EQUALS(stepdownUntil, getNet()->now()); + ASSERT_EQUALS(MemberState(MemberState::RS_RECOVERING), getTopoCoord().getMemberState()); + getNet()->exitNetwork(); + ASSERT_EQUALS(MemberState(MemberState::RS_RECOVERING), getReplCoord()->getMemberState()); +} + TEST_F(StepDownTest, NodeReturnsExceededTimeLimitWhenNoSecondaryIsCaughtUpWithinStepDownsSecondaryCatchUpPeriod) { OpTimeWithTermOne optime1(100, 1); diff --git a/src/mongo/db/repl/topology_coordinator_impl.cpp b/src/mongo/db/repl/topology_coordinator_impl.cpp index 75231166588..88337b6a500 100644 --- a/src/mongo/db/repl/topology_coordinator_impl.cpp +++ b/src/mongo/db/repl/topology_coordinator_impl.cpp @@ -1852,11 +1852,10 @@ TopologyCoordinatorImpl::prepareFreezeResponse(Date_t now, int secs, BSONObjBuil log() << "'unfreezing'"; response->append("info", "unfreezing"); - if (_followerMode == MemberState::RS_SECONDARY && _rsConfig.getNumMembers() == 1 && - _selfIndex == 0 && _rsConfig.getMemberAt(_selfIndex).isElectable()) { + if (_isElectableNodeInSingleNodeReplicaSet()) { // If we are a one-node replica set, we're the one member, - // we're electable, and we are currently in followerMode SECONDARY, - // we must transition to candidate now that our stepdown period + // we're electable, we're not in maintenance mode, and we are currently in followerMode + // SECONDARY, we must transition to candidate now that our stepdown period // is no longer active, in leiu of heartbeats. _role = Role::candidate; return PrepareFreezeResponseResult::kElectSelf; @@ -1877,11 +1876,10 @@ bool TopologyCoordinatorImpl::becomeCandidateIfStepdownPeriodOverAndSingleNodeSe return false; } - if (_followerMode == MemberState::RS_SECONDARY && _rsConfig.getNumMembers() == 1 && - _selfIndex == 0 && _rsConfig.getMemberAt(_selfIndex).isElectable()) { + if (_isElectableNodeInSingleNodeReplicaSet()) { // If the new config describes a one-node replica set, we're the one member, - // we're electable, and we are currently in followerMode SECONDARY, - // we must transition to candidate, in leiu of heartbeats. + // we're electable, we're not in maintenance mode, and we are currently in followerMode + // SECONDARY, we must transition to candidate, in leiu of heartbeats. _role = Role::candidate; return true; } @@ -1984,11 +1982,10 @@ void TopologyCoordinatorImpl::updateConfig(const ReplSetConfig& newConfig, // By this point we know we are in Role::follower _currentPrimaryIndex = -1; // force secondaries to re-detect who the primary is - if (_followerMode == MemberState::RS_SECONDARY && _rsConfig.getNumMembers() == 1 && - _selfIndex == 0 && _rsConfig.getMemberAt(_selfIndex).isElectable()) { + if (_isElectableNodeInSingleNodeReplicaSet()) { // If the new config describes a one-node replica set, we're the one member, - // we're electable, and we are currently in followerMode SECONDARY, - // we must transition to candidate, in leiu of heartbeats. + // we're electable, we're not in maintenance mode and we are currently in followerMode + // SECONDARY, we must transition to candidate, in leiu of heartbeats. _role = Role::candidate; } } @@ -2341,12 +2338,17 @@ void TopologyCoordinatorImpl::setFollowerMode(MemberState::MS newMode) { // be a candidate here. This is necessary because a single node replica set has no // heartbeats that would normally change the role to candidate. - if (_rsConfig.getNumMembers() == 1 && _selfIndex == 0 && - _rsConfig.getMemberAt(_selfIndex).isElectable()) { + if (_isElectableNodeInSingleNodeReplicaSet()) { _role = Role::candidate; } } +bool TopologyCoordinatorImpl::_isElectableNodeInSingleNodeReplicaSet() const { + return _followerMode == MemberState::RS_SECONDARY && _rsConfig.getNumMembers() == 1 && + _selfIndex == 0 && _rsConfig.getMemberAt(_selfIndex).isElectable() && + _maintenanceModeCalls == 0; +} + bool TopologyCoordinatorImpl::stepDownIfPending() { if (!_stepDownPending) { return false; diff --git a/src/mongo/db/repl/topology_coordinator_impl.h b/src/mongo/db/repl/topology_coordinator_impl.h index b30d16b4c60..d4b0476cde1 100644 --- a/src/mongo/db/repl/topology_coordinator_impl.h +++ b/src/mongo/db/repl/topology_coordinator_impl.h @@ -384,6 +384,15 @@ private: **/ bool _memberIsBlacklisted(const MemberConfig& memberConfig, Date_t now) const; + /** + * Returns true if we are a one-node replica set, we're the one member, + * we're electable, we're not in maintenance mode, and we are currently in followerMode + * SECONDARY. + * + * This is used to decide if we should transition to Role::candidate in a one-node replica set. + */ + bool _isElectableNodeInSingleNodeReplicaSet() const; + // This node's role in the replication protocol. Role _role; diff --git a/src/mongo/db/repl/topology_coordinator_impl_test.cpp b/src/mongo/db/repl/topology_coordinator_impl_test.cpp index bd617b2dfa6..e3708a9bf54 100644 --- a/src/mongo/db/repl/topology_coordinator_impl_test.cpp +++ b/src/mongo/db/repl/topology_coordinator_impl_test.cpp @@ -4617,6 +4617,34 @@ TEST_F(TopoCoordTest, "cannot freeze node when primary or running for election. state: Running-Election"); } +TEST_F(TopoCoordTest, DoNotBecomeCandidateOnUnfreezingInMaintenanceMode) { + updateConfig(BSON("_id" + << "rs0" + << "version" + << 5 + << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "host1:27017"))), + 0); + setSelfMemberState(MemberState::RS_SECONDARY); + + BSONObjBuilder response; + ASSERT_EQUALS( + TopologyCoordinator::PrepareFreezeResponseResult::kNoAction, + unittest::assertGet(getTopoCoord().prepareFreezeResponse(now()++, 20, &response))); + ASSERT(response.obj().isEmpty()); + BSONObjBuilder response2; + + // We should not transition to Role::candidate if we are in maintenance upon unfreezing. + getTopoCoord().adjustMaintenanceCountBy(1); + + ASSERT_EQUALS( + TopologyCoordinator::PrepareFreezeResponseResult::kNoAction, + unittest::assertGet(getTopoCoord().prepareFreezeResponse(now()++, 0, &response2))); + ASSERT_EQUALS("unfreezing", response2.obj()["info"].String()); + ASSERT(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); +} + class PrepareHeartbeatResponseTest : public TopoCoordTest { public: virtual void setUp() { diff --git a/src/mongo/db/repl/topology_coordinator_impl_v1_test.cpp b/src/mongo/db/repl/topology_coordinator_impl_v1_test.cpp index 72e81caba7f..8062208949b 100644 --- a/src/mongo/db/repl/topology_coordinator_impl_v1_test.cpp +++ b/src/mongo/db/repl/topology_coordinator_impl_v1_test.cpp @@ -2073,6 +2073,35 @@ TEST_F(TopoCoordTest, BecomeCandidateWhenBecomingSecondaryInSingleNodeSet) { ASSERT_EQUALS(MemberState::RS_SECONDARY, getTopoCoord().getMemberState().s); } +TEST_F(TopoCoordTest, DoNotBecomeCandidateWhenBecomingSecondaryInSingleNodeSetIfInMaintenanceMode) { + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); + ASSERT_EQUALS(MemberState::RS_STARTUP, getTopoCoord().getMemberState().s); + updateConfig(BSON("_id" + << "rs0" + << "version" + << 1 + << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "hself"))), + 0); + ASSERT_EQUALS(MemberState::RS_STARTUP2, getTopoCoord().getMemberState().s); + + // If we are the only node and we are in maintenance mode, we should not become a candidate when + // we transition to SECONDARY. + ASSERT_FALSE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); + getTopoCoord().adjustMaintenanceCountBy(1); + getTopoCoord().setFollowerMode(MemberState::RS_SECONDARY); + ASSERT_FALSE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); + + // getMemberState() returns RS_RECOVERING while we are in maintenance mode even though + // _memberState is set to RS_SECONDARY. + ASSERT_EQUALS(MemberState::RS_RECOVERING, getTopoCoord().getMemberState().s); + + // Once we are no longer in maintenance mode, getMemberState() should return RS_SECONDARY. + getTopoCoord().adjustMaintenanceCountBy(-1); + ASSERT_EQUALS(MemberState::RS_SECONDARY, getTopoCoord().getMemberState().s); +} + TEST_F(TopoCoordTest, BecomeCandidateWhenReconfigToBeElectableInSingleNodeSet) { ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); ASSERT_EQUALS(MemberState::RS_STARTUP, getTopoCoord().getMemberState().s); @@ -2107,6 +2136,42 @@ TEST_F(TopoCoordTest, BecomeCandidateWhenReconfigToBeElectableInSingleNodeSet) { ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); } +TEST_F(TopoCoordTest, + DoNotBecomeCandidateWhenReconfigToBeElectableInSingleNodeSetIfInMaintenanceMode) { + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); + ASSERT_EQUALS(MemberState::RS_STARTUP, getTopoCoord().getMemberState().s); + ReplSetConfig cfg; + ASSERT_OK(cfg.initialize(BSON("_id" + << "rs0" + << "version" + << 1 + << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "hself" + << "priority" + << 0))))); + getTopoCoord().updateConfig(cfg, 0, now()++, OpTime()); + ASSERT_EQUALS(MemberState::RS_STARTUP2, getTopoCoord().getMemberState().s); + + ASSERT_FALSE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); + getTopoCoord().setFollowerMode(MemberState::RS_SECONDARY); + ASSERT_FALSE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); + ASSERT_EQUALS(MemberState::RS_SECONDARY, getTopoCoord().getMemberState().s); + + // We should not become a candidate when we reconfig to become electable if we are currently in + // maintenance mode. + getTopoCoord().adjustMaintenanceCountBy(1); + updateConfig(BSON("_id" + << "rs0" + << "version" + << 1 + << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "hself"))), + 0); + ASSERT_FALSE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); +} + TEST_F(TopoCoordTest, NodeDoesNotBecomeCandidateWhenBecomingSecondaryInSingleNodeSetIfUnelectable) { ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); ASSERT_EQUALS(MemberState::RS_STARTUP, getTopoCoord().getMemberState().s); |