diff options
author | Spencer T Brody <spencer@mongodb.com> | 2014-08-26 18:10:05 -0400 |
---|---|---|
committer | Spencer T Brody <spencer@mongodb.com> | 2014-09-05 11:25:05 -0400 |
commit | c175aede88469969320d4935f3b2afb5a07cd4fc (patch) | |
tree | 3db49eeb58f45da8229de3dd5e79b088691eccc7 | |
parent | a6b9d6deea3cebc29603f22d29a2e79807ca4dc0 (diff) | |
download | mongo-c175aede88469969320d4935f3b2afb5a07cd4fc.tar.gz |
SERVER-14450 Implement maintenance mode setting in new ReplicationCoordinator
-rw-r--r-- | src/mongo/db/dbcommands.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/repl/repl_coordinator.h | 15 | ||||
-rw-r--r-- | src/mongo/db/repl/repl_coordinator_hybrid.cpp | 13 | ||||
-rw-r--r-- | src/mongo/db/repl/repl_coordinator_hybrid.h | 6 | ||||
-rw-r--r-- | src/mongo/db/repl/repl_coordinator_impl.cpp | 70 | ||||
-rw-r--r-- | src/mongo/db/repl/repl_coordinator_impl.h | 20 | ||||
-rw-r--r-- | src/mongo/db/repl/repl_coordinator_impl_test.cpp | 58 | ||||
-rw-r--r-- | src/mongo/db/repl/repl_coordinator_legacy.cpp | 27 | ||||
-rw-r--r-- | src/mongo/db/repl/repl_coordinator_legacy.h | 6 | ||||
-rw-r--r-- | src/mongo/db/repl/repl_coordinator_mock.cpp | 12 | ||||
-rw-r--r-- | src/mongo/db/repl/repl_coordinator_mock.h | 6 | ||||
-rw-r--r-- | src/mongo/db/repl/replset_commands.cpp | 5 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator.h | 11 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator_impl.cpp | 9 | ||||
-rw-r--r-- | src/mongo/db/repl/topology_coordinator_impl.h | 4 |
15 files changed, 182 insertions, 86 deletions
diff --git a/src/mongo/db/dbcommands.cpp b/src/mongo/db/dbcommands.cpp index 4b65ba7b6c1..6f6de870f1b 100644 --- a/src/mongo/db/dbcommands.cpp +++ b/src/mongo/db/dbcommands.cpp @@ -1172,15 +1172,15 @@ namespace mongo { } /* Sometimes we cannot set maintenance mode, in which case the call to setMaintenanceMode will - return false. This class does not treat that case as an error which means that anybody - using it is assuming it is ok to continue execution without maintenance mode. This + return a non-OK status. This class does not treat that case as an error which means that + anybody using it is assuming it is ok to continue execution without maintenance mode. This assumption needs to be audited and documented. */ class MaintenanceModeSetter { public: MaintenanceModeSetter(OperationContext* txn) : _txn(txn), maintenanceModeSet( - repl::getGlobalReplicationCoordinator()->setMaintenanceMode(txn, true)) + repl::getGlobalReplicationCoordinator()->setMaintenanceMode(txn, true).isOK()) {} ~MaintenanceModeSetter() { if (maintenanceModeSet) diff --git a/src/mongo/db/repl/repl_coordinator.h b/src/mongo/db/repl/repl_coordinator.h index 3425f5e4c5d..cabacb43c05 100644 --- a/src/mongo/db/repl/repl_coordinator.h +++ b/src/mongo/db/repl/repl_coordinator.h @@ -292,9 +292,10 @@ namespace repl { /** * Toggles maintenanceMode to the value expressed by 'activate' - * return true, if the change worked and false otherwise + * return Status::OK if the change worked, NotSecondary if it failed because we are + * PRIMARY, and OperationFailed if we are not currently in maintenance mode */ - virtual bool setMaintenanceMode(OperationContext* txn, bool activate) = 0; + virtual Status setMaintenanceMode(OperationContext* txn, bool activate) = 0; /** * Handles an incoming replSetSyncFrom command. Adds BSON to 'result' @@ -305,16 +306,6 @@ namespace repl { BSONObjBuilder* resultObj) = 0; /** - * Handles an incoming replSetMaintenance command. 'activate' indicates whether to activate - * or deactivate maintenanceMode. - * returns Status::OK() if maintenanceMode is successfully changed, otherwise returns a - * Status containing an error message about the failure - */ - virtual Status processReplSetMaintenance(OperationContext* txn, - bool activate, - BSONObjBuilder* resultObj) = 0; - - /** * Handles an incoming replSetFreeze command. Adds BSON to 'resultObj' * returns Status::OK() if the node is a member of a replica set with a config and an * error Status otherwise diff --git a/src/mongo/db/repl/repl_coordinator_hybrid.cpp b/src/mongo/db/repl/repl_coordinator_hybrid.cpp index 0d5bbcdfb3c..8813fbe9f87 100644 --- a/src/mongo/db/repl/repl_coordinator_hybrid.cpp +++ b/src/mongo/db/repl/repl_coordinator_hybrid.cpp @@ -239,8 +239,8 @@ namespace repl { _impl.processReplSetGetConfig(&implResult); } - bool HybridReplicationCoordinator::setMaintenanceMode(OperationContext* txn, bool activate) { - bool legacyResponse = _legacy.setMaintenanceMode(txn, activate); + Status HybridReplicationCoordinator::setMaintenanceMode(OperationContext* txn, bool activate) { + Status legacyResponse = _legacy.setMaintenanceMode(txn, activate); _impl.setMaintenanceMode(txn, activate); return legacyResponse; } @@ -308,15 +308,6 @@ namespace repl { return legacyStatus; } - Status HybridReplicationCoordinator::processReplSetMaintenance(OperationContext* txn, - bool activate, - BSONObjBuilder* resultObj) { - Status legacyStatus = _legacy.processReplSetMaintenance(txn, activate, resultObj); - BSONObjBuilder implResult; - Status implStatus = _impl.processReplSetMaintenance(txn, activate, &implResult); - return legacyStatus; - } - Status HybridReplicationCoordinator::processReplSetSyncFrom(const HostAndPort& target, BSONObjBuilder* resultObj) { Status legacyStatus = _legacy.processReplSetSyncFrom(target, resultObj); diff --git a/src/mongo/db/repl/repl_coordinator_hybrid.h b/src/mongo/db/repl/repl_coordinator_hybrid.h index d28ccfed80a..0ae14a213a6 100644 --- a/src/mongo/db/repl/repl_coordinator_hybrid.h +++ b/src/mongo/db/repl/repl_coordinator_hybrid.h @@ -112,11 +112,7 @@ namespace repl { virtual void processReplSetGetConfig(BSONObjBuilder* result); - virtual bool setMaintenanceMode(OperationContext* txn, bool activate); - - virtual Status processReplSetMaintenance(OperationContext* txn, - bool activate, - BSONObjBuilder* resultObj); + virtual Status setMaintenanceMode(OperationContext* txn, bool activate); virtual Status processReplSetSyncFrom(const HostAndPort& target, BSONObjBuilder* resultObj); diff --git a/src/mongo/db/repl/repl_coordinator_impl.cpp b/src/mongo/db/repl/repl_coordinator_impl.cpp index a1b48589da1..31c6490ad30 100644 --- a/src/mongo/db/repl/repl_coordinator_impl.cpp +++ b/src/mongo/db/repl/repl_coordinator_impl.cpp @@ -295,6 +295,11 @@ namespace { return _currentState; } + void ReplicationCoordinatorImpl::_setCurrentMemberState_forTest(const MemberState& newState) { + boost::lock_guard<boost::mutex> lk(_mutex); + _currentState = newState; + } + Status ReplicationCoordinatorImpl::setMyLastOptime(OperationContext* txn, const OpTime& ts) { boost::unique_lock<boost::mutex> lock(_mutex); return _setLastOptime_inlock(&lock, _getMyRID_inlock(), ts); @@ -730,9 +735,61 @@ namespace { result->append("config", _rsConfig.toBSON()); } - bool ReplicationCoordinatorImpl::setMaintenanceMode(OperationContext* txn, bool activate) { - // TODO - return false; + Status ReplicationCoordinatorImpl::setMaintenanceMode(OperationContext* txn, bool activate) { + Status result(ErrorCodes::InternalError, "didn't set status in _setMaintenanceMode_helper"); + CBHStatus cbh = _replExecutor.scheduleWorkWithGlobalExclusiveLock( + stdx::bind(&ReplicationCoordinatorImpl::_setMaintenanceMode_helper, + this, + stdx::placeholders::_1, + activate, + &result)); + if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { + return cbh.getStatus(); + } + fassert(18689, cbh.getStatus()); + _replExecutor.wait(cbh.getValue()); + return result; + } + + void ReplicationCoordinatorImpl::_setMaintenanceMode_helper( + const ReplicationExecutor::CallbackData& cbData, + bool activate, + Status* result) { + if (cbData.status == ErrorCodes::CallbackCanceled) { + *result = Status(ErrorCodes::ShutdownInProgress, "replication system is shutting down"); + return; + } + + boost::lock_guard<boost::mutex> lk(_mutex); + if (_getCurrentMemberState_inlock().primary()) { + *result = Status(ErrorCodes::NotSecondary, "primaries can't modify maintenance mode"); + return; + } + + int curMaintenanceCalls = _topCoord->getMaintenanceModeCalls(); + if (activate) { + log() << "replSet going into maintenance mode with " << curMaintenanceCalls + << " other maintenance mode tasks in progress" << rsLog; + _topCoord->adjustMaintenanceModeCallsBy(1); + } + else if (curMaintenanceCalls > 0) { + invariant(_currentState.recovering()); + + _topCoord->adjustMaintenanceModeCallsBy(-1); + // no need to change state, syncTail will try to go live as a secondary soon + + log() << "leaving maintenance mode (" << curMaintenanceCalls-1 << " other maintenance " + "mode tasks ongoing)" << rsLog; + *result = Status::OK(); + return; + } else { + warning() << "Attempted to leave maintenance mode but it is not currently active"; + *result = Status(ErrorCodes::OperationFailed, "already out of maintenance mode"); + return; + } + + _currentState = MemberState::RS_RECOVERING; + *result = Status::OK(); } Status ReplicationCoordinatorImpl::processReplSetSyncFrom(const HostAndPort& target, @@ -754,13 +811,6 @@ namespace { return result; } - Status ReplicationCoordinatorImpl::processReplSetMaintenance(OperationContext* txn, - bool activate, - BSONObjBuilder* resultObj) { - // TODO - return Status::OK(); - } - Status ReplicationCoordinatorImpl::processReplSetFreeze(int secs, BSONObjBuilder* resultObj) { Status result(ErrorCodes::InternalError, "didn't set status in prepareFreezeResponse"); CBHStatus cbh = _replExecutor.scheduleWork( diff --git a/src/mongo/db/repl/repl_coordinator_impl.h b/src/mongo/db/repl/repl_coordinator_impl.h index b852def92fd..2b2e515d0da 100644 --- a/src/mongo/db/repl/repl_coordinator_impl.h +++ b/src/mongo/db/repl/repl_coordinator_impl.h @@ -142,11 +142,7 @@ namespace repl { virtual void processReplSetGetConfig(BSONObjBuilder* result); - virtual bool setMaintenanceMode(OperationContext* txn, bool activate); - - virtual Status processReplSetMaintenance(OperationContext* txn, - bool activate, - BSONObjBuilder* resultObj); + virtual Status setMaintenanceMode(OperationContext* txn, bool activate); virtual Status processReplSetSyncFrom(const HostAndPort& target, BSONObjBuilder* resultObj); @@ -229,6 +225,12 @@ namespace repl { */ void testElection(); + /** + * Used to set the current member state of this node. + * Should only be used in unit tests. + */ + void _setCurrentMemberState_forTest(const MemberState& newState); + private: /** @@ -289,6 +291,14 @@ namespace repl { const ReplicaSetConfig& newConfig, int myIndex); + /** + * Helper method for setting/unsetting maintenance mode. Scheduled by setMaintenanceMode() + * to run in a global write lock in the replication executor thread. + */ + void _setMaintenanceMode_helper(const ReplicationExecutor::CallbackData& cbData, + bool activate, + Status* result); + /* * Returns the OpTime of the last applied operation on this node. */ diff --git a/src/mongo/db/repl/repl_coordinator_impl_test.cpp b/src/mongo/db/repl/repl_coordinator_impl_test.cpp index dd087bef220..5ffd801ce84 100644 --- a/src/mongo/db/repl/repl_coordinator_impl_test.cpp +++ b/src/mongo/db/repl/repl_coordinator_impl_test.cpp @@ -982,6 +982,64 @@ namespace { ASSERT_EQUALS(3U, rids.size()); // Make sure we saw all 3 nodes } + TEST_F(ReplCoordTest, SetMaintenanceMode) { + init("mySet/test1:1234,test2:1234,test3:1234"); + assertStartSuccess( + BSON("_id" << "mySet" << + "version" << 1 << + "members" << BSON_ARRAY(BSON("_id" << 0 << "host" << "test1:1234") << + BSON("_id" << 1 << "host" << "test2:1234") << + BSON("_id" << 2 << "host" << "test3:1234"))), + HostAndPort("test2", 1234)); + OperationContextNoop txn; + + getReplCoord()->_setCurrentMemberState_forTest(MemberState::RS_SECONDARY); + + // Can't unset maintenance mode if it was never set to begin with. + Status status = getReplCoord()->setMaintenanceMode(&txn, false); + ASSERT_EQUALS(ErrorCodes::OperationFailed, status); + ASSERT_TRUE(getReplCoord()->getCurrentMemberState().secondary()); + + // valid set + ASSERT_OK(getReplCoord()->setMaintenanceMode(&txn, true)); + ASSERT_TRUE(getReplCoord()->getCurrentMemberState().recovering()); + + // Can set multiple times + ASSERT_OK(getReplCoord()->setMaintenanceMode(&txn, true)); + ASSERT_OK(getReplCoord()->setMaintenanceMode(&txn, true)); + + // Need to unset the number of times you set + ASSERT_OK(getReplCoord()->setMaintenanceMode(&txn, false)); + ASSERT_OK(getReplCoord()->setMaintenanceMode(&txn, false)); + ASSERT_OK(getReplCoord()->setMaintenanceMode(&txn, false)); + status = getReplCoord()->setMaintenanceMode(&txn, false); + // fourth one fails b/c we only set three times + ASSERT_EQUALS(ErrorCodes::OperationFailed, status); + // Unsetting maintenance mode doesn't actually change our state. + ASSERT_TRUE(getReplCoord()->getCurrentMemberState().recovering()); + + // Can't modify maintenance mode when PRIMARY + getReplCoord()->_setCurrentMemberState_forTest(MemberState::RS_PRIMARY); + status = getReplCoord()->setMaintenanceMode(&txn, true); + ASSERT_EQUALS(ErrorCodes::NotSecondary, status); + ASSERT_TRUE(getReplCoord()->getCurrentMemberState().primary()); + getReplCoord()->_setCurrentMemberState_forTest(MemberState::RS_SECONDARY); + status = getReplCoord()->setMaintenanceMode(&txn, false); + ASSERT_EQUALS(ErrorCodes::OperationFailed, status); + ASSERT_OK(getReplCoord()->setMaintenanceMode(&txn, true)); + ASSERT_OK(getReplCoord()->setMaintenanceMode(&txn, false)); + + // Setting maintenance mode from any RS state other than primary brings us to RECOVERING + // TODO(spencer): Is this actually the desired behavior? + getReplCoord()->_setCurrentMemberState_forTest(MemberState::RS_ROLLBACK); + ASSERT_TRUE(getReplCoord()->getCurrentMemberState().rollback()); + ASSERT_OK(getReplCoord()->setMaintenanceMode(&txn, true)); + ASSERT_TRUE(getReplCoord()->getCurrentMemberState().recovering()); + + // TODO(spencer): test that the applier won't put us into secondary state when maintenance + // mode is set, then does again once it is unset. + } + // TODO(spencer): Unit test replSetFreeze // TODO(schwerin): Unit test election id updating diff --git a/src/mongo/db/repl/repl_coordinator_legacy.cpp b/src/mongo/db/repl/repl_coordinator_legacy.cpp index 055286d9757..93518619450 100644 --- a/src/mongo/db/repl/repl_coordinator_legacy.cpp +++ b/src/mongo/db/repl/repl_coordinator_legacy.cpp @@ -515,8 +515,16 @@ namespace { result->append("config", theReplSet->config().asBson()); } - bool LegacyReplicationCoordinator::setMaintenanceMode(OperationContext* txn, bool activate) { - return theReplSet->setMaintenanceMode(txn, activate); + Status LegacyReplicationCoordinator::setMaintenanceMode(OperationContext* txn, bool activate) { + if (!theReplSet->setMaintenanceMode(txn, activate)) { + if (theReplSet->isPrimary()) { + return Status(ErrorCodes::NotSecondary, "primaries can't modify maintenance mode"); + } + else { + return Status(ErrorCodes::OperationFailed, "already out of maintenance mode"); + } + } + return Status::OK(); } Status LegacyReplicationCoordinator::processHeartbeat(const ReplSetHeartbeatArgs& args, @@ -883,21 +891,6 @@ namespace { return Status::OK(); } - Status LegacyReplicationCoordinator::processReplSetMaintenance(OperationContext* txn, - bool activate, - BSONObjBuilder* resultObj) { - if (!setMaintenanceMode(txn, activate)) { - if (theReplSet->isPrimary()) { - return Status(ErrorCodes::NotSecondary, "primaries can't modify maintenance mode"); - } - else { - return Status(ErrorCodes::OperationFailed, "already out of maintenance mode"); - } - } - - return Status::OK(); - } - Status LegacyReplicationCoordinator::processReplSetSyncFrom(const HostAndPort& target, BSONObjBuilder* resultObj) { resultObj->append("syncFromRequested", target.toString()); diff --git a/src/mongo/db/repl/repl_coordinator_legacy.h b/src/mongo/db/repl/repl_coordinator_legacy.h index 637f453b4a4..84a0b617b95 100644 --- a/src/mongo/db/repl/repl_coordinator_legacy.h +++ b/src/mongo/db/repl/repl_coordinator_legacy.h @@ -108,11 +108,7 @@ namespace repl { virtual void processReplSetGetConfig(BSONObjBuilder* result); - virtual bool setMaintenanceMode(OperationContext* txn, bool activate); - - virtual Status processReplSetMaintenance(OperationContext* txn, - bool activate, - BSONObjBuilder* resultObj); + virtual Status setMaintenanceMode(OperationContext* txn, bool activate); virtual Status processReplSetSyncFrom(const HostAndPort& target, BSONObjBuilder* resultObj); diff --git a/src/mongo/db/repl/repl_coordinator_mock.cpp b/src/mongo/db/repl/repl_coordinator_mock.cpp index 7af7638e029..ce721e1fdf1 100644 --- a/src/mongo/db/repl/repl_coordinator_mock.cpp +++ b/src/mongo/db/repl/repl_coordinator_mock.cpp @@ -152,9 +152,8 @@ namespace repl { return Status::OK(); } - bool ReplicationCoordinatorMock::setMaintenanceMode(OperationContext* txn, bool activate) { - // TODO - return false; + Status ReplicationCoordinatorMock::setMaintenanceMode(OperationContext* txn, bool activate) { + return Status::OK(); } Status ReplicationCoordinatorMock::processReplSetSyncFrom(const HostAndPort& target, @@ -163,13 +162,6 @@ namespace repl { return Status::OK(); } - Status ReplicationCoordinatorMock::processReplSetMaintenance(OperationContext* txn, - bool activate, - BSONObjBuilder* resultObj) { - // TODO - return Status::OK(); - } - Status ReplicationCoordinatorMock::processReplSetFreeze(int secs, BSONObjBuilder* resultObj) { // TODO return Status::OK(); diff --git a/src/mongo/db/repl/repl_coordinator_mock.h b/src/mongo/db/repl/repl_coordinator_mock.h index 2b0d1cb3bef..62835a004ec 100644 --- a/src/mongo/db/repl/repl_coordinator_mock.h +++ b/src/mongo/db/repl/repl_coordinator_mock.h @@ -109,11 +109,7 @@ namespace repl { virtual void processReplSetGetConfig(BSONObjBuilder* result); - virtual bool setMaintenanceMode(OperationContext* txn, bool activate); - - virtual Status processReplSetMaintenance(OperationContext* txn, - bool activate, - BSONObjBuilder* resultObj); + virtual Status setMaintenanceMode(OperationContext* txn, bool activate); virtual Status processReplSetSyncFrom(const HostAndPort& target, BSONObjBuilder* resultObj); diff --git a/src/mongo/db/repl/replset_commands.cpp b/src/mongo/db/repl/replset_commands.cpp index b8ec8822996..ce1c4296942 100644 --- a/src/mongo/db/repl/replset_commands.cpp +++ b/src/mongo/db/repl/replset_commands.cpp @@ -310,10 +310,9 @@ namespace repl { return appendCommandStatus( result, - getGlobalReplicationCoordinator()->processReplSetMaintenance( + getGlobalReplicationCoordinator()->setMaintenanceMode( txn, - cmdObj["replSetMaintenance"].trueValue(), - &result)); + cmdObj["replSetMaintenance"].trueValue())); } } cmdReplSetMaintenance; diff --git a/src/mongo/db/repl/topology_coordinator.h b/src/mongo/db/repl/topology_coordinator.h index 4cc77179503..836bc8b5b2c 100644 --- a/src/mongo/db/repl/topology_coordinator.h +++ b/src/mongo/db/repl/topology_coordinator.h @@ -250,6 +250,17 @@ namespace repl { Date_t now, const OpTime& lastOpApplied) = 0; + /** + * Adds "value" to the number of currently active calls to maintenance mode. Currently + * 1 and -1 are the only valid inputs. + */ + virtual void adjustMaintenanceModeCallsBy(int value) = 0; + + /** + * Returns the number of current calls into maintenance mode. + */ + virtual int getMaintenanceModeCalls() = 0; + // Record a "ping" based on the round-trip time of the heartbeat for the member virtual void recordPing(const HostAndPort& host, const Milliseconds elapsedMillis) = 0; diff --git a/src/mongo/db/repl/topology_coordinator_impl.cpp b/src/mongo/db/repl/topology_coordinator_impl.cpp index 0e729e35bd8..419856c571c 100644 --- a/src/mongo/db/repl/topology_coordinator_impl.cpp +++ b/src/mongo/db/repl/topology_coordinator_impl.cpp @@ -1042,6 +1042,15 @@ namespace { } } + void TopologyCoordinatorImpl::adjustMaintenanceModeCallsBy(int value) { + invariant(value == 1 || value == -1); + _maintenanceModeCalls += value; + } + + int TopologyCoordinatorImpl::getMaintenanceModeCalls() { + return _maintenanceModeCalls; + } + void TopologyCoordinatorImpl::_setCurrentPrimaryForTest(int primaryIndex) { _currentPrimaryIndex = primaryIndex; if (primaryIndex == _selfIndex) { diff --git a/src/mongo/db/repl/topology_coordinator_impl.h b/src/mongo/db/repl/topology_coordinator_impl.h index 8f98ce88946..bfeac78d465 100644 --- a/src/mongo/db/repl/topology_coordinator_impl.h +++ b/src/mongo/db/repl/topology_coordinator_impl.h @@ -207,6 +207,10 @@ namespace repl { Date_t now, const OpTime& lastOpApplied); + virtual void adjustMaintenanceModeCallsBy(int value); + + virtual int getMaintenanceModeCalls(); + // Record a ping in millis based on the round-trip time of the heartbeat for the member virtual void recordPing(const HostAndPort& host, const Milliseconds elapsedMillis); |