diff options
author | Mark Benvenuto <mark.benvenuto@mongodb.com> | 2015-07-28 18:16:39 -0400 |
---|---|---|
committer | Mark Benvenuto <mark.benvenuto@mongodb.com> | 2015-07-28 18:27:27 -0400 |
commit | b66e993f1c742518d9b5e93b0d8a5f8255a4127c (patch) | |
tree | 55e6fed05333d2d37f34586726a342ed7f7dbc29 /src/mongo/db/repl | |
parent | 314a22e93f283ab80e650618cbd3ed8babb8510f (diff) | |
download | mongo-b66e993f1c742518d9b5e93b0d8a5f8255a4127c.tar.gz |
SERVER-18579: Clang-Format - reformat code, no comment reflow
Diffstat (limited to 'src/mongo/db/repl')
125 files changed, 33931 insertions, 33882 deletions
diff --git a/src/mongo/db/repl/bgsync.cpp b/src/mongo/db/repl/bgsync.cpp index 49ba4ac5668..f1a2b36ef33 100644 --- a/src/mongo/db/repl/bgsync.cpp +++ b/src/mongo/db/repl/bgsync.cpp @@ -52,503 +52,493 @@ namespace mongo { - using std::string; +using std::string; namespace repl { namespace { - const char hashFieldName[] = "h"; - int SleepToAllowBatchingMillis = 2; - const int BatchIsSmallish = 40000; // bytes -} // namespace - - MONGO_FP_DECLARE(rsBgSyncProduce); - MONGO_FP_DECLARE(stepDownWhileDrainingFailPoint); - - BackgroundSync* BackgroundSync::s_instance = 0; - boost::mutex BackgroundSync::s_mutex; - - //The number and time spent reading batches off the network - static TimerStats getmoreReplStats; - static ServerStatusMetricField<TimerStats> displayBatchesRecieved( - "repl.network.getmores", - &getmoreReplStats ); - //The oplog entries read via the oplog reader - static Counter64 opsReadStats; - static ServerStatusMetricField<Counter64> displayOpsRead( "repl.network.ops", - &opsReadStats ); - //The bytes read via the oplog reader - static Counter64 networkByteStats; - static ServerStatusMetricField<Counter64> displayBytesRead( "repl.network.bytes", - &networkByteStats ); - - //The count of items in the buffer - static Counter64 bufferCountGauge; - static ServerStatusMetricField<Counter64> displayBufferCount( "repl.buffer.count", - &bufferCountGauge ); - //The size (bytes) of items in the buffer - static Counter64 bufferSizeGauge; - static ServerStatusMetricField<Counter64> displayBufferSize( "repl.buffer.sizeBytes", - &bufferSizeGauge ); - //The max size (bytes) of the buffer - static int bufferMaxSizeGauge = 256*1024*1024; - static ServerStatusMetricField<int> displayBufferMaxSize( "repl.buffer.maxSizeBytes", - &bufferMaxSizeGauge ); - - - BackgroundSyncInterface::~BackgroundSyncInterface() {} - - size_t getSize(const BSONObj& o) { - // SERVER-9808 Avoid Fortify complaint about implicit signed->unsigned conversion - return static_cast<size_t>(o.objsize()); +const char hashFieldName[] = "h"; +int SleepToAllowBatchingMillis = 2; +const int BatchIsSmallish = 40000; // bytes +} // namespace + +MONGO_FP_DECLARE(rsBgSyncProduce); +MONGO_FP_DECLARE(stepDownWhileDrainingFailPoint); + +BackgroundSync* BackgroundSync::s_instance = 0; +boost::mutex BackgroundSync::s_mutex; + +// The number and time spent reading batches off the network +static TimerStats getmoreReplStats; +static ServerStatusMetricField<TimerStats> displayBatchesRecieved("repl.network.getmores", + &getmoreReplStats); +// The oplog entries read via the oplog reader +static Counter64 opsReadStats; +static ServerStatusMetricField<Counter64> displayOpsRead("repl.network.ops", &opsReadStats); +// The bytes read via the oplog reader +static Counter64 networkByteStats; +static ServerStatusMetricField<Counter64> displayBytesRead("repl.network.bytes", &networkByteStats); + +// The count of items in the buffer +static Counter64 bufferCountGauge; +static ServerStatusMetricField<Counter64> displayBufferCount("repl.buffer.count", + &bufferCountGauge); +// The size (bytes) of items in the buffer +static Counter64 bufferSizeGauge; +static ServerStatusMetricField<Counter64> displayBufferSize("repl.buffer.sizeBytes", + &bufferSizeGauge); +// The max size (bytes) of the buffer +static int bufferMaxSizeGauge = 256 * 1024 * 1024; +static ServerStatusMetricField<int> displayBufferMaxSize("repl.buffer.maxSizeBytes", + &bufferMaxSizeGauge); + + +BackgroundSyncInterface::~BackgroundSyncInterface() {} + +size_t getSize(const BSONObj& o) { + // SERVER-9808 Avoid Fortify complaint about implicit signed->unsigned conversion + return static_cast<size_t>(o.objsize()); +} + +BackgroundSync::BackgroundSync() + : _buffer(bufferMaxSizeGauge, &getSize), + _lastOpTimeFetched(std::numeric_limits<int>::max(), 0), + _lastAppliedHash(0), + _lastFetchedHash(0), + _pause(true), + _appliedBuffer(true), + _replCoord(getGlobalReplicationCoordinator()), + _initialSyncRequestedFlag(false), + _indexPrefetchConfig(PREFETCH_ALL) {} + +BackgroundSync* BackgroundSync::get() { + boost::unique_lock<boost::mutex> lock(s_mutex); + if (s_instance == NULL && !inShutdown()) { + s_instance = new BackgroundSync(); } + return s_instance; +} - BackgroundSync::BackgroundSync() : _buffer(bufferMaxSizeGauge, &getSize), - _lastOpTimeFetched(std::numeric_limits<int>::max(), - 0), - _lastAppliedHash(0), - _lastFetchedHash(0), - _pause(true), - _appliedBuffer(true), - _replCoord(getGlobalReplicationCoordinator()), - _initialSyncRequestedFlag(false), - _indexPrefetchConfig(PREFETCH_ALL) { - } +void BackgroundSync::shutdown() { + boost::lock_guard<boost::mutex> lock(_mutex); - BackgroundSync* BackgroundSync::get() { - boost::unique_lock<boost::mutex> lock(s_mutex); - if (s_instance == NULL && !inShutdown()) { - s_instance = new BackgroundSync(); - } - return s_instance; - } + // Clear the buffer in case the producerThread is waiting in push() due to a full queue. + invariant(inShutdown()); + _buffer.clear(); + _pause = true; - void BackgroundSync::shutdown() { - boost::lock_guard<boost::mutex> lock(_mutex); + // Wake up producerThread so it notices that we're in shutdown + _appliedBufferCondition.notify_all(); + _pausedCondition.notify_all(); +} - // Clear the buffer in case the producerThread is waiting in push() due to a full queue. - invariant(inShutdown()); - _buffer.clear(); - _pause = true; +void BackgroundSync::notify(OperationContext* txn) { + boost::lock_guard<boost::mutex> lock(_mutex); - // Wake up producerThread so it notices that we're in shutdown + // If all ops in the buffer have been applied, unblock waitForRepl (if it's waiting) + if (_buffer.empty()) { + _appliedBuffer = true; _appliedBufferCondition.notify_all(); - _pausedCondition.notify_all(); } +} - void BackgroundSync::notify(OperationContext* txn) { - boost::lock_guard<boost::mutex> lock(_mutex); +void BackgroundSync::producerThread() { + Client::initThread("rsBackgroundSync"); + cc().getAuthorizationSession()->grantInternalAuthorization(); - // If all ops in the buffer have been applied, unblock waitForRepl (if it's waiting) - if (_buffer.empty()) { - _appliedBuffer = true; - _appliedBufferCondition.notify_all(); + while (!inShutdown()) { + try { + _producerThread(); + } catch (const DBException& e) { + std::string msg(str::stream() << "sync producer problem: " << e.toString()); + error() << msg; + _replCoord->setMyHeartbeatMessage(msg); + } catch (const std::exception& e2) { + severe() << "sync producer exception: " << e2.what(); + fassertFailed(28546); } } - void BackgroundSync::producerThread() { - Client::initThread("rsBackgroundSync"); - cc().getAuthorizationSession()->grantInternalAuthorization(); + cc().shutdown(); +} - while (!inShutdown()) { - try { - _producerThread(); - } - catch (const DBException& e) { - std::string msg(str::stream() << "sync producer problem: " << e.toString()); - error() << msg; - _replCoord->setMyHeartbeatMessage(msg); - } - catch (const std::exception& e2) { - severe() << "sync producer exception: " << e2.what(); - fassertFailed(28546); - } +void BackgroundSync::_producerThread() { + const MemberState state = _replCoord->getMemberState(); + // we want to pause when the state changes to primary + if (_replCoord->isWaitingForApplierToDrain() || state.primary()) { + if (!_pause) { + stop(); } + sleepsecs(1); + return; + } - cc().shutdown(); + // TODO(spencer): Use a condition variable to await loading a config. + if (state.startup()) { + // Wait for a config to be loaded + sleepsecs(1); + return; } - void BackgroundSync::_producerThread() { - const MemberState state = _replCoord->getMemberState(); - // we want to pause when the state changes to primary - if (_replCoord->isWaitingForApplierToDrain() || state.primary()) { - if (!_pause) { - stop(); - } - sleepsecs(1); - return; - } + OperationContextImpl txn; - // TODO(spencer): Use a condition variable to await loading a config. - if (state.startup()) { - // Wait for a config to be loaded - sleepsecs(1); - return; - } + // We need to wait until initial sync has started. + if (_replCoord->getMyLastOptime().isNull()) { + sleepsecs(1); + return; + } + // we want to unpause when we're no longer primary + // start() also loads _lastOpTimeFetched, which we know is set from the "if" + else if (_pause) { + start(&txn); + } - OperationContextImpl txn; + produce(&txn); +} - // We need to wait until initial sync has started. - if (_replCoord->getMyLastOptime().isNull()) { +void BackgroundSync::produce(OperationContext* txn) { + // this oplog reader does not do a handshake because we don't want the server it's syncing + // from to track how far it has synced + { + boost::unique_lock<boost::mutex> lock(_mutex); + if (_lastOpTimeFetched.isNull()) { + // then we're initial syncing and we're still waiting for this to be set + lock.unlock(); sleepsecs(1); + // if there is no one to sync from return; } - // we want to unpause when we're no longer primary - // start() also loads _lastOpTimeFetched, which we know is set from the "if" - else if (_pause) { - start(&txn); - } - - produce(&txn); - } - - void BackgroundSync::produce(OperationContext* txn) { - // this oplog reader does not do a handshake because we don't want the server it's syncing - // from to track how far it has synced - { - boost::unique_lock<boost::mutex> lock(_mutex); - if (_lastOpTimeFetched.isNull()) { - // then we're initial syncing and we're still waiting for this to be set - lock.unlock(); - sleepsecs(1); - // if there is no one to sync from - return; - } - - if (_replCoord->isWaitingForApplierToDrain() || - _replCoord->getMemberState().primary() || - inShutdownStrict()) { - return; - } - } - - while (MONGO_FAIL_POINT(rsBgSyncProduce)) { - sleepmillis(0); - } - - // find a target to sync from the last optime fetched - OpTime lastOpTimeFetched; - { - boost::unique_lock<boost::mutex> lock(_mutex); - lastOpTimeFetched = _lastOpTimeFetched; - _syncSourceHost = HostAndPort(); + if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary() || + inShutdownStrict()) { + return; } - _syncSourceReader.resetConnection(); - _syncSourceReader.connectToSyncSource(txn, lastOpTimeFetched, _replCoord); + } - { - boost::unique_lock<boost::mutex> lock(_mutex); - // no server found - if (_syncSourceReader.getHost().empty()) { - lock.unlock(); - sleepsecs(1); - // if there is no one to sync from - return; - } - lastOpTimeFetched = _lastOpTimeFetched; - _syncSourceHost = _syncSourceReader.getHost(); - _replCoord->signalUpstreamUpdater(); - } + while (MONGO_FAIL_POINT(rsBgSyncProduce)) { + sleepmillis(0); + } - _syncSourceReader.tailingQueryGTE(rsoplog, lastOpTimeFetched); - // if target cut connections between connecting and querying (for - // example, because it stepped down) we might not have a cursor - if (!_syncSourceReader.haveCursor()) { - return; - } + // find a target to sync from the last optime fetched + OpTime lastOpTimeFetched; + { + boost::unique_lock<boost::mutex> lock(_mutex); + lastOpTimeFetched = _lastOpTimeFetched; + _syncSourceHost = HostAndPort(); + } + _syncSourceReader.resetConnection(); + _syncSourceReader.connectToSyncSource(txn, lastOpTimeFetched, _replCoord); - if (_rollbackIfNeeded(txn, _syncSourceReader)) { - stop(); + { + boost::unique_lock<boost::mutex> lock(_mutex); + // no server found + if (_syncSourceReader.getHost().empty()) { + lock.unlock(); + sleepsecs(1); + // if there is no one to sync from return; } + lastOpTimeFetched = _lastOpTimeFetched; + _syncSourceHost = _syncSourceReader.getHost(); + _replCoord->signalUpstreamUpdater(); + } - while (!inShutdown()) { - if (!_syncSourceReader.moreInCurrentBatch()) { - // Check some things periodically - // (whenever we run out of items in the - // current cursor batch) - - int bs = _syncSourceReader.currentBatchMessageSize(); - if( bs > 0 && bs < BatchIsSmallish ) { - // on a very low latency network, if we don't wait a little, we'll be - // getting ops to write almost one at a time. this will both be expensive - // for the upstream server as well as potentially defeating our parallel - // application of batches on the secondary. - // - // the inference here is basically if the batch is really small, we are - // "caught up". - // - sleepmillis(SleepToAllowBatchingMillis); - } - - // If we are transitioning to primary state, we need to leave - // this loop in order to go into bgsync-pause mode. - if (_replCoord->isWaitingForApplierToDrain() || - _replCoord->getMemberState().primary()) { - return; - } - - // re-evaluate quality of sync target - if (shouldChangeSyncSource()) { - return; - } + _syncSourceReader.tailingQueryGTE(rsoplog, lastOpTimeFetched); - { - //record time for each getmore - TimerHolder batchTimer(&getmoreReplStats); - - // This calls receiveMore() on the oplogreader cursor. - // It can wait up to five seconds for more data. - _syncSourceReader.more(); - } - networkByteStats.increment(_syncSourceReader.currentBatchMessageSize()); - - if (!_syncSourceReader.moreInCurrentBatch()) { - // If there is still no data from upstream, check a few more things - // and then loop back for another pass at getting more data - { - boost::unique_lock<boost::mutex> lock(_mutex); - if (_pause) { - return; - } - } + // if target cut connections between connecting and querying (for + // example, because it stepped down) we might not have a cursor + if (!_syncSourceReader.haveCursor()) { + return; + } - _syncSourceReader.tailCheck(); - if( !_syncSourceReader.haveCursor() ) { - LOG(1) << "replSet end syncTail pass"; - return; - } + if (_rollbackIfNeeded(txn, _syncSourceReader)) { + stop(); + return; + } - continue; - } + while (!inShutdown()) { + if (!_syncSourceReader.moreInCurrentBatch()) { + // Check some things periodically + // (whenever we run out of items in the + // current cursor batch) + + int bs = _syncSourceReader.currentBatchMessageSize(); + if (bs > 0 && bs < BatchIsSmallish) { + // on a very low latency network, if we don't wait a little, we'll be + // getting ops to write almost one at a time. this will both be expensive + // for the upstream server as well as potentially defeating our parallel + // application of batches on the secondary. + // + // the inference here is basically if the batch is really small, we are + // "caught up". + // + sleepmillis(SleepToAllowBatchingMillis); } // If we are transitioning to primary state, we need to leave // this loop in order to go into bgsync-pause mode. if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary()) { - LOG(1) << "waiting for draining or we are primary, not adding more ops to buffer"; return; } - // At this point, we are guaranteed to have at least one thing to read out - // of the oplogreader cursor. - BSONObj o = _syncSourceReader.nextSafe().getOwned(); - opsReadStats.increment(); - - - if (MONGO_FAIL_POINT(stepDownWhileDrainingFailPoint)) { - sleepsecs(20); - } + // re-evaluate quality of sync target + if (shouldChangeSyncSource()) { + return; + } { - boost::unique_lock<boost::mutex> lock(_mutex); - _appliedBuffer = false; - } + // record time for each getmore + TimerHolder batchTimer(&getmoreReplStats); - OCCASIONALLY { - LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes"; + // This calls receiveMore() on the oplogreader cursor. + // It can wait up to five seconds for more data. + _syncSourceReader.more(); } + networkByteStats.increment(_syncSourceReader.currentBatchMessageSize()); - bufferCountGauge.increment(); - bufferSizeGauge.increment(getSize(o)); - _buffer.push(o); + if (!_syncSourceReader.moreInCurrentBatch()) { + // If there is still no data from upstream, check a few more things + // and then loop back for another pass at getting more data + { + boost::unique_lock<boost::mutex> lock(_mutex); + if (_pause) { + return; + } + } - { - boost::unique_lock<boost::mutex> lock(_mutex); - _lastFetchedHash = o["h"].numberLong(); - _lastOpTimeFetched = o["ts"]._opTime(); - LOG(3) << "replSet lastOpTimeFetched: " << _lastOpTimeFetched.toStringPretty(); + _syncSourceReader.tailCheck(); + if (!_syncSourceReader.haveCursor()) { + LOG(1) << "replSet end syncTail pass"; + return; + } + + continue; } } - } - bool BackgroundSync::shouldChangeSyncSource() { - // is it even still around? - if (getSyncTarget().empty() || _syncSourceReader.getHost().empty()) { - return true; + // If we are transitioning to primary state, we need to leave + // this loop in order to go into bgsync-pause mode. + if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary()) { + LOG(1) << "waiting for draining or we are primary, not adding more ops to buffer"; + return; } - // check other members: is any member's optime more than MaxSyncSourceLag seconds - // ahead of the current sync source? - return _replCoord->shouldChangeSyncSource(_syncSourceReader.getHost()); - } - - - bool BackgroundSync::peek(BSONObj* op) { - return _buffer.peek(*op); - } - - void BackgroundSync::waitForMore() { - BSONObj op; - // Block for one second before timing out. - // Ignore the value of the op we peeked at. - _buffer.blockingPeek(op, 1); - } + // At this point, we are guaranteed to have at least one thing to read out + // of the oplogreader cursor. + BSONObj o = _syncSourceReader.nextSafe().getOwned(); + opsReadStats.increment(); - void BackgroundSync::consume() { - // this is just to get the op off the queue, it's been peeked at - // and queued for application already - BSONObj op = _buffer.blockingPop(); - bufferCountGauge.decrement(1); - bufferSizeGauge.decrement(getSize(op)); - } - bool BackgroundSync::_rollbackIfNeeded(OperationContext* txn, OplogReader& r) { - string hn = r.conn()->getServerAddress(); + if (MONGO_FAIL_POINT(stepDownWhileDrainingFailPoint)) { + sleepsecs(20); + } - if (!r.more()) { - try { - BSONObj theirLastOp = r.getLastOp(rsoplog); - if (theirLastOp.isEmpty()) { - log() << "replSet error empty query result from " << hn << " oplog"; - sleepsecs(2); - return true; - } - OpTime theirTS = theirLastOp["ts"]._opTime(); - if (theirTS < _lastOpTimeFetched) { - log() << "replSet we are ahead of the sync source, will try to roll back"; - syncRollback(txn, _replCoord->getMyLastOptime(), &r, _replCoord); - return true; - } - /* we're not ahead? maybe our new query got fresher data. best to come back and try again */ - log() << "replSet syncTail condition 1"; - sleepsecs(1); - } - catch(DBException& e) { - log() << "replSet error querying " << hn << ' ' << e.toString(); - sleepsecs(2); - } - return true; + { + boost::unique_lock<boost::mutex> lock(_mutex); + _appliedBuffer = false; } - BSONObj o = r.nextSafe(); - OpTime ts = o["ts"]._opTime(); - long long hash = o["h"].numberLong(); - if( ts != _lastOpTimeFetched || hash != _lastFetchedHash ) { - log() << "replSet our last op time fetched: " << _lastOpTimeFetched.toStringPretty(); - log() << "replset source's GTE: " << ts.toStringPretty(); - syncRollback(txn, _replCoord->getMyLastOptime(), &r, _replCoord); - return true; + OCCASIONALLY { + LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes"; } - return false; - } + bufferCountGauge.increment(); + bufferSizeGauge.increment(getSize(o)); + _buffer.push(o); - HostAndPort BackgroundSync::getSyncTarget() { - boost::unique_lock<boost::mutex> lock(_mutex); - return _syncSourceHost; + { + boost::unique_lock<boost::mutex> lock(_mutex); + _lastFetchedHash = o["h"].numberLong(); + _lastOpTimeFetched = o["ts"]._opTime(); + LOG(3) << "replSet lastOpTimeFetched: " << _lastOpTimeFetched.toStringPretty(); + } } +} - void BackgroundSync::clearSyncTarget() { - boost::unique_lock<boost::mutex> lock(_mutex); - _syncSourceHost = HostAndPort(); +bool BackgroundSync::shouldChangeSyncSource() { + // is it even still around? + if (getSyncTarget().empty() || _syncSourceReader.getHost().empty()) { + return true; } - void BackgroundSync::stop() { - boost::lock_guard<boost::mutex> lock(_mutex); + // check other members: is any member's optime more than MaxSyncSourceLag seconds + // ahead of the current sync source? + return _replCoord->shouldChangeSyncSource(_syncSourceReader.getHost()); +} - _pause = true; - _syncSourceHost = HostAndPort(); - _lastOpTimeFetched = OpTime(0,0); - _lastFetchedHash = 0; - _appliedBufferCondition.notify_all(); - _pausedCondition.notify_all(); - } - void BackgroundSync::start(OperationContext* txn) { - massert(16235, "going to start syncing, but buffer is not empty", _buffer.empty()); +bool BackgroundSync::peek(BSONObj* op) { + return _buffer.peek(*op); +} - long long updatedLastAppliedHash = _readLastAppliedHash(txn); - boost::lock_guard<boost::mutex> lk(_mutex); - _pause = false; +void BackgroundSync::waitForMore() { + BSONObj op; + // Block for one second before timing out. + // Ignore the value of the op we peeked at. + _buffer.blockingPeek(op, 1); +} - // reset _last fields with current oplog data - _lastAppliedHash = updatedLastAppliedHash; - _lastOpTimeFetched = _replCoord->getMyLastOptime(); - _lastFetchedHash = _lastAppliedHash; +void BackgroundSync::consume() { + // this is just to get the op off the queue, it's been peeked at + // and queued for application already + BSONObj op = _buffer.blockingPop(); + bufferCountGauge.decrement(1); + bufferSizeGauge.decrement(getSize(op)); +} - LOG(1) << "replset bgsync fetch queue set to: " << _lastOpTimeFetched << - " " << _lastFetchedHash; - } +bool BackgroundSync::_rollbackIfNeeded(OperationContext* txn, OplogReader& r) { + string hn = r.conn()->getServerAddress(); - void BackgroundSync::waitUntilPaused() { - boost::unique_lock<boost::mutex> lock(_mutex); - while (!_pause) { - _pausedCondition.wait(lock); + if (!r.more()) { + try { + BSONObj theirLastOp = r.getLastOp(rsoplog); + if (theirLastOp.isEmpty()) { + log() << "replSet error empty query result from " << hn << " oplog"; + sleepsecs(2); + return true; + } + OpTime theirTS = theirLastOp["ts"]._opTime(); + if (theirTS < _lastOpTimeFetched) { + log() << "replSet we are ahead of the sync source, will try to roll back"; + syncRollback(txn, _replCoord->getMyLastOptime(), &r, _replCoord); + return true; + } + /* we're not ahead? maybe our new query got fresher data. best to come back and try again */ + log() << "replSet syncTail condition 1"; + sleepsecs(1); + } catch (DBException& e) { + log() << "replSet error querying " << hn << ' ' << e.toString(); + sleepsecs(2); } + return true; } - long long BackgroundSync::getLastAppliedHash() const { - boost::lock_guard<boost::mutex> lck(_mutex); - return _lastAppliedHash; + BSONObj o = r.nextSafe(); + OpTime ts = o["ts"]._opTime(); + long long hash = o["h"].numberLong(); + if (ts != _lastOpTimeFetched || hash != _lastFetchedHash) { + log() << "replSet our last op time fetched: " << _lastOpTimeFetched.toStringPretty(); + log() << "replset source's GTE: " << ts.toStringPretty(); + syncRollback(txn, _replCoord->getMyLastOptime(), &r, _replCoord); + return true; } - void BackgroundSync::clearBuffer() { - _buffer.clear(); + return false; +} + +HostAndPort BackgroundSync::getSyncTarget() { + boost::unique_lock<boost::mutex> lock(_mutex); + return _syncSourceHost; +} + +void BackgroundSync::clearSyncTarget() { + boost::unique_lock<boost::mutex> lock(_mutex); + _syncSourceHost = HostAndPort(); +} + +void BackgroundSync::stop() { + boost::lock_guard<boost::mutex> lock(_mutex); + + _pause = true; + _syncSourceHost = HostAndPort(); + _lastOpTimeFetched = OpTime(0, 0); + _lastFetchedHash = 0; + _appliedBufferCondition.notify_all(); + _pausedCondition.notify_all(); +} + +void BackgroundSync::start(OperationContext* txn) { + massert(16235, "going to start syncing, but buffer is not empty", _buffer.empty()); + + long long updatedLastAppliedHash = _readLastAppliedHash(txn); + boost::lock_guard<boost::mutex> lk(_mutex); + _pause = false; + + // reset _last fields with current oplog data + _lastAppliedHash = updatedLastAppliedHash; + _lastOpTimeFetched = _replCoord->getMyLastOptime(); + _lastFetchedHash = _lastAppliedHash; + + LOG(1) << "replset bgsync fetch queue set to: " << _lastOpTimeFetched << " " + << _lastFetchedHash; +} + +void BackgroundSync::waitUntilPaused() { + boost::unique_lock<boost::mutex> lock(_mutex); + while (!_pause) { + _pausedCondition.wait(lock); } - - void BackgroundSync::setLastAppliedHash(long long newHash) { - boost::lock_guard<boost::mutex> lck(_mutex); - _lastAppliedHash = newHash; +} + +long long BackgroundSync::getLastAppliedHash() const { + boost::lock_guard<boost::mutex> lck(_mutex); + return _lastAppliedHash; +} + +void BackgroundSync::clearBuffer() { + _buffer.clear(); +} + +void BackgroundSync::setLastAppliedHash(long long newHash) { + boost::lock_guard<boost::mutex> lck(_mutex); + _lastAppliedHash = newHash; +} + +void BackgroundSync::loadLastAppliedHash(OperationContext* txn) { + long long result = _readLastAppliedHash(txn); + boost::lock_guard<boost::mutex> lk(_mutex); + _lastAppliedHash = result; +} + +long long BackgroundSync::_readLastAppliedHash(OperationContext* txn) { + BSONObj oplogEntry; + try { + MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { + ScopedTransaction transaction(txn, MODE_IX); + Lock::DBLock lk(txn->lockState(), "local", MODE_X); + bool success = Helpers::getLast(txn, rsoplog, oplogEntry); + if (!success) { + // This can happen when we are to do an initial sync. lastHash will be set + // after the initial sync is complete. + return 0; + } + } + MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "readLastAppliedHash", rsoplog); + } catch (const DBException& ex) { + severe() << "Problem reading " << rsoplog << ": " << ex.toStatus(); + fassertFailed(18904); } - - void BackgroundSync::loadLastAppliedHash(OperationContext* txn) { - long long result = _readLastAppliedHash(txn); - boost::lock_guard<boost::mutex> lk(_mutex); - _lastAppliedHash = result; + BSONElement hashElement = oplogEntry[hashFieldName]; + if (hashElement.eoo()) { + severe() << "Most recent entry in " << rsoplog << " missing \"" << hashFieldName + << "\" field"; + fassertFailed(18902); } - - long long BackgroundSync::_readLastAppliedHash(OperationContext* txn) { - BSONObj oplogEntry; - try { - MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { - ScopedTransaction transaction(txn, MODE_IX); - Lock::DBLock lk(txn->lockState(), "local", MODE_X); - bool success = Helpers::getLast(txn, rsoplog, oplogEntry); - if (!success) { - // This can happen when we are to do an initial sync. lastHash will be set - // after the initial sync is complete. - return 0; - } - } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "readLastAppliedHash", rsoplog); - } - catch (const DBException& ex) { - severe() << "Problem reading " << rsoplog << ": " << ex.toStatus(); - fassertFailed(18904); - } - BSONElement hashElement = oplogEntry[hashFieldName]; - if (hashElement.eoo()) { - severe() << "Most recent entry in " << rsoplog << " missing \"" << hashFieldName << - "\" field"; - fassertFailed(18902); - } - if (hashElement.type() != NumberLong) { - severe() << "Expected type of \"" << hashFieldName << "\" in most recent " << - rsoplog << " entry to have type NumberLong, but found " << - typeName(hashElement.type()); - fassertFailed(18903); - } - return hashElement.safeNumberLong(); + if (hashElement.type() != NumberLong) { + severe() << "Expected type of \"" << hashFieldName << "\" in most recent " << rsoplog + << " entry to have type NumberLong, but found " << typeName(hashElement.type()); + fassertFailed(18903); } + return hashElement.safeNumberLong(); +} - bool BackgroundSync::getInitialSyncRequestedFlag() { - boost::lock_guard<boost::mutex> lock(_initialSyncMutex); - return _initialSyncRequestedFlag; - } +bool BackgroundSync::getInitialSyncRequestedFlag() { + boost::lock_guard<boost::mutex> lock(_initialSyncMutex); + return _initialSyncRequestedFlag; +} - void BackgroundSync::setInitialSyncRequestedFlag(bool value) { - boost::lock_guard<boost::mutex> lock(_initialSyncMutex); - _initialSyncRequestedFlag = value; - } +void BackgroundSync::setInitialSyncRequestedFlag(bool value) { + boost::lock_guard<boost::mutex> lock(_initialSyncMutex); + _initialSyncRequestedFlag = value; +} - void BackgroundSync::pushTestOpToBuffer(const BSONObj& op) { - boost::lock_guard<boost::mutex> lock(_mutex); - _buffer.push(op); - } +void BackgroundSync::pushTestOpToBuffer(const BSONObj& op) { + boost::lock_guard<boost::mutex> lock(_mutex); + _buffer.push(op); +} -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/bgsync.h b/src/mongo/db/repl/bgsync.h index 2952879f246..c4d7212413d 100644 --- a/src/mongo/db/repl/bgsync.h +++ b/src/mongo/db/repl/bgsync.h @@ -37,154 +37,152 @@ namespace mongo { namespace repl { - class Member; - class ReplicationCoordinator; +class Member; +class ReplicationCoordinator; - // This interface exists to facilitate easier testing; - // the test infrastructure implements these functions with stubs. - class BackgroundSyncInterface { - public: - virtual ~BackgroundSyncInterface(); +// This interface exists to facilitate easier testing; +// the test infrastructure implements these functions with stubs. +class BackgroundSyncInterface { +public: + virtual ~BackgroundSyncInterface(); - // Gets the head of the buffer, but does not remove it. - // Returns true if an element was present at the head; - // false if the queue was empty. - virtual bool peek(BSONObj* op) = 0; + // Gets the head of the buffer, but does not remove it. + // Returns true if an element was present at the head; + // false if the queue was empty. + virtual bool peek(BSONObj* op) = 0; - // Deletes objects in the queue; - // called by sync thread after it has applied an op - virtual void consume() = 0; + // Deletes objects in the queue; + // called by sync thread after it has applied an op + virtual void consume() = 0; - // wait up to 1 second for more ops to appear - virtual void waitForMore() = 0; - }; + // wait up to 1 second for more ops to appear + virtual void waitForMore() = 0; +}; - /** - * Lock order: - * 1. rslock - * 2. rwlock - * 3. BackgroundSync::_mutex - */ - class BackgroundSync : public BackgroundSyncInterface { - public: - // Allow index prefetching to be turned on/off - enum IndexPrefetchConfig { - PREFETCH_NONE=0, PREFETCH_ID_ONLY=1, PREFETCH_ALL=2 - }; +/** + * Lock order: + * 1. rslock + * 2. rwlock + * 3. BackgroundSync::_mutex + */ +class BackgroundSync : public BackgroundSyncInterface { +public: + // Allow index prefetching to be turned on/off + enum IndexPrefetchConfig { PREFETCH_NONE = 0, PREFETCH_ID_ONLY = 1, PREFETCH_ALL = 2 }; - static BackgroundSync* get(); + static BackgroundSync* get(); - // stop syncing (when this node becomes a primary, e.g.) - void stop(); + // stop syncing (when this node becomes a primary, e.g.) + void stop(); - void shutdown(); - void notify(OperationContext* txn); + void shutdown(); + void notify(OperationContext* txn); - // Blocks until _pause becomes true from a call to stop() or shutdown() - void waitUntilPaused(); + // Blocks until _pause becomes true from a call to stop() or shutdown() + void waitUntilPaused(); - virtual ~BackgroundSync() {} + virtual ~BackgroundSync() {} - // starts the producer thread - void producerThread(); - // starts the sync target notifying thread - void notifierThread(); + // starts the producer thread + void producerThread(); + // starts the sync target notifying thread + void notifierThread(); - HostAndPort getSyncTarget(); + HostAndPort getSyncTarget(); - // Interface implementation + // Interface implementation - virtual bool peek(BSONObj* op); - virtual void consume(); - virtual void clearSyncTarget(); - virtual void waitForMore(); + virtual bool peek(BSONObj* op); + virtual void consume(); + virtual void clearSyncTarget(); + virtual void waitForMore(); - // For monitoring - BSONObj getCounters(); + // For monitoring + BSONObj getCounters(); - long long getLastAppliedHash() const; - void setLastAppliedHash(long long oldH); - void loadLastAppliedHash(OperationContext* txn); + long long getLastAppliedHash() const; + void setLastAppliedHash(long long oldH); + void loadLastAppliedHash(OperationContext* txn); - // Clears any fetched and buffered oplog entries. - void clearBuffer(); + // Clears any fetched and buffered oplog entries. + void clearBuffer(); - bool getInitialSyncRequestedFlag(); - void setInitialSyncRequestedFlag(bool value); + bool getInitialSyncRequestedFlag(); + void setInitialSyncRequestedFlag(bool value); - void setIndexPrefetchConfig(const IndexPrefetchConfig cfg) { - _indexPrefetchConfig = cfg; - } + void setIndexPrefetchConfig(const IndexPrefetchConfig cfg) { + _indexPrefetchConfig = cfg; + } - IndexPrefetchConfig getIndexPrefetchConfig() { - return _indexPrefetchConfig; - } + IndexPrefetchConfig getIndexPrefetchConfig() { + return _indexPrefetchConfig; + } - // Testing related stuff - void pushTestOpToBuffer(const BSONObj& op); - private: - static BackgroundSync *s_instance; - // protects creation of s_instance - static boost::mutex s_mutex; + // Testing related stuff + void pushTestOpToBuffer(const BSONObj& op); - // Production thread - BlockingQueue<BSONObj> _buffer; - OplogReader _syncSourceReader; +private: + static BackgroundSync* s_instance; + // protects creation of s_instance + static boost::mutex s_mutex; - // _mutex protects all of the class variables except _syncSourceReader and _buffer - mutable boost::mutex _mutex; + // Production thread + BlockingQueue<BSONObj> _buffer; + OplogReader _syncSourceReader; - OpTime _lastOpTimeFetched; + // _mutex protects all of the class variables except _syncSourceReader and _buffer + mutable boost::mutex _mutex; - // lastAppliedHash is used to generate a new hash for the following op, when primary. - long long _lastAppliedHash; - // lastFetchedHash is used to match ops to determine if we need to rollback, when - // a secondary. - long long _lastFetchedHash; + OpTime _lastOpTimeFetched; - // if produce thread should be running - bool _pause; - boost::condition _pausedCondition; - bool _appliedBuffer; - boost::condition _appliedBufferCondition; + // lastAppliedHash is used to generate a new hash for the following op, when primary. + long long _lastAppliedHash; + // lastFetchedHash is used to match ops to determine if we need to rollback, when + // a secondary. + long long _lastFetchedHash; - HostAndPort _syncSourceHost; + // if produce thread should be running + bool _pause; + boost::condition _pausedCondition; + bool _appliedBuffer; + boost::condition _appliedBufferCondition; - BackgroundSync(); - BackgroundSync(const BackgroundSync& s); - BackgroundSync operator=(const BackgroundSync& s); + HostAndPort _syncSourceHost; - // Production thread - void _producerThread(); - // Adds elements to the list, up to maxSize. - void produce(OperationContext* txn); - // Checks the criteria for rolling back and executes a rollback if warranted. - bool _rollbackIfNeeded(OperationContext* txn, OplogReader& r); + BackgroundSync(); + BackgroundSync(const BackgroundSync& s); + BackgroundSync operator=(const BackgroundSync& s); - // Evaluate if the current sync target is still good - bool shouldChangeSyncSource(); + // Production thread + void _producerThread(); + // Adds elements to the list, up to maxSize. + void produce(OperationContext* txn); + // Checks the criteria for rolling back and executes a rollback if warranted. + bool _rollbackIfNeeded(OperationContext* txn, OplogReader& r); - // restart syncing - void start(OperationContext* txn); + // Evaluate if the current sync target is still good + bool shouldChangeSyncSource(); - long long _readLastAppliedHash(OperationContext* txn); + // restart syncing + void start(OperationContext* txn); - // A pointer to the replication coordinator running the show. - ReplicationCoordinator* _replCoord; + long long _readLastAppliedHash(OperationContext* txn); - // bool for indicating resync need on this node and the mutex that protects it - // The resync command sets this flag; the Applier thread observes and clears it. - bool _initialSyncRequestedFlag; - boost::mutex _initialSyncMutex; + // A pointer to the replication coordinator running the show. + ReplicationCoordinator* _replCoord; - // This setting affects the Applier prefetcher behavior. - IndexPrefetchConfig _indexPrefetchConfig; + // bool for indicating resync need on this node and the mutex that protects it + // The resync command sets this flag; the Applier thread observes and clears it. + bool _initialSyncRequestedFlag; + boost::mutex _initialSyncMutex; - }; + // This setting affects the Applier prefetcher behavior. + IndexPrefetchConfig _indexPrefetchConfig; +}; -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/check_quorum_for_config_change.cpp b/src/mongo/db/repl/check_quorum_for_config_change.cpp index 7064b9a473d..6a9bbdf510a 100644 --- a/src/mongo/db/repl/check_quorum_for_config_change.cpp +++ b/src/mongo/db/repl/check_quorum_for_config_change.cpp @@ -45,256 +45,248 @@ namespace mongo { namespace repl { - QuorumChecker::QuorumChecker(const ReplicaSetConfig* rsConfig, int myIndex) - : _rsConfig(rsConfig), - _myIndex(myIndex), - _numResponses(1), // We "responded" to ourself already. - _numElectable(0), - _vetoStatus(Status::OK()), - _finalStatus(ErrorCodes::CallbackCanceled, "Quorum check canceled") { - - invariant(myIndex < _rsConfig->getNumMembers()); - const MemberConfig& myConfig = _rsConfig->getMemberAt(_myIndex); - - if (myConfig.isVoter()) { - _voters.push_back(myConfig.getHostAndPort()); - } - if (myConfig.isElectable()) { - _numElectable = 1; - } - - if (hasReceivedSufficientResponses()) { - _onQuorumCheckComplete(); - } +QuorumChecker::QuorumChecker(const ReplicaSetConfig* rsConfig, int myIndex) + : _rsConfig(rsConfig), + _myIndex(myIndex), + _numResponses(1), // We "responded" to ourself already. + _numElectable(0), + _vetoStatus(Status::OK()), + _finalStatus(ErrorCodes::CallbackCanceled, "Quorum check canceled") { + invariant(myIndex < _rsConfig->getNumMembers()); + const MemberConfig& myConfig = _rsConfig->getMemberAt(_myIndex); + + if (myConfig.isVoter()) { + _voters.push_back(myConfig.getHostAndPort()); + } + if (myConfig.isElectable()) { + _numElectable = 1; } - QuorumChecker::~QuorumChecker() {} + if (hasReceivedSufficientResponses()) { + _onQuorumCheckComplete(); + } +} - std::vector<ReplicationExecutor::RemoteCommandRequest> QuorumChecker::getRequests() const { - const bool isInitialConfig = _rsConfig->getConfigVersion() == 1; - const MemberConfig& myConfig = _rsConfig->getMemberAt(_myIndex); +QuorumChecker::~QuorumChecker() {} - std::vector<ReplicationExecutor::RemoteCommandRequest> requests; - if (hasReceivedSufficientResponses()) { - return requests; - } +std::vector<ReplicationExecutor::RemoteCommandRequest> QuorumChecker::getRequests() const { + const bool isInitialConfig = _rsConfig->getConfigVersion() == 1; + const MemberConfig& myConfig = _rsConfig->getMemberAt(_myIndex); - ReplSetHeartbeatArgs hbArgs; - hbArgs.setSetName(_rsConfig->getReplSetName()); - hbArgs.setProtocolVersion(1); - hbArgs.setConfigVersion(_rsConfig->getConfigVersion()); - hbArgs.setCheckEmpty(isInitialConfig); - hbArgs.setSenderHost(myConfig.getHostAndPort()); - hbArgs.setSenderId(myConfig.getId()); - const BSONObj hbRequest = hbArgs.toBSON(); + std::vector<ReplicationExecutor::RemoteCommandRequest> requests; + if (hasReceivedSufficientResponses()) { + return requests; + } - // Send a bunch of heartbeat requests. - // Schedule an operation when a "sufficient" number of them have completed, and use that - // to compute the quorum check results. - // Wait for the "completion" callback to finish, and then it's OK to return the results. - for (int i = 0; i < _rsConfig->getNumMembers(); ++i) { - if (_myIndex == i) { - // No need to check self for liveness or unreadiness. - continue; - } - requests.push_back(ReplicationExecutor::RemoteCommandRequest( - _rsConfig->getMemberAt(i).getHostAndPort(), - "admin", - hbRequest, - _rsConfig->getHeartbeatTimeoutPeriodMillis())); + ReplSetHeartbeatArgs hbArgs; + hbArgs.setSetName(_rsConfig->getReplSetName()); + hbArgs.setProtocolVersion(1); + hbArgs.setConfigVersion(_rsConfig->getConfigVersion()); + hbArgs.setCheckEmpty(isInitialConfig); + hbArgs.setSenderHost(myConfig.getHostAndPort()); + hbArgs.setSenderId(myConfig.getId()); + const BSONObj hbRequest = hbArgs.toBSON(); + + // Send a bunch of heartbeat requests. + // Schedule an operation when a "sufficient" number of them have completed, and use that + // to compute the quorum check results. + // Wait for the "completion" callback to finish, and then it's OK to return the results. + for (int i = 0; i < _rsConfig->getNumMembers(); ++i) { + if (_myIndex == i) { + // No need to check self for liveness or unreadiness. + continue; } - - return requests; + requests.push_back(ReplicationExecutor::RemoteCommandRequest( + _rsConfig->getMemberAt(i).getHostAndPort(), + "admin", + hbRequest, + _rsConfig->getHeartbeatTimeoutPeriodMillis())); } - void QuorumChecker::processResponse( - const ReplicationExecutor::RemoteCommandRequest& request, - const ResponseStatus& response) { + return requests; +} - _tabulateHeartbeatResponse(request, response); - if (hasReceivedSufficientResponses()) { - _onQuorumCheckComplete(); - } +void QuorumChecker::processResponse(const ReplicationExecutor::RemoteCommandRequest& request, + const ResponseStatus& response) { + _tabulateHeartbeatResponse(request, response); + if (hasReceivedSufficientResponses()) { + _onQuorumCheckComplete(); } +} - void QuorumChecker::_onQuorumCheckComplete() { - if (!_vetoStatus.isOK()) { - _finalStatus = _vetoStatus; - return; +void QuorumChecker::_onQuorumCheckComplete() { + if (!_vetoStatus.isOK()) { + _finalStatus = _vetoStatus; + return; + } + if (_rsConfig->getConfigVersion() == 1 && !_badResponses.empty()) { + str::stream message; + message << "replSetInitiate quorum check failed because not all proposed set members " + "responded affirmatively: "; + for (std::vector<std::pair<HostAndPort, Status>>::const_iterator it = _badResponses.begin(); + it != _badResponses.end(); + ++it) { + if (it != _badResponses.begin()) { + message << ", "; + } + message << it->first.toString() << " failed with " << it->second.reason(); } - if (_rsConfig->getConfigVersion() == 1 && !_badResponses.empty()) { - str::stream message; - message << "replSetInitiate quorum check failed because not all proposed set members " - "responded affirmatively: "; - for (std::vector<std::pair<HostAndPort, Status> >::const_iterator it = - _badResponses.begin(); - it != _badResponses.end(); - ++it) { + _finalStatus = Status(ErrorCodes::NodeNotFound, message); + return; + } + if (_numElectable == 0) { + _finalStatus = Status(ErrorCodes::NodeNotFound, + "Quorum check failed because no " + "electable nodes responded; at least one required for config"); + return; + } + if (int(_voters.size()) < _rsConfig->getMajorityVoteCount()) { + str::stream message; + message << "Quorum check failed because not enough voting nodes responded; required " + << _rsConfig->getMajorityVoteCount() << " but "; + + if (_voters.size() == 0) { + message << "none responded"; + } else { + message << "only the following " << _voters.size() + << " voting nodes responded: " << _voters.front().toString(); + for (size_t i = 1; i < _voters.size(); ++i) { + message << ", " << _voters[i].toString(); + } + } + if (!_badResponses.empty()) { + message << "; the following nodes did not respond affirmatively: "; + for (std::vector<std::pair<HostAndPort, Status>>::const_iterator it = + _badResponses.begin(); + it != _badResponses.end(); + ++it) { if (it != _badResponses.begin()) { message << ", "; } message << it->first.toString() << " failed with " << it->second.reason(); } - _finalStatus = Status(ErrorCodes::NodeNotFound, message); - return; } - if (_numElectable == 0) { - _finalStatus = Status( - ErrorCodes::NodeNotFound, "Quorum check failed because no " - "electable nodes responded; at least one required for config"); - return; - } - if (int(_voters.size()) < _rsConfig->getMajorityVoteCount()) { - str::stream message; - message << "Quorum check failed because not enough voting nodes responded; required " << - _rsConfig->getMajorityVoteCount() << " but "; - - if (_voters.size() == 0) { - message << "none responded"; - } - else { - message << "only the following " << _voters.size() << - " voting nodes responded: " << _voters.front().toString(); - for (size_t i = 1; i < _voters.size(); ++i) { - message << ", " << _voters[i].toString(); - } - } - if (!_badResponses.empty()) { - message << "; the following nodes did not respond affirmatively: "; - for (std::vector<std::pair<HostAndPort, Status> >::const_iterator it = - _badResponses.begin(); - it != _badResponses.end(); - ++it) { - if (it != _badResponses.begin()) { - message << ", "; - } - message << it->first.toString() << " failed with " << it->second.reason(); - } - } - _finalStatus = Status(ErrorCodes::NodeNotFound, message); - return; - } - _finalStatus = Status::OK(); + _finalStatus = Status(ErrorCodes::NodeNotFound, message); + return; + } + _finalStatus = Status::OK(); +} + +void QuorumChecker::_tabulateHeartbeatResponse( + const ReplicationExecutor::RemoteCommandRequest& request, const ResponseStatus& response) { + ++_numResponses; + if (!response.isOK()) { + warning() << "Failed to complete heartbeat request to " << request.target << "; " + << response.getStatus(); + _badResponses.push_back(std::make_pair(request.target, response.getStatus())); + return; } - void QuorumChecker::_tabulateHeartbeatResponse( - const ReplicationExecutor::RemoteCommandRequest& request, - const ResponseStatus& response) { + BSONObj resBSON = response.getValue().data; + ReplSetHeartbeatResponse hbResp; + Status hbStatus = hbResp.initialize(resBSON); - ++_numResponses; - if (!response.isOK()) { - warning() << "Failed to complete heartbeat request to " << request.target << - "; " << response.getStatus(); - _badResponses.push_back(std::make_pair(request.target, response.getStatus())); - return; - } + if (hbStatus.code() == ErrorCodes::InconsistentReplicaSetNames) { + std::string message = str::stream() << "Our set name did not match that of " + << request.target.toString(); + _vetoStatus = Status(ErrorCodes::NewReplicaSetConfigurationIncompatible, message); + warning() << message; + return; + } - BSONObj resBSON = response.getValue().data; - ReplSetHeartbeatResponse hbResp; - Status hbStatus = hbResp.initialize(resBSON); + if (!hbStatus.isOK() && hbStatus != ErrorCodes::InvalidReplicaSetConfig) { + warning() << "Got error (" << hbStatus << ") response on heartbeat request to " + << request.target << "; " << hbResp; + _badResponses.push_back(std::make_pair(request.target, hbStatus)); + return; + } - if (hbStatus.code() == ErrorCodes::InconsistentReplicaSetNames) { - std::string message = str::stream() << "Our set name did not match that of " << - request.target.toString(); + if (!hbResp.getReplicaSetName().empty()) { + if (hbResp.getVersion() >= _rsConfig->getConfigVersion()) { + std::string message = str::stream() + << "Our config version of " << _rsConfig->getConfigVersion() + << " is no larger than the version on " << request.target.toString() + << ", which is " << hbResp.getVersion(); _vetoStatus = Status(ErrorCodes::NewReplicaSetConfigurationIncompatible, message); warning() << message; return; } + } - if (!hbStatus.isOK() && hbStatus != ErrorCodes::InvalidReplicaSetConfig) { - warning() << "Got error (" << hbStatus - << ") response on heartbeat request to " << request.target - << "; " << hbResp; - _badResponses.push_back(std::make_pair(request.target, hbStatus)); - return; - } - - if (!hbResp.getReplicaSetName().empty()) { - if (hbResp.getVersion() >= _rsConfig->getConfigVersion()) { - std::string message = str::stream() << "Our config version of " << - _rsConfig->getConfigVersion() << - " is no larger than the version on " << request.target.toString() << - ", which is " << hbResp.getVersion(); - _vetoStatus = Status(ErrorCodes::NewReplicaSetConfigurationIncompatible, message); - warning() << message; - return; - } - } - - const bool isInitialConfig = _rsConfig->getConfigVersion() == 1; - if (isInitialConfig && hbResp.hasData()) { - std::string message = str::stream() << "'" << request.target.toString() - << "' has data already, cannot initiate set."; - _vetoStatus = Status(ErrorCodes::CannotInitializeNodeWithData, message); - warning() << message; - return; - } - - for (int i = 0; i < _rsConfig->getNumMembers(); ++i) { - const MemberConfig& memberConfig = _rsConfig->getMemberAt(i); - if (memberConfig.getHostAndPort() != request.target) { - continue; - } - if (memberConfig.isElectable()) { - ++_numElectable; - } - if (memberConfig.isVoter()) { - _voters.push_back(request.target); - } - return; - } - invariant(false); + const bool isInitialConfig = _rsConfig->getConfigVersion() == 1; + if (isInitialConfig && hbResp.hasData()) { + std::string message = str::stream() << "'" << request.target.toString() + << "' has data already, cannot initiate set."; + _vetoStatus = Status(ErrorCodes::CannotInitializeNodeWithData, message); + warning() << message; + return; } - bool QuorumChecker::hasReceivedSufficientResponses() const { - if (!_vetoStatus.isOK() || _numResponses == _rsConfig->getNumMembers()) { - // Vetoed or everybody has responded. All done. - return true; - } - if (_rsConfig->getConfigVersion() == 1) { - // Have not received responses from every member, and the proposed config - // version is 1 (initial configuration). Keep waiting. - return false; + for (int i = 0; i < _rsConfig->getNumMembers(); ++i) { + const MemberConfig& memberConfig = _rsConfig->getMemberAt(i); + if (memberConfig.getHostAndPort() != request.target) { + continue; } - if (_numElectable == 0) { - // Have not heard from at least one electable node. Keep waiting. - return false; + if (memberConfig.isElectable()) { + ++_numElectable; } - if (int(_voters.size()) < _rsConfig->getMajorityVoteCount()) { - // Have not heard from a majority of voters. Keep waiting. - return false; + if (memberConfig.isVoter()) { + _voters.push_back(request.target); } + return; + } + invariant(false); +} - // Have heard from a majority of voters and one electable node. All done. +bool QuorumChecker::hasReceivedSufficientResponses() const { + if (!_vetoStatus.isOK() || _numResponses == _rsConfig->getNumMembers()) { + // Vetoed or everybody has responded. All done. return true; } - - Status checkQuorumGeneral(ReplicationExecutor* executor, - const ReplicaSetConfig& rsConfig, - const int myIndex) { - QuorumChecker checker(&rsConfig, myIndex); - ScatterGatherRunner runner(&checker); - Status status = runner.run(executor); - if (!status.isOK()) { - return status; - } - - return checker.getFinalStatus(); + if (_rsConfig->getConfigVersion() == 1) { + // Have not received responses from every member, and the proposed config + // version is 1 (initial configuration). Keep waiting. + return false; } - - Status checkQuorumForInitiate(ReplicationExecutor* executor, - const ReplicaSetConfig& rsConfig, - const int myIndex) { - invariant(rsConfig.getConfigVersion() == 1); - return checkQuorumGeneral(executor, rsConfig, myIndex); + if (_numElectable == 0) { + // Have not heard from at least one electable node. Keep waiting. + return false; + } + if (int(_voters.size()) < _rsConfig->getMajorityVoteCount()) { + // Have not heard from a majority of voters. Keep waiting. + return false; } - Status checkQuorumForReconfig(ReplicationExecutor* executor, - const ReplicaSetConfig& rsConfig, - const int myIndex) { - invariant(rsConfig.getConfigVersion() > 1); - return checkQuorumGeneral(executor, rsConfig, myIndex); + // Have heard from a majority of voters and one electable node. All done. + return true; +} + +Status checkQuorumGeneral(ReplicationExecutor* executor, + const ReplicaSetConfig& rsConfig, + const int myIndex) { + QuorumChecker checker(&rsConfig, myIndex); + ScatterGatherRunner runner(&checker); + Status status = runner.run(executor); + if (!status.isOK()) { + return status; } + return checker.getFinalStatus(); +} + +Status checkQuorumForInitiate(ReplicationExecutor* executor, + const ReplicaSetConfig& rsConfig, + const int myIndex) { + invariant(rsConfig.getConfigVersion() == 1); + return checkQuorumGeneral(executor, rsConfig, myIndex); +} + +Status checkQuorumForReconfig(ReplicationExecutor* executor, + const ReplicaSetConfig& rsConfig, + const int myIndex) { + invariant(rsConfig.getConfigVersion() > 1); + return checkQuorumGeneral(executor, rsConfig, myIndex); +} + } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/check_quorum_for_config_change.h b/src/mongo/db/repl/check_quorum_for_config_change.h index 396ac4dea39..96e8a9aad46 100644 --- a/src/mongo/db/repl/check_quorum_for_config_change.h +++ b/src/mongo/db/repl/check_quorum_for_config_change.h @@ -35,117 +35,118 @@ namespace mongo { namespace repl { - class ReplicaSetConfig; +class ReplicaSetConfig; +/** + * Quorum checking state machine. + * + * Usage: Construct a QuorumChecker, pass in a pointer to the configuration for which you're + * checking quorum, and the integer index of the member config representing the "executing" + * node. Use ScatterGatherRunner or otherwise execute a scatter-gather procedure as described + * in the class comment for the ScatterGatherAlgorithm class. After + * hasReceivedSufficientResponses() returns true, you may call getFinalStatus() to get the + * result of the quorum check. + */ +class QuorumChecker : public ScatterGatherAlgorithm { + MONGO_DISALLOW_COPYING(QuorumChecker); + +public: /** - * Quorum checking state machine. + * Constructs a QuorumChecker that is used to confirm that sufficient nodes are up to accept + * "rsConfig". "myIndex" is the index of the local node, which is assumed to be up. * - * Usage: Construct a QuorumChecker, pass in a pointer to the configuration for which you're - * checking quorum, and the integer index of the member config representing the "executing" - * node. Use ScatterGatherRunner or otherwise execute a scatter-gather procedure as described - * in the class comment for the ScatterGatherAlgorithm class. After - * hasReceivedSufficientResponses() returns true, you may call getFinalStatus() to get the - * result of the quorum check. + * "rsConfig" must stay in scope until QuorumChecker's destructor completes. */ - class QuorumChecker : public ScatterGatherAlgorithm { - MONGO_DISALLOW_COPYING(QuorumChecker); - public: - /** - * Constructs a QuorumChecker that is used to confirm that sufficient nodes are up to accept - * "rsConfig". "myIndex" is the index of the local node, which is assumed to be up. - * - * "rsConfig" must stay in scope until QuorumChecker's destructor completes. - */ - QuorumChecker(const ReplicaSetConfig* rsConfig, int myIndex); - virtual ~QuorumChecker(); - - virtual std::vector<ReplicationExecutor::RemoteCommandRequest> getRequests() const; - virtual void processResponse( - const ReplicationExecutor::RemoteCommandRequest& request, - const ResponseStatus& response); - - virtual bool hasReceivedSufficientResponses() const; - - Status getFinalStatus() const { return _finalStatus; } - - private: - /** - * Callback that executes after _haveReceivedSufficientReplies() becomes true. - * - * Computes the quorum result based on responses received so far, stores it into - * _finalStatus, and enables QuorumChecker::run() to return. - */ - void _onQuorumCheckComplete(); - - /** - * Updates the QuorumChecker state based on the data from a single heartbeat response. - */ - void _tabulateHeartbeatResponse( - const ReplicationExecutor::RemoteCommandRequest& request, - const ResponseStatus& response); - - // Pointer to the replica set configuration for which we're checking quorum. - const ReplicaSetConfig* const _rsConfig; - - // Index of the local node's member configuration in _rsConfig. - const int _myIndex; - - // List of voting nodes that have responded affirmatively. - std::vector<HostAndPort> _voters; - - // List of nodes with bad responses and the bad response status they returned. - std::vector<std::pair<HostAndPort, Status> > _badResponses; - - // Total number of responses and timeouts processed. - int _numResponses; - - // Number of electable nodes that have responded affirmatively. - int _numElectable; - - // Set to a non-OK status if a response from a remote node indicates - // that the quorum check should definitely fail, such as because of - // a replica set name mismatch. - Status _vetoStatus; - - // Final status of the quorum check, returned by run(). - Status _finalStatus; - }; + QuorumChecker(const ReplicaSetConfig* rsConfig, int myIndex); + virtual ~QuorumChecker(); + + virtual std::vector<ReplicationExecutor::RemoteCommandRequest> getRequests() const; + virtual void processResponse(const ReplicationExecutor::RemoteCommandRequest& request, + const ResponseStatus& response); + + virtual bool hasReceivedSufficientResponses() const; + + Status getFinalStatus() const { + return _finalStatus; + } +private: /** - * Performs a quorum call to determine if a sufficient number of nodes are up - * to initiate a replica set with configuration "rsConfig". + * Callback that executes after _haveReceivedSufficientReplies() becomes true. * - * "myIndex" is the index of this node's member configuration in "rsConfig". - * "executor" is the event loop in which to schedule network/aysnchronous processing. - * - * For purposes of initiate, a quorum is only met if all of the following conditions - * are met: - * - All nodes respond. - * - No nodes other than the node running the quorum check have data. - * - No nodes are already joined to a replica set. - * - No node reports a replica set name other than the one in "rsConfig". + * Computes the quorum result based on responses received so far, stores it into + * _finalStatus, and enables QuorumChecker::run() to return. */ - Status checkQuorumForInitiate(ReplicationExecutor* executor, - const ReplicaSetConfig& rsConfig, - const int myIndex); + void _onQuorumCheckComplete(); /** - * Performs a quorum call to determine if a sufficient number of nodes are up - * to replace the current replica set configuration with "rsConfig". - * - * "myIndex" is the index of this node's member configuration in "rsConfig". - * "executor" is the event loop in which to schedule network/aysnchronous processing. - * - * For purposes of reconfig, a quorum is only met if all of the following conditions - * are met: - * - A majority of voting nodes respond. - * - At least one electable node responds. - * - No responding node reports a replica set name other than the one in "rsConfig". - * - All responding nodes report a config version less than the one in "rsConfig". + * Updates the QuorumChecker state based on the data from a single heartbeat response. */ - Status checkQuorumForReconfig(ReplicationExecutor* executor, - const ReplicaSetConfig& rsConfig, - const int myIndex); + void _tabulateHeartbeatResponse(const ReplicationExecutor::RemoteCommandRequest& request, + const ResponseStatus& response); + + // Pointer to the replica set configuration for which we're checking quorum. + const ReplicaSetConfig* const _rsConfig; + + // Index of the local node's member configuration in _rsConfig. + const int _myIndex; + + // List of voting nodes that have responded affirmatively. + std::vector<HostAndPort> _voters; + + // List of nodes with bad responses and the bad response status they returned. + std::vector<std::pair<HostAndPort, Status>> _badResponses; + + // Total number of responses and timeouts processed. + int _numResponses; + + // Number of electable nodes that have responded affirmatively. + int _numElectable; + + // Set to a non-OK status if a response from a remote node indicates + // that the quorum check should definitely fail, such as because of + // a replica set name mismatch. + Status _vetoStatus; + + // Final status of the quorum check, returned by run(). + Status _finalStatus; +}; + +/** + * Performs a quorum call to determine if a sufficient number of nodes are up + * to initiate a replica set with configuration "rsConfig". + * + * "myIndex" is the index of this node's member configuration in "rsConfig". + * "executor" is the event loop in which to schedule network/aysnchronous processing. + * + * For purposes of initiate, a quorum is only met if all of the following conditions + * are met: + * - All nodes respond. + * - No nodes other than the node running the quorum check have data. + * - No nodes are already joined to a replica set. + * - No node reports a replica set name other than the one in "rsConfig". + */ +Status checkQuorumForInitiate(ReplicationExecutor* executor, + const ReplicaSetConfig& rsConfig, + const int myIndex); + +/** + * Performs a quorum call to determine if a sufficient number of nodes are up + * to replace the current replica set configuration with "rsConfig". + * + * "myIndex" is the index of this node's member configuration in "rsConfig". + * "executor" is the event loop in which to schedule network/aysnchronous processing. + * + * For purposes of reconfig, a quorum is only met if all of the following conditions + * are met: + * - A majority of voting nodes respond. + * - At least one electable node responds. + * - No responding node reports a replica set name other than the one in "rsConfig". + * - All responding nodes report a config version less than the one in "rsConfig". + */ +Status checkQuorumForReconfig(ReplicationExecutor* executor, + const ReplicaSetConfig& rsConfig, + const int myIndex); } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/check_quorum_for_config_change_test.cpp b/src/mongo/db/repl/check_quorum_for_config_change_test.cpp index 49d87e9cf4f..064f133e3d2 100644 --- a/src/mongo/db/repl/check_quorum_for_config_change_test.cpp +++ b/src/mongo/db/repl/check_quorum_for_config_change_test.cpp @@ -46,754 +46,768 @@ #include "mongo/unittest/unittest.h" #include "mongo/util/net/hostandport.h" -#define ASSERT_REASON_CONTAINS(STATUS, PATTERN) do { \ - const mongo::Status s_ = (STATUS); \ - ASSERT_FALSE(s_.reason().find(PATTERN) == std::string::npos) << \ - #STATUS ".reason() == " << s_.reason(); \ +#define ASSERT_REASON_CONTAINS(STATUS, PATTERN) \ + do { \ + const mongo::Status s_ = (STATUS); \ + ASSERT_FALSE(s_.reason().find(PATTERN) == std::string::npos) \ + << #STATUS ".reason() == " << s_.reason(); \ } while (false) -#define ASSERT_NOT_REASON_CONTAINS(STATUS, PATTERN) do { \ - const mongo::Status s_ = (STATUS); \ - ASSERT_TRUE(s_.reason().find(PATTERN) == std::string::npos) << \ - #STATUS ".reason() == " << s_.reason(); \ +#define ASSERT_NOT_REASON_CONTAINS(STATUS, PATTERN) \ + do { \ + const mongo::Status s_ = (STATUS); \ + ASSERT_TRUE(s_.reason().find(PATTERN) == std::string::npos) \ + << #STATUS ".reason() == " << s_.reason(); \ } while (false) namespace mongo { namespace repl { namespace { - typedef ReplicationExecutor::RemoteCommandRequest RemoteCommandRequest; - - class CheckQuorumTest : public mongo::unittest::Test { - protected: - CheckQuorumTest(); - - void startQuorumCheck(const ReplicaSetConfig& config, int myIndex); - Status waitForQuorumCheck(); - bool isQuorumCheckDone(); - - NetworkInterfaceMock* _net; - boost::scoped_ptr<ReplicationExecutor> _executor; - - private: - void setUp(); - void tearDown(); - - void _runQuorumCheck(const ReplicaSetConfig& config, int myIndex); - virtual Status _runQuorumCheckImpl(const ReplicaSetConfig& config, int myIndex) = 0; - - boost::scoped_ptr<boost::thread> _executorThread; - boost::scoped_ptr<boost::thread> _quorumCheckThread; - Status _quorumCheckStatus; - boost::mutex _mutex; - bool _isQuorumCheckDone; - }; - - CheckQuorumTest::CheckQuorumTest() : - _quorumCheckStatus(ErrorCodes::InternalError, "Not executed") { +typedef ReplicationExecutor::RemoteCommandRequest RemoteCommandRequest; + +class CheckQuorumTest : public mongo::unittest::Test { +protected: + CheckQuorumTest(); + + void startQuorumCheck(const ReplicaSetConfig& config, int myIndex); + Status waitForQuorumCheck(); + bool isQuorumCheckDone(); + + NetworkInterfaceMock* _net; + boost::scoped_ptr<ReplicationExecutor> _executor; + +private: + void setUp(); + void tearDown(); + + void _runQuorumCheck(const ReplicaSetConfig& config, int myIndex); + virtual Status _runQuorumCheckImpl(const ReplicaSetConfig& config, int myIndex) = 0; + + boost::scoped_ptr<boost::thread> _executorThread; + boost::scoped_ptr<boost::thread> _quorumCheckThread; + Status _quorumCheckStatus; + boost::mutex _mutex; + bool _isQuorumCheckDone; +}; + +CheckQuorumTest::CheckQuorumTest() + : _quorumCheckStatus(ErrorCodes::InternalError, "Not executed") {} + +void CheckQuorumTest::setUp() { + _net = new NetworkInterfaceMock; + _executor.reset(new ReplicationExecutor(_net, 1 /* prng */)); + _executorThread.reset( + new boost::thread(stdx::bind(&ReplicationExecutor::run, _executor.get()))); +} + +void CheckQuorumTest::tearDown() { + _executor->shutdown(); + _executorThread->join(); +} + +void CheckQuorumTest::startQuorumCheck(const ReplicaSetConfig& config, int myIndex) { + ASSERT_FALSE(_quorumCheckThread); + _isQuorumCheckDone = false; + _quorumCheckThread.reset( + new boost::thread(stdx::bind(&CheckQuorumTest::_runQuorumCheck, this, config, myIndex))); +} + +Status CheckQuorumTest::waitForQuorumCheck() { + ASSERT_TRUE(_quorumCheckThread); + _quorumCheckThread->join(); + return _quorumCheckStatus; +} + +bool CheckQuorumTest::isQuorumCheckDone() { + boost::lock_guard<boost::mutex> lk(_mutex); + return _isQuorumCheckDone; +} + +void CheckQuorumTest::_runQuorumCheck(const ReplicaSetConfig& config, int myIndex) { + _quorumCheckStatus = _runQuorumCheckImpl(config, myIndex); + boost::lock_guard<boost::mutex> lk(_mutex); + _isQuorumCheckDone = true; +} + +class CheckQuorumForInitiate : public CheckQuorumTest { +private: + virtual Status _runQuorumCheckImpl(const ReplicaSetConfig& config, int myIndex) { + return checkQuorumForInitiate(_executor.get(), config, myIndex); } +}; - void CheckQuorumTest::setUp() { - _net = new NetworkInterfaceMock; - _executor.reset(new ReplicationExecutor(_net, 1 /* prng */ )); - _executorThread.reset(new boost::thread(stdx::bind(&ReplicationExecutor::run, - _executor.get()))); +class CheckQuorumForReconfig : public CheckQuorumTest { +protected: + virtual Status _runQuorumCheckImpl(const ReplicaSetConfig& config, int myIndex) { + return checkQuorumForReconfig(_executor.get(), config, myIndex); } - - void CheckQuorumTest::tearDown() { - _executor->shutdown(); - _executorThread->join(); +}; + +ReplicaSetConfig assertMakeRSConfig(const BSONObj& configBson) { + ReplicaSetConfig config; + ASSERT_OK(config.initialize(configBson)); + ASSERT_OK(config.validate()); + return config; +} + +TEST_F(CheckQuorumForInitiate, ValidSingleNodeSet) { + ReplicaSetConfig config = assertMakeRSConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1")))); + startQuorumCheck(config, 0); + ASSERT_OK(waitForQuorumCheck()); +} + +TEST_F(CheckQuorumForInitiate, QuorumCheckCanceledByShutdown) { + _executor->shutdown(); + ReplicaSetConfig config = assertMakeRSConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1")))); + startQuorumCheck(config, 0); + ASSERT_EQUALS(ErrorCodes::ShutdownInProgress, waitForQuorumCheck()); +} + +TEST_F(CheckQuorumForInitiate, QuorumCheckFailedDueToSeveralDownNodes) { + // In this test, "we" are host "h3:1". All other nodes time out on + // their heartbeat request, and so the quorum check for initiate + // will fail because some members were unavailable. + ReplicaSetConfig config = + assertMakeRSConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1:1") + << BSON("_id" << 2 << "host" + << "h2:1") << BSON("_id" << 3 << "host" + << "h3:1") + << BSON("_id" << 4 << "host" + << "h4:1") << BSON("_id" << 5 << "host" + << "h5:1")))); + startQuorumCheck(config, 2); + _net->enterNetwork(); + const Date_t startDate = _net->now(); + const int numCommandsExpected = config.getNumMembers() - 1; + for (int i = 0; i < numCommandsExpected; ++i) { + _net->scheduleResponse(_net->getNextReadyRequest(), + startDate + 10, + ResponseStatus(ErrorCodes::NoSuchKey, "No reply")); } - - void CheckQuorumTest::startQuorumCheck(const ReplicaSetConfig& config, int myIndex) { - ASSERT_FALSE(_quorumCheckThread); - _isQuorumCheckDone = false; - _quorumCheckThread.reset(new boost::thread(stdx::bind(&CheckQuorumTest::_runQuorumCheck, - this, - config, - myIndex))); + _net->runUntil(startDate + 10); + _net->exitNetwork(); + ASSERT_EQUALS(startDate + 10, _net->now()); + Status status = waitForQuorumCheck(); + ASSERT_EQUALS(ErrorCodes::NodeNotFound, status); + ASSERT_REASON_CONTAINS( + status, "replSetInitiate quorum check failed because not all proposed set members"); + ASSERT_REASON_CONTAINS(status, "h1:1"); + ASSERT_REASON_CONTAINS(status, "h2:1"); + ASSERT_NOT_REASON_CONTAINS(status, "h3:1"); + ASSERT_REASON_CONTAINS(status, "h4:1"); + ASSERT_REASON_CONTAINS(status, "h5:1"); +} + +const BSONObj makeHeartbeatRequest(const ReplicaSetConfig& rsConfig, int myConfigIndex) { + const MemberConfig& myConfig = rsConfig.getMemberAt(myConfigIndex); + ReplSetHeartbeatArgs hbArgs; + hbArgs.setSetName(rsConfig.getReplSetName()); + hbArgs.setProtocolVersion(1); + hbArgs.setConfigVersion(rsConfig.getConfigVersion()); + hbArgs.setCheckEmpty(rsConfig.getConfigVersion() == 1); + hbArgs.setSenderHost(myConfig.getHostAndPort()); + hbArgs.setSenderId(myConfig.getId()); + return hbArgs.toBSON(); +} + +TEST_F(CheckQuorumForInitiate, QuorumCheckSuccessForFiveNodes) { + // In this test, "we" are host "h3:1". All nodes respond successfully to their heartbeat + // requests, and the quorum check succeeds. + + const ReplicaSetConfig rsConfig = + assertMakeRSConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1:1") + << BSON("_id" << 2 << "host" + << "h2:1") << BSON("_id" << 3 << "host" + << "h3:1") + << BSON("_id" << 4 << "host" + << "h4:1") << BSON("_id" << 5 << "host" + << "h5:1")))); + const int myConfigIndex = 2; + const BSONObj hbRequest = makeHeartbeatRequest(rsConfig, myConfigIndex); + + startQuorumCheck(rsConfig, myConfigIndex); + const Date_t startDate = _net->now(); + const int numCommandsExpected = rsConfig.getNumMembers() - 1; + unordered_set<HostAndPort> seenHosts; + _net->enterNetwork(); + for (int i = 0; i < numCommandsExpected; ++i) { + const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); + const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); + ASSERT_EQUALS("admin", request.dbname); + ASSERT_EQUALS(hbRequest, request.cmdObj); + ASSERT(seenHosts.insert(request.target).second) << "Already saw " + << request.target.toString(); + _net->scheduleResponse(noi, + startDate + 10, + ResponseStatus(ReplicationExecutor::RemoteCommandResponse( + BSON("ok" << 1), Milliseconds(8)))); } - - Status CheckQuorumTest::waitForQuorumCheck() { - ASSERT_TRUE(_quorumCheckThread); - _quorumCheckThread->join(); - return _quorumCheckStatus; - } - - bool CheckQuorumTest::isQuorumCheckDone() { - boost::lock_guard<boost::mutex> lk(_mutex); - return _isQuorumCheckDone; - } - - void CheckQuorumTest::_runQuorumCheck(const ReplicaSetConfig& config, int myIndex) { - _quorumCheckStatus = _runQuorumCheckImpl(config, myIndex); - boost::lock_guard<boost::mutex> lk(_mutex); - _isQuorumCheckDone = true; - } - - class CheckQuorumForInitiate : public CheckQuorumTest { - private: - virtual Status _runQuorumCheckImpl(const ReplicaSetConfig& config, int myIndex) { - return checkQuorumForInitiate(_executor.get(), config, myIndex); - } - }; - - class CheckQuorumForReconfig : public CheckQuorumTest { - protected: - virtual Status _runQuorumCheckImpl(const ReplicaSetConfig& config, int myIndex) { - return checkQuorumForReconfig(_executor.get(), config, myIndex); - } - }; - - ReplicaSetConfig assertMakeRSConfig(const BSONObj& configBson) { - ReplicaSetConfig config; - ASSERT_OK(config.initialize(configBson)); - ASSERT_OK(config.validate()); - return config; - } - - TEST_F(CheckQuorumForInitiate, ValidSingleNodeSet) { - ReplicaSetConfig config = assertMakeRSConfig( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1")))); - startQuorumCheck(config, 0); - ASSERT_OK(waitForQuorumCheck()); - } - - TEST_F(CheckQuorumForInitiate, QuorumCheckCanceledByShutdown) { - _executor->shutdown(); - ReplicaSetConfig config = assertMakeRSConfig( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1")))); - startQuorumCheck(config, 0); - ASSERT_EQUALS(ErrorCodes::ShutdownInProgress, waitForQuorumCheck()); - } - - TEST_F(CheckQuorumForInitiate, QuorumCheckFailedDueToSeveralDownNodes) { - // In this test, "we" are host "h3:1". All other nodes time out on - // their heartbeat request, and so the quorum check for initiate - // will fail because some members were unavailable. - ReplicaSetConfig config = assertMakeRSConfig( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1:1") << - BSON("_id" << 2 << "host" << "h2:1") << - BSON("_id" << 3 << "host" << "h3:1") << - BSON("_id" << 4 << "host" << "h4:1") << - BSON("_id" << 5 << "host" << "h5:1")))); - startQuorumCheck(config, 2); - _net->enterNetwork(); - const Date_t startDate = _net->now(); - const int numCommandsExpected = config.getNumMembers() - 1; - for (int i = 0; i < numCommandsExpected; ++i) { - _net->scheduleResponse(_net->getNextReadyRequest(), + _net->runUntil(startDate + 10); + _net->exitNetwork(); + ASSERT_OK(waitForQuorumCheck()); +} + +TEST_F(CheckQuorumForInitiate, QuorumCheckFailedDueToOneDownNode) { + // In this test, "we" are host "h3:1". All nodes except "h2:1" respond + // successfully to their heartbeat requests, but quorum check fails because + // all nodes must be available for initiate. This is so even though "h2" + // is neither voting nor electable. + + const ReplicaSetConfig rsConfig = + assertMakeRSConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1:1") + << BSON("_id" << 2 << "host" + << "h2:1" + << "priority" << 0 << "votes" << 0) + << BSON("_id" << 3 << "host" + << "h3:1") << BSON("_id" << 4 << "host" + << "h4:1") + << BSON("_id" << 5 << "host" + << "h5:1") << BSON("_id" << 6 << "host" + << "h6:1")))); + const int myConfigIndex = 2; + const BSONObj hbRequest = makeHeartbeatRequest(rsConfig, myConfigIndex); + + startQuorumCheck(rsConfig, myConfigIndex); + const Date_t startDate = _net->now(); + const int numCommandsExpected = rsConfig.getNumMembers() - 1; + unordered_set<HostAndPort> seenHosts; + _net->enterNetwork(); + for (int i = 0; i < numCommandsExpected; ++i) { + const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); + const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); + ASSERT_EQUALS("admin", request.dbname); + ASSERT_EQUALS(hbRequest, request.cmdObj); + ASSERT(seenHosts.insert(request.target).second) << "Already saw " + << request.target.toString(); + if (request.target == HostAndPort("h2", 1)) { + _net->scheduleResponse( + noi, startDate + 10, ResponseStatus(ErrorCodes::NoSuchKey, "No response")); + } else { + _net->scheduleResponse(noi, startDate + 10, - ResponseStatus(ErrorCodes::NoSuchKey, "No reply")); + ResponseStatus(ReplicationExecutor::RemoteCommandResponse( + BSON("ok" << 1), Milliseconds(8)))); } - _net->runUntil(startDate + 10); - _net->exitNetwork(); - ASSERT_EQUALS(startDate + 10, _net->now()); - Status status = waitForQuorumCheck(); - ASSERT_EQUALS(ErrorCodes::NodeNotFound, status); - ASSERT_REASON_CONTAINS( - status, "replSetInitiate quorum check failed because not all proposed set members"); - ASSERT_REASON_CONTAINS(status, "h1:1"); - ASSERT_REASON_CONTAINS(status, "h2:1"); - ASSERT_NOT_REASON_CONTAINS(status, "h3:1"); - ASSERT_REASON_CONTAINS(status, "h4:1"); - ASSERT_REASON_CONTAINS(status, "h5:1"); - } - - const BSONObj makeHeartbeatRequest(const ReplicaSetConfig& rsConfig, int myConfigIndex) { - const MemberConfig& myConfig = rsConfig.getMemberAt(myConfigIndex); - ReplSetHeartbeatArgs hbArgs; - hbArgs.setSetName(rsConfig.getReplSetName()); - hbArgs.setProtocolVersion(1); - hbArgs.setConfigVersion(rsConfig.getConfigVersion()); - hbArgs.setCheckEmpty(rsConfig.getConfigVersion() == 1); - hbArgs.setSenderHost(myConfig.getHostAndPort()); - hbArgs.setSenderId(myConfig.getId()); - return hbArgs.toBSON(); } - - TEST_F(CheckQuorumForInitiate, QuorumCheckSuccessForFiveNodes) { - // In this test, "we" are host "h3:1". All nodes respond successfully to their heartbeat - // requests, and the quorum check succeeds. - - const ReplicaSetConfig rsConfig = assertMakeRSConfig( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1:1") << - BSON("_id" << 2 << "host" << "h2:1") << - BSON("_id" << 3 << "host" << "h3:1") << - BSON("_id" << 4 << "host" << "h4:1") << - BSON("_id" << 5 << "host" << "h5:1")))); - const int myConfigIndex = 2; - const BSONObj hbRequest = makeHeartbeatRequest(rsConfig, myConfigIndex); - - startQuorumCheck(rsConfig, myConfigIndex); - const Date_t startDate = _net->now(); - const int numCommandsExpected = rsConfig.getNumMembers() - 1; - unordered_set<HostAndPort> seenHosts; - _net->enterNetwork(); - for (int i = 0; i < numCommandsExpected; ++i) { - const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); - const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); - ASSERT_EQUALS("admin", request.dbname); - ASSERT_EQUALS(hbRequest, request.cmdObj); - ASSERT(seenHosts.insert(request.target).second) << - "Already saw " << request.target.toString(); + _net->runUntil(startDate + 10); + _net->exitNetwork(); + Status status = waitForQuorumCheck(); + ASSERT_EQUALS(ErrorCodes::NodeNotFound, status); + ASSERT_REASON_CONTAINS( + status, "replSetInitiate quorum check failed because not all proposed set members"); + ASSERT_NOT_REASON_CONTAINS(status, "h1:1"); + ASSERT_REASON_CONTAINS(status, "h2:1"); + ASSERT_NOT_REASON_CONTAINS(status, "h3:1"); + ASSERT_NOT_REASON_CONTAINS(status, "h4:1"); + ASSERT_NOT_REASON_CONTAINS(status, "h5:1"); + ASSERT_NOT_REASON_CONTAINS(status, "h6:1"); +} + +TEST_F(CheckQuorumForInitiate, QuorumCheckFailedDueToSetNameMismatch) { + // In this test, "we" are host "h3:1". All nodes respond + // successfully to their heartbeat requests, but quorum check fails because + // "h4" declares that the requested replica set name was not what it expected. + + const ReplicaSetConfig rsConfig = + assertMakeRSConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1:1") + << BSON("_id" << 2 << "host" + << "h2:1") << BSON("_id" << 3 << "host" + << "h3:1") + << BSON("_id" << 4 << "host" + << "h4:1") << BSON("_id" << 5 << "host" + << "h5:1")))); + const int myConfigIndex = 2; + const BSONObj hbRequest = makeHeartbeatRequest(rsConfig, myConfigIndex); + + startQuorumCheck(rsConfig, myConfigIndex); + const Date_t startDate = _net->now(); + const int numCommandsExpected = rsConfig.getNumMembers() - 1; + unordered_set<HostAndPort> seenHosts; + _net->enterNetwork(); + for (int i = 0; i < numCommandsExpected; ++i) { + const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); + const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); + ASSERT_EQUALS("admin", request.dbname); + ASSERT_EQUALS(hbRequest, request.cmdObj); + ASSERT(seenHosts.insert(request.target).second) << "Already saw " + << request.target.toString(); + if (request.target == HostAndPort("h4", 1)) { _net->scheduleResponse(noi, startDate + 10, ResponseStatus(ReplicationExecutor::RemoteCommandResponse( - BSON("ok" << 1), Milliseconds(8)))); - } - _net->runUntil(startDate + 10); - _net->exitNetwork(); - ASSERT_OK(waitForQuorumCheck()); - } - - TEST_F(CheckQuorumForInitiate, QuorumCheckFailedDueToOneDownNode) { - // In this test, "we" are host "h3:1". All nodes except "h2:1" respond - // successfully to their heartbeat requests, but quorum check fails because - // all nodes must be available for initiate. This is so even though "h2" - // is neither voting nor electable. - - const ReplicaSetConfig rsConfig = assertMakeRSConfig( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1:1") << - BSON("_id" << 2 << "host" << "h2:1" << - "priority" << 0 << "votes" << 0) << - BSON("_id" << 3 << "host" << "h3:1") << - BSON("_id" << 4 << "host" << "h4:1") << - BSON("_id" << 5 << "host" << "h5:1") << - BSON("_id" << 6 << "host" << "h6:1")))); - const int myConfigIndex = 2; - const BSONObj hbRequest = makeHeartbeatRequest(rsConfig, myConfigIndex); - - startQuorumCheck(rsConfig, myConfigIndex); - const Date_t startDate = _net->now(); - const int numCommandsExpected = rsConfig.getNumMembers() - 1; - unordered_set<HostAndPort> seenHosts; - _net->enterNetwork(); - for (int i = 0; i < numCommandsExpected; ++i) { - const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); - const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); - ASSERT_EQUALS("admin", request.dbname); - ASSERT_EQUALS(hbRequest, request.cmdObj); - ASSERT(seenHosts.insert(request.target).second) << - "Already saw " << request.target.toString(); - if (request.target == HostAndPort("h2", 1)) { - _net->scheduleResponse(noi, - startDate + 10, - ResponseStatus(ErrorCodes::NoSuchKey, "No response")); - } - else { - _net->scheduleResponse(noi, - startDate + 10, - ResponseStatus(ReplicationExecutor::RemoteCommandResponse( - BSON("ok" << 1), Milliseconds(8)))); - } - } - _net->runUntil(startDate + 10); - _net->exitNetwork(); - Status status = waitForQuorumCheck(); - ASSERT_EQUALS(ErrorCodes::NodeNotFound, status); - ASSERT_REASON_CONTAINS( - status, "replSetInitiate quorum check failed because not all proposed set members"); - ASSERT_NOT_REASON_CONTAINS(status, "h1:1"); - ASSERT_REASON_CONTAINS(status, "h2:1"); - ASSERT_NOT_REASON_CONTAINS(status, "h3:1"); - ASSERT_NOT_REASON_CONTAINS(status, "h4:1"); - ASSERT_NOT_REASON_CONTAINS(status, "h5:1"); - ASSERT_NOT_REASON_CONTAINS(status, "h6:1"); - } - - TEST_F(CheckQuorumForInitiate, QuorumCheckFailedDueToSetNameMismatch) { - // In this test, "we" are host "h3:1". All nodes respond - // successfully to their heartbeat requests, but quorum check fails because - // "h4" declares that the requested replica set name was not what it expected. - - const ReplicaSetConfig rsConfig = assertMakeRSConfig( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1:1") << - BSON("_id" << 2 << "host" << "h2:1") << - BSON("_id" << 3 << "host" << "h3:1") << - BSON("_id" << 4 << "host" << "h4:1") << - BSON("_id" << 5 << "host" << "h5:1")))); - const int myConfigIndex = 2; - const BSONObj hbRequest = makeHeartbeatRequest(rsConfig, myConfigIndex); - - startQuorumCheck(rsConfig, myConfigIndex); - const Date_t startDate = _net->now(); - const int numCommandsExpected = rsConfig.getNumMembers() - 1; - unordered_set<HostAndPort> seenHosts; - _net->enterNetwork(); - for (int i = 0; i < numCommandsExpected; ++i) { - const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); - const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); - ASSERT_EQUALS("admin", request.dbname); - ASSERT_EQUALS(hbRequest, request.cmdObj); - ASSERT(seenHosts.insert(request.target).second) << - "Already saw " << request.target.toString(); - if (request.target == HostAndPort("h4", 1)) { - _net->scheduleResponse(noi, - startDate + 10, - ResponseStatus(ReplicationExecutor::RemoteCommandResponse( - BSON("ok" << 0 << "mismatch" << true), - Milliseconds(8)))); - } - else { - _net->scheduleResponse(noi, - startDate + 10, - ResponseStatus(ReplicationExecutor::RemoteCommandResponse( - BSON("ok" << 1), Milliseconds(8)))); - } + BSON("ok" << 0 << "mismatch" << true), Milliseconds(8)))); + } else { + _net->scheduleResponse(noi, + startDate + 10, + ResponseStatus(ReplicationExecutor::RemoteCommandResponse( + BSON("ok" << 1), Milliseconds(8)))); } - _net->runUntil(startDate + 10); - _net->exitNetwork(); - Status status = waitForQuorumCheck(); - ASSERT_EQUALS(ErrorCodes::NewReplicaSetConfigurationIncompatible, status); - ASSERT_REASON_CONTAINS( - status, "Our set name did not match"); - ASSERT_NOT_REASON_CONTAINS(status, "h1:1"); - ASSERT_NOT_REASON_CONTAINS(status, "h2:1"); - ASSERT_NOT_REASON_CONTAINS(status, "h3:1"); - ASSERT_REASON_CONTAINS(status, "h4:1"); - ASSERT_NOT_REASON_CONTAINS(status, "h5:1"); } - - TEST_F(CheckQuorumForInitiate, QuorumCheckFailedDueToInitializedNode) { - // In this test, "we" are host "h3:1". All nodes respond - // successfully to their heartbeat requests, but quorum check fails because - // "h5" declares that it is already initialized. - - const ReplicaSetConfig rsConfig = assertMakeRSConfig( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1:1") << - BSON("_id" << 2 << "host" << "h2:1") << - BSON("_id" << 3 << "host" << "h3:1") << - BSON("_id" << 4 << "host" << "h4:1") << - BSON("_id" << 5 << "host" << "h5:1")))); - const int myConfigIndex = 2; - const BSONObj hbRequest = makeHeartbeatRequest(rsConfig, myConfigIndex); - - startQuorumCheck(rsConfig, myConfigIndex); - const Date_t startDate = _net->now(); - const int numCommandsExpected = rsConfig.getNumMembers() - 1; - unordered_set<HostAndPort> seenHosts; - _net->enterNetwork(); - for (int i = 0; i < numCommandsExpected; ++i) { - const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); - const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); - ASSERT_EQUALS("admin", request.dbname); - ASSERT_EQUALS(hbRequest, request.cmdObj); - ASSERT(seenHosts.insert(request.target).second) << - "Already saw " << request.target.toString(); - if (request.target == HostAndPort("h5", 1)) { - _net->scheduleResponse(noi, - startDate + 10, - ResponseStatus(ReplicationExecutor::RemoteCommandResponse( - BSON("ok" << 0 << - "set" << "rs0" << - "v" << 1), - Milliseconds(8)))); - } - else { - _net->scheduleResponse(noi, - startDate + 10, - ResponseStatus(ReplicationExecutor::RemoteCommandResponse( - BSON("ok" << 1), Milliseconds(8)))); - } + _net->runUntil(startDate + 10); + _net->exitNetwork(); + Status status = waitForQuorumCheck(); + ASSERT_EQUALS(ErrorCodes::NewReplicaSetConfigurationIncompatible, status); + ASSERT_REASON_CONTAINS(status, "Our set name did not match"); + ASSERT_NOT_REASON_CONTAINS(status, "h1:1"); + ASSERT_NOT_REASON_CONTAINS(status, "h2:1"); + ASSERT_NOT_REASON_CONTAINS(status, "h3:1"); + ASSERT_REASON_CONTAINS(status, "h4:1"); + ASSERT_NOT_REASON_CONTAINS(status, "h5:1"); +} + +TEST_F(CheckQuorumForInitiate, QuorumCheckFailedDueToInitializedNode) { + // In this test, "we" are host "h3:1". All nodes respond + // successfully to their heartbeat requests, but quorum check fails because + // "h5" declares that it is already initialized. + + const ReplicaSetConfig rsConfig = + assertMakeRSConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1:1") + << BSON("_id" << 2 << "host" + << "h2:1") << BSON("_id" << 3 << "host" + << "h3:1") + << BSON("_id" << 4 << "host" + << "h4:1") << BSON("_id" << 5 << "host" + << "h5:1")))); + const int myConfigIndex = 2; + const BSONObj hbRequest = makeHeartbeatRequest(rsConfig, myConfigIndex); + + startQuorumCheck(rsConfig, myConfigIndex); + const Date_t startDate = _net->now(); + const int numCommandsExpected = rsConfig.getNumMembers() - 1; + unordered_set<HostAndPort> seenHosts; + _net->enterNetwork(); + for (int i = 0; i < numCommandsExpected; ++i) { + const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); + const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); + ASSERT_EQUALS("admin", request.dbname); + ASSERT_EQUALS(hbRequest, request.cmdObj); + ASSERT(seenHosts.insert(request.target).second) << "Already saw " + << request.target.toString(); + if (request.target == HostAndPort("h5", 1)) { + _net->scheduleResponse( + noi, + startDate + 10, + ResponseStatus(ReplicationExecutor::RemoteCommandResponse(BSON("ok" << 0 << "set" + << "rs0" + << "v" << 1), + Milliseconds(8)))); + } else { + _net->scheduleResponse(noi, + startDate + 10, + ResponseStatus(ReplicationExecutor::RemoteCommandResponse( + BSON("ok" << 1), Milliseconds(8)))); } - _net->runUntil(startDate + 10); - _net->exitNetwork(); - Status status = waitForQuorumCheck(); - ASSERT_EQUALS(ErrorCodes::NewReplicaSetConfigurationIncompatible, status); - ASSERT_REASON_CONTAINS( - status, "Our config version of"); - ASSERT_REASON_CONTAINS( - status, "is no larger than the version"); - ASSERT_NOT_REASON_CONTAINS(status, "h1:1"); - ASSERT_NOT_REASON_CONTAINS(status, "h2:1"); - ASSERT_NOT_REASON_CONTAINS(status, "h3:1"); - ASSERT_NOT_REASON_CONTAINS(status, "h4:1"); - ASSERT_REASON_CONTAINS(status, "h5:1"); } - - TEST_F(CheckQuorumForInitiate, QuorumCheckFailedDueToInitializedNodeOnlyOneRespondent) { - // In this test, "we" are host "h3:1". Only node "h5" responds before the test completes, - // and quorum check fails because "h5" declares that it is already initialized. - // - // Compare to QuorumCheckFailedDueToInitializedNode, above. - - const ReplicaSetConfig rsConfig = assertMakeRSConfig( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1:1") << - BSON("_id" << 2 << "host" << "h2:1") << - BSON("_id" << 3 << "host" << "h3:1") << - BSON("_id" << 4 << "host" << "h4:1") << - BSON("_id" << 5 << "host" << "h5:1")))); - const int myConfigIndex = 2; - const BSONObj hbRequest = makeHeartbeatRequest(rsConfig, myConfigIndex); - - startQuorumCheck(rsConfig, myConfigIndex); - const Date_t startDate = _net->now(); - const int numCommandsExpected = rsConfig.getNumMembers() - 1; - unordered_set<HostAndPort> seenHosts; - _net->enterNetwork(); - for (int i = 0; i < numCommandsExpected; ++i) { - const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); - const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); - ASSERT_EQUALS("admin", request.dbname); - ASSERT_EQUALS(hbRequest, request.cmdObj); - ASSERT(seenHosts.insert(request.target).second) << - "Already saw " << request.target.toString(); - if (request.target == HostAndPort("h5", 1)) { - _net->scheduleResponse(noi, - startDate + 10, - ResponseStatus(ReplicationExecutor::RemoteCommandResponse( - BSON("ok" << 0 << - "set" << "rs0" << - "v" << 1), - Milliseconds(8)))); - } - else { - _net->blackHole(noi); - } + _net->runUntil(startDate + 10); + _net->exitNetwork(); + Status status = waitForQuorumCheck(); + ASSERT_EQUALS(ErrorCodes::NewReplicaSetConfigurationIncompatible, status); + ASSERT_REASON_CONTAINS(status, "Our config version of"); + ASSERT_REASON_CONTAINS(status, "is no larger than the version"); + ASSERT_NOT_REASON_CONTAINS(status, "h1:1"); + ASSERT_NOT_REASON_CONTAINS(status, "h2:1"); + ASSERT_NOT_REASON_CONTAINS(status, "h3:1"); + ASSERT_NOT_REASON_CONTAINS(status, "h4:1"); + ASSERT_REASON_CONTAINS(status, "h5:1"); +} + +TEST_F(CheckQuorumForInitiate, QuorumCheckFailedDueToInitializedNodeOnlyOneRespondent) { + // In this test, "we" are host "h3:1". Only node "h5" responds before the test completes, + // and quorum check fails because "h5" declares that it is already initialized. + // + // Compare to QuorumCheckFailedDueToInitializedNode, above. + + const ReplicaSetConfig rsConfig = + assertMakeRSConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1:1") + << BSON("_id" << 2 << "host" + << "h2:1") << BSON("_id" << 3 << "host" + << "h3:1") + << BSON("_id" << 4 << "host" + << "h4:1") << BSON("_id" << 5 << "host" + << "h5:1")))); + const int myConfigIndex = 2; + const BSONObj hbRequest = makeHeartbeatRequest(rsConfig, myConfigIndex); + + startQuorumCheck(rsConfig, myConfigIndex); + const Date_t startDate = _net->now(); + const int numCommandsExpected = rsConfig.getNumMembers() - 1; + unordered_set<HostAndPort> seenHosts; + _net->enterNetwork(); + for (int i = 0; i < numCommandsExpected; ++i) { + const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); + const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); + ASSERT_EQUALS("admin", request.dbname); + ASSERT_EQUALS(hbRequest, request.cmdObj); + ASSERT(seenHosts.insert(request.target).second) << "Already saw " + << request.target.toString(); + if (request.target == HostAndPort("h5", 1)) { + _net->scheduleResponse( + noi, + startDate + 10, + ResponseStatus(ReplicationExecutor::RemoteCommandResponse(BSON("ok" << 0 << "set" + << "rs0" + << "v" << 1), + Milliseconds(8)))); + } else { + _net->blackHole(noi); } - _net->runUntil(startDate + 10); - _net->exitNetwork(); - Status status = waitForQuorumCheck(); - ASSERT_EQUALS(ErrorCodes::NewReplicaSetConfigurationIncompatible, status); - ASSERT_REASON_CONTAINS( - status, "Our config version of"); - ASSERT_REASON_CONTAINS( - status, "is no larger than the version"); - ASSERT_NOT_REASON_CONTAINS(status, "h1:1"); - ASSERT_NOT_REASON_CONTAINS(status, "h2:1"); - ASSERT_NOT_REASON_CONTAINS(status, "h3:1"); - ASSERT_NOT_REASON_CONTAINS(status, "h4:1"); - ASSERT_REASON_CONTAINS(status, "h5:1"); } - - TEST_F(CheckQuorumForInitiate, QuorumCheckFailedDueToNodeWithData) { - // In this test, "we" are host "h3:1". Only node "h5" responds before the test completes, - // and quorum check fails because "h5" declares that it has data already. - - const ReplicaSetConfig rsConfig = assertMakeRSConfig( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1:1") << - BSON("_id" << 2 << "host" << "h2:1") << - BSON("_id" << 3 << "host" << "h3:1") << - BSON("_id" << 4 << "host" << "h4:1") << - BSON("_id" << 5 << "host" << "h5:1")))); - const int myConfigIndex = 2; - const BSONObj hbRequest = makeHeartbeatRequest(rsConfig, myConfigIndex); - - startQuorumCheck(rsConfig, myConfigIndex); - const Date_t startDate = _net->now(); - const int numCommandsExpected = rsConfig.getNumMembers() - 1; - unordered_set<HostAndPort> seenHosts; - _net->enterNetwork(); - for (int i = 0; i < numCommandsExpected; ++i) { - const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); - const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); - ASSERT_EQUALS("admin", request.dbname); - ASSERT_EQUALS(hbRequest, request.cmdObj); - ASSERT(seenHosts.insert(request.target).second) << - "Already saw " << request.target.toString(); - ReplSetHeartbeatResponse hbResp; - hbResp.setVersion(0); - hbResp.noteHasData(); - if (request.target == HostAndPort("h5", 1)) { - _net->scheduleResponse(noi, - startDate + 10, - ResponseStatus(ReplicationExecutor::RemoteCommandResponse( - hbResp.toBSON(), - Milliseconds(8)))); - } - else { - _net->blackHole(noi); - } + _net->runUntil(startDate + 10); + _net->exitNetwork(); + Status status = waitForQuorumCheck(); + ASSERT_EQUALS(ErrorCodes::NewReplicaSetConfigurationIncompatible, status); + ASSERT_REASON_CONTAINS(status, "Our config version of"); + ASSERT_REASON_CONTAINS(status, "is no larger than the version"); + ASSERT_NOT_REASON_CONTAINS(status, "h1:1"); + ASSERT_NOT_REASON_CONTAINS(status, "h2:1"); + ASSERT_NOT_REASON_CONTAINS(status, "h3:1"); + ASSERT_NOT_REASON_CONTAINS(status, "h4:1"); + ASSERT_REASON_CONTAINS(status, "h5:1"); +} + +TEST_F(CheckQuorumForInitiate, QuorumCheckFailedDueToNodeWithData) { + // In this test, "we" are host "h3:1". Only node "h5" responds before the test completes, + // and quorum check fails because "h5" declares that it has data already. + + const ReplicaSetConfig rsConfig = + assertMakeRSConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1:1") + << BSON("_id" << 2 << "host" + << "h2:1") << BSON("_id" << 3 << "host" + << "h3:1") + << BSON("_id" << 4 << "host" + << "h4:1") << BSON("_id" << 5 << "host" + << "h5:1")))); + const int myConfigIndex = 2; + const BSONObj hbRequest = makeHeartbeatRequest(rsConfig, myConfigIndex); + + startQuorumCheck(rsConfig, myConfigIndex); + const Date_t startDate = _net->now(); + const int numCommandsExpected = rsConfig.getNumMembers() - 1; + unordered_set<HostAndPort> seenHosts; + _net->enterNetwork(); + for (int i = 0; i < numCommandsExpected; ++i) { + const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); + const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); + ASSERT_EQUALS("admin", request.dbname); + ASSERT_EQUALS(hbRequest, request.cmdObj); + ASSERT(seenHosts.insert(request.target).second) << "Already saw " + << request.target.toString(); + ReplSetHeartbeatResponse hbResp; + hbResp.setVersion(0); + hbResp.noteHasData(); + if (request.target == HostAndPort("h5", 1)) { + _net->scheduleResponse(noi, + startDate + 10, + ResponseStatus(ReplicationExecutor::RemoteCommandResponse( + hbResp.toBSON(), Milliseconds(8)))); + } else { + _net->blackHole(noi); } - _net->runUntil(startDate + 10); - _net->exitNetwork(); - Status status = waitForQuorumCheck(); - ASSERT_EQUALS(ErrorCodes::CannotInitializeNodeWithData, status); - ASSERT_REASON_CONTAINS( - status, "has data already"); - ASSERT_NOT_REASON_CONTAINS(status, "h1:1"); - ASSERT_NOT_REASON_CONTAINS(status, "h2:1"); - ASSERT_NOT_REASON_CONTAINS(status, "h3:1"); - ASSERT_NOT_REASON_CONTAINS(status, "h4:1"); - ASSERT_REASON_CONTAINS(status, "h5:1"); } - TEST_F(CheckQuorumForReconfig, QuorumCheckVetoedDueToHigherConfigVersion) { - // In this test, "we" are host "h3:1". The request to "h2" does not arrive before the end - // of the test, and the request to "h1" comes back indicating a higher config version. - - const ReplicaSetConfig rsConfig = assertMakeRSConfig( - BSON("_id" << "rs0" << - "version" << 2 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1:1") << - BSON("_id" << 2 << "host" << "h2:1") << - BSON("_id" << 3 << "host" << "h3:1")))); - const int myConfigIndex = 2; - const BSONObj hbRequest = makeHeartbeatRequest(rsConfig, myConfigIndex); - - startQuorumCheck(rsConfig, myConfigIndex); - const Date_t startDate = _net->now(); - const int numCommandsExpected = rsConfig.getNumMembers() - 1; - unordered_set<HostAndPort> seenHosts; - _net->enterNetwork(); - for (int i = 0; i < numCommandsExpected; ++i) { - const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); - const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); - ASSERT_EQUALS("admin", request.dbname); - ASSERT_EQUALS(hbRequest, request.cmdObj); - ASSERT(seenHosts.insert(request.target).second) << - "Already saw " << request.target.toString(); - if (request.target == HostAndPort("h1", 1)) { - _net->scheduleResponse(noi, - startDate + 10, - ResponseStatus(ReplicationExecutor::RemoteCommandResponse( - BSON("ok" << 0 << - "set" << "rs0" << - "v" << 5), - Milliseconds(8)))); - } - else { - _net->blackHole(noi); - } + _net->runUntil(startDate + 10); + _net->exitNetwork(); + Status status = waitForQuorumCheck(); + ASSERT_EQUALS(ErrorCodes::CannotInitializeNodeWithData, status); + ASSERT_REASON_CONTAINS(status, "has data already"); + ASSERT_NOT_REASON_CONTAINS(status, "h1:1"); + ASSERT_NOT_REASON_CONTAINS(status, "h2:1"); + ASSERT_NOT_REASON_CONTAINS(status, "h3:1"); + ASSERT_NOT_REASON_CONTAINS(status, "h4:1"); + ASSERT_REASON_CONTAINS(status, "h5:1"); +} +TEST_F(CheckQuorumForReconfig, QuorumCheckVetoedDueToHigherConfigVersion) { + // In this test, "we" are host "h3:1". The request to "h2" does not arrive before the end + // of the test, and the request to "h1" comes back indicating a higher config version. + + const ReplicaSetConfig rsConfig = + assertMakeRSConfig(BSON("_id" + << "rs0" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1:1") + << BSON("_id" << 2 << "host" + << "h2:1") << BSON("_id" << 3 << "host" + << "h3:1")))); + const int myConfigIndex = 2; + const BSONObj hbRequest = makeHeartbeatRequest(rsConfig, myConfigIndex); + + startQuorumCheck(rsConfig, myConfigIndex); + const Date_t startDate = _net->now(); + const int numCommandsExpected = rsConfig.getNumMembers() - 1; + unordered_set<HostAndPort> seenHosts; + _net->enterNetwork(); + for (int i = 0; i < numCommandsExpected; ++i) { + const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); + const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); + ASSERT_EQUALS("admin", request.dbname); + ASSERT_EQUALS(hbRequest, request.cmdObj); + ASSERT(seenHosts.insert(request.target).second) << "Already saw " + << request.target.toString(); + if (request.target == HostAndPort("h1", 1)) { + _net->scheduleResponse( + noi, + startDate + 10, + ResponseStatus(ReplicationExecutor::RemoteCommandResponse(BSON("ok" << 0 << "set" + << "rs0" + << "v" << 5), + Milliseconds(8)))); + } else { + _net->blackHole(noi); } - _net->runUntil(startDate + 10); - _net->exitNetwork(); - Status status = waitForQuorumCheck(); - ASSERT_EQUALS(ErrorCodes::NewReplicaSetConfigurationIncompatible, status); - ASSERT_REASON_CONTAINS( - status, "Our config version of"); - ASSERT_REASON_CONTAINS( - status, "is no larger than the version"); - ASSERT_REASON_CONTAINS(status, "h1:1"); - ASSERT_NOT_REASON_CONTAINS(status, "h2:1"); - ASSERT_NOT_REASON_CONTAINS(status, "h3:1"); } - - TEST_F(CheckQuorumForReconfig, QuorumCheckVetoedDueToIncompatibleSetName) { - // In this test, "we" are host "h3:1". The request to "h1" times out, - // and the request to "h2" comes back indicating an incompatible set name. - - const ReplicaSetConfig rsConfig = assertMakeRSConfig( - BSON("_id" << "rs0" << - "version" << 2 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1:1") << - BSON("_id" << 2 << "host" << "h2:1") << - BSON("_id" << 3 << "host" << "h3:1")))); - const int myConfigIndex = 2; - const BSONObj hbRequest = makeHeartbeatRequest(rsConfig, myConfigIndex); - - startQuorumCheck(rsConfig, myConfigIndex); - const Date_t startDate = _net->now(); - const int numCommandsExpected = rsConfig.getNumMembers() - 1; - unordered_set<HostAndPort> seenHosts; - _net->enterNetwork(); - for (int i = 0; i < numCommandsExpected; ++i) { - const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); - const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); - ASSERT_EQUALS("admin", request.dbname); - ASSERT_EQUALS(hbRequest, request.cmdObj); - ASSERT(seenHosts.insert(request.target).second) << - "Already saw " << request.target.toString(); - if (request.target == HostAndPort("h2", 1)) { - _net->scheduleResponse(noi, - startDate + 10, - ResponseStatus(ReplicationExecutor::RemoteCommandResponse( - BSON("ok" << 0 << "mismatch" << true), - Milliseconds(8)))); - } - else { - _net->scheduleResponse(noi, - startDate + 10, - ResponseStatus(ErrorCodes::NoSuchKey, "No response")); - } + _net->runUntil(startDate + 10); + _net->exitNetwork(); + Status status = waitForQuorumCheck(); + ASSERT_EQUALS(ErrorCodes::NewReplicaSetConfigurationIncompatible, status); + ASSERT_REASON_CONTAINS(status, "Our config version of"); + ASSERT_REASON_CONTAINS(status, "is no larger than the version"); + ASSERT_REASON_CONTAINS(status, "h1:1"); + ASSERT_NOT_REASON_CONTAINS(status, "h2:1"); + ASSERT_NOT_REASON_CONTAINS(status, "h3:1"); +} + +TEST_F(CheckQuorumForReconfig, QuorumCheckVetoedDueToIncompatibleSetName) { + // In this test, "we" are host "h3:1". The request to "h1" times out, + // and the request to "h2" comes back indicating an incompatible set name. + + const ReplicaSetConfig rsConfig = + assertMakeRSConfig(BSON("_id" + << "rs0" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1:1") + << BSON("_id" << 2 << "host" + << "h2:1") << BSON("_id" << 3 << "host" + << "h3:1")))); + const int myConfigIndex = 2; + const BSONObj hbRequest = makeHeartbeatRequest(rsConfig, myConfigIndex); + + startQuorumCheck(rsConfig, myConfigIndex); + const Date_t startDate = _net->now(); + const int numCommandsExpected = rsConfig.getNumMembers() - 1; + unordered_set<HostAndPort> seenHosts; + _net->enterNetwork(); + for (int i = 0; i < numCommandsExpected; ++i) { + const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); + const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); + ASSERT_EQUALS("admin", request.dbname); + ASSERT_EQUALS(hbRequest, request.cmdObj); + ASSERT(seenHosts.insert(request.target).second) << "Already saw " + << request.target.toString(); + if (request.target == HostAndPort("h2", 1)) { + _net->scheduleResponse(noi, + startDate + 10, + ResponseStatus(ReplicationExecutor::RemoteCommandResponse( + BSON("ok" << 0 << "mismatch" << true), Milliseconds(8)))); + } else { + _net->scheduleResponse( + noi, startDate + 10, ResponseStatus(ErrorCodes::NoSuchKey, "No response")); } - _net->runUntil(startDate + 10); - _net->exitNetwork(); - Status status = waitForQuorumCheck(); - ASSERT_EQUALS(ErrorCodes::NewReplicaSetConfigurationIncompatible, status); - ASSERT_REASON_CONTAINS(status, "Our set name did not match"); - ASSERT_NOT_REASON_CONTAINS(status, "h1:1"); - ASSERT_REASON_CONTAINS(status, "h2:1"); - ASSERT_NOT_REASON_CONTAINS(status, "h3:1"); - } - - TEST_F(CheckQuorumForReconfig, QuorumCheckFailsDueToInsufficientVoters) { - // In this test, "we" are host "h4". Only "h1", "h2" and "h3" are voters, - // and of the voters, only "h1" responds. As a result, quorum check fails. - // "h5" also responds, but because it cannot vote, is irrelevant for the reconfig - // quorum check. - - const ReplicaSetConfig rsConfig = assertMakeRSConfig( - BSON("_id" << "rs0" << - "version" << 2 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1:1") << - BSON("_id" << 2 << "host" << "h2:1") << - BSON("_id" << 3 << "host" << "h3:1") << - BSON("_id" << 4 << "host" << "h4:1" << "votes" << 0) << - BSON("_id" << 5 << "host" << "h5:1" << "votes" << 0)))); - const int myConfigIndex = 3; - const BSONObj hbRequest = makeHeartbeatRequest(rsConfig, myConfigIndex); - - startQuorumCheck(rsConfig, myConfigIndex); - const Date_t startDate = _net->now(); - const int numCommandsExpected = rsConfig.getNumMembers() - 1; - unordered_set<HostAndPort> seenHosts; - _net->enterNetwork(); - for (int i = 0; i < numCommandsExpected; ++i) { - const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); - const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); - ASSERT_EQUALS("admin", request.dbname); - ASSERT_EQUALS(hbRequest, request.cmdObj); - ASSERT(seenHosts.insert(request.target).second) << - "Already saw " << request.target.toString(); - if (request.target == HostAndPort("h1", 1) || request.target == HostAndPort("h5", 1)) { - _net->scheduleResponse(noi, - startDate + 10, - ResponseStatus(ReplicationExecutor::RemoteCommandResponse( - BSON("ok" << 1), - Milliseconds(8)))); - } - else { - _net->scheduleResponse(noi, - startDate + 10, - ResponseStatus(ErrorCodes::NoSuchKey, "No response")); - } + _net->runUntil(startDate + 10); + _net->exitNetwork(); + Status status = waitForQuorumCheck(); + ASSERT_EQUALS(ErrorCodes::NewReplicaSetConfigurationIncompatible, status); + ASSERT_REASON_CONTAINS(status, "Our set name did not match"); + ASSERT_NOT_REASON_CONTAINS(status, "h1:1"); + ASSERT_REASON_CONTAINS(status, "h2:1"); + ASSERT_NOT_REASON_CONTAINS(status, "h3:1"); +} + +TEST_F(CheckQuorumForReconfig, QuorumCheckFailsDueToInsufficientVoters) { + // In this test, "we" are host "h4". Only "h1", "h2" and "h3" are voters, + // and of the voters, only "h1" responds. As a result, quorum check fails. + // "h5" also responds, but because it cannot vote, is irrelevant for the reconfig + // quorum check. + + const ReplicaSetConfig rsConfig = + assertMakeRSConfig(BSON("_id" + << "rs0" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1:1") + << BSON("_id" << 2 << "host" + << "h2:1") << BSON("_id" << 3 << "host" + << "h3:1") + << BSON("_id" << 4 << "host" + << "h4:1" + << "votes" << 0) + << BSON("_id" << 5 << "host" + << "h5:1" + << "votes" << 0)))); + const int myConfigIndex = 3; + const BSONObj hbRequest = makeHeartbeatRequest(rsConfig, myConfigIndex); + + startQuorumCheck(rsConfig, myConfigIndex); + const Date_t startDate = _net->now(); + const int numCommandsExpected = rsConfig.getNumMembers() - 1; + unordered_set<HostAndPort> seenHosts; + _net->enterNetwork(); + for (int i = 0; i < numCommandsExpected; ++i) { + const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); + const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); + ASSERT_EQUALS("admin", request.dbname); + ASSERT_EQUALS(hbRequest, request.cmdObj); + ASSERT(seenHosts.insert(request.target).second) << "Already saw " + << request.target.toString(); + if (request.target == HostAndPort("h1", 1) || request.target == HostAndPort("h5", 1)) { + _net->scheduleResponse(noi, + startDate + 10, + ResponseStatus(ReplicationExecutor::RemoteCommandResponse( + BSON("ok" << 1), Milliseconds(8)))); + } else { + _net->scheduleResponse( + noi, startDate + 10, ResponseStatus(ErrorCodes::NoSuchKey, "No response")); } - _net->runUntil(startDate + 10); - _net->exitNetwork(); - Status status = waitForQuorumCheck(); - ASSERT_EQUALS(ErrorCodes::NodeNotFound, status); - ASSERT_REASON_CONTAINS(status, "not enough voting nodes responded; required 2 but only"); - ASSERT_REASON_CONTAINS(status, "h1:1"); - ASSERT_REASON_CONTAINS(status, "h2:1 failed with"); - ASSERT_REASON_CONTAINS(status, "h3:1 failed with"); - ASSERT_NOT_REASON_CONTAINS(status, "h4:1"); - ASSERT_NOT_REASON_CONTAINS(status, "h5:1"); } - - TEST_F(CheckQuorumForReconfig, QuorumCheckFailsDueToNoElectableNodeResponding) { - // In this test, "we" are host "h4". Only "h1", "h2" and "h3" are electable, - // and none of them respond. - - const ReplicaSetConfig rsConfig = assertMakeRSConfig( - BSON("_id" << "rs0" << - "version" << 2 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1:1") << - BSON("_id" << 2 << "host" << "h2:1") << - BSON("_id" << 3 << "host" << "h3:1") << - BSON("_id" << 4 << "host" << "h4:1" << "priority" << 0) << - BSON("_id" << 5 << "host" << "h5:1" << "priority" << 0)))); - const int myConfigIndex = 3; - const BSONObj hbRequest = makeHeartbeatRequest(rsConfig, myConfigIndex); - - startQuorumCheck(rsConfig, myConfigIndex); - const Date_t startDate = _net->now(); - const int numCommandsExpected = rsConfig.getNumMembers() - 1; - unordered_set<HostAndPort> seenHosts; - _net->enterNetwork(); - for (int i = 0; i < numCommandsExpected; ++i) { - const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); - const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); - ASSERT_EQUALS("admin", request.dbname); - ASSERT_EQUALS(hbRequest, request.cmdObj); - ASSERT(seenHosts.insert(request.target).second) << - "Already saw " << request.target.toString(); - if (request.target == HostAndPort("h5", 1)) { - _net->scheduleResponse(noi, - startDate + 10, - ResponseStatus(ReplicationExecutor::RemoteCommandResponse( - BSON("ok" << 1), - Milliseconds(8)))); - } - else { - _net->scheduleResponse(noi, - startDate + 10, - ResponseStatus(ErrorCodes::NoSuchKey, "No response")); - } + _net->runUntil(startDate + 10); + _net->exitNetwork(); + Status status = waitForQuorumCheck(); + ASSERT_EQUALS(ErrorCodes::NodeNotFound, status); + ASSERT_REASON_CONTAINS(status, "not enough voting nodes responded; required 2 but only"); + ASSERT_REASON_CONTAINS(status, "h1:1"); + ASSERT_REASON_CONTAINS(status, "h2:1 failed with"); + ASSERT_REASON_CONTAINS(status, "h3:1 failed with"); + ASSERT_NOT_REASON_CONTAINS(status, "h4:1"); + ASSERT_NOT_REASON_CONTAINS(status, "h5:1"); +} + +TEST_F(CheckQuorumForReconfig, QuorumCheckFailsDueToNoElectableNodeResponding) { + // In this test, "we" are host "h4". Only "h1", "h2" and "h3" are electable, + // and none of them respond. + + const ReplicaSetConfig rsConfig = + assertMakeRSConfig(BSON("_id" + << "rs0" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1:1") + << BSON("_id" << 2 << "host" + << "h2:1") << BSON("_id" << 3 << "host" + << "h3:1") + << BSON("_id" << 4 << "host" + << "h4:1" + << "priority" << 0) + << BSON("_id" << 5 << "host" + << "h5:1" + << "priority" << 0)))); + const int myConfigIndex = 3; + const BSONObj hbRequest = makeHeartbeatRequest(rsConfig, myConfigIndex); + + startQuorumCheck(rsConfig, myConfigIndex); + const Date_t startDate = _net->now(); + const int numCommandsExpected = rsConfig.getNumMembers() - 1; + unordered_set<HostAndPort> seenHosts; + _net->enterNetwork(); + for (int i = 0; i < numCommandsExpected; ++i) { + const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); + const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); + ASSERT_EQUALS("admin", request.dbname); + ASSERT_EQUALS(hbRequest, request.cmdObj); + ASSERT(seenHosts.insert(request.target).second) << "Already saw " + << request.target.toString(); + if (request.target == HostAndPort("h5", 1)) { + _net->scheduleResponse(noi, + startDate + 10, + ResponseStatus(ReplicationExecutor::RemoteCommandResponse( + BSON("ok" << 1), Milliseconds(8)))); + } else { + _net->scheduleResponse( + noi, startDate + 10, ResponseStatus(ErrorCodes::NoSuchKey, "No response")); } - _net->runUntil(startDate + 10); - _net->exitNetwork(); - Status status = waitForQuorumCheck(); - ASSERT_EQUALS(ErrorCodes::NodeNotFound, status); - ASSERT_REASON_CONTAINS(status, "no electable nodes responded"); } - - TEST_F(CheckQuorumForReconfig, QuorumCheckSucceedsWithAsSoonAsPossible) { - // In this test, "we" are host "h4". Only "h1", "h2" and "h3" can vote. - // This test should succeed as soon as h1 and h2 respond, so we block - // h3 and h5 from responding or timing out until the test completes. - - const ReplicaSetConfig rsConfig = assertMakeRSConfig( - BSON("_id" << "rs0" << - "version" << 2 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1:1") << - BSON("_id" << 2 << "host" << "h2:1") << - BSON("_id" << 3 << "host" << "h3:1") << - BSON("_id" << 4 << "host" << "h4:1" << "votes" << 0) << - BSON("_id" << 5 << "host" << "h5:1" << "votes" << 0)))); - const int myConfigIndex = 3; - const BSONObj hbRequest = makeHeartbeatRequest(rsConfig, myConfigIndex); - - startQuorumCheck(rsConfig, myConfigIndex); - const Date_t startDate = _net->now(); - const int numCommandsExpected = rsConfig.getNumMembers() - 1; - unordered_set<HostAndPort> seenHosts; - _net->enterNetwork(); - for (int i = 0; i < numCommandsExpected; ++i) { - const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); - const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); - ASSERT_EQUALS("admin", request.dbname); - ASSERT_EQUALS(hbRequest, request.cmdObj); - ASSERT(seenHosts.insert(request.target).second) << - "Already saw " << request.target.toString(); - if (request.target == HostAndPort("h1", 1) || request.target == HostAndPort("h2", 1)) { - _net->scheduleResponse(noi, - startDate + 10, - ResponseStatus(ReplicationExecutor::RemoteCommandResponse( - BSON("ok" << 1), - Milliseconds(8)))); - } - else { - _net->blackHole(noi); - } + _net->runUntil(startDate + 10); + _net->exitNetwork(); + Status status = waitForQuorumCheck(); + ASSERT_EQUALS(ErrorCodes::NodeNotFound, status); + ASSERT_REASON_CONTAINS(status, "no electable nodes responded"); +} + +TEST_F(CheckQuorumForReconfig, QuorumCheckSucceedsWithAsSoonAsPossible) { + // In this test, "we" are host "h4". Only "h1", "h2" and "h3" can vote. + // This test should succeed as soon as h1 and h2 respond, so we block + // h3 and h5 from responding or timing out until the test completes. + + const ReplicaSetConfig rsConfig = + assertMakeRSConfig(BSON("_id" + << "rs0" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1:1") + << BSON("_id" << 2 << "host" + << "h2:1") << BSON("_id" << 3 << "host" + << "h3:1") + << BSON("_id" << 4 << "host" + << "h4:1" + << "votes" << 0) + << BSON("_id" << 5 << "host" + << "h5:1" + << "votes" << 0)))); + const int myConfigIndex = 3; + const BSONObj hbRequest = makeHeartbeatRequest(rsConfig, myConfigIndex); + + startQuorumCheck(rsConfig, myConfigIndex); + const Date_t startDate = _net->now(); + const int numCommandsExpected = rsConfig.getNumMembers() - 1; + unordered_set<HostAndPort> seenHosts; + _net->enterNetwork(); + for (int i = 0; i < numCommandsExpected; ++i) { + const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); + const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); + ASSERT_EQUALS("admin", request.dbname); + ASSERT_EQUALS(hbRequest, request.cmdObj); + ASSERT(seenHosts.insert(request.target).second) << "Already saw " + << request.target.toString(); + if (request.target == HostAndPort("h1", 1) || request.target == HostAndPort("h2", 1)) { + _net->scheduleResponse(noi, + startDate + 10, + ResponseStatus(ReplicationExecutor::RemoteCommandResponse( + BSON("ok" << 1), Milliseconds(8)))); + } else { + _net->blackHole(noi); } - _net->runUntil(startDate + 10); - _net->exitNetwork(); - ASSERT_OK(waitForQuorumCheck()); } + _net->runUntil(startDate + 10); + _net->exitNetwork(); + ASSERT_OK(waitForQuorumCheck()); +} } // namespace } // namespace repl diff --git a/src/mongo/db/repl/elect_cmd_runner.cpp b/src/mongo/db/repl/elect_cmd_runner.cpp index 2eb8599c7a4..d423a74dc64 100644 --- a/src/mongo/db/repl/elect_cmd_runner.cpp +++ b/src/mongo/db/repl/elect_cmd_runner.cpp @@ -42,119 +42,109 @@ namespace mongo { namespace repl { - ElectCmdRunner::Algorithm::Algorithm( - const ReplicaSetConfig& rsConfig, - int selfIndex, - const std::vector<HostAndPort>& targets, - OID round) - : _actualResponses(0), - _sufficientResponsesReceived(false), - _rsConfig(rsConfig), - _selfIndex(selfIndex), - _targets(targets), - _round(round) { - - // Vote for ourselves, first. - _receivedVotes = _rsConfig.getMemberAt(_selfIndex).getNumVotes(); +ElectCmdRunner::Algorithm::Algorithm(const ReplicaSetConfig& rsConfig, + int selfIndex, + const std::vector<HostAndPort>& targets, + OID round) + : _actualResponses(0), + _sufficientResponsesReceived(false), + _rsConfig(rsConfig), + _selfIndex(selfIndex), + _targets(targets), + _round(round) { + // Vote for ourselves, first. + _receivedVotes = _rsConfig.getMemberAt(_selfIndex).getNumVotes(); +} + +ElectCmdRunner::Algorithm::~Algorithm() {} + +std::vector<ReplicationExecutor::RemoteCommandRequest> ElectCmdRunner::Algorithm::getRequests() + const { + const MemberConfig& selfConfig = _rsConfig.getMemberAt(_selfIndex); + std::vector<ReplicationExecutor::RemoteCommandRequest> requests; + BSONObjBuilder electCmdBuilder; + electCmdBuilder.append("replSetElect", 1); + electCmdBuilder.append("set", _rsConfig.getReplSetName()); + electCmdBuilder.append("who", selfConfig.getHostAndPort().toString()); + electCmdBuilder.append("whoid", selfConfig.getId()); + electCmdBuilder.appendIntOrLL("cfgver", _rsConfig.getConfigVersion()); + electCmdBuilder.append("round", _round); + const BSONObj replSetElectCmd = electCmdBuilder.obj(); + + // Schedule a RemoteCommandRequest for each non-DOWN node + for (std::vector<HostAndPort>::const_iterator it = _targets.begin(); it != _targets.end(); + ++it) { + invariant(*it != selfConfig.getHostAndPort()); + requests.push_back(ReplicationExecutor::RemoteCommandRequest( + *it, + "admin", + replSetElectCmd, + Milliseconds(30 * 1000))); // trying to match current Socket timeout } - ElectCmdRunner::Algorithm::~Algorithm() {} - - std::vector<ReplicationExecutor::RemoteCommandRequest> - ElectCmdRunner::Algorithm::getRequests() const { - - const MemberConfig& selfConfig = _rsConfig.getMemberAt(_selfIndex); - std::vector<ReplicationExecutor::RemoteCommandRequest> requests; - BSONObjBuilder electCmdBuilder; - electCmdBuilder.append("replSetElect", 1); - electCmdBuilder.append("set", _rsConfig.getReplSetName()); - electCmdBuilder.append("who", selfConfig.getHostAndPort().toString()); - electCmdBuilder.append("whoid", selfConfig.getId()); - electCmdBuilder.appendIntOrLL("cfgver", _rsConfig.getConfigVersion()); - electCmdBuilder.append("round", _round); - const BSONObj replSetElectCmd = electCmdBuilder.obj(); - - // Schedule a RemoteCommandRequest for each non-DOWN node - for (std::vector<HostAndPort>::const_iterator it = _targets.begin(); - it != _targets.end(); - ++it) { - - invariant(*it != selfConfig.getHostAndPort()); - requests.push_back(ReplicationExecutor::RemoteCommandRequest( - *it, - "admin", - replSetElectCmd, - Milliseconds(30*1000))); // trying to match current Socket timeout - } - - return requests; - } + return requests; +} - bool ElectCmdRunner::Algorithm::hasReceivedSufficientResponses() const { - if (_sufficientResponsesReceived) { - return true; - } - if (_receivedVotes >= _rsConfig.getMajorityVoteCount()) { - return true; - } - if (_receivedVotes < 0) { - return true; - } - if (_actualResponses == _targets.size()) { - return true; - } - return false; +bool ElectCmdRunner::Algorithm::hasReceivedSufficientResponses() const { + if (_sufficientResponsesReceived) { + return true; } - - void ElectCmdRunner::Algorithm::processResponse( - const ReplicationExecutor::RemoteCommandRequest& request, - const ResponseStatus& response) { - - ++_actualResponses; - - if (response.isOK()) { - BSONObj res = response.getValue().data; - log() << "received " << res["vote"] << " votes from " << request.target; - LOG(1) << "full elect res: " << res.toString(); - BSONElement vote(res["vote"]); - if (vote.type() != mongo::NumberInt) { - error() << "wrong type for vote argument in replSetElect command: " << - typeName(vote.type()); - _sufficientResponsesReceived = true; - return; - } - - _receivedVotes += vote._numberInt(); - } - else { - warning() << "elect command to " << request.target << " failed: " << - response.getStatus(); - } + if (_receivedVotes >= _rsConfig.getMajorityVoteCount()) { + return true; } - - ElectCmdRunner::ElectCmdRunner() : _isCanceled(false) {} - ElectCmdRunner::~ElectCmdRunner() {} - - StatusWith<ReplicationExecutor::EventHandle> ElectCmdRunner::start( - ReplicationExecutor* executor, - const ReplicaSetConfig& currentConfig, - int selfIndex, - const std::vector<HostAndPort>& targets, - const stdx::function<void ()>& onCompletion) { - - _algorithm.reset(new Algorithm(currentConfig, selfIndex, targets, OID::gen())); - _runner.reset(new ScatterGatherRunner(_algorithm.get())); - return _runner->start(executor, onCompletion); + if (_receivedVotes < 0) { + return true; } - - void ElectCmdRunner::cancel(ReplicationExecutor* executor) { - _isCanceled = true; - _runner->cancel(executor); + if (_actualResponses == _targets.size()) { + return true; } + return false; +} + +void ElectCmdRunner::Algorithm::processResponse( + const ReplicationExecutor::RemoteCommandRequest& request, const ResponseStatus& response) { + ++_actualResponses; + + if (response.isOK()) { + BSONObj res = response.getValue().data; + log() << "received " << res["vote"] << " votes from " << request.target; + LOG(1) << "full elect res: " << res.toString(); + BSONElement vote(res["vote"]); + if (vote.type() != mongo::NumberInt) { + error() << "wrong type for vote argument in replSetElect command: " + << typeName(vote.type()); + _sufficientResponsesReceived = true; + return; + } - int ElectCmdRunner::getReceivedVotes() const { - return _algorithm->getReceivedVotes(); + _receivedVotes += vote._numberInt(); + } else { + warning() << "elect command to " << request.target << " failed: " << response.getStatus(); } - -} // namespace repl -} // namespace mongo +} + +ElectCmdRunner::ElectCmdRunner() : _isCanceled(false) {} +ElectCmdRunner::~ElectCmdRunner() {} + +StatusWith<ReplicationExecutor::EventHandle> ElectCmdRunner::start( + ReplicationExecutor* executor, + const ReplicaSetConfig& currentConfig, + int selfIndex, + const std::vector<HostAndPort>& targets, + const stdx::function<void()>& onCompletion) { + _algorithm.reset(new Algorithm(currentConfig, selfIndex, targets, OID::gen())); + _runner.reset(new ScatterGatherRunner(_algorithm.get())); + return _runner->start(executor, onCompletion); +} + +void ElectCmdRunner::cancel(ReplicationExecutor* executor) { + _isCanceled = true; + _runner->cancel(executor); +} + +int ElectCmdRunner::getReceivedVotes() const { + return _algorithm->getReceivedVotes(); +} + +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/elect_cmd_runner.h b/src/mongo/db/repl/elect_cmd_runner.h index 3007f5dc2b5..711445a8025 100644 --- a/src/mongo/db/repl/elect_cmd_runner.h +++ b/src/mongo/db/repl/elect_cmd_runner.h @@ -39,88 +39,91 @@ namespace mongo { - class Status; +class Status; namespace repl { - class ReplicaSetConfig; - class ScatterGatherRunner; +class ReplicaSetConfig; +class ScatterGatherRunner; - class ElectCmdRunner { - MONGO_DISALLOW_COPYING(ElectCmdRunner); +class ElectCmdRunner { + MONGO_DISALLOW_COPYING(ElectCmdRunner); + +public: + class Algorithm : public ScatterGatherAlgorithm { public: - class Algorithm : public ScatterGatherAlgorithm { - public: - Algorithm(const ReplicaSetConfig& rsConfig, - int selfIndex, - const std::vector<HostAndPort>& targets, - OID round); - - virtual ~Algorithm(); - virtual std::vector<ReplicationExecutor::RemoteCommandRequest> getRequests() const; - virtual void processResponse( - const ReplicationExecutor::RemoteCommandRequest& request, - const ResponseStatus& response); - virtual bool hasReceivedSufficientResponses() const; - - int getReceivedVotes() const { return _receivedVotes; } - - private: - // Tally of the number of received votes for this election. - int _receivedVotes; - - // Number of responses received so far. - size_t _actualResponses; - - bool _sufficientResponsesReceived; - - const ReplicaSetConfig _rsConfig; - const int _selfIndex; - const std::vector<HostAndPort> _targets; - const OID _round; - }; - - ElectCmdRunner(); - ~ElectCmdRunner(); - - /** - * Begins the process of sending replSetElect commands to all non-DOWN nodes - * in currentConfig. - * - * Returned handle can be used to schedule a callback when the process is complete. - */ - StatusWith<ReplicationExecutor::EventHandle> start( - ReplicationExecutor* executor, - const ReplicaSetConfig& currentConfig, - int selfIndex, - const std::vector<HostAndPort>& targets, - const stdx::function<void ()>& onCompletion = stdx::function<void ()>()); - - /** - * Informs the ElectCmdRunner to cancel further processing. The "executor" - * argument must point to the same executor passed to "start()". - * - * Like start(), this method must run in the executor context. - */ - void cancel(ReplicationExecutor* executor); - - /** - * Returns the number of received votes. Only valid to call after - * the event handle returned from start() has been signaled, which guarantees that - * the vote count will no longer be touched by callbacks. - */ - int getReceivedVotes() const; - - /** - * Returns true if cancel() was called on this instance. - */ - bool isCanceled() const { return _isCanceled; } + Algorithm(const ReplicaSetConfig& rsConfig, + int selfIndex, + const std::vector<HostAndPort>& targets, + OID round); + + virtual ~Algorithm(); + virtual std::vector<ReplicationExecutor::RemoteCommandRequest> getRequests() const; + virtual void processResponse(const ReplicationExecutor::RemoteCommandRequest& request, + const ResponseStatus& response); + virtual bool hasReceivedSufficientResponses() const; + + int getReceivedVotes() const { + return _receivedVotes; + } private: - boost::scoped_ptr<Algorithm> _algorithm; - boost::scoped_ptr<ScatterGatherRunner> _runner; - bool _isCanceled; + // Tally of the number of received votes for this election. + int _receivedVotes; + + // Number of responses received so far. + size_t _actualResponses; + + bool _sufficientResponsesReceived; + + const ReplicaSetConfig _rsConfig; + const int _selfIndex; + const std::vector<HostAndPort> _targets; + const OID _round; }; + ElectCmdRunner(); + ~ElectCmdRunner(); + + /** + * Begins the process of sending replSetElect commands to all non-DOWN nodes + * in currentConfig. + * + * Returned handle can be used to schedule a callback when the process is complete. + */ + StatusWith<ReplicationExecutor::EventHandle> start( + ReplicationExecutor* executor, + const ReplicaSetConfig& currentConfig, + int selfIndex, + const std::vector<HostAndPort>& targets, + const stdx::function<void()>& onCompletion = stdx::function<void()>()); + + /** + * Informs the ElectCmdRunner to cancel further processing. The "executor" + * argument must point to the same executor passed to "start()". + * + * Like start(), this method must run in the executor context. + */ + void cancel(ReplicationExecutor* executor); + + /** + * Returns the number of received votes. Only valid to call after + * the event handle returned from start() has been signaled, which guarantees that + * the vote count will no longer be touched by callbacks. + */ + int getReceivedVotes() const; + + /** + * Returns true if cancel() was called on this instance. + */ + bool isCanceled() const { + return _isCanceled; + } + +private: + boost::scoped_ptr<Algorithm> _algorithm; + boost::scoped_ptr<ScatterGatherRunner> _runner; + bool _isCanceled; +}; } } diff --git a/src/mongo/db/repl/elect_cmd_runner_test.cpp b/src/mongo/db/repl/elect_cmd_runner_test.cpp index 983e39a3b1b..ae8118d4c92 100644 --- a/src/mongo/db/repl/elect_cmd_runner_test.cpp +++ b/src/mongo/db/repl/elect_cmd_runner_test.cpp @@ -48,375 +48,366 @@ namespace mongo { namespace repl { namespace { - typedef ReplicationExecutor::RemoteCommandRequest RemoteCommandRequest; - - class ElectCmdRunnerTest : public mongo::unittest::Test { - public: - void startTest(ElectCmdRunner* electCmdRunner, - const ReplicaSetConfig& currentConfig, - int selfIndex, - const std::vector<HostAndPort>& hosts); - - void waitForTest(); - - void electCmdRunnerRunner(const ReplicationExecutor::CallbackData& data, - ElectCmdRunner* electCmdRunner, - StatusWith<ReplicationExecutor::EventHandle>* evh, - const ReplicaSetConfig& currentConfig, - int selfIndex, - const std::vector<HostAndPort>& hosts); - - NetworkInterfaceMock* _net; - boost::scoped_ptr<ReplicationExecutor> _executor; - boost::scoped_ptr<boost::thread> _executorThread; - - private: - void setUp(); - void tearDown(); - - ReplicationExecutor::EventHandle _allDoneEvent; - }; - - void ElectCmdRunnerTest::setUp() { - _net = new NetworkInterfaceMock; - _executor.reset(new ReplicationExecutor(_net, 1 /* prng seed */)); - _executorThread.reset(new boost::thread(stdx::bind(&ReplicationExecutor::run, - _executor.get()))); - } - - void ElectCmdRunnerTest::tearDown() { - _executor->shutdown(); - _executorThread->join(); +typedef ReplicationExecutor::RemoteCommandRequest RemoteCommandRequest; + +class ElectCmdRunnerTest : public mongo::unittest::Test { +public: + void startTest(ElectCmdRunner* electCmdRunner, + const ReplicaSetConfig& currentConfig, + int selfIndex, + const std::vector<HostAndPort>& hosts); + + void waitForTest(); + + void electCmdRunnerRunner(const ReplicationExecutor::CallbackData& data, + ElectCmdRunner* electCmdRunner, + StatusWith<ReplicationExecutor::EventHandle>* evh, + const ReplicaSetConfig& currentConfig, + int selfIndex, + const std::vector<HostAndPort>& hosts); + + NetworkInterfaceMock* _net; + boost::scoped_ptr<ReplicationExecutor> _executor; + boost::scoped_ptr<boost::thread> _executorThread; + +private: + void setUp(); + void tearDown(); + + ReplicationExecutor::EventHandle _allDoneEvent; +}; + +void ElectCmdRunnerTest::setUp() { + _net = new NetworkInterfaceMock; + _executor.reset(new ReplicationExecutor(_net, 1 /* prng seed */)); + _executorThread.reset( + new boost::thread(stdx::bind(&ReplicationExecutor::run, _executor.get()))); +} + +void ElectCmdRunnerTest::tearDown() { + _executor->shutdown(); + _executorThread->join(); +} + +ReplicaSetConfig assertMakeRSConfig(const BSONObj& configBson) { + ReplicaSetConfig config; + ASSERT_OK(config.initialize(configBson)); + ASSERT_OK(config.validate()); + return config; +} + +const BSONObj makeElectRequest(const ReplicaSetConfig& rsConfig, int selfIndex) { + const MemberConfig& myConfig = rsConfig.getMemberAt(selfIndex); + return BSON("replSetElect" << 1 << "set" << rsConfig.getReplSetName() << "who" + << myConfig.getHostAndPort().toString() << "whoid" + << myConfig.getId() << "cfgver" << rsConfig.getConfigVersion() + << "round" << 380865962699346850ll); +} + +BSONObj stripRound(const BSONObj& orig) { + BSONObjBuilder builder; + for (BSONObjIterator iter(orig); iter.more(); iter.next()) { + BSONElement e = *iter; + if (e.fieldNameStringData() == "round") { + continue; + } + builder.append(e); } + return builder.obj(); +} + +// This is necessary because the run method must be scheduled in the Replication Executor +// for correct concurrency operation. +void ElectCmdRunnerTest::electCmdRunnerRunner(const ReplicationExecutor::CallbackData& data, + ElectCmdRunner* electCmdRunner, + StatusWith<ReplicationExecutor::EventHandle>* evh, + const ReplicaSetConfig& currentConfig, + int selfIndex, + const std::vector<HostAndPort>& hosts) { + invariant(data.status.isOK()); + *evh = electCmdRunner->start(data.executor, currentConfig, selfIndex, hosts); +} + +void ElectCmdRunnerTest::startTest(ElectCmdRunner* electCmdRunner, + const ReplicaSetConfig& currentConfig, + int selfIndex, + const std::vector<HostAndPort>& hosts) { + StatusWith<ReplicationExecutor::EventHandle> evh(ErrorCodes::InternalError, "Not set"); + StatusWith<ReplicationExecutor::CallbackHandle> cbh = + _executor->scheduleWork(stdx::bind(&ElectCmdRunnerTest::electCmdRunnerRunner, + this, + stdx::placeholders::_1, + electCmdRunner, + &evh, + currentConfig, + selfIndex, + hosts)); + ASSERT_OK(cbh.getStatus()); + _executor->wait(cbh.getValue()); + ASSERT_OK(evh.getStatus()); + _allDoneEvent = evh.getValue(); +} + +void ElectCmdRunnerTest::waitForTest() { + _executor->waitForEvent(_allDoneEvent); +} + +TEST_F(ElectCmdRunnerTest, OneNode) { + // Only one node in the config. + const ReplicaSetConfig config = assertMakeRSConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1")))); + + std::vector<HostAndPort> hosts; + ElectCmdRunner electCmdRunner; + startTest(&electCmdRunner, config, 0, hosts); + waitForTest(); + ASSERT_EQUALS(electCmdRunner.getReceivedVotes(), 1); +} + +TEST_F(ElectCmdRunnerTest, TwoNodes) { + // Two nodes, we are node h1. + const ReplicaSetConfig config = + assertMakeRSConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h0") + << BSON("_id" << 2 << "host" + << "h1")))); + + std::vector<HostAndPort> hosts; + hosts.push_back(config.getMemberAt(1).getHostAndPort()); + + const BSONObj electRequest = makeElectRequest(config, 0); + + ElectCmdRunner electCmdRunner; + startTest(&electCmdRunner, config, 0, hosts); + const Date_t startDate = _net->now(); + _net->enterNetwork(); + const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); + ASSERT_EQUALS("admin", noi->getRequest().dbname); + ASSERT_EQUALS(stripRound(electRequest), stripRound(noi->getRequest().cmdObj)); + ASSERT_EQUALS(HostAndPort("h1"), noi->getRequest().target); + _net->scheduleResponse( + noi, + startDate + 10, + ResponseStatus(ReplicationExecutor::RemoteCommandResponse( + BSON("ok" << 1 << "vote" << 1 << "round" << 380865962699346850ll), Milliseconds(8)))); + _net->runUntil(startDate + 10); + _net->exitNetwork(); + ASSERT_EQUALS(startDate + 10, _net->now()); + waitForTest(); + ASSERT_EQUALS(electCmdRunner.getReceivedVotes(), 2); +} + +TEST_F(ElectCmdRunnerTest, ShuttingDown) { + // Two nodes, we are node h1. Shutdown happens while we're scheduling remote commands. + ReplicaSetConfig config = assertMakeRSConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h0") + << BSON("_id" << 2 << "host" + << "h1")))); + + std::vector<HostAndPort> hosts; + hosts.push_back(config.getMemberAt(1).getHostAndPort()); + + ElectCmdRunner electCmdRunner; + StatusWith<ReplicationExecutor::EventHandle> evh(ErrorCodes::InternalError, "Not set"); + StatusWith<ReplicationExecutor::CallbackHandle> cbh = + _executor->scheduleWork(stdx::bind(&ElectCmdRunnerTest::electCmdRunnerRunner, + this, + stdx::placeholders::_1, + &electCmdRunner, + &evh, + config, + 0, + hosts)); + ASSERT_OK(cbh.getStatus()); + _executor->wait(cbh.getValue()); + ASSERT_OK(evh.getStatus()); + _executor->shutdown(); + _executor->waitForEvent(evh.getValue()); + ASSERT_EQUALS(electCmdRunner.getReceivedVotes(), 1); +} + +class ElectScatterGatherTest : public mongo::unittest::Test { +public: + virtual void start(const BSONObj& configObj) { + int selfConfigIndex = 0; - ReplicaSetConfig assertMakeRSConfig(const BSONObj& configBson) { ReplicaSetConfig config; - ASSERT_OK(config.initialize(configBson)); - ASSERT_OK(config.validate()); - return config; - } + config.initialize(configObj); - const BSONObj makeElectRequest(const ReplicaSetConfig& rsConfig, - int selfIndex) { - const MemberConfig& myConfig = rsConfig.getMemberAt(selfIndex); - return BSON("replSetElect" << 1 << - "set" << rsConfig.getReplSetName() << - "who" << myConfig.getHostAndPort().toString() << - "whoid" << myConfig.getId() << - "cfgver" << rsConfig.getConfigVersion() << - "round" << 380865962699346850ll); - } - - BSONObj stripRound(const BSONObj& orig) { - BSONObjBuilder builder; - for (BSONObjIterator iter(orig); iter.more(); iter.next()) { - BSONElement e = *iter; - if (e.fieldNameStringData() == "round") { - continue; - } - builder.append(e); + std::vector<HostAndPort> hosts; + for (ReplicaSetConfig::MemberIterator mem = ++config.membersBegin(); + mem != config.membersEnd(); + ++mem) { + hosts.push_back(mem->getHostAndPort()); } - return builder.obj(); - } - // This is necessary because the run method must be scheduled in the Replication Executor - // for correct concurrency operation. - void ElectCmdRunnerTest::electCmdRunnerRunner( - const ReplicationExecutor::CallbackData& data, - ElectCmdRunner* electCmdRunner, - StatusWith<ReplicationExecutor::EventHandle>* evh, - const ReplicaSetConfig& currentConfig, - int selfIndex, - const std::vector<HostAndPort>& hosts) { - - invariant(data.status.isOK()); - *evh = electCmdRunner->start( - data.executor, - currentConfig, - selfIndex, - hosts); + _checker.reset(new ElectCmdRunner::Algorithm(config, selfConfigIndex, hosts, OID())); } - void ElectCmdRunnerTest::startTest(ElectCmdRunner* electCmdRunner, - const ReplicaSetConfig& currentConfig, - int selfIndex, - const std::vector<HostAndPort>& hosts) { - - StatusWith<ReplicationExecutor::EventHandle> evh(ErrorCodes::InternalError, "Not set"); - StatusWith<ReplicationExecutor::CallbackHandle> cbh = - _executor->scheduleWork( - stdx::bind(&ElectCmdRunnerTest::electCmdRunnerRunner, - this, - stdx::placeholders::_1, - electCmdRunner, - &evh, - currentConfig, - selfIndex, - hosts)); - ASSERT_OK(cbh.getStatus()); - _executor->wait(cbh.getValue()); - ASSERT_OK(evh.getStatus()); - _allDoneEvent = evh.getValue(); + virtual void tearDown() { + _checker.reset(NULL); } - void ElectCmdRunnerTest::waitForTest() { - _executor->waitForEvent(_allDoneEvent); +protected: + bool hasReceivedSufficientResponses() { + return _checker->hasReceivedSufficientResponses(); } - TEST_F(ElectCmdRunnerTest, OneNode) { - // Only one node in the config. - const ReplicaSetConfig config = assertMakeRSConfig( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1")))); - - std::vector<HostAndPort> hosts; - ElectCmdRunner electCmdRunner; - startTest(&electCmdRunner, config, 0, hosts); - waitForTest(); - ASSERT_EQUALS(electCmdRunner.getReceivedVotes(), 1); + int getReceivedVotes() { + return _checker->getReceivedVotes(); } - TEST_F(ElectCmdRunnerTest, TwoNodes) { - // Two nodes, we are node h1. - const ReplicaSetConfig config = assertMakeRSConfig( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h0") << - BSON("_id" << 2 << "host" << "h1")))); - - std::vector<HostAndPort> hosts; - hosts.push_back(config.getMemberAt(1).getHostAndPort()); - - const BSONObj electRequest = makeElectRequest(config, 0); - - ElectCmdRunner electCmdRunner; - startTest(&electCmdRunner, config, 0, hosts); - const Date_t startDate = _net->now(); - _net->enterNetwork(); - const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); - ASSERT_EQUALS("admin", noi->getRequest().dbname); - ASSERT_EQUALS(stripRound(electRequest), stripRound(noi->getRequest().cmdObj)); - ASSERT_EQUALS(HostAndPort("h1"), noi->getRequest().target); - _net->scheduleResponse(noi, - startDate + 10, - ResponseStatus(ReplicationExecutor::RemoteCommandResponse( - BSON("ok" << 1 << - "vote" << 1 << - "round" << 380865962699346850ll), - Milliseconds(8)))); - _net->runUntil(startDate + 10); - _net->exitNetwork(); - ASSERT_EQUALS(startDate + 10, _net->now()); - waitForTest(); - ASSERT_EQUALS(electCmdRunner.getReceivedVotes(), 2); + void processResponse(const RemoteCommandRequest& request, const ResponseStatus& response) { + _checker->processResponse(request, response); } - TEST_F(ElectCmdRunnerTest, ShuttingDown) { - // Two nodes, we are node h1. Shutdown happens while we're scheduling remote commands. - ReplicaSetConfig config = assertMakeRSConfig( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h0") << - BSON("_id" << 2 << "host" << "h1")))); - - std::vector<HostAndPort> hosts; - hosts.push_back(config.getMemberAt(1).getHostAndPort()); - - ElectCmdRunner electCmdRunner; - StatusWith<ReplicationExecutor::EventHandle> evh(ErrorCodes::InternalError, "Not set"); - StatusWith<ReplicationExecutor::CallbackHandle> cbh = - _executor->scheduleWork( - stdx::bind(&ElectCmdRunnerTest::electCmdRunnerRunner, - this, - stdx::placeholders::_1, - &electCmdRunner, - &evh, - config, - 0, - hosts)); - ASSERT_OK(cbh.getStatus()); - _executor->wait(cbh.getValue()); - ASSERT_OK(evh.getStatus()); - _executor->shutdown(); - _executor->waitForEvent(evh.getValue()); - ASSERT_EQUALS(electCmdRunner.getReceivedVotes(), 1); + RemoteCommandRequest requestFrom(std::string hostname) { + return RemoteCommandRequest(HostAndPort(hostname), + "", // the non-hostname fields do not matter for Elect + BSONObj(), + Milliseconds(0)); } - class ElectScatterGatherTest : public mongo::unittest::Test { - public: - virtual void start(const BSONObj& configObj) { - int selfConfigIndex = 0; - - ReplicaSetConfig config; - config.initialize(configObj); - - std::vector<HostAndPort> hosts; - for (ReplicaSetConfig::MemberIterator mem = ++config.membersBegin(); - mem != config.membersEnd(); - ++mem) { - hosts.push_back(mem->getHostAndPort()); - } - - _checker.reset(new ElectCmdRunner::Algorithm(config, - selfConfigIndex, - hosts, - OID())); - } - - virtual void tearDown() { - _checker.reset(NULL); - } - - protected: - bool hasReceivedSufficientResponses() { - return _checker->hasReceivedSufficientResponses(); - } - - int getReceivedVotes() { - return _checker->getReceivedVotes(); - } - - void processResponse(const RemoteCommandRequest& request, const ResponseStatus& response) { - _checker->processResponse(request, response); - } - - RemoteCommandRequest requestFrom(std::string hostname) { - return RemoteCommandRequest(HostAndPort(hostname), - "", // the non-hostname fields do not matter for Elect - BSONObj(), - Milliseconds(0)); - } - - ResponseStatus badResponseStatus() { - return ResponseStatus(ErrorCodes::NodeNotFound, "not on my watch"); - } - - ResponseStatus wrongTypeForVoteField() { - return ResponseStatus(NetworkInterfaceMock::Response(BSON("vote" << std::string("yea")), - Milliseconds(10))); - } - - ResponseStatus voteYea() { - return ResponseStatus(NetworkInterfaceMock::Response(BSON("vote" << 1), - Milliseconds(10))); - } - - ResponseStatus voteNay() { - return ResponseStatus(NetworkInterfaceMock::Response(BSON("vote" << -10000), - Milliseconds(10))); - } - - ResponseStatus abstainFromVoting() { - return ResponseStatus(NetworkInterfaceMock::Response(BSON("vote" << 0), - Milliseconds(10))); - } - - BSONObj threeNodesTwoArbitersConfig() { - return BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "host0") << - BSON("_id" << 1 << "host" << "host1" << "arbiterOnly" << true) << - BSON("_id" << 2 << "host" << "host2" << "arbiterOnly" << true))); - } - - BSONObj basicThreeNodeConfig() { - return BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "host0") << - BSON("_id" << 1 << "host" << "host1") << - BSON("_id" << 2 << "host" << "host2"))); - } - - private: - scoped_ptr<ElectCmdRunner::Algorithm> _checker; - }; - - TEST_F(ElectScatterGatherTest, NodeRespondsWithBadVoteType) { - start(basicThreeNodeConfig()); - ASSERT_FALSE(hasReceivedSufficientResponses()); - - processResponse(requestFrom("host2"), wrongTypeForVoteField()); - ASSERT_TRUE(hasReceivedSufficientResponses()); - ASSERT_EQUALS(1, getReceivedVotes()); // 1 because we have 1 vote and voted for ourself + ResponseStatus badResponseStatus() { + return ResponseStatus(ErrorCodes::NodeNotFound, "not on my watch"); } - TEST_F(ElectScatterGatherTest, NodeRespondsWithBadStatus) { - start(basicThreeNodeConfig()); - ASSERT_FALSE(hasReceivedSufficientResponses()); - - processResponse(requestFrom("host2"), badResponseStatus()); - ASSERT_FALSE(hasReceivedSufficientResponses()); - - processResponse(requestFrom("host3"), abstainFromVoting()); - ASSERT_TRUE(hasReceivedSufficientResponses()); - ASSERT_EQUALS(1, getReceivedVotes()); // 1 because we have 1 vote and voted for ourself + ResponseStatus wrongTypeForVoteField() { + return ResponseStatus( + NetworkInterfaceMock::Response(BSON("vote" << std::string("yea")), Milliseconds(10))); } - TEST_F(ElectScatterGatherTest, FirstNodeRespondsWithYea) { - start(basicThreeNodeConfig()); - ASSERT_FALSE(hasReceivedSufficientResponses()); - - processResponse(requestFrom("host2"), voteYea()); - ASSERT_TRUE(hasReceivedSufficientResponses()); - ASSERT_EQUALS(2, getReceivedVotes()); + ResponseStatus voteYea() { + return ResponseStatus(NetworkInterfaceMock::Response(BSON("vote" << 1), Milliseconds(10))); } - TEST_F(ElectScatterGatherTest, FirstNodeRespondsWithNaySecondWithYea) { - start(basicThreeNodeConfig()); - ASSERT_FALSE(hasReceivedSufficientResponses()); - - processResponse(requestFrom("host2"), voteNay()); - ASSERT_TRUE(hasReceivedSufficientResponses()); - ASSERT_EQUALS(-9999, getReceivedVotes()); + ResponseStatus voteNay() { + return ResponseStatus( + NetworkInterfaceMock::Response(BSON("vote" << -10000), Milliseconds(10))); } - TEST_F(ElectScatterGatherTest, BothNodesAbstainFromVoting) { - start(basicThreeNodeConfig()); - ASSERT_FALSE(hasReceivedSufficientResponses()); - - processResponse(requestFrom("host2"), abstainFromVoting()); - ASSERT_FALSE(hasReceivedSufficientResponses()); - - processResponse(requestFrom("host3"), abstainFromVoting()); - ASSERT_TRUE(hasReceivedSufficientResponses()); - ASSERT_EQUALS(1, getReceivedVotes()); + ResponseStatus abstainFromVoting() { + return ResponseStatus(NetworkInterfaceMock::Response(BSON("vote" << 0), Milliseconds(10))); } - TEST_F(ElectScatterGatherTest, NodeRespondsWithBadStatusArbiters) { - start(threeNodesTwoArbitersConfig()); - ASSERT_FALSE(hasReceivedSufficientResponses()); - - processResponse(requestFrom("host2"), badResponseStatus()); - ASSERT_FALSE(hasReceivedSufficientResponses()); - - processResponse(requestFrom("host3"), abstainFromVoting()); - ASSERT_TRUE(hasReceivedSufficientResponses()); - ASSERT_EQUALS(1, getReceivedVotes()); // 1 because we have 1 vote and voted for ourself + BSONObj threeNodesTwoArbitersConfig() { + return BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "host0") + << BSON("_id" << 1 << "host" + << "host1" + << "arbiterOnly" << true) + << BSON("_id" << 2 << "host" + << "host2" + << "arbiterOnly" << true))); } - TEST_F(ElectScatterGatherTest, FirstNodeRespondsWithYeaArbiters) { - start(threeNodesTwoArbitersConfig()); - ASSERT_FALSE(hasReceivedSufficientResponses()); - - processResponse(requestFrom("host2"), voteYea()); - ASSERT_TRUE(hasReceivedSufficientResponses()); - ASSERT_EQUALS(2, getReceivedVotes()); + BSONObj basicThreeNodeConfig() { + return BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "host0") + << BSON("_id" << 1 << "host" + << "host1") << BSON("_id" << 2 << "host" + << "host2"))); } - TEST_F(ElectScatterGatherTest, FirstNodeRespondsWithNaySecondWithYeaArbiters) { - start(threeNodesTwoArbitersConfig()); - ASSERT_FALSE(hasReceivedSufficientResponses()); - - processResponse(requestFrom("host2"), voteNay()); - ASSERT_TRUE(hasReceivedSufficientResponses()); - ASSERT_EQUALS(-9999, getReceivedVotes()); - } +private: + scoped_ptr<ElectCmdRunner::Algorithm> _checker; +}; + +TEST_F(ElectScatterGatherTest, NodeRespondsWithBadVoteType) { + start(basicThreeNodeConfig()); + ASSERT_FALSE(hasReceivedSufficientResponses()); + + processResponse(requestFrom("host2"), wrongTypeForVoteField()); + ASSERT_TRUE(hasReceivedSufficientResponses()); + ASSERT_EQUALS(1, getReceivedVotes()); // 1 because we have 1 vote and voted for ourself +} + +TEST_F(ElectScatterGatherTest, NodeRespondsWithBadStatus) { + start(basicThreeNodeConfig()); + ASSERT_FALSE(hasReceivedSufficientResponses()); + + processResponse(requestFrom("host2"), badResponseStatus()); + ASSERT_FALSE(hasReceivedSufficientResponses()); + + processResponse(requestFrom("host3"), abstainFromVoting()); + ASSERT_TRUE(hasReceivedSufficientResponses()); + ASSERT_EQUALS(1, getReceivedVotes()); // 1 because we have 1 vote and voted for ourself +} + +TEST_F(ElectScatterGatherTest, FirstNodeRespondsWithYea) { + start(basicThreeNodeConfig()); + ASSERT_FALSE(hasReceivedSufficientResponses()); + + processResponse(requestFrom("host2"), voteYea()); + ASSERT_TRUE(hasReceivedSufficientResponses()); + ASSERT_EQUALS(2, getReceivedVotes()); +} + +TEST_F(ElectScatterGatherTest, FirstNodeRespondsWithNaySecondWithYea) { + start(basicThreeNodeConfig()); + ASSERT_FALSE(hasReceivedSufficientResponses()); + + processResponse(requestFrom("host2"), voteNay()); + ASSERT_TRUE(hasReceivedSufficientResponses()); + ASSERT_EQUALS(-9999, getReceivedVotes()); +} + +TEST_F(ElectScatterGatherTest, BothNodesAbstainFromVoting) { + start(basicThreeNodeConfig()); + ASSERT_FALSE(hasReceivedSufficientResponses()); + + processResponse(requestFrom("host2"), abstainFromVoting()); + ASSERT_FALSE(hasReceivedSufficientResponses()); + + processResponse(requestFrom("host3"), abstainFromVoting()); + ASSERT_TRUE(hasReceivedSufficientResponses()); + ASSERT_EQUALS(1, getReceivedVotes()); +} + +TEST_F(ElectScatterGatherTest, NodeRespondsWithBadStatusArbiters) { + start(threeNodesTwoArbitersConfig()); + ASSERT_FALSE(hasReceivedSufficientResponses()); + + processResponse(requestFrom("host2"), badResponseStatus()); + ASSERT_FALSE(hasReceivedSufficientResponses()); + + processResponse(requestFrom("host3"), abstainFromVoting()); + ASSERT_TRUE(hasReceivedSufficientResponses()); + ASSERT_EQUALS(1, getReceivedVotes()); // 1 because we have 1 vote and voted for ourself +} + +TEST_F(ElectScatterGatherTest, FirstNodeRespondsWithYeaArbiters) { + start(threeNodesTwoArbitersConfig()); + ASSERT_FALSE(hasReceivedSufficientResponses()); + + processResponse(requestFrom("host2"), voteYea()); + ASSERT_TRUE(hasReceivedSufficientResponses()); + ASSERT_EQUALS(2, getReceivedVotes()); +} + +TEST_F(ElectScatterGatherTest, FirstNodeRespondsWithNaySecondWithYeaArbiters) { + start(threeNodesTwoArbitersConfig()); + ASSERT_FALSE(hasReceivedSufficientResponses()); + + processResponse(requestFrom("host2"), voteNay()); + ASSERT_TRUE(hasReceivedSufficientResponses()); + ASSERT_EQUALS(-9999, getReceivedVotes()); +} } // namespace } // namespace repl diff --git a/src/mongo/db/repl/freshness_checker.cpp b/src/mongo/db/repl/freshness_checker.cpp index 62e514c6793..54a020280e9 100644 --- a/src/mongo/db/repl/freshness_checker.cpp +++ b/src/mongo/db/repl/freshness_checker.cpp @@ -46,192 +46,179 @@ namespace mongo { namespace repl { - FreshnessChecker::Algorithm::Algorithm( - OpTime lastOpTimeApplied, - const ReplicaSetConfig& rsConfig, - int selfIndex, - const std::vector<HostAndPort>& targets) : - _responsesProcessed(0), - _failedVoterResponses(0), - _lastOpTimeApplied(lastOpTimeApplied), - _rsConfig(rsConfig), - _selfIndex(selfIndex), - _targets(targets), - _votingTargets(0), - _losableVoters(0), - _myVote(0), - _abortReason(None) { - - // Count voting targets (since the targets could be a subset of members). - for (std::vector<HostAndPort>::const_iterator it = _targets.begin(); - it != _targets.end(); - ++it) { - const MemberConfig* member = _rsConfig.findMemberByHostAndPort(*it); - if (member && member->isVoter()) - ++_votingTargets; - } - - _myVote = _rsConfig.getMemberAt(_selfIndex).isVoter() ? 1 : 0; - _losableVoters = std::max(0, - ((_votingTargets + _myVote) - _rsConfig.getMajorityVoteCount())); - - } - - FreshnessChecker::Algorithm::~Algorithm() {} - - std::vector<ReplicationExecutor::RemoteCommandRequest> - FreshnessChecker::Algorithm::getRequests() const { - const MemberConfig& selfConfig = _rsConfig.getMemberAt(_selfIndex); - - // gather all not-down nodes, get their fullnames(or hostandport's) - // schedule fresh command for each node - BSONObjBuilder freshCmdBuilder; - freshCmdBuilder.append("replSetFresh", 1); - freshCmdBuilder.append("set", _rsConfig.getReplSetName()); - freshCmdBuilder.append("opTime", Date_t(_lastOpTimeApplied.asDate())); - freshCmdBuilder.append("who", selfConfig.getHostAndPort().toString()); - freshCmdBuilder.appendIntOrLL("cfgver", _rsConfig.getConfigVersion()); - freshCmdBuilder.append("id", selfConfig.getId()); - const BSONObj replSetFreshCmd = freshCmdBuilder.obj(); - - std::vector<ReplicationExecutor::RemoteCommandRequest> requests; - for (std::vector<HostAndPort>::const_iterator it = _targets.begin(); - it != _targets.end(); - ++it) { - invariant(*it != selfConfig.getHostAndPort()); - requests.push_back(ReplicationExecutor::RemoteCommandRequest( - *it, - "admin", - replSetFreshCmd, - Milliseconds(30*1000))); // trying to match current Socket timeout - } - - return requests; +FreshnessChecker::Algorithm::Algorithm(OpTime lastOpTimeApplied, + const ReplicaSetConfig& rsConfig, + int selfIndex, + const std::vector<HostAndPort>& targets) + : _responsesProcessed(0), + _failedVoterResponses(0), + _lastOpTimeApplied(lastOpTimeApplied), + _rsConfig(rsConfig), + _selfIndex(selfIndex), + _targets(targets), + _votingTargets(0), + _losableVoters(0), + _myVote(0), + _abortReason(None) { + // Count voting targets (since the targets could be a subset of members). + for (std::vector<HostAndPort>::const_iterator it = _targets.begin(); it != _targets.end(); + ++it) { + const MemberConfig* member = _rsConfig.findMemberByHostAndPort(*it); + if (member && member->isVoter()) + ++_votingTargets; } - bool FreshnessChecker::Algorithm::hadTooManyFailedVoterResponses() const { - const bool tooManyLostVoters = (_failedVoterResponses > _losableVoters); - - LOG(3) << "hadTooManyFailedVoterResponses(" << tooManyLostVoters << ") = " - << _failedVoterResponses << " failed responses <" - << " (" << _votingTargets << " total voters - " - << _rsConfig.getMajorityVoteCount() << " majority voters - me (" - << _myVote << ")) -- losableVotes: " << _losableVoters; - return tooManyLostVoters; + _myVote = _rsConfig.getMemberAt(_selfIndex).isVoter() ? 1 : 0; + _losableVoters = std::max(0, ((_votingTargets + _myVote) - _rsConfig.getMajorityVoteCount())); +} + +FreshnessChecker::Algorithm::~Algorithm() {} + +std::vector<ReplicationExecutor::RemoteCommandRequest> FreshnessChecker::Algorithm::getRequests() + const { + const MemberConfig& selfConfig = _rsConfig.getMemberAt(_selfIndex); + + // gather all not-down nodes, get their fullnames(or hostandport's) + // schedule fresh command for each node + BSONObjBuilder freshCmdBuilder; + freshCmdBuilder.append("replSetFresh", 1); + freshCmdBuilder.append("set", _rsConfig.getReplSetName()); + freshCmdBuilder.append("opTime", Date_t(_lastOpTimeApplied.asDate())); + freshCmdBuilder.append("who", selfConfig.getHostAndPort().toString()); + freshCmdBuilder.appendIntOrLL("cfgver", _rsConfig.getConfigVersion()); + freshCmdBuilder.append("id", selfConfig.getId()); + const BSONObj replSetFreshCmd = freshCmdBuilder.obj(); + + std::vector<ReplicationExecutor::RemoteCommandRequest> requests; + for (std::vector<HostAndPort>::const_iterator it = _targets.begin(); it != _targets.end(); + ++it) { + invariant(*it != selfConfig.getHostAndPort()); + requests.push_back(ReplicationExecutor::RemoteCommandRequest( + *it, + "admin", + replSetFreshCmd, + Milliseconds(30 * 1000))); // trying to match current Socket timeout } - bool FreshnessChecker::Algorithm::_isVotingMember(const HostAndPort hap) const { - const MemberConfig* member = _rsConfig.findMemberByHostAndPort(hap); - invariant(member); - return member->isVoter(); - } - - void FreshnessChecker::Algorithm::processResponse( - const ReplicationExecutor::RemoteCommandRequest& request, - const ResponseStatus& response) { - ++_responsesProcessed; - bool votingMember = _isVotingMember(request.target); - - Status status = Status::OK(); - - if (!response.isOK() || - !((status = getStatusFromCommandResult(response.getValue().data)).isOK())) { - if (votingMember) { - ++_failedVoterResponses; - if (hadTooManyFailedVoterResponses()) { - _abortReason = QuorumUnreachable; - } - } - if (!response.isOK()) { // network/executor error - LOG(2) << "FreshnessChecker: Got failed response from " << request.target; - } - else { // command error, like unauth - LOG(2) << "FreshnessChecker: Got error response from " << request.target - << " :" << status; + return requests; +} + +bool FreshnessChecker::Algorithm::hadTooManyFailedVoterResponses() const { + const bool tooManyLostVoters = (_failedVoterResponses > _losableVoters); + + LOG(3) << "hadTooManyFailedVoterResponses(" << tooManyLostVoters + << ") = " << _failedVoterResponses << " failed responses <" + << " (" << _votingTargets << " total voters - " << _rsConfig.getMajorityVoteCount() + << " majority voters - me (" << _myVote << ")) -- losableVotes: " << _losableVoters; + return tooManyLostVoters; +} + +bool FreshnessChecker::Algorithm::_isVotingMember(const HostAndPort hap) const { + const MemberConfig* member = _rsConfig.findMemberByHostAndPort(hap); + invariant(member); + return member->isVoter(); +} + +void FreshnessChecker::Algorithm::processResponse( + const ReplicationExecutor::RemoteCommandRequest& request, const ResponseStatus& response) { + ++_responsesProcessed; + bool votingMember = _isVotingMember(request.target); + + Status status = Status::OK(); + + if (!response.isOK() || + !((status = getStatusFromCommandResult(response.getValue().data)).isOK())) { + if (votingMember) { + ++_failedVoterResponses; + if (hadTooManyFailedVoterResponses()) { + _abortReason = QuorumUnreachable; } - return; } - - const BSONObj res = response.getValue().data; - - LOG(2) << "FreshnessChecker: Got response from " << request.target - << " of " << res; - - if (res["fresher"].trueValue()) { - log() << "not electing self, we are not freshest"; - _abortReason = FresherNodeFound; - return; - } - - if (res["opTime"].type() != mongo::Date) { - error() << "wrong type for opTime argument in replSetFresh response: " << - typeName(res["opTime"].type()); - _abortReason = FresherNodeFound; - return; - } - OpTime remoteTime(res["opTime"].date()); - if (remoteTime == _lastOpTimeApplied) { - _abortReason = FreshnessTie; - } - if (remoteTime > _lastOpTimeApplied) { - // something really wrong (rogue command?) - _abortReason = FresherNodeFound; - return; - } - - if (res["veto"].trueValue()) { - BSONElement msg = res["errmsg"]; - if (msg.type() == String) { - log() << "not electing self, " << request.target.toString() << - " would veto with '" << msg.String() << "'"; - } - else { - log() << "not electing self, " << request.target.toString() << - " would veto"; - } - _abortReason = FresherNodeFound; - return; + if (!response.isOK()) { // network/executor error + LOG(2) << "FreshnessChecker: Got failed response from " << request.target; + } else { // command error, like unauth + LOG(2) << "FreshnessChecker: Got error response from " << request.target << " :" + << status; } + return; } - bool FreshnessChecker::Algorithm::hasReceivedSufficientResponses() const { - return (_abortReason != None && _abortReason != FreshnessTie) || - (_responsesProcessed == static_cast<int>(_targets.size())); - } + const BSONObj res = response.getValue().data; - FreshnessChecker::ElectionAbortReason FreshnessChecker::Algorithm::shouldAbortElection() const { - return _abortReason; - } + LOG(2) << "FreshnessChecker: Got response from " << request.target << " of " << res; - FreshnessChecker::ElectionAbortReason FreshnessChecker::shouldAbortElection() const { - return _algorithm->shouldAbortElection(); + if (res["fresher"].trueValue()) { + log() << "not electing self, we are not freshest"; + _abortReason = FresherNodeFound; + return; } - long long FreshnessChecker::getOriginalConfigVersion() const { - return _originalConfigVersion; + if (res["opTime"].type() != mongo::Date) { + error() << "wrong type for opTime argument in replSetFresh response: " + << typeName(res["opTime"].type()); + _abortReason = FresherNodeFound; + return; } - - FreshnessChecker::FreshnessChecker() : _isCanceled(false) {} - FreshnessChecker::~FreshnessChecker() {} - - StatusWith<ReplicationExecutor::EventHandle> FreshnessChecker::start( - ReplicationExecutor* executor, - const OpTime& lastOpTimeApplied, - const ReplicaSetConfig& currentConfig, - int selfIndex, - const std::vector<HostAndPort>& targets, - const stdx::function<void ()>& onCompletion) { - - _originalConfigVersion = currentConfig.getConfigVersion(); - _algorithm.reset(new Algorithm(lastOpTimeApplied, currentConfig, selfIndex, targets)); - _runner.reset(new ScatterGatherRunner(_algorithm.get())); - return _runner->start(executor, onCompletion); + OpTime remoteTime(res["opTime"].date()); + if (remoteTime == _lastOpTimeApplied) { + _abortReason = FreshnessTie; } - - void FreshnessChecker::cancel(ReplicationExecutor* executor) { - _isCanceled = true; - _runner->cancel(executor); + if (remoteTime > _lastOpTimeApplied) { + // something really wrong (rogue command?) + _abortReason = FresherNodeFound; + return; } -} // namespace repl -} // namespace mongo + if (res["veto"].trueValue()) { + BSONElement msg = res["errmsg"]; + if (msg.type() == String) { + log() << "not electing self, " << request.target.toString() << " would veto with '" + << msg.String() << "'"; + } else { + log() << "not electing self, " << request.target.toString() << " would veto"; + } + _abortReason = FresherNodeFound; + return; + } +} + +bool FreshnessChecker::Algorithm::hasReceivedSufficientResponses() const { + return (_abortReason != None && _abortReason != FreshnessTie) || + (_responsesProcessed == static_cast<int>(_targets.size())); +} + +FreshnessChecker::ElectionAbortReason FreshnessChecker::Algorithm::shouldAbortElection() const { + return _abortReason; +} + +FreshnessChecker::ElectionAbortReason FreshnessChecker::shouldAbortElection() const { + return _algorithm->shouldAbortElection(); +} + +long long FreshnessChecker::getOriginalConfigVersion() const { + return _originalConfigVersion; +} + +FreshnessChecker::FreshnessChecker() : _isCanceled(false) {} +FreshnessChecker::~FreshnessChecker() {} + +StatusWith<ReplicationExecutor::EventHandle> FreshnessChecker::start( + ReplicationExecutor* executor, + const OpTime& lastOpTimeApplied, + const ReplicaSetConfig& currentConfig, + int selfIndex, + const std::vector<HostAndPort>& targets, + const stdx::function<void()>& onCompletion) { + _originalConfigVersion = currentConfig.getConfigVersion(); + _algorithm.reset(new Algorithm(lastOpTimeApplied, currentConfig, selfIndex, targets)); + _runner.reset(new ScatterGatherRunner(_algorithm.get())); + return _runner->start(executor, onCompletion); +} + +void FreshnessChecker::cancel(ReplicationExecutor* executor) { + _isCanceled = true; + _runner->cancel(executor); +} + +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/freshness_checker.h b/src/mongo/db/repl/freshness_checker.h index 71f82cb86d3..fdb95765959 100644 --- a/src/mongo/db/repl/freshness_checker.h +++ b/src/mongo/db/repl/freshness_checker.h @@ -39,127 +39,128 @@ namespace mongo { - class Status; +class Status; namespace repl { - class ReplicaSetConfig; - class ScatterGatherRunner; +class ReplicaSetConfig; +class ScatterGatherRunner; - class FreshnessChecker { - MONGO_DISALLOW_COPYING(FreshnessChecker); +class FreshnessChecker { + MONGO_DISALLOW_COPYING(FreshnessChecker); + +public: + enum ElectionAbortReason { + None = 0, + FresherNodeFound, // Freshness check found fresher node + FreshnessTie, // Freshness check resulted in one or more nodes with our lastAppliedOpTime + QuorumUnavailable, // Not enough up voters + QuorumUnreachable // Too many failed voter responses + }; + + class Algorithm : public ScatterGatherAlgorithm { public: - enum ElectionAbortReason { - None = 0, - FresherNodeFound, // Freshness check found fresher node - FreshnessTie, // Freshness check resulted in one or more nodes with our lastAppliedOpTime - QuorumUnavailable, // Not enough up voters - QuorumUnreachable // Too many failed voter responses - }; - - class Algorithm : public ScatterGatherAlgorithm { - public: - Algorithm(OpTime lastOpTimeApplied, - const ReplicaSetConfig& rsConfig, - int selfIndex, - const std::vector<HostAndPort>& targets); - virtual ~Algorithm(); - virtual std::vector<ReplicationExecutor::RemoteCommandRequest> getRequests() const; - virtual void processResponse( - const ReplicationExecutor::RemoteCommandRequest& request, - const ResponseStatus& response); - virtual bool hasReceivedSufficientResponses() const; - ElectionAbortReason shouldAbortElection() const; - - private: - // Returns true if the number of failed votes is over _losableVotes() - bool hadTooManyFailedVoterResponses() const; - - // Returns true if the member, by host and port, has a vote. - bool _isVotingMember(const HostAndPort host) const; - - // Number of responses received so far. - int _responsesProcessed; - - // Number of failed voter responses so far. - int _failedVoterResponses; - - // Last OpTime applied by the caller; used in the Fresh command - const OpTime _lastOpTimeApplied; - - // Config to use for this check - const ReplicaSetConfig _rsConfig; - - // Our index position in _rsConfig - const int _selfIndex; - - // The UP members we are checking - const std::vector<HostAndPort> _targets; - - // Number of voting targets - int _votingTargets; - - // Number of voting nodes which can error - int _losableVoters; - - // 1 if I have a vote, otherwise 0 - int _myVote; - - // Reason to abort, start with None - ElectionAbortReason _abortReason; - - }; - - FreshnessChecker(); - virtual ~FreshnessChecker(); - - /** - * Begins the process of sending replSetFresh commands to all non-DOWN nodes - * in currentConfig, with the intention of determining whether the current node - * is freshest. - * evh can be used to schedule a callback when the process is complete. - * This function must be run in the executor, as it must be synchronous with the command - * callbacks that it schedules. - * If this function returns Status::OK(), evh is then guaranteed to be signaled. - **/ - StatusWith<ReplicationExecutor::EventHandle> start( - ReplicationExecutor* executor, - const OpTime& lastOpTimeApplied, - const ReplicaSetConfig& currentConfig, - int selfIndex, - const std::vector<HostAndPort>& targets, - const stdx::function<void ()>& onCompletion = stdx::function<void ()>()); - - /** - * Informs the freshness checker to cancel further processing. The "executor" - * argument must point to the same executor passed to "start()". - * - * Like start(), this method must run in the executor context. - */ - void cancel(ReplicationExecutor* executor); - - /** - * Returns true if cancel() was called on this instance. - */ - bool isCanceled() const { return _isCanceled; } - - /** - * 'None' if the election should continue, otherwise the reason to abort - */ + Algorithm(OpTime lastOpTimeApplied, + const ReplicaSetConfig& rsConfig, + int selfIndex, + const std::vector<HostAndPort>& targets); + virtual ~Algorithm(); + virtual std::vector<ReplicationExecutor::RemoteCommandRequest> getRequests() const; + virtual void processResponse(const ReplicationExecutor::RemoteCommandRequest& request, + const ResponseStatus& response); + virtual bool hasReceivedSufficientResponses() const; ElectionAbortReason shouldAbortElection() const; - /** - * Returns the config version supplied in the config when start() was called. - * Useful for determining if the the config version has changed. - */ - long long getOriginalConfigVersion() const; - private: - boost::scoped_ptr<Algorithm> _algorithm; - boost::scoped_ptr<ScatterGatherRunner> _runner; - long long _originalConfigVersion; - bool _isCanceled; + // Returns true if the number of failed votes is over _losableVotes() + bool hadTooManyFailedVoterResponses() const; + + // Returns true if the member, by host and port, has a vote. + bool _isVotingMember(const HostAndPort host) const; + + // Number of responses received so far. + int _responsesProcessed; + + // Number of failed voter responses so far. + int _failedVoterResponses; + + // Last OpTime applied by the caller; used in the Fresh command + const OpTime _lastOpTimeApplied; + + // Config to use for this check + const ReplicaSetConfig _rsConfig; + + // Our index position in _rsConfig + const int _selfIndex; + + // The UP members we are checking + const std::vector<HostAndPort> _targets; + + // Number of voting targets + int _votingTargets; + + // Number of voting nodes which can error + int _losableVoters; + + // 1 if I have a vote, otherwise 0 + int _myVote; + + // Reason to abort, start with None + ElectionAbortReason _abortReason; }; + FreshnessChecker(); + virtual ~FreshnessChecker(); + + /** + * Begins the process of sending replSetFresh commands to all non-DOWN nodes + * in currentConfig, with the intention of determining whether the current node + * is freshest. + * evh can be used to schedule a callback when the process is complete. + * This function must be run in the executor, as it must be synchronous with the command + * callbacks that it schedules. + * If this function returns Status::OK(), evh is then guaranteed to be signaled. + **/ + StatusWith<ReplicationExecutor::EventHandle> start( + ReplicationExecutor* executor, + const OpTime& lastOpTimeApplied, + const ReplicaSetConfig& currentConfig, + int selfIndex, + const std::vector<HostAndPort>& targets, + const stdx::function<void()>& onCompletion = stdx::function<void()>()); + + /** + * Informs the freshness checker to cancel further processing. The "executor" + * argument must point to the same executor passed to "start()". + * + * Like start(), this method must run in the executor context. + */ + void cancel(ReplicationExecutor* executor); + + /** + * Returns true if cancel() was called on this instance. + */ + bool isCanceled() const { + return _isCanceled; + } + + /** + * 'None' if the election should continue, otherwise the reason to abort + */ + ElectionAbortReason shouldAbortElection() const; + + /** + * Returns the config version supplied in the config when start() was called. + * Useful for determining if the the config version has changed. + */ + long long getOriginalConfigVersion() const; + +private: + boost::scoped_ptr<Algorithm> _algorithm; + boost::scoped_ptr<ScatterGatherRunner> _runner; + long long _originalConfigVersion; + bool _isCanceled; +}; + } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/freshness_checker_test.cpp b/src/mongo/db/repl/freshness_checker_test.cpp index 362b4746606..36d8f00ca1f 100644 --- a/src/mongo/db/repl/freshness_checker_test.cpp +++ b/src/mongo/db/repl/freshness_checker_test.cpp @@ -49,1029 +49,987 @@ namespace mongo { namespace repl { namespace { - using unittest::assertGet; - - typedef ReplicationExecutor::RemoteCommandRequest RemoteCommandRequest; - - bool stringContains(const std::string &haystack, const std::string& needle) { - return haystack.find(needle) != std::string::npos; +using unittest::assertGet; + +typedef ReplicationExecutor::RemoteCommandRequest RemoteCommandRequest; + +bool stringContains(const std::string& haystack, const std::string& needle) { + return haystack.find(needle) != std::string::npos; +} + +class FreshnessCheckerTest : public mongo::unittest::Test { +protected: + void startTest(const OpTime& lastOpTimeApplied, + const ReplicaSetConfig& currentConfig, + int selfIndex, + const std::vector<HostAndPort>& hosts); + void waitOnChecker(); + FreshnessChecker::ElectionAbortReason shouldAbortElection() const; + + int64_t countLogLinesContaining(const std::string& needle) { + return std::count_if(getCapturedLogMessages().begin(), + getCapturedLogMessages().end(), + stdx::bind(stringContains, stdx::placeholders::_1, needle)); } - class FreshnessCheckerTest : public mongo::unittest::Test { - protected: - void startTest(const OpTime& lastOpTimeApplied, - const ReplicaSetConfig& currentConfig, - int selfIndex, - const std::vector<HostAndPort>& hosts); - void waitOnChecker(); - FreshnessChecker::ElectionAbortReason shouldAbortElection() const; - - int64_t countLogLinesContaining(const std::string& needle) { - return std::count_if(getCapturedLogMessages().begin(), - getCapturedLogMessages().end(), - stdx::bind(stringContains, - stdx::placeholders::_1, - needle)); - } - - NetworkInterfaceMock* _net; - boost::scoped_ptr<ReplicationExecutor> _executor; - boost::scoped_ptr<boost::thread> _executorThread; - - private: - void freshnessCheckerRunner(const ReplicationExecutor::CallbackData& data, - const OpTime& lastOpTimeApplied, - const ReplicaSetConfig& currentConfig, - int selfIndex, - const std::vector<HostAndPort>& hosts); - void setUp(); - void tearDown(); - - boost::scoped_ptr<FreshnessChecker> _checker; - ReplicationExecutor::EventHandle _checkerDoneEvent; - }; - - void FreshnessCheckerTest::setUp() { - _net = new NetworkInterfaceMock; - _executor.reset(new ReplicationExecutor(_net, 1 /* prng seed */)); - _executorThread.reset(new boost::thread(stdx::bind(&ReplicationExecutor::run, - _executor.get()))); - _checker.reset(new FreshnessChecker); + NetworkInterfaceMock* _net; + boost::scoped_ptr<ReplicationExecutor> _executor; + boost::scoped_ptr<boost::thread> _executorThread; + +private: + void freshnessCheckerRunner(const ReplicationExecutor::CallbackData& data, + const OpTime& lastOpTimeApplied, + const ReplicaSetConfig& currentConfig, + int selfIndex, + const std::vector<HostAndPort>& hosts); + void setUp(); + void tearDown(); + + boost::scoped_ptr<FreshnessChecker> _checker; + ReplicationExecutor::EventHandle _checkerDoneEvent; +}; + +void FreshnessCheckerTest::setUp() { + _net = new NetworkInterfaceMock; + _executor.reset(new ReplicationExecutor(_net, 1 /* prng seed */)); + _executorThread.reset( + new boost::thread(stdx::bind(&ReplicationExecutor::run, _executor.get()))); + _checker.reset(new FreshnessChecker); +} + +void FreshnessCheckerTest::tearDown() { + _executor->shutdown(); + _executorThread->join(); +} + +void FreshnessCheckerTest::waitOnChecker() { + _executor->waitForEvent(_checkerDoneEvent); +} + +FreshnessChecker::ElectionAbortReason FreshnessCheckerTest::shouldAbortElection() const { + return _checker->shouldAbortElection(); +} + +ReplicaSetConfig assertMakeRSConfig(const BSONObj& configBson) { + ReplicaSetConfig config; + ASSERT_OK(config.initialize(configBson)); + ASSERT_OK(config.validate()); + return config; +} + +const BSONObj makeFreshRequest(const ReplicaSetConfig& rsConfig, + OpTime lastOpTimeApplied, + int selfIndex) { + const MemberConfig& myConfig = rsConfig.getMemberAt(selfIndex); + return BSON("replSetFresh" << 1 << "set" << rsConfig.getReplSetName() << "opTime" + << Date_t(lastOpTimeApplied.asDate()) << "who" + << myConfig.getHostAndPort().toString() << "cfgver" + << rsConfig.getConfigVersion() << "id" << myConfig.getId()); +} + +// This is necessary because the run method must be scheduled in the Replication Executor +// for correct concurrency operation. +void FreshnessCheckerTest::freshnessCheckerRunner(const ReplicationExecutor::CallbackData& data, + const OpTime& lastOpTimeApplied, + const ReplicaSetConfig& currentConfig, + int selfIndex, + const std::vector<HostAndPort>& hosts) { + invariant(data.status.isOK()); + StatusWith<ReplicationExecutor::EventHandle> evh = + _checker->start(data.executor, lastOpTimeApplied, currentConfig, selfIndex, hosts); + _checkerDoneEvent = assertGet(evh); +} + +void FreshnessCheckerTest::startTest(const OpTime& lastOpTimeApplied, + const ReplicaSetConfig& currentConfig, + int selfIndex, + const std::vector<HostAndPort>& hosts) { + _executor->wait( + assertGet(_executor->scheduleWork(stdx::bind(&FreshnessCheckerTest::freshnessCheckerRunner, + this, + stdx::placeholders::_1, + lastOpTimeApplied, + currentConfig, + selfIndex, + hosts)))); +} + +TEST_F(FreshnessCheckerTest, TwoNodes) { + // Two nodes, we are node h1. We are freshest, but we tie with h2. + ReplicaSetConfig config = assertMakeRSConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h0") + << BSON("_id" << 2 << "host" + << "h1")))); + + std::vector<HostAndPort> hosts; + hosts.push_back(config.getMemberAt(1).getHostAndPort()); + const BSONObj freshRequest = makeFreshRequest(config, OpTime(0, 0), 0); + + startTest(OpTime(0, 0), config, 0, hosts); + const Date_t startDate = _net->now(); + _net->enterNetwork(); + for (size_t i = 0; i < hosts.size(); ++i) { + const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); + ASSERT_EQUALS("admin", noi->getRequest().dbname); + ASSERT_EQUALS(freshRequest, noi->getRequest().cmdObj); + ASSERT_EQUALS(HostAndPort("h1"), noi->getRequest().target); + _net->scheduleResponse( + noi, + startDate + 10, + ResponseStatus(ReplicationExecutor::RemoteCommandResponse( + BSON("ok" << 1 << "id" << 2 << "set" + << "rs0" + << "who" + << "h1" + << "cfgver" << 1 << "opTime" << Date_t(OpTime(0, 0).asDate())), + Milliseconds(8)))); } - - void FreshnessCheckerTest::tearDown() { - _executor->shutdown(); - _executorThread->join(); + _net->runUntil(startDate + 10); + _net->exitNetwork(); + ASSERT_EQUALS(startDate + 10, _net->now()); + waitOnChecker(); + ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FreshnessTie); +} + +TEST_F(FreshnessCheckerTest, ShuttingDown) { + // Two nodes, we are node h1. Shutdown happens while we're scheduling remote commands. + ReplicaSetConfig config = assertMakeRSConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h0") + << BSON("_id" << 2 << "host" + << "h1")))); + + std::vector<HostAndPort> hosts; + hosts.push_back(config.getMemberAt(1).getHostAndPort()); + + startTest(OpTime(0, 0), config, 0, hosts); + _executor->shutdown(); + waitOnChecker(); + + // This seems less than ideal, but if we are shutting down, the next phase of election + // cannot proceed anyway. + ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::None); +} + +TEST_F(FreshnessCheckerTest, ElectNotElectingSelfWeAreNotFreshest) { + // other responds as fresher than us + startCapturingLogMessages(); + ReplicaSetConfig config = assertMakeRSConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h0") + << BSON("_id" << 2 << "host" + << "h1")))); + + std::vector<HostAndPort> hosts; + hosts.push_back(config.getMemberAt(1).getHostAndPort()); + + const BSONObj freshRequest = makeFreshRequest(config, OpTime(10, 0), 0); + + startTest(OpTime(10, 0), config, 0, hosts); + const Date_t startDate = _net->now(); + _net->enterNetwork(); + for (size_t i = 0; i < hosts.size(); ++i) { + const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); + ASSERT_EQUALS("admin", noi->getRequest().dbname); + ASSERT_EQUALS(freshRequest, noi->getRequest().cmdObj); + ASSERT_EQUALS(HostAndPort("h1"), noi->getRequest().target); + _net->scheduleResponse(noi, + startDate + 10, + ResponseStatus(ReplicationExecutor::RemoteCommandResponse( + BSON("ok" << 1 << "id" << 2 << "set" + << "rs0" + << "who" + << "h1" + << "cfgver" << 1 << "fresher" << true << "opTime" + << Date_t(OpTime(0, 0).asDate())), + Milliseconds(8)))); } - - void FreshnessCheckerTest::waitOnChecker() { - _executor->waitForEvent(_checkerDoneEvent); + _net->runUntil(startDate + 10); + _net->exitNetwork(); + ASSERT_EQUALS(startDate + 10, _net->now()); + waitOnChecker(); + + stopCapturingLogMessages(); + ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); + ASSERT_EQUALS(1, countLogLinesContaining("not electing self, we are not freshest")); +} + +TEST_F(FreshnessCheckerTest, ElectNotElectingSelfWeAreNotFreshestOpTime) { + // other responds with a later optime than ours + startCapturingLogMessages(); + ReplicaSetConfig config = assertMakeRSConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h0") + << BSON("_id" << 2 << "host" + << "h1")))); + + std::vector<HostAndPort> hosts; + hosts.push_back(config.getMemberAt(1).getHostAndPort()); + + const BSONObj freshRequest = makeFreshRequest(config, OpTime(0, 0), 0); + + startTest(OpTime(0, 0), config, 0, hosts); + const Date_t startDate = _net->now(); + _net->enterNetwork(); + for (size_t i = 0; i < hosts.size(); ++i) { + const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); + ASSERT_EQUALS("admin", noi->getRequest().dbname); + ASSERT_EQUALS(freshRequest, noi->getRequest().cmdObj); + ASSERT_EQUALS(HostAndPort("h1"), noi->getRequest().target); + _net->scheduleResponse( + noi, + startDate + 10, + ResponseStatus(ReplicationExecutor::RemoteCommandResponse( + BSON("ok" << 1 << "id" << 2 << "set" + << "rs0" + << "who" + << "h1" + << "cfgver" << 1 << "opTime" << Date_t(OpTime(10, 0).asDate())), + Milliseconds(8)))); } - - FreshnessChecker::ElectionAbortReason FreshnessCheckerTest::shouldAbortElection() const { - return _checker->shouldAbortElection(); + _net->runUntil(startDate + 10); + _net->exitNetwork(); + ASSERT_EQUALS(startDate + 10, _net->now()); + waitOnChecker(); + + stopCapturingLogMessages(); + ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); +} + +TEST_F(FreshnessCheckerTest, ElectWrongTypeInFreshnessResponse) { + // other responds with "opTime" field of non-Date value, causing not freshest + startCapturingLogMessages(); + ReplicaSetConfig config = assertMakeRSConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h0") + << BSON("_id" << 2 << "host" + << "h1")))); + + std::vector<HostAndPort> hosts; + hosts.push_back(config.getMemberAt(1).getHostAndPort()); + + const BSONObj freshRequest = makeFreshRequest(config, OpTime(10, 0), 0); + + startTest(OpTime(10, 0), config, 0, hosts); + const Date_t startDate = _net->now(); + _net->enterNetwork(); + for (size_t i = 0; i < hosts.size(); ++i) { + const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); + ASSERT_EQUALS("admin", noi->getRequest().dbname); + ASSERT_EQUALS(freshRequest, noi->getRequest().cmdObj); + ASSERT_EQUALS(HostAndPort("h1"), noi->getRequest().target); + _net->scheduleResponse(noi, + startDate + 10, + ResponseStatus(ReplicationExecutor::RemoteCommandResponse( + BSON("ok" << 1 << "id" << 2 << "set" + << "rs0" + << "who" + << "h1" + << "cfgver" << 1 << "opTime" << 3), + Milliseconds(8)))); } - - ReplicaSetConfig assertMakeRSConfig(const BSONObj& configBson) { - ReplicaSetConfig config; - ASSERT_OK(config.initialize(configBson)); - ASSERT_OK(config.validate()); - return config; + _net->runUntil(startDate + 10); + _net->exitNetwork(); + ASSERT_EQUALS(startDate + 10, _net->now()); + waitOnChecker(); + + stopCapturingLogMessages(); + + ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); + ASSERT_EQUALS(1, + countLogLinesContaining( + "wrong type for opTime argument in replSetFresh " + "response: NumberInt32")); +} + +TEST_F(FreshnessCheckerTest, ElectVetoed) { + // other responds with veto + startCapturingLogMessages(); + ReplicaSetConfig config = assertMakeRSConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h0") + << BSON("_id" << 2 << "host" + << "h1")))); + + std::vector<HostAndPort> hosts; + hosts.push_back(config.getMemberAt(1).getHostAndPort()); + + const BSONObj freshRequest = makeFreshRequest(config, OpTime(10, 0), 0); + + startTest(OpTime(10, 0), config, 0, hosts); + const Date_t startDate = _net->now(); + _net->enterNetwork(); + for (size_t i = 0; i < hosts.size(); ++i) { + const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); + ASSERT_EQUALS("admin", noi->getRequest().dbname); + ASSERT_EQUALS(freshRequest, noi->getRequest().cmdObj); + ASSERT_EQUALS(HostAndPort("h1"), noi->getRequest().target); + _net->scheduleResponse(noi, + startDate + 10, + ResponseStatus(ReplicationExecutor::RemoteCommandResponse( + BSON("ok" << 1 << "id" << 2 << "set" + << "rs0" + << "who" + << "h1" + << "cfgver" << 1 << "veto" << true << "errmsg" + << "I'd rather you didn't" + << "opTime" << Date_t(OpTime(0, 0).asDate())), + Milliseconds(8)))); } - - const BSONObj makeFreshRequest(const ReplicaSetConfig& rsConfig, - OpTime lastOpTimeApplied, - int selfIndex) { - const MemberConfig& myConfig = rsConfig.getMemberAt(selfIndex); - return BSON("replSetFresh" << 1 << - "set" << rsConfig.getReplSetName() << - "opTime" << Date_t(lastOpTimeApplied.asDate()) << - "who" << myConfig.getHostAndPort().toString() << - "cfgver" << rsConfig.getConfigVersion() << - "id" << myConfig.getId()); + _net->runUntil(startDate + 10); + _net->exitNetwork(); + ASSERT_EQUALS(startDate + 10, _net->now()); + waitOnChecker(); + + stopCapturingLogMessages(); + + ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); + ASSERT_EQUALS(1, + countLogLinesContaining( + "not electing self, h1:27017 would veto with " + "'I'd rather you didn't'")); +} + +int findIdForMember(const ReplicaSetConfig& rsConfig, const HostAndPort& host) { + const MemberConfig* member = rsConfig.findMemberByHostAndPort(host); + ASSERT_TRUE(member != NULL) << "No host named " << host.toString() << " in config"; + return member->getId(); +} + +TEST_F(FreshnessCheckerTest, ElectNotElectingSelfWeAreNotFreshestManyNodes) { + // one other responds as fresher than us + startCapturingLogMessages(); + ReplicaSetConfig config = + assertMakeRSConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h0") + << BSON("_id" << 2 << "host" + << "h1") << BSON("_id" << 3 << "host" + << "h2") + << BSON("_id" << 4 << "host" + << "h3") << BSON("_id" << 5 << "host" + << "h4")))); + + std::vector<HostAndPort> hosts; + for (ReplicaSetConfig::MemberIterator mem = ++config.membersBegin(); mem != config.membersEnd(); + ++mem) { + hosts.push_back(mem->getHostAndPort()); } - // This is necessary because the run method must be scheduled in the Replication Executor - // for correct concurrency operation. - void FreshnessCheckerTest::freshnessCheckerRunner( - const ReplicationExecutor::CallbackData& data, - const OpTime& lastOpTimeApplied, - const ReplicaSetConfig& currentConfig, - int selfIndex, - const std::vector<HostAndPort>& hosts) { - - invariant(data.status.isOK()); - StatusWith<ReplicationExecutor::EventHandle> evh = _checker->start(data.executor, - lastOpTimeApplied, - currentConfig, - selfIndex, - hosts); - _checkerDoneEvent = assertGet(evh); + const BSONObj freshRequest = makeFreshRequest(config, OpTime(10, 0), 0); + + startTest(OpTime(10, 0), config, 0, hosts); + const Date_t startDate = _net->now(); + unordered_set<HostAndPort> seen; + _net->enterNetwork(); + for (size_t i = 0; i < hosts.size(); ++i) { + const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); + const HostAndPort target = noi->getRequest().target; + ASSERT_EQUALS("admin", noi->getRequest().dbname); + ASSERT_EQUALS(freshRequest, noi->getRequest().cmdObj); + ASSERT(seen.insert(target).second) << "Already saw " << target; + BSONObjBuilder responseBuilder; + responseBuilder << "ok" << 1 << "id" << findIdForMember(config, target) << "set" + << "rs0" + << "who" << target.toString() << "cfgver" << 1 << "opTime" + << Date_t(OpTime(0, 0).asDate()); + if (target.host() == "h1") { + responseBuilder << "fresher" << true; + } + _net->scheduleResponse(noi, + startDate + 10, + ResponseStatus(ReplicationExecutor::RemoteCommandResponse( + responseBuilder.obj(), Milliseconds(8)))); } - - void FreshnessCheckerTest::startTest(const OpTime& lastOpTimeApplied, - const ReplicaSetConfig& currentConfig, - int selfIndex, - const std::vector<HostAndPort>& hosts) { - _executor->wait( - assertGet( - _executor->scheduleWork( - stdx::bind(&FreshnessCheckerTest::freshnessCheckerRunner, - this, - stdx::placeholders::_1, - lastOpTimeApplied, - currentConfig, - selfIndex, - hosts)))); + _net->runUntil(startDate + 10); + _net->exitNetwork(); + ASSERT_EQUALS(startDate + 10, _net->now()); + waitOnChecker(); + stopCapturingLogMessages(); + ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); + ASSERT_EQUALS(1, countLogLinesContaining("not electing self, we are not freshest")); +} + +TEST_F(FreshnessCheckerTest, ElectNotElectingSelfWeAreNotFreshestOpTimeManyNodes) { + // one other responds with a later optime than ours + startCapturingLogMessages(); + ReplicaSetConfig config = + assertMakeRSConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h0") + << BSON("_id" << 2 << "host" + << "h1") << BSON("_id" << 3 << "host" + << "h2") + << BSON("_id" << 4 << "host" + << "h3") << BSON("_id" << 5 << "host" + << "h4")))); + + std::vector<HostAndPort> hosts; + for (ReplicaSetConfig::MemberIterator mem = config.membersBegin(); mem != config.membersEnd(); + ++mem) { + if (HostAndPort("h0") == mem->getHostAndPort()) { + continue; + } + hosts.push_back(mem->getHostAndPort()); } - TEST_F(FreshnessCheckerTest, TwoNodes) { - // Two nodes, we are node h1. We are freshest, but we tie with h2. - ReplicaSetConfig config = assertMakeRSConfig( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h0") << - BSON("_id" << 2 << "host" << "h1")))); - - std::vector<HostAndPort> hosts; - hosts.push_back(config.getMemberAt(1).getHostAndPort()); - const BSONObj freshRequest = makeFreshRequest(config, OpTime(0,0), 0); - - startTest(OpTime(0, 0), config, 0, hosts); - const Date_t startDate = _net->now(); - _net->enterNetwork(); - for (size_t i = 0; i < hosts.size(); ++i) { - const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); - ASSERT_EQUALS("admin", noi->getRequest().dbname); - ASSERT_EQUALS(freshRequest, noi->getRequest().cmdObj); - ASSERT_EQUALS(HostAndPort("h1"), noi->getRequest().target); - _net->scheduleResponse( - noi, - startDate + 10, - ResponseStatus(ReplicationExecutor::RemoteCommandResponse( - BSON("ok" << 1 << - "id" << 2 << - "set" << "rs0" << - "who" << "h1" << - "cfgver" << 1 << - "opTime" << Date_t(OpTime(0,0).asDate())), - Milliseconds(8)))); + const BSONObj freshRequest = makeFreshRequest(config, OpTime(10, 0), 0); + + startTest(OpTime(10, 0), config, 0, hosts); + const Date_t startDate = _net->now(); + unordered_set<HostAndPort> seen; + _net->enterNetwork(); + + for (size_t i = 0; i < hosts.size(); ++i) { + const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); + const HostAndPort target = noi->getRequest().target; + ASSERT_EQUALS("admin", noi->getRequest().dbname); + ASSERT_EQUALS(freshRequest, noi->getRequest().cmdObj); + ASSERT(seen.insert(target).second) << "Already saw " << target; + BSONObjBuilder responseBuilder; + if (target.host() == "h4") { + responseBuilder << "ok" << 1 << "id" << findIdForMember(config, target) << "set" + << "rs0" + << "who" << target.toString() << "cfgver" << 1 << "opTime" + << Date_t(OpTime(20, 0).asDate()); + _net->scheduleResponse(noi, + startDate + 20, + ResponseStatus(ReplicationExecutor::RemoteCommandResponse( + responseBuilder.obj(), Milliseconds(8)))); + } else { + responseBuilder << "ok" << 1 << "id" << findIdForMember(config, target) << "set" + << "rs0" + << "who" << target.toString() << "cfgver" << 1 << "opTime" + << Date_t(OpTime(10, 0).asDate()); + _net->scheduleResponse(noi, + startDate + 10, + ResponseStatus(ReplicationExecutor::RemoteCommandResponse( + responseBuilder.obj(), Milliseconds(8)))); } - _net->runUntil(startDate + 10); - _net->exitNetwork(); - ASSERT_EQUALS(startDate + 10, _net->now()); - waitOnChecker(); - ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FreshnessTie); } - - TEST_F(FreshnessCheckerTest, ShuttingDown) { - // Two nodes, we are node h1. Shutdown happens while we're scheduling remote commands. - ReplicaSetConfig config = assertMakeRSConfig( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h0") << - BSON("_id" << 2 << "host" << "h1")))); - - std::vector<HostAndPort> hosts; - hosts.push_back(config.getMemberAt(1).getHostAndPort()); - - startTest( - OpTime(0, 0), - config, - 0, - hosts); - _executor->shutdown(); - waitOnChecker(); - - // This seems less than ideal, but if we are shutting down, the next phase of election - // cannot proceed anyway. - ASSERT_EQUALS(shouldAbortElection(),FreshnessChecker::None); - + _net->runUntil(startDate + 10); + ASSERT_EQUALS(startDate + 10, _net->now()); + ASSERT_EQUALS(0, countLogLinesContaining("not electing self, we are not freshest")); + _net->runUntil(startDate + 20); + ASSERT_EQUALS(startDate + 20, _net->now()); + _net->exitNetwork(); + waitOnChecker(); + stopCapturingLogMessages(); + ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); +} + +TEST_F(FreshnessCheckerTest, ElectWrongTypeInFreshnessResponseManyNodes) { + // one other responds with "opTime" field of non-Date value, causing not freshest + startCapturingLogMessages(); + ReplicaSetConfig config = + assertMakeRSConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h0") + << BSON("_id" << 2 << "host" + << "h1") << BSON("_id" << 3 << "host" + << "h2") + << BSON("_id" << 4 << "host" + << "h3") << BSON("_id" << 5 << "host" + << "h4")))); + + std::vector<HostAndPort> hosts; + for (ReplicaSetConfig::MemberIterator mem = ++config.membersBegin(); mem != config.membersEnd(); + ++mem) { + hosts.push_back(mem->getHostAndPort()); } - TEST_F(FreshnessCheckerTest, ElectNotElectingSelfWeAreNotFreshest) { - // other responds as fresher than us - startCapturingLogMessages(); - ReplicaSetConfig config = assertMakeRSConfig( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h0") << - BSON("_id" << 2 << "host" << "h1")))); - - std::vector<HostAndPort> hosts; - hosts.push_back(config.getMemberAt(1).getHostAndPort()); - - const BSONObj freshRequest = makeFreshRequest(config, OpTime(10,0), 0); - - startTest(OpTime(10, 0), config, 0, hosts); - const Date_t startDate = _net->now(); - _net->enterNetwork(); - for (size_t i = 0; i < hosts.size(); ++i) { - const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); - ASSERT_EQUALS("admin", noi->getRequest().dbname); - ASSERT_EQUALS(freshRequest, noi->getRequest().cmdObj); - ASSERT_EQUALS(HostAndPort("h1"), noi->getRequest().target); - _net->scheduleResponse( - noi, - startDate + 10, - ResponseStatus(ReplicationExecutor::RemoteCommandResponse( - BSON("ok" << 1 << - "id" << 2 << - "set" << "rs0" << - "who" << "h1" << - "cfgver" << 1 << - "fresher" << true << - "opTime" << Date_t(OpTime(0,0).asDate())), - Milliseconds(8)))); + const BSONObj freshRequest = makeFreshRequest(config, OpTime(10, 0), 0); + + startTest(OpTime(10, 0), config, 0, hosts); + const Date_t startDate = _net->now(); + unordered_set<HostAndPort> seen; + _net->enterNetwork(); + for (size_t i = 0; i < hosts.size(); ++i) { + const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); + const HostAndPort target = noi->getRequest().target; + ASSERT_EQUALS("admin", noi->getRequest().dbname); + ASSERT_EQUALS(freshRequest, noi->getRequest().cmdObj); + ASSERT(seen.insert(target).second) << "Already saw " << target; + BSONObjBuilder responseBuilder; + responseBuilder << "ok" << 1 << "id" << findIdForMember(config, target) << "set" + << "rs0" + << "who" << target.toString() << "cfgver" << 1; + if (target.host() == "h1") { + responseBuilder << "opTime" << 3; + } else { + responseBuilder << "opTime" << Date_t(OpTime(0, 0).asDate()); } - _net->runUntil(startDate + 10); - _net->exitNetwork(); - ASSERT_EQUALS(startDate + 10, _net->now()); - waitOnChecker(); - - stopCapturingLogMessages(); - ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); - ASSERT_EQUALS(1, countLogLinesContaining("not electing self, we are not freshest")); + _net->scheduleResponse(noi, + startDate + 10, + ResponseStatus(ReplicationExecutor::RemoteCommandResponse( + responseBuilder.obj(), Milliseconds(8)))); + } + _net->runUntil(startDate + 10); + _net->exitNetwork(); + ASSERT_EQUALS(startDate + 10, _net->now()); + waitOnChecker(); + stopCapturingLogMessages(); + ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); + ASSERT_EQUALS(1, + countLogLinesContaining( + "wrong type for opTime argument in replSetFresh " + "response: NumberInt32")); +} + +TEST_F(FreshnessCheckerTest, ElectVetoedManyNodes) { + // one other responds with veto + startCapturingLogMessages(); + ReplicaSetConfig config = + assertMakeRSConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h0") + << BSON("_id" << 2 << "host" + << "h1") << BSON("_id" << 3 << "host" + << "h2") + << BSON("_id" << 4 << "host" + << "h3") << BSON("_id" << 5 << "host" + << "h4")))); + + std::vector<HostAndPort> hosts; + for (ReplicaSetConfig::MemberIterator mem = ++config.membersBegin(); mem != config.membersEnd(); + ++mem) { + hosts.push_back(mem->getHostAndPort()); } - TEST_F(FreshnessCheckerTest, ElectNotElectingSelfWeAreNotFreshestOpTime) { - // other responds with a later optime than ours - startCapturingLogMessages(); - ReplicaSetConfig config = assertMakeRSConfig( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h0") << - BSON("_id" << 2 << "host" << "h1")))); - - std::vector<HostAndPort> hosts; - hosts.push_back(config.getMemberAt(1).getHostAndPort()); - - const BSONObj freshRequest = makeFreshRequest(config, OpTime(0,0), 0); - - startTest(OpTime(0, 0), config, 0, hosts); - const Date_t startDate = _net->now(); - _net->enterNetwork(); - for (size_t i = 0; i < hosts.size(); ++i) { - const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); - ASSERT_EQUALS("admin", noi->getRequest().dbname); - ASSERT_EQUALS(freshRequest, noi->getRequest().cmdObj); - ASSERT_EQUALS(HostAndPort("h1"), noi->getRequest().target); - _net->scheduleResponse( - noi, - startDate + 10, - ResponseStatus(ReplicationExecutor::RemoteCommandResponse( - BSON("ok" << 1 << - "id" << 2 << - "set" << "rs0" << - "who" << "h1" << - "cfgver" << 1 << - "opTime" << Date_t(OpTime(10,0).asDate())), - Milliseconds(8)))); + const BSONObj freshRequest = makeFreshRequest(config, OpTime(10, 0), 0); + + startTest(OpTime(10, 0), config, 0, hosts); + const Date_t startDate = _net->now(); + unordered_set<HostAndPort> seen; + _net->enterNetwork(); + for (size_t i = 0; i < hosts.size(); ++i) { + const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); + const HostAndPort target = noi->getRequest().target; + ASSERT_EQUALS("admin", noi->getRequest().dbname); + ASSERT_EQUALS(freshRequest, noi->getRequest().cmdObj); + ASSERT(seen.insert(target).second) << "Already saw " << target; + BSONObjBuilder responseBuilder; + responseBuilder << "ok" << 1 << "id" << findIdForMember(config, target) << "set" + << "rs0" + << "who" << target.toString() << "cfgver" << 1 << "opTime" + << Date_t(OpTime(0, 0).asDate()); + if (target.host() == "h1") { + responseBuilder << "veto" << true << "errmsg" + << "I'd rather you didn't"; } - _net->runUntil(startDate + 10); - _net->exitNetwork(); - ASSERT_EQUALS(startDate + 10, _net->now()); - waitOnChecker(); - - stopCapturingLogMessages(); - ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); + _net->scheduleResponse(noi, + startDate + 10, + ResponseStatus(ReplicationExecutor::RemoteCommandResponse( + responseBuilder.obj(), Milliseconds(8)))); } - - TEST_F(FreshnessCheckerTest, ElectWrongTypeInFreshnessResponse) { - // other responds with "opTime" field of non-Date value, causing not freshest - startCapturingLogMessages(); - ReplicaSetConfig config = assertMakeRSConfig( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h0") << - BSON("_id" << 2 << "host" << "h1")))); - - std::vector<HostAndPort> hosts; - hosts.push_back(config.getMemberAt(1).getHostAndPort()); - - const BSONObj freshRequest = makeFreshRequest(config, OpTime(10,0), 0); - - startTest(OpTime(10, 0), config, 0, hosts); - const Date_t startDate = _net->now(); - _net->enterNetwork(); - for (size_t i = 0; i < hosts.size(); ++i) { - const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); - ASSERT_EQUALS("admin", noi->getRequest().dbname); - ASSERT_EQUALS(freshRequest, noi->getRequest().cmdObj); - ASSERT_EQUALS(HostAndPort("h1"), noi->getRequest().target); - _net->scheduleResponse( - noi, - startDate + 10, - ResponseStatus(ReplicationExecutor::RemoteCommandResponse( - BSON("ok" << 1 << - "id" << 2 << - "set" << "rs0" << - "who" << "h1" << - "cfgver" << 1 << - "opTime" << 3), - Milliseconds(8)))); + _net->runUntil(startDate + 10); + _net->exitNetwork(); + ASSERT_EQUALS(startDate + 10, _net->now()); + waitOnChecker(); + stopCapturingLogMessages(); + ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); + ASSERT_EQUALS(1, + countLogLinesContaining( + "not electing self, h1:27017 would veto with " + "'I'd rather you didn't'")); +} + +TEST_F(FreshnessCheckerTest, ElectVetoedAndTiedFreshnessManyNodes) { + // one other responds with veto and another responds with tie + startCapturingLogMessages(); + ReplicaSetConfig config = + assertMakeRSConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h0") + << BSON("_id" << 2 << "host" + << "h1") << BSON("_id" << 3 << "host" + << "h2") + << BSON("_id" << 4 << "host" + << "h3") << BSON("_id" << 5 << "host" + << "h4")))); + + std::vector<HostAndPort> hosts; + for (ReplicaSetConfig::MemberIterator mem = config.membersBegin(); mem != config.membersEnd(); + ++mem) { + if (HostAndPort("h0") == mem->getHostAndPort()) { + continue; } - _net->runUntil(startDate + 10); - _net->exitNetwork(); - ASSERT_EQUALS(startDate + 10, _net->now()); - waitOnChecker(); - - stopCapturingLogMessages(); - - ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); - ASSERT_EQUALS(1, countLogLinesContaining("wrong type for opTime argument in replSetFresh " - "response: NumberInt32")); + hosts.push_back(mem->getHostAndPort()); } - TEST_F(FreshnessCheckerTest, ElectVetoed) { - // other responds with veto - startCapturingLogMessages(); - ReplicaSetConfig config = assertMakeRSConfig( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h0") << - BSON("_id" << 2 << "host" << "h1")))); - - std::vector<HostAndPort> hosts; - hosts.push_back(config.getMemberAt(1).getHostAndPort()); - - const BSONObj freshRequest = makeFreshRequest(config, OpTime(10,0), 0); - - startTest(OpTime(10, 0), config, 0, hosts); - const Date_t startDate = _net->now(); - _net->enterNetwork(); - for (size_t i = 0; i < hosts.size(); ++i) { - const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); - ASSERT_EQUALS("admin", noi->getRequest().dbname); - ASSERT_EQUALS(freshRequest, noi->getRequest().cmdObj); - ASSERT_EQUALS(HostAndPort("h1"), noi->getRequest().target); - _net->scheduleResponse( - noi, - startDate + 10, - ResponseStatus(ReplicationExecutor::RemoteCommandResponse( - BSON("ok" << 1 << - "id" << 2 << - "set" << "rs0" << - "who" << "h1" << - "cfgver" << 1 << - "veto" << true << - "errmsg" << "I'd rather you didn't" << - "opTime" << Date_t(OpTime(0,0).asDate())), - Milliseconds(8)))); + const BSONObj freshRequest = makeFreshRequest(config, OpTime(10, 0), 0); + + startTest(OpTime(10, 0), config, 0, hosts); + const Date_t startDate = _net->now(); + unordered_set<HostAndPort> seen; + _net->enterNetwork(); + + for (size_t i = 0; i < hosts.size(); ++i) { + const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); + const HostAndPort target = noi->getRequest().target; + ASSERT_EQUALS("admin", noi->getRequest().dbname); + ASSERT_EQUALS(freshRequest, noi->getRequest().cmdObj); + ASSERT(seen.insert(target).second) << "Already saw " << target; + BSONObjBuilder responseBuilder; + if (target.host() == "h4") { + responseBuilder << "ok" << 1 << "id" << findIdForMember(config, target) << "set" + << "rs0" + << "who" << target.toString() << "cfgver" << 1 << "veto" << true + << "errmsg" + << "I'd rather you didn't" + << "opTime" << Date_t(OpTime(10, 0).asDate()); + _net->scheduleResponse(noi, + startDate + 20, + ResponseStatus(ReplicationExecutor::RemoteCommandResponse( + responseBuilder.obj(), Milliseconds(8)))); + } else { + responseBuilder << "ok" << 1 << "id" << findIdForMember(config, target) << "set" + << "rs0" + << "who" << target.toString() << "cfgver" << 1 << "opTime" + << Date_t(OpTime(10, 0).asDate()); + _net->scheduleResponse(noi, + startDate + 10, + ResponseStatus(ReplicationExecutor::RemoteCommandResponse( + responseBuilder.obj(), Milliseconds(8)))); } - _net->runUntil(startDate + 10); - _net->exitNetwork(); - ASSERT_EQUALS(startDate + 10, _net->now()); - waitOnChecker(); - - stopCapturingLogMessages(); - - ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); - ASSERT_EQUALS(1, countLogLinesContaining("not electing self, h1:27017 would veto with " - "'I'd rather you didn't'")); } - - int findIdForMember(const ReplicaSetConfig& rsConfig, const HostAndPort& host) { - const MemberConfig* member = rsConfig.findMemberByHostAndPort(host); - ASSERT_TRUE(member != NULL) << "No host named " << host.toString() << " in config"; - return member->getId(); + _net->runUntil(startDate + 10); + ASSERT_EQUALS(startDate + 10, _net->now()); + ASSERT_EQUALS(0, + countLogLinesContaining( + "not electing self, h4:27017 would veto with '" + "errmsg: \"I'd rather you didn't\"'")); + _net->runUntil(startDate + 20); + ASSERT_EQUALS(startDate + 20, _net->now()); + _net->exitNetwork(); + waitOnChecker(); + stopCapturingLogMessages(); + ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); + ASSERT_EQUALS(1, + countLogLinesContaining( + "not electing self, h4:27017 would veto with " + "'I'd rather you didn't'")); +} + +TEST_F(FreshnessCheckerTest, ElectManyNodesNotAllRespond) { + ReplicaSetConfig config = + assertMakeRSConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h0") + << BSON("_id" << 2 << "host" + << "h1") << BSON("_id" << 3 << "host" + << "h2") + << BSON("_id" << 4 << "host" + << "h3") << BSON("_id" << 5 << "host" + << "h4")))); + + std::vector<HostAndPort> hosts; + for (ReplicaSetConfig::MemberIterator mem = ++config.membersBegin(); mem != config.membersEnd(); + ++mem) { + hosts.push_back(mem->getHostAndPort()); } - TEST_F(FreshnessCheckerTest, ElectNotElectingSelfWeAreNotFreshestManyNodes) { - // one other responds as fresher than us - startCapturingLogMessages(); - ReplicaSetConfig config = assertMakeRSConfig( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h0") << - BSON("_id" << 2 << "host" << "h1") << - BSON("_id" << 3 << "host" << "h2") << - BSON("_id" << 4 << "host" << "h3") << - BSON("_id" << 5 << "host" << "h4")))); - - std::vector<HostAndPort> hosts; - for (ReplicaSetConfig::MemberIterator mem = ++config.membersBegin(); - mem != config.membersEnd(); - ++mem) { - hosts.push_back(mem->getHostAndPort()); - } - - const BSONObj freshRequest = makeFreshRequest(config, OpTime(10,0), 0); - - startTest(OpTime(10, 0), config, 0, hosts); - const Date_t startDate = _net->now(); - unordered_set<HostAndPort> seen; - _net->enterNetwork(); - for (size_t i = 0; i < hosts.size(); ++i) { - const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); - const HostAndPort target = noi->getRequest().target; - ASSERT_EQUALS("admin", noi->getRequest().dbname); - ASSERT_EQUALS(freshRequest, noi->getRequest().cmdObj); - ASSERT(seen.insert(target).second) << "Already saw " << target; - BSONObjBuilder responseBuilder; - responseBuilder << - "ok" << 1 << - "id" << findIdForMember(config, target) << - "set" << "rs0" << - "who" << target.toString() << - "cfgver" << 1 << - "opTime" << Date_t(OpTime(0,0).asDate()); - if (target.host() == "h1") { - responseBuilder << "fresher" << true; - } + const OpTime lastOpTimeApplied(10, 0); + const BSONObj freshRequest = makeFreshRequest(config, lastOpTimeApplied, 0); + + startTest(OpTime(10, 0), config, 0, hosts); + const Date_t startDate = _net->now(); + unordered_set<HostAndPort> seen; + _net->enterNetwork(); + for (size_t i = 0; i < hosts.size(); ++i) { + const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); + const HostAndPort target = noi->getRequest().target; + ASSERT_EQUALS("admin", noi->getRequest().dbname); + ASSERT_EQUALS(freshRequest, noi->getRequest().cmdObj); + ASSERT(seen.insert(target).second) << "Already saw " << target; + if (target.host() == "h2" || target.host() == "h3") { _net->scheduleResponse( - noi, - startDate + 10, - ResponseStatus(ReplicationExecutor::RemoteCommandResponse( - responseBuilder.obj(), - Milliseconds(8)))); - } - _net->runUntil(startDate + 10); - _net->exitNetwork(); - ASSERT_EQUALS(startDate + 10, _net->now()); - waitOnChecker(); - stopCapturingLogMessages(); - ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); - ASSERT_EQUALS(1, countLogLinesContaining("not electing self, we are not freshest")); - } - - TEST_F(FreshnessCheckerTest, ElectNotElectingSelfWeAreNotFreshestOpTimeManyNodes) { - // one other responds with a later optime than ours - startCapturingLogMessages(); - ReplicaSetConfig config = assertMakeRSConfig( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h0") << - BSON("_id" << 2 << "host" << "h1") << - BSON("_id" << 3 << "host" << "h2") << - BSON("_id" << 4 << "host" << "h3") << - BSON("_id" << 5 << "host" << "h4")))); - - std::vector<HostAndPort> hosts; - for (ReplicaSetConfig::MemberIterator mem = config.membersBegin(); - mem != config.membersEnd(); - ++mem) { - if (HostAndPort("h0") == mem->getHostAndPort()) { - continue; - } - hosts.push_back(mem->getHostAndPort()); - } - - const BSONObj freshRequest = makeFreshRequest(config, OpTime(10,0), 0); - - startTest(OpTime(10, 0), config, 0, hosts); - const Date_t startDate = _net->now(); - unordered_set<HostAndPort> seen; - _net->enterNetwork(); - - for (size_t i = 0; i < hosts.size(); ++i) { - const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); - const HostAndPort target = noi->getRequest().target; - ASSERT_EQUALS("admin", noi->getRequest().dbname); - ASSERT_EQUALS(freshRequest, noi->getRequest().cmdObj); - ASSERT(seen.insert(target).second) << "Already saw " << target; + noi, startDate + 10, ResponseStatus(ErrorCodes::NoSuchKey, "No response")); + } else { BSONObjBuilder responseBuilder; - if (target.host() == "h4") { - responseBuilder << - "ok" << 1 << - "id" << findIdForMember(config, target) << - "set" << "rs0" << - "who" << target.toString() << - "cfgver" << 1 << - "opTime" << Date_t(OpTime(20,0).asDate()); - _net->scheduleResponse( - noi, - startDate + 20, - ResponseStatus(ReplicationExecutor::RemoteCommandResponse( - responseBuilder.obj(), - Milliseconds(8)))); - } - else { - responseBuilder << - "ok" << 1 << - "id" << findIdForMember(config, target) << - "set" << "rs0" << - "who" << target.toString() << - "cfgver" << 1 << - "opTime" << Date_t(OpTime(10,0).asDate()); - _net->scheduleResponse( - noi, - startDate + 10, - ResponseStatus(ReplicationExecutor::RemoteCommandResponse( - responseBuilder.obj(), - Milliseconds(8)))); - } + responseBuilder << "ok" << 1 << "id" << findIdForMember(config, target) << "set" + << "rs0" + << "who" << target.toString() << "cfgver" << 1 << "opTime" + << Date_t(OpTime(0, 0).asDate()); + _net->scheduleResponse(noi, + startDate + 10, + ResponseStatus(ReplicationExecutor::RemoteCommandResponse( + responseBuilder.obj(), Milliseconds(8)))); } - _net->runUntil(startDate + 10); - ASSERT_EQUALS(startDate + 10, _net->now()); - ASSERT_EQUALS(0, countLogLinesContaining("not electing self, we are not freshest")); - _net->runUntil(startDate + 20); - ASSERT_EQUALS(startDate + 20, _net->now()); - _net->exitNetwork(); - waitOnChecker(); - stopCapturingLogMessages(); - ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); } + _net->runUntil(startDate + 10); + _net->exitNetwork(); + ASSERT_EQUALS(startDate + 10, _net->now()); + waitOnChecker(); + ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::None); +} + +class FreshnessScatterGatherTest : public mongo::unittest::Test { +public: + virtual void setUp() { + int selfConfigIndex = 0; + OpTime lastOpTimeApplied(100, 0); - TEST_F(FreshnessCheckerTest, ElectWrongTypeInFreshnessResponseManyNodes) { - // one other responds with "opTime" field of non-Date value, causing not freshest - startCapturingLogMessages(); - ReplicaSetConfig config = assertMakeRSConfig( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h0") << - BSON("_id" << 2 << "host" << "h1") << - BSON("_id" << 3 << "host" << "h2") << - BSON("_id" << 4 << "host" << "h3") << - BSON("_id" << 5 << "host" << "h4")))); + ReplicaSetConfig config; + config.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "host0") + << BSON("_id" << 1 << "host" + << "host1") << BSON("_id" << 2 << "host" + << "host2")))); std::vector<HostAndPort> hosts; for (ReplicaSetConfig::MemberIterator mem = ++config.membersBegin(); - mem != config.membersEnd(); - ++mem) { + mem != config.membersEnd(); + ++mem) { hosts.push_back(mem->getHostAndPort()); } - const BSONObj freshRequest = makeFreshRequest(config, OpTime(10,0), 0); - - startTest(OpTime(10, 0), config, 0, hosts); - const Date_t startDate = _net->now(); - unordered_set<HostAndPort> seen; - _net->enterNetwork(); - for (size_t i = 0; i < hosts.size(); ++i) { - const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); - const HostAndPort target = noi->getRequest().target; - ASSERT_EQUALS("admin", noi->getRequest().dbname); - ASSERT_EQUALS(freshRequest, noi->getRequest().cmdObj); - ASSERT(seen.insert(target).second) << "Already saw " << target; - BSONObjBuilder responseBuilder; - responseBuilder << - "ok" << 1 << - "id" << findIdForMember(config, target) << - "set" << "rs0" << - "who" << target.toString() << - "cfgver" << 1; - if (target.host() == "h1") { - responseBuilder << "opTime" << 3; - } - else { - responseBuilder << "opTime" << Date_t(OpTime(0,0).asDate()); - } - _net->scheduleResponse( - noi, - startDate + 10, - ResponseStatus(ReplicationExecutor::RemoteCommandResponse( - responseBuilder.obj(), - Milliseconds(8)))); - } - _net->runUntil(startDate + 10); - _net->exitNetwork(); - ASSERT_EQUALS(startDate + 10, _net->now()); - waitOnChecker(); - stopCapturingLogMessages(); - ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); - ASSERT_EQUALS(1, countLogLinesContaining("wrong type for opTime argument in replSetFresh " - "response: NumberInt32")); + _checker.reset( + new FreshnessChecker::Algorithm(lastOpTimeApplied, config, selfConfigIndex, hosts)); } - TEST_F(FreshnessCheckerTest, ElectVetoedManyNodes) { - // one other responds with veto - startCapturingLogMessages(); - ReplicaSetConfig config = assertMakeRSConfig( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h0") << - BSON("_id" << 2 << "host" << "h1") << - BSON("_id" << 3 << "host" << "h2") << - BSON("_id" << 4 << "host" << "h3") << - BSON("_id" << 5 << "host" << "h4")))); + virtual void tearDown() { + _checker.reset(NULL); + } - std::vector<HostAndPort> hosts; - for (ReplicaSetConfig::MemberIterator mem = ++config.membersBegin(); - mem != config.membersEnd(); - ++mem) { - hosts.push_back(mem->getHostAndPort()); - } +protected: + bool hasReceivedSufficientResponses() { + return _checker->hasReceivedSufficientResponses(); + } - const BSONObj freshRequest = makeFreshRequest(config, OpTime(10,0), 0); - - startTest(OpTime(10, 0), config, 0, hosts); - const Date_t startDate = _net->now(); - unordered_set<HostAndPort> seen; - _net->enterNetwork(); - for (size_t i = 0; i < hosts.size(); ++i) { - const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); - const HostAndPort target = noi->getRequest().target; - ASSERT_EQUALS("admin", noi->getRequest().dbname); - ASSERT_EQUALS(freshRequest, noi->getRequest().cmdObj); - ASSERT(seen.insert(target).second) << "Already saw " << target; - BSONObjBuilder responseBuilder; - responseBuilder << - "ok" << 1 << - "id" << findIdForMember(config, target) << - "set" << "rs0" << - "who" << target.toString() << - "cfgver" << 1 << - "opTime" << Date_t(OpTime(0,0).asDate()); - if (target.host() == "h1") { - responseBuilder << "veto" << true << "errmsg" << "I'd rather you didn't"; - } - _net->scheduleResponse( - noi, - startDate + 10, - ResponseStatus(ReplicationExecutor::RemoteCommandResponse( - responseBuilder.obj(), - Milliseconds(8)))); - } - _net->runUntil(startDate + 10); - _net->exitNetwork(); - ASSERT_EQUALS(startDate + 10, _net->now()); - waitOnChecker(); - stopCapturingLogMessages(); - ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); - ASSERT_EQUALS(1, countLogLinesContaining("not electing self, h1:27017 would veto with " - "'I'd rather you didn't'")); + void processResponse(const RemoteCommandRequest& request, const ResponseStatus& response) { + _checker->processResponse(request, response); } - TEST_F(FreshnessCheckerTest, ElectVetoedAndTiedFreshnessManyNodes) { - // one other responds with veto and another responds with tie - startCapturingLogMessages(); - ReplicaSetConfig config = assertMakeRSConfig( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h0") << - BSON("_id" << 2 << "host" << "h1") << - BSON("_id" << 3 << "host" << "h2") << - BSON("_id" << 4 << "host" << "h3") << - BSON("_id" << 5 << "host" << "h4")))); - - std::vector<HostAndPort> hosts; - for (ReplicaSetConfig::MemberIterator mem = config.membersBegin(); - mem != config.membersEnd(); - ++mem) { - if (HostAndPort("h0") == mem->getHostAndPort()) { - continue; - } - hosts.push_back(mem->getHostAndPort()); - } + FreshnessChecker::ElectionAbortReason shouldAbortElection() const { + return _checker->shouldAbortElection(); + } - const BSONObj freshRequest = makeFreshRequest(config, OpTime(10,0), 0); + ResponseStatus lessFresh() { + BSONObjBuilder bb; + bb.append("ok", 1.0); + bb.appendDate("opTime", OpTime(10, 0).asDate()); + return ResponseStatus(NetworkInterfaceMock::Response(bb.obj(), Milliseconds(10))); + } - startTest(OpTime(10, 0), config, 0, hosts); - const Date_t startDate = _net->now(); - unordered_set<HostAndPort> seen; - _net->enterNetwork(); + ResponseStatus moreFreshViaOpTime() { + BSONObjBuilder bb; + bb.append("ok", 1.0); + bb.appendDate("opTime", OpTime(110, 0).asDate()); + return ResponseStatus(NetworkInterfaceMock::Response(bb.obj(), Milliseconds(10))); + } - for (size_t i = 0; i < hosts.size(); ++i) { - const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); - const HostAndPort target = noi->getRequest().target; - ASSERT_EQUALS("admin", noi->getRequest().dbname); - ASSERT_EQUALS(freshRequest, noi->getRequest().cmdObj); - ASSERT(seen.insert(target).second) << "Already saw " << target; - BSONObjBuilder responseBuilder; - if (target.host() == "h4") { - responseBuilder << - "ok" << 1 << - "id" << findIdForMember(config, target) << - "set" << "rs0" << - "who" << target.toString() << - "cfgver" << 1 << - "veto" << true << - "errmsg" << "I'd rather you didn't" << - "opTime" << Date_t(OpTime(10,0).asDate()); - _net->scheduleResponse( - noi, - startDate + 20, - ResponseStatus(ReplicationExecutor::RemoteCommandResponse( - responseBuilder.obj(), - Milliseconds(8)))); - } - else { - responseBuilder << - "ok" << 1 << - "id" << findIdForMember(config, target) << - "set" << "rs0" << - "who" << target.toString() << - "cfgver" << 1 << - "opTime" << Date_t(OpTime(10,0).asDate()); - _net->scheduleResponse( - noi, - startDate + 10, - ResponseStatus(ReplicationExecutor::RemoteCommandResponse( - responseBuilder.obj(), - Milliseconds(8)))); - } - } - _net->runUntil(startDate + 10); - ASSERT_EQUALS(startDate + 10, _net->now()); - ASSERT_EQUALS(0, countLogLinesContaining("not electing self, h4:27017 would veto with '" - "errmsg: \"I'd rather you didn't\"'")); - _net->runUntil(startDate + 20); - ASSERT_EQUALS(startDate + 20, _net->now()); - _net->exitNetwork(); - waitOnChecker(); - stopCapturingLogMessages(); - ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); - ASSERT_EQUALS(1, countLogLinesContaining("not electing self, h4:27017 would veto with " - "'I'd rather you didn't'")); + ResponseStatus wrongTypeForOpTime() { + BSONObjBuilder bb; + bb.append("ok", 1.0); + bb.append("opTime", std::string("several minutes ago")); + return ResponseStatus(NetworkInterfaceMock::Response(bb.obj(), Milliseconds(10))); } - TEST_F(FreshnessCheckerTest, ElectManyNodesNotAllRespond) { - ReplicaSetConfig config = assertMakeRSConfig( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h0") << - BSON("_id" << 2 << "host" << "h1") << - BSON("_id" << 3 << "host" << "h2") << - BSON("_id" << 4 << "host" << "h3") << - BSON("_id" << 5 << "host" << "h4")))); + ResponseStatus unauthorized() { + BSONObjBuilder bb; + bb.append("ok", 0.0); + bb.append("code", ErrorCodes::Unauthorized); + bb.append("errmsg", "Unauthorized"); + return ResponseStatus(NetworkInterfaceMock::Response(bb.obj(), Milliseconds(10))); + } - std::vector<HostAndPort> hosts; - for (ReplicaSetConfig::MemberIterator mem = ++config.membersBegin(); - mem != config.membersEnd(); - ++mem) { - hosts.push_back(mem->getHostAndPort()); - } + ResponseStatus tiedForFreshness() { + BSONObjBuilder bb; + bb.append("ok", 1.0); + bb.appendDate("opTime", OpTime(100, 0).asDate()); + return ResponseStatus(NetworkInterfaceMock::Response(bb.obj(), Milliseconds(10))); + } - const OpTime lastOpTimeApplied(10,0); - const BSONObj freshRequest = makeFreshRequest(config, lastOpTimeApplied, 0); - - startTest(OpTime(10, 0), config, 0, hosts); - const Date_t startDate = _net->now(); - unordered_set<HostAndPort> seen; - _net->enterNetwork(); - for (size_t i = 0; i < hosts.size(); ++i) { - const NetworkInterfaceMock::NetworkOperationIterator noi = _net->getNextReadyRequest(); - const HostAndPort target = noi->getRequest().target; - ASSERT_EQUALS("admin", noi->getRequest().dbname); - ASSERT_EQUALS(freshRequest, noi->getRequest().cmdObj); - ASSERT(seen.insert(target).second) << "Already saw " << target; - if (target.host() == "h2" || target.host() == "h3") { - _net->scheduleResponse( - noi, - startDate + 10, - ResponseStatus(ErrorCodes::NoSuchKey, "No response")); - } - else { - BSONObjBuilder responseBuilder; - responseBuilder << - "ok" << 1 << - "id" << findIdForMember(config, target) << - "set" << "rs0" << - "who" << target.toString() << - "cfgver" << 1 << - "opTime" << Date_t(OpTime(0,0).asDate()); - _net->scheduleResponse( - noi, - startDate + 10, - ResponseStatus(ReplicationExecutor::RemoteCommandResponse( - responseBuilder.obj(), - Milliseconds(8)))); - } - } - _net->runUntil(startDate + 10); - _net->exitNetwork(); - ASSERT_EQUALS(startDate + 10, _net->now()); - waitOnChecker(); - ASSERT_EQUALS(shouldAbortElection(),FreshnessChecker::None); + ResponseStatus moreFresh() { + return ResponseStatus(NetworkInterfaceMock::Response(BSON("ok" << 1.0 << "fresher" << true), + Milliseconds(10))); } - class FreshnessScatterGatherTest : public mongo::unittest::Test { - public: - virtual void setUp() { - int selfConfigIndex = 0; - OpTime lastOpTimeApplied(100, 0); - - ReplicaSetConfig config; - config.initialize(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "host0") << - BSON("_id" << 1 << "host" << "host1") << - BSON("_id" << 2 << "host" << "host2")))); - - std::vector<HostAndPort> hosts; - for (ReplicaSetConfig::MemberIterator mem = ++config.membersBegin(); - mem != config.membersEnd(); - ++mem) { - hosts.push_back(mem->getHostAndPort()); - } - - _checker.reset(new FreshnessChecker::Algorithm(lastOpTimeApplied, - config, - selfConfigIndex, - hosts)); + ResponseStatus veto() { + return ResponseStatus( + NetworkInterfaceMock::Response(BSON("ok" << 1.0 << "veto" << true << "errmsg" + << "vetoed!"), + Milliseconds(10))); + } - } + RemoteCommandRequest requestFrom(std::string hostname) { + return RemoteCommandRequest(HostAndPort(hostname), + "", // the non-hostname fields do not matter in Freshness + BSONObj(), + Milliseconds(0)); + } - virtual void tearDown() { - _checker.reset(NULL); - } +private: + scoped_ptr<FreshnessChecker::Algorithm> _checker; +}; - protected: - bool hasReceivedSufficientResponses() { - return _checker->hasReceivedSufficientResponses(); - } +TEST_F(FreshnessScatterGatherTest, BothNodesLessFresh) { + ASSERT_FALSE(hasReceivedSufficientResponses()); - void processResponse(const RemoteCommandRequest& request, const ResponseStatus& response) { - _checker->processResponse(request, response); - } + processResponse(requestFrom("host1"), lessFresh()); + ASSERT_FALSE(hasReceivedSufficientResponses()); - FreshnessChecker::ElectionAbortReason shouldAbortElection() const { - return _checker->shouldAbortElection(); - } + processResponse(requestFrom("host2"), lessFresh()); + ASSERT_TRUE(hasReceivedSufficientResponses()); + ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::None); +} - ResponseStatus lessFresh() { - BSONObjBuilder bb; - bb.append("ok", 1.0); - bb.appendDate("opTime", OpTime(10, 0).asDate()); - return ResponseStatus(NetworkInterfaceMock::Response(bb.obj(), Milliseconds(10))); - } +TEST_F(FreshnessScatterGatherTest, FirstNodeFresher) { + ASSERT_FALSE(hasReceivedSufficientResponses()); - ResponseStatus moreFreshViaOpTime() { - BSONObjBuilder bb; - bb.append("ok", 1.0); - bb.appendDate("opTime", OpTime(110, 0).asDate()); - return ResponseStatus(NetworkInterfaceMock::Response(bb.obj(), Milliseconds(10))); - } + processResponse(requestFrom("host1"), moreFresh()); + ASSERT_TRUE(hasReceivedSufficientResponses()); + ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); +} - ResponseStatus wrongTypeForOpTime() { - BSONObjBuilder bb; - bb.append("ok", 1.0); - bb.append("opTime", std::string("several minutes ago")); - return ResponseStatus(NetworkInterfaceMock::Response(bb.obj(), Milliseconds(10))); - } +TEST_F(FreshnessScatterGatherTest, FirstNodeFresherViaOpTime) { + ASSERT_FALSE(hasReceivedSufficientResponses()); - ResponseStatus unauthorized() { - BSONObjBuilder bb; - bb.append("ok", 0.0); - bb.append("code", ErrorCodes::Unauthorized); - bb.append("errmsg", "Unauthorized"); - return ResponseStatus(NetworkInterfaceMock::Response(bb.obj(), Milliseconds(10))); - } + processResponse(requestFrom("host1"), moreFreshViaOpTime()); + ASSERT_TRUE(hasReceivedSufficientResponses()); + ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); +} - ResponseStatus tiedForFreshness() { - BSONObjBuilder bb; - bb.append("ok", 1.0); - bb.appendDate("opTime", OpTime(100, 0).asDate()); - return ResponseStatus(NetworkInterfaceMock::Response(bb.obj(), Milliseconds(10))); - } +TEST_F(FreshnessScatterGatherTest, FirstNodeVetoes) { + ASSERT_FALSE(hasReceivedSufficientResponses()); - ResponseStatus moreFresh() { - return ResponseStatus(NetworkInterfaceMock::Response(BSON("ok" << 1.0 << - "fresher" << true), - Milliseconds(10))); - } + processResponse(requestFrom("host1"), veto()); + ASSERT_TRUE(hasReceivedSufficientResponses()); + ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); +} - ResponseStatus veto() { - return ResponseStatus(NetworkInterfaceMock::Response(BSON("ok" << 1.0 << - "veto" << true << - "errmsg" << "vetoed!"), - Milliseconds(10))); - } +TEST_F(FreshnessScatterGatherTest, FirstNodeWrongTypeForOpTime) { + ASSERT_FALSE(hasReceivedSufficientResponses()); - RemoteCommandRequest requestFrom(std::string hostname) { - return RemoteCommandRequest(HostAndPort(hostname), - "", // the non-hostname fields do not matter in Freshness - BSONObj(), - Milliseconds(0)); - } - private: - scoped_ptr<FreshnessChecker::Algorithm> _checker; - }; - - TEST_F(FreshnessScatterGatherTest, BothNodesLessFresh) { - ASSERT_FALSE(hasReceivedSufficientResponses()); - - processResponse(requestFrom("host1"), lessFresh()); - ASSERT_FALSE(hasReceivedSufficientResponses()); - - processResponse(requestFrom("host2"), lessFresh()); - ASSERT_TRUE(hasReceivedSufficientResponses()); - ASSERT_EQUALS(shouldAbortElection(),FreshnessChecker::None); - } + processResponse(requestFrom("host1"), wrongTypeForOpTime()); + ASSERT_TRUE(hasReceivedSufficientResponses()); + ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); +} - TEST_F(FreshnessScatterGatherTest, FirstNodeFresher) { - ASSERT_FALSE(hasReceivedSufficientResponses()); - - processResponse(requestFrom("host1"), moreFresh()); - ASSERT_TRUE(hasReceivedSufficientResponses()); - ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); - } +TEST_F(FreshnessScatterGatherTest, FirstNodeTiedForFreshness) { + ASSERT_FALSE(hasReceivedSufficientResponses()); - TEST_F(FreshnessScatterGatherTest, FirstNodeFresherViaOpTime) { - ASSERT_FALSE(hasReceivedSufficientResponses()); - - processResponse(requestFrom("host1"), moreFreshViaOpTime()); - ASSERT_TRUE(hasReceivedSufficientResponses()); - ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); - } + processResponse(requestFrom("host1"), tiedForFreshness()); + ASSERT_FALSE(hasReceivedSufficientResponses()); - TEST_F(FreshnessScatterGatherTest, FirstNodeVetoes) { - ASSERT_FALSE(hasReceivedSufficientResponses()); - - processResponse(requestFrom("host1"), veto()); - ASSERT_TRUE(hasReceivedSufficientResponses()); - ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); - } + processResponse(requestFrom("host2"), lessFresh()); + ASSERT_TRUE(hasReceivedSufficientResponses()); + ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FreshnessTie); +} - TEST_F(FreshnessScatterGatherTest, FirstNodeWrongTypeForOpTime) { - ASSERT_FALSE(hasReceivedSufficientResponses()); - - processResponse(requestFrom("host1"), wrongTypeForOpTime()); - ASSERT_TRUE(hasReceivedSufficientResponses()); - ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); - } +TEST_F(FreshnessScatterGatherTest, FirstNodeTiedAndSecondFresher) { + ASSERT_FALSE(hasReceivedSufficientResponses()); - TEST_F(FreshnessScatterGatherTest, FirstNodeTiedForFreshness) { - ASSERT_FALSE(hasReceivedSufficientResponses()); - - processResponse(requestFrom("host1"), tiedForFreshness()); - ASSERT_FALSE(hasReceivedSufficientResponses()); + processResponse(requestFrom("host1"), tiedForFreshness()); + ASSERT_FALSE(hasReceivedSufficientResponses()); - processResponse(requestFrom("host2"), lessFresh()); - ASSERT_TRUE(hasReceivedSufficientResponses()); - ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FreshnessTie); - } + processResponse(requestFrom("host2"), moreFresh()); + ASSERT_TRUE(hasReceivedSufficientResponses()); + ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); +} - TEST_F(FreshnessScatterGatherTest, FirstNodeTiedAndSecondFresher) { - ASSERT_FALSE(hasReceivedSufficientResponses()); - - processResponse(requestFrom("host1"), tiedForFreshness()); - ASSERT_FALSE(hasReceivedSufficientResponses()); +TEST_F(FreshnessScatterGatherTest, FirstNodeTiedAndSecondFresherViaOpTime) { + ASSERT_FALSE(hasReceivedSufficientResponses()); - processResponse(requestFrom("host2"), moreFresh()); - ASSERT_TRUE(hasReceivedSufficientResponses()); - ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); - } + processResponse(requestFrom("host1"), tiedForFreshness()); + ASSERT_FALSE(hasReceivedSufficientResponses()); - TEST_F(FreshnessScatterGatherTest, FirstNodeTiedAndSecondFresherViaOpTime) { - ASSERT_FALSE(hasReceivedSufficientResponses()); - - processResponse(requestFrom("host1"), tiedForFreshness()); - ASSERT_FALSE(hasReceivedSufficientResponses()); + processResponse(requestFrom("host2"), moreFreshViaOpTime()); + ASSERT_TRUE(hasReceivedSufficientResponses()); + ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); +} - processResponse(requestFrom("host2"), moreFreshViaOpTime()); - ASSERT_TRUE(hasReceivedSufficientResponses()); - ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); - } +TEST_F(FreshnessScatterGatherTest, FirstNodeTiedAndSecondVetoes) { + ASSERT_FALSE(hasReceivedSufficientResponses()); - TEST_F(FreshnessScatterGatherTest, FirstNodeTiedAndSecondVetoes) { - ASSERT_FALSE(hasReceivedSufficientResponses()); - - processResponse(requestFrom("host1"), tiedForFreshness()); - ASSERT_FALSE(hasReceivedSufficientResponses()); + processResponse(requestFrom("host1"), tiedForFreshness()); + ASSERT_FALSE(hasReceivedSufficientResponses()); - processResponse(requestFrom("host2"), veto()); - ASSERT_TRUE(hasReceivedSufficientResponses()); - ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); - } + processResponse(requestFrom("host2"), veto()); + ASSERT_TRUE(hasReceivedSufficientResponses()); + ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); +} - TEST_F(FreshnessScatterGatherTest, FirstNodeTiedAndSecondWrongTypeForOpTime) { - ASSERT_FALSE(hasReceivedSufficientResponses()); - - processResponse(requestFrom("host1"), tiedForFreshness()); - ASSERT_FALSE(hasReceivedSufficientResponses()); +TEST_F(FreshnessScatterGatherTest, FirstNodeTiedAndSecondWrongTypeForOpTime) { + ASSERT_FALSE(hasReceivedSufficientResponses()); - processResponse(requestFrom("host2"), wrongTypeForOpTime()); - ASSERT_TRUE(hasReceivedSufficientResponses()); - ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); - } + processResponse(requestFrom("host1"), tiedForFreshness()); + ASSERT_FALSE(hasReceivedSufficientResponses()); - TEST_F(FreshnessScatterGatherTest, FirstNodeLessFreshAndSecondWrongTypeForOpTime) { - ASSERT_FALSE(hasReceivedSufficientResponses()); + processResponse(requestFrom("host2"), wrongTypeForOpTime()); + ASSERT_TRUE(hasReceivedSufficientResponses()); + ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); +} - processResponse(requestFrom("host1"), lessFresh()); - ASSERT_FALSE(hasReceivedSufficientResponses()); +TEST_F(FreshnessScatterGatherTest, FirstNodeLessFreshAndSecondWrongTypeForOpTime) { + ASSERT_FALSE(hasReceivedSufficientResponses()); - processResponse(requestFrom("host2"), wrongTypeForOpTime()); - ASSERT_TRUE(hasReceivedSufficientResponses()); - ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); - } + processResponse(requestFrom("host1"), lessFresh()); + ASSERT_FALSE(hasReceivedSufficientResponses()); - TEST_F(FreshnessScatterGatherTest, SecondNodeTiedAndFirstWrongTypeForOpTime) { - ASSERT_FALSE(hasReceivedSufficientResponses()); + processResponse(requestFrom("host2"), wrongTypeForOpTime()); + ASSERT_TRUE(hasReceivedSufficientResponses()); + ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); +} - processResponse(requestFrom("host2"), wrongTypeForOpTime()); - ASSERT_TRUE(hasReceivedSufficientResponses()); - ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); - } +TEST_F(FreshnessScatterGatherTest, SecondNodeTiedAndFirstWrongTypeForOpTime) { + ASSERT_FALSE(hasReceivedSufficientResponses()); - TEST_F(FreshnessScatterGatherTest, NotEnoughVotersDueNetworkErrors) { - ASSERT_FALSE(hasReceivedSufficientResponses()); + processResponse(requestFrom("host2"), wrongTypeForOpTime()); + ASSERT_TRUE(hasReceivedSufficientResponses()); + ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::FresherNodeFound); +} - processResponse(requestFrom("host1"), - ResponseStatus(Status(ErrorCodes::NetworkTimeout, ""))); - ASSERT_FALSE(hasReceivedSufficientResponses()); +TEST_F(FreshnessScatterGatherTest, NotEnoughVotersDueNetworkErrors) { + ASSERT_FALSE(hasReceivedSufficientResponses()); - processResponse(requestFrom("host2"), - ResponseStatus(Status(ErrorCodes::NetworkTimeout, ""))); - ASSERT_TRUE(hasReceivedSufficientResponses()); - ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::QuorumUnreachable); - } + processResponse(requestFrom("host1"), ResponseStatus(Status(ErrorCodes::NetworkTimeout, ""))); + ASSERT_FALSE(hasReceivedSufficientResponses()); - TEST_F(FreshnessScatterGatherTest, NotEnoughVotersDueToUnauthorized) { - ASSERT_FALSE(hasReceivedSufficientResponses()); + processResponse(requestFrom("host2"), ResponseStatus(Status(ErrorCodes::NetworkTimeout, ""))); + ASSERT_TRUE(hasReceivedSufficientResponses()); + ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::QuorumUnreachable); +} - processResponse(requestFrom("host1"), unauthorized()); - ASSERT_FALSE(hasReceivedSufficientResponses()); +TEST_F(FreshnessScatterGatherTest, NotEnoughVotersDueToUnauthorized) { + ASSERT_FALSE(hasReceivedSufficientResponses()); - processResponse(requestFrom("host2"), unauthorized()); - ASSERT_TRUE(hasReceivedSufficientResponses()); - ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::QuorumUnreachable); - } + processResponse(requestFrom("host1"), unauthorized()); + ASSERT_FALSE(hasReceivedSufficientResponses()); + + processResponse(requestFrom("host2"), unauthorized()); + ASSERT_TRUE(hasReceivedSufficientResponses()); + ASSERT_EQUALS(shouldAbortElection(), FreshnessChecker::QuorumUnreachable); +} } // namespace } // namespace repl diff --git a/src/mongo/db/repl/handshake_args.cpp b/src/mongo/db/repl/handshake_args.cpp index db815ee3aa2..2ceae3df86e 100644 --- a/src/mongo/db/repl/handshake_args.cpp +++ b/src/mongo/db/repl/handshake_args.cpp @@ -40,75 +40,65 @@ namespace repl { namespace { - const std::string kRIDFieldName = "handshake"; - // TODO(danneberg) remove after 3.0 since this field is only allowed for backwards compatibility - const std::string kOldMemberConfigFieldName = "config"; - const std::string kMemberIdFieldName = "member"; - - const std::string kLegalHandshakeFieldNames[] = { - kRIDFieldName, - kOldMemberConfigFieldName, - kMemberIdFieldName - }; - -} // namespace - - HandshakeArgs::HandshakeArgs() : - _hasRid(false), - _hasMemberId(false), - _rid(OID()), - _memberId(-1) {} - - Status HandshakeArgs::initialize(const BSONObj& argsObj) { - Status status = bsonCheckOnlyHasFields("HandshakeArgs", - argsObj, - kLegalHandshakeFieldNames); - if (!status.isOK()) +const std::string kRIDFieldName = "handshake"; +// TODO(danneberg) remove after 3.0 since this field is only allowed for backwards compatibility +const std::string kOldMemberConfigFieldName = "config"; +const std::string kMemberIdFieldName = "member"; + +const std::string kLegalHandshakeFieldNames[] = { + kRIDFieldName, kOldMemberConfigFieldName, kMemberIdFieldName}; + +} // namespace + +HandshakeArgs::HandshakeArgs() : _hasRid(false), _hasMemberId(false), _rid(OID()), _memberId(-1) {} + +Status HandshakeArgs::initialize(const BSONObj& argsObj) { + Status status = bsonCheckOnlyHasFields("HandshakeArgs", argsObj, kLegalHandshakeFieldNames); + if (!status.isOK()) + return status; + + BSONElement oid; + status = bsonExtractTypedField(argsObj, kRIDFieldName, jstOID, &oid); + if (!status.isOK()) + return status; + _rid = oid.OID(); + _hasRid = true; + + status = bsonExtractIntegerField(argsObj, kMemberIdFieldName, &_memberId); + if (!status.isOK()) { + // field not necessary for master slave, do not return NoSuchKey Error + if (status != ErrorCodes::NoSuchKey) { return status; - - BSONElement oid; - status = bsonExtractTypedField(argsObj, kRIDFieldName, jstOID, &oid); - if (!status.isOK()) - return status; - _rid = oid.OID(); - _hasRid = true; - - status = bsonExtractIntegerField(argsObj, kMemberIdFieldName, &_memberId); - if (!status.isOK()) { - // field not necessary for master slave, do not return NoSuchKey Error - if (status != ErrorCodes::NoSuchKey) { - return status; - } - _memberId = -1; - } - else { - _hasMemberId = true; } - - return Status::OK(); - } - - bool HandshakeArgs::isInitialized() const { - return _hasRid; - } - - void HandshakeArgs::setRid(const OID& newVal) { - _rid = newVal; - _hasRid = true; - } - - void HandshakeArgs::setMemberId(long long newVal) { - _memberId = newVal; + _memberId = -1; + } else { _hasMemberId = true; } - BSONObj HandshakeArgs::toBSON() const { - invariant(isInitialized()); - BSONObjBuilder builder; - builder.append(kRIDFieldName, _rid); - builder.append(kMemberIdFieldName, _memberId); - return builder.obj(); - } + return Status::OK(); +} + +bool HandshakeArgs::isInitialized() const { + return _hasRid; +} + +void HandshakeArgs::setRid(const OID& newVal) { + _rid = newVal; + _hasRid = true; +} + +void HandshakeArgs::setMemberId(long long newVal) { + _memberId = newVal; + _hasMemberId = true; +} + +BSONObj HandshakeArgs::toBSON() const { + invariant(isInitialized()); + BSONObjBuilder builder; + builder.append(kRIDFieldName, _rid); + builder.append(kMemberIdFieldName, _memberId); + return builder.obj(); +} } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/handshake_args.h b/src/mongo/db/repl/handshake_args.h index b0d442aaaf6..b83bef87842 100644 --- a/src/mongo/db/repl/handshake_args.h +++ b/src/mongo/db/repl/handshake_args.h @@ -32,64 +32,72 @@ namespace mongo { - class Status; +class Status; namespace repl { +/** + * Arguments to the handshake command. + */ +class HandshakeArgs { +public: + HandshakeArgs(); + + /** + * Initializes this HandshakeArgs from the contents of args. + */ + Status initialize(const BSONObj& argsObj); + + /** + * Returns true if all required fields have been initialized. + */ + bool isInitialized() const; + + /** + * Gets the _id of the sender in their ReplSetConfig. + */ + long long getMemberId() const { + return _memberId; + } + /** - * Arguments to the handshake command. + * Gets the unique identifier of the sender, which is used to track replication progress. */ - class HandshakeArgs { - public: - HandshakeArgs(); - - /** - * Initializes this HandshakeArgs from the contents of args. - */ - Status initialize(const BSONObj& argsObj); - - /** - * Returns true if all required fields have been initialized. - */ - bool isInitialized() const; - - /** - * Gets the _id of the sender in their ReplSetConfig. - */ - long long getMemberId() const { return _memberId; } - - /** - * Gets the unique identifier of the sender, which is used to track replication progress. - */ - OID getRid() const { return _rid; } - - /** - * The below methods check whether or not value in the method name has been set. - */ - bool hasRid() { return _hasRid; }; - bool hasMemberId() { return _hasMemberId; }; - - /** - * The below methods set the value in the method name to 'newVal'. - */ - void setRid(const OID& newVal); - void setMemberId(long long newVal); - - /** - * Returns a BSONified version of the object. - * Should only be called if the mandatory fields have been set. - * Optional fields are only included if they have been set. - */ - BSONObj toBSON() const; - - private: - bool _hasRid; - bool _hasMemberId; - - // look at the body of the isInitialized() function to see which fields are mandatory - OID _rid; - long long _memberId; + OID getRid() const { + return _rid; + } + + /** + * The below methods check whether or not value in the method name has been set. + */ + bool hasRid() { + return _hasRid; }; + bool hasMemberId() { + return _hasMemberId; + }; + + /** + * The below methods set the value in the method name to 'newVal'. + */ + void setRid(const OID& newVal); + void setMemberId(long long newVal); + + /** + * Returns a BSONified version of the object. + * Should only be called if the mandatory fields have been set. + * Optional fields are only included if they have been set. + */ + BSONObj toBSON() const; + +private: + bool _hasRid; + bool _hasMemberId; + + // look at the body of the isInitialized() function to see which fields are mandatory + OID _rid; + long long _memberId; +}; -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/heartbeat_response_action.cpp b/src/mongo/db/repl/heartbeat_response_action.cpp index 4f26bc2953e..8ea8a1c4819 100644 --- a/src/mongo/db/repl/heartbeat_response_action.cpp +++ b/src/mongo/db/repl/heartbeat_response_action.cpp @@ -33,45 +33,42 @@ namespace mongo { namespace repl { - HeartbeatResponseAction HeartbeatResponseAction::makeNoAction() { - return HeartbeatResponseAction(); - } +HeartbeatResponseAction HeartbeatResponseAction::makeNoAction() { + return HeartbeatResponseAction(); +} - HeartbeatResponseAction HeartbeatResponseAction::makeReconfigAction() { - HeartbeatResponseAction result; - result._action = Reconfig; - return result; - } +HeartbeatResponseAction HeartbeatResponseAction::makeReconfigAction() { + HeartbeatResponseAction result; + result._action = Reconfig; + return result; +} - HeartbeatResponseAction HeartbeatResponseAction::makeElectAction() { - HeartbeatResponseAction result; - result._action = StartElection; - return result; - } +HeartbeatResponseAction HeartbeatResponseAction::makeElectAction() { + HeartbeatResponseAction result; + result._action = StartElection; + return result; +} - HeartbeatResponseAction HeartbeatResponseAction::makeStepDownSelfAction(int primaryIndex) { - HeartbeatResponseAction result; - result._action = StepDownSelf; - result._primaryIndex = primaryIndex; - return result; - } +HeartbeatResponseAction HeartbeatResponseAction::makeStepDownSelfAction(int primaryIndex) { + HeartbeatResponseAction result; + result._action = StepDownSelf; + result._primaryIndex = primaryIndex; + return result; +} - HeartbeatResponseAction HeartbeatResponseAction::makeStepDownRemoteAction(int primaryIndex) { - HeartbeatResponseAction result; - result._action = StepDownRemotePrimary; - result._primaryIndex = primaryIndex; - return result; - } +HeartbeatResponseAction HeartbeatResponseAction::makeStepDownRemoteAction(int primaryIndex) { + HeartbeatResponseAction result; + result._action = StepDownRemotePrimary; + result._primaryIndex = primaryIndex; + return result; +} - HeartbeatResponseAction::HeartbeatResponseAction() : - _action(NoAction), - _primaryIndex(-1), - _nextHeartbeatStartDate(0) { - } +HeartbeatResponseAction::HeartbeatResponseAction() + : _action(NoAction), _primaryIndex(-1), _nextHeartbeatStartDate(0) {} - void HeartbeatResponseAction::setNextHeartbeatStartDate(Date_t when) { - _nextHeartbeatStartDate = when; - } +void HeartbeatResponseAction::setNextHeartbeatStartDate(Date_t when) { + _nextHeartbeatStartDate = when; +} } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/heartbeat_response_action.h b/src/mongo/db/repl/heartbeat_response_action.h index 55c2d459920..f45b3668a91 100644 --- a/src/mongo/db/repl/heartbeat_response_action.h +++ b/src/mongo/db/repl/heartbeat_response_action.h @@ -33,88 +33,88 @@ namespace mongo { namespace repl { +/** + * Description of actions taken in response to a heartbeat. + * + * This includes when to schedule the next heartbeat to a target, and any other actions to + * take, such as scheduling an election or stepping down as primary. + */ +class HeartbeatResponseAction { +public: + /** + * Actions taken based on heartbeat responses + */ + enum Action { NoAction, Reconfig, StartElection, StepDownSelf, StepDownRemotePrimary }; + + /** + * Makes a new action representing doing nothing. + */ + static HeartbeatResponseAction makeNoAction(); + + /** + * Makes a new action representing the instruction to reconfigure the current node. + */ + static HeartbeatResponseAction makeReconfigAction(); + + /** + * Makes a new action telling the current node to attempt to elect itself primary. + */ + static HeartbeatResponseAction makeElectAction(); + + /** + * Makes a new action telling the current node to step down as primary. + * + * It is an error to call this with primaryIndex != the index of the current node. + */ + static HeartbeatResponseAction makeStepDownSelfAction(int primaryIndex); + /** - * Description of actions taken in response to a heartbeat. + * Makes a new action telling the current node to ask the specified remote node to step + * down as primary. * - * This includes when to schedule the next heartbeat to a target, and any other actions to - * take, such as scheduling an election or stepping down as primary. + * It is an error to call this with primaryIndex == the index of the current node. + */ + static HeartbeatResponseAction makeStepDownRemoteAction(int primaryIndex); + + /** + * Construct an action with unspecified action and a next heartbeat start date in the + * past. + */ + HeartbeatResponseAction(); + + /** + * Sets the date at which the next heartbeat should be scheduled. + */ + void setNextHeartbeatStartDate(Date_t when); + + /** + * Gets the action type of this action. + */ + Action getAction() const { + return _action; + } + + /** + * Gets the time at which the next heartbeat should be scheduled. If the + * time is not in the future, the next heartbeat should be scheduled immediately. + */ + Date_t getNextHeartbeatStartDate() const { + return _nextHeartbeatStartDate; + } + + /** + * If getAction() returns StepDownSelf or StepDownPrimary, this is the index + * in the current replica set config of the node that ought to step down. */ - class HeartbeatResponseAction { - public: - /** - * Actions taken based on heartbeat responses - */ - enum Action { - NoAction, - Reconfig, - StartElection, - StepDownSelf, - StepDownRemotePrimary - }; - - /** - * Makes a new action representing doing nothing. - */ - static HeartbeatResponseAction makeNoAction(); - - /** - * Makes a new action representing the instruction to reconfigure the current node. - */ - static HeartbeatResponseAction makeReconfigAction(); - - /** - * Makes a new action telling the current node to attempt to elect itself primary. - */ - static HeartbeatResponseAction makeElectAction(); - - /** - * Makes a new action telling the current node to step down as primary. - * - * It is an error to call this with primaryIndex != the index of the current node. - */ - static HeartbeatResponseAction makeStepDownSelfAction(int primaryIndex); - - /** - * Makes a new action telling the current node to ask the specified remote node to step - * down as primary. - * - * It is an error to call this with primaryIndex == the index of the current node. - */ - static HeartbeatResponseAction makeStepDownRemoteAction(int primaryIndex); - - /** - * Construct an action with unspecified action and a next heartbeat start date in the - * past. - */ - HeartbeatResponseAction(); - - /** - * Sets the date at which the next heartbeat should be scheduled. - */ - void setNextHeartbeatStartDate(Date_t when); - - /** - * Gets the action type of this action. - */ - Action getAction() const { return _action; } - - /** - * Gets the time at which the next heartbeat should be scheduled. If the - * time is not in the future, the next heartbeat should be scheduled immediately. - */ - Date_t getNextHeartbeatStartDate() const { return _nextHeartbeatStartDate; } - - /** - * If getAction() returns StepDownSelf or StepDownPrimary, this is the index - * in the current replica set config of the node that ought to step down. - */ - int getPrimaryConfigIndex() const { return _primaryIndex; } - - private: - Action _action; - int _primaryIndex; - Date_t _nextHeartbeatStartDate; - }; + int getPrimaryConfigIndex() const { + return _primaryIndex; + } + +private: + Action _action; + int _primaryIndex; + Date_t _nextHeartbeatStartDate; +}; } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/initial_sync.cpp b/src/mongo/db/repl/initial_sync.cpp index 61729f92139..91978f9a34d 100644 --- a/src/mongo/db/repl/initial_sync.cpp +++ b/src/mongo/db/repl/initial_sync.cpp @@ -41,22 +41,21 @@ namespace mongo { namespace repl { - InitialSync::InitialSync(BackgroundSyncInterface *q) : - SyncTail(q, multiInitialSyncApply) {} - - InitialSync::~InitialSync() {} - - /* initial oplog application, during initial sync, after cloning. - */ - void InitialSync::oplogApplication(OperationContext* txn, const OpTime& endOpTime) { - if (replSetForceInitialSyncFailure > 0) { - log() << "replSet test code invoked, forced InitialSync failure: " - << replSetForceInitialSyncFailure; - replSetForceInitialSyncFailure--; - throw DBException("forced error",0); - } - _applyOplogUntil(txn, endOpTime); +InitialSync::InitialSync(BackgroundSyncInterface* q) : SyncTail(q, multiInitialSyncApply) {} + +InitialSync::~InitialSync() {} + +/* initial oplog application, during initial sync, after cloning. +*/ +void InitialSync::oplogApplication(OperationContext* txn, const OpTime& endOpTime) { + if (replSetForceInitialSyncFailure > 0) { + log() << "replSet test code invoked, forced InitialSync failure: " + << replSetForceInitialSyncFailure; + replSetForceInitialSyncFailure--; + throw DBException("forced error", 0); } + _applyOplogUntil(txn, endOpTime); +} -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/initial_sync.h b/src/mongo/db/repl/initial_sync.h index 0840a9a261a..3672ceb82e9 100644 --- a/src/mongo/db/repl/initial_sync.h +++ b/src/mongo/db/repl/initial_sync.h @@ -33,21 +33,21 @@ namespace mongo { namespace repl { - class BackgroundSyncInterface; +class BackgroundSyncInterface; + +/** + * Initial clone and sync + */ +class InitialSync : public SyncTail { +public: + virtual ~InitialSync(); + InitialSync(BackgroundSyncInterface* q); /** - * Initial clone and sync + * applies up to endOpTime, fetching missing documents as needed. */ - class InitialSync : public SyncTail { - public: - virtual ~InitialSync(); - InitialSync(BackgroundSyncInterface *q); - - /** - * applies up to endOpTime, fetching missing documents as needed. - */ - void oplogApplication(OperationContext* txn, const OpTime& endOpTime); - }; + void oplogApplication(OperationContext* txn, const OpTime& endOpTime); +}; -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/is_master_response.cpp b/src/mongo/db/repl/is_master_response.cpp index a789fd7b6dd..392ba515e42 100644 --- a/src/mongo/db/repl/is_master_response.cpp +++ b/src/mongo/db/repl/is_master_response.cpp @@ -42,415 +42,417 @@ namespace mongo { namespace repl { namespace { - const std::string kIsMasterFieldName = "ismaster"; - const std::string kSecondaryFieldName = "secondary"; - const std::string kSetNameFieldName = "setName"; - const std::string kSetVersionFieldName = "setVersion"; - const std::string kHostsFieldName = "hosts"; - const std::string kPassivesFieldName = "passives"; - const std::string kArbitersFieldName = "arbiters"; - const std::string kPrimaryFieldName = "primary"; - const std::string kArbiterOnlyFieldName = "arbiterOnly"; - const std::string kPassiveFieldName = "passive"; - const std::string kHiddenFieldName = "hidden"; - const std::string kBuildIndexesFieldName = "buildIndexes"; - const std::string kSlaveDelayFieldName = "slaveDelay"; - const std::string kTagsFieldName = "tags"; - const std::string kMeFieldName = "me"; - const std::string kElectionIdFieldName = "electionId"; - - // field name constants that don't directly correspond to member variables - const std::string kInfoFieldName = "info"; - const std::string kIsReplicaSetFieldName = "isreplicaset"; - const std::string kErrmsgFieldName = "errmsg"; - const std::string kCodeFieldName = "code"; +const std::string kIsMasterFieldName = "ismaster"; +const std::string kSecondaryFieldName = "secondary"; +const std::string kSetNameFieldName = "setName"; +const std::string kSetVersionFieldName = "setVersion"; +const std::string kHostsFieldName = "hosts"; +const std::string kPassivesFieldName = "passives"; +const std::string kArbitersFieldName = "arbiters"; +const std::string kPrimaryFieldName = "primary"; +const std::string kArbiterOnlyFieldName = "arbiterOnly"; +const std::string kPassiveFieldName = "passive"; +const std::string kHiddenFieldName = "hidden"; +const std::string kBuildIndexesFieldName = "buildIndexes"; +const std::string kSlaveDelayFieldName = "slaveDelay"; +const std::string kTagsFieldName = "tags"; +const std::string kMeFieldName = "me"; +const std::string kElectionIdFieldName = "electionId"; + +// field name constants that don't directly correspond to member variables +const std::string kInfoFieldName = "info"; +const std::string kIsReplicaSetFieldName = "isreplicaset"; +const std::string kErrmsgFieldName = "errmsg"; +const std::string kCodeFieldName = "code"; } // namespace - IsMasterResponse::IsMasterResponse() : - _isMaster(false), - _isMasterSet(false), - _secondary(false), - _isSecondarySet(false), - _setNameSet(false), - _setVersion(0), - _setVersionSet(false), - _hostsSet(false), - _passivesSet(false), - _arbitersSet(false), - _primarySet(false), - _arbiterOnly(false), - _arbiterOnlySet(false), - _passive(false), - _passiveSet(false), - _hidden(false), - _hiddenSet(false), - _buildIndexes(true), - _buildIndexesSet(false), - _slaveDelay(0), - _slaveDelaySet(false), - _tagsSet(false), - _meSet(false), - _electionId(OID()), - _configSet(true), - _shutdownInProgress(false) - {} - - void IsMasterResponse::addToBSON(BSONObjBuilder* builder) const { - if (_shutdownInProgress) { - builder->append(kCodeFieldName, ErrorCodes::ShutdownInProgress); - builder->append(kErrmsgFieldName, "replication shutdown in progress"); - return; - } +IsMasterResponse::IsMasterResponse() + : _isMaster(false), + _isMasterSet(false), + _secondary(false), + _isSecondarySet(false), + _setNameSet(false), + _setVersion(0), + _setVersionSet(false), + _hostsSet(false), + _passivesSet(false), + _arbitersSet(false), + _primarySet(false), + _arbiterOnly(false), + _arbiterOnlySet(false), + _passive(false), + _passiveSet(false), + _hidden(false), + _hiddenSet(false), + _buildIndexes(true), + _buildIndexesSet(false), + _slaveDelay(0), + _slaveDelaySet(false), + _tagsSet(false), + _meSet(false), + _electionId(OID()), + _configSet(true), + _shutdownInProgress(false) {} + +void IsMasterResponse::addToBSON(BSONObjBuilder* builder) const { + if (_shutdownInProgress) { + builder->append(kCodeFieldName, ErrorCodes::ShutdownInProgress); + builder->append(kErrmsgFieldName, "replication shutdown in progress"); + return; + } - if (!_configSet) { - builder->append(kIsMasterFieldName, false); - builder->append(kSecondaryFieldName, false); - builder->append(kInfoFieldName, "Does not have a valid replica set config"); - builder->append(kIsReplicaSetFieldName , true); - return; - } + if (!_configSet) { + builder->append(kIsMasterFieldName, false); + builder->append(kSecondaryFieldName, false); + builder->append(kInfoFieldName, "Does not have a valid replica set config"); + builder->append(kIsReplicaSetFieldName, true); + return; + } - invariant(_setNameSet); - builder->append(kSetNameFieldName, _setName); - invariant(_setVersionSet); - builder->append(kSetVersionFieldName, static_cast<int>(_setVersion)); - invariant(_isMasterSet); - builder->append(kIsMasterFieldName, _isMaster); - invariant(_isSecondarySet); - builder->append(kSecondaryFieldName, _secondary); - - if (_hostsSet) { - std::vector<std::string> hosts; - for (size_t i = 0; i < _hosts.size(); ++i) { - hosts.push_back(_hosts[i].toString()); - } - builder->append(kHostsFieldName, hosts); + invariant(_setNameSet); + builder->append(kSetNameFieldName, _setName); + invariant(_setVersionSet); + builder->append(kSetVersionFieldName, static_cast<int>(_setVersion)); + invariant(_isMasterSet); + builder->append(kIsMasterFieldName, _isMaster); + invariant(_isSecondarySet); + builder->append(kSecondaryFieldName, _secondary); + + if (_hostsSet) { + std::vector<std::string> hosts; + for (size_t i = 0; i < _hosts.size(); ++i) { + hosts.push_back(_hosts[i].toString()); } - if (_passivesSet) { - std::vector<std::string> passives; - for (size_t i = 0; i < _passives.size(); ++i) { - passives.push_back(_passives[i].toString()); - } - builder->append(kPassivesFieldName, passives); + builder->append(kHostsFieldName, hosts); + } + if (_passivesSet) { + std::vector<std::string> passives; + for (size_t i = 0; i < _passives.size(); ++i) { + passives.push_back(_passives[i].toString()); } - if (_arbitersSet) { - std::vector<std::string> arbiters; - for (size_t i = 0; i < _arbiters.size(); ++i) { - arbiters.push_back(_arbiters[i].toString()); - } - builder->append(kArbitersFieldName, arbiters); + builder->append(kPassivesFieldName, passives); + } + if (_arbitersSet) { + std::vector<std::string> arbiters; + for (size_t i = 0; i < _arbiters.size(); ++i) { + arbiters.push_back(_arbiters[i].toString()); } - if (_primarySet) - builder->append(kPrimaryFieldName, _primary.toString()); - if (_arbiterOnlySet) - builder->append(kArbiterOnlyFieldName, _arbiterOnly); - if (_passiveSet) - builder->append(kPassiveFieldName, _passive); - if (_hiddenSet) - builder->append(kHiddenFieldName, _hidden); - if (_buildIndexesSet) - builder->append(kBuildIndexesFieldName, _buildIndexes); - if (_slaveDelaySet) - builder->append(kSlaveDelayFieldName, _slaveDelay.total_seconds()); - if (_tagsSet) { - BSONObjBuilder tags(builder->subobjStart(kTagsFieldName)); - for (unordered_map<std::string, std::string>::const_iterator it = _tags.begin(); - it != _tags.end(); ++it) { - tags.append(it->first, it->second); - } + builder->append(kArbitersFieldName, arbiters); + } + if (_primarySet) + builder->append(kPrimaryFieldName, _primary.toString()); + if (_arbiterOnlySet) + builder->append(kArbiterOnlyFieldName, _arbiterOnly); + if (_passiveSet) + builder->append(kPassiveFieldName, _passive); + if (_hiddenSet) + builder->append(kHiddenFieldName, _hidden); + if (_buildIndexesSet) + builder->append(kBuildIndexesFieldName, _buildIndexes); + if (_slaveDelaySet) + builder->append(kSlaveDelayFieldName, _slaveDelay.total_seconds()); + if (_tagsSet) { + BSONObjBuilder tags(builder->subobjStart(kTagsFieldName)); + for (unordered_map<std::string, std::string>::const_iterator it = _tags.begin(); + it != _tags.end(); + ++it) { + tags.append(it->first, it->second); + } + } + invariant(_meSet); + builder->append(kMeFieldName, _me.toString()); + if (_electionId.isSet()) + builder->append(kElectionIdFieldName, _electionId); +} + +BSONObj IsMasterResponse::toBSON() const { + BSONObjBuilder builder; + addToBSON(&builder); + return builder.obj(); +} + +Status IsMasterResponse::initialize(const BSONObj& doc) { + Status status = bsonExtractBooleanField(doc, kIsMasterFieldName, &_isMaster); + if (!status.isOK()) { + return status; + } + _isMasterSet = true; + status = bsonExtractBooleanField(doc, kSecondaryFieldName, &_secondary); + if (!status.isOK()) { + return status; + } + _isSecondarySet = true; + if (doc.hasField(kInfoFieldName)) { + if (_isMaster || _secondary || !doc.hasField(kIsReplicaSetFieldName) || + !doc[kIsReplicaSetFieldName].booleanSafe()) { + return Status(ErrorCodes::FailedToParse, + str::stream() << "Expected presence of \"" << kInfoFieldName + << "\" field to indicate no valid config loaded, but other " + "fields weren't as we expected"); + } + _configSet = false; + return Status::OK(); + } else { + if (doc.hasField(kIsReplicaSetFieldName)) { + return Status(ErrorCodes::FailedToParse, + str::stream() << "Found \"" << kIsReplicaSetFieldName + << "\" field which should indicate that no valid config " + "is loaded, but we didn't also have an \"" + << kInfoFieldName << "\" field as we expected"); } - invariant(_meSet); - builder->append(kMeFieldName, _me.toString()); - if (_electionId.isSet()) - builder->append(kElectionIdFieldName, _electionId); } - BSONObj IsMasterResponse::toBSON() const { - BSONObjBuilder builder; - addToBSON(&builder); - return builder.obj(); + status = bsonExtractStringField(doc, kSetNameFieldName, &_setName); + if (!status.isOK()) { + return status; + } + _setNameSet = true; + status = bsonExtractIntegerField(doc, kSetVersionFieldName, &_setVersion); + if (!status.isOK()) { + return status; } + _setVersionSet = true; - Status IsMasterResponse::initialize(const BSONObj& doc) { - Status status = bsonExtractBooleanField(doc, kIsMasterFieldName, &_isMaster); - if (!status.isOK()) { - return status; - } - _isMasterSet = true; - status = bsonExtractBooleanField(doc, kSecondaryFieldName, &_secondary); + if (doc.hasField(kHostsFieldName)) { + BSONElement hostsElement; + status = bsonExtractTypedField(doc, kHostsFieldName, Array, &hostsElement); if (!status.isOK()) { return status; } - _isSecondarySet = true; - if (doc.hasField(kInfoFieldName)) { - if (_isMaster || - _secondary || - !doc.hasField(kIsReplicaSetFieldName) || - !doc[kIsReplicaSetFieldName].booleanSafe()) { - return Status(ErrorCodes::FailedToParse, - str::stream() << "Expected presence of \"" << kInfoFieldName << - "\" field to indicate no valid config loaded, but other " - "fields weren't as we expected"); - } - _configSet = false; - return Status::OK(); - } - else { - if (doc.hasField(kIsReplicaSetFieldName)) { - return Status(ErrorCodes::FailedToParse, - str::stream() << "Found \"" << kIsReplicaSetFieldName << - "\" field which should indicate that no valid config " - "is loaded, but we didn't also have an \"" << - kInfoFieldName << "\" field as we expected"); + for (BSONObjIterator it(hostsElement.Obj()); it.more();) { + BSONElement hostElement = it.next(); + if (hostElement.type() != String) { + return Status(ErrorCodes::TypeMismatch, + str::stream() << "Elements in \"" << kHostsFieldName + << "\" array of isMaster response must be of type " + << typeName(String) << " but found type " + << typeName(hostElement.type())); } + _hosts.push_back(HostAndPort(hostElement.String())); } + _hostsSet = true; + } - status = bsonExtractStringField(doc, kSetNameFieldName, &_setName); - if (!status.isOK()) { - return status; - } - _setNameSet = true; - status = bsonExtractIntegerField(doc, kSetVersionFieldName, &_setVersion); + if (doc.hasField(kPassivesFieldName)) { + BSONElement passivesElement; + status = bsonExtractTypedField(doc, kPassivesFieldName, Array, &passivesElement); if (!status.isOK()) { return status; } - _setVersionSet = true; - - if (doc.hasField(kHostsFieldName)) { - BSONElement hostsElement; - status = bsonExtractTypedField(doc, kHostsFieldName, Array, &hostsElement); - if (!status.isOK()) { - return status; - } - for (BSONObjIterator it(hostsElement.Obj()); it.more();) { - BSONElement hostElement = it.next(); - if (hostElement.type() != String) { - return Status(ErrorCodes::TypeMismatch, - str::stream() << "Elements in \"" << kHostsFieldName << - "\" array of isMaster response must be of type " << - typeName(String) << " but found type " << - typeName(hostElement.type())); - } - _hosts.push_back(HostAndPort(hostElement.String())); - } - _hostsSet = true; - } - - if (doc.hasField(kPassivesFieldName)) { - BSONElement passivesElement; - status = bsonExtractTypedField(doc, kPassivesFieldName, Array, &passivesElement); - if (!status.isOK()) { - return status; - } - for (BSONObjIterator it(passivesElement.Obj()); it.more();) { - BSONElement passiveElement = it.next(); - if (passiveElement.type() != String) { - return Status(ErrorCodes::TypeMismatch, - str::stream() << "Elements in \"" << kPassivesFieldName << - "\" array of isMaster response must be of type " << - typeName(String) << " but found type " << - typeName(passiveElement.type())); - } - _passives.push_back(HostAndPort(passiveElement.String())); - } - _passivesSet = true; - } - - if (doc.hasField(kArbitersFieldName)) { - BSONElement arbitersElement; - status = bsonExtractTypedField(doc, kArbitersFieldName, Array, &arbitersElement); - if (!status.isOK()) { - return status; - } - for (BSONObjIterator it(arbitersElement.Obj()); it.more();) { - BSONElement arbiterElement = it.next(); - if (arbiterElement.type() != String) { - return Status(ErrorCodes::TypeMismatch, - str::stream() << "Elements in \"" << kArbitersFieldName << - "\" array of isMaster response must be of type " << - typeName(String) << " but found type " << - typeName(arbiterElement.type())); - } - _arbiters.push_back(HostAndPort(arbiterElement.String())); - } - _arbitersSet = true; - } - - if (doc.hasField(kPrimaryFieldName)) { - std::string primaryString; - status = bsonExtractStringField(doc, kPrimaryFieldName, &primaryString); - if (!status.isOK()) { - return status; - } - _primary = HostAndPort(primaryString); - _primarySet = true; - } - - if (doc.hasField(kArbiterOnlyFieldName)) { - status = bsonExtractBooleanField(doc, kArbiterOnlyFieldName, &_arbiterOnly); - if (!status.isOK()) { - return status; - } - _arbiterOnlySet = true; - } - - if (doc.hasField(kPassiveFieldName)) { - status = bsonExtractBooleanField(doc, kPassiveFieldName, &_passive); - if (!status.isOK()) { - return status; - } - _passiveSet = true; - } - - if (doc.hasField(kHiddenFieldName)) { - status = bsonExtractBooleanField(doc, kHiddenFieldName, &_hidden); - if (!status.isOK()) { - return status; - } - _hiddenSet = true; - } - - if (doc.hasField(kBuildIndexesFieldName)) { - status = bsonExtractBooleanField(doc, kBuildIndexesFieldName, &_buildIndexes); - if (!status.isOK()) { - return status; - } - _buildIndexesSet = true; - } - - if (doc.hasField(kSlaveDelayFieldName)) { - long long slaveDelaySecs; - status = bsonExtractIntegerField(doc, kSlaveDelayFieldName, &slaveDelaySecs); - if (!status.isOK()) { - return status; + for (BSONObjIterator it(passivesElement.Obj()); it.more();) { + BSONElement passiveElement = it.next(); + if (passiveElement.type() != String) { + return Status(ErrorCodes::TypeMismatch, + str::stream() << "Elements in \"" << kPassivesFieldName + << "\" array of isMaster response must be of type " + << typeName(String) << " but found type " + << typeName(passiveElement.type())); } - _slaveDelaySet = true; - _slaveDelay = Seconds(slaveDelaySecs); + _passives.push_back(HostAndPort(passiveElement.String())); } + _passivesSet = true; + } - if (doc.hasField(kTagsFieldName)) { - BSONElement tagsElement; - status = bsonExtractTypedField(doc, kTagsFieldName, Object, &tagsElement); - if (!status.isOK()) { - return status; - } - for (BSONObjIterator it(tagsElement.Obj()); it.more();) { - BSONElement tagElement = it.next(); - if (tagElement.type() != String) { - return Status(ErrorCodes::TypeMismatch, - str::stream() << "Elements in \"" << kTagsFieldName << "\" obj " - "of isMaster response must be of type " << - typeName(String) << " but found type " << - typeName(tagsElement.type())); - } - _tags[tagElement.fieldNameStringData().toString()] = tagElement.String(); - } - _tagsSet = true; + if (doc.hasField(kArbitersFieldName)) { + BSONElement arbitersElement; + status = bsonExtractTypedField(doc, kArbitersFieldName, Array, &arbitersElement); + if (!status.isOK()) { + return status; } - - if (doc.hasField(kElectionIdFieldName)) { - BSONElement electionIdElem; - status = bsonExtractTypedField(doc, kElectionIdFieldName, jstOID, &electionIdElem); - if (!status.isOK()) { - return status; + for (BSONObjIterator it(arbitersElement.Obj()); it.more();) { + BSONElement arbiterElement = it.next(); + if (arbiterElement.type() != String) { + return Status(ErrorCodes::TypeMismatch, + str::stream() << "Elements in \"" << kArbitersFieldName + << "\" array of isMaster response must be of type " + << typeName(String) << " but found type " + << typeName(arbiterElement.type())); } - _electionId = electionIdElem.OID(); + _arbiters.push_back(HostAndPort(arbiterElement.String())); } + _arbitersSet = true; + } - std::string meString; - status = bsonExtractStringField(doc, kMeFieldName, &meString); + if (doc.hasField(kPrimaryFieldName)) { + std::string primaryString; + status = bsonExtractStringField(doc, kPrimaryFieldName, &primaryString); if (!status.isOK()) { return status; } - _me = HostAndPort(meString); - _meSet = true; - - return Status::OK(); - } - - void IsMasterResponse::setIsMaster(bool isMaster) { - _isMasterSet = true; - _isMaster = isMaster; - } - - void IsMasterResponse::setIsSecondary(bool secondary) { - _isSecondarySet = true; - _secondary = secondary; - } - - void IsMasterResponse::setReplSetName(const std::string& setName) { - _setNameSet = true; - _setName = setName; - } - - void IsMasterResponse::setReplSetVersion(long long version) { - _setVersionSet = true; - _setVersion = version; - } - - void IsMasterResponse::addHost(const HostAndPort& host) { - _hostsSet = true; - _hosts.push_back(host); - } - - void IsMasterResponse::addPassive(const HostAndPort& passive) { - _passivesSet = true; - _passives.push_back(passive); - } - - void IsMasterResponse::addArbiter(const HostAndPort& arbiter) { - _arbitersSet = true; - _arbiters.push_back(arbiter); - } - - void IsMasterResponse::setPrimary(const HostAndPort& primary) { + _primary = HostAndPort(primaryString); _primarySet = true; - _primary = primary; } - void IsMasterResponse::setIsArbiterOnly(bool arbiterOnly) { + if (doc.hasField(kArbiterOnlyFieldName)) { + status = bsonExtractBooleanField(doc, kArbiterOnlyFieldName, &_arbiterOnly); + if (!status.isOK()) { + return status; + } _arbiterOnlySet = true; - _arbiterOnly = arbiterOnly; } - void IsMasterResponse::setIsPassive(bool passive) { + if (doc.hasField(kPassiveFieldName)) { + status = bsonExtractBooleanField(doc, kPassiveFieldName, &_passive); + if (!status.isOK()) { + return status; + } _passiveSet = true; - _passive = passive; } - void IsMasterResponse::setIsHidden(bool hidden) { + if (doc.hasField(kHiddenFieldName)) { + status = bsonExtractBooleanField(doc, kHiddenFieldName, &_hidden); + if (!status.isOK()) { + return status; + } _hiddenSet = true; - _hidden = hidden; } - void IsMasterResponse::setShouldBuildIndexes(bool buildIndexes) { + if (doc.hasField(kBuildIndexesFieldName)) { + status = bsonExtractBooleanField(doc, kBuildIndexesFieldName, &_buildIndexes); + if (!status.isOK()) { + return status; + } _buildIndexesSet = true; - _buildIndexes = buildIndexes; } - void IsMasterResponse::setSlaveDelay(Seconds slaveDelay) { + if (doc.hasField(kSlaveDelayFieldName)) { + long long slaveDelaySecs; + status = bsonExtractIntegerField(doc, kSlaveDelayFieldName, &slaveDelaySecs); + if (!status.isOK()) { + return status; + } _slaveDelaySet = true; - _slaveDelay = slaveDelay; + _slaveDelay = Seconds(slaveDelaySecs); } - void IsMasterResponse::addTag(const std::string& tagKey, const std::string& tagValue) { + if (doc.hasField(kTagsFieldName)) { + BSONElement tagsElement; + status = bsonExtractTypedField(doc, kTagsFieldName, Object, &tagsElement); + if (!status.isOK()) { + return status; + } + for (BSONObjIterator it(tagsElement.Obj()); it.more();) { + BSONElement tagElement = it.next(); + if (tagElement.type() != String) { + return Status(ErrorCodes::TypeMismatch, + str::stream() << "Elements in \"" << kTagsFieldName + << "\" obj " + "of isMaster response must be of type " + << typeName(String) << " but found type " + << typeName(tagsElement.type())); + } + _tags[tagElement.fieldNameStringData().toString()] = tagElement.String(); + } _tagsSet = true; - _tags[tagKey] = tagValue; } - void IsMasterResponse::setMe(const HostAndPort& me) { - _meSet = true; - _me = me; + if (doc.hasField(kElectionIdFieldName)) { + BSONElement electionIdElem; + status = bsonExtractTypedField(doc, kElectionIdFieldName, jstOID, &electionIdElem); + if (!status.isOK()) { + return status; + } + _electionId = electionIdElem.OID(); } - void IsMasterResponse::setElectionId(const OID& electionId) { - _electionId = electionId; + std::string meString; + status = bsonExtractStringField(doc, kMeFieldName, &meString); + if (!status.isOK()) { + return status; } - - void IsMasterResponse::markAsNoConfig() { _configSet = false; } - - void IsMasterResponse::markAsShutdownInProgress() { _shutdownInProgress = true; } - -} // namespace repl -} // namespace mongo + _me = HostAndPort(meString); + _meSet = true; + + return Status::OK(); +} + +void IsMasterResponse::setIsMaster(bool isMaster) { + _isMasterSet = true; + _isMaster = isMaster; +} + +void IsMasterResponse::setIsSecondary(bool secondary) { + _isSecondarySet = true; + _secondary = secondary; +} + +void IsMasterResponse::setReplSetName(const std::string& setName) { + _setNameSet = true; + _setName = setName; +} + +void IsMasterResponse::setReplSetVersion(long long version) { + _setVersionSet = true; + _setVersion = version; +} + +void IsMasterResponse::addHost(const HostAndPort& host) { + _hostsSet = true; + _hosts.push_back(host); +} + +void IsMasterResponse::addPassive(const HostAndPort& passive) { + _passivesSet = true; + _passives.push_back(passive); +} + +void IsMasterResponse::addArbiter(const HostAndPort& arbiter) { + _arbitersSet = true; + _arbiters.push_back(arbiter); +} + +void IsMasterResponse::setPrimary(const HostAndPort& primary) { + _primarySet = true; + _primary = primary; +} + +void IsMasterResponse::setIsArbiterOnly(bool arbiterOnly) { + _arbiterOnlySet = true; + _arbiterOnly = arbiterOnly; +} + +void IsMasterResponse::setIsPassive(bool passive) { + _passiveSet = true; + _passive = passive; +} + +void IsMasterResponse::setIsHidden(bool hidden) { + _hiddenSet = true; + _hidden = hidden; +} + +void IsMasterResponse::setShouldBuildIndexes(bool buildIndexes) { + _buildIndexesSet = true; + _buildIndexes = buildIndexes; +} + +void IsMasterResponse::setSlaveDelay(Seconds slaveDelay) { + _slaveDelaySet = true; + _slaveDelay = slaveDelay; +} + +void IsMasterResponse::addTag(const std::string& tagKey, const std::string& tagValue) { + _tagsSet = true; + _tags[tagKey] = tagValue; +} + +void IsMasterResponse::setMe(const HostAndPort& me) { + _meSet = true; + _me = me; +} + +void IsMasterResponse::setElectionId(const OID& electionId) { + _electionId = electionId; +} + +void IsMasterResponse::markAsNoConfig() { + _configSet = false; +} + +void IsMasterResponse::markAsShutdownInProgress() { + _shutdownInProgress = true; +} + +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/is_master_response.h b/src/mongo/db/repl/is_master_response.h index dd0eda70e2b..7bfaf1ac0b2 100644 --- a/src/mongo/db/repl/is_master_response.h +++ b/src/mongo/db/repl/is_master_response.h @@ -38,179 +38,216 @@ namespace mongo { - class BSONObj; - class BSONObjBuilder; - class Status; +class BSONObj; +class BSONObjBuilder; +class Status; namespace repl { +/** + * Response structure for the ismaster command. Only handles responses from nodes + * that are in replset mode. + */ +class IsMasterResponse { +public: + IsMasterResponse(); + /** - * Response structure for the ismaster command. Only handles responses from nodes - * that are in replset mode. + * Initializes this IsMasterResponse from the contents of "doc". */ - class IsMasterResponse { - public: - IsMasterResponse(); - - /** - * Initializes this IsMasterResponse from the contents of "doc". - */ - Status initialize(const BSONObj& doc); - - /** - * Appends all non-default values to "builder". - * There are two values that are handled specially: if _inShutdown is true or _configSet - * is false, we will add a standard response to "builder" indicating either that we are - * in the middle of shutting down or we do not have a valid replica set config, ignoring - * the values of all other member variables. - */ - void addToBSON(BSONObjBuilder* builder) const; + Status initialize(const BSONObj& doc); - /** - * Returns a BSONObj consisting the results of calling addToBSON on an otherwise empty - * BSONObjBuilder. - */ - BSONObj toBSON() const; - - - // ===================== Accessors for member variables ================================= // - - bool isMaster() const { return _isMaster; } + /** + * Appends all non-default values to "builder". + * There are two values that are handled specially: if _inShutdown is true or _configSet + * is false, we will add a standard response to "builder" indicating either that we are + * in the middle of shutting down or we do not have a valid replica set config, ignoring + * the values of all other member variables. + */ + void addToBSON(BSONObjBuilder* builder) const; - bool isSecondary() const { return _secondary; } + /** + * Returns a BSONObj consisting the results of calling addToBSON on an otherwise empty + * BSONObjBuilder. + */ + BSONObj toBSON() const; - const std::string& getReplSetName() const { return _setName; } - long long getReplSetVersion() const { return _setVersion; } + // ===================== Accessors for member variables ================================= // - const std::vector<HostAndPort>& getHosts() const { return _hosts; } + bool isMaster() const { + return _isMaster; + } - const std::vector<HostAndPort>& getPassives() const { return _passives; } + bool isSecondary() const { + return _secondary; + } - const std::vector<HostAndPort>& getArbiters() const { return _arbiters; } + const std::string& getReplSetName() const { + return _setName; + } - const HostAndPort& getPrimary() const { return _primary; } + long long getReplSetVersion() const { + return _setVersion; + } - bool hasPrimary() const { return _primarySet; } + const std::vector<HostAndPort>& getHosts() const { + return _hosts; + } - bool isArbiterOnly() const { return _arbiterOnly; } + const std::vector<HostAndPort>& getPassives() const { + return _passives; + } - bool isPassive() const { return _passive; } + const std::vector<HostAndPort>& getArbiters() const { + return _arbiters; + } - bool isHidden() const { return _hidden; } + const HostAndPort& getPrimary() const { + return _primary; + } - bool shouldBuildIndexes() const { return _buildIndexes; } + bool hasPrimary() const { + return _primarySet; + } - Seconds getSlaveDelay() const { return _slaveDelay; } + bool isArbiterOnly() const { + return _arbiterOnly; + } - const unordered_map<std::string, std::string> getTags() const { return _tags; } + bool isPassive() const { + return _passive; + } - const HostAndPort& getMe() const { return _me; } + bool isHidden() const { + return _hidden; + } - const OID& getElectionId() const { return _electionId; } + bool shouldBuildIndexes() const { + return _buildIndexes; + } - /** - * If false, calls to toBSON/addToBSON will ignore all other fields and add a specific - * message to indicate that we have no replica set config. - */ - bool isConfigSet() const { return _configSet; } + Seconds getSlaveDelay() const { + return _slaveDelay; + } - /** - * If false, calls to toBSON/addToBSON will ignore all other fields and add a specific - * message to indicate that we are in the middle of shutting down. - */ - bool isShutdownInProgress() const { return _shutdownInProgress; } + const unordered_map<std::string, std::string> getTags() const { + return _tags; + } + const HostAndPort& getMe() const { + return _me; + } - // ===================== Mutators for member variables ================================= // + const OID& getElectionId() const { + return _electionId; + } - void setIsMaster(bool isMaster); + /** + * If false, calls to toBSON/addToBSON will ignore all other fields and add a specific + * message to indicate that we have no replica set config. + */ + bool isConfigSet() const { + return _configSet; + } - void setIsSecondary(bool secondary); + /** + * If false, calls to toBSON/addToBSON will ignore all other fields and add a specific + * message to indicate that we are in the middle of shutting down. + */ + bool isShutdownInProgress() const { + return _shutdownInProgress; + } - void setReplSetName(const std::string& setName); - void setReplSetVersion(long long version); + // ===================== Mutators for member variables ================================= // - void addHost(const HostAndPort& host); + void setIsMaster(bool isMaster); - void addPassive(const HostAndPort& passive); + void setIsSecondary(bool secondary); - void addArbiter(const HostAndPort& arbiter); + void setReplSetName(const std::string& setName); - void setPrimary(const HostAndPort& primary); + void setReplSetVersion(long long version); - void setIsArbiterOnly(bool arbiterOnly); + void addHost(const HostAndPort& host); - void setIsPassive(bool passive); + void addPassive(const HostAndPort& passive); - void setIsHidden(bool hidden); + void addArbiter(const HostAndPort& arbiter); - void setShouldBuildIndexes(bool buildIndexes); + void setPrimary(const HostAndPort& primary); - void setSlaveDelay(Seconds slaveDelay); + void setIsArbiterOnly(bool arbiterOnly); - void addTag(const std::string& tagKey, const std::string& tagValue); + void setIsPassive(bool passive); - void setMe(const HostAndPort& me); + void setIsHidden(bool hidden); - void setElectionId(const OID& electionId); + void setShouldBuildIndexes(bool buildIndexes); - /** - * Marks _configSet as false, which will cause future calls to toBSON/addToBSON to ignore - * all other member variables and output a hardcoded response indicating that we have no - * valid replica set config. - */ - void markAsNoConfig(); + void setSlaveDelay(Seconds slaveDelay); - /** - * Marks _shutdownInProgress as true, which will cause future calls to toBSON/addToBSON to - * ignore all other member variables and output a hardcoded response indicating that we are - * in the middle of shutting down. - */ - void markAsShutdownInProgress(); + void addTag(const std::string& tagKey, const std::string& tagValue); - private: + void setMe(const HostAndPort& me); - bool _isMaster; - bool _isMasterSet; - bool _secondary; - bool _isSecondarySet; - std::string _setName; - bool _setNameSet; - long long _setVersion; - bool _setVersionSet; - std::vector<HostAndPort> _hosts; - bool _hostsSet; - std::vector<HostAndPort> _passives; - bool _passivesSet; - std::vector<HostAndPort> _arbiters; - bool _arbitersSet; - HostAndPort _primary; - bool _primarySet; - bool _arbiterOnly; - bool _arbiterOnlySet; - bool _passive; - bool _passiveSet; - bool _hidden; - bool _hiddenSet; - bool _buildIndexes; - bool _buildIndexesSet; - Seconds _slaveDelay; - bool _slaveDelaySet; - unordered_map<std::string, std::string> _tags; - bool _tagsSet; - HostAndPort _me; - bool _meSet; - OID _electionId; + void setElectionId(const OID& electionId); - // If _configSet is false this means we don't have a valid repl set config, so toBSON - // will return a set of hardcoded values that indicate this. - bool _configSet; - // If _shutdownInProgress is true toBSON will return a set of hardcoded values to indicate - // that we are mid shutdown - bool _shutdownInProgress; - }; + /** + * Marks _configSet as false, which will cause future calls to toBSON/addToBSON to ignore + * all other member variables and output a hardcoded response indicating that we have no + * valid replica set config. + */ + void markAsNoConfig(); -} // namespace repl -} // namespace mongo + /** + * Marks _shutdownInProgress as true, which will cause future calls to toBSON/addToBSON to + * ignore all other member variables and output a hardcoded response indicating that we are + * in the middle of shutting down. + */ + void markAsShutdownInProgress(); + +private: + bool _isMaster; + bool _isMasterSet; + bool _secondary; + bool _isSecondarySet; + std::string _setName; + bool _setNameSet; + long long _setVersion; + bool _setVersionSet; + std::vector<HostAndPort> _hosts; + bool _hostsSet; + std::vector<HostAndPort> _passives; + bool _passivesSet; + std::vector<HostAndPort> _arbiters; + bool _arbitersSet; + HostAndPort _primary; + bool _primarySet; + bool _arbiterOnly; + bool _arbiterOnlySet; + bool _passive; + bool _passiveSet; + bool _hidden; + bool _hiddenSet; + bool _buildIndexes; + bool _buildIndexesSet; + Seconds _slaveDelay; + bool _slaveDelaySet; + unordered_map<std::string, std::string> _tags; + bool _tagsSet; + HostAndPort _me; + bool _meSet; + OID _electionId; + + // If _configSet is false this means we don't have a valid repl set config, so toBSON + // will return a set of hardcoded values that indicate this. + bool _configSet; + // If _shutdownInProgress is true toBSON will return a set of hardcoded values to indicate + // that we are mid shutdown + bool _shutdownInProgress; +}; + +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/isself.cpp b/src/mongo/db/repl/isself.cpp index 423d3025a1e..57e4720375f 100644 --- a/src/mongo/db/repl/isself.cpp +++ b/src/mongo/db/repl/isself.cpp @@ -47,7 +47,8 @@ #include "mongo/util/scopeguard.h" #include "mongo/util/log.h" -#if defined(__linux__) || defined(__APPLE__) || defined(__freebsd__) || defined(__sunos__) || defined(__openbsd__) +#if defined(__linux__) || defined(__APPLE__) || defined(__freebsd__) || defined(__sunos__) || \ + defined(__openbsd__) #define FASTPATH_UNIX 1 #endif @@ -76,281 +77,269 @@ namespace mongo { namespace repl { - OID instanceId; +OID instanceId; - MONGO_INITIALIZER(GenerateInstanceId)(InitializerContext*) { - instanceId = OID::gen(); - return Status::OK(); - } +MONGO_INITIALIZER(GenerateInstanceId)(InitializerContext*) { + instanceId = OID::gen(); + return Status::OK(); +} namespace { - /** - * Helper to convert a message from a networking function to a string. - * Needed because errnoWithDescription uses strerror on linux, when - * we need gai_strerror. - */ - std::string stringifyError(int code) { +/** + * Helper to convert a message from a networking function to a string. + * Needed because errnoWithDescription uses strerror on linux, when + * we need gai_strerror. + */ +std::string stringifyError(int code) { #if FASTPATH_UNIX - return gai_strerror(code); + return gai_strerror(code); #elif defined(_WIN32) - // FormatMessage in errnoWithDescription works here on windows - return errnoWithDescription(code); + // FormatMessage in errnoWithDescription works here on windows + return errnoWithDescription(code); #endif - } - - /** - * Resolves a host and port to a list of IP addresses. This requires a syscall. If the - * ipv6enabled parameter is true, both IPv6 and IPv4 addresses will be returned. - */ - std::vector<std::string> getAddrsForHost(const std::string& iporhost, - const int port, - const bool ipv6enabled) { - addrinfo* addrs = NULL; - addrinfo hints = {0}; - hints.ai_socktype = SOCK_STREAM; - hints.ai_family = (ipv6enabled ? AF_UNSPEC : AF_INET); - - const std::string portNum = BSONObjBuilder::numStr(port); - - std::vector<std::string> out; - - int err = getaddrinfo(iporhost.c_str(), portNum.c_str(), &hints, &addrs); +} - if (err) { - warning() << "getaddrinfo(\"" << iporhost << "\") failed: " - << stringifyError(err) << std::endl; - return out; - } +/** + * Resolves a host and port to a list of IP addresses. This requires a syscall. If the + * ipv6enabled parameter is true, both IPv6 and IPv4 addresses will be returned. + */ +std::vector<std::string> getAddrsForHost(const std::string& iporhost, + const int port, + const bool ipv6enabled) { + addrinfo* addrs = NULL; + addrinfo hints = {0}; + hints.ai_socktype = SOCK_STREAM; + hints.ai_family = (ipv6enabled ? AF_UNSPEC : AF_INET); + + const std::string portNum = BSONObjBuilder::numStr(port); + + std::vector<std::string> out; + + int err = getaddrinfo(iporhost.c_str(), portNum.c_str(), &hints, &addrs); + + if (err) { + warning() << "getaddrinfo(\"" << iporhost << "\") failed: " << stringifyError(err) + << std::endl; + return out; + } - ON_BLOCK_EXIT(freeaddrinfo, addrs); + ON_BLOCK_EXIT(freeaddrinfo, addrs); - for (addrinfo* addr = addrs; addr != NULL; addr = addr->ai_next) { - int family = addr->ai_family; - char host[NI_MAXHOST]; + for (addrinfo* addr = addrs; addr != NULL; addr = addr->ai_next) { + int family = addr->ai_family; + char host[NI_MAXHOST]; - if (family == AF_INET || family == AF_INET6) { - err = getnameinfo(addr->ai_addr, addr->ai_addrlen, host, - NI_MAXHOST, NULL, 0, NI_NUMERICHOST); - if (err) { - warning() << "getnameinfo() failed: " << stringifyError(err) << std::endl; - continue; - } - out.push_back(host); + if (family == AF_INET || family == AF_INET6) { + err = getnameinfo( + addr->ai_addr, addr->ai_addrlen, host, NI_MAXHOST, NULL, 0, NI_NUMERICHOST); + if (err) { + warning() << "getnameinfo() failed: " << stringifyError(err) << std::endl; + continue; } - + out.push_back(host); } + } - if (shouldLog(logger::LogSeverity::Debug(2))) { - StringBuilder builder; - builder << "getAddrsForHost(\"" << iporhost << ":" << port << "\"):"; - for (std::vector<std::string>::const_iterator o = out.begin(); o != out.end(); ++o) { - builder << " [ " << *o << "]"; - } - LOG(2) << builder.str(); + if (shouldLog(logger::LogSeverity::Debug(2))) { + StringBuilder builder; + builder << "getAddrsForHost(\"" << iporhost << ":" << port << "\"):"; + for (std::vector<std::string>::const_iterator o = out.begin(); o != out.end(); ++o) { + builder << " [ " << *o << "]"; } - - return out; + LOG(2) << builder.str(); } -} // namespace + return out; +} - bool isSelf(const HostAndPort& hostAndPort) { +} // namespace - // Fastpath: check if the host&port in question is bound to one - // of the interfaces on this machine. - // No need for ip match if the ports do not match - if (hostAndPort.port() == serverGlobalParams.port) { - std::vector<std::string> myAddrs = serverGlobalParams.bind_ip.empty() ? - getBoundAddrs(IPv6Enabled()) : - std::vector<std::string>(); +bool isSelf(const HostAndPort& hostAndPort) { + // Fastpath: check if the host&port in question is bound to one + // of the interfaces on this machine. + // No need for ip match if the ports do not match + if (hostAndPort.port() == serverGlobalParams.port) { + std::vector<std::string> myAddrs = serverGlobalParams.bind_ip.empty() + ? getBoundAddrs(IPv6Enabled()) + : std::vector<std::string>(); + + if (!serverGlobalParams.bind_ip.empty()) { + boost::split(myAddrs, serverGlobalParams.bind_ip, boost::is_any_of(", ")); + } - if (!serverGlobalParams.bind_ip.empty()) { - boost::split(myAddrs, serverGlobalParams.bind_ip, boost::is_any_of(", ")); - } + const std::vector<std::string> hostAddrs = + getAddrsForHost(hostAndPort.host(), hostAndPort.port(), IPv6Enabled()); - const std::vector<std::string> hostAddrs = getAddrsForHost(hostAndPort.host(), - hostAndPort.port(), - IPv6Enabled()); - - for (std::vector<std::string>::const_iterator i = myAddrs.begin(); - i != myAddrs.end(); ++i) { - for (std::vector<std::string>::const_iterator j = hostAddrs.begin(); - j != hostAddrs.end(); ++j) { - if (*i == *j) { - return true; - } + for (std::vector<std::string>::const_iterator i = myAddrs.begin(); i != myAddrs.end(); + ++i) { + for (std::vector<std::string>::const_iterator j = hostAddrs.begin(); + j != hostAddrs.end(); + ++j) { + if (*i == *j) { + return true; } } } + } + + // Ensure that the server is up and ready to accept incoming network requests. + const Listener* listener = Listener::getTimeTracker(); + if (!listener) { + return false; + } + listener->waitUntilListening(); - // Ensure that the server is up and ready to accept incoming network requests. - const Listener* listener = Listener::getTimeTracker(); - if (!listener) { + try { + DBClientConnection conn; + std::string errmsg; + conn.setSoTimeout(30); // 30 second timeout + if (!conn.connect(hostAndPort, errmsg)) { return false; } - listener->waitUntilListening(); - try { - DBClientConnection conn; - std::string errmsg; - conn.setSoTimeout(30); // 30 second timeout - if (!conn.connect(hostAndPort, errmsg)) { + if (getGlobalAuthorizationManager()->isAuthEnabled() && isInternalAuthSet()) { + if (!authenticateInternalUser(&conn)) { return false; } - - if (getGlobalAuthorizationManager()->isAuthEnabled() && isInternalAuthSet()) { - if (!authenticateInternalUser(&conn)) { - return false; - } - } - BSONObj out; - bool ok = conn.simpleCommand("admin" , &out, "_isSelf"); - bool me = ok && out["id"].type() == jstOID && instanceId == out["id"].OID(); - - return me; - } - catch (const std::exception& e) { - warning() << "could't check isSelf (" << hostAndPort << ") " << e.what() << std::endl; } + BSONObj out; + bool ok = conn.simpleCommand("admin", &out, "_isSelf"); + bool me = ok && out["id"].type() == jstOID && instanceId == out["id"].OID(); - return false; + return me; + } catch (const std::exception& e) { + warning() << "could't check isSelf (" << hostAndPort << ") " << e.what() << std::endl; } - /** - * Returns all the IP addresses bound to the network interfaces of this machine. - * This requires a syscall. If the ipv6enabled parameter is true, both IPv6 AND IPv4 - * addresses will be returned. - */ - std::vector<std::string> getBoundAddrs(const bool ipv6enabled) { - std::vector<std::string> out; + return false; +} + +/** + * Returns all the IP addresses bound to the network interfaces of this machine. + * This requires a syscall. If the ipv6enabled parameter is true, both IPv6 AND IPv4 + * addresses will be returned. + */ +std::vector<std::string> getBoundAddrs(const bool ipv6enabled) { + std::vector<std::string> out; #ifdef FASTPATH_UNIX - ifaddrs* addrs; + ifaddrs* addrs; - int err = getifaddrs(&addrs); - if (err) { - warning() << "getifaddrs failure: " << errnoWithDescription(err) << std::endl; - return out; - } - ON_BLOCK_EXIT(freeifaddrs, addrs); - - // based on example code from linux getifaddrs manpage - for (ifaddrs* addr = addrs; addr != NULL; addr = addr->ifa_next) { - if (addr->ifa_addr == NULL) continue; - int family = addr->ifa_addr->sa_family; - char host[NI_MAXHOST]; - - if (family == AF_INET || (ipv6enabled && (family == AF_INET6))) { - err = getnameinfo(addr->ifa_addr, - (family == AF_INET ? sizeof(struct sockaddr_in) - : sizeof(struct sockaddr_in6)), - host, NI_MAXHOST, NULL, 0, NI_NUMERICHOST); - if (err) { - warning() << "getnameinfo() failed: " << gai_strerror(err) << std::endl; - continue; - } - out.push_back(host); + int err = getifaddrs(&addrs); + if (err) { + warning() << "getifaddrs failure: " << errnoWithDescription(err) << std::endl; + return out; + } + ON_BLOCK_EXIT(freeifaddrs, addrs); + + // based on example code from linux getifaddrs manpage + for (ifaddrs* addr = addrs; addr != NULL; addr = addr->ifa_next) { + if (addr->ifa_addr == NULL) + continue; + int family = addr->ifa_addr->sa_family; + char host[NI_MAXHOST]; + + if (family == AF_INET || (ipv6enabled && (family == AF_INET6))) { + err = getnameinfo( + addr->ifa_addr, + (family == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6)), + host, + NI_MAXHOST, + NULL, + 0, + NI_NUMERICHOST); + if (err) { + warning() << "getnameinfo() failed: " << gai_strerror(err) << std::endl; + continue; } + out.push_back(host); } + } #elif defined(_WIN32) - // Start with the MS recommended 15KB buffer. Use multiple attempts - // for the rare case that the adapter config changes between calls + // Start with the MS recommended 15KB buffer. Use multiple attempts + // for the rare case that the adapter config changes between calls - ULONG adaptersLen = 15 * 1024; - boost::scoped_array<char> buf(new char[adaptersLen]); - IP_ADAPTER_ADDRESSES* adapters = reinterpret_cast<IP_ADAPTER_ADDRESSES*>(buf.get()); - DWORD err; + ULONG adaptersLen = 15 * 1024; + boost::scoped_array<char> buf(new char[adaptersLen]); + IP_ADAPTER_ADDRESSES* adapters = reinterpret_cast<IP_ADAPTER_ADDRESSES*>(buf.get()); + DWORD err; - ULONG family = ipv6enabled ? AF_UNSPEC : AF_INET; + ULONG family = ipv6enabled ? AF_UNSPEC : AF_INET; - for (int tries = 0; tries < 3; ++tries) { - err = GetAdaptersAddresses(family, - GAA_FLAG_SKIP_ANYCAST | // only want unicast addrs + for (int tries = 0; tries < 3; ++tries) { + err = GetAdaptersAddresses(family, + GAA_FLAG_SKIP_ANYCAST | // only want unicast addrs GAA_FLAG_SKIP_MULTICAST | GAA_FLAG_SKIP_DNS_SERVER, - NULL, - adapters, - &adaptersLen); - - if (err == ERROR_BUFFER_OVERFLOW) { - // in this case, adaptersLen will be set to the size we need to allocate - buf.reset(new char[adaptersLen]); - adapters = reinterpret_cast<IP_ADAPTER_ADDRESSES*>(buf.get()); - } - else { - break; // only retry for incorrectly sized buffer - } + NULL, + adapters, + &adaptersLen); + + if (err == ERROR_BUFFER_OVERFLOW) { + // in this case, adaptersLen will be set to the size we need to allocate + buf.reset(new char[adaptersLen]); + adapters = reinterpret_cast<IP_ADAPTER_ADDRESSES*>(buf.get()); + } else { + break; // only retry for incorrectly sized buffer } + } - if (err != NO_ERROR) { - warning() << "GetAdaptersAddresses() failed: " << errnoWithDescription(err) - << std::endl; - return out; - } + if (err != NO_ERROR) { + warning() << "GetAdaptersAddresses() failed: " << errnoWithDescription(err) << std::endl; + return out; + } - for (IP_ADAPTER_ADDRESSES* adapter = adapters; - adapter != NULL; adapter = adapter->Next) { - for (IP_ADAPTER_UNICAST_ADDRESS* addr = adapter->FirstUnicastAddress; - addr != NULL; addr = addr->Next) { - - short family = - reinterpret_cast<SOCKADDR_STORAGE*>(addr->Address.lpSockaddr)->ss_family; - - if (family == AF_INET) { - // IPv4 - SOCKADDR_IN* sock = reinterpret_cast<SOCKADDR_IN*>(addr->Address.lpSockaddr); - char addrstr[INET_ADDRSTRLEN] = {0}; - boost::system::error_code ec; - // Not all windows versions have inet_ntop - boost::asio::detail::socket_ops::inet_ntop(AF_INET, - &(sock->sin_addr), - addrstr, - INET_ADDRSTRLEN, - 0, - ec); - if (ec) { - warning() << "inet_ntop failed during IPv4 address conversion: " - << ec.message() << std::endl; - continue; - } - out.push_back(addrstr); + for (IP_ADAPTER_ADDRESSES* adapter = adapters; adapter != NULL; adapter = adapter->Next) { + for (IP_ADAPTER_UNICAST_ADDRESS* addr = adapter->FirstUnicastAddress; addr != NULL; + addr = addr->Next) { + short family = reinterpret_cast<SOCKADDR_STORAGE*>(addr->Address.lpSockaddr)->ss_family; + + if (family == AF_INET) { + // IPv4 + SOCKADDR_IN* sock = reinterpret_cast<SOCKADDR_IN*>(addr->Address.lpSockaddr); + char addrstr[INET_ADDRSTRLEN] = {0}; + boost::system::error_code ec; + // Not all windows versions have inet_ntop + boost::asio::detail::socket_ops::inet_ntop( + AF_INET, &(sock->sin_addr), addrstr, INET_ADDRSTRLEN, 0, ec); + if (ec) { + warning() << "inet_ntop failed during IPv4 address conversion: " << ec.message() + << std::endl; + continue; } - else if (family == AF_INET6) { - // IPv6 - SOCKADDR_IN6* sock = reinterpret_cast<SOCKADDR_IN6*>(addr->Address.lpSockaddr); - char addrstr[INET6_ADDRSTRLEN] = {0}; - boost::system::error_code ec; - boost::asio::detail::socket_ops::inet_ntop(AF_INET6, - &(sock->sin6_addr), - addrstr, - INET6_ADDRSTRLEN, - 0, - ec); - if (ec) { - warning() << "inet_ntop failed during IPv6 address conversion: " - << ec.message() << std::endl; - continue; - } - out.push_back(addrstr); + out.push_back(addrstr); + } else if (family == AF_INET6) { + // IPv6 + SOCKADDR_IN6* sock = reinterpret_cast<SOCKADDR_IN6*>(addr->Address.lpSockaddr); + char addrstr[INET6_ADDRSTRLEN] = {0}; + boost::system::error_code ec; + boost::asio::detail::socket_ops::inet_ntop( + AF_INET6, &(sock->sin6_addr), addrstr, INET6_ADDRSTRLEN, 0, ec); + if (ec) { + warning() << "inet_ntop failed during IPv6 address conversion: " << ec.message() + << std::endl; + continue; } + out.push_back(addrstr); } } + } #endif // defined(_WIN32) - if (shouldLog(logger::LogSeverity::Debug(2))) { - StringBuilder builder; - builder << "getBoundAddrs():"; - for (std::vector<std::string>::const_iterator o = out.begin(); o != out.end(); ++o) { - builder << " [ " << *o << "]"; - } - LOG(2) << builder.str(); + if (shouldLog(logger::LogSeverity::Debug(2))) { + StringBuilder builder; + builder << "getBoundAddrs():"; + for (std::vector<std::string>::const_iterator o = out.begin(); o != out.end(); ++o) { + builder << " [ " << *o << "]"; } - return out; + LOG(2) << builder.str(); } + return out; +} } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/isself.h b/src/mongo/db/repl/isself.h index cbcbbd9f031..19b61bf47bd 100644 --- a/src/mongo/db/repl/isself.h +++ b/src/mongo/db/repl/isself.h @@ -34,30 +34,30 @@ #include "mongo/bson/oid.h" namespace mongo { - struct HostAndPort; +struct HostAndPort; namespace repl { - /** - * An identifier unique to this instance. Used by isSelf to see if we are talking - * to ourself or someone else. - */ - extern OID instanceId; - - /** - * Returns true if "hostAndPort" identifies this instance. - */ - bool isSelf(const HostAndPort& hostAndPort); - - /** - * Returns all the IP addresses bound to the network interfaces of this machine. - * This requires a syscall. If the ipv6enabled parameter is true, both IPv6 AND IPv4 - * addresses will be returned. - * - * Note: this only works on Linux and Windows. All calls should be properly ifdef'd, - * otherwise an invariant will be triggered. - */ - std::vector<std::string> getBoundAddrs(const bool ipv6enabled); - -} // namespace repl -} // namespace mongo +/** + * An identifier unique to this instance. Used by isSelf to see if we are talking + * to ourself or someone else. + */ +extern OID instanceId; + +/** + * Returns true if "hostAndPort" identifies this instance. + */ +bool isSelf(const HostAndPort& hostAndPort); + +/** + * Returns all the IP addresses bound to the network interfaces of this machine. + * This requires a syscall. If the ipv6enabled parameter is true, both IPv6 AND IPv4 + * addresses will be returned. + * + * Note: this only works on Linux and Windows. All calls should be properly ifdef'd, + * otherwise an invariant will be triggered. + */ +std::vector<std::string> getBoundAddrs(const bool ipv6enabled); + +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/isself_test.cpp b/src/mongo/db/repl/isself_test.cpp index b3af1721e62..b6a3e26f8e6 100644 --- a/src/mongo/db/repl/isself_test.cpp +++ b/src/mongo/db/repl/isself_test.cpp @@ -40,45 +40,41 @@ namespace repl { namespace { - using std::string; +using std::string; - TEST(IsSelf, DetectsSameHostIPv4) { +TEST(IsSelf, DetectsSameHostIPv4) { #if defined(_WIN32) || defined(__linux__) || defined(__APPLE__) - bool wasEnabled = IPv6Enabled(); - enableIPv6(false); - ON_BLOCK_EXIT(enableIPv6, wasEnabled); - // first we get the addrs bound on this host - const std::vector<std::string> addrs = getBoundAddrs(false); - // Fastpath should agree with the result of getBoundAddrs - // since it uses it... - for (std::vector<string>::const_iterator it = addrs.begin(); - it != addrs.end(); ++it) { - - ASSERT(isSelf(HostAndPort(*it, serverGlobalParams.port))); - } + bool wasEnabled = IPv6Enabled(); + enableIPv6(false); + ON_BLOCK_EXIT(enableIPv6, wasEnabled); + // first we get the addrs bound on this host + const std::vector<std::string> addrs = getBoundAddrs(false); + // Fastpath should agree with the result of getBoundAddrs + // since it uses it... + for (std::vector<string>::const_iterator it = addrs.begin(); it != addrs.end(); ++it) { + ASSERT(isSelf(HostAndPort(*it, serverGlobalParams.port))); + } #else - ASSERT(true); + ASSERT(true); #endif - } +} - TEST(IsSelf, DetectsSameHostIPv6) { +TEST(IsSelf, DetectsSameHostIPv6) { #if defined(_WIN32) || defined(__linux__) || defined(__APPLE__) - bool wasEnabled = IPv6Enabled(); - enableIPv6(true); - ON_BLOCK_EXIT(enableIPv6, wasEnabled); - // first we get the addrs bound on this host - const std::vector<std::string> addrs = getBoundAddrs(true); - // Fastpath should agree with the result of getBoundAddrs - // since it uses it... - for (std::vector<string>::const_iterator it = addrs.begin(); - it != addrs.end(); ++it) { - - ASSERT(isSelf(HostAndPort(*it, serverGlobalParams.port))); - } + bool wasEnabled = IPv6Enabled(); + enableIPv6(true); + ON_BLOCK_EXIT(enableIPv6, wasEnabled); + // first we get the addrs bound on this host + const std::vector<std::string> addrs = getBoundAddrs(true); + // Fastpath should agree with the result of getBoundAddrs + // since it uses it... + for (std::vector<string>::const_iterator it = addrs.begin(); it != addrs.end(); ++it) { + ASSERT(isSelf(HostAndPort(*it, serverGlobalParams.port))); + } #else - ASSERT(true); + ASSERT(true); #endif - } +} } // namespace diff --git a/src/mongo/db/repl/master_slave.cpp b/src/mongo/db/repl/master_slave.cpp index 7cc44b022f1..8844ae7828d 100644 --- a/src/mongo/db/repl/master_slave.cpp +++ b/src/mongo/db/repl/master_slave.cpp @@ -81,1319 +81,1292 @@ using std::vector; namespace mongo { namespace repl { - void pretouchOperation(OperationContext* txn, const BSONObj& op); - void pretouchN(vector<BSONObj>&, unsigned a, unsigned b); +void pretouchOperation(OperationContext* txn, const BSONObj& op); +void pretouchN(vector<BSONObj>&, unsigned a, unsigned b); - /* if 1 sync() is running */ - volatile int syncing = 0; - volatile int relinquishSyncingSome = 0; +/* if 1 sync() is running */ +volatile int syncing = 0; +volatile int relinquishSyncingSome = 0; - static time_t lastForcedResync = 0; +static time_t lastForcedResync = 0; - /* output by the web console */ - const char *replInfo = ""; - struct ReplInfo { - ReplInfo(const char *msg) { - replInfo = msg; - } - ~ReplInfo() { - replInfo = "?"; - } - }; - - - ReplSource::ReplSource(OperationContext* txn) { - nClonedThisPass = 0; - ensureMe(txn); +/* output by the web console */ +const char* replInfo = ""; +struct ReplInfo { + ReplInfo(const char* msg) { + replInfo = msg; } - - ReplSource::ReplSource(OperationContext* txn, BSONObj o) : nClonedThisPass(0) { - only = o.getStringField("only"); - hostName = o.getStringField("host"); - _sourceName = o.getStringField("source"); - uassert( 10118 , "'host' field not set in sources collection object", !hostName.empty() ); - uassert( 10119 , "only source='main' allowed for now with replication", sourceName() == "main" ); - BSONElement e = o.getField("syncedTo"); - if ( !e.eoo() ) { - uassert( 10120 , "bad sources 'syncedTo' field value", e.type() == Date || e.type() == Timestamp ); - OpTime tmp( e.date() ); - syncedTo = tmp; - } - - BSONObj dbsObj = o.getObjectField("dbsNextPass"); - if ( !dbsObj.isEmpty() ) { - BSONObjIterator i(dbsObj); - while ( 1 ) { - BSONElement e = i.next(); - if ( e.eoo() ) - break; - addDbNextPass.insert( e.fieldName() ); - } - } - - dbsObj = o.getObjectField("incompleteCloneDbs"); - if ( !dbsObj.isEmpty() ) { - BSONObjIterator i(dbsObj); - while ( 1 ) { - BSONElement e = i.next(); - if ( e.eoo() ) - break; - incompleteCloneDbs.insert( e.fieldName() ); - } - } - ensureMe(txn); + ~ReplInfo() { + replInfo = "?"; + } +}; + + +ReplSource::ReplSource(OperationContext* txn) { + nClonedThisPass = 0; + ensureMe(txn); +} + +ReplSource::ReplSource(OperationContext* txn, BSONObj o) : nClonedThisPass(0) { + only = o.getStringField("only"); + hostName = o.getStringField("host"); + _sourceName = o.getStringField("source"); + uassert(10118, "'host' field not set in sources collection object", !hostName.empty()); + uassert(10119, "only source='main' allowed for now with replication", sourceName() == "main"); + BSONElement e = o.getField("syncedTo"); + if (!e.eoo()) { + uassert( + 10120, "bad sources 'syncedTo' field value", e.type() == Date || e.type() == Timestamp); + OpTime tmp(e.date()); + syncedTo = tmp; } - /* Turn our C++ Source object into a BSONObj */ - BSONObj ReplSource::jsobj() { - BSONObjBuilder b; - b.append("host", hostName); - b.append("source", sourceName()); - if ( !only.empty() ) - b.append("only", only); - if ( !syncedTo.isNull() ) - b.appendTimestamp("syncedTo", syncedTo.asDate()); - - BSONObjBuilder dbsNextPassBuilder; - int n = 0; - for ( set<string>::iterator i = addDbNextPass.begin(); i != addDbNextPass.end(); i++ ) { - n++; - dbsNextPassBuilder.appendBool(*i, 1); + BSONObj dbsObj = o.getObjectField("dbsNextPass"); + if (!dbsObj.isEmpty()) { + BSONObjIterator i(dbsObj); + while (1) { + BSONElement e = i.next(); + if (e.eoo()) + break; + addDbNextPass.insert(e.fieldName()); } - if ( n ) - b.append("dbsNextPass", dbsNextPassBuilder.done()); + } - BSONObjBuilder incompleteCloneDbsBuilder; - n = 0; - for ( set<string>::iterator i = incompleteCloneDbs.begin(); i != incompleteCloneDbs.end(); i++ ) { - n++; - incompleteCloneDbsBuilder.appendBool(*i, 1); + dbsObj = o.getObjectField("incompleteCloneDbs"); + if (!dbsObj.isEmpty()) { + BSONObjIterator i(dbsObj); + while (1) { + BSONElement e = i.next(); + if (e.eoo()) + break; + incompleteCloneDbs.insert(e.fieldName()); } - if ( n ) - b.append("incompleteCloneDbs", incompleteCloneDbsBuilder.done()); - - return b.obj(); } + ensureMe(txn); +} + +/* Turn our C++ Source object into a BSONObj */ +BSONObj ReplSource::jsobj() { + BSONObjBuilder b; + b.append("host", hostName); + b.append("source", sourceName()); + if (!only.empty()) + b.append("only", only); + if (!syncedTo.isNull()) + b.appendTimestamp("syncedTo", syncedTo.asDate()); + + BSONObjBuilder dbsNextPassBuilder; + int n = 0; + for (set<string>::iterator i = addDbNextPass.begin(); i != addDbNextPass.end(); i++) { + n++; + dbsNextPassBuilder.appendBool(*i, 1); + } + if (n) + b.append("dbsNextPass", dbsNextPassBuilder.done()); + + BSONObjBuilder incompleteCloneDbsBuilder; + n = 0; + for (set<string>::iterator i = incompleteCloneDbs.begin(); i != incompleteCloneDbs.end(); i++) { + n++; + incompleteCloneDbsBuilder.appendBool(*i, 1); + } + if (n) + b.append("incompleteCloneDbs", incompleteCloneDbsBuilder.done()); - void ReplSource::ensureMe(OperationContext* txn) { - string myname = getHostName(); + return b.obj(); +} - // local.me is an identifier for a server for getLastError w:2+ - bool exists = Helpers::getSingleton(txn, "local.me", _me); +void ReplSource::ensureMe(OperationContext* txn) { + string myname = getHostName(); - if (!exists || !_me.hasField("host") || _me["host"].String() != myname) { - ScopedTransaction transaction(txn, MODE_IX); - Lock::DBLock dblk(txn->lockState(), "local", MODE_X); - WriteUnitOfWork wunit(txn); - // clean out local.me - Helpers::emptyCollection(txn, "local.me"); + // local.me is an identifier for a server for getLastError w:2+ + bool exists = Helpers::getSingleton(txn, "local.me", _me); - // repopulate - BSONObjBuilder b; - b.appendOID("_id", 0, true); - b.append("host", myname); - _me = b.obj(); - Helpers::putSingleton(txn, "local.me", _me); - wunit.commit(); - } - _me = _me.getOwned(); - } + if (!exists || !_me.hasField("host") || _me["host"].String() != myname) { + ScopedTransaction transaction(txn, MODE_IX); + Lock::DBLock dblk(txn->lockState(), "local", MODE_X); + WriteUnitOfWork wunit(txn); + // clean out local.me + Helpers::emptyCollection(txn, "local.me"); - void ReplSource::save(OperationContext* txn) { + // repopulate BSONObjBuilder b; - verify( !hostName.empty() ); - b.append("host", hostName); - // todo: finish allowing multiple source configs. - // this line doesn't work right when source is null, if that is allowed as it is now: - //b.append("source", _sourceName); - BSONObj pattern = b.done(); + b.appendOID("_id", 0, true); + b.append("host", myname); + _me = b.obj(); + Helpers::putSingleton(txn, "local.me", _me); + wunit.commit(); + } + _me = _me.getOwned(); +} - BSONObj o = jsobj(); - LOG( 1 ) << "Saving repl source: " << o << endl; +void ReplSource::save(OperationContext* txn) { + BSONObjBuilder b; + verify(!hostName.empty()); + b.append("host", hostName); + // todo: finish allowing multiple source configs. + // this line doesn't work right when source is null, if that is allowed as it is now: + // b.append("source", _sourceName); + BSONObj pattern = b.done(); - { - OpDebug debug; + BSONObj o = jsobj(); + LOG(1) << "Saving repl source: " << o << endl; - Client::Context ctx(txn, "local.sources"); + { + OpDebug debug; - const NamespaceString requestNs("local.sources"); - UpdateRequest request(requestNs); + Client::Context ctx(txn, "local.sources"); - request.setQuery(pattern); - request.setUpdates(o); - request.setUpsert(); + const NamespaceString requestNs("local.sources"); + UpdateRequest request(requestNs); - UpdateResult res = update(txn, ctx.db(), request, &debug); + request.setQuery(pattern); + request.setUpdates(o); + request.setUpsert(); - verify( ! res.modifiers ); - verify( res.numMatched == 1 ); - } - } + UpdateResult res = update(txn, ctx.db(), request, &debug); - static void addSourceToList(OperationContext* txn, - ReplSource::SourceVector &v, - ReplSource& s, - ReplSource::SourceVector &old) { - if ( !s.syncedTo.isNull() ) { // Don't reuse old ReplSource if there was a forced resync. - for ( ReplSource::SourceVector::iterator i = old.begin(); i != old.end(); ) { - if ( s == **i ) { - v.push_back(*i); - old.erase(i); - return; - } - i++; + verify(!res.modifiers); + verify(res.numMatched == 1); + } +} + +static void addSourceToList(OperationContext* txn, + ReplSource::SourceVector& v, + ReplSource& s, + ReplSource::SourceVector& old) { + if (!s.syncedTo.isNull()) { // Don't reuse old ReplSource if there was a forced resync. + for (ReplSource::SourceVector::iterator i = old.begin(); i != old.end();) { + if (s == **i) { + v.push_back(*i); + old.erase(i); + return; } + i++; } - - v.push_back( boost::shared_ptr< ReplSource >( new ReplSource( s ) ) ); } - /* we reuse our existing objects so that we can keep our existing connection - and cursor in effect. - */ - void ReplSource::loadAll(OperationContext* txn, SourceVector &v) { - const char* localSources = "local.sources"; - Client::Context ctx(txn, localSources); - SourceVector old = v; - v.clear(); - - const ReplSettings& replSettings = getGlobalReplicationCoordinator()->getSettings(); - if (!replSettings.source.empty()) { - // --source <host> specified. - // check that no items are in sources other than that - // add if missing - int n = 0; - auto_ptr<PlanExecutor> exec( - InternalPlanner::collectionScan(txn, - localSources, - ctx.db()->getCollection(localSources))); - BSONObj obj; - PlanExecutor::ExecState state; - while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { - n++; - ReplSource tmp(txn, obj); - if (tmp.hostName != replSettings.source) { - log() << "repl: --source " << replSettings.source << " != " << tmp.hostName - << " from local.sources collection" << endl; - log() << "repl: for instructions on changing this slave's source, see:" << endl; - log() << "http://dochub.mongodb.org/core/masterslave" << endl; - log() << "repl: terminating mongod after 30 seconds" << endl; - sleepsecs(30); - dbexit( EXIT_REPLICATION_ERROR ); - } - if (tmp.only != replSettings.only) { - log() << "--only " << replSettings.only << " != " << tmp.only - << " from local.sources collection" << endl; - log() << "terminating after 30 seconds" << endl; - sleepsecs(30); - dbexit( EXIT_REPLICATION_ERROR ); - } - } - uassert(17065, "Internal error reading from local.sources", PlanExecutor::IS_EOF == state); - uassert( 10002 , "local.sources collection corrupt?", n<2 ); - if ( n == 0 ) { - // source missing. add. - ReplSource s(txn); - s.hostName = replSettings.source; - s.only = replSettings.only; - s.save(txn); - } - } - else { - try { - massert(10384 , "--only requires use of --source", replSettings.only.empty()); - } - catch ( ... ) { - dbexit( EXIT_BADOPTIONS ); - } - } + v.push_back(boost::shared_ptr<ReplSource>(new ReplSource(s))); +} - auto_ptr<PlanExecutor> exec( - InternalPlanner::collectionScan(txn, - localSources, - ctx.db()->getCollection(localSources))); +/* we reuse our existing objects so that we can keep our existing connection + and cursor in effect. +*/ +void ReplSource::loadAll(OperationContext* txn, SourceVector& v) { + const char* localSources = "local.sources"; + Client::Context ctx(txn, localSources); + SourceVector old = v; + v.clear(); + + const ReplSettings& replSettings = getGlobalReplicationCoordinator()->getSettings(); + if (!replSettings.source.empty()) { + // --source <host> specified. + // check that no items are in sources other than that + // add if missing + int n = 0; + auto_ptr<PlanExecutor> exec(InternalPlanner::collectionScan( + txn, localSources, ctx.db()->getCollection(localSources))); BSONObj obj; PlanExecutor::ExecState state; while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { + n++; ReplSource tmp(txn, obj); - if ( tmp.syncedTo.isNull() ) { - DBDirectClient c(txn); - BSONObj op = c.findOne( "local.oplog.$main", QUERY( "op" << NE << "n" ).sort( BSON( "$natural" << -1 ) ) ); - if ( !op.isEmpty() ) { - tmp.syncedTo = op[ "ts" ].date(); - } + if (tmp.hostName != replSettings.source) { + log() << "repl: --source " << replSettings.source << " != " << tmp.hostName + << " from local.sources collection" << endl; + log() << "repl: for instructions on changing this slave's source, see:" << endl; + log() << "http://dochub.mongodb.org/core/masterslave" << endl; + log() << "repl: terminating mongod after 30 seconds" << endl; + sleepsecs(30); + dbexit(EXIT_REPLICATION_ERROR); + } + if (tmp.only != replSettings.only) { + log() << "--only " << replSettings.only << " != " << tmp.only + << " from local.sources collection" << endl; + log() << "terminating after 30 seconds" << endl; + sleepsecs(30); + dbexit(EXIT_REPLICATION_ERROR); } - addSourceToList(txn, v, tmp, old); } - uassert(17066, "Internal error reading from local.sources", PlanExecutor::IS_EOF == state); - } - - bool ReplSource::throttledForceResyncDead( OperationContext* txn, const char *requester ) { - if ( time( 0 ) - lastForcedResync > 600 ) { - forceResyncDead( txn, requester ); - lastForcedResync = time( 0 ); - return true; + uassert(17065, "Internal error reading from local.sources", PlanExecutor::IS_EOF == state); + uassert(10002, "local.sources collection corrupt?", n < 2); + if (n == 0) { + // source missing. add. + ReplSource s(txn); + s.hostName = replSettings.source; + s.only = replSettings.only; + s.save(txn); + } + } else { + try { + massert(10384, "--only requires use of --source", replSettings.only.empty()); + } catch (...) { + dbexit(EXIT_BADOPTIONS); } - return false; } - void ReplSource::forceResyncDead( OperationContext* txn, const char *requester ) { - if ( !replAllDead ) - return; - SourceVector sources; - ReplSource::loadAll(txn, sources); - for( SourceVector::iterator i = sources.begin(); i != sources.end(); ++i ) { - log() << requester << " forcing resync from " << (*i)->hostName << endl; - (*i)->forceResync( txn, requester ); + auto_ptr<PlanExecutor> exec( + InternalPlanner::collectionScan(txn, localSources, ctx.db()->getCollection(localSources))); + BSONObj obj; + PlanExecutor::ExecState state; + while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { + ReplSource tmp(txn, obj); + if (tmp.syncedTo.isNull()) { + DBDirectClient c(txn); + BSONObj op = c.findOne("local.oplog.$main", + QUERY("op" << NE << "n").sort(BSON("$natural" << -1))); + if (!op.isEmpty()) { + tmp.syncedTo = op["ts"].date(); + } } - replAllDead = 0; + addSourceToList(txn, v, tmp, old); } + uassert(17066, "Internal error reading from local.sources", PlanExecutor::IS_EOF == state); +} - bool replHandshake(DBClientConnection *conn, const OID& myRID) { - string myname = getHostName(); - - BSONObjBuilder cmd; - cmd.append("handshake", myRID); - - BSONObj res; - bool ok = conn->runCommand( "admin" , cmd.obj() , res ); - // ignoring for now on purpose for older versions - LOG( ok ? 1 : 0 ) << "replHandshake res not: " << ok << " res: " << res << endl; +bool ReplSource::throttledForceResyncDead(OperationContext* txn, const char* requester) { + if (time(0) - lastForcedResync > 600) { + forceResyncDead(txn, requester); + lastForcedResync = time(0); return true; } + return false; +} - bool ReplSource::_connect(OplogReader* reader, const HostAndPort& host, const OID& myRID) { - if (reader->conn()) { - return true; - } +void ReplSource::forceResyncDead(OperationContext* txn, const char* requester) { + if (!replAllDead) + return; + SourceVector sources; + ReplSource::loadAll(txn, sources); + for (SourceVector::iterator i = sources.begin(); i != sources.end(); ++i) { + log() << requester << " forcing resync from " << (*i)->hostName << endl; + (*i)->forceResync(txn, requester); + } + replAllDead = 0; +} - if (!reader->connect(host)) { - return false; - } +bool replHandshake(DBClientConnection* conn, const OID& myRID) { + string myname = getHostName(); - if (!replHandshake(reader->conn(), myRID)) { - return false; - } + BSONObjBuilder cmd; + cmd.append("handshake", myRID); + + BSONObj res; + bool ok = conn->runCommand("admin", cmd.obj(), res); + // ignoring for now on purpose for older versions + LOG(ok ? 1 : 0) << "replHandshake res not: " << ok << " res: " << res << endl; + return true; +} +bool ReplSource::_connect(OplogReader* reader, const HostAndPort& host, const OID& myRID) { + if (reader->conn()) { return true; } + if (!reader->connect(host)) { + return false; + } - void ReplSource::forceResync( OperationContext* txn, const char *requester ) { - BSONObj info; - { - // This is always a GlobalWrite lock (so no ns/db used from the context) - invariant(txn->lockState()->isW()); - Lock::TempRelease tempRelease(txn->lockState()); + if (!replHandshake(reader->conn(), myRID)) { + return false; + } - if (!_connect(&oplogReader, HostAndPort(hostName), - getGlobalReplicationCoordinator()->getMyRID())) { - msgassertedNoTrace( 14051 , "unable to connect to resync"); - } - /* todo use getDatabaseNames() method here */ - bool ok = oplogReader.conn()->runCommand("admin", - BSON("listDatabases" << 1), - info, - QueryOption_SlaveOk); - massert( 10385 , "Unable to get database list", ok ); + return true; +} + + +void ReplSource::forceResync(OperationContext* txn, const char* requester) { + BSONObj info; + { + // This is always a GlobalWrite lock (so no ns/db used from the context) + invariant(txn->lockState()->isW()); + Lock::TempRelease tempRelease(txn->lockState()); + + if (!_connect(&oplogReader, + HostAndPort(hostName), + getGlobalReplicationCoordinator()->getMyRID())) { + msgassertedNoTrace(14051, "unable to connect to resync"); } + /* todo use getDatabaseNames() method here */ + bool ok = oplogReader.conn()->runCommand( + "admin", BSON("listDatabases" << 1), info, QueryOption_SlaveOk); + massert(10385, "Unable to get database list", ok); + } - BSONObjIterator i( info.getField( "databases" ).embeddedObject() ); - while( i.moreWithEOO() ) { - BSONElement e = i.next(); - if ( e.eoo() ) - break; - string name = e.embeddedObject().getField( "name" ).valuestr(); - if ( !e.embeddedObject().getBoolField( "empty" ) ) { - if ( name != "local" ) { - if ( only.empty() || only == name ) { - resyncDrop( txn, name ); - } + BSONObjIterator i(info.getField("databases").embeddedObject()); + while (i.moreWithEOO()) { + BSONElement e = i.next(); + if (e.eoo()) + break; + string name = e.embeddedObject().getField("name").valuestr(); + if (!e.embeddedObject().getBoolField("empty")) { + if (name != "local") { + if (only.empty() || only == name) { + resyncDrop(txn, name); } } } - syncedTo = OpTime(); - addDbNextPass.clear(); - save(txn); } - - void ReplSource::resyncDrop( OperationContext* txn, const string& db ) { - log() << "resync: dropping database " << db; - Client::Context ctx(txn, db); - dropDatabase(txn, ctx.db()); + syncedTo = OpTime(); + addDbNextPass.clear(); + save(txn); +} + +void ReplSource::resyncDrop(OperationContext* txn, const string& db) { + log() << "resync: dropping database " << db; + Client::Context ctx(txn, db); + dropDatabase(txn, ctx.db()); +} + +/* grab initial copy of a database from the master */ +void ReplSource::resync(OperationContext* txn, const std::string& dbName) { + const std::string db(dbName); // need local copy of the name, we're dropping the original + resyncDrop(txn, db); + + { + log() << "resync: cloning database " << db << " to get an initial copy" << endl; + ReplInfo r("resync: cloning a database"); + string errmsg; + int errCode = 0; + CloneOptions cloneOptions; + cloneOptions.fromDB = db; + cloneOptions.logForRepl = false; + cloneOptions.slaveOk = true; + cloneOptions.useReplAuth = true; + cloneOptions.snapshot = true; + cloneOptions.mayYield = true; + cloneOptions.mayBeInterrupted = false; + + Cloner cloner; + bool ok = cloner.go(txn, db, hostName.c_str(), cloneOptions, NULL, errmsg, &errCode); + + if (!ok) { + if (errCode == DatabaseDifferCaseCode) { + resyncDrop(txn, db); + log() << "resync: database " << db + << " not valid on the master due to a name conflict, dropping." << endl; + return; + } else { + log() << "resync of " << db << " from " << hostName << " failed " << errmsg << endl; + throw SyncException(); + } + } } - /* grab initial copy of a database from the master */ - void ReplSource::resync(OperationContext* txn, const std::string& dbName) { - const std::string db(dbName); // need local copy of the name, we're dropping the original - resyncDrop( txn, db ); + log() << "resync: done with initial clone for db: " << db << endl; - { - log() << "resync: cloning database " << db << " to get an initial copy" << endl; - ReplInfo r("resync: cloning a database"); - string errmsg; - int errCode = 0; - CloneOptions cloneOptions; - cloneOptions.fromDB = db; - cloneOptions.logForRepl = false; - cloneOptions.slaveOk = true; - cloneOptions.useReplAuth = true; - cloneOptions.snapshot = true; - cloneOptions.mayYield = true; - cloneOptions.mayBeInterrupted = false; - - Cloner cloner; - bool ok = cloner.go(txn, - db, - hostName.c_str(), - cloneOptions, - NULL, - errmsg, - &errCode); - - if ( !ok ) { - if ( errCode == DatabaseDifferCaseCode ) { - resyncDrop( txn, db ); - log() << "resync: database " << db << " not valid on the master due to a name conflict, dropping." << endl; - return; - } - else { - log() << "resync of " << db << " from " << hostName << " failed " << errmsg << endl; - throw SyncException(); - } - } - } + return; +} - log() << "resync: done with initial clone for db: " << db << endl; +static DatabaseIgnorer ___databaseIgnorer; - return; - } - - static DatabaseIgnorer ___databaseIgnorer; - - void DatabaseIgnorer::doIgnoreUntilAfter( const string &db, const OpTime &futureOplogTime ) { - if ( futureOplogTime > _ignores[ db ] ) { - _ignores[ db ] = futureOplogTime; - } +void DatabaseIgnorer::doIgnoreUntilAfter(const string& db, const OpTime& futureOplogTime) { + if (futureOplogTime > _ignores[db]) { + _ignores[db] = futureOplogTime; } +} - bool DatabaseIgnorer::ignoreAt( const string &db, const OpTime ¤tOplogTime ) { - if ( _ignores[ db ].isNull() ) { - return false; - } - if ( _ignores[ db ] >= currentOplogTime ) { - return true; - } else { - // The ignore state has expired, so clear it. - _ignores.erase( db ); - return false; - } +bool DatabaseIgnorer::ignoreAt(const string& db, const OpTime& currentOplogTime) { + if (_ignores[db].isNull()) { + return false; + } + if (_ignores[db] >= currentOplogTime) { + return true; + } else { + // The ignore state has expired, so clear it. + _ignores.erase(db); + return false; + } +} + +bool ReplSource::handleDuplicateDbName(OperationContext* txn, + const BSONObj& op, + const char* ns, + const char* db) { + // We are already locked at this point + if (dbHolder().get(txn, ns) != NULL) { + // Database is already present. + return true; + } + BSONElement ts = op.getField("ts"); + if ((ts.type() == Date || ts.type() == Timestamp) && + ___databaseIgnorer.ignoreAt(db, ts.date())) { + // Database is ignored due to a previous indication that it is + // missing from master after optime "ts". + return false; + } + if (Database::duplicateUncasedName(db).empty()) { + // No duplicate database names are present. + return true; } - bool ReplSource::handleDuplicateDbName( OperationContext* txn, - const BSONObj &op, - const char* ns, - const char* db ) { - // We are already locked at this point - if (dbHolder().get(txn, ns) != NULL) { - // Database is already present. - return true; - } - BSONElement ts = op.getField( "ts" ); - if ( ( ts.type() == Date || ts.type() == Timestamp ) && ___databaseIgnorer.ignoreAt( db, ts.date() ) ) { - // Database is ignored due to a previous indication that it is - // missing from master after optime "ts". - return false; - } - if (Database::duplicateUncasedName(db).empty()) { - // No duplicate database names are present. - return true; + OpTime lastTime; + bool dbOk = false; + { + // This is always a GlobalWrite lock (so no ns/db used from the context) + invariant(txn->lockState()->isW()); + Lock::TempRelease(txn->lockState()); + + // We always log an operation after executing it (never before), so + // a database list will always be valid as of an oplog entry generated + // before it was retrieved. + + BSONObj last = + oplogReader.findOne(this->ns().c_str(), Query().sort(BSON("$natural" << -1))); + if (!last.isEmpty()) { + BSONElement ts = last.getField("ts"); + massert( + 14032, "Invalid 'ts' in remote log", ts.type() == Date || ts.type() == Timestamp); + lastTime = OpTime(ts.date()); } - - OpTime lastTime; - bool dbOk = false; - { - // This is always a GlobalWrite lock (so no ns/db used from the context) - invariant(txn->lockState()->isW()); - Lock::TempRelease(txn->lockState()); - - // We always log an operation after executing it (never before), so - // a database list will always be valid as of an oplog entry generated - // before it was retrieved. - - BSONObj last = oplogReader.findOne( this->ns().c_str(), Query().sort( BSON( "$natural" << -1 ) ) ); - if ( !last.isEmpty() ) { - BSONElement ts = last.getField( "ts" ); - massert( 14032, "Invalid 'ts' in remote log", ts.type() == Date || ts.type() == Timestamp ); - lastTime = OpTime( ts.date() ); - } - BSONObj info; - bool ok = oplogReader.conn()->runCommand( "admin", BSON( "listDatabases" << 1 ), info ); - massert( 14033, "Unable to get database list", ok ); - BSONObjIterator i( info.getField( "databases" ).embeddedObject() ); - while( i.more() ) { - BSONElement e = i.next(); - - const char * name = e.embeddedObject().getField( "name" ).valuestr(); - if ( strcasecmp( name, db ) != 0 ) - continue; - - if ( strcmp( name, db ) == 0 ) { - // The db exists on master, still need to check that no conflicts exist there. - dbOk = true; - continue; - } - - // The master has a db name that conflicts with the requested name. - dbOk = false; - break; + BSONObj info; + bool ok = oplogReader.conn()->runCommand("admin", BSON("listDatabases" << 1), info); + massert(14033, "Unable to get database list", ok); + BSONObjIterator i(info.getField("databases").embeddedObject()); + while (i.more()) { + BSONElement e = i.next(); + + const char* name = e.embeddedObject().getField("name").valuestr(); + if (strcasecmp(name, db) != 0) + continue; + + if (strcmp(name, db) == 0) { + // The db exists on master, still need to check that no conflicts exist there. + dbOk = true; + continue; } + + // The master has a db name that conflicts with the requested name. + dbOk = false; + break; } - - if ( !dbOk ) { - ___databaseIgnorer.doIgnoreUntilAfter( db, lastTime ); - incompleteCloneDbs.erase(db); - addDbNextPass.erase(db); - return false; - } - - // Check for duplicates again, since we released the lock above. - set< string > duplicates; - Database::duplicateUncasedName(db, &duplicates); - - // The database is present on the master and no conflicting databases - // are present on the master. Drop any local conflicts. - for( set< string >::const_iterator i = duplicates.begin(); i != duplicates.end(); ++i ) { - ___databaseIgnorer.doIgnoreUntilAfter( *i, lastTime ); - incompleteCloneDbs.erase(*i); - addDbNextPass.erase(*i); - - Client::Context ctx(txn, *i); - dropDatabase(txn, ctx.db()); - } - - massert(14034, "Duplicate database names present after attempting to delete duplicates", - Database::duplicateUncasedName(db).empty()); - return true; } - void ReplSource::applyOperation(OperationContext* txn, Database* db, const BSONObj& op) { - try { - bool failedUpdate = applyOperation_inlock( txn, db, op ); - if (failedUpdate) { - Sync sync(hostName); - if (sync.shouldRetry(txn, op)) { - uassert(15914, - "Failure retrying initial sync update", - !applyOperation_inlock(txn, db, op)); - } + if (!dbOk) { + ___databaseIgnorer.doIgnoreUntilAfter(db, lastTime); + incompleteCloneDbs.erase(db); + addDbNextPass.erase(db); + return false; + } + + // Check for duplicates again, since we released the lock above. + set<string> duplicates; + Database::duplicateUncasedName(db, &duplicates); + + // The database is present on the master and no conflicting databases + // are present on the master. Drop any local conflicts. + for (set<string>::const_iterator i = duplicates.begin(); i != duplicates.end(); ++i) { + ___databaseIgnorer.doIgnoreUntilAfter(*i, lastTime); + incompleteCloneDbs.erase(*i); + addDbNextPass.erase(*i); + + Client::Context ctx(txn, *i); + dropDatabase(txn, ctx.db()); + } + + massert(14034, + "Duplicate database names present after attempting to delete duplicates", + Database::duplicateUncasedName(db).empty()); + return true; +} + +void ReplSource::applyOperation(OperationContext* txn, Database* db, const BSONObj& op) { + try { + bool failedUpdate = applyOperation_inlock(txn, db, op); + if (failedUpdate) { + Sync sync(hostName); + if (sync.shouldRetry(txn, op)) { + uassert(15914, + "Failure retrying initial sync update", + !applyOperation_inlock(txn, db, op)); } } - catch ( UserException& e ) { - log() << "sync: caught user assertion " << e << " while applying op: " << op << endl;; - } - catch ( DBException& e ) { - log() << "sync: caught db exception " << e << " while applying op: " << op << endl;; - } - + } catch (UserException& e) { + log() << "sync: caught user assertion " << e << " while applying op: " << op << endl; + ; + } catch (DBException& e) { + log() << "sync: caught db exception " << e << " while applying op: " << op << endl; + ; } +} - /* local.$oplog.main is of the form: - { ts: ..., op: <optype>, ns: ..., o: <obj> , o2: <extraobj>, b: <boolflag> } - ... - see logOp() comments. +/* local.$oplog.main is of the form: + { ts: ..., op: <optype>, ns: ..., o: <obj> , o2: <extraobj>, b: <boolflag> } + ... + see logOp() comments. - @param alreadyLocked caller already put us in write lock if true - */ - void ReplSource::_sync_pullOpLog_applyOperation(OperationContext* txn, BSONObj& op, bool alreadyLocked) { - LOG(6) << "processing op: " << op << endl; + @param alreadyLocked caller already put us in write lock if true +*/ +void ReplSource::_sync_pullOpLog_applyOperation(OperationContext* txn, + BSONObj& op, + bool alreadyLocked) { + LOG(6) << "processing op: " << op << endl; - if( op.getStringField("op")[0] == 'n' ) - return; + if (op.getStringField("op")[0] == 'n') + return; - char clientName[MaxDatabaseNameLen]; - const char *ns = op.getStringField("ns"); - nsToDatabase(ns, clientName); + char clientName[MaxDatabaseNameLen]; + const char* ns = op.getStringField("ns"); + nsToDatabase(ns, clientName); - if ( *ns == '.' ) { - log() << "skipping bad op in oplog: " << op.toString() << endl; - return; - } - else if ( *ns == 0 ) { - /*if( op.getStringField("op")[0] != 'n' )*/ { - log() << "halting replication, bad op in oplog:\n " << op.toString() << endl; - replAllDead = "bad object in oplog"; - throw SyncException(); - } - //ns = "local.system.x"; - //nsToDatabase(ns, clientName); + if (*ns == '.') { + log() << "skipping bad op in oplog: " << op.toString() << endl; + return; + } else if (*ns == 0) { + /*if( op.getStringField("op")[0] != 'n' )*/ { + log() << "halting replication, bad op in oplog:\n " << op.toString() << endl; + replAllDead = "bad object in oplog"; + throw SyncException(); } + // ns = "local.system.x"; + // nsToDatabase(ns, clientName); + } - if ( !only.empty() && only != clientName ) - return; + if (!only.empty() && only != clientName) + return; - const ReplSettings& replSettings = getGlobalReplicationCoordinator()->getSettings(); - if (replSettings.pretouch && - !alreadyLocked/*doesn't make sense if in write lock already*/) { - if (replSettings.pretouch > 1) { - /* note: this is bad - should be put in ReplSource. but this is first test... */ - static int countdown; - verify( countdown >= 0 ); - if( countdown > 0 ) { - countdown--; // was pretouched on a prev pass + const ReplSettings& replSettings = getGlobalReplicationCoordinator()->getSettings(); + if (replSettings.pretouch && !alreadyLocked /*doesn't make sense if in write lock already*/) { + if (replSettings.pretouch > 1) { + /* note: this is bad - should be put in ReplSource. but this is first test... */ + static int countdown; + verify(countdown >= 0); + if (countdown > 0) { + countdown--; // was pretouched on a prev pass + } else { + const int m = 4; + if (tp.get() == 0) { + int nthr = min(8, replSettings.pretouch); + nthr = max(nthr, 1); + tp.reset(new ThreadPool(nthr)); } - else { - const int m = 4; - if( tp.get() == 0 ) { - int nthr = min(8, replSettings.pretouch); - nthr = max(nthr, 1); - tp.reset( new ThreadPool(nthr) ); - } - vector<BSONObj> v; - oplogReader.peek(v, replSettings.pretouch); - unsigned a = 0; - while( 1 ) { - if( a >= v.size() ) break; - unsigned b = a + m - 1; // v[a..b] - if( b >= v.size() ) b = v.size() - 1; - tp->schedule(pretouchN, v, a, b); - DEV cout << "pretouch task: " << a << ".." << b << endl; - a += m; - } - // we do one too... - pretouchOperation(txn, op); - tp->join(); - countdown = v.size(); + vector<BSONObj> v; + oplogReader.peek(v, replSettings.pretouch); + unsigned a = 0; + while (1) { + if (a >= v.size()) + break; + unsigned b = a + m - 1; // v[a..b] + if (b >= v.size()) + b = v.size() - 1; + tp->schedule(pretouchN, v, a, b); + DEV cout << "pretouch task: " << a << ".." << b << endl; + a += m; } - } - else { + // we do one too... pretouchOperation(txn, op); + tp->join(); + countdown = v.size(); } + } else { + pretouchOperation(txn, op); } + } - scoped_ptr<Lock::GlobalWrite> lk(alreadyLocked ? 0 : new Lock::GlobalWrite(txn->lockState())); + scoped_ptr<Lock::GlobalWrite> lk(alreadyLocked ? 0 : new Lock::GlobalWrite(txn->lockState())); - if ( replAllDead ) { - // hmmm why is this check here and not at top of this function? does it get set between top and here? - log() << "replAllDead, throwing SyncException: " << replAllDead << endl; - throw SyncException(); - } + if (replAllDead) { + // hmmm why is this check here and not at top of this function? does it get set between top and here? + log() << "replAllDead, throwing SyncException: " << replAllDead << endl; + throw SyncException(); + } - if (!handleDuplicateDbName(txn, op, ns, clientName)) { - return; - } + if (!handleDuplicateDbName(txn, op, ns, clientName)) { + return; + } - // This code executes on the slaves only, so it doesn't need to be sharding-aware since - // mongos will not send requests there. That's why the last argument is false (do not do - // version checking). - Client::Context ctx(txn, ns, false); - ctx.getClient()->curop()->reset(); + // This code executes on the slaves only, so it doesn't need to be sharding-aware since + // mongos will not send requests there. That's why the last argument is false (do not do + // version checking). + Client::Context ctx(txn, ns, false); + ctx.getClient()->curop()->reset(); - bool empty = !ctx.db()->getDatabaseCatalogEntry()->hasUserData(); - bool incompleteClone = incompleteCloneDbs.count( clientName ) != 0; + bool empty = !ctx.db()->getDatabaseCatalogEntry()->hasUserData(); + bool incompleteClone = incompleteCloneDbs.count(clientName) != 0; - LOG(6) << "ns: " << ns << ", justCreated: " << ctx.justCreated() << ", empty: " << empty << ", incompleteClone: " << incompleteClone << endl; + LOG(6) << "ns: " << ns << ", justCreated: " << ctx.justCreated() << ", empty: " << empty + << ", incompleteClone: " << incompleteClone << endl; - // always apply admin command command - // this is a bit hacky -- the semantics of replication/commands aren't well specified - if ( strcmp( clientName, "admin" ) == 0 && *op.getStringField( "op" ) == 'c' ) { - applyOperation(txn, ctx.db(), op); - return; - } + // always apply admin command command + // this is a bit hacky -- the semantics of replication/commands aren't well specified + if (strcmp(clientName, "admin") == 0 && *op.getStringField("op") == 'c') { + applyOperation(txn, ctx.db(), op); + return; + } - if ( ctx.justCreated() || empty || incompleteClone ) { - // we must add to incomplete list now that setClient has been called - incompleteCloneDbs.insert( clientName ); - if ( nClonedThisPass ) { - /* we only clone one database per pass, even if a lot need done. This helps us - avoid overflowing the master's transaction log by doing too much work before going - back to read more transactions. (Imagine a scenario of slave startup where we try to - clone 100 databases in one pass.) - */ - addDbNextPass.insert( clientName ); - } - else { - if ( incompleteClone ) { - log() << "An earlier initial clone of '" << clientName << "' did not complete, now resyncing." << endl; - } - save(txn); - Client::Context ctx(txn, ns); - nClonedThisPass++; - resync(txn, ctx.db()->name()); - addDbNextPass.erase(clientName); - incompleteCloneDbs.erase( clientName ); + if (ctx.justCreated() || empty || incompleteClone) { + // we must add to incomplete list now that setClient has been called + incompleteCloneDbs.insert(clientName); + if (nClonedThisPass) { + /* we only clone one database per pass, even if a lot need done. This helps us + avoid overflowing the master's transaction log by doing too much work before going + back to read more transactions. (Imagine a scenario of slave startup where we try to + clone 100 databases in one pass.) + */ + addDbNextPass.insert(clientName); + } else { + if (incompleteClone) { + log() << "An earlier initial clone of '" << clientName + << "' did not complete, now resyncing." << endl; } save(txn); + Client::Context ctx(txn, ns); + nClonedThisPass++; + resync(txn, ctx.db()->name()); + addDbNextPass.erase(clientName); + incompleteCloneDbs.erase(clientName); } - else { - applyOperation(txn, ctx.db(), op); - addDbNextPass.erase( clientName ); - } + save(txn); + } else { + applyOperation(txn, ctx.db(), op); + addDbNextPass.erase(clientName); } +} - void ReplSource::syncToTailOfRemoteLog() { - string _ns = ns(); - BSONObjBuilder b; - if ( !only.empty() ) { - b.appendRegex("ns", string("^") + pcrecpp::RE::QuoteMeta( only )); - } - BSONObj last = oplogReader.findOne( _ns.c_str(), Query( b.done() ).sort( BSON( "$natural" << -1 ) ) ); - if ( !last.isEmpty() ) { - BSONElement ts = last.getField( "ts" ); - massert( 10386 , "non Date ts found: " + last.toString(), ts.type() == Date || ts.type() == Timestamp ); - syncedTo = OpTime( ts.date() ); - } +void ReplSource::syncToTailOfRemoteLog() { + string _ns = ns(); + BSONObjBuilder b; + if (!only.empty()) { + b.appendRegex("ns", string("^") + pcrecpp::RE::QuoteMeta(only)); } + BSONObj last = oplogReader.findOne(_ns.c_str(), Query(b.done()).sort(BSON("$natural" << -1))); + if (!last.isEmpty()) { + BSONElement ts = last.getField("ts"); + massert(10386, + "non Date ts found: " + last.toString(), + ts.type() == Date || ts.type() == Timestamp); + syncedTo = OpTime(ts.date()); + } +} - class ReplApplyBatchSize : public ServerParameter { - public: - ReplApplyBatchSize() - : ServerParameter( ServerParameterSet::getGlobal(), "replApplyBatchSize" ), - _value( 1 ) { - } +class ReplApplyBatchSize : public ServerParameter { +public: + ReplApplyBatchSize() + : ServerParameter(ServerParameterSet::getGlobal(), "replApplyBatchSize"), _value(1) {} + + int get() const { + return _value; + } + + virtual void append(OperationContext* txn, BSONObjBuilder& b, const string& name) { + b.append(name, _value); + } - int get() const { return _value; } + virtual Status set(const BSONElement& newValuElement) { + return set(newValuElement.numberInt()); + } - virtual void append(OperationContext* txn, BSONObjBuilder& b, const string& name) { - b.append( name, _value ); + virtual Status set(int b) { + if (b < 1 || b > 1024) { + return Status(ErrorCodes::BadValue, "replApplyBatchSize has to be >= 1 and < 1024"); } - virtual Status set( const BSONElement& newValuElement ) { - return set( newValuElement.numberInt() ); + const ReplSettings& replSettings = getGlobalReplicationCoordinator()->getSettings(); + if (replSettings.slavedelay != 0 && b > 1) { + return Status(ErrorCodes::BadValue, "can't use a batch size > 1 with slavedelay"); + } + if (!replSettings.slave) { + return Status(ErrorCodes::BadValue, + "can't set replApplyBatchSize on a non-slave machine"); } - virtual Status set( int b ) { - if( b < 1 || b > 1024 ) { - return Status( ErrorCodes::BadValue, - "replApplyBatchSize has to be >= 1 and < 1024" ); - } + _value = b; + return Status::OK(); + } - const ReplSettings& replSettings = getGlobalReplicationCoordinator()->getSettings(); - if ( replSettings.slavedelay != 0 && b > 1 ) { - return Status( ErrorCodes::BadValue, - "can't use a batch size > 1 with slavedelay" ); - } - if ( ! replSettings.slave ) { - return Status( ErrorCodes::BadValue, - "can't set replApplyBatchSize on a non-slave machine" ); - } + virtual Status setFromString(const string& str) { + return set(atoi(str.c_str())); + } - _value = b; - return Status::OK(); - } + int _value; - virtual Status setFromString( const string& str ) { - return set( atoi( str.c_str() ) ); - } +} replApplyBatchSize; - int _value; - - } replApplyBatchSize; - - /* slave: pull some data from the master's oplog - note: not yet in db mutex at this point. - @return -1 error - 0 ok, don't sleep - 1 ok, sleep - */ - int ReplSource::_sync_pullOpLog(OperationContext* txn, int& nApplied) { - int okResultCode = 1; - string ns = string("local.oplog.$") + sourceName(); - LOG(2) << "repl: sync_pullOpLog " << ns << " syncedTo:" << syncedTo.toStringLong() << '\n'; - - bool tailing = true; - oplogReader.tailCheck(); - - bool initial = syncedTo.isNull(); - - if ( !oplogReader.haveCursor() || initial ) { - if ( initial ) { - // Important to grab last oplog timestamp before listing databases. - syncToTailOfRemoteLog(); - BSONObj info; - bool ok = oplogReader.conn()->runCommand( "admin", BSON( "listDatabases" << 1 ), info ); - massert( 10389 , "Unable to get database list", ok ); - BSONObjIterator i( info.getField( "databases" ).embeddedObject() ); - while( i.moreWithEOO() ) { - BSONElement e = i.next(); - if ( e.eoo() ) - break; - string name = e.embeddedObject().getField( "name" ).valuestr(); - if ( !e.embeddedObject().getBoolField( "empty" ) ) { - if ( name != "local" ) { - if ( only.empty() || only == name ) { - LOG( 2 ) << "adding to 'addDbNextPass': " << name << endl; - addDbNextPass.insert( name ); - } +/* slave: pull some data from the master's oplog + note: not yet in db mutex at this point. + @return -1 error + 0 ok, don't sleep + 1 ok, sleep +*/ +int ReplSource::_sync_pullOpLog(OperationContext* txn, int& nApplied) { + int okResultCode = 1; + string ns = string("local.oplog.$") + sourceName(); + LOG(2) << "repl: sync_pullOpLog " << ns << " syncedTo:" << syncedTo.toStringLong() << '\n'; + + bool tailing = true; + oplogReader.tailCheck(); + + bool initial = syncedTo.isNull(); + + if (!oplogReader.haveCursor() || initial) { + if (initial) { + // Important to grab last oplog timestamp before listing databases. + syncToTailOfRemoteLog(); + BSONObj info; + bool ok = oplogReader.conn()->runCommand("admin", BSON("listDatabases" << 1), info); + massert(10389, "Unable to get database list", ok); + BSONObjIterator i(info.getField("databases").embeddedObject()); + while (i.moreWithEOO()) { + BSONElement e = i.next(); + if (e.eoo()) + break; + string name = e.embeddedObject().getField("name").valuestr(); + if (!e.embeddedObject().getBoolField("empty")) { + if (name != "local") { + if (only.empty() || only == name) { + LOG(2) << "adding to 'addDbNextPass': " << name << endl; + addDbNextPass.insert(name); } } } - // obviously global isn't ideal, but non-repl set is old so - // keeping it simple - ScopedTransaction transaction(txn, MODE_X); - Lock::GlobalWrite lk(txn->lockState()); - save(txn); } + // obviously global isn't ideal, but non-repl set is old so + // keeping it simple + ScopedTransaction transaction(txn, MODE_X); + Lock::GlobalWrite lk(txn->lockState()); + save(txn); + } - BSONObjBuilder gte; - gte.appendTimestamp("$gte", syncedTo.asDate()); - BSONObjBuilder query; - query.append("ts", gte.done()); - if ( !only.empty() ) { - // note we may here skip a LOT of data table scanning, a lot of work for the master. - // maybe append "\\." here? - query.appendRegex("ns", string("^") + pcrecpp::RE::QuoteMeta( only )); - } - BSONObj queryObj = query.done(); - // e.g. queryObj = { ts: { $gte: syncedTo } } + BSONObjBuilder gte; + gte.appendTimestamp("$gte", syncedTo.asDate()); + BSONObjBuilder query; + query.append("ts", gte.done()); + if (!only.empty()) { + // note we may here skip a LOT of data table scanning, a lot of work for the master. + // maybe append "\\." here? + query.appendRegex("ns", string("^") + pcrecpp::RE::QuoteMeta(only)); + } + BSONObj queryObj = query.done(); + // e.g. queryObj = { ts: { $gte: syncedTo } } + + oplogReader.tailingQuery(ns.c_str(), queryObj); + tailing = false; + } else { + LOG(2) << "repl: tailing=true\n"; + } + + if (!oplogReader.haveCursor()) { + log() << "repl: dbclient::query returns null (conn closed?)" << endl; + oplogReader.resetConnection(); + return -1; + } - oplogReader.tailingQuery(ns.c_str(), queryObj); - tailing = false; + // show any deferred database creates from a previous pass + { + set<string>::iterator i = addDbNextPass.begin(); + if (i != addDbNextPass.end()) { + BSONObjBuilder b; + b.append("ns", *i + '.'); + b.append("op", "db"); + BSONObj op = b.done(); + _sync_pullOpLog_applyOperation(txn, op, false); } - else { - LOG(2) << "repl: tailing=true\n"; + } + + if (!oplogReader.more()) { + if (tailing) { + LOG(2) << "repl: tailing & no new activity\n"; + okResultCode = 0; // don't sleep + + } else { + log() << "repl: " << ns << " oplog is empty" << endl; + } + { + ScopedTransaction transaction(txn, MODE_X); + Lock::GlobalWrite lk(txn->lockState()); + save(txn); } + return okResultCode; + } - if( !oplogReader.haveCursor() ) { - log() << "repl: dbclient::query returns null (conn closed?)" << endl; - oplogReader.resetConnection(); - return -1; + OpTime nextOpTime; + { + BSONObj op = oplogReader.next(); + BSONElement ts = op.getField("ts"); + if (ts.type() != Date && ts.type() != Timestamp) { + string err = op.getStringField("$err"); + if (!err.empty()) { + // 13051 is "tailable cursor requested on non capped collection" + if (op.getIntField("code") == 13051) { + log() << "trying to slave off of a non-master" << '\n'; + massert(13344, "trying to slave off of a non-master", false); + } else { + log() << "repl: $err reading remote oplog: " + err << '\n'; + massert(10390, "got $err reading remote oplog", false); + } + } else { + log() << "repl: bad object read from remote oplog: " << op.toString() << '\n'; + massert(10391, "repl: bad object read from remote oplog", false); + } } - // show any deferred database creates from a previous pass - { - set<string>::iterator i = addDbNextPass.begin(); - if ( i != addDbNextPass.end() ) { - BSONObjBuilder b; - b.append("ns", *i + '.'); - b.append("op", "db"); - BSONObj op = b.done(); - _sync_pullOpLog_applyOperation(txn, op, false); + nextOpTime = OpTime(ts.date()); + LOG(2) << "repl: first op time received: " << nextOpTime.toString() << '\n'; + if (initial) { + LOG(1) << "repl: initial run\n"; + } + if (tailing) { + if (!(syncedTo < nextOpTime)) { + log() << "repl ASSERTION failed : syncedTo < nextOpTime" << endl; + log() << "repl syncTo: " << syncedTo.toStringLong() << endl; + log() << "repl nextOpTime: " << nextOpTime.toStringLong() << endl; + verify(false); } + oplogReader.putBack(op); // op will be processed in the loop below + nextOpTime = OpTime(); // will reread the op below + } else if (nextOpTime != syncedTo) { // didn't get what we queried for - error + log() << "repl: nextOpTime " << nextOpTime.toStringLong() << ' ' + << ((nextOpTime < syncedTo) ? "<??" : ">") << " syncedTo " + << syncedTo.toStringLong() << '\n' + << "repl: time diff: " << (nextOpTime.getSecs() - syncedTo.getSecs()) << "sec\n" + << "repl: tailing: " << tailing << '\n' + << "repl: data too stale, halting replication" << endl; + replInfo = replAllDead = "data too stale halted replication"; + verify(syncedTo < nextOpTime); + throw SyncException(); + } else { + /* t == syncedTo, so the first op was applied previously or it is the first op of initial query and need not be applied. */ } + } - if ( !oplogReader.more() ) { - if ( tailing ) { - LOG(2) << "repl: tailing & no new activity\n"; - okResultCode = 0; // don't sleep + // apply operations + { + int n = 0; + time_t saveLast = time(0); + while (1) { + // we need "&& n" to assure we actually process at least one op to get a sync + // point recorded in the first place. + const bool moreInitialSyncsPending = !addDbNextPass.empty() && n; - } - else { - log() << "repl: " << ns << " oplog is empty" << endl; - } - { + if (moreInitialSyncsPending || !oplogReader.more()) { ScopedTransaction transaction(txn, MODE_X); Lock::GlobalWrite lk(txn->lockState()); - save(txn); - } - return okResultCode; - } - OpTime nextOpTime; - { - BSONObj op = oplogReader.next(); - BSONElement ts = op.getField("ts"); - if ( ts.type() != Date && ts.type() != Timestamp ) { - string err = op.getStringField("$err"); - if ( !err.empty() ) { - // 13051 is "tailable cursor requested on non capped collection" - if (op.getIntField("code") == 13051) { - log() << "trying to slave off of a non-master" << '\n'; - massert( 13344 , "trying to slave off of a non-master", false ); - } - else { - log() << "repl: $err reading remote oplog: " + err << '\n'; - massert( 10390 , "got $err reading remote oplog", false ); - } + if (tailing) { + okResultCode = 0; // don't sleep } - else { - log() << "repl: bad object read from remote oplog: " << op.toString() << '\n'; - massert( 10391 , "repl: bad object read from remote oplog", false); - } - } - nextOpTime = OpTime( ts.date() ); - LOG(2) << "repl: first op time received: " << nextOpTime.toString() << '\n'; - if ( initial ) { - LOG(1) << "repl: initial run\n"; - } - if( tailing ) { - if( !( syncedTo < nextOpTime ) ) { - log() << "repl ASSERTION failed : syncedTo < nextOpTime" << endl; - log() << "repl syncTo: " << syncedTo.toStringLong() << endl; - log() << "repl nextOpTime: " << nextOpTime.toStringLong() << endl; - verify(false); - } - oplogReader.putBack( op ); // op will be processed in the loop below - nextOpTime = OpTime(); // will reread the op below - } - else if ( nextOpTime != syncedTo ) { // didn't get what we queried for - error - log() - << "repl: nextOpTime " << nextOpTime.toStringLong() << ' ' - << ((nextOpTime < syncedTo) ? "<??" : ">") - << " syncedTo " << syncedTo.toStringLong() << '\n' - << "repl: time diff: " << (nextOpTime.getSecs() - syncedTo.getSecs()) - << "sec\n" - << "repl: tailing: " << tailing << '\n' - << "repl: data too stale, halting replication" << endl; - replInfo = replAllDead = "data too stale halted replication"; - verify( syncedTo < nextOpTime ); - throw SyncException(); + syncedTo = nextOpTime; + save(txn); // note how far we are synced up to now + nApplied = n; + break; } - else { - /* t == syncedTo, so the first op was applied previously or it is the first op of initial query and need not be applied. */ + + OCCASIONALLY if (n > 0 && (n > 100000 || time(0) - saveLast > 60)) { + // periodically note our progress, in case we are doing a lot of work and crash + ScopedTransaction transaction(txn, MODE_X); + Lock::GlobalWrite lk(txn->lockState()); + syncedTo = nextOpTime; + // can't update local log ts since there are pending operations from our peer + save(txn); + log() << "repl: checkpoint applied " << n << " operations" << endl; + log() << "repl: syncedTo: " << syncedTo.toStringLong() << endl; + saveLast = time(0); + n = 0; } - } - // apply operations - { - int n = 0; - time_t saveLast = time(0); - while ( 1 ) { - // we need "&& n" to assure we actually process at least one op to get a sync - // point recorded in the first place. - const bool moreInitialSyncsPending = !addDbNextPass.empty() && n; - - if ( moreInitialSyncsPending || !oplogReader.more() ) { - ScopedTransaction transaction(txn, MODE_X); - Lock::GlobalWrite lk(txn->lockState()); - - if (tailing) { - okResultCode = 0; // don't sleep - } + BSONObj op = oplogReader.next(); - syncedTo = nextOpTime; - save(txn); // note how far we are synced up to now - nApplied = n; - break; + int b = replApplyBatchSize.get(); + bool justOne = b == 1; + scoped_ptr<Lock::GlobalWrite> lk(justOne ? 0 : new Lock::GlobalWrite(txn->lockState())); + while (1) { + BSONElement ts = op.getField("ts"); + if (!(ts.type() == Date || ts.type() == Timestamp)) { + log() << "sync error: problem querying remote oplog record" << endl; + log() << "op: " << op.toString() << endl; + log() << "halting replication" << endl; + replInfo = replAllDead = "sync error: no ts found querying remote oplog record"; + throw SyncException(); } - - OCCASIONALLY if( n > 0 && ( n > 100000 || time(0) - saveLast > 60 ) ) { - // periodically note our progress, in case we are doing a lot of work and crash + OpTime last = nextOpTime; + nextOpTime = OpTime(ts.date()); + if (!(last < nextOpTime)) { + log() << "sync error: last applied optime at slave >= nextOpTime from master" + << endl; + log() << " last: " << last.toStringLong() << endl; + log() << " nextOpTime: " << nextOpTime.toStringLong() << endl; + log() << " halting replication" << endl; + replInfo = replAllDead = "sync error last >= nextOpTime"; + uassert( + 10123, + "replication error last applied optime at slave >= nextOpTime from master", + false); + } + const ReplSettings& replSettings = getGlobalReplicationCoordinator()->getSettings(); + if (replSettings.slavedelay && + (unsigned(time(0)) < nextOpTime.getSecs() + replSettings.slavedelay)) { + verify(justOne); + oplogReader.putBack(op); + _sleepAdviceTime = nextOpTime.getSecs() + replSettings.slavedelay + 1; ScopedTransaction transaction(txn, MODE_X); Lock::GlobalWrite lk(txn->lockState()); - syncedTo = nextOpTime; - // can't update local log ts since there are pending operations from our peer - save(txn); - log() << "repl: checkpoint applied " << n << " operations" << endl; + if (n > 0) { + syncedTo = last; + save(txn); + } + log() << "repl: applied " << n << " operations" << endl; log() << "repl: syncedTo: " << syncedTo.toStringLong() << endl; - saveLast = time(0); - n = 0; + log() << "waiting until: " << _sleepAdviceTime << " to continue" << endl; + return okResultCode; } - BSONObj op = oplogReader.next(); - - int b = replApplyBatchSize.get(); - bool justOne = b == 1; - scoped_ptr<Lock::GlobalWrite> lk(justOne ? 0 : new Lock::GlobalWrite(txn->lockState())); - while( 1 ) { - - BSONElement ts = op.getField("ts"); - if( !( ts.type() == Date || ts.type() == Timestamp ) ) { - log() << "sync error: problem querying remote oplog record" << endl; - log() << "op: " << op.toString() << endl; - log() << "halting replication" << endl; - replInfo = replAllDead = "sync error: no ts found querying remote oplog record"; - throw SyncException(); - } - OpTime last = nextOpTime; - nextOpTime = OpTime( ts.date() ); - if ( !( last < nextOpTime ) ) { - log() << "sync error: last applied optime at slave >= nextOpTime from master" << endl; - log() << " last: " << last.toStringLong() << endl; - log() << " nextOpTime: " << nextOpTime.toStringLong() << endl; - log() << " halting replication" << endl; - replInfo = replAllDead = "sync error last >= nextOpTime"; - uassert( 10123 , "replication error last applied optime at slave >= nextOpTime from master", false); - } - const ReplSettings& replSettings = - getGlobalReplicationCoordinator()->getSettings(); - if ( replSettings.slavedelay && ( unsigned( time( 0 ) ) < nextOpTime.getSecs() + replSettings.slavedelay ) ) { - verify( justOne ); - oplogReader.putBack( op ); - _sleepAdviceTime = nextOpTime.getSecs() + replSettings.slavedelay + 1; - ScopedTransaction transaction(txn, MODE_X); - Lock::GlobalWrite lk(txn->lockState()); - if ( n > 0 ) { - syncedTo = last; - save(txn); - } - log() << "repl: applied " << n << " operations" << endl; - log() << "repl: syncedTo: " << syncedTo.toStringLong() << endl; - log() << "waiting until: " << _sleepAdviceTime << " to continue" << endl; - return okResultCode; - } - - _sync_pullOpLog_applyOperation(txn, op, !justOne); - n++; + _sync_pullOpLog_applyOperation(txn, op, !justOne); + n++; - if( --b == 0 ) - break; - // if to here, we are doing mulpile applications in a singel write lock acquisition - if( !oplogReader.moreInCurrentBatch() ) { - // break if no more in batch so we release lock while reading from the master - break; - } - op = oplogReader.next(); + if (--b == 0) + break; + // if to here, we are doing mulpile applications in a singel write lock acquisition + if (!oplogReader.moreInCurrentBatch()) { + // break if no more in batch so we release lock while reading from the master + break; } + op = oplogReader.next(); } } - - return okResultCode; } + return okResultCode; +} - /* note: not yet in mutex at this point. - returns >= 0 if ok. return -1 if you want to reconnect. - return value of zero indicates no sleep necessary before next call - */ - int ReplSource::sync(OperationContext* txn, int& nApplied) { - _sleepAdviceTime = 0; - ReplInfo r("sync"); - if (!serverGlobalParams.quiet) { - LogstreamBuilder l = log(); - l << "repl: syncing from "; - if( sourceName() != "main" ) { - l << "source:" << sourceName() << ' '; - } - l << "host:" << hostName << endl; - } - nClonedThisPass = 0; - - // FIXME Handle cases where this db isn't on default port, or default port is spec'd in hostName. - if ((string("localhost") == hostName || string("127.0.0.1") == hostName) && - serverGlobalParams.port == ServerGlobalParams::DefaultDBPort) { - log() << "repl: can't sync from self (localhost). sources configuration may be wrong." << endl; - sleepsecs(5); - return -1; - } - if ( !_connect(&oplogReader, - HostAndPort(hostName), - getGlobalReplicationCoordinator()->getMyRID()) ) { - LOG(4) << "repl: can't connect to sync source" << endl; - return -1; - } +/* note: not yet in mutex at this point. + returns >= 0 if ok. return -1 if you want to reconnect. + return value of zero indicates no sleep necessary before next call +*/ +int ReplSource::sync(OperationContext* txn, int& nApplied) { + _sleepAdviceTime = 0; + ReplInfo r("sync"); + if (!serverGlobalParams.quiet) { + LogstreamBuilder l = log(); + l << "repl: syncing from "; + if (sourceName() != "main") { + l << "source:" << sourceName() << ' '; + } + l << "host:" << hostName << endl; + } + nClonedThisPass = 0; + + // FIXME Handle cases where this db isn't on default port, or default port is spec'd in hostName. + if ((string("localhost") == hostName || string("127.0.0.1") == hostName) && + serverGlobalParams.port == ServerGlobalParams::DefaultDBPort) { + log() << "repl: can't sync from self (localhost). sources configuration may be wrong." + << endl; + sleepsecs(5); + return -1; + } - return _sync_pullOpLog(txn, nApplied); + if (!_connect( + &oplogReader, HostAndPort(hostName), getGlobalReplicationCoordinator()->getMyRID())) { + LOG(4) << "repl: can't connect to sync source" << endl; + return -1; } - /* --------------------------------------------------------------*/ + return _sync_pullOpLog(txn, nApplied); +} - static bool _replMainStarted = false; +/* --------------------------------------------------------------*/ - /* - TODO: - _ source has autoptr to the cursor - _ reuse that cursor when we can - */ +static bool _replMainStarted = false; - /* returns: # of seconds to sleep before next pass - 0 = no sleep recommended - 1 = special sentinel indicating adaptive sleep recommended - */ - int _replMain(OperationContext* txn, ReplSource::SourceVector& sources, int& nApplied) { - { - ReplInfo r("replMain load sources"); - ScopedTransaction transaction(txn, MODE_X); - Lock::GlobalWrite lk(txn->lockState()); - ReplSource::loadAll(txn, sources); +/* +TODO: +_ source has autoptr to the cursor +_ reuse that cursor when we can +*/ - // only need this param for initial reset - _replMainStarted = true; - } +/* returns: # of seconds to sleep before next pass + 0 = no sleep recommended + 1 = special sentinel indicating adaptive sleep recommended +*/ +int _replMain(OperationContext* txn, ReplSource::SourceVector& sources, int& nApplied) { + { + ReplInfo r("replMain load sources"); + ScopedTransaction transaction(txn, MODE_X); + Lock::GlobalWrite lk(txn->lockState()); + ReplSource::loadAll(txn, sources); - if ( sources.empty() ) { - /* replication is not configured yet (for --slave) in local.sources. Poll for config it - every 20 seconds. - */ - log() << "no source given, add a master to local.sources to start replication" << endl; - return 20; - } + // only need this param for initial reset + _replMainStarted = true; + } - int sleepAdvice = 1; - for ( ReplSource::SourceVector::iterator i = sources.begin(); i != sources.end(); i++ ) { - ReplSource *s = i->get(); - int res = -1; - try { - res = s->sync(txn, nApplied); - bool moreToSync = s->haveMoreDbsToSync(); - if( res < 0 ) { - sleepAdvice = 3; - } - else if( moreToSync ) { - sleepAdvice = 0; - } - else if ( s->sleepAdvice() ) { - sleepAdvice = s->sleepAdvice(); - } - else - sleepAdvice = res; - } - catch ( const SyncException& ) { - log() << "caught SyncException" << endl; - return 10; - } - catch ( AssertionException& e ) { - if ( e.severe() ) { - log() << "replMain AssertionException " << e.what() << endl; - return 60; - } - else { - log() << "repl: AssertionException " << e.what() << endl; - } - replInfo = "replMain caught AssertionException"; - } - catch ( const DBException& e ) { - log() << "repl: DBException " << e.what() << endl; - replInfo = "replMain caught DBException"; - } - catch ( const std::exception &e ) { - log() << "repl: std::exception " << e.what() << endl; - replInfo = "replMain caught std::exception"; - } - catch ( ... ) { - log() << "unexpected exception during replication. replication will halt" << endl; - replAllDead = "caught unexpected exception during replication"; - } - if ( res < 0 ) - s->oplogReader.resetConnection(); - } - return sleepAdvice; + if (sources.empty()) { + /* replication is not configured yet (for --slave) in local.sources. Poll for config it + every 20 seconds. + */ + log() << "no source given, add a master to local.sources to start replication" << endl; + return 20; } - static void replMain(OperationContext* txn) { - ReplSource::SourceVector sources; - while ( 1 ) { - int s = 0; - { - ScopedTransaction transaction(txn, MODE_X); - Lock::GlobalWrite lk(txn->lockState()); - if ( replAllDead ) { - // throttledForceResyncDead can throw - if ( !getGlobalReplicationCoordinator()->getSettings().autoresync || - !ReplSource::throttledForceResyncDead( txn, "auto" ) ) { - log() << "all sources dead: " << replAllDead << ", sleeping for 5 seconds" << endl; - break; - } - } - verify( syncing == 0 ); // i.e., there is only one sync thread running. we will want to change/fix this. - syncing++; + int sleepAdvice = 1; + for (ReplSource::SourceVector::iterator i = sources.begin(); i != sources.end(); i++) { + ReplSource* s = i->get(); + int res = -1; + try { + res = s->sync(txn, nApplied); + bool moreToSync = s->haveMoreDbsToSync(); + if (res < 0) { + sleepAdvice = 3; + } else if (moreToSync) { + sleepAdvice = 0; + } else if (s->sleepAdvice()) { + sleepAdvice = s->sleepAdvice(); + } else + sleepAdvice = res; + } catch (const SyncException&) { + log() << "caught SyncException" << endl; + return 10; + } catch (AssertionException& e) { + if (e.severe()) { + log() << "replMain AssertionException " << e.what() << endl; + return 60; + } else { + log() << "repl: AssertionException " << e.what() << endl; } + replInfo = "replMain caught AssertionException"; + } catch (const DBException& e) { + log() << "repl: DBException " << e.what() << endl; + replInfo = "replMain caught DBException"; + } catch (const std::exception& e) { + log() << "repl: std::exception " << e.what() << endl; + replInfo = "replMain caught std::exception"; + } catch (...) { + log() << "unexpected exception during replication. replication will halt" << endl; + replAllDead = "caught unexpected exception during replication"; + } + if (res < 0) + s->oplogReader.resetConnection(); + } + return sleepAdvice; +} - try { - int nApplied = 0; - s = _replMain(txn, sources, nApplied); - if( s == 1 ) { - if( nApplied == 0 ) s = 2; - else if( nApplied > 100 ) { - // sleep very little - just enough that we aren't truly hammering master - sleepmillis(75); - s = 0; - } +static void replMain(OperationContext* txn) { + ReplSource::SourceVector sources; + while (1) { + int s = 0; + { + ScopedTransaction transaction(txn, MODE_X); + Lock::GlobalWrite lk(txn->lockState()); + if (replAllDead) { + // throttledForceResyncDead can throw + if (!getGlobalReplicationCoordinator()->getSettings().autoresync || + !ReplSource::throttledForceResyncDead(txn, "auto")) { + log() << "all sources dead: " << replAllDead << ", sleeping for 5 seconds" + << endl; + break; } } - catch (...) { - log() << "caught exception in _replMain" << endl; - s = 4; - } - - { - ScopedTransaction transaction(txn, MODE_X); - Lock::GlobalWrite lk(txn->lockState()); - verify( syncing == 1 ); - syncing--; - } - - if( relinquishSyncingSome ) { - relinquishSyncingSome = 0; - s = 1; // sleep before going back in to syncing=1 - } + verify( + syncing == + 0); // i.e., there is only one sync thread running. we will want to change/fix this. + syncing++; + } - if ( s ) { - stringstream ss; - ss << "repl: sleep " << s << " sec before next pass"; - string msg = ss.str(); - if (!serverGlobalParams.quiet) - log() << msg << endl; - ReplInfo r(msg.c_str()); - sleepsecs(s); + try { + int nApplied = 0; + s = _replMain(txn, sources, nApplied); + if (s == 1) { + if (nApplied == 0) + s = 2; + else if (nApplied > 100) { + // sleep very little - just enough that we aren't truly hammering master + sleepmillis(75); + s = 0; + } } + } catch (...) { + log() << "caught exception in _replMain" << endl; + s = 4; } - } - - static void replMasterThread() { - sleepsecs(4); - Client::initThread("replmaster"); - int toSleep = 10; - while( 1 ) { - sleepsecs(toSleep); - // Write a keep-alive like entry to the log. This will make things like - // printReplicationStatus() and printSlaveReplicationStatus() stay up-to-date even - // when things are idle. - OperationContextImpl txn; - txn.getClient()->getAuthorizationSession()->grantInternalAuthorization(); + { + ScopedTransaction transaction(txn, MODE_X); + Lock::GlobalWrite lk(txn->lockState()); + verify(syncing == 1); + syncing--; + } - Lock::GlobalWrite globalWrite(txn.lockState(), 1); - if (globalWrite.isLocked()) { - toSleep = 10; + if (relinquishSyncingSome) { + relinquishSyncingSome = 0; + s = 1; // sleep before going back in to syncing=1 + } - try { - WriteUnitOfWork wuow(&txn); - logKeepalive(&txn); - wuow.commit(); - } - catch (...) { - log() << "caught exception in replMasterThread()" << endl; - } - } - else { - LOG(5) << "couldn't logKeepalive" << endl; - toSleep = 1; - } + if (s) { + stringstream ss; + ss << "repl: sleep " << s << " sec before next pass"; + string msg = ss.str(); + if (!serverGlobalParams.quiet) + log() << msg << endl; + ReplInfo r(msg.c_str()); + sleepsecs(s); } } - - static void replSlaveThread() { - sleepsecs(1); - Client::initThread("replslave"); - +} + +static void replMasterThread() { + sleepsecs(4); + Client::initThread("replmaster"); + int toSleep = 10; + while (1) { + sleepsecs(toSleep); + + // Write a keep-alive like entry to the log. This will make things like + // printReplicationStatus() and printSlaveReplicationStatus() stay up-to-date even + // when things are idle. OperationContextImpl txn; txn.getClient()->getAuthorizationSession()->grantInternalAuthorization(); - while ( 1 ) { + Lock::GlobalWrite globalWrite(txn.lockState(), 1); + if (globalWrite.isLocked()) { + toSleep = 10; + try { - replMain(&txn); - sleepsecs(5); - } - catch ( AssertionException& ) { - ReplInfo r("Assertion in replSlaveThread(): sleeping 5 minutes before retry"); - log() << "Assertion in replSlaveThread(): sleeping 5 minutes before retry" << endl; - sleepsecs(300); - } - catch ( DBException& e ) { - log() << "exception in replSlaveThread(): " << e.what() - << ", sleeping 5 minutes before retry" << endl; - sleepsecs(300); - } - catch ( ... ) { - log() << "error in replSlaveThread(): sleeping 5 minutes before retry" << endl; - sleepsecs(300); + WriteUnitOfWork wuow(&txn); + logKeepalive(&txn); + wuow.commit(); + } catch (...) { + log() << "caught exception in replMasterThread()" << endl; } + } else { + LOG(5) << "couldn't logKeepalive" << endl; + toSleep = 1; } } +} - void startMasterSlave(OperationContext* txn) { - - oldRepl(); - - const ReplSettings& replSettings = getGlobalReplicationCoordinator()->getSettings(); - if( !replSettings.slave && !replSettings.master ) - return; +static void replSlaveThread() { + sleepsecs(1); + Client::initThread("replslave"); - txn->getClient()->getAuthorizationSession()->grantInternalAuthorization(); + OperationContextImpl txn; + txn.getClient()->getAuthorizationSession()->grantInternalAuthorization(); - { - ReplSource temp(txn); // Ensures local.me is populated + while (1) { + try { + replMain(&txn); + sleepsecs(5); + } catch (AssertionException&) { + ReplInfo r("Assertion in replSlaveThread(): sleeping 5 minutes before retry"); + log() << "Assertion in replSlaveThread(): sleeping 5 minutes before retry" << endl; + sleepsecs(300); + } catch (DBException& e) { + log() << "exception in replSlaveThread(): " << e.what() + << ", sleeping 5 minutes before retry" << endl; + sleepsecs(300); + } catch (...) { + log() << "error in replSlaveThread(): sleeping 5 minutes before retry" << endl; + sleepsecs(300); } + } +} - if ( replSettings.slave ) { - verify( replSettings.slave == SimpleSlave ); - LOG(1) << "slave=true" << endl; - boost::thread repl_thread(replSlaveThread); - } +void startMasterSlave(OperationContext* txn) { + oldRepl(); - if ( replSettings.master ) { - LOG(1) << "master=true" << endl; - createOplog(txn); - boost::thread t(replMasterThread); - } + const ReplSettings& replSettings = getGlobalReplicationCoordinator()->getSettings(); + if (!replSettings.slave && !replSettings.master) + return; - if (replSettings.fastsync) { - while(!_replMainStarted) // don't allow writes until we've set up from log - sleepmillis( 50 ); - } - } - int _dummy_z; + txn->getClient()->getAuthorizationSession()->grantInternalAuthorization(); - void pretouchN(vector<BSONObj>& v, unsigned a, unsigned b) { - Client *c = currentClient.get(); - if( c == 0 ) { - Client::initThread("pretouchN"); - c = &cc(); - } + { + ReplSource temp(txn); // Ensures local.me is populated + } - OperationContextImpl txn; // XXX - ScopedTransaction transaction(&txn, MODE_S); - Lock::GlobalRead lk(txn.lockState()); - - for( unsigned i = a; i <= b; i++ ) { - const BSONObj& op = v[i]; - const char *which = "o"; - const char *opType = op.getStringField("op"); - if ( *opType == 'i' ) - ; - else if( *opType == 'u' ) - which = "o2"; - else - continue; - /* todo : other operations */ + if (replSettings.slave) { + verify(replSettings.slave == SimpleSlave); + LOG(1) << "slave=true" << endl; + boost::thread repl_thread(replSlaveThread); + } - try { - BSONObj o = op.getObjectField(which); - BSONElement _id; - if( o.getObjectID(_id) ) { - const char *ns = op.getStringField("ns"); - BSONObjBuilder b; - b.append(_id); - BSONObj result; - Client::Context ctx(&txn, ns); - if( Helpers::findById(&txn, ctx.db(), ns, b.done(), result) ) - _dummy_z += result.objsize(); // touch - } - } - catch( DBException& e ) { - log() << "ignoring assertion in pretouchN() " << a << ' ' << b << ' ' << i << ' ' << e.toString() << endl; - } - } + if (replSettings.master) { + LOG(1) << "master=true" << endl; + createOplog(txn); + boost::thread t(replMasterThread); } - void pretouchOperation(OperationContext* txn, const BSONObj& op) { + if (replSettings.fastsync) { + while (!_replMainStarted) // don't allow writes until we've set up from log + sleepmillis(50); + } +} +int _dummy_z; + +void pretouchN(vector<BSONObj>& v, unsigned a, unsigned b) { + Client* c = currentClient.get(); + if (c == 0) { + Client::initThread("pretouchN"); + c = &cc(); + } - if (txn->lockState()->isWriteLocked()) { - return; // no point pretouching if write locked. not sure if this will ever fire, but just in case. - } + OperationContextImpl txn; // XXX + ScopedTransaction transaction(&txn, MODE_S); + Lock::GlobalRead lk(txn.lockState()); - const char *which = "o"; - const char *opType = op.getStringField("op"); - if ( *opType == 'i' ) + for (unsigned i = a; i <= b; i++) { + const BSONObj& op = v[i]; + const char* which = "o"; + const char* opType = op.getStringField("op"); + if (*opType == 'i') ; - else if( *opType == 'u' ) + else if (*opType == 'u') which = "o2"; else - return; + continue; /* todo : other operations */ try { BSONObj o = op.getObjectField(which); BSONElement _id; - if( o.getObjectID(_id) ) { - const char *ns = op.getStringField("ns"); + if (o.getObjectID(_id)) { + const char* ns = op.getStringField("ns"); BSONObjBuilder b; b.append(_id); BSONObj result; - AutoGetCollectionForRead ctx(txn, ns ); - if (Helpers::findById(txn, ctx.getDb(), ns, b.done(), result)) { - _dummy_z += result.objsize(); // touch - } + Client::Context ctx(&txn, ns); + if (Helpers::findById(&txn, ctx.db(), ns, b.done(), result)) + _dummy_z += result.objsize(); // touch } + } catch (DBException& e) { + log() << "ignoring assertion in pretouchN() " << a << ' ' << b << ' ' << i << ' ' + << e.toString() << endl; } - catch( DBException& ) { - log() << "ignoring assertion in pretouchOperation()" << endl; + } +} + +void pretouchOperation(OperationContext* txn, const BSONObj& op) { + if (txn->lockState()->isWriteLocked()) { + return; // no point pretouching if write locked. not sure if this will ever fire, but just in case. + } + + const char* which = "o"; + const char* opType = op.getStringField("op"); + if (*opType == 'i') + ; + else if (*opType == 'u') + which = "o2"; + else + return; + /* todo : other operations */ + + try { + BSONObj o = op.getObjectField(which); + BSONElement _id; + if (o.getObjectID(_id)) { + const char* ns = op.getStringField("ns"); + BSONObjBuilder b; + b.append(_id); + BSONObj result; + AutoGetCollectionForRead ctx(txn, ns); + if (Helpers::findById(txn, ctx.getDb(), ns, b.done(), result)) { + _dummy_z += result.objsize(); // touch + } } + } catch (DBException&) { + log() << "ignoring assertion in pretouchOperation()" << endl; } +} -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/master_slave.h b/src/mongo/db/repl/master_slave.h index 117e26d08a1..74e509302f7 100644 --- a/src/mongo/db/repl/master_slave.h +++ b/src/mongo/db/repl/master_slave.h @@ -42,154 +42,165 @@ */ namespace mongo { - namespace threadpool { - class ThreadPool; - } +namespace threadpool { +class ThreadPool; +} - class Database; - class OperationContext; +class Database; +class OperationContext; namespace repl { - // Main entry point for master/slave at startup time. - void startMasterSlave(OperationContext* txn); +// Main entry point for master/slave at startup time. +void startMasterSlave(OperationContext* txn); + +// externed for use with resync.cpp +extern volatile int relinquishSyncingSome; +extern volatile int syncing; + +extern const char* replInfo; + +/* A replication exception */ +class SyncException : public DBException { +public: + SyncException() : DBException("sync exception", 10001) {} +}; + +/* A Source is a source from which we can pull (replicate) data. + stored in collection local.sources. - // externed for use with resync.cpp - extern volatile int relinquishSyncingSome; - extern volatile int syncing; + Can be a group of things to replicate for several databases. - extern const char *replInfo; + { host: ..., source: ..., only: ..., syncedTo: ..., dbsNextPass: { ... }, incompleteCloneDbs: { ... } } - /* A replication exception */ - class SyncException : public DBException { - public: - SyncException() : DBException( "sync exception" , 10001 ) {} - }; + 'source' defaults to 'main'; support for multiple source names is + not done (always use main for now). +*/ +class ReplSource { + boost::shared_ptr<threadpool::ThreadPool> tp; - /* A Source is a source from which we can pull (replicate) data. - stored in collection local.sources. + void resync(OperationContext* txn, const std::string& dbName); - Can be a group of things to replicate for several databases. + /** @param alreadyLocked caller already put us in write lock if true */ + void _sync_pullOpLog_applyOperation(OperationContext* txn, BSONObj& op, bool alreadyLocked); - { host: ..., source: ..., only: ..., syncedTo: ..., dbsNextPass: { ... }, incompleteCloneDbs: { ... } } + /* pull some operations from the master's oplog, and apply them. + calls sync_pullOpLog_applyOperation + */ + int _sync_pullOpLog(OperationContext* txn, int& nApplied); - 'source' defaults to 'main'; support for multiple source names is - not done (always use main for now). + /* we only clone one database per pass, even if a lot need done. This helps us + avoid overflowing the master's transaction log by doing too much work before going + back to read more transactions. (Imagine a scenario of slave startup where we try to + clone 100 databases in one pass.) */ - class ReplSource { - boost::shared_ptr<threadpool::ThreadPool> tp; - - void resync(OperationContext* txn, const std::string& dbName); - - /** @param alreadyLocked caller already put us in write lock if true */ - void _sync_pullOpLog_applyOperation(OperationContext* txn, BSONObj& op, bool alreadyLocked); - - /* pull some operations from the master's oplog, and apply them. - calls sync_pullOpLog_applyOperation - */ - int _sync_pullOpLog(OperationContext* txn, int& nApplied); - - /* we only clone one database per pass, even if a lot need done. This helps us - avoid overflowing the master's transaction log by doing too much work before going - back to read more transactions. (Imagine a scenario of slave startup where we try to - clone 100 databases in one pass.) - */ - std::set<std::string> addDbNextPass; - - std::set<std::string> incompleteCloneDbs; - - /// TODO(spencer): Remove this once the LegacyReplicationCoordinator is gone. - BSONObj _me; - - void resyncDrop( OperationContext* txn, const std::string& db ); - // call without the db mutex - void syncToTailOfRemoteLog(); - std::string ns() const { return std::string( "local.oplog.$" ) + sourceName(); } - unsigned _sleepAdviceTime; - - /** - * If 'db' is a new database and its name would conflict with that of - * an existing database, synchronize these database names with the - * master. - * @return true iff an op with the specified ns may be applied. - */ - bool handleDuplicateDbName( OperationContext* txn, - const BSONObj &op, - const char* ns, - const char* db ); - - // populates _me so that it can be passed to oplogreader for handshakes - /// TODO(spencer): Remove this function once the LegacyReplicationCoordinator is gone. - void ensureMe(OperationContext* txn); - - void forceResync(OperationContext* txn, const char *requester); - - bool _connect(OplogReader* reader, const HostAndPort& host, const OID& myRID); - public: - OplogReader oplogReader; - - void applyOperation(OperationContext* txn, Database* db, const BSONObj& op); - std::string hostName; // ip addr or hostname plus optionally, ":<port>" - std::string _sourceName; // a logical source name. - std::string sourceName() const { return _sourceName.empty() ? "main" : _sourceName; } - std::string only; // only a certain db. note that in the sources collection, this may not be changed once you start replicating. - - /* the last time point we have already synced up to (in the remote/master's oplog). */ - OpTime syncedTo; - - int nClonedThisPass; - - typedef std::vector< boost::shared_ptr< ReplSource > > SourceVector; - static void loadAll(OperationContext* txn, SourceVector&); - - explicit ReplSource(OperationContext* txn, BSONObj); - // This is not the constructor you are looking for. Always prefer the version that takes - // a BSONObj. This is public only as a hack so that the ReplicationCoordinator can find - // out the process's RID in master/slave setups. - ReplSource(OperationContext* txn); - - /* -1 = error */ - int sync(OperationContext* txn, int& nApplied); - - void save(OperationContext* txn); // write ourself to local.sources - - // make a jsobj from our member fields of the form - // { host: ..., source: ..., syncedTo: ... } - BSONObj jsobj(); - - bool operator==(const ReplSource&r) const { - return hostName == r.hostName && sourceName() == r.sourceName(); - } - std::string toString() const { return sourceName() + "@" + hostName; } - - bool haveMoreDbsToSync() const { return !addDbNextPass.empty(); } - int sleepAdvice() const { - if ( !_sleepAdviceTime ) - return 0; - int wait = _sleepAdviceTime - unsigned( time( 0 ) ); - return wait > 0 ? wait : 0; - } - - static bool throttledForceResyncDead( OperationContext* txn, const char *requester ); - static void forceResyncDead( OperationContext* txn, const char *requester ); - }; + std::set<std::string> addDbNextPass; + + std::set<std::string> incompleteCloneDbs; + + /// TODO(spencer): Remove this once the LegacyReplicationCoordinator is gone. + BSONObj _me; + + void resyncDrop(OperationContext* txn, const std::string& db); + // call without the db mutex + void syncToTailOfRemoteLog(); + std::string ns() const { + return std::string("local.oplog.$") + sourceName(); + } + unsigned _sleepAdviceTime; /** - * Helper class used to set and query an ignore state for a named database. - * The ignore state will expire after a specified OpTime. + * If 'db' is a new database and its name would conflict with that of + * an existing database, synchronize these database names with the + * master. + * @return true iff an op with the specified ns may be applied. */ - class DatabaseIgnorer { - public: - /** Indicate that operations for 'db' should be ignored until after 'futureOplogTime' */ - void doIgnoreUntilAfter( const std::string &db, const OpTime &futureOplogTime ); - /** - * Query ignore state of 'db'; if 'currentOplogTime' is after the ignore - * limit, the ignore state will be cleared. - */ - bool ignoreAt( const std::string &db, const OpTime ¤tOplogTime ); - private: - std::map< std::string, OpTime > _ignores; - }; - -} // namespace repl -} // namespace mongo + bool handleDuplicateDbName(OperationContext* txn, + const BSONObj& op, + const char* ns, + const char* db); + + // populates _me so that it can be passed to oplogreader for handshakes + /// TODO(spencer): Remove this function once the LegacyReplicationCoordinator is gone. + void ensureMe(OperationContext* txn); + + void forceResync(OperationContext* txn, const char* requester); + + bool _connect(OplogReader* reader, const HostAndPort& host, const OID& myRID); + +public: + OplogReader oplogReader; + + void applyOperation(OperationContext* txn, Database* db, const BSONObj& op); + std::string hostName; // ip addr or hostname plus optionally, ":<port>" + std::string _sourceName; // a logical source name. + std::string sourceName() const { + return _sourceName.empty() ? "main" : _sourceName; + } + std::string + only; // only a certain db. note that in the sources collection, this may not be changed once you start replicating. + + /* the last time point we have already synced up to (in the remote/master's oplog). */ + OpTime syncedTo; + + int nClonedThisPass; + + typedef std::vector<boost::shared_ptr<ReplSource>> SourceVector; + static void loadAll(OperationContext* txn, SourceVector&); + + explicit ReplSource(OperationContext* txn, BSONObj); + // This is not the constructor you are looking for. Always prefer the version that takes + // a BSONObj. This is public only as a hack so that the ReplicationCoordinator can find + // out the process's RID in master/slave setups. + ReplSource(OperationContext* txn); + + /* -1 = error */ + int sync(OperationContext* txn, int& nApplied); + + void save(OperationContext* txn); // write ourself to local.sources + + // make a jsobj from our member fields of the form + // { host: ..., source: ..., syncedTo: ... } + BSONObj jsobj(); + + bool operator==(const ReplSource& r) const { + return hostName == r.hostName && sourceName() == r.sourceName(); + } + std::string toString() const { + return sourceName() + "@" + hostName; + } + + bool haveMoreDbsToSync() const { + return !addDbNextPass.empty(); + } + int sleepAdvice() const { + if (!_sleepAdviceTime) + return 0; + int wait = _sleepAdviceTime - unsigned(time(0)); + return wait > 0 ? wait : 0; + } + + static bool throttledForceResyncDead(OperationContext* txn, const char* requester); + static void forceResyncDead(OperationContext* txn, const char* requester); +}; + +/** + * Helper class used to set and query an ignore state for a named database. + * The ignore state will expire after a specified OpTime. + */ +class DatabaseIgnorer { +public: + /** Indicate that operations for 'db' should be ignored until after 'futureOplogTime' */ + void doIgnoreUntilAfter(const std::string& db, const OpTime& futureOplogTime); + /** + * Query ignore state of 'db'; if 'currentOplogTime' is after the ignore + * limit, the ignore state will be cleared. + */ + bool ignoreAt(const std::string& db, const OpTime& currentOplogTime); + +private: + std::map<std::string, OpTime> _ignores; +}; + +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/member_config.cpp b/src/mongo/db/repl/member_config.cpp index 6f3bcf40501..3828185094a 100644 --- a/src/mongo/db/repl/member_config.cpp +++ b/src/mongo/db/repl/member_config.cpp @@ -40,280 +40,267 @@ namespace mongo { namespace repl { - const std::string MemberConfig::kIdFieldName = "_id"; - const std::string MemberConfig::kVotesFieldName = "votes"; - const std::string MemberConfig::kPriorityFieldName = "priority"; - const std::string MemberConfig::kHostFieldName = "host"; - const std::string MemberConfig::kHiddenFieldName = "hidden"; - const std::string MemberConfig::kSlaveDelayFieldName = "slaveDelay"; - const std::string MemberConfig::kArbiterOnlyFieldName = "arbiterOnly"; - const std::string MemberConfig::kBuildIndexesFieldName = "buildIndexes"; - const std::string MemberConfig::kTagsFieldName = "tags"; - const std::string MemberConfig::kInternalVoterTagName = "$voter"; - const std::string MemberConfig::kInternalElectableTagName = "$electable"; - const std::string MemberConfig::kInternalAllTagName = "$all"; +const std::string MemberConfig::kIdFieldName = "_id"; +const std::string MemberConfig::kVotesFieldName = "votes"; +const std::string MemberConfig::kPriorityFieldName = "priority"; +const std::string MemberConfig::kHostFieldName = "host"; +const std::string MemberConfig::kHiddenFieldName = "hidden"; +const std::string MemberConfig::kSlaveDelayFieldName = "slaveDelay"; +const std::string MemberConfig::kArbiterOnlyFieldName = "arbiterOnly"; +const std::string MemberConfig::kBuildIndexesFieldName = "buildIndexes"; +const std::string MemberConfig::kTagsFieldName = "tags"; +const std::string MemberConfig::kInternalVoterTagName = "$voter"; +const std::string MemberConfig::kInternalElectableTagName = "$electable"; +const std::string MemberConfig::kInternalAllTagName = "$all"; namespace { - const std::string kLegalMemberConfigFieldNames[] = { - MemberConfig::kIdFieldName, - MemberConfig::kVotesFieldName, - MemberConfig::kPriorityFieldName, - MemberConfig::kHostFieldName, - MemberConfig::kHiddenFieldName, - MemberConfig::kSlaveDelayFieldName, - MemberConfig::kArbiterOnlyFieldName, - MemberConfig::kBuildIndexesFieldName, - MemberConfig::kTagsFieldName - }; - - const int kVotesFieldDefault = 1; - const double kPriorityFieldDefault = 1.0; - const Seconds kSlaveDelayFieldDefault(0); - const bool kArbiterOnlyFieldDefault = false; - const bool kHiddenFieldDefault = false; - const bool kBuildIndexesFieldDefault = true; - - const Seconds kMaxSlaveDelay(3600 * 24 * 366); +const std::string kLegalMemberConfigFieldNames[] = {MemberConfig::kIdFieldName, + MemberConfig::kVotesFieldName, + MemberConfig::kPriorityFieldName, + MemberConfig::kHostFieldName, + MemberConfig::kHiddenFieldName, + MemberConfig::kSlaveDelayFieldName, + MemberConfig::kArbiterOnlyFieldName, + MemberConfig::kBuildIndexesFieldName, + MemberConfig::kTagsFieldName}; + +const int kVotesFieldDefault = 1; +const double kPriorityFieldDefault = 1.0; +const Seconds kSlaveDelayFieldDefault(0); +const bool kArbiterOnlyFieldDefault = false; +const bool kHiddenFieldDefault = false; +const bool kBuildIndexesFieldDefault = true; + +const Seconds kMaxSlaveDelay(3600 * 24 * 366); } // namespace - Status MemberConfig::initialize(const BSONObj& mcfg, ReplicaSetTagConfig* tagConfig) { - Status status = bsonCheckOnlyHasFields( - "replica set member configuration", mcfg, kLegalMemberConfigFieldNames); - if (!status.isOK()) - return status; - - // - // Parse _id field. - // - BSONElement idElement = mcfg[kIdFieldName]; - if (idElement.eoo()) { - return Status(ErrorCodes::NoSuchKey, str::stream() << kIdFieldName << - " field is missing"); - } - if (!idElement.isNumber()) { - return Status(ErrorCodes::TypeMismatch, str::stream() << kIdFieldName << - " field has non-numeric type " << typeName(idElement.type())); - } - _id = idElement.numberInt(); - - // - // Parse h field. - // - std::string hostAndPortString; - status = bsonExtractStringField(mcfg, kHostFieldName, &hostAndPortString); - if (!status.isOK()) - return status; - boost::trim(hostAndPortString); - status = _host.initialize(hostAndPortString); - if (!status.isOK()) - return status; - if (!_host.hasPort()) { - // make port explicit even if default. - _host = HostAndPort(_host.host(), _host.port()); - } +Status MemberConfig::initialize(const BSONObj& mcfg, ReplicaSetTagConfig* tagConfig) { + Status status = bsonCheckOnlyHasFields( + "replica set member configuration", mcfg, kLegalMemberConfigFieldNames); + if (!status.isOK()) + return status; - // - // Parse votes field. - // - BSONElement votesElement = mcfg[kVotesFieldName]; - if (votesElement.eoo()) { - _votes = kVotesFieldDefault; - } - else if (votesElement.isNumber()) { - _votes = votesElement.numberInt(); - } - else { - return Status(ErrorCodes::TypeMismatch, str::stream() << kVotesFieldName << - " field value has non-numeric type " << - typeName(votesElement.type())); - } + // + // Parse _id field. + // + BSONElement idElement = mcfg[kIdFieldName]; + if (idElement.eoo()) { + return Status(ErrorCodes::NoSuchKey, str::stream() << kIdFieldName << " field is missing"); + } + if (!idElement.isNumber()) { + return Status(ErrorCodes::TypeMismatch, + str::stream() << kIdFieldName << " field has non-numeric type " + << typeName(idElement.type())); + } + _id = idElement.numberInt(); - // - // Parse priority field. - // - BSONElement priorityElement = mcfg[kPriorityFieldName]; - if (priorityElement.eoo()) { - _priority = kPriorityFieldDefault; - } - else if (priorityElement.isNumber()) { - _priority = priorityElement.numberDouble(); - } - else { - return Status(ErrorCodes::TypeMismatch, str::stream() << kPriorityFieldName << - " field has non-numeric type " << typeName(priorityElement.type())); - } + // + // Parse h field. + // + std::string hostAndPortString; + status = bsonExtractStringField(mcfg, kHostFieldName, &hostAndPortString); + if (!status.isOK()) + return status; + boost::trim(hostAndPortString); + status = _host.initialize(hostAndPortString); + if (!status.isOK()) + return status; + if (!_host.hasPort()) { + // make port explicit even if default. + _host = HostAndPort(_host.host(), _host.port()); + } - // - // Parse arbiterOnly field. - // - status = bsonExtractBooleanFieldWithDefault(mcfg, - kArbiterOnlyFieldName, - kArbiterOnlyFieldDefault, - &_arbiterOnly); - if (!status.isOK()) - return status; - - // - // Parse slaveDelay field. - // - BSONElement slaveDelayElement = mcfg[kSlaveDelayFieldName]; - if (slaveDelayElement.eoo()) { - _slaveDelay = kSlaveDelayFieldDefault; - } - else if (slaveDelayElement.isNumber()) { - _slaveDelay = Seconds(slaveDelayElement.numberInt()); - } - else { - return Status(ErrorCodes::TypeMismatch, str::stream() << kSlaveDelayFieldName << - " field value has non-numeric type " << - typeName(slaveDelayElement.type())); - } + // + // Parse votes field. + // + BSONElement votesElement = mcfg[kVotesFieldName]; + if (votesElement.eoo()) { + _votes = kVotesFieldDefault; + } else if (votesElement.isNumber()) { + _votes = votesElement.numberInt(); + } else { + return Status(ErrorCodes::TypeMismatch, + str::stream() << kVotesFieldName << " field value has non-numeric type " + << typeName(votesElement.type())); + } + + // + // Parse priority field. + // + BSONElement priorityElement = mcfg[kPriorityFieldName]; + if (priorityElement.eoo()) { + _priority = kPriorityFieldDefault; + } else if (priorityElement.isNumber()) { + _priority = priorityElement.numberDouble(); + } else { + return Status(ErrorCodes::TypeMismatch, + str::stream() << kPriorityFieldName << " field has non-numeric type " + << typeName(priorityElement.type())); + } + + // + // Parse arbiterOnly field. + // + status = bsonExtractBooleanFieldWithDefault( + mcfg, kArbiterOnlyFieldName, kArbiterOnlyFieldDefault, &_arbiterOnly); + if (!status.isOK()) + return status; - // - // Parse hidden field. - // - status = bsonExtractBooleanFieldWithDefault(mcfg, - kHiddenFieldName, - kHiddenFieldDefault, - &_hidden); - if (!status.isOK()) - return status; - - // - // Parse buildIndexes field. - // - status = bsonExtractBooleanFieldWithDefault(mcfg, - kBuildIndexesFieldName, - kBuildIndexesFieldDefault, - &_buildIndexes); - if (!status.isOK()) - return status; - - // - // Parse "tags" field. - // - _tags.clear(); - BSONElement tagsElement; - status = bsonExtractTypedField(mcfg, kTagsFieldName, Object, &tagsElement); - if (status.isOK()) { - for (BSONObj::iterator tagIter(tagsElement.Obj()); tagIter.more();) { - const BSONElement& tag = tagIter.next(); - if (tag.type() != String) { - return Status(ErrorCodes::TypeMismatch, str::stream() << "tags." << - tag.fieldName() << " field has non-string value of type " << - typeName(tag.type())); - } - _tags.push_back(tagConfig->makeTag(tag.fieldNameStringData(), - tag.valueStringData())); + // + // Parse slaveDelay field. + // + BSONElement slaveDelayElement = mcfg[kSlaveDelayFieldName]; + if (slaveDelayElement.eoo()) { + _slaveDelay = kSlaveDelayFieldDefault; + } else if (slaveDelayElement.isNumber()) { + _slaveDelay = Seconds(slaveDelayElement.numberInt()); + } else { + return Status(ErrorCodes::TypeMismatch, + str::stream() << kSlaveDelayFieldName << " field value has non-numeric type " + << typeName(slaveDelayElement.type())); + } + + // + // Parse hidden field. + // + status = + bsonExtractBooleanFieldWithDefault(mcfg, kHiddenFieldName, kHiddenFieldDefault, &_hidden); + if (!status.isOK()) + return status; + + // + // Parse buildIndexes field. + // + status = bsonExtractBooleanFieldWithDefault( + mcfg, kBuildIndexesFieldName, kBuildIndexesFieldDefault, &_buildIndexes); + if (!status.isOK()) + return status; + + // + // Parse "tags" field. + // + _tags.clear(); + BSONElement tagsElement; + status = bsonExtractTypedField(mcfg, kTagsFieldName, Object, &tagsElement); + if (status.isOK()) { + for (BSONObj::iterator tagIter(tagsElement.Obj()); tagIter.more();) { + const BSONElement& tag = tagIter.next(); + if (tag.type() != String) { + return Status(ErrorCodes::TypeMismatch, + str::stream() << "tags." << tag.fieldName() + << " field has non-string value of type " + << typeName(tag.type())); } + _tags.push_back(tagConfig->makeTag(tag.fieldNameStringData(), tag.valueStringData())); } - else if (ErrorCodes::NoSuchKey != status) { - return status; - } + } else if (ErrorCodes::NoSuchKey != status) { + return status; + } - // - // Add internal tags based on other member properties. - // - - // Add a voter tag if this non-arbiter member votes; use _id for uniquity. - const std::string id = str::stream() << _id; - if (isVoter() && !_arbiterOnly) { - _tags.push_back(tagConfig->makeTag(kInternalVoterTagName, id)); - } + // + // Add internal tags based on other member properties. + // - // Add an electable tag if this member is electable. - if (isElectable()) { - _tags.push_back(tagConfig->makeTag(kInternalElectableTagName, id)); - } + // Add a voter tag if this non-arbiter member votes; use _id for uniquity. + const std::string id = str::stream() << _id; + if (isVoter() && !_arbiterOnly) { + _tags.push_back(tagConfig->makeTag(kInternalVoterTagName, id)); + } - // Add a tag for generic counting of this node. - if (!_arbiterOnly) { - _tags.push_back(tagConfig->makeTag(kInternalAllTagName, id)); - } + // Add an electable tag if this member is electable. + if (isElectable()) { + _tags.push_back(tagConfig->makeTag(kInternalElectableTagName, id)); + } - return Status::OK(); + // Add a tag for generic counting of this node. + if (!_arbiterOnly) { + _tags.push_back(tagConfig->makeTag(kInternalAllTagName, id)); } - Status MemberConfig::validate() const { - if (_id < 0 || _id > 255) { - return Status(ErrorCodes::BadValue, str::stream() << kIdFieldName << - " field value of " << _id << " is out of range."); - } + return Status::OK(); +} - if (_priority < 0 || _priority > 1000) { - return Status(ErrorCodes::BadValue, str::stream() << kPriorityFieldName << - " field value of " << _priority << " is out of range"); - } - if (_votes != 0 && _votes != 1) { - return Status(ErrorCodes::BadValue, str::stream() << kVotesFieldName << - " field value is " << _votes << " but must be 0 or 1"); - } - if (_arbiterOnly) { - if (!_tags.empty()) { - return Status(ErrorCodes::BadValue, "Cannot set tags on arbiters."); - } - if (!isVoter()) { - return Status(ErrorCodes::BadValue, "Arbiter must vote (cannot have 0 votes)"); - } - } - if (_slaveDelay < Seconds(0) || _slaveDelay > kMaxSlaveDelay) { - return Status(ErrorCodes::BadValue, str::stream() << kSlaveDelayFieldName << - " field value of " << _slaveDelay.total_seconds() << - " seconds is out of range"); - } - if (_slaveDelay > Seconds(0) && _priority != 0) { - return Status(ErrorCodes::BadValue, "slaveDelay requires priority be zero"); - } - if (_hidden && _priority != 0) { - return Status(ErrorCodes::BadValue, "priority must be 0 when hidden=true"); +Status MemberConfig::validate() const { + if (_id < 0 || _id > 255) { + return Status(ErrorCodes::BadValue, + str::stream() << kIdFieldName << " field value of " << _id + << " is out of range."); + } + + if (_priority < 0 || _priority > 1000) { + return Status(ErrorCodes::BadValue, + str::stream() << kPriorityFieldName << " field value of " << _priority + << " is out of range"); + } + if (_votes != 0 && _votes != 1) { + return Status(ErrorCodes::BadValue, + str::stream() << kVotesFieldName << " field value is " << _votes + << " but must be 0 or 1"); + } + if (_arbiterOnly) { + if (!_tags.empty()) { + return Status(ErrorCodes::BadValue, "Cannot set tags on arbiters."); } - if (!_buildIndexes && _priority != 0) { - return Status(ErrorCodes::BadValue, "priority must be 0 when buildIndexes=false"); + if (!isVoter()) { + return Status(ErrorCodes::BadValue, "Arbiter must vote (cannot have 0 votes)"); } - return Status::OK(); } + if (_slaveDelay < Seconds(0) || _slaveDelay > kMaxSlaveDelay) { + return Status(ErrorCodes::BadValue, + str::stream() << kSlaveDelayFieldName << " field value of " + << _slaveDelay.total_seconds() << " seconds is out of range"); + } + if (_slaveDelay > Seconds(0) && _priority != 0) { + return Status(ErrorCodes::BadValue, "slaveDelay requires priority be zero"); + } + if (_hidden && _priority != 0) { + return Status(ErrorCodes::BadValue, "priority must be 0 when hidden=true"); + } + if (!_buildIndexes && _priority != 0) { + return Status(ErrorCodes::BadValue, "priority must be 0 when buildIndexes=false"); + } + return Status::OK(); +} - bool MemberConfig::hasTags(const ReplicaSetTagConfig& tagConfig) const { - for (std::vector<ReplicaSetTag>::const_iterator tag = _tags.begin(); - tag != _tags.end(); - tag++) { - std::string tagKey = tagConfig.getTagKey(*tag); - if (tagKey[0] == '$') { - // Filter out internal tags - continue; - } - return true; +bool MemberConfig::hasTags(const ReplicaSetTagConfig& tagConfig) const { + for (std::vector<ReplicaSetTag>::const_iterator tag = _tags.begin(); tag != _tags.end(); + tag++) { + std::string tagKey = tagConfig.getTagKey(*tag); + if (tagKey[0] == '$') { + // Filter out internal tags + continue; } - return false; + return true; } + return false; +} - BSONObj MemberConfig::toBSON(const ReplicaSetTagConfig& tagConfig) const { - BSONObjBuilder configBuilder; - configBuilder.append("_id", _id); - configBuilder.append("host", _host.toString()); - configBuilder.append("arbiterOnly", _arbiterOnly); - configBuilder.append("buildIndexes", _buildIndexes); - configBuilder.append("hidden", _hidden); - configBuilder.append("priority", _priority); - - BSONObjBuilder tags(configBuilder.subobjStart("tags")); - for (std::vector<ReplicaSetTag>::const_iterator tag = _tags.begin(); - tag != _tags.end(); - tag++) { - std::string tagKey = tagConfig.getTagKey(*tag); - if (tagKey[0] == '$') { - // Filter out internal tags - continue; - } - tags.append(tagKey, tagConfig.getTagValue(*tag)); - } - tags.done(); +BSONObj MemberConfig::toBSON(const ReplicaSetTagConfig& tagConfig) const { + BSONObjBuilder configBuilder; + configBuilder.append("_id", _id); + configBuilder.append("host", _host.toString()); + configBuilder.append("arbiterOnly", _arbiterOnly); + configBuilder.append("buildIndexes", _buildIndexes); + configBuilder.append("hidden", _hidden); + configBuilder.append("priority", _priority); - configBuilder.append("slaveDelay", _slaveDelay.total_seconds()); - configBuilder.append("votes", getNumVotes()); - return configBuilder.obj(); + BSONObjBuilder tags(configBuilder.subobjStart("tags")); + for (std::vector<ReplicaSetTag>::const_iterator tag = _tags.begin(); tag != _tags.end(); + tag++) { + std::string tagKey = tagConfig.getTagKey(*tag); + if (tagKey[0] == '$') { + // Filter out internal tags + continue; + } + tags.append(tagKey, tagConfig.getTagValue(*tag)); } + tags.done(); + + configBuilder.append("slaveDelay", _slaveDelay.total_seconds()); + configBuilder.append("votes", getNumVotes()); + return configBuilder.obj(); +} } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/member_config.h b/src/mongo/db/repl/member_config.h index f980a8e2bc8..694a8941f8e 100644 --- a/src/mongo/db/repl/member_config.h +++ b/src/mongo/db/repl/member_config.h @@ -38,143 +38,168 @@ namespace mongo { - class BSONObj; +class BSONObj; namespace repl { +/** + * Representation of the configuration information about a particular member of a replica set. + */ +class MemberConfig { +public: + typedef std::vector<ReplicaSetTag>::const_iterator TagIterator; + + static const std::string kIdFieldName; + static const std::string kVotesFieldName; + static const std::string kPriorityFieldName; + static const std::string kHostFieldName; + static const std::string kHiddenFieldName; + static const std::string kSlaveDelayFieldName; + static const std::string kArbiterOnlyFieldName; + static const std::string kBuildIndexesFieldName; + static const std::string kTagsFieldName; + static const std::string kInternalVoterTagName; + static const std::string kInternalElectableTagName; + static const std::string kInternalAllTagName; + + /** + * Default constructor, produces a MemberConfig in an undefined state. + * Must successfully call initialze() before calling validate() or the + * accessors. + */ + MemberConfig() : _slaveDelay(0) {} + + /** + * Initializes this MemberConfig from the contents of "mcfg". + * + * If "mcfg" describes any tags, builds ReplicaSetTags for this + * configuration using "tagConfig" as the tag's namespace. This may + * have the effect of altering "tagConfig" when "mcfg" describes a + * tag not previously added to "tagConfig". + */ + Status initialize(const BSONObj& mcfg, ReplicaSetTagConfig* tagConfig); + + /** + * Performs basic consistency checks on the member configuration. + */ + Status validate() const; + + /** + * Gets the identifier for this member, unique within a ReplicaSetConfig. + */ + int getId() const { + return _id; + } + + /** + * Gets the canonical name of this member, by which other members and clients + * will contact it. + */ + const HostAndPort& getHostAndPort() const { + return _host; + } + + /** + * Gets this member's priority. Higher means more likely to be elected + * primary. + */ + double getPriority() const { + return _priority; + } + /** - * Representation of the configuration information about a particular member of a replica set. - */ - class MemberConfig { - public: - typedef std::vector<ReplicaSetTag>::const_iterator TagIterator; - - static const std::string kIdFieldName; - static const std::string kVotesFieldName; - static const std::string kPriorityFieldName; - static const std::string kHostFieldName; - static const std::string kHiddenFieldName; - static const std::string kSlaveDelayFieldName; - static const std::string kArbiterOnlyFieldName; - static const std::string kBuildIndexesFieldName; - static const std::string kTagsFieldName; - static const std::string kInternalVoterTagName; - static const std::string kInternalElectableTagName; - static const std::string kInternalAllTagName; - - /** - * Default constructor, produces a MemberConfig in an undefined state. - * Must successfully call initialze() before calling validate() or the - * accessors. - */ - MemberConfig() : _slaveDelay(0) {} - - /** - * Initializes this MemberConfig from the contents of "mcfg". - * - * If "mcfg" describes any tags, builds ReplicaSetTags for this - * configuration using "tagConfig" as the tag's namespace. This may - * have the effect of altering "tagConfig" when "mcfg" describes a - * tag not previously added to "tagConfig". - */ - Status initialize(const BSONObj& mcfg, ReplicaSetTagConfig* tagConfig); - - /** - * Performs basic consistency checks on the member configuration. - */ - Status validate() const; - - /** - * Gets the identifier for this member, unique within a ReplicaSetConfig. - */ - int getId() const { return _id; } - - /** - * Gets the canonical name of this member, by which other members and clients - * will contact it. - */ - const HostAndPort& getHostAndPort() const { return _host; } - - /** - * Gets this member's priority. Higher means more likely to be elected - * primary. - */ - double getPriority() const { return _priority; } - - /** - * Gets the amount of time behind the primary that this member will atempt to - * remain. Zero seconds means stay as caught up as possible. - */ - Seconds getSlaveDelay() const { return _slaveDelay; } - - /** - * Returns true if this member may vote in elections. - */ - bool isVoter() const { return _votes != 0; } - - /** - * Returns the number of votes that this member gets. - */ - int getNumVotes() const { return isVoter() ? 1 : 0; } - - /** - * Returns true if this member is an arbiter (is not data-bearing). - */ - bool isArbiter() const { return _arbiterOnly; } - - /** - * Returns true if this member is hidden (not reported by isMaster, not electable). - */ - bool isHidden() const { return _hidden; } - - /** - * Returns true if this member should build secondary indexes. - */ - bool shouldBuildIndexes() const { return _buildIndexes; } - - /** - * Gets the number of replica set tags, including internal '$' tags, for this member. - */ - size_t getNumTags() const { return _tags.size(); } - - /** - * Returns true if this MemberConfig has any non-internal tags, using "tagConfig" to - * determine the internal property of the tags. - */ - bool hasTags(const ReplicaSetTagConfig& tagConfig) const; - - /** - * Gets a begin iterator over the tags for this member. - */ - TagIterator tagsBegin() const { return _tags.begin(); } - - /** - * Gets an end iterator over the tags for this member. - */ - TagIterator tagsEnd() const { return _tags.end(); } - - /** - * Returns true if this represents the configuration of an electable member. - */ - bool isElectable() const { return !isArbiter() && getPriority() > 0; } - - /** - * Returns the member config as a BSONObj, using "tagConfig" to generate the tag subdoc. - */ - BSONObj toBSON(const ReplicaSetTagConfig& tagConfig) const; - - private: - - int _id; - HostAndPort _host; - double _priority; // 0 means can never be primary - int _votes; // Can this member vote? Only 0 and 1 are valid. Default 1. - bool _arbiterOnly; - Seconds _slaveDelay; - bool _hidden; // if set, don't advertise to drivers in isMaster. - bool _buildIndexes; // if false, do not create any non-_id indexes - std::vector<ReplicaSetTag> _tags; // tagging for data center, rack, etc. - }; + * Gets the amount of time behind the primary that this member will atempt to + * remain. Zero seconds means stay as caught up as possible. + */ + Seconds getSlaveDelay() const { + return _slaveDelay; + } + + /** + * Returns true if this member may vote in elections. + */ + bool isVoter() const { + return _votes != 0; + } + + /** + * Returns the number of votes that this member gets. + */ + int getNumVotes() const { + return isVoter() ? 1 : 0; + } + + /** + * Returns true if this member is an arbiter (is not data-bearing). + */ + bool isArbiter() const { + return _arbiterOnly; + } + + /** + * Returns true if this member is hidden (not reported by isMaster, not electable). + */ + bool isHidden() const { + return _hidden; + } + + /** + * Returns true if this member should build secondary indexes. + */ + bool shouldBuildIndexes() const { + return _buildIndexes; + } + + /** + * Gets the number of replica set tags, including internal '$' tags, for this member. + */ + size_t getNumTags() const { + return _tags.size(); + } + + /** + * Returns true if this MemberConfig has any non-internal tags, using "tagConfig" to + * determine the internal property of the tags. + */ + bool hasTags(const ReplicaSetTagConfig& tagConfig) const; + + /** + * Gets a begin iterator over the tags for this member. + */ + TagIterator tagsBegin() const { + return _tags.begin(); + } + + /** + * Gets an end iterator over the tags for this member. + */ + TagIterator tagsEnd() const { + return _tags.end(); + } + + /** + * Returns true if this represents the configuration of an electable member. + */ + bool isElectable() const { + return !isArbiter() && getPriority() > 0; + } + + /** + * Returns the member config as a BSONObj, using "tagConfig" to generate the tag subdoc. + */ + BSONObj toBSON(const ReplicaSetTagConfig& tagConfig) const; + +private: + int _id; + HostAndPort _host; + double _priority; // 0 means can never be primary + int _votes; // Can this member vote? Only 0 and 1 are valid. Default 1. + bool _arbiterOnly; + Seconds _slaveDelay; + bool _hidden; // if set, don't advertise to drivers in isMaster. + bool _buildIndexes; // if false, do not create any non-_id indexes + std::vector<ReplicaSetTag> _tags; // tagging for data center, rack, etc. +}; } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/member_config_test.cpp b/src/mongo/db/repl/member_config_test.cpp index 9d3a0f7b276..98e57e7d998 100644 --- a/src/mongo/db/repl/member_config_test.cpp +++ b/src/mongo/db/repl/member_config_test.cpp @@ -38,324 +38,417 @@ namespace mongo { namespace repl { namespace { - TEST(MemberConfig, ParseMinimalMemberConfigAndCheckDefaults) { - ReplicaSetTagConfig tagConfig; - MemberConfig mc; - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "localhost:12345"), - &tagConfig)); - ASSERT_EQUALS(0, mc.getId()); - ASSERT_EQUALS(HostAndPort("localhost", 12345), mc.getHostAndPort()); - ASSERT_EQUALS(1.0, mc.getPriority()); - ASSERT_EQUALS(0, mc.getSlaveDelay().total_seconds()); - ASSERT_TRUE(mc.isVoter()); - ASSERT_FALSE(mc.isHidden()); - ASSERT_FALSE(mc.isArbiter()); - ASSERT_TRUE(mc.shouldBuildIndexes()); - ASSERT_EQUALS(3U, mc.getNumTags()); - ASSERT_OK(mc.validate()); - } - - TEST(MemberConfig, ParseFailsWithIllegalFieldName) { - ReplicaSetTagConfig tagConfig; - MemberConfig mc; - ASSERT_EQUALS(ErrorCodes::BadValue, - mc.initialize(BSON("_id" << 0 << "host" << "localhost" << "frim" << 1), - &tagConfig)); - } - - TEST(MemberConfig, ParseFailsWithMissingIdField) { - ReplicaSetTagConfig tagConfig; - MemberConfig mc; - ASSERT_EQUALS(ErrorCodes::NoSuchKey, mc.initialize(BSON("host" << "localhost:12345"), - &tagConfig)); - } - - TEST(MemberConfig, ParseFailsWithBadIdField) { - ReplicaSetTagConfig tagConfig; - MemberConfig mc; - ASSERT_EQUALS(ErrorCodes::NoSuchKey, mc.initialize(BSON("host" << "localhost:12345"), - &tagConfig)); - ASSERT_EQUALS(ErrorCodes::TypeMismatch, - mc.initialize(BSON("_id" << "0" << "host" << "localhost:12345"), - &tagConfig)); - ASSERT_EQUALS(ErrorCodes::TypeMismatch, - mc.initialize(BSON("_id" << Date_t(0) << "host" << "localhost:12345"), - &tagConfig)); - } - - TEST(MemberConfig, ParseFailsWithMissingHostField) { - ReplicaSetTagConfig tagConfig; - MemberConfig mc; - ASSERT_EQUALS(ErrorCodes::NoSuchKey, mc.initialize(BSON("_id" << 0), &tagConfig)); - } - - - TEST(MemberConfig, ParseFailsWithBadHostField) { - ReplicaSetTagConfig tagConfig; - MemberConfig mc; - ASSERT_EQUALS(ErrorCodes::TypeMismatch, mc.initialize(BSON("_id" << 0 << "host" << 0), - &tagConfig)); - ASSERT_EQUALS(ErrorCodes::FailedToParse, mc.initialize(BSON("_id" << 0 << "host" << ""), - &tagConfig)); - ASSERT_EQUALS(ErrorCodes::FailedToParse, - mc.initialize(BSON("_id" << 0 << "host" << "myhost:zabc"), &tagConfig)); - } - - TEST(MemberConfig, ParseArbiterOnly) { - ReplicaSetTagConfig tagConfig; - MemberConfig mc; - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "arbiterOnly" << 1.0), - &tagConfig)); - ASSERT_TRUE(mc.isArbiter()); - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "arbiterOnly" << false), - &tagConfig)); - ASSERT_TRUE(!mc.isArbiter()); - } - - TEST(MemberConfig, ParseHidden) { - ReplicaSetTagConfig tagConfig; - MemberConfig mc; - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "hidden" << 1.0), - &tagConfig)); - ASSERT_TRUE(mc.isHidden()); - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "hidden" << false), - &tagConfig)); - ASSERT_TRUE(!mc.isHidden()); - ASSERT_EQUALS(ErrorCodes::TypeMismatch, - mc.initialize(BSON("_id" << 0 << "host" << "h" << "hidden" << "1.0"), - &tagConfig)); - } - - TEST(MemberConfig, ParseBuildIndexes) { - ReplicaSetTagConfig tagConfig; - MemberConfig mc; - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "buildIndexes" << 1.0), - &tagConfig)); - ASSERT_TRUE(mc.shouldBuildIndexes()); - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "buildIndexes" << false), - &tagConfig)); - ASSERT_TRUE(!mc.shouldBuildIndexes()); - } - - TEST(MemberConfig, ParseVotes) { - ReplicaSetTagConfig tagConfig; - MemberConfig mc; - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "votes" << 1.0), - &tagConfig)); - ASSERT_TRUE(mc.isVoter()); - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "votes" << 0), - &tagConfig)); - ASSERT_FALSE(mc.isVoter()); - - // For backwards compatibility, truncate 1.X to 1, and 0.X to 0 (and -0.X to 0). - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "votes" << 1.5), - &tagConfig)); - ASSERT_TRUE(mc.isVoter()); - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "votes" << 0.5), - &tagConfig)); - ASSERT_FALSE(mc.isVoter()); - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "votes" << -0.5), - &tagConfig)); - ASSERT_FALSE(mc.isVoter()); - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "votes" << 2), - &tagConfig)); - - ASSERT_EQUALS(ErrorCodes::TypeMismatch, - mc.initialize(BSON("_id" << 0 << "host" << "h" << "votes" << Date_t(2)), - &tagConfig)); - } - - TEST(MemberConfig, ParsePriority) { - ReplicaSetTagConfig tagConfig; - MemberConfig mc; - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "priority" << 1), - &tagConfig)); - ASSERT_EQUALS(1.0, mc.getPriority()); - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "priority" << 0), - &tagConfig)); - ASSERT_EQUALS(0.0, mc.getPriority()); - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "priority" << 100.8), - &tagConfig)); - ASSERT_EQUALS(100.8, mc.getPriority()); - - ASSERT_EQUALS(ErrorCodes::TypeMismatch, - mc.initialize(BSON("_id" << 0 << "host" << "h" << "priority" << Date_t(2)), - &tagConfig)); - } - - TEST(MemberConfig, ParseSlaveDelay) { - ReplicaSetTagConfig tagConfig; - MemberConfig mc; - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "slaveDelay" << 100), - &tagConfig)); - ASSERT_EQUALS(100, mc.getSlaveDelay().total_seconds()); - } - - TEST(MemberConfig, ParseTags) { - ReplicaSetTagConfig tagConfig; - MemberConfig mc; - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << - "tags" << BSON("k1" << "v1" << "k2" << "v2")), - &tagConfig)); - ASSERT_EQUALS(5U, mc.getNumTags()); - ASSERT_EQUALS(5, std::distance(mc.tagsBegin(), mc.tagsEnd())); - ASSERT_EQUALS(1, std::count(mc.tagsBegin(), mc.tagsEnd(), tagConfig.findTag("k1", "v1"))); - ASSERT_EQUALS(1, std::count(mc.tagsBegin(), mc.tagsEnd(), tagConfig.findTag("k2", "v2"))); - ASSERT_EQUALS(1, std::count(mc.tagsBegin(), mc.tagsEnd(), tagConfig.findTag("$voter", - "0"))); - ASSERT_EQUALS(1, std::count(mc.tagsBegin(), mc.tagsEnd(), tagConfig.findTag("$electable", - "0"))); - ASSERT_EQUALS(1, std::count(mc.tagsBegin(), mc.tagsEnd(), tagConfig.findTag("$all", - "0"))); - } - - TEST(MemberConfig, ValidateFailsWithIdOutOfRange) { - ReplicaSetTagConfig tagConfig; - MemberConfig mc; - ASSERT_OK(mc.initialize(BSON("_id" << -1 << "host" << "localhost:12345"), - &tagConfig)); - ASSERT_EQUALS(ErrorCodes::BadValue, mc.validate()); - ASSERT_OK(mc.initialize(BSON("_id" << 256 << "host" << "localhost:12345"), - &tagConfig)); - ASSERT_EQUALS(ErrorCodes::BadValue, mc.validate()); - } - - TEST(MemberConfig, ValidateVotes) { - ReplicaSetTagConfig tagConfig; - MemberConfig mc; - - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "votes" << 1.0), - &tagConfig)); - ASSERT_OK(mc.validate()); - ASSERT_TRUE(mc.isVoter()); - - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "votes" << 0), - &tagConfig)); - ASSERT_OK(mc.validate()); - ASSERT_FALSE(mc.isVoter()); - - // For backwards compatibility, truncate 1.X to 1, and 0.X to 0 (and -0.X to 0). - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "votes" << 1.5), - &tagConfig)); - ASSERT_OK(mc.validate()); - ASSERT_TRUE(mc.isVoter()); - - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "votes" << 0.5), - &tagConfig)); - ASSERT_OK(mc.validate()); - ASSERT_FALSE(mc.isVoter()); - - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "votes" << -0.5), - &tagConfig)); - ASSERT_OK(mc.validate()); - ASSERT_FALSE(mc.isVoter()); - - // Invalid values - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "votes" << 2), - &tagConfig)); - ASSERT_EQUALS(ErrorCodes::BadValue, mc.validate()); - - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "votes" << -1), - &tagConfig)); - ASSERT_EQUALS(ErrorCodes::BadValue, mc.validate()); - } - - TEST(MemberConfig, ValidatePriorityRanges) { - ReplicaSetTagConfig tagConfig; - MemberConfig mc; - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "priority" << 0), - &tagConfig)); - ASSERT_OK(mc.validate()); - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "priority" << 1000), - &tagConfig)); - ASSERT_OK(mc.validate()); - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "priority" << -1), - &tagConfig)); - ASSERT_EQUALS(ErrorCodes::BadValue, mc.validate()); - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "priority" << 1001), - &tagConfig)); - ASSERT_EQUALS(ErrorCodes::BadValue, mc.validate()); - } - - TEST(MemberConfig, ValidateSlaveDelays) { - ReplicaSetTagConfig tagConfig; - MemberConfig mc; - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "priority" << 0 << - "slaveDelay" << 0), - &tagConfig)); - ASSERT_OK(mc.validate()); - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "priority" << 0 << - "slaveDelay" << 3600 * 10), - &tagConfig)); - ASSERT_OK(mc.validate()); - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "priority" << 0 << - "slaveDelay" << -1), - &tagConfig)); - ASSERT_EQUALS(ErrorCodes::BadValue, mc.validate()); - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "priority" << 0 << - "slaveDelay" << 3600 * 24 * 400), - &tagConfig)); - ASSERT_EQUALS(ErrorCodes::BadValue, mc.validate()); - } - - TEST(MemberConfig, ValidatePriorityAndSlaveDelayRelationship) { - ReplicaSetTagConfig tagConfig; - MemberConfig mc; - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "priority" << 1 << - "slaveDelay" << 60), - &tagConfig)); - ASSERT_EQUALS(ErrorCodes::BadValue, mc.validate()); - } - - TEST(MemberConfig, ValidatePriorityAndHiddenRelationship) { - ReplicaSetTagConfig tagConfig; - MemberConfig mc; - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "priority" << 1 << - "hidden" << true), - &tagConfig)); - ASSERT_EQUALS(ErrorCodes::BadValue, mc.validate()); - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "priority" << 1 << - "hidden" << false), - &tagConfig)); - ASSERT_OK(mc.validate()); - } - - TEST(MemberConfig, ValidatePriorityAndBuildIndexesRelationship) { - ReplicaSetTagConfig tagConfig; - MemberConfig mc; - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "priority" << 1 << - "buildIndexes" << false), - &tagConfig)); - - ASSERT_EQUALS(ErrorCodes::BadValue, mc.validate()); - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << "priority" << 1 << - "buildIndexes" << true), - &tagConfig)); - ASSERT_OK(mc.validate()); - } - - TEST(MemberConfig, ValidateArbiterVotesRelationship) { - ReplicaSetTagConfig tagConfig; - MemberConfig mc; - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << - "votes" << 1 << "arbiterOnly" << true), - &tagConfig)); - ASSERT_OK(mc.validate()); - - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << - "votes" << 0 << "arbiterOnly" << false), - &tagConfig)); - ASSERT_OK(mc.validate()); - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << - "votes" << 1 << "arbiterOnly" << false), - &tagConfig)); - ASSERT_OK(mc.validate()); - - ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" << "h" << - "votes" << 0 << "arbiterOnly" << true), - &tagConfig)); - ASSERT_EQUALS(ErrorCodes::BadValue, mc.validate()); - } +TEST(MemberConfig, ParseMinimalMemberConfigAndCheckDefaults) { + ReplicaSetTagConfig tagConfig; + MemberConfig mc; + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "localhost:12345"), + &tagConfig)); + ASSERT_EQUALS(0, mc.getId()); + ASSERT_EQUALS(HostAndPort("localhost", 12345), mc.getHostAndPort()); + ASSERT_EQUALS(1.0, mc.getPriority()); + ASSERT_EQUALS(0, mc.getSlaveDelay().total_seconds()); + ASSERT_TRUE(mc.isVoter()); + ASSERT_FALSE(mc.isHidden()); + ASSERT_FALSE(mc.isArbiter()); + ASSERT_TRUE(mc.shouldBuildIndexes()); + ASSERT_EQUALS(3U, mc.getNumTags()); + ASSERT_OK(mc.validate()); +} + +TEST(MemberConfig, ParseFailsWithIllegalFieldName) { + ReplicaSetTagConfig tagConfig; + MemberConfig mc; + ASSERT_EQUALS(ErrorCodes::BadValue, + mc.initialize(BSON("_id" << 0 << "host" + << "localhost" + << "frim" << 1), + &tagConfig)); +} + +TEST(MemberConfig, ParseFailsWithMissingIdField) { + ReplicaSetTagConfig tagConfig; + MemberConfig mc; + ASSERT_EQUALS(ErrorCodes::NoSuchKey, + mc.initialize(BSON("host" + << "localhost:12345"), + &tagConfig)); +} + +TEST(MemberConfig, ParseFailsWithBadIdField) { + ReplicaSetTagConfig tagConfig; + MemberConfig mc; + ASSERT_EQUALS(ErrorCodes::NoSuchKey, + mc.initialize(BSON("host" + << "localhost:12345"), + &tagConfig)); + ASSERT_EQUALS(ErrorCodes::TypeMismatch, + mc.initialize(BSON("_id" + << "0" + << "host" + << "localhost:12345"), + &tagConfig)); + ASSERT_EQUALS(ErrorCodes::TypeMismatch, + mc.initialize(BSON("_id" << Date_t(0) << "host" + << "localhost:12345"), + &tagConfig)); +} + +TEST(MemberConfig, ParseFailsWithMissingHostField) { + ReplicaSetTagConfig tagConfig; + MemberConfig mc; + ASSERT_EQUALS(ErrorCodes::NoSuchKey, mc.initialize(BSON("_id" << 0), &tagConfig)); +} + + +TEST(MemberConfig, ParseFailsWithBadHostField) { + ReplicaSetTagConfig tagConfig; + MemberConfig mc; + ASSERT_EQUALS(ErrorCodes::TypeMismatch, + mc.initialize(BSON("_id" << 0 << "host" << 0), &tagConfig)); + ASSERT_EQUALS(ErrorCodes::FailedToParse, + mc.initialize(BSON("_id" << 0 << "host" + << ""), + &tagConfig)); + ASSERT_EQUALS(ErrorCodes::FailedToParse, + mc.initialize(BSON("_id" << 0 << "host" + << "myhost:zabc"), + &tagConfig)); +} + +TEST(MemberConfig, ParseArbiterOnly) { + ReplicaSetTagConfig tagConfig; + MemberConfig mc; + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "arbiterOnly" << 1.0), + &tagConfig)); + ASSERT_TRUE(mc.isArbiter()); + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "arbiterOnly" << false), + &tagConfig)); + ASSERT_TRUE(!mc.isArbiter()); +} + +TEST(MemberConfig, ParseHidden) { + ReplicaSetTagConfig tagConfig; + MemberConfig mc; + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "hidden" << 1.0), + &tagConfig)); + ASSERT_TRUE(mc.isHidden()); + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "hidden" << false), + &tagConfig)); + ASSERT_TRUE(!mc.isHidden()); + ASSERT_EQUALS(ErrorCodes::TypeMismatch, + mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "hidden" + << "1.0"), + &tagConfig)); +} + +TEST(MemberConfig, ParseBuildIndexes) { + ReplicaSetTagConfig tagConfig; + MemberConfig mc; + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "buildIndexes" << 1.0), + &tagConfig)); + ASSERT_TRUE(mc.shouldBuildIndexes()); + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "buildIndexes" << false), + &tagConfig)); + ASSERT_TRUE(!mc.shouldBuildIndexes()); +} + +TEST(MemberConfig, ParseVotes) { + ReplicaSetTagConfig tagConfig; + MemberConfig mc; + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "votes" << 1.0), + &tagConfig)); + ASSERT_TRUE(mc.isVoter()); + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "votes" << 0), + &tagConfig)); + ASSERT_FALSE(mc.isVoter()); + + // For backwards compatibility, truncate 1.X to 1, and 0.X to 0 (and -0.X to 0). + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "votes" << 1.5), + &tagConfig)); + ASSERT_TRUE(mc.isVoter()); + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "votes" << 0.5), + &tagConfig)); + ASSERT_FALSE(mc.isVoter()); + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "votes" << -0.5), + &tagConfig)); + ASSERT_FALSE(mc.isVoter()); + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "votes" << 2), + &tagConfig)); + + ASSERT_EQUALS(ErrorCodes::TypeMismatch, + mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "votes" << Date_t(2)), + &tagConfig)); +} + +TEST(MemberConfig, ParsePriority) { + ReplicaSetTagConfig tagConfig; + MemberConfig mc; + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "priority" << 1), + &tagConfig)); + ASSERT_EQUALS(1.0, mc.getPriority()); + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "priority" << 0), + &tagConfig)); + ASSERT_EQUALS(0.0, mc.getPriority()); + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "priority" << 100.8), + &tagConfig)); + ASSERT_EQUALS(100.8, mc.getPriority()); + + ASSERT_EQUALS(ErrorCodes::TypeMismatch, + mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "priority" << Date_t(2)), + &tagConfig)); +} + +TEST(MemberConfig, ParseSlaveDelay) { + ReplicaSetTagConfig tagConfig; + MemberConfig mc; + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "slaveDelay" << 100), + &tagConfig)); + ASSERT_EQUALS(100, mc.getSlaveDelay().total_seconds()); +} + +TEST(MemberConfig, ParseTags) { + ReplicaSetTagConfig tagConfig; + MemberConfig mc; + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "tags" << BSON("k1" + << "v1" + << "k2" + << "v2")), + &tagConfig)); + ASSERT_EQUALS(5U, mc.getNumTags()); + ASSERT_EQUALS(5, std::distance(mc.tagsBegin(), mc.tagsEnd())); + ASSERT_EQUALS(1, std::count(mc.tagsBegin(), mc.tagsEnd(), tagConfig.findTag("k1", "v1"))); + ASSERT_EQUALS(1, std::count(mc.tagsBegin(), mc.tagsEnd(), tagConfig.findTag("k2", "v2"))); + ASSERT_EQUALS(1, std::count(mc.tagsBegin(), mc.tagsEnd(), tagConfig.findTag("$voter", "0"))); + ASSERT_EQUALS(1, + std::count(mc.tagsBegin(), mc.tagsEnd(), tagConfig.findTag("$electable", "0"))); + ASSERT_EQUALS(1, std::count(mc.tagsBegin(), mc.tagsEnd(), tagConfig.findTag("$all", "0"))); +} + +TEST(MemberConfig, ValidateFailsWithIdOutOfRange) { + ReplicaSetTagConfig tagConfig; + MemberConfig mc; + ASSERT_OK(mc.initialize(BSON("_id" << -1 << "host" + << "localhost:12345"), + &tagConfig)); + ASSERT_EQUALS(ErrorCodes::BadValue, mc.validate()); + ASSERT_OK(mc.initialize(BSON("_id" << 256 << "host" + << "localhost:12345"), + &tagConfig)); + ASSERT_EQUALS(ErrorCodes::BadValue, mc.validate()); +} + +TEST(MemberConfig, ValidateVotes) { + ReplicaSetTagConfig tagConfig; + MemberConfig mc; + + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "votes" << 1.0), + &tagConfig)); + ASSERT_OK(mc.validate()); + ASSERT_TRUE(mc.isVoter()); + + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "votes" << 0), + &tagConfig)); + ASSERT_OK(mc.validate()); + ASSERT_FALSE(mc.isVoter()); + + // For backwards compatibility, truncate 1.X to 1, and 0.X to 0 (and -0.X to 0). + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "votes" << 1.5), + &tagConfig)); + ASSERT_OK(mc.validate()); + ASSERT_TRUE(mc.isVoter()); + + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "votes" << 0.5), + &tagConfig)); + ASSERT_OK(mc.validate()); + ASSERT_FALSE(mc.isVoter()); + + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "votes" << -0.5), + &tagConfig)); + ASSERT_OK(mc.validate()); + ASSERT_FALSE(mc.isVoter()); + + // Invalid values + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "votes" << 2), + &tagConfig)); + ASSERT_EQUALS(ErrorCodes::BadValue, mc.validate()); + + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "votes" << -1), + &tagConfig)); + ASSERT_EQUALS(ErrorCodes::BadValue, mc.validate()); +} + +TEST(MemberConfig, ValidatePriorityRanges) { + ReplicaSetTagConfig tagConfig; + MemberConfig mc; + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "priority" << 0), + &tagConfig)); + ASSERT_OK(mc.validate()); + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "priority" << 1000), + &tagConfig)); + ASSERT_OK(mc.validate()); + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "priority" << -1), + &tagConfig)); + ASSERT_EQUALS(ErrorCodes::BadValue, mc.validate()); + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "priority" << 1001), + &tagConfig)); + ASSERT_EQUALS(ErrorCodes::BadValue, mc.validate()); +} + +TEST(MemberConfig, ValidateSlaveDelays) { + ReplicaSetTagConfig tagConfig; + MemberConfig mc; + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "priority" << 0 << "slaveDelay" << 0), + &tagConfig)); + ASSERT_OK(mc.validate()); + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "priority" << 0 << "slaveDelay" << 3600 * 10), + &tagConfig)); + ASSERT_OK(mc.validate()); + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "priority" << 0 << "slaveDelay" << -1), + &tagConfig)); + ASSERT_EQUALS(ErrorCodes::BadValue, mc.validate()); + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "priority" << 0 << "slaveDelay" << 3600 * 24 * 400), + &tagConfig)); + ASSERT_EQUALS(ErrorCodes::BadValue, mc.validate()); +} + +TEST(MemberConfig, ValidatePriorityAndSlaveDelayRelationship) { + ReplicaSetTagConfig tagConfig; + MemberConfig mc; + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "priority" << 1 << "slaveDelay" << 60), + &tagConfig)); + ASSERT_EQUALS(ErrorCodes::BadValue, mc.validate()); +} + +TEST(MemberConfig, ValidatePriorityAndHiddenRelationship) { + ReplicaSetTagConfig tagConfig; + MemberConfig mc; + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "priority" << 1 << "hidden" << true), + &tagConfig)); + ASSERT_EQUALS(ErrorCodes::BadValue, mc.validate()); + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "priority" << 1 << "hidden" << false), + &tagConfig)); + ASSERT_OK(mc.validate()); +} + +TEST(MemberConfig, ValidatePriorityAndBuildIndexesRelationship) { + ReplicaSetTagConfig tagConfig; + MemberConfig mc; + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "priority" << 1 << "buildIndexes" << false), + &tagConfig)); + + ASSERT_EQUALS(ErrorCodes::BadValue, mc.validate()); + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "priority" << 1 << "buildIndexes" << true), + &tagConfig)); + ASSERT_OK(mc.validate()); +} + +TEST(MemberConfig, ValidateArbiterVotesRelationship) { + ReplicaSetTagConfig tagConfig; + MemberConfig mc; + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "votes" << 1 << "arbiterOnly" << true), + &tagConfig)); + ASSERT_OK(mc.validate()); + + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "votes" << 0 << "arbiterOnly" << false), + &tagConfig)); + ASSERT_OK(mc.validate()); + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "votes" << 1 << "arbiterOnly" << false), + &tagConfig)); + ASSERT_OK(mc.validate()); + + ASSERT_OK(mc.initialize(BSON("_id" << 0 << "host" + << "h" + << "votes" << 0 << "arbiterOnly" << true), + &tagConfig)); + ASSERT_EQUALS(ErrorCodes::BadValue, mc.validate()); +} } // namespace } // namespace repl diff --git a/src/mongo/db/repl/member_heartbeat_data.cpp b/src/mongo/db/repl/member_heartbeat_data.cpp index 8ca22c40649..ff1d5882be6 100644 --- a/src/mongo/db/repl/member_heartbeat_data.cpp +++ b/src/mongo/db/repl/member_heartbeat_data.cpp @@ -39,74 +39,68 @@ namespace mongo { namespace repl { - MemberHeartbeatData::MemberHeartbeatData() : - _health(-1), - _upSince(0), - _lastHeartbeat(0), - _lastHeartbeatRecv(0), - _authIssue(false) { - - _lastResponse.setState(MemberState::RS_UNKNOWN); - _lastResponse.setElectionTime(OpTime()); - _lastResponse.setOpTime(OpTime()); +MemberHeartbeatData::MemberHeartbeatData() + : _health(-1), _upSince(0), _lastHeartbeat(0), _lastHeartbeatRecv(0), _authIssue(false) { + _lastResponse.setState(MemberState::RS_UNKNOWN); + _lastResponse.setElectionTime(OpTime()); + _lastResponse.setOpTime(OpTime()); +} + +void MemberHeartbeatData::setUpValues(Date_t now, + const HostAndPort& host, + ReplSetHeartbeatResponse hbResponse) { + _health = 1; + if (_upSince == 0) { + _upSince = now; } - - void MemberHeartbeatData::setUpValues(Date_t now, - const HostAndPort& host, - ReplSetHeartbeatResponse hbResponse) { - _health = 1; - if (_upSince == 0) { - _upSince = now; - } - _authIssue = false; - _lastHeartbeat = now; - if (!hbResponse.hasState()) { - hbResponse.setState(MemberState::RS_UNKNOWN); - } - if (!hbResponse.hasElectionTime()) { - hbResponse.setElectionTime(_lastResponse.getElectionTime()); - } - if (!hbResponse.hasOpTime()) { - hbResponse.setOpTime(_lastResponse.getOpTime()); - } - - // Log if the state changes - if (_lastResponse.getState() != hbResponse.getState()){ - log() << "Member " << host.toString() << " is now in state " - << hbResponse.getState().toString() << rsLog; - } - - _lastResponse = hbResponse; + _authIssue = false; + _lastHeartbeat = now; + if (!hbResponse.hasState()) { + hbResponse.setState(MemberState::RS_UNKNOWN); } - - void MemberHeartbeatData::setDownValues(Date_t now, const std::string& heartbeatMessage) { - - _health = 0; - _upSince = 0; - _lastHeartbeat = now; - _authIssue = false; - - _lastResponse = ReplSetHeartbeatResponse(); - _lastResponse.setState(MemberState::RS_DOWN); - _lastResponse.setElectionTime(OpTime()); - _lastResponse.setOpTime(OpTime()); - _lastResponse.setHbMsg(heartbeatMessage); - _lastResponse.setSyncingTo(""); + if (!hbResponse.hasElectionTime()) { + hbResponse.setElectionTime(_lastResponse.getElectionTime()); + } + if (!hbResponse.hasOpTime()) { + hbResponse.setOpTime(_lastResponse.getOpTime()); } - void MemberHeartbeatData::setAuthIssue(Date_t now) { - _health = 0; // set health to 0 so that this doesn't count towards majority. - _upSince = 0; - _lastHeartbeat = now; - _authIssue = true; - - _lastResponse = ReplSetHeartbeatResponse(); - _lastResponse.setState(MemberState::RS_UNKNOWN); - _lastResponse.setElectionTime(OpTime()); - _lastResponse.setOpTime(OpTime()); - _lastResponse.setHbMsg(""); - _lastResponse.setSyncingTo(""); + // Log if the state changes + if (_lastResponse.getState() != hbResponse.getState()) { + log() << "Member " << host.toString() << " is now in state " + << hbResponse.getState().toString() << rsLog; } -} // namespace repl -} // namespace mongo + _lastResponse = hbResponse; +} + +void MemberHeartbeatData::setDownValues(Date_t now, const std::string& heartbeatMessage) { + _health = 0; + _upSince = 0; + _lastHeartbeat = now; + _authIssue = false; + + _lastResponse = ReplSetHeartbeatResponse(); + _lastResponse.setState(MemberState::RS_DOWN); + _lastResponse.setElectionTime(OpTime()); + _lastResponse.setOpTime(OpTime()); + _lastResponse.setHbMsg(heartbeatMessage); + _lastResponse.setSyncingTo(""); +} + +void MemberHeartbeatData::setAuthIssue(Date_t now) { + _health = 0; // set health to 0 so that this doesn't count towards majority. + _upSince = 0; + _lastHeartbeat = now; + _authIssue = true; + + _lastResponse = ReplSetHeartbeatResponse(); + _lastResponse.setState(MemberState::RS_UNKNOWN); + _lastResponse.setElectionTime(OpTime()); + _lastResponse.setOpTime(OpTime()); + _lastResponse.setHbMsg(""); + _lastResponse.setSyncingTo(""); +} + +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/member_heartbeat_data.h b/src/mongo/db/repl/member_heartbeat_data.h index 624c572d33f..c54ffb02385 100644 --- a/src/mongo/db/repl/member_heartbeat_data.h +++ b/src/mongo/db/repl/member_heartbeat_data.h @@ -36,77 +36,103 @@ namespace mongo { namespace repl { +/** + * This class contains the data returned from a heartbeat command for one member + * of a replica set. + **/ +class MemberHeartbeatData { +public: + MemberHeartbeatData(); + + MemberState getState() const { + return _lastResponse.getState(); + } + int getHealth() const { + return _health; + } + Date_t getUpSince() const { + return _upSince; + } + Date_t getLastHeartbeat() const { + return _lastHeartbeat; + } + Date_t getLastHeartbeatRecv() const { + return _lastHeartbeatRecv; + } + void setLastHeartbeatRecv(Date_t newHeartbeatRecvTime) { + _lastHeartbeatRecv = newHeartbeatRecvTime; + } + const std::string& getLastHeartbeatMsg() const { + return _lastResponse.getHbMsg(); + } + const std::string& getSyncSource() const { + return _lastResponse.getSyncingTo(); + } + OpTime getOpTime() const { + return _lastResponse.getOpTime(); + } + int getConfigVersion() const { + return _lastResponse.getVersion(); + } + bool hasAuthIssue() const { + return _authIssue; + } + + OpTime getElectionTime() const { + return _lastResponse.getElectionTime(); + } + + // Returns true if the last heartbeat data explicilty stated that the node + // is not electable. + bool isUnelectable() const { + return _lastResponse.hasIsElectable() && !_lastResponse.isElectable(); + } + + // Was this member up for the last heartbeat? + bool up() const { + return _health > 0; + } + // Was this member up for the last hearbeeat + // (or we haven't received the first heartbeat yet) + bool maybeUp() const { + return _health != 0; + } + + /** + * Sets values in this object from the results of a successful heartbeat command. + */ + void setUpValues(Date_t now, const HostAndPort& host, ReplSetHeartbeatResponse hbResponse); + + /** + * Sets values in this object from the results of a erroring/failed heartbeat command. + * _authIssues is set to false, _health is set to 0, _state is set to RS_DOWN, and + * other values are set as specified. + */ + void setDownValues(Date_t now, const std::string& heartbeatMessage); + /** - * This class contains the data returned from a heartbeat command for one member - * of a replica set. - **/ - class MemberHeartbeatData { - public: - MemberHeartbeatData(); - - MemberState getState() const { return _lastResponse.getState(); } - int getHealth() const { return _health; } - Date_t getUpSince() const { return _upSince; } - Date_t getLastHeartbeat() const { return _lastHeartbeat; } - Date_t getLastHeartbeatRecv() const { return _lastHeartbeatRecv; } - void setLastHeartbeatRecv(Date_t newHeartbeatRecvTime) { - _lastHeartbeatRecv = newHeartbeatRecvTime; - } - const std::string& getLastHeartbeatMsg() const { return _lastResponse.getHbMsg(); } - const std::string& getSyncSource() const { return _lastResponse.getSyncingTo(); } - OpTime getOpTime() const { return _lastResponse.getOpTime(); } - int getConfigVersion() const { return _lastResponse.getVersion(); } - bool hasAuthIssue() const { return _authIssue; } - - OpTime getElectionTime() const { return _lastResponse.getElectionTime(); } - - // Returns true if the last heartbeat data explicilty stated that the node - // is not electable. - bool isUnelectable() const { - return _lastResponse.hasIsElectable() && !_lastResponse.isElectable(); - } - - // Was this member up for the last heartbeat? - bool up() const { return _health > 0; } - // Was this member up for the last hearbeeat - // (or we haven't received the first heartbeat yet) - bool maybeUp() const { return _health != 0; } - - /** - * Sets values in this object from the results of a successful heartbeat command. - */ - void setUpValues(Date_t now, const HostAndPort& host, ReplSetHeartbeatResponse hbResponse); - - /** - * Sets values in this object from the results of a erroring/failed heartbeat command. - * _authIssues is set to false, _health is set to 0, _state is set to RS_DOWN, and - * other values are set as specified. - */ - void setDownValues(Date_t now, const std::string& heartbeatMessage); - - /** - * Sets values in this object that indicate there was an auth issue on the last heartbeat - * command. - */ - void setAuthIssue(Date_t now); - - private: - // -1 = not checked yet, 0 = member is down/unreachable, 1 = member is up - int _health; - - // Time of first successful heartbeat, if currently still up - Date_t _upSince; - // This is the last time we got a response from a heartbeat request to a given member. - Date_t _lastHeartbeat; - // This is the last time we got a heartbeat request from a given member. - Date_t _lastHeartbeatRecv; - - // Did the last heartbeat show a failure to authenticate? - bool _authIssue; - - // The last heartbeat response we received. - ReplSetHeartbeatResponse _lastResponse; - }; - -} // namespace repl -} // namespace mongo + * Sets values in this object that indicate there was an auth issue on the last heartbeat + * command. + */ + void setAuthIssue(Date_t now); + +private: + // -1 = not checked yet, 0 = member is down/unreachable, 1 = member is up + int _health; + + // Time of first successful heartbeat, if currently still up + Date_t _upSince; + // This is the last time we got a response from a heartbeat request to a given member. + Date_t _lastHeartbeat; + // This is the last time we got a heartbeat request from a given member. + Date_t _lastHeartbeatRecv; + + // Did the last heartbeat show a failure to authenticate? + bool _authIssue; + + // The last heartbeat response we received. + ReplSetHeartbeatResponse _lastResponse; +}; + +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/member_state.h b/src/mongo/db/repl/member_state.h index c3e3ffd292b..4adf7516845 100644 --- a/src/mongo/db/repl/member_state.h +++ b/src/mongo/db/repl/member_state.h @@ -36,65 +36,97 @@ namespace mongo { namespace repl { - /* - RS_STARTUP serving still starting up, or still trying to initiate the set - RS_PRIMARY this server thinks it is primary - RS_SECONDARY this server thinks it is a secondary (slave mode) - RS_RECOVERING recovering/resyncing; after recovery usually auto-transitions to secondary - RS_STARTUP2 loaded config, still determining who is primary +/* + RS_STARTUP serving still starting up, or still trying to initiate the set + RS_PRIMARY this server thinks it is primary + RS_SECONDARY this server thinks it is a secondary (slave mode) + RS_RECOVERING recovering/resyncing; after recovery usually auto-transitions to secondary + RS_STARTUP2 loaded config, still determining who is primary - State -> integer mappings are reserved forever. Do not change them or delete them, except - to update RS_MAX when introducing new states. - */ - struct MemberState { - enum MS { - RS_STARTUP = 0, - RS_PRIMARY = 1, - RS_SECONDARY = 2, - RS_RECOVERING = 3, - RS_STARTUP2 = 5, - RS_UNKNOWN = 6, /* remote node not yet reached */ - RS_ARBITER = 7, - RS_DOWN = 8, /* node not reachable for a report */ - RS_ROLLBACK = 9, - RS_REMOVED = 10, /* node removed from replica set */ - RS_MAX = 10 - } s; + State -> integer mappings are reserved forever. Do not change them or delete them, except + to update RS_MAX when introducing new states. +*/ +struct MemberState { + enum MS { + RS_STARTUP = 0, + RS_PRIMARY = 1, + RS_SECONDARY = 2, + RS_RECOVERING = 3, + RS_STARTUP2 = 5, + RS_UNKNOWN = 6, /* remote node not yet reached */ + RS_ARBITER = 7, + RS_DOWN = 8, /* node not reachable for a report */ + RS_ROLLBACK = 9, + RS_REMOVED = 10, /* node removed from replica set */ + RS_MAX = 10 + } s; - MemberState(MS ms = RS_UNKNOWN) : s(ms) { } - explicit MemberState(int ms) : s((MS) ms) { } + MemberState(MS ms = RS_UNKNOWN) : s(ms) {} + explicit MemberState(int ms) : s((MS)ms) {} - bool startup() const { return s == RS_STARTUP; } - bool primary() const { return s == RS_PRIMARY; } - bool secondary() const { return s == RS_SECONDARY; } - bool recovering() const { return s == RS_RECOVERING; } - bool startup2() const { return s == RS_STARTUP2; } - bool rollback() const { return s == RS_ROLLBACK; } - bool readable() const { return s == RS_PRIMARY || s == RS_SECONDARY; } - bool removed() const { return s == RS_REMOVED; } - bool arbiter() const { return s == RS_ARBITER; } + bool startup() const { + return s == RS_STARTUP; + } + bool primary() const { + return s == RS_PRIMARY; + } + bool secondary() const { + return s == RS_SECONDARY; + } + bool recovering() const { + return s == RS_RECOVERING; + } + bool startup2() const { + return s == RS_STARTUP2; + } + bool rollback() const { + return s == RS_ROLLBACK; + } + bool readable() const { + return s == RS_PRIMARY || s == RS_SECONDARY; + } + bool removed() const { + return s == RS_REMOVED; + } + bool arbiter() const { + return s == RS_ARBITER; + } - std::string toString() const; + std::string toString() const; - bool operator==(const MemberState& r) const { return s == r.s; } - bool operator!=(const MemberState& r) const { return s != r.s; } - }; + bool operator==(const MemberState& r) const { + return s == r.s; + } + bool operator!=(const MemberState& r) const { + return s != r.s; + } +}; - inline std::string MemberState::toString() const { - switch ( s ) { - case RS_STARTUP: return "STARTUP"; - case RS_PRIMARY: return "PRIMARY"; - case RS_SECONDARY: return "SECONDARY"; - case RS_RECOVERING: return "RECOVERING"; - case RS_STARTUP2: return "STARTUP2"; - case RS_ARBITER: return "ARBITER"; - case RS_DOWN: return "DOWN"; - case RS_ROLLBACK: return "ROLLBACK"; - case RS_UNKNOWN: return "UNKNOWN"; - case RS_REMOVED: return "REMOVED"; - } - return ""; +inline std::string MemberState::toString() const { + switch (s) { + case RS_STARTUP: + return "STARTUP"; + case RS_PRIMARY: + return "PRIMARY"; + case RS_SECONDARY: + return "SECONDARY"; + case RS_RECOVERING: + return "RECOVERING"; + case RS_STARTUP2: + return "STARTUP2"; + case RS_ARBITER: + return "ARBITER"; + case RS_DOWN: + return "DOWN"; + case RS_ROLLBACK: + return "ROLLBACK"; + case RS_UNKNOWN: + return "UNKNOWN"; + case RS_REMOVED: + return "REMOVED"; } + return ""; +} -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/minvalid.cpp b/src/mongo/db/repl/minvalid.cpp index 18235cc178c..ec39364ea13 100644 --- a/src/mongo/db/repl/minvalid.cpp +++ b/src/mongo/db/repl/minvalid.cpp @@ -45,61 +45,65 @@ namespace mongo { namespace repl { namespace { - const char* initialSyncFlagString = "doingInitialSync"; - const BSONObj initialSyncFlag(BSON(initialSyncFlagString << true)); - const char* minvalidNS = "local.replset.minvalid"; -} // namespace +const char* initialSyncFlagString = "doingInitialSync"; +const BSONObj initialSyncFlag(BSON(initialSyncFlagString << true)); +const char* minvalidNS = "local.replset.minvalid"; +} // namespace - void clearInitialSyncFlag(OperationContext* txn) { - MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { - ScopedTransaction transaction(txn, MODE_IX); - Lock::DBLock lk(txn->lockState(), "local", MODE_X); - Helpers::putSingleton(txn, minvalidNS, BSON("$unset" << initialSyncFlag)); - } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "clearInitialSyncFlags", minvalidNS); +void clearInitialSyncFlag(OperationContext* txn) { + MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { + ScopedTransaction transaction(txn, MODE_IX); + Lock::DBLock lk(txn->lockState(), "local", MODE_X); + Helpers::putSingleton(txn, minvalidNS, BSON("$unset" << initialSyncFlag)); } + MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "clearInitialSyncFlags", minvalidNS); +} - void setInitialSyncFlag(OperationContext* txn) { - MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { - ScopedTransaction transaction(txn, MODE_IX); - Lock::DBLock lk(txn->lockState(), "local", MODE_X); - Helpers::putSingleton(txn, minvalidNS, BSON("$set" << initialSyncFlag)); - } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "setInitialSyncFlags", minvalidNS); +void setInitialSyncFlag(OperationContext* txn) { + MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { + ScopedTransaction transaction(txn, MODE_IX); + Lock::DBLock lk(txn->lockState(), "local", MODE_X); + Helpers::putSingleton(txn, minvalidNS, BSON("$set" << initialSyncFlag)); } + MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "setInitialSyncFlags", minvalidNS); +} - bool getInitialSyncFlag() { - OperationContextImpl txn; - MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { - ScopedTransaction transaction(&txn, MODE_IX); - Lock::DBLock lk(txn.lockState(), "local", MODE_X); - BSONObj mv; - bool found = Helpers::getSingleton( &txn, minvalidNS, mv); - if (found) { - return mv[initialSyncFlagString].trueValue(); - } - return false; - } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(&txn, "getInitialSyncFlags", minvalidNS); +bool getInitialSyncFlag() { + OperationContextImpl txn; + MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { + ScopedTransaction transaction(&txn, MODE_IX); + Lock::DBLock lk(txn.lockState(), "local", MODE_X); + BSONObj mv; + bool found = Helpers::getSingleton(&txn, minvalidNS, mv); + if (found) { + return mv[initialSyncFlagString].trueValue(); + } + return false; } + MONGO_WRITE_CONFLICT_RETRY_LOOP_END(&txn, "getInitialSyncFlags", minvalidNS); +} - void setMinValid(OperationContext* ctx, OpTime ts) { - MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { - ScopedTransaction transaction(ctx, MODE_IX); - Lock::DBLock lk(ctx->lockState(), "local", MODE_X); - Helpers::putSingleton(ctx, minvalidNS, BSON("$set" << BSON("ts" << ts))); - } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(ctx, "setMinValid", minvalidNS); +void setMinValid(OperationContext* ctx, OpTime ts) { + MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { + ScopedTransaction transaction(ctx, MODE_IX); + Lock::DBLock lk(ctx->lockState(), "local", MODE_X); + Helpers::putSingleton(ctx, minvalidNS, BSON("$set" << BSON("ts" << ts))); } + MONGO_WRITE_CONFLICT_RETRY_LOOP_END(ctx, "setMinValid", minvalidNS); +} - OpTime getMinValid(OperationContext* txn) { - MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { - ScopedTransaction transaction(txn, MODE_IS); - Lock::DBLock lk(txn->lockState(), "local", MODE_S); - BSONObj mv; - bool found = Helpers::getSingleton(txn, minvalidNS, mv); - if (found) { - return mv["ts"]._opTime(); - } - return OpTime(); - } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "getMinValid", minvalidNS); +OpTime getMinValid(OperationContext* txn) { + MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { + ScopedTransaction transaction(txn, MODE_IS); + Lock::DBLock lk(txn->lockState(), "local", MODE_S); + BSONObj mv; + bool found = Helpers::getSingleton(txn, minvalidNS, mv); + if (found) { + return mv["ts"]._opTime(); + } + return OpTime(); } - + MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "getMinValid", minvalidNS); +} } } diff --git a/src/mongo/db/repl/minvalid.h b/src/mongo/db/repl/minvalid.h index 7bbe7c39c69..36edeafad6b 100644 --- a/src/mongo/db/repl/minvalid.h +++ b/src/mongo/db/repl/minvalid.h @@ -29,39 +29,39 @@ #pragma once namespace mongo { - class BSONObj; - class OperationContext; - class OpTime; +class BSONObj; +class OperationContext; +class OpTime; namespace repl { - /** - * Helper functions for maintaining local.replset.minvalid collection contents. - * - * When a member reaches its minValid optime it is in a consistent state. Thus, minValid is - * set as the last step in initial sync. At the beginning of initial sync, _initialSyncFlag - * is appended onto minValid to indicate that initial sync was started but has not yet - * completed. - * minValid is also used during "normal" sync: the last op in each batch is used to set - * minValid, to indicate that we are in a consistent state when the batch has been fully - * applied. - */ +/** + * Helper functions for maintaining local.replset.minvalid collection contents. + * + * When a member reaches its minValid optime it is in a consistent state. Thus, minValid is + * set as the last step in initial sync. At the beginning of initial sync, _initialSyncFlag + * is appended onto minValid to indicate that initial sync was started but has not yet + * completed. + * minValid is also used during "normal" sync: the last op in each batch is used to set + * minValid, to indicate that we are in a consistent state when the batch has been fully + * applied. + */ - /** - * The initial sync flag is used to durably record the state of an initial sync; its boolean - * value is true when an initial sync is in progress and hasn't yet completed. The flag - * is stored as part of the local.replset.minvalid collection. - */ - void clearInitialSyncFlag(OperationContext* txn); - void setInitialSyncFlag(OperationContext* txn); - bool getInitialSyncFlag(); +/** + * The initial sync flag is used to durably record the state of an initial sync; its boolean + * value is true when an initial sync is in progress and hasn't yet completed. The flag + * is stored as part of the local.replset.minvalid collection. + */ +void clearInitialSyncFlag(OperationContext* txn); +void setInitialSyncFlag(OperationContext* txn); +bool getInitialSyncFlag(); - /** - * The minValid optime value is the earliest (minimum) OpTime that must be applied in order to - * consider the dataset consistent. Do not allow client reads if our last applied operation is - * before the minValid time. - */ - void setMinValid(OperationContext* ctx, OpTime ts); - OpTime getMinValid(OperationContext* txn); +/** + * The minValid optime value is the earliest (minimum) OpTime that must be applied in order to + * consider the dataset consistent. Do not allow client reads if our last applied operation is + * before the minValid time. + */ +void setMinValid(OperationContext* ctx, OpTime ts); +OpTime getMinValid(OperationContext* txn); } } diff --git a/src/mongo/db/repl/multicmd.cpp b/src/mongo/db/repl/multicmd.cpp index cb0a41345f6..706645d9796 100644 --- a/src/mongo/db/repl/multicmd.cpp +++ b/src/mongo/db/repl/multicmd.cpp @@ -39,17 +39,16 @@ namespace mongo { namespace repl { - void _MultiCommandJob::run() { - try { - ScopedConn c(d.toHost); - LOG(1) << "multiCommand running on host " << d.toHost; - d.ok = c.runCommand("admin", cmd, d.result); - LOG(1) << "multiCommand response: " << d.result; - } - catch (const DBException& e) { - LOG(1) << "dev caught " << e.what() << " on multiCommand to " << d.toHost; - } +void _MultiCommandJob::run() { + try { + ScopedConn c(d.toHost); + LOG(1) << "multiCommand running on host " << d.toHost; + d.ok = c.runCommand("admin", cmd, d.result); + LOG(1) << "multiCommand response: " << d.result; + } catch (const DBException& e) { + LOG(1) << "dev caught " << e.what() << " on multiCommand to " << d.toHost; } +} } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/multicmd.h b/src/mongo/db/repl/multicmd.h index fa4519d4f68..677f3da481b 100644 --- a/src/mongo/db/repl/multicmd.h +++ b/src/mongo/db/repl/multicmd.h @@ -39,46 +39,49 @@ namespace mongo { namespace repl { - struct Target { - Target(std::string hostport) : toHost(hostport), ok(false) { } - //Target() : ok(false) { } - const std::string toHost; - bool ok; - BSONObj result; - }; +struct Target { + Target(std::string hostport) : toHost(hostport), ok(false) {} + // Target() : ok(false) { } + const std::string toHost; + bool ok; + BSONObj result; +}; - /** send a command to several servers in parallel. waits for all to complete before - returning. - - in: Target::toHost - out: Target::result and Target::ok - */ - void multiCommand(BSONObj cmd, std::list<Target>& L); +/** send a command to several servers in parallel. waits for all to complete before + returning. - class _MultiCommandJob : public BackgroundJob { - public: - BSONObj& cmd; - Target& d; - _MultiCommandJob(BSONObj& _cmd, Target& _d) : cmd(_cmd), d(_d) { } + in: Target::toHost + out: Target::result and Target::ok +*/ +void multiCommand(BSONObj cmd, std::list<Target>& L); + +class _MultiCommandJob : public BackgroundJob { +public: + BSONObj& cmd; + Target& d; + _MultiCommandJob(BSONObj& _cmd, Target& _d) : cmd(_cmd), d(_d) {} - private: - std::string name() const { return "MultiCommandJob"; } - void run(); - }; +private: + std::string name() const { + return "MultiCommandJob"; + } + void run(); +}; - inline void multiCommand(BSONObj cmd, std::list<Target>& L) { - std::list< boost::shared_ptr<BackgroundJob> > jobs; +inline void multiCommand(BSONObj cmd, std::list<Target>& L) { + std::list<boost::shared_ptr<BackgroundJob>> jobs; - for( std::list<Target>::iterator i = L.begin(); i != L.end(); i++ ) { - Target& d = *i; - _MultiCommandJob *j = new _MultiCommandJob(cmd, d); - jobs.push_back( boost::shared_ptr<BackgroundJob>(j) ); - j->go(); - } + for (std::list<Target>::iterator i = L.begin(); i != L.end(); i++) { + Target& d = *i; + _MultiCommandJob* j = new _MultiCommandJob(cmd, d); + jobs.push_back(boost::shared_ptr<BackgroundJob>(j)); + j->go(); + } - for( std::list< boost::shared_ptr<BackgroundJob> >::iterator i = jobs.begin(); i != jobs.end(); i++ ) { - (*i)->wait(); - } + for (std::list<boost::shared_ptr<BackgroundJob>>::iterator i = jobs.begin(); i != jobs.end(); + i++) { + (*i)->wait(); } -} // namespace repl -} // namespace mongo +} +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/network_interface_impl.cpp b/src/mongo/db/repl/network_interface_impl.cpp index fa758c7606f..754714c4614 100644 --- a/src/mongo/db/repl/network_interface_impl.cpp +++ b/src/mongo/db/repl/network_interface_impl.cpp @@ -56,594 +56,573 @@ namespace repl { namespace { - const size_t kMinThreads = 1; - const size_t kMaxThreads = 51; // Set to 1 + max repl set size, for heartbeat + wiggle room. - const Seconds kMaxIdleThreadAge(30); - const Seconds kMaxConnectionAge(30); +const size_t kMinThreads = 1; +const size_t kMaxThreads = 51; // Set to 1 + max repl set size, for heartbeat + wiggle room. +const Seconds kMaxIdleThreadAge(30); +const Seconds kMaxConnectionAge(30); } // namespace - /** - * Private pool of connections used by the network interface. - * - * Methods of the pool may be called from any thread, as they are synchronized internally. - */ - class NetworkInterfaceImpl::ConnectionPool { - MONGO_DISALLOW_COPYING(ConnectionPool); - public: - struct ConnectionInfo; +/** + * Private pool of connections used by the network interface. + * + * Methods of the pool may be called from any thread, as they are synchronized internally. + */ +class NetworkInterfaceImpl::ConnectionPool { + MONGO_DISALLOW_COPYING(ConnectionPool); - typedef stdx::list<ConnectionInfo> ConnectionList; - typedef unordered_map<HostAndPort, ConnectionList> HostConnectionMap; +public: + struct ConnectionInfo; - /** - * RAII class for connections from the pool. To use the connection pool, instantiate one of - * these with a pointer to the pool, the identity of the target node and the timeout for - * network operations, use it like a pointer to a connection, and then call done() on - * successful completion. Failure to call done() will lead to the connection being reaped - * when the holder goes out of scope. - */ - class ConnectionPtr { - MONGO_DISALLOW_COPYING(ConnectionPtr); - public: - /** - * Constructs a ConnectionPtr referring to a connection to "target" drawn from "pool", - * with the network timeout set to "timeout". - * - * Throws DBExceptions if the connection cannot be established. - */ - ConnectionPtr(ConnectionPool* pool, - const HostAndPort& target, - Date_t now, - Milliseconds timeout) : - _pool(pool), _connInfo(pool->acquireConnection(target, now, timeout)) {} - - /** - * Destructor reaps the connection if it wasn't already returned to the pool by calling - * done(). - */ - ~ConnectionPtr() { if (_pool) _pool->destroyConnection(_connInfo); } - - /** - * Releases the connection back to the pool from which it was drawn. - */ - void done(Date_t now) { _pool->releaseConnection(_connInfo, now); _pool = NULL; } - - DBClientConnection& operator*(); - DBClientConnection* operator->(); - - private: - ConnectionPool* _pool; - const ConnectionList::iterator _connInfo; - }; - - ConnectionPool(); - ~ConnectionPool(); + typedef stdx::list<ConnectionInfo> ConnectionList; + typedef unordered_map<HostAndPort, ConnectionList> HostConnectionMap; - /** - * Acquires a connection to "target" with the given "timeout", or throws a DBException. - * Intended for use by ConnectionPtr. - */ - ConnectionList::iterator acquireConnection( - const HostAndPort& target, Date_t now, Milliseconds timeout); + /** + * RAII class for connections from the pool. To use the connection pool, instantiate one of + * these with a pointer to the pool, the identity of the target node and the timeout for + * network operations, use it like a pointer to a connection, and then call done() on + * successful completion. Failure to call done() will lead to the connection being reaped + * when the holder goes out of scope. + */ + class ConnectionPtr { + MONGO_DISALLOW_COPYING(ConnectionPtr); + public: /** - * Releases a connection back into the pool. - * Intended for use by ConnectionPtr. - * Call this for connections that can safely be reused. + * Constructs a ConnectionPtr referring to a connection to "target" drawn from "pool", + * with the network timeout set to "timeout". + * + * Throws DBExceptions if the connection cannot be established. */ - void releaseConnection(ConnectionList::iterator iter, Date_t now); + ConnectionPtr(ConnectionPool* pool, + const HostAndPort& target, + Date_t now, + Milliseconds timeout) + : _pool(pool), _connInfo(pool->acquireConnection(target, now, timeout)) {} /** - * Destroys a connection previously acquired from the pool. - * Intended for use by ConnectionPtr. - * Call this for connections that cannot be reused. + * Destructor reaps the connection if it wasn't already returned to the pool by calling + * done(). */ - void destroyConnection(ConnectionList::iterator); + ~ConnectionPtr() { + if (_pool) + _pool->destroyConnection(_connInfo); + } /** - * Closes all connections currently in use, to ensure that the network threads - * terminate promptly during shutdown. + * Releases the connection back to the pool from which it was drawn. */ - void closeAllInUseConnections(); + void done(Date_t now) { + _pool->releaseConnection(_connInfo, now); + _pool = NULL; + } - /** - * Reaps all connections in the pool that are too old as of "now". - */ - void cleanUpOlderThan(Date_t now); + DBClientConnection& operator*(); + DBClientConnection* operator->(); private: - /** - * Returns true if the given connection is young enough to keep in the pool. - */ - bool shouldKeepConnection(Date_t now, const ConnectionInfo& connInfo) const; - - /** - * Implementation of cleanUpOlderThan which assumes that _mutex is already held. - */ - void cleanUpOlderThan_inlock(Date_t now); + ConnectionPool* _pool; + const ConnectionList::iterator _connInfo; + }; - /** - * Reaps connections in "hostConns" that are too old or have been in the pool too long as of - * "now". Expects _mutex to be held. - */ - void cleanUpOlderThan_inlock(Date_t now, ConnectionList* hostConns); + ConnectionPool(); + ~ConnectionPool(); - /** - * Destroys the connection associated with "iter" and removes "iter" fron connList. - */ - static void destroyConnection_inlock(ConnectionList* connList, - ConnectionList::iterator iter); + /** + * Acquires a connection to "target" with the given "timeout", or throws a DBException. + * Intended for use by ConnectionPtr. + */ + ConnectionList::iterator acquireConnection(const HostAndPort& target, + Date_t now, + Milliseconds timeout); - // Mutex guarding members of the connection pool - boost::mutex _mutex; + /** + * Releases a connection back into the pool. + * Intended for use by ConnectionPtr. + * Call this for connections that can safely be reused. + */ + void releaseConnection(ConnectionList::iterator iter, Date_t now); - // Map from HostAndPort to idle connections. - HostConnectionMap _connections; + /** + * Destroys a connection previously acquired from the pool. + * Intended for use by ConnectionPtr. + * Call this for connections that cannot be reused. + */ + void destroyConnection(ConnectionList::iterator); - // List of non-idle connections. - ConnectionList _inUseConnections; - }; + /** + * Closes all connections currently in use, to ensure that the network threads + * terminate promptly during shutdown. + */ + void closeAllInUseConnections(); /** - * Information about a connection in the pool. + * Reaps all connections in the pool that are too old as of "now". */ - struct NetworkInterfaceImpl::ConnectionPool::ConnectionInfo { - ConnectionInfo() : conn(NULL), creationDate(0ULL) {} - ConnectionInfo(DBClientConnection* theConn, Date_t date) : - conn(theConn), - creationDate(date) {} + void cleanUpOlderThan(Date_t now); - // A connection in the pool. - DBClientConnection* conn; +private: + /** + * Returns true if the given connection is young enough to keep in the pool. + */ + bool shouldKeepConnection(Date_t now, const ConnectionInfo& connInfo) const; - // The date at which the connection was created. - Date_t creationDate; - }; + /** + * Implementation of cleanUpOlderThan which assumes that _mutex is already held. + */ + void cleanUpOlderThan_inlock(Date_t now); - DBClientConnection& NetworkInterfaceImpl::ConnectionPool::ConnectionPtr::operator*() { - return *_connInfo->conn; - } + /** + * Reaps connections in "hostConns" that are too old or have been in the pool too long as of + * "now". Expects _mutex to be held. + */ + void cleanUpOlderThan_inlock(Date_t now, ConnectionList* hostConns); - DBClientConnection* NetworkInterfaceImpl::ConnectionPool::ConnectionPtr::operator->() { - return _connInfo->conn; - } + /** + * Destroys the connection associated with "iter" and removes "iter" fron connList. + */ + static void destroyConnection_inlock(ConnectionList* connList, ConnectionList::iterator iter); - NetworkInterfaceImpl::ConnectionPool::ConnectionPool() {} + // Mutex guarding members of the connection pool + boost::mutex _mutex; - NetworkInterfaceImpl::ConnectionPool::~ConnectionPool() { - cleanUpOlderThan(Date_t(~0ULL)); - invariant(_connections.empty()); - invariant(_inUseConnections.empty()); - } + // Map from HostAndPort to idle connections. + HostConnectionMap _connections; - void NetworkInterfaceImpl::ConnectionPool::cleanUpOlderThan(Date_t now) { - boost::lock_guard<boost::mutex> lk(_mutex); - cleanUpOlderThan_inlock(now); - } + // List of non-idle connections. + ConnectionList _inUseConnections; +}; - void NetworkInterfaceImpl::ConnectionPool::cleanUpOlderThan_inlock(Date_t now) { - HostConnectionMap::iterator hostConns = _connections.begin(); - while (hostConns != _connections.end()) { - cleanUpOlderThan_inlock(now, &hostConns->second); - if (hostConns->second.empty()) { - _connections.erase(hostConns++); - } - else { - ++hostConns; - } +/** + * Information about a connection in the pool. + */ +struct NetworkInterfaceImpl::ConnectionPool::ConnectionInfo { + ConnectionInfo() : conn(NULL), creationDate(0ULL) {} + ConnectionInfo(DBClientConnection* theConn, Date_t date) : conn(theConn), creationDate(date) {} + + // A connection in the pool. + DBClientConnection* conn; + + // The date at which the connection was created. + Date_t creationDate; +}; + +DBClientConnection& NetworkInterfaceImpl::ConnectionPool::ConnectionPtr::operator*() { + return *_connInfo->conn; +} + +DBClientConnection* NetworkInterfaceImpl::ConnectionPool::ConnectionPtr::operator->() { + return _connInfo->conn; +} + +NetworkInterfaceImpl::ConnectionPool::ConnectionPool() {} + +NetworkInterfaceImpl::ConnectionPool::~ConnectionPool() { + cleanUpOlderThan(Date_t(~0ULL)); + invariant(_connections.empty()); + invariant(_inUseConnections.empty()); +} + +void NetworkInterfaceImpl::ConnectionPool::cleanUpOlderThan(Date_t now) { + boost::lock_guard<boost::mutex> lk(_mutex); + cleanUpOlderThan_inlock(now); +} + +void NetworkInterfaceImpl::ConnectionPool::cleanUpOlderThan_inlock(Date_t now) { + HostConnectionMap::iterator hostConns = _connections.begin(); + while (hostConns != _connections.end()) { + cleanUpOlderThan_inlock(now, &hostConns->second); + if (hostConns->second.empty()) { + _connections.erase(hostConns++); + } else { + ++hostConns; } } - - void NetworkInterfaceImpl::ConnectionPool::cleanUpOlderThan_inlock( - Date_t now, - ConnectionList* hostConns) { - ConnectionList::iterator iter = hostConns->begin(); - while (iter != hostConns->end()) { - if (shouldKeepConnection(now, *iter)) { - ++iter; - } - else { - destroyConnection_inlock(hostConns, iter++); - } +} + +void NetworkInterfaceImpl::ConnectionPool::cleanUpOlderThan_inlock(Date_t now, + ConnectionList* hostConns) { + ConnectionList::iterator iter = hostConns->begin(); + while (iter != hostConns->end()) { + if (shouldKeepConnection(now, *iter)) { + ++iter; + } else { + destroyConnection_inlock(hostConns, iter++); } } +} - bool NetworkInterfaceImpl::ConnectionPool::shouldKeepConnection( - const Date_t now, - const ConnectionInfo& connInfo) const { - - const Date_t expirationDate = - connInfo.creationDate + kMaxConnectionAge.total_milliseconds(); - if (expirationDate <= now) { - return false; - } - return true; +bool NetworkInterfaceImpl::ConnectionPool::shouldKeepConnection( + const Date_t now, const ConnectionInfo& connInfo) const { + const Date_t expirationDate = connInfo.creationDate + kMaxConnectionAge.total_milliseconds(); + if (expirationDate <= now) { + return false; } - - void NetworkInterfaceImpl::ConnectionPool::closeAllInUseConnections() { - boost::lock_guard<boost::mutex> lk(_mutex); - for (ConnectionList::iterator iter = _inUseConnections.begin(); - iter != _inUseConnections.end(); - ++iter) { - - iter->conn->port().shutdown(); - } + return true; +} + +void NetworkInterfaceImpl::ConnectionPool::closeAllInUseConnections() { + boost::lock_guard<boost::mutex> lk(_mutex); + for (ConnectionList::iterator iter = _inUseConnections.begin(); iter != _inUseConnections.end(); + ++iter) { + iter->conn->port().shutdown(); } - - NetworkInterfaceImpl::ConnectionPool::ConnectionList::iterator - NetworkInterfaceImpl::ConnectionPool::acquireConnection( - const HostAndPort& target, - Date_t now, - Milliseconds timeout) { - boost::unique_lock<boost::mutex> lk(_mutex); - for (HostConnectionMap::iterator hostConns; - ((hostConns = _connections.find(target)) != _connections.end());) { - - cleanUpOlderThan_inlock(now, &hostConns->second); - if (hostConns->second.empty()) { - break; - } - _inUseConnections.splice(_inUseConnections.begin(), - hostConns->second, - hostConns->second.begin()); - const ConnectionList::iterator candidate = _inUseConnections.begin(); - lk.unlock(); - try { - if (candidate->conn->isStillConnected()) { - // setSoTimeout takes a double representing the number of seconds for send and - // receive timeouts. Thus, we must take total_milliseconds() and divide by - // 1000.0 to get the number of seconds with a fractional part. - candidate->conn->setSoTimeout(timeout.total_milliseconds() / 1000.0); - return candidate; - } - } - catch (...) { - lk.lock(); - destroyConnection_inlock(&_inUseConnections, candidate); - throw; +} + +NetworkInterfaceImpl::ConnectionPool::ConnectionList::iterator +NetworkInterfaceImpl::ConnectionPool::acquireConnection(const HostAndPort& target, + Date_t now, + Milliseconds timeout) { + boost::unique_lock<boost::mutex> lk(_mutex); + for (HostConnectionMap::iterator hostConns; + ((hostConns = _connections.find(target)) != _connections.end());) { + cleanUpOlderThan_inlock(now, &hostConns->second); + if (hostConns->second.empty()) { + break; + } + _inUseConnections.splice( + _inUseConnections.begin(), hostConns->second, hostConns->second.begin()); + const ConnectionList::iterator candidate = _inUseConnections.begin(); + lk.unlock(); + try { + if (candidate->conn->isStillConnected()) { + // setSoTimeout takes a double representing the number of seconds for send and + // receive timeouts. Thus, we must take total_milliseconds() and divide by + // 1000.0 to get the number of seconds with a fractional part. + candidate->conn->setSoTimeout(timeout.total_milliseconds() / 1000.0); + return candidate; } + } catch (...) { lk.lock(); destroyConnection_inlock(&_inUseConnections, candidate); - } - - // No idle connection in the pool; make a new one. - lk.unlock(); - std::auto_ptr<DBClientConnection> conn(new DBClientConnection); - // setSoTimeout takes a double representing the number of seconds for send and receive - // timeouts. Thus, we must take total_milliseconds() and divide by 1000.0 to get the number - // of seconds with a fractional part. - conn->setSoTimeout(timeout.total_milliseconds() / 1000.0); - std::string errmsg; - uassert(18915, - str::stream() << "Failed attempt to connect to " << target.toString() << "; " << - errmsg, - conn->connect(target, errmsg)); - conn->port().tag |= ScopedConn::keepOpen; - if (getGlobalAuthorizationManager()->isAuthEnabled()) { - uassert(ErrorCodes::AuthenticationFailed, - "Missing credentials for authenticating as internal user", - isInternalAuthSet()); - conn->auth(getInternalUserAuthParamsWithFallback()); + throw; } lk.lock(); - return _inUseConnections.insert(_inUseConnections.begin(), - ConnectionInfo(conn.release(), now)); + destroyConnection_inlock(&_inUseConnections, candidate); } - void NetworkInterfaceImpl::ConnectionPool::releaseConnection(ConnectionList::iterator iter, - const Date_t now) { - boost::lock_guard<boost::mutex> lk(_mutex); - if (!shouldKeepConnection(now, *iter)) { - destroyConnection_inlock(&_inUseConnections, iter); - return; - } - ConnectionList& hostConns = _connections[iter->conn->getServerHostAndPort()]; - cleanUpOlderThan_inlock(now, &hostConns); - hostConns.splice(hostConns.begin(), _inUseConnections, iter); + // No idle connection in the pool; make a new one. + lk.unlock(); + std::auto_ptr<DBClientConnection> conn(new DBClientConnection); + // setSoTimeout takes a double representing the number of seconds for send and receive + // timeouts. Thus, we must take total_milliseconds() and divide by 1000.0 to get the number + // of seconds with a fractional part. + conn->setSoTimeout(timeout.total_milliseconds() / 1000.0); + std::string errmsg; + uassert(18915, + str::stream() << "Failed attempt to connect to " << target.toString() << "; " << errmsg, + conn->connect(target, errmsg)); + conn->port().tag |= ScopedConn::keepOpen; + if (getGlobalAuthorizationManager()->isAuthEnabled()) { + uassert(ErrorCodes::AuthenticationFailed, + "Missing credentials for authenticating as internal user", + isInternalAuthSet()); + conn->auth(getInternalUserAuthParamsWithFallback()); } - - void NetworkInterfaceImpl::ConnectionPool::destroyConnection(ConnectionList::iterator iter) { - boost::lock_guard<boost::mutex> lk(_mutex); + lk.lock(); + return _inUseConnections.insert(_inUseConnections.begin(), ConnectionInfo(conn.release(), now)); +} + +void NetworkInterfaceImpl::ConnectionPool::releaseConnection(ConnectionList::iterator iter, + const Date_t now) { + boost::lock_guard<boost::mutex> lk(_mutex); + if (!shouldKeepConnection(now, *iter)) { destroyConnection_inlock(&_inUseConnections, iter); + return; } - - void NetworkInterfaceImpl::ConnectionPool::destroyConnection_inlock( - ConnectionList* connList, ConnectionList::iterator iter) { - delete iter->conn; - connList->erase(iter); - } - - NetworkInterfaceImpl::NetworkInterfaceImpl() : - _numIdleThreads(0), - _nextThreadId(0), - _lastFullUtilizationDate(), - _isExecutorRunnable(false), - _inShutdown(false), - _numActiveNetworkRequests(0) { - _connPool.reset(new ConnectionPool()); - } - - NetworkInterfaceImpl::~NetworkInterfaceImpl() { } - - std::string NetworkInterfaceImpl::getDiagnosticString() { - boost::lock_guard<boost::mutex> lk(_mutex); - str::stream output; - output << "NetworkImpl"; - output << " threads:" << _threads.size(); - output << " inShutdown:" << _inShutdown; - output << " active:" << _numActiveNetworkRequests; - output << " pending:" << _pending.size(); - output << " execRunable:" << _isExecutorRunnable; - return output; - + ConnectionList& hostConns = _connections[iter->conn->getServerHostAndPort()]; + cleanUpOlderThan_inlock(now, &hostConns); + hostConns.splice(hostConns.begin(), _inUseConnections, iter); +} + +void NetworkInterfaceImpl::ConnectionPool::destroyConnection(ConnectionList::iterator iter) { + boost::lock_guard<boost::mutex> lk(_mutex); + destroyConnection_inlock(&_inUseConnections, iter); +} + +void NetworkInterfaceImpl::ConnectionPool::destroyConnection_inlock(ConnectionList* connList, + ConnectionList::iterator iter) { + delete iter->conn; + connList->erase(iter); +} + +NetworkInterfaceImpl::NetworkInterfaceImpl() + : _numIdleThreads(0), + _nextThreadId(0), + _lastFullUtilizationDate(), + _isExecutorRunnable(false), + _inShutdown(false), + _numActiveNetworkRequests(0) { + _connPool.reset(new ConnectionPool()); +} + +NetworkInterfaceImpl::~NetworkInterfaceImpl() {} + +std::string NetworkInterfaceImpl::getDiagnosticString() { + boost::lock_guard<boost::mutex> lk(_mutex); + str::stream output; + output << "NetworkImpl"; + output << " threads:" << _threads.size(); + output << " inShutdown:" << _inShutdown; + output << " active:" << _numActiveNetworkRequests; + output << " pending:" << _pending.size(); + output << " execRunable:" << _isExecutorRunnable; + return output; +} + +void NetworkInterfaceImpl::_startNewNetworkThread_inlock() { + if (_inShutdown) { + LOG(1) << "Not starting new replication networking thread while shutting down replication."; + return; } - - void NetworkInterfaceImpl::_startNewNetworkThread_inlock() { - if (_inShutdown) { - LOG(1) << - "Not starting new replication networking thread while shutting down replication."; - return; - } - if (_threads.size() >= kMaxThreads) { - LOG(1) << "Not starting new replication networking thread because " << kMaxThreads << - " are already running; " << _numIdleThreads << " threads are idle and " << - _pending.size() << " network requests are waiting for a thread to serve them."; - return; - } - const std::string threadName(str::stream() << "ReplExecNetThread-" << _nextThreadId++); - try { - _threads.push_back( - boost::make_shared<boost::thread>( - stdx::bind(&NetworkInterfaceImpl::_requestProcessorThreadBody, - this, - threadName))); - ++_numIdleThreads; - } - catch (const std::exception& ex) { - error() << "Failed to start " << threadName << "; " << _threads.size() << - " other network threads still running; caught exception: " << ex.what(); - } + if (_threads.size() >= kMaxThreads) { + LOG(1) << "Not starting new replication networking thread because " << kMaxThreads + << " are already running; " << _numIdleThreads << " threads are idle and " + << _pending.size() << " network requests are waiting for a thread to serve them."; + return; } - - void NetworkInterfaceImpl::startup() { - boost::lock_guard<boost::mutex> lk(_mutex); - invariant(!_inShutdown); - if (!_threads.empty()) { - return; - } - for (size_t i = 0; i < kMinThreads; ++i) { - _startNewNetworkThread_inlock(); - } + const std::string threadName(str::stream() << "ReplExecNetThread-" << _nextThreadId++); + try { + _threads.push_back(boost::make_shared<boost::thread>( + stdx::bind(&NetworkInterfaceImpl::_requestProcessorThreadBody, this, threadName))); + ++_numIdleThreads; + } catch (const std::exception& ex) { + error() << "Failed to start " << threadName << "; " << _threads.size() + << " other network threads still running; caught exception: " << ex.what(); } +} - void NetworkInterfaceImpl::shutdown() { - using std::swap; - boost::unique_lock<boost::mutex> lk(_mutex); - _inShutdown = true; - _hasPending.notify_all(); - ThreadList threadsToJoin; - swap(threadsToJoin, _threads); - lk.unlock(); - _connPool->closeAllInUseConnections(); - std::for_each(threadsToJoin.begin(), - threadsToJoin.end(), - stdx::bind(&boost::thread::join, stdx::placeholders::_1)); +void NetworkInterfaceImpl::startup() { + boost::lock_guard<boost::mutex> lk(_mutex); + invariant(!_inShutdown); + if (!_threads.empty()) { + return; } - - void NetworkInterfaceImpl::signalWorkAvailable() { - boost::lock_guard<boost::mutex> lk(_mutex); - _signalWorkAvailable_inlock(); + for (size_t i = 0; i < kMinThreads; ++i) { + _startNewNetworkThread_inlock(); } - - void NetworkInterfaceImpl::_signalWorkAvailable_inlock() { - if (!_isExecutorRunnable) { - _isExecutorRunnable = true; - _isExecutorRunnableCondition.notify_one(); - } +} + +void NetworkInterfaceImpl::shutdown() { + using std::swap; + boost::unique_lock<boost::mutex> lk(_mutex); + _inShutdown = true; + _hasPending.notify_all(); + ThreadList threadsToJoin; + swap(threadsToJoin, _threads); + lk.unlock(); + _connPool->closeAllInUseConnections(); + std::for_each(threadsToJoin.begin(), + threadsToJoin.end(), + stdx::bind(&boost::thread::join, stdx::placeholders::_1)); +} + +void NetworkInterfaceImpl::signalWorkAvailable() { + boost::lock_guard<boost::mutex> lk(_mutex); + _signalWorkAvailable_inlock(); +} + +void NetworkInterfaceImpl::_signalWorkAvailable_inlock() { + if (!_isExecutorRunnable) { + _isExecutorRunnable = true; + _isExecutorRunnableCondition.notify_one(); } +} - void NetworkInterfaceImpl::waitForWork() { - boost::unique_lock<boost::mutex> lk(_mutex); - while (!_isExecutorRunnable) { - _isExecutorRunnableCondition.wait(lk); - } - _isExecutorRunnable = false; +void NetworkInterfaceImpl::waitForWork() { + boost::unique_lock<boost::mutex> lk(_mutex); + while (!_isExecutorRunnable) { + _isExecutorRunnableCondition.wait(lk); } - - void NetworkInterfaceImpl::waitForWorkUntil(Date_t when) { - boost::unique_lock<boost::mutex> lk(_mutex); - while (!_isExecutorRunnable) { - const Milliseconds waitTime(when - now()); - if (waitTime <= Milliseconds(0)) { - break; - } - _isExecutorRunnableCondition.timed_wait(lk, waitTime); + _isExecutorRunnable = false; +} + +void NetworkInterfaceImpl::waitForWorkUntil(Date_t when) { + boost::unique_lock<boost::mutex> lk(_mutex); + while (!_isExecutorRunnable) { + const Milliseconds waitTime(when - now()); + if (waitTime <= Milliseconds(0)) { + break; } - _isExecutorRunnable = false; - } - - void NetworkInterfaceImpl::_requestProcessorThreadBody( - NetworkInterfaceImpl* net, - const std::string& threadName) { - setThreadName(threadName); - LOG(1) << "thread starting"; - net->_consumeNetworkRequests(); - - // At this point, another thread may have destroyed "net", if this thread chose to detach - // itself and remove itself from net->_threads before releasing net->_mutex. Do not access - // member variables of "net" from here, on. - LOG(1) << "thread shutting down"; + _isExecutorRunnableCondition.timed_wait(lk, waitTime); } - - void NetworkInterfaceImpl::_consumeNetworkRequests() { - boost::unique_lock<boost::mutex> lk(_mutex); - while (!_inShutdown) { - if (_pending.empty()) { - if (_threads.size() > kMinThreads) { - const Date_t nowDate = now(); - const Date_t nextThreadRetirementDate = - _lastFullUtilizationDate + kMaxIdleThreadAge.total_milliseconds(); - if (nowDate > nextThreadRetirementDate) { - _lastFullUtilizationDate = nowDate; - break; - } + _isExecutorRunnable = false; +} + +void NetworkInterfaceImpl::_requestProcessorThreadBody(NetworkInterfaceImpl* net, + const std::string& threadName) { + setThreadName(threadName); + LOG(1) << "thread starting"; + net->_consumeNetworkRequests(); + + // At this point, another thread may have destroyed "net", if this thread chose to detach + // itself and remove itself from net->_threads before releasing net->_mutex. Do not access + // member variables of "net" from here, on. + LOG(1) << "thread shutting down"; +} + +void NetworkInterfaceImpl::_consumeNetworkRequests() { + boost::unique_lock<boost::mutex> lk(_mutex); + while (!_inShutdown) { + if (_pending.empty()) { + if (_threads.size() > kMinThreads) { + const Date_t nowDate = now(); + const Date_t nextThreadRetirementDate = + _lastFullUtilizationDate + kMaxIdleThreadAge.total_milliseconds(); + if (nowDate > nextThreadRetirementDate) { + _lastFullUtilizationDate = nowDate; + break; } - _hasPending.timed_wait(lk, kMaxIdleThreadAge); - continue; } - CommandData todo = _pending.front(); - _pending.pop_front(); - ++_numActiveNetworkRequests; - --_numIdleThreads; - lk.unlock(); - ResponseStatus result = _runCommand(todo.request); - LOG(2) << "Network status of sending " << todo.request.cmdObj.firstElementFieldName() << - " to " << todo.request.target << " was " << result.getStatus(); - todo.onFinish(result); - lk.lock(); - --_numActiveNetworkRequests; - ++_numIdleThreads; - _signalWorkAvailable_inlock(); + _hasPending.timed_wait(lk, kMaxIdleThreadAge); + continue; } + CommandData todo = _pending.front(); + _pending.pop_front(); + ++_numActiveNetworkRequests; --_numIdleThreads; - if (_inShutdown) { - return; - } - // This thread is ending because it was idle for too long. - // Find self in _threads, remove self from _threads, detach self. - for (size_t i = 0; i < _threads.size(); ++i) { - if (_threads[i]->get_id() != boost::this_thread::get_id()) { - continue; - } - _threads[i]->detach(); - _threads[i].swap(_threads.back()); - _threads.pop_back(); - return; - } - severe().stream() << "Could not find this thread, with id " << - boost::this_thread::get_id() << " in the replication networking thread pool"; - fassertFailedNoTrace(28581); - } - - void NetworkInterfaceImpl::startCommand( - const ReplicationExecutor::CallbackHandle& cbHandle, - const ReplicationExecutor::RemoteCommandRequest& request, - const RemoteCommandCompletionFn& onFinish) { - LOG(2) << "Scheduling " << request.cmdObj.firstElementFieldName() << " to " << - request.target; - boost::lock_guard<boost::mutex> lk(_mutex); - _pending.push_back(CommandData()); - CommandData& cd = _pending.back(); - cd.cbHandle = cbHandle; - cd.request = request; - cd.onFinish = onFinish; - if (_numIdleThreads < _pending.size()) { - _startNewNetworkThread_inlock(); - } - if (_numIdleThreads <= _pending.size()) { - _lastFullUtilizationDate = curTimeMillis64(); - } - _hasPending.notify_one(); - } - - void NetworkInterfaceImpl::cancelCommand(const ReplicationExecutor::CallbackHandle& cbHandle) { - boost::unique_lock<boost::mutex> lk(_mutex); - CommandDataList::iterator iter; - for (iter = _pending.begin(); iter != _pending.end(); ++iter) { - if (iter->cbHandle == cbHandle) { - break; - } - } - if (iter == _pending.end()) { - return; - } - const RemoteCommandCompletionFn onFinish = iter->onFinish; - LOG(2) << "Canceled sending " << iter->request.cmdObj.firstElementFieldName() << " to " << - iter->request.target; - _pending.erase(iter); lk.unlock(); - onFinish(ResponseStatus(ErrorCodes::CallbackCanceled, "Callback canceled")); + ResponseStatus result = _runCommand(todo.request); + LOG(2) << "Network status of sending " << todo.request.cmdObj.firstElementFieldName() + << " to " << todo.request.target << " was " << result.getStatus(); + todo.onFinish(result); lk.lock(); + --_numActiveNetworkRequests; + ++_numIdleThreads; _signalWorkAvailable_inlock(); } - - Date_t NetworkInterfaceImpl::now() { - return curTimeMillis64(); + --_numIdleThreads; + if (_inShutdown) { + return; } - - namespace { - - /** - * Calculates the timeout for a network operation expiring at "expDate", given - * that it is now "nowDate". - * - * Returns 0 to indicate no expiration date, a number of milliseconds until "expDate", or - * ErrorCodes::ExceededTimeLimit if "expDate" is not later than "nowDate". - * - * TODO: Change return type to StatusWith<Milliseconds> once Milliseconds supports default - * construction or StatusWith<T> supports not constructing T when the result is a non-OK - * status. - */ - StatusWith<int64_t> getTimeoutMillis(const Date_t expDate, const Date_t nowDate) { - if (expDate == ReplicationExecutor::kNoExpirationDate) { - return StatusWith<int64_t>(0); - } - if (expDate <= nowDate) { - return StatusWith<int64_t>( - ErrorCodes::ExceededTimeLimit, - str::stream() << "Went to run command, but it was too late. " - "Expiration was set to " << dateToISOStringUTC(expDate)); - } - return StatusWith<int64_t>(expDate.asInt64() - nowDate.asInt64()); + // This thread is ending because it was idle for too long. + // Find self in _threads, remove self from _threads, detach self. + for (size_t i = 0; i < _threads.size(); ++i) { + if (_threads[i]->get_id() != boost::this_thread::get_id()) { + continue; } + _threads[i]->detach(); + _threads[i].swap(_threads.back()); + _threads.pop_back(); + return; + } + severe().stream() << "Could not find this thread, with id " << boost::this_thread::get_id() + << " in the replication networking thread pool"; + fassertFailedNoTrace(28581); +} + +void NetworkInterfaceImpl::startCommand(const ReplicationExecutor::CallbackHandle& cbHandle, + const ReplicationExecutor::RemoteCommandRequest& request, + const RemoteCommandCompletionFn& onFinish) { + LOG(2) << "Scheduling " << request.cmdObj.firstElementFieldName() << " to " << request.target; + boost::lock_guard<boost::mutex> lk(_mutex); + _pending.push_back(CommandData()); + CommandData& cd = _pending.back(); + cd.cbHandle = cbHandle; + cd.request = request; + cd.onFinish = onFinish; + if (_numIdleThreads < _pending.size()) { + _startNewNetworkThread_inlock(); + } + if (_numIdleThreads <= _pending.size()) { + _lastFullUtilizationDate = curTimeMillis64(); + } + _hasPending.notify_one(); +} + +void NetworkInterfaceImpl::cancelCommand(const ReplicationExecutor::CallbackHandle& cbHandle) { + boost::unique_lock<boost::mutex> lk(_mutex); + CommandDataList::iterator iter; + for (iter = _pending.begin(); iter != _pending.end(); ++iter) { + if (iter->cbHandle == cbHandle) { + break; + } + } + if (iter == _pending.end()) { + return; + } + const RemoteCommandCompletionFn onFinish = iter->onFinish; + LOG(2) << "Canceled sending " << iter->request.cmdObj.firstElementFieldName() << " to " + << iter->request.target; + _pending.erase(iter); + lk.unlock(); + onFinish(ResponseStatus(ErrorCodes::CallbackCanceled, "Callback canceled")); + lk.lock(); + _signalWorkAvailable_inlock(); +} + +Date_t NetworkInterfaceImpl::now() { + return curTimeMillis64(); +} - } //namespace +namespace { - ResponseStatus NetworkInterfaceImpl::_runCommand( - const ReplicationExecutor::RemoteCommandRequest& request) { +/** + * Calculates the timeout for a network operation expiring at "expDate", given + * that it is now "nowDate". + * + * Returns 0 to indicate no expiration date, a number of milliseconds until "expDate", or + * ErrorCodes::ExceededTimeLimit if "expDate" is not later than "nowDate". + * + * TODO: Change return type to StatusWith<Milliseconds> once Milliseconds supports default + * construction or StatusWith<T> supports not constructing T when the result is a non-OK + * status. + */ +StatusWith<int64_t> getTimeoutMillis(const Date_t expDate, const Date_t nowDate) { + if (expDate == ReplicationExecutor::kNoExpirationDate) { + return StatusWith<int64_t>(0); + } + if (expDate <= nowDate) { + return StatusWith<int64_t>(ErrorCodes::ExceededTimeLimit, + str::stream() << "Went to run command, but it was too late. " + "Expiration was set to " + << dateToISOStringUTC(expDate)); + } + return StatusWith<int64_t>(expDate.asInt64() - nowDate.asInt64()); +} - try { - BSONObj output; +} // namespace - const Date_t requestStartDate = now(); - StatusWith<int64_t> timeoutMillis = getTimeoutMillis(request.expirationDate, - requestStartDate); - if (!timeoutMillis.isOK()) { - return ResponseStatus(timeoutMillis.getStatus()); - } +ResponseStatus NetworkInterfaceImpl::_runCommand( + const ReplicationExecutor::RemoteCommandRequest& request) { + try { + BSONObj output; - ConnectionPool::ConnectionPtr conn(_connPool.get(), - request.target, - requestStartDate, - Milliseconds(timeoutMillis.getValue())); - conn->runCommand(request.dbname, request.cmdObj, output); - const Date_t requestFinishDate = now(); - conn.done(requestFinishDate); - return ResponseStatus(Response(output, - Milliseconds(requestFinishDate - requestStartDate))); + const Date_t requestStartDate = now(); + StatusWith<int64_t> timeoutMillis = + getTimeoutMillis(request.expirationDate, requestStartDate); + if (!timeoutMillis.isOK()) { + return ResponseStatus(timeoutMillis.getStatus()); } - catch (const DBException& ex) { - return ResponseStatus(ex.toStatus()); - } - catch (const std::exception& ex) { - return ResponseStatus( - ErrorCodes::UnknownError, - mongoutils::str::stream() << - "Sending command " << request.cmdObj << " on database " << request.dbname << - " over network to " << request.target.toString() << " received exception " << - ex.what()); - } - } - void NetworkInterfaceImpl::runCallbackWithGlobalExclusiveLock( - const stdx::function<void (OperationContext*)>& callback) { - Client::initThreadIfNotAlready(); - OperationContextImpl txn; - ScopedTransaction transaction(&txn, MODE_X); - Lock::GlobalWrite lk(txn.lockState()); - callback(&txn); + ConnectionPool::ConnectionPtr conn(_connPool.get(), + request.target, + requestStartDate, + Milliseconds(timeoutMillis.getValue())); + conn->runCommand(request.dbname, request.cmdObj, output); + const Date_t requestFinishDate = now(); + conn.done(requestFinishDate); + return ResponseStatus(Response(output, Milliseconds(requestFinishDate - requestStartDate))); + } catch (const DBException& ex) { + return ResponseStatus(ex.toStatus()); + } catch (const std::exception& ex) { + return ResponseStatus(ErrorCodes::UnknownError, + mongoutils::str::stream() + << "Sending command " << request.cmdObj << " on database " + << request.dbname << " over network to " + << request.target.toString() << " received exception " + << ex.what()); } +} + +void NetworkInterfaceImpl::runCallbackWithGlobalExclusiveLock( + const stdx::function<void(OperationContext*)>& callback) { + Client::initThreadIfNotAlready(); + OperationContextImpl txn; + ScopedTransaction transaction(&txn, MODE_X); + Lock::GlobalWrite lk(txn.lockState()); + callback(&txn); +} } // namespace repl -} // namespace mongo +} // namespace mongo diff --git a/src/mongo/db/repl/network_interface_impl.h b/src/mongo/db/repl/network_interface_impl.h index 29011f32762..41d9a0003e7 100644 --- a/src/mongo/db/repl/network_interface_impl.h +++ b/src/mongo/db/repl/network_interface_impl.h @@ -42,132 +42,131 @@ namespace mongo { namespace repl { +/** + * Implementation of the network interface used by the ReplicationExecutor inside mongod. + * + * This implementation manages a dynamically sized group of worker threads for performing + * network operations. The minimum and maximum number of threads is set at compile time, and + * the exact number of threads is adjusted dynamically, using the following two rules. + * + * 1.) If the number of worker threads is less than the maximum, there are no idle worker + * threads, and the client enqueues a new network operation via startCommand(), the network + * interface spins up a new worker thread. This decision is made on the assumption that + * spinning up a new thread is faster than the round-trip time for processing a remote command, + * and so this will minimize wait time. + * + * 2.) If the number of worker threads has exceeded the the peak number of scheduled outstanding + * network commands continuously for a period of time (kMaxIdleThreadAge), one thread is retired + * from the pool and the monitoring of idle threads is reset. This means that at most one + * thread retires every kMaxIdleThreadAge units of time. The value of kMaxIdleThreadAge is set + * to be much larger than the expected frequency of new requests, averaging out short-duration + * periods of idleness, as occur between heartbeats. + * + * The implementation also manages a pool of network connections to recently contacted remote + * nodes. The size of this pool is not bounded, but connections are retired unconditionally + * after they have been connected for a certain maximum period. + */ +class NetworkInterfaceImpl : public ReplicationExecutor::NetworkInterface { +public: + explicit NetworkInterfaceImpl(); + virtual ~NetworkInterfaceImpl(); + virtual std::string getDiagnosticString(); + virtual void startup(); + virtual void shutdown(); + virtual void waitForWork(); + virtual void waitForWorkUntil(Date_t when); + virtual void signalWorkAvailable(); + virtual Date_t now(); + virtual void startCommand(const ReplicationExecutor::CallbackHandle& cbHandle, + const ReplicationExecutor::RemoteCommandRequest& request, + const RemoteCommandCompletionFn& onFinish); + virtual void cancelCommand(const ReplicationExecutor::CallbackHandle& cbHandle); + virtual void runCallbackWithGlobalExclusiveLock( + const stdx::function<void(OperationContext*)>& callback); + + std::string getNextCallbackWithGlobalLockThreadName(); + +private: + class ConnectionPool; + /** - * Implementation of the network interface used by the ReplicationExecutor inside mongod. - * - * This implementation manages a dynamically sized group of worker threads for performing - * network operations. The minimum and maximum number of threads is set at compile time, and - * the exact number of threads is adjusted dynamically, using the following two rules. - * - * 1.) If the number of worker threads is less than the maximum, there are no idle worker - * threads, and the client enqueues a new network operation via startCommand(), the network - * interface spins up a new worker thread. This decision is made on the assumption that - * spinning up a new thread is faster than the round-trip time for processing a remote command, - * and so this will minimize wait time. - * - * 2.) If the number of worker threads has exceeded the the peak number of scheduled outstanding - * network commands continuously for a period of time (kMaxIdleThreadAge), one thread is retired - * from the pool and the monitoring of idle threads is reset. This means that at most one - * thread retires every kMaxIdleThreadAge units of time. The value of kMaxIdleThreadAge is set - * to be much larger than the expected frequency of new requests, averaging out short-duration - * periods of idleness, as occur between heartbeats. - * - * The implementation also manages a pool of network connections to recently contacted remote - * nodes. The size of this pool is not bounded, but connections are retired unconditionally - * after they have been connected for a certain maximum period. + * Information describing an in-flight command. */ - class NetworkInterfaceImpl : public ReplicationExecutor::NetworkInterface { - public: - explicit NetworkInterfaceImpl(); - virtual ~NetworkInterfaceImpl(); - virtual std::string getDiagnosticString(); - virtual void startup(); - virtual void shutdown(); - virtual void waitForWork(); - virtual void waitForWorkUntil(Date_t when); - virtual void signalWorkAvailable(); - virtual Date_t now(); - virtual void startCommand( - const ReplicationExecutor::CallbackHandle& cbHandle, - const ReplicationExecutor::RemoteCommandRequest& request, - const RemoteCommandCompletionFn& onFinish); - virtual void cancelCommand(const ReplicationExecutor::CallbackHandle& cbHandle); - virtual void runCallbackWithGlobalExclusiveLock( - const stdx::function<void (OperationContext*)>& callback); - - std::string getNextCallbackWithGlobalLockThreadName(); - - private: - class ConnectionPool; - - /** - * Information describing an in-flight command. - */ - struct CommandData { - ReplicationExecutor::CallbackHandle cbHandle; - ReplicationExecutor::RemoteCommandRequest request; - RemoteCommandCompletionFn onFinish; - }; - typedef stdx::list<CommandData> CommandDataList; - typedef std::vector<boost::shared_ptr<boost::thread> > ThreadList; - - /** - * Thread body for threads that synchronously perform network requests from - * the _pending list. - */ - static void _requestProcessorThreadBody(NetworkInterfaceImpl* net, - const std::string& threadName); - - /** - * Run loop that iteratively consumes network requests in a request processor thread. - */ - void _consumeNetworkRequests(); - - /** - * Synchronously invokes the command described by "request". - */ - ResponseStatus _runCommand(const ReplicationExecutor::RemoteCommandRequest& request); - - /** - * Notifies the network threads that there is work available. - */ - void _signalWorkAvailable_inlock(); - - /** - * Starts a new network thread. - */ - void _startNewNetworkThread_inlock(); - - // Mutex guarding the state of the network interface, except for the pool pointed to by - // _connPool. - boost::mutex _mutex; - - // Condition signaled to indicate that there is work in the _pending queue. - boost::condition_variable _hasPending; - - // Queue of yet-to-be-executed network operations. - CommandDataList _pending; - - // List of threads serving as the worker pool. - ThreadList _threads; - - // Count of idle threads. - size_t _numIdleThreads; - - // Id counter for assigning thread names - size_t _nextThreadId; - - // The last time that _pending.size() + _numActiveNetworkRequests grew to be at least - // _threads.size(). - Date_t _lastFullUtilizationDate; - - // Condition signaled to indicate that the executor, blocked in waitForWorkUntil or - // waitForWork, should wake up. - boost::condition_variable _isExecutorRunnableCondition; - - // Flag indicating whether or not the executor associated with this interface is runnable. - bool _isExecutorRunnable; - - // Flag indicating when this interface is being shut down (because shutdown() has executed). - bool _inShutdown; - - // Pool of connections to remote nodes, used by the worker threads to execute network - // requests. - boost::scoped_ptr<ConnectionPool> _connPool; // (R) - - // Number of active network requests - size_t _numActiveNetworkRequests; + struct CommandData { + ReplicationExecutor::CallbackHandle cbHandle; + ReplicationExecutor::RemoteCommandRequest request; + RemoteCommandCompletionFn onFinish; }; + typedef stdx::list<CommandData> CommandDataList; + typedef std::vector<boost::shared_ptr<boost::thread>> ThreadList; + + /** + * Thread body for threads that synchronously perform network requests from + * the _pending list. + */ + static void _requestProcessorThreadBody(NetworkInterfaceImpl* net, + const std::string& threadName); + + /** + * Run loop that iteratively consumes network requests in a request processor thread. + */ + void _consumeNetworkRequests(); + + /** + * Synchronously invokes the command described by "request". + */ + ResponseStatus _runCommand(const ReplicationExecutor::RemoteCommandRequest& request); + + /** + * Notifies the network threads that there is work available. + */ + void _signalWorkAvailable_inlock(); + + /** + * Starts a new network thread. + */ + void _startNewNetworkThread_inlock(); + + // Mutex guarding the state of the network interface, except for the pool pointed to by + // _connPool. + boost::mutex _mutex; + + // Condition signaled to indicate that there is work in the _pending queue. + boost::condition_variable _hasPending; + + // Queue of yet-to-be-executed network operations. + CommandDataList _pending; + + // List of threads serving as the worker pool. + ThreadList _threads; + + // Count of idle threads. + size_t _numIdleThreads; + + // Id counter for assigning thread names + size_t _nextThreadId; + + // The last time that _pending.size() + _numActiveNetworkRequests grew to be at least + // _threads.size(). + Date_t _lastFullUtilizationDate; + + // Condition signaled to indicate that the executor, blocked in waitForWorkUntil or + // waitForWork, should wake up. + boost::condition_variable _isExecutorRunnableCondition; + + // Flag indicating whether or not the executor associated with this interface is runnable. + bool _isExecutorRunnable; + + // Flag indicating when this interface is being shut down (because shutdown() has executed). + bool _inShutdown; + + // Pool of connections to remote nodes, used by the worker threads to execute network + // requests. + boost::scoped_ptr<ConnectionPool> _connPool; // (R) + + // Number of active network requests + size_t _numActiveNetworkRequests; +}; } // namespace repl -} // namespace mongo +} // namespace mongo diff --git a/src/mongo/db/repl/network_interface_mock.cpp b/src/mongo/db/repl/network_interface_mock.cpp index f0fd79fe667..a996340af50 100644 --- a/src/mongo/db/repl/network_interface_mock.cpp +++ b/src/mongo/db/repl/network_interface_mock.cpp @@ -38,374 +38,357 @@ namespace mongo { namespace repl { - NetworkInterfaceMock::NetworkInterfaceMock() - : _waitingToRunMask(0), - _currentlyRunning(kNoThread), - _hasStarted(false), - _inShutdown(false), - _executorNextWakeupDate(~0ULL) { - - StatusWith<Date_t> initialNow = dateFromISOString("2014-08-01T00:00:00Z"); - fassert(18653, initialNow.getStatus()); - _now = initialNow.getValue(); - } - - NetworkInterfaceMock::~NetworkInterfaceMock() { - boost::unique_lock<boost::mutex> lk(_mutex); - invariant(!_hasStarted || _inShutdown); - invariant(_scheduled.empty()); - invariant(_blackHoled.empty()); - } - - std::string NetworkInterfaceMock::getDiagnosticString() { - // TODO something better. - return "NetworkInterfaceMock diagnostics here"; - } - - Date_t NetworkInterfaceMock::now() { - boost::lock_guard<boost::mutex> lk(_mutex); - return _now_inlock(); - } - - void NetworkInterfaceMock::runCallbackWithGlobalExclusiveLock( - const stdx::function<void (OperationContext* txn)>& callback) { - - OperationContextNoop txn; - callback(&txn); - } - - void NetworkInterfaceMock::startCommand( - const ReplicationExecutor::CallbackHandle& cbHandle, - const ReplicationExecutor::RemoteCommandRequest& request, - const RemoteCommandCompletionFn& onFinish) { - - boost::lock_guard<boost::mutex> lk(_mutex); - invariant(!_inShutdown); - const Date_t now = _now_inlock(); - NetworkOperationIterator insertBefore = _unscheduled.begin(); - while ((insertBefore != _unscheduled.end()) && - (insertBefore->getNextConsiderationDate() <= now)) { - - ++insertBefore; - } - _unscheduled.insert(insertBefore, NetworkOperation(cbHandle, request, now, onFinish)); - } - - static bool findAndCancelIf( - const stdx::function<bool (const NetworkInterfaceMock::NetworkOperation&)>& matchFn, - NetworkInterfaceMock::NetworkOperationList* other, - NetworkInterfaceMock::NetworkOperationList* scheduled, - const Date_t now) { - const NetworkInterfaceMock::NetworkOperationIterator noi = - std::find_if(other->begin(), other->end(), matchFn); - if (noi == other->end()) { - return false; - } - scheduled->splice(scheduled->begin(), *other, noi); - noi->setResponse(now, ResponseStatus(ErrorCodes::CallbackCanceled, - "Network operation canceled")); - return true; - } - - void NetworkInterfaceMock::cancelCommand( - const ReplicationExecutor::CallbackHandle& cbHandle) { - boost::lock_guard<boost::mutex> lk(_mutex); - invariant(!_inShutdown); - stdx::function<bool (const NetworkOperation&)> matchesHandle = stdx::bind( - &NetworkOperation::isForCallback, - stdx::placeholders::_1, - cbHandle); - const Date_t now = _now_inlock(); - if (findAndCancelIf(matchesHandle, &_unscheduled, &_scheduled, now)) { - return; - } - if (findAndCancelIf(matchesHandle, &_blackHoled, &_scheduled, now)) { - return; - } - if (findAndCancelIf(matchesHandle, &_scheduled, &_scheduled, now)) { - return; - } - // No not-in-progress network command matched cbHandle. Oh, well. - } - - void NetworkInterfaceMock::startup() { - boost::lock_guard<boost::mutex> lk(_mutex); - invariant(!_hasStarted); - _hasStarted = true; - _inShutdown = false; - invariant(_currentlyRunning == kNoThread); - _currentlyRunning = kExecutorThread; - } - - void NetworkInterfaceMock::shutdown() { - boost::unique_lock<boost::mutex> lk(_mutex); - invariant(_hasStarted); - invariant(!_inShutdown); - _inShutdown = true; - NetworkOperationList todo; - todo.splice(todo.end(), _scheduled); - todo.splice(todo.end(), _unscheduled); - todo.splice(todo.end(), _processing); - todo.splice(todo.end(), _blackHoled); - - const Date_t now = _now_inlock(); - _waitingToRunMask |= kExecutorThread; // Prevents network thread from scheduling. - lk.unlock(); - for (NetworkOperationIterator iter = todo.begin(); iter != todo.end(); ++iter) { - iter->setResponse(now, ResponseStatus(ErrorCodes::ShutdownInProgress, - "Shutting down mock network")); - iter->finishResponse(); - } - lk.lock(); - invariant(_currentlyRunning == kExecutorThread); - _currentlyRunning = kNoThread; - _waitingToRunMask = kNetworkThread; - _shouldWakeNetworkCondition.notify_one(); - } - - void NetworkInterfaceMock::enterNetwork() { - boost::unique_lock<boost::mutex> lk(_mutex); - while (!_isNetworkThreadRunnable_inlock()) { - _shouldWakeNetworkCondition.wait(lk); - } - _currentlyRunning = kNetworkThread; - _waitingToRunMask &= ~kNetworkThread; - } - - void NetworkInterfaceMock::exitNetwork() { - boost::lock_guard<boost::mutex> lk(_mutex); - if (_currentlyRunning != kNetworkThread) { - return; - } - _currentlyRunning = kNoThread; - if (_isExecutorThreadRunnable_inlock()) { - _shouldWakeExecutorCondition.notify_one(); - } - _waitingToRunMask |= kNetworkThread; +NetworkInterfaceMock::NetworkInterfaceMock() + : _waitingToRunMask(0), + _currentlyRunning(kNoThread), + _hasStarted(false), + _inShutdown(false), + _executorNextWakeupDate(~0ULL) { + StatusWith<Date_t> initialNow = dateFromISOString("2014-08-01T00:00:00Z"); + fassert(18653, initialNow.getStatus()); + _now = initialNow.getValue(); +} + +NetworkInterfaceMock::~NetworkInterfaceMock() { + boost::unique_lock<boost::mutex> lk(_mutex); + invariant(!_hasStarted || _inShutdown); + invariant(_scheduled.empty()); + invariant(_blackHoled.empty()); +} + +std::string NetworkInterfaceMock::getDiagnosticString() { + // TODO something better. + return "NetworkInterfaceMock diagnostics here"; +} + +Date_t NetworkInterfaceMock::now() { + boost::lock_guard<boost::mutex> lk(_mutex); + return _now_inlock(); +} + +void NetworkInterfaceMock::runCallbackWithGlobalExclusiveLock( + const stdx::function<void(OperationContext* txn)>& callback) { + OperationContextNoop txn; + callback(&txn); +} + +void NetworkInterfaceMock::startCommand(const ReplicationExecutor::CallbackHandle& cbHandle, + const ReplicationExecutor::RemoteCommandRequest& request, + const RemoteCommandCompletionFn& onFinish) { + boost::lock_guard<boost::mutex> lk(_mutex); + invariant(!_inShutdown); + const Date_t now = _now_inlock(); + NetworkOperationIterator insertBefore = _unscheduled.begin(); + while ((insertBefore != _unscheduled.end()) && + (insertBefore->getNextConsiderationDate() <= now)) { + ++insertBefore; + } + _unscheduled.insert(insertBefore, NetworkOperation(cbHandle, request, now, onFinish)); +} + +static bool findAndCancelIf( + const stdx::function<bool(const NetworkInterfaceMock::NetworkOperation&)>& matchFn, + NetworkInterfaceMock::NetworkOperationList* other, + NetworkInterfaceMock::NetworkOperationList* scheduled, + const Date_t now) { + const NetworkInterfaceMock::NetworkOperationIterator noi = + std::find_if(other->begin(), other->end(), matchFn); + if (noi == other->end()) { + return false; + } + scheduled->splice(scheduled->begin(), *other, noi); + noi->setResponse(now, + ResponseStatus(ErrorCodes::CallbackCanceled, "Network operation canceled")); + return true; +} + +void NetworkInterfaceMock::cancelCommand(const ReplicationExecutor::CallbackHandle& cbHandle) { + boost::lock_guard<boost::mutex> lk(_mutex); + invariant(!_inShutdown); + stdx::function<bool(const NetworkOperation&)> matchesHandle = + stdx::bind(&NetworkOperation::isForCallback, stdx::placeholders::_1, cbHandle); + const Date_t now = _now_inlock(); + if (findAndCancelIf(matchesHandle, &_unscheduled, &_scheduled, now)) { + return; + } + if (findAndCancelIf(matchesHandle, &_blackHoled, &_scheduled, now)) { + return; + } + if (findAndCancelIf(matchesHandle, &_scheduled, &_scheduled, now)) { + return; + } + // No not-in-progress network command matched cbHandle. Oh, well. +} + +void NetworkInterfaceMock::startup() { + boost::lock_guard<boost::mutex> lk(_mutex); + invariant(!_hasStarted); + _hasStarted = true; + _inShutdown = false; + invariant(_currentlyRunning == kNoThread); + _currentlyRunning = kExecutorThread; +} + +void NetworkInterfaceMock::shutdown() { + boost::unique_lock<boost::mutex> lk(_mutex); + invariant(_hasStarted); + invariant(!_inShutdown); + _inShutdown = true; + NetworkOperationList todo; + todo.splice(todo.end(), _scheduled); + todo.splice(todo.end(), _unscheduled); + todo.splice(todo.end(), _processing); + todo.splice(todo.end(), _blackHoled); + + const Date_t now = _now_inlock(); + _waitingToRunMask |= kExecutorThread; // Prevents network thread from scheduling. + lk.unlock(); + for (NetworkOperationIterator iter = todo.begin(); iter != todo.end(); ++iter) { + iter->setResponse( + now, ResponseStatus(ErrorCodes::ShutdownInProgress, "Shutting down mock network")); + iter->finishResponse(); + } + lk.lock(); + invariant(_currentlyRunning == kExecutorThread); + _currentlyRunning = kNoThread; + _waitingToRunMask = kNetworkThread; + _shouldWakeNetworkCondition.notify_one(); +} + +void NetworkInterfaceMock::enterNetwork() { + boost::unique_lock<boost::mutex> lk(_mutex); + while (!_isNetworkThreadRunnable_inlock()) { + _shouldWakeNetworkCondition.wait(lk); + } + _currentlyRunning = kNetworkThread; + _waitingToRunMask &= ~kNetworkThread; +} + +void NetworkInterfaceMock::exitNetwork() { + boost::lock_guard<boost::mutex> lk(_mutex); + if (_currentlyRunning != kNetworkThread) { + return; + } + _currentlyRunning = kNoThread; + if (_isExecutorThreadRunnable_inlock()) { + _shouldWakeExecutorCondition.notify_one(); } + _waitingToRunMask |= kNetworkThread; +} - bool NetworkInterfaceMock::hasReadyRequests() { - boost::lock_guard<boost::mutex> lk(_mutex); - invariant(_currentlyRunning == kNetworkThread); - return _hasReadyRequests_inlock(); - } +bool NetworkInterfaceMock::hasReadyRequests() { + boost::lock_guard<boost::mutex> lk(_mutex); + invariant(_currentlyRunning == kNetworkThread); + return _hasReadyRequests_inlock(); +} - bool NetworkInterfaceMock::_hasReadyRequests_inlock() { - if (_unscheduled.empty()) - return false; - if (_unscheduled.front().getNextConsiderationDate() > _now_inlock()) { - return false; - } - return true; +bool NetworkInterfaceMock::_hasReadyRequests_inlock() { + if (_unscheduled.empty()) + return false; + if (_unscheduled.front().getNextConsiderationDate() > _now_inlock()) { + return false; } + return true; +} - NetworkInterfaceMock::NetworkOperationIterator NetworkInterfaceMock::getNextReadyRequest() { - boost::unique_lock<boost::mutex> lk(_mutex); - invariant(_currentlyRunning == kNetworkThread); - while (!_hasReadyRequests_inlock()) { - _waitingToRunMask |= kExecutorThread; - _runReadyNetworkOperations_inlock(&lk); - } - invariant(_hasReadyRequests_inlock()); - _processing.splice(_processing.begin(), _unscheduled, _unscheduled.begin()); - return _processing.begin(); +NetworkInterfaceMock::NetworkOperationIterator NetworkInterfaceMock::getNextReadyRequest() { + boost::unique_lock<boost::mutex> lk(_mutex); + invariant(_currentlyRunning == kNetworkThread); + while (!_hasReadyRequests_inlock()) { + _waitingToRunMask |= kExecutorThread; + _runReadyNetworkOperations_inlock(&lk); } - - void NetworkInterfaceMock::scheduleResponse( - NetworkOperationIterator noi, - Date_t when, - const ResponseStatus& response) { - - boost::lock_guard<boost::mutex> lk(_mutex); - invariant(_currentlyRunning == kNetworkThread); - NetworkOperationIterator insertBefore = _scheduled.begin(); - while ((insertBefore != _scheduled.end()) && (insertBefore->getResponseDate() <= when)) { - ++insertBefore; + invariant(_hasReadyRequests_inlock()); + _processing.splice(_processing.begin(), _unscheduled, _unscheduled.begin()); + return _processing.begin(); +} + +void NetworkInterfaceMock::scheduleResponse(NetworkOperationIterator noi, + Date_t when, + const ResponseStatus& response) { + boost::lock_guard<boost::mutex> lk(_mutex); + invariant(_currentlyRunning == kNetworkThread); + NetworkOperationIterator insertBefore = _scheduled.begin(); + while ((insertBefore != _scheduled.end()) && (insertBefore->getResponseDate() <= when)) { + ++insertBefore; + } + noi->setResponse(when, response); + _scheduled.splice(insertBefore, _processing, noi); +} + +void NetworkInterfaceMock::blackHole(NetworkOperationIterator noi) { + boost::lock_guard<boost::mutex> lk(_mutex); + invariant(_currentlyRunning == kNetworkThread); + _blackHoled.splice(_blackHoled.end(), _processing, noi); +} + +void NetworkInterfaceMock::requeueAt(NetworkOperationIterator noi, Date_t dontAskUntil) { + boost::lock_guard<boost::mutex> lk(_mutex); + invariant(_currentlyRunning == kNetworkThread); + invariant(noi->getNextConsiderationDate() < dontAskUntil); + invariant(_now_inlock() < dontAskUntil); + NetworkOperationIterator insertBefore = _unscheduled.begin(); + for (; insertBefore != _unscheduled.end(); ++insertBefore) { + if (insertBefore->getNextConsiderationDate() >= dontAskUntil) { + break; } - noi->setResponse(when, response); - _scheduled.splice(insertBefore, _processing, noi); - } - - void NetworkInterfaceMock::blackHole(NetworkOperationIterator noi) { - boost::lock_guard<boost::mutex> lk(_mutex); - invariant(_currentlyRunning == kNetworkThread); - _blackHoled.splice(_blackHoled.end(), _processing, noi); } + noi->setNextConsiderationDate(dontAskUntil); + _unscheduled.splice(insertBefore, _processing, noi); +} - void NetworkInterfaceMock::requeueAt(NetworkOperationIterator noi, Date_t dontAskUntil) { - boost::lock_guard<boost::mutex> lk(_mutex); - invariant(_currentlyRunning == kNetworkThread); - invariant(noi->getNextConsiderationDate() < dontAskUntil); - invariant(_now_inlock() < dontAskUntil); - NetworkOperationIterator insertBefore = _unscheduled.begin(); - for (; insertBefore != _unscheduled.end(); ++insertBefore) { - if (insertBefore->getNextConsiderationDate() >= dontAskUntil) { - break; - } +void NetworkInterfaceMock::runUntil(Date_t until) { + boost::unique_lock<boost::mutex> lk(_mutex); + invariant(_currentlyRunning == kNetworkThread); + invariant(until > _now_inlock()); + while (until > _now_inlock()) { + _runReadyNetworkOperations_inlock(&lk); + if (_hasReadyRequests_inlock()) { + break; } - noi->setNextConsiderationDate(dontAskUntil); - _unscheduled.splice(insertBefore, _processing, noi); - } - - void NetworkInterfaceMock::runUntil(Date_t until) { - boost::unique_lock<boost::mutex> lk(_mutex); - invariant(_currentlyRunning == kNetworkThread); - invariant(until > _now_inlock()); - while (until > _now_inlock()) { - _runReadyNetworkOperations_inlock(&lk); - if (_hasReadyRequests_inlock()) { - break; - } - Date_t newNow = _executorNextWakeupDate; - if (!_scheduled.empty() && _scheduled.front().getResponseDate() < newNow) { - newNow = _scheduled.front().getResponseDate(); - } - if (until < newNow) { - newNow = until; - } - invariant(_now_inlock() <= newNow); - _now = newNow; - _waitingToRunMask |= kExecutorThread; + Date_t newNow = _executorNextWakeupDate; + if (!_scheduled.empty() && _scheduled.front().getResponseDate() < newNow) { + newNow = _scheduled.front().getResponseDate(); } - _runReadyNetworkOperations_inlock(&lk); - } - - void NetworkInterfaceMock::runReadyNetworkOperations() { - boost::unique_lock<boost::mutex> lk(_mutex); - invariant(_currentlyRunning == kNetworkThread); - _runReadyNetworkOperations_inlock(&lk); - } - - void NetworkInterfaceMock::waitForWork() { - boost::unique_lock<boost::mutex> lk(_mutex); - invariant(_currentlyRunning == kExecutorThread); - _waitForWork_inlock(&lk); - } - - void NetworkInterfaceMock::waitForWorkUntil(Date_t when) { - boost::unique_lock<boost::mutex> lk(_mutex); - invariant(_currentlyRunning == kExecutorThread); - _executorNextWakeupDate = when; - if (_executorNextWakeupDate <= _now_inlock()) { - return; + if (until < newNow) { + newNow = until; } - _waitForWork_inlock(&lk); - } - - void NetworkInterfaceMock::signalWorkAvailable() { - boost::lock_guard<boost::mutex> lk(_mutex); + invariant(_now_inlock() <= newNow); + _now = newNow; _waitingToRunMask |= kExecutorThread; - if (_currentlyRunning == kNoThread) { - _shouldWakeExecutorCondition.notify_one(); - } } - - void NetworkInterfaceMock::_runReadyNetworkOperations_inlock( - boost::unique_lock<boost::mutex>* lk) { - while (!_scheduled.empty() && _scheduled.front().getResponseDate() <= _now_inlock()) { - invariant(_currentlyRunning == kNetworkThread); - NetworkOperation op = _scheduled.front(); - _scheduled.pop_front(); - _waitingToRunMask |= kExecutorThread; - lk->unlock(); - op.finishResponse(); - lk->lock(); - } - invariant(_currentlyRunning == kNetworkThread); - if (!(_waitingToRunMask & kExecutorThread)) { - return; - } + _runReadyNetworkOperations_inlock(&lk); +} + +void NetworkInterfaceMock::runReadyNetworkOperations() { + boost::unique_lock<boost::mutex> lk(_mutex); + invariant(_currentlyRunning == kNetworkThread); + _runReadyNetworkOperations_inlock(&lk); +} + +void NetworkInterfaceMock::waitForWork() { + boost::unique_lock<boost::mutex> lk(_mutex); + invariant(_currentlyRunning == kExecutorThread); + _waitForWork_inlock(&lk); +} + +void NetworkInterfaceMock::waitForWorkUntil(Date_t when) { + boost::unique_lock<boost::mutex> lk(_mutex); + invariant(_currentlyRunning == kExecutorThread); + _executorNextWakeupDate = when; + if (_executorNextWakeupDate <= _now_inlock()) { + return; + } + _waitForWork_inlock(&lk); +} + +void NetworkInterfaceMock::signalWorkAvailable() { + boost::lock_guard<boost::mutex> lk(_mutex); + _waitingToRunMask |= kExecutorThread; + if (_currentlyRunning == kNoThread) { _shouldWakeExecutorCondition.notify_one(); - _currentlyRunning = kNoThread; - while (!_isNetworkThreadRunnable_inlock()) { - _shouldWakeNetworkCondition.wait(*lk); - } - _currentlyRunning = kNetworkThread; - _waitingToRunMask &= ~kNetworkThread; } +} - void NetworkInterfaceMock::_waitForWork_inlock(boost::unique_lock<boost::mutex>* lk) { - if (_waitingToRunMask & kExecutorThread) { - _waitingToRunMask &= ~kExecutorThread; - return; - } - _currentlyRunning = kNoThread; - while (!_isExecutorThreadRunnable_inlock()) { - _waitingToRunMask |= kNetworkThread; - _shouldWakeNetworkCondition.notify_one(); - _shouldWakeExecutorCondition.wait(*lk); - } - _currentlyRunning = kExecutorThread; - _waitingToRunMask &= ~kExecutorThread; - } - - bool NetworkInterfaceMock::_isNetworkThreadRunnable_inlock() { - if (_currentlyRunning != kNoThread) { - return false; - } - if (_waitingToRunMask != kNetworkThread) { - return false; - } - return true; - } - - bool NetworkInterfaceMock::_isExecutorThreadRunnable_inlock() { - if (_currentlyRunning != kNoThread) { - return false; - } - return _waitingToRunMask & kExecutorThread; - } - - static const StatusWith<ReplicationExecutor::RemoteCommandResponse> kUnsetResponse( - ErrorCodes::InternalError, - "NetworkOperation::_response never set"); - - NetworkInterfaceMock::NetworkOperation::NetworkOperation() - : _requestDate(), - _nextConsiderationDate(), - _responseDate(), - _request(), - _response(kUnsetResponse), - _onFinish() { - } - - NetworkInterfaceMock::NetworkOperation::NetworkOperation( - const ReplicationExecutor::CallbackHandle& cbHandle, - const ReplicationExecutor::RemoteCommandRequest& theRequest, - Date_t theRequestDate, - const RemoteCommandCompletionFn& onFinish) - : _requestDate(theRequestDate), - _nextConsiderationDate(theRequestDate), - _responseDate(), - _cbHandle(cbHandle), - _request(theRequest), - _response(kUnsetResponse), - _onFinish(onFinish) { +void NetworkInterfaceMock::_runReadyNetworkOperations_inlock(boost::unique_lock<boost::mutex>* lk) { + while (!_scheduled.empty() && _scheduled.front().getResponseDate() <= _now_inlock()) { + invariant(_currentlyRunning == kNetworkThread); + NetworkOperation op = _scheduled.front(); + _scheduled.pop_front(); + _waitingToRunMask |= kExecutorThread; + lk->unlock(); + op.finishResponse(); + lk->lock(); } - - NetworkInterfaceMock::NetworkOperation::~NetworkOperation() {} - - void NetworkInterfaceMock::NetworkOperation::setNextConsiderationDate( - Date_t nextConsiderationDate) { - - invariant(nextConsiderationDate > _nextConsiderationDate); - _nextConsiderationDate = nextConsiderationDate; + invariant(_currentlyRunning == kNetworkThread); + if (!(_waitingToRunMask & kExecutorThread)) { + return; } - - void NetworkInterfaceMock::NetworkOperation::setResponse( - Date_t responseDate, - const ResponseStatus& response) { - - invariant(responseDate >= _requestDate); - _responseDate = responseDate; - _response = response; + _shouldWakeExecutorCondition.notify_one(); + _currentlyRunning = kNoThread; + while (!_isNetworkThreadRunnable_inlock()) { + _shouldWakeNetworkCondition.wait(*lk); } + _currentlyRunning = kNetworkThread; + _waitingToRunMask &= ~kNetworkThread; +} - void NetworkInterfaceMock::NetworkOperation::finishResponse() { - invariant(_onFinish); - _onFinish(_response); - _onFinish = RemoteCommandCompletionFn(); +void NetworkInterfaceMock::_waitForWork_inlock(boost::unique_lock<boost::mutex>* lk) { + if (_waitingToRunMask & kExecutorThread) { + _waitingToRunMask &= ~kExecutorThread; + return; } + _currentlyRunning = kNoThread; + while (!_isExecutorThreadRunnable_inlock()) { + _waitingToRunMask |= kNetworkThread; + _shouldWakeNetworkCondition.notify_one(); + _shouldWakeExecutorCondition.wait(*lk); + } + _currentlyRunning = kExecutorThread; + _waitingToRunMask &= ~kExecutorThread; +} + +bool NetworkInterfaceMock::_isNetworkThreadRunnable_inlock() { + if (_currentlyRunning != kNoThread) { + return false; + } + if (_waitingToRunMask != kNetworkThread) { + return false; + } + return true; +} + +bool NetworkInterfaceMock::_isExecutorThreadRunnable_inlock() { + if (_currentlyRunning != kNoThread) { + return false; + } + return _waitingToRunMask & kExecutorThread; +} + +static const StatusWith<ReplicationExecutor::RemoteCommandResponse> kUnsetResponse( + ErrorCodes::InternalError, "NetworkOperation::_response never set"); + +NetworkInterfaceMock::NetworkOperation::NetworkOperation() + : _requestDate(), + _nextConsiderationDate(), + _responseDate(), + _request(), + _response(kUnsetResponse), + _onFinish() {} + +NetworkInterfaceMock::NetworkOperation::NetworkOperation( + const ReplicationExecutor::CallbackHandle& cbHandle, + const ReplicationExecutor::RemoteCommandRequest& theRequest, + Date_t theRequestDate, + const RemoteCommandCompletionFn& onFinish) + : _requestDate(theRequestDate), + _nextConsiderationDate(theRequestDate), + _responseDate(), + _cbHandle(cbHandle), + _request(theRequest), + _response(kUnsetResponse), + _onFinish(onFinish) {} + +NetworkInterfaceMock::NetworkOperation::~NetworkOperation() {} + +void NetworkInterfaceMock::NetworkOperation::setNextConsiderationDate( + Date_t nextConsiderationDate) { + invariant(nextConsiderationDate > _nextConsiderationDate); + _nextConsiderationDate = nextConsiderationDate; +} + +void NetworkInterfaceMock::NetworkOperation::setResponse(Date_t responseDate, + const ResponseStatus& response) { + invariant(responseDate >= _requestDate); + _responseDate = responseDate; + _response = response; +} + +void NetworkInterfaceMock::NetworkOperation::finishResponse() { + invariant(_onFinish); + _onFinish(_response); + _onFinish = RemoteCommandCompletionFn(); +} } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/network_interface_mock.h b/src/mongo/db/repl/network_interface_mock.h index fd2ad52746d..269eec7fcfc 100644 --- a/src/mongo/db/repl/network_interface_mock.h +++ b/src/mongo/db/repl/network_interface_mock.h @@ -38,292 +38,297 @@ namespace mongo { namespace repl { +/** + * Mock network implementation for use in unit tests. + * + * To use, construct a new instance on the heap, and keep a pointer to it. Pass + * the pointer to the instance into the ReplicationExecutor constructor, transfering + * ownership. Start the executor's run() method in a separate thread, schedule the + * work you want to test into the executor, then while the test is still going, iterate + * through the ready network requests, servicing them and advancing time as needed. + * + * The mock has a fully virtualized notion of time and the the network. When the + * replication executor under test schedules a network operation, the startCommand + * method of this class adds an entry to the _unscheduled queue for immediate consideration. + * The test driver loop, when it examines the request, may schedule a response, ask the + * interface to redeliver the request at a later virtual time, or to swallow the virtual + * request until the end of the simulation. The test driver loop can also instruct the + * interface to run forward through virtual time until there are operations ready to + * consider, via runUntil. + * + * The thread acting as the "network" and the executor run thread are highly synchronized + * by this code, allowing for deterministic control of operation interleaving. + */ +class NetworkInterfaceMock : public ReplicationExecutor::NetworkInterface { +public: + class NetworkOperation; + typedef stdx::list<NetworkOperation> NetworkOperationList; + typedef NetworkOperationList::iterator NetworkOperationIterator; + + NetworkInterfaceMock(); + virtual ~NetworkInterfaceMock(); + virtual std::string getDiagnosticString(); + + //////////////////////////////////////////////////////////////////////////////// + // + // ReplicationExecutor::NetworkInterface methods + // + //////////////////////////////////////////////////////////////////////////////// + + virtual void startup(); + virtual void shutdown(); + virtual void waitForWork(); + virtual void waitForWorkUntil(Date_t when); + virtual void signalWorkAvailable(); + virtual Date_t now(); + virtual void startCommand(const ReplicationExecutor::CallbackHandle& cbHandle, + const ReplicationExecutor::RemoteCommandRequest& request, + const RemoteCommandCompletionFn& onFinish); + virtual void cancelCommand(const ReplicationExecutor::CallbackHandle& cbHandle); + virtual void runCallbackWithGlobalExclusiveLock( + const stdx::function<void(OperationContext*)>& callback); + + + //////////////////////////////////////////////////////////////////////////////// + // + // Methods for simulating network operations and the passage of time. + // + // Methods in this section are to be called by the thread currently simulating + // the network. + // + //////////////////////////////////////////////////////////////////////////////// + + /** + * Causes the currently running (non-executor) thread to assume the mantle of the network + * simulation thread. + * + * Call this before calling any of the other methods in this section. + */ + void enterNetwork(); + + /** + * Causes the currently running thread to drop the mantle of "network simulation thread". + * + * Call this before calling any methods that might block waiting for the replciation + * executor thread. + */ + void exitNetwork(); + + /** + * Returns true if there are unscheduled network requests to be processed. + */ + bool hasReadyRequests(); + + /** + * Gets the next unscheduled request to process, blocking until one is available. + * + * Will not return until the executor thread is blocked in waitForWorkUntil or waitForWork. + */ + NetworkOperationIterator getNextReadyRequest(); + + /** + * Schedules "response" in response to "noi" at virtual time "when". + */ + void scheduleResponse(NetworkOperationIterator noi, + Date_t when, + const ResponseStatus& response); + + /** + * Swallows "noi", causing the network interface to not respond to it until + * shutdown() is called. + */ + void blackHole(NetworkOperationIterator noi); + + /** + * Defers decision making on "noi" until virtual time "dontAskUntil". Use + * this when getNextReadyRequest() returns a request you want to deal with + * after looking at other requests. + */ + void requeueAt(NetworkOperationIterator noi, Date_t dontAskUntil); + /** - * Mock network implementation for use in unit tests. + * Runs the simulator forward until now() == until or hasReadyRequests() is true. * - * To use, construct a new instance on the heap, and keep a pointer to it. Pass - * the pointer to the instance into the ReplicationExecutor constructor, transfering - * ownership. Start the executor's run() method in a separate thread, schedule the - * work you want to test into the executor, then while the test is still going, iterate - * through the ready network requests, servicing them and advancing time as needed. + * Will not return until the executor thread is blocked in waitForWorkUntil or waitForWork. + */ + void runUntil(Date_t until); + + /** + * Processes all ready, scheduled network operations. * - * The mock has a fully virtualized notion of time and the the network. When the - * replication executor under test schedules a network operation, the startCommand - * method of this class adds an entry to the _unscheduled queue for immediate consideration. - * The test driver loop, when it examines the request, may schedule a response, ask the - * interface to redeliver the request at a later virtual time, or to swallow the virtual - * request until the end of the simulation. The test driver loop can also instruct the - * interface to run forward through virtual time until there are operations ready to - * consider, via runUntil. + * Will not return until the executor thread is blocked in waitForWorkUntil or waitForWork. + */ + void runReadyNetworkOperations(); + +private: + /** + * Type used to identify which thread (network mock or executor) is currently executing. * - * The thread acting as the "network" and the executor run thread are highly synchronized - * by this code, allowing for deterministic control of operation interleaving. + * Values are used in a bitmask, as well. + */ + enum ThreadType { kNoThread = 0, kExecutorThread = 1, kNetworkThread = 2 }; + + /** + * Returns the current virtualized time. + */ + Date_t _now_inlock() const { + return _now; + } + + /** + * Implementation of waitForWork*. + */ + void _waitForWork_inlock(boost::unique_lock<boost::mutex>* lk); + + /** + * Returns true if there are ready requests for the network thread to service. + */ + bool _hasReadyRequests_inlock(); + + /** + * Returns true if the network thread could run right now. + */ + bool _isNetworkThreadRunnable_inlock(); + + /** + * Returns true if the executor thread could run right now. + */ + bool _isExecutorThreadRunnable_inlock(); + + /** + * Runs all ready network operations, called while holding "lk". May drop and + * reaquire "lk" several times, but will not return until the executor has blocked + * in waitFor*. + */ + void _runReadyNetworkOperations_inlock(boost::unique_lock<boost::mutex>* lk); + + // Mutex that synchronizes access to mutable data in this class and its subclasses. + // Fields guarded by the mutex are labled (M), below, and those that are read-only + // in multi-threaded execution, and so unsynchronized, are labeled (R). + boost::mutex _mutex; + + // Condition signaled to indicate that the network processing thread should wake up. + boost::condition_variable _shouldWakeNetworkCondition; // (M) + + // Condition signaled to indicate that the executor run thread should wake up. + boost::condition_variable _shouldWakeExecutorCondition; // (M) + + // Bitmask indicating which threads are runnable. + int _waitingToRunMask; // (M) + + // Indicator of which thread, if any, is currently running. + ThreadType _currentlyRunning; // (M) + + // The current time reported by this instance of NetworkInterfaceMock. + Date_t _now; // (M) + + // Set to true by "startUp()" + bool _hasStarted; // (M) + + // Set to true by "shutDown()". + bool _inShutdown; // (M) + + // Next date that the executor expects to wake up at (due to a scheduleWorkAt() call). + Date_t _executorNextWakeupDate; // (M) + + // List of network operations whose responses haven't been scheduled or blackholed. This is + // where network requests are first queued. It is sorted by + // NetworkOperation::_nextConsiderationDate, which is set to now() when startCommand() is + // called, and adjusted by requeueAt(). + NetworkOperationList _unscheduled; // (M) + + // List of network operations that have been returned by getNextReadyRequest() but not + // yet scheudled, black-holed or requeued. + NetworkOperationList _processing; // (M) + + // List of network operations whose responses have been scheduled but not delivered, sorted + // by NetworkOperation::_responseDate. These operations will have their responses delivered + // when now() == getResponseDate(). + NetworkOperationList _scheduled; // (M) + + // List of network operations that will not be responded to until shutdown() is called. + NetworkOperationList _blackHoled; // (M) + + // Pointer to the executor into which this mock is installed. Used to signal the executor + // when the clock changes. + ReplicationExecutor* _executor; // (R) +}; + +/** + * Representation of an in-progress network operation. + */ +class NetworkInterfaceMock::NetworkOperation { +public: + NetworkOperation(); + NetworkOperation(const ReplicationExecutor::CallbackHandle& cbHandle, + const ReplicationExecutor::RemoteCommandRequest& theRequest, + Date_t theRequestDate, + const RemoteCommandCompletionFn& onFinish); + ~NetworkOperation(); + + /** + * Adjusts the stored virtual time at which this entry will be subject to consideration + * by the test harness. + */ + void setNextConsiderationDate(Date_t nextConsiderationDate); + + /** + * Sets the response and thet virtual time at which it will be delivered. + */ + void setResponse(Date_t responseDate, const ResponseStatus& response); + + /** + * Predicate that returns true if cbHandle equals the executor's handle for this network + * operation. Used for searching lists of NetworkOperations. + */ + bool isForCallback(const ReplicationExecutor::CallbackHandle& cbHandle) const { + return cbHandle == _cbHandle; + } + + /** + * Gets the request that initiated this operation. + */ + const ReplicationExecutor::RemoteCommandRequest& getRequest() const { + return _request; + } + + /** + * Gets the virtual time at which the operation was started. + */ + Date_t getRequestDate() const { + return _requestDate; + } + + /** + * Gets the virtual time at which the test harness should next consider what to do + * with this request. + */ + Date_t getNextConsiderationDate() const { + return _nextConsiderationDate; + } + + /** + * After setResponse() has been called, returns the virtual time at which + * the response should be delivered. */ - class NetworkInterfaceMock : public ReplicationExecutor::NetworkInterface { - public: - class NetworkOperation; - typedef stdx::list<NetworkOperation> NetworkOperationList; - typedef NetworkOperationList::iterator NetworkOperationIterator; - - NetworkInterfaceMock(); - virtual ~NetworkInterfaceMock(); - virtual std::string getDiagnosticString(); - - //////////////////////////////////////////////////////////////////////////////// - // - // ReplicationExecutor::NetworkInterface methods - // - //////////////////////////////////////////////////////////////////////////////// - - virtual void startup(); - virtual void shutdown(); - virtual void waitForWork(); - virtual void waitForWorkUntil(Date_t when); - virtual void signalWorkAvailable(); - virtual Date_t now(); - virtual void startCommand(const ReplicationExecutor::CallbackHandle& cbHandle, - const ReplicationExecutor::RemoteCommandRequest& request, - const RemoteCommandCompletionFn& onFinish); - virtual void cancelCommand(const ReplicationExecutor::CallbackHandle& cbHandle); - virtual void runCallbackWithGlobalExclusiveLock( - const stdx::function<void (OperationContext*)>& callback); - - - //////////////////////////////////////////////////////////////////////////////// - // - // Methods for simulating network operations and the passage of time. - // - // Methods in this section are to be called by the thread currently simulating - // the network. - // - //////////////////////////////////////////////////////////////////////////////// - - /** - * Causes the currently running (non-executor) thread to assume the mantle of the network - * simulation thread. - * - * Call this before calling any of the other methods in this section. - */ - void enterNetwork(); - - /** - * Causes the currently running thread to drop the mantle of "network simulation thread". - * - * Call this before calling any methods that might block waiting for the replciation - * executor thread. - */ - void exitNetwork(); - - /** - * Returns true if there are unscheduled network requests to be processed. - */ - bool hasReadyRequests(); - - /** - * Gets the next unscheduled request to process, blocking until one is available. - * - * Will not return until the executor thread is blocked in waitForWorkUntil or waitForWork. - */ - NetworkOperationIterator getNextReadyRequest(); - - /** - * Schedules "response" in response to "noi" at virtual time "when". - */ - void scheduleResponse( - NetworkOperationIterator noi, - Date_t when, - const ResponseStatus& response); - - /** - * Swallows "noi", causing the network interface to not respond to it until - * shutdown() is called. - */ - void blackHole(NetworkOperationIterator noi); - - /** - * Defers decision making on "noi" until virtual time "dontAskUntil". Use - * this when getNextReadyRequest() returns a request you want to deal with - * after looking at other requests. - */ - void requeueAt(NetworkOperationIterator noi, Date_t dontAskUntil); - - /** - * Runs the simulator forward until now() == until or hasReadyRequests() is true. - * - * Will not return until the executor thread is blocked in waitForWorkUntil or waitForWork. - */ - void runUntil(Date_t until); - - /** - * Processes all ready, scheduled network operations. - * - * Will not return until the executor thread is blocked in waitForWorkUntil or waitForWork. - */ - void runReadyNetworkOperations(); - - private: - /** - * Type used to identify which thread (network mock or executor) is currently executing. - * - * Values are used in a bitmask, as well. - */ - enum ThreadType { - kNoThread = 0, - kExecutorThread = 1, - kNetworkThread = 2 - }; - - /** - * Returns the current virtualized time. - */ - Date_t _now_inlock() const { return _now; } - - /** - * Implementation of waitForWork*. - */ - void _waitForWork_inlock(boost::unique_lock<boost::mutex>* lk); - - /** - * Returns true if there are ready requests for the network thread to service. - */ - bool _hasReadyRequests_inlock(); - - /** - * Returns true if the network thread could run right now. - */ - bool _isNetworkThreadRunnable_inlock(); - - /** - * Returns true if the executor thread could run right now. - */ - bool _isExecutorThreadRunnable_inlock(); - - /** - * Runs all ready network operations, called while holding "lk". May drop and - * reaquire "lk" several times, but will not return until the executor has blocked - * in waitFor*. - */ - void _runReadyNetworkOperations_inlock(boost::unique_lock<boost::mutex>* lk); - - // Mutex that synchronizes access to mutable data in this class and its subclasses. - // Fields guarded by the mutex are labled (M), below, and those that are read-only - // in multi-threaded execution, and so unsynchronized, are labeled (R). - boost::mutex _mutex; - - // Condition signaled to indicate that the network processing thread should wake up. - boost::condition_variable _shouldWakeNetworkCondition; // (M) - - // Condition signaled to indicate that the executor run thread should wake up. - boost::condition_variable _shouldWakeExecutorCondition; // (M) - - // Bitmask indicating which threads are runnable. - int _waitingToRunMask; // (M) - - // Indicator of which thread, if any, is currently running. - ThreadType _currentlyRunning; // (M) - - // The current time reported by this instance of NetworkInterfaceMock. - Date_t _now; // (M) - - // Set to true by "startUp()" - bool _hasStarted; // (M) - - // Set to true by "shutDown()". - bool _inShutdown; // (M) - - // Next date that the executor expects to wake up at (due to a scheduleWorkAt() call). - Date_t _executorNextWakeupDate; // (M) - - // List of network operations whose responses haven't been scheduled or blackholed. This is - // where network requests are first queued. It is sorted by - // NetworkOperation::_nextConsiderationDate, which is set to now() when startCommand() is - // called, and adjusted by requeueAt(). - NetworkOperationList _unscheduled; // (M) - - // List of network operations that have been returned by getNextReadyRequest() but not - // yet scheudled, black-holed or requeued. - NetworkOperationList _processing; // (M) - - // List of network operations whose responses have been scheduled but not delivered, sorted - // by NetworkOperation::_responseDate. These operations will have their responses delivered - // when now() == getResponseDate(). - NetworkOperationList _scheduled; // (M) - - // List of network operations that will not be responded to until shutdown() is called. - NetworkOperationList _blackHoled; // (M) - - // Pointer to the executor into which this mock is installed. Used to signal the executor - // when the clock changes. - ReplicationExecutor* _executor; // (R) - }; + Date_t getResponseDate() const { + return _responseDate; + } /** - * Representation of an in-progress network operation. + * Delivers the response, by invoking the onFinish callback passed into the constructor. */ - class NetworkInterfaceMock::NetworkOperation { - public: - NetworkOperation(); - NetworkOperation(const ReplicationExecutor::CallbackHandle& cbHandle, - const ReplicationExecutor::RemoteCommandRequest& theRequest, - Date_t theRequestDate, - const RemoteCommandCompletionFn& onFinish); - ~NetworkOperation(); - - /** - * Adjusts the stored virtual time at which this entry will be subject to consideration - * by the test harness. - */ - void setNextConsiderationDate(Date_t nextConsiderationDate); - - /** - * Sets the response and thet virtual time at which it will be delivered. - */ - void setResponse(Date_t responseDate, const ResponseStatus& response); - - /** - * Predicate that returns true if cbHandle equals the executor's handle for this network - * operation. Used for searching lists of NetworkOperations. - */ - bool isForCallback(const ReplicationExecutor::CallbackHandle& cbHandle) const { - return cbHandle == _cbHandle; - } - - /** - * Gets the request that initiated this operation. - */ - const ReplicationExecutor::RemoteCommandRequest& getRequest() const { return _request; } - - /** - * Gets the virtual time at which the operation was started. - */ - Date_t getRequestDate() const { return _requestDate; } - - /** - * Gets the virtual time at which the test harness should next consider what to do - * with this request. - */ - Date_t getNextConsiderationDate() const { return _nextConsiderationDate; } - - /** - * After setResponse() has been called, returns the virtual time at which - * the response should be delivered. - */ - Date_t getResponseDate() const { return _responseDate; } - - /** - * Delivers the response, by invoking the onFinish callback passed into the constructor. - */ - void finishResponse(); - - private: - Date_t _requestDate; - Date_t _nextConsiderationDate; - Date_t _responseDate; - ReplicationExecutor::CallbackHandle _cbHandle; - ReplicationExecutor::RemoteCommandRequest _request; - ResponseStatus _response; - RemoteCommandCompletionFn _onFinish; - }; + void finishResponse(); + +private: + Date_t _requestDate; + Date_t _nextConsiderationDate; + Date_t _responseDate; + ReplicationExecutor::CallbackHandle _cbHandle; + ReplicationExecutor::RemoteCommandRequest _request; + ResponseStatus _response; + RemoteCommandCompletionFn _onFinish; +}; } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/operation_context_repl_mock.cpp b/src/mongo/db/repl/operation_context_repl_mock.cpp index b5c44316261..f5ac1a7e5ce 100644 --- a/src/mongo/db/repl/operation_context_repl_mock.cpp +++ b/src/mongo/db/repl/operation_context_repl_mock.cpp @@ -36,10 +36,9 @@ namespace mongo { namespace repl { - OperationContextReplMock::OperationContextReplMock() - : _lockState(new MMAPV1LockerImpl()) { } +OperationContextReplMock::OperationContextReplMock() : _lockState(new MMAPV1LockerImpl()) {} - OperationContextReplMock::~OperationContextReplMock() {} +OperationContextReplMock::~OperationContextReplMock() {} } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/operation_context_repl_mock.h b/src/mongo/db/repl/operation_context_repl_mock.h index 660fbdfc5db..4ba3a4b4bf3 100644 --- a/src/mongo/db/repl/operation_context_repl_mock.h +++ b/src/mongo/db/repl/operation_context_repl_mock.h @@ -34,23 +34,25 @@ namespace mongo { - class Locker; +class Locker; namespace repl { - /** - * Mock implementation of OperationContext that can be used with real instances of LockManager. - */ - class OperationContextReplMock : public OperationContextNoop { - public: - OperationContextReplMock(); - virtual ~OperationContextReplMock(); - - virtual Locker* lockState() const { return _lockState.get(); } - - private: - boost::scoped_ptr<Locker> _lockState; - }; +/** + * Mock implementation of OperationContext that can be used with real instances of LockManager. + */ +class OperationContextReplMock : public OperationContextNoop { +public: + OperationContextReplMock(); + virtual ~OperationContextReplMock(); + + virtual Locker* lockState() const { + return _lockState.get(); + } + +private: + boost::scoped_ptr<Locker> _lockState; +}; } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/oplog.cpp b/src/mongo/db/repl/oplog.cpp index 53757442e49..df2c04b8e22 100644 --- a/src/mongo/db/repl/oplog.cpp +++ b/src/mongo/db/repl/oplog.cpp @@ -73,645 +73,629 @@ namespace mongo { - using std::endl; - using std::stringstream; +using std::endl; +using std::stringstream; namespace repl { namespace { - // cached copies of these...so don't rename them, drop them, etc.!!! - Database* localDB = NULL; - Collection* localOplogMainCollection = 0; - Collection* localOplogRSCollection = 0; - - // Synchronizes the section where a new OpTime is generated and when it actually - // appears in the oplog. - mongo::mutex newOpMutex("oplogNewOp"); - boost::condition newOptimeNotifier; - - // so we can fail the same way - void checkOplogInsert( StatusWith<RecordId> result ) { - massert( 17322, - str::stream() << "write to oplog failed: " << result.getStatus().toString(), - result.isOK() ); - } - - - /** - * Allocates an optime for a new entry in the oplog, and updates the replication coordinator to - * reflect that new optime. Returns the new optime and the correct value of the "h" field for - * the new oplog entry. - * - * NOTE: From the time this function returns to the time that the new oplog entry is written - * to the storage system, all errors must be considered fatal. This is because the this - * function registers the new optime with the storage system and the replication coordinator, - * and provides no facility to revert those registrations on rollback. - */ - std::pair<OpTime, long long> getNextOpTime(OperationContext* txn, - Collection* oplog, - const char* ns, - ReplicationCoordinator* replCoord, - const char* opstr) { - mutex::scoped_lock lk(newOpMutex); - OpTime ts = getNextGlobalOptime(); - newOptimeNotifier.notify_all(); - - fassert(28560, oplog->getRecordStore()->oplogDiskLocRegister(txn, ts)); +// cached copies of these...so don't rename them, drop them, etc.!!! +Database* localDB = NULL; +Collection* localOplogMainCollection = 0; +Collection* localOplogRSCollection = 0; - long long hashNew; +// Synchronizes the section where a new OpTime is generated and when it actually +// appears in the oplog. +mongo::mutex newOpMutex("oplogNewOp"); +boost::condition newOptimeNotifier; - if (replCoord->getReplicationMode() == ReplicationCoordinator::modeReplSet) { +// so we can fail the same way +void checkOplogInsert(StatusWith<RecordId> result) { + massert(17322, + str::stream() << "write to oplog failed: " << result.getStatus().toString(), + result.isOK()); +} - hashNew = BackgroundSync::get()->getLastAppliedHash(); - // Check to make sure logOp() is legal at this point. - if (*opstr == 'n') { - // 'n' operations are always logged - invariant(*ns == '\0'); - - // 'n' operations do not advance the hash, since they are not rolled back - } - else { - // Advance the hash - hashNew = (hashNew * 131 + ts.asLL()) * 17 + replCoord->getMyId(); - - BackgroundSync::get()->setLastAppliedHash(hashNew); - } - } - else { - hashNew = 0; +/** + * Allocates an optime for a new entry in the oplog, and updates the replication coordinator to + * reflect that new optime. Returns the new optime and the correct value of the "h" field for + * the new oplog entry. + * + * NOTE: From the time this function returns to the time that the new oplog entry is written + * to the storage system, all errors must be considered fatal. This is because the this + * function registers the new optime with the storage system and the replication coordinator, + * and provides no facility to revert those registrations on rollback. + */ +std::pair<OpTime, long long> getNextOpTime(OperationContext* txn, + Collection* oplog, + const char* ns, + ReplicationCoordinator* replCoord, + const char* opstr) { + mutex::scoped_lock lk(newOpMutex); + OpTime ts = getNextGlobalOptime(); + newOptimeNotifier.notify_all(); + + fassert(28560, oplog->getRecordStore()->oplogDiskLocRegister(txn, ts)); + + long long hashNew; + + if (replCoord->getReplicationMode() == ReplicationCoordinator::modeReplSet) { + hashNew = BackgroundSync::get()->getLastAppliedHash(); + + // Check to make sure logOp() is legal at this point. + if (*opstr == 'n') { + // 'n' operations are always logged + invariant(*ns == '\0'); + + // 'n' operations do not advance the hash, since they are not rolled back + } else { + // Advance the hash + hashNew = (hashNew * 131 + ts.asLL()) * 17 + replCoord->getMyId(); + + BackgroundSync::get()->setLastAppliedHash(hashNew); } - - replCoord->setMyLastOptime(ts); - return std::pair<OpTime,long long>(ts, hashNew); + } else { + hashNew = 0; } - /** - * This allows us to stream the oplog entry directly into data region - * main goal is to avoid copying the o portion - * which can be very large - * TODO: can have this build the entire doc - */ - class OplogDocWriter : public DocWriter { - public: - OplogDocWriter( const BSONObj& frame, const BSONObj& oField ) - : _frame( frame ), _oField( oField ) { - } + replCoord->setMyLastOptime(ts); + return std::pair<OpTime, long long>(ts, hashNew); +} - ~OplogDocWriter(){} +/** + * This allows us to stream the oplog entry directly into data region + * main goal is to avoid copying the o portion + * which can be very large + * TODO: can have this build the entire doc + */ +class OplogDocWriter : public DocWriter { +public: + OplogDocWriter(const BSONObj& frame, const BSONObj& oField) : _frame(frame), _oField(oField) {} - void writeDocument( char* start ) const { - char* buf = start; + ~OplogDocWriter() {} - memcpy( buf, _frame.objdata(), _frame.objsize() - 1 ); // don't copy final EOO + void writeDocument(char* start) const { + char* buf = start; - reinterpret_cast<int*>( buf )[0] = documentSize(); + memcpy(buf, _frame.objdata(), _frame.objsize() - 1); // don't copy final EOO - buf += ( _frame.objsize() - 1 ); - buf[0] = (char)Object; - buf[1] = 'o'; - buf[2] = 0; - memcpy( buf+3, _oField.objdata(), _oField.objsize() ); - buf += 3 + _oField.objsize(); - buf[0] = EOO; + reinterpret_cast<int*>(buf)[0] = documentSize(); - verify( static_cast<size_t>( ( buf + 1 ) - start ) == documentSize() ); // DEV? - } + buf += (_frame.objsize() - 1); + buf[0] = (char)Object; + buf[1] = 'o'; + buf[2] = 0; + memcpy(buf + 3, _oField.objdata(), _oField.objsize()); + buf += 3 + _oField.objsize(); + buf[0] = EOO; - size_t documentSize() const { - return _frame.objsize() + _oField.objsize() + 1 /* type */ + 2 /* "o" */; - } + verify(static_cast<size_t>((buf + 1) - start) == documentSize()); // DEV? + } - private: - BSONObj _frame; - BSONObj _oField; - }; - - /* we write to local.oplog.rs: - { ts : ..., h: ..., v: ..., op: ..., etc } - ts: an OpTime timestamp - h: hash - v: version - op: - "i" insert - "u" update - "d" delete - "c" db cmd - "db" declares presence of a database (ns is set to the db name + '.') - "n" no op - - bb param: - if not null, specifies a boolean to pass along to the other side as b: param. - used for "justOne" or "upsert" flags on 'd', 'u' + size_t documentSize() const { + return _frame.objsize() + _oField.objsize() + 1 /* type */ + 2 /* "o" */; + } - */ +private: + BSONObj _frame; + BSONObj _oField; +}; + +/* we write to local.oplog.rs: + { ts : ..., h: ..., v: ..., op: ..., etc } + ts: an OpTime timestamp + h: hash + v: version + op: + "i" insert + "u" update + "d" delete + "c" db cmd + "db" declares presence of a database (ns is set to the db name + '.') + "n" no op + + bb param: + if not null, specifies a boolean to pass along to the other side as b: param. + used for "justOne" or "upsert" flags on 'd', 'u' - void _logOpRS(OperationContext* txn, - const char *opstr, - const char *ns, - const char *logNS, - const BSONObj& obj, - BSONObj *o2, - bool *bb, - bool fromMigrate ) { - if ( strncmp(ns, "local.", 6) == 0 ) { - return; - } - - Lock::DBLock lk(txn->lockState(), "local", MODE_IX); - Lock::OplogIntentWriteLock oplogLk(txn->lockState()); +*/ - DEV verify( logNS == 0 ); // check this was never a master/slave master +void _logOpRS(OperationContext* txn, + const char* opstr, + const char* ns, + const char* logNS, + const BSONObj& obj, + BSONObj* o2, + bool* bb, + bool fromMigrate) { + if (strncmp(ns, "local.", 6) == 0) { + return; + } - if ( localOplogRSCollection == 0 ) { - Client::Context ctx(txn, rsoplog); - localDB = ctx.db(); - invariant( localDB ); - localOplogRSCollection = localDB->getCollection( rsoplog ); - massert(13347, "local.oplog.rs missing. did you drop it? if so restart server", localOplogRSCollection); - } + Lock::DBLock lk(txn->lockState(), "local", MODE_IX); + Lock::OplogIntentWriteLock oplogLk(txn->lockState()); - ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); - if (ns[0] && !replCoord->canAcceptWritesForDatabase(nsToDatabaseSubstring(ns))) { - severe() << "replSet error : logOp() but can't accept write to collection " << ns; - fassertFailed(17405); - } + DEV verify(logNS == 0); // check this was never a master/slave master - oplogLk.serializeIfNeeded(); - std::pair<OpTime, long long> slot = getNextOpTime(txn, - localOplogRSCollection, - ns, - replCoord, - opstr); - - /* we jump through a bunch of hoops here to avoid copying the obj buffer twice -- - instead we do a single copy to the destination position in the memory mapped file. - */ - - BSONObjBuilder b(256); - b.appendTimestamp("ts", slot.first.asDate()); - b.append("h", slot.second); - b.append("v", OPLOG_VERSION); - b.append("op", opstr); - b.append("ns", ns); - if (fromMigrate) - b.appendBool("fromMigrate", true); - if ( bb ) - b.appendBool("b", *bb); - if ( o2 ) - b.append("o2", *o2); - BSONObj partial = b.done(); - - OplogDocWriter writer( partial, obj ); - checkOplogInsert( localOplogRSCollection->insertDocument( txn, &writer, false ) ); - - txn->getClient()->setLastOp(slot.first); + if (localOplogRSCollection == 0) { + Client::Context ctx(txn, rsoplog); + localDB = ctx.db(); + invariant(localDB); + localOplogRSCollection = localDB->getCollection(rsoplog); + massert(13347, + "local.oplog.rs missing. did you drop it? if so restart server", + localOplogRSCollection); } - void _logOpOld(OperationContext* txn, - const char *opstr, - const char *ns, - const char *logNS, - const BSONObj& obj, - BSONObj *o2, - bool *bb, - bool fromMigrate ) { - - - if ( strncmp(ns, "local.", 6) == 0 ) { - return; - } + ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); + if (ns[0] && !replCoord->canAcceptWritesForDatabase(nsToDatabaseSubstring(ns))) { + severe() << "replSet error : logOp() but can't accept write to collection " << ns; + fassertFailed(17405); + } - Lock::DBLock lk(txn->lockState(), "local", MODE_IX); + oplogLk.serializeIfNeeded(); + std::pair<OpTime, long long> slot = + getNextOpTime(txn, localOplogRSCollection, ns, replCoord, opstr); - if( logNS == 0 ) { - logNS = "local.oplog.$main"; - } + /* we jump through a bunch of hoops here to avoid copying the obj buffer twice -- + instead we do a single copy to the destination position in the memory mapped file. + */ - Lock::CollectionLock lk2(txn->lockState(), logNS, MODE_IX); + BSONObjBuilder b(256); + b.appendTimestamp("ts", slot.first.asDate()); + b.append("h", slot.second); + b.append("v", OPLOG_VERSION); + b.append("op", opstr); + b.append("ns", ns); + if (fromMigrate) + b.appendBool("fromMigrate", true); + if (bb) + b.appendBool("b", *bb); + if (o2) + b.append("o2", *o2); + BSONObj partial = b.done(); + + OplogDocWriter writer(partial, obj); + checkOplogInsert(localOplogRSCollection->insertDocument(txn, &writer, false)); + + txn->getClient()->setLastOp(slot.first); +} + +void _logOpOld(OperationContext* txn, + const char* opstr, + const char* ns, + const char* logNS, + const BSONObj& obj, + BSONObj* o2, + bool* bb, + bool fromMigrate) { + if (strncmp(ns, "local.", 6) == 0) { + return; + } - if (localOplogMainCollection == 0) { - Client::Context ctx(txn, logNS); - localDB = ctx.db(); - invariant(localDB); - localOplogMainCollection = localDB->getCollection(logNS); - invariant(localOplogMainCollection); - } + Lock::DBLock lk(txn->lockState(), "local", MODE_IX); - ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); - std::pair<OpTime,long long> slot = getNextOpTime(txn, - localOplogMainCollection, - ns, - replCoord, - opstr); - - /* we jump through a bunch of hoops here to avoid copying the obj buffer twice -- - instead we do a single copy to the destination position in the memory mapped file. - */ - - BSONObjBuilder b(256); - b.appendTimestamp("ts", slot.first.asDate()); - b.append("op", opstr); - b.append("ns", ns); - if (fromMigrate) - b.appendBool("fromMigrate", true); - if ( bb ) - b.appendBool("b", *bb); - if ( o2 ) - b.append("o2", *o2); - BSONObj partial = b.done(); // partial is everything except the o:... part. - - OplogDocWriter writer( partial, obj ); - checkOplogInsert( localOplogMainCollection->insertDocument( txn, &writer, false ) ); - - txn->getClient()->setLastOp(slot.first); + if (logNS == 0) { + logNS = "local.oplog.$main"; } - void (*_logOp)(OperationContext* txn, - const char *opstr, - const char *ns, - const char *logNS, - const BSONObj& obj, - BSONObj *o2, - bool *bb, - bool fromMigrate ) = _logOpRS; -} // namespace - - void oldRepl() { _logOp = _logOpOld; } + Lock::CollectionLock lk2(txn->lockState(), logNS, MODE_IX); - void logKeepalive(OperationContext* txn) { - _logOp(txn, "n", "", 0, BSONObj(), 0, 0, false); - } - void logOpComment(OperationContext* txn, const BSONObj& obj) { - _logOp(txn, "n", "", 0, obj, 0, 0, false); - } - void logOpInitiate(OperationContext* txn, const BSONObj& obj) { - _logOpRS(txn, "n", "", 0, obj, 0, 0, false); + if (localOplogMainCollection == 0) { + Client::Context ctx(txn, logNS); + localDB = ctx.db(); + invariant(localDB); + localOplogMainCollection = localDB->getCollection(logNS); + invariant(localOplogMainCollection); } - /*@ @param opstr: - c userCreateNS - i insert - n no-op / keepalive - d delete / remove - u update + ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); + std::pair<OpTime, long long> slot = + getNextOpTime(txn, localOplogMainCollection, ns, replCoord, opstr); + + /* we jump through a bunch of hoops here to avoid copying the obj buffer twice -- + instead we do a single copy to the destination position in the memory mapped file. */ - void logOp(OperationContext* txn, + + BSONObjBuilder b(256); + b.appendTimestamp("ts", slot.first.asDate()); + b.append("op", opstr); + b.append("ns", ns); + if (fromMigrate) + b.appendBool("fromMigrate", true); + if (bb) + b.appendBool("b", *bb); + if (o2) + b.append("o2", *o2); + BSONObj partial = b.done(); // partial is everything except the o:... part. + + OplogDocWriter writer(partial, obj); + checkOplogInsert(localOplogMainCollection->insertDocument(txn, &writer, false)); + + txn->getClient()->setLastOp(slot.first); +} + +void (*_logOp)(OperationContext* txn, const char* opstr, const char* ns, + const char* logNS, const BSONObj& obj, - BSONObj* patt, - bool* b, - bool fromMigrate) { + BSONObj* o2, + bool* bb, + bool fromMigrate) = _logOpRS; +} // namespace - if ( getGlobalReplicationCoordinator()->isReplEnabled() ) { - _logOp(txn, opstr, ns, 0, obj, patt, b, fromMigrate); - } - ensureShardVersionOKOrThrow(ns); - - // - // rollback-safe logOp listeners - // - getGlobalAuthorizationManager()->logOp(txn, opstr, ns, obj, patt, b); - logOpForSharding(txn, opstr, ns, obj, patt, fromMigrate); - logOpForDbHash(txn, ns); - if ( strstr( ns, ".system.js" ) ) { - Scope::storedFuncMod(txn); - } +void oldRepl() { + _logOp = _logOpOld; +} + +void logKeepalive(OperationContext* txn) { + _logOp(txn, "n", "", 0, BSONObj(), 0, 0, false); +} +void logOpComment(OperationContext* txn, const BSONObj& obj) { + _logOp(txn, "n", "", 0, obj, 0, 0, false); +} +void logOpInitiate(OperationContext* txn, const BSONObj& obj) { + _logOpRS(txn, "n", "", 0, obj, 0, 0, false); +} + +/*@ @param opstr: + c userCreateNS + i insert + n no-op / keepalive + d delete / remove + u update +*/ +void logOp(OperationContext* txn, + const char* opstr, + const char* ns, + const BSONObj& obj, + BSONObj* patt, + bool* b, + bool fromMigrate) { + if (getGlobalReplicationCoordinator()->isReplEnabled()) { + _logOp(txn, opstr, ns, 0, obj, patt, b, fromMigrate); } + ensureShardVersionOKOrThrow(ns); + + // + // rollback-safe logOp listeners + // + getGlobalAuthorizationManager()->logOp(txn, opstr, ns, obj, patt, b); + logOpForSharding(txn, opstr, ns, obj, patt, fromMigrate); + logOpForDbHash(txn, ns); + if (strstr(ns, ".system.js")) { + Scope::storedFuncMod(txn); + } +} + +OpTime writeOpsToOplog(OperationContext* txn, const std::deque<BSONObj>& ops) { + ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); + OpTime lastOptime; + MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { + lastOptime = replCoord->getMyLastOptime(); + invariant(!ops.empty()); + ScopedTransaction transaction(txn, MODE_IX); + Lock::DBLock lk(txn->lockState(), "local", MODE_X); + + if (localOplogRSCollection == 0) { + Client::Context ctx(txn, rsoplog); - OpTime writeOpsToOplog(OperationContext* txn, const std::deque<BSONObj>& ops) { - ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); - OpTime lastOptime; - MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { - lastOptime = replCoord->getMyLastOptime(); - invariant(!ops.empty()); - ScopedTransaction transaction(txn, MODE_IX); - Lock::DBLock lk(txn->lockState(), "local", MODE_X); - - if ( localOplogRSCollection == 0 ) { - Client::Context ctx(txn, rsoplog); - - localDB = ctx.db(); - verify( localDB ); - localOplogRSCollection = localDB->getCollection(rsoplog); - massert(13389, - "local.oplog.rs missing. did you drop it? if so restart server", - localOplogRSCollection); - } + localDB = ctx.db(); + verify(localDB); + localOplogRSCollection = localDB->getCollection(rsoplog); + massert(13389, + "local.oplog.rs missing. did you drop it? if so restart server", + localOplogRSCollection); + } - Client::Context ctx(txn, rsoplog, localDB); - WriteUnitOfWork wunit(txn); + Client::Context ctx(txn, rsoplog, localDB); + WriteUnitOfWork wunit(txn); - for (std::deque<BSONObj>::const_iterator it = ops.begin(); - it != ops.end(); - ++it) { - const BSONObj& op = *it; - const OpTime ts = op["ts"]._opTime(); + for (std::deque<BSONObj>::const_iterator it = ops.begin(); it != ops.end(); ++it) { + const BSONObj& op = *it; + const OpTime ts = op["ts"]._opTime(); - checkOplogInsert(localOplogRSCollection->insertDocument(txn, op, false)); + checkOplogInsert(localOplogRSCollection->insertDocument(txn, op, false)); - if (!(lastOptime < ts)) { - severe() << "replication oplog stream went back in time. " - "previous timestamp: " << lastOptime << " newest timestamp: " << ts - << ". Op being applied: " << op; - fassertFailedNoTrace(18905); - } - lastOptime = ts; + if (!(lastOptime < ts)) { + severe() << "replication oplog stream went back in time. " + "previous timestamp: " << lastOptime << " newest timestamp: " << ts + << ". Op being applied: " << op; + fassertFailedNoTrace(18905); } - wunit.commit(); + lastOptime = ts; + } + wunit.commit(); - BackgroundSync* bgsync = BackgroundSync::get(); - // Keep this up-to-date, in case we step up to primary. - long long hash = ops.back()["h"].numberLong(); - bgsync->setLastAppliedHash(hash); + BackgroundSync* bgsync = BackgroundSync::get(); + // Keep this up-to-date, in case we step up to primary. + long long hash = ops.back()["h"].numberLong(); + bgsync->setLastAppliedHash(hash); - ctx.getClient()->setLastOp(lastOptime); + ctx.getClient()->setLastOp(lastOptime); - replCoord->setMyLastOptime(lastOptime); - setNewOptime(lastOptime); + replCoord->setMyLastOptime(lastOptime); + setNewOptime(lastOptime); - return lastOptime; - } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "write oplog entry", rsoplog); + return lastOptime; } - - void createOplog(OperationContext* txn) { - ScopedTransaction transaction(txn, MODE_X); - Lock::GlobalWrite lk(txn->lockState()); - - const char * ns = "local.oplog.$main"; - - const ReplSettings& replSettings = getGlobalReplicationCoordinator()->getSettings(); - bool rs = !replSettings.replSet.empty(); - if( rs ) - ns = rsoplog; - - Client::Context ctx(txn, ns); - Collection* collection = ctx.db()->getCollection( ns ); - - if ( collection ) { - - if (replSettings.oplogSize != 0) { - const CollectionOptions oplogOpts = - collection->getCatalogEntry()->getCollectionOptions(txn); - - int o = (int)(oplogOpts.cappedSize / ( 1024 * 1024 ) ); - int n = (int)(replSettings.oplogSize / (1024 * 1024)); - if ( n != o ) { - stringstream ss; - ss << "cmdline oplogsize (" << n << ") different than existing (" << o << ") see: http://dochub.mongodb.org/core/increase-oplog"; - log() << ss.str() << endl; - throw UserException( 13257 , ss.str() ); - } + MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "write oplog entry", rsoplog); +} + +void createOplog(OperationContext* txn) { + ScopedTransaction transaction(txn, MODE_X); + Lock::GlobalWrite lk(txn->lockState()); + + const char* ns = "local.oplog.$main"; + + const ReplSettings& replSettings = getGlobalReplicationCoordinator()->getSettings(); + bool rs = !replSettings.replSet.empty(); + if (rs) + ns = rsoplog; + + Client::Context ctx(txn, ns); + Collection* collection = ctx.db()->getCollection(ns); + + if (collection) { + if (replSettings.oplogSize != 0) { + const CollectionOptions oplogOpts = + collection->getCatalogEntry()->getCollectionOptions(txn); + + int o = (int)(oplogOpts.cappedSize / (1024 * 1024)); + int n = (int)(replSettings.oplogSize / (1024 * 1024)); + if (n != o) { + stringstream ss; + ss << "cmdline oplogsize (" << n << ") different than existing (" << o + << ") see: http://dochub.mongodb.org/core/increase-oplog"; + log() << ss.str() << endl; + throw UserException(13257, ss.str()); } - - if ( !rs ) - initOpTimeFromOplog(txn, ns); - return; } - /* create an oplog collection, if it doesn't yet exist. */ - long long sz = 0; - if ( replSettings.oplogSize != 0 ) { - sz = replSettings.oplogSize; - } - else { - /* not specified. pick a default size */ - sz = 50LL * 1024LL * 1024LL; - if ( sizeof(int *) >= 8 ) { + if (!rs) + initOpTimeFromOplog(txn, ns); + return; + } + + /* create an oplog collection, if it doesn't yet exist. */ + long long sz = 0; + if (replSettings.oplogSize != 0) { + sz = replSettings.oplogSize; + } else { + /* not specified. pick a default size */ + sz = 50LL * 1024LL * 1024LL; + if (sizeof(int*) >= 8) { #if defined(__APPLE__) - // typically these are desktops (dev machines), so keep it smallish - sz = (256-64) * 1024 * 1024; + // typically these are desktops (dev machines), so keep it smallish + sz = (256 - 64) * 1024 * 1024; #else - sz = 990LL * 1024 * 1024; - double free = - File::freeSpace(storageGlobalParams.dbpath); //-1 if call not supported. - long long fivePct = static_cast<long long>( free * 0.05 ); - if ( fivePct > sz ) - sz = fivePct; - // we use 5% of free space up to 50GB (1TB free) - static long long upperBound = 50LL * 1024 * 1024 * 1024; - if (fivePct > upperBound) - sz = upperBound; + sz = 990LL * 1024 * 1024; + double free = File::freeSpace(storageGlobalParams.dbpath); //-1 if call not supported. + long long fivePct = static_cast<long long>(free * 0.05); + if (fivePct > sz) + sz = fivePct; + // we use 5% of free space up to 50GB (1TB free) + static long long upperBound = 50LL * 1024 * 1024 * 1024; + if (fivePct > upperBound) + sz = upperBound; #endif - } } - - log() << "******" << endl; - log() << "creating replication oplog of size: " << (int)( sz / ( 1024 * 1024 ) ) << "MB..." << endl; - - CollectionOptions options; - options.capped = true; - options.cappedSize = sz; - options.autoIndexId = CollectionOptions::NO; - - MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { - WriteUnitOfWork uow( txn ); - invariant(ctx.db()->createCollection(txn, ns, options)); - if( !rs ) - logOp(txn, "n", "", BSONObj() ); - uow.commit(); - } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "createCollection", ns); - - /* sync here so we don't get any surprising lag later when we try to sync */ - StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine(); - storageEngine->flushAllFiles(true); - log() << "******" << endl; } - // ------------------------------------- - - /** @param fromRepl false if from ApplyOpsCmd - @return true if was and update should have happened and the document DNE. see replset initial sync code. - */ - bool applyOperation_inlock(OperationContext* txn, - Database* db, - const BSONObj& op, - bool fromRepl, - bool convertUpdateToUpsert) { - LOG(3) << "applying op: " << op << endl; - bool failedUpdate = false; - - OpCounters * opCounters = fromRepl ? &replOpCounters : &globalOpCounters; - - const char *names[] = { "o", "ns", "op", "b", "o2" }; - BSONElement fields[5]; - op.getFields(5, names, fields); - BSONElement& fieldO = fields[0]; - BSONElement& fieldNs = fields[1]; - BSONElement& fieldOp = fields[2]; - BSONElement& fieldB = fields[3]; - BSONElement& fieldO2 = fields[4]; - - BSONObj o; - if( fieldO.isABSONObj() ) - o = fieldO.embeddedObject(); - - const char *ns = fieldNs.valuestrsafe(); - - BSONObj o2; - if (fieldO2.isABSONObj()) - o2 = fieldO2.Obj(); - - bool valueB = fieldB.booleanSafe(); - - if (nsIsFull(ns)) { - if (supportsDocLocking()) { - // WiredTiger, and others requires MODE_IX since the applier threads driving - // this allow writes to the same collection on any thread. - invariant(txn->lockState()->isCollectionLockedForMode(ns, MODE_IX)); + log() << "******" << endl; + log() << "creating replication oplog of size: " << (int)(sz / (1024 * 1024)) << "MB..." << endl; + + CollectionOptions options; + options.capped = true; + options.cappedSize = sz; + options.autoIndexId = CollectionOptions::NO; + + MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { + WriteUnitOfWork uow(txn); + invariant(ctx.db()->createCollection(txn, ns, options)); + if (!rs) + logOp(txn, "n", "", BSONObj()); + uow.commit(); + } + MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "createCollection", ns); + + /* sync here so we don't get any surprising lag later when we try to sync */ + StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine(); + storageEngine->flushAllFiles(true); + log() << "******" << endl; +} + +// ------------------------------------- + +/** @param fromRepl false if from ApplyOpsCmd + @return true if was and update should have happened and the document DNE. see replset initial sync code. + */ +bool applyOperation_inlock(OperationContext* txn, + Database* db, + const BSONObj& op, + bool fromRepl, + bool convertUpdateToUpsert) { + LOG(3) << "applying op: " << op << endl; + bool failedUpdate = false; + + OpCounters* opCounters = fromRepl ? &replOpCounters : &globalOpCounters; + + const char* names[] = {"o", "ns", "op", "b", "o2"}; + BSONElement fields[5]; + op.getFields(5, names, fields); + BSONElement& fieldO = fields[0]; + BSONElement& fieldNs = fields[1]; + BSONElement& fieldOp = fields[2]; + BSONElement& fieldB = fields[3]; + BSONElement& fieldO2 = fields[4]; + + BSONObj o; + if (fieldO.isABSONObj()) + o = fieldO.embeddedObject(); + + const char* ns = fieldNs.valuestrsafe(); + + BSONObj o2; + if (fieldO2.isABSONObj()) + o2 = fieldO2.Obj(); + + bool valueB = fieldB.booleanSafe(); + + if (nsIsFull(ns)) { + if (supportsDocLocking()) { + // WiredTiger, and others requires MODE_IX since the applier threads driving + // this allow writes to the same collection on any thread. + invariant(txn->lockState()->isCollectionLockedForMode(ns, MODE_IX)); + } else { + // mmapV1 ensures that all operations to the same collection are executed from + // the same worker thread, so it takes an exclusive lock (MODE_X) + invariant(txn->lockState()->isCollectionLockedForMode(ns, MODE_X)); + } + } + Collection* collection = db->getCollection(ns); + IndexCatalog* indexCatalog = collection == NULL ? NULL : collection->getIndexCatalog(); + + // operation type -- see logOp() comments for types + const char* opType = fieldOp.valuestrsafe(); + + if (*opType == 'i') { + opCounters->gotInsert(); + + const char* p = strchr(ns, '.'); + if (p && nsToCollectionSubstring(p) == "system.indexes") { + if (o["background"].trueValue()) { + IndexBuilder* builder = new IndexBuilder(o); + // This spawns a new thread and returns immediately. + builder->go(); + // Wait for thread to start and register itself + Lock::TempRelease release(txn->lockState()); + IndexBuilder::waitForBgIndexStarting(); } else { - // mmapV1 ensures that all operations to the same collection are executed from - // the same worker thread, so it takes an exclusive lock (MODE_X) - invariant(txn->lockState()->isCollectionLockedForMode(ns, MODE_X)); + IndexBuilder builder(o); + Status status = builder.buildInForeground(txn, db); + uassertStatusOK(status); } - } - Collection* collection = db->getCollection( ns ); - IndexCatalog* indexCatalog = collection == NULL ? NULL : collection->getIndexCatalog(); - - // operation type -- see logOp() comments for types - const char *opType = fieldOp.valuestrsafe(); - - if ( *opType == 'i' ) { - opCounters->gotInsert(); - - const char *p = strchr(ns, '.'); - if ( p && nsToCollectionSubstring( p ) == "system.indexes" ) { - if (o["background"].trueValue()) { - IndexBuilder* builder = new IndexBuilder(o); - // This spawns a new thread and returns immediately. - builder->go(); - // Wait for thread to start and register itself - Lock::TempRelease release(txn->lockState()); - IndexBuilder::waitForBgIndexStarting(); - } - else { - IndexBuilder builder(o); - Status status = builder.buildInForeground(txn, db); - uassertStatusOK(status); + } else { + // do upserts for inserts as we might get replayed more than once + OpDebug debug; + BSONElement _id; + if (!o.getObjectID(_id)) { + /* No _id. This will be very slow. */ + Timer t; + + const NamespaceString requestNs(ns); + UpdateRequest request(requestNs); + + request.setQuery(o); + request.setUpdates(o); + request.setUpsert(); + request.setFromReplication(); + UpdateLifecycleImpl updateLifecycle(true, requestNs); + request.setLifecycle(&updateLifecycle); + + update(txn, db, request, &debug); + + if (t.millis() >= 2) { + RARELY OCCASIONALLY log() + << "warning, repl doing slow updates (no _id field) for " << ns << endl; } + } else { + /* todo : it may be better to do an insert here, and then catch the dup key exception and do update + then. very few upserts will not be inserts... + */ + BSONObjBuilder b; + b.append(_id); + + const NamespaceString requestNs(ns); + UpdateRequest request(requestNs); + + request.setQuery(b.done()); + request.setUpdates(o); + request.setUpsert(); + request.setFromReplication(); + UpdateLifecycleImpl updateLifecycle(true, requestNs); + request.setLifecycle(&updateLifecycle); + + update(txn, db, request, &debug); } - else { - // do upserts for inserts as we might get replayed more than once - OpDebug debug; - BSONElement _id; - if( !o.getObjectID(_id) ) { - /* No _id. This will be very slow. */ - Timer t; - - const NamespaceString requestNs(ns); - UpdateRequest request(requestNs); - - request.setQuery(o); - request.setUpdates(o); - request.setUpsert(); - request.setFromReplication(); - UpdateLifecycleImpl updateLifecycle(true, requestNs); - request.setLifecycle(&updateLifecycle); - - update(txn, db, request, &debug); - - if( t.millis() >= 2 ) { - RARELY OCCASIONALLY log() << "warning, repl doing slow updates (no _id field) for " << ns << endl; - } + } + } else if (*opType == 'u') { + opCounters->gotUpdate(); + + OpDebug debug; + BSONObj updateCriteria = o2; + const bool upsert = valueB || convertUpdateToUpsert; + + const NamespaceString requestNs(ns); + UpdateRequest request(requestNs); + + request.setQuery(updateCriteria); + request.setUpdates(o); + request.setUpsert(upsert); + request.setFromReplication(); + UpdateLifecycleImpl updateLifecycle(true, requestNs); + request.setLifecycle(&updateLifecycle); + + UpdateResult ur = update(txn, db, request, &debug); + + if (ur.numMatched == 0) { + if (ur.modifiers) { + if (updateCriteria.nFields() == 1) { + // was a simple { _id : ... } update criteria + failedUpdate = true; + log() << "replication failed to apply update: " << op.toString() << endl; } + // need to check to see if it isn't present so we can set failedUpdate correctly. + // note that adds some overhead for this extra check in some cases, such as an updateCriteria + // of the form + // { _id:..., { x : {$size:...} } + // thus this is not ideal. else { - /* todo : it may be better to do an insert here, and then catch the dup key exception and do update - then. very few upserts will not be inserts... - */ - BSONObjBuilder b; - b.append(_id); - - const NamespaceString requestNs(ns); - UpdateRequest request(requestNs); - - request.setQuery(b.done()); - request.setUpdates(o); - request.setUpsert(); - request.setFromReplication(); - UpdateLifecycleImpl updateLifecycle(true, requestNs); - request.setLifecycle(&updateLifecycle); - - update(txn, db, request, &debug); - } - } - } - else if ( *opType == 'u' ) { - opCounters->gotUpdate(); - - OpDebug debug; - BSONObj updateCriteria = o2; - const bool upsert = valueB || convertUpdateToUpsert; - - const NamespaceString requestNs(ns); - UpdateRequest request(requestNs); - - request.setQuery(updateCriteria); - request.setUpdates(o); - request.setUpsert(upsert); - request.setFromReplication(); - UpdateLifecycleImpl updateLifecycle(true, requestNs); - request.setLifecycle(&updateLifecycle); - - UpdateResult ur = update(txn, db, request, &debug); - - if( ur.numMatched == 0 ) { - if( ur.modifiers ) { - if( updateCriteria.nFields() == 1 ) { - // was a simple { _id : ... } update criteria + if (collection == NULL || + (indexCatalog->haveIdIndex(txn) && + Helpers::findById(txn, collection, updateCriteria).isNull()) || + // capped collections won't have an _id index + (!indexCatalog->haveIdIndex(txn) && + Helpers::findOne(txn, collection, updateCriteria, false).isNull())) { failedUpdate = true; - log() << "replication failed to apply update: " << op.toString() << endl; - } - // need to check to see if it isn't present so we can set failedUpdate correctly. - // note that adds some overhead for this extra check in some cases, such as an updateCriteria - // of the form - // { _id:..., { x : {$size:...} } - // thus this is not ideal. - else { - if (collection == NULL || - (indexCatalog->haveIdIndex(txn) && Helpers::findById(txn, collection, updateCriteria).isNull()) || - // capped collections won't have an _id index - (!indexCatalog->haveIdIndex(txn) && Helpers::findOne(txn, collection, updateCriteria, false).isNull())) { - failedUpdate = true; - log() << "replication couldn't find doc: " << op.toString() << endl; - } - - // Otherwise, it's present; zero objects were updated because of additional specifiers - // in the query for idempotence + log() << "replication couldn't find doc: " << op.toString() << endl; } + + // Otherwise, it's present; zero objects were updated because of additional specifiers + // in the query for idempotence } - else { - // this could happen benignly on an oplog duplicate replay of an upsert - // (because we are idempotent), - // if an regular non-mod update fails the item is (presumably) missing. - if( !upsert ) { - failedUpdate = true; - log() << "replication update of non-mod failed: " << op.toString() << endl; - } + } else { + // this could happen benignly on an oplog duplicate replay of an upsert + // (because we are idempotent), + // if an regular non-mod update fails the item is (presumably) missing. + if (!upsert) { + failedUpdate = true; + log() << "replication update of non-mod failed: " << op.toString() << endl; } } } - else if ( *opType == 'd' ) { - opCounters->gotDelete(); - if ( opType[1] == 0 ) - deleteObjects(txn, db, ns, o, PlanExecutor::YIELD_MANUAL, /*justOne*/ valueB); - else - verify( opType[1] == 'b' ); // "db" advertisement - } - else if ( *opType == 'c' ) { - bool done = false; - while (!done) { - BufBuilder bb; - BSONObjBuilder runCommandResult; - - // Applying commands in repl is done under Global W-lock, so it is safe to not - // perform the current DB checks after reacquiring the lock. - invariant(txn->lockState()->isW()); - - _runCommands(txn, ns, o, bb, runCommandResult, true, 0); - // _runCommands takes care of adjusting opcounters for command counting. - Status status = Command::getStatusFromCommandResult(runCommandResult.done()); - switch (status.code()) { + } else if (*opType == 'd') { + opCounters->gotDelete(); + if (opType[1] == 0) + deleteObjects(txn, db, ns, o, PlanExecutor::YIELD_MANUAL, /*justOne*/ valueB); + else + verify(opType[1] == 'b'); // "db" advertisement + } else if (*opType == 'c') { + bool done = false; + while (!done) { + BufBuilder bb; + BSONObjBuilder runCommandResult; + + // Applying commands in repl is done under Global W-lock, so it is safe to not + // perform the current DB checks after reacquiring the lock. + invariant(txn->lockState()->isW()); + + _runCommands(txn, ns, o, bb, runCommandResult, true, 0); + // _runCommands takes care of adjusting opcounters for command counting. + Status status = Command::getStatusFromCommandResult(runCommandResult.done()); + switch (status.code()) { case ErrorCodes::WriteConflict: { // Need to throw this up to a higher level where it will be caught and the // operation retried. @@ -734,74 +718,63 @@ namespace { break; } default: - warning() << "repl Failed command " << o << " on " << - nsToDatabaseSubstring(ns) << " with status " << status << - " during oplog application"; - // fallthrough + warning() << "repl Failed command " << o << " on " << nsToDatabaseSubstring(ns) + << " with status " << status << " during oplog application"; + // fallthrough case ErrorCodes::OK: done = true; break; - } } } - else if ( *opType == 'n' ) { - // no op - } - else { - throw MsgAssertionException( 14825 , ErrorMsg("error in applyOperation : unknown opType ", *opType) ); - } - - // AuthorizationManager's logOp method registers a RecoveryUnit::Change - // and to do so we need to have begun a UnitOfWork - WriteUnitOfWork wuow(txn); - getGlobalAuthorizationManager()->logOp( - txn, - opType, - ns, - o, - fieldO2.isABSONObj() ? &o2 : NULL, - !fieldB.eoo() ? &valueB : NULL ); - wuow.commit(); - - return failedUpdate; + } else if (*opType == 'n') { + // no op + } else { + throw MsgAssertionException(14825, + ErrorMsg("error in applyOperation : unknown opType ", *opType)); } - void waitUpToOneSecondForOptimeChange(const OpTime& referenceTime) { - mutex::scoped_lock lk(newOpMutex); + // AuthorizationManager's logOp method registers a RecoveryUnit::Change + // and to do so we need to have begun a UnitOfWork + WriteUnitOfWork wuow(txn); + getGlobalAuthorizationManager()->logOp( + txn, opType, ns, o, fieldO2.isABSONObj() ? &o2 : NULL, !fieldB.eoo() ? &valueB : NULL); + wuow.commit(); - while (referenceTime == getLastSetOptime()) { - if (!newOptimeNotifier.timed_wait(lk.boost(), - boost::posix_time::seconds(1))) - return; - } - } + return failedUpdate; +} - void setNewOptime(const OpTime& newTime) { - mutex::scoped_lock lk(newOpMutex); - setGlobalOptime(newTime); - newOptimeNotifier.notify_all(); +void waitUpToOneSecondForOptimeChange(const OpTime& referenceTime) { + mutex::scoped_lock lk(newOpMutex); + + while (referenceTime == getLastSetOptime()) { + if (!newOptimeNotifier.timed_wait(lk.boost(), boost::posix_time::seconds(1))) + return; } +} - void initOpTimeFromOplog(OperationContext* txn, const std::string& oplogNS) { - DBDirectClient c(txn); - BSONObj lastOp = c.findOne(oplogNS, - Query().sort(reverseNaturalObj), - NULL, - QueryOption_SlaveOk); +void setNewOptime(const OpTime& newTime) { + mutex::scoped_lock lk(newOpMutex); + setGlobalOptime(newTime); + newOptimeNotifier.notify_all(); +} - if (!lastOp.isEmpty()) { - LOG(1) << "replSet setting last OpTime"; - setNewOptime(lastOp[ "ts" ].date()); - } +void initOpTimeFromOplog(OperationContext* txn, const std::string& oplogNS) { + DBDirectClient c(txn); + BSONObj lastOp = c.findOne(oplogNS, Query().sort(reverseNaturalObj), NULL, QueryOption_SlaveOk); + + if (!lastOp.isEmpty()) { + LOG(1) << "replSet setting last OpTime"; + setNewOptime(lastOp["ts"].date()); } +} - void oplogCheckCloseDatabase(OperationContext* txn, Database* db) { - invariant(txn->lockState()->isW()); +void oplogCheckCloseDatabase(OperationContext* txn, Database* db) { + invariant(txn->lockState()->isW()); - localDB = NULL; - localOplogMainCollection = NULL; - localOplogRSCollection = NULL; - } + localDB = NULL; + localOplogMainCollection = NULL; + localOplogRSCollection = NULL; +} -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/oplog.h b/src/mongo/db/repl/oplog.h index 07b0723417a..f6f7bc3c82a 100644 --- a/src/mongo/db/repl/oplog.h +++ b/src/mongo/db/repl/oplog.h @@ -33,96 +33,96 @@ #include <string> namespace mongo { - class BSONObj; - class Database; - class OperationContext; - class OpTime; +class BSONObj; +class Database; +class OperationContext; +class OpTime; namespace repl { - // Redefines the function for logOp() to master/slave. - void oldRepl(); // master-slave - - // Create a new capped collection for the oplog if it doesn't yet exist. - // This will be either local.oplog.rs (replica sets) or local.oplog.$main (master/slave) - // If the collection already exists, set the 'last' OpTime if master/slave (side effect!) - void createOplog(OperationContext* txn); - - // This function writes ops into the replica-set oplog; - // used internally by replication secondaries after they have applied ops. Updates the global - // optime. - // Returns the optime for the last op inserted. - OpTime writeOpsToOplog(OperationContext* txn, const std::deque<BSONObj>& ops); - - const char rsoplog[] = "local.oplog.rs"; - static const int OPLOG_VERSION = 2; - - /** Log an operation to the local oplog - - @param opstr - "i" insert - "u" update - "d" delete - "c" db cmd - "n" no-op - "db" declares presence of a database (ns is set to the db name + '.') - - For 'u' records, 'obj' captures the mutation made to the object but not - the object itself. In that case, we provide also 'fullObj' which is the - image of the object _after_ the mutation logged here was applied. - - See _logOp() in oplog.cpp for more details. - */ - void logOp( OperationContext* txn, - const char *opstr, - const char *ns, - const BSONObj& obj, - BSONObj *patt = NULL, - bool *b = NULL, - bool fromMigrate = false); - - // Log an empty no-op operation to the local oplog - void logKeepalive(OperationContext* txn); - - /** puts obj in the oplog as a comment (a no-op). Just for diags. - convention is - { msg : "text", ... } - */ - void logOpComment(OperationContext* txn, const BSONObj& obj); - - // Same as logOpComment, except only works for replsets - void logOpInitiate(OperationContext* txn, const BSONObj& obj); - - // Flush out the cached pointers to the local database and oplog. - // Used by the closeDatabase command to ensure we don't cache closed things. - void oplogCheckCloseDatabase(OperationContext* txn, Database * db); - - /** - * take an op and apply locally - * used for applying from an oplog - * @param fromRepl really from replication or for testing/internal/command/etc... - * @param convertUpdateToUpsert convert some updates to upserts for idempotency reasons - * Returns if the op was an update that could not be applied (true on failure) - */ - bool applyOperation_inlock(OperationContext* txn, - Database* db, - const BSONObj& op, - bool fromRepl = true, - bool convertUpdateToUpsert = false); - - /** - * Waits one second for the OpTime from the oplog to change. - */ - void waitUpToOneSecondForOptimeChange(const OpTime& referenceTime); - - /** - * Initializes the global OpTime with the value from the timestamp of the last oplog entry. - */ - void initOpTimeFromOplog(OperationContext* txn, const std::string& oplogNS); - - /** - * Sets the global OpTime to be 'newTime'. - */ - void setNewOptime(const OpTime& newTime); -} // namespace repl -} // namespace mongo +// Redefines the function for logOp() to master/slave. +void oldRepl(); // master-slave + +// Create a new capped collection for the oplog if it doesn't yet exist. +// This will be either local.oplog.rs (replica sets) or local.oplog.$main (master/slave) +// If the collection already exists, set the 'last' OpTime if master/slave (side effect!) +void createOplog(OperationContext* txn); + +// This function writes ops into the replica-set oplog; +// used internally by replication secondaries after they have applied ops. Updates the global +// optime. +// Returns the optime for the last op inserted. +OpTime writeOpsToOplog(OperationContext* txn, const std::deque<BSONObj>& ops); + +const char rsoplog[] = "local.oplog.rs"; +static const int OPLOG_VERSION = 2; + +/** Log an operation to the local oplog + + @param opstr + "i" insert + "u" update + "d" delete + "c" db cmd + "n" no-op + "db" declares presence of a database (ns is set to the db name + '.') + + For 'u' records, 'obj' captures the mutation made to the object but not + the object itself. In that case, we provide also 'fullObj' which is the + image of the object _after_ the mutation logged here was applied. + + See _logOp() in oplog.cpp for more details. +*/ +void logOp(OperationContext* txn, + const char* opstr, + const char* ns, + const BSONObj& obj, + BSONObj* patt = NULL, + bool* b = NULL, + bool fromMigrate = false); + +// Log an empty no-op operation to the local oplog +void logKeepalive(OperationContext* txn); + +/** puts obj in the oplog as a comment (a no-op). Just for diags. + convention is + { msg : "text", ... } +*/ +void logOpComment(OperationContext* txn, const BSONObj& obj); + +// Same as logOpComment, except only works for replsets +void logOpInitiate(OperationContext* txn, const BSONObj& obj); + +// Flush out the cached pointers to the local database and oplog. +// Used by the closeDatabase command to ensure we don't cache closed things. +void oplogCheckCloseDatabase(OperationContext* txn, Database* db); + +/** + * take an op and apply locally + * used for applying from an oplog + * @param fromRepl really from replication or for testing/internal/command/etc... + * @param convertUpdateToUpsert convert some updates to upserts for idempotency reasons + * Returns if the op was an update that could not be applied (true on failure) + */ +bool applyOperation_inlock(OperationContext* txn, + Database* db, + const BSONObj& op, + bool fromRepl = true, + bool convertUpdateToUpsert = false); + +/** + * Waits one second for the OpTime from the oplog to change. + */ +void waitUpToOneSecondForOptimeChange(const OpTime& referenceTime); + +/** + * Initializes the global OpTime with the value from the timestamp of the last oplog entry. + */ +void initOpTimeFromOplog(OperationContext* txn, const std::string& oplogNS); + +/** + * Sets the global OpTime to be 'newTime'. + */ +void setNewOptime(const OpTime& newTime); +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/oplogreader.cpp b/src/mongo/db/repl/oplogreader.cpp index 0370d2ae098..ef10b40b2fe 100644 --- a/src/mongo/db/repl/oplogreader.cpp +++ b/src/mongo/db/repl/oplogreader.cpp @@ -52,171 +52,159 @@ namespace mongo { - using boost::shared_ptr; - using std::endl; - using std::string; +using boost::shared_ptr; +using std::endl; +using std::string; namespace repl { - //number of readers created; - // this happens when the source source changes, a reconfig/network-error or the cursor dies - static Counter64 readersCreatedStats; - static ServerStatusMetricField<Counter64> displayReadersCreated( - "repl.network.readersCreated", - &readersCreatedStats ); +// number of readers created; +// this happens when the source source changes, a reconfig/network-error or the cursor dies +static Counter64 readersCreatedStats; +static ServerStatusMetricField<Counter64> displayReadersCreated("repl.network.readersCreated", + &readersCreatedStats); - static const BSONObj userReplQuery = fromjson("{\"user\":\"repl\"}"); +static const BSONObj userReplQuery = fromjson("{\"user\":\"repl\"}"); - bool replAuthenticate(DBClientBase *conn) { - if (!getGlobalAuthorizationManager()->isAuthEnabled()) - return true; +bool replAuthenticate(DBClientBase* conn) { + if (!getGlobalAuthorizationManager()->isAuthEnabled()) + return true; - if (!isInternalAuthSet()) - return false; - return authenticateInternalUser(conn); - } + if (!isInternalAuthSet()) + return false; + return authenticateInternalUser(conn); +} - OplogReader::OplogReader() { - _tailingQueryOptions = QueryOption_SlaveOk; - _tailingQueryOptions |= QueryOption_CursorTailable | QueryOption_OplogReplay; - - /* TODO: slaveOk maybe shouldn't use? */ - _tailingQueryOptions |= QueryOption_AwaitData; +OplogReader::OplogReader() { + _tailingQueryOptions = QueryOption_SlaveOk; + _tailingQueryOptions |= QueryOption_CursorTailable | QueryOption_OplogReplay; - readersCreatedStats.increment(); - } + /* TODO: slaveOk maybe shouldn't use? */ + _tailingQueryOptions |= QueryOption_AwaitData; - bool OplogReader::connect(const HostAndPort& host) { - if (conn() == NULL || _host != host) { - resetConnection(); - _conn = shared_ptr<DBClientConnection>(new DBClientConnection(false, - tcp_timeout)); - string errmsg; - if ( !_conn->connect(host, errmsg) || - (getGlobalAuthorizationManager()->isAuthEnabled() && - !replAuthenticate(_conn.get())) ) { - - resetConnection(); - log() << "repl: " << errmsg << endl; - return false; - } - _host = host; - } - return true; - } + readersCreatedStats.increment(); +} - void OplogReader::tailCheck() { - if( cursor.get() && cursor->isDead() ) { - log() << "repl: old cursor isDead, will initiate a new one" << std::endl; - resetCursor(); +bool OplogReader::connect(const HostAndPort& host) { + if (conn() == NULL || _host != host) { + resetConnection(); + _conn = shared_ptr<DBClientConnection>(new DBClientConnection(false, tcp_timeout)); + string errmsg; + if (!_conn->connect(host, errmsg) || + (getGlobalAuthorizationManager()->isAuthEnabled() && !replAuthenticate(_conn.get()))) { + resetConnection(); + log() << "repl: " << errmsg << endl; + return false; } + _host = host; } + return true; +} - void OplogReader::query(const char *ns, - Query query, - int nToReturn, - int nToSkip, - const BSONObj* fields) { - cursor.reset( - _conn->query(ns, query, nToReturn, nToSkip, fields, QueryOption_SlaveOk).release() - ); - } - - void OplogReader::tailingQuery(const char *ns, const BSONObj& query, const BSONObj* fields ) { - verify( !haveCursor() ); - LOG(2) << "repl: " << ns << ".find(" << query.toString() << ')' << endl; - cursor.reset( _conn->query( ns, query, 0, 0, fields, _tailingQueryOptions ).release() ); - } - - void OplogReader::tailingQueryGTE(const char *ns, OpTime optime, const BSONObj* fields ) { - BSONObjBuilder gte; - gte.appendTimestamp("$gte", optime.asDate()); - BSONObjBuilder query; - query.append("ts", gte.done()); - tailingQuery(ns, query.done(), fields); +void OplogReader::tailCheck() { + if (cursor.get() && cursor->isDead()) { + log() << "repl: old cursor isDead, will initiate a new one" << std::endl; + resetCursor(); } - - HostAndPort OplogReader::getHost() const { - return _host; - } - - void OplogReader::connectToSyncSource(OperationContext* txn, - OpTime lastOpTimeFetched, - ReplicationCoordinator* replCoord) { - const OpTime sentinel(Milliseconds(curTimeMillis64()).total_seconds(), 0); - OpTime oldestOpTimeSeen = sentinel; - - invariant(conn() == NULL); - - while (true) { - HostAndPort candidate = replCoord->chooseNewSyncSource(lastOpTimeFetched); - - if (candidate.empty()) { - if (oldestOpTimeSeen == sentinel) { - // If, in this invocation of connectToSyncSource(), we did not successfully - // connect to any node ahead of us, - // we apparently have no sync sources to connect to. - // This situation is common; e.g. if there are no writes to the primary at - // the moment. - return; - } - - // Connected to at least one member, but in all cases we were too stale to use them - // as a sync source. - log() << "replSet error RS102 too stale to catch up"; - log() << "replSet our last optime : " << lastOpTimeFetched.toStringLong(); - log() << "replSet oldest available is " << oldestOpTimeSeen.toStringLong(); - log() << "replSet " - "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember"; - setMinValid(txn, oldestOpTimeSeen); - bool worked = replCoord->setFollowerMode(MemberState::RS_RECOVERING); - if (!worked) { - warning() << "Failed to transition into " - << MemberState(MemberState::RS_RECOVERING) - << ". Current state: " << replCoord->getMemberState(); - } +} + +void OplogReader::query( + const char* ns, Query query, int nToReturn, int nToSkip, const BSONObj* fields) { + cursor.reset( + _conn->query(ns, query, nToReturn, nToSkip, fields, QueryOption_SlaveOk).release()); +} + +void OplogReader::tailingQuery(const char* ns, const BSONObj& query, const BSONObj* fields) { + verify(!haveCursor()); + LOG(2) << "repl: " << ns << ".find(" << query.toString() << ')' << endl; + cursor.reset(_conn->query(ns, query, 0, 0, fields, _tailingQueryOptions).release()); +} + +void OplogReader::tailingQueryGTE(const char* ns, OpTime optime, const BSONObj* fields) { + BSONObjBuilder gte; + gte.appendTimestamp("$gte", optime.asDate()); + BSONObjBuilder query; + query.append("ts", gte.done()); + tailingQuery(ns, query.done(), fields); +} + +HostAndPort OplogReader::getHost() const { + return _host; +} + +void OplogReader::connectToSyncSource(OperationContext* txn, + OpTime lastOpTimeFetched, + ReplicationCoordinator* replCoord) { + const OpTime sentinel(Milliseconds(curTimeMillis64()).total_seconds(), 0); + OpTime oldestOpTimeSeen = sentinel; + + invariant(conn() == NULL); + + while (true) { + HostAndPort candidate = replCoord->chooseNewSyncSource(lastOpTimeFetched); + + if (candidate.empty()) { + if (oldestOpTimeSeen == sentinel) { + // If, in this invocation of connectToSyncSource(), we did not successfully + // connect to any node ahead of us, + // we apparently have no sync sources to connect to. + // This situation is common; e.g. if there are no writes to the primary at + // the moment. return; } - if (!connect(candidate)) { - LOG(2) << "replSet can't connect to " << candidate.toString() << - " to read operations"; - resetConnection(); - replCoord->blacklistSyncSource(candidate, Date_t(curTimeMillis64() + 10*1000)); - continue; + // Connected to at least one member, but in all cases we were too stale to use them + // as a sync source. + log() << "replSet error RS102 too stale to catch up"; + log() << "replSet our last optime : " << lastOpTimeFetched.toStringLong(); + log() << "replSet oldest available is " << oldestOpTimeSeen.toStringLong(); + log() << "replSet " + "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember"; + setMinValid(txn, oldestOpTimeSeen); + bool worked = replCoord->setFollowerMode(MemberState::RS_RECOVERING); + if (!worked) { + warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING) + << ". Current state: " << replCoord->getMemberState(); } - // Read the first (oldest) op and confirm that it's not newer than our last - // fetched op. Otherwise, we have fallen off the back of that source's oplog. - BSONObj remoteOldestOp(findOne(rsoplog, Query())); - BSONElement tsElem(remoteOldestOp["ts"]); - if (tsElem.type() != Timestamp) { - // This member's got a bad op in its oplog. - warning() << "oplog invalid format on node " << candidate.toString(); - resetConnection(); - replCoord->blacklistSyncSource(candidate, - Date_t(curTimeMillis64() + 600*1000)); - continue; - } - OpTime remoteOldOpTime = tsElem._opTime(); - - if (!lastOpTimeFetched.isNull() && lastOpTimeFetched < remoteOldOpTime) { - // We're too stale to use this sync source. - resetConnection(); - replCoord->blacklistSyncSource(candidate, - Date_t(curTimeMillis64() + 600*1000)); - if (oldestOpTimeSeen > remoteOldOpTime) { - warning() << "we are too stale to use " << candidate.toString() << - " as a sync source"; - oldestOpTimeSeen = remoteOldOpTime; - } - continue; + return; + } + + if (!connect(candidate)) { + LOG(2) << "replSet can't connect to " << candidate.toString() << " to read operations"; + resetConnection(); + replCoord->blacklistSyncSource(candidate, Date_t(curTimeMillis64() + 10 * 1000)); + continue; + } + // Read the first (oldest) op and confirm that it's not newer than our last + // fetched op. Otherwise, we have fallen off the back of that source's oplog. + BSONObj remoteOldestOp(findOne(rsoplog, Query())); + BSONElement tsElem(remoteOldestOp["ts"]); + if (tsElem.type() != Timestamp) { + // This member's got a bad op in its oplog. + warning() << "oplog invalid format on node " << candidate.toString(); + resetConnection(); + replCoord->blacklistSyncSource(candidate, Date_t(curTimeMillis64() + 600 * 1000)); + continue; + } + OpTime remoteOldOpTime = tsElem._opTime(); + + if (!lastOpTimeFetched.isNull() && lastOpTimeFetched < remoteOldOpTime) { + // We're too stale to use this sync source. + resetConnection(); + replCoord->blacklistSyncSource(candidate, Date_t(curTimeMillis64() + 600 * 1000)); + if (oldestOpTimeSeen > remoteOldOpTime) { + warning() << "we are too stale to use " << candidate.toString() + << " as a sync source"; + oldestOpTimeSeen = remoteOldOpTime; } + continue; + } - // Got a valid sync source. - return; - } // while (true) - } + // Got a valid sync source. + return; + } // while (true) +} -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/oplogreader.h b/src/mongo/db/repl/oplogreader.h index 322559dba87..435ce408012 100644 --- a/src/mongo/db/repl/oplogreader.h +++ b/src/mongo/db/repl/oplogreader.h @@ -39,118 +39,131 @@ namespace mongo { - extern const BSONObj reverseNaturalObj; // { $natural : -1 } +extern const BSONObj reverseNaturalObj; // { $natural : -1 } namespace repl { - class ReplicationCoordinator; +class ReplicationCoordinator; + +/** + * Authenticates conn using the server's cluster-membership credentials. + * + * Returns true on successful authentication. + */ +bool replAuthenticate(DBClientBase* conn); + +/* started abstracting out the querying of the primary/master's oplog + still fairly awkward but a start. +*/ + +class OplogReader { +private: + boost::shared_ptr<DBClientConnection> _conn; + boost::shared_ptr<DBClientCursor> cursor; + int _tailingQueryOptions; + + // If _conn was actively connected, _host represents the current HostAndPort of the + // connection. + HostAndPort _host; + +public: + OplogReader(); + ~OplogReader() {} + void resetCursor() { + cursor.reset(); + } + void resetConnection() { + cursor.reset(); + _conn.reset(); + _host = HostAndPort(); + } + DBClientConnection* conn() { + return _conn.get(); + } + BSONObj findOne(const char* ns, const Query& q) { + return conn()->findOne(ns, q, 0, QueryOption_SlaveOk); + } + BSONObj getLastOp(const char* ns) { + return findOne(ns, Query().sort(reverseNaturalObj)); + } + + /* SO_TIMEOUT (send/recv time out) for our DBClientConnections */ + static const int tcp_timeout = 30; + + /* ok to call if already connected */ + bool connect(const HostAndPort& host); + + void tailCheck(); + + bool haveCursor() { + return cursor.get() != 0; + } + + void query(const char* ns, Query query, int nToReturn, int nToSkip, const BSONObj* fields = 0); + + void tailingQuery(const char* ns, const BSONObj& query, const BSONObj* fields = 0); + + void tailingQueryGTE(const char* ns, OpTime t, const BSONObj* fields = 0); + + /* Do a tailing query, but only send the ts field back. */ + void ghostQueryGTE(const char* ns, OpTime t) { + const BSONObj fields = BSON("ts" << 1 << "_id" << 0); + return tailingQueryGTE(ns, t, &fields); + } + + bool more() { + uassert(15910, "Doesn't have cursor for reading oplog", cursor.get()); + return cursor->more(); + } + + bool moreInCurrentBatch() { + uassert(15911, "Doesn't have cursor for reading oplog", cursor.get()); + return cursor->moreInCurrentBatch(); + } + + int currentBatchMessageSize() { + if (NULL == cursor->getMessage()) + return 0; + return cursor->getMessage()->size(); + } + + int getTailingQueryOptions() const { + return _tailingQueryOptions; + } + void setTailingQueryOptions(int tailingQueryOptions) { + _tailingQueryOptions = tailingQueryOptions; + } + + void peek(std::vector<BSONObj>& v, int n) { + if (cursor.get()) + cursor->peek(v, n); + } + BSONObj nextSafe() { + return cursor->nextSafe(); + } + BSONObj next() { + return cursor->next(); + } + void putBack(BSONObj op) { + cursor->putBack(op); + } + + HostAndPort getHost() const; /** - * Authenticates conn using the server's cluster-membership credentials. - * - * Returns true on successful authentication. + * Connects this OplogReader to a valid sync source, using the provided lastOpTimeFetched + * and ReplicationCoordinator objects. + * If this function fails to connect to a sync source that is viable, this OplogReader + * is left unconnected, where this->conn() equals NULL. + * In the process of connecting, this function may add items to the repl coordinator's + * sync source blacklist. + * This function may throw DB exceptions. + * If "lastOpTimeFetched" is (0, 0), we do not check staleness as this indicates an initial + * sync. */ - bool replAuthenticate(DBClientBase* conn); - - /* started abstracting out the querying of the primary/master's oplog - still fairly awkward but a start. - */ - - class OplogReader { - private: - boost::shared_ptr<DBClientConnection> _conn; - boost::shared_ptr<DBClientCursor> cursor; - int _tailingQueryOptions; - - // If _conn was actively connected, _host represents the current HostAndPort of the - // connection. - HostAndPort _host; - public: - OplogReader(); - ~OplogReader() { } - void resetCursor() { cursor.reset(); } - void resetConnection() { - cursor.reset(); - _conn.reset(); - _host = HostAndPort(); - } - DBClientConnection* conn() { return _conn.get(); } - BSONObj findOne(const char *ns, const Query& q) { - return conn()->findOne(ns, q, 0, QueryOption_SlaveOk); - } - BSONObj getLastOp(const char *ns) { - return findOne(ns, Query().sort(reverseNaturalObj)); - } - - /* SO_TIMEOUT (send/recv time out) for our DBClientConnections */ - static const int tcp_timeout = 30; - - /* ok to call if already connected */ - bool connect(const HostAndPort& host); - - void tailCheck(); - - bool haveCursor() { return cursor.get() != 0; } - - void query(const char *ns, - Query query, - int nToReturn, - int nToSkip, - const BSONObj* fields=0); - - void tailingQuery(const char *ns, const BSONObj& query, const BSONObj* fields=0); - - void tailingQueryGTE(const char *ns, OpTime t, const BSONObj* fields=0); - - /* Do a tailing query, but only send the ts field back. */ - void ghostQueryGTE(const char *ns, OpTime t) { - const BSONObj fields = BSON("ts" << 1 << "_id" << 0); - return tailingQueryGTE(ns, t, &fields); - } - - bool more() { - uassert( 15910, "Doesn't have cursor for reading oplog", cursor.get() ); - return cursor->more(); - } - - bool moreInCurrentBatch() { - uassert( 15911, "Doesn't have cursor for reading oplog", cursor.get() ); - return cursor->moreInCurrentBatch(); - } - - int currentBatchMessageSize() { - if( NULL == cursor->getMessage() ) - return 0; - return cursor->getMessage()->size(); - } - - int getTailingQueryOptions() const { return _tailingQueryOptions; } - void setTailingQueryOptions( int tailingQueryOptions ) { _tailingQueryOptions = tailingQueryOptions; } - - void peek(std::vector<BSONObj>& v, int n) { - if( cursor.get() ) - cursor->peek(v,n); - } - BSONObj nextSafe() { return cursor->nextSafe(); } - BSONObj next() { return cursor->next(); } - void putBack(BSONObj op) { cursor->putBack(op); } - - HostAndPort getHost() const; - - /** - * Connects this OplogReader to a valid sync source, using the provided lastOpTimeFetched - * and ReplicationCoordinator objects. - * If this function fails to connect to a sync source that is viable, this OplogReader - * is left unconnected, where this->conn() equals NULL. - * In the process of connecting, this function may add items to the repl coordinator's - * sync source blacklist. - * This function may throw DB exceptions. - * If "lastOpTimeFetched" is (0, 0), we do not check staleness as this indicates an initial - * sync. - */ - void connectToSyncSource(OperationContext* txn, - OpTime lastOpTimeFetched, - ReplicationCoordinator* replCoord); - }; - -} // namespace repl -} // namespace mongo + void connectToSyncSource(OperationContext* txn, + OpTime lastOpTimeFetched, + ReplicationCoordinator* replCoord); +}; + +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/repl_set_heartbeat_args.cpp b/src/mongo/db/repl/repl_set_heartbeat_args.cpp index 75eee68348f..babca5a0dfa 100644 --- a/src/mongo/db/repl/repl_set_heartbeat_args.cpp +++ b/src/mongo/db/repl/repl_set_heartbeat_args.cpp @@ -39,142 +39,133 @@ namespace repl { namespace { - const std::string kCheckEmptyFieldName = "checkEmpty"; - const std::string kProtocolVersionFieldName = "pv"; - const std::string kConfigVersionFieldName = "v"; - const std::string kSenderIdFieldName = "fromId"; - const std::string kSetNameFieldName = "replSetHeartbeat"; - const std::string kSenderHostFieldName = "from"; - - const std::string kLegalHeartbeatFieldNames[] = { - kCheckEmptyFieldName, - kProtocolVersionFieldName, - kConfigVersionFieldName, - kSenderIdFieldName, - kSetNameFieldName, - kSenderHostFieldName - }; - -} // namespace - - ReplSetHeartbeatArgs::ReplSetHeartbeatArgs() : - _hasCheckEmpty(false), - _hasProtocolVersion(false), - _hasConfigVersion(false), - _hasSenderId(false), - _hasSetName(false), - _hasSenderHost(false), - _checkEmpty(false), - _protocolVersion(-1), - _configVersion(-1), - _senderId(-1), - _setName(""), - _senderHost(HostAndPort()) {} - - Status ReplSetHeartbeatArgs::initialize(const BSONObj& argsObj) { - Status status = bsonCheckOnlyHasFields("ReplSetHeartbeatArgs", - argsObj, - kLegalHeartbeatFieldNames); +const std::string kCheckEmptyFieldName = "checkEmpty"; +const std::string kProtocolVersionFieldName = "pv"; +const std::string kConfigVersionFieldName = "v"; +const std::string kSenderIdFieldName = "fromId"; +const std::string kSetNameFieldName = "replSetHeartbeat"; +const std::string kSenderHostFieldName = "from"; + +const std::string kLegalHeartbeatFieldNames[] = {kCheckEmptyFieldName, + kProtocolVersionFieldName, + kConfigVersionFieldName, + kSenderIdFieldName, + kSetNameFieldName, + kSenderHostFieldName}; + +} // namespace + +ReplSetHeartbeatArgs::ReplSetHeartbeatArgs() + : _hasCheckEmpty(false), + _hasProtocolVersion(false), + _hasConfigVersion(false), + _hasSenderId(false), + _hasSetName(false), + _hasSenderHost(false), + _checkEmpty(false), + _protocolVersion(-1), + _configVersion(-1), + _senderId(-1), + _setName(""), + _senderHost(HostAndPort()) {} + +Status ReplSetHeartbeatArgs::initialize(const BSONObj& argsObj) { + Status status = + bsonCheckOnlyHasFields("ReplSetHeartbeatArgs", argsObj, kLegalHeartbeatFieldNames); + if (!status.isOK()) + return status; + + status = bsonExtractBooleanFieldWithDefault(argsObj, kCheckEmptyFieldName, false, &_checkEmpty); + if (!status.isOK()) + return status; + _hasCheckEmpty = true; + + status = bsonExtractIntegerField(argsObj, kProtocolVersionFieldName, &_protocolVersion); + if (!status.isOK()) + return status; + _hasProtocolVersion = true; + + status = bsonExtractIntegerField(argsObj, kConfigVersionFieldName, &_configVersion); + if (!status.isOK()) + return status; + _hasConfigVersion = true; + + status = bsonExtractIntegerFieldWithDefault(argsObj, kSenderIdFieldName, -1, &_senderId); + if (!status.isOK()) + return status; + _hasSenderId = true; + + status = bsonExtractStringField(argsObj, kSetNameFieldName, &_setName); + if (!status.isOK()) + return status; + _hasSetName = true; + + std::string hostAndPortString; + status = + bsonExtractStringFieldWithDefault(argsObj, kSenderHostFieldName, "", &hostAndPortString); + if (!status.isOK()) + return status; + + if (!hostAndPortString.empty()) { + status = _senderHost.initialize(hostAndPortString); if (!status.isOK()) return status; - - status = bsonExtractBooleanFieldWithDefault(argsObj, - kCheckEmptyFieldName, - false, - &_checkEmpty); - if (!status.isOK()) - return status; - _hasCheckEmpty = true; - - status = bsonExtractIntegerField(argsObj, kProtocolVersionFieldName, &_protocolVersion); - if (!status.isOK()) - return status; - _hasProtocolVersion = true; - - status = bsonExtractIntegerField(argsObj, kConfigVersionFieldName, &_configVersion); - if (!status.isOK()) - return status; - _hasConfigVersion = true; - - status = bsonExtractIntegerFieldWithDefault(argsObj, kSenderIdFieldName, -1, &_senderId); - if (!status.isOK()) - return status; - _hasSenderId = true; - - status = bsonExtractStringField(argsObj, kSetNameFieldName, &_setName); - if (!status.isOK()) - return status; - _hasSetName = true; - - std::string hostAndPortString; - status = bsonExtractStringFieldWithDefault( - argsObj, - kSenderHostFieldName, - "", - &hostAndPortString); - if (!status.isOK()) - return status; - - if (!hostAndPortString.empty()) { - status = _senderHost.initialize(hostAndPortString); - if (!status.isOK()) - return status; - _hasSenderHost = true; - } - - return Status::OK(); - } - - bool ReplSetHeartbeatArgs::isInitialized() const { - return _hasProtocolVersion && _hasConfigVersion && _hasSetName; - } - - BSONObj ReplSetHeartbeatArgs::toBSON() const { - invariant(isInitialized()); - BSONObjBuilder builder; - builder.append("replSetHeartbeat", _setName); - builder.appendIntOrLL("pv", _protocolVersion); - builder.appendIntOrLL("v", _configVersion); - builder.append("from", _hasSenderHost ? _senderHost.toString() : ""); - - if (_hasSenderId) { - builder.appendIntOrLL("fromId", _senderId); - } - if (_hasCheckEmpty) { - builder.append("checkEmpty", _checkEmpty); - } - return builder.obj(); - } - - void ReplSetHeartbeatArgs::setCheckEmpty(bool newVal) { - _checkEmpty = newVal; - _hasCheckEmpty = true; + _hasSenderHost = true; } - void ReplSetHeartbeatArgs::setProtocolVersion(long long newVal) { - _protocolVersion = newVal; - _hasProtocolVersion = true; - } + return Status::OK(); +} - void ReplSetHeartbeatArgs::setConfigVersion(long long newVal) { - _configVersion = newVal; - _hasConfigVersion = true; - } +bool ReplSetHeartbeatArgs::isInitialized() const { + return _hasProtocolVersion && _hasConfigVersion && _hasSetName; +} - void ReplSetHeartbeatArgs::setSenderId(long long newVal) { - _senderId = newVal; - _hasSenderId = true; - } +BSONObj ReplSetHeartbeatArgs::toBSON() const { + invariant(isInitialized()); + BSONObjBuilder builder; + builder.append("replSetHeartbeat", _setName); + builder.appendIntOrLL("pv", _protocolVersion); + builder.appendIntOrLL("v", _configVersion); + builder.append("from", _hasSenderHost ? _senderHost.toString() : ""); - void ReplSetHeartbeatArgs::setSetName(std::string newVal) { - _setName = newVal; - _hasSetName = true; + if (_hasSenderId) { + builder.appendIntOrLL("fromId", _senderId); } - - void ReplSetHeartbeatArgs::setSenderHost(HostAndPort newVal) { - _senderHost = newVal; - _hasSenderHost = true; + if (_hasCheckEmpty) { + builder.append("checkEmpty", _checkEmpty); } + return builder.obj(); +} + +void ReplSetHeartbeatArgs::setCheckEmpty(bool newVal) { + _checkEmpty = newVal; + _hasCheckEmpty = true; +} + +void ReplSetHeartbeatArgs::setProtocolVersion(long long newVal) { + _protocolVersion = newVal; + _hasProtocolVersion = true; +} + +void ReplSetHeartbeatArgs::setConfigVersion(long long newVal) { + _configVersion = newVal; + _hasConfigVersion = true; +} + +void ReplSetHeartbeatArgs::setSenderId(long long newVal) { + _senderId = newVal; + _hasSenderId = true; +} + +void ReplSetHeartbeatArgs::setSetName(std::string newVal) { + _setName = newVal; + _hasSetName = true; +} + +void ReplSetHeartbeatArgs::setSenderHost(HostAndPort newVal) { + _senderHost = newVal; + _hasSenderHost = true; +} } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/repl_set_heartbeat_args.h b/src/mongo/db/repl/repl_set_heartbeat_args.h index 487be758524..f03e3260a04 100644 --- a/src/mongo/db/repl/repl_set_heartbeat_args.h +++ b/src/mongo/db/repl/repl_set_heartbeat_args.h @@ -34,101 +34,125 @@ namespace mongo { - class BSONObj; - class Status; +class BSONObj; +class Status; namespace repl { +/** + * Arguments to the replSetHeartbeat command. + */ +class ReplSetHeartbeatArgs { +public: + ReplSetHeartbeatArgs(); + + /** + * Initializes this ReplSetHeartbeatArgs from the contents of args. + */ + Status initialize(const BSONObj& argsObj); + + /** + * Returns true if all required fields have been initialized. + */ + bool isInitialized() const; + + /** + * Returns whether the sender would like to know whether the node is empty or not. + */ + bool getCheckEmpty() const { + return _checkEmpty; + } + + /** + * Gets the version of the Heartbeat protocol being used by the sender. + */ + long long getProtocolVersion() const { + return _protocolVersion; + } + + /** + * Gets the ReplSetConfig version number of the sender. + */ + long long getConfigVersion() const { + return _configVersion; + } + + /** + * Gets the _id of the sender in their ReplSetConfig. + */ + long long getSenderId() const { + return _senderId; + } + + /** + * Gets the replSet name of the sender's replica set. + */ + std::string getSetName() const { + return _setName; + } + + /** + * Gets the HostAndPort of the sender. + */ + HostAndPort getSenderHost() const { + return _senderHost; + } + + /** + * The below methods check whether or not value in the method name has been set. + */ + bool hasCheckEmpty() { + return _hasCheckEmpty; + } + bool hasProtocolVersion() { + return _hasProtocolVersion; + } + bool hasConfigVersion() { + return _hasConfigVersion; + } + bool hasSenderId() { + return _hasSenderId; + } + bool hasSetName() { + return _hasSetName; + } + bool hasSenderHost() { + return _hasSenderHost; + } + + /** + * The below methods set the value in the method name to 'newVal'. + */ + void setCheckEmpty(bool newVal); + void setProtocolVersion(long long newVal); + void setConfigVersion(long long newVal); + void setSenderId(long long newVal); + void setSetName(std::string newVal); + void setSenderHost(HostAndPort newVal); + /** - * Arguments to the replSetHeartbeat command. + * Returns a BSONified version of the object. + * Should only be called if the mandatory fields have been set. + * Optional fields are only included if they have been set. */ - class ReplSetHeartbeatArgs { - public: - ReplSetHeartbeatArgs(); - - /** - * Initializes this ReplSetHeartbeatArgs from the contents of args. - */ - Status initialize(const BSONObj& argsObj); - - /** - * Returns true if all required fields have been initialized. - */ - bool isInitialized() const; - - /** - * Returns whether the sender would like to know whether the node is empty or not. - */ - bool getCheckEmpty() const { return _checkEmpty; } - - /** - * Gets the version of the Heartbeat protocol being used by the sender. - */ - long long getProtocolVersion() const { return _protocolVersion; } - - /** - * Gets the ReplSetConfig version number of the sender. - */ - long long getConfigVersion() const { return _configVersion; } - - /** - * Gets the _id of the sender in their ReplSetConfig. - */ - long long getSenderId() const { return _senderId; } - - /** - * Gets the replSet name of the sender's replica set. - */ - std::string getSetName() const { return _setName; } - - /** - * Gets the HostAndPort of the sender. - */ - HostAndPort getSenderHost() const { return _senderHost; } - - /** - * The below methods check whether or not value in the method name has been set. - */ - bool hasCheckEmpty() { return _hasCheckEmpty; } - bool hasProtocolVersion() { return _hasProtocolVersion; } - bool hasConfigVersion() { return _hasConfigVersion; } - bool hasSenderId() { return _hasSenderId; } - bool hasSetName() { return _hasSetName; } - bool hasSenderHost() { return _hasSenderHost; } - - /** - * The below methods set the value in the method name to 'newVal'. - */ - void setCheckEmpty(bool newVal); - void setProtocolVersion(long long newVal); - void setConfigVersion(long long newVal); - void setSenderId(long long newVal); - void setSetName(std::string newVal); - void setSenderHost(HostAndPort newVal); - - /** - * Returns a BSONified version of the object. - * Should only be called if the mandatory fields have been set. - * Optional fields are only included if they have been set. - */ - BSONObj toBSON() const; - - private: - bool _hasCheckEmpty; - bool _hasProtocolVersion; - bool _hasConfigVersion; - bool _hasSenderId; - bool _hasSetName; - bool _hasSenderHost; - - // look at the body of the isInitialized() function to see which fields are mandatory - bool _checkEmpty; - long long _protocolVersion; - long long _configVersion; - long long _senderId; - std::string _setName; - HostAndPort _senderHost; - }; - -} // namespace repl -} // namespace mongo + BSONObj toBSON() const; + +private: + bool _hasCheckEmpty; + bool _hasProtocolVersion; + bool _hasConfigVersion; + bool _hasSenderId; + bool _hasSetName; + bool _hasSenderHost; + + // look at the body of the isInitialized() function to see which fields are mandatory + bool _checkEmpty; + long long _protocolVersion; + long long _configVersion; + long long _senderId; + std::string _setName; + HostAndPort _senderHost; +}; + +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/repl_set_heartbeat_response.cpp b/src/mongo/db/repl/repl_set_heartbeat_response.cpp index 4e968ef0175..5d27f8b222d 100644 --- a/src/mongo/db/repl/repl_set_heartbeat_response.cpp +++ b/src/mongo/db/repl/repl_set_heartbeat_response.cpp @@ -44,315 +44,308 @@ namespace mongo { namespace repl { namespace { - const std::string kOkFieldName = "ok"; - const std::string kErrMsgFieldName = "errmsg"; - const std::string kErrorCodeFieldName = "code"; - const std::string kOpTimeFieldName = "opTime"; - const std::string kTimeFieldName = "time"; - const std::string kElectionTimeFieldName = "electionTime"; - const std::string kConfigFieldName = "config"; - const std::string kIsElectableFieldName = "e"; - const std::string kMismatchFieldName = "mismatch"; - const std::string kIsReplSetFieldName = "rs"; - const std::string kHasStateDisagreementFieldName = "stateDisagreement"; - const std::string kMemberStateFieldName = "state"; - const std::string kConfigVersionFieldName = "v"; - const std::string kHbMessageFieldName = "hbmsg"; - const std::string kReplSetFieldName = "set"; - const std::string kSyncSourceFieldName = "syncingTo"; - const std::string kHasDataFieldName = "hasData"; +const std::string kOkFieldName = "ok"; +const std::string kErrMsgFieldName = "errmsg"; +const std::string kErrorCodeFieldName = "code"; +const std::string kOpTimeFieldName = "opTime"; +const std::string kTimeFieldName = "time"; +const std::string kElectionTimeFieldName = "electionTime"; +const std::string kConfigFieldName = "config"; +const std::string kIsElectableFieldName = "e"; +const std::string kMismatchFieldName = "mismatch"; +const std::string kIsReplSetFieldName = "rs"; +const std::string kHasStateDisagreementFieldName = "stateDisagreement"; +const std::string kMemberStateFieldName = "state"; +const std::string kConfigVersionFieldName = "v"; +const std::string kHbMessageFieldName = "hbmsg"; +const std::string kReplSetFieldName = "set"; +const std::string kSyncSourceFieldName = "syncingTo"; +const std::string kHasDataFieldName = "hasData"; } // namespace - ReplSetHeartbeatResponse::ReplSetHeartbeatResponse() : - _electionTimeSet(false), - _timeSet(false), - _time(0), - _opTimeSet(false), - _electableSet(false), - _electable(false), - _hasDataSet(false), - _hasData(false), - _mismatch(false), - _isReplSet(false), - _stateDisagreement(false), - _stateSet(false), - _version(-1), - _configSet(false) - {} - - void ReplSetHeartbeatResponse::addToBSON(BSONObjBuilder* builder) const { - if (_mismatch) { - *builder << kOkFieldName << 0.0; - *builder << kMismatchFieldName << _mismatch; - return; - } - - builder->append(kOkFieldName, 1.0); - if (_opTimeSet) { - builder->appendDate(kOpTimeFieldName, _opTime.asDate()); - } - if (_timeSet) { - *builder << kTimeFieldName << _time.total_seconds(); - } - if (_electionTimeSet) { - builder->appendDate(kElectionTimeFieldName, _electionTime.asDate()); - } - if (_configSet) { - *builder << kConfigFieldName << _config.toBSON(); - } - if (_electableSet) { - *builder << kIsElectableFieldName << _electable; - } - if (_isReplSet) { - *builder << "rs" << _isReplSet; - } - if (_stateDisagreement) { - *builder << kHasStateDisagreementFieldName << _stateDisagreement; - } - if (_stateSet) { - builder->appendIntOrLL(kMemberStateFieldName, _state.s); - } - if (_version != -1) { - *builder << kConfigVersionFieldName << _version; - } - *builder << kHbMessageFieldName << _hbmsg; - if (!_setName.empty()) { - *builder << kReplSetFieldName << _setName; - } - if (!_syncingTo.empty()) { - *builder << kSyncSourceFieldName << _syncingTo; - } - if (_hasDataSet) { - builder->append(kHasDataFieldName, _hasData); - } +ReplSetHeartbeatResponse::ReplSetHeartbeatResponse() + : _electionTimeSet(false), + _timeSet(false), + _time(0), + _opTimeSet(false), + _electableSet(false), + _electable(false), + _hasDataSet(false), + _hasData(false), + _mismatch(false), + _isReplSet(false), + _stateDisagreement(false), + _stateSet(false), + _version(-1), + _configSet(false) {} + +void ReplSetHeartbeatResponse::addToBSON(BSONObjBuilder* builder) const { + if (_mismatch) { + *builder << kOkFieldName << 0.0; + *builder << kMismatchFieldName << _mismatch; + return; } - BSONObj ReplSetHeartbeatResponse::toBSON() const { - BSONObjBuilder builder; - addToBSON(&builder); - return builder.obj(); + builder->append(kOkFieldName, 1.0); + if (_opTimeSet) { + builder->appendDate(kOpTimeFieldName, _opTime.asDate()); + } + if (_timeSet) { + *builder << kTimeFieldName << _time.total_seconds(); + } + if (_electionTimeSet) { + builder->appendDate(kElectionTimeFieldName, _electionTime.asDate()); + } + if (_configSet) { + *builder << kConfigFieldName << _config.toBSON(); + } + if (_electableSet) { + *builder << kIsElectableFieldName << _electable; + } + if (_isReplSet) { + *builder << "rs" << _isReplSet; + } + if (_stateDisagreement) { + *builder << kHasStateDisagreementFieldName << _stateDisagreement; + } + if (_stateSet) { + builder->appendIntOrLL(kMemberStateFieldName, _state.s); + } + if (_version != -1) { + *builder << kConfigVersionFieldName << _version; + } + *builder << kHbMessageFieldName << _hbmsg; + if (!_setName.empty()) { + *builder << kReplSetFieldName << _setName; + } + if (!_syncingTo.empty()) { + *builder << kSyncSourceFieldName << _syncingTo; + } + if (_hasDataSet) { + builder->append(kHasDataFieldName, _hasData); + } +} + +BSONObj ReplSetHeartbeatResponse::toBSON() const { + BSONObjBuilder builder; + addToBSON(&builder); + return builder.obj(); +} + +Status ReplSetHeartbeatResponse::initialize(const BSONObj& doc) { + // Old versions set this even though they returned not "ok" + _mismatch = doc[kMismatchFieldName].trueValue(); + if (_mismatch) + return Status(ErrorCodes::InconsistentReplicaSetNames, "replica set name doesn't match."); + + // Old versions sometimes set the replica set name ("set") but ok:0 + const BSONElement replSetNameElement = doc[kReplSetFieldName]; + if (replSetNameElement.eoo()) { + _setName.clear(); + } else if (replSetNameElement.type() != String) { + return Status(ErrorCodes::TypeMismatch, + str::stream() << "Expected \"" << kReplSetFieldName + << "\" field in response to replSetHeartbeat to have " + "type String, but found " + << typeName(replSetNameElement.type())); + } else { + _setName = replSetNameElement.String(); } - Status ReplSetHeartbeatResponse::initialize(const BSONObj& doc) { - - // Old versions set this even though they returned not "ok" - _mismatch = doc[kMismatchFieldName].trueValue(); - if (_mismatch) - return Status(ErrorCodes::InconsistentReplicaSetNames, - "replica set name doesn't match."); - - // Old versions sometimes set the replica set name ("set") but ok:0 - const BSONElement replSetNameElement = doc[kReplSetFieldName]; - if (replSetNameElement.eoo()) { - _setName.clear(); - } - else if (replSetNameElement.type() != String) { - return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << - kReplSetFieldName << "\" field in response to replSetHeartbeat to have " - "type String, but found " << typeName(replSetNameElement.type())); - } - else { - _setName = replSetNameElement.String(); - } - - if (_setName.empty() && !doc[kOkFieldName].trueValue()) { - std::string errMsg = doc[kErrMsgFieldName].str(); - - BSONElement errCodeElem = doc[kErrorCodeFieldName]; - if (errCodeElem.ok()) { - if (!errCodeElem.isNumber()) - return Status(ErrorCodes::BadValue, "Error code is not a number!"); - - int errorCode = errCodeElem.numberInt(); - return Status(ErrorCodes::Error(errorCode), errMsg); - } - return Status(ErrorCodes::UnknownError, errMsg); - } - - const BSONElement hasDataElement = doc[kHasDataFieldName]; - _hasDataSet = !hasDataElement.eoo(); - _hasData = hasDataElement.trueValue(); - - const BSONElement electionTimeElement = doc[kElectionTimeFieldName]; - if (electionTimeElement.eoo()) { - _electionTimeSet = false; - } - else if (electionTimeElement.type() == Timestamp) { - _electionTimeSet = true; - _electionTime = electionTimeElement._opTime(); - } - else if (electionTimeElement.type() == Date) { - _electionTimeSet = true; - _electionTime = OpTime(electionTimeElement.date()); - } - else { - return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << - kElectionTimeFieldName << "\" field in response to replSetHeartbeat " - "command to have type Date or Timestamp, but found type " << - typeName(electionTimeElement.type())); - } - - const BSONElement timeElement = doc[kTimeFieldName]; - if (timeElement.eoo()) { - _timeSet = false; - } - else if (timeElement.isNumber()) { - _timeSet = true; - _time = Seconds(timeElement.numberLong()); - } - else { - return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << - kTimeFieldName << "\" field in response to replSetHeartbeat " - "command to have a numeric type, but found type " << - typeName(timeElement.type())); - } - - const BSONElement opTimeElement = doc[kOpTimeFieldName]; - if (opTimeElement.eoo()) { - _opTimeSet = false; - } - else if (opTimeElement.type() == Timestamp) { - _opTimeSet = true; - _opTime = opTimeElement._opTime(); - } - else if (opTimeElement.type() == Date) { - _opTimeSet = true; - _opTime = OpTime(opTimeElement.date()); - } - else { - return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << - kOpTimeFieldName << "\" field in response to replSetHeartbeat " - "command to have type Date or Timestamp, but found type " << - typeName(opTimeElement.type())); - } - - const BSONElement electableElement = doc[kIsElectableFieldName]; - if (electableElement.eoo()) { - _electableSet = false; - } - else { - _electableSet = true; - _electable = electableElement.trueValue(); - } + if (_setName.empty() && !doc[kOkFieldName].trueValue()) { + std::string errMsg = doc[kErrMsgFieldName].str(); - _isReplSet = doc[kIsReplSetFieldName].trueValue(); + BSONElement errCodeElem = doc[kErrorCodeFieldName]; + if (errCodeElem.ok()) { + if (!errCodeElem.isNumber()) + return Status(ErrorCodes::BadValue, "Error code is not a number!"); - const BSONElement memberStateElement = doc[kMemberStateFieldName]; - if (memberStateElement.eoo()) { - _stateSet = false; + int errorCode = errCodeElem.numberInt(); + return Status(ErrorCodes::Error(errorCode), errMsg); } - else if (memberStateElement.type() != NumberInt && - memberStateElement.type() != NumberLong) { - return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << - kMemberStateFieldName << "\" field in response to replSetHeartbeat " - "command to have type NumberInt or NumberLong, but found type " << - typeName(memberStateElement.type())); - } - else { - long long stateInt = memberStateElement.numberLong(); - if (stateInt < 0 || stateInt > MemberState::RS_MAX) { - return Status(ErrorCodes::BadValue, str::stream() << "Value for \"" << - kMemberStateFieldName << "\" in response to replSetHeartbeat is " - "out of range; legal values are non-negative and no more than " << - MemberState::RS_MAX); - } - _stateSet = true; - _state = MemberState(static_cast<int>(stateInt)); - } - - _stateDisagreement = doc[kHasStateDisagreementFieldName].trueValue(); + return Status(ErrorCodes::UnknownError, errMsg); + } + const BSONElement hasDataElement = doc[kHasDataFieldName]; + _hasDataSet = !hasDataElement.eoo(); + _hasData = hasDataElement.trueValue(); + + const BSONElement electionTimeElement = doc[kElectionTimeFieldName]; + if (electionTimeElement.eoo()) { + _electionTimeSet = false; + } else if (electionTimeElement.type() == Timestamp) { + _electionTimeSet = true; + _electionTime = electionTimeElement._opTime(); + } else if (electionTimeElement.type() == Date) { + _electionTimeSet = true; + _electionTime = OpTime(electionTimeElement.date()); + } else { + return Status(ErrorCodes::TypeMismatch, + str::stream() << "Expected \"" << kElectionTimeFieldName + << "\" field in response to replSetHeartbeat " + "command to have type Date or Timestamp, but found type " + << typeName(electionTimeElement.type())); + } - // Not required for the case of uninitialized members -- they have no config - const BSONElement versionElement = doc[kConfigVersionFieldName]; + const BSONElement timeElement = doc[kTimeFieldName]; + if (timeElement.eoo()) { + _timeSet = false; + } else if (timeElement.isNumber()) { + _timeSet = true; + _time = Seconds(timeElement.numberLong()); + } else { + return Status(ErrorCodes::TypeMismatch, + str::stream() << "Expected \"" << kTimeFieldName + << "\" field in response to replSetHeartbeat " + "command to have a numeric type, but found type " + << typeName(timeElement.type())); + } - // If we have an optime then we must have a version - if (_opTimeSet && versionElement.eoo()) { - return Status(ErrorCodes::NoSuchKey, str::stream() << - "Response to replSetHeartbeat missing required \"" << - kConfigVersionFieldName << "\" field even though initialized"); - } + const BSONElement opTimeElement = doc[kOpTimeFieldName]; + if (opTimeElement.eoo()) { + _opTimeSet = false; + } else if (opTimeElement.type() == Timestamp) { + _opTimeSet = true; + _opTime = opTimeElement._opTime(); + } else if (opTimeElement.type() == Date) { + _opTimeSet = true; + _opTime = OpTime(opTimeElement.date()); + } else { + return Status(ErrorCodes::TypeMismatch, + str::stream() << "Expected \"" << kOpTimeFieldName + << "\" field in response to replSetHeartbeat " + "command to have type Date or Timestamp, but found type " + << typeName(opTimeElement.type())); + } - // If there is a "v" (config version) then it must be an int. - if (!versionElement.eoo() && versionElement.type() != NumberInt) { - return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << - kConfigVersionFieldName << - "\" field in response to replSetHeartbeat to have " - "type NumberInt, but found " << typeName(versionElement.type())); - } - _version = versionElement.numberInt(); + const BSONElement electableElement = doc[kIsElectableFieldName]; + if (electableElement.eoo()) { + _electableSet = false; + } else { + _electableSet = true; + _electable = electableElement.trueValue(); + } - const BSONElement hbMsgElement = doc[kHbMessageFieldName]; - if (hbMsgElement.eoo()) { - _hbmsg.clear(); - } - else if (hbMsgElement.type() != String) { - return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << - kHbMessageFieldName << "\" field in response to replSetHeartbeat to have " - "type String, but found " << typeName(hbMsgElement.type())); - } - else { - _hbmsg = hbMsgElement.String(); - } + _isReplSet = doc[kIsReplSetFieldName].trueValue(); + + const BSONElement memberStateElement = doc[kMemberStateFieldName]; + if (memberStateElement.eoo()) { + _stateSet = false; + } else if (memberStateElement.type() != NumberInt && memberStateElement.type() != NumberLong) { + return Status(ErrorCodes::TypeMismatch, + str::stream() + << "Expected \"" << kMemberStateFieldName + << "\" field in response to replSetHeartbeat " + "command to have type NumberInt or NumberLong, but found type " + << typeName(memberStateElement.type())); + } else { + long long stateInt = memberStateElement.numberLong(); + if (stateInt < 0 || stateInt > MemberState::RS_MAX) { + return Status(ErrorCodes::BadValue, + str::stream() + << "Value for \"" << kMemberStateFieldName + << "\" in response to replSetHeartbeat is " + "out of range; legal values are non-negative and no more than " + << MemberState::RS_MAX); + } + _stateSet = true; + _state = MemberState(static_cast<int>(stateInt)); + } - const BSONElement syncingToElement = doc[kSyncSourceFieldName]; - if (syncingToElement.eoo()) { - _syncingTo.clear(); - } - else if (syncingToElement.type() != String) { - return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << - kSyncSourceFieldName << "\" field in response to replSetHeartbeat to " - "have type String, but found " << typeName(syncingToElement.type())); - } - else { - _syncingTo = syncingToElement.String(); - } + _stateDisagreement = doc[kHasStateDisagreementFieldName].trueValue(); - const BSONElement rsConfigElement = doc[kConfigFieldName]; - if (rsConfigElement.eoo()) { - _configSet = false; - _config = ReplicaSetConfig(); - return Status::OK(); - } - else if (rsConfigElement.type() != Object) { - return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << - kConfigFieldName << "\" in response to replSetHeartbeat to have type " - "Object, but found " << typeName(rsConfigElement.type())); - } - _configSet = true; - return _config.initialize(rsConfigElement.Obj()); - } - MemberState ReplSetHeartbeatResponse::getState() const { - invariant(_stateSet); - return _state; - } + // Not required for the case of uninitialized members -- they have no config + const BSONElement versionElement = doc[kConfigVersionFieldName]; - OpTime ReplSetHeartbeatResponse::getElectionTime() const { - invariant(_electionTimeSet); - return _electionTime; + // If we have an optime then we must have a version + if (_opTimeSet && versionElement.eoo()) { + return Status(ErrorCodes::NoSuchKey, + str::stream() << "Response to replSetHeartbeat missing required \"" + << kConfigVersionFieldName + << "\" field even though initialized"); } - bool ReplSetHeartbeatResponse::isElectable() const { - invariant(_electableSet); - return _electable; + // If there is a "v" (config version) then it must be an int. + if (!versionElement.eoo() && versionElement.type() != NumberInt) { + return Status(ErrorCodes::TypeMismatch, + str::stream() << "Expected \"" << kConfigVersionFieldName + << "\" field in response to replSetHeartbeat to have " + "type NumberInt, but found " + << typeName(versionElement.type())); } - - Seconds ReplSetHeartbeatResponse::getTime() const { - invariant(_timeSet); - return _time; + _version = versionElement.numberInt(); + + const BSONElement hbMsgElement = doc[kHbMessageFieldName]; + if (hbMsgElement.eoo()) { + _hbmsg.clear(); + } else if (hbMsgElement.type() != String) { + return Status(ErrorCodes::TypeMismatch, + str::stream() << "Expected \"" << kHbMessageFieldName + << "\" field in response to replSetHeartbeat to have " + "type String, but found " << typeName(hbMsgElement.type())); + } else { + _hbmsg = hbMsgElement.String(); } - OpTime ReplSetHeartbeatResponse::getOpTime() const { - invariant(_opTimeSet); - return _opTime; + const BSONElement syncingToElement = doc[kSyncSourceFieldName]; + if (syncingToElement.eoo()) { + _syncingTo.clear(); + } else if (syncingToElement.type() != String) { + return Status(ErrorCodes::TypeMismatch, + str::stream() << "Expected \"" << kSyncSourceFieldName + << "\" field in response to replSetHeartbeat to " + "have type String, but found " + << typeName(syncingToElement.type())); + } else { + _syncingTo = syncingToElement.String(); } - const ReplicaSetConfig& ReplSetHeartbeatResponse::getConfig() const { - invariant(_configSet); - return _config; + const BSONElement rsConfigElement = doc[kConfigFieldName]; + if (rsConfigElement.eoo()) { + _configSet = false; + _config = ReplicaSetConfig(); + return Status::OK(); + } else if (rsConfigElement.type() != Object) { + return Status(ErrorCodes::TypeMismatch, + str::stream() << "Expected \"" << kConfigFieldName + << "\" in response to replSetHeartbeat to have type " + "Object, but found " << typeName(rsConfigElement.type())); } - -} // namespace repl -} // namespace mongo + _configSet = true; + return _config.initialize(rsConfigElement.Obj()); +} + +MemberState ReplSetHeartbeatResponse::getState() const { + invariant(_stateSet); + return _state; +} + +OpTime ReplSetHeartbeatResponse::getElectionTime() const { + invariant(_electionTimeSet); + return _electionTime; +} + +bool ReplSetHeartbeatResponse::isElectable() const { + invariant(_electableSet); + return _electable; +} + +Seconds ReplSetHeartbeatResponse::getTime() const { + invariant(_timeSet); + return _time; +} + +OpTime ReplSetHeartbeatResponse::getOpTime() const { + invariant(_opTimeSet); + return _opTime; +} + +const ReplicaSetConfig& ReplSetHeartbeatResponse::getConfig() const { + invariant(_configSet); + return _config; +} + +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/repl_set_heartbeat_response.h b/src/mongo/db/repl/repl_set_heartbeat_response.h index a5629fbc3bf..cb555cca825 100644 --- a/src/mongo/db/repl/repl_set_heartbeat_response.h +++ b/src/mongo/db/repl/repl_set_heartbeat_response.h @@ -36,165 +36,231 @@ namespace mongo { - class BSONObj; - class BSONObjBuilder; - class Status; +class BSONObj; +class BSONObjBuilder; +class Status; namespace repl { +/** + * Response structure for the replSetHeartbeat command. + */ +class ReplSetHeartbeatResponse { +public: + ReplSetHeartbeatResponse(); + + /** + * Initializes this ReplSetHeartbeatResponse from the contents of "doc". + */ + Status initialize(const BSONObj& doc); + + /** + * Appends all non-default values to "builder". + */ + void addToBSON(BSONObjBuilder* builder) const; + + /** + * Returns a BSONObj consisting of all non-default values to "builder". + */ + BSONObj toBSON() const; + + /** + * Returns toBSON().toString() + */ + const std::string toString() const { + return toBSON().toString(); + } + + bool hasDataSet() const { + return _hasDataSet; + } + bool hasData() const { + return _hasData; + } + bool isMismatched() const { + return _mismatch; + } + bool isReplSet() const { + return _isReplSet; + } + bool isStateDisagreement() const { + return _stateDisagreement; + } + const std::string& getReplicaSetName() const { + return _setName; + } + bool hasState() const { + return _stateSet; + } + MemberState getState() const; + bool hasElectionTime() const { + return _electionTimeSet; + } + OpTime getElectionTime() const; + bool hasIsElectable() const { + return _electableSet; + } + bool isElectable() const; + const std::string& getHbMsg() const { + return _hbmsg; + } + bool hasTime() const { + return _timeSet; + } + Seconds getTime() const; + bool hasOpTime() const { + return _opTimeSet; + } + OpTime getOpTime() const; + const std::string& getSyncingTo() const { + return _syncingTo; + } + int getVersion() const { + return _version; + } + bool hasConfig() const { + return _configSet; + } + const ReplicaSetConfig& getConfig() const; + + /** + * Sets _mismatch to true. + */ + void noteMismatched() { + _mismatch = true; + } + + /** + * Sets _isReplSet to true. + */ + void noteReplSet() { + _isReplSet = true; + } + + /** + * Sets _stateDisagreement to true. + */ + void noteStateDisagreement() { + _stateDisagreement = true; + } + + /** + * Sets _hasData to true, and _hasDataSet to true to indicate _hasData has been modified + */ + void noteHasData() { + _hasDataSet = _hasData = true; + } + + /** + * Sets _setName to "name". + */ + void setSetName(std::string name) { + _setName = name; + } + + /** + * Sets _state to "state". + */ + void setState(MemberState state) { + _stateSet = true; + _state = state; + } + /** - * Response structure for the replSetHeartbeat command. - */ - class ReplSetHeartbeatResponse { - public: - ReplSetHeartbeatResponse(); - - /** - * Initializes this ReplSetHeartbeatResponse from the contents of "doc". - */ - Status initialize(const BSONObj& doc); - - /** - * Appends all non-default values to "builder". - */ - void addToBSON(BSONObjBuilder* builder) const; - - /** - * Returns a BSONObj consisting of all non-default values to "builder". - */ - BSONObj toBSON() const; - - /** - * Returns toBSON().toString() - */ - const std::string toString() const { return toBSON().toString(); } - - bool hasDataSet() const { return _hasDataSet; } - bool hasData() const { return _hasData; } - bool isMismatched() const { return _mismatch; } - bool isReplSet() const { return _isReplSet; } - bool isStateDisagreement() const { return _stateDisagreement; } - const std::string& getReplicaSetName() const { return _setName; } - bool hasState() const { return _stateSet; } - MemberState getState() const; - bool hasElectionTime() const { return _electionTimeSet; } - OpTime getElectionTime() const; - bool hasIsElectable() const { return _electableSet; } - bool isElectable() const; - const std::string& getHbMsg() const { return _hbmsg; } - bool hasTime() const { return _timeSet; } - Seconds getTime() const; - bool hasOpTime() const { return _opTimeSet; } - OpTime getOpTime() const; - const std::string& getSyncingTo() const { return _syncingTo; } - int getVersion() const { return _version; } - bool hasConfig() const { return _configSet; } - const ReplicaSetConfig& getConfig() const; - - /** - * Sets _mismatch to true. - */ - void noteMismatched() { _mismatch = true; } - - /** - * Sets _isReplSet to true. - */ - void noteReplSet() { _isReplSet = true; } - - /** - * Sets _stateDisagreement to true. - */ - void noteStateDisagreement() { _stateDisagreement = true; } - - /** - * Sets _hasData to true, and _hasDataSet to true to indicate _hasData has been modified - */ - void noteHasData() { _hasDataSet = _hasData = true;} - - /** - * Sets _setName to "name". - */ - void setSetName(std::string name) { _setName = name; } - - /** - * Sets _state to "state". - */ - void setState(MemberState state) { _stateSet = true; _state = state; } - - /** - * Sets the optional "electionTime" field to the given OpTime. - */ - void setElectionTime(OpTime time) { _electionTimeSet = true; _electionTime = time; } - - /** - * Sets _electable to "electable" and sets _electableSet to true to indicate - * that the value of _electable has been modified. - */ - void setElectable(bool electable) { _electableSet = true; _electable = electable; } - - /** - * Sets _hbmsg to "hbmsg". - */ - void setHbMsg(std::string hbmsg) { _hbmsg = hbmsg; } - - /** - * Sets the optional "time" field of the response to "theTime", which is - * a count of seconds since the UNIX epoch. - */ - void setTime(Seconds theTime) { _timeSet = true; _time = theTime; } - - /** - * Sets _opTime to "time" and sets _opTimeSet to true to indicate that the value - * of _opTime has been modified. - */ - void setOpTime(OpTime time) { _opTimeSet = true; _opTime = time; } - - /** - * Sets _syncingTo to "syncingTo". - */ - void setSyncingTo(std::string syncingTo) { _syncingTo = syncingTo; } - - /** - * Sets _version to "version". - */ - void setVersion(int version) { _version = version; } - - /** - * Initializes _config with "config". - */ - void setConfig(const ReplicaSetConfig& config) { _configSet = true; _config = config; } - - private: - bool _electionTimeSet; - OpTime _electionTime; - - bool _timeSet; - Seconds _time; // Seconds since UNIX epoch. - - bool _opTimeSet; - OpTime _opTime; - - bool _electableSet; - bool _electable; - - bool _hasDataSet; - bool _hasData; - - bool _mismatch; - bool _isReplSet; - bool _stateDisagreement; - - bool _stateSet; - MemberState _state; - - int _version; - std::string _setName; - std::string _hbmsg; - std::string _syncingTo; - - bool _configSet; - ReplicaSetConfig _config; - }; - -} // namespace repl -} // namespace mongo + * Sets the optional "electionTime" field to the given OpTime. + */ + void setElectionTime(OpTime time) { + _electionTimeSet = true; + _electionTime = time; + } + + /** + * Sets _electable to "electable" and sets _electableSet to true to indicate + * that the value of _electable has been modified. + */ + void setElectable(bool electable) { + _electableSet = true; + _electable = electable; + } + + /** + * Sets _hbmsg to "hbmsg". + */ + void setHbMsg(std::string hbmsg) { + _hbmsg = hbmsg; + } + + /** + * Sets the optional "time" field of the response to "theTime", which is + * a count of seconds since the UNIX epoch. + */ + void setTime(Seconds theTime) { + _timeSet = true; + _time = theTime; + } + + /** + * Sets _opTime to "time" and sets _opTimeSet to true to indicate that the value + * of _opTime has been modified. + */ + void setOpTime(OpTime time) { + _opTimeSet = true; + _opTime = time; + } + + /** + * Sets _syncingTo to "syncingTo". + */ + void setSyncingTo(std::string syncingTo) { + _syncingTo = syncingTo; + } + + /** + * Sets _version to "version". + */ + void setVersion(int version) { + _version = version; + } + + /** + * Initializes _config with "config". + */ + void setConfig(const ReplicaSetConfig& config) { + _configSet = true; + _config = config; + } + +private: + bool _electionTimeSet; + OpTime _electionTime; + + bool _timeSet; + Seconds _time; // Seconds since UNIX epoch. + + bool _opTimeSet; + OpTime _opTime; + + bool _electableSet; + bool _electable; + + bool _hasDataSet; + bool _hasData; + + bool _mismatch; + bool _isReplSet; + bool _stateDisagreement; + + bool _stateSet; + MemberState _state; + + int _version; + std::string _setName; + std::string _hbmsg; + std::string _syncingTo; + + bool _configSet; + ReplicaSetConfig _config; +}; + +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/repl_set_heartbeat_response_test.cpp b/src/mongo/db/repl/repl_set_heartbeat_response_test.cpp index 71003ab4fec..a2791084d00 100644 --- a/src/mongo/db/repl/repl_set_heartbeat_response_test.cpp +++ b/src/mongo/db/repl/repl_set_heartbeat_response_test.cpp @@ -38,722 +38,733 @@ namespace mongo { namespace repl { namespace { - using boost::scoped_ptr; - using std::auto_ptr; - - bool stringContains(const std::string &haystack, const std::string& needle) { - return haystack.find(needle) != std::string::npos; - } - - TEST(ReplSetHeartbeatResponse, DefaultConstructThenSlowlyBuildToFullObj) { - int fieldsSet = 2; - ReplSetHeartbeatResponse hbResponse; - ReplSetHeartbeatResponse hbResponseObjRoundTripChecker; - ASSERT_EQUALS(false, hbResponse.hasState()); - ASSERT_EQUALS(false, hbResponse.hasElectionTime()); - ASSERT_EQUALS(false, hbResponse.hasIsElectable()); - ASSERT_EQUALS(false, hbResponse.hasTime()); - ASSERT_EQUALS(false, hbResponse.hasOpTime()); - ASSERT_EQUALS(false, hbResponse.hasConfig()); - ASSERT_EQUALS(false, hbResponse.isMismatched()); - ASSERT_EQUALS(false, hbResponse.isReplSet()); - ASSERT_EQUALS(false, hbResponse.isStateDisagreement()); - ASSERT_EQUALS("", hbResponse.getReplicaSetName()); - ASSERT_EQUALS("", hbResponse.getHbMsg()); - ASSERT_EQUALS("", hbResponse.getSyncingTo()); - ASSERT_EQUALS(-1, hbResponse.getVersion()); - - BSONObj hbResponseObj = hbResponse.toBSON(); - ASSERT_EQUALS(fieldsSet, hbResponseObj.nFields()); - ASSERT_EQUALS("", hbResponseObj["hbmsg"].String()); - - Status initializeResult = Status::OK(); - ASSERT_EQUALS(hbResponseObj.toString(), hbResponseObjRoundTripChecker.toBSON().toString()); - - // set version - hbResponse.setVersion(1); - ++fieldsSet; - ASSERT_EQUALS(false, hbResponse.hasState()); - ASSERT_EQUALS(false, hbResponse.hasElectionTime()); - ASSERT_EQUALS(false, hbResponse.hasIsElectable()); - ASSERT_EQUALS(false, hbResponse.hasTime()); - ASSERT_EQUALS(false, hbResponse.hasOpTime()); - ASSERT_EQUALS(false, hbResponse.hasConfig()); - ASSERT_EQUALS(false, hbResponse.isMismatched()); - ASSERT_EQUALS(false, hbResponse.isReplSet()); - ASSERT_EQUALS(false, hbResponse.isStateDisagreement()); - ASSERT_EQUALS("", hbResponse.getReplicaSetName()); - ASSERT_EQUALS("", hbResponse.getHbMsg()); - ASSERT_EQUALS("", hbResponse.getSyncingTo()); - ASSERT_EQUALS(1, hbResponse.getVersion()); - - hbResponseObj = hbResponse.toBSON(); - ASSERT_EQUALS(fieldsSet, hbResponseObj.nFields()); - ASSERT_EQUALS("", hbResponseObj["hbmsg"].String()); - ASSERT_EQUALS(1, hbResponseObj["v"].Number()); - - initializeResult = hbResponseObjRoundTripChecker.initialize(hbResponseObj); - ASSERT_EQUALS(Status::OK(), initializeResult); - ASSERT_EQUALS(hbResponseObj.toString(), hbResponseObjRoundTripChecker.toBSON().toString()); - - // set setname - hbResponse.setSetName("rs0"); - ++fieldsSet; - ASSERT_EQUALS(false, hbResponse.hasState()); - ASSERT_EQUALS(false, hbResponse.hasElectionTime()); - ASSERT_EQUALS(false, hbResponse.hasIsElectable()); - ASSERT_EQUALS(false, hbResponse.hasTime()); - ASSERT_EQUALS(false, hbResponse.hasOpTime()); - ASSERT_EQUALS(false, hbResponse.hasConfig()); - ASSERT_EQUALS(false, hbResponse.isMismatched()); - ASSERT_EQUALS(false, hbResponse.isReplSet()); - ASSERT_EQUALS(false, hbResponse.isStateDisagreement()); - ASSERT_EQUALS("rs0", hbResponse.getReplicaSetName()); - ASSERT_EQUALS("", hbResponse.getHbMsg()); - ASSERT_EQUALS("", hbResponse.getSyncingTo()); - ASSERT_EQUALS(1, hbResponse.getVersion()); - - hbResponseObj = hbResponse.toBSON(); - ASSERT_EQUALS(fieldsSet, hbResponseObj.nFields()); - ASSERT_EQUALS("rs0", hbResponseObj["set"].String()); - ASSERT_EQUALS("", hbResponseObj["hbmsg"].String()); - ASSERT_EQUALS(1, hbResponseObj["v"].Number()); - - initializeResult = hbResponseObjRoundTripChecker.initialize(hbResponseObj); - ASSERT_EQUALS(Status::OK(), initializeResult); - ASSERT_EQUALS(hbResponseObj.toString(), hbResponseObjRoundTripChecker.toBSON().toString()); - - // set electionTime - hbResponse.setElectionTime(OpTime(10,0)); - ++fieldsSet; - ASSERT_EQUALS(false, hbResponse.hasState()); - ASSERT_EQUALS(true, hbResponse.hasElectionTime()); - ASSERT_EQUALS(false, hbResponse.hasIsElectable()); - ASSERT_EQUALS(false, hbResponse.hasTime()); - ASSERT_EQUALS(false, hbResponse.hasOpTime()); - ASSERT_EQUALS(false, hbResponse.hasConfig()); - ASSERT_EQUALS(false, hbResponse.isMismatched()); - ASSERT_EQUALS(false, hbResponse.isReplSet()); - ASSERT_EQUALS(false, hbResponse.isStateDisagreement()); - ASSERT_EQUALS("rs0", hbResponse.getReplicaSetName()); - ASSERT_EQUALS("", hbResponse.getHbMsg()); - ASSERT_EQUALS("", hbResponse.getSyncingTo()); - ASSERT_EQUALS(1, hbResponse.getVersion()); - ASSERT_EQUALS(OpTime(10,0), hbResponse.getElectionTime()); - - hbResponseObj = hbResponse.toBSON(); - ASSERT_EQUALS(fieldsSet, hbResponseObj.nFields()); - ASSERT_EQUALS("rs0", hbResponseObj["set"].String()); - ASSERT_EQUALS("", hbResponseObj["hbmsg"].String()); - ASSERT_EQUALS(1, hbResponseObj["v"].Number()); - ASSERT_EQUALS(OpTime(10,0), hbResponseObj["electionTime"]._opTime()); - - initializeResult = hbResponseObjRoundTripChecker.initialize(hbResponseObj); - ASSERT_EQUALS(Status::OK(), initializeResult); - ASSERT_EQUALS(hbResponseObj.toString(), hbResponseObjRoundTripChecker.toBSON().toString()); - - // set opTime - hbResponse.setOpTime(Date_t(10)); - ++fieldsSet; - ASSERT_EQUALS(false, hbResponse.hasState()); - ASSERT_EQUALS(true, hbResponse.hasElectionTime()); - ASSERT_EQUALS(false, hbResponse.hasIsElectable()); - ASSERT_EQUALS(false, hbResponse.hasTime()); - ASSERT_EQUALS(true, hbResponse.hasOpTime()); - ASSERT_EQUALS(false, hbResponse.hasConfig()); - ASSERT_EQUALS(false, hbResponse.isMismatched()); - ASSERT_EQUALS(false, hbResponse.isReplSet()); - ASSERT_EQUALS(false, hbResponse.isStateDisagreement()); - ASSERT_EQUALS("rs0", hbResponse.getReplicaSetName()); - ASSERT_EQUALS("", hbResponse.getHbMsg()); - ASSERT_EQUALS("", hbResponse.getSyncingTo()); - ASSERT_EQUALS(1, hbResponse.getVersion()); - ASSERT_EQUALS(OpTime(10,0), hbResponse.getElectionTime()); - ASSERT_EQUALS(OpTime(0,10), hbResponse.getOpTime()); - - hbResponseObj = hbResponse.toBSON(); - ASSERT_EQUALS(fieldsSet, hbResponseObj.nFields()); - ASSERT_EQUALS("rs0", hbResponseObj["set"].String()); - ASSERT_EQUALS("", hbResponseObj["hbmsg"].String()); - ASSERT_EQUALS(1, hbResponseObj["v"].Number()); - ASSERT_EQUALS(OpTime(10,0), hbResponseObj["electionTime"]._opTime()); - ASSERT_EQUALS(OpTime(0,10), hbResponseObj["opTime"]._opTime()); - - initializeResult = hbResponseObjRoundTripChecker.initialize(hbResponseObj); - ASSERT_EQUALS(Status::OK(), initializeResult); - ASSERT_EQUALS(hbResponseObj.toString(), hbResponseObjRoundTripChecker.toBSON().toString()); - - // set time - hbResponse.setTime(Seconds(10)); - ++fieldsSet; - ASSERT_EQUALS(false, hbResponse.hasState()); - ASSERT_EQUALS(true, hbResponse.hasElectionTime()); - ASSERT_EQUALS(false, hbResponse.hasIsElectable()); - ASSERT_EQUALS(true, hbResponse.hasTime()); - ASSERT_EQUALS(true, hbResponse.hasOpTime()); - ASSERT_EQUALS(false, hbResponse.hasConfig()); - ASSERT_EQUALS(false, hbResponse.isMismatched()); - ASSERT_EQUALS(false, hbResponse.isReplSet()); - ASSERT_EQUALS(false, hbResponse.isStateDisagreement()); - ASSERT_EQUALS("rs0", hbResponse.getReplicaSetName()); - ASSERT_EQUALS("", hbResponse.getHbMsg()); - ASSERT_EQUALS("", hbResponse.getSyncingTo()); - ASSERT_EQUALS(1, hbResponse.getVersion()); - ASSERT_EQUALS(OpTime(10,0), hbResponse.getElectionTime()); - ASSERT_EQUALS(OpTime(0,10), hbResponse.getOpTime()); - ASSERT_EQUALS(10, hbResponse.getTime().total_seconds()); - - hbResponseObj = hbResponse.toBSON(); - ASSERT_EQUALS(fieldsSet, hbResponseObj.nFields()); - ASSERT_EQUALS("rs0", hbResponseObj["set"].String()); - ASSERT_EQUALS("", hbResponseObj["hbmsg"].String()); - ASSERT_EQUALS(1, hbResponseObj["v"].Number()); - ASSERT_EQUALS(OpTime(10,0), hbResponseObj["electionTime"]._opTime()); - ASSERT_EQUALS(OpTime(0,10), hbResponseObj["opTime"]._opTime()); - ASSERT_EQUALS(10, hbResponseObj["time"].numberLong()); - - initializeResult = hbResponseObjRoundTripChecker.initialize(hbResponseObj); - ASSERT_EQUALS(Status::OK(), initializeResult); - ASSERT_EQUALS(hbResponseObj.toString(), hbResponseObjRoundTripChecker.toBSON().toString()); - - // set electable - hbResponse.setElectable(true); - ++fieldsSet; - ASSERT_EQUALS(false, hbResponse.hasState()); - ASSERT_EQUALS(true, hbResponse.hasElectionTime()); - ASSERT_EQUALS(true, hbResponse.hasIsElectable()); - ASSERT_EQUALS(true, hbResponse.hasTime()); - ASSERT_EQUALS(true, hbResponse.hasOpTime()); - ASSERT_EQUALS(false, hbResponse.hasConfig()); - ASSERT_EQUALS(false, hbResponse.isMismatched()); - ASSERT_EQUALS(false, hbResponse.isReplSet()); - ASSERT_EQUALS(false, hbResponse.isStateDisagreement()); - ASSERT_EQUALS("rs0", hbResponse.getReplicaSetName()); - ASSERT_EQUALS("", hbResponse.getHbMsg()); - ASSERT_EQUALS("", hbResponse.getSyncingTo()); - ASSERT_EQUALS(1, hbResponse.getVersion()); - ASSERT_EQUALS(OpTime(10,0), hbResponse.getElectionTime()); - ASSERT_EQUALS(OpTime(0,10), hbResponse.getOpTime()); - ASSERT_EQUALS(10, hbResponse.getTime().total_seconds()); - ASSERT_EQUALS(true, hbResponse.isElectable()); - - hbResponseObj = hbResponse.toBSON(); - ASSERT_EQUALS(fieldsSet, hbResponseObj.nFields()); - ASSERT_EQUALS("rs0", hbResponseObj["set"].String()); - ASSERT_EQUALS("", hbResponseObj["hbmsg"].String()); - ASSERT_EQUALS(1, hbResponseObj["v"].Number()); - ASSERT_EQUALS(OpTime(10,0), hbResponseObj["electionTime"]._opTime()); - ASSERT_EQUALS(OpTime(0,10), hbResponseObj["opTime"]._opTime()); - ASSERT_EQUALS(10, hbResponseObj["time"].numberLong()); - ASSERT_EQUALS(true, hbResponseObj["e"].trueValue()); - - initializeResult = hbResponseObjRoundTripChecker.initialize(hbResponseObj); - ASSERT_EQUALS(Status::OK(), initializeResult); - ASSERT_EQUALS(hbResponseObj.toString(), hbResponseObjRoundTripChecker.toBSON().toString()); - - // set config - ReplicaSetConfig config; - hbResponse.setConfig(config); - ++fieldsSet; - ASSERT_EQUALS(false, hbResponse.hasState()); - ASSERT_EQUALS(true, hbResponse.hasElectionTime()); - ASSERT_EQUALS(true, hbResponse.hasIsElectable()); - ASSERT_EQUALS(true, hbResponse.hasTime()); - ASSERT_EQUALS(true, hbResponse.hasOpTime()); - ASSERT_EQUALS(true, hbResponse.hasConfig()); - ASSERT_EQUALS(false, hbResponse.isMismatched()); - ASSERT_EQUALS(false, hbResponse.isReplSet()); - ASSERT_EQUALS(false, hbResponse.isStateDisagreement()); - ASSERT_EQUALS("rs0", hbResponse.getReplicaSetName()); - ASSERT_EQUALS("", hbResponse.getHbMsg()); - ASSERT_EQUALS("", hbResponse.getSyncingTo()); - ASSERT_EQUALS(1, hbResponse.getVersion()); - ASSERT_EQUALS(OpTime(10,0), hbResponse.getElectionTime()); - ASSERT_EQUALS(OpTime(0,10), hbResponse.getOpTime()); - ASSERT_EQUALS(10, hbResponse.getTime().total_seconds()); - ASSERT_EQUALS(true, hbResponse.isElectable()); - ASSERT_EQUALS(config.toBSON().toString(), hbResponse.getConfig().toBSON().toString()); - - hbResponseObj = hbResponse.toBSON(); - ASSERT_EQUALS(fieldsSet, hbResponseObj.nFields()); - ASSERT_EQUALS("rs0", hbResponseObj["set"].String()); - ASSERT_EQUALS("", hbResponseObj["hbmsg"].String()); - ASSERT_EQUALS(1, hbResponseObj["v"].Number()); - ASSERT_EQUALS(OpTime(10,0), hbResponseObj["electionTime"]._opTime()); - ASSERT_EQUALS(OpTime(0,10), hbResponseObj["opTime"]._opTime()); - ASSERT_EQUALS(10, hbResponseObj["time"].numberLong()); - ASSERT_EQUALS(true, hbResponseObj["e"].trueValue()); - ASSERT_EQUALS(config.toBSON().toString(), hbResponseObj["config"].Obj().toString()); - - initializeResult = hbResponseObjRoundTripChecker.initialize(hbResponseObj); - ASSERT_EQUALS(Status::OK(), initializeResult); - ASSERT_EQUALS(hbResponseObj.toString(), hbResponseObjRoundTripChecker.toBSON().toString()); - - // set state - hbResponse.setState(MemberState(MemberState::RS_SECONDARY)); - ++fieldsSet; - ASSERT_EQUALS(true, hbResponse.hasState()); - ASSERT_EQUALS(true, hbResponse.hasElectionTime()); - ASSERT_EQUALS(true, hbResponse.hasIsElectable()); - ASSERT_EQUALS(true, hbResponse.hasTime()); - ASSERT_EQUALS(true, hbResponse.hasOpTime()); - ASSERT_EQUALS(true, hbResponse.hasConfig()); - ASSERT_EQUALS(false, hbResponse.isMismatched()); - ASSERT_EQUALS(false, hbResponse.isReplSet()); - ASSERT_EQUALS(false, hbResponse.isStateDisagreement()); - ASSERT_EQUALS("rs0", hbResponse.getReplicaSetName()); - ASSERT_EQUALS(MemberState(MemberState::RS_SECONDARY).toString(), - hbResponse.getState().toString()); - ASSERT_EQUALS("", hbResponse.getHbMsg()); - ASSERT_EQUALS("", hbResponse.getSyncingTo()); - ASSERT_EQUALS(1, hbResponse.getVersion()); - ASSERT_EQUALS(OpTime(10,0), hbResponse.getElectionTime()); - ASSERT_EQUALS(OpTime(0,10), hbResponse.getOpTime()); - ASSERT_EQUALS(10, hbResponse.getTime().total_seconds()); - ASSERT_EQUALS(true, hbResponse.isElectable()); - ASSERT_EQUALS(config.toBSON().toString(), hbResponse.getConfig().toBSON().toString()); - - hbResponseObj = hbResponse.toBSON(); - ASSERT_EQUALS(fieldsSet, hbResponseObj.nFields()); - ASSERT_EQUALS("rs0", hbResponseObj["set"].String()); - ASSERT_EQUALS("", hbResponseObj["hbmsg"].String()); - ASSERT_EQUALS(1, hbResponseObj["v"].Number()); - ASSERT_EQUALS(OpTime(10,0), hbResponseObj["electionTime"]._opTime()); - ASSERT_EQUALS(OpTime(0,10), hbResponseObj["opTime"]._opTime()); - ASSERT_EQUALS(10, hbResponseObj["time"].numberLong()); - ASSERT_EQUALS(true, hbResponseObj["e"].trueValue()); - ASSERT_EQUALS(config.toBSON().toString(), hbResponseObj["config"].Obj().toString()); - ASSERT_EQUALS(2, hbResponseObj["state"].numberLong()); - - initializeResult = hbResponseObjRoundTripChecker.initialize(hbResponseObj); - ASSERT_EQUALS(Status::OK(), initializeResult); - ASSERT_EQUALS(hbResponseObj.toString(), hbResponseObjRoundTripChecker.toBSON().toString()); - - // set stateDisagreement - hbResponse.noteStateDisagreement(); - ++fieldsSet; - ASSERT_EQUALS(true, hbResponse.hasState()); - ASSERT_EQUALS(true, hbResponse.hasElectionTime()); - ASSERT_EQUALS(true, hbResponse.hasIsElectable()); - ASSERT_EQUALS(true, hbResponse.hasTime()); - ASSERT_EQUALS(true, hbResponse.hasOpTime()); - ASSERT_EQUALS(true, hbResponse.hasConfig()); - ASSERT_EQUALS(false, hbResponse.isMismatched()); - ASSERT_EQUALS(false, hbResponse.isReplSet()); - ASSERT_EQUALS(true, hbResponse.isStateDisagreement()); - ASSERT_EQUALS("rs0", hbResponse.getReplicaSetName()); - ASSERT_EQUALS(MemberState(MemberState::RS_SECONDARY).toString(), - hbResponse.getState().toString()); - ASSERT_EQUALS("", hbResponse.getHbMsg()); - ASSERT_EQUALS("", hbResponse.getSyncingTo()); - ASSERT_EQUALS(1, hbResponse.getVersion()); - ASSERT_EQUALS(OpTime(10,0), hbResponse.getElectionTime()); - ASSERT_EQUALS(OpTime(0,10), hbResponse.getOpTime()); - ASSERT_EQUALS(10, hbResponse.getTime().total_seconds()); - ASSERT_EQUALS(true, hbResponse.isElectable()); - ASSERT_EQUALS(config.toBSON().toString(), hbResponse.getConfig().toBSON().toString()); - - hbResponseObj = hbResponse.toBSON(); - ASSERT_EQUALS(fieldsSet, hbResponseObj.nFields()); - ASSERT_EQUALS("rs0", hbResponseObj["set"].String()); - ASSERT_EQUALS("", hbResponseObj["hbmsg"].String()); - ASSERT_EQUALS(1, hbResponseObj["v"].Number()); - ASSERT_EQUALS(OpTime(10,0), hbResponseObj["electionTime"]._opTime()); - ASSERT_EQUALS(OpTime(0,10), hbResponseObj["opTime"]._opTime()); - ASSERT_EQUALS(10, hbResponseObj["time"].numberLong()); - ASSERT_EQUALS(true, hbResponseObj["e"].trueValue()); - ASSERT_EQUALS(config.toBSON().toString(), hbResponseObj["config"].Obj().toString()); - ASSERT_EQUALS(2, hbResponseObj["state"].numberLong()); - ASSERT_EQUALS(false, hbResponseObj["mismatch"].trueValue()); - ASSERT_EQUALS(true, hbResponseObj["stateDisagreement"].trueValue()); - - initializeResult = hbResponseObjRoundTripChecker.initialize(hbResponseObj); - ASSERT_EQUALS(Status::OK(), initializeResult); - ASSERT_EQUALS(hbResponseObj.toString(), hbResponseObjRoundTripChecker.toBSON().toString()); - - // set replSet - hbResponse.noteReplSet(); - ++fieldsSet; - ASSERT_EQUALS(true, hbResponse.hasState()); - ASSERT_EQUALS(true, hbResponse.hasElectionTime()); - ASSERT_EQUALS(true, hbResponse.hasIsElectable()); - ASSERT_EQUALS(true, hbResponse.hasTime()); - ASSERT_EQUALS(true, hbResponse.hasOpTime()); - ASSERT_EQUALS(true, hbResponse.hasConfig()); - ASSERT_EQUALS(false, hbResponse.isMismatched()); - ASSERT_EQUALS(true, hbResponse.isReplSet()); - ASSERT_EQUALS(true, hbResponse.isStateDisagreement()); - ASSERT_EQUALS("rs0", hbResponse.getReplicaSetName()); - ASSERT_EQUALS(MemberState(MemberState::RS_SECONDARY).toString(), - hbResponse.getState().toString()); - ASSERT_EQUALS("", hbResponse.getHbMsg()); - ASSERT_EQUALS("", hbResponse.getSyncingTo()); - ASSERT_EQUALS(1, hbResponse.getVersion()); - ASSERT_EQUALS(OpTime(10,0), hbResponse.getElectionTime()); - ASSERT_EQUALS(OpTime(0,10), hbResponse.getOpTime()); - ASSERT_EQUALS(10, hbResponse.getTime().total_seconds()); - ASSERT_EQUALS(true, hbResponse.isElectable()); - ASSERT_EQUALS(config.toBSON().toString(), hbResponse.getConfig().toBSON().toString()); - - hbResponseObj = hbResponse.toBSON(); - ASSERT_EQUALS(fieldsSet, hbResponseObj.nFields()); - ASSERT_EQUALS("rs0", hbResponseObj["set"].String()); - ASSERT_EQUALS("", hbResponseObj["hbmsg"].String()); - ASSERT_EQUALS(1, hbResponseObj["v"].Number()); - ASSERT_EQUALS(OpTime(10,0), hbResponseObj["electionTime"]._opTime()); - ASSERT_EQUALS(OpTime(0,10), hbResponseObj["opTime"]._opTime()); - ASSERT_EQUALS(10, hbResponseObj["time"].numberLong()); - ASSERT_EQUALS(true, hbResponseObj["e"].trueValue()); - ASSERT_EQUALS(config.toBSON().toString(), hbResponseObj["config"].Obj().toString()); - ASSERT_EQUALS(2, hbResponseObj["state"].numberLong()); - ASSERT_EQUALS(false, hbResponseObj["mismatch"].trueValue()); - ASSERT_EQUALS(true, hbResponseObj["stateDisagreement"].trueValue()); - ASSERT_EQUALS(true, hbResponseObj["rs"].trueValue()); - - initializeResult = hbResponseObjRoundTripChecker.initialize(hbResponseObj); - ASSERT_EQUALS(Status::OK(), initializeResult); - ASSERT_EQUALS(hbResponseObj.toString(), hbResponseObjRoundTripChecker.toBSON().toString()); - - // set syncingTo - hbResponse.setSyncingTo("syncTarget"); - ++fieldsSet; - ASSERT_EQUALS(true, hbResponse.hasState()); - ASSERT_EQUALS(true, hbResponse.hasElectionTime()); - ASSERT_EQUALS(true, hbResponse.hasIsElectable()); - ASSERT_EQUALS(true, hbResponse.hasTime()); - ASSERT_EQUALS(true, hbResponse.hasOpTime()); - ASSERT_EQUALS(true, hbResponse.hasConfig()); - ASSERT_EQUALS(false, hbResponse.isMismatched()); - ASSERT_EQUALS(true, hbResponse.isReplSet()); - ASSERT_EQUALS(true, hbResponse.isStateDisagreement()); - ASSERT_EQUALS("rs0", hbResponse.getReplicaSetName()); - ASSERT_EQUALS(MemberState(MemberState::RS_SECONDARY).toString(), - hbResponse.getState().toString()); - ASSERT_EQUALS("", hbResponse.getHbMsg()); - ASSERT_EQUALS("syncTarget", hbResponse.getSyncingTo()); - ASSERT_EQUALS(1, hbResponse.getVersion()); - ASSERT_EQUALS(OpTime(10,0), hbResponse.getElectionTime()); - ASSERT_EQUALS(OpTime(0,10), hbResponse.getOpTime()); - ASSERT_EQUALS(10, hbResponse.getTime().total_seconds()); - ASSERT_EQUALS(true, hbResponse.isElectable()); - ASSERT_EQUALS(config.toBSON().toString(), hbResponse.getConfig().toBSON().toString()); - - hbResponseObj = hbResponse.toBSON(); - ASSERT_EQUALS(fieldsSet, hbResponseObj.nFields()); - ASSERT_EQUALS("rs0", hbResponseObj["set"].String()); - ASSERT_EQUALS("", hbResponseObj["hbmsg"].String()); - ASSERT_EQUALS(1, hbResponseObj["v"].Number()); - ASSERT_EQUALS(OpTime(10,0), hbResponseObj["electionTime"]._opTime()); - ASSERT_EQUALS(OpTime(0,10), hbResponseObj["opTime"]._opTime()); - ASSERT_EQUALS(10, hbResponseObj["time"].numberLong()); - ASSERT_EQUALS(true, hbResponseObj["e"].trueValue()); - ASSERT_EQUALS(config.toBSON().toString(), hbResponseObj["config"].Obj().toString()); - ASSERT_EQUALS(2, hbResponseObj["state"].numberLong()); - ASSERT_EQUALS(false, hbResponseObj["mismatch"].trueValue()); - ASSERT_EQUALS(true, hbResponseObj["stateDisagreement"].trueValue()); - ASSERT_EQUALS(true, hbResponseObj["rs"].trueValue()); - ASSERT_EQUALS("syncTarget", hbResponseObj["syncingTo"].String()); - - initializeResult = hbResponseObjRoundTripChecker.initialize(hbResponseObj); - ASSERT_EQUALS(Status::OK(), initializeResult); - ASSERT_EQUALS(hbResponseObj.toString(), hbResponseObjRoundTripChecker.toBSON().toString()); - - // set hbmsg - hbResponse.setHbMsg("lub dub"); - ASSERT_EQUALS(true, hbResponse.hasState()); - ASSERT_EQUALS(true, hbResponse.hasElectionTime()); - ASSERT_EQUALS(true, hbResponse.hasIsElectable()); - ASSERT_EQUALS(true, hbResponse.hasTime()); - ASSERT_EQUALS(true, hbResponse.hasOpTime()); - ASSERT_EQUALS(true, hbResponse.hasConfig()); - ASSERT_EQUALS(false, hbResponse.isMismatched()); - ASSERT_EQUALS(true, hbResponse.isReplSet()); - ASSERT_EQUALS(true, hbResponse.isStateDisagreement()); - ASSERT_EQUALS("rs0", hbResponse.getReplicaSetName()); - ASSERT_EQUALS(MemberState(MemberState::RS_SECONDARY).toString(), - hbResponse.getState().toString()); - ASSERT_EQUALS("lub dub", hbResponse.getHbMsg()); - ASSERT_EQUALS("syncTarget", hbResponse.getSyncingTo()); - ASSERT_EQUALS(1, hbResponse.getVersion()); - ASSERT_EQUALS(OpTime(10,0), hbResponse.getElectionTime()); - ASSERT_EQUALS(OpTime(0,10), hbResponse.getOpTime()); - ASSERT_EQUALS(10, hbResponse.getTime().total_seconds()); - ASSERT_EQUALS(true, hbResponse.isElectable()); - ASSERT_EQUALS(config.toBSON().toString(), hbResponse.getConfig().toBSON().toString()); - - hbResponseObj = hbResponse.toBSON(); - ASSERT_EQUALS(fieldsSet, hbResponseObj.nFields()); - ASSERT_EQUALS("rs0", hbResponseObj["set"].String()); - ASSERT_EQUALS("lub dub", hbResponseObj["hbmsg"].String()); - ASSERT_EQUALS(1, hbResponseObj["v"].Number()); - ASSERT_EQUALS(OpTime(10,0), hbResponseObj["electionTime"]._opTime()); - ASSERT_EQUALS(OpTime(0,10), hbResponseObj["opTime"]._opTime()); - ASSERT_EQUALS(10, hbResponseObj["time"].numberLong()); - ASSERT_EQUALS(true, hbResponseObj["e"].trueValue()); - ASSERT_EQUALS(config.toBSON().toString(), hbResponseObj["config"].Obj().toString()); - ASSERT_EQUALS(2, hbResponseObj["state"].numberLong()); - ASSERT_EQUALS(false, hbResponseObj["mismatch"].trueValue()); - ASSERT_EQUALS(true, hbResponseObj["stateDisagreement"].trueValue()); - ASSERT_EQUALS(true, hbResponseObj["rs"].trueValue()); - ASSERT_EQUALS("syncTarget", hbResponseObj["syncingTo"].String()); - - initializeResult = hbResponseObjRoundTripChecker.initialize(hbResponseObj); - ASSERT_EQUALS(Status::OK(), initializeResult); - ASSERT_EQUALS(hbResponseObj.toString(), hbResponseObjRoundTripChecker.toBSON().toString()); - - // set mismatched - hbResponse.noteMismatched(); - ASSERT_EQUALS(true, hbResponse.hasState()); - ASSERT_EQUALS(true, hbResponse.hasElectionTime()); - ASSERT_EQUALS(true, hbResponse.hasIsElectable()); - ASSERT_EQUALS(true, hbResponse.hasTime()); - ASSERT_EQUALS(true, hbResponse.hasOpTime()); - ASSERT_EQUALS(true, hbResponse.hasConfig()); - ASSERT_EQUALS(true, hbResponse.isMismatched()); - ASSERT_EQUALS(true, hbResponse.isReplSet()); - ASSERT_EQUALS(true, hbResponse.isStateDisagreement()); - ASSERT_EQUALS("rs0", hbResponse.getReplicaSetName()); - ASSERT_EQUALS(MemberState(MemberState::RS_SECONDARY).toString(), - hbResponse.getState().toString()); - ASSERT_EQUALS("lub dub", hbResponse.getHbMsg()); - ASSERT_EQUALS("syncTarget", hbResponse.getSyncingTo()); - ASSERT_EQUALS(1, hbResponse.getVersion()); - ASSERT_EQUALS(OpTime(10,0), hbResponse.getElectionTime()); - ASSERT_EQUALS(OpTime(0,10), hbResponse.getOpTime()); - ASSERT_EQUALS(10, hbResponse.getTime().total_seconds()); - ASSERT_EQUALS(true, hbResponse.isElectable()); - ASSERT_EQUALS(config.toBSON().toString(), hbResponse.getConfig().toBSON().toString()); - - hbResponseObj = hbResponse.toBSON(); - ASSERT_EQUALS(2, hbResponseObj.nFields()); - ASSERT_EQUALS(true, hbResponseObj["mismatch"].trueValue()); - - // NOTE: Does not check round-trip. Once noteMismached is set the bson will return an error - // from initialize parsing. - initializeResult = hbResponseObjRoundTripChecker.initialize(hbResponseObj); - ASSERT_NOT_EQUALS(Status::OK(), initializeResult); - ASSERT_EQUALS(ErrorCodes::InconsistentReplicaSetNames, initializeResult.code()); - } - - TEST(ReplSetHeartbeatResponse, InitializeWrongElectionTimeType) { - ReplSetHeartbeatResponse hbResponse; - BSONObj initializerObj = BSON("ok" << 1.0 << "electionTime" << "hello"); - Status result = hbResponse.initialize(initializerObj); - ASSERT_EQUALS(ErrorCodes::TypeMismatch, result); - ASSERT_EQUALS("Expected \"electionTime\" field in response to replSetHeartbeat command to " - "have type Date or Timestamp, but found type String", - result.reason()); - } - - TEST(ReplSetHeartbeatResponse, InitializeWrongTimeType) { - ReplSetHeartbeatResponse hbResponse; - BSONObj initializerObj = BSON("ok" << 1.0 << "time" << "hello"); - Status result = hbResponse.initialize(initializerObj); - ASSERT_EQUALS(ErrorCodes::TypeMismatch, result); - ASSERT_EQUALS("Expected \"time\" field in response to replSetHeartbeat command to " - "have a numeric type, but found type String", - result.reason()); - } - - TEST(ReplSetHeartbeatResponse, InitializeWrongOpTimeType) { - ReplSetHeartbeatResponse hbResponse; - BSONObj initializerObj = BSON("ok" << 1.0 << "opTime" << "hello"); - Status result = hbResponse.initialize(initializerObj); - ASSERT_EQUALS(ErrorCodes::TypeMismatch, result); - ASSERT_EQUALS("Expected \"opTime\" field in response to replSetHeartbeat command to " - "have type Date or Timestamp, but found type String", - result.reason()); - } - - TEST(ReplSetHeartbeatResponse, InitializeMemberStateWrongType) { - ReplSetHeartbeatResponse hbResponse; - BSONObj initializerObj = BSON("ok" << 1.0 << "state" << "hello"); - Status result = hbResponse.initialize(initializerObj); - ASSERT_EQUALS(ErrorCodes::TypeMismatch, result); - ASSERT_EQUALS("Expected \"state\" field in response to replSetHeartbeat command to " - "have type NumberInt or NumberLong, but found type String", - result.reason()); - } - - TEST(ReplSetHeartbeatResponse, InitializeMemberStateTooLow) { - ReplSetHeartbeatResponse hbResponse; - BSONObj initializerObj = BSON("ok" << 1.0 << "state" << -1); - Status result = hbResponse.initialize(initializerObj); - ASSERT_EQUALS(ErrorCodes::BadValue, result); - ASSERT_EQUALS("Value for \"state\" in response to replSetHeartbeat is out of range; " - "legal values are non-negative and no more than 10", - result.reason()); - } - - TEST(ReplSetHeartbeatResponse, InitializeMemberStateTooHigh) { - ReplSetHeartbeatResponse hbResponse; - BSONObj initializerObj = BSON("ok" << 1.0 << "state" << 11); - Status result = hbResponse.initialize(initializerObj); - ASSERT_EQUALS(ErrorCodes::BadValue, result); - ASSERT_EQUALS("Value for \"state\" in response to replSetHeartbeat is out of range; " - "legal values are non-negative and no more than 10", - result.reason()); - } - - TEST(ReplSetHeartbeatResponse, InitializeVersionWrongType) { - ReplSetHeartbeatResponse hbResponse; - BSONObj initializerObj = BSON("ok" << 1.0 << - "v" << "hello"); - Status result = hbResponse.initialize(initializerObj); - ASSERT_EQUALS(ErrorCodes::TypeMismatch, result); - ASSERT_EQUALS("Expected \"v\" field in response to replSetHeartbeat to " - "have type NumberInt, but found String", - result.reason()); - } - - TEST(ReplSetHeartbeatResponse, InitializeReplSetNameWrongType) { - ReplSetHeartbeatResponse hbResponse; - BSONObj initializerObj = BSON("ok" << 1.0 << - "v" << 2 << // needs a version to get this far in initialize() - "set" << 4); - Status result = hbResponse.initialize(initializerObj); - ASSERT_EQUALS(ErrorCodes::TypeMismatch, result); - ASSERT_EQUALS("Expected \"set\" field in response to replSetHeartbeat to " - "have type String, but found NumberInt32", - result.reason()); - } - - TEST(ReplSetHeartbeatResponse, InitializeHeartbeatMeessageWrongType) { - ReplSetHeartbeatResponse hbResponse; - BSONObj initializerObj = BSON("ok" << 1.0 << - "v" << 2 << // needs a version to get this far in initialize() - "hbmsg" << 4); - Status result = hbResponse.initialize(initializerObj); - ASSERT_EQUALS(ErrorCodes::TypeMismatch, result); - ASSERT_EQUALS("Expected \"hbmsg\" field in response to replSetHeartbeat to " - "have type String, but found NumberInt32", - result.reason()); - } - - TEST(ReplSetHeartbeatResponse, InitializeSyncingToWrongType) { - ReplSetHeartbeatResponse hbResponse; - BSONObj initializerObj = BSON("ok" << 1.0 << - "v" << 2 << // needs a version to get this far in initialize() - "syncingTo" << 4); - Status result = hbResponse.initialize(initializerObj); - ASSERT_EQUALS(ErrorCodes::TypeMismatch, result); - ASSERT_EQUALS("Expected \"syncingTo\" field in response to replSetHeartbeat to " - "have type String, but found NumberInt32", - result.reason()); - } - - TEST(ReplSetHeartbeatResponse, InitializeConfigWrongType) { - ReplSetHeartbeatResponse hbResponse; - BSONObj initializerObj = BSON("ok" << 1.0 << - "v" << 2 << // needs a version to get this far in initialize() - "config" << 4); - Status result = hbResponse.initialize(initializerObj); - ASSERT_EQUALS(ErrorCodes::TypeMismatch, result); - ASSERT_EQUALS("Expected \"config\" in response to replSetHeartbeat to " - "have type Object, but found NumberInt32", - result.reason()); - } - - TEST(ReplSetHeartbeatResponse, InitializeBadConfig) { - ReplSetHeartbeatResponse hbResponse; - BSONObj initializerObj = BSON("ok" << 1.0 << - "v" << 2 << // needs a version to get this far in initialize() - "config" << BSON("illegalFieldName" << 2)); - Status result = hbResponse.initialize(initializerObj); - ASSERT_EQUALS(ErrorCodes::BadValue, result); - ASSERT_EQUALS("Unexpected field illegalFieldName in replica set configuration", - result.reason()); - } - - TEST(ReplSetHeartbeatResponse, InitializeBothElectionTimeTypesSameResult) { - ReplSetHeartbeatResponse hbResponseDate; - ReplSetHeartbeatResponse hbResponseTimestamp; - BSONObjBuilder initializerDate; - BSONObjBuilder initializerTimestamp; - Date_t electionTime = Date_t(974132); - - initializerDate.append("ok", 1.0); - initializerDate.append("v", 1); - initializerDate.appendDate("electionTime", electionTime); - Status result = hbResponseDate.initialize(initializerDate.obj()); - ASSERT_EQUALS(Status::OK(), result); - - initializerTimestamp.append("ok", 1.0); - initializerTimestamp.append("v", 1); - initializerTimestamp.appendTimestamp("electionTime", electionTime); - result = hbResponseTimestamp.initialize(initializerTimestamp.obj()); - ASSERT_EQUALS(Status::OK(), result); - - ASSERT_EQUALS(hbResponseTimestamp.getElectionTime(), hbResponseTimestamp.getElectionTime()); - } - - TEST(ReplSetHeartbeatResponse, InitializeBothOpTimeTypesSameResult) { - ReplSetHeartbeatResponse hbResponseDate; - ReplSetHeartbeatResponse hbResponseTimestamp; - BSONObjBuilder initializerDate; - BSONObjBuilder initializerTimestamp; - Date_t opTime = Date_t(974132); - - initializerDate.append("ok", 1.0); - initializerDate.append("v", 1); - initializerDate.appendDate("opTime", opTime); - Status result = hbResponseDate.initialize(initializerDate.obj()); - ASSERT_EQUALS(Status::OK(), result); - - initializerTimestamp.append("ok", 1.0); - initializerTimestamp.append("v", 1); - initializerTimestamp.appendTimestamp("opTime", opTime); - result = hbResponseTimestamp.initialize(initializerTimestamp.obj()); - ASSERT_EQUALS(Status::OK(), result); - - ASSERT_EQUALS(hbResponseTimestamp.getOpTime(), hbResponseTimestamp.getOpTime()); - } - - TEST(ReplSetHeartbeatResponse, NoConfigStillInitializing) { - ReplSetHeartbeatResponse hbResp; - std::string msg = "still initializing"; - Status result = hbResp.initialize(BSON("ok" << 1.0 << - "rs" << true << - "hbmsg" << msg)); - ASSERT_EQUALS(Status::OK(), result); - ASSERT_EQUALS(true, hbResp.isReplSet()); - ASSERT_EQUALS(msg, hbResp.getHbMsg()); - } - - TEST(ReplSetHeartbeatResponse, InvalidResponseOpTimeMissesConfigVersion) { - ReplSetHeartbeatResponse hbResp; - std::string msg = "still initializing"; - Status result = hbResp.initialize(BSON("ok" << 1.0 << - "opTime" << OpTime())); - ASSERT_EQUALS(ErrorCodes::NoSuchKey, result.code()); - ASSERT_TRUE(stringContains(result.reason(), "\"v\"")) - << result.reason() << " doesn't contain 'v' field required error msg"; - } - - TEST(ReplSetHeartbeatResponse, MismatchedRepliSetNames) { - ReplSetHeartbeatResponse hbResponse; - BSONObj initializerObj = BSON("ok" << 0.0 << "mismatch" << true); - Status result = hbResponse.initialize(initializerObj); - ASSERT_EQUALS(ErrorCodes::InconsistentReplicaSetNames, result.code()); - } - - TEST(ReplSetHeartbeatResponse, AuthFailure) { - ReplSetHeartbeatResponse hbResp; - std::string errMsg = "Unauthorized"; - Status result = hbResp.initialize(BSON("ok" << 0.0 << - "errmsg" << errMsg << - "code" << ErrorCodes::Unauthorized)); - ASSERT_EQUALS(ErrorCodes::Unauthorized, result.code()); - ASSERT_EQUALS(errMsg, result.reason()); - } - - TEST(ReplSetHeartbeatResponse, ServerError) { - ReplSetHeartbeatResponse hbResp; - std::string errMsg = "Random Error"; - Status result = hbResp.initialize(BSON("ok" << 0.0 << "errmsg" << errMsg )); - ASSERT_EQUALS(ErrorCodes::UnknownError, result.code()); - ASSERT_EQUALS(errMsg, result.reason()); - } +using boost::scoped_ptr; +using std::auto_ptr; + +bool stringContains(const std::string& haystack, const std::string& needle) { + return haystack.find(needle) != std::string::npos; +} + +TEST(ReplSetHeartbeatResponse, DefaultConstructThenSlowlyBuildToFullObj) { + int fieldsSet = 2; + ReplSetHeartbeatResponse hbResponse; + ReplSetHeartbeatResponse hbResponseObjRoundTripChecker; + ASSERT_EQUALS(false, hbResponse.hasState()); + ASSERT_EQUALS(false, hbResponse.hasElectionTime()); + ASSERT_EQUALS(false, hbResponse.hasIsElectable()); + ASSERT_EQUALS(false, hbResponse.hasTime()); + ASSERT_EQUALS(false, hbResponse.hasOpTime()); + ASSERT_EQUALS(false, hbResponse.hasConfig()); + ASSERT_EQUALS(false, hbResponse.isMismatched()); + ASSERT_EQUALS(false, hbResponse.isReplSet()); + ASSERT_EQUALS(false, hbResponse.isStateDisagreement()); + ASSERT_EQUALS("", hbResponse.getReplicaSetName()); + ASSERT_EQUALS("", hbResponse.getHbMsg()); + ASSERT_EQUALS("", hbResponse.getSyncingTo()); + ASSERT_EQUALS(-1, hbResponse.getVersion()); + + BSONObj hbResponseObj = hbResponse.toBSON(); + ASSERT_EQUALS(fieldsSet, hbResponseObj.nFields()); + ASSERT_EQUALS("", hbResponseObj["hbmsg"].String()); + + Status initializeResult = Status::OK(); + ASSERT_EQUALS(hbResponseObj.toString(), hbResponseObjRoundTripChecker.toBSON().toString()); + + // set version + hbResponse.setVersion(1); + ++fieldsSet; + ASSERT_EQUALS(false, hbResponse.hasState()); + ASSERT_EQUALS(false, hbResponse.hasElectionTime()); + ASSERT_EQUALS(false, hbResponse.hasIsElectable()); + ASSERT_EQUALS(false, hbResponse.hasTime()); + ASSERT_EQUALS(false, hbResponse.hasOpTime()); + ASSERT_EQUALS(false, hbResponse.hasConfig()); + ASSERT_EQUALS(false, hbResponse.isMismatched()); + ASSERT_EQUALS(false, hbResponse.isReplSet()); + ASSERT_EQUALS(false, hbResponse.isStateDisagreement()); + ASSERT_EQUALS("", hbResponse.getReplicaSetName()); + ASSERT_EQUALS("", hbResponse.getHbMsg()); + ASSERT_EQUALS("", hbResponse.getSyncingTo()); + ASSERT_EQUALS(1, hbResponse.getVersion()); + + hbResponseObj = hbResponse.toBSON(); + ASSERT_EQUALS(fieldsSet, hbResponseObj.nFields()); + ASSERT_EQUALS("", hbResponseObj["hbmsg"].String()); + ASSERT_EQUALS(1, hbResponseObj["v"].Number()); + + initializeResult = hbResponseObjRoundTripChecker.initialize(hbResponseObj); + ASSERT_EQUALS(Status::OK(), initializeResult); + ASSERT_EQUALS(hbResponseObj.toString(), hbResponseObjRoundTripChecker.toBSON().toString()); + + // set setname + hbResponse.setSetName("rs0"); + ++fieldsSet; + ASSERT_EQUALS(false, hbResponse.hasState()); + ASSERT_EQUALS(false, hbResponse.hasElectionTime()); + ASSERT_EQUALS(false, hbResponse.hasIsElectable()); + ASSERT_EQUALS(false, hbResponse.hasTime()); + ASSERT_EQUALS(false, hbResponse.hasOpTime()); + ASSERT_EQUALS(false, hbResponse.hasConfig()); + ASSERT_EQUALS(false, hbResponse.isMismatched()); + ASSERT_EQUALS(false, hbResponse.isReplSet()); + ASSERT_EQUALS(false, hbResponse.isStateDisagreement()); + ASSERT_EQUALS("rs0", hbResponse.getReplicaSetName()); + ASSERT_EQUALS("", hbResponse.getHbMsg()); + ASSERT_EQUALS("", hbResponse.getSyncingTo()); + ASSERT_EQUALS(1, hbResponse.getVersion()); + + hbResponseObj = hbResponse.toBSON(); + ASSERT_EQUALS(fieldsSet, hbResponseObj.nFields()); + ASSERT_EQUALS("rs0", hbResponseObj["set"].String()); + ASSERT_EQUALS("", hbResponseObj["hbmsg"].String()); + ASSERT_EQUALS(1, hbResponseObj["v"].Number()); + + initializeResult = hbResponseObjRoundTripChecker.initialize(hbResponseObj); + ASSERT_EQUALS(Status::OK(), initializeResult); + ASSERT_EQUALS(hbResponseObj.toString(), hbResponseObjRoundTripChecker.toBSON().toString()); + + // set electionTime + hbResponse.setElectionTime(OpTime(10, 0)); + ++fieldsSet; + ASSERT_EQUALS(false, hbResponse.hasState()); + ASSERT_EQUALS(true, hbResponse.hasElectionTime()); + ASSERT_EQUALS(false, hbResponse.hasIsElectable()); + ASSERT_EQUALS(false, hbResponse.hasTime()); + ASSERT_EQUALS(false, hbResponse.hasOpTime()); + ASSERT_EQUALS(false, hbResponse.hasConfig()); + ASSERT_EQUALS(false, hbResponse.isMismatched()); + ASSERT_EQUALS(false, hbResponse.isReplSet()); + ASSERT_EQUALS(false, hbResponse.isStateDisagreement()); + ASSERT_EQUALS("rs0", hbResponse.getReplicaSetName()); + ASSERT_EQUALS("", hbResponse.getHbMsg()); + ASSERT_EQUALS("", hbResponse.getSyncingTo()); + ASSERT_EQUALS(1, hbResponse.getVersion()); + ASSERT_EQUALS(OpTime(10, 0), hbResponse.getElectionTime()); + + hbResponseObj = hbResponse.toBSON(); + ASSERT_EQUALS(fieldsSet, hbResponseObj.nFields()); + ASSERT_EQUALS("rs0", hbResponseObj["set"].String()); + ASSERT_EQUALS("", hbResponseObj["hbmsg"].String()); + ASSERT_EQUALS(1, hbResponseObj["v"].Number()); + ASSERT_EQUALS(OpTime(10, 0), hbResponseObj["electionTime"]._opTime()); + + initializeResult = hbResponseObjRoundTripChecker.initialize(hbResponseObj); + ASSERT_EQUALS(Status::OK(), initializeResult); + ASSERT_EQUALS(hbResponseObj.toString(), hbResponseObjRoundTripChecker.toBSON().toString()); + + // set opTime + hbResponse.setOpTime(Date_t(10)); + ++fieldsSet; + ASSERT_EQUALS(false, hbResponse.hasState()); + ASSERT_EQUALS(true, hbResponse.hasElectionTime()); + ASSERT_EQUALS(false, hbResponse.hasIsElectable()); + ASSERT_EQUALS(false, hbResponse.hasTime()); + ASSERT_EQUALS(true, hbResponse.hasOpTime()); + ASSERT_EQUALS(false, hbResponse.hasConfig()); + ASSERT_EQUALS(false, hbResponse.isMismatched()); + ASSERT_EQUALS(false, hbResponse.isReplSet()); + ASSERT_EQUALS(false, hbResponse.isStateDisagreement()); + ASSERT_EQUALS("rs0", hbResponse.getReplicaSetName()); + ASSERT_EQUALS("", hbResponse.getHbMsg()); + ASSERT_EQUALS("", hbResponse.getSyncingTo()); + ASSERT_EQUALS(1, hbResponse.getVersion()); + ASSERT_EQUALS(OpTime(10, 0), hbResponse.getElectionTime()); + ASSERT_EQUALS(OpTime(0, 10), hbResponse.getOpTime()); + + hbResponseObj = hbResponse.toBSON(); + ASSERT_EQUALS(fieldsSet, hbResponseObj.nFields()); + ASSERT_EQUALS("rs0", hbResponseObj["set"].String()); + ASSERT_EQUALS("", hbResponseObj["hbmsg"].String()); + ASSERT_EQUALS(1, hbResponseObj["v"].Number()); + ASSERT_EQUALS(OpTime(10, 0), hbResponseObj["electionTime"]._opTime()); + ASSERT_EQUALS(OpTime(0, 10), hbResponseObj["opTime"]._opTime()); + + initializeResult = hbResponseObjRoundTripChecker.initialize(hbResponseObj); + ASSERT_EQUALS(Status::OK(), initializeResult); + ASSERT_EQUALS(hbResponseObj.toString(), hbResponseObjRoundTripChecker.toBSON().toString()); + + // set time + hbResponse.setTime(Seconds(10)); + ++fieldsSet; + ASSERT_EQUALS(false, hbResponse.hasState()); + ASSERT_EQUALS(true, hbResponse.hasElectionTime()); + ASSERT_EQUALS(false, hbResponse.hasIsElectable()); + ASSERT_EQUALS(true, hbResponse.hasTime()); + ASSERT_EQUALS(true, hbResponse.hasOpTime()); + ASSERT_EQUALS(false, hbResponse.hasConfig()); + ASSERT_EQUALS(false, hbResponse.isMismatched()); + ASSERT_EQUALS(false, hbResponse.isReplSet()); + ASSERT_EQUALS(false, hbResponse.isStateDisagreement()); + ASSERT_EQUALS("rs0", hbResponse.getReplicaSetName()); + ASSERT_EQUALS("", hbResponse.getHbMsg()); + ASSERT_EQUALS("", hbResponse.getSyncingTo()); + ASSERT_EQUALS(1, hbResponse.getVersion()); + ASSERT_EQUALS(OpTime(10, 0), hbResponse.getElectionTime()); + ASSERT_EQUALS(OpTime(0, 10), hbResponse.getOpTime()); + ASSERT_EQUALS(10, hbResponse.getTime().total_seconds()); + + hbResponseObj = hbResponse.toBSON(); + ASSERT_EQUALS(fieldsSet, hbResponseObj.nFields()); + ASSERT_EQUALS("rs0", hbResponseObj["set"].String()); + ASSERT_EQUALS("", hbResponseObj["hbmsg"].String()); + ASSERT_EQUALS(1, hbResponseObj["v"].Number()); + ASSERT_EQUALS(OpTime(10, 0), hbResponseObj["electionTime"]._opTime()); + ASSERT_EQUALS(OpTime(0, 10), hbResponseObj["opTime"]._opTime()); + ASSERT_EQUALS(10, hbResponseObj["time"].numberLong()); + + initializeResult = hbResponseObjRoundTripChecker.initialize(hbResponseObj); + ASSERT_EQUALS(Status::OK(), initializeResult); + ASSERT_EQUALS(hbResponseObj.toString(), hbResponseObjRoundTripChecker.toBSON().toString()); + + // set electable + hbResponse.setElectable(true); + ++fieldsSet; + ASSERT_EQUALS(false, hbResponse.hasState()); + ASSERT_EQUALS(true, hbResponse.hasElectionTime()); + ASSERT_EQUALS(true, hbResponse.hasIsElectable()); + ASSERT_EQUALS(true, hbResponse.hasTime()); + ASSERT_EQUALS(true, hbResponse.hasOpTime()); + ASSERT_EQUALS(false, hbResponse.hasConfig()); + ASSERT_EQUALS(false, hbResponse.isMismatched()); + ASSERT_EQUALS(false, hbResponse.isReplSet()); + ASSERT_EQUALS(false, hbResponse.isStateDisagreement()); + ASSERT_EQUALS("rs0", hbResponse.getReplicaSetName()); + ASSERT_EQUALS("", hbResponse.getHbMsg()); + ASSERT_EQUALS("", hbResponse.getSyncingTo()); + ASSERT_EQUALS(1, hbResponse.getVersion()); + ASSERT_EQUALS(OpTime(10, 0), hbResponse.getElectionTime()); + ASSERT_EQUALS(OpTime(0, 10), hbResponse.getOpTime()); + ASSERT_EQUALS(10, hbResponse.getTime().total_seconds()); + ASSERT_EQUALS(true, hbResponse.isElectable()); + + hbResponseObj = hbResponse.toBSON(); + ASSERT_EQUALS(fieldsSet, hbResponseObj.nFields()); + ASSERT_EQUALS("rs0", hbResponseObj["set"].String()); + ASSERT_EQUALS("", hbResponseObj["hbmsg"].String()); + ASSERT_EQUALS(1, hbResponseObj["v"].Number()); + ASSERT_EQUALS(OpTime(10, 0), hbResponseObj["electionTime"]._opTime()); + ASSERT_EQUALS(OpTime(0, 10), hbResponseObj["opTime"]._opTime()); + ASSERT_EQUALS(10, hbResponseObj["time"].numberLong()); + ASSERT_EQUALS(true, hbResponseObj["e"].trueValue()); + + initializeResult = hbResponseObjRoundTripChecker.initialize(hbResponseObj); + ASSERT_EQUALS(Status::OK(), initializeResult); + ASSERT_EQUALS(hbResponseObj.toString(), hbResponseObjRoundTripChecker.toBSON().toString()); + + // set config + ReplicaSetConfig config; + hbResponse.setConfig(config); + ++fieldsSet; + ASSERT_EQUALS(false, hbResponse.hasState()); + ASSERT_EQUALS(true, hbResponse.hasElectionTime()); + ASSERT_EQUALS(true, hbResponse.hasIsElectable()); + ASSERT_EQUALS(true, hbResponse.hasTime()); + ASSERT_EQUALS(true, hbResponse.hasOpTime()); + ASSERT_EQUALS(true, hbResponse.hasConfig()); + ASSERT_EQUALS(false, hbResponse.isMismatched()); + ASSERT_EQUALS(false, hbResponse.isReplSet()); + ASSERT_EQUALS(false, hbResponse.isStateDisagreement()); + ASSERT_EQUALS("rs0", hbResponse.getReplicaSetName()); + ASSERT_EQUALS("", hbResponse.getHbMsg()); + ASSERT_EQUALS("", hbResponse.getSyncingTo()); + ASSERT_EQUALS(1, hbResponse.getVersion()); + ASSERT_EQUALS(OpTime(10, 0), hbResponse.getElectionTime()); + ASSERT_EQUALS(OpTime(0, 10), hbResponse.getOpTime()); + ASSERT_EQUALS(10, hbResponse.getTime().total_seconds()); + ASSERT_EQUALS(true, hbResponse.isElectable()); + ASSERT_EQUALS(config.toBSON().toString(), hbResponse.getConfig().toBSON().toString()); + + hbResponseObj = hbResponse.toBSON(); + ASSERT_EQUALS(fieldsSet, hbResponseObj.nFields()); + ASSERT_EQUALS("rs0", hbResponseObj["set"].String()); + ASSERT_EQUALS("", hbResponseObj["hbmsg"].String()); + ASSERT_EQUALS(1, hbResponseObj["v"].Number()); + ASSERT_EQUALS(OpTime(10, 0), hbResponseObj["electionTime"]._opTime()); + ASSERT_EQUALS(OpTime(0, 10), hbResponseObj["opTime"]._opTime()); + ASSERT_EQUALS(10, hbResponseObj["time"].numberLong()); + ASSERT_EQUALS(true, hbResponseObj["e"].trueValue()); + ASSERT_EQUALS(config.toBSON().toString(), hbResponseObj["config"].Obj().toString()); + + initializeResult = hbResponseObjRoundTripChecker.initialize(hbResponseObj); + ASSERT_EQUALS(Status::OK(), initializeResult); + ASSERT_EQUALS(hbResponseObj.toString(), hbResponseObjRoundTripChecker.toBSON().toString()); + + // set state + hbResponse.setState(MemberState(MemberState::RS_SECONDARY)); + ++fieldsSet; + ASSERT_EQUALS(true, hbResponse.hasState()); + ASSERT_EQUALS(true, hbResponse.hasElectionTime()); + ASSERT_EQUALS(true, hbResponse.hasIsElectable()); + ASSERT_EQUALS(true, hbResponse.hasTime()); + ASSERT_EQUALS(true, hbResponse.hasOpTime()); + ASSERT_EQUALS(true, hbResponse.hasConfig()); + ASSERT_EQUALS(false, hbResponse.isMismatched()); + ASSERT_EQUALS(false, hbResponse.isReplSet()); + ASSERT_EQUALS(false, hbResponse.isStateDisagreement()); + ASSERT_EQUALS("rs0", hbResponse.getReplicaSetName()); + ASSERT_EQUALS(MemberState(MemberState::RS_SECONDARY).toString(), + hbResponse.getState().toString()); + ASSERT_EQUALS("", hbResponse.getHbMsg()); + ASSERT_EQUALS("", hbResponse.getSyncingTo()); + ASSERT_EQUALS(1, hbResponse.getVersion()); + ASSERT_EQUALS(OpTime(10, 0), hbResponse.getElectionTime()); + ASSERT_EQUALS(OpTime(0, 10), hbResponse.getOpTime()); + ASSERT_EQUALS(10, hbResponse.getTime().total_seconds()); + ASSERT_EQUALS(true, hbResponse.isElectable()); + ASSERT_EQUALS(config.toBSON().toString(), hbResponse.getConfig().toBSON().toString()); + + hbResponseObj = hbResponse.toBSON(); + ASSERT_EQUALS(fieldsSet, hbResponseObj.nFields()); + ASSERT_EQUALS("rs0", hbResponseObj["set"].String()); + ASSERT_EQUALS("", hbResponseObj["hbmsg"].String()); + ASSERT_EQUALS(1, hbResponseObj["v"].Number()); + ASSERT_EQUALS(OpTime(10, 0), hbResponseObj["electionTime"]._opTime()); + ASSERT_EQUALS(OpTime(0, 10), hbResponseObj["opTime"]._opTime()); + ASSERT_EQUALS(10, hbResponseObj["time"].numberLong()); + ASSERT_EQUALS(true, hbResponseObj["e"].trueValue()); + ASSERT_EQUALS(config.toBSON().toString(), hbResponseObj["config"].Obj().toString()); + ASSERT_EQUALS(2, hbResponseObj["state"].numberLong()); + + initializeResult = hbResponseObjRoundTripChecker.initialize(hbResponseObj); + ASSERT_EQUALS(Status::OK(), initializeResult); + ASSERT_EQUALS(hbResponseObj.toString(), hbResponseObjRoundTripChecker.toBSON().toString()); + + // set stateDisagreement + hbResponse.noteStateDisagreement(); + ++fieldsSet; + ASSERT_EQUALS(true, hbResponse.hasState()); + ASSERT_EQUALS(true, hbResponse.hasElectionTime()); + ASSERT_EQUALS(true, hbResponse.hasIsElectable()); + ASSERT_EQUALS(true, hbResponse.hasTime()); + ASSERT_EQUALS(true, hbResponse.hasOpTime()); + ASSERT_EQUALS(true, hbResponse.hasConfig()); + ASSERT_EQUALS(false, hbResponse.isMismatched()); + ASSERT_EQUALS(false, hbResponse.isReplSet()); + ASSERT_EQUALS(true, hbResponse.isStateDisagreement()); + ASSERT_EQUALS("rs0", hbResponse.getReplicaSetName()); + ASSERT_EQUALS(MemberState(MemberState::RS_SECONDARY).toString(), + hbResponse.getState().toString()); + ASSERT_EQUALS("", hbResponse.getHbMsg()); + ASSERT_EQUALS("", hbResponse.getSyncingTo()); + ASSERT_EQUALS(1, hbResponse.getVersion()); + ASSERT_EQUALS(OpTime(10, 0), hbResponse.getElectionTime()); + ASSERT_EQUALS(OpTime(0, 10), hbResponse.getOpTime()); + ASSERT_EQUALS(10, hbResponse.getTime().total_seconds()); + ASSERT_EQUALS(true, hbResponse.isElectable()); + ASSERT_EQUALS(config.toBSON().toString(), hbResponse.getConfig().toBSON().toString()); + + hbResponseObj = hbResponse.toBSON(); + ASSERT_EQUALS(fieldsSet, hbResponseObj.nFields()); + ASSERT_EQUALS("rs0", hbResponseObj["set"].String()); + ASSERT_EQUALS("", hbResponseObj["hbmsg"].String()); + ASSERT_EQUALS(1, hbResponseObj["v"].Number()); + ASSERT_EQUALS(OpTime(10, 0), hbResponseObj["electionTime"]._opTime()); + ASSERT_EQUALS(OpTime(0, 10), hbResponseObj["opTime"]._opTime()); + ASSERT_EQUALS(10, hbResponseObj["time"].numberLong()); + ASSERT_EQUALS(true, hbResponseObj["e"].trueValue()); + ASSERT_EQUALS(config.toBSON().toString(), hbResponseObj["config"].Obj().toString()); + ASSERT_EQUALS(2, hbResponseObj["state"].numberLong()); + ASSERT_EQUALS(false, hbResponseObj["mismatch"].trueValue()); + ASSERT_EQUALS(true, hbResponseObj["stateDisagreement"].trueValue()); + + initializeResult = hbResponseObjRoundTripChecker.initialize(hbResponseObj); + ASSERT_EQUALS(Status::OK(), initializeResult); + ASSERT_EQUALS(hbResponseObj.toString(), hbResponseObjRoundTripChecker.toBSON().toString()); + + // set replSet + hbResponse.noteReplSet(); + ++fieldsSet; + ASSERT_EQUALS(true, hbResponse.hasState()); + ASSERT_EQUALS(true, hbResponse.hasElectionTime()); + ASSERT_EQUALS(true, hbResponse.hasIsElectable()); + ASSERT_EQUALS(true, hbResponse.hasTime()); + ASSERT_EQUALS(true, hbResponse.hasOpTime()); + ASSERT_EQUALS(true, hbResponse.hasConfig()); + ASSERT_EQUALS(false, hbResponse.isMismatched()); + ASSERT_EQUALS(true, hbResponse.isReplSet()); + ASSERT_EQUALS(true, hbResponse.isStateDisagreement()); + ASSERT_EQUALS("rs0", hbResponse.getReplicaSetName()); + ASSERT_EQUALS(MemberState(MemberState::RS_SECONDARY).toString(), + hbResponse.getState().toString()); + ASSERT_EQUALS("", hbResponse.getHbMsg()); + ASSERT_EQUALS("", hbResponse.getSyncingTo()); + ASSERT_EQUALS(1, hbResponse.getVersion()); + ASSERT_EQUALS(OpTime(10, 0), hbResponse.getElectionTime()); + ASSERT_EQUALS(OpTime(0, 10), hbResponse.getOpTime()); + ASSERT_EQUALS(10, hbResponse.getTime().total_seconds()); + ASSERT_EQUALS(true, hbResponse.isElectable()); + ASSERT_EQUALS(config.toBSON().toString(), hbResponse.getConfig().toBSON().toString()); + + hbResponseObj = hbResponse.toBSON(); + ASSERT_EQUALS(fieldsSet, hbResponseObj.nFields()); + ASSERT_EQUALS("rs0", hbResponseObj["set"].String()); + ASSERT_EQUALS("", hbResponseObj["hbmsg"].String()); + ASSERT_EQUALS(1, hbResponseObj["v"].Number()); + ASSERT_EQUALS(OpTime(10, 0), hbResponseObj["electionTime"]._opTime()); + ASSERT_EQUALS(OpTime(0, 10), hbResponseObj["opTime"]._opTime()); + ASSERT_EQUALS(10, hbResponseObj["time"].numberLong()); + ASSERT_EQUALS(true, hbResponseObj["e"].trueValue()); + ASSERT_EQUALS(config.toBSON().toString(), hbResponseObj["config"].Obj().toString()); + ASSERT_EQUALS(2, hbResponseObj["state"].numberLong()); + ASSERT_EQUALS(false, hbResponseObj["mismatch"].trueValue()); + ASSERT_EQUALS(true, hbResponseObj["stateDisagreement"].trueValue()); + ASSERT_EQUALS(true, hbResponseObj["rs"].trueValue()); + + initializeResult = hbResponseObjRoundTripChecker.initialize(hbResponseObj); + ASSERT_EQUALS(Status::OK(), initializeResult); + ASSERT_EQUALS(hbResponseObj.toString(), hbResponseObjRoundTripChecker.toBSON().toString()); + + // set syncingTo + hbResponse.setSyncingTo("syncTarget"); + ++fieldsSet; + ASSERT_EQUALS(true, hbResponse.hasState()); + ASSERT_EQUALS(true, hbResponse.hasElectionTime()); + ASSERT_EQUALS(true, hbResponse.hasIsElectable()); + ASSERT_EQUALS(true, hbResponse.hasTime()); + ASSERT_EQUALS(true, hbResponse.hasOpTime()); + ASSERT_EQUALS(true, hbResponse.hasConfig()); + ASSERT_EQUALS(false, hbResponse.isMismatched()); + ASSERT_EQUALS(true, hbResponse.isReplSet()); + ASSERT_EQUALS(true, hbResponse.isStateDisagreement()); + ASSERT_EQUALS("rs0", hbResponse.getReplicaSetName()); + ASSERT_EQUALS(MemberState(MemberState::RS_SECONDARY).toString(), + hbResponse.getState().toString()); + ASSERT_EQUALS("", hbResponse.getHbMsg()); + ASSERT_EQUALS("syncTarget", hbResponse.getSyncingTo()); + ASSERT_EQUALS(1, hbResponse.getVersion()); + ASSERT_EQUALS(OpTime(10, 0), hbResponse.getElectionTime()); + ASSERT_EQUALS(OpTime(0, 10), hbResponse.getOpTime()); + ASSERT_EQUALS(10, hbResponse.getTime().total_seconds()); + ASSERT_EQUALS(true, hbResponse.isElectable()); + ASSERT_EQUALS(config.toBSON().toString(), hbResponse.getConfig().toBSON().toString()); + + hbResponseObj = hbResponse.toBSON(); + ASSERT_EQUALS(fieldsSet, hbResponseObj.nFields()); + ASSERT_EQUALS("rs0", hbResponseObj["set"].String()); + ASSERT_EQUALS("", hbResponseObj["hbmsg"].String()); + ASSERT_EQUALS(1, hbResponseObj["v"].Number()); + ASSERT_EQUALS(OpTime(10, 0), hbResponseObj["electionTime"]._opTime()); + ASSERT_EQUALS(OpTime(0, 10), hbResponseObj["opTime"]._opTime()); + ASSERT_EQUALS(10, hbResponseObj["time"].numberLong()); + ASSERT_EQUALS(true, hbResponseObj["e"].trueValue()); + ASSERT_EQUALS(config.toBSON().toString(), hbResponseObj["config"].Obj().toString()); + ASSERT_EQUALS(2, hbResponseObj["state"].numberLong()); + ASSERT_EQUALS(false, hbResponseObj["mismatch"].trueValue()); + ASSERT_EQUALS(true, hbResponseObj["stateDisagreement"].trueValue()); + ASSERT_EQUALS(true, hbResponseObj["rs"].trueValue()); + ASSERT_EQUALS("syncTarget", hbResponseObj["syncingTo"].String()); + + initializeResult = hbResponseObjRoundTripChecker.initialize(hbResponseObj); + ASSERT_EQUALS(Status::OK(), initializeResult); + ASSERT_EQUALS(hbResponseObj.toString(), hbResponseObjRoundTripChecker.toBSON().toString()); + + // set hbmsg + hbResponse.setHbMsg("lub dub"); + ASSERT_EQUALS(true, hbResponse.hasState()); + ASSERT_EQUALS(true, hbResponse.hasElectionTime()); + ASSERT_EQUALS(true, hbResponse.hasIsElectable()); + ASSERT_EQUALS(true, hbResponse.hasTime()); + ASSERT_EQUALS(true, hbResponse.hasOpTime()); + ASSERT_EQUALS(true, hbResponse.hasConfig()); + ASSERT_EQUALS(false, hbResponse.isMismatched()); + ASSERT_EQUALS(true, hbResponse.isReplSet()); + ASSERT_EQUALS(true, hbResponse.isStateDisagreement()); + ASSERT_EQUALS("rs0", hbResponse.getReplicaSetName()); + ASSERT_EQUALS(MemberState(MemberState::RS_SECONDARY).toString(), + hbResponse.getState().toString()); + ASSERT_EQUALS("lub dub", hbResponse.getHbMsg()); + ASSERT_EQUALS("syncTarget", hbResponse.getSyncingTo()); + ASSERT_EQUALS(1, hbResponse.getVersion()); + ASSERT_EQUALS(OpTime(10, 0), hbResponse.getElectionTime()); + ASSERT_EQUALS(OpTime(0, 10), hbResponse.getOpTime()); + ASSERT_EQUALS(10, hbResponse.getTime().total_seconds()); + ASSERT_EQUALS(true, hbResponse.isElectable()); + ASSERT_EQUALS(config.toBSON().toString(), hbResponse.getConfig().toBSON().toString()); + + hbResponseObj = hbResponse.toBSON(); + ASSERT_EQUALS(fieldsSet, hbResponseObj.nFields()); + ASSERT_EQUALS("rs0", hbResponseObj["set"].String()); + ASSERT_EQUALS("lub dub", hbResponseObj["hbmsg"].String()); + ASSERT_EQUALS(1, hbResponseObj["v"].Number()); + ASSERT_EQUALS(OpTime(10, 0), hbResponseObj["electionTime"]._opTime()); + ASSERT_EQUALS(OpTime(0, 10), hbResponseObj["opTime"]._opTime()); + ASSERT_EQUALS(10, hbResponseObj["time"].numberLong()); + ASSERT_EQUALS(true, hbResponseObj["e"].trueValue()); + ASSERT_EQUALS(config.toBSON().toString(), hbResponseObj["config"].Obj().toString()); + ASSERT_EQUALS(2, hbResponseObj["state"].numberLong()); + ASSERT_EQUALS(false, hbResponseObj["mismatch"].trueValue()); + ASSERT_EQUALS(true, hbResponseObj["stateDisagreement"].trueValue()); + ASSERT_EQUALS(true, hbResponseObj["rs"].trueValue()); + ASSERT_EQUALS("syncTarget", hbResponseObj["syncingTo"].String()); + + initializeResult = hbResponseObjRoundTripChecker.initialize(hbResponseObj); + ASSERT_EQUALS(Status::OK(), initializeResult); + ASSERT_EQUALS(hbResponseObj.toString(), hbResponseObjRoundTripChecker.toBSON().toString()); + + // set mismatched + hbResponse.noteMismatched(); + ASSERT_EQUALS(true, hbResponse.hasState()); + ASSERT_EQUALS(true, hbResponse.hasElectionTime()); + ASSERT_EQUALS(true, hbResponse.hasIsElectable()); + ASSERT_EQUALS(true, hbResponse.hasTime()); + ASSERT_EQUALS(true, hbResponse.hasOpTime()); + ASSERT_EQUALS(true, hbResponse.hasConfig()); + ASSERT_EQUALS(true, hbResponse.isMismatched()); + ASSERT_EQUALS(true, hbResponse.isReplSet()); + ASSERT_EQUALS(true, hbResponse.isStateDisagreement()); + ASSERT_EQUALS("rs0", hbResponse.getReplicaSetName()); + ASSERT_EQUALS(MemberState(MemberState::RS_SECONDARY).toString(), + hbResponse.getState().toString()); + ASSERT_EQUALS("lub dub", hbResponse.getHbMsg()); + ASSERT_EQUALS("syncTarget", hbResponse.getSyncingTo()); + ASSERT_EQUALS(1, hbResponse.getVersion()); + ASSERT_EQUALS(OpTime(10, 0), hbResponse.getElectionTime()); + ASSERT_EQUALS(OpTime(0, 10), hbResponse.getOpTime()); + ASSERT_EQUALS(10, hbResponse.getTime().total_seconds()); + ASSERT_EQUALS(true, hbResponse.isElectable()); + ASSERT_EQUALS(config.toBSON().toString(), hbResponse.getConfig().toBSON().toString()); + + hbResponseObj = hbResponse.toBSON(); + ASSERT_EQUALS(2, hbResponseObj.nFields()); + ASSERT_EQUALS(true, hbResponseObj["mismatch"].trueValue()); + + // NOTE: Does not check round-trip. Once noteMismached is set the bson will return an error + // from initialize parsing. + initializeResult = hbResponseObjRoundTripChecker.initialize(hbResponseObj); + ASSERT_NOT_EQUALS(Status::OK(), initializeResult); + ASSERT_EQUALS(ErrorCodes::InconsistentReplicaSetNames, initializeResult.code()); +} + +TEST(ReplSetHeartbeatResponse, InitializeWrongElectionTimeType) { + ReplSetHeartbeatResponse hbResponse; + BSONObj initializerObj = BSON("ok" << 1.0 << "electionTime" + << "hello"); + Status result = hbResponse.initialize(initializerObj); + ASSERT_EQUALS(ErrorCodes::TypeMismatch, result); + ASSERT_EQUALS( + "Expected \"electionTime\" field in response to replSetHeartbeat command to " + "have type Date or Timestamp, but found type String", + result.reason()); +} + +TEST(ReplSetHeartbeatResponse, InitializeWrongTimeType) { + ReplSetHeartbeatResponse hbResponse; + BSONObj initializerObj = BSON("ok" << 1.0 << "time" + << "hello"); + Status result = hbResponse.initialize(initializerObj); + ASSERT_EQUALS(ErrorCodes::TypeMismatch, result); + ASSERT_EQUALS( + "Expected \"time\" field in response to replSetHeartbeat command to " + "have a numeric type, but found type String", + result.reason()); +} + +TEST(ReplSetHeartbeatResponse, InitializeWrongOpTimeType) { + ReplSetHeartbeatResponse hbResponse; + BSONObj initializerObj = BSON("ok" << 1.0 << "opTime" + << "hello"); + Status result = hbResponse.initialize(initializerObj); + ASSERT_EQUALS(ErrorCodes::TypeMismatch, result); + ASSERT_EQUALS( + "Expected \"opTime\" field in response to replSetHeartbeat command to " + "have type Date or Timestamp, but found type String", + result.reason()); +} + +TEST(ReplSetHeartbeatResponse, InitializeMemberStateWrongType) { + ReplSetHeartbeatResponse hbResponse; + BSONObj initializerObj = BSON("ok" << 1.0 << "state" + << "hello"); + Status result = hbResponse.initialize(initializerObj); + ASSERT_EQUALS(ErrorCodes::TypeMismatch, result); + ASSERT_EQUALS( + "Expected \"state\" field in response to replSetHeartbeat command to " + "have type NumberInt or NumberLong, but found type String", + result.reason()); +} + +TEST(ReplSetHeartbeatResponse, InitializeMemberStateTooLow) { + ReplSetHeartbeatResponse hbResponse; + BSONObj initializerObj = BSON("ok" << 1.0 << "state" << -1); + Status result = hbResponse.initialize(initializerObj); + ASSERT_EQUALS(ErrorCodes::BadValue, result); + ASSERT_EQUALS( + "Value for \"state\" in response to replSetHeartbeat is out of range; " + "legal values are non-negative and no more than 10", + result.reason()); +} + +TEST(ReplSetHeartbeatResponse, InitializeMemberStateTooHigh) { + ReplSetHeartbeatResponse hbResponse; + BSONObj initializerObj = BSON("ok" << 1.0 << "state" << 11); + Status result = hbResponse.initialize(initializerObj); + ASSERT_EQUALS(ErrorCodes::BadValue, result); + ASSERT_EQUALS( + "Value for \"state\" in response to replSetHeartbeat is out of range; " + "legal values are non-negative and no more than 10", + result.reason()); +} + +TEST(ReplSetHeartbeatResponse, InitializeVersionWrongType) { + ReplSetHeartbeatResponse hbResponse; + BSONObj initializerObj = BSON("ok" << 1.0 << "v" + << "hello"); + Status result = hbResponse.initialize(initializerObj); + ASSERT_EQUALS(ErrorCodes::TypeMismatch, result); + ASSERT_EQUALS( + "Expected \"v\" field in response to replSetHeartbeat to " + "have type NumberInt, but found String", + result.reason()); +} + +TEST(ReplSetHeartbeatResponse, InitializeReplSetNameWrongType) { + ReplSetHeartbeatResponse hbResponse; + BSONObj initializerObj = + BSON("ok" << 1.0 << "v" << 2 << // needs a version to get this far in initialize() + "set" << 4); + Status result = hbResponse.initialize(initializerObj); + ASSERT_EQUALS(ErrorCodes::TypeMismatch, result); + ASSERT_EQUALS( + "Expected \"set\" field in response to replSetHeartbeat to " + "have type String, but found NumberInt32", + result.reason()); +} + +TEST(ReplSetHeartbeatResponse, InitializeHeartbeatMeessageWrongType) { + ReplSetHeartbeatResponse hbResponse; + BSONObj initializerObj = + BSON("ok" << 1.0 << "v" << 2 << // needs a version to get this far in initialize() + "hbmsg" << 4); + Status result = hbResponse.initialize(initializerObj); + ASSERT_EQUALS(ErrorCodes::TypeMismatch, result); + ASSERT_EQUALS( + "Expected \"hbmsg\" field in response to replSetHeartbeat to " + "have type String, but found NumberInt32", + result.reason()); +} + +TEST(ReplSetHeartbeatResponse, InitializeSyncingToWrongType) { + ReplSetHeartbeatResponse hbResponse; + BSONObj initializerObj = + BSON("ok" << 1.0 << "v" << 2 << // needs a version to get this far in initialize() + "syncingTo" << 4); + Status result = hbResponse.initialize(initializerObj); + ASSERT_EQUALS(ErrorCodes::TypeMismatch, result); + ASSERT_EQUALS( + "Expected \"syncingTo\" field in response to replSetHeartbeat to " + "have type String, but found NumberInt32", + result.reason()); +} + +TEST(ReplSetHeartbeatResponse, InitializeConfigWrongType) { + ReplSetHeartbeatResponse hbResponse; + BSONObj initializerObj = + BSON("ok" << 1.0 << "v" << 2 << // needs a version to get this far in initialize() + "config" << 4); + Status result = hbResponse.initialize(initializerObj); + ASSERT_EQUALS(ErrorCodes::TypeMismatch, result); + ASSERT_EQUALS( + "Expected \"config\" in response to replSetHeartbeat to " + "have type Object, but found NumberInt32", + result.reason()); +} + +TEST(ReplSetHeartbeatResponse, InitializeBadConfig) { + ReplSetHeartbeatResponse hbResponse; + BSONObj initializerObj = + BSON("ok" << 1.0 << "v" << 2 << // needs a version to get this far in initialize() + "config" << BSON("illegalFieldName" << 2)); + Status result = hbResponse.initialize(initializerObj); + ASSERT_EQUALS(ErrorCodes::BadValue, result); + ASSERT_EQUALS("Unexpected field illegalFieldName in replica set configuration", + result.reason()); +} + +TEST(ReplSetHeartbeatResponse, InitializeBothElectionTimeTypesSameResult) { + ReplSetHeartbeatResponse hbResponseDate; + ReplSetHeartbeatResponse hbResponseTimestamp; + BSONObjBuilder initializerDate; + BSONObjBuilder initializerTimestamp; + Date_t electionTime = Date_t(974132); + + initializerDate.append("ok", 1.0); + initializerDate.append("v", 1); + initializerDate.appendDate("electionTime", electionTime); + Status result = hbResponseDate.initialize(initializerDate.obj()); + ASSERT_EQUALS(Status::OK(), result); + + initializerTimestamp.append("ok", 1.0); + initializerTimestamp.append("v", 1); + initializerTimestamp.appendTimestamp("electionTime", electionTime); + result = hbResponseTimestamp.initialize(initializerTimestamp.obj()); + ASSERT_EQUALS(Status::OK(), result); + + ASSERT_EQUALS(hbResponseTimestamp.getElectionTime(), hbResponseTimestamp.getElectionTime()); +} + +TEST(ReplSetHeartbeatResponse, InitializeBothOpTimeTypesSameResult) { + ReplSetHeartbeatResponse hbResponseDate; + ReplSetHeartbeatResponse hbResponseTimestamp; + BSONObjBuilder initializerDate; + BSONObjBuilder initializerTimestamp; + Date_t opTime = Date_t(974132); + + initializerDate.append("ok", 1.0); + initializerDate.append("v", 1); + initializerDate.appendDate("opTime", opTime); + Status result = hbResponseDate.initialize(initializerDate.obj()); + ASSERT_EQUALS(Status::OK(), result); + + initializerTimestamp.append("ok", 1.0); + initializerTimestamp.append("v", 1); + initializerTimestamp.appendTimestamp("opTime", opTime); + result = hbResponseTimestamp.initialize(initializerTimestamp.obj()); + ASSERT_EQUALS(Status::OK(), result); + + ASSERT_EQUALS(hbResponseTimestamp.getOpTime(), hbResponseTimestamp.getOpTime()); +} + +TEST(ReplSetHeartbeatResponse, NoConfigStillInitializing) { + ReplSetHeartbeatResponse hbResp; + std::string msg = "still initializing"; + Status result = hbResp.initialize(BSON("ok" << 1.0 << "rs" << true << "hbmsg" << msg)); + ASSERT_EQUALS(Status::OK(), result); + ASSERT_EQUALS(true, hbResp.isReplSet()); + ASSERT_EQUALS(msg, hbResp.getHbMsg()); +} + +TEST(ReplSetHeartbeatResponse, InvalidResponseOpTimeMissesConfigVersion) { + ReplSetHeartbeatResponse hbResp; + std::string msg = "still initializing"; + Status result = hbResp.initialize(BSON("ok" << 1.0 << "opTime" << OpTime())); + ASSERT_EQUALS(ErrorCodes::NoSuchKey, result.code()); + ASSERT_TRUE(stringContains(result.reason(), "\"v\"")) + << result.reason() << " doesn't contain 'v' field required error msg"; +} + +TEST(ReplSetHeartbeatResponse, MismatchedRepliSetNames) { + ReplSetHeartbeatResponse hbResponse; + BSONObj initializerObj = BSON("ok" << 0.0 << "mismatch" << true); + Status result = hbResponse.initialize(initializerObj); + ASSERT_EQUALS(ErrorCodes::InconsistentReplicaSetNames, result.code()); +} + +TEST(ReplSetHeartbeatResponse, AuthFailure) { + ReplSetHeartbeatResponse hbResp; + std::string errMsg = "Unauthorized"; + Status result = hbResp.initialize( + BSON("ok" << 0.0 << "errmsg" << errMsg << "code" << ErrorCodes::Unauthorized)); + ASSERT_EQUALS(ErrorCodes::Unauthorized, result.code()); + ASSERT_EQUALS(errMsg, result.reason()); +} + +TEST(ReplSetHeartbeatResponse, ServerError) { + ReplSetHeartbeatResponse hbResp; + std::string errMsg = "Random Error"; + Status result = hbResp.initialize(BSON("ok" << 0.0 << "errmsg" << errMsg)); + ASSERT_EQUALS(ErrorCodes::UnknownError, result.code()); + ASSERT_EQUALS(errMsg, result.reason()); +} } // namespace } // namespace repl diff --git a/src/mongo/db/repl/repl_set_html_summary.cpp b/src/mongo/db/repl/repl_set_html_summary.cpp index ec7d54e9748..227f96228a8 100644 --- a/src/mongo/db/repl/repl_set_html_summary.cpp +++ b/src/mongo/db/repl/repl_set_html_summary.cpp @@ -42,187 +42,180 @@ namespace mongo { namespace repl { - ReplSetHtmlSummary::ReplSetHtmlSummary() : _selfIndex(-1), _primaryIndex(-1), _selfUptime(0) {} +ReplSetHtmlSummary::ReplSetHtmlSummary() : _selfIndex(-1), _primaryIndex(-1), _selfUptime(0) {} namespace { - template<class T> - std::string ToString(const T& t) { - str::stream s; - s << t; - return s; - } +template <class T> +std::string ToString(const T& t) { + str::stream s; + s << t; + return s; +} - /** - * Turns an unsigned int representing a duration of time in milliseconds and turns it into - * a human readable time string representation. - */ - std::string ago(unsigned int duration) { - std::stringstream s; - if( duration < 180 ) { - s << duration << " sec"; - if( duration != 1 ) s << 's'; - } - else if( duration < 3600 ) { - s.precision(2); - s << duration / 60.0 << " mins"; - } - else { - s.precision(2); - s << duration / 3600.0 << " hrs"; - } - return s.str(); +/** + * Turns an unsigned int representing a duration of time in milliseconds and turns it into + * a human readable time string representation. + */ +std::string ago(unsigned int duration) { + std::stringstream s; + if (duration < 180) { + s << duration << " sec"; + if (duration != 1) + s << 's'; + } else if (duration < 3600) { + s.precision(2); + s << duration / 60.0 << " mins"; + } else { + s.precision(2); + s << duration / 3600.0 << " hrs"; } + return s.str(); +} - unsigned int timeDifference(Date_t now, Date_t past) { - return static_cast<unsigned int> ((past ? - (now - past) / 1000 /* convert millis to secs */ : 0)); - } +unsigned int timeDifference(Date_t now, Date_t past) { + return static_cast<unsigned int>((past ? (now - past) / 1000 /* convert millis to secs */ : 0)); +} - std::string stateAsHtml(const MemberState& s) { - using namespace html; - - if( s.s == MemberState::RS_STARTUP ) - return a("", - "server still starting up, or still trying to initiate the set", - "STARTUP"); - if( s.s == MemberState::RS_PRIMARY ) - return a("", "this server thinks it is primary", "PRIMARY"); - if( s.s == MemberState::RS_SECONDARY ) - return a("", "this server thinks it is a secondary (slave mode)", "SECONDARY"); - if( s.s == MemberState::RS_RECOVERING ) - return a("", - "recovering/resyncing; after recovery usually auto-transitions to secondary", - "RECOVERING"); - if( s.s == MemberState::RS_STARTUP2 ) - return a("", "loaded config, still determining who is primary", "STARTUP2"); - if( s.s == MemberState::RS_ARBITER ) - return a("", "this server is an arbiter only", "ARBITER"); - if( s.s == MemberState::RS_DOWN ) - return a("", "member is down, slow, or unreachable", "DOWN"); - if( s.s == MemberState::RS_ROLLBACK ) - return a("", "rolling back operations to get in sync", "ROLLBACK"); - if( s.s == MemberState::RS_UNKNOWN) - return a("", "we do not know what state this node is in", "UNKNOWN"); - if( s.s == MemberState::RS_REMOVED) - return a("", "this server has been removed from the replica set config", "ROLLBACK"); - return ""; - } +std::string stateAsHtml(const MemberState& s) { + using namespace html; + + if (s.s == MemberState::RS_STARTUP) + return a("", "server still starting up, or still trying to initiate the set", "STARTUP"); + if (s.s == MemberState::RS_PRIMARY) + return a("", "this server thinks it is primary", "PRIMARY"); + if (s.s == MemberState::RS_SECONDARY) + return a("", "this server thinks it is a secondary (slave mode)", "SECONDARY"); + if (s.s == MemberState::RS_RECOVERING) + return a("", + "recovering/resyncing; after recovery usually auto-transitions to secondary", + "RECOVERING"); + if (s.s == MemberState::RS_STARTUP2) + return a("", "loaded config, still determining who is primary", "STARTUP2"); + if (s.s == MemberState::RS_ARBITER) + return a("", "this server is an arbiter only", "ARBITER"); + if (s.s == MemberState::RS_DOWN) + return a("", "member is down, slow, or unreachable", "DOWN"); + if (s.s == MemberState::RS_ROLLBACK) + return a("", "rolling back operations to get in sync", "ROLLBACK"); + if (s.s == MemberState::RS_UNKNOWN) + return a("", "we do not know what state this node is in", "UNKNOWN"); + if (s.s == MemberState::RS_REMOVED) + return a("", "this server has been removed from the replica set config", "ROLLBACK"); + return ""; +} } - const std::string ReplSetHtmlSummary::toHtmlString() const { - using namespace html; +const std::string ReplSetHtmlSummary::toHtmlString() const { + using namespace html; - std::stringstream s; + std::stringstream s; - if (!_config.isInitialized()) { - s << p("Still starting up, or else replset is not yet initiated."); - return s.str(); - } - if (_selfIndex < 0) { - s << p("This node is not a member of its replica set configuration, it most likely was" - " removed recently"); - return s.str(); - } + if (!_config.isInitialized()) { + s << p("Still starting up, or else replset is not yet initiated."); + return s.str(); + } + if (_selfIndex < 0) { + s << p( + "This node is not a member of its replica set configuration, it most likely was" + " removed recently"); + return s.str(); + } - int votesUp = 0; - int totalVotes = 0; - // Build table of node information. - std::stringstream memberTable; - const char *h[] = - {"Member", - "<a title=\"member id in the replset config\">id</a>", - "Up", - "<a title=\"length of time we have been continuously connected to the other member " - "with no reconnects (for self, shows uptime)\">cctime</a>", - "<a title=\"when this server last received a heartbeat response - includes error code " - "responses\">Last heartbeat</a>", - "Votes", - "Priority", - "State", - "Messages", - "<a title=\"how up to date this server is. this value polled every few seconds so " - "actually lag is typically lower than value shown here.\">optime</a>", - 0 - }; - memberTable << table(h); - - for (int i = 0; i < _config.getNumMembers(); ++i) { - const MemberConfig& memberConfig = _config.getMemberAt(i); - const MemberHeartbeatData& memberHB = _hbData[i]; - bool isSelf = _selfIndex == i; - bool up = memberHB.getHealth() > 0; - - totalVotes += memberConfig.getNumVotes(); - if (up || isSelf) { - votesUp += memberConfig.getNumVotes(); - } + int votesUp = 0; + int totalVotes = 0; + // Build table of node information. + std::stringstream memberTable; + const char* h[] = { + "Member", + "<a title=\"member id in the replset config\">id</a>", + "Up", + "<a title=\"length of time we have been continuously connected to the other member " + "with no reconnects (for self, shows uptime)\">cctime</a>", + "<a title=\"when this server last received a heartbeat response - includes error code " + "responses\">Last heartbeat</a>", + "Votes", + "Priority", + "State", + "Messages", + "<a title=\"how up to date this server is. this value polled every few seconds so " + "actually lag is typically lower than value shown here.\">optime</a>", + 0}; + memberTable << table(h); + + for (int i = 0; i < _config.getNumMembers(); ++i) { + const MemberConfig& memberConfig = _config.getMemberAt(i); + const MemberHeartbeatData& memberHB = _hbData[i]; + bool isSelf = _selfIndex == i; + bool up = memberHB.getHealth() > 0; + + totalVotes += memberConfig.getNumVotes(); + if (up || isSelf) { + votesUp += memberConfig.getNumVotes(); + } - memberTable << tr(); - if (isSelf) { - memberTable << td(memberConfig.getHostAndPort().toString() + " (me)"); - memberTable << td(memberConfig.getId()); - memberTable << td("1"); // up - memberTable << td(ago(_selfUptime)); - memberTable << td(""); // last heartbeat - memberTable << td(ToString(memberConfig.getNumVotes())); - memberTable << td(ToString(memberConfig.getPriority())); - memberTable << td(stateAsHtml(_selfState) + - (memberConfig.isHidden() ? " (hidden)" : "")); - memberTable << td(_selfHeartbeatMessage); - memberTable << td(_selfOptime.toString()); + memberTable << tr(); + if (isSelf) { + memberTable << td(memberConfig.getHostAndPort().toString() + " (me)"); + memberTable << td(memberConfig.getId()); + memberTable << td("1"); // up + memberTable << td(ago(_selfUptime)); + memberTable << td(""); // last heartbeat + memberTable << td(ToString(memberConfig.getNumVotes())); + memberTable << td(ToString(memberConfig.getPriority())); + memberTable << td(stateAsHtml(_selfState) + + (memberConfig.isHidden() ? " (hidden)" : "")); + memberTable << td(_selfHeartbeatMessage); + memberTable << td(_selfOptime.toString()); + } else { + std::stringstream link; + link << "http://" << memberConfig.getHostAndPort().host() << ':' + << (memberConfig.getHostAndPort().port() + 1000) << "/_replSet"; + memberTable << td(a(link.str(), "", memberConfig.getHostAndPort().toString())); + memberTable << td(memberConfig.getId()); + memberTable << td(red(str::stream() << memberHB.getHealth(), !up)); + const unsigned int uptime = timeDifference(_now, memberHB.getUpSince()); + memberTable << td(ago(uptime)); + if (memberHB.getLastHeartbeat() == 0) { + memberTable << td("never"); + } else { + memberTable << td(ago(timeDifference(_now, memberHB.getLastHeartbeat()))); } - else { - std::stringstream link; - link << "http://" << memberConfig.getHostAndPort().host() << ':' << - (memberConfig.getHostAndPort().port() + 1000) << "/_replSet"; - memberTable << td( a(link.str(), "", memberConfig.getHostAndPort().toString()) ); - memberTable << td(memberConfig.getId()); - memberTable << td(red(str::stream() << memberHB.getHealth(), !up)); - const unsigned int uptime = timeDifference(_now, memberHB.getUpSince()); - memberTable << td(ago(uptime)); - if (memberHB.getLastHeartbeat() == 0) { - memberTable << td("never"); - } - else { - memberTable << td(ago(timeDifference(_now, memberHB.getLastHeartbeat()))); - } - memberTable << td(ToString(memberConfig.getNumVotes())); - memberTable << td(ToString(memberConfig.getPriority())); - std::string state = memberHB.getState().toString() + - (memberConfig.isHidden() ? " (hidden)" : ""); - if (up) { - memberTable << td(state); - } - else { - memberTable << td( grey(str::stream() << "(was " << state << ')', true) ); - } - memberTable << td(grey(memberHB.getLastHeartbeatMsg(), !up)); - memberTable << td(memberHB.getLastHeartbeat() == 0 ? - "?" : memberHB.getOpTime().toString()); + memberTable << td(ToString(memberConfig.getNumVotes())); + memberTable << td(ToString(memberConfig.getPriority())); + std::string state = + memberHB.getState().toString() + (memberConfig.isHidden() ? " (hidden)" : ""); + if (up) { + memberTable << td(state); + } else { + memberTable << td(grey(str::stream() << "(was " << state << ')', true)); } - memberTable << _tr(); + memberTable << td(grey(memberHB.getLastHeartbeatMsg(), !up)); + memberTable << td(memberHB.getLastHeartbeat() == 0 ? "?" + : memberHB.getOpTime().toString()); } - memberTable << _table(); + memberTable << _tr(); + } + memberTable << _table(); - s << table(0, false); - s << tr("Set name:", _config.getReplSetName()); - bool majorityUp = votesUp * 2 > totalVotes; - s << tr("Majority up:", majorityUp ? "yes" : "no" ); + s << table(0, false); + s << tr("Set name:", _config.getReplSetName()); + bool majorityUp = votesUp * 2 > totalVotes; + s << tr("Majority up:", majorityUp ? "yes" : "no"); - const MemberConfig& selfConfig = _config.getMemberAt(_selfIndex); + const MemberConfig& selfConfig = _config.getMemberAt(_selfIndex); - if (_primaryIndex >= 0 && _primaryIndex != _selfIndex && !selfConfig.isArbiter()) { - int lag = _hbData[_primaryIndex].getOpTime().getSecs() - _selfOptime.getSecs(); - s << tr("Lag: ", str::stream() << lag << " secs"); - } + if (_primaryIndex >= 0 && _primaryIndex != _selfIndex && !selfConfig.isArbiter()) { + int lag = _hbData[_primaryIndex].getOpTime().getSecs() - _selfOptime.getSecs(); + s << tr("Lag: ", str::stream() << lag << " secs"); + } - s << _table(); + s << _table(); - s << memberTable.str(); + s << memberTable.str(); - return s.str(); - } + return s.str(); +} -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/repl_set_html_summary.h b/src/mongo/db/repl/repl_set_html_summary.h index 278f1ca35b6..98709ae4379 100644 --- a/src/mongo/db/repl/repl_set_html_summary.h +++ b/src/mongo/db/repl/repl_set_html_summary.h @@ -38,64 +38,63 @@ namespace mongo { namespace repl { - /** - * Class containing all the information needed to build the replSet page on http interface, - * and the logic to generate that page. - */ - class ReplSetHtmlSummary { - public: - ReplSetHtmlSummary(); - - const std::string toHtmlString() const; - - void setConfig(const ReplicaSetConfig& config) { - _config = config; - } - - void setHBData(const std::vector<MemberHeartbeatData>& hbData) { - _hbData = hbData; - } - - void setSelfIndex(int index) { - _selfIndex = index; - } - - void setPrimaryIndex(int index) { - _primaryIndex = index; - } - - void setSelfOptime(const OpTime& ts) { - _selfOptime = ts; - } - - void setSelfUptime(unsigned int time) { - _selfUptime = time; - } - - void setNow(Date_t now) { - _now = now; - } - - void setSelfState(const MemberState& state) { - _selfState = state; - } - - void setSelfHeartbeatMessage(StringData msg) { - _selfHeartbeatMessage = msg.toString(); - } - - private: - - ReplicaSetConfig _config; - std::vector<MemberHeartbeatData> _hbData; - Date_t _now; - int _selfIndex; - int _primaryIndex; - OpTime _selfOptime; - unsigned int _selfUptime; - MemberState _selfState; - std::string _selfHeartbeatMessage; - }; - -} // namespace repl -} // namespace mongo +/** + * Class containing all the information needed to build the replSet page on http interface, + * and the logic to generate that page. + */ +class ReplSetHtmlSummary { +public: + ReplSetHtmlSummary(); + + const std::string toHtmlString() const; + + void setConfig(const ReplicaSetConfig& config) { + _config = config; + } + + void setHBData(const std::vector<MemberHeartbeatData>& hbData) { + _hbData = hbData; + } + + void setSelfIndex(int index) { + _selfIndex = index; + } + + void setPrimaryIndex(int index) { + _primaryIndex = index; + } + + void setSelfOptime(const OpTime& ts) { + _selfOptime = ts; + } + + void setSelfUptime(unsigned int time) { + _selfUptime = time; + } + + void setNow(Date_t now) { + _now = now; + } + + void setSelfState(const MemberState& state) { + _selfState = state; + } + + void setSelfHeartbeatMessage(StringData msg) { + _selfHeartbeatMessage = msg.toString(); + } + +private: + ReplicaSetConfig _config; + std::vector<MemberHeartbeatData> _hbData; + Date_t _now; + int _selfIndex; + int _primaryIndex; + OpTime _selfOptime; + unsigned int _selfUptime; + MemberState _selfState; + std::string _selfHeartbeatMessage; +}; + +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/repl_set_seed_list.cpp b/src/mongo/db/repl/repl_set_seed_list.cpp index 861a2398711..53db5cf555d 100644 --- a/src/mongo/db/repl/repl_set_seed_list.cpp +++ b/src/mongo/db/repl/repl_set_seed_list.cpp @@ -40,56 +40,55 @@ namespace mongo { namespace repl { - using std::string; +using std::string; - /** @param cfgString <setname>/<seedhost1>,<seedhost2> */ - void parseReplSetSeedList(ReplicationCoordinatorExternalState* externalState, - const std::string& cfgString, - std::string& setname, - std::vector<HostAndPort>& seeds, - std::set<HostAndPort>& seedSet) { - const char *p = cfgString.c_str(); - const char *slash = strchr(p, '/'); - if( slash ) - setname = string(p, slash-p); - else - setname = p; - uassert(13093, - "bad --replSet config string format is: <setname>[/<seedhost1>,<seedhost2>,...]", - !setname.empty()); +/** @param cfgString <setname>/<seedhost1>,<seedhost2> */ +void parseReplSetSeedList(ReplicationCoordinatorExternalState* externalState, + const std::string& cfgString, + std::string& setname, + std::vector<HostAndPort>& seeds, + std::set<HostAndPort>& seedSet) { + const char* p = cfgString.c_str(); + const char* slash = strchr(p, '/'); + if (slash) + setname = string(p, slash - p); + else + setname = p; + uassert(13093, + "bad --replSet config string format is: <setname>[/<seedhost1>,<seedhost2>,...]", + !setname.empty()); - if( slash == 0 ) - return; + if (slash == 0) + return; - p = slash + 1; - while( 1 ) { - const char *comma = strchr(p, ','); - if( comma == 0 ) comma = strchr(p,0); - if( p == comma ) - break; - { - HostAndPort m; - try { - m = HostAndPort( string(p, comma-p) ); - } - catch(...) { - uassert(13114, "bad --replSet seed hostname", false); - } - uassert(13096, "bad --replSet command line config string - dups?", - seedSet.count(m) == 0); - seedSet.insert(m); - //uassert(13101, "can't use localhost in replset host list", !m.isLocalHost()); - if (externalState->isSelf(m)) { - LOG(1) << "replSet ignoring seed " << m.toString() << " (=self)"; - } - else - seeds.push_back(m); - if( *comma == 0 ) - break; - p = comma + 1; + p = slash + 1; + while (1) { + const char* comma = strchr(p, ','); + if (comma == 0) + comma = strchr(p, 0); + if (p == comma) + break; + { + HostAndPort m; + try { + m = HostAndPort(string(p, comma - p)); + } catch (...) { + uassert(13114, "bad --replSet seed hostname", false); } + uassert( + 13096, "bad --replSet command line config string - dups?", seedSet.count(m) == 0); + seedSet.insert(m); + // uassert(13101, "can't use localhost in replset host list", !m.isLocalHost()); + if (externalState->isSelf(m)) { + LOG(1) << "replSet ignoring seed " << m.toString() << " (=self)"; + } else + seeds.push_back(m); + if (*comma == 0) + break; + p = comma + 1; } } +} -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/repl_set_seed_list.h b/src/mongo/db/repl/repl_set_seed_list.h index 761928d2a7a..ca48c6604de 100644 --- a/src/mongo/db/repl/repl_set_seed_list.h +++ b/src/mongo/db/repl/repl_set_seed_list.h @@ -37,27 +37,27 @@ namespace mongo { namespace repl { - class ReplicationCoordinatorExternalState; - - void parseReplSetSeedList(ReplicationCoordinatorExternalState* externalState, - const std::string& cfgString, - std::string& setname, - std::vector<HostAndPort>& seeds, - std::set<HostAndPort>& seedSet); - - /** Parameter given to the --replSet command line option (parsed). - Syntax is "<setname>/<seedhost1>,<seedhost2>" - where setname is a name and seedhost is "<host>[:<port>]" */ - class ReplSetSeedList { - public: - ReplSetSeedList(ReplicationCoordinatorExternalState* externalState, - const std::string& cfgString) { - parseReplSetSeedList(externalState, cfgString, setname, seeds, seedSet); - } - std::string setname; - std::vector<HostAndPort> seeds; - std::set<HostAndPort> seedSet; - }; - -} // namespace repl -} // namespace mongo +class ReplicationCoordinatorExternalState; + +void parseReplSetSeedList(ReplicationCoordinatorExternalState* externalState, + const std::string& cfgString, + std::string& setname, + std::vector<HostAndPort>& seeds, + std::set<HostAndPort>& seedSet); + +/** Parameter given to the --replSet command line option (parsed). + Syntax is "<setname>/<seedhost1>,<seedhost2>" + where setname is a name and seedhost is "<host>[:<port>]" */ +class ReplSetSeedList { +public: + ReplSetSeedList(ReplicationCoordinatorExternalState* externalState, + const std::string& cfgString) { + parseReplSetSeedList(externalState, cfgString, setname, seeds, seedSet); + } + std::string setname; + std::vector<HostAndPort> seeds; + std::set<HostAndPort> seedSet; +}; + +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/repl_settings.cpp b/src/mongo/db/repl/repl_settings.cpp index 3b22a3203eb..a385d89c55b 100644 --- a/src/mongo/db/repl/repl_settings.cpp +++ b/src/mongo/db/repl/repl_settings.cpp @@ -35,13 +35,12 @@ namespace mongo { namespace repl { - MONGO_EXPORT_STARTUP_SERVER_PARAMETER(maxSyncSourceLagSecs, int, 30); - MONGO_INITIALIZER(maxSyncSourceLagSecsCheck) (InitializerContext*) { - if (maxSyncSourceLagSecs < 1) { - return Status(ErrorCodes::BadValue, "maxSyncSourceLagSecs must be > 0"); - } - return Status::OK(); +MONGO_EXPORT_STARTUP_SERVER_PARAMETER(maxSyncSourceLagSecs, int, 30); +MONGO_INITIALIZER(maxSyncSourceLagSecsCheck)(InitializerContext*) { + if (maxSyncSourceLagSecs < 1) { + return Status(ErrorCodes::BadValue, "maxSyncSourceLagSecs must be > 0"); } - + return Status::OK(); +} } } diff --git a/src/mongo/db/repl/repl_settings.h b/src/mongo/db/repl/repl_settings.h index cec0b90040f..5c1e6032acc 100644 --- a/src/mongo/db/repl/repl_settings.h +++ b/src/mongo/db/repl/repl_settings.h @@ -38,90 +38,91 @@ namespace mongo { namespace repl { - extern int maxSyncSourceLagSecs; - - bool anyReplEnabled(); - - /* replication slave? (possibly with slave) - --slave cmd line setting -> SimpleSlave - */ - typedef enum { NotSlave=0, SimpleSlave } SlaveTypes; - - class ReplSettings { - public: - SlaveTypes slave; - - /** true means we are master and doing replication. if we are not writing to oplog, this won't be true. */ - bool master; - - bool fastsync; - - bool autoresync; - - int slavedelay; - - long long oplogSize; // --oplogSize - - // for master/slave replication - std::string source; // --source - std::string only; // --only - int pretouch; // --pretouch for replication application (experimental) - - std::string replSet; // --replSet[/<seedlist>] - std::string ourSetName() const { - std::string setname; - size_t sl = replSet.find('/'); - if( sl == std::string::npos ) - return replSet; - return replSet.substr(0, sl); - } - bool usingReplSets() const { return !replSet.empty(); } - - std::string rsIndexPrefetch;// --indexPrefetch - - ReplSettings() - : slave(NotSlave), - master(false), - fastsync(), - autoresync(false), - slavedelay(), - oplogSize(0), - pretouch(0) { - } - - // TODO(spencer): Remove explicit copy constructor after we no longer have mutable state - // in ReplSettings. - ReplSettings(const ReplSettings& other) : - slave(other.slave), - master(other.master), - fastsync(other.fastsync), - autoresync(other.autoresync), - slavedelay(other.slavedelay), - oplogSize(other.oplogSize), - source(other.source), - only(other.only), - pretouch(other.pretouch), - replSet(other.replSet), - rsIndexPrefetch(other.rsIndexPrefetch) {} - - ReplSettings& operator=(const ReplSettings& other) { - if (this == &other) return *this; - - slave = other.slave; - master = other.master; - fastsync = other.fastsync; - autoresync = other.autoresync; - slavedelay = other.slavedelay; - oplogSize = other.oplogSize; - source = other.source; - only = other.only; - pretouch = other.pretouch; - replSet = other.replSet; - rsIndexPrefetch = other.rsIndexPrefetch; - return *this; - } +extern int maxSyncSourceLagSecs; + +bool anyReplEnabled(); - }; +/* replication slave? (possibly with slave) + --slave cmd line setting -> SimpleSlave +*/ +typedef enum { NotSlave = 0, SimpleSlave } SlaveTypes; + +class ReplSettings { +public: + SlaveTypes slave; + + /** true means we are master and doing replication. if we are not writing to oplog, this won't be true. */ + bool master; + + bool fastsync; + + bool autoresync; + + int slavedelay; + + long long oplogSize; // --oplogSize + + // for master/slave replication + std::string source; // --source + std::string only; // --only + int pretouch; // --pretouch for replication application (experimental) + + std::string replSet; // --replSet[/<seedlist>] + std::string ourSetName() const { + std::string setname; + size_t sl = replSet.find('/'); + if (sl == std::string::npos) + return replSet; + return replSet.substr(0, sl); + } + bool usingReplSets() const { + return !replSet.empty(); + } + + std::string rsIndexPrefetch; // --indexPrefetch + + ReplSettings() + : slave(NotSlave), + master(false), + fastsync(), + autoresync(false), + slavedelay(), + oplogSize(0), + pretouch(0) {} + + // TODO(spencer): Remove explicit copy constructor after we no longer have mutable state + // in ReplSettings. + ReplSettings(const ReplSettings& other) + : slave(other.slave), + master(other.master), + fastsync(other.fastsync), + autoresync(other.autoresync), + slavedelay(other.slavedelay), + oplogSize(other.oplogSize), + source(other.source), + only(other.only), + pretouch(other.pretouch), + replSet(other.replSet), + rsIndexPrefetch(other.rsIndexPrefetch) {} + + ReplSettings& operator=(const ReplSettings& other) { + if (this == &other) + return *this; -} // namespace repl -} // namespace mongo + slave = other.slave; + master = other.master; + fastsync = other.fastsync; + autoresync = other.autoresync; + slavedelay = other.slavedelay; + oplogSize = other.oplogSize; + source = other.source; + only = other.only; + pretouch = other.pretouch; + replSet = other.replSet; + rsIndexPrefetch = other.rsIndexPrefetch; + return *this; + } +}; + +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/replica_set_config.cpp b/src/mongo/db/repl/replica_set_config.cpp index 6b4aedd1601..3d7f6d1268e 100644 --- a/src/mongo/db/repl/replica_set_config.cpp +++ b/src/mongo/db/repl/replica_set_config.cpp @@ -41,510 +41,490 @@ namespace mongo { namespace repl { #ifndef _MSC_VER - const size_t ReplicaSetConfig::kMaxMembers; - const size_t ReplicaSetConfig::kMaxVotingMembers; +const size_t ReplicaSetConfig::kMaxMembers; +const size_t ReplicaSetConfig::kMaxVotingMembers; #endif - const Seconds ReplicaSetConfig::kDefaultHeartbeatTimeoutPeriod(10); - const std::string ReplicaSetConfig::kIdFieldName = "_id"; - const std::string ReplicaSetConfig::kVersionFieldName = "version"; - const std::string ReplicaSetConfig::kMembersFieldName = "members"; - const std::string ReplicaSetConfig::kSettingsFieldName = "settings"; - const std::string ReplicaSetConfig::kMajorityWriteConcernModeName = "$majority"; - const std::string ReplicaSetConfig::kStepDownCheckWriteConcernModeName = "$stepDownCheck"; +const Seconds ReplicaSetConfig::kDefaultHeartbeatTimeoutPeriod(10); +const std::string ReplicaSetConfig::kIdFieldName = "_id"; +const std::string ReplicaSetConfig::kVersionFieldName = "version"; +const std::string ReplicaSetConfig::kMembersFieldName = "members"; +const std::string ReplicaSetConfig::kSettingsFieldName = "settings"; +const std::string ReplicaSetConfig::kMajorityWriteConcernModeName = "$majority"; +const std::string ReplicaSetConfig::kStepDownCheckWriteConcernModeName = "$stepDownCheck"; namespace { - const std::string kLegalConfigTopFieldNames[] = { - ReplicaSetConfig::kIdFieldName, - ReplicaSetConfig::kVersionFieldName, - ReplicaSetConfig::kMembersFieldName, - ReplicaSetConfig::kSettingsFieldName - }; +const std::string kLegalConfigTopFieldNames[] = {ReplicaSetConfig::kIdFieldName, + ReplicaSetConfig::kVersionFieldName, + ReplicaSetConfig::kMembersFieldName, + ReplicaSetConfig::kSettingsFieldName}; - const std::string kHeartbeatTimeoutFieldName = "heartbeatTimeoutSecs"; - const std::string kChainingAllowedFieldName = "chainingAllowed"; - const std::string kGetLastErrorDefaultsFieldName = "getLastErrorDefaults"; - const std::string kGetLastErrorModesFieldName = "getLastErrorModes"; +const std::string kHeartbeatTimeoutFieldName = "heartbeatTimeoutSecs"; +const std::string kChainingAllowedFieldName = "chainingAllowed"; +const std::string kGetLastErrorDefaultsFieldName = "getLastErrorDefaults"; +const std::string kGetLastErrorModesFieldName = "getLastErrorModes"; } // namespace - ReplicaSetConfig::ReplicaSetConfig() : _isInitialized(false), _heartbeatTimeoutPeriod(0) {} - - Status ReplicaSetConfig::initialize(const BSONObj& cfg) { - _isInitialized = false; - _members.clear(); - Status status = bsonCheckOnlyHasFields( - "replica set configuration", cfg, kLegalConfigTopFieldNames); - if (!status.isOK()) - return status; - - // - // Parse replSetName - // - status = bsonExtractStringField(cfg, kIdFieldName, &_replSetName); - if (!status.isOK()) - return status; - - // - // Parse version - // - status = bsonExtractIntegerField(cfg, kVersionFieldName, &_version); +ReplicaSetConfig::ReplicaSetConfig() : _isInitialized(false), _heartbeatTimeoutPeriod(0) {} + +Status ReplicaSetConfig::initialize(const BSONObj& cfg) { + _isInitialized = false; + _members.clear(); + Status status = + bsonCheckOnlyHasFields("replica set configuration", cfg, kLegalConfigTopFieldNames); + if (!status.isOK()) + return status; + + // + // Parse replSetName + // + status = bsonExtractStringField(cfg, kIdFieldName, &_replSetName); + if (!status.isOK()) + return status; + + // + // Parse version + // + status = bsonExtractIntegerField(cfg, kVersionFieldName, &_version); + if (!status.isOK()) + return status; + + // + // Parse members + // + BSONElement membersElement; + status = bsonExtractTypedField(cfg, kMembersFieldName, Array, &membersElement); + if (!status.isOK()) + return status; + + for (BSONObj::iterator membersIterator(membersElement.Obj()); membersIterator.more();) { + BSONElement memberElement = membersIterator.next(); + if (memberElement.type() != Object) { + return Status(ErrorCodes::TypeMismatch, + str::stream() << "Expected type of " << kMembersFieldName << "." + << memberElement.fieldName() << " to be Object, but found " + << typeName(memberElement.type())); + } + _members.resize(_members.size() + 1); + status = _members.back().initialize(memberElement.Obj(), &_tagConfig); if (!status.isOK()) return status; - - // - // Parse members - // - BSONElement membersElement; - status = bsonExtractTypedField(cfg, kMembersFieldName, Array, &membersElement); - if (!status.isOK()) - return status; - - for (BSONObj::iterator membersIterator(membersElement.Obj()); membersIterator.more();) { - BSONElement memberElement = membersIterator.next(); - if (memberElement.type() != Object) { - return Status(ErrorCodes::TypeMismatch, str::stream() << - "Expected type of " << kMembersFieldName << "." << - memberElement.fieldName() << " to be Object, but found " << - typeName(memberElement.type())); - } - _members.resize(_members.size() + 1); - status = _members.back().initialize(memberElement.Obj(), &_tagConfig); - if (!status.isOK()) - return status; - } - - // - // Parse settings - // - BSONElement settingsElement; - status = bsonExtractTypedField(cfg, kSettingsFieldName, Object, &settingsElement); - BSONObj settings; - if (status.isOK()) { - settings = settingsElement.Obj(); - } - else if (status != ErrorCodes::NoSuchKey) { - return status; - } - status = _parseSettingsSubdocument(settings); - if (!status.isOK()) - return status; - - _calculateMajorities(); - _addInternalWriteConcernModes(); - _isInitialized = true; - return Status::OK(); } - Status ReplicaSetConfig::_parseSettingsSubdocument(const BSONObj& settings) { - // - // Parse heartbeatTimeoutSecs - // - BSONElement hbTimeoutSecsElement = settings[kHeartbeatTimeoutFieldName]; - if (hbTimeoutSecsElement.eoo()) { - _heartbeatTimeoutPeriod = Seconds(kDefaultHeartbeatTimeoutPeriod); - } - else if (hbTimeoutSecsElement.isNumber()) { - _heartbeatTimeoutPeriod = Seconds(hbTimeoutSecsElement.numberInt()); - } - else { - return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected type of " << - kSettingsFieldName << "." << kHeartbeatTimeoutFieldName << - " to be a number, but found a value of type " << - typeName(hbTimeoutSecsElement.type())); - } + // + // Parse settings + // + BSONElement settingsElement; + status = bsonExtractTypedField(cfg, kSettingsFieldName, Object, &settingsElement); + BSONObj settings; + if (status.isOK()) { + settings = settingsElement.Obj(); + } else if (status != ErrorCodes::NoSuchKey) { + return status; + } + status = _parseSettingsSubdocument(settings); + if (!status.isOK()) + return status; + + _calculateMajorities(); + _addInternalWriteConcernModes(); + _isInitialized = true; + return Status::OK(); +} + +Status ReplicaSetConfig::_parseSettingsSubdocument(const BSONObj& settings) { + // + // Parse heartbeatTimeoutSecs + // + BSONElement hbTimeoutSecsElement = settings[kHeartbeatTimeoutFieldName]; + if (hbTimeoutSecsElement.eoo()) { + _heartbeatTimeoutPeriod = Seconds(kDefaultHeartbeatTimeoutPeriod); + } else if (hbTimeoutSecsElement.isNumber()) { + _heartbeatTimeoutPeriod = Seconds(hbTimeoutSecsElement.numberInt()); + } else { + return Status(ErrorCodes::TypeMismatch, + str::stream() << "Expected type of " << kSettingsFieldName << "." + << kHeartbeatTimeoutFieldName + << " to be a number, but found a value of type " + << typeName(hbTimeoutSecsElement.type())); + } - // - // Parse chainingAllowed - // - Status status = bsonExtractBooleanFieldWithDefault(settings, - kChainingAllowedFieldName, - true, - &_chainingAllowed); + // + // Parse chainingAllowed + // + Status status = bsonExtractBooleanFieldWithDefault( + settings, kChainingAllowedFieldName, true, &_chainingAllowed); + if (!status.isOK()) + return status; + + // + // Parse getLastErrorDefaults + // + BSONElement gleDefaultsElement; + status = bsonExtractTypedField( + settings, kGetLastErrorDefaultsFieldName, Object, &gleDefaultsElement); + if (status.isOK()) { + status = _defaultWriteConcern.parse(gleDefaultsElement.Obj()); if (!status.isOK()) return status; + } else if (status == ErrorCodes::NoSuchKey) { + // Default write concern is w: 1. + _defaultWriteConcern.reset(); + _defaultWriteConcern.wNumNodes = 1; + } else { + return status; + } - // - // Parse getLastErrorDefaults - // - BSONElement gleDefaultsElement; - status = bsonExtractTypedField(settings, - kGetLastErrorDefaultsFieldName, - Object, - &gleDefaultsElement); - if (status.isOK()) { - status = _defaultWriteConcern.parse(gleDefaultsElement.Obj()); - if (!status.isOK()) - return status; - } - else if (status == ErrorCodes::NoSuchKey) { - // Default write concern is w: 1. - _defaultWriteConcern.reset(); - _defaultWriteConcern.wNumNodes = 1; - } - else { - return status; - } + // + // Parse getLastErrorModes + // + BSONElement gleModesElement; + status = bsonExtractTypedField(settings, kGetLastErrorModesFieldName, Object, &gleModesElement); + BSONObj gleModes; + if (status.isOK()) { + gleModes = gleModesElement.Obj(); + } else if (status != ErrorCodes::NoSuchKey) { + return status; + } - // - // Parse getLastErrorModes - // - BSONElement gleModesElement; - status = bsonExtractTypedField(settings, - kGetLastErrorModesFieldName, - Object, - &gleModesElement); - BSONObj gleModes; - if (status.isOK()) { - gleModes = gleModesElement.Obj(); - } - else if (status != ErrorCodes::NoSuchKey) { - return status; + for (BSONObj::iterator gleModeIter(gleModes); gleModeIter.more();) { + const BSONElement modeElement = gleModeIter.next(); + if (_customWriteConcernModes.find(modeElement.fieldNameStringData()) != + _customWriteConcernModes.end()) { + return Status(ErrorCodes::DuplicateKey, + str::stream() << kSettingsFieldName << '.' << kGetLastErrorModesFieldName + << " contains multiple fields named " + << modeElement.fieldName()); + } + if (modeElement.type() != Object) { + return Status(ErrorCodes::TypeMismatch, + str::stream() << "Expected " << kSettingsFieldName << '.' + << kGetLastErrorModesFieldName << '.' + << modeElement.fieldName() << " to be an Object, not " + << typeName(modeElement.type())); } - - for (BSONObj::iterator gleModeIter(gleModes); gleModeIter.more();) { - const BSONElement modeElement = gleModeIter.next(); - if (_customWriteConcernModes.find(modeElement.fieldNameStringData()) != - _customWriteConcernModes.end()) { - - return Status(ErrorCodes::DuplicateKey, str::stream() << kSettingsFieldName << - '.' << kGetLastErrorModesFieldName << - " contains multiple fields named " << modeElement.fieldName()); + ReplicaSetTagPattern pattern = _tagConfig.makePattern(); + for (BSONObj::iterator constraintIter(modeElement.Obj()); constraintIter.more();) { + const BSONElement constraintElement = constraintIter.next(); + if (!constraintElement.isNumber()) { + return Status(ErrorCodes::TypeMismatch, + str::stream() + << "Expected " << kSettingsFieldName << '.' + << kGetLastErrorModesFieldName << '.' << modeElement.fieldName() + << '.' << constraintElement.fieldName() << " to be a number, not " + << typeName(constraintElement.type())); } - if (modeElement.type() != Object) { - return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected " << - kSettingsFieldName << '.' << kGetLastErrorModesFieldName << '.' << - modeElement.fieldName() << " to be an Object, not " << - typeName(modeElement.type())); + const int minCount = constraintElement.numberInt(); + if (minCount <= 0) { + return Status(ErrorCodes::BadValue, + str::stream() << "Value of " << kSettingsFieldName << '.' + << kGetLastErrorModesFieldName << '.' + << modeElement.fieldName() << '.' + << constraintElement.fieldName() + << " must be positive, but found " << minCount); } - ReplicaSetTagPattern pattern = _tagConfig.makePattern(); - for (BSONObj::iterator constraintIter(modeElement.Obj()); constraintIter.more();) { - const BSONElement constraintElement = constraintIter.next(); - if (!constraintElement.isNumber()) { - return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected " << - kSettingsFieldName << '.' << kGetLastErrorModesFieldName << '.' << - modeElement.fieldName() << '.' << constraintElement.fieldName() << - " to be a number, not " << typeName(constraintElement.type())); - } - const int minCount = constraintElement.numberInt(); - if (minCount <= 0) { - return Status(ErrorCodes::BadValue, str::stream() << "Value of " << - kSettingsFieldName << '.' << kGetLastErrorModesFieldName << '.' << - modeElement.fieldName() << '.' << constraintElement.fieldName() << - " must be positive, but found " << minCount); - } - status = _tagConfig.addTagCountConstraintToPattern( - &pattern, - constraintElement.fieldNameStringData(), - minCount); - if (!status.isOK()) { - return status; - } + status = _tagConfig.addTagCountConstraintToPattern( + &pattern, constraintElement.fieldNameStringData(), minCount); + if (!status.isOK()) { + return status; } - _customWriteConcernModes[modeElement.fieldNameStringData()] = pattern; } - return Status::OK(); + _customWriteConcernModes[modeElement.fieldNameStringData()] = pattern; + } + return Status::OK(); +} + +Status ReplicaSetConfig::validate() const { + if (_version <= 0 || _version > std::numeric_limits<int>::max()) { + return Status(ErrorCodes::BadValue, + str::stream() << kVersionFieldName << " field value of " << _version + << " is out of range"); + } + if (_replSetName.empty()) { + return Status(ErrorCodes::BadValue, + str::stream() << "Replica set configuration must have non-empty " + << kIdFieldName << " field"); + } + if (_heartbeatTimeoutPeriod < Seconds(0)) { + return Status(ErrorCodes::BadValue, + str::stream() << kSettingsFieldName << '.' << kHeartbeatTimeoutFieldName + << " field value must be non-negative, " + "but found " << _heartbeatTimeoutPeriod.total_seconds()); + } + if (_members.size() > kMaxMembers || _members.empty()) { + return Status(ErrorCodes::BadValue, + str::stream() << "Replica set configuration contains " << _members.size() + << " members, but must have at least 1 and no more than " + << kMaxMembers); } - Status ReplicaSetConfig::validate() const { - if (_version <= 0 || _version > std::numeric_limits<int>::max()) { - return Status(ErrorCodes::BadValue, str::stream() << kVersionFieldName << - " field value of " << _version << " is out of range"); - } - if (_replSetName.empty()) { - return Status(ErrorCodes::BadValue, str::stream() << - "Replica set configuration must have non-empty " << kIdFieldName << - " field"); + size_t localhostCount = 0; + size_t voterCount = 0; + size_t arbiterCount = 0; + size_t electableCount = 0; + for (size_t i = 0; i < _members.size(); ++i) { + const MemberConfig& memberI = _members[i]; + Status status = memberI.validate(); + if (!status.isOK()) + return status; + if (memberI.getHostAndPort().isLocalHost()) { + ++localhostCount; } - if (_heartbeatTimeoutPeriod < Seconds(0)) { - return Status(ErrorCodes::BadValue, str::stream() << kSettingsFieldName << '.' << - kHeartbeatTimeoutFieldName << " field value must be non-negative, " - "but found " << _heartbeatTimeoutPeriod.total_seconds()); + if (memberI.isVoter()) { + ++voterCount; } - if (_members.size() > kMaxMembers || _members.empty()) { - return Status(ErrorCodes::BadValue, str::stream() << - "Replica set configuration contains " << _members.size() << - " members, but must have at least 1 and no more than " << kMaxMembers); + // Nodes may be arbiters or electable, or neither, but never both. + if (memberI.isArbiter()) { + ++arbiterCount; + } else if (memberI.getPriority() > 0) { + ++electableCount; } - - size_t localhostCount = 0; - size_t voterCount = 0; - size_t arbiterCount = 0; - size_t electableCount = 0; - for (size_t i = 0; i < _members.size(); ++i) { - const MemberConfig& memberI = _members[i]; - Status status = memberI.validate(); - if (!status.isOK()) - return status; - if (memberI.getHostAndPort().isLocalHost()) { - ++localhostCount; - } - if (memberI.isVoter()) { - ++voterCount; - } - // Nodes may be arbiters or electable, or neither, but never both. - if (memberI.isArbiter()) { - ++arbiterCount; - } - else if (memberI.getPriority() > 0) { - ++electableCount; + for (size_t j = 0; j < _members.size(); ++j) { + if (i == j) + continue; + const MemberConfig& memberJ = _members[j]; + if (memberI.getId() == memberJ.getId()) { + return Status(ErrorCodes::BadValue, + str::stream() + << "Found two member configurations with same " + << MemberConfig::kIdFieldName << " field, " << kMembersFieldName + << "." << i << "." << MemberConfig::kIdFieldName + << " == " << kMembersFieldName << "." << j << "." + << MemberConfig::kIdFieldName << " == " << memberI.getId()); } - for (size_t j = 0; j < _members.size(); ++j) { - if (i == j) - continue; - const MemberConfig& memberJ = _members[j]; - if (memberI.getId() == memberJ.getId()) { - return Status( - ErrorCodes::BadValue, str::stream() << - "Found two member configurations with same " << - MemberConfig::kIdFieldName << " field, " << - kMembersFieldName << "." << i << "." << MemberConfig::kIdFieldName << - " == " << - kMembersFieldName << "." << j << "." << MemberConfig::kIdFieldName << - " == " << memberI.getId()); - } - if (memberI.getHostAndPort() == memberJ.getHostAndPort()) { - return Status( - ErrorCodes::BadValue, str::stream() << - "Found two member configurations with same " << - MemberConfig::kHostFieldName << " field, " << - kMembersFieldName << "." << i << "." << MemberConfig::kHostFieldName << - " == " << - kMembersFieldName << "." << j << "." << MemberConfig::kHostFieldName << - " == " << memberI.getHostAndPort().toString()); - } + if (memberI.getHostAndPort() == memberJ.getHostAndPort()) { + return Status(ErrorCodes::BadValue, + str::stream() << "Found two member configurations with same " + << MemberConfig::kHostFieldName << " field, " + << kMembersFieldName << "." << i << "." + << MemberConfig::kHostFieldName + << " == " << kMembersFieldName << "." << j << "." + << MemberConfig::kHostFieldName + << " == " << memberI.getHostAndPort().toString()); } } + } - if (localhostCount != 0 && localhostCount != _members.size()) { - return Status(ErrorCodes::BadValue, str::stream() << - "Either all host names in a replica set configuration must be localhost " - "references, or none must be; found " << localhostCount << " out of " << - _members.size()); - } + if (localhostCount != 0 && localhostCount != _members.size()) { + return Status( + ErrorCodes::BadValue, + str::stream() + << "Either all host names in a replica set configuration must be localhost " + "references, or none must be; found " << localhostCount << " out of " + << _members.size()); + } - if (voterCount > kMaxVotingMembers || voterCount == 0) { - return Status(ErrorCodes::BadValue, str::stream() << - "Replica set configuration contains " << voterCount << - " voting members, but must be at least 1 and no more than " << - kMaxVotingMembers); - } + if (voterCount > kMaxVotingMembers || voterCount == 0) { + return Status(ErrorCodes::BadValue, + str::stream() << "Replica set configuration contains " << voterCount + << " voting members, but must be at least 1 and no more than " + << kMaxVotingMembers); + } - if (electableCount == 0) { - return Status(ErrorCodes::BadValue, "Replica set configuration must contain at least " - "one non-arbiter member with priority > 0"); - } + if (electableCount == 0) { + return Status(ErrorCodes::BadValue, + "Replica set configuration must contain at least " + "one non-arbiter member with priority > 0"); + } - // TODO(schwerin): Validate satisfiability of write modes? Omitting for backwards - // compatibility. - if (_defaultWriteConcern.wMode.empty()) { - if (_defaultWriteConcern.wNumNodes == 0) { - return Status(ErrorCodes::BadValue, - "Default write concern mode must wait for at least 1 member"); - } + // TODO(schwerin): Validate satisfiability of write modes? Omitting for backwards + // compatibility. + if (_defaultWriteConcern.wMode.empty()) { + if (_defaultWriteConcern.wNumNodes == 0) { + return Status(ErrorCodes::BadValue, + "Default write concern mode must wait for at least 1 member"); } - else { - if ("majority" != _defaultWriteConcern.wMode && - !findCustomWriteMode(_defaultWriteConcern.wMode).isOK()) { - return Status(ErrorCodes::BadValue, str::stream() << - "Default write concern requires undefined write mode " << - _defaultWriteConcern.wMode); - } + } else { + if ("majority" != _defaultWriteConcern.wMode && + !findCustomWriteMode(_defaultWriteConcern.wMode).isOK()) { + return Status(ErrorCodes::BadValue, + str::stream() << "Default write concern requires undefined write mode " + << _defaultWriteConcern.wMode); } - - return Status::OK(); } - Status ReplicaSetConfig::checkIfWriteConcernCanBeSatisfied( - const WriteConcernOptions& writeConcern) const { - if (!writeConcern.wMode.empty() && writeConcern.wMode != "majority") { - StatusWith<ReplicaSetTagPattern> tagPatternStatus = - findCustomWriteMode(writeConcern.wMode); - if (!tagPatternStatus.isOK()) { - return tagPatternStatus.getStatus(); - } - - ReplicaSetTagMatch matcher(tagPatternStatus.getValue()); - for (size_t j = 0; j < _members.size(); ++j) { - const MemberConfig& memberConfig = _members[j]; - for (MemberConfig::TagIterator it = memberConfig.tagsBegin(); - it != memberConfig.tagsEnd(); ++it) { - if (matcher.update(*it)) { - return Status::OK(); - } + return Status::OK(); +} + +Status ReplicaSetConfig::checkIfWriteConcernCanBeSatisfied( + const WriteConcernOptions& writeConcern) const { + if (!writeConcern.wMode.empty() && writeConcern.wMode != "majority") { + StatusWith<ReplicaSetTagPattern> tagPatternStatus = findCustomWriteMode(writeConcern.wMode); + if (!tagPatternStatus.isOK()) { + return tagPatternStatus.getStatus(); + } + + ReplicaSetTagMatch matcher(tagPatternStatus.getValue()); + for (size_t j = 0; j < _members.size(); ++j) { + const MemberConfig& memberConfig = _members[j]; + for (MemberConfig::TagIterator it = memberConfig.tagsBegin(); + it != memberConfig.tagsEnd(); + ++it) { + if (matcher.update(*it)) { + return Status::OK(); } } - // Even if all the nodes in the set had a given write it still would not satisfy this - // write concern mode. - return Status(ErrorCodes::CannotSatisfyWriteConcern, - str::stream() << "Not enough nodes match write concern mode \"" - << writeConcern.wMode << "\""); } - else { - int nodesRemaining = writeConcern.wNumNodes; - for (size_t j = 0; j < _members.size(); ++j) { - if (!_members[j].isArbiter()) { // Only count data-bearing nodes - --nodesRemaining; - if (nodesRemaining <= 0) { - return Status::OK(); - } + // Even if all the nodes in the set had a given write it still would not satisfy this + // write concern mode. + return Status(ErrorCodes::CannotSatisfyWriteConcern, + str::stream() << "Not enough nodes match write concern mode \"" + << writeConcern.wMode << "\""); + } else { + int nodesRemaining = writeConcern.wNumNodes; + for (size_t j = 0; j < _members.size(); ++j) { + if (!_members[j].isArbiter()) { // Only count data-bearing nodes + --nodesRemaining; + if (nodesRemaining <= 0) { + return Status::OK(); } } - return Status(ErrorCodes::CannotSatisfyWriteConcern, "Not enough data-bearing nodes"); } + return Status(ErrorCodes::CannotSatisfyWriteConcern, "Not enough data-bearing nodes"); } +} - const MemberConfig& ReplicaSetConfig::getMemberAt(size_t i) const { - invariant(i < _members.size()); - return _members[i]; - } +const MemberConfig& ReplicaSetConfig::getMemberAt(size_t i) const { + invariant(i < _members.size()); + return _members[i]; +} - const MemberConfig* ReplicaSetConfig::findMemberByID(int id) const { - for (std::vector<MemberConfig>::const_iterator it = _members.begin(); - it != _members.end(); ++it) { - if (it->getId() == id) { - return &(*it); - } +const MemberConfig* ReplicaSetConfig::findMemberByID(int id) const { + for (std::vector<MemberConfig>::const_iterator it = _members.begin(); it != _members.end(); + ++it) { + if (it->getId() == id) { + return &(*it); } - return NULL; } + return NULL; +} - const int ReplicaSetConfig::findMemberIndexByHostAndPort(const HostAndPort& hap) const { - int x = 0; - for (std::vector<MemberConfig>::const_iterator it = _members.begin(); - it != _members.end(); ++it) { - - if (it->getHostAndPort() == hap) { - return x; - } - ++x; +const int ReplicaSetConfig::findMemberIndexByHostAndPort(const HostAndPort& hap) const { + int x = 0; + for (std::vector<MemberConfig>::const_iterator it = _members.begin(); it != _members.end(); + ++it) { + if (it->getHostAndPort() == hap) { + return x; } - return -1; + ++x; } - - const MemberConfig* ReplicaSetConfig::findMemberByHostAndPort(const HostAndPort& hap) const { - int idx = findMemberIndexByHostAndPort(hap); - return idx != -1 ? &getMemberAt(idx) : NULL; + return -1; +} + +const MemberConfig* ReplicaSetConfig::findMemberByHostAndPort(const HostAndPort& hap) const { + int idx = findMemberIndexByHostAndPort(hap); + return idx != -1 ? &getMemberAt(idx) : NULL; +} + +ReplicaSetTag ReplicaSetConfig::findTag(const StringData& key, const StringData& value) const { + return _tagConfig.findTag(key, value); +} + +StatusWith<ReplicaSetTagPattern> ReplicaSetConfig::findCustomWriteMode( + const StringData& patternName) const { + const StringMap<ReplicaSetTagPattern>::const_iterator iter = + _customWriteConcernModes.find(patternName); + if (iter == _customWriteConcernModes.end()) { + return StatusWith<ReplicaSetTagPattern>( + ErrorCodes::UnknownReplWriteConcern, + str::stream() << "No write concern mode named '" << escape(patternName.toString()) + << "' found in replica set configuration"); } - - ReplicaSetTag ReplicaSetConfig::findTag(const StringData& key, const StringData& value) const { - return _tagConfig.findTag(key, value); + return StatusWith<ReplicaSetTagPattern>(iter->second); +} + +void ReplicaSetConfig::_calculateMajorities() { + const int voters = std::count_if(_members.begin(), + _members.end(), + stdx::bind(&MemberConfig::isVoter, stdx::placeholders::_1)); + const int arbiters = + std::count_if(_members.begin(), + _members.end(), + stdx::bind(&MemberConfig::isArbiter, stdx::placeholders::_1)); + _totalVotingMembers = voters; + _majorityVoteCount = voters / 2 + 1; + _writeMajority = std::min(_majorityVoteCount, voters - arbiters); +} + +void ReplicaSetConfig::_addInternalWriteConcernModes() { + // $majority: the majority of voting nodes or all non-arbiter voting nodes if + // the majority of voting nodes are arbiters. + ReplicaSetTagPattern pattern = _tagConfig.makePattern(); + + Status status = _tagConfig.addTagCountConstraintToPattern( + &pattern, MemberConfig::kInternalVoterTagName, _writeMajority); + + if (status.isOK()) { + _customWriteConcernModes[kMajorityWriteConcernModeName] = pattern; + } else if (status != ErrorCodes::NoSuchKey) { + // NoSuchKey means we have no $voter-tagged nodes in this config; + // other errors are unexpected. + fassert(28693, status); } - StatusWith<ReplicaSetTagPattern> ReplicaSetConfig::findCustomWriteMode( - const StringData& patternName) const { - - const StringMap<ReplicaSetTagPattern>::const_iterator iter = _customWriteConcernModes.find( - patternName); - if (iter == _customWriteConcernModes.end()) { - return StatusWith<ReplicaSetTagPattern>( - ErrorCodes::UnknownReplWriteConcern, - str::stream() << - "No write concern mode named '" << escape(patternName.toString()) << - "' found in replica set configuration"); - } - return StatusWith<ReplicaSetTagPattern>(iter->second); + // $stepDownCheck: one electable node plus ourselves + pattern = _tagConfig.makePattern(); + status = _tagConfig.addTagCountConstraintToPattern( + &pattern, MemberConfig::kInternalElectableTagName, 2); + if (status.isOK()) { + _customWriteConcernModes[kStepDownCheckWriteConcernModeName] = pattern; + } else if (status != ErrorCodes::NoSuchKey) { + // NoSuchKey means we have no $electable-tagged nodes in this config; + // other errors are unexpected + fassert(28694, status); } +} - void ReplicaSetConfig::_calculateMajorities() { - const int voters = std::count_if( - _members.begin(), - _members.end(), - stdx::bind(&MemberConfig::isVoter, stdx::placeholders::_1)); - const int arbiters = std::count_if( - _members.begin(), - _members.end(), - stdx::bind(&MemberConfig::isArbiter, stdx::placeholders::_1)); - _totalVotingMembers = voters; - _majorityVoteCount = voters / 2 + 1; - _writeMajority = std::min(_majorityVoteCount, voters - arbiters); - } +BSONObj ReplicaSetConfig::toBSON() const { + BSONObjBuilder configBuilder; + configBuilder.append("_id", _replSetName); + configBuilder.appendIntOrLL("version", _version); - void ReplicaSetConfig::_addInternalWriteConcernModes() { - // $majority: the majority of voting nodes or all non-arbiter voting nodes if - // the majority of voting nodes are arbiters. - ReplicaSetTagPattern pattern = _tagConfig.makePattern(); - - Status status = _tagConfig.addTagCountConstraintToPattern( - &pattern, - MemberConfig::kInternalVoterTagName, - _writeMajority); - - if (status.isOK()) { - _customWriteConcernModes[kMajorityWriteConcernModeName] = pattern; - } - else if (status != ErrorCodes::NoSuchKey) { - // NoSuchKey means we have no $voter-tagged nodes in this config; - // other errors are unexpected. - fassert(28693, status); - } - - // $stepDownCheck: one electable node plus ourselves - pattern = _tagConfig.makePattern(); - status = _tagConfig.addTagCountConstraintToPattern(&pattern, - MemberConfig::kInternalElectableTagName, - 2); - if (status.isOK()) { - _customWriteConcernModes[kStepDownCheckWriteConcernModeName] = pattern; - } - else if (status != ErrorCodes::NoSuchKey) { - // NoSuchKey means we have no $electable-tagged nodes in this config; - // other errors are unexpected - fassert(28694, status); - } + BSONArrayBuilder members(configBuilder.subarrayStart("members")); + for (MemberIterator mem = membersBegin(); mem != membersEnd(); mem++) { + members.append(mem->toBSON(getTagConfig())); } - - BSONObj ReplicaSetConfig::toBSON() const { - BSONObjBuilder configBuilder; - configBuilder.append("_id", _replSetName); - configBuilder.appendIntOrLL("version", _version); - - BSONArrayBuilder members(configBuilder.subarrayStart("members")); - for (MemberIterator mem = membersBegin(); mem != membersEnd(); mem++) { - members.append(mem->toBSON(getTagConfig())); - } - members.done(); - - BSONObjBuilder settingsBuilder(configBuilder.subobjStart("settings")); - settingsBuilder.append("chainingAllowed", _chainingAllowed); - settingsBuilder.append("heartbeatTimeoutSecs", _heartbeatTimeoutPeriod.total_seconds()); - - BSONObjBuilder gleModes(settingsBuilder.subobjStart("getLastErrorModes")); - for (StringMap<ReplicaSetTagPattern>::const_iterator mode = - _customWriteConcernModes.begin(); - mode != _customWriteConcernModes.end(); - ++mode) { - if (mode->first[0] == '$') { - // Filter out internal modes - continue; - } - BSONObjBuilder modeBuilder(gleModes.subobjStart(mode->first)); - for (ReplicaSetTagPattern::ConstraintIterator itr = mode->second.constraintsBegin(); - itr != mode->second.constraintsEnd(); - itr++) { - modeBuilder.append(_tagConfig.getTagKey(ReplicaSetTag(itr->getKeyIndex(), 0)), - itr->getMinCount()); - } - modeBuilder.done(); - } - gleModes.done(); - - settingsBuilder.append("getLastErrorDefaults", _defaultWriteConcern.toBSON()); - settingsBuilder.done(); - return configBuilder.obj(); + members.done(); + + BSONObjBuilder settingsBuilder(configBuilder.subobjStart("settings")); + settingsBuilder.append("chainingAllowed", _chainingAllowed); + settingsBuilder.append("heartbeatTimeoutSecs", _heartbeatTimeoutPeriod.total_seconds()); + + BSONObjBuilder gleModes(settingsBuilder.subobjStart("getLastErrorModes")); + for (StringMap<ReplicaSetTagPattern>::const_iterator mode = _customWriteConcernModes.begin(); + mode != _customWriteConcernModes.end(); + ++mode) { + if (mode->first[0] == '$') { + // Filter out internal modes + continue; + } + BSONObjBuilder modeBuilder(gleModes.subobjStart(mode->first)); + for (ReplicaSetTagPattern::ConstraintIterator itr = mode->second.constraintsBegin(); + itr != mode->second.constraintsEnd(); + itr++) { + modeBuilder.append(_tagConfig.getTagKey(ReplicaSetTag(itr->getKeyIndex(), 0)), + itr->getMinCount()); + } + modeBuilder.done(); } - - std::vector<std::string> ReplicaSetConfig::getWriteConcernNames() const { - std::vector<std::string> names; - for (StringMap<ReplicaSetTagPattern>::const_iterator mode = - _customWriteConcernModes.begin(); - mode != _customWriteConcernModes.end(); - ++mode) { - names.push_back(mode->first); - } - return names; - } + gleModes.done(); + + settingsBuilder.append("getLastErrorDefaults", _defaultWriteConcern.toBSON()); + settingsBuilder.done(); + return configBuilder.obj(); +} + +std::vector<std::string> ReplicaSetConfig::getWriteConcernNames() const { + std::vector<std::string> names; + for (StringMap<ReplicaSetTagPattern>::const_iterator mode = _customWriteConcernModes.begin(); + mode != _customWriteConcernModes.end(); + ++mode) { + names.push_back(mode->first); + } + return names; +} } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/replica_set_config.h b/src/mongo/db/repl/replica_set_config.h index 4baa96adbcf..2ff35b971db 100644 --- a/src/mongo/db/repl/replica_set_config.h +++ b/src/mongo/db/repl/replica_set_config.h @@ -41,208 +41,236 @@ namespace mongo { - class BSONObj; +class BSONObj; namespace repl { +/** + * Representation of the configuration information about a particular replica set. + */ +class ReplicaSetConfig { +public: + typedef std::vector<MemberConfig>::const_iterator MemberIterator; + + static const std::string kIdFieldName; + static const std::string kVersionFieldName; + static const std::string kMembersFieldName; + static const std::string kSettingsFieldName; + static const std::string kMajorityWriteConcernModeName; + static const std::string kStepDownCheckWriteConcernModeName; + + static const size_t kMaxMembers = 50; + static const size_t kMaxVotingMembers = 7; + static const Seconds kDefaultHeartbeatTimeoutPeriod; + + ReplicaSetConfig(); + std::string asBson() { + return ""; + } + /** + * Initializes this ReplicaSetConfig from the contents of "cfg". + */ + Status initialize(const BSONObj& cfg); + + /** + * Returns true if this object has been successfully initialized or copied from + * an initialized object. + */ + bool isInitialized() const { + return _isInitialized; + } + + /** + * Performs basic consistency checks on the replica set configuration. + */ + Status validate() const; + + /** + * Checks if this configuration can satisfy the given write concern. + * + * Things that are taken into consideration include: + * 1. If the set has enough data-bearing members. + * 2. If the write concern mode exists. + * 3. If there are enough members for the write concern mode specified. + */ + Status checkIfWriteConcernCanBeSatisfied(const WriteConcernOptions& writeConcern) const; + + /** + * Gets the version of this configuration. + * + * The version number sequences configurations of the replica set, so that + * nodes may distinguish between "older" and "newer" configurations. + */ + long long getConfigVersion() const { + return _version; + } + + /** + * Gets the name (_id field value) of the replica set described by this configuration. + */ + const std::string& getReplSetName() const { + return _replSetName; + } + + /** + * Gets the number of members in this configuration. + */ + int getNumMembers() const { + return _members.size(); + } + + /** + * Gets a begin iterator over the MemberConfigs stored in this ReplicaSetConfig. + */ + MemberIterator membersBegin() const { + return _members.begin(); + } + + /** + * Gets an end iterator over the MemberConfigs stored in this ReplicaSetConfig. + */ + MemberIterator membersEnd() const { + return _members.end(); + } + + /** + * Access a MemberConfig element by index. + */ + const MemberConfig& getMemberAt(size_t i) const; + + /** + * Returns a pointer to the MemberConfig corresponding to the member with the given _id in + * the config, or NULL if there is no member with that ID. + */ + const MemberConfig* findMemberByID(int id) const; + + /** + * Returns a pointer to the MemberConfig corresponding to the member with the given + * HostAndPort in the config, or NULL if there is no member with that address. + */ + const MemberConfig* findMemberByHostAndPort(const HostAndPort& hap) const; + + /** + * Returns a MemberConfig index position corresponding to the member with the given + * HostAndPort in the config, or -1 if there is no member with that address. + */ + const int findMemberIndexByHostAndPort(const HostAndPort& hap) const; + + /** + * Gets the default write concern for the replica set described by this configuration. + */ + const WriteConcernOptions& getDefaultWriteConcern() const { + return _defaultWriteConcern; + } + + /** + * Gets the amount of time to wait for a response to hearbeats sent to other + * nodes in the replica set. + */ + Seconds getHeartbeatTimeoutPeriod() const { + return _heartbeatTimeoutPeriod; + } + + /** + * Gets the amount of time to wait for a response to hearbeats sent to other + * nodes in the replica set, as above, but returns a Milliseconds instead of + * Seconds object. + */ + Milliseconds getHeartbeatTimeoutPeriodMillis() const { + return Milliseconds(_heartbeatTimeoutPeriod.total_milliseconds()); + } + + /** + * Gets the number of votes required to win an election. + */ + int getMajorityVoteCount() const { + return _majorityVoteCount; + } + + /** + * Gets the number of voters. + */ + int getTotalVotingMembers() const { + return _totalVotingMembers; + } + + /** + * Returns true if automatic (not explicitly set) chaining is allowed. + */ + bool isChainingAllowed() const { + return _chainingAllowed; + } + + /** + * Returns a ReplicaSetTag with the given "key" and "value", or an invalid + * tag if the configuration describes no such tag. + */ + ReplicaSetTag findTag(const StringData& key, const StringData& value) const; + + /** + * Returns the pattern corresponding to "patternName" in this configuration. + * If "patternName" is not a valid pattern in this configuration, returns + * ErrorCodes::NoSuchKey. + */ + StatusWith<ReplicaSetTagPattern> findCustomWriteMode(const StringData& patternName) const; + + /** + * Returns the "tags configuration" for this replicaset. + * + * NOTE(schwerin): Not clear if this should be used other than for reporting/debugging. + */ + const ReplicaSetTagConfig& getTagConfig() const { + return _tagConfig; + } + + /** + * Returns the config as a BSONObj. + */ + BSONObj toBSON() const; + + /** + * Returns a vector of strings which are the names of the WriteConcernModes. + * Currently used in unit tests to compare two configs. + */ + std::vector<std::string> getWriteConcernNames() const; + /** - * Representation of the configuration information about a particular replica set. - */ - class ReplicaSetConfig { - public: - typedef std::vector<MemberConfig>::const_iterator MemberIterator; - - static const std::string kIdFieldName; - static const std::string kVersionFieldName; - static const std::string kMembersFieldName; - static const std::string kSettingsFieldName; - static const std::string kMajorityWriteConcernModeName; - static const std::string kStepDownCheckWriteConcernModeName; - - static const size_t kMaxMembers = 50; - static const size_t kMaxVotingMembers = 7; - static const Seconds kDefaultHeartbeatTimeoutPeriod; - - ReplicaSetConfig(); - std::string asBson() { return ""; } - /** - * Initializes this ReplicaSetConfig from the contents of "cfg". - */ - Status initialize(const BSONObj& cfg); - - /** - * Returns true if this object has been successfully initialized or copied from - * an initialized object. - */ - bool isInitialized() const { return _isInitialized; } - - /** - * Performs basic consistency checks on the replica set configuration. - */ - Status validate() const; - - /** - * Checks if this configuration can satisfy the given write concern. - * - * Things that are taken into consideration include: - * 1. If the set has enough data-bearing members. - * 2. If the write concern mode exists. - * 3. If there are enough members for the write concern mode specified. - */ - Status checkIfWriteConcernCanBeSatisfied(const WriteConcernOptions& writeConcern) const; - - /** - * Gets the version of this configuration. - * - * The version number sequences configurations of the replica set, so that - * nodes may distinguish between "older" and "newer" configurations. - */ - long long getConfigVersion() const { return _version; } - - /** - * Gets the name (_id field value) of the replica set described by this configuration. - */ - const std::string& getReplSetName() const { return _replSetName; } - - /** - * Gets the number of members in this configuration. - */ - int getNumMembers() const { return _members.size(); } - - /** - * Gets a begin iterator over the MemberConfigs stored in this ReplicaSetConfig. - */ - MemberIterator membersBegin() const { return _members.begin(); } - - /** - * Gets an end iterator over the MemberConfigs stored in this ReplicaSetConfig. - */ - MemberIterator membersEnd() const { return _members.end(); } - - /** - * Access a MemberConfig element by index. - */ - const MemberConfig& getMemberAt(size_t i) const; - - /** - * Returns a pointer to the MemberConfig corresponding to the member with the given _id in - * the config, or NULL if there is no member with that ID. - */ - const MemberConfig* findMemberByID(int id) const; - - /** - * Returns a pointer to the MemberConfig corresponding to the member with the given - * HostAndPort in the config, or NULL if there is no member with that address. - */ - const MemberConfig* findMemberByHostAndPort(const HostAndPort& hap) const; - - /** - * Returns a MemberConfig index position corresponding to the member with the given - * HostAndPort in the config, or -1 if there is no member with that address. - */ - const int findMemberIndexByHostAndPort(const HostAndPort& hap) const; - - /** - * Gets the default write concern for the replica set described by this configuration. - */ - const WriteConcernOptions& getDefaultWriteConcern() const { return _defaultWriteConcern; } - - /** - * Gets the amount of time to wait for a response to hearbeats sent to other - * nodes in the replica set. - */ - Seconds getHeartbeatTimeoutPeriod() const { return _heartbeatTimeoutPeriod; } - - /** - * Gets the amount of time to wait for a response to hearbeats sent to other - * nodes in the replica set, as above, but returns a Milliseconds instead of - * Seconds object. - */ - Milliseconds getHeartbeatTimeoutPeriodMillis() const { - return Milliseconds(_heartbeatTimeoutPeriod.total_milliseconds()); - } - - /** - * Gets the number of votes required to win an election. - */ - int getMajorityVoteCount() const { return _majorityVoteCount; } - - /** - * Gets the number of voters. - */ - int getTotalVotingMembers() const { return _totalVotingMembers; } - - /** - * Returns true if automatic (not explicitly set) chaining is allowed. - */ - bool isChainingAllowed() const { return _chainingAllowed; } - - /** - * Returns a ReplicaSetTag with the given "key" and "value", or an invalid - * tag if the configuration describes no such tag. - */ - ReplicaSetTag findTag(const StringData& key, const StringData& value) const; - - /** - * Returns the pattern corresponding to "patternName" in this configuration. - * If "patternName" is not a valid pattern in this configuration, returns - * ErrorCodes::NoSuchKey. - */ - StatusWith<ReplicaSetTagPattern> findCustomWriteMode(const StringData& patternName) const; - - /** - * Returns the "tags configuration" for this replicaset. - * - * NOTE(schwerin): Not clear if this should be used other than for reporting/debugging. - */ - const ReplicaSetTagConfig& getTagConfig() const { return _tagConfig; } - - /** - * Returns the config as a BSONObj. - */ - BSONObj toBSON() const; - - /** - * Returns a vector of strings which are the names of the WriteConcernModes. - * Currently used in unit tests to compare two configs. - */ - std::vector<std::string> getWriteConcernNames() const; - - /** - * Returns the number of voting data-bearing members that must acknowledge a write - * in order to satisfy a write concern of {w: "majority"}. - */ - int getWriteMajority() const { return _writeMajority; } - - private: - /** - * Parses the "settings" subdocument of a replica set configuration. - */ - Status _parseSettingsSubdocument(const BSONObj& settings); - - /** - * Calculates and stores the majority for electing a primary (_majorityVoteCount). - */ - void _calculateMajorities(); - - /** - * Adds internal write concern modes to the getLastErrorModes list. - */ - void _addInternalWriteConcernModes(); - - bool _isInitialized; - long long _version; - std::string _replSetName; - std::vector<MemberConfig> _members; - WriteConcernOptions _defaultWriteConcern; - Seconds _heartbeatTimeoutPeriod; - bool _chainingAllowed; - int _majorityVoteCount; - int _writeMajority; - int _totalVotingMembers; - ReplicaSetTagConfig _tagConfig; - StringMap<ReplicaSetTagPattern> _customWriteConcernModes; - }; + * Returns the number of voting data-bearing members that must acknowledge a write + * in order to satisfy a write concern of {w: "majority"}. + */ + int getWriteMajority() const { + return _writeMajority; + } + +private: + /** + * Parses the "settings" subdocument of a replica set configuration. + */ + Status _parseSettingsSubdocument(const BSONObj& settings); + + /** + * Calculates and stores the majority for electing a primary (_majorityVoteCount). + */ + void _calculateMajorities(); + + /** + * Adds internal write concern modes to the getLastErrorModes list. + */ + void _addInternalWriteConcernModes(); + + bool _isInitialized; + long long _version; + std::string _replSetName; + std::vector<MemberConfig> _members; + WriteConcernOptions _defaultWriteConcern; + Seconds _heartbeatTimeoutPeriod; + bool _chainingAllowed; + int _majorityVoteCount; + int _writeMajority; + int _totalVotingMembers; + ReplicaSetTagConfig _tagConfig; + StringMap<ReplicaSetTagPattern> _customWriteConcernModes; +}; } // namespace repl diff --git a/src/mongo/db/repl/replica_set_config_checks.cpp b/src/mongo/db/repl/replica_set_config_checks.cpp index 7b97d3679a3..6b972063c6a 100644 --- a/src/mongo/db/repl/replica_set_config_checks.cpp +++ b/src/mongo/db/repl/replica_set_config_checks.cpp @@ -40,247 +40,234 @@ namespace mongo { namespace repl { namespace { - /** - * Finds the index of the one member configuration in "newConfig" that corresponds - * to the current node (as identified by "externalState"). - * - * Returns an error if the current node does not appear or appears multiple times in - * "newConfig". - */ - StatusWith<int> findSelfInConfig( - ReplicationCoordinatorExternalState* externalState, - const ReplicaSetConfig& newConfig) { - - std::vector<ReplicaSetConfig::MemberIterator> meConfigs; - for (ReplicaSetConfig::MemberIterator iter = newConfig.membersBegin(); - iter != newConfig.membersEnd(); - ++iter) { - if (externalState->isSelf(iter->getHostAndPort())) { - meConfigs.push_back(iter); - } - } - if (meConfigs.empty()) { - return StatusWith<int>(ErrorCodes::NodeNotFound, str::stream() << - "No host described in new configuration " << - newConfig.getConfigVersion() << " for replica set " << - newConfig.getReplSetName() << " maps to this node"); - } - if (meConfigs.size() > 1) { - str::stream message; - message << "The hosts " << meConfigs.front()->getHostAndPort().toString(); - for (size_t i = 1; i < meConfigs.size() - 1; ++i) { - message << ", " << meConfigs[i]->getHostAndPort().toString(); - } - message << " and " << meConfigs.back()->getHostAndPort().toString() << - " all map to this node in new configuration version " << - newConfig.getConfigVersion() << " for replica set " << newConfig.getReplSetName(); - return StatusWith<int>(ErrorCodes::DuplicateKey, message); +/** + * Finds the index of the one member configuration in "newConfig" that corresponds + * to the current node (as identified by "externalState"). + * + * Returns an error if the current node does not appear or appears multiple times in + * "newConfig". + */ +StatusWith<int> findSelfInConfig(ReplicationCoordinatorExternalState* externalState, + const ReplicaSetConfig& newConfig) { + std::vector<ReplicaSetConfig::MemberIterator> meConfigs; + for (ReplicaSetConfig::MemberIterator iter = newConfig.membersBegin(); + iter != newConfig.membersEnd(); + ++iter) { + if (externalState->isSelf(iter->getHostAndPort())) { + meConfigs.push_back(iter); } - - int myIndex = std::distance(newConfig.membersBegin(), meConfigs.front()); - invariant(myIndex >= 0 && myIndex < newConfig.getNumMembers()); - return StatusWith<int>(myIndex); } - - /** - * Checks if the node with the given config index is electable, returning a useful - * status message if not. - */ - Status checkElectable(const ReplicaSetConfig& newConfig, int configIndex) { - const MemberConfig& myConfig = newConfig.getMemberAt(configIndex); - if (!myConfig.isElectable()) { - return Status( - ErrorCodes::NodeNotElectable, str::stream() << - "This node, " << myConfig.getHostAndPort().toString() << ", with _id " << - myConfig.getId() << " is not electable under the new configuration version " << - newConfig.getConfigVersion() << " for replica set " << - newConfig.getReplSetName()); - } - return Status::OK(); + if (meConfigs.empty()) { + return StatusWith<int>(ErrorCodes::NodeNotFound, + str::stream() << "No host described in new configuration " + << newConfig.getConfigVersion() << " for replica set " + << newConfig.getReplSetName() << " maps to this node"); } - - /** - * Like findSelfInConfig, above, but also returns an error if the member configuration - * for this node is not electable, as this is a requirement for nodes accepting - * reconfig or initiate commands. - */ - StatusWith<int> findSelfInConfigIfElectable( - ReplicationCoordinatorExternalState* externalState, - const ReplicaSetConfig& newConfig) { - StatusWith<int> result = findSelfInConfig(externalState, newConfig); - if (result.isOK()) { - Status status = checkElectable(newConfig, result.getValue()); - if (!status.isOK()) { - return StatusWith<int>(status); - } + if (meConfigs.size() > 1) { + str::stream message; + message << "The hosts " << meConfigs.front()->getHostAndPort().toString(); + for (size_t i = 1; i < meConfigs.size() - 1; ++i) { + message << ", " << meConfigs[i]->getHostAndPort().toString(); } - return result; + message << " and " << meConfigs.back()->getHostAndPort().toString() + << " all map to this node in new configuration version " + << newConfig.getConfigVersion() << " for replica set " + << newConfig.getReplSetName(); + return StatusWith<int>(ErrorCodes::DuplicateKey, message); } - /** - * Compares two initialized and validated replica set configurations, and checks to - * see if "newConfig" is a legal successor configuration to "oldConfig". - * - * Returns Status::OK() if "newConfig" may replace "oldConfig", or an indicative error - * otherwise. - * - * The checks performed by this test are necessary, but may not be sufficient for - * ensuring that "newConfig" is a legal successor to "oldConfig". For example, - * a legal reconfiguration must typically be executed on a node that is currently - * primary under "oldConfig" and is electable under "newConfig". Such checks that - * require knowledge of which node is executing the configuration are out of scope - * for this function. - */ - Status validateOldAndNewConfigsCompatible( - const ReplicaSetConfig& oldConfig, - const ReplicaSetConfig& newConfig) { - invariant(newConfig.isInitialized()); - invariant(oldConfig.isInitialized()); - - if (oldConfig.getConfigVersion() >= newConfig.getConfigVersion()) { - return Status(ErrorCodes::NewReplicaSetConfigurationIncompatible, - str::stream() << - "New replica set configuration version must be greater than old, but " << - newConfig.getConfigVersion() << " is not greater than " << - oldConfig.getConfigVersion() << " for replica set " << - newConfig.getReplSetName()); - } + int myIndex = std::distance(newConfig.membersBegin(), meConfigs.front()); + invariant(myIndex >= 0 && myIndex < newConfig.getNumMembers()); + return StatusWith<int>(myIndex); +} - if (oldConfig.getReplSetName() != newConfig.getReplSetName()) { - return Status(ErrorCodes::NewReplicaSetConfigurationIncompatible, - str::stream() << - "New and old configurations differ in replica set name; " - "old was " << oldConfig.getReplSetName() << ", and new is " << - newConfig.getReplSetName()); - } - - // - // For every member config mNew in newConfig, if there exists member config mOld - // in oldConfig such that mNew.getHostAndPort() == mOld.getHostAndPort(), it is required - // that mNew.getId() == mOld.getId(). - // - // Also, one may not use reconfig to change the value of the buildIndexes or - // arbiterOnly flags. - // - for (ReplicaSetConfig::MemberIterator mNew = newConfig.membersBegin(); - mNew != newConfig.membersEnd(); - ++mNew) { - for (ReplicaSetConfig::MemberIterator mOld = oldConfig.membersBegin(); - mOld != oldConfig.membersEnd(); - ++mOld) { - - const bool idsEqual = mOld->getId() == mNew->getId(); - const bool hostsEqual = mOld->getHostAndPort() == mNew->getHostAndPort(); - if (!idsEqual && !hostsEqual) { - continue; - } - if (hostsEqual && !idsEqual) { - return Status(ErrorCodes::NewReplicaSetConfigurationIncompatible, - str::stream() << - "New and old configurations both have members with " << - MemberConfig::kHostFieldName << " of " << - mOld->getHostAndPort().toString() << - " but in the new configuration the " << - MemberConfig::kIdFieldName << " field is " << - mNew->getId() << " and in the old configuration it is " << - mOld->getId() << - " for replica set " << newConfig.getReplSetName()); - } - // At this point, the _id and host fields are equal, so we're looking at the old and - // new configurations for the same member node. - const bool buildIndexesFlagsEqual = - mOld->shouldBuildIndexes() == mNew->shouldBuildIndexes(); - if (!buildIndexesFlagsEqual) { - return Status(ErrorCodes::NewReplicaSetConfigurationIncompatible, - str::stream() << - "New and old configurations differ in the setting of the " - "buildIndexes field for member " << - mOld->getHostAndPort().toString() << - "; to make this change, remove then re-add the member"); - } - const bool arbiterFlagsEqual = mOld->isArbiter() == mNew->isArbiter(); - if (!arbiterFlagsEqual) { - return Status(ErrorCodes::NewReplicaSetConfigurationIncompatible, - str::stream() << - "New and old configurations differ in the setting of the " - "arbiterOnly field for member " << - mOld->getHostAndPort().toString() << - "; to make this change, remove then re-add the member"); - - } - } - } - return Status::OK(); +/** + * Checks if the node with the given config index is electable, returning a useful + * status message if not. + */ +Status checkElectable(const ReplicaSetConfig& newConfig, int configIndex) { + const MemberConfig& myConfig = newConfig.getMemberAt(configIndex); + if (!myConfig.isElectable()) { + return Status(ErrorCodes::NodeNotElectable, + str::stream() << "This node, " << myConfig.getHostAndPort().toString() + << ", with _id " << myConfig.getId() + << " is not electable under the new configuration version " + << newConfig.getConfigVersion() << " for replica set " + << newConfig.getReplSetName()); } -} // namespace + return Status::OK(); +} - StatusWith<int> validateConfigForStartUp( - ReplicationCoordinatorExternalState* externalState, - const ReplicaSetConfig& oldConfig, - const ReplicaSetConfig& newConfig) { - Status status = newConfig.validate(); +/** + * Like findSelfInConfig, above, but also returns an error if the member configuration + * for this node is not electable, as this is a requirement for nodes accepting + * reconfig or initiate commands. + */ +StatusWith<int> findSelfInConfigIfElectable(ReplicationCoordinatorExternalState* externalState, + const ReplicaSetConfig& newConfig) { + StatusWith<int> result = findSelfInConfig(externalState, newConfig); + if (result.isOK()) { + Status status = checkElectable(newConfig, result.getValue()); if (!status.isOK()) { return StatusWith<int>(status); } - if (oldConfig.isInitialized()) { - status = validateOldAndNewConfigsCompatible(oldConfig, newConfig); - if (!status.isOK()) { - return StatusWith<int>(status); - } - } - return findSelfInConfig(externalState, newConfig); } + return result; +} - StatusWith<int> validateConfigForInitiate( - ReplicationCoordinatorExternalState* externalState, - const ReplicaSetConfig& newConfig) { - Status status = newConfig.validate(); - if (!status.isOK()) { - return StatusWith<int>(status); - } - if (newConfig.getConfigVersion() != 1) { - return StatusWith<int>( - ErrorCodes::NewReplicaSetConfigurationIncompatible, - str::stream() << "Configuration used to initiate a replica set must " << - " have version 1, but found " << newConfig.getConfigVersion()); - } - return findSelfInConfigIfElectable(externalState, newConfig); +/** + * Compares two initialized and validated replica set configurations, and checks to + * see if "newConfig" is a legal successor configuration to "oldConfig". + * + * Returns Status::OK() if "newConfig" may replace "oldConfig", or an indicative error + * otherwise. + * + * The checks performed by this test are necessary, but may not be sufficient for + * ensuring that "newConfig" is a legal successor to "oldConfig". For example, + * a legal reconfiguration must typically be executed on a node that is currently + * primary under "oldConfig" and is electable under "newConfig". Such checks that + * require knowledge of which node is executing the configuration are out of scope + * for this function. + */ +Status validateOldAndNewConfigsCompatible(const ReplicaSetConfig& oldConfig, + const ReplicaSetConfig& newConfig) { + invariant(newConfig.isInitialized()); + invariant(oldConfig.isInitialized()); + + if (oldConfig.getConfigVersion() >= newConfig.getConfigVersion()) { + return Status(ErrorCodes::NewReplicaSetConfigurationIncompatible, + str::stream() + << "New replica set configuration version must be greater than old, but " + << newConfig.getConfigVersion() << " is not greater than " + << oldConfig.getConfigVersion() << " for replica set " + << newConfig.getReplSetName()); } - StatusWith<int> validateConfigForReconfig( - ReplicationCoordinatorExternalState* externalState, - const ReplicaSetConfig& oldConfig, - const ReplicaSetConfig& newConfig, - bool force) { + if (oldConfig.getReplSetName() != newConfig.getReplSetName()) { + return Status(ErrorCodes::NewReplicaSetConfigurationIncompatible, + str::stream() << "New and old configurations differ in replica set name; " + "old was " << oldConfig.getReplSetName() << ", and new is " + << newConfig.getReplSetName()); + } - Status status = newConfig.validate(); - if (!status.isOK()) { - return StatusWith<int>(status); + // + // For every member config mNew in newConfig, if there exists member config mOld + // in oldConfig such that mNew.getHostAndPort() == mOld.getHostAndPort(), it is required + // that mNew.getId() == mOld.getId(). + // + // Also, one may not use reconfig to change the value of the buildIndexes or + // arbiterOnly flags. + // + for (ReplicaSetConfig::MemberIterator mNew = newConfig.membersBegin(); + mNew != newConfig.membersEnd(); + ++mNew) { + for (ReplicaSetConfig::MemberIterator mOld = oldConfig.membersBegin(); + mOld != oldConfig.membersEnd(); + ++mOld) { + const bool idsEqual = mOld->getId() == mNew->getId(); + const bool hostsEqual = mOld->getHostAndPort() == mNew->getHostAndPort(); + if (!idsEqual && !hostsEqual) { + continue; + } + if (hostsEqual && !idsEqual) { + return Status(ErrorCodes::NewReplicaSetConfigurationIncompatible, + str::stream() + << "New and old configurations both have members with " + << MemberConfig::kHostFieldName << " of " + << mOld->getHostAndPort().toString() + << " but in the new configuration the " + << MemberConfig::kIdFieldName << " field is " << mNew->getId() + << " and in the old configuration it is " << mOld->getId() + << " for replica set " << newConfig.getReplSetName()); + } + // At this point, the _id and host fields are equal, so we're looking at the old and + // new configurations for the same member node. + const bool buildIndexesFlagsEqual = + mOld->shouldBuildIndexes() == mNew->shouldBuildIndexes(); + if (!buildIndexesFlagsEqual) { + return Status(ErrorCodes::NewReplicaSetConfigurationIncompatible, + str::stream() + << "New and old configurations differ in the setting of the " + "buildIndexes field for member " + << mOld->getHostAndPort().toString() + << "; to make this change, remove then re-add the member"); + } + const bool arbiterFlagsEqual = mOld->isArbiter() == mNew->isArbiter(); + if (!arbiterFlagsEqual) { + return Status(ErrorCodes::NewReplicaSetConfigurationIncompatible, + str::stream() + << "New and old configurations differ in the setting of the " + "arbiterOnly field for member " + << mOld->getHostAndPort().toString() + << "; to make this change, remove then re-add the member"); + } } + } + return Status::OK(); +} +} // namespace +StatusWith<int> validateConfigForStartUp(ReplicationCoordinatorExternalState* externalState, + const ReplicaSetConfig& oldConfig, + const ReplicaSetConfig& newConfig) { + Status status = newConfig.validate(); + if (!status.isOK()) { + return StatusWith<int>(status); + } + if (oldConfig.isInitialized()) { status = validateOldAndNewConfigsCompatible(oldConfig, newConfig); if (!status.isOK()) { return StatusWith<int>(status); } + } + return findSelfInConfig(externalState, newConfig); +} - if (force) { - return findSelfInConfig(externalState, newConfig); - } - - return findSelfInConfigIfElectable(externalState, newConfig); +StatusWith<int> validateConfigForInitiate(ReplicationCoordinatorExternalState* externalState, + const ReplicaSetConfig& newConfig) { + Status status = newConfig.validate(); + if (!status.isOK()) { + return StatusWith<int>(status); + } + if (newConfig.getConfigVersion() != 1) { + return StatusWith<int>(ErrorCodes::NewReplicaSetConfigurationIncompatible, + str::stream() << "Configuration used to initiate a replica set must " + << " have version 1, but found " + << newConfig.getConfigVersion()); } + return findSelfInConfigIfElectable(externalState, newConfig); +} - StatusWith<int> validateConfigForHeartbeatReconfig( - ReplicationCoordinatorExternalState* externalState, - const ReplicaSetConfig& newConfig) { +StatusWith<int> validateConfigForReconfig(ReplicationCoordinatorExternalState* externalState, + const ReplicaSetConfig& oldConfig, + const ReplicaSetConfig& newConfig, + bool force) { + Status status = newConfig.validate(); + if (!status.isOK()) { + return StatusWith<int>(status); + } - Status status = newConfig.validate(); - if (!status.isOK()) { - return StatusWith<int>(status); - } + status = validateOldAndNewConfigsCompatible(oldConfig, newConfig); + if (!status.isOK()) { + return StatusWith<int>(status); + } + if (force) { return findSelfInConfig(externalState, newConfig); } + return findSelfInConfigIfElectable(externalState, newConfig); +} + +StatusWith<int> validateConfigForHeartbeatReconfig( + ReplicationCoordinatorExternalState* externalState, const ReplicaSetConfig& newConfig) { + Status status = newConfig.validate(); + if (!status.isOK()) { + return StatusWith<int>(status); + } + + return findSelfInConfig(externalState, newConfig); +} + } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/replica_set_config_checks.h b/src/mongo/db/repl/replica_set_config_checks.h index ba7ad90f3fc..adeb4758093 100644 --- a/src/mongo/db/repl/replica_set_config_checks.h +++ b/src/mongo/db/repl/replica_set_config_checks.h @@ -33,61 +33,57 @@ namespace mongo { namespace repl { - class ReplicationCoordinatorExternalState; - class ReplicaSetConfig; +class ReplicationCoordinatorExternalState; +class ReplicaSetConfig; - /** - * Validates that "newConfig" is a legal configuration that the current - * node can accept from its local storage during startup. - * - * Returns the index of the current node's member configuration in "newConfig", - * on success, and an indicative error on failure. - * - * If "oldConfig" is valid, this method only succeds if "newConfig" is a legal - * successor configuration. - */ - StatusWith<int> validateConfigForStartUp( - ReplicationCoordinatorExternalState* externalState, - const ReplicaSetConfig& oldConfig, - const ReplicaSetConfig& newConfig); +/** + * Validates that "newConfig" is a legal configuration that the current + * node can accept from its local storage during startup. + * + * Returns the index of the current node's member configuration in "newConfig", + * on success, and an indicative error on failure. + * + * If "oldConfig" is valid, this method only succeds if "newConfig" is a legal + * successor configuration. + */ +StatusWith<int> validateConfigForStartUp(ReplicationCoordinatorExternalState* externalState, + const ReplicaSetConfig& oldConfig, + const ReplicaSetConfig& newConfig); - /** - * Validates that "newConfig" is a legal initial configuration that can be - * initiated by the current node (identified via "externalState"). - * - * Returns the index of the current node's member configuration in "newConfig", - * on success, and an indicative error on failure. - */ - StatusWith<int> validateConfigForInitiate( - ReplicationCoordinatorExternalState* externalState, - const ReplicaSetConfig& newConfig); +/** + * Validates that "newConfig" is a legal initial configuration that can be + * initiated by the current node (identified via "externalState"). + * + * Returns the index of the current node's member configuration in "newConfig", + * on success, and an indicative error on failure. + */ +StatusWith<int> validateConfigForInitiate(ReplicationCoordinatorExternalState* externalState, + const ReplicaSetConfig& newConfig); - /** - * Validates that "newConfig" is a legal successor configuration to "oldConfig" that can be - * initiated by the current node (identified via "externalState"). - * - * If "force" is set to true, then compatibility with the old configuration and electability of - * the current node in "newConfig" are not considered when determining if the reconfig is valid. - * - * Returns the index of the current node's member configuration in "newConfig", - * on success, and an indicative error on failure. - */ - StatusWith<int> validateConfigForReconfig( - ReplicationCoordinatorExternalState* externalState, - const ReplicaSetConfig& oldConfig, - const ReplicaSetConfig& newConfig, - bool force); +/** + * Validates that "newConfig" is a legal successor configuration to "oldConfig" that can be + * initiated by the current node (identified via "externalState"). + * + * If "force" is set to true, then compatibility with the old configuration and electability of + * the current node in "newConfig" are not considered when determining if the reconfig is valid. + * + * Returns the index of the current node's member configuration in "newConfig", + * on success, and an indicative error on failure. + */ +StatusWith<int> validateConfigForReconfig(ReplicationCoordinatorExternalState* externalState, + const ReplicaSetConfig& oldConfig, + const ReplicaSetConfig& newConfig, + bool force); - /** - * Validates that "newConfig" is an acceptable configuration when received in a heartbeat - * reasponse. - * - * If the new configuration omits the current node, but is otherwise valid, returns - * ErrorCodes::NodeNotFound. If the configuration is wholly valid, returns Status::OK(). - * Otherwise, returns some other error status. - */ - StatusWith<int> validateConfigForHeartbeatReconfig( - ReplicationCoordinatorExternalState* externalState, - const ReplicaSetConfig& newConfig); +/** + * Validates that "newConfig" is an acceptable configuration when received in a heartbeat + * reasponse. + * + * If the new configuration omits the current node, but is otherwise valid, returns + * ErrorCodes::NodeNotFound. If the configuration is wholly valid, returns Status::OK(). + * Otherwise, returns some other error status. + */ +StatusWith<int> validateConfigForHeartbeatReconfig( + ReplicationCoordinatorExternalState* externalState, const ReplicaSetConfig& newConfig); } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/replica_set_config_checks_test.cpp b/src/mongo/db/repl/replica_set_config_checks_test.cpp index efb39f5e0fa..d495421689d 100644 --- a/src/mongo/db/repl/replica_set_config_checks_test.cpp +++ b/src/mongo/db/repl/replica_set_config_checks_test.cpp @@ -40,660 +40,679 @@ namespace mongo { namespace repl { namespace { - TEST(ValidateConfigForInitiate, VersionMustBe1) { - ReplicationCoordinatorExternalStateMock rses; - rses.addSelf(HostAndPort("h1")); - - ReplicaSetConfig config; - ASSERT_OK(config.initialize(BSON("_id" << "rs0" << - "version" << 2 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1"))))); - ASSERT_EQUALS( - ErrorCodes::NewReplicaSetConfigurationIncompatible, - validateConfigForInitiate(&rses, config).getStatus()); - } - - TEST(ValidateConfigForInitiate, MustFindSelf) { - ReplicaSetConfig config; - ASSERT_OK(config.initialize(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1") << - BSON("_id" << 2 << "host" << "h2") << - BSON("_id" << 3 << "host" << "h3"))))); - ReplicationCoordinatorExternalStateMock notPresentExternalState; - ReplicationCoordinatorExternalStateMock presentOnceExternalState; - presentOnceExternalState.addSelf(HostAndPort("h2")); - ReplicationCoordinatorExternalStateMock presentTwiceExternalState; - presentTwiceExternalState.addSelf(HostAndPort("h3")); - presentTwiceExternalState.addSelf(HostAndPort("h1")); - - ASSERT_EQUALS(ErrorCodes::NodeNotFound, - validateConfigForInitiate(¬PresentExternalState, config).getStatus()); - ASSERT_EQUALS(ErrorCodes::DuplicateKey, - validateConfigForInitiate(&presentTwiceExternalState, config).getStatus()); - ASSERT_EQUALS(1, unittest::assertGet(validateConfigForInitiate(&presentOnceExternalState, - config))); - } - - TEST(ValidateConfigForInitiate, SelfMustBeElectable) { - ReplicaSetConfig config; - ASSERT_OK(config.initialize(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1") << - BSON("_id" << 2 << "host" << "h2" << - "priority" << 0) << - BSON("_id" << 3 << "host" << "h3"))))); - ReplicationCoordinatorExternalStateMock presentOnceExternalState; - presentOnceExternalState.addSelf(HostAndPort("h2")); - - ASSERT_EQUALS(ErrorCodes::NodeNotElectable, - validateConfigForInitiate(&presentOnceExternalState, config).getStatus()); - } - - TEST(ValidateConfigForReconfig, NewConfigVersionNumberMustBeHigherThanOld) { - ReplicationCoordinatorExternalStateMock externalState; - externalState.addSelf(HostAndPort("h1")); - - ReplicaSetConfig oldConfig; - ReplicaSetConfig newConfig; - - // Two configurations, identical except for version. - ASSERT_OK(oldConfig.initialize( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1") << - BSON("_id" << 2 << "host" << "h2") << - BSON("_id" << 3 << "host" << "h3"))))); - - ASSERT_OK(newConfig.initialize( - BSON("_id" << "rs0" << - "version" << 3 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1") << - BSON("_id" << 2 << "host" << "h2") << - BSON("_id" << 3 << "host" << "h3"))))); - - ASSERT_OK(oldConfig.validate()); - ASSERT_OK(newConfig.validate()); - - // Can reconfig from old to new. - ASSERT_OK(validateConfigForReconfig(&externalState, - oldConfig, - newConfig, - false).getStatus()); - - - // Cannot reconfig from old to old (versions must be different). - ASSERT_EQUALS(ErrorCodes::NewReplicaSetConfigurationIncompatible, - validateConfigForReconfig(&externalState, - oldConfig, - oldConfig, - false).getStatus()); - // Forced reconfigs also do not allow this. - ASSERT_EQUALS(ErrorCodes::NewReplicaSetConfigurationIncompatible, - validateConfigForReconfig(&externalState, - oldConfig, - oldConfig, - true).getStatus()); - - // Cannot reconfig from new to old (versions must increase). - ASSERT_EQUALS(ErrorCodes::NewReplicaSetConfigurationIncompatible, - validateConfigForReconfig(&externalState, - newConfig, - oldConfig, - false).getStatus()); - // Forced reconfigs also do not allow this. - ASSERT_EQUALS(ErrorCodes::NewReplicaSetConfigurationIncompatible, - validateConfigForReconfig(&externalState, - newConfig, - oldConfig, - true).getStatus()); - } - - TEST(ValidateConfigForReconfig, NewConfigMustNotChangeSetName) { - ReplicationCoordinatorExternalStateMock externalState; - externalState.addSelf(HostAndPort("h1")); - - ReplicaSetConfig oldConfig; - ReplicaSetConfig newConfig; - - // Two configurations, compatible except for set name. - ASSERT_OK(oldConfig.initialize( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1") << - BSON("_id" << 2 << "host" << "h2") << - BSON("_id" << 3 << "host" << "h3"))))); - - ASSERT_OK(newConfig.initialize( - BSON("_id" << "rs1" << - "version" << 3 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1") << - BSON("_id" << 2 << "host" << "h2") << - BSON("_id" << 3 << "host" << "h3"))))); - - ASSERT_OK(oldConfig.validate()); - ASSERT_OK(newConfig.validate()); - ASSERT_EQUALS(ErrorCodes::NewReplicaSetConfigurationIncompatible, - validateConfigForReconfig(&externalState, - oldConfig, - newConfig, - false).getStatus()); - // Forced reconfigs also do not allow this. - ASSERT_EQUALS(ErrorCodes::NewReplicaSetConfigurationIncompatible, - validateConfigForReconfig(&externalState, - newConfig, - oldConfig, - true).getStatus()); - } - - TEST(ValidateConfigForReconfig, NewConfigMustNotFlipBuildIndexesFlag) { - ReplicationCoordinatorExternalStateMock externalState; - externalState.addSelf(HostAndPort("h1")); - - ReplicaSetConfig oldConfig; - ReplicaSetConfig newConfig; - ReplicaSetConfig oldConfigRefresh; - - // Three configurations, two compatible except that h2 flips the buildIndex flag. - // The third, compatible with the first. - ASSERT_OK(oldConfig.initialize( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1") << - BSON("_id" << 2 << "host" << "h2" << - "buildIndexes" << false << - "priority" << 0) << - BSON("_id" << 3 << "host" << "h3"))))); - - ASSERT_OK(newConfig.initialize( - BSON("_id" << "rs0" << - "version" << 3 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1") << - BSON("_id" << 2 << "host" << "h2" << - "buildIndexes" << true << - "priority" << 0) << - BSON("_id" << 3 << "host" << "h3"))))); - - ASSERT_OK(oldConfigRefresh.initialize( - BSON("_id" << "rs0" << - "version" << 2 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1") << - BSON("_id" << 2 << "host" << "h2" << - "buildIndexes" << false << - "priority" << 0) << - BSON("_id" << 3 << "host" << "h3"))))); - - ASSERT_OK(oldConfig.validate()); - ASSERT_OK(newConfig.validate()); - ASSERT_OK(oldConfigRefresh.validate()); - ASSERT_OK(validateConfigForReconfig(&externalState, - oldConfig, - oldConfigRefresh, - false).getStatus()); - ASSERT_EQUALS(ErrorCodes::NewReplicaSetConfigurationIncompatible, - validateConfigForReconfig(&externalState, - oldConfig, - newConfig, - false).getStatus()); - - // Forced reconfigs also do not allow this. - ASSERT_EQUALS(ErrorCodes::NewReplicaSetConfigurationIncompatible, - validateConfigForReconfig(&externalState, - oldConfig, - newConfig, - true).getStatus()); - } - - TEST(ValidateConfigForReconfig, NewConfigMustNotFlipArbiterFlag) { - ReplicationCoordinatorExternalStateMock externalState; - externalState.addSelf(HostAndPort("h1")); - - ReplicaSetConfig oldConfig; - ReplicaSetConfig newConfig; - ReplicaSetConfig oldConfigRefresh; - - // Three configurations, two compatible except that h2 flips the arbiterOnly flag. - // The third, compatible with the first. - ASSERT_OK(oldConfig.initialize( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1") << - BSON("_id" << 2 << "host" << "h2" << - "arbiterOnly" << false) << - BSON("_id" << 3 << "host" << "h3"))))); - - ASSERT_OK(newConfig.initialize( - BSON("_id" << "rs0" << - "version" << 3 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1") << - BSON("_id" << 2 << "host" << "h2" << - "arbiterOnly" << true) << - BSON("_id" << 3 << "host" << "h3"))))); - - ASSERT_OK(oldConfigRefresh.initialize( - BSON("_id" << "rs0" << - "version" << 2 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1") << - BSON("_id" << 2 << "host" << "h2" << - "arbiterOnly" << false) << - BSON("_id" << 3 << "host" << "h3"))))); - - ASSERT_OK(oldConfig.validate()); - ASSERT_OK(newConfig.validate()); - ASSERT_OK(oldConfigRefresh.validate()); - ASSERT_OK(validateConfigForReconfig(&externalState, - oldConfig, - oldConfigRefresh, - false).getStatus()); - ASSERT_EQUALS(ErrorCodes::NewReplicaSetConfigurationIncompatible, - validateConfigForReconfig(&externalState, - oldConfig, - newConfig, - false).getStatus()); - // Forced reconfigs also do not allow this. - ASSERT_EQUALS(ErrorCodes::NewReplicaSetConfigurationIncompatible, - validateConfigForReconfig(&externalState, - oldConfig, - newConfig, - true).getStatus()); - } - - TEST(ValidateConfigForReconfig, HostAndIdRemappingRestricted) { - // When reconfiguring a replica set, it is allowed to introduce (host, id) pairs - // absent from the old config only when the hosts and ids were both individually - // absent in the old config. - - ReplicationCoordinatorExternalStateMock externalState; - externalState.addSelf(HostAndPort("h1")); - - ReplicaSetConfig oldConfig; - ReplicaSetConfig legalNewConfigWithNewHostAndId; - ReplicaSetConfig illegalNewConfigReusingHost; - ReplicaSetConfig illegalNewConfigReusingId; - - ASSERT_OK(oldConfig.initialize( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1") << - BSON("_id" << 2 << "host" << "h2") << - BSON("_id" << 3 << "host" << "h3"))))); - ASSERT_OK(oldConfig.validate()); - - // - // Here, the new config is valid because we've replaced (2, "h2") with - // (4, "h4"), so neither the member _id or host name were reused. - // - ASSERT_OK(legalNewConfigWithNewHostAndId.initialize( - BSON("_id" << "rs0" << - "version" << 2 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1") << - BSON("_id" << 4 << "host" << "h4") << - BSON("_id" << 3 << "host" << "h3"))))); - ASSERT_OK(legalNewConfigWithNewHostAndId.validate()); - ASSERT_OK(validateConfigForReconfig(&externalState, - oldConfig, - legalNewConfigWithNewHostAndId, - false).getStatus()); - - // - // Here, the new config is invalid because we've reused host name "h2" with - // new _id 4. - // - ASSERT_OK(illegalNewConfigReusingHost.initialize( - BSON("_id" << "rs0" << - "version" << 2 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1") << - BSON("_id" << 4 << "host" << "h2") << - BSON("_id" << 3 << "host" << "h3"))))); - ASSERT_OK(illegalNewConfigReusingHost.validate()); - ASSERT_EQUALS(ErrorCodes::NewReplicaSetConfigurationIncompatible, - validateConfigForReconfig(&externalState, - oldConfig, - illegalNewConfigReusingHost, - false).getStatus()); - // Forced reconfigs also do not allow this. - ASSERT_EQUALS(ErrorCodes::NewReplicaSetConfigurationIncompatible, - validateConfigForReconfig(&externalState, - oldConfig, - illegalNewConfigReusingHost, - true).getStatus()); - // - // Here, the new config is valid, because all we've changed is the name of - // the host representing _id 2. - // - ASSERT_OK(illegalNewConfigReusingId.initialize( - BSON("_id" << "rs0" << - "version" << 2 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1") << - BSON("_id" << 2 << "host" << "h4") << - BSON("_id" << 3 << "host" << "h3"))))); - ASSERT_OK(illegalNewConfigReusingId.validate()); - ASSERT_OK(validateConfigForReconfig(&externalState, - oldConfig, - illegalNewConfigReusingId, - false).getStatus()); - } - - TEST(ValidateConfigForReconfig, MustFindSelf) { - // Old and new config are same except for version change; this is just testing that we can - // find ourself in the new config. - ReplicaSetConfig oldConfig; - ASSERT_OK(oldConfig.initialize(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1") << - BSON("_id" << 2 << "host" << "h2") << - BSON("_id" << 3 << "host" << "h3"))))); - - ReplicaSetConfig newConfig; - ASSERT_OK(newConfig.initialize(BSON("_id" << "rs0" << - "version" << 2 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1") << - BSON("_id" << 2 << "host" << "h2") << - BSON("_id" << 3 << "host" << "h3"))))); - ReplicationCoordinatorExternalStateMock notPresentExternalState; - ReplicationCoordinatorExternalStateMock presentOnceExternalState; - presentOnceExternalState.addSelf(HostAndPort("h2")); - ReplicationCoordinatorExternalStateMock presentThriceExternalState; - presentThriceExternalState.addSelf(HostAndPort("h3")); - presentThriceExternalState.addSelf(HostAndPort("h2")); - presentThriceExternalState.addSelf(HostAndPort("h1")); - - ASSERT_EQUALS(ErrorCodes::NodeNotFound, - validateConfigForReconfig(¬PresentExternalState, - oldConfig, - newConfig, - false).getStatus()); - ASSERT_EQUALS(ErrorCodes::DuplicateKey, - validateConfigForReconfig(&presentThriceExternalState, - oldConfig, - newConfig, - false).getStatus()); - ASSERT_EQUALS(1, unittest::assertGet(validateConfigForReconfig(&presentOnceExternalState, - oldConfig, - newConfig, - false))); - // Forced reconfigs also do not allow this. - ASSERT_EQUALS(ErrorCodes::NodeNotFound, - validateConfigForReconfig(¬PresentExternalState, - oldConfig, - newConfig, - true).getStatus()); - ASSERT_EQUALS(ErrorCodes::DuplicateKey, - validateConfigForReconfig(&presentThriceExternalState, - oldConfig, - newConfig, - true).getStatus()); - ASSERT_EQUALS(1, unittest::assertGet(validateConfigForReconfig(&presentOnceExternalState, - oldConfig, - newConfig, - true))); - } - - TEST(ValidateConfigForReconfig, SelfMustEndElectable) { - // Old and new config are same except for version change and the electability of one node; - // this is just testing that we must be electable in the new config. - ReplicaSetConfig oldConfig; - ASSERT_OK(oldConfig.initialize(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1") << - BSON("_id" << 2 << "host" << "h2") << - BSON("_id" << 3 << "host" << "h3"))))); - - ReplicaSetConfig newConfig; - ASSERT_OK(newConfig.initialize(BSON("_id" << "rs0" << - "version" << 2 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1") << - BSON("_id" << 2 << "host" << "h2" << - "priority" << 0) << - BSON("_id" << 3 << "host" << "h3"))))); - ReplicationCoordinatorExternalStateMock presentOnceExternalState; - presentOnceExternalState.addSelf(HostAndPort("h2")); - - ASSERT_EQUALS(ErrorCodes::NodeNotElectable, - validateConfigForReconfig(&presentOnceExternalState, - oldConfig, - newConfig, - false).getStatus()); - // Forced reconfig does not require electability. - ASSERT_OK(validateConfigForReconfig(&presentOnceExternalState, - oldConfig, - newConfig, - true).getStatus()); - } - - TEST(ValidateConfigForInitiate, NewConfigInvalid) { - // The new config is not valid due to a duplicate _id value. This tests that if the new - // config is invalid, validateConfigForInitiate will return a status indicating what is - // wrong with the new config. - ReplicaSetConfig newConfig; - ASSERT_OK(newConfig.initialize(BSON("_id" << "rs0" << - "version" << 2 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "h2") << - BSON("_id" << 0 << "host" << "h3"))))); - - ReplicationCoordinatorExternalStateMock presentOnceExternalState; - presentOnceExternalState.addSelf(HostAndPort("h2")); - ASSERT_EQUALS(ErrorCodes::BadValue, validateConfigForInitiate(&presentOnceExternalState, - newConfig).getStatus()); - } - - TEST(ValidateConfigForReconfig, NewConfigInvalid) { - // The new config is not valid due to a duplicate _id value. This tests that if the new - // config is invalid, validateConfigForReconfig will return a status indicating what is - // wrong with the new config. - ReplicaSetConfig oldConfig; - ASSERT_OK(oldConfig.initialize(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "h2"))))); - - ReplicaSetConfig newConfig; - ASSERT_OK(newConfig.initialize(BSON("_id" << "rs0" << - "version" << 2 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "h2") << - BSON("_id" << 0 << "host" << "h3"))))); - - ReplicationCoordinatorExternalStateMock presentOnceExternalState; - presentOnceExternalState.addSelf(HostAndPort("h2")); - ASSERT_EQUALS(ErrorCodes::BadValue, validateConfigForReconfig(&presentOnceExternalState, - oldConfig, - newConfig, - false).getStatus()); - // Forced reconfigs also do not allow this. - ASSERT_EQUALS(ErrorCodes::BadValue, validateConfigForReconfig(&presentOnceExternalState, - oldConfig, - newConfig, - true).getStatus()); - } - - TEST(ValidateConfigForStartUp, NewConfigInvalid) { - // The new config is not valid due to a duplicate _id value. This tests that if the new - // config is invalid, validateConfigForStartUp will return a status indicating what is wrong - // with the new config. - ReplicaSetConfig oldConfig; - ASSERT_OK(oldConfig.initialize(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "h2"))))); - - ReplicaSetConfig newConfig; - ASSERT_OK(newConfig.initialize(BSON("_id" << "rs0" << - "version" << 2 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "h2") << - BSON("_id" << 0 << "host" << "h3"))))); - - ReplicationCoordinatorExternalStateMock presentOnceExternalState; - presentOnceExternalState.addSelf(HostAndPort("h2")); - ASSERT_EQUALS(ErrorCodes::BadValue, validateConfigForStartUp(&presentOnceExternalState, - oldConfig, - newConfig).getStatus()); - } - - TEST(ValidateConfigForStartUp, OldAndNewConfigIncompatible) { - // The new config is not compatible with the old config due to a member changing _ids. This - // tests that validateConfigForStartUp will return a status indicating the incompatiblilty - // between the old and new config. - ReplicaSetConfig oldConfig; - ASSERT_OK(oldConfig.initialize(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "h2") << - BSON("_id" << 1 << "host" << "h3"))))); - - - ReplicaSetConfig newConfig; - ASSERT_OK(newConfig.initialize(BSON("_id" << "rs0" << - "version" << 2 << - "members" << BSON_ARRAY( - BSON("_id" << 2 << "host" << "h2") << - BSON("_id" << 1 << "host" << "h3"))))); - - ReplicationCoordinatorExternalStateMock presentOnceExternalState; - presentOnceExternalState.addSelf(HostAndPort("h2")); - ASSERT_EQUALS(ErrorCodes::NewReplicaSetConfigurationIncompatible, - validateConfigForStartUp(&presentOnceExternalState, - oldConfig, - newConfig).getStatus()); - } - - TEST(ValidateConfigForStartUp, OldAndNewConfigCompatible) { - // The new config is compatible with the old config. This tests that - // validateConfigForStartUp will return a Status::OK() indicating the validity of this - // config change. - ReplicaSetConfig oldConfig; - ASSERT_OK(oldConfig.initialize(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "h2") << - BSON("_id" << 1 << "host" << "h3"))))); - - - ReplicaSetConfig newConfig; - ASSERT_OK(newConfig.initialize(BSON("_id" << "rs0" << - "version" << 2 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "h2" << - "priority" << 3) << - BSON("_id" << 1 << "host" << "h3"))))); - - ReplicationCoordinatorExternalStateMock presentOnceExternalState; - presentOnceExternalState.addSelf(HostAndPort("h2")); - ASSERT_OK(validateConfigForStartUp(&presentOnceExternalState, - oldConfig, - newConfig).getStatus()); - } - - TEST(ValidateConfigForHeartbeatReconfig, NewConfigInvalid) { - // The new config is not valid due to a duplicate _id value. This tests that if the new - // config is invalid, validateConfigForHeartbeatReconfig will return a status indicating - // what is wrong with the new config. - ReplicaSetConfig newConfig; - ASSERT_OK(newConfig.initialize(BSON("_id" << "rs0" << - "version" << 2 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "h2") << - BSON("_id" << 0 << "host" << "h3"))))); - - ReplicationCoordinatorExternalStateMock presentOnceExternalState; - presentOnceExternalState.addSelf(HostAndPort("h2")); - ASSERT_EQUALS(ErrorCodes::BadValue, - validateConfigForHeartbeatReconfig(&presentOnceExternalState, - newConfig).getStatus()); - } - - TEST(ValidateConfigForHeartbeatReconfig, NewConfigValid) { - // The new config is valid. This tests that validateConfigForHeartbeatReconfig will return - // a Status::OK() indicating the validity of this config change. - ReplicaSetConfig newConfig; - ASSERT_OK(newConfig.initialize(BSON("_id" << "rs0" << - "version" << 2 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "h2") << - BSON("_id" << 1 << "host" << "h3"))))); - - ReplicationCoordinatorExternalStateMock presentOnceExternalState; - presentOnceExternalState.addSelf(HostAndPort("h2")); - ASSERT_OK(validateConfigForHeartbeatReconfig(&presentOnceExternalState, - newConfig).getStatus()); - } - - TEST(ValidateForReconfig, ForceStillNeedsValidConfig) { - // The new config is invalid due to two nodes with the same _id value. This tests that - // ValidateForReconfig fails with an invalid config, even if force is true. - ReplicaSetConfig oldConfig; - ASSERT_OK(oldConfig.initialize(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "h2") << - BSON("_id" << 1 << "host" << "h3"))))); - - - ReplicaSetConfig newConfig; - ASSERT_OK(newConfig.initialize(BSON("_id" << "rs0" << - "version" << 2 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "h2") << - BSON("_id" << 0 << "host" << "h3"))))); - - ReplicationCoordinatorExternalStateMock presentOnceExternalState; - presentOnceExternalState.addSelf(HostAndPort("h2")); - ASSERT_EQUALS(ErrorCodes::BadValue, - validateConfigForReconfig(&presentOnceExternalState, - oldConfig, - newConfig, - true).getStatus()); - } - - TEST(ValidateForReconfig, ForceStillNeedsSelfPresent) { - // The new config does not contain self. This tests that ValidateForReconfig fails - // if the member receiving it is absent from the config, even if force is true. - ReplicaSetConfig oldConfig; - ASSERT_OK(oldConfig.initialize(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "h2") << - BSON("_id" << 1 << "host" << "h3"))))); - - - ReplicaSetConfig newConfig; - ASSERT_OK(newConfig.initialize(BSON("_id" << "rs0" << - "version" << 2 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h3") << - BSON("_id" << 2 << "host" << "h4"))))); - - ReplicationCoordinatorExternalStateMock presentOnceExternalState; - presentOnceExternalState.addSelf(HostAndPort("h2")); - ASSERT_EQUALS(ErrorCodes::NodeNotFound, - validateConfigForReconfig(&presentOnceExternalState, - oldConfig, - newConfig, - true).getStatus()); - } +TEST(ValidateConfigForInitiate, VersionMustBe1) { + ReplicationCoordinatorExternalStateMock rses; + rses.addSelf(HostAndPort("h1")); + + ReplicaSetConfig config; + ASSERT_OK(config.initialize(BSON("_id" + << "rs0" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1"))))); + ASSERT_EQUALS(ErrorCodes::NewReplicaSetConfigurationIncompatible, + validateConfigForInitiate(&rses, config).getStatus()); +} + +TEST(ValidateConfigForInitiate, MustFindSelf) { + ReplicaSetConfig config; + ASSERT_OK( + config.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1") + << BSON("_id" << 2 << "host" + << "h2") << BSON("_id" << 3 << "host" + << "h3"))))); + ReplicationCoordinatorExternalStateMock notPresentExternalState; + ReplicationCoordinatorExternalStateMock presentOnceExternalState; + presentOnceExternalState.addSelf(HostAndPort("h2")); + ReplicationCoordinatorExternalStateMock presentTwiceExternalState; + presentTwiceExternalState.addSelf(HostAndPort("h3")); + presentTwiceExternalState.addSelf(HostAndPort("h1")); + + ASSERT_EQUALS(ErrorCodes::NodeNotFound, + validateConfigForInitiate(¬PresentExternalState, config).getStatus()); + ASSERT_EQUALS(ErrorCodes::DuplicateKey, + validateConfigForInitiate(&presentTwiceExternalState, config).getStatus()); + ASSERT_EQUALS( + 1, unittest::assertGet(validateConfigForInitiate(&presentOnceExternalState, config))); +} + +TEST(ValidateConfigForInitiate, SelfMustBeElectable) { + ReplicaSetConfig config; + ASSERT_OK(config.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1") + << BSON("_id" << 2 << "host" + << "h2" + << "priority" << 0) + << BSON("_id" << 3 << "host" + << "h3"))))); + ReplicationCoordinatorExternalStateMock presentOnceExternalState; + presentOnceExternalState.addSelf(HostAndPort("h2")); + + ASSERT_EQUALS(ErrorCodes::NodeNotElectable, + validateConfigForInitiate(&presentOnceExternalState, config).getStatus()); +} + +TEST(ValidateConfigForReconfig, NewConfigVersionNumberMustBeHigherThanOld) { + ReplicationCoordinatorExternalStateMock externalState; + externalState.addSelf(HostAndPort("h1")); + + ReplicaSetConfig oldConfig; + ReplicaSetConfig newConfig; + + // Two configurations, identical except for version. + ASSERT_OK( + oldConfig.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1") + << BSON("_id" << 2 << "host" + << "h2") << BSON("_id" << 3 << "host" + << "h3"))))); + + ASSERT_OK( + newConfig.initialize(BSON("_id" + << "rs0" + << "version" << 3 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1") + << BSON("_id" << 2 << "host" + << "h2") << BSON("_id" << 3 << "host" + << "h3"))))); + + ASSERT_OK(oldConfig.validate()); + ASSERT_OK(newConfig.validate()); + + // Can reconfig from old to new. + ASSERT_OK(validateConfigForReconfig(&externalState, oldConfig, newConfig, false).getStatus()); + + + // Cannot reconfig from old to old (versions must be different). + ASSERT_EQUALS( + ErrorCodes::NewReplicaSetConfigurationIncompatible, + validateConfigForReconfig(&externalState, oldConfig, oldConfig, false).getStatus()); + // Forced reconfigs also do not allow this. + ASSERT_EQUALS( + ErrorCodes::NewReplicaSetConfigurationIncompatible, + validateConfigForReconfig(&externalState, oldConfig, oldConfig, true).getStatus()); + + // Cannot reconfig from new to old (versions must increase). + ASSERT_EQUALS( + ErrorCodes::NewReplicaSetConfigurationIncompatible, + validateConfigForReconfig(&externalState, newConfig, oldConfig, false).getStatus()); + // Forced reconfigs also do not allow this. + ASSERT_EQUALS( + ErrorCodes::NewReplicaSetConfigurationIncompatible, + validateConfigForReconfig(&externalState, newConfig, oldConfig, true).getStatus()); +} + +TEST(ValidateConfigForReconfig, NewConfigMustNotChangeSetName) { + ReplicationCoordinatorExternalStateMock externalState; + externalState.addSelf(HostAndPort("h1")); + + ReplicaSetConfig oldConfig; + ReplicaSetConfig newConfig; + + // Two configurations, compatible except for set name. + ASSERT_OK( + oldConfig.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1") + << BSON("_id" << 2 << "host" + << "h2") << BSON("_id" << 3 << "host" + << "h3"))))); + + ASSERT_OK( + newConfig.initialize(BSON("_id" + << "rs1" + << "version" << 3 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1") + << BSON("_id" << 2 << "host" + << "h2") << BSON("_id" << 3 << "host" + << "h3"))))); + + ASSERT_OK(oldConfig.validate()); + ASSERT_OK(newConfig.validate()); + ASSERT_EQUALS( + ErrorCodes::NewReplicaSetConfigurationIncompatible, + validateConfigForReconfig(&externalState, oldConfig, newConfig, false).getStatus()); + // Forced reconfigs also do not allow this. + ASSERT_EQUALS( + ErrorCodes::NewReplicaSetConfigurationIncompatible, + validateConfigForReconfig(&externalState, newConfig, oldConfig, true).getStatus()); +} + +TEST(ValidateConfigForReconfig, NewConfigMustNotFlipBuildIndexesFlag) { + ReplicationCoordinatorExternalStateMock externalState; + externalState.addSelf(HostAndPort("h1")); + + ReplicaSetConfig oldConfig; + ReplicaSetConfig newConfig; + ReplicaSetConfig oldConfigRefresh; + + // Three configurations, two compatible except that h2 flips the buildIndex flag. + // The third, compatible with the first. + ASSERT_OK(oldConfig.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1") + << BSON("_id" << 2 << "host" + << "h2" + << "buildIndexes" << false + << "priority" << 0) + << BSON("_id" << 3 << "host" + << "h3"))))); + + ASSERT_OK(newConfig.initialize(BSON("_id" + << "rs0" + << "version" << 3 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1") + << BSON("_id" << 2 << "host" + << "h2" + << "buildIndexes" << true + << "priority" << 0) + << BSON("_id" << 3 << "host" + << "h3"))))); + + ASSERT_OK( + oldConfigRefresh.initialize(BSON("_id" + << "rs0" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1") + << BSON("_id" << 2 << "host" + << "h2" + << "buildIndexes" << false + << "priority" << 0) + << BSON("_id" << 3 << "host" + << "h3"))))); + + ASSERT_OK(oldConfig.validate()); + ASSERT_OK(newConfig.validate()); + ASSERT_OK(oldConfigRefresh.validate()); + ASSERT_OK( + validateConfigForReconfig(&externalState, oldConfig, oldConfigRefresh, false).getStatus()); + ASSERT_EQUALS( + ErrorCodes::NewReplicaSetConfigurationIncompatible, + validateConfigForReconfig(&externalState, oldConfig, newConfig, false).getStatus()); + + // Forced reconfigs also do not allow this. + ASSERT_EQUALS( + ErrorCodes::NewReplicaSetConfigurationIncompatible, + validateConfigForReconfig(&externalState, oldConfig, newConfig, true).getStatus()); +} + +TEST(ValidateConfigForReconfig, NewConfigMustNotFlipArbiterFlag) { + ReplicationCoordinatorExternalStateMock externalState; + externalState.addSelf(HostAndPort("h1")); + + ReplicaSetConfig oldConfig; + ReplicaSetConfig newConfig; + ReplicaSetConfig oldConfigRefresh; + + // Three configurations, two compatible except that h2 flips the arbiterOnly flag. + // The third, compatible with the first. + ASSERT_OK(oldConfig.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1") + << BSON("_id" << 2 << "host" + << "h2" + << "arbiterOnly" << false) + << BSON("_id" << 3 << "host" + << "h3"))))); + + ASSERT_OK(newConfig.initialize(BSON("_id" + << "rs0" + << "version" << 3 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1") + << BSON("_id" << 2 << "host" + << "h2" + << "arbiterOnly" << true) + << BSON("_id" << 3 << "host" + << "h3"))))); + + ASSERT_OK( + oldConfigRefresh.initialize(BSON("_id" + << "rs0" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1") + << BSON("_id" << 2 << "host" + << "h2" + << "arbiterOnly" << false) + << BSON("_id" << 3 << "host" + << "h3"))))); + + ASSERT_OK(oldConfig.validate()); + ASSERT_OK(newConfig.validate()); + ASSERT_OK(oldConfigRefresh.validate()); + ASSERT_OK( + validateConfigForReconfig(&externalState, oldConfig, oldConfigRefresh, false).getStatus()); + ASSERT_EQUALS( + ErrorCodes::NewReplicaSetConfigurationIncompatible, + validateConfigForReconfig(&externalState, oldConfig, newConfig, false).getStatus()); + // Forced reconfigs also do not allow this. + ASSERT_EQUALS( + ErrorCodes::NewReplicaSetConfigurationIncompatible, + validateConfigForReconfig(&externalState, oldConfig, newConfig, true).getStatus()); +} + +TEST(ValidateConfigForReconfig, HostAndIdRemappingRestricted) { + // When reconfiguring a replica set, it is allowed to introduce (host, id) pairs + // absent from the old config only when the hosts and ids were both individually + // absent in the old config. + + ReplicationCoordinatorExternalStateMock externalState; + externalState.addSelf(HostAndPort("h1")); + + ReplicaSetConfig oldConfig; + ReplicaSetConfig legalNewConfigWithNewHostAndId; + ReplicaSetConfig illegalNewConfigReusingHost; + ReplicaSetConfig illegalNewConfigReusingId; + + ASSERT_OK( + oldConfig.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1") + << BSON("_id" << 2 << "host" + << "h2") << BSON("_id" << 3 << "host" + << "h3"))))); + ASSERT_OK(oldConfig.validate()); + + // + // Here, the new config is valid because we've replaced (2, "h2") with + // (4, "h4"), so neither the member _id or host name were reused. + // + ASSERT_OK( + legalNewConfigWithNewHostAndId.initialize(BSON("_id" + << "rs0" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1") + << BSON("_id" << 4 << "host" + << "h4") + << BSON("_id" << 3 << "host" + << "h3"))))); + ASSERT_OK(legalNewConfigWithNewHostAndId.validate()); + ASSERT_OK(validateConfigForReconfig( + &externalState, oldConfig, legalNewConfigWithNewHostAndId, false).getStatus()); + + // + // Here, the new config is invalid because we've reused host name "h2" with + // new _id 4. + // + ASSERT_OK(illegalNewConfigReusingHost.initialize(BSON("_id" + << "rs0" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1") + << BSON("_id" << 4 << "host" + << "h2") + << BSON("_id" << 3 << "host" + << "h3"))))); + ASSERT_OK(illegalNewConfigReusingHost.validate()); + ASSERT_EQUALS(ErrorCodes::NewReplicaSetConfigurationIncompatible, + validateConfigForReconfig( + &externalState, oldConfig, illegalNewConfigReusingHost, false).getStatus()); + // Forced reconfigs also do not allow this. + ASSERT_EQUALS(ErrorCodes::NewReplicaSetConfigurationIncompatible, + validateConfigForReconfig( + &externalState, oldConfig, illegalNewConfigReusingHost, true).getStatus()); + // + // Here, the new config is valid, because all we've changed is the name of + // the host representing _id 2. + // + ASSERT_OK(illegalNewConfigReusingId.initialize(BSON("_id" + << "rs0" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1") + << BSON("_id" << 2 << "host" + << "h4") + << BSON("_id" << 3 << "host" + << "h3"))))); + ASSERT_OK(illegalNewConfigReusingId.validate()); + ASSERT_OK(validateConfigForReconfig(&externalState, oldConfig, illegalNewConfigReusingId, false) + .getStatus()); +} + +TEST(ValidateConfigForReconfig, MustFindSelf) { + // Old and new config are same except for version change; this is just testing that we can + // find ourself in the new config. + ReplicaSetConfig oldConfig; + ASSERT_OK( + oldConfig.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1") + << BSON("_id" << 2 << "host" + << "h2") << BSON("_id" << 3 << "host" + << "h3"))))); + + ReplicaSetConfig newConfig; + ASSERT_OK( + newConfig.initialize(BSON("_id" + << "rs0" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1") + << BSON("_id" << 2 << "host" + << "h2") << BSON("_id" << 3 << "host" + << "h3"))))); + ReplicationCoordinatorExternalStateMock notPresentExternalState; + ReplicationCoordinatorExternalStateMock presentOnceExternalState; + presentOnceExternalState.addSelf(HostAndPort("h2")); + ReplicationCoordinatorExternalStateMock presentThriceExternalState; + presentThriceExternalState.addSelf(HostAndPort("h3")); + presentThriceExternalState.addSelf(HostAndPort("h2")); + presentThriceExternalState.addSelf(HostAndPort("h1")); + + ASSERT_EQUALS(ErrorCodes::NodeNotFound, + validateConfigForReconfig(¬PresentExternalState, oldConfig, newConfig, false) + .getStatus()); + ASSERT_EQUALS(ErrorCodes::DuplicateKey, + validateConfigForReconfig( + &presentThriceExternalState, oldConfig, newConfig, false).getStatus()); + ASSERT_EQUALS(1, + unittest::assertGet(validateConfigForReconfig( + &presentOnceExternalState, oldConfig, newConfig, false))); + // Forced reconfigs also do not allow this. + ASSERT_EQUALS(ErrorCodes::NodeNotFound, + validateConfigForReconfig(¬PresentExternalState, oldConfig, newConfig, true) + .getStatus()); + ASSERT_EQUALS(ErrorCodes::DuplicateKey, + validateConfigForReconfig(&presentThriceExternalState, oldConfig, newConfig, true) + .getStatus()); + ASSERT_EQUALS(1, + unittest::assertGet(validateConfigForReconfig( + &presentOnceExternalState, oldConfig, newConfig, true))); +} + +TEST(ValidateConfigForReconfig, SelfMustEndElectable) { + // Old and new config are same except for version change and the electability of one node; + // this is just testing that we must be electable in the new config. + ReplicaSetConfig oldConfig; + ASSERT_OK( + oldConfig.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1") + << BSON("_id" << 2 << "host" + << "h2") << BSON("_id" << 3 << "host" + << "h3"))))); + + ReplicaSetConfig newConfig; + ASSERT_OK(newConfig.initialize(BSON("_id" + << "rs0" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1") + << BSON("_id" << 2 << "host" + << "h2" + << "priority" << 0) + << BSON("_id" << 3 << "host" + << "h3"))))); + ReplicationCoordinatorExternalStateMock presentOnceExternalState; + presentOnceExternalState.addSelf(HostAndPort("h2")); + + ASSERT_EQUALS(ErrorCodes::NodeNotElectable, + validateConfigForReconfig(&presentOnceExternalState, oldConfig, newConfig, false) + .getStatus()); + // Forced reconfig does not require electability. + ASSERT_OK(validateConfigForReconfig(&presentOnceExternalState, oldConfig, newConfig, true) + .getStatus()); +} + +TEST(ValidateConfigForInitiate, NewConfigInvalid) { + // The new config is not valid due to a duplicate _id value. This tests that if the new + // config is invalid, validateConfigForInitiate will return a status indicating what is + // wrong with the new config. + ReplicaSetConfig newConfig; + ASSERT_OK(newConfig.initialize(BSON("_id" + << "rs0" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "h2") + << BSON("_id" << 0 << "host" + << "h3"))))); + + ReplicationCoordinatorExternalStateMock presentOnceExternalState; + presentOnceExternalState.addSelf(HostAndPort("h2")); + ASSERT_EQUALS(ErrorCodes::BadValue, + validateConfigForInitiate(&presentOnceExternalState, newConfig).getStatus()); +} + +TEST(ValidateConfigForReconfig, NewConfigInvalid) { + // The new config is not valid due to a duplicate _id value. This tests that if the new + // config is invalid, validateConfigForReconfig will return a status indicating what is + // wrong with the new config. + ReplicaSetConfig oldConfig; + ASSERT_OK(oldConfig.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "h2"))))); + + ReplicaSetConfig newConfig; + ASSERT_OK(newConfig.initialize(BSON("_id" + << "rs0" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "h2") + << BSON("_id" << 0 << "host" + << "h3"))))); + + ReplicationCoordinatorExternalStateMock presentOnceExternalState; + presentOnceExternalState.addSelf(HostAndPort("h2")); + ASSERT_EQUALS(ErrorCodes::BadValue, + validateConfigForReconfig(&presentOnceExternalState, oldConfig, newConfig, false) + .getStatus()); + // Forced reconfigs also do not allow this. + ASSERT_EQUALS(ErrorCodes::BadValue, + validateConfigForReconfig(&presentOnceExternalState, oldConfig, newConfig, true) + .getStatus()); +} + +TEST(ValidateConfigForStartUp, NewConfigInvalid) { + // The new config is not valid due to a duplicate _id value. This tests that if the new + // config is invalid, validateConfigForStartUp will return a status indicating what is wrong + // with the new config. + ReplicaSetConfig oldConfig; + ASSERT_OK(oldConfig.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "h2"))))); + + ReplicaSetConfig newConfig; + ASSERT_OK(newConfig.initialize(BSON("_id" + << "rs0" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "h2") + << BSON("_id" << 0 << "host" + << "h3"))))); + + ReplicationCoordinatorExternalStateMock presentOnceExternalState; + presentOnceExternalState.addSelf(HostAndPort("h2")); + ASSERT_EQUALS( + ErrorCodes::BadValue, + validateConfigForStartUp(&presentOnceExternalState, oldConfig, newConfig).getStatus()); +} + +TEST(ValidateConfigForStartUp, OldAndNewConfigIncompatible) { + // The new config is not compatible with the old config due to a member changing _ids. This + // tests that validateConfigForStartUp will return a status indicating the incompatiblilty + // between the old and new config. + ReplicaSetConfig oldConfig; + ASSERT_OK(oldConfig.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "h2") + << BSON("_id" << 1 << "host" + << "h3"))))); + + + ReplicaSetConfig newConfig; + ASSERT_OK(newConfig.initialize(BSON("_id" + << "rs0" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 2 << "host" + << "h2") + << BSON("_id" << 1 << "host" + << "h3"))))); + + ReplicationCoordinatorExternalStateMock presentOnceExternalState; + presentOnceExternalState.addSelf(HostAndPort("h2")); + ASSERT_EQUALS( + ErrorCodes::NewReplicaSetConfigurationIncompatible, + validateConfigForStartUp(&presentOnceExternalState, oldConfig, newConfig).getStatus()); +} + +TEST(ValidateConfigForStartUp, OldAndNewConfigCompatible) { + // The new config is compatible with the old config. This tests that + // validateConfigForStartUp will return a Status::OK() indicating the validity of this + // config change. + ReplicaSetConfig oldConfig; + ASSERT_OK(oldConfig.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "h2") + << BSON("_id" << 1 << "host" + << "h3"))))); + + + ReplicaSetConfig newConfig; + ASSERT_OK(newConfig.initialize(BSON("_id" + << "rs0" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "h2" + << "priority" << 3) + << BSON("_id" << 1 << "host" + << "h3"))))); + + ReplicationCoordinatorExternalStateMock presentOnceExternalState; + presentOnceExternalState.addSelf(HostAndPort("h2")); + ASSERT_OK( + validateConfigForStartUp(&presentOnceExternalState, oldConfig, newConfig).getStatus()); +} + +TEST(ValidateConfigForHeartbeatReconfig, NewConfigInvalid) { + // The new config is not valid due to a duplicate _id value. This tests that if the new + // config is invalid, validateConfigForHeartbeatReconfig will return a status indicating + // what is wrong with the new config. + ReplicaSetConfig newConfig; + ASSERT_OK(newConfig.initialize(BSON("_id" + << "rs0" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "h2") + << BSON("_id" << 0 << "host" + << "h3"))))); + + ReplicationCoordinatorExternalStateMock presentOnceExternalState; + presentOnceExternalState.addSelf(HostAndPort("h2")); + ASSERT_EQUALS( + ErrorCodes::BadValue, + validateConfigForHeartbeatReconfig(&presentOnceExternalState, newConfig).getStatus()); +} + +TEST(ValidateConfigForHeartbeatReconfig, NewConfigValid) { + // The new config is valid. This tests that validateConfigForHeartbeatReconfig will return + // a Status::OK() indicating the validity of this config change. + ReplicaSetConfig newConfig; + ASSERT_OK(newConfig.initialize(BSON("_id" + << "rs0" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "h2") + << BSON("_id" << 1 << "host" + << "h3"))))); + + ReplicationCoordinatorExternalStateMock presentOnceExternalState; + presentOnceExternalState.addSelf(HostAndPort("h2")); + ASSERT_OK(validateConfigForHeartbeatReconfig(&presentOnceExternalState, newConfig).getStatus()); +} + +TEST(ValidateForReconfig, ForceStillNeedsValidConfig) { + // The new config is invalid due to two nodes with the same _id value. This tests that + // ValidateForReconfig fails with an invalid config, even if force is true. + ReplicaSetConfig oldConfig; + ASSERT_OK(oldConfig.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "h2") + << BSON("_id" << 1 << "host" + << "h3"))))); + + + ReplicaSetConfig newConfig; + ASSERT_OK(newConfig.initialize(BSON("_id" + << "rs0" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "h2") + << BSON("_id" << 0 << "host" + << "h3"))))); + + ReplicationCoordinatorExternalStateMock presentOnceExternalState; + presentOnceExternalState.addSelf(HostAndPort("h2")); + ASSERT_EQUALS(ErrorCodes::BadValue, + validateConfigForReconfig(&presentOnceExternalState, oldConfig, newConfig, true) + .getStatus()); +} + +TEST(ValidateForReconfig, ForceStillNeedsSelfPresent) { + // The new config does not contain self. This tests that ValidateForReconfig fails + // if the member receiving it is absent from the config, even if force is true. + ReplicaSetConfig oldConfig; + ASSERT_OK(oldConfig.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "h2") + << BSON("_id" << 1 << "host" + << "h3"))))); + + + ReplicaSetConfig newConfig; + ASSERT_OK(newConfig.initialize(BSON("_id" + << "rs0" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h3") + << BSON("_id" << 2 << "host" + << "h4"))))); + + ReplicationCoordinatorExternalStateMock presentOnceExternalState; + presentOnceExternalState.addSelf(HostAndPort("h2")); + ASSERT_EQUALS(ErrorCodes::NodeNotFound, + validateConfigForReconfig(&presentOnceExternalState, oldConfig, newConfig, true) + .getStatus()); +} } // namespace } // namespace repl diff --git a/src/mongo/db/repl/replica_set_config_test.cpp b/src/mongo/db/repl/replica_set_config_test.cpp index b77a8844079..73e1b42d305 100644 --- a/src/mongo/db/repl/replica_set_config_test.cpp +++ b/src/mongo/db/repl/replica_set_config_test.cpp @@ -36,654 +36,704 @@ namespace mongo { namespace repl { namespace { - TEST(ReplicaSetConfig, ParseMinimalConfigAndCheckDefaults) { - ReplicaSetConfig config; - ASSERT_OK(config.initialize( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "localhost:12345"))))); - ASSERT_OK(config.validate()); - ASSERT_EQUALS("rs0", config.getReplSetName()); - ASSERT_EQUALS(1, config.getConfigVersion()); - ASSERT_EQUALS(1, config.getNumMembers()); - ASSERT_EQUALS(0, config.membersBegin()->getId()); - ASSERT_EQUALS(1, config.getDefaultWriteConcern().wNumNodes); - ASSERT_EQUALS("", config.getDefaultWriteConcern().wMode); - ASSERT_EQUALS(10, config.getHeartbeatTimeoutPeriod().total_seconds()); - ASSERT_TRUE(config.isChainingAllowed()); - } - - TEST(ReplicaSetConfig, MajorityCalculationThreeVotersNoArbiters) { - ReplicaSetConfig config; - ASSERT_OK(config.initialize( - BSON("_id" << "rs0" << - "version" << 2 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "h1:1") << - BSON("_id" << 2 << "host" << "h2:1") << - BSON("_id" << 3 << "host" << "h3:1") << - BSON("_id" << 4 << "host" << "h4:1" << "votes" << 0) << - BSON("_id" << 5 << "host" << "h5:1" << "votes" << 0))))); - ASSERT_OK(config.validate()); - - ASSERT_EQUALS(2, config.getWriteMajority()); - } - - TEST(ReplicaSetConfig, MajorityCalculationNearlyHalfArbiters) { - ReplicaSetConfig config; - ASSERT_OK(config.initialize( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("host" << "node1:12345" << "_id" << 0) << - BSON("host" << "node2:12345" << "_id" << 1) << - BSON("host" << "node3:12345" << "_id" << 2) << - BSON("host" << "node4:12345" << - "_id" << 3 << - "arbiterOnly" << true) << - BSON("host" << "node5:12345" << - "_id" << 4 << - "arbiterOnly" << true))))); - ASSERT_OK(config.validate()); - ASSERT_EQUALS(3, config.getWriteMajority()); - } - - TEST(ReplicaSetConfig, MajorityCalculationNearlyHalfArbitersOthersNoVote) { - ReplicaSetConfig config; - ASSERT_OK(config.initialize( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("host" << "node1:12345" << - "_id" << 0 << - "votes" << 0) << - BSON("host" << "node2:12345" << - "_id" << 1 << - "votes" << 0) << - BSON("host" << "node3:12345" << - "_id" << 2 << - "votes" << 0) << - BSON("host" << "node4:12345" << - "_id" << 3 << - "arbiterOnly" << true) << - BSON("host" << "node5:12345" << - "_id" << 4 << - "arbiterOnly" << true))))); - ASSERT_OK(config.validate()); - ASSERT_EQUALS(0, config.getWriteMajority()); - } - - TEST(ReplicaSetConfig, MajorityCalculationEvenNumberOfMembers) { - ReplicaSetConfig config; - ASSERT_OK(config.initialize( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("host" << "node1:12345" << "_id" << 0) << - BSON("host" << "node2:12345" << "_id" << 1) << - BSON("host" << "node3:12345" << "_id" << 2) << - BSON("host" << "node4:12345" << "_id" << 3))))); - ASSERT_OK(config.validate()); - ASSERT_EQUALS(3, config.getWriteMajority()); - } - - TEST(ReplicaSetConfig, MajorityCalculationNearlyHalfSecondariesNoVotes) { - ReplicaSetConfig config; - ASSERT_OK(config.initialize( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("host" << "node1:12345" << "_id" << 0) << - BSON("host" << "node2:12345" << - "_id" << 1 << - "votes" << 0) << - BSON("host" << "node3:12345" << - "_id" << 2 << - "votes" << 0) << - BSON("host" << "node4:12345" << "_id" << 3) << - BSON("host" << "node5:12345" << "_id" << 4))))); - ASSERT_OK(config.validate()); - ASSERT_EQUALS(2, config.getWriteMajority()); - } - - TEST(ReplicaSetConfig, ParseFailsWithBadOrMissingIdField) { - ReplicaSetConfig config; - // Replica set name must be a string. - ASSERT_EQUALS( - ErrorCodes::TypeMismatch, - config.initialize( - BSON("_id" << 1 << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "localhost:12345"))))); - - // Replica set name must be present. - ASSERT_EQUALS( - ErrorCodes::NoSuchKey, - config.initialize( - BSON("version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "localhost:12345"))))); - - // Empty repl set name parses, but does not validate. - ASSERT_OK(config.initialize( - BSON("_id" << "" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "localhost:12345"))))); - - ASSERT_EQUALS(ErrorCodes::BadValue, config.validate()); - } - - TEST(ReplicaSetConfig, ParseFailsWithBadOrMissingVersionField) { - ReplicaSetConfig config; - // Config version field must be present. - ASSERT_EQUALS( - ErrorCodes::NoSuchKey, - config.initialize( - BSON("_id" << "rs0" << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "localhost:12345"))))); - ASSERT_EQUALS( - ErrorCodes::TypeMismatch, - config.initialize( - BSON("_id" << "rs0" << - "version" << "1" << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "localhost:12345"))))); - - ASSERT_OK(config.initialize( - BSON("_id" << "rs0" << - "version" << 1.0 << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "localhost:12345"))))); - ASSERT_OK(config.validate()); - ASSERT_OK(config.initialize( - BSON("_id" << "rs0" << - "version" << 0.0 << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "localhost:12345"))))); - ASSERT_EQUALS(ErrorCodes::BadValue, config.validate()); - ASSERT_OK(config.initialize( - BSON("_id" << "rs0" << - "version" << - static_cast<long long>(std::numeric_limits<int>::max()) + 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "localhost:12345"))))); - ASSERT_EQUALS(ErrorCodes::BadValue, config.validate()); - } - - TEST(ReplicaSetConfig, ParseFailsWithBadMembers) { - ReplicaSetConfig config; - ASSERT_EQUALS(ErrorCodes::TypeMismatch, - config.initialize( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "localhost:12345") << - "localhost:23456")))); - ASSERT_EQUALS(ErrorCodes::NoSuchKey, - config.initialize( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("host" << "localhost:12345"))))); - } - - TEST(ReplicaSetConfig, ParseFailsWithLocalNonLocalHostMix) { - ReplicaSetConfig config; - ASSERT_OK(config.initialize(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "localhost") << - BSON("_id" << 1 << - "host" << "otherhost"))))); - ASSERT_EQUALS(ErrorCodes::BadValue, config.validate()); - } - - TEST(ReplicaSetConfig, ParseFailsWithNoElectableNodes) { - ReplicaSetConfig config; - const BSONObj configBsonNoElectableNodes = BSON( - "_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "localhost:1" << "priority" << 0) << - BSON("_id" << 1 << "host" << "localhost:2" << "priority" << 0))); - - ASSERT_OK(config.initialize(configBsonNoElectableNodes)); - ASSERT_EQUALS(ErrorCodes::BadValue, config.validate()); - - const BSONObj configBsonNoElectableNodesOneArbiter = BSON( - "_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "localhost:1" << "arbiterOnly" << 1) << - BSON("_id" << 1 << "host" << "localhost:2" << "priority" << 0))); - - ASSERT_OK(config.initialize(configBsonNoElectableNodesOneArbiter)); - ASSERT_EQUALS(ErrorCodes::BadValue, config.validate()); - - const BSONObj configBsonNoElectableNodesTwoArbiters = BSON( - "_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "localhost:1" << "arbiterOnly" << 1) << - BSON("_id" << 1 << "host" << "localhost:2" << "arbiterOnly" << 1))); - - ASSERT_OK(config.initialize(configBsonNoElectableNodesOneArbiter)); - ASSERT_EQUALS(ErrorCodes::BadValue, config.validate()); - - const BSONObj configBsonOneElectableNode = BSON( - "_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "localhost:1" << "priority" << 0) << - BSON("_id" << 1 << "host" << "localhost:2" << "priority" << 1))); - ASSERT_OK(config.initialize(configBsonOneElectableNode)); - ASSERT_OK(config.validate()); - } - - TEST(ReplicaSetConfig, ParseFailsWithTooFewVoters) { - ReplicaSetConfig config; - const BSONObj configBsonNoVoters = BSON( - "_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "localhost:1" << "votes" << 0) << - BSON("_id" << 1 << "host" << "localhost:2" << "votes" << 0))); - - ASSERT_OK(config.initialize(configBsonNoVoters)); - ASSERT_EQUALS(ErrorCodes::BadValue, config.validate()); - - const BSONObj configBsonOneVoter = BSON( - "_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "localhost:1" << "votes" << 0) << - BSON("_id" << 1 << "host" << "localhost:2" << "votes" << 1))); - ASSERT_OK(config.initialize(configBsonOneVoter)); - ASSERT_OK(config.validate()); - } - - TEST(ReplicaSetConfig, ParseFailsWithTooManyVoters) { - ReplicaSetConfig config; - namespace mmb = mutablebson; - mmb::Document configDoc; - mmb::Element configDocRoot = configDoc.root(); - ASSERT_OK(configDocRoot.appendString("_id", "rs0")); - ASSERT_OK(configDocRoot.appendInt("version", 1)); - mmb::Element membersArray = configDoc.makeElementArray("members"); - ASSERT_OK(configDocRoot.pushBack(membersArray)); - for (size_t i = 0; i < ReplicaSetConfig::kMaxVotingMembers + 1; ++i) { - mmb::Element memberElement = configDoc.makeElementObject(""); - ASSERT_OK(membersArray.pushBack(memberElement)); - ASSERT_OK(memberElement.appendInt("_id", i)); - ASSERT_OK(memberElement.appendString( - "host", std::string(str::stream() << "localhost" << i + 1))); - ASSERT_OK(memberElement.appendInt("votes", 1)); - } - - const BSONObj configBsonTooManyVoters = configDoc.getObject(); - - membersArray.leftChild().findFirstChildNamed("votes").setValueInt(0); - const BSONObj configBsonMaxVoters = configDoc.getObject(); - - - ASSERT_OK(config.initialize(configBsonMaxVoters)); - ASSERT_OK(config.validate()); - ASSERT_OK(config.initialize(configBsonTooManyVoters)); - ASSERT_EQUALS(ErrorCodes::BadValue, config.validate()); - } - - TEST(ReplicaSetConfig, ParseFailsWithDuplicateHost) { - ReplicaSetConfig config; - const BSONObj configBson = BSON( - "_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "localhost:1") << - BSON("_id" << 1 << "host" << "localhost:1"))); - ASSERT_OK(config.initialize(configBson)); - ASSERT_EQUALS(ErrorCodes::BadValue, config.validate()); - } - - TEST(ReplicaSetConfig, ParseFailsWithTooManyNodes) { - ReplicaSetConfig config; - namespace mmb = mutablebson; - mmb::Document configDoc; - mmb::Element configDocRoot = configDoc.root(); - ASSERT_OK(configDocRoot.appendString("_id", "rs0")); - ASSERT_OK(configDocRoot.appendInt("version", 1)); - mmb::Element membersArray = configDoc.makeElementArray("members"); - ASSERT_OK(configDocRoot.pushBack(membersArray)); - for (size_t i = 0; i < ReplicaSetConfig::kMaxMembers; ++i) { - mmb::Element memberElement = configDoc.makeElementObject(""); - ASSERT_OK(membersArray.pushBack(memberElement)); - ASSERT_OK(memberElement.appendInt("_id", i)); - ASSERT_OK(memberElement.appendString( - "host", std::string(str::stream() << "localhost" << i + 1))); - if (i >= ReplicaSetConfig::kMaxVotingMembers) { - ASSERT_OK(memberElement.appendInt("votes", 0)); - } - } - const BSONObj configBsonMaxNodes = configDoc.getObject(); - +TEST(ReplicaSetConfig, ParseMinimalConfigAndCheckDefaults) { + ReplicaSetConfig config; + ASSERT_OK(config.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345"))))); + ASSERT_OK(config.validate()); + ASSERT_EQUALS("rs0", config.getReplSetName()); + ASSERT_EQUALS(1, config.getConfigVersion()); + ASSERT_EQUALS(1, config.getNumMembers()); + ASSERT_EQUALS(0, config.membersBegin()->getId()); + ASSERT_EQUALS(1, config.getDefaultWriteConcern().wNumNodes); + ASSERT_EQUALS("", config.getDefaultWriteConcern().wMode); + ASSERT_EQUALS(10, config.getHeartbeatTimeoutPeriod().total_seconds()); + ASSERT_TRUE(config.isChainingAllowed()); +} + +TEST(ReplicaSetConfig, MajorityCalculationThreeVotersNoArbiters) { + ReplicaSetConfig config; + ASSERT_OK( + config.initialize(BSON("_id" + << "rs0" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1:1") + << BSON("_id" << 2 << "host" + << "h2:1") << BSON("_id" << 3 << "host" + << "h3:1") + << BSON("_id" << 4 << "host" + << "h4:1" + << "votes" << 0) + << BSON("_id" << 5 << "host" + << "h5:1" + << "votes" << 0))))); + ASSERT_OK(config.validate()); + + ASSERT_EQUALS(2, config.getWriteMajority()); +} + +TEST(ReplicaSetConfig, MajorityCalculationNearlyHalfArbiters) { + ReplicaSetConfig config; + ASSERT_OK( + config.initialize(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("host" + << "node1:12345" + << "_id" << 0) + << BSON("host" + << "node2:12345" + << "_id" << 1) << BSON("host" + << "node3:12345" + << "_id" << 2) + << BSON("host" + << "node4:12345" + << "_id" << 3 << "arbiterOnly" << true) + << BSON("host" + << "node5:12345" + << "_id" << 4 << "arbiterOnly" << true))))); + ASSERT_OK(config.validate()); + ASSERT_EQUALS(3, config.getWriteMajority()); +} + +TEST(ReplicaSetConfig, MajorityCalculationNearlyHalfArbitersOthersNoVote) { + ReplicaSetConfig config; + ASSERT_OK( + config.initialize(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("host" + << "node1:12345" + << "_id" << 0 << "votes" << 0) + << BSON("host" + << "node2:12345" + << "_id" << 1 << "votes" << 0) + << BSON("host" + << "node3:12345" + << "_id" << 2 << "votes" << 0) + << BSON("host" + << "node4:12345" + << "_id" << 3 << "arbiterOnly" << true) + << BSON("host" + << "node5:12345" + << "_id" << 4 << "arbiterOnly" << true))))); + ASSERT_OK(config.validate()); + ASSERT_EQUALS(0, config.getWriteMajority()); +} + +TEST(ReplicaSetConfig, MajorityCalculationEvenNumberOfMembers) { + ReplicaSetConfig config; + ASSERT_OK(config.initialize(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("host" + << "node1:12345" + << "_id" << 0) + << BSON("host" + << "node2:12345" + << "_id" << 1) << BSON("host" + << "node3:12345" + << "_id" << 2) + << BSON("host" + << "node4:12345" + << "_id" << 3))))); + ASSERT_OK(config.validate()); + ASSERT_EQUALS(3, config.getWriteMajority()); +} + +TEST(ReplicaSetConfig, MajorityCalculationNearlyHalfSecondariesNoVotes) { + ReplicaSetConfig config; + ASSERT_OK( + config.initialize(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("host" + << "node1:12345" + << "_id" << 0) + << BSON("host" + << "node2:12345" + << "_id" << 1 << "votes" << 0) + << BSON("host" + << "node3:12345" + << "_id" << 2 << "votes" << 0) + << BSON("host" + << "node4:12345" + << "_id" << 3) << BSON("host" + << "node5:12345" + << "_id" << 4))))); + ASSERT_OK(config.validate()); + ASSERT_EQUALS(2, config.getWriteMajority()); +} + +TEST(ReplicaSetConfig, ParseFailsWithBadOrMissingIdField) { + ReplicaSetConfig config; + // Replica set name must be a string. + ASSERT_EQUALS(ErrorCodes::TypeMismatch, + config.initialize(BSON("_id" << 1 << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345"))))); + + // Replica set name must be present. + ASSERT_EQUALS( + ErrorCodes::NoSuchKey, + config.initialize( + BSON("version" << 1 << "members" << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345"))))); + + // Empty repl set name parses, but does not validate. + ASSERT_OK(config.initialize(BSON("_id" + << "" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345"))))); + + ASSERT_EQUALS(ErrorCodes::BadValue, config.validate()); +} + +TEST(ReplicaSetConfig, ParseFailsWithBadOrMissingVersionField) { + ReplicaSetConfig config; + // Config version field must be present. + ASSERT_EQUALS( + ErrorCodes::NoSuchKey, + config.initialize(BSON("_id" + << "rs0" + << "members" << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345"))))); + ASSERT_EQUALS( + ErrorCodes::TypeMismatch, + config.initialize(BSON("_id" + << "rs0" + << "version" + << "1" + << "members" << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345"))))); + + ASSERT_OK(config.initialize(BSON("_id" + << "rs0" + << "version" << 1.0 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345"))))); + ASSERT_OK(config.validate()); + ASSERT_OK(config.initialize(BSON("_id" + << "rs0" + << "version" << 0.0 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345"))))); + ASSERT_EQUALS(ErrorCodes::BadValue, config.validate()); + ASSERT_OK( + config.initialize(BSON("_id" + << "rs0" + << "version" + << static_cast<long long>(std::numeric_limits<int>::max()) + 1 + << "members" << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345"))))); + ASSERT_EQUALS(ErrorCodes::BadValue, config.validate()); +} + +TEST(ReplicaSetConfig, ParseFailsWithBadMembers) { + ReplicaSetConfig config; + ASSERT_EQUALS(ErrorCodes::TypeMismatch, + config.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345") + << "localhost:23456")))); + ASSERT_EQUALS(ErrorCodes::NoSuchKey, + config.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("host" + << "localhost:12345"))))); +} + +TEST(ReplicaSetConfig, ParseFailsWithLocalNonLocalHostMix) { + ReplicaSetConfig config; + ASSERT_OK(config.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost") + << BSON("_id" << 1 << "host" + << "otherhost"))))); + ASSERT_EQUALS(ErrorCodes::BadValue, config.validate()); +} + +TEST(ReplicaSetConfig, ParseFailsWithNoElectableNodes) { + ReplicaSetConfig config; + const BSONObj configBsonNoElectableNodes = BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:1" + << "priority" << 0) + << BSON("_id" << 1 << "host" + << "localhost:2" + << "priority" + << 0))); + + ASSERT_OK(config.initialize(configBsonNoElectableNodes)); + ASSERT_EQUALS(ErrorCodes::BadValue, config.validate()); + + const BSONObj configBsonNoElectableNodesOneArbiter = + BSON("_id" + << "rs0" + << "version" << 1 << "members" << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:1" + << "arbiterOnly" << 1) + << BSON("_id" << 1 << "host" + << "localhost:2" + << "priority" << 0))); + + ASSERT_OK(config.initialize(configBsonNoElectableNodesOneArbiter)); + ASSERT_EQUALS(ErrorCodes::BadValue, config.validate()); + + const BSONObj configBsonNoElectableNodesTwoArbiters = + BSON("_id" + << "rs0" + << "version" << 1 << "members" << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:1" + << "arbiterOnly" << 1) + << BSON("_id" << 1 << "host" + << "localhost:2" + << "arbiterOnly" << 1))); + + ASSERT_OK(config.initialize(configBsonNoElectableNodesOneArbiter)); + ASSERT_EQUALS(ErrorCodes::BadValue, config.validate()); + + const BSONObj configBsonOneElectableNode = BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:1" + << "priority" << 0) + << BSON("_id" << 1 << "host" + << "localhost:2" + << "priority" + << 1))); + ASSERT_OK(config.initialize(configBsonOneElectableNode)); + ASSERT_OK(config.validate()); +} + +TEST(ReplicaSetConfig, ParseFailsWithTooFewVoters) { + ReplicaSetConfig config; + const BSONObj configBsonNoVoters = BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:1" + << "votes" << 0) + << BSON("_id" << 1 << "host" + << "localhost:2" + << "votes" << 0))); + + ASSERT_OK(config.initialize(configBsonNoVoters)); + ASSERT_EQUALS(ErrorCodes::BadValue, config.validate()); + + const BSONObj configBsonOneVoter = BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:1" + << "votes" << 0) + << BSON("_id" << 1 << "host" + << "localhost:2" + << "votes" << 1))); + ASSERT_OK(config.initialize(configBsonOneVoter)); + ASSERT_OK(config.validate()); +} + +TEST(ReplicaSetConfig, ParseFailsWithTooManyVoters) { + ReplicaSetConfig config; + namespace mmb = mutablebson; + mmb::Document configDoc; + mmb::Element configDocRoot = configDoc.root(); + ASSERT_OK(configDocRoot.appendString("_id", "rs0")); + ASSERT_OK(configDocRoot.appendInt("version", 1)); + mmb::Element membersArray = configDoc.makeElementArray("members"); + ASSERT_OK(configDocRoot.pushBack(membersArray)); + for (size_t i = 0; i < ReplicaSetConfig::kMaxVotingMembers + 1; ++i) { mmb::Element memberElement = configDoc.makeElementObject(""); ASSERT_OK(membersArray.pushBack(memberElement)); - ASSERT_OK(memberElement.appendInt("_id", ReplicaSetConfig::kMaxMembers)); - ASSERT_OK(memberElement.appendString( - "host", std::string(str::stream() << - "localhost" << ReplicaSetConfig::kMaxMembers + 1))); - ASSERT_OK(memberElement.appendInt("votes", 0)); - const BSONObj configBsonTooManyNodes = configDoc.getObject(); - - - ASSERT_OK(config.initialize(configBsonMaxNodes)); - ASSERT_OK(config.validate()); - ASSERT_OK(config.initialize(configBsonTooManyNodes)); - ASSERT_EQUALS(ErrorCodes::BadValue, config.validate()); - } - - TEST(ReplicaSetConfig, ParseFailsWithUnexpectedField) { - ReplicaSetConfig config; - Status status = config.initialize(BSON("_id" << "rs0" << - "version" << 1 << - "unexpectedfield" << "value")); - ASSERT_EQUALS(ErrorCodes::BadValue, status); - } - - TEST(ReplicaSetConfig, ParseFailsWithNonArrayMembersField) { - ReplicaSetConfig config; - Status status = config.initialize(BSON("_id" << "rs0" << - "version" << 1 << - "members" << "value")); - ASSERT_EQUALS(ErrorCodes::TypeMismatch, status); - } - - TEST(ReplicaSetConfig, ParseFailsWithNonNumericHeartbeatTimeoutSecsField) { - ReplicaSetConfig config; - Status status = config.initialize(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "localhost:12345")) << - "settings" << BSON("heartbeatTimeoutSecs" << "no"))); - ASSERT_EQUALS(ErrorCodes::TypeMismatch, status); - } - - TEST(ReplicaSetConfig, ParseFailsWithNonBoolChainingAllowedField) { - ReplicaSetConfig config; - Status status = config.initialize(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "localhost:12345")) << - "settings" << BSON("chainingAllowed" << "no"))); - ASSERT_EQUALS(ErrorCodes::TypeMismatch, status); - } - - TEST(ReplicaSetConfig, ParseFailsWithNonObjectSettingsField) { - ReplicaSetConfig config; - Status status = config.initialize(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "localhost:12345")) << - "settings" << "none")); - ASSERT_EQUALS(ErrorCodes::TypeMismatch, status); - } - - TEST(ReplicaSetConfig, ParseFailsWithGetLastErrorDefaultsFieldUnparseable) { - ReplicaSetConfig config; - Status status = config.initialize(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "localhost:12345")) << - "settings" << BSON("getLastErrorDefaults" << BSON( - "fsync" << "seven")))); - ASSERT_EQUALS(ErrorCodes::FailedToParse, status); + ASSERT_OK(memberElement.appendInt("_id", i)); + ASSERT_OK( + memberElement.appendString("host", std::string(str::stream() << "localhost" << i + 1))); + ASSERT_OK(memberElement.appendInt("votes", 1)); } - TEST(ReplicaSetConfig, ParseFailsWithNonObjectGetLastErrorDefaultsField) { - ReplicaSetConfig config; - Status status = config.initialize(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "localhost:12345")) << - "settings" << BSON("getLastErrorDefaults" << "no"))); - ASSERT_EQUALS(ErrorCodes::TypeMismatch, status); - } - - TEST(ReplicaSetConfig, ParseFailsWithNonObjectGetLastErrorModesField) { - ReplicaSetConfig config; - Status status = config.initialize(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "localhost:12345")) << - "settings" << BSON("getLastErrorModes" << "no"))); - ASSERT_EQUALS(ErrorCodes::TypeMismatch, status); - } - - TEST(ReplicaSetConfig, ParseFailsWithDuplicateGetLastErrorModesField) { - ReplicaSetConfig config; - Status status = config.initialize(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "localhost:12345" << - "tags" << BSON("tag" << "yes"))) << - "settings" << BSON("getLastErrorModes" << BSON( - "one" << BSON("tag" << 1) << - "one" << BSON("tag" << 1))))); - ASSERT_EQUALS(ErrorCodes::DuplicateKey, status); - } - - TEST(ReplicaSetConfig, ParseFailsWithNonObjectGetLastErrorModesEntryField) { - ReplicaSetConfig config; - Status status = config.initialize(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "localhost:12345" << - "tags" << BSON("tag" << "yes"))) << - "settings" << BSON("getLastErrorModes" << BSON( - "one" << 1)))); - ASSERT_EQUALS(ErrorCodes::TypeMismatch, status); - } - - TEST(ReplicaSetConfig, ParseFailsWithNonNumericGetLastErrorModesConstraintValue) { - ReplicaSetConfig config; - Status status = config.initialize(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "localhost:12345" << - "tags" << BSON("tag" << "yes"))) << - "settings" << BSON("getLastErrorModes" << BSON( - "one" << BSON("tag" << "no"))))); - ASSERT_EQUALS(ErrorCodes::TypeMismatch, status); - } - - TEST(ReplicaSetConfig, ParseFailsWithNegativeGetLastErrorModesConstraintValue) { - ReplicaSetConfig config; - Status status = config.initialize(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "localhost:12345" << - "tags" << BSON("tag" << "yes"))) << - "settings" << BSON("getLastErrorModes" << BSON( - "one" << BSON("tag" << -1))))); - ASSERT_EQUALS(ErrorCodes::BadValue, status); - } - - TEST(ReplicaSetConfig, ParseFailsWithNonExistentGetLastErrorModesConstraintTag) { - ReplicaSetConfig config; - Status status = config.initialize(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "localhost:12345" << - "tags" << BSON("tag" << "yes"))) << - "settings" << BSON("getLastErrorModes" << BSON( - "one" << BSON("tag2" << 1))))); - ASSERT_EQUALS(ErrorCodes::NoSuchKey, status); - } - - TEST(ReplicaSetConfig, ValidateFailsWithDuplicateMemberId) { - ReplicaSetConfig config; - Status status = config.initialize(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "localhost:12345") << - BSON("_id" << 0 << - "host" << "someoneelse:12345")))); - ASSERT_OK(status); - - status = config.validate(); - ASSERT_EQUALS(ErrorCodes::BadValue, status); - } - - TEST(ReplicaSetConfig, ValidateFailsWithInvalidMember) { - ReplicaSetConfig config; - Status status = config.initialize(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "localhost:12345" << - "hidden" << true)))); - ASSERT_OK(status); - - status = config.validate(); - ASSERT_EQUALS(ErrorCodes::BadValue, status); - } - - TEST(ReplicaSetConfig, ChainingAllowedField) { - ReplicaSetConfig config; - ASSERT_OK(config.initialize( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "localhost:12345")) << - "settings" << BSON("chainingAllowed" << true)))); - ASSERT_OK(config.validate()); - ASSERT_TRUE(config.isChainingAllowed()); - - ASSERT_OK(config.initialize( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "localhost:12345")) << - "settings" << BSON("chainingAllowed" << false)))); - ASSERT_OK(config.validate()); - ASSERT_FALSE(config.isChainingAllowed()); - } - - TEST(ReplicaSetConfig, HeartbeatTimeoutField) { - ReplicaSetConfig config; - ASSERT_OK(config.initialize( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "localhost:12345")) << - "settings" << BSON("heartbeatTimeoutSecs" << 20)))); - ASSERT_OK(config.validate()); - ASSERT_EQUALS(20, config.getHeartbeatTimeoutPeriod().total_seconds()); - - ASSERT_OK(config.initialize( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "localhost:12345")) << - "settings" << BSON("heartbeatTimeoutSecs" << -20)))); - ASSERT_EQUALS(ErrorCodes::BadValue, config.validate()); - } - - TEST(ReplicaSetConfig, GleDefaultField) { - ReplicaSetConfig config; - ASSERT_OK(config.initialize( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "localhost:12345")) << - "settings" << BSON( - "getLastErrorDefaults" << BSON("w" << "majority"))))); - ASSERT_OK(config.validate()); - ASSERT_EQUALS("majority", config.getDefaultWriteConcern().wMode); - - ASSERT_OK(config.initialize( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "localhost:12345")) << - "settings" << BSON( - "getLastErrorDefaults" << BSON("w" << "frim"))))); - ASSERT_EQUALS(ErrorCodes::BadValue, config.validate()); - - ASSERT_OK(config.initialize( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "localhost:12345")) << - "settings" << BSON( - "getLastErrorDefaults" << BSON("w" << 0))))); - ASSERT_EQUALS(ErrorCodes::BadValue, config.validate()); - - ASSERT_OK(config.initialize( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "localhost:12345" << - "tags" << BSON("a" << "v"))) << - "settings" << BSON( - "getLastErrorDefaults" << BSON("w" << "frim") << - "getLastErrorModes" << BSON("frim" << BSON("a" << 1)))))); - ASSERT_OK(config.validate()); - ASSERT_EQUALS("frim", config.getDefaultWriteConcern().wMode); - ASSERT_OK(config.findCustomWriteMode("frim").getStatus()); - } - - bool operator==(const MemberConfig& a, const MemberConfig& b) { - // do tag comparisons - for (MemberConfig::TagIterator itrA = a.tagsBegin(); itrA != a.tagsEnd(); ++itrA) { - if (std::find(b.tagsBegin(), b.tagsEnd(), *itrA) == b.tagsEnd()) { - return false; - } + const BSONObj configBsonTooManyVoters = configDoc.getObject(); + + membersArray.leftChild().findFirstChildNamed("votes").setValueInt(0); + const BSONObj configBsonMaxVoters = configDoc.getObject(); + + + ASSERT_OK(config.initialize(configBsonMaxVoters)); + ASSERT_OK(config.validate()); + ASSERT_OK(config.initialize(configBsonTooManyVoters)); + ASSERT_EQUALS(ErrorCodes::BadValue, config.validate()); +} + +TEST(ReplicaSetConfig, ParseFailsWithDuplicateHost) { + ReplicaSetConfig config; + const BSONObj configBson = BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:1") + << BSON("_id" << 1 << "host" + << "localhost:1"))); + ASSERT_OK(config.initialize(configBson)); + ASSERT_EQUALS(ErrorCodes::BadValue, config.validate()); +} + +TEST(ReplicaSetConfig, ParseFailsWithTooManyNodes) { + ReplicaSetConfig config; + namespace mmb = mutablebson; + mmb::Document configDoc; + mmb::Element configDocRoot = configDoc.root(); + ASSERT_OK(configDocRoot.appendString("_id", "rs0")); + ASSERT_OK(configDocRoot.appendInt("version", 1)); + mmb::Element membersArray = configDoc.makeElementArray("members"); + ASSERT_OK(configDocRoot.pushBack(membersArray)); + for (size_t i = 0; i < ReplicaSetConfig::kMaxMembers; ++i) { + mmb::Element memberElement = configDoc.makeElementObject(""); + ASSERT_OK(membersArray.pushBack(memberElement)); + ASSERT_OK(memberElement.appendInt("_id", i)); + ASSERT_OK( + memberElement.appendString("host", std::string(str::stream() << "localhost" << i + 1))); + if (i >= ReplicaSetConfig::kMaxVotingMembers) { + ASSERT_OK(memberElement.appendInt("votes", 0)); } - return a.getId() == b.getId() && - a.getHostAndPort() == b.getHostAndPort() && - a.getPriority() == b.getPriority() && - a.getSlaveDelay() == b.getSlaveDelay() && - a.isVoter() == b.isVoter() && - a.isArbiter() == b.isArbiter() && - a.isHidden() == b.isHidden() && - a.shouldBuildIndexes() == b.shouldBuildIndexes() && - a.getNumTags() == b.getNumTags(); } - - bool operator==(const ReplicaSetConfig& a, const ReplicaSetConfig& b) { - // compare WriteConcernModes - std::vector<std::string> modeNames = a.getWriteConcernNames(); - for (std::vector<std::string>::iterator it = modeNames.begin(); - it != modeNames.end(); - it++) { - ReplicaSetTagPattern patternA = a.findCustomWriteMode(*it).getValue(); - ReplicaSetTagPattern patternB = b.findCustomWriteMode(*it).getValue(); - for (ReplicaSetTagPattern::ConstraintIterator itrA = patternA.constraintsBegin(); - itrA != patternA.constraintsEnd(); - itrA++) { - bool same = false; - for (ReplicaSetTagPattern::ConstraintIterator itrB = patternB.constraintsBegin(); - itrB != patternB.constraintsEnd(); - itrB++) { - if (itrA->getKeyIndex() == itrB->getKeyIndex() && - itrA->getMinCount() == itrB->getMinCount()) { - same = true; - break; - } - } - if (!same) { - return false; - } - } + const BSONObj configBsonMaxNodes = configDoc.getObject(); + + mmb::Element memberElement = configDoc.makeElementObject(""); + ASSERT_OK(membersArray.pushBack(memberElement)); + ASSERT_OK(memberElement.appendInt("_id", ReplicaSetConfig::kMaxMembers)); + ASSERT_OK(memberElement.appendString( + "host", std::string(str::stream() << "localhost" << ReplicaSetConfig::kMaxMembers + 1))); + ASSERT_OK(memberElement.appendInt("votes", 0)); + const BSONObj configBsonTooManyNodes = configDoc.getObject(); + + + ASSERT_OK(config.initialize(configBsonMaxNodes)); + ASSERT_OK(config.validate()); + ASSERT_OK(config.initialize(configBsonTooManyNodes)); + ASSERT_EQUALS(ErrorCodes::BadValue, config.validate()); +} + +TEST(ReplicaSetConfig, ParseFailsWithUnexpectedField) { + ReplicaSetConfig config; + Status status = config.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "unexpectedfield" + << "value")); + ASSERT_EQUALS(ErrorCodes::BadValue, status); +} + +TEST(ReplicaSetConfig, ParseFailsWithNonArrayMembersField) { + ReplicaSetConfig config; + Status status = config.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << "value")); + ASSERT_EQUALS(ErrorCodes::TypeMismatch, status); +} + +TEST(ReplicaSetConfig, ParseFailsWithNonNumericHeartbeatTimeoutSecsField) { + ReplicaSetConfig config; + Status status = config.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345")) + << "settings" << BSON("heartbeatTimeoutSecs" + << "no"))); + ASSERT_EQUALS(ErrorCodes::TypeMismatch, status); +} + +TEST(ReplicaSetConfig, ParseFailsWithNonBoolChainingAllowedField) { + ReplicaSetConfig config; + Status status = config.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345")) + << "settings" << BSON("chainingAllowed" + << "no"))); + ASSERT_EQUALS(ErrorCodes::TypeMismatch, status); +} + +TEST(ReplicaSetConfig, ParseFailsWithNonObjectSettingsField) { + ReplicaSetConfig config; + Status status = + config.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345")) << "settings" + << "none")); + ASSERT_EQUALS(ErrorCodes::TypeMismatch, status); +} + +TEST(ReplicaSetConfig, ParseFailsWithGetLastErrorDefaultsFieldUnparseable) { + ReplicaSetConfig config; + Status status = + config.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345")) << "settings" + << BSON("getLastErrorDefaults" << BSON("fsync" + << "seven")))); + ASSERT_EQUALS(ErrorCodes::FailedToParse, status); +} + +TEST(ReplicaSetConfig, ParseFailsWithNonObjectGetLastErrorDefaultsField) { + ReplicaSetConfig config; + Status status = config.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345")) + << "settings" << BSON("getLastErrorDefaults" + << "no"))); + ASSERT_EQUALS(ErrorCodes::TypeMismatch, status); +} + +TEST(ReplicaSetConfig, ParseFailsWithNonObjectGetLastErrorModesField) { + ReplicaSetConfig config; + Status status = config.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345")) + << "settings" << BSON("getLastErrorModes" + << "no"))); + ASSERT_EQUALS(ErrorCodes::TypeMismatch, status); +} + +TEST(ReplicaSetConfig, ParseFailsWithDuplicateGetLastErrorModesField) { + ReplicaSetConfig config; + Status status = + config.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345" + << "tags" << BSON("tag" + << "yes"))) << "settings" + << BSON("getLastErrorModes" + << BSON("one" << BSON("tag" << 1) << "one" + << BSON("tag" << 1))))); + ASSERT_EQUALS(ErrorCodes::DuplicateKey, status); +} + +TEST(ReplicaSetConfig, ParseFailsWithNonObjectGetLastErrorModesEntryField) { + ReplicaSetConfig config; + Status status = + config.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345" + << "tags" << BSON("tag" + << "yes"))) << "settings" + << BSON("getLastErrorModes" << BSON("one" << 1)))); + ASSERT_EQUALS(ErrorCodes::TypeMismatch, status); +} + +TEST(ReplicaSetConfig, ParseFailsWithNonNumericGetLastErrorModesConstraintValue) { + ReplicaSetConfig config; + Status status = + config.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345" + << "tags" << BSON("tag" + << "yes"))) << "settings" + << BSON("getLastErrorModes" << BSON("one" << BSON("tag" + << "no"))))); + ASSERT_EQUALS(ErrorCodes::TypeMismatch, status); +} + +TEST(ReplicaSetConfig, ParseFailsWithNegativeGetLastErrorModesConstraintValue) { + ReplicaSetConfig config; + Status status = + config.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345" + << "tags" << BSON("tag" + << "yes"))) << "settings" + << BSON("getLastErrorModes" << BSON("one" << BSON("tag" << -1))))); + ASSERT_EQUALS(ErrorCodes::BadValue, status); +} + +TEST(ReplicaSetConfig, ParseFailsWithNonExistentGetLastErrorModesConstraintTag) { + ReplicaSetConfig config; + Status status = + config.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345" + << "tags" << BSON("tag" + << "yes"))) << "settings" + << BSON("getLastErrorModes" << BSON("one" << BSON("tag2" << 1))))); + ASSERT_EQUALS(ErrorCodes::NoSuchKey, status); +} + +TEST(ReplicaSetConfig, ValidateFailsWithDuplicateMemberId) { + ReplicaSetConfig config; + Status status = config.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345") + << BSON("_id" << 0 << "host" + << "someoneelse:12345")))); + ASSERT_OK(status); + + status = config.validate(); + ASSERT_EQUALS(ErrorCodes::BadValue, status); +} + +TEST(ReplicaSetConfig, ValidateFailsWithInvalidMember) { + ReplicaSetConfig config; + Status status = config.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345" + << "hidden" << true)))); + ASSERT_OK(status); + + status = config.validate(); + ASSERT_EQUALS(ErrorCodes::BadValue, status); +} + +TEST(ReplicaSetConfig, ChainingAllowedField) { + ReplicaSetConfig config; + ASSERT_OK(config.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345")) << "settings" + << BSON("chainingAllowed" << true)))); + ASSERT_OK(config.validate()); + ASSERT_TRUE(config.isChainingAllowed()); + + ASSERT_OK(config.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345")) << "settings" + << BSON("chainingAllowed" << false)))); + ASSERT_OK(config.validate()); + ASSERT_FALSE(config.isChainingAllowed()); +} + +TEST(ReplicaSetConfig, HeartbeatTimeoutField) { + ReplicaSetConfig config; + ASSERT_OK(config.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345")) << "settings" + << BSON("heartbeatTimeoutSecs" << 20)))); + ASSERT_OK(config.validate()); + ASSERT_EQUALS(20, config.getHeartbeatTimeoutPeriod().total_seconds()); + + ASSERT_OK(config.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345")) << "settings" + << BSON("heartbeatTimeoutSecs" << -20)))); + ASSERT_EQUALS(ErrorCodes::BadValue, config.validate()); +} + +TEST(ReplicaSetConfig, GleDefaultField) { + ReplicaSetConfig config; + ASSERT_OK(config.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345")) << "settings" + << BSON("getLastErrorDefaults" << BSON("w" + << "majority"))))); + ASSERT_OK(config.validate()); + ASSERT_EQUALS("majority", config.getDefaultWriteConcern().wMode); + + ASSERT_OK(config.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345")) << "settings" + << BSON("getLastErrorDefaults" << BSON("w" + << "frim"))))); + ASSERT_EQUALS(ErrorCodes::BadValue, config.validate()); + + ASSERT_OK(config.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345")) << "settings" + << BSON("getLastErrorDefaults" << BSON("w" << 0))))); + ASSERT_EQUALS(ErrorCodes::BadValue, config.validate()); + + ASSERT_OK(config.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345" + << "tags" << BSON("a" + << "v"))) + << "settings" << BSON("getLastErrorDefaults" + << BSON("w" + << "frim") << "getLastErrorModes" + << BSON("frim" << BSON("a" << 1)))))); + ASSERT_OK(config.validate()); + ASSERT_EQUALS("frim", config.getDefaultWriteConcern().wMode); + ASSERT_OK(config.findCustomWriteMode("frim").getStatus()); +} + +bool operator==(const MemberConfig& a, const MemberConfig& b) { + // do tag comparisons + for (MemberConfig::TagIterator itrA = a.tagsBegin(); itrA != a.tagsEnd(); ++itrA) { + if (std::find(b.tagsBegin(), b.tagsEnd(), *itrA) == b.tagsEnd()) { + return false; } - - // compare the members - for (ReplicaSetConfig::MemberIterator memA = a.membersBegin(); - memA != a.membersEnd(); - memA++) { + } + return a.getId() == b.getId() && a.getHostAndPort() == b.getHostAndPort() && + a.getPriority() == b.getPriority() && a.getSlaveDelay() == b.getSlaveDelay() && + a.isVoter() == b.isVoter() && a.isArbiter() == b.isArbiter() && + a.isHidden() == b.isHidden() && a.shouldBuildIndexes() == b.shouldBuildIndexes() && + a.getNumTags() == b.getNumTags(); +} + +bool operator==(const ReplicaSetConfig& a, const ReplicaSetConfig& b) { + // compare WriteConcernModes + std::vector<std::string> modeNames = a.getWriteConcernNames(); + for (std::vector<std::string>::iterator it = modeNames.begin(); it != modeNames.end(); it++) { + ReplicaSetTagPattern patternA = a.findCustomWriteMode(*it).getValue(); + ReplicaSetTagPattern patternB = b.findCustomWriteMode(*it).getValue(); + for (ReplicaSetTagPattern::ConstraintIterator itrA = patternA.constraintsBegin(); + itrA != patternA.constraintsEnd(); + itrA++) { bool same = false; - for (ReplicaSetConfig::MemberIterator memB = b.membersBegin(); - memB != b.membersEnd(); - memB++) { - if (*memA == *memB) { + for (ReplicaSetTagPattern::ConstraintIterator itrB = patternB.constraintsBegin(); + itrB != patternB.constraintsEnd(); + itrB++) { + if (itrA->getKeyIndex() == itrB->getKeyIndex() && + itrA->getMinCount() == itrB->getMinCount()) { same = true; break; } @@ -692,291 +742,445 @@ namespace { return false; } } - - // simple comparisons - return a.getReplSetName() == b.getReplSetName() && - a.getConfigVersion() == b.getConfigVersion() && - a.getNumMembers() == b.getNumMembers() && - a.getHeartbeatTimeoutPeriod() == b.getHeartbeatTimeoutPeriod() && - a.isChainingAllowed() == b.isChainingAllowed() && - a.getDefaultWriteConcern().wNumNodes == b.getDefaultWriteConcern().wNumNodes && - a.getDefaultWriteConcern().wMode == b.getDefaultWriteConcern().wMode; } - TEST(ReplicaSetConfig, toBSONRoundTripAbility) { - ReplicaSetConfig configA; - ReplicaSetConfig configB; - ASSERT_OK(configA.initialize( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "localhost:12345")) << - "settings" << BSON("heartbeatTimeoutSecs" << 20)))); - ASSERT_OK(configB.initialize(configA.toBSON())); - ASSERT_TRUE(configA == configB); - } - - TEST(ReplicaSetConfig, toBSONRoundTripAbilityLarge) { - ReplicaSetConfig configA; - ReplicaSetConfig configB; - ASSERT_OK(configA.initialize( - BSON("_id" << "asdf" - << "version" << 9 - << "members" << BSON_ARRAY( - BSON("_id" << 0 - << "host" << "localhost:12345" - << "arbiterOnly" << true - << "votes" << 1 - ) << - BSON("_id" << 3 - << "host" << "localhost:3828" - << "arbiterOnly" << false - << "hidden" << true - << "buildIndexes" << false - << "priority" << 0 - << "slaveDelay" << 17 - << "votes" << 0 - << "tags" << BSON("coast" << "east" << "ssd" << "true") - ) << - BSON("_id" << 2 - << "host" << "foo.com:3828" - << "priority" << 9 - << "votes" << 0 - << "tags" << BSON("coast" << "west" << "hdd" << "true") - )) - << "settings" << BSON("heartbeatTimeoutSecs" << 20 - << "chainingAllowd" << true - << "getLastErrorDefaults" << BSON("w" << "majority") - << "getLastErrorModes" << BSON( - "disks" << BSON("ssd" << 1 << "hdd" << 1) - << "coasts" << BSON("coast" << 2))) - ))); - ASSERT_OK(configB.initialize(configA.toBSON())); - ASSERT_TRUE(configA == configB); - } - - TEST(ReplicaSetConfig, toBSONRoundTripAbilityInvalid) { - ReplicaSetConfig configA; - ReplicaSetConfig configB; - ASSERT_OK(configA.initialize( - BSON("_id" << "" - << "version" << -3 - << "members" << BSON_ARRAY( - BSON("_id" << 0 - << "host" << "localhost:12345" - << "arbiterOnly" << true - << "votes" << 0 - ) << - BSON("_id" << 0 - << "host" << "localhost:3828" - << "arbiterOnly" << false - << "buildIndexes" << false - << "priority" << 2 - ) << - BSON("_id" << 2 - << "host" << "localhost:3828" - << "priority" << 9 - << "votes" << 0 - )) - << "settings" << BSON("heartbeatTimeoutSecs" << -20)))); - ASSERT_OK(configB.initialize(configA.toBSON())); - ASSERT_NOT_OK(configA.validate()); - ASSERT_NOT_OK(configB.validate()); - ASSERT_TRUE(configA == configB); - } - - TEST(ReplicaSetConfig, CheckIfWriteConcernCanBeSatisfied) { - ReplicaSetConfig configA; - ASSERT_OK(configA.initialize( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "node0" << - "tags" << BSON("dc" << "NA" << - "rack" << "rackNA1")) << - BSON("_id" << 1 << - "host" << "node1" << - "tags" << BSON("dc" << "NA" << - "rack" << "rackNA2")) << - BSON("_id" << 2 << - "host" << "node2" << - "tags" << BSON("dc" << "NA" << - "rack" << "rackNA3")) << - BSON("_id" << 3 << - "host" << "node3" << - "tags" << BSON("dc" << "EU" << - "rack" << "rackEU1")) << - BSON("_id" << 4 << - "host" << "node4" << - "tags" << BSON("dc" << "EU" << - "rack" << "rackEU2")) << - BSON("_id" << 5 << - "host" << "node5" << - "arbiterOnly" << true)) << - "settings" << BSON("getLastErrorModes" << - BSON("valid" << BSON("dc" << 2 << "rack" << 3) << - "invalidNotEnoughValues" << BSON("dc" << 3) << - "invalidNotEnoughNodes" << BSON("rack" << 6)))))); - - WriteConcernOptions validNumberWC; - validNumberWC.wNumNodes = 5; - ASSERT_OK(configA.checkIfWriteConcernCanBeSatisfied(validNumberWC)); - - WriteConcernOptions invalidNumberWC; - invalidNumberWC.wNumNodes = 6; - ASSERT_EQUALS(ErrorCodes::CannotSatisfyWriteConcern, - configA.checkIfWriteConcernCanBeSatisfied(invalidNumberWC)); - - WriteConcernOptions majorityWC; - majorityWC.wMode = "majority"; - ASSERT_OK(configA.checkIfWriteConcernCanBeSatisfied(majorityWC)); - - WriteConcernOptions validModeWC; - validModeWC.wMode = "valid"; - ASSERT_OK(configA.checkIfWriteConcernCanBeSatisfied(validModeWC)); - - WriteConcernOptions fakeModeWC; - fakeModeWC.wMode = "fake"; - ASSERT_EQUALS(ErrorCodes::UnknownReplWriteConcern, - configA.checkIfWriteConcernCanBeSatisfied(fakeModeWC)); - - WriteConcernOptions invalidModeNotEnoughValuesWC; - invalidModeNotEnoughValuesWC.wMode = "invalidNotEnoughValues"; - ASSERT_EQUALS(ErrorCodes::CannotSatisfyWriteConcern, - configA.checkIfWriteConcernCanBeSatisfied(invalidModeNotEnoughValuesWC)); - - WriteConcernOptions invalidModeNotEnoughNodesWC; - invalidModeNotEnoughNodesWC.wMode = "invalidNotEnoughNodes"; - ASSERT_EQUALS(ErrorCodes::CannotSatisfyWriteConcern, - configA.checkIfWriteConcernCanBeSatisfied(invalidModeNotEnoughNodesWC)); - } - - TEST(ReplicaSetConfig, CheckMaximumNodesOkay) { - ReplicaSetConfig configA; - ReplicaSetConfig configB; - ASSERT_OK(configA.initialize( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "node0") << - BSON("_id" << 1 << "host" << "node1") << - BSON("_id" << 2 << "host" << "node2") << - BSON("_id" << 3 << "host" << "node3") << - BSON("_id" << 4 << "host" << "node4") << - BSON("_id" << 5 << "host" << "node5") << - BSON("_id" << 6 << "host" << "node6") << - BSON("_id" << 7 << "host" << "node7" << "votes" << 0) << - BSON("_id" << 8 << "host" << "node8" << "votes" << 0) << - BSON("_id" << 9 << "host" << "node9" << "votes" << 0) << - BSON("_id" << 10 << "host" << "node10" << "votes" << 0) << - BSON("_id" << 11 << "host" << "node11" << "votes" << 0) << - BSON("_id" << 12 << "host" << "node12" << "votes" << 0) << - BSON("_id" << 13 << "host" << "node13" << "votes" << 0) << - BSON("_id" << 14 << "host" << "node14" << "votes" << 0) << - BSON("_id" << 15 << "host" << "node15" << "votes" << 0) << - BSON("_id" << 16 << "host" << "node16" << "votes" << 0) << - BSON("_id" << 17 << "host" << "node17" << "votes" << 0) << - BSON("_id" << 18 << "host" << "node18" << "votes" << 0) << - BSON("_id" << 19 << "host" << "node19" << "votes" << 0) << - BSON("_id" << 20 << "host" << "node20" << "votes" << 0) << - BSON("_id" << 21 << "host" << "node21" << "votes" << 0) << - BSON("_id" << 22 << "host" << "node22" << "votes" << 0) << - BSON("_id" << 23 << "host" << "node23" << "votes" << 0) << - BSON("_id" << 24 << "host" << "node24" << "votes" << 0) << - BSON("_id" << 25 << "host" << "node25" << "votes" << 0) << - BSON("_id" << 26 << "host" << "node26" << "votes" << 0) << - BSON("_id" << 27 << "host" << "node27" << "votes" << 0) << - BSON("_id" << 28 << "host" << "node28" << "votes" << 0) << - BSON("_id" << 29 << "host" << "node29" << "votes" << 0) << - BSON("_id" << 30 << "host" << "node30" << "votes" << 0) << - BSON("_id" << 31 << "host" << "node31" << "votes" << 0) << - BSON("_id" << 32 << "host" << "node32" << "votes" << 0) << - BSON("_id" << 33 << "host" << "node33" << "votes" << 0) << - BSON("_id" << 34 << "host" << "node34" << "votes" << 0) << - BSON("_id" << 35 << "host" << "node35" << "votes" << 0) << - BSON("_id" << 36 << "host" << "node36" << "votes" << 0) << - BSON("_id" << 37 << "host" << "node37" << "votes" << 0) << - BSON("_id" << 38 << "host" << "node38" << "votes" << 0) << - BSON("_id" << 39 << "host" << "node39" << "votes" << 0) << - BSON("_id" << 40 << "host" << "node40" << "votes" << 0) << - BSON("_id" << 41 << "host" << "node41" << "votes" << 0) << - BSON("_id" << 42 << "host" << "node42" << "votes" << 0) << - BSON("_id" << 43 << "host" << "node43" << "votes" << 0) << - BSON("_id" << 44 << "host" << "node44" << "votes" << 0) << - BSON("_id" << 45 << "host" << "node45" << "votes" << 0) << - BSON("_id" << 46 << "host" << "node46" << "votes" << 0) << - BSON("_id" << 47 << "host" << "node47" << "votes" << 0) << - BSON("_id" << 48 << "host" << "node48" << "votes" << 0) << - BSON("_id" << 49 << "host" << "node49" << "votes" << 0))))); - ASSERT_OK(configB.initialize(configA.toBSON())); - ASSERT_OK(configA.validate()); - ASSERT_OK(configB.validate()); - ASSERT_TRUE(configA == configB); + // compare the members + for (ReplicaSetConfig::MemberIterator memA = a.membersBegin(); memA != a.membersEnd(); memA++) { + bool same = false; + for (ReplicaSetConfig::MemberIterator memB = b.membersBegin(); memB != b.membersEnd(); + memB++) { + if (*memA == *memB) { + same = true; + break; + } + } + if (!same) { + return false; + } } - TEST(ReplicaSetConfig, CheckBeyondMaximumNodesFailsValidate) { - ReplicaSetConfig configA; - ReplicaSetConfig configB; - ASSERT_OK(configA.initialize( - BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "node0") << - BSON("_id" << 1 << "host" << "node1") << - BSON("_id" << 2 << "host" << "node2") << - BSON("_id" << 3 << "host" << "node3") << - BSON("_id" << 4 << "host" << "node4") << - BSON("_id" << 5 << "host" << "node5") << - BSON("_id" << 6 << "host" << "node6") << - BSON("_id" << 7 << "host" << "node7" << "votes" << 0) << - BSON("_id" << 8 << "host" << "node8" << "votes" << 0) << - BSON("_id" << 9 << "host" << "node9" << "votes" << 0) << - BSON("_id" << 10 << "host" << "node10" << "votes" << 0) << - BSON("_id" << 11 << "host" << "node11" << "votes" << 0) << - BSON("_id" << 12 << "host" << "node12" << "votes" << 0) << - BSON("_id" << 13 << "host" << "node13" << "votes" << 0) << - BSON("_id" << 14 << "host" << "node14" << "votes" << 0) << - BSON("_id" << 15 << "host" << "node15" << "votes" << 0) << - BSON("_id" << 16 << "host" << "node16" << "votes" << 0) << - BSON("_id" << 17 << "host" << "node17" << "votes" << 0) << - BSON("_id" << 18 << "host" << "node18" << "votes" << 0) << - BSON("_id" << 19 << "host" << "node19" << "votes" << 0) << - BSON("_id" << 20 << "host" << "node20" << "votes" << 0) << - BSON("_id" << 21 << "host" << "node21" << "votes" << 0) << - BSON("_id" << 22 << "host" << "node22" << "votes" << 0) << - BSON("_id" << 23 << "host" << "node23" << "votes" << 0) << - BSON("_id" << 24 << "host" << "node24" << "votes" << 0) << - BSON("_id" << 25 << "host" << "node25" << "votes" << 0) << - BSON("_id" << 26 << "host" << "node26" << "votes" << 0) << - BSON("_id" << 27 << "host" << "node27" << "votes" << 0) << - BSON("_id" << 28 << "host" << "node28" << "votes" << 0) << - BSON("_id" << 29 << "host" << "node29" << "votes" << 0) << - BSON("_id" << 30 << "host" << "node30" << "votes" << 0) << - BSON("_id" << 31 << "host" << "node31" << "votes" << 0) << - BSON("_id" << 32 << "host" << "node32" << "votes" << 0) << - BSON("_id" << 33 << "host" << "node33" << "votes" << 0) << - BSON("_id" << 34 << "host" << "node34" << "votes" << 0) << - BSON("_id" << 35 << "host" << "node35" << "votes" << 0) << - BSON("_id" << 36 << "host" << "node36" << "votes" << 0) << - BSON("_id" << 37 << "host" << "node37" << "votes" << 0) << - BSON("_id" << 38 << "host" << "node38" << "votes" << 0) << - BSON("_id" << 39 << "host" << "node39" << "votes" << 0) << - BSON("_id" << 40 << "host" << "node40" << "votes" << 0) << - BSON("_id" << 41 << "host" << "node41" << "votes" << 0) << - BSON("_id" << 42 << "host" << "node42" << "votes" << 0) << - BSON("_id" << 43 << "host" << "node43" << "votes" << 0) << - BSON("_id" << 44 << "host" << "node44" << "votes" << 0) << - BSON("_id" << 45 << "host" << "node45" << "votes" << 0) << - BSON("_id" << 46 << "host" << "node46" << "votes" << 0) << - BSON("_id" << 47 << "host" << "node47" << "votes" << 0) << - BSON("_id" << 48 << "host" << "node48" << "votes" << 0) << - BSON("_id" << 49 << "host" << "node49" << "votes" << 0) << - BSON("_id" << 50 << "host" << "node50" << "votes" << 0))))); - ASSERT_OK(configB.initialize(configA.toBSON())); - ASSERT_NOT_OK(configA.validate()); - ASSERT_NOT_OK(configB.validate()); - ASSERT_TRUE(configA == configB); - } + // simple comparisons + return a.getReplSetName() == b.getReplSetName() && + a.getConfigVersion() == b.getConfigVersion() && a.getNumMembers() == b.getNumMembers() && + a.getHeartbeatTimeoutPeriod() == b.getHeartbeatTimeoutPeriod() && + a.isChainingAllowed() == b.isChainingAllowed() && + a.getDefaultWriteConcern().wNumNodes == b.getDefaultWriteConcern().wNumNodes && + a.getDefaultWriteConcern().wMode == b.getDefaultWriteConcern().wMode; +} + +TEST(ReplicaSetConfig, toBSONRoundTripAbility) { + ReplicaSetConfig configA; + ReplicaSetConfig configB; + ASSERT_OK(configA.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345")) << "settings" + << BSON("heartbeatTimeoutSecs" << 20)))); + ASSERT_OK(configB.initialize(configA.toBSON())); + ASSERT_TRUE(configA == configB); +} + +TEST(ReplicaSetConfig, toBSONRoundTripAbilityLarge) { + ReplicaSetConfig configA; + ReplicaSetConfig configB; + ASSERT_OK(configA.initialize(BSON( + "_id" + << "asdf" + << "version" << 9 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345" + << "arbiterOnly" << true << "votes" << 1) + << BSON("_id" << 3 << "host" + << "localhost:3828" + << "arbiterOnly" << false << "hidden" << true << "buildIndexes" + << false << "priority" << 0 << "slaveDelay" << 17 << "votes" + << 0 << "tags" << BSON("coast" + << "east" + << "ssd" + << "true")) + << BSON("_id" << 2 << "host" + << "foo.com:3828" + << "priority" << 9 << "votes" << 0 << "tags" + << BSON("coast" + << "west" + << "hdd" + << "true"))) << "settings" + << BSON("heartbeatTimeoutSecs" << 20 << "chainingAllowd" << true << "getLastErrorDefaults" + << BSON("w" + << "majority") << "getLastErrorModes" + << BSON("disks" << BSON("ssd" << 1 << "hdd" << 1) << "coasts" + << BSON("coast" << 2)))))); + ASSERT_OK(configB.initialize(configA.toBSON())); + ASSERT_TRUE(configA == configB); +} + +TEST(ReplicaSetConfig, toBSONRoundTripAbilityInvalid) { + ReplicaSetConfig configA; + ReplicaSetConfig configB; + ASSERT_OK( + configA.initialize(BSON("_id" + << "" + << "version" << -3 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345" + << "arbiterOnly" << true << "votes" << 0) + << BSON("_id" << 0 << "host" + << "localhost:3828" + << "arbiterOnly" << false + << "buildIndexes" << false << "priority" + << 2) + << BSON("_id" << 2 << "host" + << "localhost:3828" + << "priority" << 9 << "votes" << 0)) + << "settings" << BSON("heartbeatTimeoutSecs" << -20)))); + ASSERT_OK(configB.initialize(configA.toBSON())); + ASSERT_NOT_OK(configA.validate()); + ASSERT_NOT_OK(configB.validate()); + ASSERT_TRUE(configA == configB); +} + +TEST(ReplicaSetConfig, CheckIfWriteConcernCanBeSatisfied) { + ReplicaSetConfig configA; + ASSERT_OK(configA.initialize(BSON( + "_id" + << "rs0" + << "version" << 1 << "members" << BSON_ARRAY(BSON("_id" << 0 << "host" + << "node0" + << "tags" << BSON("dc" + << "NA" + << "rack" + << "rackNA1")) + << BSON("_id" << 1 << "host" + << "node1" + << "tags" << BSON("dc" + << "NA" + << "rack" + << "rackNA2")) + << BSON("_id" << 2 << "host" + << "node2" + << "tags" << BSON("dc" + << "NA" + << "rack" + << "rackNA3")) + << BSON("_id" << 3 << "host" + << "node3" + << "tags" << BSON("dc" + << "EU" + << "rack" + << "rackEU1")) + << BSON("_id" << 4 << "host" + << "node4" + << "tags" << BSON("dc" + << "EU" + << "rack" + << "rackEU2")) + << BSON("_id" << 5 << "host" + << "node5" + << "arbiterOnly" << true)) + << "settings" << BSON("getLastErrorModes" + << BSON("valid" << BSON("dc" << 2 << "rack" << 3) + << "invalidNotEnoughValues" << BSON("dc" << 3) + << "invalidNotEnoughNodes" << BSON("rack" << 6)))))); + + WriteConcernOptions validNumberWC; + validNumberWC.wNumNodes = 5; + ASSERT_OK(configA.checkIfWriteConcernCanBeSatisfied(validNumberWC)); + + WriteConcernOptions invalidNumberWC; + invalidNumberWC.wNumNodes = 6; + ASSERT_EQUALS(ErrorCodes::CannotSatisfyWriteConcern, + configA.checkIfWriteConcernCanBeSatisfied(invalidNumberWC)); + + WriteConcernOptions majorityWC; + majorityWC.wMode = "majority"; + ASSERT_OK(configA.checkIfWriteConcernCanBeSatisfied(majorityWC)); + + WriteConcernOptions validModeWC; + validModeWC.wMode = "valid"; + ASSERT_OK(configA.checkIfWriteConcernCanBeSatisfied(validModeWC)); + + WriteConcernOptions fakeModeWC; + fakeModeWC.wMode = "fake"; + ASSERT_EQUALS(ErrorCodes::UnknownReplWriteConcern, + configA.checkIfWriteConcernCanBeSatisfied(fakeModeWC)); + + WriteConcernOptions invalidModeNotEnoughValuesWC; + invalidModeNotEnoughValuesWC.wMode = "invalidNotEnoughValues"; + ASSERT_EQUALS(ErrorCodes::CannotSatisfyWriteConcern, + configA.checkIfWriteConcernCanBeSatisfied(invalidModeNotEnoughValuesWC)); + + WriteConcernOptions invalidModeNotEnoughNodesWC; + invalidModeNotEnoughNodesWC.wMode = "invalidNotEnoughNodes"; + ASSERT_EQUALS(ErrorCodes::CannotSatisfyWriteConcern, + configA.checkIfWriteConcernCanBeSatisfied(invalidModeNotEnoughNodesWC)); +} + +TEST(ReplicaSetConfig, CheckMaximumNodesOkay) { + ReplicaSetConfig configA; + ReplicaSetConfig configB; + ASSERT_OK(configA.initialize( + BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "node0") + << BSON("_id" << 1 << "host" + << "node1") << BSON("_id" << 2 << "host" + << "node2") + << BSON("_id" << 3 << "host" + << "node3") << BSON("_id" << 4 << "host" + << "node4") + << BSON("_id" << 5 << "host" + << "node5") << BSON("_id" << 6 << "host" + << "node6") + << BSON("_id" << 7 << "host" + << "node7" + << "votes" << 0) << BSON("_id" << 8 << "host" + << "node8" + << "votes" << 0) + << BSON("_id" << 9 << "host" + << "node9" + << "votes" << 0) << BSON("_id" << 10 << "host" + << "node10" + << "votes" << 0) + << BSON("_id" << 11 << "host" + << "node11" + << "votes" << 0) << BSON("_id" << 12 << "host" + << "node12" + << "votes" << 0) + << BSON("_id" << 13 << "host" + << "node13" + << "votes" << 0) << BSON("_id" << 14 << "host" + << "node14" + << "votes" << 0) + << BSON("_id" << 15 << "host" + << "node15" + << "votes" << 0) << BSON("_id" << 16 << "host" + << "node16" + << "votes" << 0) + << BSON("_id" << 17 << "host" + << "node17" + << "votes" << 0) << BSON("_id" << 18 << "host" + << "node18" + << "votes" << 0) + << BSON("_id" << 19 << "host" + << "node19" + << "votes" << 0) << BSON("_id" << 20 << "host" + << "node20" + << "votes" << 0) + << BSON("_id" << 21 << "host" + << "node21" + << "votes" << 0) << BSON("_id" << 22 << "host" + << "node22" + << "votes" << 0) + << BSON("_id" << 23 << "host" + << "node23" + << "votes" << 0) << BSON("_id" << 24 << "host" + << "node24" + << "votes" << 0) + << BSON("_id" << 25 << "host" + << "node25" + << "votes" << 0) << BSON("_id" << 26 << "host" + << "node26" + << "votes" << 0) + << BSON("_id" << 27 << "host" + << "node27" + << "votes" << 0) << BSON("_id" << 28 << "host" + << "node28" + << "votes" << 0) + << BSON("_id" << 29 << "host" + << "node29" + << "votes" << 0) << BSON("_id" << 30 << "host" + << "node30" + << "votes" << 0) + << BSON("_id" << 31 << "host" + << "node31" + << "votes" << 0) << BSON("_id" << 32 << "host" + << "node32" + << "votes" << 0) + << BSON("_id" << 33 << "host" + << "node33" + << "votes" << 0) << BSON("_id" << 34 << "host" + << "node34" + << "votes" << 0) + << BSON("_id" << 35 << "host" + << "node35" + << "votes" << 0) << BSON("_id" << 36 << "host" + << "node36" + << "votes" << 0) + << BSON("_id" << 37 << "host" + << "node37" + << "votes" << 0) << BSON("_id" << 38 << "host" + << "node38" + << "votes" << 0) + << BSON("_id" << 39 << "host" + << "node39" + << "votes" << 0) << BSON("_id" << 40 << "host" + << "node40" + << "votes" << 0) + << BSON("_id" << 41 << "host" + << "node41" + << "votes" << 0) << BSON("_id" << 42 << "host" + << "node42" + << "votes" << 0) + << BSON("_id" << 43 << "host" + << "node43" + << "votes" << 0) << BSON("_id" << 44 << "host" + << "node44" + << "votes" << 0) + << BSON("_id" << 45 << "host" + << "node45" + << "votes" << 0) << BSON("_id" << 46 << "host" + << "node46" + << "votes" << 0) + << BSON("_id" << 47 << "host" + << "node47" + << "votes" << 0) << BSON("_id" << 48 << "host" + << "node48" + << "votes" << 0) + << BSON("_id" << 49 << "host" + << "node49" + << "votes" << 0))))); + ASSERT_OK(configB.initialize(configA.toBSON())); + ASSERT_OK(configA.validate()); + ASSERT_OK(configB.validate()); + ASSERT_TRUE(configA == configB); +} + +TEST(ReplicaSetConfig, CheckBeyondMaximumNodesFailsValidate) { + ReplicaSetConfig configA; + ReplicaSetConfig configB; + ASSERT_OK(configA.initialize( + BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "node0") + << BSON("_id" << 1 << "host" + << "node1") << BSON("_id" << 2 << "host" + << "node2") + << BSON("_id" << 3 << "host" + << "node3") << BSON("_id" << 4 << "host" + << "node4") + << BSON("_id" << 5 << "host" + << "node5") << BSON("_id" << 6 << "host" + << "node6") + << BSON("_id" << 7 << "host" + << "node7" + << "votes" << 0) << BSON("_id" << 8 << "host" + << "node8" + << "votes" << 0) + << BSON("_id" << 9 << "host" + << "node9" + << "votes" << 0) << BSON("_id" << 10 << "host" + << "node10" + << "votes" << 0) + << BSON("_id" << 11 << "host" + << "node11" + << "votes" << 0) << BSON("_id" << 12 << "host" + << "node12" + << "votes" << 0) + << BSON("_id" << 13 << "host" + << "node13" + << "votes" << 0) << BSON("_id" << 14 << "host" + << "node14" + << "votes" << 0) + << BSON("_id" << 15 << "host" + << "node15" + << "votes" << 0) << BSON("_id" << 16 << "host" + << "node16" + << "votes" << 0) + << BSON("_id" << 17 << "host" + << "node17" + << "votes" << 0) << BSON("_id" << 18 << "host" + << "node18" + << "votes" << 0) + << BSON("_id" << 19 << "host" + << "node19" + << "votes" << 0) << BSON("_id" << 20 << "host" + << "node20" + << "votes" << 0) + << BSON("_id" << 21 << "host" + << "node21" + << "votes" << 0) << BSON("_id" << 22 << "host" + << "node22" + << "votes" << 0) + << BSON("_id" << 23 << "host" + << "node23" + << "votes" << 0) << BSON("_id" << 24 << "host" + << "node24" + << "votes" << 0) + << BSON("_id" << 25 << "host" + << "node25" + << "votes" << 0) << BSON("_id" << 26 << "host" + << "node26" + << "votes" << 0) + << BSON("_id" << 27 << "host" + << "node27" + << "votes" << 0) << BSON("_id" << 28 << "host" + << "node28" + << "votes" << 0) + << BSON("_id" << 29 << "host" + << "node29" + << "votes" << 0) << BSON("_id" << 30 << "host" + << "node30" + << "votes" << 0) + << BSON("_id" << 31 << "host" + << "node31" + << "votes" << 0) << BSON("_id" << 32 << "host" + << "node32" + << "votes" << 0) + << BSON("_id" << 33 << "host" + << "node33" + << "votes" << 0) << BSON("_id" << 34 << "host" + << "node34" + << "votes" << 0) + << BSON("_id" << 35 << "host" + << "node35" + << "votes" << 0) << BSON("_id" << 36 << "host" + << "node36" + << "votes" << 0) + << BSON("_id" << 37 << "host" + << "node37" + << "votes" << 0) << BSON("_id" << 38 << "host" + << "node38" + << "votes" << 0) + << BSON("_id" << 39 << "host" + << "node39" + << "votes" << 0) << BSON("_id" << 40 << "host" + << "node40" + << "votes" << 0) + << BSON("_id" << 41 << "host" + << "node41" + << "votes" << 0) << BSON("_id" << 42 << "host" + << "node42" + << "votes" << 0) + << BSON("_id" << 43 << "host" + << "node43" + << "votes" << 0) << BSON("_id" << 44 << "host" + << "node44" + << "votes" << 0) + << BSON("_id" << 45 << "host" + << "node45" + << "votes" << 0) << BSON("_id" << 46 << "host" + << "node46" + << "votes" << 0) + << BSON("_id" << 47 << "host" + << "node47" + << "votes" << 0) << BSON("_id" << 48 << "host" + << "node48" + << "votes" << 0) + << BSON("_id" << 49 << "host" + << "node49" + << "votes" << 0) << BSON("_id" << 50 << "host" + << "node50" + << "votes" << 0))))); + ASSERT_OK(configB.initialize(configA.toBSON())); + ASSERT_NOT_OK(configA.validate()); + ASSERT_NOT_OK(configB.validate()); + ASSERT_TRUE(configA == configB); +} } // namespace } // namespace repl diff --git a/src/mongo/db/repl/replica_set_tag.cpp b/src/mongo/db/repl/replica_set_tag.cpp index 05363498fcd..631b097abdd 100644 --- a/src/mongo/db/repl/replica_set_tag.cpp +++ b/src/mongo/db/repl/replica_set_tag.cpp @@ -41,206 +41,198 @@ namespace mongo { namespace repl { - bool ReplicaSetTag::operator==(const ReplicaSetTag& other) const { - return _keyIndex == other._keyIndex && _valueIndex == other._valueIndex; - } - - bool ReplicaSetTag::operator!=(const ReplicaSetTag& other) const { - return !(*this == other); - } - - void ReplicaSetTagPattern::addTagCountConstraint(int32_t keyIndex, int32_t minCount) { - const std::vector<TagCountConstraint>::iterator iter = std::find_if( - _constraints.begin(), - _constraints.end(), - stdx::bind(std::equal_to<int32_t>(), - keyIndex, - stdx::bind(&TagCountConstraint::getKeyIndex, stdx::placeholders::_1))); - if (iter == _constraints.end()) { - _constraints.push_back(TagCountConstraint(keyIndex, minCount)); - } - else if (iter->getMinCount() < minCount) { - *iter = TagCountConstraint(keyIndex, minCount); - } - } - - ReplicaSetTagPattern::TagCountConstraint::TagCountConstraint(int32_t keyIndex, - int32_t minCount) : - _keyIndex(keyIndex), _minCount(minCount) {} - - ReplicaSetTagMatch::ReplicaSetTagMatch(const ReplicaSetTagPattern& pattern) { - for (ReplicaSetTagPattern::ConstraintIterator iter = pattern.constraintsBegin(); - iter != pattern.constraintsEnd(); - ++iter) { - - _boundTagValues.push_back(BoundTagValue(*iter)); - } - } - - bool ReplicaSetTagMatch::update(const ReplicaSetTag& tag) { - const std::vector<BoundTagValue>::iterator iter = std::find_if( - _boundTagValues.begin(), - _boundTagValues.end(), - stdx::bind(std::equal_to<int32_t>(), tag.getKeyIndex(), stdx::bind( - &BoundTagValue::getKeyIndex, stdx::placeholders::_1))); - if (iter != _boundTagValues.end()) { - if (!sequenceContains(iter->boundValues, tag.getValueIndex())) { - iter->boundValues.push_back(tag.getValueIndex()); - } +bool ReplicaSetTag::operator==(const ReplicaSetTag& other) const { + return _keyIndex == other._keyIndex && _valueIndex == other._valueIndex; +} + +bool ReplicaSetTag::operator!=(const ReplicaSetTag& other) const { + return !(*this == other); +} + +void ReplicaSetTagPattern::addTagCountConstraint(int32_t keyIndex, int32_t minCount) { + const std::vector<TagCountConstraint>::iterator iter = std::find_if( + _constraints.begin(), + _constraints.end(), + stdx::bind(std::equal_to<int32_t>(), + keyIndex, + stdx::bind(&TagCountConstraint::getKeyIndex, stdx::placeholders::_1))); + if (iter == _constraints.end()) { + _constraints.push_back(TagCountConstraint(keyIndex, minCount)); + } else if (iter->getMinCount() < minCount) { + *iter = TagCountConstraint(keyIndex, minCount); + } +} + +ReplicaSetTagPattern::TagCountConstraint::TagCountConstraint(int32_t keyIndex, int32_t minCount) + : _keyIndex(keyIndex), _minCount(minCount) {} + +ReplicaSetTagMatch::ReplicaSetTagMatch(const ReplicaSetTagPattern& pattern) { + for (ReplicaSetTagPattern::ConstraintIterator iter = pattern.constraintsBegin(); + iter != pattern.constraintsEnd(); + ++iter) { + _boundTagValues.push_back(BoundTagValue(*iter)); + } +} + +bool ReplicaSetTagMatch::update(const ReplicaSetTag& tag) { + const std::vector<BoundTagValue>::iterator iter = + std::find_if(_boundTagValues.begin(), + _boundTagValues.end(), + stdx::bind(std::equal_to<int32_t>(), + tag.getKeyIndex(), + stdx::bind(&BoundTagValue::getKeyIndex, stdx::placeholders::_1))); + if (iter != _boundTagValues.end()) { + if (!sequenceContains(iter->boundValues, tag.getValueIndex())) { + iter->boundValues.push_back(tag.getValueIndex()); } - return isSatisfied(); } - - bool ReplicaSetTagMatch::isSatisfied() const { - const std::vector<BoundTagValue>::const_iterator iter = std::find_if( - _boundTagValues.begin(), - _boundTagValues.end(), - stdx::bind(std::logical_not<bool>(), - stdx::bind(&BoundTagValue::isSatisfied, stdx::placeholders::_1))); - return iter == _boundTagValues.end(); - } - - bool ReplicaSetTagMatch::BoundTagValue::isSatisfied() const { - return constraint.getMinCount() <= int32_t(boundValues.size()); - } - - ReplicaSetTag ReplicaSetTagConfig::makeTag(const StringData& key, const StringData& value) { - int32_t keyIndex = _findKeyIndex(key); - if (size_t(keyIndex) == _tagData.size()) { - _tagData.push_back(make_pair(key.toString(), ValueVector())); - } - ValueVector& values = _tagData[keyIndex].second; - for (size_t valueIndex = 0; valueIndex < values.size(); ++valueIndex) { - if (values[valueIndex] != value) - continue; - return ReplicaSetTag(keyIndex, int32_t(valueIndex)); - } - values.push_back(value.toString()); - return ReplicaSetTag(keyIndex, int32_t(values.size()) - 1); - } - - ReplicaSetTag ReplicaSetTagConfig::findTag(const StringData& key, - const StringData& value) const { - int32_t keyIndex = _findKeyIndex(key); - if (size_t(keyIndex) == _tagData.size()) - return ReplicaSetTag(-1, -1); - const ValueVector& values = _tagData[keyIndex].second; - for (size_t valueIndex = 0; valueIndex < values.size(); ++valueIndex) { - if (values[valueIndex] == value) { - return ReplicaSetTag(keyIndex, int32_t(valueIndex)); - } - } + return isSatisfied(); +} + +bool ReplicaSetTagMatch::isSatisfied() const { + const std::vector<BoundTagValue>::const_iterator iter = + std::find_if(_boundTagValues.begin(), + _boundTagValues.end(), + stdx::bind(std::logical_not<bool>(), + stdx::bind(&BoundTagValue::isSatisfied, stdx::placeholders::_1))); + return iter == _boundTagValues.end(); +} + +bool ReplicaSetTagMatch::BoundTagValue::isSatisfied() const { + return constraint.getMinCount() <= int32_t(boundValues.size()); +} + +ReplicaSetTag ReplicaSetTagConfig::makeTag(const StringData& key, const StringData& value) { + int32_t keyIndex = _findKeyIndex(key); + if (size_t(keyIndex) == _tagData.size()) { + _tagData.push_back(make_pair(key.toString(), ValueVector())); + } + ValueVector& values = _tagData[keyIndex].second; + for (size_t valueIndex = 0; valueIndex < values.size(); ++valueIndex) { + if (values[valueIndex] != value) + continue; + return ReplicaSetTag(keyIndex, int32_t(valueIndex)); + } + values.push_back(value.toString()); + return ReplicaSetTag(keyIndex, int32_t(values.size()) - 1); +} + +ReplicaSetTag ReplicaSetTagConfig::findTag(const StringData& key, const StringData& value) const { + int32_t keyIndex = _findKeyIndex(key); + if (size_t(keyIndex) == _tagData.size()) return ReplicaSetTag(-1, -1); - } - - ReplicaSetTagPattern ReplicaSetTagConfig::makePattern() const { - return ReplicaSetTagPattern(); - } - - Status ReplicaSetTagConfig::addTagCountConstraintToPattern(ReplicaSetTagPattern* pattern, - const StringData& tagKey, - int32_t minCount) const { - int32_t keyIndex = _findKeyIndex(tagKey); - if (size_t(keyIndex) == _tagData.size()) { - return Status(ErrorCodes::NoSuchKey, - str::stream() << "No replica set tag key " << tagKey << " in config"); - } - pattern->addTagCountConstraint(keyIndex, minCount); - return Status::OK(); - } - - int32_t ReplicaSetTagConfig::_findKeyIndex(const StringData& key) const { - size_t i; - for (i = 0; i < _tagData.size(); ++i) { - if (_tagData[i].first == key) { - break; - } - } - return int32_t(i); - } - - std::string ReplicaSetTagConfig::getTagKey(const ReplicaSetTag& tag) const { - invariant(tag.isValid() && size_t(tag.getKeyIndex()) < _tagData.size()); - return _tagData[tag.getKeyIndex()].first; - } - - std::string ReplicaSetTagConfig::getTagValue(const ReplicaSetTag& tag) const { - invariant(tag.isValid() && size_t(tag.getKeyIndex()) < _tagData.size()); - const ValueVector& values = _tagData[tag.getKeyIndex()].second; - invariant(tag.getValueIndex() >= 0 && size_t(tag.getValueIndex()) < values.size()); - return values[tag.getValueIndex()]; - } - - void ReplicaSetTagConfig::put(const ReplicaSetTag& tag, std::ostream& os) const { - BSONObjBuilder builder; - _appendTagKey(tag.getKeyIndex(), &builder); - _appendTagValue(tag.getKeyIndex(), tag.getValueIndex(), &builder); - os << builder.done(); - } - - void ReplicaSetTagConfig::put(const ReplicaSetTagPattern& pattern, std::ostream& os) const { - BSONObjBuilder builder; - BSONArrayBuilder allConstraintsBuilder(builder.subarrayStart("constraints")); - for (ReplicaSetTagPattern::ConstraintIterator iter = pattern.constraintsBegin(); - iter != pattern.constraintsEnd(); - ++iter) { - - BSONObjBuilder constraintBuilder(allConstraintsBuilder.subobjStart()); - _appendConstraint(*iter, &constraintBuilder); + const ValueVector& values = _tagData[keyIndex].second; + for (size_t valueIndex = 0; valueIndex < values.size(); ++valueIndex) { + if (values[valueIndex] == value) { + return ReplicaSetTag(keyIndex, int32_t(valueIndex)); } - allConstraintsBuilder.doneFast(); - os << builder.done(); } - - void ReplicaSetTagConfig::put(const ReplicaSetTagMatch& matcher, std::ostream& os) const { - BSONObjBuilder builder; - BSONArrayBuilder allBindingsBuilder(builder.subarrayStart("bindings")); - for (size_t i = 0; i < matcher._boundTagValues.size(); ++i) { - - BSONObjBuilder bindingBuilder(allBindingsBuilder.subobjStart()); - _appendConstraint(matcher._boundTagValues[i].constraint, &bindingBuilder); - BSONArrayBuilder boundValues(bindingBuilder.subarrayStart("boundValues")); - for (size_t j = 0; j < matcher._boundTagValues[i].boundValues.size(); ++j) { - BSONObjBuilder bvb(boundValues.subobjStart()); - _appendTagValue(matcher._boundTagValues[i].constraint.getKeyIndex(), - matcher._boundTagValues[i].boundValues[j], - &bvb); - } + return ReplicaSetTag(-1, -1); +} + +ReplicaSetTagPattern ReplicaSetTagConfig::makePattern() const { + return ReplicaSetTagPattern(); +} + +Status ReplicaSetTagConfig::addTagCountConstraintToPattern(ReplicaSetTagPattern* pattern, + const StringData& tagKey, + int32_t minCount) const { + int32_t keyIndex = _findKeyIndex(tagKey); + if (size_t(keyIndex) == _tagData.size()) { + return Status(ErrorCodes::NoSuchKey, + str::stream() << "No replica set tag key " << tagKey << " in config"); + } + pattern->addTagCountConstraint(keyIndex, minCount); + return Status::OK(); +} + +int32_t ReplicaSetTagConfig::_findKeyIndex(const StringData& key) const { + size_t i; + for (i = 0; i < _tagData.size(); ++i) { + if (_tagData[i].first == key) { + break; } - allBindingsBuilder.doneFast(); - os << builder.done(); } - - void ReplicaSetTagConfig::_appendTagKey(int32_t keyIndex, BSONObjBuilder* builder) const { - if (keyIndex < 0 || size_t(keyIndex) >= _tagData.size()) { - builder->append("tagKey", int(keyIndex)); - } - else { - builder->append("tagKey", _tagData[keyIndex].first); + return int32_t(i); +} + +std::string ReplicaSetTagConfig::getTagKey(const ReplicaSetTag& tag) const { + invariant(tag.isValid() && size_t(tag.getKeyIndex()) < _tagData.size()); + return _tagData[tag.getKeyIndex()].first; +} + +std::string ReplicaSetTagConfig::getTagValue(const ReplicaSetTag& tag) const { + invariant(tag.isValid() && size_t(tag.getKeyIndex()) < _tagData.size()); + const ValueVector& values = _tagData[tag.getKeyIndex()].second; + invariant(tag.getValueIndex() >= 0 && size_t(tag.getValueIndex()) < values.size()); + return values[tag.getValueIndex()]; +} + +void ReplicaSetTagConfig::put(const ReplicaSetTag& tag, std::ostream& os) const { + BSONObjBuilder builder; + _appendTagKey(tag.getKeyIndex(), &builder); + _appendTagValue(tag.getKeyIndex(), tag.getValueIndex(), &builder); + os << builder.done(); +} + +void ReplicaSetTagConfig::put(const ReplicaSetTagPattern& pattern, std::ostream& os) const { + BSONObjBuilder builder; + BSONArrayBuilder allConstraintsBuilder(builder.subarrayStart("constraints")); + for (ReplicaSetTagPattern::ConstraintIterator iter = pattern.constraintsBegin(); + iter != pattern.constraintsEnd(); + ++iter) { + BSONObjBuilder constraintBuilder(allConstraintsBuilder.subobjStart()); + _appendConstraint(*iter, &constraintBuilder); + } + allConstraintsBuilder.doneFast(); + os << builder.done(); +} + +void ReplicaSetTagConfig::put(const ReplicaSetTagMatch& matcher, std::ostream& os) const { + BSONObjBuilder builder; + BSONArrayBuilder allBindingsBuilder(builder.subarrayStart("bindings")); + for (size_t i = 0; i < matcher._boundTagValues.size(); ++i) { + BSONObjBuilder bindingBuilder(allBindingsBuilder.subobjStart()); + _appendConstraint(matcher._boundTagValues[i].constraint, &bindingBuilder); + BSONArrayBuilder boundValues(bindingBuilder.subarrayStart("boundValues")); + for (size_t j = 0; j < matcher._boundTagValues[i].boundValues.size(); ++j) { + BSONObjBuilder bvb(boundValues.subobjStart()); + _appendTagValue(matcher._boundTagValues[i].constraint.getKeyIndex(), + matcher._boundTagValues[i].boundValues[j], + &bvb); } } - - void ReplicaSetTagConfig::_appendTagValue(int32_t keyIndex, - int32_t valueIndex, - BSONObjBuilder* builder) const { - if (keyIndex < 0 || size_t(keyIndex) >= _tagData.size()) { - builder->append("tagValue", valueIndex); - return; - } - KeyValueVector::const_reference keyEntry = _tagData[keyIndex]; - if (valueIndex < 0 || size_t(valueIndex) < keyEntry.second.size()) { - builder->append("tagValue", valueIndex); - } - builder->append("tagValue", keyEntry.second[valueIndex]); - } - - void ReplicaSetTagConfig::_appendConstraint( - const ReplicaSetTagPattern::TagCountConstraint& constraint, - BSONObjBuilder* builder) const { - - _appendTagKey(constraint.getKeyIndex(), builder); - builder->append("minCount", int(constraint.getMinCount())); - } + allBindingsBuilder.doneFast(); + os << builder.done(); +} + +void ReplicaSetTagConfig::_appendTagKey(int32_t keyIndex, BSONObjBuilder* builder) const { + if (keyIndex < 0 || size_t(keyIndex) >= _tagData.size()) { + builder->append("tagKey", int(keyIndex)); + } else { + builder->append("tagKey", _tagData[keyIndex].first); + } +} + +void ReplicaSetTagConfig::_appendTagValue(int32_t keyIndex, + int32_t valueIndex, + BSONObjBuilder* builder) const { + if (keyIndex < 0 || size_t(keyIndex) >= _tagData.size()) { + builder->append("tagValue", valueIndex); + return; + } + KeyValueVector::const_reference keyEntry = _tagData[keyIndex]; + if (valueIndex < 0 || size_t(valueIndex) < keyEntry.second.size()) { + builder->append("tagValue", valueIndex); + } + builder->append("tagValue", keyEntry.second[valueIndex]); +} + +void ReplicaSetTagConfig::_appendConstraint( + const ReplicaSetTagPattern::TagCountConstraint& constraint, BSONObjBuilder* builder) const { + _appendTagKey(constraint.getKeyIndex(), builder); + builder->append("minCount", int(constraint.getMinCount())); +} } // namespace repl diff --git a/src/mongo/db/repl/replica_set_tag.h b/src/mongo/db/repl/replica_set_tag.h index 3f4a2022baf..01c70ed0d2f 100644 --- a/src/mongo/db/repl/replica_set_tag.h +++ b/src/mongo/db/repl/replica_set_tag.h @@ -38,265 +38,282 @@ #include "mongo/platform/cstdint.h" namespace mongo { - class BSONObjBuilder; +class BSONObjBuilder; namespace repl { +/** + * Representation of a tag on a replica set node. + * + * Tags are only meaningful when used with a copy of the ReplicaSetTagConfig that + * created them. + */ +class ReplicaSetTag { +public: /** - * Representation of a tag on a replica set node. - * - * Tags are only meaningful when used with a copy of the ReplicaSetTagConfig that - * created them. + * Default constructor, produces an uninitialized tag. */ - class ReplicaSetTag { - public: - /** - * Default constructor, produces an uninitialized tag. - */ - ReplicaSetTag() {} - - /** - * Constructs a tag with the given key and value indexes. - * Do not call directly; used by ReplicaSetTagConfig. - */ - ReplicaSetTag(int32_t keyIndex, int32_t valueIndex) : - _keyIndex(keyIndex), - _valueIndex(valueIndex) {} - - /** - * Returns true if the tag is not explicitly invalid. - */ - bool isValid() const { return _keyIndex >= 0; } - - /** - * Gets the key index of the tag. - */ - int32_t getKeyIndex() const { return _keyIndex; } - - /** - * Gets the value index of the tag. - */ - int32_t getValueIndex() const { return _valueIndex; } - - /** - * Compares two tags from the *same* ReplicaSetTagConfig for equality. - */ - bool operator==(const ReplicaSetTag& other) const; - - /** - * Compares two tags from the *same* ReplicaSetTagConfig for inequality. - */ - bool operator!=(const ReplicaSetTag& other) const; + ReplicaSetTag() {} - private: - // The index of the key in the associated ReplicaSetTagConfig. - int32_t _keyIndex; + /** + * Constructs a tag with the given key and value indexes. + * Do not call directly; used by ReplicaSetTagConfig. + */ + ReplicaSetTag(int32_t keyIndex, int32_t valueIndex) + : _keyIndex(keyIndex), _valueIndex(valueIndex) {} - // The index of the value in the entry for the key in the associated ReplicaSetTagConfig. - int32_t _valueIndex; - }; + /** + * Returns true if the tag is not explicitly invalid. + */ + bool isValid() const { + return _keyIndex >= 0; + } + + /** + * Gets the key index of the tag. + */ + int32_t getKeyIndex() const { + return _keyIndex; + } + + /** + * Gets the value index of the tag. + */ + int32_t getValueIndex() const { + return _valueIndex; + } + + /** + * Compares two tags from the *same* ReplicaSetTagConfig for equality. + */ + bool operator==(const ReplicaSetTag& other) const; + + /** + * Compares two tags from the *same* ReplicaSetTagConfig for inequality. + */ + bool operator!=(const ReplicaSetTag& other) const; + +private: + // The index of the key in the associated ReplicaSetTagConfig. + int32_t _keyIndex; + + // The index of the value in the entry for the key in the associated ReplicaSetTagConfig. + int32_t _valueIndex; +}; +/** + * Representation of a tag matching pattern, like { "dc": 2, "rack": 3 }, of the form + * used for tagged replica set writes. + */ +class ReplicaSetTagPattern { +public: /** - * Representation of a tag matching pattern, like { "dc": 2, "rack": 3 }, of the form - * used for tagged replica set writes. + * Representation of a single tag's minimum count constraint in a pattern. */ - class ReplicaSetTagPattern { + class TagCountConstraint { public: - /** - * Representation of a single tag's minimum count constraint in a pattern. - */ - class TagCountConstraint { - public: - TagCountConstraint() {} - TagCountConstraint(int32_t keyIndex, int32_t minCount); - int32_t getKeyIndex() const { return _keyIndex; } - int32_t getMinCount() const { return _minCount; } - private: - int32_t _keyIndex; - int32_t _minCount; - }; - - typedef std::vector<TagCountConstraint>::const_iterator ConstraintIterator; - - /** - * Adds a count constraint for the given key index with the given count. - * - * Do not call directly, but use the addTagCountConstraintToPattern method - * of ReplicaSetTagConfig. - */ - void addTagCountConstraint(int32_t keyIndex, int32_t minCount); - - /** - * Gets the begin iterator over the constraints in this pattern. - */ - ConstraintIterator constraintsBegin() const { return _constraints.begin(); } - - /** - * Gets the end iterator over the constraints in this pattern. - */ - ConstraintIterator constraintsEnd() const { return _constraints.end(); } + TagCountConstraint() {} + TagCountConstraint(int32_t keyIndex, int32_t minCount); + int32_t getKeyIndex() const { + return _keyIndex; + } + int32_t getMinCount() const { + return _minCount; + } private: - std::vector<TagCountConstraint> _constraints; + int32_t _keyIndex; + int32_t _minCount; }; + typedef std::vector<TagCountConstraint>::const_iterator ConstraintIterator; + /** - * State object for progressive detection of ReplicaSetTagPattern constraint satisfaction. + * Adds a count constraint for the given key index with the given count. * - * This is an abstraction of the replica set write tag satisfaction problem. + * Do not call directly, but use the addTagCountConstraintToPattern method + * of ReplicaSetTagConfig. + */ + void addTagCountConstraint(int32_t keyIndex, int32_t minCount); + + /** + * Gets the begin iterator over the constraints in this pattern. + */ + ConstraintIterator constraintsBegin() const { + return _constraints.begin(); + } + + /** + * Gets the end iterator over the constraints in this pattern. + */ + ConstraintIterator constraintsEnd() const { + return _constraints.end(); + } + +private: + std::vector<TagCountConstraint> _constraints; +}; + +/** + * State object for progressive detection of ReplicaSetTagPattern constraint satisfaction. + * + * This is an abstraction of the replica set write tag satisfaction problem. + * + * Replica set tag matching is an event-driven constraint satisfaction process. This type + * represents the state of that process. It is initialized from a pattern object, then + * progressively updated with tags. After processing a sequence of tags sufficient to satisfy + * the pattern, isSatisfied() becomes true. + */ +class ReplicaSetTagMatch { + friend class ReplicaSetTagConfig; + +public: + /** + * Constructs an empty match object, equivalent to one that matches an + * empty pattern. + */ + ReplicaSetTagMatch() {} + + /** + * Constructs a clean match object for the given pattern. + */ + explicit ReplicaSetTagMatch(const ReplicaSetTagPattern& pattern); + + /** + * Updates the match state based on the data for the given tag. * - * Replica set tag matching is an event-driven constraint satisfaction process. This type - * represents the state of that process. It is initialized from a pattern object, then - * progressively updated with tags. After processing a sequence of tags sufficient to satisfy - * the pattern, isSatisfied() becomes true. + * Returns true if, after this update, isSatisfied() is true. */ - class ReplicaSetTagMatch { - friend class ReplicaSetTagConfig; - public: - /** - * Constructs an empty match object, equivalent to one that matches an - * empty pattern. - */ - ReplicaSetTagMatch() {} - - /** - * Constructs a clean match object for the given pattern. - */ - explicit ReplicaSetTagMatch(const ReplicaSetTagPattern& pattern); - - /** - * Updates the match state based on the data for the given tag. - * - * Returns true if, after this update, isSatisfied() is true. - */ - bool update(const ReplicaSetTag& tag); - - /** - * Returns true if the match has received a sequence of tags sufficient to satisfy the - * pattern. - */ + bool update(const ReplicaSetTag& tag); + + /** + * Returns true if the match has received a sequence of tags sufficient to satisfy the + * pattern. + */ + bool isSatisfied() const; + +private: + /** + * Representation of the state related to a single tag key in the match pattern. + * Consists of a constraint (key index and min count for satisfaction) and a list + * of already observed values. + * + * A BoundTagValue is satisfied when the size of boundValues is at least + * constraint.getMinCount(). + */ + struct BoundTagValue { + BoundTagValue() {} + explicit BoundTagValue(const ReplicaSetTagPattern::TagCountConstraint& aConstraint) + : constraint(aConstraint) {} + + int32_t getKeyIndex() const { + return constraint.getKeyIndex(); + } bool isSatisfied() const; - private: - /** - * Representation of the state related to a single tag key in the match pattern. - * Consists of a constraint (key index and min count for satisfaction) and a list - * of already observed values. - * - * A BoundTagValue is satisfied when the size of boundValues is at least - * constraint.getMinCount(). - */ - struct BoundTagValue { - BoundTagValue() {} - explicit BoundTagValue(const ReplicaSetTagPattern::TagCountConstraint& aConstraint) : - constraint(aConstraint) {} - - int32_t getKeyIndex() const { return constraint.getKeyIndex(); } - bool isSatisfied() const; - - ReplicaSetTagPattern::TagCountConstraint constraint; - std::vector<int32_t> boundValues; - }; - std::vector<BoundTagValue> _boundTagValues; + ReplicaSetTagPattern::TagCountConstraint constraint; + std::vector<int32_t> boundValues; }; + std::vector<BoundTagValue> _boundTagValues; +}; + +/** + * Representation of the tag configuration information for a replica set. + * + * This type, like all in this file, is copyable. Tags and patterns from one instance of this + * class are compatible with other instances of this class that are *copies* of the original + * instance. + */ +class ReplicaSetTagConfig { +public: + /** + * Finds or allocates a tag with the given "key" and "value" strings. + */ + ReplicaSetTag makeTag(const StringData& key, const StringData& value); + + /** + * Finds a tag with the given key and value strings, or returns a tag whose isValid() method + * returns false if the configuration has never allocated such a tag via makeTag(). + */ + ReplicaSetTag findTag(const StringData& key, const StringData& value) const; + + /** + * Makes a new, empty pattern object. + */ + ReplicaSetTagPattern makePattern() const; /** - * Representation of the tag configuration information for a replica set. + * Adds a constraint clause to the given "pattern". This particular + * constraint requires that at least "minCount" distinct tags with the given "tagKey" + * be observed. Two tags "t1" and "t2" are distinct if "t1 != t2", so this constraint + * means that we must see at least "minCount" tags with the specified "tagKey". + */ + Status addTagCountConstraintToPattern(ReplicaSetTagPattern* pattern, + const StringData& tagKey, + int32_t minCount) const; + + /** + * Gets the string key for the given "tag". * - * This type, like all in this file, is copyable. Tags and patterns from one instance of this - * class are compatible with other instances of this class that are *copies* of the original - * instance. + * Behavior is undefined if "tag" is not valid or was not from this + * config or one of its copies. */ - class ReplicaSetTagConfig { - public: - /** - * Finds or allocates a tag with the given "key" and "value" strings. - */ - ReplicaSetTag makeTag(const StringData& key, const StringData& value); - - /** - * Finds a tag with the given key and value strings, or returns a tag whose isValid() method - * returns false if the configuration has never allocated such a tag via makeTag(). - */ - ReplicaSetTag findTag(const StringData& key, const StringData& value) const; - - /** - * Makes a new, empty pattern object. - */ - ReplicaSetTagPattern makePattern() const; - - /** - * Adds a constraint clause to the given "pattern". This particular - * constraint requires that at least "minCount" distinct tags with the given "tagKey" - * be observed. Two tags "t1" and "t2" are distinct if "t1 != t2", so this constraint - * means that we must see at least "minCount" tags with the specified "tagKey". - */ - Status addTagCountConstraintToPattern(ReplicaSetTagPattern* pattern, - const StringData& tagKey, - int32_t minCount) const; - - /** - * Gets the string key for the given "tag". - * - * Behavior is undefined if "tag" is not valid or was not from this - * config or one of its copies. - */ - std::string getTagKey(const ReplicaSetTag& tag) const; - - /** - * Gets the string value for the given "tag". - * - * Like getTagKey, above, behavior is undefined if "tag" is not valid or was not from this - * config or one of its copies. - */ - std::string getTagValue(const ReplicaSetTag& tag) const; - - /** - * Helper that writes a string debugging representation of "tag" to "os". - */ - void put(const ReplicaSetTag& tag, std::ostream& os) const; - - /** - * Helper that writes a string debugging representation of "pattern" to "os". - */ - void put(const ReplicaSetTagPattern& pattern, std::ostream& os) const; - - /** - * Helper that writes a string debugging representation of "matcher" to "os". - */ - void put(const ReplicaSetTagMatch& matcher, std::ostream& os) const; + std::string getTagKey(const ReplicaSetTag& tag) const; - private: - typedef std::vector<std::string> ValueVector; - typedef std::vector<std::pair<std::string, ValueVector> > KeyValueVector; - - /** - * Returns the index corresponding to "key", or _tagData.size() if there is no - * such index. - */ - int32_t _findKeyIndex(const StringData& key) const; - - /** - * Helper that writes a "tagKey" field for the given "keyIndex" to "builder". - */ - void _appendTagKey(int32_t keyIndex, BSONObjBuilder* builder) const; - - /** - * Helper that writes a "tagValue" field for the given "keyIndex" and "valueIndex" - * to "builder". - */ - void _appendTagValue(int32_t keyIndex, int32_t valueIndex, BSONObjBuilder* builder) const; - - /** - * Helper that writes a constraint object to "builder". - */ - void _appendConstraint(const ReplicaSetTagPattern::TagCountConstraint& constraint, - BSONObjBuilder* builder) const; - - // Data about known tags. Conceptually, it maps between keys and their indexes, - // keys and their associated values, and (key, value) pairs and the values' indexes. - KeyValueVector _tagData; - }; + /** + * Gets the string value for the given "tag". + * + * Like getTagKey, above, behavior is undefined if "tag" is not valid or was not from this + * config or one of its copies. + */ + std::string getTagValue(const ReplicaSetTag& tag) const; + + /** + * Helper that writes a string debugging representation of "tag" to "os". + */ + void put(const ReplicaSetTag& tag, std::ostream& os) const; + + /** + * Helper that writes a string debugging representation of "pattern" to "os". + */ + void put(const ReplicaSetTagPattern& pattern, std::ostream& os) const; + + /** + * Helper that writes a string debugging representation of "matcher" to "os". + */ + void put(const ReplicaSetTagMatch& matcher, std::ostream& os) const; + +private: + typedef std::vector<std::string> ValueVector; + typedef std::vector<std::pair<std::string, ValueVector>> KeyValueVector; + + /** + * Returns the index corresponding to "key", or _tagData.size() if there is no + * such index. + */ + int32_t _findKeyIndex(const StringData& key) const; + + /** + * Helper that writes a "tagKey" field for the given "keyIndex" to "builder". + */ + void _appendTagKey(int32_t keyIndex, BSONObjBuilder* builder) const; + + /** + * Helper that writes a "tagValue" field for the given "keyIndex" and "valueIndex" + * to "builder". + */ + void _appendTagValue(int32_t keyIndex, int32_t valueIndex, BSONObjBuilder* builder) const; + + /** + * Helper that writes a constraint object to "builder". + */ + void _appendConstraint(const ReplicaSetTagPattern::TagCountConstraint& constraint, + BSONObjBuilder* builder) const; + + // Data about known tags. Conceptually, it maps between keys and their indexes, + // keys and their associated values, and (key, value) pairs and the values' indexes. + KeyValueVector _tagData; +}; } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/replica_set_tag_test.cpp b/src/mongo/db/repl/replica_set_tag_test.cpp index 1a2bdf9e120..1d70ee39bbe 100644 --- a/src/mongo/db/repl/replica_set_tag_test.cpp +++ b/src/mongo/db/repl/replica_set_tag_test.cpp @@ -33,129 +33,129 @@ namespace mongo { namespace repl { namespace { - template <typename T> - class StreamPutter { - public: - StreamPutter(const ReplicaSetTagConfig& tagConfig, const T& item) : - _tagConfig(&tagConfig), _item(&item) {} - void put(std::ostream& os) const { - _tagConfig->put(*_item, os); - } - - private: - const ReplicaSetTagConfig* _tagConfig; - const T* _item; - }; - - template <typename T> - StreamPutter<T> streamput(const ReplicaSetTagConfig& tagConfig, const T& item) { - return StreamPutter<T>(tagConfig, item); +template <typename T> +class StreamPutter { +public: + StreamPutter(const ReplicaSetTagConfig& tagConfig, const T& item) + : _tagConfig(&tagConfig), _item(&item) {} + void put(std::ostream& os) const { + _tagConfig->put(*_item, os); } - template <typename T> - std::ostream& operator<<(std::ostream& os, const StreamPutter<T>& putter) { - putter.put(os); - return os; +private: + const ReplicaSetTagConfig* _tagConfig; + const T* _item; +}; + +template <typename T> +StreamPutter<T> streamput(const ReplicaSetTagConfig& tagConfig, const T& item) { + return StreamPutter<T>(tagConfig, item); +} + +template <typename T> +std::ostream& operator<<(std::ostream& os, const StreamPutter<T>& putter) { + putter.put(os); + return os; +} + +TEST(ReplicaSetTagConfigTest, MakeAndFindTags) { + ReplicaSetTagConfig tagConfig; + ReplicaSetTag dcNY = tagConfig.makeTag("dc", "ny"); + ReplicaSetTag dcRI = tagConfig.makeTag("dc", "ri"); + ReplicaSetTag rack1 = tagConfig.makeTag("rack", "1"); + ReplicaSetTag rack2 = tagConfig.makeTag("rack", "2"); + ASSERT_TRUE(dcNY.isValid()); + ASSERT_EQUALS("dc", tagConfig.getTagKey(dcNY)); + ASSERT_EQUALS("ny", tagConfig.getTagValue(dcNY)); + ASSERT_EQUALS("dc", tagConfig.getTagKey(dcRI)); + ASSERT_EQUALS("ri", tagConfig.getTagValue(dcRI)); + ASSERT_EQUALS("rack", tagConfig.getTagKey(rack1)); + ASSERT_EQUALS("1", tagConfig.getTagValue(rack1)); + ASSERT_EQUALS("rack", tagConfig.getTagKey(rack2)); + ASSERT_EQUALS("2", tagConfig.getTagValue(rack2)); + + ASSERT_EQUALS(rack1.getKeyIndex(), rack2.getKeyIndex()); + ASSERT_NOT_EQUALS(rack1.getKeyIndex(), dcRI.getKeyIndex()); + ASSERT_NOT_EQUALS(rack1.getValueIndex(), rack2.getValueIndex()); + + ASSERT_TRUE(rack1 == tagConfig.makeTag("rack", "1")); + ASSERT_TRUE(rack1 == tagConfig.findTag("rack", "1")); + ASSERT_FALSE(tagConfig.findTag("rack", "7").isValid()); + ASSERT_FALSE(tagConfig.findTag("country", "us").isValid()); +} + +class ReplicaSetTagMatchTest : public unittest::Test { +public: + void setUp() { + dcNY = tagConfig.makeTag("dc", "ny"); + dcVA = tagConfig.makeTag("dc", "va"); + dcRI = tagConfig.makeTag("dc", "ri"); + rack1 = tagConfig.makeTag("rack", "1"); + rack2 = tagConfig.makeTag("rack", "2"); + rack3 = tagConfig.makeTag("rack", "3"); + rack4 = tagConfig.makeTag("rack", "4"); } - TEST(ReplicaSetTagConfigTest, MakeAndFindTags) { - ReplicaSetTagConfig tagConfig; - ReplicaSetTag dcNY = tagConfig.makeTag("dc", "ny"); - ReplicaSetTag dcRI = tagConfig.makeTag("dc", "ri"); - ReplicaSetTag rack1 = tagConfig.makeTag("rack", "1"); - ReplicaSetTag rack2 = tagConfig.makeTag("rack", "2"); - ASSERT_TRUE(dcNY.isValid()); - ASSERT_EQUALS("dc", tagConfig.getTagKey(dcNY)); - ASSERT_EQUALS("ny", tagConfig.getTagValue(dcNY)); - ASSERT_EQUALS("dc", tagConfig.getTagKey(dcRI)); - ASSERT_EQUALS("ri", tagConfig.getTagValue(dcRI)); - ASSERT_EQUALS("rack", tagConfig.getTagKey(rack1)); - ASSERT_EQUALS("1", tagConfig.getTagValue(rack1)); - ASSERT_EQUALS("rack", tagConfig.getTagKey(rack2)); - ASSERT_EQUALS("2", tagConfig.getTagValue(rack2)); - - ASSERT_EQUALS(rack1.getKeyIndex(), rack2.getKeyIndex()); - ASSERT_NOT_EQUALS(rack1.getKeyIndex(), dcRI.getKeyIndex()); - ASSERT_NOT_EQUALS(rack1.getValueIndex(), rack2.getValueIndex()); - - ASSERT_TRUE(rack1 == tagConfig.makeTag("rack", "1")); - ASSERT_TRUE(rack1 == tagConfig.findTag("rack", "1")); - ASSERT_FALSE(tagConfig.findTag("rack", "7").isValid()); - ASSERT_FALSE(tagConfig.findTag("country", "us").isValid()); - } - - class ReplicaSetTagMatchTest : public unittest::Test { - public: - void setUp() { - dcNY = tagConfig.makeTag("dc", "ny"); - dcVA = tagConfig.makeTag("dc", "va"); - dcRI = tagConfig.makeTag("dc", "ri"); - rack1 = tagConfig.makeTag("rack", "1"); - rack2 = tagConfig.makeTag("rack", "2"); - rack3 = tagConfig.makeTag("rack", "3"); - rack4 = tagConfig.makeTag("rack", "4"); - } - - protected: - ReplicaSetTagConfig tagConfig; - ReplicaSetTag dcNY; - ReplicaSetTag dcVA; - ReplicaSetTag dcRI; - ReplicaSetTag rack1; - ReplicaSetTag rack2; - ReplicaSetTag rack3; - ReplicaSetTag rack4; - }; - - TEST_F(ReplicaSetTagMatchTest, EmptyPatternAlwaysSatisfied) { - ReplicaSetTagPattern pattern = tagConfig.makePattern(); - ASSERT_TRUE(ReplicaSetTagMatch(pattern).isSatisfied()); - ASSERT_OK(tagConfig.addTagCountConstraintToPattern(&pattern, "dc", 0)); - ASSERT_TRUE(ReplicaSetTagMatch(pattern).isSatisfied()); - } - - TEST_F(ReplicaSetTagMatchTest, SingleTagConstraint) { - ReplicaSetTagPattern pattern = tagConfig.makePattern(); - ASSERT_OK(tagConfig.addTagCountConstraintToPattern(&pattern, "dc", 2)); - ReplicaSetTagMatch matcher(pattern); - ASSERT_FALSE(matcher.isSatisfied()); - ASSERT_FALSE(matcher.update(dcVA)); // One DC alone won't satisfy "dc: 2". - ASSERT_FALSE(matcher.update(rack2)); // Adding one rack won't satisfy. - ASSERT_FALSE(matcher.update(rack3)); // Two racks won't satisfy "dc: 2". - ASSERT_FALSE(matcher.update(dcVA)); // Same tag twice won't satisfy. - ASSERT_TRUE(matcher.update(dcRI)); // Two DCs satisfies. - ASSERT_TRUE(matcher.isSatisfied()); - ASSERT_TRUE(matcher.update(dcNY)); // Three DCs satisfies. - ASSERT_TRUE(matcher.update(rack1)); // Once matcher is satisfied, it stays satisfied. - } - - TEST_F(ReplicaSetTagMatchTest, MaskingConstraints) { - // The highest count constraint for a tag key is the only one that matters. - ReplicaSetTagPattern pattern = tagConfig.makePattern(); - ASSERT_OK(tagConfig.addTagCountConstraintToPattern(&pattern, "rack", 2)); - ASSERT_OK(tagConfig.addTagCountConstraintToPattern(&pattern, "rack", 3)); - ReplicaSetTagMatch matcher(pattern); - ASSERT_FALSE(matcher.isSatisfied()); - ASSERT_FALSE(matcher.update(rack2)); - ASSERT_FALSE(matcher.update(rack3)); - ASSERT_FALSE(matcher.update(rack2)); - ASSERT_TRUE(matcher.update(rack1)); - } - - TEST_F(ReplicaSetTagMatchTest, MultipleConstraints) { - ReplicaSetTagPattern pattern = tagConfig.makePattern(); - ASSERT_OK(tagConfig.addTagCountConstraintToPattern(&pattern, "dc", 3)); - ASSERT_OK(tagConfig.addTagCountConstraintToPattern(&pattern, "rack", 2)); - ReplicaSetTagMatch matcher(pattern); - ASSERT_FALSE(matcher.isSatisfied()); - ASSERT_FALSE(matcher.update(dcVA)); - ASSERT_FALSE(matcher.update(rack2)); - ASSERT_FALSE(matcher.update(rack3)); - ASSERT_FALSE(matcher.update(dcVA)); - ASSERT_FALSE(matcher.update(dcRI)); - ASSERT_TRUE(matcher.update(dcNY)); - ASSERT_TRUE(matcher.isSatisfied()); - } +protected: + ReplicaSetTagConfig tagConfig; + ReplicaSetTag dcNY; + ReplicaSetTag dcVA; + ReplicaSetTag dcRI; + ReplicaSetTag rack1; + ReplicaSetTag rack2; + ReplicaSetTag rack3; + ReplicaSetTag rack4; +}; + +TEST_F(ReplicaSetTagMatchTest, EmptyPatternAlwaysSatisfied) { + ReplicaSetTagPattern pattern = tagConfig.makePattern(); + ASSERT_TRUE(ReplicaSetTagMatch(pattern).isSatisfied()); + ASSERT_OK(tagConfig.addTagCountConstraintToPattern(&pattern, "dc", 0)); + ASSERT_TRUE(ReplicaSetTagMatch(pattern).isSatisfied()); +} + +TEST_F(ReplicaSetTagMatchTest, SingleTagConstraint) { + ReplicaSetTagPattern pattern = tagConfig.makePattern(); + ASSERT_OK(tagConfig.addTagCountConstraintToPattern(&pattern, "dc", 2)); + ReplicaSetTagMatch matcher(pattern); + ASSERT_FALSE(matcher.isSatisfied()); + ASSERT_FALSE(matcher.update(dcVA)); // One DC alone won't satisfy "dc: 2". + ASSERT_FALSE(matcher.update(rack2)); // Adding one rack won't satisfy. + ASSERT_FALSE(matcher.update(rack3)); // Two racks won't satisfy "dc: 2". + ASSERT_FALSE(matcher.update(dcVA)); // Same tag twice won't satisfy. + ASSERT_TRUE(matcher.update(dcRI)); // Two DCs satisfies. + ASSERT_TRUE(matcher.isSatisfied()); + ASSERT_TRUE(matcher.update(dcNY)); // Three DCs satisfies. + ASSERT_TRUE(matcher.update(rack1)); // Once matcher is satisfied, it stays satisfied. +} + +TEST_F(ReplicaSetTagMatchTest, MaskingConstraints) { + // The highest count constraint for a tag key is the only one that matters. + ReplicaSetTagPattern pattern = tagConfig.makePattern(); + ASSERT_OK(tagConfig.addTagCountConstraintToPattern(&pattern, "rack", 2)); + ASSERT_OK(tagConfig.addTagCountConstraintToPattern(&pattern, "rack", 3)); + ReplicaSetTagMatch matcher(pattern); + ASSERT_FALSE(matcher.isSatisfied()); + ASSERT_FALSE(matcher.update(rack2)); + ASSERT_FALSE(matcher.update(rack3)); + ASSERT_FALSE(matcher.update(rack2)); + ASSERT_TRUE(matcher.update(rack1)); +} + +TEST_F(ReplicaSetTagMatchTest, MultipleConstraints) { + ReplicaSetTagPattern pattern = tagConfig.makePattern(); + ASSERT_OK(tagConfig.addTagCountConstraintToPattern(&pattern, "dc", 3)); + ASSERT_OK(tagConfig.addTagCountConstraintToPattern(&pattern, "rack", 2)); + ReplicaSetTagMatch matcher(pattern); + ASSERT_FALSE(matcher.isSatisfied()); + ASSERT_FALSE(matcher.update(dcVA)); + ASSERT_FALSE(matcher.update(rack2)); + ASSERT_FALSE(matcher.update(rack3)); + ASSERT_FALSE(matcher.update(dcVA)); + ASSERT_FALSE(matcher.update(dcRI)); + ASSERT_TRUE(matcher.update(dcNY)); + ASSERT_TRUE(matcher.isSatisfied()); +} } // namespace } // namespace repl diff --git a/src/mongo/db/repl/replication_coordinator.cpp b/src/mongo/db/repl/replication_coordinator.cpp index a8b1eaa2fee..5b7d25f969a 100644 --- a/src/mongo/db/repl/replication_coordinator.cpp +++ b/src/mongo/db/repl/replication_coordinator.cpp @@ -33,11 +33,11 @@ namespace mongo { namespace repl { - ReplicationCoordinator::ReplicationCoordinator() {} - ReplicationCoordinator::~ReplicationCoordinator() {} +ReplicationCoordinator::ReplicationCoordinator() {} +ReplicationCoordinator::~ReplicationCoordinator() {} - // TODO(dannenberg) remove when master slave is removed - const char *replAllDead = 0; +// TODO(dannenberg) remove when master slave is removed +const char* replAllDead = 0; -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/replication_coordinator.h b/src/mongo/db/repl/replication_coordinator.h index f9927f719dd..fe45fb2fd30 100644 --- a/src/mongo/db/repl/replication_coordinator.h +++ b/src/mongo/db/repl/replication_coordinator.h @@ -39,530 +39,518 @@ namespace mongo { - class BSONObj; - class BSONObjBuilder; - class IndexDescriptor; - class NamespaceString; - class OperationContext; - class OpTime; - struct WriteConcernOptions; +class BSONObj; +class BSONObjBuilder; +class IndexDescriptor; +class NamespaceString; +class OperationContext; +class OpTime; +struct WriteConcernOptions; namespace repl { - class BackgroundSync; - class HandshakeArgs; - class IsMasterResponse; - class OplogReader; - class ReplSetHeartbeatArgs; - class ReplSetHeartbeatResponse; - class ReplSetHtmlSummary; - class UpdatePositionArgs; +class BackgroundSync; +class HandshakeArgs; +class IsMasterResponse; +class OplogReader; +class ReplSetHeartbeatArgs; +class ReplSetHeartbeatResponse; +class ReplSetHtmlSummary; +class UpdatePositionArgs; + +/** + * Global variable that contains a std::string telling why master/slave halted + * + * "dead" means something really bad happened like replication falling completely out of sync. + * when non-null, we are dead and the string is informational + * + * TODO(dannenberg) remove when master slave goes + */ +extern const char* replAllDead; + +/** + * The ReplicationCoordinator is responsible for coordinating the interaction of replication + * with the rest of the system. The public methods on ReplicationCoordinator are the public + * API that the replication subsystem presents to the rest of the codebase. + */ +class ReplicationCoordinator { + MONGO_DISALLOW_COPYING(ReplicationCoordinator); + +public: + typedef boost::posix_time::milliseconds Milliseconds; + + struct StatusAndDuration { + public: + Status status; + Milliseconds duration; + + StatusAndDuration(const Status& stat, Milliseconds ms) : status(stat), duration(ms) {} + }; + + virtual ~ReplicationCoordinator(); + + /** + * Does any initial bookkeeping needed to start replication, and instructs the other + * components of the replication system to start up whatever threads and do whatever + * initialization they need. + */ + virtual void startReplication(OperationContext* txn) = 0; + + /** + * Does whatever cleanup is required to stop replication, including instructing the other + * components of the replication system to shut down and stop any threads they are using, + * blocking until all replication-related shutdown tasks are complete. + */ + virtual void shutdown() = 0; /** - * Global variable that contains a std::string telling why master/slave halted + * Returns a reference to the parsed command line arguments that are related to replication. + */ + virtual const ReplSettings& getSettings() const = 0; + + enum Mode { modeNone = 0, modeReplSet, modeMasterSlave }; + + /** + * Returns a value indicating whether this node was configured at start-up to run + * standalone, as part of a master-slave pair, or as a member of a replica set. + */ + virtual Mode getReplicationMode() const = 0; + + /** + * Returns true if this node is configured to be a member of a replica set or master/slave + * setup. + */ + virtual bool isReplEnabled() const = 0; + + /** + * Returns the current replica set state of this node (PRIMARY, SECONDARY, STARTUP, etc). + * It is invalid to call this unless getReplicationMode() == modeReplSet. + */ + virtual MemberState getMemberState() const = 0; + + /** + * Returns true if this node is in state PRIMARY or SECONDARY. * - * "dead" means something really bad happened like replication falling completely out of sync. - * when non-null, we are dead and the string is informational + * It is invalid to call this unless getReplicationMode() == modeReplSet. * - * TODO(dannenberg) remove when master slave goes + * This method may be optimized to reduce synchronization overhead compared to + * reading the current member state with getMemberState(). */ - extern const char *replAllDead; - + virtual bool isInPrimaryOrSecondaryState() const = 0; + + /** - * The ReplicationCoordinator is responsible for coordinating the interaction of replication - * with the rest of the system. The public methods on ReplicationCoordinator are the public - * API that the replication subsystem presents to the rest of the codebase. + * Returns how slave delayed this node is configured to be. + * + * Raises a DBException if this node is not a member of the current replica set + * configuration. */ - class ReplicationCoordinator { - MONGO_DISALLOW_COPYING(ReplicationCoordinator); + virtual Seconds getSlaveDelaySecs() const = 0; - public: + /** + * Clears the list of sync sources we have blacklisted. + */ + virtual void clearSyncSourceBlacklist() = 0; + + /** + * Blocks the calling thread for up to writeConcern.wTimeout millis, or until "ts" has been + * replicated to at least a set of nodes that satisfies the writeConcern, whichever comes + * first. A writeConcern.wTimeout of 0 indicates no timeout (block forever) and a + * writeConcern.wTimeout of -1 indicates return immediately after checking. Return codes: + * ErrorCodes::ExceededTimeLimit if the writeConcern.wTimeout is reached before + * the data has been sufficiently replicated + * ErrorCodes::NotMaster if the node is not Primary/Master + * ErrorCodes::UnknownReplWriteConcern if the writeConcern.wMode contains a write concern + * mode that is not known + * ErrorCodes::ShutdownInProgress if we are mid-shutdown + * ErrorCodes::Interrupted if the operation was killed with killop() + */ + virtual StatusAndDuration awaitReplication(const OperationContext* txn, + const OpTime& ts, + const WriteConcernOptions& writeConcern) = 0; + + /** + * Like awaitReplication(), above, but waits for the replication of the last operation + * performed on the client associated with "txn". + */ + virtual StatusAndDuration awaitReplicationOfLastOpForClient( + const OperationContext* txn, const WriteConcernOptions& writeConcern) = 0; + + /** + * Causes this node to relinquish being primary for at least 'stepdownTime'. If 'force' is + * false, before doing so it will wait for 'waitTime' for one other node to be within 10 + * seconds of this node's optime before stepping down. Returns a Status with the code + * ErrorCodes::ExceededTimeLimit if no secondary catches up within waitTime, + * ErrorCodes::NotMaster if you are no longer primary when trying to step down, + * ErrorCodes::SecondaryAheadOfPrimary if we are primary but there is another node that + * seems to be ahead of us in replication, and Status::OK otherwise. + */ + virtual Status stepDown(OperationContext* txn, + bool force, + const Milliseconds& waitTime, + const Milliseconds& stepdownTime) = 0; + + /** + * Returns true if the node can be considered master for the purpose of introspective + * commands such as isMaster() and rs.status(). + */ + virtual bool isMasterForReportingPurposes() = 0; - typedef boost::posix_time::milliseconds Milliseconds; - - struct StatusAndDuration { - public: - Status status; - Milliseconds duration; - - StatusAndDuration(const Status& stat, Milliseconds ms) : status(stat), - duration(ms) {} - }; - - virtual ~ReplicationCoordinator(); - - /** - * Does any initial bookkeeping needed to start replication, and instructs the other - * components of the replication system to start up whatever threads and do whatever - * initialization they need. - */ - virtual void startReplication(OperationContext* txn) = 0; - - /** - * Does whatever cleanup is required to stop replication, including instructing the other - * components of the replication system to shut down and stop any threads they are using, - * blocking until all replication-related shutdown tasks are complete. - */ - virtual void shutdown() = 0; - - /** - * Returns a reference to the parsed command line arguments that are related to replication. - */ - virtual const ReplSettings& getSettings() const = 0; - - enum Mode { - modeNone = 0, - modeReplSet, - modeMasterSlave - }; - - /** - * Returns a value indicating whether this node was configured at start-up to run - * standalone, as part of a master-slave pair, or as a member of a replica set. - */ - virtual Mode getReplicationMode() const = 0; - - /** - * Returns true if this node is configured to be a member of a replica set or master/slave - * setup. - */ - virtual bool isReplEnabled() const = 0; - - /** - * Returns the current replica set state of this node (PRIMARY, SECONDARY, STARTUP, etc). - * It is invalid to call this unless getReplicationMode() == modeReplSet. - */ - virtual MemberState getMemberState() const = 0; - - /** - * Returns true if this node is in state PRIMARY or SECONDARY. - * - * It is invalid to call this unless getReplicationMode() == modeReplSet. - * - * This method may be optimized to reduce synchronization overhead compared to - * reading the current member state with getMemberState(). - */ - virtual bool isInPrimaryOrSecondaryState() const = 0; - - - /** - * Returns how slave delayed this node is configured to be. - * - * Raises a DBException if this node is not a member of the current replica set - * configuration. - */ - virtual Seconds getSlaveDelaySecs() const = 0; - - /** - * Clears the list of sync sources we have blacklisted. - */ - virtual void clearSyncSourceBlacklist() = 0; - - /** - * Blocks the calling thread for up to writeConcern.wTimeout millis, or until "ts" has been - * replicated to at least a set of nodes that satisfies the writeConcern, whichever comes - * first. A writeConcern.wTimeout of 0 indicates no timeout (block forever) and a - * writeConcern.wTimeout of -1 indicates return immediately after checking. Return codes: - * ErrorCodes::ExceededTimeLimit if the writeConcern.wTimeout is reached before - * the data has been sufficiently replicated - * ErrorCodes::NotMaster if the node is not Primary/Master - * ErrorCodes::UnknownReplWriteConcern if the writeConcern.wMode contains a write concern - * mode that is not known - * ErrorCodes::ShutdownInProgress if we are mid-shutdown - * ErrorCodes::Interrupted if the operation was killed with killop() - */ - virtual StatusAndDuration awaitReplication(const OperationContext* txn, - const OpTime& ts, - const WriteConcernOptions& writeConcern) = 0; - - /** - * Like awaitReplication(), above, but waits for the replication of the last operation - * performed on the client associated with "txn". - */ - virtual StatusAndDuration awaitReplicationOfLastOpForClient( - const OperationContext* txn, - const WriteConcernOptions& writeConcern) = 0; - - /** - * Causes this node to relinquish being primary for at least 'stepdownTime'. If 'force' is - * false, before doing so it will wait for 'waitTime' for one other node to be within 10 - * seconds of this node's optime before stepping down. Returns a Status with the code - * ErrorCodes::ExceededTimeLimit if no secondary catches up within waitTime, - * ErrorCodes::NotMaster if you are no longer primary when trying to step down, - * ErrorCodes::SecondaryAheadOfPrimary if we are primary but there is another node that - * seems to be ahead of us in replication, and Status::OK otherwise. - */ - virtual Status stepDown(OperationContext* txn, - bool force, - const Milliseconds& waitTime, - const Milliseconds& stepdownTime) = 0; - - /** - * Returns true if the node can be considered master for the purpose of introspective - * commands such as isMaster() and rs.status(). - */ - virtual bool isMasterForReportingPurposes() = 0; - - /** - * Returns true if it is valid for this node to accept writes on the given database. - * Currently this is true only if this node is Primary, master in master/slave, - * a standalone, or is writing to the local database. - * - * If a node was started with the replSet argument, but has not yet received a config, it - * will not be able to receive writes to a database other than local (it will not be treated - * as standalone node). - * - * NOTE: This function can only be meaningfully called while the caller holds the global - * lock in some mode other than MODE_NONE. - */ - virtual bool canAcceptWritesForDatabase(const StringData& dbName) = 0; - - /** - * Checks if the current replica set configuration can satisfy the given write concern. - * - * Things that are taken into consideration include: - * 1. If the set has enough data-bearing members. - * 2. If the write concern mode exists. - * 3. If there are enough members for the write concern mode specified. - */ - virtual Status checkIfWriteConcernCanBeSatisfied( - const WriteConcernOptions& writeConcern) const = 0; - - /** - * Returns Status::OK() if it is valid for this node to serve reads on the given collection - * and an errorcode indicating why the node cannot if it cannot. - */ - virtual Status checkCanServeReadsFor(OperationContext* txn, - const NamespaceString& ns, - bool slaveOk) = 0; - - /** - * Returns true if this node should ignore unique index constraints on new documents. - * Currently this is needed for nodes in STARTUP2, RECOVERING, and ROLLBACK states. - */ - virtual bool shouldIgnoreUniqueIndex(const IndexDescriptor* idx) = 0; - - /** - * Updates our internal tracking of the last OpTime applied for the given slave - * identified by "rid". Only valid to call in master/slave mode - */ - virtual Status setLastOptimeForSlave(const OID& rid, const OpTime& ts) = 0; - - /** - * Updates our internal tracking of the last OpTime applied to this node. - * - * The new value of "ts" must be no less than any prior value passed to this method, and it - * is the caller's job to properly synchronize this behavior. The exception to this rule is - * that after calls to resetLastOpTimeFromOplog(), the minimum acceptable value for "ts" is - * reset based on the contents of the oplog, and may go backwards due to rollback. - */ - virtual void setMyLastOptime(const OpTime& ts) = 0; - - /** - * Same as above, but used during places we need to zero our last optime. - */ - virtual void resetMyLastOptime() = 0; - - /** - * Updates our the message we include in heartbeat responses. - */ - virtual void setMyHeartbeatMessage(const std::string& msg) = 0; - - /** - * Returns the last optime recorded by setMyLastOptime. - */ - virtual OpTime getMyLastOptime() const = 0; - - /** - * Retrieves and returns the current election id, which is a unique id that is local to - * this node and changes every time we become primary. - * TODO(spencer): Use term instead. - */ - virtual OID getElectionId() = 0; - - /** - * Returns the RID for this node. The RID is used to identify this node to our sync source - * when sending updates about our replication progress. - */ - virtual OID getMyRID() const = 0; - - /** - * Returns the id for this node as specified in the current replica set configuration. - */ - virtual int getMyId() const = 0; - - /** - * Sets this node into a specific follower mode. - * - * Returns true if the follower mode was successfully set. Returns false if the - * node is or becomes a leader before setFollowerMode completes. - * - * Follower modes are RS_STARTUP2 (initial sync), RS_SECONDARY, RS_ROLLBACK and - * RS_RECOVERING. They are the valid states of a node whose topology coordinator has the - * follower role. - * - * This is essentially an interface that allows the applier to prevent the node from - * becoming a candidate or accepting reads, depending on circumstances in the oplog - * application process. - */ - virtual bool setFollowerMode(const MemberState& newState) = 0; - - /** - * Returns true if the coordinator wants the applier to pause application. - * - * If this returns true, the applier should call signalDrainComplete() when it has - * completed draining its operation buffer and no further ops are being applied. - */ - virtual bool isWaitingForApplierToDrain() = 0; - - /** - * Signals that a previously requested pause and drain of the applier buffer - * has completed. - * - * This is an interface that allows the applier to reenable writes after - * a successful election triggers the draining of the applier buffer. - */ - virtual void signalDrainComplete(OperationContext* txn) = 0; - - /** - * Signals the sync source feedback thread to wake up and send a handshake and - * replSetUpdatePosition command to our sync source. - */ - virtual void signalUpstreamUpdater() = 0; - - /** - * Prepares a BSONObj describing an invocation of the replSetUpdatePosition command that can - * be sent to this node's sync source to update it about our progress in replication. - * - * The returned bool indicates whether or not the command was created. - */ - virtual bool prepareReplSetUpdatePositionCommand(BSONObjBuilder* cmdBuilder) = 0; - - /** - * For ourself and each secondary chaining off of us, adds a BSONObj to "handshakes" - * describing an invocation of the replSetUpdateCommand that can be sent to this node's - * sync source to handshake us and our chained secondaries, informing the sync source that - * we are replicating off of it. - */ - virtual void prepareReplSetUpdatePositionCommandHandshakes( - std::vector<BSONObj>* handshakes) = 0; - - /** - * Handles an incoming replSetGetStatus command. Adds BSON to 'result'. - */ - virtual Status processReplSetGetStatus(BSONObjBuilder* result) = 0; - - /** - * Handles an incoming isMaster command for a replica set node. Should not be - * called on a master-slave or standalone node. - */ - virtual void fillIsMasterForReplSet(IsMasterResponse* result) = 0; - - /** - * Adds to "result" a description of the slaveInfo data structure used to map RIDs to their - * last known optimes. - */ - virtual void appendSlaveInfoData(BSONObjBuilder* result) = 0; - - /** - * Handles an incoming replSetGetConfig command. Adds BSON to 'result'. - */ - virtual void processReplSetGetConfig(BSONObjBuilder* result) = 0; - - /** - * Toggles maintenanceMode to the value expressed by 'activate' - * return Status::OK if the change worked, NotSecondary if it failed because we are - * PRIMARY, and OperationFailed if we are not currently in maintenance mode - */ - virtual Status setMaintenanceMode(bool activate) = 0; - - /** - * Retrieves the current count of maintenanceMode and returns 'true' if greater than 0. - */ - virtual bool getMaintenanceMode() = 0; - - /** - * Handles an incoming replSetSyncFrom command. Adds BSON to 'result' - * returns Status::OK if the sync target could be set and an ErrorCode indicating why it - * couldn't otherwise. - */ - virtual Status processReplSetSyncFrom(const HostAndPort& target, - BSONObjBuilder* resultObj) = 0; - - /** - * Handles an incoming replSetFreeze command. Adds BSON to 'resultObj' - * returns Status::OK() if the node is a member of a replica set with a config and an - * error Status otherwise - */ - virtual Status processReplSetFreeze(int secs, BSONObjBuilder* resultObj) = 0; - - /** - * Handles an incoming heartbeat command with arguments 'args'. Populates 'response'; - * returns a Status with either OK or an error message. - */ - virtual Status processHeartbeat(const ReplSetHeartbeatArgs& args, - ReplSetHeartbeatResponse* response) = 0; - - /** - * Arguments for the replSetReconfig command. - */ - struct ReplSetReconfigArgs { - BSONObj newConfigObj; - bool force; - }; - - /** - * Handles an incoming replSetReconfig command. Adds BSON to 'resultObj'; - * returns a Status with either OK or an error message. - */ - virtual Status processReplSetReconfig(OperationContext* txn, - const ReplSetReconfigArgs& args, - BSONObjBuilder* resultObj) = 0; - - /* - * Handles an incoming replSetInitiate command. If "configObj" is empty, generates a default - * configuration to use. - * Adds BSON to 'resultObj'; returns a Status with either OK or an error message. - */ - virtual Status processReplSetInitiate(OperationContext* txn, - const BSONObj& configObj, - BSONObjBuilder* resultObj) = 0; - - /* - * Handles an incoming replSetGetRBID command. - * Adds BSON to 'resultObj'; returns a Status with either OK or an error message. - */ - virtual Status processReplSetGetRBID(BSONObjBuilder* resultObj) = 0; - - /** - * Increments this process's rollback id. Called every time a rollback occurs. - */ - virtual void incrementRollbackID() = 0; - - /** - * Arguments to the replSetFresh command. - */ - struct ReplSetFreshArgs { - StringData setName; // Name of the replset - HostAndPort who; // host and port of the member that sent the replSetFresh command - unsigned id; // replSet id of the member that sent the replSetFresh command - int cfgver; // replSet config version that the member who sent the command thinks it has - OpTime opTime; // last optime seen by the member who sent the replSetFresh command - }; - - /* - * Handles an incoming replSetFresh command. - * Adds BSON to 'resultObj'; returns a Status with either OK or an error message. - */ - virtual Status processReplSetFresh(const ReplSetFreshArgs& args, - BSONObjBuilder* resultObj) = 0; - - /** - * Arguments to the replSetElect command. - */ - struct ReplSetElectArgs { - StringData set; // Name of the replset - int whoid; // replSet id of the member that sent the replSetFresh command - int cfgver; // replSet config version that the member who sent the command thinks it has - OID round; // unique ID for this election - }; - - /* - * Handles an incoming replSetElect command. - * Adds BSON to 'resultObj'; returns a Status with either OK or an error message. - */ - virtual Status processReplSetElect(const ReplSetElectArgs& args, - BSONObjBuilder* resultObj) = 0; - - /** - * Handles an incoming replSetUpdatePosition command, updating each node's oplog progress. - * Returns Status::OK() if all updates are processed correctly, NodeNotFound - * if any updating node cannot be found in the config, InvalidReplicaSetConfig if the - * "cfgver" sent in any of the updates doesn't match our config version, or - * NotMasterOrSecondaryCode if we are in state REMOVED or otherwise don't have a valid - * replica set config. - * If a non-OK status is returned, it is unspecified whether none or some of the updates - * were applied. - */ - virtual Status processReplSetUpdatePosition(const UpdatePositionArgs& updates) = 0; - - /** - * Handles an incoming Handshake command (or a handshake from replSetUpdatePosition). - * Associates the node's 'remoteID' with its 'handshake' object. This association is used - * to update local.slaves and to forward the node's replication progress upstream when this - * node is being chained through. - * - * Returns ErrorCodes::NodeNotFound if no replica set member exists with the given member ID - * and ErrorCodes::NotMasterOrSecondaryCode if we're in state REMOVED or otherwise don't - * have a valid config. - */ - virtual Status processHandshake(OperationContext* txn, const HandshakeArgs& handshake) = 0; - - /** - * Returns a bool indicating whether or not this node builds indexes. - */ - virtual bool buildsIndexes() = 0; - - /** - * Returns a vector of members that have applied the operation with OpTime 'op'. - */ - virtual std::vector<HostAndPort> getHostsWrittenTo(const OpTime& op) = 0; - - /** - * Returns a vector of the members other than ourself in the replica set, as specified in - * the replica set config. Invalid to call if we are not in replica set mode. Returns - * an empty vector if we do not have a valid config. - */ - virtual std::vector<HostAndPort> getOtherNodesInReplSet() const = 0; - - /** - * Returns a BSONObj containing a representation of the current default write concern. - */ - virtual WriteConcernOptions getGetLastErrorDefault() = 0; - - /** - * Checks that the --replSet flag was passed when starting up the node and that the node - * has a valid replica set config. - * - * Returns a Status indicating whether those conditions are met with errorcode - * NoReplicationEnabled if --replSet was not present during start up or with errorcode - * NotYetInitialized in the absence of a valid config. Also adds error info to "result". - */ - virtual Status checkReplEnabledForCommand(BSONObjBuilder* result) = 0; - - /** - * Chooses a viable sync source, or, if none available, returns empty HostAndPort. - */ - virtual HostAndPort chooseNewSyncSource(const OpTime& lastOpTimeFetched) = 0; - - /** - * Blacklists choosing 'host' as a sync source until time 'until'. - */ - virtual void blacklistSyncSource(const HostAndPort& host, Date_t until) = 0; - - /** - * Loads the optime from the last op in the oplog into the coordinator's lastOpApplied - * value. - */ - virtual void resetLastOpTimeFromOplog(OperationContext* txn) = 0; - - /** - * Determines if a new sync source should be considered. - * currentSource: the current sync source - */ - virtual bool shouldChangeSyncSource(const HostAndPort& currentSource) = 0; - - /** - * Writes into 'output' all the information needed to generate a summary of the current - * replication state for use by the web interface. - */ - virtual void summarizeAsHtml(ReplSetHtmlSummary* output) = 0; - - protected: - - ReplicationCoordinator(); + /** + * Returns true if it is valid for this node to accept writes on the given database. + * Currently this is true only if this node is Primary, master in master/slave, + * a standalone, or is writing to the local database. + * + * If a node was started with the replSet argument, but has not yet received a config, it + * will not be able to receive writes to a database other than local (it will not be treated + * as standalone node). + * + * NOTE: This function can only be meaningfully called while the caller holds the global + * lock in some mode other than MODE_NONE. + */ + virtual bool canAcceptWritesForDatabase(const StringData& dbName) = 0; + + /** + * Checks if the current replica set configuration can satisfy the given write concern. + * + * Things that are taken into consideration include: + * 1. If the set has enough data-bearing members. + * 2. If the write concern mode exists. + * 3. If there are enough members for the write concern mode specified. + */ + virtual Status checkIfWriteConcernCanBeSatisfied( + const WriteConcernOptions& writeConcern) const = 0; + + /** + * Returns Status::OK() if it is valid for this node to serve reads on the given collection + * and an errorcode indicating why the node cannot if it cannot. + */ + virtual Status checkCanServeReadsFor(OperationContext* txn, + const NamespaceString& ns, + bool slaveOk) = 0; + + /** + * Returns true if this node should ignore unique index constraints on new documents. + * Currently this is needed for nodes in STARTUP2, RECOVERING, and ROLLBACK states. + */ + virtual bool shouldIgnoreUniqueIndex(const IndexDescriptor* idx) = 0; + + /** + * Updates our internal tracking of the last OpTime applied for the given slave + * identified by "rid". Only valid to call in master/slave mode + */ + virtual Status setLastOptimeForSlave(const OID& rid, const OpTime& ts) = 0; + /** + * Updates our internal tracking of the last OpTime applied to this node. + * + * The new value of "ts" must be no less than any prior value passed to this method, and it + * is the caller's job to properly synchronize this behavior. The exception to this rule is + * that after calls to resetLastOpTimeFromOplog(), the minimum acceptable value for "ts" is + * reset based on the contents of the oplog, and may go backwards due to rollback. + */ + virtual void setMyLastOptime(const OpTime& ts) = 0; + + /** + * Same as above, but used during places we need to zero our last optime. + */ + virtual void resetMyLastOptime() = 0; + + /** + * Updates our the message we include in heartbeat responses. + */ + virtual void setMyHeartbeatMessage(const std::string& msg) = 0; + + /** + * Returns the last optime recorded by setMyLastOptime. + */ + virtual OpTime getMyLastOptime() const = 0; + + /** + * Retrieves and returns the current election id, which is a unique id that is local to + * this node and changes every time we become primary. + * TODO(spencer): Use term instead. + */ + virtual OID getElectionId() = 0; + + /** + * Returns the RID for this node. The RID is used to identify this node to our sync source + * when sending updates about our replication progress. + */ + virtual OID getMyRID() const = 0; + + /** + * Returns the id for this node as specified in the current replica set configuration. + */ + virtual int getMyId() const = 0; + + /** + * Sets this node into a specific follower mode. + * + * Returns true if the follower mode was successfully set. Returns false if the + * node is or becomes a leader before setFollowerMode completes. + * + * Follower modes are RS_STARTUP2 (initial sync), RS_SECONDARY, RS_ROLLBACK and + * RS_RECOVERING. They are the valid states of a node whose topology coordinator has the + * follower role. + * + * This is essentially an interface that allows the applier to prevent the node from + * becoming a candidate or accepting reads, depending on circumstances in the oplog + * application process. + */ + virtual bool setFollowerMode(const MemberState& newState) = 0; + + /** + * Returns true if the coordinator wants the applier to pause application. + * + * If this returns true, the applier should call signalDrainComplete() when it has + * completed draining its operation buffer and no further ops are being applied. + */ + virtual bool isWaitingForApplierToDrain() = 0; + + /** + * Signals that a previously requested pause and drain of the applier buffer + * has completed. + * + * This is an interface that allows the applier to reenable writes after + * a successful election triggers the draining of the applier buffer. + */ + virtual void signalDrainComplete(OperationContext* txn) = 0; + + /** + * Signals the sync source feedback thread to wake up and send a handshake and + * replSetUpdatePosition command to our sync source. + */ + virtual void signalUpstreamUpdater() = 0; + + /** + * Prepares a BSONObj describing an invocation of the replSetUpdatePosition command that can + * be sent to this node's sync source to update it about our progress in replication. + * + * The returned bool indicates whether or not the command was created. + */ + virtual bool prepareReplSetUpdatePositionCommand(BSONObjBuilder* cmdBuilder) = 0; + + /** + * For ourself and each secondary chaining off of us, adds a BSONObj to "handshakes" + * describing an invocation of the replSetUpdateCommand that can be sent to this node's + * sync source to handshake us and our chained secondaries, informing the sync source that + * we are replicating off of it. + */ + virtual void prepareReplSetUpdatePositionCommandHandshakes( + std::vector<BSONObj>* handshakes) = 0; + + /** + * Handles an incoming replSetGetStatus command. Adds BSON to 'result'. + */ + virtual Status processReplSetGetStatus(BSONObjBuilder* result) = 0; + + /** + * Handles an incoming isMaster command for a replica set node. Should not be + * called on a master-slave or standalone node. + */ + virtual void fillIsMasterForReplSet(IsMasterResponse* result) = 0; + + /** + * Adds to "result" a description of the slaveInfo data structure used to map RIDs to their + * last known optimes. + */ + virtual void appendSlaveInfoData(BSONObjBuilder* result) = 0; + + /** + * Handles an incoming replSetGetConfig command. Adds BSON to 'result'. + */ + virtual void processReplSetGetConfig(BSONObjBuilder* result) = 0; + + /** + * Toggles maintenanceMode to the value expressed by 'activate' + * return Status::OK if the change worked, NotSecondary if it failed because we are + * PRIMARY, and OperationFailed if we are not currently in maintenance mode + */ + virtual Status setMaintenanceMode(bool activate) = 0; + + /** + * Retrieves the current count of maintenanceMode and returns 'true' if greater than 0. + */ + virtual bool getMaintenanceMode() = 0; + + /** + * Handles an incoming replSetSyncFrom command. Adds BSON to 'result' + * returns Status::OK if the sync target could be set and an ErrorCode indicating why it + * couldn't otherwise. + */ + virtual Status processReplSetSyncFrom(const HostAndPort& target, BSONObjBuilder* resultObj) = 0; + + /** + * Handles an incoming replSetFreeze command. Adds BSON to 'resultObj' + * returns Status::OK() if the node is a member of a replica set with a config and an + * error Status otherwise + */ + virtual Status processReplSetFreeze(int secs, BSONObjBuilder* resultObj) = 0; + + /** + * Handles an incoming heartbeat command with arguments 'args'. Populates 'response'; + * returns a Status with either OK or an error message. + */ + virtual Status processHeartbeat(const ReplSetHeartbeatArgs& args, + ReplSetHeartbeatResponse* response) = 0; + + /** + * Arguments for the replSetReconfig command. + */ + struct ReplSetReconfigArgs { + BSONObj newConfigObj; + bool force; }; -} // namespace repl -} // namespace mongo + /** + * Handles an incoming replSetReconfig command. Adds BSON to 'resultObj'; + * returns a Status with either OK or an error message. + */ + virtual Status processReplSetReconfig(OperationContext* txn, + const ReplSetReconfigArgs& args, + BSONObjBuilder* resultObj) = 0; + + /* + * Handles an incoming replSetInitiate command. If "configObj" is empty, generates a default + * configuration to use. + * Adds BSON to 'resultObj'; returns a Status with either OK or an error message. + */ + virtual Status processReplSetInitiate(OperationContext* txn, + const BSONObj& configObj, + BSONObjBuilder* resultObj) = 0; + + /* + * Handles an incoming replSetGetRBID command. + * Adds BSON to 'resultObj'; returns a Status with either OK or an error message. + */ + virtual Status processReplSetGetRBID(BSONObjBuilder* resultObj) = 0; + + /** + * Increments this process's rollback id. Called every time a rollback occurs. + */ + virtual void incrementRollbackID() = 0; + + /** + * Arguments to the replSetFresh command. + */ + struct ReplSetFreshArgs { + StringData setName; // Name of the replset + HostAndPort who; // host and port of the member that sent the replSetFresh command + unsigned id; // replSet id of the member that sent the replSetFresh command + int cfgver; // replSet config version that the member who sent the command thinks it has + OpTime opTime; // last optime seen by the member who sent the replSetFresh command + }; + + /* + * Handles an incoming replSetFresh command. + * Adds BSON to 'resultObj'; returns a Status with either OK or an error message. + */ + virtual Status processReplSetFresh(const ReplSetFreshArgs& args, BSONObjBuilder* resultObj) = 0; + + /** + * Arguments to the replSetElect command. + */ + struct ReplSetElectArgs { + StringData set; // Name of the replset + int whoid; // replSet id of the member that sent the replSetFresh command + int cfgver; // replSet config version that the member who sent the command thinks it has + OID round; // unique ID for this election + }; + + /* + * Handles an incoming replSetElect command. + * Adds BSON to 'resultObj'; returns a Status with either OK or an error message. + */ + virtual Status processReplSetElect(const ReplSetElectArgs& args, BSONObjBuilder* resultObj) = 0; + + /** + * Handles an incoming replSetUpdatePosition command, updating each node's oplog progress. + * Returns Status::OK() if all updates are processed correctly, NodeNotFound + * if any updating node cannot be found in the config, InvalidReplicaSetConfig if the + * "cfgver" sent in any of the updates doesn't match our config version, or + * NotMasterOrSecondaryCode if we are in state REMOVED or otherwise don't have a valid + * replica set config. + * If a non-OK status is returned, it is unspecified whether none or some of the updates + * were applied. + */ + virtual Status processReplSetUpdatePosition(const UpdatePositionArgs& updates) = 0; + + /** + * Handles an incoming Handshake command (or a handshake from replSetUpdatePosition). + * Associates the node's 'remoteID' with its 'handshake' object. This association is used + * to update local.slaves and to forward the node's replication progress upstream when this + * node is being chained through. + * + * Returns ErrorCodes::NodeNotFound if no replica set member exists with the given member ID + * and ErrorCodes::NotMasterOrSecondaryCode if we're in state REMOVED or otherwise don't + * have a valid config. + */ + virtual Status processHandshake(OperationContext* txn, const HandshakeArgs& handshake) = 0; + + /** + * Returns a bool indicating whether or not this node builds indexes. + */ + virtual bool buildsIndexes() = 0; + + /** + * Returns a vector of members that have applied the operation with OpTime 'op'. + */ + virtual std::vector<HostAndPort> getHostsWrittenTo(const OpTime& op) = 0; + + /** + * Returns a vector of the members other than ourself in the replica set, as specified in + * the replica set config. Invalid to call if we are not in replica set mode. Returns + * an empty vector if we do not have a valid config. + */ + virtual std::vector<HostAndPort> getOtherNodesInReplSet() const = 0; + + /** + * Returns a BSONObj containing a representation of the current default write concern. + */ + virtual WriteConcernOptions getGetLastErrorDefault() = 0; + + /** + * Checks that the --replSet flag was passed when starting up the node and that the node + * has a valid replica set config. + * + * Returns a Status indicating whether those conditions are met with errorcode + * NoReplicationEnabled if --replSet was not present during start up or with errorcode + * NotYetInitialized in the absence of a valid config. Also adds error info to "result". + */ + virtual Status checkReplEnabledForCommand(BSONObjBuilder* result) = 0; + + /** + * Chooses a viable sync source, or, if none available, returns empty HostAndPort. + */ + virtual HostAndPort chooseNewSyncSource(const OpTime& lastOpTimeFetched) = 0; + + /** + * Blacklists choosing 'host' as a sync source until time 'until'. + */ + virtual void blacklistSyncSource(const HostAndPort& host, Date_t until) = 0; + + /** + * Loads the optime from the last op in the oplog into the coordinator's lastOpApplied + * value. + */ + virtual void resetLastOpTimeFromOplog(OperationContext* txn) = 0; + + /** + * Determines if a new sync source should be considered. + * currentSource: the current sync source + */ + virtual bool shouldChangeSyncSource(const HostAndPort& currentSource) = 0; + + /** + * Writes into 'output' all the information needed to generate a summary of the current + * replication state for use by the web interface. + */ + virtual void summarizeAsHtml(ReplSetHtmlSummary* output) = 0; + +protected: + ReplicationCoordinator(); +}; + +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/replication_coordinator_external_state.cpp b/src/mongo/db/repl/replication_coordinator_external_state.cpp index 68403755b07..fbeddfba68a 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state.cpp +++ b/src/mongo/db/repl/replication_coordinator_external_state.cpp @@ -33,8 +33,8 @@ namespace mongo { namespace repl { - ReplicationCoordinatorExternalState::ReplicationCoordinatorExternalState() {} - ReplicationCoordinatorExternalState::~ReplicationCoordinatorExternalState() {} +ReplicationCoordinatorExternalState::ReplicationCoordinatorExternalState() {} +ReplicationCoordinatorExternalState::~ReplicationCoordinatorExternalState() {} -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/replication_coordinator_external_state.h b/src/mongo/db/repl/replication_coordinator_external_state.h index bb44acf5fbe..4d654dfc148 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state.h +++ b/src/mongo/db/repl/replication_coordinator_external_state.h @@ -36,144 +36,145 @@ namespace mongo { - class BSONObj; - class OID; - class OperationContext; - class Status; - struct HostAndPort; - template <typename T> class StatusWith; +class BSONObj; +class OID; +class OperationContext; +class Status; +struct HostAndPort; +template <typename T> +class StatusWith; namespace repl { +/** + * This class represents the interface the ReplicationCoordinator uses to interact with the + * rest of the system. All functionality of the ReplicationCoordinatorImpl that would introduce + * dependencies on large sections of the server code and thus break the unit testability of + * ReplicationCoordinatorImpl should be moved here. + */ +class ReplicationCoordinatorExternalState { + MONGO_DISALLOW_COPYING(ReplicationCoordinatorExternalState); + +public: + ReplicationCoordinatorExternalState(); + virtual ~ReplicationCoordinatorExternalState(); + + /** + * Starts the background sync, producer, and sync source feedback threads + * + * NOTE: Only starts threads if they are not already started, + */ + virtual void startThreads() = 0; + + /** + * Starts the Master/Slave threads and sets up logOp + */ + virtual void startMasterSlave(OperationContext* txn) = 0; + + /** + * Performs any necessary external state specific shutdown tasks, such as cleaning up + * the threads it started. + */ + virtual void shutdown() = 0; + + /** + * Creates the oplog and writes the first entry. + */ + virtual void initiateOplog(OperationContext* txn) = 0; + + /** + * Simple wrapper around SyncSourceFeedback::forwardSlaveHandshake. Signals to the + * SyncSourceFeedback thread that it needs to wake up and send a replication handshake + * upstream. + */ + virtual void forwardSlaveHandshake() = 0; + + /** + * Simple wrapper around SyncSourceFeedback::forwardSlaveProgress. Signals to the + * SyncSourceFeedback thread that it needs to wake up and send a replSetUpdatePosition + * command upstream. + */ + virtual void forwardSlaveProgress() = 0; + + /** + * Queries the singleton document in local.me. If it exists and our hostname has not + * changed since we wrote, returns the RID stored in the object. If the document does not + * exist or our hostname doesn't match what was recorded in local.me, generates a new OID + * to use as our RID, stores it in local.me, and returns it. + */ + virtual OID ensureMe(OperationContext*) = 0; + + /** + * Returns true if "host" is one of the network identities of this node. + */ + virtual bool isSelf(const HostAndPort& host) = 0; + + /** + * Gets the replica set config document from local storage, or returns an error. + */ + virtual StatusWith<BSONObj> loadLocalConfigDocument(OperationContext* txn) = 0; + + /** + * Stores the replica set config document in local storage, or returns an error. + */ + virtual Status storeLocalConfigDocument(OperationContext* txn, const BSONObj& config) = 0; + + /** + * Sets the global opTime to be 'newTime'. + */ + virtual void setGlobalOpTime(const OpTime& newTime) = 0; + + /** + * Gets the last optime of an operation performed on this host, from stable + * storage. + */ + virtual StatusWith<OpTime> loadLastOpTime(OperationContext* txn) = 0; + + /** + * Returns the HostAndPort of the remote client connected to us that initiated the operation + * represented by "txn". + */ + virtual HostAndPort getClientHostAndPort(const OperationContext* txn) = 0; + + /** + * Closes all connections except those marked with the keepOpen property, which should + * just be connections used for heartbeating. + * This is used during stepdown, and transition out of primary. + */ + virtual void closeConnections() = 0; + + /** + * Kills all operations that have a Client that is associated with an incoming user + * connection. Used during stepdown. + */ + virtual void killAllUserOperations(OperationContext* txn) = 0; + /** - * This class represents the interface the ReplicationCoordinator uses to interact with the - * rest of the system. All functionality of the ReplicationCoordinatorImpl that would introduce - * dependencies on large sections of the server code and thus break the unit testability of - * ReplicationCoordinatorImpl should be moved here. - */ - class ReplicationCoordinatorExternalState { - MONGO_DISALLOW_COPYING(ReplicationCoordinatorExternalState); - public: - - ReplicationCoordinatorExternalState(); - virtual ~ReplicationCoordinatorExternalState(); - - /** - * Starts the background sync, producer, and sync source feedback threads - * - * NOTE: Only starts threads if they are not already started, - */ - virtual void startThreads() = 0; - - /** - * Starts the Master/Slave threads and sets up logOp - */ - virtual void startMasterSlave(OperationContext* txn) = 0; - - /** - * Performs any necessary external state specific shutdown tasks, such as cleaning up - * the threads it started. - */ - virtual void shutdown() = 0; - - /** - * Creates the oplog and writes the first entry. - */ - virtual void initiateOplog(OperationContext* txn) = 0; - - /** - * Simple wrapper around SyncSourceFeedback::forwardSlaveHandshake. Signals to the - * SyncSourceFeedback thread that it needs to wake up and send a replication handshake - * upstream. - */ - virtual void forwardSlaveHandshake() = 0; - - /** - * Simple wrapper around SyncSourceFeedback::forwardSlaveProgress. Signals to the - * SyncSourceFeedback thread that it needs to wake up and send a replSetUpdatePosition - * command upstream. - */ - virtual void forwardSlaveProgress() = 0; - - /** - * Queries the singleton document in local.me. If it exists and our hostname has not - * changed since we wrote, returns the RID stored in the object. If the document does not - * exist or our hostname doesn't match what was recorded in local.me, generates a new OID - * to use as our RID, stores it in local.me, and returns it. - */ - virtual OID ensureMe(OperationContext*) = 0; - - /** - * Returns true if "host" is one of the network identities of this node. - */ - virtual bool isSelf(const HostAndPort& host) = 0; - - /** - * Gets the replica set config document from local storage, or returns an error. - */ - virtual StatusWith<BSONObj> loadLocalConfigDocument(OperationContext* txn) = 0; - - /** - * Stores the replica set config document in local storage, or returns an error. - */ - virtual Status storeLocalConfigDocument(OperationContext* txn, const BSONObj& config) = 0; - - /** - * Sets the global opTime to be 'newTime'. - */ - virtual void setGlobalOpTime(const OpTime& newTime) = 0; - - /** - * Gets the last optime of an operation performed on this host, from stable - * storage. - */ - virtual StatusWith<OpTime> loadLastOpTime(OperationContext* txn) = 0; - - /** - * Returns the HostAndPort of the remote client connected to us that initiated the operation - * represented by "txn". - */ - virtual HostAndPort getClientHostAndPort(const OperationContext* txn) = 0; - - /** - * Closes all connections except those marked with the keepOpen property, which should - * just be connections used for heartbeating. - * This is used during stepdown, and transition out of primary. - */ - virtual void closeConnections() = 0; - - /** - * Kills all operations that have a Client that is associated with an incoming user - * connection. Used during stepdown. - */ - virtual void killAllUserOperations(OperationContext* txn) = 0; - - /** - * Clears all cached sharding metadata on this server. This is called after stepDown to - * ensure that if the node becomes primary again in the future it will reload an up-to-date - * version of the sharding data. - */ - virtual void clearShardingState() = 0; - - /** - * Notifies the bgsync and syncSourceFeedback threads to choose a new sync source. - */ - virtual void signalApplierToChooseNewSyncSource() = 0; - - /** - * Returns an OperationContext, owned by the caller, that may be used in methods of - * the same instance that require an OperationContext. - */ - virtual OperationContext* createOperationContext(const std::string& threadName) = 0; - - /** - * Drops all temporary collections on all databases except "local". - * - * The implementation may assume that the caller has acquired the global exclusive lock - * for "txn". - */ - virtual void dropAllTempCollections(OperationContext* txn) = 0; - }; - -} // namespace repl -} // namespace mongo + * Clears all cached sharding metadata on this server. This is called after stepDown to + * ensure that if the node becomes primary again in the future it will reload an up-to-date + * version of the sharding data. + */ + virtual void clearShardingState() = 0; + + /** + * Notifies the bgsync and syncSourceFeedback threads to choose a new sync source. + */ + virtual void signalApplierToChooseNewSyncSource() = 0; + + /** + * Returns an OperationContext, owned by the caller, that may be used in methods of + * the same instance that require an OperationContext. + */ + virtual OperationContext* createOperationContext(const std::string& threadName) = 0; + + /** + * Drops all temporary collections on all databases except "local". + * + * The implementation may assume that the caller has acquired the global exclusive lock + * for "txn". + */ + virtual void dropAllTempCollections(OperationContext* txn) = 0; +}; + +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp index e2472bd3406..03ad878aac9 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp @@ -65,229 +65,216 @@ namespace mongo { namespace repl { namespace { - const char configCollectionName[] = "local.system.replset"; - const char configDatabaseName[] = "local"; - const char meCollectionName[] = "local.me"; - const char meDatabaseName[] = "local"; - const char tsFieldName[] = "ts"; +const char configCollectionName[] = "local.system.replset"; +const char configDatabaseName[] = "local"; +const char meCollectionName[] = "local.me"; +const char meDatabaseName[] = "local"; +const char tsFieldName[] = "ts"; } // namespace - ReplicationCoordinatorExternalStateImpl::ReplicationCoordinatorExternalStateImpl() : - _startedThreads(false) - , _nextThreadId(0) {} - ReplicationCoordinatorExternalStateImpl::~ReplicationCoordinatorExternalStateImpl() {} +ReplicationCoordinatorExternalStateImpl::ReplicationCoordinatorExternalStateImpl() + : _startedThreads(false), _nextThreadId(0) {} +ReplicationCoordinatorExternalStateImpl::~ReplicationCoordinatorExternalStateImpl() {} - void ReplicationCoordinatorExternalStateImpl::startThreads() { - boost::lock_guard<boost::mutex> lk(_threadMutex); - if (_startedThreads) { - return; - } - log() << "Starting replication applier threads"; - _applierThread.reset(new boost::thread(runSyncThread)); +void ReplicationCoordinatorExternalStateImpl::startThreads() { + boost::lock_guard<boost::mutex> lk(_threadMutex); + if (_startedThreads) { + return; + } + log() << "Starting replication applier threads"; + _applierThread.reset(new boost::thread(runSyncThread)); + BackgroundSync* bgsync = BackgroundSync::get(); + _producerThread.reset(new boost::thread(stdx::bind(&BackgroundSync::producerThread, bgsync))); + _syncSourceFeedbackThread.reset( + new boost::thread(stdx::bind(&SyncSourceFeedback::run, &_syncSourceFeedback))); + _startedThreads = true; +} + +void ReplicationCoordinatorExternalStateImpl::startMasterSlave(OperationContext* txn) { + repl::startMasterSlave(txn); +} + +void ReplicationCoordinatorExternalStateImpl::shutdown() { + boost::lock_guard<boost::mutex> lk(_threadMutex); + if (_startedThreads) { + log() << "Stopping replication applier threads"; + _syncSourceFeedback.shutdown(); + _syncSourceFeedbackThread->join(); + _applierThread->join(); BackgroundSync* bgsync = BackgroundSync::get(); - _producerThread.reset(new boost::thread(stdx::bind(&BackgroundSync::producerThread, - bgsync))); - _syncSourceFeedbackThread.reset(new boost::thread(stdx::bind(&SyncSourceFeedback::run, - &_syncSourceFeedback))); - _startedThreads = true; + bgsync->shutdown(); + _producerThread->join(); } - - void ReplicationCoordinatorExternalStateImpl::startMasterSlave(OperationContext* txn) { - repl::startMasterSlave(txn); +} + +void ReplicationCoordinatorExternalStateImpl::initiateOplog(OperationContext* txn) { + createOplog(txn); + + MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { + ScopedTransaction scopedXact(txn, MODE_X); + Lock::GlobalWrite globalWrite(txn->lockState()); + WriteUnitOfWork wuow(txn); + logOpInitiate(txn, + BSON("msg" + << "initiating set")); + wuow.commit(); } - - void ReplicationCoordinatorExternalStateImpl::shutdown() { - boost::lock_guard<boost::mutex> lk(_threadMutex); - if (_startedThreads) { - log() << "Stopping replication applier threads"; - _syncSourceFeedback.shutdown(); - _syncSourceFeedbackThread->join(); - _applierThread->join(); - BackgroundSync* bgsync = BackgroundSync::get(); - bgsync->shutdown(); - _producerThread->join(); + MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "initiate oplog entry", "local.oplog.rs"); +} + +void ReplicationCoordinatorExternalStateImpl::forwardSlaveHandshake() { + _syncSourceFeedback.forwardSlaveHandshake(); +} + +void ReplicationCoordinatorExternalStateImpl::forwardSlaveProgress() { + _syncSourceFeedback.forwardSlaveProgress(); +} + +OID ReplicationCoordinatorExternalStateImpl::ensureMe(OperationContext* txn) { + std::string myname = getHostName(); + OID myRID; + { + ScopedTransaction transaction(txn, MODE_IX); + Lock::DBLock lock(txn->lockState(), meDatabaseName, MODE_X); + + BSONObj me; + // local.me is an identifier for a server for getLastError w:2+ + // TODO: handle WriteConflictExceptions below + if (!Helpers::getSingleton(txn, meCollectionName, me) || !me.hasField("host") || + me["host"].String() != myname) { + myRID = OID::gen(); + + // clean out local.me + Helpers::emptyCollection(txn, meCollectionName); + + // repopulate + BSONObjBuilder b; + b.append("_id", myRID); + b.append("host", myname); + Helpers::putSingleton(txn, meCollectionName, b.done()); + } else { + myRID = me["_id"].OID(); } } + return myRID; +} - void ReplicationCoordinatorExternalStateImpl::initiateOplog(OperationContext* txn) { - createOplog(txn); - +StatusWith<BSONObj> ReplicationCoordinatorExternalStateImpl::loadLocalConfigDocument( + OperationContext* txn) { + try { MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { - ScopedTransaction scopedXact(txn, MODE_X); - Lock::GlobalWrite globalWrite(txn->lockState()); - WriteUnitOfWork wuow(txn); - logOpInitiate(txn, BSON("msg" << "initiating set")); - wuow.commit(); - } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "initiate oplog entry", "local.oplog.rs"); - } - - void ReplicationCoordinatorExternalStateImpl::forwardSlaveHandshake() { - _syncSourceFeedback.forwardSlaveHandshake(); - } - - void ReplicationCoordinatorExternalStateImpl::forwardSlaveProgress() { - _syncSourceFeedback.forwardSlaveProgress(); - } - - OID ReplicationCoordinatorExternalStateImpl::ensureMe(OperationContext* txn) { - std::string myname = getHostName(); - OID myRID; - { - ScopedTransaction transaction(txn, MODE_IX); - Lock::DBLock lock(txn->lockState(), meDatabaseName, MODE_X); - - BSONObj me; - // local.me is an identifier for a server for getLastError w:2+ - // TODO: handle WriteConflictExceptions below - if (!Helpers::getSingleton(txn, meCollectionName, me) || - !me.hasField("host") || - me["host"].String() != myname) { - - myRID = OID::gen(); - - // clean out local.me - Helpers::emptyCollection(txn, meCollectionName); - - // repopulate - BSONObjBuilder b; - b.append("_id", myRID); - b.append("host", myname); - Helpers::putSingleton(txn, meCollectionName, b.done()); - } else { - myRID = me["_id"].OID(); + BSONObj config; + if (!Helpers::getSingleton(txn, configCollectionName, config)) { + return StatusWith<BSONObj>( + ErrorCodes::NoMatchingDocument, + str::stream() << "Did not find replica set configuration document in " + << configCollectionName); } + return StatusWith<BSONObj>(config); } - return myRID; + MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "load replica set config", configCollectionName); + } catch (const DBException& ex) { + return StatusWith<BSONObj>(ex.toStatus()); } +} - StatusWith<BSONObj> ReplicationCoordinatorExternalStateImpl::loadLocalConfigDocument( - OperationContext* txn) { - try { - MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { - BSONObj config; - if (!Helpers::getSingleton(txn, configCollectionName, config)) { - return StatusWith<BSONObj>( - ErrorCodes::NoMatchingDocument, - str::stream() << "Did not find replica set configuration document in " - << configCollectionName); - } - return StatusWith<BSONObj>(config); - } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, - "load replica set config", - configCollectionName); - } - catch (const DBException& ex) { - return StatusWith<BSONObj>(ex.toStatus()); +Status ReplicationCoordinatorExternalStateImpl::storeLocalConfigDocument(OperationContext* txn, + const BSONObj& config) { + try { + MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { + ScopedTransaction transaction(txn, MODE_IX); + Lock::DBLock dbWriteLock(txn->lockState(), configDatabaseName, MODE_X); + Helpers::putSingleton(txn, configCollectionName, config); + return Status::OK(); } + MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "save replica set config", configCollectionName); + } catch (const DBException& ex) { + return ex.toStatus(); } - - Status ReplicationCoordinatorExternalStateImpl::storeLocalConfigDocument( - OperationContext* txn, - const BSONObj& config) { - try { - MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { - ScopedTransaction transaction(txn, MODE_IX); - Lock::DBLock dbWriteLock(txn->lockState(), configDatabaseName, MODE_X); - Helpers::putSingleton(txn, configCollectionName, config); - return Status::OK(); - } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, - "save replica set config", - configCollectionName); +} + +void ReplicationCoordinatorExternalStateImpl::setGlobalOpTime(const OpTime& newTime) { + setNewOptime(newTime); +} + +StatusWith<OpTime> ReplicationCoordinatorExternalStateImpl::loadLastOpTime(OperationContext* txn) { + // TODO: handle WriteConflictExceptions below + try { + BSONObj oplogEntry; + if (!Helpers::getLast(txn, rsoplog, oplogEntry)) { + return StatusWith<OpTime>(ErrorCodes::NoMatchingDocument, + str::stream() << "Did not find any entries in " << rsoplog); } - catch (const DBException& ex) { - return ex.toStatus(); + BSONElement tsElement = oplogEntry[tsFieldName]; + if (tsElement.eoo()) { + return StatusWith<OpTime>(ErrorCodes::NoSuchKey, + str::stream() << "Most recent entry in " << rsoplog + << " missing \"" << tsFieldName << "\" field"); } - - } - - void ReplicationCoordinatorExternalStateImpl::setGlobalOpTime(const OpTime& newTime) { - setNewOptime(newTime); - } - - StatusWith<OpTime> ReplicationCoordinatorExternalStateImpl::loadLastOpTime( - OperationContext* txn) { - - // TODO: handle WriteConflictExceptions below - try { - BSONObj oplogEntry; - if (!Helpers::getLast(txn, rsoplog, oplogEntry)) { - return StatusWith<OpTime>( - ErrorCodes::NoMatchingDocument, - str::stream() << "Did not find any entries in " << rsoplog); - } - BSONElement tsElement = oplogEntry[tsFieldName]; - if (tsElement.eoo()) { - return StatusWith<OpTime>( - ErrorCodes::NoSuchKey, - str::stream() << "Most recent entry in " << rsoplog << " missing \"" << - tsFieldName << "\" field"); - } - if (tsElement.type() != Timestamp) { - return StatusWith<OpTime>( - ErrorCodes::TypeMismatch, - str::stream() << "Expected type of \"" << tsFieldName << - "\" in most recent " << rsoplog << - " entry to have type Timestamp, but found " << typeName(tsElement.type())); - } - return StatusWith<OpTime>(tsElement._opTime()); - } - catch (const DBException& ex) { - return StatusWith<OpTime>(ex.toStatus()); + if (tsElement.type() != Timestamp) { + return StatusWith<OpTime>(ErrorCodes::TypeMismatch, + str::stream() << "Expected type of \"" << tsFieldName + << "\" in most recent " << rsoplog + << " entry to have type Timestamp, but found " + << typeName(tsElement.type())); } + return StatusWith<OpTime>(tsElement._opTime()); + } catch (const DBException& ex) { + return StatusWith<OpTime>(ex.toStatus()); } - - bool ReplicationCoordinatorExternalStateImpl::isSelf(const HostAndPort& host) { - return repl::isSelf(host); - - } - - HostAndPort ReplicationCoordinatorExternalStateImpl::getClientHostAndPort( - const OperationContext* txn) { - return HostAndPort(txn->getClient()->clientAddress(true)); - } - - void ReplicationCoordinatorExternalStateImpl::closeConnections() { - MessagingPort::closeAllSockets(ScopedConn::keepOpen); - } - - void ReplicationCoordinatorExternalStateImpl::killAllUserOperations(OperationContext* txn) { - GlobalEnvironmentExperiment* environment = getGlobalEnvironment(); - environment->killAllUserOperations(txn); - } - - void ReplicationCoordinatorExternalStateImpl::clearShardingState() { - shardingState.resetShardingState(); - } - - void ReplicationCoordinatorExternalStateImpl::signalApplierToChooseNewSyncSource() { - BackgroundSync::get()->clearSyncTarget(); - } - - OperationContext* ReplicationCoordinatorExternalStateImpl::createOperationContext( - const std::string& threadName) { - Client::initThreadIfNotAlready(threadName.c_str()); - return new OperationContextImpl; - } - - void ReplicationCoordinatorExternalStateImpl::dropAllTempCollections(OperationContext* txn) { - std::vector<std::string> dbNames; - StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine(); - storageEngine->listDatabases(&dbNames); - - for (std::vector<std::string>::iterator it = dbNames.begin(); it != dbNames.end(); ++it) { - // The local db is special because it isn't replicated. It is cleared at startup even on - // replica set members. - if (*it == "local") - continue; - LOG(2) << "Removing temporary collections from " << *it; - Database* db = dbHolder().get(txn, *it); - // Since we must be holding the global lock during this function, if listDatabases - // returned this dbname, we should be able to get a reference to it - it can't have - // been dropped. - invariant(db); - db->clearTmpCollections(txn); - } +} + +bool ReplicationCoordinatorExternalStateImpl::isSelf(const HostAndPort& host) { + return repl::isSelf(host); +} + +HostAndPort ReplicationCoordinatorExternalStateImpl::getClientHostAndPort( + const OperationContext* txn) { + return HostAndPort(txn->getClient()->clientAddress(true)); +} + +void ReplicationCoordinatorExternalStateImpl::closeConnections() { + MessagingPort::closeAllSockets(ScopedConn::keepOpen); +} + +void ReplicationCoordinatorExternalStateImpl::killAllUserOperations(OperationContext* txn) { + GlobalEnvironmentExperiment* environment = getGlobalEnvironment(); + environment->killAllUserOperations(txn); +} + +void ReplicationCoordinatorExternalStateImpl::clearShardingState() { + shardingState.resetShardingState(); +} + +void ReplicationCoordinatorExternalStateImpl::signalApplierToChooseNewSyncSource() { + BackgroundSync::get()->clearSyncTarget(); +} + +OperationContext* ReplicationCoordinatorExternalStateImpl::createOperationContext( + const std::string& threadName) { + Client::initThreadIfNotAlready(threadName.c_str()); + return new OperationContextImpl; +} + +void ReplicationCoordinatorExternalStateImpl::dropAllTempCollections(OperationContext* txn) { + std::vector<std::string> dbNames; + StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine(); + storageEngine->listDatabases(&dbNames); + + for (std::vector<std::string>::iterator it = dbNames.begin(); it != dbNames.end(); ++it) { + // The local db is special because it isn't replicated. It is cleared at startup even on + // replica set members. + if (*it == "local") + continue; + LOG(2) << "Removing temporary collections from " << *it; + Database* db = dbHolder().get(txn, *it); + // Since we must be holding the global lock during this function, if listDatabases + // returned this dbname, we should be able to get a reference to it - it can't have + // been dropped. + invariant(db); + db->clearTmpCollections(txn); } +} -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.h b/src/mongo/db/repl/replication_coordinator_external_state_impl.h index ed4c01b5823..7918479cbc1 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_impl.h +++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.h @@ -39,60 +39,60 @@ namespace mongo { namespace repl { - class ReplicationCoordinatorExternalStateImpl : public ReplicationCoordinatorExternalState { - MONGO_DISALLOW_COPYING(ReplicationCoordinatorExternalStateImpl); - public: +class ReplicationCoordinatorExternalStateImpl : public ReplicationCoordinatorExternalState { + MONGO_DISALLOW_COPYING(ReplicationCoordinatorExternalStateImpl); - ReplicationCoordinatorExternalStateImpl(); - virtual ~ReplicationCoordinatorExternalStateImpl(); - virtual void startThreads(); - virtual void startMasterSlave(OperationContext* txn); - virtual void shutdown(); - virtual void initiateOplog(OperationContext* txn); - virtual void forwardSlaveHandshake(); - virtual void forwardSlaveProgress(); - virtual OID ensureMe(OperationContext* txn); - virtual bool isSelf(const HostAndPort& host); - virtual StatusWith<BSONObj> loadLocalConfigDocument(OperationContext* txn); - virtual Status storeLocalConfigDocument(OperationContext* txn, const BSONObj& config); - virtual void setGlobalOpTime(const OpTime& newTime); - virtual StatusWith<OpTime> loadLastOpTime(OperationContext* txn); - virtual HostAndPort getClientHostAndPort(const OperationContext* txn); - virtual void closeConnections(); - virtual void killAllUserOperations(OperationContext* txn); - virtual void clearShardingState(); - virtual void signalApplierToChooseNewSyncSource(); - virtual OperationContext* createOperationContext(const std::string& threadName); - virtual void dropAllTempCollections(OperationContext* txn); +public: + ReplicationCoordinatorExternalStateImpl(); + virtual ~ReplicationCoordinatorExternalStateImpl(); + virtual void startThreads(); + virtual void startMasterSlave(OperationContext* txn); + virtual void shutdown(); + virtual void initiateOplog(OperationContext* txn); + virtual void forwardSlaveHandshake(); + virtual void forwardSlaveProgress(); + virtual OID ensureMe(OperationContext* txn); + virtual bool isSelf(const HostAndPort& host); + virtual StatusWith<BSONObj> loadLocalConfigDocument(OperationContext* txn); + virtual Status storeLocalConfigDocument(OperationContext* txn, const BSONObj& config); + virtual void setGlobalOpTime(const OpTime& newTime); + virtual StatusWith<OpTime> loadLastOpTime(OperationContext* txn); + virtual HostAndPort getClientHostAndPort(const OperationContext* txn); + virtual void closeConnections(); + virtual void killAllUserOperations(OperationContext* txn); + virtual void clearShardingState(); + virtual void signalApplierToChooseNewSyncSource(); + virtual OperationContext* createOperationContext(const std::string& threadName); + virtual void dropAllTempCollections(OperationContext* txn); - std::string getNextOpContextThreadName(); + std::string getNextOpContextThreadName(); - private: - // Guards starting threads and setting _startedThreads - boost::mutex _threadMutex; +private: + // Guards starting threads and setting _startedThreads + boost::mutex _threadMutex; - // True when the threads have been started - bool _startedThreads; + // True when the threads have been started + bool _startedThreads; - // The SyncSourceFeedback class is responsible for sending replSetUpdatePosition commands - // for forwarding replication progress information upstream when there is chained - // replication. - SyncSourceFeedback _syncSourceFeedback; + // The SyncSourceFeedback class is responsible for sending replSetUpdatePosition commands + // for forwarding replication progress information upstream when there is chained + // replication. + SyncSourceFeedback _syncSourceFeedback; - // Thread running SyncSourceFeedback::run(). - boost::scoped_ptr<boost::thread> _syncSourceFeedbackThread; + // Thread running SyncSourceFeedback::run(). + boost::scoped_ptr<boost::thread> _syncSourceFeedbackThread; - // Thread running runSyncThread(). - boost::scoped_ptr<boost::thread> _applierThread; + // Thread running runSyncThread(). + boost::scoped_ptr<boost::thread> _applierThread; - // Thread running BackgroundSync::producerThread(). - boost::scoped_ptr<boost::thread> _producerThread; + // Thread running BackgroundSync::producerThread(). + boost::scoped_ptr<boost::thread> _producerThread; - // Mutex guarding the _nextThreadId value to prevent concurrent incrementing. - boost::mutex _nextThreadIdMutex; - // Number used to uniquely name threads. - long long _nextThreadId; - }; + // Mutex guarding the _nextThreadId value to prevent concurrent incrementing. + boost::mutex _nextThreadIdMutex; + // Number used to uniquely name threads. + long long _nextThreadId; +}; -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp b/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp index e88cf78d9ce..941575a7f26 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp +++ b/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp @@ -42,114 +42,108 @@ namespace mongo { namespace repl { - ReplicationCoordinatorExternalStateMock::ReplicationCoordinatorExternalStateMock() - : _localRsConfigDocument(ErrorCodes::NoMatchingDocument, "No local config document"), - _lastOpTime(ErrorCodes::NoMatchingDocument, "No last oplog entry"), - _canAcquireGlobalSharedLock(true), - _storeLocalConfigDocumentStatus(Status::OK()), - _storeLocalConfigDocumentShouldHang(false), - _connectionsClosed(false) { - } - - ReplicationCoordinatorExternalStateMock::~ReplicationCoordinatorExternalStateMock() {} - - void ReplicationCoordinatorExternalStateMock::startThreads() {} - void ReplicationCoordinatorExternalStateMock::startMasterSlave(OperationContext*) {} - void ReplicationCoordinatorExternalStateMock::initiateOplog(OperationContext* txn) {} - void ReplicationCoordinatorExternalStateMock::shutdown() {} - void ReplicationCoordinatorExternalStateMock::forwardSlaveHandshake() {} - void ReplicationCoordinatorExternalStateMock::forwardSlaveProgress() {} - - OID ReplicationCoordinatorExternalStateMock::ensureMe(OperationContext*) { - return OID::gen(); - } - - bool ReplicationCoordinatorExternalStateMock::isSelf(const HostAndPort& host) { - return sequenceContains(_selfHosts, host); - } - - void ReplicationCoordinatorExternalStateMock::addSelf(const HostAndPort& host) { - _selfHosts.push_back(host); - } - - HostAndPort ReplicationCoordinatorExternalStateMock::getClientHostAndPort( - const OperationContext* txn) { - return _clientHostAndPort; - } - - void ReplicationCoordinatorExternalStateMock::setClientHostAndPort( - const HostAndPort& clientHostAndPort) { - _clientHostAndPort = clientHostAndPort; - } - - StatusWith<BSONObj> ReplicationCoordinatorExternalStateMock::loadLocalConfigDocument( - OperationContext* txn) { - return _localRsConfigDocument; - } - - Status ReplicationCoordinatorExternalStateMock::storeLocalConfigDocument( - OperationContext* txn, - const BSONObj& config) { - { - boost::unique_lock<boost::mutex> lock(_shouldHangMutex); - while (_storeLocalConfigDocumentShouldHang) { - _shouldHangCondVar.wait(lock); - } - } - if (_storeLocalConfigDocumentStatus.isOK()) { - setLocalConfigDocument(StatusWith<BSONObj>(config)); - return Status::OK(); +ReplicationCoordinatorExternalStateMock::ReplicationCoordinatorExternalStateMock() + : _localRsConfigDocument(ErrorCodes::NoMatchingDocument, "No local config document"), + _lastOpTime(ErrorCodes::NoMatchingDocument, "No last oplog entry"), + _canAcquireGlobalSharedLock(true), + _storeLocalConfigDocumentStatus(Status::OK()), + _storeLocalConfigDocumentShouldHang(false), + _connectionsClosed(false) {} + +ReplicationCoordinatorExternalStateMock::~ReplicationCoordinatorExternalStateMock() {} + +void ReplicationCoordinatorExternalStateMock::startThreads() {} +void ReplicationCoordinatorExternalStateMock::startMasterSlave(OperationContext*) {} +void ReplicationCoordinatorExternalStateMock::initiateOplog(OperationContext* txn) {} +void ReplicationCoordinatorExternalStateMock::shutdown() {} +void ReplicationCoordinatorExternalStateMock::forwardSlaveHandshake() {} +void ReplicationCoordinatorExternalStateMock::forwardSlaveProgress() {} + +OID ReplicationCoordinatorExternalStateMock::ensureMe(OperationContext*) { + return OID::gen(); +} + +bool ReplicationCoordinatorExternalStateMock::isSelf(const HostAndPort& host) { + return sequenceContains(_selfHosts, host); +} + +void ReplicationCoordinatorExternalStateMock::addSelf(const HostAndPort& host) { + _selfHosts.push_back(host); +} + +HostAndPort ReplicationCoordinatorExternalStateMock::getClientHostAndPort( + const OperationContext* txn) { + return _clientHostAndPort; +} + +void ReplicationCoordinatorExternalStateMock::setClientHostAndPort( + const HostAndPort& clientHostAndPort) { + _clientHostAndPort = clientHostAndPort; +} + +StatusWith<BSONObj> ReplicationCoordinatorExternalStateMock::loadLocalConfigDocument( + OperationContext* txn) { + return _localRsConfigDocument; +} + +Status ReplicationCoordinatorExternalStateMock::storeLocalConfigDocument(OperationContext* txn, + const BSONObj& config) { + { + boost::unique_lock<boost::mutex> lock(_shouldHangMutex); + while (_storeLocalConfigDocumentShouldHang) { + _shouldHangCondVar.wait(lock); } - return _storeLocalConfigDocumentStatus; } - - void ReplicationCoordinatorExternalStateMock::setLocalConfigDocument( - const StatusWith<BSONObj>& localConfigDocument) { - - _localRsConfigDocument = localConfigDocument; + if (_storeLocalConfigDocumentStatus.isOK()) { + setLocalConfigDocument(StatusWith<BSONObj>(config)); + return Status::OK(); } + return _storeLocalConfigDocumentStatus; +} - void ReplicationCoordinatorExternalStateMock::setGlobalOpTime(const OpTime& newTime) { - } +void ReplicationCoordinatorExternalStateMock::setLocalConfigDocument( + const StatusWith<BSONObj>& localConfigDocument) { + _localRsConfigDocument = localConfigDocument; +} - StatusWith<OpTime> ReplicationCoordinatorExternalStateMock::loadLastOpTime( - OperationContext* txn) { - return _lastOpTime; - } +void ReplicationCoordinatorExternalStateMock::setGlobalOpTime(const OpTime& newTime) {} - void ReplicationCoordinatorExternalStateMock::setLastOpTime( - const StatusWith<OpTime>& lastApplied) { - _lastOpTime = lastApplied; - } +StatusWith<OpTime> ReplicationCoordinatorExternalStateMock::loadLastOpTime(OperationContext* txn) { + return _lastOpTime; +} - void ReplicationCoordinatorExternalStateMock::setStoreLocalConfigDocumentStatus(Status status) { - _storeLocalConfigDocumentStatus = status; - } +void ReplicationCoordinatorExternalStateMock::setLastOpTime(const StatusWith<OpTime>& lastApplied) { + _lastOpTime = lastApplied; +} - void ReplicationCoordinatorExternalStateMock::setStoreLocalConfigDocumentToHang(bool hang) { - boost::unique_lock<boost::mutex> lock(_shouldHangMutex); - _storeLocalConfigDocumentShouldHang = hang; - if (!hang) { - _shouldHangCondVar.notify_all(); - } - } +void ReplicationCoordinatorExternalStateMock::setStoreLocalConfigDocumentStatus(Status status) { + _storeLocalConfigDocumentStatus = status; +} - void ReplicationCoordinatorExternalStateMock::closeConnections() { - _connectionsClosed = true; +void ReplicationCoordinatorExternalStateMock::setStoreLocalConfigDocumentToHang(bool hang) { + boost::unique_lock<boost::mutex> lock(_shouldHangMutex); + _storeLocalConfigDocumentShouldHang = hang; + if (!hang) { + _shouldHangCondVar.notify_all(); } +} - void ReplicationCoordinatorExternalStateMock::killAllUserOperations(OperationContext* txn) {} +void ReplicationCoordinatorExternalStateMock::closeConnections() { + _connectionsClosed = true; +} - void ReplicationCoordinatorExternalStateMock::clearShardingState() {} +void ReplicationCoordinatorExternalStateMock::killAllUserOperations(OperationContext* txn) {} - void ReplicationCoordinatorExternalStateMock::signalApplierToChooseNewSyncSource() {} +void ReplicationCoordinatorExternalStateMock::clearShardingState() {} - OperationContext* ReplicationCoordinatorExternalStateMock::createOperationContext( - const std::string& threadName) { - return new OperationContextReplMock; - } +void ReplicationCoordinatorExternalStateMock::signalApplierToChooseNewSyncSource() {} + +OperationContext* ReplicationCoordinatorExternalStateMock::createOperationContext( + const std::string& threadName) { + return new OperationContextReplMock; +} - void ReplicationCoordinatorExternalStateMock::dropAllTempCollections(OperationContext* txn) {} +void ReplicationCoordinatorExternalStateMock::dropAllTempCollections(OperationContext* txn) {} -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/replication_coordinator_external_state_mock.h b/src/mongo/db/repl/replication_coordinator_external_state_mock.h index 0e44b0cc2e7..dd648dd72e9 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_mock.h +++ b/src/mongo/db/repl/replication_coordinator_external_state_mock.h @@ -41,79 +41,80 @@ namespace mongo { namespace repl { - class ReplicationCoordinatorExternalStateMock : public ReplicationCoordinatorExternalState { - MONGO_DISALLOW_COPYING(ReplicationCoordinatorExternalStateMock); - public: - class GlobalSharedLockAcquirer; +class ReplicationCoordinatorExternalStateMock : public ReplicationCoordinatorExternalState { + MONGO_DISALLOW_COPYING(ReplicationCoordinatorExternalStateMock); - ReplicationCoordinatorExternalStateMock(); - virtual ~ReplicationCoordinatorExternalStateMock(); - virtual void startThreads(); - virtual void startMasterSlave(OperationContext*); - virtual void shutdown(); - virtual void initiateOplog(OperationContext* txn); - virtual void forwardSlaveHandshake(); - virtual void forwardSlaveProgress(); - virtual OID ensureMe(OperationContext*); - virtual bool isSelf(const HostAndPort& host); - virtual HostAndPort getClientHostAndPort(const OperationContext* txn); - virtual StatusWith<BSONObj> loadLocalConfigDocument(OperationContext* txn); - virtual Status storeLocalConfigDocument(OperationContext* txn, const BSONObj& config); - virtual void setGlobalOpTime(const OpTime& newTime); - virtual StatusWith<OpTime> loadLastOpTime(OperationContext* txn); - virtual void closeConnections(); - virtual void killAllUserOperations(OperationContext* txn); - virtual void clearShardingState(); - virtual void signalApplierToChooseNewSyncSource(); - virtual OperationContext* createOperationContext(const std::string& threadName); - virtual void dropAllTempCollections(OperationContext* txn); +public: + class GlobalSharedLockAcquirer; - /** - * Adds "host" to the list of hosts that this mock will match when responding to "isSelf" - * messages. - */ - void addSelf(const HostAndPort& host); + ReplicationCoordinatorExternalStateMock(); + virtual ~ReplicationCoordinatorExternalStateMock(); + virtual void startThreads(); + virtual void startMasterSlave(OperationContext*); + virtual void shutdown(); + virtual void initiateOplog(OperationContext* txn); + virtual void forwardSlaveHandshake(); + virtual void forwardSlaveProgress(); + virtual OID ensureMe(OperationContext*); + virtual bool isSelf(const HostAndPort& host); + virtual HostAndPort getClientHostAndPort(const OperationContext* txn); + virtual StatusWith<BSONObj> loadLocalConfigDocument(OperationContext* txn); + virtual Status storeLocalConfigDocument(OperationContext* txn, const BSONObj& config); + virtual void setGlobalOpTime(const OpTime& newTime); + virtual StatusWith<OpTime> loadLastOpTime(OperationContext* txn); + virtual void closeConnections(); + virtual void killAllUserOperations(OperationContext* txn); + virtual void clearShardingState(); + virtual void signalApplierToChooseNewSyncSource(); + virtual OperationContext* createOperationContext(const std::string& threadName); + virtual void dropAllTempCollections(OperationContext* txn); - /** - * Sets the return value for subsequent calls to loadLocalConfigDocument(). - */ - void setLocalConfigDocument(const StatusWith<BSONObj>& localConfigDocument); + /** + * Adds "host" to the list of hosts that this mock will match when responding to "isSelf" + * messages. + */ + void addSelf(const HostAndPort& host); - /** - * Sets the return value for subsequent calls to getClientHostAndPort(). - */ - void setClientHostAndPort(const HostAndPort& clientHostAndPort); + /** + * Sets the return value for subsequent calls to loadLocalConfigDocument(). + */ + void setLocalConfigDocument(const StatusWith<BSONObj>& localConfigDocument); - /** - * Sets the return value for subsequent calls to loadLastOpTimeApplied. - */ - void setLastOpTime(const StatusWith<OpTime>& lastApplied); + /** + * Sets the return value for subsequent calls to getClientHostAndPort(). + */ + void setClientHostAndPort(const HostAndPort& clientHostAndPort); - /** - * Sets the return value for subsequent calls to storeLocalConfigDocument(). - * If "status" is Status::OK(), the subsequent calls will call the underlying funtion. - */ - void setStoreLocalConfigDocumentStatus(Status status); + /** + * Sets the return value for subsequent calls to loadLastOpTimeApplied. + */ + void setLastOpTime(const StatusWith<OpTime>& lastApplied); - /** - * Sets whether or not subsequent calls to storeLocalConfigDocument() should hang - * indefinitely or not based on the value of "hang". - */ - void setStoreLocalConfigDocumentToHang(bool hang); + /** + * Sets the return value for subsequent calls to storeLocalConfigDocument(). + * If "status" is Status::OK(), the subsequent calls will call the underlying funtion. + */ + void setStoreLocalConfigDocumentStatus(Status status); - private: - StatusWith<BSONObj> _localRsConfigDocument; - StatusWith<OpTime> _lastOpTime; - std::vector<HostAndPort> _selfHosts; - bool _canAcquireGlobalSharedLock; - Status _storeLocalConfigDocumentStatus; - // mutex and cond var for controlling stroeLocalConfigDocument()'s hanging - boost::mutex _shouldHangMutex; - boost::condition _shouldHangCondVar; - bool _storeLocalConfigDocumentShouldHang; - bool _connectionsClosed; - HostAndPort _clientHostAndPort; - }; + /** + * Sets whether or not subsequent calls to storeLocalConfigDocument() should hang + * indefinitely or not based on the value of "hang". + */ + void setStoreLocalConfigDocumentToHang(bool hang); -} // namespace repl -} // namespace mongo +private: + StatusWith<BSONObj> _localRsConfigDocument; + StatusWith<OpTime> _lastOpTime; + std::vector<HostAndPort> _selfHosts; + bool _canAcquireGlobalSharedLock; + Status _storeLocalConfigDocumentStatus; + // mutex and cond var for controlling stroeLocalConfigDocument()'s hanging + boost::mutex _shouldHangMutex; + boost::condition _shouldHangCondVar; + bool _storeLocalConfigDocumentShouldHang; + bool _connectionsClosed; + HostAndPort _clientHostAndPort; +}; + +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/replication_coordinator_global.cpp b/src/mongo/db/repl/replication_coordinator_global.cpp index a586f65b437..03891d163d6 100644 --- a/src/mongo/db/repl/replication_coordinator_global.cpp +++ b/src/mongo/db/repl/replication_coordinator_global.cpp @@ -34,16 +34,16 @@ namespace mongo { namespace repl { namespace { - ReplicationCoordinator* coordinator = NULL; -} // namespace +ReplicationCoordinator* coordinator = NULL; +} // namespace - ReplicationCoordinator* getGlobalReplicationCoordinator() { - return coordinator; - } +ReplicationCoordinator* getGlobalReplicationCoordinator() { + return coordinator; +} - void setGlobalReplicationCoordinator(ReplicationCoordinator* newCoordinator) { - coordinator = newCoordinator; - } +void setGlobalReplicationCoordinator(ReplicationCoordinator* newCoordinator) { + coordinator = newCoordinator; +} -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/replication_coordinator_global.h b/src/mongo/db/repl/replication_coordinator_global.h index c107959dbf6..a18033fd162 100644 --- a/src/mongo/db/repl/replication_coordinator_global.h +++ b/src/mongo/db/repl/replication_coordinator_global.h @@ -33,8 +33,8 @@ namespace mongo { namespace repl { - ReplicationCoordinator* getGlobalReplicationCoordinator(); - void setGlobalReplicationCoordinator(ReplicationCoordinator* coordinator); +ReplicationCoordinator* getGlobalReplicationCoordinator(); +void setGlobalReplicationCoordinator(ReplicationCoordinator* coordinator); -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index 40b28dbb546..adac75fab35 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -67,1620 +67,1568 @@ namespace mongo { namespace repl { namespace { - typedef StatusWith<ReplicationExecutor::CallbackHandle> CBHStatus; +typedef StatusWith<ReplicationExecutor::CallbackHandle> CBHStatus; - void lockAndCall(boost::unique_lock<boost::mutex>* lk, const stdx::function<void ()>& fn) { - if (!lk->owns_lock()) { - lk->lock(); - } - fn(); +void lockAndCall(boost::unique_lock<boost::mutex>* lk, const stdx::function<void()>& fn) { + if (!lk->owns_lock()) { + lk->lock(); } + fn(); +} - /** - * Implements the force-reconfig behavior of incrementing config version by a large random - * number. - */ - BSONObj incrementConfigVersionByRandom(BSONObj config) { - BSONObjBuilder builder; - for (BSONObjIterator iter(config); iter.more(); iter.next()) { - BSONElement elem = *iter; - if (elem.fieldNameStringData() == ReplicaSetConfig::kVersionFieldName && - elem.isNumber()) { - - boost::scoped_ptr<SecureRandom> generator(SecureRandom::create()); - const int random = std::abs(static_cast<int>(generator->nextInt64()) % 100000); - builder.appendIntOrLL(ReplicaSetConfig::kVersionFieldName, - elem.numberLong() + 10000 + random); - } - else { - builder.append(elem); - } - } - return builder.obj(); - } - -} //namespace - - struct ReplicationCoordinatorImpl::WaiterInfo { - - /** - * Constructor takes the list of waiters and enqueues itself on the list, removing itself - * in the destructor. - */ - WaiterInfo(std::vector<WaiterInfo*>* _list, - unsigned int _opID, - const OpTime* _opTime, - const WriteConcernOptions* _writeConcern, - boost::condition_variable* _condVar) : list(_list), - master(true), - opID(_opID), - opTime(_opTime), - writeConcern(_writeConcern), - condVar(_condVar) { - list->push_back(this); +/** + * Implements the force-reconfig behavior of incrementing config version by a large random + * number. + */ +BSONObj incrementConfigVersionByRandom(BSONObj config) { + BSONObjBuilder builder; + for (BSONObjIterator iter(config); iter.more(); iter.next()) { + BSONElement elem = *iter; + if (elem.fieldNameStringData() == ReplicaSetConfig::kVersionFieldName && elem.isNumber()) { + boost::scoped_ptr<SecureRandom> generator(SecureRandom::create()); + const int random = std::abs(static_cast<int>(generator->nextInt64()) % 100000); + builder.appendIntOrLL(ReplicaSetConfig::kVersionFieldName, + elem.numberLong() + 10000 + random); + } else { + builder.append(elem); } + } + return builder.obj(); +} - ~WaiterInfo() { - list->erase(std::remove(list->begin(), list->end(), this), list->end()); - } +} // namespace - std::vector<WaiterInfo*>* list; - bool master; // Set to false to indicate that stepDown was called while waiting - const unsigned int opID; - const OpTime* opTime; - const WriteConcernOptions* writeConcern; - boost::condition_variable* condVar; - }; +struct ReplicationCoordinatorImpl::WaiterInfo { + /** + * Constructor takes the list of waiters and enqueues itself on the list, removing itself + * in the destructor. + */ + WaiterInfo(std::vector<WaiterInfo*>* _list, + unsigned int _opID, + const OpTime* _opTime, + const WriteConcernOptions* _writeConcern, + boost::condition_variable* _condVar) + : list(_list), + master(true), + opID(_opID), + opTime(_opTime), + writeConcern(_writeConcern), + condVar(_condVar) { + list->push_back(this); + } + + ~WaiterInfo() { + list->erase(std::remove(list->begin(), list->end(), this), list->end()); + } + + std::vector<WaiterInfo*>* list; + bool master; // Set to false to indicate that stepDown was called while waiting + const unsigned int opID; + const OpTime* opTime; + const WriteConcernOptions* writeConcern; + boost::condition_variable* condVar; +}; namespace { - ReplicationCoordinator::Mode getReplicationModeFromSettings(const ReplSettings& settings) { - if (settings.usingReplSets()) { - return ReplicationCoordinator::modeReplSet; - } - if (settings.master || settings.slave) { - return ReplicationCoordinator::modeMasterSlave; - } - return ReplicationCoordinator::modeNone; +ReplicationCoordinator::Mode getReplicationModeFromSettings(const ReplSettings& settings) { + if (settings.usingReplSets()) { + return ReplicationCoordinator::modeReplSet; + } + if (settings.master || settings.slave) { + return ReplicationCoordinator::modeMasterSlave; } + return ReplicationCoordinator::modeNone; +} } // namespace - ReplicationCoordinatorImpl::ReplicationCoordinatorImpl( - const ReplSettings& settings, - ReplicationCoordinatorExternalState* externalState, - ReplicationExecutor::NetworkInterface* network, - TopologyCoordinator* topCoord, - int64_t prngSeed) : - _settings(settings), - _replMode(getReplicationModeFromSettings(settings)), - _topCoord(topCoord), - _replExecutor(network, prngSeed), - _externalState(externalState), - _inShutdown(false), - _memberState(MemberState::RS_STARTUP), - _isWaitingForDrainToComplete(false), - _rsConfigState(kConfigPreStart), - _selfIndex(-1), - _sleptLastElection(false), - _canAcceptNonLocalWrites(!(settings.usingReplSets() || settings.slave)), - _canServeNonLocalReads(0U) { - - if (!isReplEnabled()) { - return; - } - - boost::scoped_ptr<SecureRandom> rbidGenerator(SecureRandom::create()); - _rbid = static_cast<int>(rbidGenerator->nextInt64()); - if (_rbid < 0) { - // Ensure _rbid is always positive - _rbid = -_rbid; - } - - // Make sure there is always an entry in _slaveInfo for ourself. - SlaveInfo selfInfo; - selfInfo.self = true; - _slaveInfo.push_back(selfInfo); +ReplicationCoordinatorImpl::ReplicationCoordinatorImpl( + const ReplSettings& settings, + ReplicationCoordinatorExternalState* externalState, + ReplicationExecutor::NetworkInterface* network, + TopologyCoordinator* topCoord, + int64_t prngSeed) + : _settings(settings), + _replMode(getReplicationModeFromSettings(settings)), + _topCoord(topCoord), + _replExecutor(network, prngSeed), + _externalState(externalState), + _inShutdown(false), + _memberState(MemberState::RS_STARTUP), + _isWaitingForDrainToComplete(false), + _rsConfigState(kConfigPreStart), + _selfIndex(-1), + _sleptLastElection(false), + _canAcceptNonLocalWrites(!(settings.usingReplSets() || settings.slave)), + _canServeNonLocalReads(0U) { + if (!isReplEnabled()) { + return; + } + + boost::scoped_ptr<SecureRandom> rbidGenerator(SecureRandom::create()); + _rbid = static_cast<int>(rbidGenerator->nextInt64()); + if (_rbid < 0) { + // Ensure _rbid is always positive + _rbid = -_rbid; + } + + // Make sure there is always an entry in _slaveInfo for ourself. + SlaveInfo selfInfo; + selfInfo.self = true; + _slaveInfo.push_back(selfInfo); +} + +ReplicationCoordinatorImpl::~ReplicationCoordinatorImpl() {} + +void ReplicationCoordinatorImpl::waitForStartUpComplete() { + boost::unique_lock<boost::mutex> lk(_mutex); + while (_rsConfigState == kConfigPreStart || _rsConfigState == kConfigStartingUp) { + _rsConfigStateChange.wait(lk); + } +} + +ReplicaSetConfig ReplicationCoordinatorImpl::getReplicaSetConfig_forTest() { + boost::lock_guard<boost::mutex> lk(_mutex); + return _rsConfig; +} + +bool ReplicationCoordinatorImpl::_startLoadLocalConfig(OperationContext* txn) { + StatusWith<BSONObj> cfg = _externalState->loadLocalConfigDocument(txn); + if (!cfg.isOK()) { + log() << "Did not find local replica set configuration document at startup; " + << cfg.getStatus(); + return true; } - - ReplicationCoordinatorImpl::~ReplicationCoordinatorImpl() {} - - void ReplicationCoordinatorImpl::waitForStartUpComplete() { - boost::unique_lock<boost::mutex> lk(_mutex); - while (_rsConfigState == kConfigPreStart || _rsConfigState == kConfigStartingUp) { - _rsConfigStateChange.wait(lk); + ReplicaSetConfig localConfig; + Status status = localConfig.initialize(cfg.getValue()); + if (!status.isOK()) { + error() << "Locally stored replica set configuration does not parse; See " + "http://www.mongodb.org/dochub/core/recover-replica-set-from-invalid-config " + "for information on how to recover from this. Got \"" << status + << "\" while parsing " << cfg.getValue(); + fassertFailedNoTrace(28545); + } + + StatusWith<OpTime> lastOpTimeStatus = _externalState->loadLastOpTime(txn); + + // Use a callback here, because _finishLoadLocalConfig calls isself() which requires + // that the server's networking layer be up and running and accepting connections, which + // doesn't happen until startReplication finishes. + _replExecutor.scheduleWork(stdx::bind(&ReplicationCoordinatorImpl::_finishLoadLocalConfig, + this, + stdx::placeholders::_1, + localConfig, + lastOpTimeStatus)); + return false; +} + +void ReplicationCoordinatorImpl::_finishLoadLocalConfig( + const ReplicationExecutor::CallbackData& cbData, + const ReplicaSetConfig& localConfig, + const StatusWith<OpTime>& lastOpTimeStatus) { + if (!cbData.status.isOK()) { + LOG(1) << "Loading local replica set configuration failed due to " << cbData.status; + return; + } + + StatusWith<int> myIndex = + validateConfigForStartUp(_externalState.get(), _rsConfig, localConfig); + if (!myIndex.isOK()) { + if (myIndex.getStatus() == ErrorCodes::NodeNotFound || + myIndex.getStatus() == ErrorCodes::DuplicateKey) { + warning() << "Locally stored replica set configuration does not have a valid entry " + "for the current node; waiting for reconfig or remote heartbeat; Got \"" + << myIndex.getStatus() << "\" while validating " << localConfig.toBSON(); + myIndex = StatusWith<int>(-1); + } else { + error() << "Locally stored replica set configuration is invalid; See " + "http://www.mongodb.org/dochub/core/recover-replica-set-from-invalid-config" + " for information on how to recover from this. Got \"" << myIndex.getStatus() + << "\" while validating " << localConfig.toBSON(); + fassertFailedNoTrace(28544); } } - ReplicaSetConfig ReplicationCoordinatorImpl::getReplicaSetConfig_forTest() { - boost::lock_guard<boost::mutex> lk(_mutex); - return _rsConfig; + if (localConfig.getReplSetName() != _settings.ourSetName()) { + warning() << "Local replica set configuration document reports set name of " + << localConfig.getReplSetName() << ", but command line reports " + << _settings.ourSetName() << "; waitng for reconfig or remote heartbeat"; + myIndex = StatusWith<int>(-1); } - bool ReplicationCoordinatorImpl::_startLoadLocalConfig(OperationContext* txn) { - - StatusWith<BSONObj> cfg = _externalState->loadLocalConfigDocument(txn); - if (!cfg.isOK()) { - log() << "Did not find local replica set configuration document at startup; " << - cfg.getStatus(); - return true; - } - ReplicaSetConfig localConfig; - Status status = localConfig.initialize(cfg.getValue()); - if (!status.isOK()) { - error() << "Locally stored replica set configuration does not parse; See " - "http://www.mongodb.org/dochub/core/recover-replica-set-from-invalid-config " - "for information on how to recover from this. Got \"" << - status << "\" while parsing " << cfg.getValue(); - fassertFailedNoTrace(28545); + // Do not check optime, if this node is an arbiter. + bool isArbiter = + myIndex.getValue() != -1 && localConfig.getMemberAt(myIndex.getValue()).isArbiter(); + OpTime lastOpTime(0, 0); + if (!isArbiter) { + if (!lastOpTimeStatus.isOK()) { + warning() << "Failed to load timestamp of most recently applied operation; " + << lastOpTimeStatus.getStatus(); + } else { + lastOpTime = lastOpTimeStatus.getValue(); } - - StatusWith<OpTime> lastOpTimeStatus = _externalState->loadLastOpTime(txn); - - // Use a callback here, because _finishLoadLocalConfig calls isself() which requires - // that the server's networking layer be up and running and accepting connections, which - // doesn't happen until startReplication finishes. - _replExecutor.scheduleWork( - stdx::bind(&ReplicationCoordinatorImpl::_finishLoadLocalConfig, - this, - stdx::placeholders::_1, - localConfig, - lastOpTimeStatus)); - return false; } - void ReplicationCoordinatorImpl::_finishLoadLocalConfig( - const ReplicationExecutor::CallbackData& cbData, - const ReplicaSetConfig& localConfig, - const StatusWith<OpTime>& lastOpTimeStatus) { - if (!cbData.status.isOK()) { - LOG(1) << "Loading local replica set configuration failed due to " << cbData.status; - return; - } - - StatusWith<int> myIndex = validateConfigForStartUp(_externalState.get(), - _rsConfig, - localConfig); - if (!myIndex.isOK()) { - if (myIndex.getStatus() == ErrorCodes::NodeNotFound || - myIndex.getStatus() == ErrorCodes::DuplicateKey) { - warning() << "Locally stored replica set configuration does not have a valid entry " - "for the current node; waiting for reconfig or remote heartbeat; Got \"" << - myIndex.getStatus() << "\" while validating " << localConfig.toBSON(); - myIndex = StatusWith<int>(-1); - } - else { - error() << "Locally stored replica set configuration is invalid; See " - "http://www.mongodb.org/dochub/core/recover-replica-set-from-invalid-config" - " for information on how to recover from this. Got \"" << - myIndex.getStatus() << "\" while validating " << localConfig.toBSON(); - fassertFailedNoTrace(28544); - } - } - - if (localConfig.getReplSetName() != _settings.ourSetName()) { - warning() << "Local replica set configuration document reports set name of " << - localConfig.getReplSetName() << ", but command line reports " << - _settings.ourSetName() << "; waitng for reconfig or remote heartbeat"; - myIndex = StatusWith<int>(-1); - } - - // Do not check optime, if this node is an arbiter. - bool isArbiter = myIndex.getValue() != -1 && - localConfig.getMemberAt(myIndex.getValue()).isArbiter(); - OpTime lastOpTime(0, 0); - if (!isArbiter) { - if (!lastOpTimeStatus.isOK()) { - warning() << "Failed to load timestamp of most recently applied operation; " << - lastOpTimeStatus.getStatus(); - } - else { - lastOpTime = lastOpTimeStatus.getValue(); - } - } - - boost::unique_lock<boost::mutex> lk(_mutex); - invariant(_rsConfigState == kConfigStartingUp); - const PostMemberStateUpdateAction action = - _setCurrentRSConfig_inlock(localConfig, myIndex.getValue()); - _setMyLastOptime_inlock(&lk, lastOpTime, false); - _externalState->setGlobalOpTime(lastOpTime); - if (lk.owns_lock()) { - lk.unlock(); - } - _performPostMemberStateUpdateAction(action); - _externalState->startThreads(); + boost::unique_lock<boost::mutex> lk(_mutex); + invariant(_rsConfigState == kConfigStartingUp); + const PostMemberStateUpdateAction action = + _setCurrentRSConfig_inlock(localConfig, myIndex.getValue()); + _setMyLastOptime_inlock(&lk, lastOpTime, false); + _externalState->setGlobalOpTime(lastOpTime); + if (lk.owns_lock()) { + lk.unlock(); } + _performPostMemberStateUpdateAction(action); + _externalState->startThreads(); +} - void ReplicationCoordinatorImpl::startReplication(OperationContext* txn) { - if (!isReplEnabled()) { - boost::lock_guard<boost::mutex> lk(_mutex); - _setConfigState_inlock(kConfigReplicationDisabled); - return; - } - - { - OID rid = _externalState->ensureMe(txn); - - boost::lock_guard<boost::mutex> lk(_mutex); - fassert(18822, !_inShutdown); - _setConfigState_inlock(kConfigStartingUp); - _myRID = rid; - _slaveInfo[_getMyIndexInSlaveInfo_inlock()].rid = rid; - } - - if (!_settings.usingReplSets()) { - // Must be Master/Slave - invariant(_settings.master || _settings.slave); - _externalState->startMasterSlave(txn); - return; - } - - _topCoordDriverThread.reset(new boost::thread(stdx::bind(&ReplicationExecutor::run, - &_replExecutor))); - - bool doneLoadingConfig = _startLoadLocalConfig(txn); - if (doneLoadingConfig) { - // If we're not done loading the config, then the config state will be set by - // _finishLoadLocalConfig. - boost::lock_guard<boost::mutex> lk(_mutex); - invariant(!_rsConfig.isInitialized()); - _setConfigState_inlock(kConfigUninitialized); - } +void ReplicationCoordinatorImpl::startReplication(OperationContext* txn) { + if (!isReplEnabled()) { + boost::lock_guard<boost::mutex> lk(_mutex); + _setConfigState_inlock(kConfigReplicationDisabled); + return; } - void ReplicationCoordinatorImpl::shutdown() { - // Shutdown must: - // * prevent new threads from blocking in awaitReplication - // * wake up all existing threads blocking in awaitReplication - // * tell the ReplicationExecutor to shut down - // * wait for the thread running the ReplicationExecutor to finish - - if (!_settings.usingReplSets()) { - return; - } + { + OID rid = _externalState->ensureMe(txn); - boost::thread* hbReconfigThread = NULL; - { - boost::lock_guard<boost::mutex> lk(_mutex); - fassert(28533, !_inShutdown); - _inShutdown = true; - if (_rsConfigState == kConfigPreStart) { - warning() << "ReplicationCoordinatorImpl::shutdown() called before " - "startReplication() finished. Shutting down without cleaning up the " - "replication system"; - return; - } - fassert(18823, _rsConfigState != kConfigStartingUp); - for (std::vector<WaiterInfo*>::iterator it = _replicationWaiterList.begin(); - it != _replicationWaiterList.end(); ++it) { - WaiterInfo* waiter = *it; - waiter->condVar->notify_all(); - } - - // Since we've set _inShutdown we know that _heartbeatReconfigThread will not be - // changed again, which makes it safe to store the pointer to it to be accessed outside - // of _mutex. - hbReconfigThread = _heartbeatReconfigThread.get(); - } - - if (hbReconfigThread) { - hbReconfigThread->join(); - } - - _replExecutor.shutdown(); - _topCoordDriverThread->join(); // must happen outside _mutex - _externalState->shutdown(); - } - - const ReplSettings& ReplicationCoordinatorImpl::getSettings() const { - return _settings; + boost::lock_guard<boost::mutex> lk(_mutex); + fassert(18822, !_inShutdown); + _setConfigState_inlock(kConfigStartingUp); + _myRID = rid; + _slaveInfo[_getMyIndexInSlaveInfo_inlock()].rid = rid; } - ReplicationCoordinator::Mode ReplicationCoordinatorImpl::getReplicationMode() const { - return _getReplicationMode_inlock(); + if (!_settings.usingReplSets()) { + // Must be Master/Slave + invariant(_settings.master || _settings.slave); + _externalState->startMasterSlave(txn); + return; } - ReplicationCoordinator::Mode ReplicationCoordinatorImpl::_getReplicationMode_inlock() const { - return _replMode; - } + _topCoordDriverThread.reset( + new boost::thread(stdx::bind(&ReplicationExecutor::run, &_replExecutor))); - MemberState ReplicationCoordinatorImpl::getMemberState() const { + bool doneLoadingConfig = _startLoadLocalConfig(txn); + if (doneLoadingConfig) { + // If we're not done loading the config, then the config state will be set by + // _finishLoadLocalConfig. boost::lock_guard<boost::mutex> lk(_mutex); - return _getMemberState_inlock(); + invariant(!_rsConfig.isInitialized()); + _setConfigState_inlock(kConfigUninitialized); } +} + +void ReplicationCoordinatorImpl::shutdown() { + // Shutdown must: + // * prevent new threads from blocking in awaitReplication + // * wake up all existing threads blocking in awaitReplication + // * tell the ReplicationExecutor to shut down + // * wait for the thread running the ReplicationExecutor to finish - MemberState ReplicationCoordinatorImpl::_getMemberState_inlock() const { - return _memberState; + if (!_settings.usingReplSets()) { + return; } - Seconds ReplicationCoordinatorImpl::getSlaveDelaySecs() const { + boost::thread* hbReconfigThread = NULL; + { boost::lock_guard<boost::mutex> lk(_mutex); - invariant(_rsConfig.isInitialized()); - uassert(28524, - "Node not a member of the current set configuration", - _selfIndex != -1); - return _rsConfig.getMemberAt(_selfIndex).getSlaveDelay(); - } - - void ReplicationCoordinatorImpl::clearSyncSourceBlacklist() { - CBHStatus cbh = _replExecutor.scheduleWork( - stdx::bind(&ReplicationCoordinatorImpl::_clearSyncSourceBlacklist_finish, - this, - stdx::placeholders::_1)); - if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { + fassert(28533, !_inShutdown); + _inShutdown = true; + if (_rsConfigState == kConfigPreStart) { + warning() << "ReplicationCoordinatorImpl::shutdown() called before " + "startReplication() finished. Shutting down without cleaning up the " + "replication system"; return; } - fassert(18907, cbh.getStatus()); - _replExecutor.wait(cbh.getValue()); + fassert(18823, _rsConfigState != kConfigStartingUp); + for (std::vector<WaiterInfo*>::iterator it = _replicationWaiterList.begin(); + it != _replicationWaiterList.end(); + ++it) { + WaiterInfo* waiter = *it; + waiter->condVar->notify_all(); + } + + // Since we've set _inShutdown we know that _heartbeatReconfigThread will not be + // changed again, which makes it safe to store the pointer to it to be accessed outside + // of _mutex. + hbReconfigThread = _heartbeatReconfigThread.get(); + } + + if (hbReconfigThread) { + hbReconfigThread->join(); + } + + _replExecutor.shutdown(); + _topCoordDriverThread->join(); // must happen outside _mutex + _externalState->shutdown(); +} + +const ReplSettings& ReplicationCoordinatorImpl::getSettings() const { + return _settings; +} + +ReplicationCoordinator::Mode ReplicationCoordinatorImpl::getReplicationMode() const { + return _getReplicationMode_inlock(); +} + +ReplicationCoordinator::Mode ReplicationCoordinatorImpl::_getReplicationMode_inlock() const { + return _replMode; +} + +MemberState ReplicationCoordinatorImpl::getMemberState() const { + boost::lock_guard<boost::mutex> lk(_mutex); + return _getMemberState_inlock(); +} + +MemberState ReplicationCoordinatorImpl::_getMemberState_inlock() const { + return _memberState; +} + +Seconds ReplicationCoordinatorImpl::getSlaveDelaySecs() const { + boost::lock_guard<boost::mutex> lk(_mutex); + invariant(_rsConfig.isInitialized()); + uassert(28524, "Node not a member of the current set configuration", _selfIndex != -1); + return _rsConfig.getMemberAt(_selfIndex).getSlaveDelay(); +} + +void ReplicationCoordinatorImpl::clearSyncSourceBlacklist() { + CBHStatus cbh = _replExecutor.scheduleWork( + stdx::bind(&ReplicationCoordinatorImpl::_clearSyncSourceBlacklist_finish, + this, + stdx::placeholders::_1)); + if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { + return; + } + fassert(18907, cbh.getStatus()); + _replExecutor.wait(cbh.getValue()); +} + +void ReplicationCoordinatorImpl::_clearSyncSourceBlacklist_finish( + const ReplicationExecutor::CallbackData& cbData) { + if (cbData.status == ErrorCodes::CallbackCanceled) + return; + _topCoord->clearSyncSourceBlacklist(); +} + +bool ReplicationCoordinatorImpl::setFollowerMode(const MemberState& newState) { + StatusWith<ReplicationExecutor::EventHandle> finishedSettingFollowerState = + _replExecutor.makeEvent(); + if (finishedSettingFollowerState.getStatus() == ErrorCodes::ShutdownInProgress) { + return false; } - - void ReplicationCoordinatorImpl::_clearSyncSourceBlacklist_finish( - const ReplicationExecutor::CallbackData& cbData) { - if (cbData.status == ErrorCodes::CallbackCanceled) - return; - _topCoord->clearSyncSourceBlacklist(); + fassert(18812, finishedSettingFollowerState.getStatus()); + bool success = false; + CBHStatus cbh = + _replExecutor.scheduleWork(stdx::bind(&ReplicationCoordinatorImpl::_setFollowerModeFinish, + this, + stdx::placeholders::_1, + newState, + finishedSettingFollowerState.getValue(), + &success)); + if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { + return false; } + fassert(18699, cbh.getStatus()); + _replExecutor.waitForEvent(finishedSettingFollowerState.getValue()); + return success; +} - bool ReplicationCoordinatorImpl::setFollowerMode(const MemberState& newState) { - StatusWith<ReplicationExecutor::EventHandle> finishedSettingFollowerState = - _replExecutor.makeEvent(); - if (finishedSettingFollowerState.getStatus() == ErrorCodes::ShutdownInProgress) { - return false; - } - fassert(18812, finishedSettingFollowerState.getStatus()); - bool success = false; - CBHStatus cbh = _replExecutor.scheduleWork( - stdx::bind(&ReplicationCoordinatorImpl::_setFollowerModeFinish, - this, - stdx::placeholders::_1, - newState, - finishedSettingFollowerState.getValue(), - &success)); - if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { - return false; - } - fassert(18699, cbh.getStatus()); - _replExecutor.waitForEvent(finishedSettingFollowerState.getValue()); - return success; +void ReplicationCoordinatorImpl::_setFollowerModeFinish( + const ReplicationExecutor::CallbackData& cbData, + const MemberState& newState, + const ReplicationExecutor::EventHandle& finishedSettingFollowerMode, + bool* success) { + if (cbData.status == ErrorCodes::CallbackCanceled) { + return; } - - void ReplicationCoordinatorImpl::_setFollowerModeFinish( - const ReplicationExecutor::CallbackData& cbData, - const MemberState& newState, - const ReplicationExecutor::EventHandle& finishedSettingFollowerMode, - bool* success) { - - if (cbData.status == ErrorCodes::CallbackCanceled) { - return; - } - if (newState == _topCoord->getMemberState()) { - *success = true; - _replExecutor.signalEvent(finishedSettingFollowerMode); - return; - } - if (_topCoord->getRole() == TopologyCoordinator::Role::leader) { - *success = false; - _replExecutor.signalEvent(finishedSettingFollowerMode); - return; - } - - if (_topCoord->getRole() == TopologyCoordinator::Role::candidate) { - // We are a candidate, which means _topCoord believs us to be in state RS_SECONDARY, and - // we know that newState != RS_SECONDARY because we would have returned early, above if - // the old and new state were equal. So, cancel the running election and try again to - // finish setting the follower mode. - invariant(_freshnessChecker); - _freshnessChecker->cancel(&_replExecutor); - if (_electCmdRunner) { - _electCmdRunner->cancel(&_replExecutor); - } - _replExecutor.onEvent( - _electionFinishedEvent, - stdx::bind(&ReplicationCoordinatorImpl::_setFollowerModeFinish, - this, - stdx::placeholders::_1, - newState, - finishedSettingFollowerMode, - success)); - return; - } - - boost::unique_lock<boost::mutex> lk(_mutex); - _topCoord->setFollowerMode(newState.s); - - const PostMemberStateUpdateAction action = - _updateMemberStateFromTopologyCoordinator_inlock(); + if (newState == _topCoord->getMemberState()) { *success = true; _replExecutor.signalEvent(finishedSettingFollowerMode); - lk.unlock(); - _performPostMemberStateUpdateAction(action); + return; } - - bool ReplicationCoordinatorImpl::isWaitingForApplierToDrain() { - boost::lock_guard<boost::mutex> lk(_mutex); - return _isWaitingForDrainToComplete; - } - - void ReplicationCoordinatorImpl::signalDrainComplete(OperationContext* txn) { - // This logic is a little complicated in order to avoid acquiring the global exclusive lock - // unnecessarily. This is important because the applier may call signalDrainComplete() - // whenever it wants, not only when the ReplicationCoordinator is expecting it. - // - // The steps are: - // 1.) Check to see if we're waiting for this signal. If not, return early. - // 2.) Otherwise, release the mutex while acquiring the global exclusive lock, - // since that might take a while (NB there's a deadlock cycle otherwise, too). - // 3.) Re-check to see if we've somehow left drain mode. If we have not, clear - // _isWaitingForDrainToComplete, set the flag allowing non-local database writes and - // drop the mutex. At this point, no writes can occur from other threads, due to the - // global exclusive lock. - // 4.) Drop all temp collections. - // 5.) Drop the global exclusive lock. - // - // Because replicatable writes are forbidden while in drain mode, and we don't exit drain - // mode until we have the global exclusive lock, which forbids all other threads from making - // writes, we know that from the time that _isWaitingForDrainToComplete is set in - // _performPostMemberStateUpdateAction(kActionWinElection) until this method returns, no - // external writes will be processed. This is important so that a new temp collection isn't - // introduced on the new primary before we drop all the temp collections. - - boost::unique_lock<boost::mutex> lk(_mutex); - if (!_isWaitingForDrainToComplete) { - return; - } - lk.unlock(); - ScopedTransaction transaction(txn, MODE_X); - Lock::GlobalWrite globalWriteLock(txn->lockState()); - lk.lock(); - if (!_isWaitingForDrainToComplete) { - return; + if (_topCoord->getRole() == TopologyCoordinator::Role::leader) { + *success = false; + _replExecutor.signalEvent(finishedSettingFollowerMode); + return; + } + + if (_topCoord->getRole() == TopologyCoordinator::Role::candidate) { + // We are a candidate, which means _topCoord believs us to be in state RS_SECONDARY, and + // we know that newState != RS_SECONDARY because we would have returned early, above if + // the old and new state were equal. So, cancel the running election and try again to + // finish setting the follower mode. + invariant(_freshnessChecker); + _freshnessChecker->cancel(&_replExecutor); + if (_electCmdRunner) { + _electCmdRunner->cancel(&_replExecutor); + } + _replExecutor.onEvent(_electionFinishedEvent, + stdx::bind(&ReplicationCoordinatorImpl::_setFollowerModeFinish, + this, + stdx::placeholders::_1, + newState, + finishedSettingFollowerMode, + success)); + return; + } + + boost::unique_lock<boost::mutex> lk(_mutex); + _topCoord->setFollowerMode(newState.s); + + const PostMemberStateUpdateAction action = _updateMemberStateFromTopologyCoordinator_inlock(); + *success = true; + _replExecutor.signalEvent(finishedSettingFollowerMode); + lk.unlock(); + _performPostMemberStateUpdateAction(action); +} + +bool ReplicationCoordinatorImpl::isWaitingForApplierToDrain() { + boost::lock_guard<boost::mutex> lk(_mutex); + return _isWaitingForDrainToComplete; +} + +void ReplicationCoordinatorImpl::signalDrainComplete(OperationContext* txn) { + // This logic is a little complicated in order to avoid acquiring the global exclusive lock + // unnecessarily. This is important because the applier may call signalDrainComplete() + // whenever it wants, not only when the ReplicationCoordinator is expecting it. + // + // The steps are: + // 1.) Check to see if we're waiting for this signal. If not, return early. + // 2.) Otherwise, release the mutex while acquiring the global exclusive lock, + // since that might take a while (NB there's a deadlock cycle otherwise, too). + // 3.) Re-check to see if we've somehow left drain mode. If we have not, clear + // _isWaitingForDrainToComplete, set the flag allowing non-local database writes and + // drop the mutex. At this point, no writes can occur from other threads, due to the + // global exclusive lock. + // 4.) Drop all temp collections. + // 5.) Drop the global exclusive lock. + // + // Because replicatable writes are forbidden while in drain mode, and we don't exit drain + // mode until we have the global exclusive lock, which forbids all other threads from making + // writes, we know that from the time that _isWaitingForDrainToComplete is set in + // _performPostMemberStateUpdateAction(kActionWinElection) until this method returns, no + // external writes will be processed. This is important so that a new temp collection isn't + // introduced on the new primary before we drop all the temp collections. + + boost::unique_lock<boost::mutex> lk(_mutex); + if (!_isWaitingForDrainToComplete) { + return; + } + lk.unlock(); + ScopedTransaction transaction(txn, MODE_X); + Lock::GlobalWrite globalWriteLock(txn->lockState()); + lk.lock(); + if (!_isWaitingForDrainToComplete) { + return; + } + _isWaitingForDrainToComplete = false; + _canAcceptNonLocalWrites = true; + lk.unlock(); + _externalState->dropAllTempCollections(txn); + log() << "transition to primary complete; database writes are now permitted" << rsLog; +} + +void ReplicationCoordinatorImpl::signalUpstreamUpdater() { + _externalState->forwardSlaveHandshake(); +} + +ReplicationCoordinatorImpl::SlaveInfo* ReplicationCoordinatorImpl::_findSlaveInfoByMemberID_inlock( + int memberId) { + for (SlaveInfoVector::iterator it = _slaveInfo.begin(); it != _slaveInfo.end(); ++it) { + if (it->memberId == memberId) { + return &(*it); + } + } + return NULL; +} + +ReplicationCoordinatorImpl::SlaveInfo* ReplicationCoordinatorImpl::_findSlaveInfoByRID_inlock( + const OID& rid) { + for (SlaveInfoVector::iterator it = _slaveInfo.begin(); it != _slaveInfo.end(); ++it) { + if (it->rid == rid) { + return &(*it); + } + } + return NULL; +} + +void ReplicationCoordinatorImpl::_addSlaveInfo_inlock(const SlaveInfo& slaveInfo) { + invariant(_getReplicationMode_inlock() == modeMasterSlave); + _slaveInfo.push_back(slaveInfo); + + // Wake up any threads waiting for replication that now have their replication + // check satisfied + _wakeReadyWaiters_inlock(); +} + +void ReplicationCoordinatorImpl::_updateSlaveInfoOptime_inlock(SlaveInfo* slaveInfo, OpTime ts) { + slaveInfo->opTime = ts; + + // Wake up any threads waiting for replication that now have their replication + // check satisfied + _wakeReadyWaiters_inlock(); +} + +void ReplicationCoordinatorImpl::_updateSlaveInfoFromConfig_inlock() { + invariant(_settings.usingReplSets()); + + SlaveInfoVector oldSlaveInfos; + _slaveInfo.swap(oldSlaveInfos); + + if (_selfIndex == -1) { + // If we aren't in the config then the only data we care about is for ourself + for (SlaveInfoVector::const_iterator it = oldSlaveInfos.begin(); it != oldSlaveInfos.end(); + ++it) { + if (it->self) { + SlaveInfo slaveInfo = *it; + slaveInfo.memberId = -1; + _slaveInfo.push_back(slaveInfo); + return; + } } - _isWaitingForDrainToComplete = false; - _canAcceptNonLocalWrites = true; - lk.unlock(); - _externalState->dropAllTempCollections(txn); - log() << "transition to primary complete; database writes are now permitted" << rsLog; + invariant(false); // There should always have been an entry for ourself } - void ReplicationCoordinatorImpl::signalUpstreamUpdater() { - _externalState->forwardSlaveHandshake(); - } + for (int i = 0; i < _rsConfig.getNumMembers(); ++i) { + const MemberConfig& memberConfig = _rsConfig.getMemberAt(i); + int memberId = memberConfig.getId(); + const HostAndPort& memberHostAndPort = memberConfig.getHostAndPort(); - ReplicationCoordinatorImpl::SlaveInfo* - ReplicationCoordinatorImpl::_findSlaveInfoByMemberID_inlock(int memberId) { - for (SlaveInfoVector::iterator it = _slaveInfo.begin(); it != _slaveInfo.end(); ++it) { - if (it->memberId == memberId) { - return &(*it); - } - } - return NULL; - } + SlaveInfo slaveInfo; - ReplicationCoordinatorImpl::SlaveInfo* - ReplicationCoordinatorImpl::_findSlaveInfoByRID_inlock(const OID& rid) { - for (SlaveInfoVector::iterator it = _slaveInfo.begin(); it != _slaveInfo.end(); ++it) { - if (it->rid == rid) { - return &(*it); + // Check if the node existed with the same member ID and hostname in the old data + for (SlaveInfoVector::const_iterator it = oldSlaveInfos.begin(); it != oldSlaveInfos.end(); + ++it) { + if ((it->memberId == memberId && it->hostAndPort == memberHostAndPort) || + (i == _selfIndex && it->self)) { + slaveInfo = *it; } } - return NULL; - } - void ReplicationCoordinatorImpl::_addSlaveInfo_inlock(const SlaveInfo& slaveInfo) { - invariant(_getReplicationMode_inlock() == modeMasterSlave); + // Make sure you have the most up-to-date info for member ID and hostAndPort. + slaveInfo.memberId = memberId; + slaveInfo.hostAndPort = memberHostAndPort; _slaveInfo.push_back(slaveInfo); - - // Wake up any threads waiting for replication that now have their replication - // check satisfied - _wakeReadyWaiters_inlock(); } + invariant(static_cast<int>(_slaveInfo.size()) == _rsConfig.getNumMembers()); +} - void ReplicationCoordinatorImpl::_updateSlaveInfoOptime_inlock(SlaveInfo* slaveInfo, - OpTime ts) { - - slaveInfo->opTime = ts; - - // Wake up any threads waiting for replication that now have their replication - // check satisfied - _wakeReadyWaiters_inlock(); - } - - void ReplicationCoordinatorImpl::_updateSlaveInfoFromConfig_inlock() { +size_t ReplicationCoordinatorImpl::_getMyIndexInSlaveInfo_inlock() const { + if (_getReplicationMode_inlock() == modeMasterSlave) { + // Self data always lives in the first entry in _slaveInfo for master/slave + return 0; + } else { invariant(_settings.usingReplSets()); - - SlaveInfoVector oldSlaveInfos; - _slaveInfo.swap(oldSlaveInfos); - if (_selfIndex == -1) { - // If we aren't in the config then the only data we care about is for ourself - for (SlaveInfoVector::const_iterator it = oldSlaveInfos.begin(); - it != oldSlaveInfos.end(); ++it) { - if (it->self) { - SlaveInfo slaveInfo = *it; - slaveInfo.memberId = -1; - _slaveInfo.push_back(slaveInfo); - return; - } - } - invariant(false); // There should always have been an entry for ourself - } - - for (int i = 0; i < _rsConfig.getNumMembers(); ++i) { - const MemberConfig& memberConfig = _rsConfig.getMemberAt(i); - int memberId = memberConfig.getId(); - const HostAndPort& memberHostAndPort = memberConfig.getHostAndPort(); - - SlaveInfo slaveInfo; - - // Check if the node existed with the same member ID and hostname in the old data - for (SlaveInfoVector::const_iterator it = oldSlaveInfos.begin(); - it != oldSlaveInfos.end(); ++it) { - if ((it->memberId == memberId && it->hostAndPort == memberHostAndPort) - || (i == _selfIndex && it->self)) { - slaveInfo = *it; - } - } - - // Make sure you have the most up-to-date info for member ID and hostAndPort. - slaveInfo.memberId = memberId; - slaveInfo.hostAndPort = memberHostAndPort; - _slaveInfo.push_back(slaveInfo); - } - invariant(static_cast<int>(_slaveInfo.size()) == _rsConfig.getNumMembers()); - } - - size_t ReplicationCoordinatorImpl::_getMyIndexInSlaveInfo_inlock() const { - if (_getReplicationMode_inlock() == modeMasterSlave) { - // Self data always lives in the first entry in _slaveInfo for master/slave + invariant(_slaveInfo.size() == 1); return 0; - } - else { - invariant(_settings.usingReplSets()); - if (_selfIndex == -1) { - invariant(_slaveInfo.size() == 1); - return 0; - } - else { - return _selfIndex; - } + } else { + return _selfIndex; } } +} - Status ReplicationCoordinatorImpl::setLastOptimeForSlave(const OID& rid, - const OpTime& ts) { - boost::unique_lock<boost::mutex> lock(_mutex); - massert(28576, - "Received an old style replication progress update, which is only used for Master/" - "Slave replication now, but this node is not using Master/Slave replication. " - "This is likely caused by an old (pre-2.6) member syncing from this node.", - _getReplicationMode_inlock() == modeMasterSlave); +Status ReplicationCoordinatorImpl::setLastOptimeForSlave(const OID& rid, const OpTime& ts) { + boost::unique_lock<boost::mutex> lock(_mutex); + massert(28576, + "Received an old style replication progress update, which is only used for Master/" + "Slave replication now, but this node is not using Master/Slave replication. " + "This is likely caused by an old (pre-2.6) member syncing from this node.", + _getReplicationMode_inlock() == modeMasterSlave); - SlaveInfo* slaveInfo = _findSlaveInfoByRID_inlock(rid); - if (slaveInfo) { - if (slaveInfo->opTime < ts) { - _updateSlaveInfoOptime_inlock(slaveInfo, ts); - } - } - else { - SlaveInfo newSlaveInfo; - newSlaveInfo.rid = rid; - newSlaveInfo.opTime = ts; - _addSlaveInfo_inlock(newSlaveInfo); + SlaveInfo* slaveInfo = _findSlaveInfoByRID_inlock(rid); + if (slaveInfo) { + if (slaveInfo->opTime < ts) { + _updateSlaveInfoOptime_inlock(slaveInfo, ts); } + } else { + SlaveInfo newSlaveInfo; + newSlaveInfo.rid = rid; + newSlaveInfo.opTime = ts; + _addSlaveInfo_inlock(newSlaveInfo); + } + return Status::OK(); +} + +void ReplicationCoordinatorImpl::setMyHeartbeatMessage(const std::string& msg) { + CBHStatus cbh = _replExecutor.scheduleWork(stdx::bind( + &TopologyCoordinator::setMyHeartbeatMessage, _topCoord.get(), _replExecutor.now(), msg)); + if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { + return; + } + fassert(28540, cbh.getStatus()); + _replExecutor.wait(cbh.getValue()); +} + +void ReplicationCoordinatorImpl::setMyLastOptime(const OpTime& ts) { + boost::unique_lock<boost::mutex> lock(_mutex); + _setMyLastOptime_inlock(&lock, ts, false); +} + +void ReplicationCoordinatorImpl::resetMyLastOptime() { + boost::unique_lock<boost::mutex> lock(_mutex); + _setMyLastOptime_inlock(&lock, OpTime(), true); +} + +void ReplicationCoordinatorImpl::_setMyLastOptime_inlock(boost::unique_lock<boost::mutex>* lock, + const OpTime& ts, + bool isRollbackAllowed) { + invariant(lock->owns_lock()); + SlaveInfo* mySlaveInfo = &_slaveInfo[_getMyIndexInSlaveInfo_inlock()]; + invariant(isRollbackAllowed || mySlaveInfo->opTime <= ts); + _updateSlaveInfoOptime_inlock(mySlaveInfo, ts); + + if (_getReplicationMode_inlock() != modeReplSet) { + return; + } + if (_getMemberState_inlock().primary()) { + return; + } + lock->unlock(); + _externalState->forwardSlaveProgress(); // Must do this outside _mutex +} + +OpTime ReplicationCoordinatorImpl::getMyLastOptime() const { + boost::lock_guard<boost::mutex> lock(_mutex); + return _getMyLastOptime_inlock(); +} + +OpTime ReplicationCoordinatorImpl::_getMyLastOptime_inlock() const { + return _slaveInfo[_getMyIndexInSlaveInfo_inlock()].opTime; +} + +Status ReplicationCoordinatorImpl::setLastOptime_forTest(const OID& rid, const OpTime& ts) { + boost::lock_guard<boost::mutex> lock(_mutex); + invariant(_getReplicationMode_inlock() == modeReplSet); + + const UpdatePositionArgs::UpdateInfo update(rid, ts, -1, -1); + return _setLastOptime_inlock(update); +} + +Status ReplicationCoordinatorImpl::_setLastOptime_inlock( + const UpdatePositionArgs::UpdateInfo& args) { + if (_selfIndex == -1) { + // Ignore updates when we're in state REMOVED + return Status(ErrorCodes::NotMasterOrSecondaryCode, + "Received replSetUpdatePosition command but we are in state REMOVED"); + } + invariant(_getReplicationMode_inlock() == modeReplSet); + + if (args.rid == _getMyRID_inlock() || + args.memberId == _rsConfig.getMemberAt(_selfIndex).getId()) { + // Do not let remote nodes tell us what our optime is. return Status::OK(); } - void ReplicationCoordinatorImpl::setMyHeartbeatMessage(const std::string& msg) { - CBHStatus cbh = _replExecutor.scheduleWork( - stdx::bind(&TopologyCoordinator::setMyHeartbeatMessage, - _topCoord.get(), - _replExecutor.now(), - msg)); - if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { - return; - } - fassert(28540, cbh.getStatus()); - _replExecutor.wait(cbh.getValue()); - } - - void ReplicationCoordinatorImpl::setMyLastOptime(const OpTime& ts) { - boost::unique_lock<boost::mutex> lock(_mutex); - _setMyLastOptime_inlock(&lock, ts, false); - } - - void ReplicationCoordinatorImpl::resetMyLastOptime() { - boost::unique_lock<boost::mutex> lock(_mutex); - _setMyLastOptime_inlock(&lock, OpTime(), true); - } + LOG(2) << "received notification that node with RID " << args.rid + << " has reached optime: " << args.ts; - void ReplicationCoordinatorImpl::_setMyLastOptime_inlock( - boost::unique_lock<boost::mutex>* lock, const OpTime& ts, bool isRollbackAllowed) { - invariant(lock->owns_lock()); - SlaveInfo* mySlaveInfo = &_slaveInfo[_getMyIndexInSlaveInfo_inlock()]; - invariant(isRollbackAllowed || mySlaveInfo->opTime <= ts); - _updateSlaveInfoOptime_inlock(mySlaveInfo, ts); - - if (_getReplicationMode_inlock() != modeReplSet) { - return; + SlaveInfo* slaveInfo = NULL; + if (args.memberId >= 0) { + if (args.cfgver != _rsConfig.getConfigVersion()) { + std::string errmsg = str::stream() + << "Received replSetUpdatePosition for node with memberId " << args.memberId + << " whose config version of " << args.cfgver + << " doesn't match our config version of " << _rsConfig.getConfigVersion(); + LOG(1) << errmsg; + return Status(ErrorCodes::InvalidReplicaSetConfig, errmsg); } - if (_getMemberState_inlock().primary()) { + + slaveInfo = _findSlaveInfoByMemberID_inlock(args.memberId); + if (!slaveInfo) { + invariant(!_rsConfig.findMemberByID(args.memberId)); + + std::string errmsg = str::stream() + << "Received replSetUpdatePosition for node with memberId " << args.memberId + << " which doesn't exist in our config"; + LOG(1) << errmsg; + return Status(ErrorCodes::NodeNotFound, errmsg); + } + } else { + // The command we received didn't contain a memberId, most likely this is because it + // came from a member running something prior to 3.0. + // Fall back to finding the node by RID. + slaveInfo = _findSlaveInfoByRID_inlock(args.rid); + if (!slaveInfo) { + std::string errmsg = str::stream() + << "Received replSetUpdatePosition for node with RID " << args.rid + << ", but we haven't yet received a handshake for that node."; + LOG(1) << errmsg; + return Status(ErrorCodes::NodeNotFound, errmsg); + } + invariant(slaveInfo->memberId >= 0); + } + invariant(slaveInfo); + invariant(args.memberId < 0 || args.memberId == slaveInfo->memberId); + + LOG(3) << "Node with RID " << args.rid << " and memberId " << slaveInfo->memberId + << " currently has optime " << slaveInfo->opTime << "; updating to " << args.ts; + + // Only update remote optimes if they increase. + if (slaveInfo->opTime < args.ts) { + _updateSlaveInfoOptime_inlock(slaveInfo, args.ts); + } + return Status::OK(); +} + +void ReplicationCoordinatorImpl::interrupt(unsigned opId) { + boost::lock_guard<boost::mutex> lk(_mutex); + for (std::vector<WaiterInfo*>::iterator it = _replicationWaiterList.begin(); + it != _replicationWaiterList.end(); + ++it) { + WaiterInfo* info = *it; + if (info->opID == opId) { + info->condVar->notify_all(); return; } - lock->unlock(); - _externalState->forwardSlaveProgress(); // Must do this outside _mutex } - OpTime ReplicationCoordinatorImpl::getMyLastOptime() const { - boost::lock_guard<boost::mutex> lock(_mutex); - return _getMyLastOptime_inlock(); - } + _replExecutor.scheduleWork( + stdx::bind(&ReplicationCoordinatorImpl::_signalStepDownWaitersFromCallback, + this, + stdx::placeholders::_1)); +} - OpTime ReplicationCoordinatorImpl::_getMyLastOptime_inlock() const { - return _slaveInfo[_getMyIndexInSlaveInfo_inlock()].opTime; +void ReplicationCoordinatorImpl::interruptAll() { + boost::lock_guard<boost::mutex> lk(_mutex); + for (std::vector<WaiterInfo*>::iterator it = _replicationWaiterList.begin(); + it != _replicationWaiterList.end(); + ++it) { + WaiterInfo* info = *it; + info->condVar->notify_all(); } - Status ReplicationCoordinatorImpl::setLastOptime_forTest(const OID& rid, const OpTime& ts) { - boost::lock_guard<boost::mutex> lock(_mutex); - invariant(_getReplicationMode_inlock() == modeReplSet); + _replExecutor.scheduleWork( + stdx::bind(&ReplicationCoordinatorImpl::_signalStepDownWaitersFromCallback, + this, + stdx::placeholders::_1)); +} - const UpdatePositionArgs::UpdateInfo update(rid, ts, -1, -1); - return _setLastOptime_inlock(update); +bool ReplicationCoordinatorImpl::_doneWaitingForReplication_inlock( + const OpTime& opTime, const WriteConcernOptions& writeConcern) { + Status status = _checkIfWriteConcernCanBeSatisfied_inlock(writeConcern); + if (!status.isOK()) { + return true; } - Status ReplicationCoordinatorImpl::_setLastOptime_inlock( - const UpdatePositionArgs::UpdateInfo& args) { - - if (_selfIndex == -1) { - // Ignore updates when we're in state REMOVED - return Status(ErrorCodes::NotMasterOrSecondaryCode, - "Received replSetUpdatePosition command but we are in state REMOVED"); - } - invariant(_getReplicationMode_inlock() == modeReplSet); - - if (args.rid == _getMyRID_inlock() || - args.memberId == _rsConfig.getMemberAt(_selfIndex).getId()) { - // Do not let remote nodes tell us what our optime is. - return Status::OK(); - } - - LOG(2) << "received notification that node with RID " << args.rid << - " has reached optime: " << args.ts; - - SlaveInfo* slaveInfo = NULL; - if (args.memberId >= 0) { - if (args.cfgver != _rsConfig.getConfigVersion()) { - std::string errmsg = str::stream() - << "Received replSetUpdatePosition for node with memberId " - << args.memberId << " whose config version of " << args.cfgver - << " doesn't match our config version of " - << _rsConfig.getConfigVersion(); - LOG(1) << errmsg; - return Status(ErrorCodes::InvalidReplicaSetConfig, errmsg); - } - - slaveInfo = _findSlaveInfoByMemberID_inlock(args.memberId); - if (!slaveInfo) { - invariant(!_rsConfig.findMemberByID(args.memberId)); - - std::string errmsg = str::stream() - << "Received replSetUpdatePosition for node with memberId " - << args.memberId << " which doesn't exist in our config"; - LOG(1) << errmsg; - return Status(ErrorCodes::NodeNotFound, errmsg); - } - } - else { - // The command we received didn't contain a memberId, most likely this is because it - // came from a member running something prior to 3.0. - // Fall back to finding the node by RID. - slaveInfo = _findSlaveInfoByRID_inlock(args.rid); - if (!slaveInfo) { - std::string errmsg = str::stream() - << "Received replSetUpdatePosition for node with RID " << args.rid - << ", but we haven't yet received a handshake for that node."; - LOG(1) << errmsg; - return Status(ErrorCodes::NodeNotFound, errmsg); - } - invariant(slaveInfo->memberId >= 0); + if (!writeConcern.wMode.empty()) { + StringData patternName; + if (writeConcern.wMode == "majority") { + patternName = "$majority"; + } else { + patternName = writeConcern.wMode; } - invariant(slaveInfo); - invariant(args.memberId < 0 || args.memberId == slaveInfo->memberId); - - LOG(3) << "Node with RID " << args.rid << " and memberId " << slaveInfo->memberId - << " currently has optime " << slaveInfo->opTime << "; updating to " << args.ts; - - // Only update remote optimes if they increase. - if (slaveInfo->opTime < args.ts) { - _updateSlaveInfoOptime_inlock(slaveInfo, args.ts); + StatusWith<ReplicaSetTagPattern> tagPattern = _rsConfig.findCustomWriteMode(patternName); + if (!tagPattern.isOK()) { + return true; } - return Status::OK(); + return _haveTaggedNodesReachedOpTime_inlock(opTime, tagPattern.getValue()); + } else { + return _haveNumNodesReachedOpTime_inlock(opTime, writeConcern.wNumNodes); } +} - void ReplicationCoordinatorImpl::interrupt(unsigned opId) { - boost::lock_guard<boost::mutex> lk(_mutex); - for (std::vector<WaiterInfo*>::iterator it = _replicationWaiterList.begin(); - it != _replicationWaiterList.end(); ++it) { - WaiterInfo* info = *it; - if (info->opID == opId) { - info->condVar->notify_all(); - return; - } - } - - _replExecutor.scheduleWork( - stdx::bind(&ReplicationCoordinatorImpl::_signalStepDownWaitersFromCallback, - this, - stdx::placeholders::_1)); +bool ReplicationCoordinatorImpl::_haveNumNodesReachedOpTime_inlock(const OpTime& opTime, + int numNodes) { + if (_getMyLastOptime_inlock() < opTime) { + // Secondaries that are for some reason ahead of us should not allow us to + // satisfy a write concern if we aren't caught up ourselves. + return false; } - void ReplicationCoordinatorImpl::interruptAll() { - boost::lock_guard<boost::mutex> lk(_mutex); - for (std::vector<WaiterInfo*>::iterator it = _replicationWaiterList.begin(); - it != _replicationWaiterList.end(); ++it) { - WaiterInfo* info = *it; - info->condVar->notify_all(); + for (SlaveInfoVector::iterator it = _slaveInfo.begin(); it != _slaveInfo.end(); ++it) { + const OpTime& slaveTime = it->opTime; + if (slaveTime >= opTime) { + --numNodes; } - _replExecutor.scheduleWork( - stdx::bind(&ReplicationCoordinatorImpl::_signalStepDownWaitersFromCallback, - this, - stdx::placeholders::_1)); - } - - bool ReplicationCoordinatorImpl::_doneWaitingForReplication_inlock( - const OpTime& opTime, const WriteConcernOptions& writeConcern) { - Status status = _checkIfWriteConcernCanBeSatisfied_inlock(writeConcern); - if (!status.isOK()) { + if (numNodes <= 0) { return true; } - - if (!writeConcern.wMode.empty()) { - StringData patternName; - if (writeConcern.wMode == "majority") { - patternName = "$majority"; - } - else { - patternName = writeConcern.wMode; - } - StatusWith<ReplicaSetTagPattern> tagPattern = - _rsConfig.findCustomWriteMode(patternName); - if (!tagPattern.isOK()) { - return true; - } - return _haveTaggedNodesReachedOpTime_inlock(opTime, tagPattern.getValue()); - } - else { - return _haveNumNodesReachedOpTime_inlock(opTime, writeConcern.wNumNodes); - } - } - - bool ReplicationCoordinatorImpl::_haveNumNodesReachedOpTime_inlock(const OpTime& opTime, - int numNodes) { - if (_getMyLastOptime_inlock() < opTime) { - // Secondaries that are for some reason ahead of us should not allow us to - // satisfy a write concern if we aren't caught up ourselves. - return false; - } - - for (SlaveInfoVector::iterator it = _slaveInfo.begin(); - it != _slaveInfo.end(); ++it) { - - const OpTime& slaveTime = it->opTime; - if (slaveTime >= opTime) { - --numNodes; - } - - if (numNodes <= 0) { - return true; - } - } - return false; } - - bool ReplicationCoordinatorImpl::_haveTaggedNodesReachedOpTime_inlock( - const OpTime& opTime, const ReplicaSetTagPattern& tagPattern) { - - ReplicaSetTagMatch matcher(tagPattern); - for (SlaveInfoVector::iterator it = _slaveInfo.begin(); - it != _slaveInfo.end(); ++it) { - - const OpTime& slaveTime = it->opTime; - if (slaveTime >= opTime) { - // This node has reached the desired optime, now we need to check if it is a part - // of the tagPattern. - const MemberConfig* memberConfig = _rsConfig.findMemberByID(it->memberId); - invariant(memberConfig); - for (MemberConfig::TagIterator it = memberConfig->tagsBegin(); - it != memberConfig->tagsEnd(); ++it) { - if (matcher.update(*it)) { - return true; - } + return false; +} + +bool ReplicationCoordinatorImpl::_haveTaggedNodesReachedOpTime_inlock( + const OpTime& opTime, const ReplicaSetTagPattern& tagPattern) { + ReplicaSetTagMatch matcher(tagPattern); + for (SlaveInfoVector::iterator it = _slaveInfo.begin(); it != _slaveInfo.end(); ++it) { + const OpTime& slaveTime = it->opTime; + if (slaveTime >= opTime) { + // This node has reached the desired optime, now we need to check if it is a part + // of the tagPattern. + const MemberConfig* memberConfig = _rsConfig.findMemberByID(it->memberId); + invariant(memberConfig); + for (MemberConfig::TagIterator it = memberConfig->tagsBegin(); + it != memberConfig->tagsEnd(); + ++it) { + if (matcher.update(*it)) { + return true; } } } - return false; } - - ReplicationCoordinator::StatusAndDuration ReplicationCoordinatorImpl::awaitReplication( - const OperationContext* txn, - const OpTime& opTime, - const WriteConcernOptions& writeConcern) { - Timer timer; - boost::unique_lock<boost::mutex> lock(_mutex); - return _awaitReplication_inlock(&timer, &lock, txn, opTime, writeConcern); + return false; +} + +ReplicationCoordinator::StatusAndDuration ReplicationCoordinatorImpl::awaitReplication( + const OperationContext* txn, const OpTime& opTime, const WriteConcernOptions& writeConcern) { + Timer timer; + boost::unique_lock<boost::mutex> lock(_mutex); + return _awaitReplication_inlock(&timer, &lock, txn, opTime, writeConcern); +} + +ReplicationCoordinator::StatusAndDuration +ReplicationCoordinatorImpl::awaitReplicationOfLastOpForClient( + const OperationContext* txn, const WriteConcernOptions& writeConcern) { + Timer timer; + boost::unique_lock<boost::mutex> lock(_mutex); + return _awaitReplication_inlock( + &timer, &lock, txn, txn->getClient()->getLastOp(), writeConcern); +} + +ReplicationCoordinator::StatusAndDuration ReplicationCoordinatorImpl::_awaitReplication_inlock( + const Timer* timer, + boost::unique_lock<boost::mutex>* lock, + const OperationContext* txn, + const OpTime& opTime, + const WriteConcernOptions& writeConcern) { + const Mode replMode = _getReplicationMode_inlock(); + if (replMode == modeNone || serverGlobalParams.configsvr) { + // no replication check needed (validated above) + return StatusAndDuration(Status::OK(), Milliseconds(timer->millis())); } - ReplicationCoordinator::StatusAndDuration - ReplicationCoordinatorImpl::awaitReplicationOfLastOpForClient( - const OperationContext* txn, - const WriteConcernOptions& writeConcern) { - Timer timer; - boost::unique_lock<boost::mutex> lock(_mutex); - return _awaitReplication_inlock( - &timer, &lock, txn, txn->getClient()->getLastOp(), writeConcern); + if (replMode == modeMasterSlave && writeConcern.wMode == "majority") { + // with master/slave, majority is equivalent to w=1 + return StatusAndDuration(Status::OK(), Milliseconds(timer->millis())); } - ReplicationCoordinator::StatusAndDuration ReplicationCoordinatorImpl::_awaitReplication_inlock( - const Timer* timer, - boost::unique_lock<boost::mutex>* lock, - const OperationContext* txn, - const OpTime& opTime, - const WriteConcernOptions& writeConcern) { - - const Mode replMode = _getReplicationMode_inlock(); - if (replMode == modeNone || serverGlobalParams.configsvr) { - // no replication check needed (validated above) - return StatusAndDuration(Status::OK(), Milliseconds(timer->millis())); - } - - if (replMode == modeMasterSlave && writeConcern.wMode == "majority") { - // with master/slave, majority is equivalent to w=1 - return StatusAndDuration(Status::OK(), Milliseconds(timer->millis())); - } - - if (opTime.isNull()) { - // If waiting for the empty optime, always say it's been replicated. - return StatusAndDuration(Status::OK(), Milliseconds(timer->millis())); - } - - if (replMode == modeReplSet && !_memberState.primary()) { - return StatusAndDuration(Status(ErrorCodes::NotMaster, - "Not master while waiting for replication"), - Milliseconds(timer->millis())); - } - - if (writeConcern.wMode.empty()) { - if (writeConcern.wNumNodes < 1) { - return StatusAndDuration(Status::OK(), Milliseconds(timer->millis())); - } - else if (writeConcern.wNumNodes == 1 && _getMyLastOptime_inlock() >= opTime) { - return StatusAndDuration(Status::OK(), Milliseconds(timer->millis())); - } - } - - // Must hold _mutex before constructing waitInfo as it will modify _replicationWaiterList - boost::condition_variable condVar; - WaiterInfo waitInfo( - &_replicationWaiterList, txn->getOpID(), &opTime, &writeConcern, &condVar); - while (!_doneWaitingForReplication_inlock(opTime, writeConcern)) { - const int elapsed = timer->millis(); - - Status interruptedStatus = txn->checkForInterruptNoAssert(); - if (!interruptedStatus.isOK()) { - return StatusAndDuration(interruptedStatus, Milliseconds(elapsed)); - } - - if (!waitInfo.master) { - return StatusAndDuration(Status(ErrorCodes::NotMaster, - "Not master anymore while waiting for replication" - " - this most likely means that a step down" - " occurred while waiting for replication"), - Milliseconds(elapsed)); - } - - if (writeConcern.wTimeout != WriteConcernOptions::kNoTimeout && - elapsed > writeConcern.wTimeout) { - return StatusAndDuration(Status(ErrorCodes::ExceededTimeLimit, - "waiting for replication timed out"), - Milliseconds(elapsed)); - } - - if (_inShutdown) { - return StatusAndDuration(Status(ErrorCodes::ShutdownInProgress, - "Replication is being shut down"), - Milliseconds(elapsed)); - } - - try { - if (writeConcern.wTimeout == WriteConcernOptions::kNoTimeout) { - condVar.wait(*lock); - } - else { - condVar.timed_wait(*lock, Milliseconds(writeConcern.wTimeout - elapsed)); - } - } catch (const boost::thread_interrupted&) {} - } - - Status status = _checkIfWriteConcernCanBeSatisfied_inlock(writeConcern); - if (!status.isOK()) { - return StatusAndDuration(status, Milliseconds(timer->millis())); - } - + if (opTime.isNull()) { + // If waiting for the empty optime, always say it's been replicated. return StatusAndDuration(Status::OK(), Milliseconds(timer->millis())); } - Status ReplicationCoordinatorImpl::stepDown(OperationContext* txn, - bool force, - const Milliseconds& waitTime, - const Milliseconds& stepdownTime) { - const Date_t startTime = _replExecutor.now(); - const Date_t stepDownUntil(startTime.millis + stepdownTime.total_milliseconds()); - const Date_t waitUntil(startTime.millis + waitTime.total_milliseconds()); - - if (!getMemberState().primary()) { - // Note this check is inherently racy - it's always possible for the node to - // stepdown from some other path before we acquire the global shared lock, but - // that's okay because we are resiliant to that happening in _stepDownContinue. - return Status(ErrorCodes::NotMaster, "not primary so can't step down"); - } - - LockResult lockState = txn->lockState()->lockGlobalBegin(MODE_S); - // We've requested the global shared lock which will stop new writes from coming in, - // but existing writes could take a long time to finish, so kill all user operations - // to help us get the global lock faster. - _externalState->killAllUserOperations(txn); - - if (lockState == LOCK_WAITING) { - lockState = txn->lockState()->lockGlobalComplete(stepdownTime.total_milliseconds()); - if (lockState == LOCK_TIMEOUT) { - return Status(ErrorCodes::ExceededTimeLimit, - "Could not acquire the global shared lock within the amount of time " - "specified that we should step down for"); - } - } - invariant(lockState == LOCK_OK); - ON_BLOCK_EXIT(&Locker::unlockAll, txn->lockState()); - // From this point onward we are guaranteed to be holding the global shared lock. - - StatusWith<ReplicationExecutor::EventHandle> finishedEvent = _replExecutor.makeEvent(); - if (finishedEvent.getStatus() == ErrorCodes::ShutdownInProgress) { - return finishedEvent.getStatus(); - } - fassert(26000, finishedEvent.getStatus()); - Status result(ErrorCodes::InternalError, "didn't set status in _stepDownContinue"); - CBHStatus cbh = _replExecutor.scheduleWork( - stdx::bind(&ReplicationCoordinatorImpl::_stepDownContinue, - this, - stdx::placeholders::_1, - finishedEvent.getValue(), - txn, - waitUntil, - stepDownUntil, - force, - &result)); - if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { - return cbh.getStatus(); - } - fassert(18809, cbh.getStatus()); - cbh = _replExecutor.scheduleWorkAt( - waitUntil, - stdx::bind(&ReplicationCoordinatorImpl::_signalStepDownWaitersFromCallback, - this, - stdx::placeholders::_1)); - if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { - return cbh.getStatus(); - } - fassert(26001, cbh.getStatus()); - _replExecutor.waitForEvent(finishedEvent.getValue()); - return result; + if (replMode == modeReplSet && !_memberState.primary()) { + return StatusAndDuration( + Status(ErrorCodes::NotMaster, "Not master while waiting for replication"), + Milliseconds(timer->millis())); } - void ReplicationCoordinatorImpl::_signalStepDownWaitersFromCallback( - const ReplicationExecutor::CallbackData& cbData) { - if (!cbData.status.isOK()) { - return; - } - - _signalStepDownWaiters(); - } - - void ReplicationCoordinatorImpl::_signalStepDownWaiters() { - std::for_each(_stepDownWaiters.begin(), - _stepDownWaiters.end(), - stdx::bind(&ReplicationExecutor::signalEvent, - &_replExecutor, - stdx::placeholders::_1)); - _stepDownWaiters.clear(); - } - - void ReplicationCoordinatorImpl::_stepDownContinue( - const ReplicationExecutor::CallbackData& cbData, - const ReplicationExecutor::EventHandle finishedEvent, - OperationContext* txn, - const Date_t waitUntil, - const Date_t stepDownUntil, - bool force, - Status* result) { - if (cbData.status == ErrorCodes::CallbackCanceled) { - // Cancelation only occurs on shutdown, which will also handle signaling the event. - *result = Status(ErrorCodes::ShutdownInProgress, "Shutting down replication"); - return; + if (writeConcern.wMode.empty()) { + if (writeConcern.wNumNodes < 1) { + return StatusAndDuration(Status::OK(), Milliseconds(timer->millis())); + } else if (writeConcern.wNumNodes == 1 && _getMyLastOptime_inlock() >= opTime) { + return StatusAndDuration(Status::OK(), Milliseconds(timer->millis())); } + } - ScopeGuard allFinishedGuard = MakeGuard( - stdx::bind(&ReplicationExecutor::signalEvent, &_replExecutor, finishedEvent)); - if (!cbData.status.isOK()) { - *result = cbData.status; - return; - } + // Must hold _mutex before constructing waitInfo as it will modify _replicationWaiterList + boost::condition_variable condVar; + WaiterInfo waitInfo(&_replicationWaiterList, txn->getOpID(), &opTime, &writeConcern, &condVar); + while (!_doneWaitingForReplication_inlock(opTime, writeConcern)) { + const int elapsed = timer->millis(); Status interruptedStatus = txn->checkForInterruptNoAssert(); if (!interruptedStatus.isOK()) { - *result = interruptedStatus; - return; + return StatusAndDuration(interruptedStatus, Milliseconds(elapsed)); } - if (_topCoord->getRole() != TopologyCoordinator::Role::leader) { - *result = Status(ErrorCodes::NotMaster, - "Already stepped down from primary while processing step down " - "request"); - return; - } - const Date_t now = _replExecutor.now(); - if (now >= stepDownUntil) { - *result = Status(ErrorCodes::ExceededTimeLimit, - "By the time we were ready to step down, we were already past the " - "time we were supposed to step down until"); - return; + if (!waitInfo.master) { + return StatusAndDuration(Status(ErrorCodes::NotMaster, + "Not master anymore while waiting for replication" + " - this most likely means that a step down" + " occurred while waiting for replication"), + Milliseconds(elapsed)); } - bool forceNow = now >= waitUntil ? force : false; - if (_topCoord->stepDown(stepDownUntil, forceNow, getMyLastOptime())) { - // Schedule work to (potentially) step back up once the stepdown period has ended. - _replExecutor.scheduleWorkAt(stepDownUntil, - stdx::bind(&ReplicationCoordinatorImpl::_handleTimePassing, - this, - stdx::placeholders::_1)); - boost::unique_lock<boost::mutex> lk(_mutex); - const PostMemberStateUpdateAction action = - _updateMemberStateFromTopologyCoordinator_inlock(); - lk.unlock(); - _performPostMemberStateUpdateAction(action); - *result = Status::OK(); - return; + if (writeConcern.wTimeout != WriteConcernOptions::kNoTimeout && + elapsed > writeConcern.wTimeout) { + return StatusAndDuration( + Status(ErrorCodes::ExceededTimeLimit, "waiting for replication timed out"), + Milliseconds(elapsed)); } - // Step down failed. Keep waiting if we can, otherwise finish. - if (now >= waitUntil) { - *result = Status(ErrorCodes::ExceededTimeLimit, str::stream() << - "No electable secondaries caught up as of " << - dateToISOStringLocal(now)); - return; + if (_inShutdown) { + return StatusAndDuration( + Status(ErrorCodes::ShutdownInProgress, "Replication is being shut down"), + Milliseconds(elapsed)); } - if (_stepDownWaiters.empty()) { - StatusWith<ReplicationExecutor::EventHandle> reschedEvent = - _replExecutor.makeEvent(); - if (!reschedEvent.isOK()) { - *result = reschedEvent.getStatus(); - return; + try { + if (writeConcern.wTimeout == WriteConcernOptions::kNoTimeout) { + condVar.wait(*lock); + } else { + condVar.timed_wait(*lock, Milliseconds(writeConcern.wTimeout - elapsed)); } - _stepDownWaiters.push_back(reschedEvent.getValue()); - } - CBHStatus cbh = _replExecutor.onEvent( - _stepDownWaiters.back(), - stdx::bind(&ReplicationCoordinatorImpl::_stepDownContinue, - this, - stdx::placeholders::_1, - finishedEvent, - txn, - waitUntil, - stepDownUntil, - force, - result)); - if (!cbh.isOK()) { - *result = cbh.getStatus(); - return; - } - allFinishedGuard.Dismiss(); - } - - void ReplicationCoordinatorImpl::_handleTimePassing( - const ReplicationExecutor::CallbackData& cbData) { - if (!cbData.status.isOK()) { - return; - } + } catch (const boost::thread_interrupted&) { + } + } + + Status status = _checkIfWriteConcernCanBeSatisfied_inlock(writeConcern); + if (!status.isOK()) { + return StatusAndDuration(status, Milliseconds(timer->millis())); + } + + return StatusAndDuration(Status::OK(), Milliseconds(timer->millis())); +} + +Status ReplicationCoordinatorImpl::stepDown(OperationContext* txn, + bool force, + const Milliseconds& waitTime, + const Milliseconds& stepdownTime) { + const Date_t startTime = _replExecutor.now(); + const Date_t stepDownUntil(startTime.millis + stepdownTime.total_milliseconds()); + const Date_t waitUntil(startTime.millis + waitTime.total_milliseconds()); + + if (!getMemberState().primary()) { + // Note this check is inherently racy - it's always possible for the node to + // stepdown from some other path before we acquire the global shared lock, but + // that's okay because we are resiliant to that happening in _stepDownContinue. + return Status(ErrorCodes::NotMaster, "not primary so can't step down"); + } + + LockResult lockState = txn->lockState()->lockGlobalBegin(MODE_S); + // We've requested the global shared lock which will stop new writes from coming in, + // but existing writes could take a long time to finish, so kill all user operations + // to help us get the global lock faster. + _externalState->killAllUserOperations(txn); + + if (lockState == LOCK_WAITING) { + lockState = txn->lockState()->lockGlobalComplete(stepdownTime.total_milliseconds()); + if (lockState == LOCK_TIMEOUT) { + return Status(ErrorCodes::ExceededTimeLimit, + "Could not acquire the global shared lock within the amount of time " + "specified that we should step down for"); + } + } + invariant(lockState == LOCK_OK); + ON_BLOCK_EXIT(&Locker::unlockAll, txn->lockState()); + // From this point onward we are guaranteed to be holding the global shared lock. + + StatusWith<ReplicationExecutor::EventHandle> finishedEvent = _replExecutor.makeEvent(); + if (finishedEvent.getStatus() == ErrorCodes::ShutdownInProgress) { + return finishedEvent.getStatus(); + } + fassert(26000, finishedEvent.getStatus()); + Status result(ErrorCodes::InternalError, "didn't set status in _stepDownContinue"); + CBHStatus cbh = + _replExecutor.scheduleWork(stdx::bind(&ReplicationCoordinatorImpl::_stepDownContinue, + this, + stdx::placeholders::_1, + finishedEvent.getValue(), + txn, + waitUntil, + stepDownUntil, + force, + &result)); + if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { + return cbh.getStatus(); + } + fassert(18809, cbh.getStatus()); + cbh = _replExecutor.scheduleWorkAt( + waitUntil, + stdx::bind(&ReplicationCoordinatorImpl::_signalStepDownWaitersFromCallback, + this, + stdx::placeholders::_1)); + if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { + return cbh.getStatus(); + } + fassert(26001, cbh.getStatus()); + _replExecutor.waitForEvent(finishedEvent.getValue()); + return result; +} + +void ReplicationCoordinatorImpl::_signalStepDownWaitersFromCallback( + const ReplicationExecutor::CallbackData& cbData) { + if (!cbData.status.isOK()) { + return; + } + + _signalStepDownWaiters(); +} + +void ReplicationCoordinatorImpl::_signalStepDownWaiters() { + std::for_each( + _stepDownWaiters.begin(), + _stepDownWaiters.end(), + stdx::bind(&ReplicationExecutor::signalEvent, &_replExecutor, stdx::placeholders::_1)); + _stepDownWaiters.clear(); +} + +void ReplicationCoordinatorImpl::_stepDownContinue( + const ReplicationExecutor::CallbackData& cbData, + const ReplicationExecutor::EventHandle finishedEvent, + OperationContext* txn, + const Date_t waitUntil, + const Date_t stepDownUntil, + bool force, + Status* result) { + if (cbData.status == ErrorCodes::CallbackCanceled) { + // Cancelation only occurs on shutdown, which will also handle signaling the event. + *result = Status(ErrorCodes::ShutdownInProgress, "Shutting down replication"); + return; + } + + ScopeGuard allFinishedGuard = + MakeGuard(stdx::bind(&ReplicationExecutor::signalEvent, &_replExecutor, finishedEvent)); + if (!cbData.status.isOK()) { + *result = cbData.status; + return; + } + + Status interruptedStatus = txn->checkForInterruptNoAssert(); + if (!interruptedStatus.isOK()) { + *result = interruptedStatus; + return; + } + + if (_topCoord->getRole() != TopologyCoordinator::Role::leader) { + *result = Status(ErrorCodes::NotMaster, + "Already stepped down from primary while processing step down " + "request"); + return; + } + const Date_t now = _replExecutor.now(); + if (now >= stepDownUntil) { + *result = Status(ErrorCodes::ExceededTimeLimit, + "By the time we were ready to step down, we were already past the " + "time we were supposed to step down until"); + return; + } + bool forceNow = now >= waitUntil ? force : false; + if (_topCoord->stepDown(stepDownUntil, forceNow, getMyLastOptime())) { + // Schedule work to (potentially) step back up once the stepdown period has ended. + _replExecutor.scheduleWorkAt(stepDownUntil, + stdx::bind(&ReplicationCoordinatorImpl::_handleTimePassing, + this, + stdx::placeholders::_1)); - if (_topCoord->becomeCandidateIfStepdownPeriodOverAndSingleNodeSet(_replExecutor.now())) { - _performPostMemberStateUpdateAction(kActionWinElection); - } + boost::unique_lock<boost::mutex> lk(_mutex); + const PostMemberStateUpdateAction action = + _updateMemberStateFromTopologyCoordinator_inlock(); + lk.unlock(); + _performPostMemberStateUpdateAction(action); + *result = Status::OK(); + return; } - bool ReplicationCoordinatorImpl::isMasterForReportingPurposes() { - if (_settings.usingReplSets()) { - boost::lock_guard<boost::mutex> lock(_mutex); - if (_getReplicationMode_inlock() == modeReplSet && - _getMemberState_inlock().primary()) { - return true; - } - return false; - } - - if (!_settings.slave) - return true; - - - // TODO(dannenberg) replAllDead is bad and should be removed when master slave is removed - if (replAllDead) { - return false; - } - - if (_settings.master) { - // if running with --master --slave, allow. - return true; - } - - return false; + // Step down failed. Keep waiting if we can, otherwise finish. + if (now >= waitUntil) { + *result = Status(ErrorCodes::ExceededTimeLimit, + str::stream() << "No electable secondaries caught up as of " + << dateToISOStringLocal(now)); + return; } - bool ReplicationCoordinatorImpl::canAcceptWritesForDatabase(const StringData& dbName) { - // _canAcceptNonLocalWrites is always true for standalone nodes, always false for nodes - // started with --slave, and adjusted based on primary+drain state in replica sets. - // - // That is, stand-alone nodes, non-slave nodes and drained replica set primaries can always - // accept writes. Similarly, writes are always permitted to the "local" database. Finally, - // in the event that a node is started with --slave and --master, we allow writes unless the - // master/slave system has set the replAllDead flag. - if (_canAcceptNonLocalWrites) { - return true; - } - if (dbName == "local") { - return true; + if (_stepDownWaiters.empty()) { + StatusWith<ReplicationExecutor::EventHandle> reschedEvent = _replExecutor.makeEvent(); + if (!reschedEvent.isOK()) { + *result = reschedEvent.getStatus(); + return; } - return !replAllDead && _settings.master; + _stepDownWaiters.push_back(reschedEvent.getValue()); + } + CBHStatus cbh = _replExecutor.onEvent(_stepDownWaiters.back(), + stdx::bind(&ReplicationCoordinatorImpl::_stepDownContinue, + this, + stdx::placeholders::_1, + finishedEvent, + txn, + waitUntil, + stepDownUntil, + force, + result)); + if (!cbh.isOK()) { + *result = cbh.getStatus(); + return; } + allFinishedGuard.Dismiss(); +} - Status ReplicationCoordinatorImpl::checkCanServeReadsFor(OperationContext* txn, - const NamespaceString& ns, - bool slaveOk) { - if (txn->getClient()->isInDirectClient()) { - return Status::OK(); - } - if (canAcceptWritesForDatabase(ns.db())) { - return Status::OK(); - } - if (_settings.slave || _settings.master) { - return Status::OK(); - } - if (slaveOk) { - if (_canServeNonLocalReads.loadRelaxed()) { - return Status::OK(); - } - return Status( - ErrorCodes::NotMasterOrSecondaryCode, - "not master or secondary; cannot currently read from this replSet member"); - } - return Status(ErrorCodes::NotMasterNoSlaveOkCode, "not master and slaveOk=false"); +void ReplicationCoordinatorImpl::_handleTimePassing( + const ReplicationExecutor::CallbackData& cbData) { + if (!cbData.status.isOK()) { + return; } - bool ReplicationCoordinatorImpl::isInPrimaryOrSecondaryState() const { - return _canServeNonLocalReads.loadRelaxed(); + if (_topCoord->becomeCandidateIfStepdownPeriodOverAndSingleNodeSet(_replExecutor.now())) { + _performPostMemberStateUpdateAction(kActionWinElection); } +} - bool ReplicationCoordinatorImpl::shouldIgnoreUniqueIndex(const IndexDescriptor* idx) { - if (!idx->unique()) { - return false; - } - // Never ignore _id index - if (idx->isIdIndex()) { - return false; - } - if (nsToDatabaseSubstring(idx->parentNS()) == "local" ) { - // always enforce on local - return false; - } +bool ReplicationCoordinatorImpl::isMasterForReportingPurposes() { + if (_settings.usingReplSets()) { boost::lock_guard<boost::mutex> lock(_mutex); - if (_getReplicationMode_inlock() != modeReplSet) { - return false; - } - // see SERVER-6671 - MemberState ms = _getMemberState_inlock(); - switch ( ms.s ) { - case MemberState::RS_SECONDARY: - case MemberState::RS_RECOVERING: - case MemberState::RS_ROLLBACK: - case MemberState::RS_STARTUP2: + if (_getReplicationMode_inlock() == modeReplSet && _getMemberState_inlock().primary()) { return true; - default: - return false; } + return false; } - OID ReplicationCoordinatorImpl::getElectionId() { - boost::lock_guard<boost::mutex> lock(_mutex); - return _electionId; - } + if (!_settings.slave) + return true; - OID ReplicationCoordinatorImpl::getMyRID() const { - boost::lock_guard<boost::mutex> lock(_mutex); - return _getMyRID_inlock(); - } - OID ReplicationCoordinatorImpl::_getMyRID_inlock() const { - return _myRID; + // TODO(dannenberg) replAllDead is bad and should be removed when master slave is removed + if (replAllDead) { + return false; } - int ReplicationCoordinatorImpl::getMyId() const { - boost::lock_guard<boost::mutex> lock(_mutex); - return _getMyId_inlock(); + if (_settings.master) { + // if running with --master --slave, allow. + return true; } - int ReplicationCoordinatorImpl::_getMyId_inlock() const { - const MemberConfig& self = _rsConfig.getMemberAt(_selfIndex); - return self.getId(); - } + return false; +} - bool ReplicationCoordinatorImpl::prepareReplSetUpdatePositionCommand( - BSONObjBuilder* cmdBuilder) { - boost::lock_guard<boost::mutex> lock(_mutex); - invariant(_rsConfig.isInitialized()); - // do not send updates if we have been removed from the config - if (_selfIndex == -1) { - return false; - } - cmdBuilder->append("replSetUpdatePosition", 1); - // create an array containing objects each member connected to us and for ourself - BSONArrayBuilder arrayBuilder(cmdBuilder->subarrayStart("optimes")); - { - for (SlaveInfoVector::const_iterator itr = _slaveInfo.begin(); - itr != _slaveInfo.end(); ++itr) { - if (itr->opTime.isNull()) { - // Don't include info on members we haven't heard from yet. - continue; - } - BSONObjBuilder entry(arrayBuilder.subobjStart()); - entry.append("_id", itr->rid); - entry.append("optime", itr->opTime); - entry.append("memberId", itr->memberId); - entry.append("cfgver", _rsConfig.getConfigVersion()); - // SERVER-14550 Even though the "config" field isn't used on the other end in 3.0, - // we need to keep sending it for 2.6 compatibility. - // TODO(spencer): Remove this after 3.0 is released. - const MemberConfig* member = _rsConfig.findMemberByID(itr->memberId); - fassert(18651, member); // We ensured the member existed in processHandshake. - entry.append("config", member->toBSON(_rsConfig.getTagConfig())); - } - } +bool ReplicationCoordinatorImpl::canAcceptWritesForDatabase(const StringData& dbName) { + // _canAcceptNonLocalWrites is always true for standalone nodes, always false for nodes + // started with --slave, and adjusted based on primary+drain state in replica sets. + // + // That is, stand-alone nodes, non-slave nodes and drained replica set primaries can always + // accept writes. Similarly, writes are always permitted to the "local" database. Finally, + // in the event that a node is started with --slave and --master, we allow writes unless the + // master/slave system has set the replAllDead flag. + if (_canAcceptNonLocalWrites) { return true; } - - void ReplicationCoordinatorImpl::prepareReplSetUpdatePositionCommandHandshakes( - std::vector<BSONObj>* handshakes) { - boost::lock_guard<boost::mutex> lock(_mutex); - // do not send handshakes if we have been removed from the config - if (_selfIndex == -1) { - return; - } - // handshake objs for ourself and all chained members - for (SlaveInfoVector::const_iterator itr = _slaveInfo.begin(); - itr != _slaveInfo.end(); ++itr) { - if (!itr->rid.isSet()) { - // Don't include info on members we haven't heard from yet. - continue; - } - - BSONObjBuilder cmd; - cmd.append("replSetUpdatePosition", 1); - { - BSONObjBuilder subCmd (cmd.subobjStart("handshake")); - subCmd.append("handshake", itr->rid); - subCmd.append("member", itr->memberId); - // SERVER-14550 Even though the "config" field isn't used on the other end in 3.0, - // we need to keep sending it for 2.6 compatibility. - // TODO(spencer): Remove this after 3.0 is released. - const MemberConfig* member = _rsConfig.findMemberByID(itr->memberId); - fassert(18650, member); // We ensured the member existed in processHandshake. - subCmd.append("config", member->toBSON(_rsConfig.getTagConfig())); - } - handshakes->push_back(cmd.obj()); - } + if (dbName == "local") { + return true; } + return !replAllDead && _settings.master; +} - Status ReplicationCoordinatorImpl::processReplSetGetStatus(BSONObjBuilder* response) { - Status result(ErrorCodes::InternalError, "didn't set status in prepareStatusResponse"); - CBHStatus cbh = _replExecutor.scheduleWork( - stdx::bind(&TopologyCoordinator::prepareStatusResponse, - _topCoord.get(), - stdx::placeholders::_1, - _replExecutor.now(), - time(0) - serverGlobalParams.started, - getMyLastOptime(), - response, - &result)); - if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { - return Status(ErrorCodes::ShutdownInProgress, "replication shutdown in progress"); - } - fassert(18640, cbh.getStatus()); - _replExecutor.wait(cbh.getValue()); - - return result; +Status ReplicationCoordinatorImpl::checkCanServeReadsFor(OperationContext* txn, + const NamespaceString& ns, + bool slaveOk) { + if (txn->getClient()->isInDirectClient()) { + return Status::OK(); } - - void ReplicationCoordinatorImpl::fillIsMasterForReplSet(IsMasterResponse* response) { - invariant(getSettings().usingReplSets()); - - CBHStatus cbh = _replExecutor.scheduleWork( - stdx::bind(&ReplicationCoordinatorImpl::_fillIsMasterForReplSet_finish, - this, - stdx::placeholders::_1, - response)); - if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { - response->markAsShutdownInProgress(); - return; - } - fassert(28602, cbh.getStatus()); - - _replExecutor.wait(cbh.getValue()); - if (isWaitingForApplierToDrain()) { - // Report that we are secondary to ismaster callers until drain completes. - response->setIsMaster(false); - response->setIsSecondary(true); - } + if (canAcceptWritesForDatabase(ns.db())) { + return Status::OK(); } - - void ReplicationCoordinatorImpl::_fillIsMasterForReplSet_finish( - const ReplicationExecutor::CallbackData& cbData, IsMasterResponse* response) { - if (cbData.status == ErrorCodes::CallbackCanceled) { - response->markAsShutdownInProgress(); - return; - } - _topCoord->fillIsMasterForReplSet(response); + if (_settings.slave || _settings.master) { + return Status::OK(); } - - void ReplicationCoordinatorImpl::appendSlaveInfoData(BSONObjBuilder* result) { - boost::lock_guard<boost::mutex> lock(_mutex); - BSONArrayBuilder slaves(result->subarrayStart("slaves")); - { - for (SlaveInfoVector::const_iterator itr = _slaveInfo.begin(); - itr != _slaveInfo.end(); ++itr) { - BSONObjBuilder entry(slaves.subobjStart()); - entry.append("rid", itr->rid); - entry.append("optime", itr->opTime); - entry.append("host", itr->hostAndPort.toString()); - if (_getReplicationMode_inlock() == modeReplSet) { - if (_selfIndex == -1) { - continue; - } - invariant(itr->memberId >= 0); - entry.append("memberId", itr->memberId); - } - } + if (slaveOk) { + if (_canServeNonLocalReads.loadRelaxed()) { + return Status::OK(); } + return Status(ErrorCodes::NotMasterOrSecondaryCode, + "not master or secondary; cannot currently read from this replSet member"); } + return Status(ErrorCodes::NotMasterNoSlaveOkCode, "not master and slaveOk=false"); +} - void ReplicationCoordinatorImpl::processReplSetGetConfig(BSONObjBuilder* result) { - boost::lock_guard<boost::mutex> lock(_mutex); - result->append("config", _rsConfig.toBSON()); - } +bool ReplicationCoordinatorImpl::isInPrimaryOrSecondaryState() const { + return _canServeNonLocalReads.loadRelaxed(); +} - bool ReplicationCoordinatorImpl::getMaintenanceMode() { - bool maintenanceMode(false); - CBHStatus cbh = _replExecutor.scheduleWork( - stdx::bind(&ReplicationCoordinatorImpl::_getMaintenanceMode_helper, - this, - stdx::placeholders::_1, - &maintenanceMode)); - if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { - return false; - } - fassert(18811, cbh.getStatus()); - _replExecutor.wait(cbh.getValue()); - return maintenanceMode; +bool ReplicationCoordinatorImpl::shouldIgnoreUniqueIndex(const IndexDescriptor* idx) { + if (!idx->unique()) { + return false; } - - void ReplicationCoordinatorImpl::_getMaintenanceMode_helper( - const ReplicationExecutor::CallbackData& cbData, - bool* maintenanceMode) { - if (cbData.status == ErrorCodes::CallbackCanceled) { - return; - } - *maintenanceMode = _topCoord->getMaintenanceCount() > 0; + // Never ignore _id index + if (idx->isIdIndex()) { + return false; } - - Status ReplicationCoordinatorImpl::setMaintenanceMode(bool activate) { - if (_getReplicationMode_inlock() != modeReplSet) { - return Status(ErrorCodes::NoReplicationEnabled, - "can only set maintenance mode on replica set members"); - } - - Status result(ErrorCodes::InternalError, "didn't set status in _setMaintenanceMode_helper"); - CBHStatus cbh = _replExecutor.scheduleWork( - stdx::bind(&ReplicationCoordinatorImpl::_setMaintenanceMode_helper, - this, - stdx::placeholders::_1, - activate, - &result)); - if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { - return cbh.getStatus(); - } - fassert(18698, cbh.getStatus()); - _replExecutor.wait(cbh.getValue()); - return result; + if (nsToDatabaseSubstring(idx->parentNS()) == "local") { + // always enforce on local + return false; } - - void ReplicationCoordinatorImpl::_setMaintenanceMode_helper( - const ReplicationExecutor::CallbackData& cbData, - bool activate, - Status* result) { - if (cbData.status == ErrorCodes::CallbackCanceled) { - *result = Status(ErrorCodes::ShutdownInProgress, "replication system is shutting down"); - return; - } - - boost::unique_lock<boost::mutex> lk(_mutex); - if (_getMemberState_inlock().primary()) { - *result = Status(ErrorCodes::NotSecondary, "primaries can't modify maintenance mode"); - return; - } - - int curMaintenanceCalls = _topCoord->getMaintenanceCount(); - if (activate) { - log() << "replSet going into maintenance mode with " << curMaintenanceCalls - << " other maintenance mode tasks in progress" << rsLog; - _topCoord->adjustMaintenanceCountBy(1); - } - else if (curMaintenanceCalls > 0) { - invariant(_topCoord->getRole() == TopologyCoordinator::Role::follower); - - _topCoord->adjustMaintenanceCountBy(-1); - - log() << "leaving maintenance mode (" << curMaintenanceCalls-1 - << " other maintenance mode tasks ongoing)" << rsLog; - } else { - warning() << "Attempted to leave maintenance mode but it is not currently active"; - *result = Status(ErrorCodes::OperationFailed, "already out of maintenance mode"); - return; - } - - const PostMemberStateUpdateAction action = - _updateMemberStateFromTopologyCoordinator_inlock(); - *result = Status::OK(); - lk.unlock(); - _performPostMemberStateUpdateAction(action); + boost::lock_guard<boost::mutex> lock(_mutex); + if (_getReplicationMode_inlock() != modeReplSet) { + return false; } - - Status ReplicationCoordinatorImpl::processReplSetSyncFrom(const HostAndPort& target, - BSONObjBuilder* resultObj) { - Status result(ErrorCodes::InternalError, "didn't set status in prepareSyncFromResponse"); - CBHStatus cbh = _replExecutor.scheduleWork( - stdx::bind(&TopologyCoordinator::prepareSyncFromResponse, - _topCoord.get(), - stdx::placeholders::_1, - target, - _getMyLastOptime_inlock(), - resultObj, - &result)); - if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { - return Status(ErrorCodes::ShutdownInProgress, "replication shutdown in progress"); - } - fassert(18649, cbh.getStatus()); - _replExecutor.wait(cbh.getValue()); - return result; - } - - Status ReplicationCoordinatorImpl::processReplSetFreeze(int secs, BSONObjBuilder* resultObj) { - Status result(ErrorCodes::InternalError, "didn't set status in prepareFreezeResponse"); - CBHStatus cbh = _replExecutor.scheduleWork( - stdx::bind(&ReplicationCoordinatorImpl::_processReplSetFreeze_finish, - this, - stdx::placeholders::_1, - secs, - resultObj, - &result)); - if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { - return cbh.getStatus(); - } - fassert(18641, cbh.getStatus()); - _replExecutor.wait(cbh.getValue()); - return result; - } - - void ReplicationCoordinatorImpl::_processReplSetFreeze_finish( - const ReplicationExecutor::CallbackData& cbData, - int secs, - BSONObjBuilder* response, - Status* result) { - if (cbData.status == ErrorCodes::CallbackCanceled) { - *result = Status(ErrorCodes::ShutdownInProgress, "replication system is shutting down"); - return; - } - - _topCoord->prepareFreezeResponse(_replExecutor.now(), secs, response); - - if (_topCoord->getRole() == TopologyCoordinator::Role::candidate) { - // If we just unfroze and ended our stepdown period and we are a one node replica set, - // the topology coordinator will have gone into the candidate role to signal that we - // need to elect ourself. - _performPostMemberStateUpdateAction(kActionWinElection); - } - *result = Status::OK(); + // see SERVER-6671 + MemberState ms = _getMemberState_inlock(); + switch (ms.s) { + case MemberState::RS_SECONDARY: + case MemberState::RS_RECOVERING: + case MemberState::RS_ROLLBACK: + case MemberState::RS_STARTUP2: + return true; + default: + return false; } - - Status ReplicationCoordinatorImpl::processHeartbeat(const ReplSetHeartbeatArgs& args, - ReplSetHeartbeatResponse* response) { +} + +OID ReplicationCoordinatorImpl::getElectionId() { + boost::lock_guard<boost::mutex> lock(_mutex); + return _electionId; +} + +OID ReplicationCoordinatorImpl::getMyRID() const { + boost::lock_guard<boost::mutex> lock(_mutex); + return _getMyRID_inlock(); +} + +OID ReplicationCoordinatorImpl::_getMyRID_inlock() const { + return _myRID; +} + +int ReplicationCoordinatorImpl::getMyId() const { + boost::lock_guard<boost::mutex> lock(_mutex); + return _getMyId_inlock(); +} + +int ReplicationCoordinatorImpl::_getMyId_inlock() const { + const MemberConfig& self = _rsConfig.getMemberAt(_selfIndex); + return self.getId(); +} + +bool ReplicationCoordinatorImpl::prepareReplSetUpdatePositionCommand(BSONObjBuilder* cmdBuilder) { + boost::lock_guard<boost::mutex> lock(_mutex); + invariant(_rsConfig.isInitialized()); + // do not send updates if we have been removed from the config + if (_selfIndex == -1) { + return false; + } + cmdBuilder->append("replSetUpdatePosition", 1); + // create an array containing objects each member connected to us and for ourself + BSONArrayBuilder arrayBuilder(cmdBuilder->subarrayStart("optimes")); + { + for (SlaveInfoVector::const_iterator itr = _slaveInfo.begin(); itr != _slaveInfo.end(); + ++itr) { + if (itr->opTime.isNull()) { + // Don't include info on members we haven't heard from yet. + continue; + } + BSONObjBuilder entry(arrayBuilder.subobjStart()); + entry.append("_id", itr->rid); + entry.append("optime", itr->opTime); + entry.append("memberId", itr->memberId); + entry.append("cfgver", _rsConfig.getConfigVersion()); + // SERVER-14550 Even though the "config" field isn't used on the other end in 3.0, + // we need to keep sending it for 2.6 compatibility. + // TODO(spencer): Remove this after 3.0 is released. + const MemberConfig* member = _rsConfig.findMemberByID(itr->memberId); + fassert(18651, member); // We ensured the member existed in processHandshake. + entry.append("config", member->toBSON(_rsConfig.getTagConfig())); + } + } + return true; +} + +void ReplicationCoordinatorImpl::prepareReplSetUpdatePositionCommandHandshakes( + std::vector<BSONObj>* handshakes) { + boost::lock_guard<boost::mutex> lock(_mutex); + // do not send handshakes if we have been removed from the config + if (_selfIndex == -1) { + return; + } + // handshake objs for ourself and all chained members + for (SlaveInfoVector::const_iterator itr = _slaveInfo.begin(); itr != _slaveInfo.end(); ++itr) { + if (!itr->rid.isSet()) { + // Don't include info on members we haven't heard from yet. + continue; + } + + BSONObjBuilder cmd; + cmd.append("replSetUpdatePosition", 1); { - boost::lock_guard<boost::mutex> lock(_mutex); - if (_rsConfigState == kConfigPreStart || _rsConfigState == kConfigStartingUp) { - return Status(ErrorCodes::NotYetInitialized, - "Received heartbeat while still initializing replication system"); + BSONObjBuilder subCmd(cmd.subobjStart("handshake")); + subCmd.append("handshake", itr->rid); + subCmd.append("member", itr->memberId); + // SERVER-14550 Even though the "config" field isn't used on the other end in 3.0, + // we need to keep sending it for 2.6 compatibility. + // TODO(spencer): Remove this after 3.0 is released. + const MemberConfig* member = _rsConfig.findMemberByID(itr->memberId); + fassert(18650, member); // We ensured the member existed in processHandshake. + subCmd.append("config", member->toBSON(_rsConfig.getTagConfig())); + } + handshakes->push_back(cmd.obj()); + } +} + +Status ReplicationCoordinatorImpl::processReplSetGetStatus(BSONObjBuilder* response) { + Status result(ErrorCodes::InternalError, "didn't set status in prepareStatusResponse"); + CBHStatus cbh = + _replExecutor.scheduleWork(stdx::bind(&TopologyCoordinator::prepareStatusResponse, + _topCoord.get(), + stdx::placeholders::_1, + _replExecutor.now(), + time(0) - serverGlobalParams.started, + getMyLastOptime(), + response, + &result)); + if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { + return Status(ErrorCodes::ShutdownInProgress, "replication shutdown in progress"); + } + fassert(18640, cbh.getStatus()); + _replExecutor.wait(cbh.getValue()); + + return result; +} + +void ReplicationCoordinatorImpl::fillIsMasterForReplSet(IsMasterResponse* response) { + invariant(getSettings().usingReplSets()); + + CBHStatus cbh = _replExecutor.scheduleWork( + stdx::bind(&ReplicationCoordinatorImpl::_fillIsMasterForReplSet_finish, + this, + stdx::placeholders::_1, + response)); + if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { + response->markAsShutdownInProgress(); + return; + } + fassert(28602, cbh.getStatus()); + + _replExecutor.wait(cbh.getValue()); + if (isWaitingForApplierToDrain()) { + // Report that we are secondary to ismaster callers until drain completes. + response->setIsMaster(false); + response->setIsSecondary(true); + } +} + +void ReplicationCoordinatorImpl::_fillIsMasterForReplSet_finish( + const ReplicationExecutor::CallbackData& cbData, IsMasterResponse* response) { + if (cbData.status == ErrorCodes::CallbackCanceled) { + response->markAsShutdownInProgress(); + return; + } + _topCoord->fillIsMasterForReplSet(response); +} + +void ReplicationCoordinatorImpl::appendSlaveInfoData(BSONObjBuilder* result) { + boost::lock_guard<boost::mutex> lock(_mutex); + BSONArrayBuilder slaves(result->subarrayStart("slaves")); + { + for (SlaveInfoVector::const_iterator itr = _slaveInfo.begin(); itr != _slaveInfo.end(); + ++itr) { + BSONObjBuilder entry(slaves.subobjStart()); + entry.append("rid", itr->rid); + entry.append("optime", itr->opTime); + entry.append("host", itr->hostAndPort.toString()); + if (_getReplicationMode_inlock() == modeReplSet) { + if (_selfIndex == -1) { + continue; + } + invariant(itr->memberId >= 0); + entry.append("memberId", itr->memberId); } } - - Status result(ErrorCodes::InternalError, "didn't set status in prepareHeartbeatResponse"); - CBHStatus cbh = _replExecutor.scheduleWork( - stdx::bind(&ReplicationCoordinatorImpl::_processHeartbeatFinish, - this, - stdx::placeholders::_1, - args, - response, - &result)); - if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { - return Status(ErrorCodes::ShutdownInProgress, "replication shutdown in progress"); - } - fassert(18508, cbh.getStatus()); - _replExecutor.wait(cbh.getValue()); - return result; } +} - void ReplicationCoordinatorImpl::_processHeartbeatFinish( - const ReplicationExecutor::CallbackData& cbData, - const ReplSetHeartbeatArgs& args, - ReplSetHeartbeatResponse* response, - Status* outStatus) { +void ReplicationCoordinatorImpl::processReplSetGetConfig(BSONObjBuilder* result) { + boost::lock_guard<boost::mutex> lock(_mutex); + result->append("config", _rsConfig.toBSON()); +} - if (cbData.status == ErrorCodes::CallbackCanceled) { - *outStatus = Status(ErrorCodes::ShutdownInProgress, "Replication shutdown in progress"); - return; - } - fassert(18910, cbData.status); - const Date_t now = _replExecutor.now(); - *outStatus = _topCoord->prepareHeartbeatResponse( - now, - args, - _settings.ourSetName(), - getMyLastOptime(), - response); - if ((outStatus->isOK() || *outStatus == ErrorCodes::InvalidReplicaSetConfig) && - _selfIndex < 0) { - // If this node does not belong to the configuration it knows about, send heartbeats - // back to any node that sends us a heartbeat, in case one of those remote nodes has - // a configuration that contains us. Chances are excellent that it will, since that - // is the only reason for a remote node to send this node a heartbeat request. - if (!args.getSenderHost().empty() && _seedList.insert(args.getSenderHost()).second) { - _scheduleHeartbeatToTarget(args.getSenderHost(), -1, now); - } - } +bool ReplicationCoordinatorImpl::getMaintenanceMode() { + bool maintenanceMode(false); + CBHStatus cbh = _replExecutor.scheduleWork( + stdx::bind(&ReplicationCoordinatorImpl::_getMaintenanceMode_helper, + this, + stdx::placeholders::_1, + &maintenanceMode)); + if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { + return false; } - - Status ReplicationCoordinatorImpl::processReplSetReconfig(OperationContext* txn, - const ReplSetReconfigArgs& args, - BSONObjBuilder* resultObj) { - - log() << "replSetReconfig admin command received from client"; - - boost::unique_lock<boost::mutex> lk(_mutex); - - while (_rsConfigState == kConfigPreStart || _rsConfigState == kConfigStartingUp) { - _rsConfigStateChange.wait(lk); - } - - switch (_rsConfigState) { + fassert(18811, cbh.getStatus()); + _replExecutor.wait(cbh.getValue()); + return maintenanceMode; +} + +void ReplicationCoordinatorImpl::_getMaintenanceMode_helper( + const ReplicationExecutor::CallbackData& cbData, bool* maintenanceMode) { + if (cbData.status == ErrorCodes::CallbackCanceled) { + return; + } + *maintenanceMode = _topCoord->getMaintenanceCount() > 0; +} + +Status ReplicationCoordinatorImpl::setMaintenanceMode(bool activate) { + if (_getReplicationMode_inlock() != modeReplSet) { + return Status(ErrorCodes::NoReplicationEnabled, + "can only set maintenance mode on replica set members"); + } + + Status result(ErrorCodes::InternalError, "didn't set status in _setMaintenanceMode_helper"); + CBHStatus cbh = _replExecutor.scheduleWork( + stdx::bind(&ReplicationCoordinatorImpl::_setMaintenanceMode_helper, + this, + stdx::placeholders::_1, + activate, + &result)); + if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { + return cbh.getStatus(); + } + fassert(18698, cbh.getStatus()); + _replExecutor.wait(cbh.getValue()); + return result; +} + +void ReplicationCoordinatorImpl::_setMaintenanceMode_helper( + const ReplicationExecutor::CallbackData& cbData, bool activate, Status* result) { + if (cbData.status == ErrorCodes::CallbackCanceled) { + *result = Status(ErrorCodes::ShutdownInProgress, "replication system is shutting down"); + return; + } + + boost::unique_lock<boost::mutex> lk(_mutex); + if (_getMemberState_inlock().primary()) { + *result = Status(ErrorCodes::NotSecondary, "primaries can't modify maintenance mode"); + return; + } + + int curMaintenanceCalls = _topCoord->getMaintenanceCount(); + if (activate) { + log() << "replSet going into maintenance mode with " << curMaintenanceCalls + << " other maintenance mode tasks in progress" << rsLog; + _topCoord->adjustMaintenanceCountBy(1); + } else if (curMaintenanceCalls > 0) { + invariant(_topCoord->getRole() == TopologyCoordinator::Role::follower); + + _topCoord->adjustMaintenanceCountBy(-1); + + log() << "leaving maintenance mode (" << curMaintenanceCalls - 1 + << " other maintenance mode tasks ongoing)" << rsLog; + } else { + warning() << "Attempted to leave maintenance mode but it is not currently active"; + *result = Status(ErrorCodes::OperationFailed, "already out of maintenance mode"); + return; + } + + const PostMemberStateUpdateAction action = _updateMemberStateFromTopologyCoordinator_inlock(); + *result = Status::OK(); + lk.unlock(); + _performPostMemberStateUpdateAction(action); +} + +Status ReplicationCoordinatorImpl::processReplSetSyncFrom(const HostAndPort& target, + BSONObjBuilder* resultObj) { + Status result(ErrorCodes::InternalError, "didn't set status in prepareSyncFromResponse"); + CBHStatus cbh = + _replExecutor.scheduleWork(stdx::bind(&TopologyCoordinator::prepareSyncFromResponse, + _topCoord.get(), + stdx::placeholders::_1, + target, + _getMyLastOptime_inlock(), + resultObj, + &result)); + if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { + return Status(ErrorCodes::ShutdownInProgress, "replication shutdown in progress"); + } + fassert(18649, cbh.getStatus()); + _replExecutor.wait(cbh.getValue()); + return result; +} + +Status ReplicationCoordinatorImpl::processReplSetFreeze(int secs, BSONObjBuilder* resultObj) { + Status result(ErrorCodes::InternalError, "didn't set status in prepareFreezeResponse"); + CBHStatus cbh = _replExecutor.scheduleWork( + stdx::bind(&ReplicationCoordinatorImpl::_processReplSetFreeze_finish, + this, + stdx::placeholders::_1, + secs, + resultObj, + &result)); + if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { + return cbh.getStatus(); + } + fassert(18641, cbh.getStatus()); + _replExecutor.wait(cbh.getValue()); + return result; +} + +void ReplicationCoordinatorImpl::_processReplSetFreeze_finish( + const ReplicationExecutor::CallbackData& cbData, + int secs, + BSONObjBuilder* response, + Status* result) { + if (cbData.status == ErrorCodes::CallbackCanceled) { + *result = Status(ErrorCodes::ShutdownInProgress, "replication system is shutting down"); + return; + } + + _topCoord->prepareFreezeResponse(_replExecutor.now(), secs, response); + + if (_topCoord->getRole() == TopologyCoordinator::Role::candidate) { + // If we just unfroze and ended our stepdown period and we are a one node replica set, + // the topology coordinator will have gone into the candidate role to signal that we + // need to elect ourself. + _performPostMemberStateUpdateAction(kActionWinElection); + } + *result = Status::OK(); +} + +Status ReplicationCoordinatorImpl::processHeartbeat(const ReplSetHeartbeatArgs& args, + ReplSetHeartbeatResponse* response) { + { + boost::lock_guard<boost::mutex> lock(_mutex); + if (_rsConfigState == kConfigPreStart || _rsConfigState == kConfigStartingUp) { + return Status(ErrorCodes::NotYetInitialized, + "Received heartbeat while still initializing replication system"); + } + } + + Status result(ErrorCodes::InternalError, "didn't set status in prepareHeartbeatResponse"); + CBHStatus cbh = + _replExecutor.scheduleWork(stdx::bind(&ReplicationCoordinatorImpl::_processHeartbeatFinish, + this, + stdx::placeholders::_1, + args, + response, + &result)); + if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { + return Status(ErrorCodes::ShutdownInProgress, "replication shutdown in progress"); + } + fassert(18508, cbh.getStatus()); + _replExecutor.wait(cbh.getValue()); + return result; +} + +void ReplicationCoordinatorImpl::_processHeartbeatFinish( + const ReplicationExecutor::CallbackData& cbData, + const ReplSetHeartbeatArgs& args, + ReplSetHeartbeatResponse* response, + Status* outStatus) { + if (cbData.status == ErrorCodes::CallbackCanceled) { + *outStatus = Status(ErrorCodes::ShutdownInProgress, "Replication shutdown in progress"); + return; + } + fassert(18910, cbData.status); + const Date_t now = _replExecutor.now(); + *outStatus = _topCoord->prepareHeartbeatResponse( + now, args, _settings.ourSetName(), getMyLastOptime(), response); + if ((outStatus->isOK() || *outStatus == ErrorCodes::InvalidReplicaSetConfig) && + _selfIndex < 0) { + // If this node does not belong to the configuration it knows about, send heartbeats + // back to any node that sends us a heartbeat, in case one of those remote nodes has + // a configuration that contains us. Chances are excellent that it will, since that + // is the only reason for a remote node to send this node a heartbeat request. + if (!args.getSenderHost().empty() && _seedList.insert(args.getSenderHost()).second) { + _scheduleHeartbeatToTarget(args.getSenderHost(), -1, now); + } + } +} + +Status ReplicationCoordinatorImpl::processReplSetReconfig(OperationContext* txn, + const ReplSetReconfigArgs& args, + BSONObjBuilder* resultObj) { + log() << "replSetReconfig admin command received from client"; + + boost::unique_lock<boost::mutex> lk(_mutex); + + while (_rsConfigState == kConfigPreStart || _rsConfigState == kConfigStartingUp) { + _rsConfigStateChange.wait(lk); + } + + switch (_rsConfigState) { case kConfigSteady: break; case kConfigUninitialized: return Status(ErrorCodes::NotYetInitialized, "Node not yet initialized; use the replSetInitiate command"); case kConfigReplicationDisabled: - invariant(false); // should be unreachable due to !_settings.usingReplSets() check above + invariant( + false); // should be unreachable due to !_settings.usingReplSets() check above case kConfigInitiating: case kConfigReconfiguring: case kConfigHBReconfiguring: @@ -1690,279 +1638,260 @@ namespace { default: severe() << "Unexpected _rsConfigState " << int(_rsConfigState); fassertFailed(18914); - } + } - invariant(_rsConfig.isInitialized()); + invariant(_rsConfig.isInitialized()); - if (!args.force && !_getMemberState_inlock().primary()) { - return Status(ErrorCodes::NotMaster, str::stream() << - "replSetReconfig should only be run on PRIMARY, but my state is " << - _getMemberState_inlock().toString() << - "; use the \"force\" argument to override"); - } + if (!args.force && !_getMemberState_inlock().primary()) { + return Status(ErrorCodes::NotMaster, + str::stream() + << "replSetReconfig should only be run on PRIMARY, but my state is " + << _getMemberState_inlock().toString() + << "; use the \"force\" argument to override"); + } - _setConfigState_inlock(kConfigReconfiguring); - ScopeGuard configStateGuard = MakeGuard( - lockAndCall, - &lk, - stdx::bind(&ReplicationCoordinatorImpl::_setConfigState_inlock, - this, - kConfigSteady)); + _setConfigState_inlock(kConfigReconfiguring); + ScopeGuard configStateGuard = MakeGuard( + lockAndCall, + &lk, + stdx::bind(&ReplicationCoordinatorImpl::_setConfigState_inlock, this, kConfigSteady)); - ReplicaSetConfig oldConfig = _rsConfig; - lk.unlock(); + ReplicaSetConfig oldConfig = _rsConfig; + lk.unlock(); - ReplicaSetConfig newConfig; - BSONObj newConfigObj = args.newConfigObj; - if (args.force) { - newConfigObj = incrementConfigVersionByRandom(newConfigObj); - } - Status status = newConfig.initialize(newConfigObj); - if (!status.isOK()) { - error() << "replSetReconfig got " << status << " while parsing " << newConfigObj; - return Status(ErrorCodes::InvalidReplicaSetConfig, status.reason());; - } - if (newConfig.getReplSetName() != _settings.ourSetName()) { - str::stream errmsg; - errmsg << "Attempting to reconfigure a replica set with name " << - newConfig.getReplSetName() << ", but command line reports " << - _settings.ourSetName() << "; rejecting"; - error() << std::string(errmsg); - return Status(ErrorCodes::InvalidReplicaSetConfig, errmsg); - } - - StatusWith<int> myIndex = validateConfigForReconfig( - _externalState.get(), - oldConfig, - newConfig, - args.force); - if (!myIndex.isOK()) { - error() << "replSetReconfig got " << myIndex.getStatus() << " while validating " << - newConfigObj; - return Status(ErrorCodes::NewReplicaSetConfigurationIncompatible, - myIndex.getStatus().reason()); - } + ReplicaSetConfig newConfig; + BSONObj newConfigObj = args.newConfigObj; + if (args.force) { + newConfigObj = incrementConfigVersionByRandom(newConfigObj); + } + Status status = newConfig.initialize(newConfigObj); + if (!status.isOK()) { + error() << "replSetReconfig got " << status << " while parsing " << newConfigObj; + return Status(ErrorCodes::InvalidReplicaSetConfig, status.reason()); + ; + } + if (newConfig.getReplSetName() != _settings.ourSetName()) { + str::stream errmsg; + errmsg << "Attempting to reconfigure a replica set with name " << newConfig.getReplSetName() + << ", but command line reports " << _settings.ourSetName() << "; rejecting"; + error() << std::string(errmsg); + return Status(ErrorCodes::InvalidReplicaSetConfig, errmsg); + } - log() << "replSetReconfig config object with " << newConfig.getNumMembers() << - " members parses ok"; + StatusWith<int> myIndex = + validateConfigForReconfig(_externalState.get(), oldConfig, newConfig, args.force); + if (!myIndex.isOK()) { + error() << "replSetReconfig got " << myIndex.getStatus() << " while validating " + << newConfigObj; + return Status(ErrorCodes::NewReplicaSetConfigurationIncompatible, + myIndex.getStatus().reason()); + } - if (!args.force) { - status = checkQuorumForReconfig(&_replExecutor, - newConfig, - myIndex.getValue()); - if (!status.isOK()) { - error() << "replSetReconfig failed; " << status; - return status; - } - } + log() << "replSetReconfig config object with " << newConfig.getNumMembers() + << " members parses ok"; - status = _externalState->storeLocalConfigDocument(txn, newConfig.toBSON()); + if (!args.force) { + status = checkQuorumForReconfig(&_replExecutor, newConfig, myIndex.getValue()); if (!status.isOK()) { - error() << "replSetReconfig failed to store config document; " << status; + error() << "replSetReconfig failed; " << status; return status; } - - const stdx::function<void (const ReplicationExecutor::CallbackData&)> reconfigFinishFn( - stdx::bind(&ReplicationCoordinatorImpl::_finishReplSetReconfig, - this, - stdx::placeholders::_1, - newConfig, - myIndex.getValue())); - - // If it's a force reconfig, the primary node may not be electable after the configuration - // change. In case we are that primary node, finish the reconfig under the global lock, - // so that the step down occurs safely. - CBHStatus cbh = - args.force ? - _replExecutor.scheduleWorkWithGlobalExclusiveLock(reconfigFinishFn) : - _replExecutor.scheduleWork(reconfigFinishFn); - if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { - return status; - } - fassert(18824, cbh.getStatus()); - configStateGuard.Dismiss(); - _replExecutor.wait(cbh.getValue()); - return Status::OK(); } - void ReplicationCoordinatorImpl::_finishReplSetReconfig( - const ReplicationExecutor::CallbackData& cbData, - const ReplicaSetConfig& newConfig, - int myIndex) { - - boost::unique_lock<boost::mutex> lk(_mutex); - invariant(_rsConfigState == kConfigReconfiguring); - invariant(_rsConfig.isInitialized()); - const PostMemberStateUpdateAction action = _setCurrentRSConfig_inlock(newConfig, myIndex); - lk.unlock(); - _performPostMemberStateUpdateAction(action); + status = _externalState->storeLocalConfigDocument(txn, newConfig.toBSON()); + if (!status.isOK()) { + error() << "replSetReconfig failed to store config document; " << status; + return status; } - Status ReplicationCoordinatorImpl::processReplSetInitiate(OperationContext* txn, - const BSONObj& configObj, - BSONObjBuilder* resultObj) { - log() << "replSetInitiate admin command received from client"; - - boost::unique_lock<boost::mutex> lk(_mutex); - if (!_settings.usingReplSets()) { - return Status(ErrorCodes::NoReplicationEnabled, "server is not running with --replSet"); - } - - while (_rsConfigState == kConfigPreStart || _rsConfigState == kConfigStartingUp) { - _rsConfigStateChange.wait(lk); - } - - if (_rsConfigState != kConfigUninitialized) { - resultObj->append("info", - "try querying local.system.replset to see current configuration"); - return Status(ErrorCodes::AlreadyInitialized, "already initialized"); - } - invariant(!_rsConfig.isInitialized()); - _setConfigState_inlock(kConfigInitiating); - ScopeGuard configStateGuard = MakeGuard( - lockAndCall, - &lk, - stdx::bind(&ReplicationCoordinatorImpl::_setConfigState_inlock, - this, - kConfigUninitialized)); - lk.unlock(); - - ReplicaSetConfig newConfig; - Status status = newConfig.initialize(configObj); - if (!status.isOK()) { - error() << "replSet initiate got " << status << " while parsing " << configObj; - return Status(ErrorCodes::InvalidReplicaSetConfig, status.reason());; - } - if (newConfig.getReplSetName() != _settings.ourSetName()) { - str::stream errmsg; - errmsg << "Attempting to initiate a replica set with name " << - newConfig.getReplSetName() << ", but command line reports " << - _settings.ourSetName() << "; rejecting"; - error() << std::string(errmsg); - return Status(ErrorCodes::InvalidReplicaSetConfig, errmsg); - } - - StatusWith<int> myIndex = validateConfigForInitiate(_externalState.get(), newConfig); - if (!myIndex.isOK()) { - error() << "replSet initiate got " << myIndex.getStatus() << " while validating " << - configObj; - return Status(ErrorCodes::InvalidReplicaSetConfig, myIndex.getStatus().reason()); - } - - log() << "replSet replSetInitiate config object with " << newConfig.getNumMembers() << - " members parses ok"; - - status = checkQuorumForInitiate( - &_replExecutor, - newConfig, - myIndex.getValue()); - - if (!status.isOK()) { - error() << "replSet replSetInitiate failed; " << status; - return status; - } + const stdx::function<void(const ReplicationExecutor::CallbackData&)> reconfigFinishFn( + stdx::bind(&ReplicationCoordinatorImpl::_finishReplSetReconfig, + this, + stdx::placeholders::_1, + newConfig, + myIndex.getValue())); - status = _externalState->storeLocalConfigDocument(txn, newConfig.toBSON()); - if (!status.isOK()) { - error() << "replSet replSetInitiate failed to store config document; " << status; - return status; - } - - CBHStatus cbh = _replExecutor.scheduleWork( - stdx::bind(&ReplicationCoordinatorImpl::_finishReplSetInitiate, - this, - stdx::placeholders::_1, - newConfig, - myIndex.getValue())); - if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { - return status; - } - configStateGuard.Dismiss(); - fassert(18654, cbh.getStatus()); - _replExecutor.wait(cbh.getValue()); - - if (status.isOK()) { - // Create the oplog with the first entry, and start repl threads. - _externalState->initiateOplog(txn); - _externalState->startThreads(); - } + // If it's a force reconfig, the primary node may not be electable after the configuration + // change. In case we are that primary node, finish the reconfig under the global lock, + // so that the step down occurs safely. + CBHStatus cbh = args.force ? _replExecutor.scheduleWorkWithGlobalExclusiveLock(reconfigFinishFn) + : _replExecutor.scheduleWork(reconfigFinishFn); + if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { + return status; + } + fassert(18824, cbh.getStatus()); + configStateGuard.Dismiss(); + _replExecutor.wait(cbh.getValue()); + return Status::OK(); +} + +void ReplicationCoordinatorImpl::_finishReplSetReconfig( + const ReplicationExecutor::CallbackData& cbData, + const ReplicaSetConfig& newConfig, + int myIndex) { + boost::unique_lock<boost::mutex> lk(_mutex); + invariant(_rsConfigState == kConfigReconfiguring); + invariant(_rsConfig.isInitialized()); + const PostMemberStateUpdateAction action = _setCurrentRSConfig_inlock(newConfig, myIndex); + lk.unlock(); + _performPostMemberStateUpdateAction(action); +} + +Status ReplicationCoordinatorImpl::processReplSetInitiate(OperationContext* txn, + const BSONObj& configObj, + BSONObjBuilder* resultObj) { + log() << "replSetInitiate admin command received from client"; + + boost::unique_lock<boost::mutex> lk(_mutex); + if (!_settings.usingReplSets()) { + return Status(ErrorCodes::NoReplicationEnabled, "server is not running with --replSet"); + } + + while (_rsConfigState == kConfigPreStart || _rsConfigState == kConfigStartingUp) { + _rsConfigStateChange.wait(lk); + } + + if (_rsConfigState != kConfigUninitialized) { + resultObj->append("info", "try querying local.system.replset to see current configuration"); + return Status(ErrorCodes::AlreadyInitialized, "already initialized"); + } + invariant(!_rsConfig.isInitialized()); + _setConfigState_inlock(kConfigInitiating); + ScopeGuard configStateGuard = MakeGuard( + lockAndCall, + &lk, + stdx::bind( + &ReplicationCoordinatorImpl::_setConfigState_inlock, this, kConfigUninitialized)); + lk.unlock(); + + ReplicaSetConfig newConfig; + Status status = newConfig.initialize(configObj); + if (!status.isOK()) { + error() << "replSet initiate got " << status << " while parsing " << configObj; + return Status(ErrorCodes::InvalidReplicaSetConfig, status.reason()); + ; + } + if (newConfig.getReplSetName() != _settings.ourSetName()) { + str::stream errmsg; + errmsg << "Attempting to initiate a replica set with name " << newConfig.getReplSetName() + << ", but command line reports " << _settings.ourSetName() << "; rejecting"; + error() << std::string(errmsg); + return Status(ErrorCodes::InvalidReplicaSetConfig, errmsg); + } + + StatusWith<int> myIndex = validateConfigForInitiate(_externalState.get(), newConfig); + if (!myIndex.isOK()) { + error() << "replSet initiate got " << myIndex.getStatus() << " while validating " + << configObj; + return Status(ErrorCodes::InvalidReplicaSetConfig, myIndex.getStatus().reason()); + } + + log() << "replSet replSetInitiate config object with " << newConfig.getNumMembers() + << " members parses ok"; + + status = checkQuorumForInitiate(&_replExecutor, newConfig, myIndex.getValue()); + + if (!status.isOK()) { + error() << "replSet replSetInitiate failed; " << status; return status; } - void ReplicationCoordinatorImpl::_finishReplSetInitiate( - const ReplicationExecutor::CallbackData& cbData, - const ReplicaSetConfig& newConfig, - int myIndex) { - - boost::unique_lock<boost::mutex> lk(_mutex); - invariant(_rsConfigState == kConfigInitiating); - invariant(!_rsConfig.isInitialized()); - const PostMemberStateUpdateAction action = _setCurrentRSConfig_inlock(newConfig, myIndex); - lk.unlock(); - _performPostMemberStateUpdateAction(action); + status = _externalState->storeLocalConfigDocument(txn, newConfig.toBSON()); + if (!status.isOK()) { + error() << "replSet replSetInitiate failed to store config document; " << status; + return status; } - void ReplicationCoordinatorImpl::_setConfigState_inlock(ConfigState newState) { - if (newState != _rsConfigState) { - _rsConfigState = newState; - _rsConfigStateChange.notify_all(); - } + CBHStatus cbh = + _replExecutor.scheduleWork(stdx::bind(&ReplicationCoordinatorImpl::_finishReplSetInitiate, + this, + stdx::placeholders::_1, + newConfig, + myIndex.getValue())); + if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { + return status; } + configStateGuard.Dismiss(); + fassert(18654, cbh.getStatus()); + _replExecutor.wait(cbh.getValue()); - ReplicationCoordinatorImpl::PostMemberStateUpdateAction - ReplicationCoordinatorImpl::_updateMemberStateFromTopologyCoordinator_inlock() { - const MemberState newState = _topCoord->getMemberState(); - if (newState == _memberState) { - if (_topCoord->getRole() == TopologyCoordinator::Role::candidate) { - invariant(_rsConfig.getNumMembers() == 1 && - _selfIndex == 0 && - _rsConfig.getMemberAt(0).isElectable()); - return kActionWinElection; - } - return kActionNone; - } - PostMemberStateUpdateAction result; - if (_memberState.primary() || newState.removed()) { - // Wake up any threads blocked in awaitReplication, close connections, etc. - for (std::vector<WaiterInfo*>::iterator it = _replicationWaiterList.begin(); - it != _replicationWaiterList.end(); ++it) { - WaiterInfo* info = *it; - info->master = false; - info->condVar->notify_all(); - } - _canAcceptNonLocalWrites = false; - result = kActionCloseAllConnections; - } - else { - if (_memberState.secondary() && !newState.primary()) { - // Switching out of SECONDARY, but not to PRIMARY. - _canServeNonLocalReads.store(0U); - } - else if (newState.secondary()) { - // Switching into SECONDARY, but not from PRIMARY. - _canServeNonLocalReads.store(1U); - } - result = kActionChooseNewSyncSource; - } - if (newState.secondary() && _topCoord->getRole() == TopologyCoordinator::Role::candidate) { - // When transitioning to SECONDARY, the only way for _topCoord to report the candidate - // role is if the configuration represents a single-node replica set. In that case, the - // overriding requirement is to elect this singleton node primary. - invariant(_rsConfig.getNumMembers() == 1 && - _selfIndex == 0 && + if (status.isOK()) { + // Create the oplog with the first entry, and start repl threads. + _externalState->initiateOplog(txn); + _externalState->startThreads(); + } + return status; +} + +void ReplicationCoordinatorImpl::_finishReplSetInitiate( + const ReplicationExecutor::CallbackData& cbData, + const ReplicaSetConfig& newConfig, + int myIndex) { + boost::unique_lock<boost::mutex> lk(_mutex); + invariant(_rsConfigState == kConfigInitiating); + invariant(!_rsConfig.isInitialized()); + const PostMemberStateUpdateAction action = _setCurrentRSConfig_inlock(newConfig, myIndex); + lk.unlock(); + _performPostMemberStateUpdateAction(action); +} + +void ReplicationCoordinatorImpl::_setConfigState_inlock(ConfigState newState) { + if (newState != _rsConfigState) { + _rsConfigState = newState; + _rsConfigStateChange.notify_all(); + } +} + +ReplicationCoordinatorImpl::PostMemberStateUpdateAction +ReplicationCoordinatorImpl::_updateMemberStateFromTopologyCoordinator_inlock() { + const MemberState newState = _topCoord->getMemberState(); + if (newState == _memberState) { + if (_topCoord->getRole() == TopologyCoordinator::Role::candidate) { + invariant(_rsConfig.getNumMembers() == 1 && _selfIndex == 0 && _rsConfig.getMemberAt(0).isElectable()); - result = kActionWinElection; + return kActionWinElection; } - - _memberState = newState; - log() << "transition to " << newState.toString() << rsLog; - return result; + return kActionNone; } - - void ReplicationCoordinatorImpl::_performPostMemberStateUpdateAction( - PostMemberStateUpdateAction action) { - - switch (action) { + PostMemberStateUpdateAction result; + if (_memberState.primary() || newState.removed()) { + // Wake up any threads blocked in awaitReplication, close connections, etc. + for (std::vector<WaiterInfo*>::iterator it = _replicationWaiterList.begin(); + it != _replicationWaiterList.end(); + ++it) { + WaiterInfo* info = *it; + info->master = false; + info->condVar->notify_all(); + } + _canAcceptNonLocalWrites = false; + result = kActionCloseAllConnections; + } else { + if (_memberState.secondary() && !newState.primary()) { + // Switching out of SECONDARY, but not to PRIMARY. + _canServeNonLocalReads.store(0U); + } else if (newState.secondary()) { + // Switching into SECONDARY, but not from PRIMARY. + _canServeNonLocalReads.store(1U); + } + result = kActionChooseNewSyncSource; + } + if (newState.secondary() && _topCoord->getRole() == TopologyCoordinator::Role::candidate) { + // When transitioning to SECONDARY, the only way for _topCoord to report the candidate + // role is if the configuration represents a single-node replica set. In that case, the + // overriding requirement is to elect this singleton node primary. + invariant(_rsConfig.getNumMembers() == 1 && _selfIndex == 0 && + _rsConfig.getMemberAt(0).isElectable()); + result = kActionWinElection; + } + + _memberState = newState; + log() << "transition to " << newState.toString() << rsLog; + return result; +} + +void ReplicationCoordinatorImpl::_performPostMemberStateUpdateAction( + PostMemberStateUpdateAction action) { + switch (action) { case kActionNone: break; case kActionChooseNewSyncSource: @@ -1987,441 +1916,426 @@ namespace { default: severe() << "Unknown post member state update action " << static_cast<int>(action); fassertFailed(26010); - } - } - - Status ReplicationCoordinatorImpl::processReplSetGetRBID(BSONObjBuilder* resultObj) { - boost::lock_guard<boost::mutex> lk(_mutex); - resultObj->append("rbid", _rbid); - return Status::OK(); - } - - void ReplicationCoordinatorImpl::incrementRollbackID() { - boost::lock_guard<boost::mutex> lk(_mutex); - ++_rbid; - } - - Status ReplicationCoordinatorImpl::processReplSetFresh(const ReplSetFreshArgs& args, - BSONObjBuilder* resultObj) { - - Status result(ErrorCodes::InternalError, "didn't set status in prepareFreshResponse"); - CBHStatus cbh = _replExecutor.scheduleWork( - stdx::bind(&ReplicationCoordinatorImpl::_processReplSetFresh_finish, - this, - stdx::placeholders::_1, - args, - resultObj, - &result)); - if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { - return Status(ErrorCodes::ShutdownInProgress, "replication shutdown in progress"); - } - fassert(18652, cbh.getStatus()); - _replExecutor.wait(cbh.getValue()); - return result; - } - - void ReplicationCoordinatorImpl::_processReplSetFresh_finish( - const ReplicationExecutor::CallbackData& cbData, - const ReplSetFreshArgs& args, - BSONObjBuilder* response, - Status* result) { - if (cbData.status == ErrorCodes::CallbackCanceled) { - *result = Status(ErrorCodes::ShutdownInProgress, "replication shutdown in progress"); - return; - } - - _topCoord->prepareFreshResponse( - args, _replExecutor.now(), getMyLastOptime(), response, result); - } - - Status ReplicationCoordinatorImpl::processReplSetElect(const ReplSetElectArgs& args, - BSONObjBuilder* responseObj) { - Status result = Status(ErrorCodes::InternalError, "status not set by callback"); - CBHStatus cbh = _replExecutor.scheduleWork( - stdx::bind(&ReplicationCoordinatorImpl::_processReplSetElect_finish, - this, - stdx::placeholders::_1, - args, - responseObj, - &result)); - if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { - return Status(ErrorCodes::ShutdownInProgress, "replication shutdown in progress"); - } - fassert(18657, cbh.getStatus()); - _replExecutor.wait(cbh.getValue()); - return result; - } - - void ReplicationCoordinatorImpl::_processReplSetElect_finish( - const ReplicationExecutor::CallbackData& cbData, - const ReplSetElectArgs& args, - BSONObjBuilder* response, - Status* result) { - if (cbData.status == ErrorCodes::CallbackCanceled) { - *result = Status(ErrorCodes::ShutdownInProgress, "replication shutdown in progress"); - return; - } - - _topCoord->prepareElectResponse( - args, _replExecutor.now(), getMyLastOptime(), response, result); - } - - ReplicationCoordinatorImpl::PostMemberStateUpdateAction - ReplicationCoordinatorImpl::_setCurrentRSConfig_inlock( - const ReplicaSetConfig& newConfig, - int myIndex) { - invariant(_settings.usingReplSets()); - _cancelHeartbeats(); - _setConfigState_inlock(kConfigSteady); - OpTime myOptime = _getMyLastOptime_inlock(); // Must get this before changing our config. - _topCoord->updateConfig( - newConfig, - myIndex, - _replExecutor.now(), - myOptime); - _rsConfig = newConfig; - log() << "New replica set config in use: " << _rsConfig.toBSON() << rsLog; - _selfIndex = myIndex; - if (_selfIndex >= 0) { - log() << "This node is " << - _rsConfig.getMemberAt(_selfIndex).getHostAndPort() << " in the config"; - } - else { - log() << "This node is not a member of the config"; - } - - const PostMemberStateUpdateAction action = - _updateMemberStateFromTopologyCoordinator_inlock(); - _updateSlaveInfoFromConfig_inlock(); - if (_selfIndex >= 0) { - // Don't send heartbeats if we're not in the config, if we get re-added one of the - // nodes in the set will contact us. - _startHeartbeats(); - } - _wakeReadyWaiters_inlock(); - return action; - } - - void ReplicationCoordinatorImpl::_wakeReadyWaiters_inlock(){ - for (std::vector<WaiterInfo*>::iterator it = _replicationWaiterList.begin(); - it != _replicationWaiterList.end(); ++it) { - WaiterInfo* info = *it; - if (_doneWaitingForReplication_inlock(*info->opTime, *info->writeConcern)) { - info->condVar->notify_all(); - } - } - } - - Status ReplicationCoordinatorImpl::processReplSetUpdatePosition( - const UpdatePositionArgs& updates) { - - boost::unique_lock<boost::mutex> lock(_mutex); - Status status = Status::OK(); - bool somethingChanged = false; - for (UpdatePositionArgs::UpdateIterator update = updates.updatesBegin(); - update != updates.updatesEnd(); - ++update) { - status = _setLastOptime_inlock(*update); - if (!status.isOK()) { - break; - } - somethingChanged = true; - } - - if (somethingChanged && !_getMemberState_inlock().primary()) { - lock.unlock(); - _externalState->forwardSlaveProgress(); // Must do this outside _mutex - } - return status; } - - Status ReplicationCoordinatorImpl::processHandshake(OperationContext* txn, - const HandshakeArgs& handshake) { - LOG(2) << "Received handshake " << handshake.toBSON(); - - boost::unique_lock<boost::mutex> lock(_mutex); - if (_getReplicationMode_inlock() == modeReplSet) { - if (_selfIndex == -1) { - // Ignore updates when we're in state REMOVED - return Status(ErrorCodes::NotMasterOrSecondaryCode, - "Received replSetUpdatePosition command but we are in state REMOVED"); - } - - int memberId = handshake.getMemberId(); - const MemberConfig* member = _rsConfig.findMemberByID(memberId); - if (!member) { - return Status(ErrorCodes::NodeNotFound, - str::stream() << "Node with replica set memberId " << memberId << - " could not be found in replica set config while attempting" - " to associate it with RID " << handshake.getRid() << - " in replication handshake. ReplSet Config: " << - _rsConfig.toBSON().toString()); - } - SlaveInfo* slaveInfo = _findSlaveInfoByMemberID_inlock(handshake.getMemberId()); - invariant(slaveInfo); // If it's in the config it must be in _slaveInfo - slaveInfo->rid = handshake.getRid(); - slaveInfo->hostAndPort = member->getHostAndPort(); - - if (!_getMemberState_inlock().primary()) { - lock.unlock(); - _externalState->forwardSlaveHandshake(); // must do outside _mutex - } - return Status::OK(); - } - - // master-slave from here down - SlaveInfo* slaveInfo = _findSlaveInfoByRID_inlock(handshake.getRid()); - if (slaveInfo) { - return Status::OK(); // nothing to do +} + +Status ReplicationCoordinatorImpl::processReplSetGetRBID(BSONObjBuilder* resultObj) { + boost::lock_guard<boost::mutex> lk(_mutex); + resultObj->append("rbid", _rbid); + return Status::OK(); +} + +void ReplicationCoordinatorImpl::incrementRollbackID() { + boost::lock_guard<boost::mutex> lk(_mutex); + ++_rbid; +} + +Status ReplicationCoordinatorImpl::processReplSetFresh(const ReplSetFreshArgs& args, + BSONObjBuilder* resultObj) { + Status result(ErrorCodes::InternalError, "didn't set status in prepareFreshResponse"); + CBHStatus cbh = _replExecutor.scheduleWork( + stdx::bind(&ReplicationCoordinatorImpl::_processReplSetFresh_finish, + this, + stdx::placeholders::_1, + args, + resultObj, + &result)); + if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { + return Status(ErrorCodes::ShutdownInProgress, "replication shutdown in progress"); + } + fassert(18652, cbh.getStatus()); + _replExecutor.wait(cbh.getValue()); + return result; +} + +void ReplicationCoordinatorImpl::_processReplSetFresh_finish( + const ReplicationExecutor::CallbackData& cbData, + const ReplSetFreshArgs& args, + BSONObjBuilder* response, + Status* result) { + if (cbData.status == ErrorCodes::CallbackCanceled) { + *result = Status(ErrorCodes::ShutdownInProgress, "replication shutdown in progress"); + return; + } + + _topCoord->prepareFreshResponse(args, _replExecutor.now(), getMyLastOptime(), response, result); +} + +Status ReplicationCoordinatorImpl::processReplSetElect(const ReplSetElectArgs& args, + BSONObjBuilder* responseObj) { + Status result = Status(ErrorCodes::InternalError, "status not set by callback"); + CBHStatus cbh = _replExecutor.scheduleWork( + stdx::bind(&ReplicationCoordinatorImpl::_processReplSetElect_finish, + this, + stdx::placeholders::_1, + args, + responseObj, + &result)); + if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { + return Status(ErrorCodes::ShutdownInProgress, "replication shutdown in progress"); + } + fassert(18657, cbh.getStatus()); + _replExecutor.wait(cbh.getValue()); + return result; +} + +void ReplicationCoordinatorImpl::_processReplSetElect_finish( + const ReplicationExecutor::CallbackData& cbData, + const ReplSetElectArgs& args, + BSONObjBuilder* response, + Status* result) { + if (cbData.status == ErrorCodes::CallbackCanceled) { + *result = Status(ErrorCodes::ShutdownInProgress, "replication shutdown in progress"); + return; + } + + _topCoord->prepareElectResponse(args, _replExecutor.now(), getMyLastOptime(), response, result); +} + +ReplicationCoordinatorImpl::PostMemberStateUpdateAction +ReplicationCoordinatorImpl::_setCurrentRSConfig_inlock(const ReplicaSetConfig& newConfig, + int myIndex) { + invariant(_settings.usingReplSets()); + _cancelHeartbeats(); + _setConfigState_inlock(kConfigSteady); + OpTime myOptime = _getMyLastOptime_inlock(); // Must get this before changing our config. + _topCoord->updateConfig(newConfig, myIndex, _replExecutor.now(), myOptime); + _rsConfig = newConfig; + log() << "New replica set config in use: " << _rsConfig.toBSON() << rsLog; + _selfIndex = myIndex; + if (_selfIndex >= 0) { + log() << "This node is " << _rsConfig.getMemberAt(_selfIndex).getHostAndPort() + << " in the config"; + } else { + log() << "This node is not a member of the config"; + } + + const PostMemberStateUpdateAction action = _updateMemberStateFromTopologyCoordinator_inlock(); + _updateSlaveInfoFromConfig_inlock(); + if (_selfIndex >= 0) { + // Don't send heartbeats if we're not in the config, if we get re-added one of the + // nodes in the set will contact us. + _startHeartbeats(); + } + _wakeReadyWaiters_inlock(); + return action; +} + +void ReplicationCoordinatorImpl::_wakeReadyWaiters_inlock() { + for (std::vector<WaiterInfo*>::iterator it = _replicationWaiterList.begin(); + it != _replicationWaiterList.end(); + ++it) { + WaiterInfo* info = *it; + if (_doneWaitingForReplication_inlock(*info->opTime, *info->writeConcern)) { + info->condVar->notify_all(); } - - SlaveInfo newSlaveInfo; - newSlaveInfo.rid = handshake.getRid(); - newSlaveInfo.memberId = -1; - newSlaveInfo.hostAndPort = _externalState->getClientHostAndPort(txn); - // Don't call _addSlaveInfo_inlock as that would wake sleepers unnecessarily. - _slaveInfo.push_back(newSlaveInfo); - - return Status::OK(); } +} - bool ReplicationCoordinatorImpl::buildsIndexes() { - boost::lock_guard<boost::mutex> lk(_mutex); - if (_selfIndex == -1) { - return true; +Status ReplicationCoordinatorImpl::processReplSetUpdatePosition(const UpdatePositionArgs& updates) { + boost::unique_lock<boost::mutex> lock(_mutex); + Status status = Status::OK(); + bool somethingChanged = false; + for (UpdatePositionArgs::UpdateIterator update = updates.updatesBegin(); + update != updates.updatesEnd(); + ++update) { + status = _setLastOptime_inlock(*update); + if (!status.isOK()) { + break; } - const MemberConfig& self = _rsConfig.getMemberAt(_selfIndex); - return self.shouldBuildIndexes(); + somethingChanged = true; } - std::vector<HostAndPort> ReplicationCoordinatorImpl::getHostsWrittenTo(const OpTime& op) { - std::vector<HostAndPort> hosts; - boost::lock_guard<boost::mutex> lk(_mutex); - for (size_t i = 0; i < _slaveInfo.size(); ++i) { - const SlaveInfo& slaveInfo = _slaveInfo[i]; - if (slaveInfo.opTime < op) { - continue; - } - - if (_getReplicationMode_inlock() == modeMasterSlave && - slaveInfo.rid == _getMyRID_inlock()) { - // Master-slave doesn't know the HostAndPort for itself at this point. - continue; - } - hosts.push_back(slaveInfo.hostAndPort); - } - return hosts; + if (somethingChanged && !_getMemberState_inlock().primary()) { + lock.unlock(); + _externalState->forwardSlaveProgress(); // Must do this outside _mutex } + return status; +} - std::vector<HostAndPort> ReplicationCoordinatorImpl::getOtherNodesInReplSet() const { - boost::lock_guard<boost::mutex> lk(_mutex); - invariant(_settings.usingReplSets()); +Status ReplicationCoordinatorImpl::processHandshake(OperationContext* txn, + const HandshakeArgs& handshake) { + LOG(2) << "Received handshake " << handshake.toBSON(); - std::vector<HostAndPort> nodes; + boost::unique_lock<boost::mutex> lock(_mutex); + if (_getReplicationMode_inlock() == modeReplSet) { if (_selfIndex == -1) { - return nodes; + // Ignore updates when we're in state REMOVED + return Status(ErrorCodes::NotMasterOrSecondaryCode, + "Received replSetUpdatePosition command but we are in state REMOVED"); } - for (int i = 0; i < _rsConfig.getNumMembers(); ++i) { - if (i == _selfIndex) - continue; + int memberId = handshake.getMemberId(); + const MemberConfig* member = _rsConfig.findMemberByID(memberId); + if (!member) { + return Status(ErrorCodes::NodeNotFound, + str::stream() + << "Node with replica set memberId " << memberId + << " could not be found in replica set config while attempting" + " to associate it with RID " << handshake.getRid() + << " in replication handshake. ReplSet Config: " + << _rsConfig.toBSON().toString()); + } + SlaveInfo* slaveInfo = _findSlaveInfoByMemberID_inlock(handshake.getMemberId()); + invariant(slaveInfo); // If it's in the config it must be in _slaveInfo + slaveInfo->rid = handshake.getRid(); + slaveInfo->hostAndPort = member->getHostAndPort(); - nodes.push_back(_rsConfig.getMemberAt(i).getHostAndPort()); + if (!_getMemberState_inlock().primary()) { + lock.unlock(); + _externalState->forwardSlaveHandshake(); // must do outside _mutex } - return nodes; + return Status::OK(); } - Status ReplicationCoordinatorImpl::checkIfWriteConcernCanBeSatisfied( - const WriteConcernOptions& writeConcern) const { - boost::lock_guard<boost::mutex> lock(_mutex); - return _checkIfWriteConcernCanBeSatisfied_inlock(writeConcern); + // master-slave from here down + SlaveInfo* slaveInfo = _findSlaveInfoByRID_inlock(handshake.getRid()); + if (slaveInfo) { + return Status::OK(); // nothing to do } - Status ReplicationCoordinatorImpl::_checkIfWriteConcernCanBeSatisfied_inlock( - const WriteConcernOptions& writeConcern) const { - if (_getReplicationMode_inlock() == modeNone) { - return Status(ErrorCodes::NoReplicationEnabled, - "No replication enabled when checking if write concern can be satisfied"); - } + SlaveInfo newSlaveInfo; + newSlaveInfo.rid = handshake.getRid(); + newSlaveInfo.memberId = -1; + newSlaveInfo.hostAndPort = _externalState->getClientHostAndPort(txn); + // Don't call _addSlaveInfo_inlock as that would wake sleepers unnecessarily. + _slaveInfo.push_back(newSlaveInfo); - if (_getReplicationMode_inlock() == modeMasterSlave) { - if (!writeConcern.wMode.empty()) { - return Status(ErrorCodes::UnknownReplWriteConcern, - "Cannot use named write concern modes in master-slave"); - } - // No way to know how many slaves there are, so assume any numeric mode is possible. - return Status::OK(); - } + return Status::OK(); +} - invariant(_getReplicationMode_inlock() == modeReplSet); - return _rsConfig.checkIfWriteConcernCanBeSatisfied(writeConcern); +bool ReplicationCoordinatorImpl::buildsIndexes() { + boost::lock_guard<boost::mutex> lk(_mutex); + if (_selfIndex == -1) { + return true; } + const MemberConfig& self = _rsConfig.getMemberAt(_selfIndex); + return self.shouldBuildIndexes(); +} - WriteConcernOptions ReplicationCoordinatorImpl::getGetLastErrorDefault() { - boost::mutex::scoped_lock lock(_mutex); - if (_rsConfig.isInitialized()) { - return _rsConfig.getDefaultWriteConcern(); +std::vector<HostAndPort> ReplicationCoordinatorImpl::getHostsWrittenTo(const OpTime& op) { + std::vector<HostAndPort> hosts; + boost::lock_guard<boost::mutex> lk(_mutex); + for (size_t i = 0; i < _slaveInfo.size(); ++i) { + const SlaveInfo& slaveInfo = _slaveInfo[i]; + if (slaveInfo.opTime < op) { + continue; } - return WriteConcernOptions(); - } - Status ReplicationCoordinatorImpl::checkReplEnabledForCommand(BSONObjBuilder* result) { - if (!_settings.usingReplSets()) { - if (serverGlobalParams.configsvr) { - result->append("info", "configsvr"); // for shell prompt - } - return Status(ErrorCodes::NoReplicationEnabled, "not running with --replSet"); + if (_getReplicationMode_inlock() == modeMasterSlave && + slaveInfo.rid == _getMyRID_inlock()) { + // Master-slave doesn't know the HostAndPort for itself at this point. + continue; } + hosts.push_back(slaveInfo.hostAndPort); + } + return hosts; +} - if (getMemberState().startup()) { - result->append("info", "run rs.initiate(...) if not yet done for the set"); - return Status(ErrorCodes::NotYetInitialized, "no replset config has been received"); - } +std::vector<HostAndPort> ReplicationCoordinatorImpl::getOtherNodesInReplSet() const { + boost::lock_guard<boost::mutex> lk(_mutex); + invariant(_settings.usingReplSets()); - return Status::OK(); + std::vector<HostAndPort> nodes; + if (_selfIndex == -1) { + return nodes; } - bool ReplicationCoordinatorImpl::isReplEnabled() const { - return getReplicationMode() != modeNone; - } + for (int i = 0; i < _rsConfig.getNumMembers(); ++i) { + if (i == _selfIndex) + continue; - void ReplicationCoordinatorImpl::_chooseNewSyncSource( - const ReplicationExecutor::CallbackData& cbData, - const OpTime& lastOpTimeFetched, - HostAndPort* newSyncSource) { - if (cbData.status == ErrorCodes::CallbackCanceled) { - return; - } - *newSyncSource = _topCoord->chooseNewSyncSource(_replExecutor.now(), lastOpTimeFetched); - } - - HostAndPort ReplicationCoordinatorImpl::chooseNewSyncSource(const OpTime& lastOpTimeFetched) { - HostAndPort newSyncSource; - CBHStatus cbh = _replExecutor.scheduleWork( - stdx::bind(&ReplicationCoordinatorImpl::_chooseNewSyncSource, - this, - stdx::placeholders::_1, - lastOpTimeFetched, - &newSyncSource)); - if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { - return newSyncSource; // empty - } - fassert(18740, cbh.getStatus()); - _replExecutor.wait(cbh.getValue()); - return newSyncSource; + nodes.push_back(_rsConfig.getMemberAt(i).getHostAndPort()); } + return nodes; +} - void ReplicationCoordinatorImpl::_blacklistSyncSource( - const ReplicationExecutor::CallbackData& cbData, - const HostAndPort& host, - Date_t until) { - if (cbData.status == ErrorCodes::CallbackCanceled) { - return; - } - _topCoord->blacklistSyncSource(host, until); - - CBHStatus cbh = _replExecutor.scheduleWorkAt( - until, - stdx::bind(&ReplicationCoordinatorImpl::_unblacklistSyncSource, - this, - stdx::placeholders::_1, - host)); - if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { - return; - } - fassert(28610, cbh.getStatus()); - } +Status ReplicationCoordinatorImpl::checkIfWriteConcernCanBeSatisfied( + const WriteConcernOptions& writeConcern) const { + boost::lock_guard<boost::mutex> lock(_mutex); + return _checkIfWriteConcernCanBeSatisfied_inlock(writeConcern); +} - void ReplicationCoordinatorImpl::_unblacklistSyncSource( - const ReplicationExecutor::CallbackData& cbData, - const HostAndPort& host) { - if (cbData.status == ErrorCodes::CallbackCanceled) - return; - _topCoord->unblacklistSyncSource(host, _replExecutor.now()); +Status ReplicationCoordinatorImpl::_checkIfWriteConcernCanBeSatisfied_inlock( + const WriteConcernOptions& writeConcern) const { + if (_getReplicationMode_inlock() == modeNone) { + return Status(ErrorCodes::NoReplicationEnabled, + "No replication enabled when checking if write concern can be satisfied"); } - void ReplicationCoordinatorImpl::blacklistSyncSource(const HostAndPort& host, Date_t until) { - CBHStatus cbh = _replExecutor.scheduleWork( - stdx::bind(&ReplicationCoordinatorImpl::_blacklistSyncSource, - this, - stdx::placeholders::_1, - host, - until)); - if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { - return; + if (_getReplicationMode_inlock() == modeMasterSlave) { + if (!writeConcern.wMode.empty()) { + return Status(ErrorCodes::UnknownReplWriteConcern, + "Cannot use named write concern modes in master-slave"); } - fassert(18741, cbh.getStatus()); - _replExecutor.wait(cbh.getValue()); + // No way to know how many slaves there are, so assume any numeric mode is possible. + return Status::OK(); } - void ReplicationCoordinatorImpl::resetLastOpTimeFromOplog(OperationContext* txn) { - StatusWith<OpTime> lastOpTimeStatus = _externalState->loadLastOpTime(txn); - OpTime lastOpTime(0, 0); - if (!lastOpTimeStatus.isOK()) { - warning() << "Failed to load timestamp of most recently applied operation; " << - lastOpTimeStatus.getStatus(); - } - else { - lastOpTime = lastOpTimeStatus.getValue(); - } - boost::unique_lock<boost::mutex> lk(_mutex); - _setMyLastOptime_inlock(&lk, lastOpTime, true); - _externalState->setGlobalOpTime(lastOpTime); + invariant(_getReplicationMode_inlock() == modeReplSet); + return _rsConfig.checkIfWriteConcernCanBeSatisfied(writeConcern); +} + +WriteConcernOptions ReplicationCoordinatorImpl::getGetLastErrorDefault() { + boost::mutex::scoped_lock lock(_mutex); + if (_rsConfig.isInitialized()) { + return _rsConfig.getDefaultWriteConcern(); + } + return WriteConcernOptions(); +} + +Status ReplicationCoordinatorImpl::checkReplEnabledForCommand(BSONObjBuilder* result) { + if (!_settings.usingReplSets()) { + if (serverGlobalParams.configsvr) { + result->append("info", "configsvr"); // for shell prompt + } + return Status(ErrorCodes::NoReplicationEnabled, "not running with --replSet"); + } + + if (getMemberState().startup()) { + result->append("info", "run rs.initiate(...) if not yet done for the set"); + return Status(ErrorCodes::NotYetInitialized, "no replset config has been received"); + } + + return Status::OK(); +} + +bool ReplicationCoordinatorImpl::isReplEnabled() const { + return getReplicationMode() != modeNone; +} + +void ReplicationCoordinatorImpl::_chooseNewSyncSource( + const ReplicationExecutor::CallbackData& cbData, + const OpTime& lastOpTimeFetched, + HostAndPort* newSyncSource) { + if (cbData.status == ErrorCodes::CallbackCanceled) { + return; + } + *newSyncSource = _topCoord->chooseNewSyncSource(_replExecutor.now(), lastOpTimeFetched); +} + +HostAndPort ReplicationCoordinatorImpl::chooseNewSyncSource(const OpTime& lastOpTimeFetched) { + HostAndPort newSyncSource; + CBHStatus cbh = + _replExecutor.scheduleWork(stdx::bind(&ReplicationCoordinatorImpl::_chooseNewSyncSource, + this, + stdx::placeholders::_1, + lastOpTimeFetched, + &newSyncSource)); + if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { + return newSyncSource; // empty + } + fassert(18740, cbh.getStatus()); + _replExecutor.wait(cbh.getValue()); + return newSyncSource; +} + +void ReplicationCoordinatorImpl::_blacklistSyncSource( + const ReplicationExecutor::CallbackData& cbData, const HostAndPort& host, Date_t until) { + if (cbData.status == ErrorCodes::CallbackCanceled) { + return; + } + _topCoord->blacklistSyncSource(host, until); + + CBHStatus cbh = + _replExecutor.scheduleWorkAt(until, + stdx::bind(&ReplicationCoordinatorImpl::_unblacklistSyncSource, + this, + stdx::placeholders::_1, + host)); + if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { + return; + } + fassert(28610, cbh.getStatus()); +} + +void ReplicationCoordinatorImpl::_unblacklistSyncSource( + const ReplicationExecutor::CallbackData& cbData, const HostAndPort& host) { + if (cbData.status == ErrorCodes::CallbackCanceled) + return; + _topCoord->unblacklistSyncSource(host, _replExecutor.now()); +} + +void ReplicationCoordinatorImpl::blacklistSyncSource(const HostAndPort& host, Date_t until) { + CBHStatus cbh = + _replExecutor.scheduleWork(stdx::bind(&ReplicationCoordinatorImpl::_blacklistSyncSource, + this, + stdx::placeholders::_1, + host, + until)); + if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { + return; + } + fassert(18741, cbh.getStatus()); + _replExecutor.wait(cbh.getValue()); +} + +void ReplicationCoordinatorImpl::resetLastOpTimeFromOplog(OperationContext* txn) { + StatusWith<OpTime> lastOpTimeStatus = _externalState->loadLastOpTime(txn); + OpTime lastOpTime(0, 0); + if (!lastOpTimeStatus.isOK()) { + warning() << "Failed to load timestamp of most recently applied operation; " + << lastOpTimeStatus.getStatus(); + } else { + lastOpTime = lastOpTimeStatus.getValue(); + } + boost::unique_lock<boost::mutex> lk(_mutex); + _setMyLastOptime_inlock(&lk, lastOpTime, true); + _externalState->setGlobalOpTime(lastOpTime); +} + +void ReplicationCoordinatorImpl::_shouldChangeSyncSource( + const ReplicationExecutor::CallbackData& cbData, + const HostAndPort& currentSource, + bool* shouldChange) { + if (cbData.status == ErrorCodes::CallbackCanceled) { + return; + } + + *shouldChange = _topCoord->shouldChangeSyncSource(currentSource, _replExecutor.now()); +} + +bool ReplicationCoordinatorImpl::shouldChangeSyncSource(const HostAndPort& currentSource) { + bool shouldChange(false); + CBHStatus cbh = + _replExecutor.scheduleWork(stdx::bind(&ReplicationCoordinatorImpl::_shouldChangeSyncSource, + this, + stdx::placeholders::_1, + currentSource, + &shouldChange)); + if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { + return false; } + fassert(18906, cbh.getStatus()); + _replExecutor.wait(cbh.getValue()); + return shouldChange; +} - void ReplicationCoordinatorImpl::_shouldChangeSyncSource( - const ReplicationExecutor::CallbackData& cbData, - const HostAndPort& currentSource, - bool* shouldChange) { - if (cbData.status == ErrorCodes::CallbackCanceled) { - return; - } - - *shouldChange = _topCoord->shouldChangeSyncSource(currentSource, _replExecutor.now()); +void ReplicationCoordinatorImpl::summarizeAsHtml(ReplSetHtmlSummary* output) { + CBHStatus cbh = + _replExecutor.scheduleWork(stdx::bind(&ReplicationCoordinatorImpl::_summarizeAsHtml_finish, + this, + stdx::placeholders::_1, + output)); + if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { + return; } + fassert(28638, cbh.getStatus()); + _replExecutor.wait(cbh.getValue()); +} - bool ReplicationCoordinatorImpl::shouldChangeSyncSource(const HostAndPort& currentSource) { - bool shouldChange(false); - CBHStatus cbh = _replExecutor.scheduleWork( - stdx::bind(&ReplicationCoordinatorImpl::_shouldChangeSyncSource, - this, - stdx::placeholders::_1, - currentSource, - &shouldChange)); - if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { - return false; - } - fassert(18906, cbh.getStatus()); - _replExecutor.wait(cbh.getValue()); - return shouldChange; - } - - void ReplicationCoordinatorImpl::summarizeAsHtml(ReplSetHtmlSummary* output) { - CBHStatus cbh = _replExecutor.scheduleWork( - stdx::bind(&ReplicationCoordinatorImpl::_summarizeAsHtml_finish, - this, - stdx::placeholders::_1, - output)); - if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { - return; - } - fassert(28638, cbh.getStatus()); - _replExecutor.wait(cbh.getValue()); +void ReplicationCoordinatorImpl::_summarizeAsHtml_finish( + const ReplicationExecutor::CallbackData& cbData, ReplSetHtmlSummary* output) { + if (cbData.status == ErrorCodes::CallbackCanceled) { + return; } - void ReplicationCoordinatorImpl::_summarizeAsHtml_finish( - const ReplicationExecutor::CallbackData& cbData, - ReplSetHtmlSummary* output) { - if (cbData.status == ErrorCodes::CallbackCanceled) { - return; - } + output->setSelfOptime(getMyLastOptime()); + output->setSelfUptime(time(0) - serverGlobalParams.started); + output->setNow(_replExecutor.now()); - output->setSelfOptime(getMyLastOptime()); - output->setSelfUptime(time(0) - serverGlobalParams.started); - output->setNow(_replExecutor.now()); - - _topCoord->summarizeAsHtml(output); - } + _topCoord->summarizeAsHtml(output); +} -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h index ae08703b628..bdf6166cbcd 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.h +++ b/src/mongo/db/repl/replication_coordinator_impl.h @@ -50,836 +50,819 @@ namespace mongo { - class Timer; - template <typename T> class StatusWith; +class Timer; +template <typename T> +class StatusWith; namespace repl { - class ElectCmdRunner; - class FreshnessChecker; - class HeartbeatResponseAction; - class OplogReader; - class SyncSourceFeedback; - class TopologyCoordinator; +class ElectCmdRunner; +class FreshnessChecker; +class HeartbeatResponseAction; +class OplogReader; +class SyncSourceFeedback; +class TopologyCoordinator; - class ReplicationCoordinatorImpl : public ReplicationCoordinator, - public KillOpListenerInterface { - MONGO_DISALLOW_COPYING(ReplicationCoordinatorImpl); +class ReplicationCoordinatorImpl : public ReplicationCoordinator, public KillOpListenerInterface { + MONGO_DISALLOW_COPYING(ReplicationCoordinatorImpl); - public: +public: + // Takes ownership of the "externalState", "topCoord" and "network" objects. + ReplicationCoordinatorImpl(const ReplSettings& settings, + ReplicationCoordinatorExternalState* externalState, + ReplicationExecutor::NetworkInterface* network, + TopologyCoordinator* topoCoord, + int64_t prngSeed); + virtual ~ReplicationCoordinatorImpl(); - // Takes ownership of the "externalState", "topCoord" and "network" objects. - ReplicationCoordinatorImpl(const ReplSettings& settings, - ReplicationCoordinatorExternalState* externalState, - ReplicationExecutor::NetworkInterface* network, - TopologyCoordinator* topoCoord, - int64_t prngSeed); - virtual ~ReplicationCoordinatorImpl(); + // ================== Members of public ReplicationCoordinator API =================== - // ================== Members of public ReplicationCoordinator API =================== + virtual void startReplication(OperationContext* txn); - virtual void startReplication(OperationContext* txn); + virtual void shutdown(); - virtual void shutdown(); + virtual const ReplSettings& getSettings() const; - virtual const ReplSettings& getSettings() const; + virtual Mode getReplicationMode() const; - virtual Mode getReplicationMode() const; + virtual MemberState getMemberState() const; - virtual MemberState getMemberState() const; + virtual bool isInPrimaryOrSecondaryState() const; - virtual bool isInPrimaryOrSecondaryState() const; + virtual Seconds getSlaveDelaySecs() const; - virtual Seconds getSlaveDelaySecs() const; + virtual void clearSyncSourceBlacklist(); - virtual void clearSyncSourceBlacklist(); + /* + * Implementation of the KillOpListenerInterface interrupt method so that we can wake up + * threads blocked in awaitReplication() when a killOp command comes in. + */ + virtual void interrupt(unsigned opId); - /* - * Implementation of the KillOpListenerInterface interrupt method so that we can wake up - * threads blocked in awaitReplication() when a killOp command comes in. - */ - virtual void interrupt(unsigned opId); + /* + * Implementation of the KillOpListenerInterface interruptAll method so that we can wake up + * threads blocked in awaitReplication() when we kill all operations. + */ + virtual void interruptAll(); - /* - * Implementation of the KillOpListenerInterface interruptAll method so that we can wake up - * threads blocked in awaitReplication() when we kill all operations. - */ - virtual void interruptAll(); + virtual ReplicationCoordinator::StatusAndDuration awaitReplication( + const OperationContext* txn, const OpTime& ts, const WriteConcernOptions& writeConcern); - virtual ReplicationCoordinator::StatusAndDuration awaitReplication( - const OperationContext* txn, - const OpTime& ts, - const WriteConcernOptions& writeConcern); + virtual ReplicationCoordinator::StatusAndDuration awaitReplicationOfLastOpForClient( + const OperationContext* txn, const WriteConcernOptions& writeConcern); - virtual ReplicationCoordinator::StatusAndDuration awaitReplicationOfLastOpForClient( - const OperationContext* txn, - const WriteConcernOptions& writeConcern); + virtual Status stepDown(OperationContext* txn, + bool force, + const Milliseconds& waitTime, + const Milliseconds& stepdownTime); - virtual Status stepDown(OperationContext* txn, - bool force, - const Milliseconds& waitTime, - const Milliseconds& stepdownTime); + virtual bool isMasterForReportingPurposes(); - virtual bool isMasterForReportingPurposes(); + virtual bool canAcceptWritesForDatabase(const StringData& dbName); - virtual bool canAcceptWritesForDatabase(const StringData& dbName); + virtual Status checkIfWriteConcernCanBeSatisfied(const WriteConcernOptions& writeConcern) const; - virtual Status checkIfWriteConcernCanBeSatisfied( - const WriteConcernOptions& writeConcern) const; + virtual Status checkCanServeReadsFor(OperationContext* txn, + const NamespaceString& ns, + bool slaveOk); - virtual Status checkCanServeReadsFor(OperationContext* txn, - const NamespaceString& ns, - bool slaveOk); + virtual bool shouldIgnoreUniqueIndex(const IndexDescriptor* idx); - virtual bool shouldIgnoreUniqueIndex(const IndexDescriptor* idx); + virtual Status setLastOptimeForSlave(const OID& rid, const OpTime& ts); - virtual Status setLastOptimeForSlave(const OID& rid, const OpTime& ts); + virtual void setMyLastOptime(const OpTime& ts); - virtual void setMyLastOptime(const OpTime& ts); + virtual void resetMyLastOptime(); - virtual void resetMyLastOptime(); + virtual void setMyHeartbeatMessage(const std::string& msg); - virtual void setMyHeartbeatMessage(const std::string& msg); + virtual OpTime getMyLastOptime() const; - virtual OpTime getMyLastOptime() const; + virtual OID getElectionId(); - virtual OID getElectionId(); + virtual OID getMyRID() const; - virtual OID getMyRID() const; + virtual int getMyId() const; - virtual int getMyId() const; + virtual bool setFollowerMode(const MemberState& newState); - virtual bool setFollowerMode(const MemberState& newState); + virtual bool isWaitingForApplierToDrain(); - virtual bool isWaitingForApplierToDrain(); + virtual void signalDrainComplete(OperationContext* txn); - virtual void signalDrainComplete(OperationContext* txn); + virtual void signalUpstreamUpdater(); - virtual void signalUpstreamUpdater(); + virtual bool prepareReplSetUpdatePositionCommand(BSONObjBuilder* cmdBuilder); - virtual bool prepareReplSetUpdatePositionCommand(BSONObjBuilder* cmdBuilder); + virtual void prepareReplSetUpdatePositionCommandHandshakes(std::vector<BSONObj>* handshakes); - virtual void prepareReplSetUpdatePositionCommandHandshakes( - std::vector<BSONObj>* handshakes); + virtual Status processReplSetGetStatus(BSONObjBuilder* result); - virtual Status processReplSetGetStatus(BSONObjBuilder* result); + virtual void fillIsMasterForReplSet(IsMasterResponse* result); - virtual void fillIsMasterForReplSet(IsMasterResponse* result); + virtual void appendSlaveInfoData(BSONObjBuilder* result); - virtual void appendSlaveInfoData(BSONObjBuilder* result); + virtual void processReplSetGetConfig(BSONObjBuilder* result); - virtual void processReplSetGetConfig(BSONObjBuilder* result); + virtual Status setMaintenanceMode(bool activate); - virtual Status setMaintenanceMode(bool activate); + virtual bool getMaintenanceMode(); - virtual bool getMaintenanceMode(); + virtual Status processReplSetSyncFrom(const HostAndPort& target, BSONObjBuilder* resultObj); - virtual Status processReplSetSyncFrom(const HostAndPort& target, - BSONObjBuilder* resultObj); + virtual Status processReplSetFreeze(int secs, BSONObjBuilder* resultObj); - virtual Status processReplSetFreeze(int secs, BSONObjBuilder* resultObj); + virtual Status processHeartbeat(const ReplSetHeartbeatArgs& args, + ReplSetHeartbeatResponse* response); - virtual Status processHeartbeat(const ReplSetHeartbeatArgs& args, - ReplSetHeartbeatResponse* response); + virtual Status processReplSetReconfig(OperationContext* txn, + const ReplSetReconfigArgs& args, + BSONObjBuilder* resultObj); - virtual Status processReplSetReconfig(OperationContext* txn, - const ReplSetReconfigArgs& args, - BSONObjBuilder* resultObj); + virtual Status processReplSetInitiate(OperationContext* txn, + const BSONObj& configObj, + BSONObjBuilder* resultObj); - virtual Status processReplSetInitiate(OperationContext* txn, - const BSONObj& configObj, - BSONObjBuilder* resultObj); + virtual Status processReplSetGetRBID(BSONObjBuilder* resultObj); - virtual Status processReplSetGetRBID(BSONObjBuilder* resultObj); + virtual void incrementRollbackID(); - virtual void incrementRollbackID(); + virtual Status processReplSetFresh(const ReplSetFreshArgs& args, BSONObjBuilder* resultObj); - virtual Status processReplSetFresh(const ReplSetFreshArgs& args, - BSONObjBuilder* resultObj); + virtual Status processReplSetElect(const ReplSetElectArgs& args, BSONObjBuilder* response); - virtual Status processReplSetElect(const ReplSetElectArgs& args, - BSONObjBuilder* response); + virtual Status processReplSetUpdatePosition(const UpdatePositionArgs& updates); - virtual Status processReplSetUpdatePosition(const UpdatePositionArgs& updates); + virtual Status processHandshake(OperationContext* txn, const HandshakeArgs& handshake); - virtual Status processHandshake(OperationContext* txn, const HandshakeArgs& handshake); + virtual bool buildsIndexes(); - virtual bool buildsIndexes(); + virtual std::vector<HostAndPort> getHostsWrittenTo(const OpTime& op); - virtual std::vector<HostAndPort> getHostsWrittenTo(const OpTime& op); + virtual std::vector<HostAndPort> getOtherNodesInReplSet() const; - virtual std::vector<HostAndPort> getOtherNodesInReplSet() const; + virtual WriteConcernOptions getGetLastErrorDefault(); - virtual WriteConcernOptions getGetLastErrorDefault(); + virtual Status checkReplEnabledForCommand(BSONObjBuilder* result); - virtual Status checkReplEnabledForCommand(BSONObjBuilder* result); + virtual bool isReplEnabled() const; - virtual bool isReplEnabled() const; + virtual HostAndPort chooseNewSyncSource(const OpTime& lastOpTimeFetched); - virtual HostAndPort chooseNewSyncSource(const OpTime& lastOpTimeFetched); + virtual void blacklistSyncSource(const HostAndPort& host, Date_t until); - virtual void blacklistSyncSource(const HostAndPort& host, Date_t until); + virtual void resetLastOpTimeFromOplog(OperationContext* txn); - virtual void resetLastOpTimeFromOplog(OperationContext* txn); + virtual bool shouldChangeSyncSource(const HostAndPort& currentSource); - virtual bool shouldChangeSyncSource(const HostAndPort& currentSource); + virtual void summarizeAsHtml(ReplSetHtmlSummary* s); - virtual void summarizeAsHtml(ReplSetHtmlSummary* s); + // ================== Test support API =================== - // ================== Test support API =================== + /** + * If called after startReplication(), blocks until all asynchronous + * activities associated with replication start-up complete. + */ + void waitForStartUpComplete(); - /** - * If called after startReplication(), blocks until all asynchronous - * activities associated with replication start-up complete. - */ - void waitForStartUpComplete(); + /** + * Gets the replica set configuration in use by the node. + */ + ReplicaSetConfig getReplicaSetConfig_forTest(); - /** - * Gets the replica set configuration in use by the node. - */ - ReplicaSetConfig getReplicaSetConfig_forTest(); + /** + * Simple wrapper around _setLastOptime_inlock to make it easier to test. + */ + Status setLastOptime_forTest(const OID& rid, const OpTime& ts); - /** - * Simple wrapper around _setLastOptime_inlock to make it easier to test. - */ - Status setLastOptime_forTest(const OID& rid, const OpTime& ts); +private: + /** + * Configuration states for a replica set node. + * + * Transition diagram: + * + * PreStart ------------------> ReplicationDisabled + * | + * | + * v + * StartingUp -------> Uninitialized <------> Initiating + * \ ^ | + * ------- | | + * | | | + * v v | + * Reconfig <---> Steady <----> HBReconfig | + * ^ / + * | / + * \ / + * ----------------------- + */ + enum ConfigState { + kConfigPreStart, + kConfigStartingUp, + kConfigReplicationDisabled, + kConfigUninitialized, + kConfigSteady, + kConfigInitiating, + kConfigReconfiguring, + kConfigHBReconfiguring + }; - private: + /** + * Type describing actions to take after a change to the MemberState _memberState. + */ + enum PostMemberStateUpdateAction { + kActionNone, + kActionCloseAllConnections, // Also indicates that we should clear sharding state. + kActionChooseNewSyncSource, + kActionWinElection + }; - /** - * Configuration states for a replica set node. - * - * Transition diagram: - * - * PreStart ------------------> ReplicationDisabled - * | - * | - * v - * StartingUp -------> Uninitialized <------> Initiating - * \ ^ | - * ------- | | - * | | | - * v v | - * Reconfig <---> Steady <----> HBReconfig | - * ^ / - * | / - * \ / - * ----------------------- - */ - enum ConfigState { - kConfigPreStart, - kConfigStartingUp, - kConfigReplicationDisabled, - kConfigUninitialized, - kConfigSteady, - kConfigInitiating, - kConfigReconfiguring, - kConfigHBReconfiguring - }; - - /** - * Type describing actions to take after a change to the MemberState _memberState. - */ - enum PostMemberStateUpdateAction { - kActionNone, - kActionCloseAllConnections, // Also indicates that we should clear sharding state. - kActionChooseNewSyncSource, - kActionWinElection - }; - - // Struct that holds information about clients waiting for replication. - struct WaiterInfo; - - // Struct that holds information about nodes in this replication group, mainly used for - // tracking replication progress for write concern satisfaction. - struct SlaveInfo { - OpTime opTime; // Our last known OpTime that this slave has replicated to. - HostAndPort hostAndPort; // Client address of the slave. - int memberId; // Id of the node in the replica set config, or -1 if we're not a replSet. - OID rid; // RID of the node. - bool self; // Whether this SlaveInfo stores the information about ourself - SlaveInfo() : memberId(-1), self(false) {} - }; - - typedef std::vector<SlaveInfo> SlaveInfoVector; - - typedef std::vector<ReplicationExecutor::CallbackHandle> HeartbeatHandles; - - /** - * Looks up the SlaveInfo in _slaveInfo associated with the given RID and returns a pointer - * to it, or returns NULL if there is no SlaveInfo with the given RID. - */ - SlaveInfo* _findSlaveInfoByRID_inlock(const OID& rid); - - /** - * Looks up the SlaveInfo in _slaveInfo associated with the given member ID and returns a - * pointer to it, or returns NULL if there is no SlaveInfo with the given member ID. - */ - SlaveInfo* _findSlaveInfoByMemberID_inlock(int memberID); - - /** - * Adds the given SlaveInfo to _slaveInfo and wakes up any threads waiting for replication - * that now have their write concern satisfied. Only valid to call in master/slave setups. - */ - void _addSlaveInfo_inlock(const SlaveInfo& slaveInfo); - - /** - * Updates the item in _slaveInfo pointed to by 'slaveInfo' with the given OpTime 'ts' - * and wakes up any threads waiting for replication that now have their write concern - * satisfied. - */ - void _updateSlaveInfoOptime_inlock(SlaveInfo* slaveInfo, OpTime ts); - - /** - * Returns the index into _slaveInfo where data corresponding to ourself is stored. - * For more info on the rules about how we know where our entry is, see the comment for - * _slaveInfo. - */ - size_t _getMyIndexInSlaveInfo_inlock() const; - - /** - * Helper method that removes entries from _slaveInfo if they correspond to a node - * with a member ID that is not in the current replica set config. Will always leave an - * entry for ourself at the beginning of _slaveInfo, even if we aren't present in the - * config. - */ - void _updateSlaveInfoFromConfig_inlock(); - - /** - * Helper to update our saved config, cancel any pending heartbeats, and kick off sending - * new heartbeats based on the new config. Must *only* be called from within the - * ReplicationExecutor context. - * - * Returns an action to be performed after unlocking _mutex, via - * _performPostMemberStateUpdateAction. - */ - PostMemberStateUpdateAction _setCurrentRSConfig_inlock( - const ReplicaSetConfig& newConfig, - int myIndex); - - /** - * Helper to wake waiters in _replicationWaiterList that are doneWaitingForReplication. - */ - void _wakeReadyWaiters_inlock(); - - /** - * Helper method for setting/unsetting maintenance mode. Scheduled by setMaintenanceMode() - * to run in a global write lock in the replication executor thread. - */ - void _setMaintenanceMode_helper(const ReplicationExecutor::CallbackData& cbData, - bool activate, - Status* result); - - /** - * Helper method for retrieving maintenance mode. Scheduled by getMaintenanceMode() to run - * in the replication executor thread. - */ - void _getMaintenanceMode_helper(const ReplicationExecutor::CallbackData& cbData, - bool* maintenanceMode); - - /** - * Bottom half of fillIsMasterForReplSet. - */ - void _fillIsMasterForReplSet_finish(const ReplicationExecutor::CallbackData& cbData, - IsMasterResponse* result); - - /** - * Bottom half of processReplSetFresh. - */ - void _processReplSetFresh_finish(const ReplicationExecutor::CallbackData& cbData, - const ReplSetFreshArgs& args, - BSONObjBuilder* response, - Status* result); - - /** - * Bottom half of processReplSetElect. - */ - void _processReplSetElect_finish(const ReplicationExecutor::CallbackData& cbData, - const ReplSetElectArgs& args, - BSONObjBuilder* response, - Status* result); - - /** - * Bottom half of processReplSetFreeze. - */ - void _processReplSetFreeze_finish(const ReplicationExecutor::CallbackData& cbData, - int secs, - BSONObjBuilder* response, - Status* result); - /* - * Bottom half of clearSyncSourceBlacklist - */ - void _clearSyncSourceBlacklist_finish(const ReplicationExecutor::CallbackData& cbData); - - /** - * Scheduled to cause the ReplicationCoordinator to reconsider any state that might - * need to change as a result of time passing - for instance becoming PRIMARY when a single - * node replica set member's stepDown period ends. - */ - void _handleTimePassing(const ReplicationExecutor::CallbackData& cbData); - - /** - * Helper method for _awaitReplication that takes an already locked unique_lock and a - * Timer for timing the operation which has been counting since before the lock was - * acquired. - */ - ReplicationCoordinator::StatusAndDuration _awaitReplication_inlock( - const Timer* timer, - boost::unique_lock<boost::mutex>* lock, - const OperationContext* txn, - const OpTime& ts, - const WriteConcernOptions& writeConcern); - - /* - * Returns true if the given writeConcern is satisfied up to "optime" or is unsatisfiable. - */ - bool _doneWaitingForReplication_inlock(const OpTime& opTime, - const WriteConcernOptions& writeConcern); - - /** - * Helper for _doneWaitingForReplication_inlock that takes an integer write concern. - */ - bool _haveNumNodesReachedOpTime_inlock(const OpTime& opTime, int numNodes); - - /** - * Helper for _doneWaitingForReplication_inlock that takes a tag pattern representing a - * named write concern mode. - */ - bool _haveTaggedNodesReachedOpTime_inlock(const OpTime& opTime, - const ReplicaSetTagPattern& tagPattern); - - Status _checkIfWriteConcernCanBeSatisfied_inlock( - const WriteConcernOptions& writeConcern) const; - - /** - * Triggers all callbacks that are blocked waiting for new heartbeat data - * to decide whether or not to finish a step down. - * Should only be called from executor callbacks. - */ - void _signalStepDownWaitersFromCallback(const ReplicationExecutor::CallbackData& cbData); - void _signalStepDownWaiters(); - - /** - * Helper for stepDown run within a ReplicationExecutor callback. This method assumes - * it is running within a global shared lock, and thus that no writes are going on at the - * same time. - */ - void _stepDownContinue(const ReplicationExecutor::CallbackData& cbData, - const ReplicationExecutor::EventHandle finishedEvent, - OperationContext* txn, - Date_t waitUntil, - Date_t stepdownUntil, - bool force, - Status* result); - - OID _getMyRID_inlock() const; - - int _getMyId_inlock() const; - - OpTime _getMyLastOptime_inlock() const; - - - /** - * Bottom half of setFollowerMode. - * - * May reschedule itself after the current election, so it is not sufficient to - * wait for a callback scheduled to execute this method to complete. Instead, - * supply an event, "finishedSettingFollowerMode", and wait for that event to - * be signaled. Do not observe "*success" until after the event is signaled. - */ - void _setFollowerModeFinish( - const ReplicationExecutor::CallbackData& cbData, - const MemberState& newState, - const ReplicationExecutor::EventHandle& finishedSettingFollowerMode, - bool* success); - - /** - * Helper method for updating our tracking of the last optime applied by a given node. - * This is only valid to call on replica sets. - */ - Status _setLastOptime_inlock(const UpdatePositionArgs::UpdateInfo& args); - - /** - * Helper method for setMyLastOptime that takes in a unique lock on - * _mutex. The passed in lock must already be locked. It is unspecified what state the - * lock will be in after this method finishes. - * - * This function has the same rules for "ts" as setMyLastOptime(), unless - * "isRollbackAllowed" is true. - */ - void _setMyLastOptime_inlock(boost::unique_lock<boost::mutex>* lock, - const OpTime& ts, - bool isRollbackAllowed); - - /** - * Schedules a heartbeat to be sent to "target" at "when". "targetIndex" is the index - * into the replica set config members array that corresponds to the "target", or -1 if - * "target" is not in _rsConfig. - */ - void _scheduleHeartbeatToTarget(const HostAndPort& target, int targetIndex, Date_t when); - - /** - * Processes each heartbeat response. - * - * Schedules additional heartbeats, triggers elections and step downs, etc. - */ - void _handleHeartbeatResponse(const ReplicationExecutor::RemoteCommandCallbackData& cbData, - int targetIndex); - - void _trackHeartbeatHandle(const StatusWith<ReplicationExecutor::CallbackHandle>& handle); - - void _untrackHeartbeatHandle(const ReplicationExecutor::CallbackHandle& handle); - - /** - * Helper for _handleHeartbeatResponse. - * - * Updates the optime associated with the member at "memberIndex" in our config. - */ - void _updateOpTimeFromHeartbeat_inlock(int memberIndex, OpTime optime); - - /** - * Starts a heartbeat for each member in the current config. Called within the executor - * context. - */ - void _startHeartbeats(); - - /** - * Cancels all heartbeats. Called within executor context. - */ - void _cancelHeartbeats(); - - /** - * Asynchronously sends a heartbeat to "target". "targetIndex" is the index - * into the replica set config members array that corresponds to the "target", or -1 if - * we don't have a valid replica set config. - * - * Scheduled by _scheduleHeartbeatToTarget. - */ - void _doMemberHeartbeat(ReplicationExecutor::CallbackData cbData, - const HostAndPort& target, - int targetIndex); - - - MemberState _getMemberState_inlock() const; - - /** - * Returns the current replication mode. This method requires the caller to be holding - * "_mutex" to be called safely. - */ - Mode _getReplicationMode_inlock() const; - - /** - * Starts loading the replication configuration from local storage, and if it is valid, - * schedules a callback (of _finishLoadLocalConfig) to set it as the current replica set - * config (sets _rsConfig and _thisMembersConfigIndex). - * Returns true if it finishes loading the local config, which most likely means there - * was no local config at all or it was invalid in some way, and false if there was a valid - * config detected but more work is needed to set it as the local config (which will be - * handled by the callback to _finishLoadLocalConfig). - */ - bool _startLoadLocalConfig(OperationContext* txn); - - /** - * Callback that finishes the work started in _startLoadLocalConfig and sets _rsConfigState - * to kConfigSteady, so that we can begin processing heartbeats and reconfigs. - */ - void _finishLoadLocalConfig(const ReplicationExecutor::CallbackData& cbData, - const ReplicaSetConfig& localConfig, - const StatusWith<OpTime>& lastOpTimeStatus); - - /** - * Callback that finishes the work of processReplSetInitiate() inside the replication - * executor context, in the event of a successful quorum check. - */ - void _finishReplSetInitiate( - const ReplicationExecutor::CallbackData& cbData, - const ReplicaSetConfig& newConfig, - int myIndex); - - /** - * Callback that finishes the work of processReplSetReconfig inside the replication - * executor context, in the event of a successful quorum check. - */ - void _finishReplSetReconfig( - const ReplicationExecutor::CallbackData& cbData, - const ReplicaSetConfig& newConfig, - int myIndex); - - /** - * Changes _rsConfigState to newState, and notify any waiters. - */ - void _setConfigState_inlock(ConfigState newState); - - /** - * Updates the cached value, _memberState, to match _topCoord's reported - * member state, from getMemberState(). - * - * Returns an enum indicating what action to take after releasing _mutex, if any. - * Call performPostMemberStateUpdateAction on the return value after releasing - * _mutex. - */ - PostMemberStateUpdateAction _updateMemberStateFromTopologyCoordinator_inlock(); - - /** - * Performs a post member-state update action. Do not call while holding _mutex. - */ - void _performPostMemberStateUpdateAction(PostMemberStateUpdateAction action); - - /** - * Begins an attempt to elect this node. - * Called after an incoming heartbeat changes this node's view of the set such that it - * believes it can be elected PRIMARY. - * For proper concurrency, must be called via a ReplicationExecutor callback. - */ - void _startElectSelf(); - - /** - * Callback called when the FreshnessChecker has completed; checks the results and - * decides whether to continue election proceedings. - * finishEvh is an event that is signaled when election is complete. - **/ - void _onFreshnessCheckComplete(); - - /** - * Callback called when the ElectCmdRunner has completed; checks the results and - * decides whether to complete the election and change state to primary. - * finishEvh is an event that is signaled when election is complete. - **/ - void _onElectCmdRunnerComplete(); - - /** - * Callback called after a random delay, to prevent repeated election ties. - */ - void _recoverFromElectionTie(const ReplicationExecutor::CallbackData& cbData); - - /** - * Chooses a new sync source. Must be scheduled as a callback. - * - * Calls into the Topology Coordinator, which uses its current view of the set to choose - * the most appropriate sync source. - */ - void _chooseNewSyncSource(const ReplicationExecutor::CallbackData& cbData, - const OpTime& lastOpTimeFetched, - HostAndPort* newSyncSource); - - /** - * Adds 'host' to the sync source blacklist until 'until'. A blacklisted source cannot - * be chosen as a sync source. Schedules a callback to unblacklist the sync source to be - * run at 'until'. - * - * Must be scheduled as a callback. - */ - void _blacklistSyncSource(const ReplicationExecutor::CallbackData& cbData, - const HostAndPort& host, - Date_t until); - - /** - * Removes 'host' from the sync source blacklist. If 'host' isn't found, it's simply - * ignored and no error is thrown. - * - * Must be scheduled as a callback. - */ - void _unblacklistSyncSource(const ReplicationExecutor::CallbackData& cbData, - const HostAndPort& host); - - /** - * Determines if a new sync source should be considered. - * - * Must be scheduled as a callback. - */ - void _shouldChangeSyncSource(const ReplicationExecutor::CallbackData& cbData, - const HostAndPort& currentSource, - bool* shouldChange); - - /** - * Schedules a request that the given host step down; logs any errors. - */ - void _requestRemotePrimaryStepdown(const HostAndPort& target); - - void _heartbeatStepDownStart(); - - /** - * Completes a step-down of the current node triggered by a heartbeat. Must - * be run with a global shared or global exclusive lock. - */ - void _heartbeatStepDownFinish(const ReplicationExecutor::CallbackData& cbData); - - /** - * Schedules a replica set config change. - */ - void _scheduleHeartbeatReconfig(const ReplicaSetConfig& newConfig); - - /** - * Callback that continues a heartbeat-initiated reconfig after a running election - * completes. - */ - void _heartbeatReconfigAfterElectionCanceled( - const ReplicationExecutor::CallbackData& cbData, - const ReplicaSetConfig& newConfig); - - /** - * Method to write a configuration transmitted via heartbeat message to stable storage. - */ - void _heartbeatReconfigStore(const ReplicaSetConfig& newConfig); - - /** - * Conclusion actions of a heartbeat-triggered reconfiguration. - */ - void _heartbeatReconfigFinish(const ReplicationExecutor::CallbackData& cbData, - const ReplicaSetConfig& newConfig, - StatusWith<int> myIndex); - - /** - * Utility method that schedules or performs actions specified by a HeartbeatResponseAction - * returned by a TopologyCoordinator::processHeartbeatResponse call with the given - * value of "responseStatus". - */ - void _handleHeartbeatResponseAction( - const HeartbeatResponseAction& action, - const StatusWith<ReplSetHeartbeatResponse>& responseStatus); - - /** - * Bottom half of processHeartbeat(), which runs in the replication executor. - */ - void _processHeartbeatFinish(const ReplicationExecutor::CallbackData& cbData, - const ReplSetHeartbeatArgs& args, - ReplSetHeartbeatResponse* response, - Status* outStatus); - - void _summarizeAsHtml_finish(const ReplicationExecutor::CallbackData& cbData, - ReplSetHtmlSummary* output); - - // - // All member variables are labeled with one of the following codes indicating the - // synchronization rules for accessing them. - // - // (R) Read-only in concurrent operation; no synchronization required. - // (S) Self-synchronizing; access in any way from any context. - // (PS) Pointer is read-only in concurrent operation, item pointed to is self-synchronizing; - // Access in any context. - // (M) Reads and writes guarded by _mutex - // (X) Reads and writes must be performed in a callback in _replExecutor - // (MX) Must hold _mutex and be in a callback in _replExecutor to write; must either hold - // _mutex or be in a callback in _replExecutor to read. - // (GX) Readable under a global intent lock. Must either hold global lock in exclusive - // mode (MODE_X) or both hold global lock in shared mode (MODE_S) and be in executor - // context to write. - // (I) Independently synchronized, see member variable comment. - - // Protects member data of this ReplicationCoordinator. - mutable boost::mutex _mutex; // (S) - - // Handles to actively queued heartbeats. - HeartbeatHandles _heartbeatHandles; // (X) - - // When this node does not know itself to be a member of a config, it adds - // every host that sends it a heartbeat request to this set, and also starts - // sending heartbeat requests to that host. This set is cleared whenever - // a node discovers that it is a member of a config. - unordered_set<HostAndPort> _seedList; // (X) - - // Parsed command line arguments related to replication. - const ReplSettings _settings; // (R) - - // Mode of replication specified by _settings. - const Mode _replMode; // (R) - - // Pointer to the TopologyCoordinator owned by this ReplicationCoordinator. - boost::scoped_ptr<TopologyCoordinator> _topCoord; // (X) - - // Executor that drives the topology coordinator. - ReplicationExecutor _replExecutor; // (S) - - // Pointer to the ReplicationCoordinatorExternalState owned by this ReplicationCoordinator. - boost::scoped_ptr<ReplicationCoordinatorExternalState> _externalState; // (PS) - - // Thread that drives actions in the topology coordinator - // Set in startReplication() and thereafter accessed in shutdown. - boost::scoped_ptr<boost::thread> _topCoordDriverThread; // (I) - - // Thread that is used to write new configs received via a heartbeat reconfig - // to stable storage. It is an error to change this if _inShutdown is true. - boost::scoped_ptr<boost::thread> _heartbeatReconfigThread; // (M) - - // Our RID, used to identify us to our sync source when sending replication progress - // updates upstream. Set once in startReplication() and then never modified again. - OID _myRID; // (M) - - // Rollback ID. Used to check if a rollback happened during some interval of time - // TODO: ideally this should only change on rollbacks NOT on mongod restarts also. - int _rbid; // (M) - - // list of information about clients waiting on replication. Does *not* own the - // WaiterInfos. - std::vector<WaiterInfo*> _replicationWaiterList; // (M) - - // Set to true when we are in the process of shutting down replication. - bool _inShutdown; // (M) - - // Election ID of the last election that resulted in this node becoming primary. - OID _electionId; // (M) - - // Vector containing known information about each member (such as replication - // progress and member ID) in our replica set or each member replicating from - // us in a master-slave deployment. In master/slave, the first entry is - // guaranteed to correspond to ourself. In replica sets where we don't have a - // valid config or are in state REMOVED then the vector will be a single element - // just with info about ourself. In replica sets with a valid config the elements - // will be in the same order as the members in the replica set config, thus - // the entry for ourself will be at _thisMemberConfigIndex. - SlaveInfoVector _slaveInfo; // (M) - - // Current ReplicaSet state. - MemberState _memberState; // (MX) - - // True if we are waiting for the applier to finish draining. - bool _isWaitingForDrainToComplete; // (M) - - // Used to signal threads waiting for changes to _rsConfigState. - boost::condition_variable _rsConfigStateChange; // (M) - - // Represents the configuration state of the coordinator, which controls how and when - // _rsConfig may change. See the state transition diagram in the type definition of - // ConfigState for details. - ConfigState _rsConfigState; // (M) - - // The current ReplicaSet configuration object, including the information about tag groups - // that is used to satisfy write concern requests with named gle modes. - ReplicaSetConfig _rsConfig; // (MX) - - // This member's index position in the current config. - int _selfIndex; // (MX) - - // Vector of events that should be signaled whenever new heartbeat data comes in. - std::vector<ReplicationExecutor::EventHandle> _stepDownWaiters; // (X) - - // State for conducting an election of this node. - // the presence of a non-null _freshnessChecker pointer indicates that an election is - // currently in progress. Only one election is allowed at once. - boost::scoped_ptr<FreshnessChecker> _freshnessChecker; // (X) - - boost::scoped_ptr<ElectCmdRunner> _electCmdRunner; // (X) + // Struct that holds information about clients waiting for replication. + struct WaiterInfo; + + // Struct that holds information about nodes in this replication group, mainly used for + // tracking replication progress for write concern satisfaction. + struct SlaveInfo { + OpTime opTime; // Our last known OpTime that this slave has replicated to. + HostAndPort hostAndPort; // Client address of the slave. + int memberId; // Id of the node in the replica set config, or -1 if we're not a replSet. + OID rid; // RID of the node. + bool self; // Whether this SlaveInfo stores the information about ourself + SlaveInfo() : memberId(-1), self(false) {} + }; - // Event that the election code will signal when the in-progress election completes. - // Unspecified value when _freshnessChecker is NULL. - ReplicationExecutor::EventHandle _electionFinishedEvent; // (X) + typedef std::vector<SlaveInfo> SlaveInfoVector; + + typedef std::vector<ReplicationExecutor::CallbackHandle> HeartbeatHandles; + + /** + * Looks up the SlaveInfo in _slaveInfo associated with the given RID and returns a pointer + * to it, or returns NULL if there is no SlaveInfo with the given RID. + */ + SlaveInfo* _findSlaveInfoByRID_inlock(const OID& rid); + + /** + * Looks up the SlaveInfo in _slaveInfo associated with the given member ID and returns a + * pointer to it, or returns NULL if there is no SlaveInfo with the given member ID. + */ + SlaveInfo* _findSlaveInfoByMemberID_inlock(int memberID); + + /** + * Adds the given SlaveInfo to _slaveInfo and wakes up any threads waiting for replication + * that now have their write concern satisfied. Only valid to call in master/slave setups. + */ + void _addSlaveInfo_inlock(const SlaveInfo& slaveInfo); + + /** + * Updates the item in _slaveInfo pointed to by 'slaveInfo' with the given OpTime 'ts' + * and wakes up any threads waiting for replication that now have their write concern + * satisfied. + */ + void _updateSlaveInfoOptime_inlock(SlaveInfo* slaveInfo, OpTime ts); + + /** + * Returns the index into _slaveInfo where data corresponding to ourself is stored. + * For more info on the rules about how we know where our entry is, see the comment for + * _slaveInfo. + */ + size_t _getMyIndexInSlaveInfo_inlock() const; + + /** + * Helper method that removes entries from _slaveInfo if they correspond to a node + * with a member ID that is not in the current replica set config. Will always leave an + * entry for ourself at the beginning of _slaveInfo, even if we aren't present in the + * config. + */ + void _updateSlaveInfoFromConfig_inlock(); + + /** + * Helper to update our saved config, cancel any pending heartbeats, and kick off sending + * new heartbeats based on the new config. Must *only* be called from within the + * ReplicationExecutor context. + * + * Returns an action to be performed after unlocking _mutex, via + * _performPostMemberStateUpdateAction. + */ + PostMemberStateUpdateAction _setCurrentRSConfig_inlock(const ReplicaSetConfig& newConfig, + int myIndex); + + /** + * Helper to wake waiters in _replicationWaiterList that are doneWaitingForReplication. + */ + void _wakeReadyWaiters_inlock(); + + /** + * Helper method for setting/unsetting maintenance mode. Scheduled by setMaintenanceMode() + * to run in a global write lock in the replication executor thread. + */ + void _setMaintenanceMode_helper(const ReplicationExecutor::CallbackData& cbData, + bool activate, + Status* result); + + /** + * Helper method for retrieving maintenance mode. Scheduled by getMaintenanceMode() to run + * in the replication executor thread. + */ + void _getMaintenanceMode_helper(const ReplicationExecutor::CallbackData& cbData, + bool* maintenanceMode); + + /** + * Bottom half of fillIsMasterForReplSet. + */ + void _fillIsMasterForReplSet_finish(const ReplicationExecutor::CallbackData& cbData, + IsMasterResponse* result); + + /** + * Bottom half of processReplSetFresh. + */ + void _processReplSetFresh_finish(const ReplicationExecutor::CallbackData& cbData, + const ReplSetFreshArgs& args, + BSONObjBuilder* response, + Status* result); + + /** + * Bottom half of processReplSetElect. + */ + void _processReplSetElect_finish(const ReplicationExecutor::CallbackData& cbData, + const ReplSetElectArgs& args, + BSONObjBuilder* response, + Status* result); + + /** + * Bottom half of processReplSetFreeze. + */ + void _processReplSetFreeze_finish(const ReplicationExecutor::CallbackData& cbData, + int secs, + BSONObjBuilder* response, + Status* result); + /* + * Bottom half of clearSyncSourceBlacklist + */ + void _clearSyncSourceBlacklist_finish(const ReplicationExecutor::CallbackData& cbData); + + /** + * Scheduled to cause the ReplicationCoordinator to reconsider any state that might + * need to change as a result of time passing - for instance becoming PRIMARY when a single + * node replica set member's stepDown period ends. + */ + void _handleTimePassing(const ReplicationExecutor::CallbackData& cbData); + + /** + * Helper method for _awaitReplication that takes an already locked unique_lock and a + * Timer for timing the operation which has been counting since before the lock was + * acquired. + */ + ReplicationCoordinator::StatusAndDuration _awaitReplication_inlock( + const Timer* timer, + boost::unique_lock<boost::mutex>* lock, + const OperationContext* txn, + const OpTime& ts, + const WriteConcernOptions& writeConcern); + + /* + * Returns true if the given writeConcern is satisfied up to "optime" or is unsatisfiable. + */ + bool _doneWaitingForReplication_inlock(const OpTime& opTime, + const WriteConcernOptions& writeConcern); + + /** + * Helper for _doneWaitingForReplication_inlock that takes an integer write concern. + */ + bool _haveNumNodesReachedOpTime_inlock(const OpTime& opTime, int numNodes); + + /** + * Helper for _doneWaitingForReplication_inlock that takes a tag pattern representing a + * named write concern mode. + */ + bool _haveTaggedNodesReachedOpTime_inlock(const OpTime& opTime, + const ReplicaSetTagPattern& tagPattern); + + Status _checkIfWriteConcernCanBeSatisfied_inlock(const WriteConcernOptions& writeConcern) const; + + /** + * Triggers all callbacks that are blocked waiting for new heartbeat data + * to decide whether or not to finish a step down. + * Should only be called from executor callbacks. + */ + void _signalStepDownWaitersFromCallback(const ReplicationExecutor::CallbackData& cbData); + void _signalStepDownWaiters(); + + /** + * Helper for stepDown run within a ReplicationExecutor callback. This method assumes + * it is running within a global shared lock, and thus that no writes are going on at the + * same time. + */ + void _stepDownContinue(const ReplicationExecutor::CallbackData& cbData, + const ReplicationExecutor::EventHandle finishedEvent, + OperationContext* txn, + Date_t waitUntil, + Date_t stepdownUntil, + bool force, + Status* result); + + OID _getMyRID_inlock() const; + + int _getMyId_inlock() const; + + OpTime _getMyLastOptime_inlock() const; + + + /** + * Bottom half of setFollowerMode. + * + * May reschedule itself after the current election, so it is not sufficient to + * wait for a callback scheduled to execute this method to complete. Instead, + * supply an event, "finishedSettingFollowerMode", and wait for that event to + * be signaled. Do not observe "*success" until after the event is signaled. + */ + void _setFollowerModeFinish(const ReplicationExecutor::CallbackData& cbData, + const MemberState& newState, + const ReplicationExecutor::EventHandle& finishedSettingFollowerMode, + bool* success); + + /** + * Helper method for updating our tracking of the last optime applied by a given node. + * This is only valid to call on replica sets. + */ + Status _setLastOptime_inlock(const UpdatePositionArgs::UpdateInfo& args); + + /** + * Helper method for setMyLastOptime that takes in a unique lock on + * _mutex. The passed in lock must already be locked. It is unspecified what state the + * lock will be in after this method finishes. + * + * This function has the same rules for "ts" as setMyLastOptime(), unless + * "isRollbackAllowed" is true. + */ + void _setMyLastOptime_inlock(boost::unique_lock<boost::mutex>* lock, + const OpTime& ts, + bool isRollbackAllowed); + + /** + * Schedules a heartbeat to be sent to "target" at "when". "targetIndex" is the index + * into the replica set config members array that corresponds to the "target", or -1 if + * "target" is not in _rsConfig. + */ + void _scheduleHeartbeatToTarget(const HostAndPort& target, int targetIndex, Date_t when); + + /** + * Processes each heartbeat response. + * + * Schedules additional heartbeats, triggers elections and step downs, etc. + */ + void _handleHeartbeatResponse(const ReplicationExecutor::RemoteCommandCallbackData& cbData, + int targetIndex); + + void _trackHeartbeatHandle(const StatusWith<ReplicationExecutor::CallbackHandle>& handle); + + void _untrackHeartbeatHandle(const ReplicationExecutor::CallbackHandle& handle); + + /** + * Helper for _handleHeartbeatResponse. + * + * Updates the optime associated with the member at "memberIndex" in our config. + */ + void _updateOpTimeFromHeartbeat_inlock(int memberIndex, OpTime optime); + + /** + * Starts a heartbeat for each member in the current config. Called within the executor + * context. + */ + void _startHeartbeats(); + + /** + * Cancels all heartbeats. Called within executor context. + */ + void _cancelHeartbeats(); + + /** + * Asynchronously sends a heartbeat to "target". "targetIndex" is the index + * into the replica set config members array that corresponds to the "target", or -1 if + * we don't have a valid replica set config. + * + * Scheduled by _scheduleHeartbeatToTarget. + */ + void _doMemberHeartbeat(ReplicationExecutor::CallbackData cbData, + const HostAndPort& target, + int targetIndex); + + + MemberState _getMemberState_inlock() const; + + /** + * Returns the current replication mode. This method requires the caller to be holding + * "_mutex" to be called safely. + */ + Mode _getReplicationMode_inlock() const; + + /** + * Starts loading the replication configuration from local storage, and if it is valid, + * schedules a callback (of _finishLoadLocalConfig) to set it as the current replica set + * config (sets _rsConfig and _thisMembersConfigIndex). + * Returns true if it finishes loading the local config, which most likely means there + * was no local config at all or it was invalid in some way, and false if there was a valid + * config detected but more work is needed to set it as the local config (which will be + * handled by the callback to _finishLoadLocalConfig). + */ + bool _startLoadLocalConfig(OperationContext* txn); + + /** + * Callback that finishes the work started in _startLoadLocalConfig and sets _rsConfigState + * to kConfigSteady, so that we can begin processing heartbeats and reconfigs. + */ + void _finishLoadLocalConfig(const ReplicationExecutor::CallbackData& cbData, + const ReplicaSetConfig& localConfig, + const StatusWith<OpTime>& lastOpTimeStatus); + + /** + * Callback that finishes the work of processReplSetInitiate() inside the replication + * executor context, in the event of a successful quorum check. + */ + void _finishReplSetInitiate(const ReplicationExecutor::CallbackData& cbData, + const ReplicaSetConfig& newConfig, + int myIndex); + + /** + * Callback that finishes the work of processReplSetReconfig inside the replication + * executor context, in the event of a successful quorum check. + */ + void _finishReplSetReconfig(const ReplicationExecutor::CallbackData& cbData, + const ReplicaSetConfig& newConfig, + int myIndex); + + /** + * Changes _rsConfigState to newState, and notify any waiters. + */ + void _setConfigState_inlock(ConfigState newState); + + /** + * Updates the cached value, _memberState, to match _topCoord's reported + * member state, from getMemberState(). + * + * Returns an enum indicating what action to take after releasing _mutex, if any. + * Call performPostMemberStateUpdateAction on the return value after releasing + * _mutex. + */ + PostMemberStateUpdateAction _updateMemberStateFromTopologyCoordinator_inlock(); + + /** + * Performs a post member-state update action. Do not call while holding _mutex. + */ + void _performPostMemberStateUpdateAction(PostMemberStateUpdateAction action); + + /** + * Begins an attempt to elect this node. + * Called after an incoming heartbeat changes this node's view of the set such that it + * believes it can be elected PRIMARY. + * For proper concurrency, must be called via a ReplicationExecutor callback. + */ + void _startElectSelf(); + + /** + * Callback called when the FreshnessChecker has completed; checks the results and + * decides whether to continue election proceedings. + * finishEvh is an event that is signaled when election is complete. + **/ + void _onFreshnessCheckComplete(); + + /** + * Callback called when the ElectCmdRunner has completed; checks the results and + * decides whether to complete the election and change state to primary. + * finishEvh is an event that is signaled when election is complete. + **/ + void _onElectCmdRunnerComplete(); + + /** + * Callback called after a random delay, to prevent repeated election ties. + */ + void _recoverFromElectionTie(const ReplicationExecutor::CallbackData& cbData); + + /** + * Chooses a new sync source. Must be scheduled as a callback. + * + * Calls into the Topology Coordinator, which uses its current view of the set to choose + * the most appropriate sync source. + */ + void _chooseNewSyncSource(const ReplicationExecutor::CallbackData& cbData, + const OpTime& lastOpTimeFetched, + HostAndPort* newSyncSource); + + /** + * Adds 'host' to the sync source blacklist until 'until'. A blacklisted source cannot + * be chosen as a sync source. Schedules a callback to unblacklist the sync source to be + * run at 'until'. + * + * Must be scheduled as a callback. + */ + void _blacklistSyncSource(const ReplicationExecutor::CallbackData& cbData, + const HostAndPort& host, + Date_t until); + + /** + * Removes 'host' from the sync source blacklist. If 'host' isn't found, it's simply + * ignored and no error is thrown. + * + * Must be scheduled as a callback. + */ + void _unblacklistSyncSource(const ReplicationExecutor::CallbackData& cbData, + const HostAndPort& host); + + /** + * Determines if a new sync source should be considered. + * + * Must be scheduled as a callback. + */ + void _shouldChangeSyncSource(const ReplicationExecutor::CallbackData& cbData, + const HostAndPort& currentSource, + bool* shouldChange); + + /** + * Schedules a request that the given host step down; logs any errors. + */ + void _requestRemotePrimaryStepdown(const HostAndPort& target); + + void _heartbeatStepDownStart(); + + /** + * Completes a step-down of the current node triggered by a heartbeat. Must + * be run with a global shared or global exclusive lock. + */ + void _heartbeatStepDownFinish(const ReplicationExecutor::CallbackData& cbData); + + /** + * Schedules a replica set config change. + */ + void _scheduleHeartbeatReconfig(const ReplicaSetConfig& newConfig); + + /** + * Callback that continues a heartbeat-initiated reconfig after a running election + * completes. + */ + void _heartbeatReconfigAfterElectionCanceled(const ReplicationExecutor::CallbackData& cbData, + const ReplicaSetConfig& newConfig); + + /** + * Method to write a configuration transmitted via heartbeat message to stable storage. + */ + void _heartbeatReconfigStore(const ReplicaSetConfig& newConfig); + + /** + * Conclusion actions of a heartbeat-triggered reconfiguration. + */ + void _heartbeatReconfigFinish(const ReplicationExecutor::CallbackData& cbData, + const ReplicaSetConfig& newConfig, + StatusWith<int> myIndex); + + /** + * Utility method that schedules or performs actions specified by a HeartbeatResponseAction + * returned by a TopologyCoordinator::processHeartbeatResponse call with the given + * value of "responseStatus". + */ + void _handleHeartbeatResponseAction(const HeartbeatResponseAction& action, + const StatusWith<ReplSetHeartbeatResponse>& responseStatus); + + /** + * Bottom half of processHeartbeat(), which runs in the replication executor. + */ + void _processHeartbeatFinish(const ReplicationExecutor::CallbackData& cbData, + const ReplSetHeartbeatArgs& args, + ReplSetHeartbeatResponse* response, + Status* outStatus); + + void _summarizeAsHtml_finish(const ReplicationExecutor::CallbackData& cbData, + ReplSetHtmlSummary* output); + + // + // All member variables are labeled with one of the following codes indicating the + // synchronization rules for accessing them. + // + // (R) Read-only in concurrent operation; no synchronization required. + // (S) Self-synchronizing; access in any way from any context. + // (PS) Pointer is read-only in concurrent operation, item pointed to is self-synchronizing; + // Access in any context. + // (M) Reads and writes guarded by _mutex + // (X) Reads and writes must be performed in a callback in _replExecutor + // (MX) Must hold _mutex and be in a callback in _replExecutor to write; must either hold + // _mutex or be in a callback in _replExecutor to read. + // (GX) Readable under a global intent lock. Must either hold global lock in exclusive + // mode (MODE_X) or both hold global lock in shared mode (MODE_S) and be in executor + // context to write. + // (I) Independently synchronized, see member variable comment. + + // Protects member data of this ReplicationCoordinator. + mutable boost::mutex _mutex; // (S) + + // Handles to actively queued heartbeats. + HeartbeatHandles _heartbeatHandles; // (X) + + // When this node does not know itself to be a member of a config, it adds + // every host that sends it a heartbeat request to this set, and also starts + // sending heartbeat requests to that host. This set is cleared whenever + // a node discovers that it is a member of a config. + unordered_set<HostAndPort> _seedList; // (X) + + // Parsed command line arguments related to replication. + const ReplSettings _settings; // (R) + + // Mode of replication specified by _settings. + const Mode _replMode; // (R) + + // Pointer to the TopologyCoordinator owned by this ReplicationCoordinator. + boost::scoped_ptr<TopologyCoordinator> _topCoord; // (X) + + // Executor that drives the topology coordinator. + ReplicationExecutor _replExecutor; // (S) + + // Pointer to the ReplicationCoordinatorExternalState owned by this ReplicationCoordinator. + boost::scoped_ptr<ReplicationCoordinatorExternalState> _externalState; // (PS) + + // Thread that drives actions in the topology coordinator + // Set in startReplication() and thereafter accessed in shutdown. + boost::scoped_ptr<boost::thread> _topCoordDriverThread; // (I) + + // Thread that is used to write new configs received via a heartbeat reconfig + // to stable storage. It is an error to change this if _inShutdown is true. + boost::scoped_ptr<boost::thread> _heartbeatReconfigThread; // (M) + + // Our RID, used to identify us to our sync source when sending replication progress + // updates upstream. Set once in startReplication() and then never modified again. + OID _myRID; // (M) + + // Rollback ID. Used to check if a rollback happened during some interval of time + // TODO: ideally this should only change on rollbacks NOT on mongod restarts also. + int _rbid; // (M) + + // list of information about clients waiting on replication. Does *not* own the + // WaiterInfos. + std::vector<WaiterInfo*> _replicationWaiterList; // (M) + + // Set to true when we are in the process of shutting down replication. + bool _inShutdown; // (M) + + // Election ID of the last election that resulted in this node becoming primary. + OID _electionId; // (M) + + // Vector containing known information about each member (such as replication + // progress and member ID) in our replica set or each member replicating from + // us in a master-slave deployment. In master/slave, the first entry is + // guaranteed to correspond to ourself. In replica sets where we don't have a + // valid config or are in state REMOVED then the vector will be a single element + // just with info about ourself. In replica sets with a valid config the elements + // will be in the same order as the members in the replica set config, thus + // the entry for ourself will be at _thisMemberConfigIndex. + SlaveInfoVector _slaveInfo; // (M) + + // Current ReplicaSet state. + MemberState _memberState; // (MX) + + // True if we are waiting for the applier to finish draining. + bool _isWaitingForDrainToComplete; // (M) + + // Used to signal threads waiting for changes to _rsConfigState. + boost::condition_variable _rsConfigStateChange; // (M) + + // Represents the configuration state of the coordinator, which controls how and when + // _rsConfig may change. See the state transition diagram in the type definition of + // ConfigState for details. + ConfigState _rsConfigState; // (M) + + // The current ReplicaSet configuration object, including the information about tag groups + // that is used to satisfy write concern requests with named gle modes. + ReplicaSetConfig _rsConfig; // (MX) + + // This member's index position in the current config. + int _selfIndex; // (MX) + + // Vector of events that should be signaled whenever new heartbeat data comes in. + std::vector<ReplicationExecutor::EventHandle> _stepDownWaiters; // (X) + + // State for conducting an election of this node. + // the presence of a non-null _freshnessChecker pointer indicates that an election is + // currently in progress. Only one election is allowed at once. + boost::scoped_ptr<FreshnessChecker> _freshnessChecker; // (X) + + boost::scoped_ptr<ElectCmdRunner> _electCmdRunner; // (X) - // Whether we slept last time we attempted an election but possibly tied with other nodes. - bool _sleptLastElection; // (X) + // Event that the election code will signal when the in-progress election completes. + // Unspecified value when _freshnessChecker is NULL. + ReplicationExecutor::EventHandle _electionFinishedEvent; // (X) - // Flag that indicates whether writes to databases other than "local" are allowed. Used to - // answer the canAcceptWritesForDatabase() question. Always true for standalone nodes and - // masters in master-slave relationships. - bool _canAcceptNonLocalWrites; // (GX) + // Whether we slept last time we attempted an election but possibly tied with other nodes. + bool _sleptLastElection; // (X) - // Flag that indicates whether reads from databases other than "local" are allowed. Unlike - // _canAcceptNonLocalWrites, above, this question is about admission control on secondaries, - // and we do not require that its observers be strongly synchronized. Accidentally - // providing the prior value for a limited period of time is acceptable. Also unlike - // _canAcceptNonLocalWrites, its value is only meaningful on replica set secondaries. - AtomicUInt32 _canServeNonLocalReads; // (S) - }; + // Flag that indicates whether writes to databases other than "local" are allowed. Used to + // answer the canAcceptWritesForDatabase() question. Always true for standalone nodes and + // masters in master-slave relationships. + bool _canAcceptNonLocalWrites; // (GX) + + // Flag that indicates whether reads from databases other than "local" are allowed. Unlike + // _canAcceptNonLocalWrites, above, this question is about admission control on secondaries, + // and we do not require that its observers be strongly synchronized. Accidentally + // providing the prior value for a limited period of time is acceptable. Also unlike + // _canAcceptNonLocalWrites, its value is only meaningful on replica set secondaries. + AtomicUInt32 _canServeNonLocalReads; // (S) +}; -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect.cpp index aa4618210b5..cb34f89c2d7 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_elect.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_elect.cpp @@ -42,54 +42,55 @@ namespace mongo { namespace repl { namespace { - class LoseElectionGuard { - MONGO_DISALLOW_COPYING(LoseElectionGuard); - public: - LoseElectionGuard( - TopologyCoordinator* topCoord, - ReplicationExecutor* executor, - boost::scoped_ptr<FreshnessChecker>* freshnessChecker, - boost::scoped_ptr<ElectCmdRunner>* electCmdRunner, - ReplicationExecutor::EventHandle* electionFinishedEvent) - : _topCoord(topCoord), - _executor(executor), - _freshnessChecker(freshnessChecker), - _electCmdRunner(electCmdRunner), - _electionFinishedEvent(electionFinishedEvent), - _dismissed(false) { +class LoseElectionGuard { + MONGO_DISALLOW_COPYING(LoseElectionGuard); + +public: + LoseElectionGuard(TopologyCoordinator* topCoord, + ReplicationExecutor* executor, + boost::scoped_ptr<FreshnessChecker>* freshnessChecker, + boost::scoped_ptr<ElectCmdRunner>* electCmdRunner, + ReplicationExecutor::EventHandle* electionFinishedEvent) + : _topCoord(topCoord), + _executor(executor), + _freshnessChecker(freshnessChecker), + _electCmdRunner(electCmdRunner), + _electionFinishedEvent(electionFinishedEvent), + _dismissed(false) {} + + ~LoseElectionGuard() { + if (_dismissed) { + return; } - - ~LoseElectionGuard() { - if (_dismissed) { - return; - } - _topCoord->processLoseElection(); - _freshnessChecker->reset(NULL); - _electCmdRunner->reset(NULL); - if (_electionFinishedEvent->isValid()) { - _executor->signalEvent(*_electionFinishedEvent); - } + _topCoord->processLoseElection(); + _freshnessChecker->reset(NULL); + _electCmdRunner->reset(NULL); + if (_electionFinishedEvent->isValid()) { + _executor->signalEvent(*_electionFinishedEvent); } + } - void dismiss() { _dismissed = true; } + void dismiss() { + _dismissed = true; + } - private: - TopologyCoordinator* const _topCoord; - ReplicationExecutor* const _executor; - boost::scoped_ptr<FreshnessChecker>* const _freshnessChecker; - boost::scoped_ptr<ElectCmdRunner>* const _electCmdRunner; - const ReplicationExecutor::EventHandle* _electionFinishedEvent; - bool _dismissed; - }; +private: + TopologyCoordinator* const _topCoord; + ReplicationExecutor* const _executor; + boost::scoped_ptr<FreshnessChecker>* const _freshnessChecker; + boost::scoped_ptr<ElectCmdRunner>* const _electCmdRunner; + const ReplicationExecutor::EventHandle* _electionFinishedEvent; + bool _dismissed; +}; } // namespace - void ReplicationCoordinatorImpl::_startElectSelf() { - invariant(!_freshnessChecker); - invariant(!_electCmdRunner); +void ReplicationCoordinatorImpl::_startElectSelf() { + invariant(!_freshnessChecker); + invariant(!_electCmdRunner); - boost::unique_lock<boost::mutex> lk(_mutex); - switch (_rsConfigState) { + boost::unique_lock<boost::mutex> lk(_mutex); + switch (_rsConfigState) { case kConfigSteady: break; case kConfigInitiating: @@ -100,183 +101,183 @@ namespace { _topCoord->processLoseElection(); return; default: - severe() << "Entered replica set election code while in illegal config state " << - int(_rsConfigState); + severe() << "Entered replica set election code while in illegal config state " + << int(_rsConfigState); fassertFailed(18913); - } + } - log() << "Standing for election"; - const StatusWith<ReplicationExecutor::EventHandle> finishEvh = _replExecutor.makeEvent(); - if (finishEvh.getStatus() == ErrorCodes::ShutdownInProgress) { - return; - } - fassert(18680, finishEvh.getStatus()); - _electionFinishedEvent = finishEvh.getValue(); - LoseElectionGuard lossGuard(_topCoord.get(), - &_replExecutor, - &_freshnessChecker, - &_electCmdRunner, - &_electionFinishedEvent); + log() << "Standing for election"; + const StatusWith<ReplicationExecutor::EventHandle> finishEvh = _replExecutor.makeEvent(); + if (finishEvh.getStatus() == ErrorCodes::ShutdownInProgress) { + return; + } + fassert(18680, finishEvh.getStatus()); + _electionFinishedEvent = finishEvh.getValue(); + LoseElectionGuard lossGuard(_topCoord.get(), + &_replExecutor, + &_freshnessChecker, + &_electCmdRunner, + &_electionFinishedEvent); + + + invariant(_rsConfig.getMemberAt(_selfIndex).isElectable()); + OpTime lastOpTimeApplied(_getMyLastOptime_inlock()); + + if (lastOpTimeApplied == OpTime()) { + log() << "replSet info not trying to elect self, " + "do not yet have a complete set of data from any point in time"; + return; + } + _freshnessChecker.reset(new FreshnessChecker); + + // This is necessary because the freshnessChecker may call directly into winning an + // election, if there are no other MaybeUp nodes. Winning an election attempts to lock + // _mutex again. + lk.unlock(); + + StatusWith<ReplicationExecutor::EventHandle> nextPhaseEvh = _freshnessChecker->start( + &_replExecutor, + lastOpTimeApplied, + _rsConfig, + _selfIndex, + _topCoord->getMaybeUpHostAndPorts(), + stdx::bind(&ReplicationCoordinatorImpl::_onFreshnessCheckComplete, this)); + if (nextPhaseEvh.getStatus() == ErrorCodes::ShutdownInProgress) { + return; + } + fassert(18681, nextPhaseEvh.getStatus()); + lossGuard.dismiss(); +} + +void ReplicationCoordinatorImpl::_onFreshnessCheckComplete() { + invariant(_freshnessChecker); + invariant(!_electCmdRunner); + LoseElectionGuard lossGuard(_topCoord.get(), + &_replExecutor, + &_freshnessChecker, + &_electCmdRunner, + &_electionFinishedEvent); + + if (_freshnessChecker->isCanceled()) { + LOG(2) << "Election canceled during freshness check phase"; + return; + } - invariant(_rsConfig.getMemberAt(_selfIndex).isElectable()); - OpTime lastOpTimeApplied(_getMyLastOptime_inlock()); + const Date_t now(_replExecutor.now()); + const FreshnessChecker::ElectionAbortReason abortReason = + _freshnessChecker->shouldAbortElection(); - if (lastOpTimeApplied == OpTime()) { - log() << "replSet info not trying to elect self, " - "do not yet have a complete set of data from any point in time"; + // need to not sleep after last time sleeping, + switch (abortReason) { + case FreshnessChecker::None: + break; + case FreshnessChecker::FreshnessTie: + if ((_selfIndex != 0) && !_sleptLastElection) { + const long long ms = _replExecutor.nextRandomInt64(1000) + 50; + const Date_t nextCandidateTime = now + ms; + log() << "replSet possible election tie; sleeping " << ms << "ms until " + << dateToISOStringLocal(nextCandidateTime); + _topCoord->setElectionSleepUntil(nextCandidateTime); + _replExecutor.scheduleWorkAt( + nextCandidateTime, + stdx::bind(&ReplicationCoordinatorImpl::_recoverFromElectionTie, + this, + stdx::placeholders::_1)); + _sleptLastElection = true; + return; + } + _sleptLastElection = false; + break; + case FreshnessChecker::FresherNodeFound: + log() << "not electing self, we are not freshest"; return; - } - - _freshnessChecker.reset(new FreshnessChecker); - - // This is necessary because the freshnessChecker may call directly into winning an - // election, if there are no other MaybeUp nodes. Winning an election attempts to lock - // _mutex again. - lk.unlock(); - - StatusWith<ReplicationExecutor::EventHandle> nextPhaseEvh = _freshnessChecker->start( - &_replExecutor, - lastOpTimeApplied, - _rsConfig, - _selfIndex, - _topCoord->getMaybeUpHostAndPorts(), - stdx::bind(&ReplicationCoordinatorImpl::_onFreshnessCheckComplete, this)); - if (nextPhaseEvh.getStatus() == ErrorCodes::ShutdownInProgress) { + case FreshnessChecker::QuorumUnreachable: + log() << "not electing self, we could not contact enough voting members"; + return; + default: + log() << "not electing self due to election abort message :" + << static_cast<int>(abortReason); return; - } - fassert(18681, nextPhaseEvh.getStatus()); - lossGuard.dismiss(); } - void ReplicationCoordinatorImpl::_onFreshnessCheckComplete() { - invariant(_freshnessChecker); - invariant(!_electCmdRunner); - LoseElectionGuard lossGuard(_topCoord.get(), - &_replExecutor, - &_freshnessChecker, - &_electCmdRunner, - &_electionFinishedEvent); - - if (_freshnessChecker->isCanceled()) { - LOG(2) << "Election canceled during freshness check phase"; - return; - } + log() << "replSet info electSelf"; + // Secure our vote for ourself first + if (!_topCoord->voteForMyself(now)) { + return; + } - const Date_t now(_replExecutor.now()); - const FreshnessChecker::ElectionAbortReason abortReason = - _freshnessChecker->shouldAbortElection(); - - // need to not sleep after last time sleeping, - switch (abortReason) { - case FreshnessChecker::None: - break; - case FreshnessChecker::FreshnessTie: - if ((_selfIndex != 0) && !_sleptLastElection) { - const long long ms = _replExecutor.nextRandomInt64(1000) + 50; - const Date_t nextCandidateTime = now + ms; - log() << "replSet possible election tie; sleeping " << ms << "ms until " << - dateToISOStringLocal(nextCandidateTime); - _topCoord->setElectionSleepUntil(nextCandidateTime); - _replExecutor.scheduleWorkAt( - nextCandidateTime, - stdx::bind(&ReplicationCoordinatorImpl::_recoverFromElectionTie, - this, - stdx::placeholders::_1)); - _sleptLastElection = true; - return; - } - _sleptLastElection = false; - break; - case FreshnessChecker::FresherNodeFound: - log() << "not electing self, we are not freshest"; - return; - case FreshnessChecker::QuorumUnreachable: - log() << "not electing self, we could not contact enough voting members"; - return; - default: - log() << "not electing self due to election abort message :" - << static_cast<int>(abortReason); - return; - } + _electCmdRunner.reset(new ElectCmdRunner); + StatusWith<ReplicationExecutor::EventHandle> nextPhaseEvh = _electCmdRunner->start( + &_replExecutor, + _rsConfig, + _selfIndex, + _topCoord->getMaybeUpHostAndPorts(), + stdx::bind(&ReplicationCoordinatorImpl::_onElectCmdRunnerComplete, this)); + if (nextPhaseEvh.getStatus() == ErrorCodes::ShutdownInProgress) { + return; + } + fassert(18685, nextPhaseEvh.getStatus()); + lossGuard.dismiss(); +} + +void ReplicationCoordinatorImpl::_onElectCmdRunnerComplete() { + LoseElectionGuard lossGuard(_topCoord.get(), + &_replExecutor, + &_freshnessChecker, + &_electCmdRunner, + &_electionFinishedEvent); + + invariant(_freshnessChecker); + invariant(_electCmdRunner); + if (_electCmdRunner->isCanceled()) { + LOG(2) << "Election canceled during elect self phase"; + return; + } - log() << "replSet info electSelf"; - // Secure our vote for ourself first - if (!_topCoord->voteForMyself(now)) { - return; - } + const int receivedVotes = _electCmdRunner->getReceivedVotes(); - _electCmdRunner.reset(new ElectCmdRunner); - StatusWith<ReplicationExecutor::EventHandle> nextPhaseEvh = _electCmdRunner->start( - &_replExecutor, - _rsConfig, - _selfIndex, - _topCoord->getMaybeUpHostAndPorts(), - stdx::bind(&ReplicationCoordinatorImpl::_onElectCmdRunnerComplete, this)); - if (nextPhaseEvh.getStatus() == ErrorCodes::ShutdownInProgress) { - return; - } - fassert(18685, nextPhaseEvh.getStatus()); - lossGuard.dismiss(); + if (receivedVotes < _rsConfig.getMajorityVoteCount()) { + log() << "replSet couldn't elect self, only received " << receivedVotes + << " votes, but needed at least " << _rsConfig.getMajorityVoteCount(); + // Suppress ourselves from standing for election again, giving other nodes a chance + // to win their elections. + const long long ms = _replExecutor.nextRandomInt64(1000) + 50; + const Date_t now(_replExecutor.now()); + const Date_t nextCandidateTime = now + ms; + log() << "waiting until " << nextCandidateTime << " before standing for election again"; + _topCoord->setElectionSleepUntil(nextCandidateTime); + _replExecutor.scheduleWorkAt( + nextCandidateTime, + stdx::bind(&ReplicationCoordinatorImpl::_recoverFromElectionTie, + this, + stdx::placeholders::_1)); + return; } - void ReplicationCoordinatorImpl::_onElectCmdRunnerComplete() { - LoseElectionGuard lossGuard(_topCoord.get(), - &_replExecutor, - &_freshnessChecker, - &_electCmdRunner, - &_electionFinishedEvent); - - invariant(_freshnessChecker); - invariant(_electCmdRunner); - if (_electCmdRunner->isCanceled()) { - LOG(2) << "Election canceled during elect self phase"; - return; - } + if (_rsConfig.getConfigVersion() != _freshnessChecker->getOriginalConfigVersion()) { + log() << "replSet config version changed during our election, ignoring result"; + return; + } - const int receivedVotes = _electCmdRunner->getReceivedVotes(); - - if (receivedVotes < _rsConfig.getMajorityVoteCount()) { - log() << "replSet couldn't elect self, only received " << receivedVotes << - " votes, but needed at least " << _rsConfig.getMajorityVoteCount(); - // Suppress ourselves from standing for election again, giving other nodes a chance - // to win their elections. - const long long ms = _replExecutor.nextRandomInt64(1000) + 50; - const Date_t now(_replExecutor.now()); - const Date_t nextCandidateTime = now + ms; - log() << "waiting until " << nextCandidateTime << " before standing for election again"; - _topCoord->setElectionSleepUntil(nextCandidateTime); - _replExecutor.scheduleWorkAt( - nextCandidateTime, - stdx::bind(&ReplicationCoordinatorImpl::_recoverFromElectionTie, - this, - stdx::placeholders::_1)); - return; - } + log() << "replSet election succeeded, assuming primary role"; - if (_rsConfig.getConfigVersion() != _freshnessChecker->getOriginalConfigVersion()) { - log() << "replSet config version changed during our election, ignoring result"; - return; - } - - log() << "replSet election succeeded, assuming primary role"; + lossGuard.dismiss(); + _freshnessChecker.reset(NULL); + _electCmdRunner.reset(NULL); + _performPostMemberStateUpdateAction(kActionWinElection); + _replExecutor.signalEvent(_electionFinishedEvent); +} - lossGuard.dismiss(); - _freshnessChecker.reset(NULL); - _electCmdRunner.reset(NULL); - _performPostMemberStateUpdateAction(kActionWinElection); - _replExecutor.signalEvent(_electionFinishedEvent); +void ReplicationCoordinatorImpl::_recoverFromElectionTie( + const ReplicationExecutor::CallbackData& cbData) { + if (!cbData.status.isOK()) { + return; } - - void ReplicationCoordinatorImpl::_recoverFromElectionTie( - const ReplicationExecutor::CallbackData& cbData) { - if (!cbData.status.isOK()) { - return; - } - if (_topCoord->checkShouldStandForElection(_replExecutor.now(), getMyLastOptime())) { - _startElectSelf(); - } + if (_topCoord->checkShouldStandForElection(_replExecutor.now(), getMyLastOptime())) { + _startElectSelf(); } +} } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect_test.cpp index ac36d2802ec..7f005cdb0e3 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_elect_test.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_elect_test.cpp @@ -48,364 +48,369 @@ namespace mongo { namespace repl { namespace { - typedef ReplicationExecutor::RemoteCommandRequest RemoteCommandRequest; - - class ReplCoordElectTest : public ReplCoordTest { - protected: - void simulateEnoughHeartbeatsForElectability(); - void simulateFreshEnoughForElectability(); - }; - - void ReplCoordElectTest::simulateEnoughHeartbeatsForElectability() { - ReplicationCoordinatorImpl* replCoord = getReplCoord(); - ReplicaSetConfig rsConfig = replCoord->getReplicaSetConfig_forTest(); - NetworkInterfaceMock* net = getNet(); - net->enterNetwork(); - for (int i = 0; i < rsConfig.getNumMembers() - 1; ++i) { - const NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); - const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); - log() << request.target.toString() << " processing " << request.cmdObj; - ReplSetHeartbeatArgs hbArgs; - if (hbArgs.initialize(request.cmdObj).isOK()) { - ReplSetHeartbeatResponse hbResp; - hbResp.setSetName(rsConfig.getReplSetName()); - hbResp.setState(MemberState::RS_SECONDARY); - hbResp.setVersion(rsConfig.getConfigVersion()); - BSONObjBuilder respObj; - respObj << "ok" << 1; - hbResp.addToBSON(&respObj); - net->scheduleResponse(noi, net->now(), makeResponseStatus(respObj.obj())); - } - else { - error() << "Black holing unexpected request to " << request.target << ": " << - request.cmdObj; - net->blackHole(noi); - } - net->runReadyNetworkOperations(); +typedef ReplicationExecutor::RemoteCommandRequest RemoteCommandRequest; + +class ReplCoordElectTest : public ReplCoordTest { +protected: + void simulateEnoughHeartbeatsForElectability(); + void simulateFreshEnoughForElectability(); +}; + +void ReplCoordElectTest::simulateEnoughHeartbeatsForElectability() { + ReplicationCoordinatorImpl* replCoord = getReplCoord(); + ReplicaSetConfig rsConfig = replCoord->getReplicaSetConfig_forTest(); + NetworkInterfaceMock* net = getNet(); + net->enterNetwork(); + for (int i = 0; i < rsConfig.getNumMembers() - 1; ++i) { + const NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); + const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); + log() << request.target.toString() << " processing " << request.cmdObj; + ReplSetHeartbeatArgs hbArgs; + if (hbArgs.initialize(request.cmdObj).isOK()) { + ReplSetHeartbeatResponse hbResp; + hbResp.setSetName(rsConfig.getReplSetName()); + hbResp.setState(MemberState::RS_SECONDARY); + hbResp.setVersion(rsConfig.getConfigVersion()); + BSONObjBuilder respObj; + respObj << "ok" << 1; + hbResp.addToBSON(&respObj); + net->scheduleResponse(noi, net->now(), makeResponseStatus(respObj.obj())); + } else { + error() << "Black holing unexpected request to " << request.target << ": " + << request.cmdObj; + net->blackHole(noi); } - net->exitNetwork(); + net->runReadyNetworkOperations(); } + net->exitNetwork(); +} - void ReplCoordElectTest::simulateFreshEnoughForElectability() { - ReplicationCoordinatorImpl* replCoord = getReplCoord(); - ReplicaSetConfig rsConfig = replCoord->getReplicaSetConfig_forTest(); - NetworkInterfaceMock* net = getNet(); - net->enterNetwork(); - for (int i = 0; i < rsConfig.getNumMembers() - 1; ++i) { - const NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); - const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); - log() << request.target.toString() << " processing " << request.cmdObj; - if (request.cmdObj.firstElement().fieldNameStringData() == "replSetFresh") { - net->scheduleResponse(noi, net->now(), makeResponseStatus( - BSON("ok" << 1 << - "fresher" << false << - "opTime" << Date_t(OpTime(0, 0).asDate()) << - "veto" << false))); - } - else { - error() << "Black holing unexpected request to " << request.target << ": " << - request.cmdObj; - net->blackHole(noi); - } - net->runReadyNetworkOperations(); +void ReplCoordElectTest::simulateFreshEnoughForElectability() { + ReplicationCoordinatorImpl* replCoord = getReplCoord(); + ReplicaSetConfig rsConfig = replCoord->getReplicaSetConfig_forTest(); + NetworkInterfaceMock* net = getNet(); + net->enterNetwork(); + for (int i = 0; i < rsConfig.getNumMembers() - 1; ++i) { + const NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); + const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); + log() << request.target.toString() << " processing " << request.cmdObj; + if (request.cmdObj.firstElement().fieldNameStringData() == "replSetFresh") { + net->scheduleResponse( + noi, + net->now(), + makeResponseStatus(BSON("ok" << 1 << "fresher" << false << "opTime" + << Date_t(OpTime(0, 0).asDate()) << "veto" << false))); + } else { + error() << "Black holing unexpected request to " << request.target << ": " + << request.cmdObj; + net->blackHole(noi); } - net->exitNetwork(); + net->runReadyNetworkOperations(); } + net->exitNetwork(); +} - TEST_F(ReplCoordElectTest, ElectTooSoon) { - logger::globalLogDomain()->setMinimumLoggedSeverity(logger::LogSeverity::Debug(3)); - // Election never starts because we haven't set a lastOpTimeApplied value yet, via a - // heartbeat. - startCapturingLogMessages(); - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 1 << "host" << "node1:12345") << - BSON("_id" << 2 << "host" << "node2:12345"))), - HostAndPort("node1", 12345)); - ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - simulateEnoughHeartbeatsForElectability(); - stopCapturingLogMessages(); - ASSERT_EQUALS(1, countLogLinesContaining("node has no applied oplog entries")); - } +TEST_F(ReplCoordElectTest, ElectTooSoon) { + logger::globalLogDomain()->setMinimumLoggedSeverity(logger::LogSeverity::Debug(3)); + // Election never starts because we haven't set a lastOpTimeApplied value yet, via a + // heartbeat. + startCapturingLogMessages(); + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345") + << BSON("_id" << 2 << "host" + << "node2:12345"))), + HostAndPort("node1", 12345)); + ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + simulateEnoughHeartbeatsForElectability(); + stopCapturingLogMessages(); + ASSERT_EQUALS(1, countLogLinesContaining("node has no applied oplog entries")); +} - /** - * This test checks that an election can happen when only one node is up, and it has the - * vote(s) to win. - */ - TEST_F(ReplCoordElectTest, ElectTwoNodesWithOneZeroVoter) { - OperationContextReplMock txn; - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 1 << "host" << "node1:12345") << - BSON("_id" << 2 << "host" << "node2:12345" << - "votes" << 0 << "hidden" << true << - "priority" << 0))), - HostAndPort("node1", 12345)); - - getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY); - - ASSERT(getReplCoord()->getMemberState().secondary()) << - getReplCoord()->getMemberState().toString(); - - getReplCoord()->setMyLastOptime(OpTime(10,0)); - - NetworkInterfaceMock* net = getNet(); - net->enterNetwork(); - const NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); - net->scheduleResponse(noi, - net->now(), - ResponseStatus(ErrorCodes::OperationFailed, "timeout")); - net->runReadyNetworkOperations(); - const NetworkInterfaceMock::NetworkOperationIterator noi2 = net->getNextReadyRequest(); - net->scheduleResponse(noi2, - net->now(), - ResponseStatus(ErrorCodes::OperationFailed, "timeout")); - net->runReadyNetworkOperations(); - net->exitNetwork(); - - ASSERT(getReplCoord()->getMemberState().primary()) << - getReplCoord()->getMemberState().toString(); - ASSERT(getReplCoord()->isWaitingForApplierToDrain()); - - // Since we're still in drain mode, expect that we report ismaster: false, issecondary:true. - IsMasterResponse imResponse; - getReplCoord()->fillIsMasterForReplSet(&imResponse); - ASSERT_FALSE(imResponse.isMaster()) << imResponse.toBSON().toString(); - ASSERT_TRUE(imResponse.isSecondary()) << imResponse.toBSON().toString(); - getReplCoord()->signalDrainComplete(&txn); - getReplCoord()->fillIsMasterForReplSet(&imResponse); - ASSERT_TRUE(imResponse.isMaster()) << imResponse.toBSON().toString(); - ASSERT_FALSE(imResponse.isSecondary()) << imResponse.toBSON().toString(); - } +/** + * This test checks that an election can happen when only one node is up, and it has the + * vote(s) to win. + */ +TEST_F(ReplCoordElectTest, ElectTwoNodesWithOneZeroVoter) { + OperationContextReplMock txn; + assertStartSuccess( + BSON("_id" + << "mySet" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345") + << BSON("_id" << 2 << "host" + << "node2:12345" + << "votes" << 0 << "hidden" << true << "priority" << 0))), + HostAndPort("node1", 12345)); + + getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY); + + ASSERT(getReplCoord()->getMemberState().secondary()) + << getReplCoord()->getMemberState().toString(); + + getReplCoord()->setMyLastOptime(OpTime(10, 0)); + + NetworkInterfaceMock* net = getNet(); + net->enterNetwork(); + const NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); + net->scheduleResponse(noi, net->now(), ResponseStatus(ErrorCodes::OperationFailed, "timeout")); + net->runReadyNetworkOperations(); + const NetworkInterfaceMock::NetworkOperationIterator noi2 = net->getNextReadyRequest(); + net->scheduleResponse(noi2, net->now(), ResponseStatus(ErrorCodes::OperationFailed, "timeout")); + net->runReadyNetworkOperations(); + net->exitNetwork(); + + ASSERT(getReplCoord()->getMemberState().primary()) + << getReplCoord()->getMemberState().toString(); + ASSERT(getReplCoord()->isWaitingForApplierToDrain()); + + // Since we're still in drain mode, expect that we report ismaster: false, issecondary:true. + IsMasterResponse imResponse; + getReplCoord()->fillIsMasterForReplSet(&imResponse); + ASSERT_FALSE(imResponse.isMaster()) << imResponse.toBSON().toString(); + ASSERT_TRUE(imResponse.isSecondary()) << imResponse.toBSON().toString(); + getReplCoord()->signalDrainComplete(&txn); + getReplCoord()->fillIsMasterForReplSet(&imResponse); + ASSERT_TRUE(imResponse.isMaster()) << imResponse.toBSON().toString(); + ASSERT_FALSE(imResponse.isSecondary()) << imResponse.toBSON().toString(); +} - TEST_F(ReplCoordElectTest, Elect1NodeSuccess) { - OperationContextReplMock txn; - startCapturingLogMessages(); - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 1 << "host" << "node1:12345"))), - HostAndPort("node1", 12345)); - - getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY); - - ASSERT(getReplCoord()->getMemberState().primary()) << - getReplCoord()->getMemberState().toString(); - ASSERT(getReplCoord()->isWaitingForApplierToDrain()); - - // Since we're still in drain mode, expect that we report ismaster: false, issecondary:true. - IsMasterResponse imResponse; - getReplCoord()->fillIsMasterForReplSet(&imResponse); - ASSERT_FALSE(imResponse.isMaster()) << imResponse.toBSON().toString(); - ASSERT_TRUE(imResponse.isSecondary()) << imResponse.toBSON().toString(); - getReplCoord()->signalDrainComplete(&txn); - getReplCoord()->fillIsMasterForReplSet(&imResponse); - ASSERT_TRUE(imResponse.isMaster()) << imResponse.toBSON().toString(); - ASSERT_FALSE(imResponse.isSecondary()) << imResponse.toBSON().toString(); - } +TEST_F(ReplCoordElectTest, Elect1NodeSuccess) { + OperationContextReplMock txn; + startCapturingLogMessages(); + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345"))), + HostAndPort("node1", 12345)); + + getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY); + + ASSERT(getReplCoord()->getMemberState().primary()) + << getReplCoord()->getMemberState().toString(); + ASSERT(getReplCoord()->isWaitingForApplierToDrain()); + + // Since we're still in drain mode, expect that we report ismaster: false, issecondary:true. + IsMasterResponse imResponse; + getReplCoord()->fillIsMasterForReplSet(&imResponse); + ASSERT_FALSE(imResponse.isMaster()) << imResponse.toBSON().toString(); + ASSERT_TRUE(imResponse.isSecondary()) << imResponse.toBSON().toString(); + getReplCoord()->signalDrainComplete(&txn); + getReplCoord()->fillIsMasterForReplSet(&imResponse); + ASSERT_TRUE(imResponse.isMaster()) << imResponse.toBSON().toString(); + ASSERT_FALSE(imResponse.isSecondary()) << imResponse.toBSON().toString(); +} - TEST_F(ReplCoordElectTest, ElectManyNodesSuccess) { - BSONObj configObj = BSON("_id" << "mySet" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 1 << "host" << "node1:12345") - << BSON("_id" << 2 << "host" << "node2:12345") - << BSON("_id" << 3 << "host" << "node3:12345") - )); - assertStartSuccess(configObj, HostAndPort("node1", 12345)); - OperationContextNoop txn; - getReplCoord()->setMyLastOptime(OpTime (100, 1)); - ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - startCapturingLogMessages(); - simulateSuccessfulElection(); - stopCapturingLogMessages(); - ASSERT_EQUALS(1, countLogLinesContaining("election succeeded")); - } +TEST_F(ReplCoordElectTest, ElectManyNodesSuccess) { + BSONObj configObj = BSON("_id" + << "mySet" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345") + << BSON("_id" << 2 << "host" + << "node2:12345") + << BSON("_id" << 3 << "host" + << "node3:12345"))); + assertStartSuccess(configObj, HostAndPort("node1", 12345)); + OperationContextNoop txn; + getReplCoord()->setMyLastOptime(OpTime(100, 1)); + ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + startCapturingLogMessages(); + simulateSuccessfulElection(); + stopCapturingLogMessages(); + ASSERT_EQUALS(1, countLogLinesContaining("election succeeded")); +} - TEST_F(ReplCoordElectTest, ElectNotEnoughVotes) { - // one responds with -10000 votes, and one doesn't respond, and we are not elected - startCapturingLogMessages(); - BSONObj configObj = BSON("_id" << "mySet" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 1 << "host" << "node1:12345") - << BSON("_id" << 2 << "host" << "node2:12345") - << BSON("_id" << 3 << "host" << "node3:12345") - )); - assertStartSuccess(configObj, HostAndPort("node1", 12345)); - ReplicaSetConfig config = assertMakeRSConfig(configObj); - - OperationContextNoop txn; - OpTime time1(100, 1); - getReplCoord()->setMyLastOptime(time1); - ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - - simulateEnoughHeartbeatsForElectability(); - simulateFreshEnoughForElectability(); - NetworkInterfaceMock* net = getNet(); - net->enterNetwork(); - while (net->hasReadyRequests()) { - const NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); - const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); - log() << request.target.toString() << " processing " << request.cmdObj; - if (request.target != HostAndPort("node2", 12345)) { - net->blackHole(noi); - } - else if (request.cmdObj.firstElement().fieldNameStringData() != "replSetElect") { - net->blackHole(noi); - } - else { - net->scheduleResponse( - noi, - net->now(), - makeResponseStatus(BSON("ok" << 1 << - "vote" << -10000 << - "round" << OID()))); - } - net->runReadyNetworkOperations(); +TEST_F(ReplCoordElectTest, ElectNotEnoughVotes) { + // one responds with -10000 votes, and one doesn't respond, and we are not elected + startCapturingLogMessages(); + BSONObj configObj = BSON("_id" + << "mySet" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345") + << BSON("_id" << 2 << "host" + << "node2:12345") + << BSON("_id" << 3 << "host" + << "node3:12345"))); + assertStartSuccess(configObj, HostAndPort("node1", 12345)); + ReplicaSetConfig config = assertMakeRSConfig(configObj); + + OperationContextNoop txn; + OpTime time1(100, 1); + getReplCoord()->setMyLastOptime(time1); + ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + + simulateEnoughHeartbeatsForElectability(); + simulateFreshEnoughForElectability(); + NetworkInterfaceMock* net = getNet(); + net->enterNetwork(); + while (net->hasReadyRequests()) { + const NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); + const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); + log() << request.target.toString() << " processing " << request.cmdObj; + if (request.target != HostAndPort("node2", 12345)) { + net->blackHole(noi); + } else if (request.cmdObj.firstElement().fieldNameStringData() != "replSetElect") { + net->blackHole(noi); + } else { + net->scheduleResponse( + noi, + net->now(), + makeResponseStatus(BSON("ok" << 1 << "vote" << -10000 << "round" << OID()))); } - net->exitNetwork(); - stopCapturingLogMessages(); - ASSERT_EQUALS(1, - countLogLinesContaining("replSet couldn't elect self, only received -9999 votes")); + net->runReadyNetworkOperations(); } + net->exitNetwork(); + stopCapturingLogMessages(); + ASSERT_EQUALS( + 1, countLogLinesContaining("replSet couldn't elect self, only received -9999 votes")); +} - TEST_F(ReplCoordElectTest, ElectWrongTypeForVote) { - // one responds with a bad 'vote' field, and one doesn't respond, and we are not elected - startCapturingLogMessages(); - BSONObj configObj = BSON("_id" << "mySet" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 1 << "host" << "node1:12345") - << BSON("_id" << 2 << "host" << "node2:12345") - << BSON("_id" << 3 << "host" << "node3:12345") - )); - assertStartSuccess(configObj, HostAndPort("node1", 12345)); - ReplicaSetConfig config = assertMakeRSConfig(configObj); - - OperationContextNoop txn; - OpTime time1(100, 1); - getReplCoord()->setMyLastOptime(time1); - ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - - simulateEnoughHeartbeatsForElectability(); - simulateFreshEnoughForElectability(); - NetworkInterfaceMock* net = getNet(); - net->enterNetwork(); - while (net->hasReadyRequests()) { - const NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); - const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); - log() << request.target.toString() << " processing " << request.cmdObj; - if (request.target != HostAndPort("node2", 12345)) { - net->blackHole(noi); - } - else if (request.cmdObj.firstElement().fieldNameStringData() != "replSetElect") { - net->blackHole(noi); - } - else { - net->scheduleResponse( - noi, - net->now(), - makeResponseStatus(BSON("ok" << 1 << - "vote" << "yea" << - "round" << OID()))); - } - net->runReadyNetworkOperations(); +TEST_F(ReplCoordElectTest, ElectWrongTypeForVote) { + // one responds with a bad 'vote' field, and one doesn't respond, and we are not elected + startCapturingLogMessages(); + BSONObj configObj = BSON("_id" + << "mySet" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345") + << BSON("_id" << 2 << "host" + << "node2:12345") + << BSON("_id" << 3 << "host" + << "node3:12345"))); + assertStartSuccess(configObj, HostAndPort("node1", 12345)); + ReplicaSetConfig config = assertMakeRSConfig(configObj); + + OperationContextNoop txn; + OpTime time1(100, 1); + getReplCoord()->setMyLastOptime(time1); + ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + + simulateEnoughHeartbeatsForElectability(); + simulateFreshEnoughForElectability(); + NetworkInterfaceMock* net = getNet(); + net->enterNetwork(); + while (net->hasReadyRequests()) { + const NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); + const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); + log() << request.target.toString() << " processing " << request.cmdObj; + if (request.target != HostAndPort("node2", 12345)) { + net->blackHole(noi); + } else if (request.cmdObj.firstElement().fieldNameStringData() != "replSetElect") { + net->blackHole(noi); + } else { + net->scheduleResponse(noi, + net->now(), + makeResponseStatus(BSON("ok" << 1 << "vote" + << "yea" + << "round" << OID()))); } - net->exitNetwork(); - stopCapturingLogMessages(); - ASSERT_EQUALS(1, - countLogLinesContaining("wrong type for vote argument in replSetElect command")); + net->runReadyNetworkOperations(); } + net->exitNetwork(); + stopCapturingLogMessages(); + ASSERT_EQUALS(1, + countLogLinesContaining("wrong type for vote argument in replSetElect command")); +} - TEST_F(ReplCoordElectTest, ElectionDuringHBReconfigFails) { - // start up, receive reconfig via heartbeat while at the same time, become candidate. - // candidate state should be cleared. - OperationContextNoop txn; - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("_id" << 1 << "host" << "node1:12345") << - BSON("_id" << 2 << "host" << "node2:12345") << - BSON("_id" << 3 << "host" << "node3:12345") << - BSON("_id" << 4 << "host" << "node4:12345") << - BSON("_id" << 5 << "host" << "node5:12345") )), - HostAndPort("node1", 12345)); - ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(100,0)); - - // set hbreconfig to hang while in progress - getExternalState()->setStoreLocalConfigDocumentToHang(true); - - // hb reconfig - NetworkInterfaceMock* net = getNet(); - net->enterNetwork(); - ReplSetHeartbeatResponse hbResp2; - ReplicaSetConfig config; - config.initialize(BSON("_id" << "mySet" << - "version" << 3 << - "members" << BSON_ARRAY(BSON("_id" << 1 << - "host" << "node1:12345") << - BSON("_id" << 2 << - "host" << "node2:12345")))); - hbResp2.setConfig(config); - hbResp2.setVersion(3); - hbResp2.setSetName("mySet"); - hbResp2.setState(MemberState::RS_SECONDARY); - BSONObjBuilder respObj2; - respObj2 << "ok" << 1; - hbResp2.addToBSON(&respObj2); - net->runUntil(net->now() + 10*1000); // run until we've sent a heartbeat request - const NetworkInterfaceMock::NetworkOperationIterator noi2 = net->getNextReadyRequest(); - net->scheduleResponse(noi2, net->now(), makeResponseStatus(respObj2.obj())); - net->runReadyNetworkOperations(); - getNet()->exitNetwork(); - - // prepare candidacy - BSONObjBuilder result; - ReplicationCoordinator::ReplSetReconfigArgs args; - args.force = false; - args.newConfigObj = config.toBSON(); - ASSERT_EQUALS(ErrorCodes::ConfigurationInProgress, - getReplCoord()->processReplSetReconfig(&txn, args, &result)); - - logger::globalLogDomain()->setMinimumLoggedSeverity(logger::LogSeverity::Debug(2)); - startCapturingLogMessages(); - - // receive sufficient heartbeats to trigger an election - ReplicationCoordinatorImpl* replCoord = getReplCoord(); - ReplicaSetConfig rsConfig = replCoord->getReplicaSetConfig_forTest(); - net->enterNetwork(); - for (int i = 0; i < 2; ++i) { - const NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); - const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); - log() << request.target.toString() << " processing " << request.cmdObj; - ReplSetHeartbeatArgs hbArgs; - if (hbArgs.initialize(request.cmdObj).isOK()) { - ReplSetHeartbeatResponse hbResp; - hbResp.setSetName(rsConfig.getReplSetName()); - hbResp.setState(MemberState::RS_SECONDARY); - hbResp.setVersion(rsConfig.getConfigVersion()); - BSONObjBuilder respObj; - respObj << "ok" << 1; - hbResp.addToBSON(&respObj); - net->scheduleResponse(noi, net->now(), makeResponseStatus(respObj.obj())); - } - else { - error() << "Black holing unexpected request to " << request.target << ": " << - request.cmdObj; - net->blackHole(noi); - } - net->runReadyNetworkOperations(); +TEST_F(ReplCoordElectTest, ElectionDuringHBReconfigFails) { + // start up, receive reconfig via heartbeat while at the same time, become candidate. + // candidate state should be cleared. + OperationContextNoop txn; + assertStartSuccess( + BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345") + << BSON("_id" << 2 << "host" + << "node2:12345") << BSON("_id" << 3 << "host" + << "node3:12345") + << BSON("_id" << 4 << "host" + << "node4:12345") << BSON("_id" << 5 << "host" + << "node5:12345"))), + HostAndPort("node1", 12345)); + ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + getReplCoord()->setMyLastOptime(OpTime(100, 0)); + + // set hbreconfig to hang while in progress + getExternalState()->setStoreLocalConfigDocumentToHang(true); + + // hb reconfig + NetworkInterfaceMock* net = getNet(); + net->enterNetwork(); + ReplSetHeartbeatResponse hbResp2; + ReplicaSetConfig config; + config.initialize(BSON("_id" + << "mySet" + << "version" << 3 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345") + << BSON("_id" << 2 << "host" + << "node2:12345")))); + hbResp2.setConfig(config); + hbResp2.setVersion(3); + hbResp2.setSetName("mySet"); + hbResp2.setState(MemberState::RS_SECONDARY); + BSONObjBuilder respObj2; + respObj2 << "ok" << 1; + hbResp2.addToBSON(&respObj2); + net->runUntil(net->now() + 10 * 1000); // run until we've sent a heartbeat request + const NetworkInterfaceMock::NetworkOperationIterator noi2 = net->getNextReadyRequest(); + net->scheduleResponse(noi2, net->now(), makeResponseStatus(respObj2.obj())); + net->runReadyNetworkOperations(); + getNet()->exitNetwork(); + + // prepare candidacy + BSONObjBuilder result; + ReplicationCoordinator::ReplSetReconfigArgs args; + args.force = false; + args.newConfigObj = config.toBSON(); + ASSERT_EQUALS(ErrorCodes::ConfigurationInProgress, + getReplCoord()->processReplSetReconfig(&txn, args, &result)); + + logger::globalLogDomain()->setMinimumLoggedSeverity(logger::LogSeverity::Debug(2)); + startCapturingLogMessages(); + + // receive sufficient heartbeats to trigger an election + ReplicationCoordinatorImpl* replCoord = getReplCoord(); + ReplicaSetConfig rsConfig = replCoord->getReplicaSetConfig_forTest(); + net->enterNetwork(); + for (int i = 0; i < 2; ++i) { + const NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); + const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); + log() << request.target.toString() << " processing " << request.cmdObj; + ReplSetHeartbeatArgs hbArgs; + if (hbArgs.initialize(request.cmdObj).isOK()) { + ReplSetHeartbeatResponse hbResp; + hbResp.setSetName(rsConfig.getReplSetName()); + hbResp.setState(MemberState::RS_SECONDARY); + hbResp.setVersion(rsConfig.getConfigVersion()); + BSONObjBuilder respObj; + respObj << "ok" << 1; + hbResp.addToBSON(&respObj); + net->scheduleResponse(noi, net->now(), makeResponseStatus(respObj.obj())); + } else { + error() << "Black holing unexpected request to " << request.target << ": " + << request.cmdObj; + net->blackHole(noi); } - - stopCapturingLogMessages(); - // ensure node does not stand for election - ASSERT_EQUALS(1, - countLogLinesContaining("Not standing for election; processing " - "a configuration change")); - getExternalState()->setStoreLocalConfigDocumentToHang(false); + net->runReadyNetworkOperations(); } + stopCapturingLogMessages(); + // ensure node does not stand for election + ASSERT_EQUALS(1, + countLogLinesContaining( + "Not standing for election; processing " + "a configuration change")); + getExternalState()->setStoreLocalConfigDocumentToHang(false); +} } } } diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp index f80d2a5c1a7..cd247f9a864 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp @@ -55,146 +55,125 @@ namespace mongo { namespace repl { namespace { - typedef StatusWith<ReplicationExecutor::CallbackHandle> CBHStatus; - typedef ReplicationExecutor::RemoteCommandRequest CmdRequest; - typedef ReplicationExecutor::CallbackHandle CBHandle; +typedef StatusWith<ReplicationExecutor::CallbackHandle> CBHStatus; +typedef ReplicationExecutor::RemoteCommandRequest CmdRequest; +typedef ReplicationExecutor::CallbackHandle CBHandle; -} //namespace - - void ReplicationCoordinatorImpl::_doMemberHeartbeat(ReplicationExecutor::CallbackData cbData, - const HostAndPort& target, - int targetIndex) { - - _untrackHeartbeatHandle(cbData.myHandle); - if (cbData.status == ErrorCodes::CallbackCanceled) { - return; - } +} // namespace - const Date_t now = _replExecutor.now(); - const std::pair<ReplSetHeartbeatArgs, Milliseconds> hbRequest = - _topCoord->prepareHeartbeatRequest( - now, - _settings.ourSetName(), - target); - - const CmdRequest request(target, "admin", hbRequest.first.toBSON(), hbRequest.second); - const ReplicationExecutor::RemoteCommandCallbackFn callback = stdx::bind( - &ReplicationCoordinatorImpl::_handleHeartbeatResponse, - this, - stdx::placeholders::_1, - targetIndex); - - _trackHeartbeatHandle(_replExecutor.scheduleRemoteCommand(request, callback)); +void ReplicationCoordinatorImpl::_doMemberHeartbeat(ReplicationExecutor::CallbackData cbData, + const HostAndPort& target, + int targetIndex) { + _untrackHeartbeatHandle(cbData.myHandle); + if (cbData.status == ErrorCodes::CallbackCanceled) { + return; } - void ReplicationCoordinatorImpl::_scheduleHeartbeatToTarget( - const HostAndPort& target, - int targetIndex, - Date_t when) { - - LOG(2) << "Scheduling heartbeat to " << target << " at " << dateToISOStringUTC(when); - _trackHeartbeatHandle( - _replExecutor.scheduleWorkAt( - when, - stdx::bind(&ReplicationCoordinatorImpl::_doMemberHeartbeat, - this, - stdx::placeholders::_1, - target, - targetIndex))); + const Date_t now = _replExecutor.now(); + const std::pair<ReplSetHeartbeatArgs, Milliseconds> hbRequest = + _topCoord->prepareHeartbeatRequest(now, _settings.ourSetName(), target); + + const CmdRequest request(target, "admin", hbRequest.first.toBSON(), hbRequest.second); + const ReplicationExecutor::RemoteCommandCallbackFn callback = + stdx::bind(&ReplicationCoordinatorImpl::_handleHeartbeatResponse, + this, + stdx::placeholders::_1, + targetIndex); + + _trackHeartbeatHandle(_replExecutor.scheduleRemoteCommand(request, callback)); +} + +void ReplicationCoordinatorImpl::_scheduleHeartbeatToTarget(const HostAndPort& target, + int targetIndex, + Date_t when) { + LOG(2) << "Scheduling heartbeat to " << target << " at " << dateToISOStringUTC(when); + _trackHeartbeatHandle( + _replExecutor.scheduleWorkAt(when, + stdx::bind(&ReplicationCoordinatorImpl::_doMemberHeartbeat, + this, + stdx::placeholders::_1, + target, + targetIndex))); +} + +void ReplicationCoordinatorImpl::_handleHeartbeatResponse( + const ReplicationExecutor::RemoteCommandCallbackData& cbData, int targetIndex) { + // remove handle from queued heartbeats + _untrackHeartbeatHandle(cbData.myHandle); + + // Parse and validate the response. At the end of this step, if responseStatus is OK then + // hbResponse is valid. + Status responseStatus = cbData.response.getStatus(); + if (responseStatus == ErrorCodes::CallbackCanceled) { + return; } - void ReplicationCoordinatorImpl::_handleHeartbeatResponse( - const ReplicationExecutor::RemoteCommandCallbackData& cbData, int targetIndex) { - - // remove handle from queued heartbeats - _untrackHeartbeatHandle(cbData.myHandle); - - // Parse and validate the response. At the end of this step, if responseStatus is OK then - // hbResponse is valid. - Status responseStatus = cbData.response.getStatus(); - if (responseStatus == ErrorCodes::CallbackCanceled) { - return; + const HostAndPort& target = cbData.request.target; + ReplSetHeartbeatResponse hbResponse; + BSONObj resp; + if (responseStatus.isOK()) { + resp = cbData.response.getValue().data; + responseStatus = hbResponse.initialize(resp); + } + const bool isUnauthorized = (responseStatus.code() == ErrorCodes::Unauthorized) || + (responseStatus.code() == ErrorCodes::AuthenticationFailed); + const Date_t now = _replExecutor.now(); + const OpTime lastApplied = getMyLastOptime(); // Locks and unlocks _mutex. + Milliseconds networkTime(0); + StatusWith<ReplSetHeartbeatResponse> hbStatusResponse(hbResponse); + + if (responseStatus.isOK()) { + networkTime = cbData.response.getValue().elapsedMillis; + } else { + log() << "Error in heartbeat request to " << target << "; " << responseStatus; + if (!resp.isEmpty()) { + LOG(3) << "heartbeat response: " << resp; } - const HostAndPort& target = cbData.request.target; - ReplSetHeartbeatResponse hbResponse; - BSONObj resp; - if (responseStatus.isOK()) { - resp = cbData.response.getValue().data; - responseStatus = hbResponse.initialize(resp); - } - const bool isUnauthorized = (responseStatus.code() == ErrorCodes::Unauthorized) || - (responseStatus.code() == ErrorCodes::AuthenticationFailed); - const Date_t now = _replExecutor.now(); - const OpTime lastApplied = getMyLastOptime(); // Locks and unlocks _mutex. - Milliseconds networkTime(0); - StatusWith<ReplSetHeartbeatResponse> hbStatusResponse(hbResponse); - - if (responseStatus.isOK()) { + if (isUnauthorized) { networkTime = cbData.response.getValue().elapsedMillis; } - else { - log() << "Error in heartbeat request to " << target << "; " << responseStatus; - if (!resp.isEmpty()) { - LOG(3) << "heartbeat response: " << resp; - } + hbStatusResponse = StatusWith<ReplSetHeartbeatResponse>(responseStatus); + } - if (isUnauthorized) { - networkTime = cbData.response.getValue().elapsedMillis; - } - hbStatusResponse = StatusWith<ReplSetHeartbeatResponse>(responseStatus); - } + HeartbeatResponseAction action = _topCoord->processHeartbeatResponse( + now, networkTime, target, hbStatusResponse, lastApplied); - HeartbeatResponseAction action = - _topCoord->processHeartbeatResponse( - now, - networkTime, - target, - hbStatusResponse, - lastApplied); - - if (action.getAction() == HeartbeatResponseAction::NoAction && - hbStatusResponse.isOK() && - hbStatusResponse.getValue().hasOpTime() && - targetIndex >= 0 && - hbStatusResponse.getValue().hasState() && - hbStatusResponse.getValue().getState() != MemberState::RS_PRIMARY) { - boost::lock_guard<boost::mutex> lk(_mutex); - if (hbStatusResponse.getValue().getVersion() == _rsConfig.getConfigVersion()) { - _updateOpTimeFromHeartbeat_inlock(targetIndex, - hbStatusResponse.getValue().getOpTime()); - } + if (action.getAction() == HeartbeatResponseAction::NoAction && hbStatusResponse.isOK() && + hbStatusResponse.getValue().hasOpTime() && targetIndex >= 0 && + hbStatusResponse.getValue().hasState() && + hbStatusResponse.getValue().getState() != MemberState::RS_PRIMARY) { + boost::lock_guard<boost::mutex> lk(_mutex); + if (hbStatusResponse.getValue().getVersion() == _rsConfig.getConfigVersion()) { + _updateOpTimeFromHeartbeat_inlock(targetIndex, hbStatusResponse.getValue().getOpTime()); } + } - _signalStepDownWaiters(); + _signalStepDownWaiters(); - _scheduleHeartbeatToTarget( - target, - targetIndex, - std::max(now, action.getNextHeartbeatStartDate())); + _scheduleHeartbeatToTarget( + target, targetIndex, std::max(now, action.getNextHeartbeatStartDate())); - _handleHeartbeatResponseAction(action, hbStatusResponse); - } + _handleHeartbeatResponseAction(action, hbStatusResponse); +} - void ReplicationCoordinatorImpl::_updateOpTimeFromHeartbeat_inlock(int targetIndex, - OpTime optime) { - invariant(_selfIndex >= 0); - invariant(targetIndex >= 0); - - SlaveInfo& slaveInfo = _slaveInfo[targetIndex]; - if (optime > slaveInfo.opTime && slaveInfo.rid.isSet()) { - // TODO(spencer): The second part of the above if-statement can be removed after 3.0 - // but for now, to maintain compatibility with 2.6, we can't record optimes for any - // nodes we haven't heard from via replSetUpdatePosition yet to associate an RID. - _updateSlaveInfoOptime_inlock(&slaveInfo, optime); - } - } +void ReplicationCoordinatorImpl::_updateOpTimeFromHeartbeat_inlock(int targetIndex, OpTime optime) { + invariant(_selfIndex >= 0); + invariant(targetIndex >= 0); - void ReplicationCoordinatorImpl::_handleHeartbeatResponseAction( - const HeartbeatResponseAction& action, - const StatusWith<ReplSetHeartbeatResponse>& responseStatus) { + SlaveInfo& slaveInfo = _slaveInfo[targetIndex]; + if (optime > slaveInfo.opTime && slaveInfo.rid.isSet()) { + // TODO(spencer): The second part of the above if-statement can be removed after 3.0 + // but for now, to maintain compatibility with 2.6, we can't record optimes for any + // nodes we haven't heard from via replSetUpdatePosition yet to associate an RID. + _updateSlaveInfoOptime_inlock(&slaveInfo, optime); + } +} - switch (action.getAction()) { +void ReplicationCoordinatorImpl::_handleHeartbeatResponseAction( + const HeartbeatResponseAction& action, + const StatusWith<ReplSetHeartbeatResponse>& responseStatus) { + switch (action.getAction()) { case HeartbeatResponseAction::NoAction: // Update the cached member state if different than the current topology member state if (_memberState != _topCoord->getMemberState()) { @@ -219,331 +198,309 @@ namespace { case HeartbeatResponseAction::StepDownRemotePrimary: { invariant(action.getPrimaryConfigIndex() != _selfIndex); _requestRemotePrimaryStepdown( - _rsConfig.getMemberAt(action.getPrimaryConfigIndex()).getHostAndPort()); + _rsConfig.getMemberAt(action.getPrimaryConfigIndex()).getHostAndPort()); break; } default: severe() << "Illegal heartbeat response action code " << int(action.getAction()); invariant(false); - } } +} namespace { - /** - * This callback is purely for logging and has no effect on any other operations - */ - void remoteStepdownCallback(const ReplicationExecutor::RemoteCommandCallbackData& cbData) { - - const Status status = cbData.response.getStatus(); - if (status == ErrorCodes::CallbackCanceled) { - return; - } +/** + * This callback is purely for logging and has no effect on any other operations + */ +void remoteStepdownCallback(const ReplicationExecutor::RemoteCommandCallbackData& cbData) { + const Status status = cbData.response.getStatus(); + if (status == ErrorCodes::CallbackCanceled) { + return; + } - if (status.isOK()) { - LOG(1) << "replset: stepdown of primary(" << cbData.request.target - << ") succeeded with response -- " - << cbData.response.getValue().data; - } - else { - warning() << "replset: stepdown of primary(" << cbData.request.target - << ") failed due to " << cbData.response.getStatus(); - } + if (status.isOK()) { + LOG(1) << "replset: stepdown of primary(" << cbData.request.target + << ") succeeded with response -- " << cbData.response.getValue().data; + } else { + warning() << "replset: stepdown of primary(" << cbData.request.target << ") failed due to " + << cbData.response.getStatus(); } +} } // namespace - void ReplicationCoordinatorImpl::_requestRemotePrimaryStepdown(const HostAndPort& target) { - CmdRequest request(target, "admin", BSON("replSetStepDown" << 1)); +void ReplicationCoordinatorImpl::_requestRemotePrimaryStepdown(const HostAndPort& target) { + CmdRequest request(target, "admin", BSON("replSetStepDown" << 1)); - log() << "Requesting " << target << " step down from primary"; - CBHStatus cbh = _replExecutor.scheduleRemoteCommand( - request, remoteStepdownCallback); - if (cbh.getStatus() != ErrorCodes::ShutdownInProgress) { - fassert(18808, cbh.getStatus()); - } + log() << "Requesting " << target << " step down from primary"; + CBHStatus cbh = _replExecutor.scheduleRemoteCommand(request, remoteStepdownCallback); + if (cbh.getStatus() != ErrorCodes::ShutdownInProgress) { + fassert(18808, cbh.getStatus()); } - - void ReplicationCoordinatorImpl::_heartbeatStepDownStart() { - log() << "Stepping down from primary in response to heartbeat"; - _replExecutor.scheduleWorkWithGlobalExclusiveLock( - stdx::bind(&ReplicationCoordinatorImpl::_heartbeatStepDownFinish, - this, - stdx::placeholders::_1)); +} + +void ReplicationCoordinatorImpl::_heartbeatStepDownStart() { + log() << "Stepping down from primary in response to heartbeat"; + _replExecutor.scheduleWorkWithGlobalExclusiveLock(stdx::bind( + &ReplicationCoordinatorImpl::_heartbeatStepDownFinish, this, stdx::placeholders::_1)); +} + +void ReplicationCoordinatorImpl::_heartbeatStepDownFinish( + const ReplicationExecutor::CallbackData& cbData) { + if (cbData.status == ErrorCodes::CallbackCanceled) { + return; } - - void ReplicationCoordinatorImpl::_heartbeatStepDownFinish( - const ReplicationExecutor::CallbackData& cbData) { - - if (cbData.status == ErrorCodes::CallbackCanceled) { - return; - } - invariant(cbData.txn); - // TODO Add invariant that we've got global shared or global exclusive lock, when supported - // by lock manager. - boost::unique_lock<boost::mutex> lk(_mutex); - _topCoord->stepDownIfPending(); - const PostMemberStateUpdateAction action = - _updateMemberStateFromTopologyCoordinator_inlock(); - lk.unlock(); - _performPostMemberStateUpdateAction(action); + invariant(cbData.txn); + // TODO Add invariant that we've got global shared or global exclusive lock, when supported + // by lock manager. + boost::unique_lock<boost::mutex> lk(_mutex); + _topCoord->stepDownIfPending(); + const PostMemberStateUpdateAction action = _updateMemberStateFromTopologyCoordinator_inlock(); + lk.unlock(); + _performPostMemberStateUpdateAction(action); +} + +void ReplicationCoordinatorImpl::_scheduleHeartbeatReconfig(const ReplicaSetConfig& newConfig) { + boost::lock_guard<boost::mutex> lk(_mutex); + if (_inShutdown) { + return; } - void ReplicationCoordinatorImpl::_scheduleHeartbeatReconfig(const ReplicaSetConfig& newConfig) { - boost::lock_guard<boost::mutex> lk(_mutex); - if (_inShutdown) { - return; - } - - switch (_rsConfigState) { + switch (_rsConfigState) { case kConfigStartingUp: - LOG(1) << "Ignoring new configuration with version " << newConfig.getConfigVersion() << - " because still attempting to load local configuration information"; + LOG(1) << "Ignoring new configuration with version " << newConfig.getConfigVersion() + << " because still attempting to load local configuration information"; return; case kConfigUninitialized: case kConfigSteady: - LOG(1) << "Received new config via heartbeat with version " << - newConfig.getConfigVersion(); + LOG(1) << "Received new config via heartbeat with version " + << newConfig.getConfigVersion(); break; case kConfigInitiating: case kConfigReconfiguring: case kConfigHBReconfiguring: - LOG(1) << "Ignoring new configuration with version " << newConfig.getConfigVersion() << - " because already in the midst of a configuration process"; + LOG(1) << "Ignoring new configuration with version " << newConfig.getConfigVersion() + << " because already in the midst of a configuration process"; return; default: - severe() << "Reconfiguration request occurred while _rsConfigState == " << - int(_rsConfigState) << "; aborting."; + severe() << "Reconfiguration request occurred while _rsConfigState == " + << int(_rsConfigState) << "; aborting."; fassertFailed(18807); - } - _setConfigState_inlock(kConfigHBReconfiguring); - invariant(!_rsConfig.isInitialized() || - _rsConfig.getConfigVersion() < newConfig.getConfigVersion()); - if (_freshnessChecker) { - _freshnessChecker->cancel(&_replExecutor); - if (_electCmdRunner) { - _electCmdRunner->cancel(&_replExecutor); - } - _replExecutor.onEvent( - _electionFinishedEvent, - stdx::bind(&ReplicationCoordinatorImpl::_heartbeatReconfigAfterElectionCanceled, - this, - stdx::placeholders::_1, - newConfig)); - return; - } - invariant(!_heartbeatReconfigThread.get()); - _heartbeatReconfigThread.reset( - new boost::thread(stdx::bind(&ReplicationCoordinatorImpl::_heartbeatReconfigStore, - this, - newConfig)));; } - - void ReplicationCoordinatorImpl::_heartbeatReconfigAfterElectionCanceled( - const ReplicationExecutor::CallbackData& cbData, - const ReplicaSetConfig& newConfig) { - if (cbData.status == ErrorCodes::CallbackCanceled) { - return; - } - fassert(18911, cbData.status); - boost::lock_guard<boost::mutex> lk(_mutex); - if (_inShutdown) { - return; + _setConfigState_inlock(kConfigHBReconfiguring); + invariant(!_rsConfig.isInitialized() || + _rsConfig.getConfigVersion() < newConfig.getConfigVersion()); + if (_freshnessChecker) { + _freshnessChecker->cancel(&_replExecutor); + if (_electCmdRunner) { + _electCmdRunner->cancel(&_replExecutor); } - - invariant(!_heartbeatReconfigThread.get()); - _heartbeatReconfigThread.reset( - new boost::thread(stdx::bind(&ReplicationCoordinatorImpl::_heartbeatReconfigStore, - this, - newConfig))); + _replExecutor.onEvent( + _electionFinishedEvent, + stdx::bind(&ReplicationCoordinatorImpl::_heartbeatReconfigAfterElectionCanceled, + this, + stdx::placeholders::_1, + newConfig)); + return; + } + invariant(!_heartbeatReconfigThread.get()); + _heartbeatReconfigThread.reset(new boost::thread( + stdx::bind(&ReplicationCoordinatorImpl::_heartbeatReconfigStore, this, newConfig))); + ; +} + +void ReplicationCoordinatorImpl::_heartbeatReconfigAfterElectionCanceled( + const ReplicationExecutor::CallbackData& cbData, const ReplicaSetConfig& newConfig) { + if (cbData.status == ErrorCodes::CallbackCanceled) { + return; + } + fassert(18911, cbData.status); + boost::lock_guard<boost::mutex> lk(_mutex); + if (_inShutdown) { + return; } - void ReplicationCoordinatorImpl::_heartbeatReconfigStore(const ReplicaSetConfig& newConfig) { - class StoreThreadGuard { - public: - StoreThreadGuard(boost::unique_lock<boost::mutex>* lk, - boost::scoped_ptr<boost::thread>* thread, - bool* inShutdown) : - _lk(lk), - _thread(thread), - _inShutdown(inShutdown) {} - ~StoreThreadGuard() { - if (!_lk->owns_lock()) { - _lk->lock(); - } - if (*_inShutdown) { - return; - } - _thread->get()->detach(); - _thread->reset(NULL); + invariant(!_heartbeatReconfigThread.get()); + _heartbeatReconfigThread.reset(new boost::thread( + stdx::bind(&ReplicationCoordinatorImpl::_heartbeatReconfigStore, this, newConfig))); +} + +void ReplicationCoordinatorImpl::_heartbeatReconfigStore(const ReplicaSetConfig& newConfig) { + class StoreThreadGuard { + public: + StoreThreadGuard(boost::unique_lock<boost::mutex>* lk, + boost::scoped_ptr<boost::thread>* thread, + bool* inShutdown) + : _lk(lk), _thread(thread), _inShutdown(inShutdown) {} + ~StoreThreadGuard() { + if (!_lk->owns_lock()) { + _lk->lock(); } - - private: - boost::unique_lock<boost::mutex>* const _lk; - boost::scoped_ptr<boost::thread>* const _thread; - bool* const _inShutdown; - }; - - boost::unique_lock<boost::mutex> lk(_mutex, boost::defer_lock_t()); - StoreThreadGuard guard(&lk, &_heartbeatReconfigThread, &_inShutdown); - - const StatusWith<int> myIndex = validateConfigForHeartbeatReconfig( - _externalState.get(), - newConfig); - - if (myIndex.getStatus() == ErrorCodes::NodeNotFound) { - lk.lock(); - // If this node absent in newConfig, and this node was not previously initialized, - // return to kConfigUninitialized immediately, rather than storing the config and - // transitioning into the RS_REMOVED state. See SERVER-15740. - if (!_rsConfig.isInitialized()) { - invariant(_rsConfigState == kConfigHBReconfiguring); - LOG(1) << "Ignoring new configuration in heartbeat response because we are " - "uninitialized and not a member of the new configuration"; - _setConfigState_inlock(kConfigUninitialized); + if (*_inShutdown) { return; } - lk.unlock(); + _thread->get()->detach(); + _thread->reset(NULL); } - if (!myIndex.getStatus().isOK() && myIndex.getStatus() != ErrorCodes::NodeNotFound) { - warning() << "Not persisting new configuration in heartbeat response to disk because " - "it is invalid: "<< myIndex.getStatus(); - } - else { - boost::scoped_ptr<OperationContext> txn( - _externalState->createOperationContext("WriteReplSetConfig")); - Status status = _externalState->storeLocalConfigDocument(txn.get(), newConfig.toBSON()); - - lk.lock(); - if (!status.isOK()) { - error() << "Ignoring new configuration in heartbeat response because we failed to" - " write it to stable storage; " << status; - invariant(_rsConfigState == kConfigHBReconfiguring); - if (_rsConfig.isInitialized()) { - _setConfigState_inlock(kConfigSteady); - } - else { - _setConfigState_inlock(kConfigUninitialized); - } - return; - } - - lk.unlock(); + private: + boost::unique_lock<boost::mutex>* const _lk; + boost::scoped_ptr<boost::thread>* const _thread; + bool* const _inShutdown; + }; - _externalState->startThreads(); - } + boost::unique_lock<boost::mutex> lk(_mutex, boost::defer_lock_t()); + StoreThreadGuard guard(&lk, &_heartbeatReconfigThread, &_inShutdown); - const stdx::function<void (const ReplicationExecutor::CallbackData&)> reconfigFinishFn( - stdx::bind(&ReplicationCoordinatorImpl::_heartbeatReconfigFinish, - this, - stdx::placeholders::_1, - newConfig, - myIndex)); + const StatusWith<int> myIndex = + validateConfigForHeartbeatReconfig(_externalState.get(), newConfig); - // Make sure that the reconfigFinishFn doesn't finish until we've reset - // _heartbeatReconfigThread. + if (myIndex.getStatus() == ErrorCodes::NodeNotFound) { lk.lock(); - if (_memberState.primary()) { - // If the primary is receiving a heartbeat reconfig, that strongly suggests - // that there has been a force reconfiguration. In any event, it might lead - // to this node stepping down as primary, so we'd better do it with the global - // lock. - _replExecutor.scheduleWorkWithGlobalExclusiveLock(reconfigFinishFn); - } - else { - _replExecutor.scheduleWork(reconfigFinishFn); + // If this node absent in newConfig, and this node was not previously initialized, + // return to kConfigUninitialized immediately, rather than storing the config and + // transitioning into the RS_REMOVED state. See SERVER-15740. + if (!_rsConfig.isInitialized()) { + invariant(_rsConfigState == kConfigHBReconfiguring); + LOG(1) << "Ignoring new configuration in heartbeat response because we are " + "uninitialized and not a member of the new configuration"; + _setConfigState_inlock(kConfigUninitialized); + return; } + lk.unlock(); } - void ReplicationCoordinatorImpl::_heartbeatReconfigFinish( - const ReplicationExecutor::CallbackData& cbData, - const ReplicaSetConfig& newConfig, - StatusWith<int> myIndex) { - if (cbData.status == ErrorCodes::CallbackCanceled) { - return; - } + if (!myIndex.getStatus().isOK() && myIndex.getStatus() != ErrorCodes::NodeNotFound) { + warning() << "Not persisting new configuration in heartbeat response to disk because " + "it is invalid: " << myIndex.getStatus(); + } else { + boost::scoped_ptr<OperationContext> txn( + _externalState->createOperationContext("WriteReplSetConfig")); + Status status = _externalState->storeLocalConfigDocument(txn.get(), newConfig.toBSON()); - boost::unique_lock<boost::mutex> lk(_mutex); - invariant(_rsConfigState == kConfigHBReconfiguring); - invariant(!_rsConfig.isInitialized() || - _rsConfig.getConfigVersion() < newConfig.getConfigVersion()); - - if (_getMemberState_inlock().primary() && !cbData.txn) { - // Not having an OperationContext in the CallbackData means we definitely aren't holding - // the global lock. Since we're primary and this reconfig could cause us to stepdown, - // reschedule this work with the global exclusive lock so the stepdown is safe. - // TODO(spencer): When we *do* have an OperationContext, consult it to confirm that - // we are indeed holding the global lock. - _replExecutor.scheduleWorkWithGlobalExclusiveLock( - stdx::bind(&ReplicationCoordinatorImpl::_heartbeatReconfigFinish, - this, - stdx::placeholders::_1, - newConfig, - myIndex)); + lk.lock(); + if (!status.isOK()) { + error() << "Ignoring new configuration in heartbeat response because we failed to" + " write it to stable storage; " << status; + invariant(_rsConfigState == kConfigHBReconfiguring); + if (_rsConfig.isInitialized()) { + _setConfigState_inlock(kConfigSteady); + } else { + _setConfigState_inlock(kConfigUninitialized); + } return; } - if (!myIndex.isOK()) { - switch (myIndex.getStatus().code()) { + lk.unlock(); + + _externalState->startThreads(); + } + + const stdx::function<void(const ReplicationExecutor::CallbackData&)> reconfigFinishFn( + stdx::bind(&ReplicationCoordinatorImpl::_heartbeatReconfigFinish, + this, + stdx::placeholders::_1, + newConfig, + myIndex)); + + // Make sure that the reconfigFinishFn doesn't finish until we've reset + // _heartbeatReconfigThread. + lk.lock(); + if (_memberState.primary()) { + // If the primary is receiving a heartbeat reconfig, that strongly suggests + // that there has been a force reconfiguration. In any event, it might lead + // to this node stepping down as primary, so we'd better do it with the global + // lock. + _replExecutor.scheduleWorkWithGlobalExclusiveLock(reconfigFinishFn); + } else { + _replExecutor.scheduleWork(reconfigFinishFn); + } +} + +void ReplicationCoordinatorImpl::_heartbeatReconfigFinish( + const ReplicationExecutor::CallbackData& cbData, + const ReplicaSetConfig& newConfig, + StatusWith<int> myIndex) { + if (cbData.status == ErrorCodes::CallbackCanceled) { + return; + } + + boost::unique_lock<boost::mutex> lk(_mutex); + invariant(_rsConfigState == kConfigHBReconfiguring); + invariant(!_rsConfig.isInitialized() || + _rsConfig.getConfigVersion() < newConfig.getConfigVersion()); + + if (_getMemberState_inlock().primary() && !cbData.txn) { + // Not having an OperationContext in the CallbackData means we definitely aren't holding + // the global lock. Since we're primary and this reconfig could cause us to stepdown, + // reschedule this work with the global exclusive lock so the stepdown is safe. + // TODO(spencer): When we *do* have an OperationContext, consult it to confirm that + // we are indeed holding the global lock. + _replExecutor.scheduleWorkWithGlobalExclusiveLock( + stdx::bind(&ReplicationCoordinatorImpl::_heartbeatReconfigFinish, + this, + stdx::placeholders::_1, + newConfig, + myIndex)); + return; + } + + if (!myIndex.isOK()) { + switch (myIndex.getStatus().code()) { case ErrorCodes::NodeNotFound: - log() << "Cannot find self in new replica set configuration; I must be removed; " << - myIndex.getStatus(); + log() << "Cannot find self in new replica set configuration; I must be removed; " + << myIndex.getStatus(); break; case ErrorCodes::DuplicateKey: error() << "Several entries in new config represent this node; " - "Removing self until an acceptable configuration arrives; " << - myIndex.getStatus(); + "Removing self until an acceptable configuration arrives; " + << myIndex.getStatus(); break; default: error() << "Could not validate configuration received from remote node; " - "Removing self until an acceptable configuration arrives; " << - myIndex.getStatus(); + "Removing self until an acceptable configuration arrives; " + << myIndex.getStatus(); break; - } - myIndex = StatusWith<int>(-1); } - const PostMemberStateUpdateAction action = - _setCurrentRSConfig_inlock(newConfig, myIndex.getValue()); - lk.unlock(); - _performPostMemberStateUpdateAction(action); + myIndex = StatusWith<int>(-1); } - - void ReplicationCoordinatorImpl::_trackHeartbeatHandle(const StatusWith<CBHandle>& handle) { - if (handle.getStatus() == ErrorCodes::ShutdownInProgress) { - return; - } - fassert(18912, handle.getStatus()); - _heartbeatHandles.push_back(handle.getValue()); + const PostMemberStateUpdateAction action = + _setCurrentRSConfig_inlock(newConfig, myIndex.getValue()); + lk.unlock(); + _performPostMemberStateUpdateAction(action); +} + +void ReplicationCoordinatorImpl::_trackHeartbeatHandle(const StatusWith<CBHandle>& handle) { + if (handle.getStatus() == ErrorCodes::ShutdownInProgress) { + return; } - - void ReplicationCoordinatorImpl::_untrackHeartbeatHandle(const CBHandle& handle) { - const HeartbeatHandles::iterator newEnd = std::remove( - _heartbeatHandles.begin(), - _heartbeatHandles.end(), - handle); - invariant(newEnd != _heartbeatHandles.end()); - _heartbeatHandles.erase(newEnd, _heartbeatHandles.end()); - } - - void ReplicationCoordinatorImpl::_cancelHeartbeats() { - std::for_each(_heartbeatHandles.begin(), - _heartbeatHandles.end(), - stdx::bind(&ReplicationExecutor::cancel, - &_replExecutor, - stdx::placeholders::_1)); - // Heartbeat callbacks will remove themselves from _heartbeatHandles when they execute with - // CallbackCanceled status, so it's better to leave the handles in the list, for now. - } - - void ReplicationCoordinatorImpl::_startHeartbeats() { - const Date_t now = _replExecutor.now(); - _seedList.clear(); - for (int i = 0; i < _rsConfig.getNumMembers(); ++i) { - if (i == _selfIndex) { - continue; - } - _scheduleHeartbeatToTarget(_rsConfig.getMemberAt(i).getHostAndPort(), i, now); + fassert(18912, handle.getStatus()); + _heartbeatHandles.push_back(handle.getValue()); +} + +void ReplicationCoordinatorImpl::_untrackHeartbeatHandle(const CBHandle& handle) { + const HeartbeatHandles::iterator newEnd = + std::remove(_heartbeatHandles.begin(), _heartbeatHandles.end(), handle); + invariant(newEnd != _heartbeatHandles.end()); + _heartbeatHandles.erase(newEnd, _heartbeatHandles.end()); +} + +void ReplicationCoordinatorImpl::_cancelHeartbeats() { + std::for_each(_heartbeatHandles.begin(), + _heartbeatHandles.end(), + stdx::bind(&ReplicationExecutor::cancel, &_replExecutor, stdx::placeholders::_1)); + // Heartbeat callbacks will remove themselves from _heartbeatHandles when they execute with + // CallbackCanceled status, so it's better to leave the handles in the list, for now. +} + +void ReplicationCoordinatorImpl::_startHeartbeats() { + const Date_t now = _replExecutor.now(); + _seedList.clear(); + for (int i = 0; i < _rsConfig.getNumMembers(); ++i) { + if (i == _selfIndex) { + continue; } + _scheduleHeartbeatToTarget(_rsConfig.getMemberAt(i).getHostAndPort(), i, now); } +} -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat_test.cpp index 9008dbb9854..a2edb95ad6b 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat_test.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat_test.cpp @@ -47,200 +47,208 @@ namespace mongo { namespace repl { namespace { - class ReplCoordHBTest : public ReplCoordTest { - protected: - void assertMemberState(MemberState expected, std::string msg = ""); - ReplSetHeartbeatResponse receiveHeartbeatFrom( - const ReplicaSetConfig& rsConfig, - int sourceId, - const HostAndPort& source); - }; - - void ReplCoordHBTest::assertMemberState(const MemberState expected, std::string msg) { - const MemberState actual = getReplCoord()->getMemberState(); - ASSERT(expected == actual) << "Expected coordinator to report state " << - expected.toString() << " but found " << actual.toString() << " - " << msg; +class ReplCoordHBTest : public ReplCoordTest { +protected: + void assertMemberState(MemberState expected, std::string msg = ""); + ReplSetHeartbeatResponse receiveHeartbeatFrom(const ReplicaSetConfig& rsConfig, + int sourceId, + const HostAndPort& source); +}; + +void ReplCoordHBTest::assertMemberState(const MemberState expected, std::string msg) { + const MemberState actual = getReplCoord()->getMemberState(); + ASSERT(expected == actual) << "Expected coordinator to report state " << expected.toString() + << " but found " << actual.toString() << " - " << msg; +} + +ReplSetHeartbeatResponse ReplCoordHBTest::receiveHeartbeatFrom(const ReplicaSetConfig& rsConfig, + int sourceId, + const HostAndPort& source) { + ReplSetHeartbeatArgs hbArgs; + hbArgs.setProtocolVersion(1); + hbArgs.setConfigVersion(rsConfig.getConfigVersion()); + hbArgs.setSetName(rsConfig.getReplSetName()); + hbArgs.setSenderHost(source); + hbArgs.setSenderId(sourceId); + ASSERT(hbArgs.isInitialized()); + + ReplSetHeartbeatResponse response; + ASSERT_OK(getReplCoord()->processHeartbeat(hbArgs, &response)); + return response; +} + +TEST_F(ReplCoordHBTest, JoinExistingReplSet) { + logger::globalLogDomain()->setMinimumLoggedSeverity(logger::LogSeverity::Debug(3)); + ReplicaSetConfig rsConfig = + assertMakeRSConfig(BSON("_id" + << "mySet" + << "version" << 3 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1:1") + << BSON("_id" << 2 << "host" + << "h2:1") << BSON("_id" << 3 << "host" + << "h3:1")))); + init("mySet"); + addSelf(HostAndPort("h2", 1)); + const Date_t startDate = getNet()->now(); + start(); + enterNetwork(); + assertMemberState(MemberState::RS_STARTUP); + NetworkInterfaceMock* net = getNet(); + ASSERT_FALSE(net->hasReadyRequests()); + exitNetwork(); + receiveHeartbeatFrom(rsConfig, 1, HostAndPort("h1", 1)); + + enterNetwork(); + NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); + const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); + ASSERT_EQUALS(HostAndPort("h1", 1), request.target); + ReplSetHeartbeatArgs hbArgs; + ASSERT_OK(hbArgs.initialize(request.cmdObj)); + ASSERT_EQUALS("mySet", hbArgs.getSetName()); + ASSERT_EQUALS(-2, hbArgs.getConfigVersion()); + ReplSetHeartbeatResponse hbResp; + hbResp.setSetName("mySet"); + hbResp.setState(MemberState::RS_PRIMARY); + hbResp.noteReplSet(); + hbResp.setVersion(rsConfig.getConfigVersion()); + hbResp.setConfig(rsConfig); + BSONObjBuilder responseBuilder; + responseBuilder << "ok" << 1; + hbResp.addToBSON(&responseBuilder); + net->scheduleResponse(noi, startDate + 200, makeResponseStatus(responseBuilder.obj())); + assertRunUntil(startDate + 200); + + // Because the new config is stored using an out-of-band thread, we need to perform some + // extra synchronization to let the executor finish the heartbeat reconfig. We know that + // after the out-of-band thread completes, it schedules new heartbeats. We assume that no + // other network operations get scheduled during or before the reconfig, though this may + // cease to be true in the future. + noi = net->getNextReadyRequest(); + + assertMemberState(MemberState::RS_STARTUP2); + OperationContextNoop txn; + ReplicaSetConfig storedConfig; + ASSERT_OK(storedConfig.initialize( + unittest::assertGet(getExternalState()->loadLocalConfigDocument(&txn)))); + ASSERT_OK(storedConfig.validate()); + ASSERT_EQUALS(3, storedConfig.getConfigVersion()); + ASSERT_EQUALS(3, storedConfig.getNumMembers()); + exitNetwork(); +} + +TEST_F(ReplCoordHBTest, DoNotJoinReplSetIfNotAMember) { + // Tests that a node in RS_STARTUP will not transition to RS_REMOVED if it receives a + // configuration that does not contain it. + logger::globalLogDomain()->setMinimumLoggedSeverity(logger::LogSeverity::Debug(3)); + ReplicaSetConfig rsConfig = + assertMakeRSConfig(BSON("_id" + << "mySet" + << "version" << 3 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "h1:1") + << BSON("_id" << 2 << "host" + << "h2:1") << BSON("_id" << 3 << "host" + << "h3:1")))); + init("mySet"); + addSelf(HostAndPort("h4", 1)); + const Date_t startDate = getNet()->now(); + start(); + enterNetwork(); + assertMemberState(MemberState::RS_STARTUP, "1"); + NetworkInterfaceMock* net = getNet(); + ASSERT_FALSE(net->hasReadyRequests()); + exitNetwork(); + receiveHeartbeatFrom(rsConfig, 1, HostAndPort("h1", 1)); + + enterNetwork(); + NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); + const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); + ASSERT_EQUALS(HostAndPort("h1", 1), request.target); + ReplSetHeartbeatArgs hbArgs; + ASSERT_OK(hbArgs.initialize(request.cmdObj)); + ASSERT_EQUALS("mySet", hbArgs.getSetName()); + ASSERT_EQUALS(-2, hbArgs.getConfigVersion()); + ReplSetHeartbeatResponse hbResp; + hbResp.setSetName("mySet"); + hbResp.setState(MemberState::RS_PRIMARY); + hbResp.noteReplSet(); + hbResp.setVersion(rsConfig.getConfigVersion()); + hbResp.setConfig(rsConfig); + BSONObjBuilder responseBuilder; + responseBuilder << "ok" << 1; + hbResp.addToBSON(&responseBuilder); + net->scheduleResponse(noi, startDate + 200, makeResponseStatus(responseBuilder.obj())); + assertRunUntil(startDate + 2200); + + // Because the new config is stored using an out-of-band thread, we need to perform some + // extra synchronization to let the executor finish the heartbeat reconfig. We know that + // after the out-of-band thread completes, it schedules new heartbeats. We assume that no + // other network operations get scheduled during or before the reconfig, though this may + // cease to be true in the future. + noi = net->getNextReadyRequest(); + + assertMemberState(MemberState::RS_STARTUP, "2"); + OperationContextNoop txn; + + StatusWith<BSONObj> loadedConfig(getExternalState()->loadLocalConfigDocument(&txn)); + ASSERT_NOT_OK(loadedConfig.getStatus()) << loadedConfig.getValue(); + exitNetwork(); +} + +TEST_F(ReplCoordHBTest, NotYetInitializedConfigStateEarlyReturn) { + // ensure that if we've yet to receive an initial config, we return NotYetInitialized + init("mySet"); + ReplSetHeartbeatArgs hbArgs; + hbArgs.setProtocolVersion(1); + hbArgs.setConfigVersion(3); + hbArgs.setSetName("mySet"); + hbArgs.setSenderHost(HostAndPort("h1:1")); + hbArgs.setSenderId(1); + ASSERT(hbArgs.isInitialized()); + + ReplSetHeartbeatResponse response; + Status status = getReplCoord()->processHeartbeat(hbArgs, &response); + ASSERT_EQUALS(ErrorCodes::NotYetInitialized, status.code()); +} + +TEST_F(ReplCoordHBTest, OnlyUnauthorizedUpCausesRecovering) { + // Tests that a node that only has auth error heartbeats is recovering + logger::globalLogDomain()->setMinimumLoggedSeverity(logger::LogSeverity::Debug(3)); + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345") + << BSON("_id" << 2 << "host" + << "node2:12345"))), + HostAndPort("node1", 12345)); + ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + + // process heartbeat + enterNetwork(); + const NetworkInterfaceMock::NetworkOperationIterator noi = getNet()->getNextReadyRequest(); + const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); + log() << request.target.toString() << " processing " << request.cmdObj; + getNet()->scheduleResponse( + noi, + getNet()->now(), + makeResponseStatus(BSON("ok" << 0.0 << "errmsg" + << "unauth'd" + << "code" << ErrorCodes::Unauthorized))); + + if (request.target != HostAndPort("node2", 12345) && + request.cmdObj.firstElement().fieldNameStringData() != "replSetHeartbeat") { + error() << "Black holing unexpected request to " << request.target << ": " + << request.cmdObj; + getNet()->blackHole(noi); } + getNet()->runReadyNetworkOperations(); + exitNetwork(); - ReplSetHeartbeatResponse ReplCoordHBTest::receiveHeartbeatFrom( - const ReplicaSetConfig& rsConfig, - int sourceId, - const HostAndPort& source) { - ReplSetHeartbeatArgs hbArgs; - hbArgs.setProtocolVersion(1); - hbArgs.setConfigVersion(rsConfig.getConfigVersion()); - hbArgs.setSetName(rsConfig.getReplSetName()); - hbArgs.setSenderHost(source); - hbArgs.setSenderId(sourceId); - ASSERT(hbArgs.isInitialized()); - - ReplSetHeartbeatResponse response; - ASSERT_OK(getReplCoord()->processHeartbeat(hbArgs, &response)); - return response; - } - - TEST_F(ReplCoordHBTest, JoinExistingReplSet) { - logger::globalLogDomain()->setMinimumLoggedSeverity(logger::LogSeverity::Debug(3)); - ReplicaSetConfig rsConfig = assertMakeRSConfig( - BSON("_id" << "mySet" << - "version" << 3 << - "members" << BSON_ARRAY(BSON("_id" << 1 << "host" << "h1:1") << - BSON("_id" << 2 << "host" << "h2:1") << - BSON("_id" << 3 << "host" << "h3:1")))); - init("mySet"); - addSelf(HostAndPort("h2", 1)); - const Date_t startDate = getNet()->now(); - start(); - enterNetwork(); - assertMemberState(MemberState::RS_STARTUP); - NetworkInterfaceMock* net = getNet(); - ASSERT_FALSE(net->hasReadyRequests()); - exitNetwork(); - receiveHeartbeatFrom(rsConfig, 1, HostAndPort("h1", 1)); - - enterNetwork(); - NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); - const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); - ASSERT_EQUALS(HostAndPort("h1", 1), request.target); - ReplSetHeartbeatArgs hbArgs; - ASSERT_OK(hbArgs.initialize(request.cmdObj)); - ASSERT_EQUALS("mySet", hbArgs.getSetName()); - ASSERT_EQUALS(-2, hbArgs.getConfigVersion()); - ReplSetHeartbeatResponse hbResp; - hbResp.setSetName("mySet"); - hbResp.setState(MemberState::RS_PRIMARY); - hbResp.noteReplSet(); - hbResp.setVersion(rsConfig.getConfigVersion()); - hbResp.setConfig(rsConfig); - BSONObjBuilder responseBuilder; - responseBuilder << "ok" << 1; - hbResp.addToBSON(&responseBuilder); - net->scheduleResponse(noi, startDate + 200, makeResponseStatus(responseBuilder.obj())); - assertRunUntil(startDate + 200); - - // Because the new config is stored using an out-of-band thread, we need to perform some - // extra synchronization to let the executor finish the heartbeat reconfig. We know that - // after the out-of-band thread completes, it schedules new heartbeats. We assume that no - // other network operations get scheduled during or before the reconfig, though this may - // cease to be true in the future. - noi = net->getNextReadyRequest(); - - assertMemberState(MemberState::RS_STARTUP2); - OperationContextNoop txn; - ReplicaSetConfig storedConfig; - ASSERT_OK(storedConfig.initialize( - unittest::assertGet(getExternalState()->loadLocalConfigDocument(&txn)))); - ASSERT_OK(storedConfig.validate()); - ASSERT_EQUALS(3, storedConfig.getConfigVersion()); - ASSERT_EQUALS(3, storedConfig.getNumMembers()); - exitNetwork(); - } - - TEST_F(ReplCoordHBTest, DoNotJoinReplSetIfNotAMember) { - // Tests that a node in RS_STARTUP will not transition to RS_REMOVED if it receives a - // configuration that does not contain it. - logger::globalLogDomain()->setMinimumLoggedSeverity(logger::LogSeverity::Debug(3)); - ReplicaSetConfig rsConfig = assertMakeRSConfig( - BSON("_id" << "mySet" << - "version" << 3 << - "members" << BSON_ARRAY(BSON("_id" << 1 << "host" << "h1:1") << - BSON("_id" << 2 << "host" << "h2:1") << - BSON("_id" << 3 << "host" << "h3:1")))); - init("mySet"); - addSelf(HostAndPort("h4", 1)); - const Date_t startDate = getNet()->now(); - start(); - enterNetwork(); - assertMemberState(MemberState::RS_STARTUP, "1"); - NetworkInterfaceMock* net = getNet(); - ASSERT_FALSE(net->hasReadyRequests()); - exitNetwork(); - receiveHeartbeatFrom(rsConfig, 1, HostAndPort("h1", 1)); - - enterNetwork(); - NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); - const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); - ASSERT_EQUALS(HostAndPort("h1", 1), request.target); - ReplSetHeartbeatArgs hbArgs; - ASSERT_OK(hbArgs.initialize(request.cmdObj)); - ASSERT_EQUALS("mySet", hbArgs.getSetName()); - ASSERT_EQUALS(-2, hbArgs.getConfigVersion()); - ReplSetHeartbeatResponse hbResp; - hbResp.setSetName("mySet"); - hbResp.setState(MemberState::RS_PRIMARY); - hbResp.noteReplSet(); - hbResp.setVersion(rsConfig.getConfigVersion()); - hbResp.setConfig(rsConfig); - BSONObjBuilder responseBuilder; - responseBuilder << "ok" << 1; - hbResp.addToBSON(&responseBuilder); - net->scheduleResponse(noi, startDate + 200, makeResponseStatus(responseBuilder.obj())); - assertRunUntil(startDate + 2200); - - // Because the new config is stored using an out-of-band thread, we need to perform some - // extra synchronization to let the executor finish the heartbeat reconfig. We know that - // after the out-of-band thread completes, it schedules new heartbeats. We assume that no - // other network operations get scheduled during or before the reconfig, though this may - // cease to be true in the future. - noi = net->getNextReadyRequest(); - - assertMemberState(MemberState::RS_STARTUP, "2"); - OperationContextNoop txn; - - StatusWith<BSONObj> loadedConfig(getExternalState()->loadLocalConfigDocument(&txn)); - ASSERT_NOT_OK(loadedConfig.getStatus()) << loadedConfig.getValue(); - exitNetwork(); - } - - TEST_F(ReplCoordHBTest, NotYetInitializedConfigStateEarlyReturn) { - // ensure that if we've yet to receive an initial config, we return NotYetInitialized - init("mySet"); - ReplSetHeartbeatArgs hbArgs; - hbArgs.setProtocolVersion(1); - hbArgs.setConfigVersion(3); - hbArgs.setSetName("mySet"); - hbArgs.setSenderHost(HostAndPort("h1:1")); - hbArgs.setSenderId(1); - ASSERT(hbArgs.isInitialized()); - - ReplSetHeartbeatResponse response; - Status status = getReplCoord()->processHeartbeat(hbArgs, &response); - ASSERT_EQUALS(ErrorCodes::NotYetInitialized, status.code()); - } - - TEST_F(ReplCoordHBTest, OnlyUnauthorizedUpCausesRecovering) { - // Tests that a node that only has auth error heartbeats is recovering - logger::globalLogDomain()->setMinimumLoggedSeverity(logger::LogSeverity::Debug(3)); - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 1 << "host" << "node1:12345") << - BSON("_id" << 2 << "host" << "node2:12345"))), - HostAndPort("node1", 12345)); - ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - - // process heartbeat - enterNetwork(); - const NetworkInterfaceMock::NetworkOperationIterator noi = getNet()->getNextReadyRequest(); - const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); - log() << request.target.toString() << " processing " << request.cmdObj; - getNet()->scheduleResponse(noi, getNet()->now(), makeResponseStatus( - BSON("ok" << 0.0 << - "errmsg" << "unauth'd" << - "code" << ErrorCodes::Unauthorized))); - - if (request.target != HostAndPort("node2", 12345) - && request.cmdObj.firstElement().fieldNameStringData() != "replSetHeartbeat") { - error() << "Black holing unexpected request to " - << request.target << ": " << request.cmdObj; - getNet()->blackHole(noi); - } - getNet()->runReadyNetworkOperations(); - exitNetwork(); - - ASSERT_TRUE(getTopoCoord().getMemberState().recovering()); - assertMemberState(MemberState::RS_RECOVERING, "0"); - } + ASSERT_TRUE(getTopoCoord().getMemberState().recovering()); + assertMemberState(MemberState::RS_RECOVERING, "0"); +} } // namespace } // namespace repl diff --git a/src/mongo/db/repl/replication_coordinator_impl_reconfig_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_reconfig_test.cpp index 39ba0557b18..079f6e2227a 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_reconfig_test.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_reconfig_test.cpp @@ -39,7 +39,7 @@ #include "mongo/db/repl/replication_coordinator_external_state_mock.h" #include "mongo/db/repl/replication_coordinator_impl.h" #include "mongo/db/repl/replication_coordinator_test_fixture.h" -#include "mongo/db/repl/replication_coordinator.h" // ReplSetReconfigArgs +#include "mongo/db/repl/replication_coordinator.h" // ReplSetReconfigArgs #include "mongo/unittest/unittest.h" #include "mongo/util/log.h" @@ -47,391 +47,418 @@ namespace mongo { namespace repl { namespace { - typedef ReplicationCoordinator::ReplSetReconfigArgs ReplSetReconfigArgs; - typedef ReplicationExecutor::RemoteCommandRequest RemoteCommandRequest; - - TEST_F(ReplCoordTest, ReconfigBeforeInitialized) { - // start up but do not initiate - OperationContextNoop txn; - init(); - start(); - BSONObjBuilder result; - ReplSetReconfigArgs args; - - ASSERT_EQUALS(ErrorCodes::NotYetInitialized, - getReplCoord()->processReplSetReconfig(&txn, args, &result)); - ASSERT_TRUE(result.obj().isEmpty()); - } - - TEST_F(ReplCoordTest, ReconfigWhileNotPrimary) { - // start up, become secondary, receive reconfig - OperationContextNoop txn; - init(); - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("_id" << 1 << "host" << "node1:12345") << - BSON("_id" << 2 << "host" << "node2:12345") )), - HostAndPort("node1", 12345)); - - BSONObjBuilder result; - ReplSetReconfigArgs args; - args.force = false; - ASSERT_EQUALS(ErrorCodes::NotMaster, - getReplCoord()->processReplSetReconfig(&txn, args, &result)); - ASSERT_TRUE(result.obj().isEmpty()); - } - - TEST_F(ReplCoordTest, ReconfigWithUninitializableConfig) { - // start up, become primary, receive uninitializable config - OperationContextNoop txn; - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("_id" << 1 << "host" << "node1:12345") << - BSON("_id" << 2 << "host" << "node2:12345") )), - HostAndPort("node1", 12345)); - ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(100, 0)); - simulateSuccessfulElection(); - - BSONObjBuilder result; - ReplSetReconfigArgs args; - args.force = false; - args.newConfigObj = BSON("_id" << "mySet" << - "version" << 2 << - "invalidlyNamedField" << 3 << - "members" << BSON_ARRAY(BSON("_id" << 1 << - "host" << "node1:12345" << - "arbiterOnly" << true) << - BSON("_id" << 2 << - "host" << "node2:12345" << - "arbiterOnly" << true))); - // ErrorCodes::BadValue should be propagated from ReplicaSetConfig::initialize() - ASSERT_EQUALS(ErrorCodes::InvalidReplicaSetConfig, - getReplCoord()->processReplSetReconfig(&txn, args, &result)); - ASSERT_TRUE(result.obj().isEmpty()); - } - - TEST_F(ReplCoordTest, ReconfigWithWrongReplSetName) { - // start up, become primary, receive config with incorrect replset name - OperationContextNoop txn; - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("_id" << 1 << "host" << "node1:12345") << - BSON("_id" << 2 << "host" << "node2:12345") )), - HostAndPort("node1", 12345)); - ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(100, 0)); - simulateSuccessfulElection(); - - BSONObjBuilder result; - ReplSetReconfigArgs args; - args.force = false; - args.newConfigObj = BSON("_id" << "notMySet" << - "version" << 3 << - "members" << BSON_ARRAY(BSON("_id" << 1 << - "host" << "node1:12345") << - BSON("_id" << 2 << - "host" << "node2:12345"))); - - ASSERT_EQUALS(ErrorCodes::InvalidReplicaSetConfig, - getReplCoord()->processReplSetReconfig(&txn, args, &result)); - ASSERT_TRUE(result.obj().isEmpty()); - } - - TEST_F(ReplCoordTest, ReconfigValidateFails) { - // start up, become primary, validate fails - OperationContextNoop txn; - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("_id" << 1 << "host" << "node1:12345") << - BSON("_id" << 2 << "host" << "node2:12345") )), - HostAndPort("node1", 12345)); - ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(100, 0)); - simulateSuccessfulElection(); - - BSONObjBuilder result; - ReplSetReconfigArgs args; - args.force = false; - args.newConfigObj = BSON("_id" << "mySet" << - "version" << -3 << - "members" << BSON_ARRAY(BSON("_id" << 1 << - "host" << "node1:12345") << - BSON("_id" << 2 << - "host" << "node2:12345"))); - - ASSERT_EQUALS(ErrorCodes::NewReplicaSetConfigurationIncompatible, - getReplCoord()->processReplSetReconfig(&txn, args, &result)); - ASSERT_TRUE(result.obj().isEmpty()); - } - - void doReplSetInitiate(ReplicationCoordinatorImpl* replCoord, Status* status) { - OperationContextNoop txn; - BSONObjBuilder garbage; - *status = replCoord->processReplSetInitiate( - &txn, - BSON("_id" << "mySet" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "node1:12345") << - BSON("_id" << 2 << "host" << "node2:12345"))), - &garbage); - } - - void doReplSetReconfig(ReplicationCoordinatorImpl* replCoord, Status* status) { - OperationContextNoop txn; - BSONObjBuilder garbage; - ReplSetReconfigArgs args; - args.force = false; - args.newConfigObj = BSON("_id" << "mySet" << - "version" << 3 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "node1:12345") << - BSON("_id" << 2 << "host" << "node2:12345" << - "priority" << 3))); - *status = replCoord->processReplSetReconfig(&txn, args, &garbage); - } - - TEST_F(ReplCoordTest, ReconfigQuorumCheckFails) { - // start up, become primary, fail during quorum check due to a heartbeat - // containing a higher config version - OperationContextNoop txn; - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("_id" << 1 << "host" << "node1:12345") << - BSON("_id" << 2 << "host" << "node2:12345") )), - HostAndPort("node1", 12345)); - ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(100, 0)); - simulateSuccessfulElection(); - - Status status(ErrorCodes::InternalError, "Not Set"); - boost::thread reconfigThread(stdx::bind(doReplSetReconfig, getReplCoord(), &status)); - - NetworkInterfaceMock* net = getNet(); - getNet()->enterNetwork(); - const NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); - const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); - repl::ReplSetHeartbeatArgs hbArgs; - ASSERT_OK(hbArgs.initialize(request.cmdObj)); - repl::ReplSetHeartbeatResponse hbResp; - hbResp.setSetName("mySet"); - hbResp.setState(MemberState::RS_SECONDARY); - hbResp.setVersion(5); - BSONObjBuilder respObj; - respObj << "ok" << 1; - hbResp.addToBSON(&respObj); - net->scheduleResponse(noi, net->now(), makeResponseStatus(respObj.obj())); - net->runReadyNetworkOperations(); - getNet()->exitNetwork(); - reconfigThread.join(); - ASSERT_EQUALS(ErrorCodes::NewReplicaSetConfigurationIncompatible, status); - } - - TEST_F(ReplCoordTest, ReconfigStoreLocalConfigDocumentFails) { - // start up, become primary, saving the config fails - OperationContextNoop txn; - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("_id" << 1 << "host" << "node1:12345") << - BSON("_id" << 2 << "host" << "node2:12345") )), - HostAndPort("node1", 12345)); - ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(100, 0)); - simulateSuccessfulElection(); - - Status status(ErrorCodes::InternalError, "Not Set"); - getExternalState()->setStoreLocalConfigDocumentStatus(Status(ErrorCodes::OutOfDiskSpace, - "The test set this")); - boost::thread reconfigThread(stdx::bind(doReplSetReconfig, getReplCoord(), &status)); - - NetworkInterfaceMock* net = getNet(); - getNet()->enterNetwork(); - const NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); - const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); - repl::ReplSetHeartbeatArgs hbArgs; - ASSERT_OK(hbArgs.initialize(request.cmdObj)); - repl::ReplSetHeartbeatResponse hbResp; - hbResp.setSetName("mySet"); - hbResp.setState(MemberState::RS_SECONDARY); - hbResp.setVersion(2); - BSONObjBuilder respObj; - respObj << "ok" << 1; - hbResp.addToBSON(&respObj); - net->scheduleResponse(noi, net->now(), makeResponseStatus(respObj.obj())); - net->runReadyNetworkOperations(); - getNet()->exitNetwork(); - reconfigThread.join(); - ASSERT_EQUALS(ErrorCodes::OutOfDiskSpace, status); - } - - TEST_F(ReplCoordTest, ReconfigWhileReconfiggingFails) { - // start up, become primary, reconfig, then before that reconfig concludes, reconfig again - OperationContextNoop txn; - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("_id" << 1 << "host" << "node1:12345") << - BSON("_id" << 2 << "host" << "node2:12345") )), - HostAndPort("node1", 12345)); - ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(100, 0)); - simulateSuccessfulElection(); - - Status status(ErrorCodes::InternalError, "Not Set"); - // first reconfig - boost::thread reconfigThread(stdx::bind(doReplSetReconfig, getReplCoord(), &status)); - getNet()->enterNetwork(); - getNet()->blackHole(getNet()->getNextReadyRequest()); - getNet()->exitNetwork(); - - // second reconfig - BSONObjBuilder result; - ReplSetReconfigArgs args; - args.force = false; - args.newConfigObj = BSON("_id" << "mySet" << - "version" << 3 << - "members" << BSON_ARRAY(BSON("_id" << 1 << - "host" << "node1:12345") << - BSON("_id" << 2 << - "host" << "node2:12345"))); - - ASSERT_EQUALS(ErrorCodes::ConfigurationInProgress, - getReplCoord()->processReplSetReconfig(&txn, args, &result)); - ASSERT_TRUE(result.obj().isEmpty()); - - shutdown(); - reconfigThread.join(); - } - - TEST_F(ReplCoordTest, ReconfigWhileInitializingFails) { - // start up, initiate, then before that initiate concludes, reconfig - OperationContextNoop txn; - init(); - start(HostAndPort("node1", 12345)); - ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(100, 0)); - - // initiate - Status status(ErrorCodes::InternalError, "Not Set"); - boost::thread initateThread(stdx::bind(doReplSetInitiate, getReplCoord(), &status)); - getNet()->enterNetwork(); - getNet()->blackHole(getNet()->getNextReadyRequest()); - getNet()->exitNetwork(); - - // reconfig - BSONObjBuilder result; - ReplSetReconfigArgs args; - args.force = false; - args.newConfigObj = BSON("_id" << "mySet" << - "version" << 3 << - "members" << BSON_ARRAY(BSON("_id" << 1 << - "host" << "node1:12345") << - BSON("_id" << 2 << - "host" << "node2:12345"))); - - ASSERT_EQUALS(ErrorCodes::ConfigurationInProgress, - getReplCoord()->processReplSetReconfig(&txn, args, &result)); - ASSERT_TRUE(result.obj().isEmpty()); - - shutdown(); - initateThread.join(); - } - - TEST_F(ReplCoordTest, ReconfigSuccessful) { - // start up, become primary, reconfig successfully - OperationContextNoop txn; - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("_id" << 1 << "host" << "node1:12345") << - BSON("_id" << 2 << "host" << "node2:12345"))), - HostAndPort("node1", 12345)); - ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(100, 0)); - simulateSuccessfulElection(); - - Status status(ErrorCodes::InternalError, "Not Set"); - boost::thread reconfigThread(stdx::bind(doReplSetReconfig, getReplCoord(), &status)); - - NetworkInterfaceMock* net = getNet(); - getNet()->enterNetwork(); - const NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); - const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); - repl::ReplSetHeartbeatArgs hbArgs; - ASSERT_OK(hbArgs.initialize(request.cmdObj)); - repl::ReplSetHeartbeatResponse hbResp; - hbResp.setSetName("mySet"); - hbResp.setState(MemberState::RS_SECONDARY); - hbResp.setVersion(2); - BSONObjBuilder respObj; - respObj << "ok" << 1; - hbResp.addToBSON(&respObj); - net->scheduleResponse(noi, net->now(), makeResponseStatus(respObj.obj())); - net->runReadyNetworkOperations(); - getNet()->exitNetwork(); - reconfigThread.join(); - ASSERT_OK(status); - } - - TEST_F(ReplCoordTest, ReconfigDuringHBReconfigFails) { - // start up, become primary, receive reconfig via heartbeat, then a second one - // from reconfig - OperationContextNoop txn; - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("_id" << 1 << "host" << "node1:12345") << - BSON("_id" << 2 << "host" << "node2:12345") )), - HostAndPort("node1", 12345)); - ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(100,0)); - simulateSuccessfulElection(); - ASSERT_TRUE(getReplCoord()->getMemberState().primary()); - - // set hbreconfig to hang while in progress - getExternalState()->setStoreLocalConfigDocumentToHang(true); - - // hb reconfig - NetworkInterfaceMock* net = getNet(); - net->enterNetwork(); - ReplSetHeartbeatResponse hbResp2; - ReplicaSetConfig config; - config.initialize(BSON("_id" << "mySet" << - "version" << 3 << - "members" << BSON_ARRAY(BSON("_id" << 1 << - "host" << "node1:12345") << - BSON("_id" << 2 << - "host" << "node2:12345")))); - hbResp2.setConfig(config); - hbResp2.setVersion(3); - hbResp2.setSetName("mySet"); - hbResp2.setState(MemberState::RS_SECONDARY); - BSONObjBuilder respObj2; - respObj2 << "ok" << 1; - hbResp2.addToBSON(&respObj2); - net->runUntil(net->now() + 10*1000); // run until we've sent a heartbeat request - const NetworkInterfaceMock::NetworkOperationIterator noi2 = net->getNextReadyRequest(); - net->scheduleResponse(noi2, net->now(), makeResponseStatus(respObj2.obj())); - net->runReadyNetworkOperations(); - getNet()->exitNetwork(); - - // reconfig - BSONObjBuilder result; - ReplSetReconfigArgs args; - args.force = false; - args.newConfigObj = config.toBSON(); - ASSERT_EQUALS(ErrorCodes::ConfigurationInProgress, - getReplCoord()->processReplSetReconfig(&txn, args, &result)); - - getExternalState()->setStoreLocalConfigDocumentToHang(false); - } +typedef ReplicationCoordinator::ReplSetReconfigArgs ReplSetReconfigArgs; +typedef ReplicationExecutor::RemoteCommandRequest RemoteCommandRequest; + +TEST_F(ReplCoordTest, ReconfigBeforeInitialized) { + // start up but do not initiate + OperationContextNoop txn; + init(); + start(); + BSONObjBuilder result; + ReplSetReconfigArgs args; + + ASSERT_EQUALS(ErrorCodes::NotYetInitialized, + getReplCoord()->processReplSetReconfig(&txn, args, &result)); + ASSERT_TRUE(result.obj().isEmpty()); +} + +TEST_F(ReplCoordTest, ReconfigWhileNotPrimary) { + // start up, become secondary, receive reconfig + OperationContextNoop txn; + init(); + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345") + << BSON("_id" << 2 << "host" + << "node2:12345"))), + HostAndPort("node1", 12345)); + + BSONObjBuilder result; + ReplSetReconfigArgs args; + args.force = false; + ASSERT_EQUALS(ErrorCodes::NotMaster, + getReplCoord()->processReplSetReconfig(&txn, args, &result)); + ASSERT_TRUE(result.obj().isEmpty()); +} + +TEST_F(ReplCoordTest, ReconfigWithUninitializableConfig) { + // start up, become primary, receive uninitializable config + OperationContextNoop txn; + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345") + << BSON("_id" << 2 << "host" + << "node2:12345"))), + HostAndPort("node1", 12345)); + ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + getReplCoord()->setMyLastOptime(OpTime(100, 0)); + simulateSuccessfulElection(); + + BSONObjBuilder result; + ReplSetReconfigArgs args; + args.force = false; + args.newConfigObj = BSON("_id" + << "mySet" + << "version" << 2 << "invalidlyNamedField" << 3 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345" + << "arbiterOnly" << true) + << BSON("_id" << 2 << "host" + << "node2:12345" + << "arbiterOnly" << true))); + // ErrorCodes::BadValue should be propagated from ReplicaSetConfig::initialize() + ASSERT_EQUALS(ErrorCodes::InvalidReplicaSetConfig, + getReplCoord()->processReplSetReconfig(&txn, args, &result)); + ASSERT_TRUE(result.obj().isEmpty()); +} + +TEST_F(ReplCoordTest, ReconfigWithWrongReplSetName) { + // start up, become primary, receive config with incorrect replset name + OperationContextNoop txn; + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345") + << BSON("_id" << 2 << "host" + << "node2:12345"))), + HostAndPort("node1", 12345)); + ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + getReplCoord()->setMyLastOptime(OpTime(100, 0)); + simulateSuccessfulElection(); + + BSONObjBuilder result; + ReplSetReconfigArgs args; + args.force = false; + args.newConfigObj = BSON("_id" + << "notMySet" + << "version" << 3 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345") + << BSON("_id" << 2 << "host" + << "node2:12345"))); + + ASSERT_EQUALS(ErrorCodes::InvalidReplicaSetConfig, + getReplCoord()->processReplSetReconfig(&txn, args, &result)); + ASSERT_TRUE(result.obj().isEmpty()); +} + +TEST_F(ReplCoordTest, ReconfigValidateFails) { + // start up, become primary, validate fails + OperationContextNoop txn; + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345") + << BSON("_id" << 2 << "host" + << "node2:12345"))), + HostAndPort("node1", 12345)); + ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + getReplCoord()->setMyLastOptime(OpTime(100, 0)); + simulateSuccessfulElection(); + + BSONObjBuilder result; + ReplSetReconfigArgs args; + args.force = false; + args.newConfigObj = BSON("_id" + << "mySet" + << "version" << -3 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345") + << BSON("_id" << 2 << "host" + << "node2:12345"))); + + ASSERT_EQUALS(ErrorCodes::NewReplicaSetConfigurationIncompatible, + getReplCoord()->processReplSetReconfig(&txn, args, &result)); + ASSERT_TRUE(result.obj().isEmpty()); +} + +void doReplSetInitiate(ReplicationCoordinatorImpl* replCoord, Status* status) { + OperationContextNoop txn; + BSONObjBuilder garbage; + *status = + replCoord->processReplSetInitiate(&txn, + BSON("_id" + << "mySet" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345") + << BSON("_id" << 2 << "host" + << "node2:12345"))), + &garbage); +} + +void doReplSetReconfig(ReplicationCoordinatorImpl* replCoord, Status* status) { + OperationContextNoop txn; + BSONObjBuilder garbage; + ReplSetReconfigArgs args; + args.force = false; + args.newConfigObj = BSON("_id" + << "mySet" + << "version" << 3 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345") + << BSON("_id" << 2 << "host" + << "node2:12345" + << "priority" << 3))); + *status = replCoord->processReplSetReconfig(&txn, args, &garbage); +} + +TEST_F(ReplCoordTest, ReconfigQuorumCheckFails) { + // start up, become primary, fail during quorum check due to a heartbeat + // containing a higher config version + OperationContextNoop txn; + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345") + << BSON("_id" << 2 << "host" + << "node2:12345"))), + HostAndPort("node1", 12345)); + ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + getReplCoord()->setMyLastOptime(OpTime(100, 0)); + simulateSuccessfulElection(); + + Status status(ErrorCodes::InternalError, "Not Set"); + boost::thread reconfigThread(stdx::bind(doReplSetReconfig, getReplCoord(), &status)); + + NetworkInterfaceMock* net = getNet(); + getNet()->enterNetwork(); + const NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); + const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); + repl::ReplSetHeartbeatArgs hbArgs; + ASSERT_OK(hbArgs.initialize(request.cmdObj)); + repl::ReplSetHeartbeatResponse hbResp; + hbResp.setSetName("mySet"); + hbResp.setState(MemberState::RS_SECONDARY); + hbResp.setVersion(5); + BSONObjBuilder respObj; + respObj << "ok" << 1; + hbResp.addToBSON(&respObj); + net->scheduleResponse(noi, net->now(), makeResponseStatus(respObj.obj())); + net->runReadyNetworkOperations(); + getNet()->exitNetwork(); + reconfigThread.join(); + ASSERT_EQUALS(ErrorCodes::NewReplicaSetConfigurationIncompatible, status); +} + +TEST_F(ReplCoordTest, ReconfigStoreLocalConfigDocumentFails) { + // start up, become primary, saving the config fails + OperationContextNoop txn; + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345") + << BSON("_id" << 2 << "host" + << "node2:12345"))), + HostAndPort("node1", 12345)); + ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + getReplCoord()->setMyLastOptime(OpTime(100, 0)); + simulateSuccessfulElection(); + + Status status(ErrorCodes::InternalError, "Not Set"); + getExternalState()->setStoreLocalConfigDocumentStatus( + Status(ErrorCodes::OutOfDiskSpace, "The test set this")); + boost::thread reconfigThread(stdx::bind(doReplSetReconfig, getReplCoord(), &status)); + + NetworkInterfaceMock* net = getNet(); + getNet()->enterNetwork(); + const NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); + const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); + repl::ReplSetHeartbeatArgs hbArgs; + ASSERT_OK(hbArgs.initialize(request.cmdObj)); + repl::ReplSetHeartbeatResponse hbResp; + hbResp.setSetName("mySet"); + hbResp.setState(MemberState::RS_SECONDARY); + hbResp.setVersion(2); + BSONObjBuilder respObj; + respObj << "ok" << 1; + hbResp.addToBSON(&respObj); + net->scheduleResponse(noi, net->now(), makeResponseStatus(respObj.obj())); + net->runReadyNetworkOperations(); + getNet()->exitNetwork(); + reconfigThread.join(); + ASSERT_EQUALS(ErrorCodes::OutOfDiskSpace, status); +} + +TEST_F(ReplCoordTest, ReconfigWhileReconfiggingFails) { + // start up, become primary, reconfig, then before that reconfig concludes, reconfig again + OperationContextNoop txn; + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345") + << BSON("_id" << 2 << "host" + << "node2:12345"))), + HostAndPort("node1", 12345)); + ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + getReplCoord()->setMyLastOptime(OpTime(100, 0)); + simulateSuccessfulElection(); + + Status status(ErrorCodes::InternalError, "Not Set"); + // first reconfig + boost::thread reconfigThread(stdx::bind(doReplSetReconfig, getReplCoord(), &status)); + getNet()->enterNetwork(); + getNet()->blackHole(getNet()->getNextReadyRequest()); + getNet()->exitNetwork(); + + // second reconfig + BSONObjBuilder result; + ReplSetReconfigArgs args; + args.force = false; + args.newConfigObj = BSON("_id" + << "mySet" + << "version" << 3 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345") + << BSON("_id" << 2 << "host" + << "node2:12345"))); + + ASSERT_EQUALS(ErrorCodes::ConfigurationInProgress, + getReplCoord()->processReplSetReconfig(&txn, args, &result)); + ASSERT_TRUE(result.obj().isEmpty()); + + shutdown(); + reconfigThread.join(); +} + +TEST_F(ReplCoordTest, ReconfigWhileInitializingFails) { + // start up, initiate, then before that initiate concludes, reconfig + OperationContextNoop txn; + init(); + start(HostAndPort("node1", 12345)); + ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + getReplCoord()->setMyLastOptime(OpTime(100, 0)); + + // initiate + Status status(ErrorCodes::InternalError, "Not Set"); + boost::thread initateThread(stdx::bind(doReplSetInitiate, getReplCoord(), &status)); + getNet()->enterNetwork(); + getNet()->blackHole(getNet()->getNextReadyRequest()); + getNet()->exitNetwork(); + + // reconfig + BSONObjBuilder result; + ReplSetReconfigArgs args; + args.force = false; + args.newConfigObj = BSON("_id" + << "mySet" + << "version" << 3 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345") + << BSON("_id" << 2 << "host" + << "node2:12345"))); + + ASSERT_EQUALS(ErrorCodes::ConfigurationInProgress, + getReplCoord()->processReplSetReconfig(&txn, args, &result)); + ASSERT_TRUE(result.obj().isEmpty()); + + shutdown(); + initateThread.join(); +} + +TEST_F(ReplCoordTest, ReconfigSuccessful) { + // start up, become primary, reconfig successfully + OperationContextNoop txn; + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345") + << BSON("_id" << 2 << "host" + << "node2:12345"))), + HostAndPort("node1", 12345)); + ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + getReplCoord()->setMyLastOptime(OpTime(100, 0)); + simulateSuccessfulElection(); + + Status status(ErrorCodes::InternalError, "Not Set"); + boost::thread reconfigThread(stdx::bind(doReplSetReconfig, getReplCoord(), &status)); + + NetworkInterfaceMock* net = getNet(); + getNet()->enterNetwork(); + const NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); + const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); + repl::ReplSetHeartbeatArgs hbArgs; + ASSERT_OK(hbArgs.initialize(request.cmdObj)); + repl::ReplSetHeartbeatResponse hbResp; + hbResp.setSetName("mySet"); + hbResp.setState(MemberState::RS_SECONDARY); + hbResp.setVersion(2); + BSONObjBuilder respObj; + respObj << "ok" << 1; + hbResp.addToBSON(&respObj); + net->scheduleResponse(noi, net->now(), makeResponseStatus(respObj.obj())); + net->runReadyNetworkOperations(); + getNet()->exitNetwork(); + reconfigThread.join(); + ASSERT_OK(status); +} + +TEST_F(ReplCoordTest, ReconfigDuringHBReconfigFails) { + // start up, become primary, receive reconfig via heartbeat, then a second one + // from reconfig + OperationContextNoop txn; + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345") + << BSON("_id" << 2 << "host" + << "node2:12345"))), + HostAndPort("node1", 12345)); + ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + getReplCoord()->setMyLastOptime(OpTime(100, 0)); + simulateSuccessfulElection(); + ASSERT_TRUE(getReplCoord()->getMemberState().primary()); + + // set hbreconfig to hang while in progress + getExternalState()->setStoreLocalConfigDocumentToHang(true); + + // hb reconfig + NetworkInterfaceMock* net = getNet(); + net->enterNetwork(); + ReplSetHeartbeatResponse hbResp2; + ReplicaSetConfig config; + config.initialize(BSON("_id" + << "mySet" + << "version" << 3 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345") + << BSON("_id" << 2 << "host" + << "node2:12345")))); + hbResp2.setConfig(config); + hbResp2.setVersion(3); + hbResp2.setSetName("mySet"); + hbResp2.setState(MemberState::RS_SECONDARY); + BSONObjBuilder respObj2; + respObj2 << "ok" << 1; + hbResp2.addToBSON(&respObj2); + net->runUntil(net->now() + 10 * 1000); // run until we've sent a heartbeat request + const NetworkInterfaceMock::NetworkOperationIterator noi2 = net->getNextReadyRequest(); + net->scheduleResponse(noi2, net->now(), makeResponseStatus(respObj2.obj())); + net->runReadyNetworkOperations(); + getNet()->exitNetwork(); + + // reconfig + BSONObjBuilder result; + ReplSetReconfigArgs args; + args.force = false; + args.newConfigObj = config.toBSON(); + ASSERT_EQUALS(ErrorCodes::ConfigurationInProgress, + getReplCoord()->processReplSetReconfig(&txn, args, &result)); + + getExternalState()->setStoreLocalConfigDocumentToHang(false); +} // TEST_F(ReplCoordTest, HBReconfigDuringReconfigFails) { // // start up, become primary, reconfig, while reconfigging receive reconfig via heartbeat @@ -446,7 +473,7 @@ namespace { // getReplCoord()->setMyLastOptime(OpTime(100,0)); // simulateSuccessfulElection(); // ASSERT_TRUE(getReplCoord()->getCurrentMemberState().primary()); -// +// // // schedule hb reconfig // NetworkInterfaceMock* net = getNet(); // net->enterNetwork(); @@ -468,7 +495,7 @@ namespace { // respObj2 << "ok" << 1; // hbResp.addToBSON(&respObj2); // net->scheduleResponse(noi, net->now(), makeResponseStatus(respObj2.obj())); -// +// // // start reconfig thread // Status status2(ErrorCodes::InternalError, "Not Set"); // boost::thread reconfigThread(stdx::bind(doReplSetReconfig, getReplCoord(), &status2)); @@ -499,41 +526,44 @@ namespace { // logger::globalLogDomain()->setMinimumLoggedSeverity(logger::LogSeverity::Log()); // } - TEST_F(ReplCoordTest, ForceReconfigWhileNotPrimarySuccessful) { - // start up, become a secondary, receive a forced reconfig - OperationContextNoop txn; - init(); - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("_id" << 1 << "host" << "node1:12345") << - BSON("_id" << 2 << "host" << "node2:12345") )), - HostAndPort("node1", 12345)); - ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(100, 0)); - - // fail before forced - BSONObjBuilder result; - ReplSetReconfigArgs args; - args.force = false; - args.newConfigObj = BSON("_id" << "mySet" << - "version" << 3 << - "members" << BSON_ARRAY(BSON("_id" << 1 << - "host" << "node1:12345") << - BSON("_id" << 2 << - "host" << "node2:12345"))); - ASSERT_EQUALS(ErrorCodes::NotMaster, - getReplCoord()->processReplSetReconfig(&txn, args, &result)); - - // forced should succeed - args.force = true; - ASSERT_OK(getReplCoord()->processReplSetReconfig(&txn, args, &result)); - getReplCoord()->processReplSetGetConfig(&result); - - // ensure forced reconfig results in a random larger version - ASSERT_GREATER_THAN(result.obj()["config"].Obj()["version"].numberInt(), 3); - } - -} // anonymous namespace -} // namespace repl -} // namespace mongo +TEST_F(ReplCoordTest, ForceReconfigWhileNotPrimarySuccessful) { + // start up, become a secondary, receive a forced reconfig + OperationContextNoop txn; + init(); + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345") + << BSON("_id" << 2 << "host" + << "node2:12345"))), + HostAndPort("node1", 12345)); + ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + getReplCoord()->setMyLastOptime(OpTime(100, 0)); + + // fail before forced + BSONObjBuilder result; + ReplSetReconfigArgs args; + args.force = false; + args.newConfigObj = BSON("_id" + << "mySet" + << "version" << 3 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345") + << BSON("_id" << 2 << "host" + << "node2:12345"))); + ASSERT_EQUALS(ErrorCodes::NotMaster, + getReplCoord()->processReplSetReconfig(&txn, args, &result)); + + // forced should succeed + args.force = true; + ASSERT_OK(getReplCoord()->processReplSetReconfig(&txn, args, &result)); + getReplCoord()->processReplSetGetConfig(&result); + + // ensure forced reconfig results in a random larger version + ASSERT_GREATER_THAN(result.obj()["config"].Obj()["version"].numberInt(), 3); +} + +} // anonymous namespace +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/replication_coordinator_impl_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_test.cpp index 205a2a9ff2a..cc256ca6fe3 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_test.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_test.cpp @@ -45,7 +45,7 @@ #include "mongo/db/repl/repl_set_heartbeat_args.h" #include "mongo/db/repl/repl_settings.h" #include "mongo/db/repl/replica_set_config.h" -#include "mongo/db/repl/replication_coordinator.h" // ReplSetReconfigArgs +#include "mongo/db/repl/replication_coordinator.h" // ReplSetReconfigArgs #include "mongo/db/repl/replication_coordinator_external_state_mock.h" #include "mongo/db/repl/replication_coordinator_impl.h" #include "mongo/db/repl/replication_coordinator_test_fixture.h" @@ -62,2148 +62,2238 @@ namespace mongo { namespace repl { namespace { - typedef ReplicationCoordinator::ReplSetReconfigArgs ReplSetReconfigArgs; +typedef ReplicationCoordinator::ReplSetReconfigArgs ReplSetReconfigArgs; + +TEST_F(ReplCoordTest, StartupWithValidLocalConfig) { + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345"))), + HostAndPort("node1", 12345)); +} + +TEST_F(ReplCoordTest, StartupWithConfigMissingSelf) { + startCapturingLogMessages(); + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345") + << BSON("_id" << 2 << "host" + << "node2:54321"))), + HostAndPort("node3", 12345)); + stopCapturingLogMessages(); + ASSERT_EQUALS(1, countLogLinesContaining("NodeNotFound")); +} + +TEST_F(ReplCoordTest, StartupWithLocalConfigSetNameMismatch) { + init("mySet"); + startCapturingLogMessages(); + assertStartSuccess(BSON("_id" + << "notMySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345"))), + HostAndPort("node1", 12345)); + stopCapturingLogMessages(); + ASSERT_EQUALS(1, countLogLinesContaining("reports set name of notMySet,")); +} + +TEST_F(ReplCoordTest, StartupWithNoLocalConfig) { + startCapturingLogMessages(); + start(); + stopCapturingLogMessages(); + ASSERT_EQUALS(1, countLogLinesContaining("Did not find local ")); + ASSERT_EQUALS(MemberState::RS_STARTUP, getReplCoord()->getMemberState().s); +} + +TEST_F(ReplCoordTest, InitiateFailsWithEmptyConfig) { + OperationContextNoop txn; + init("mySet"); + start(HostAndPort("node1", 12345)); + BSONObjBuilder result; + ASSERT_EQUALS(ErrorCodes::InvalidReplicaSetConfig, + getReplCoord()->processReplSetInitiate(&txn, BSONObj(), &result)); + ASSERT_EQUALS(MemberState::RS_STARTUP, getReplCoord()->getMemberState().s); +} + +TEST_F(ReplCoordTest, InitiateSucceedsWithOneNodeConfig) { + OperationContextNoop txn; + init("mySet"); + start(HostAndPort("node1", 12345)); + ASSERT_EQUALS(MemberState::RS_STARTUP, getReplCoord()->getMemberState().s); + + // Starting uninitialized, show that we can perform the initiate behavior. + BSONObjBuilder result1; + ASSERT_OK( + getReplCoord()->processReplSetInitiate(&txn, + BSON("_id" + << "mySet" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "node1:12345"))), + &result1)); + ASSERT_EQUALS(ReplicationCoordinator::modeReplSet, getReplCoord()->getReplicationMode()); + + // Show that initiate fails after it has already succeeded. + BSONObjBuilder result2; + ASSERT_EQUALS( + ErrorCodes::AlreadyInitialized, + getReplCoord()->processReplSetInitiate(&txn, + BSON("_id" + << "mySet" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "node1:12345"))), + &result2)); + + // Still in repl set mode, even after failed reinitiate. + ASSERT_EQUALS(ReplicationCoordinator::modeReplSet, getReplCoord()->getReplicationMode()); +} + +TEST_F(ReplCoordTest, InitiateSucceedsAfterFailing) { + OperationContextNoop txn; + init("mySet"); + start(HostAndPort("node1", 12345)); + BSONObjBuilder result; + ASSERT_EQUALS(ErrorCodes::InvalidReplicaSetConfig, + getReplCoord()->processReplSetInitiate(&txn, BSONObj(), &result)); + ASSERT_EQUALS(MemberState::RS_STARTUP, getReplCoord()->getMemberState().s); + + // Having failed to initiate once, show that we can now initiate. + BSONObjBuilder result1; + ASSERT_OK( + getReplCoord()->processReplSetInitiate(&txn, + BSON("_id" + << "mySet" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "node1:12345"))), + &result1)); + ASSERT_EQUALS(ReplicationCoordinator::modeReplSet, getReplCoord()->getReplicationMode()); +} + +TEST_F(ReplCoordTest, InitiateFailsIfAlreadyInitialized) { + OperationContextNoop txn; + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345"))), + HostAndPort("node1", 12345)); + BSONObjBuilder result; + ASSERT_EQUALS( + ErrorCodes::AlreadyInitialized, + getReplCoord()->processReplSetInitiate(&txn, + BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "node1:12345"))), + &result)); +} + +TEST_F(ReplCoordTest, InitiateFailsIfSelfMissing) { + OperationContextNoop txn; + BSONObjBuilder result; + init("mySet"); + start(HostAndPort("node1", 12345)); + ASSERT_EQUALS( + ErrorCodes::InvalidReplicaSetConfig, + getReplCoord()->processReplSetInitiate(&txn, + BSON("_id" + << "mySet" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "node4"))), + &result)); +} + +void doReplSetInitiate(ReplicationCoordinatorImpl* replCoord, Status* status) { + OperationContextNoop txn; + BSONObjBuilder garbage; + *status = + replCoord->processReplSetInitiate(&txn, + BSON("_id" + << "mySet" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "node1:12345") + << BSON("_id" << 1 << "host" + << "node2:54321"))), + &garbage); +} + +TEST_F(ReplCoordTest, InitiateFailsIfQuorumNotMet) { + init("mySet"); + start(HostAndPort("node1", 12345)); + ASSERT_EQUALS(MemberState::RS_STARTUP, getReplCoord()->getMemberState().s); + + ReplSetHeartbeatArgs hbArgs; + hbArgs.setSetName("mySet"); + hbArgs.setProtocolVersion(1); + hbArgs.setConfigVersion(1); + hbArgs.setCheckEmpty(true); + hbArgs.setSenderHost(HostAndPort("node1", 12345)); + hbArgs.setSenderId(0); + + Status status(ErrorCodes::InternalError, "Not set"); + boost::thread prsiThread(stdx::bind(doReplSetInitiate, getReplCoord(), &status)); + const Date_t startDate = getNet()->now(); + getNet()->enterNetwork(); + const NetworkInterfaceMock::NetworkOperationIterator noi = getNet()->getNextReadyRequest(); + ASSERT_EQUALS(HostAndPort("node2", 54321), noi->getRequest().target); + ASSERT_EQUALS("admin", noi->getRequest().dbname); + ASSERT_EQUALS(hbArgs.toBSON(), noi->getRequest().cmdObj); + getNet()->scheduleResponse( + noi, startDate + 10, ResponseStatus(ErrorCodes::NoSuchKey, "No response")); + getNet()->runUntil(startDate + 10); + getNet()->exitNetwork(); + ASSERT_EQUALS(startDate + 10, getNet()->now()); + prsiThread.join(); + ASSERT_EQUALS(ErrorCodes::NodeNotFound, status); + ASSERT_EQUALS(MemberState::RS_STARTUP, getReplCoord()->getMemberState().s); +} + +TEST_F(ReplCoordTest, InitiatePassesIfQuorumMet) { + init("mySet"); + start(HostAndPort("node1", 12345)); + ASSERT_EQUALS(MemberState::RS_STARTUP, getReplCoord()->getMemberState().s); + + ReplSetHeartbeatArgs hbArgs; + hbArgs.setSetName("mySet"); + hbArgs.setProtocolVersion(1); + hbArgs.setConfigVersion(1); + hbArgs.setCheckEmpty(true); + hbArgs.setSenderHost(HostAndPort("node1", 12345)); + hbArgs.setSenderId(0); + + Status status(ErrorCodes::InternalError, "Not set"); + boost::thread prsiThread(stdx::bind(doReplSetInitiate, getReplCoord(), &status)); + const Date_t startDate = getNet()->now(); + getNet()->enterNetwork(); + const NetworkInterfaceMock::NetworkOperationIterator noi = getNet()->getNextReadyRequest(); + ASSERT_EQUALS(HostAndPort("node2", 54321), noi->getRequest().target); + ASSERT_EQUALS("admin", noi->getRequest().dbname); + ASSERT_EQUALS(hbArgs.toBSON(), noi->getRequest().cmdObj); + ReplSetHeartbeatResponse hbResp; + hbResp.setVersion(0); + getNet()->scheduleResponse(noi, + startDate + 10, + ResponseStatus(ReplicationExecutor::RemoteCommandResponse( + hbResp.toBSON(), Milliseconds(8)))); + getNet()->runUntil(startDate + 10); + getNet()->exitNetwork(); + ASSERT_EQUALS(startDate + 10, getNet()->now()); + prsiThread.join(); + ASSERT_OK(status); + ASSERT_EQUALS(ReplicationCoordinator::modeReplSet, getReplCoord()->getReplicationMode()); +} + +TEST_F(ReplCoordTest, InitiateFailsWithSetNameMismatch) { + OperationContextNoop txn; + init("mySet"); + start(HostAndPort("node1", 12345)); + ASSERT_EQUALS(MemberState::RS_STARTUP, getReplCoord()->getMemberState().s); + + BSONObjBuilder result1; + ASSERT_EQUALS( + ErrorCodes::InvalidReplicaSetConfig, + getReplCoord()->processReplSetInitiate(&txn, + BSON("_id" + << "wrongSet" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "node1:12345"))), + &result1)); + ASSERT_EQUALS(MemberState::RS_STARTUP, getReplCoord()->getMemberState().s); +} + +TEST_F(ReplCoordTest, InitiateFailsWithoutReplSetFlag) { + OperationContextNoop txn; + init(""); + start(HostAndPort("node1", 12345)); + ASSERT_EQUALS(MemberState::RS_STARTUP, getReplCoord()->getMemberState().s); + + BSONObjBuilder result1; + ASSERT_EQUALS( + ErrorCodes::NoReplicationEnabled, + getReplCoord()->processReplSetInitiate(&txn, + BSON("_id" + << "mySet" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "node1:12345"))), + &result1)); + ASSERT_EQUALS(MemberState::RS_STARTUP, getReplCoord()->getMemberState().s); +} + +TEST_F(ReplCoordTest, InitiateFailsWhileStoringLocalConfigDocument) { + OperationContextNoop txn; + init("mySet"); + start(HostAndPort("node1", 12345)); + ASSERT_EQUALS(MemberState::RS_STARTUP, getReplCoord()->getMemberState().s); + + BSONObjBuilder result1; + getExternalState()->setStoreLocalConfigDocumentStatus( + Status(ErrorCodes::OutOfDiskSpace, "The test set this")); + ASSERT_EQUALS( + ErrorCodes::OutOfDiskSpace, + getReplCoord()->processReplSetInitiate(&txn, + BSON("_id" + << "mySet" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "node1:12345"))), + &result1)); + ASSERT_EQUALS(MemberState::RS_STARTUP, getReplCoord()->getMemberState().s); +} + +TEST_F(ReplCoordTest, CheckReplEnabledForCommandNotRepl) { + // pass in settings to avoid having a replSet + ReplSettings settings; + init(settings); + start(); + + // check status NoReplicationEnabled and empty result + BSONObjBuilder result; + Status status = getReplCoord()->checkReplEnabledForCommand(&result); + ASSERT_EQUALS(status, ErrorCodes::NoReplicationEnabled); + ASSERT_TRUE(result.obj().isEmpty()); +} + +TEST_F(ReplCoordTest, checkReplEnabledForCommandConfigSvr) { + ReplSettings settings; + serverGlobalParams.configsvr = true; + init(settings); + start(); + + // check status NoReplicationEnabled and result mentions configsrv + BSONObjBuilder result; + Status status = getReplCoord()->checkReplEnabledForCommand(&result); + ASSERT_EQUALS(status, ErrorCodes::NoReplicationEnabled); + ASSERT_EQUALS(result.obj()["info"].String(), "configsvr"); + serverGlobalParams.configsvr = false; +} + +TEST_F(ReplCoordTest, checkReplEnabledForCommandNoConfig) { + start(); + + // check status NotYetInitialized and result mentions rs.initiate + BSONObjBuilder result; + Status status = getReplCoord()->checkReplEnabledForCommand(&result); + ASSERT_EQUALS(status, ErrorCodes::NotYetInitialized); + ASSERT_TRUE(result.obj()["info"].String().find("rs.initiate") != std::string::npos); +} + +TEST_F(ReplCoordTest, checkReplEnabledForCommandWorking) { + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 2 << "members" << BSON_ARRAY(BSON("host" + << "node1:12345" + << "_id" << 0))), + HostAndPort("node1", 12345)); + + // check status OK and result is empty + BSONObjBuilder result; + Status status = getReplCoord()->checkReplEnabledForCommand(&result); + ASSERT_EQUALS(status, Status::OK()); + ASSERT_TRUE(result.obj().isEmpty()); +} + +TEST_F(ReplCoordTest, BasicRBIDUsage) { + start(); + BSONObjBuilder result; + getReplCoord()->processReplSetGetRBID(&result); + long long initialValue = result.obj()["rbid"].Int(); + getReplCoord()->incrementRollbackID(); + + BSONObjBuilder result2; + getReplCoord()->processReplSetGetRBID(&result2); + long long incrementedValue = result2.obj()["rbid"].Int(); + ASSERT_EQUALS(incrementedValue, initialValue + 1); +} + +TEST_F(ReplCoordTest, AwaitReplicationNoReplEnabled) { + init(""); + OperationContextNoop txn; + OpTime time(100, 1); + + WriteConcernOptions writeConcern; + writeConcern.wTimeout = WriteConcernOptions::kNoWaiting; + writeConcern.wNumNodes = 2; + + // Because we didn't set ReplSettings.replSet, it will think we're a standalone so + // awaitReplication will always work. + ReplicationCoordinator::StatusAndDuration statusAndDur = + getReplCoord()->awaitReplication(&txn, time, writeConcern); + ASSERT_OK(statusAndDur.status); +} + +TEST_F(ReplCoordTest, AwaitReplicationMasterSlaveMajorityBaseCase) { + ReplSettings settings; + settings.master = true; + init(settings); + OperationContextNoop txn; + OpTime time(100, 1); + + WriteConcernOptions writeConcern; + writeConcern.wTimeout = WriteConcernOptions::kNoWaiting; + writeConcern.wNumNodes = 2; + + + writeConcern.wNumNodes = 0; + writeConcern.wMode = "majority"; + // w:majority always works on master/slave + ReplicationCoordinator::StatusAndDuration statusAndDur = + getReplCoord()->awaitReplication(&txn, time, writeConcern); + ASSERT_OK(statusAndDur.status); +} + +TEST_F(ReplCoordTest, AwaitReplicationReplSetBaseCases) { + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("host" + << "node1:12345" + << "_id" << 0) + << BSON("host" + << "node2:12345" + << "_id" << 1) << BSON("host" + << "node3:12345" + << "_id" << 2))), + HostAndPort("node1", 12345)); + + OperationContextNoop txn; + OpTime time(100, 1); + + WriteConcernOptions writeConcern; + writeConcern.wTimeout = WriteConcernOptions::kNoWaiting; + writeConcern.wNumNodes = 0; // Waiting for 0 nodes always works + writeConcern.wMode = ""; + + // Should fail when not primary + ReplicationCoordinator::StatusAndDuration statusAndDur = + getReplCoord()->awaitReplication(&txn, time, writeConcern); + ASSERT_EQUALS(ErrorCodes::NotMaster, statusAndDur.status); + + ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + getReplCoord()->setMyLastOptime(OpTime(100, 0)); + simulateSuccessfulElection(); + + statusAndDur = getReplCoord()->awaitReplication(&txn, time, writeConcern); + ASSERT_OK(statusAndDur.status); +} + +TEST_F(ReplCoordTest, AwaitReplicationNumberOfNodesNonBlocking) { + OperationContextNoop txn; + assertStartSuccess( + BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("host" + << "node1:12345" + << "_id" << 0) + << BSON("host" + << "node2:12345" + << "_id" << 1) << BSON("host" + << "node3:12345" + << "_id" << 2) << BSON("host" + << "node4:12345" + << "_id" << 3))), + HostAndPort("node1", 12345)); + ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + getReplCoord()->setMyLastOptime(OpTime(100, 0)); + simulateSuccessfulElection(); + + OID client1 = OID::gen(); + OID client2 = OID::gen(); + OID client3 = OID::gen(); + OpTime time1(100, 1); + OpTime time2(100, 2); + + HandshakeArgs handshake1; + ASSERT_OK(handshake1.initialize(BSON("handshake" << client1 << "member" << 1))); + ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake1)); + HandshakeArgs handshake2; + ASSERT_OK(handshake2.initialize(BSON("handshake" << client2 << "member" << 2))); + ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake2)); + HandshakeArgs handshake3; + ASSERT_OK(handshake3.initialize(BSON("handshake" << client3 << "member" << 3))); + ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake3)); + + WriteConcernOptions writeConcern; + writeConcern.wTimeout = WriteConcernOptions::kNoWaiting; + writeConcern.wNumNodes = 1; + + // 1 node waiting for time 1 + ReplicationCoordinator::StatusAndDuration statusAndDur = + getReplCoord()->awaitReplication(&txn, time1, writeConcern); + ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, statusAndDur.status); + getReplCoord()->setMyLastOptime(time1); + statusAndDur = getReplCoord()->awaitReplication(&txn, time1, writeConcern); + ASSERT_OK(statusAndDur.status); + + // 2 nodes waiting for time1 + writeConcern.wNumNodes = 2; + statusAndDur = getReplCoord()->awaitReplication(&txn, time1, writeConcern); + ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, statusAndDur.status); + ASSERT_OK(getReplCoord()->setLastOptime_forTest(client1, time1)); + statusAndDur = getReplCoord()->awaitReplication(&txn, time1, writeConcern); + ASSERT_OK(statusAndDur.status); + + // 2 nodes waiting for time2 + statusAndDur = getReplCoord()->awaitReplication(&txn, time2, writeConcern); + ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, statusAndDur.status); + getReplCoord()->setMyLastOptime(time2); + statusAndDur = getReplCoord()->awaitReplication(&txn, time2, writeConcern); + ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, statusAndDur.status); + ASSERT_OK(getReplCoord()->setLastOptime_forTest(client3, time2)); + statusAndDur = getReplCoord()->awaitReplication(&txn, time2, writeConcern); + ASSERT_OK(statusAndDur.status); + + // 3 nodes waiting for time2 + writeConcern.wNumNodes = 3; + statusAndDur = getReplCoord()->awaitReplication(&txn, time2, writeConcern); + ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, statusAndDur.status); + ASSERT_OK(getReplCoord()->setLastOptime_forTest(client2, time2)); + statusAndDur = getReplCoord()->awaitReplication(&txn, time2, writeConcern); + ASSERT_OK(statusAndDur.status); +} + +TEST_F(ReplCoordTest, AwaitReplicationNamedModesNonBlocking) { + OperationContextNoop txn; + assertStartSuccess( + BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "node0" + << "tags" << BSON("dc" + << "NA" + << "rack" + << "rackNA1")) + << BSON("_id" << 1 << "host" + << "node1" + << "tags" << BSON("dc" + << "NA" + << "rack" + << "rackNA2")) + << BSON("_id" << 2 << "host" + << "node2" + << "tags" << BSON("dc" + << "NA" + << "rack" + << "rackNA3")) + << BSON("_id" << 3 << "host" + << "node3" + << "tags" << BSON("dc" + << "EU" + << "rack" + << "rackEU1")) + << BSON("_id" << 4 << "host" + << "node4" + << "tags" << BSON("dc" + << "EU" + << "rack" + << "rackEU2"))) << "settings" + << BSON("getLastErrorModes" << BSON("multiDC" << BSON("dc" << 2) << "multiDCAndRack" + << BSON("dc" << 2 << "rack" << 3)))), + HostAndPort("node0")); + ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + getReplCoord()->setMyLastOptime(OpTime(100, 0)); + simulateSuccessfulElection(); + + OID clientRID1 = OID::gen(); + OID clientRID2 = OID::gen(); + OID clientRID3 = OID::gen(); + OID clientRID4 = OID::gen(); + OpTime time1(100, 1); + OpTime time2(100, 2); + + HandshakeArgs handshake1; + ASSERT_OK(handshake1.initialize(BSON("handshake" << clientRID1 << "member" << 1))); + ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake1)); + HandshakeArgs handshake2; + ASSERT_OK(handshake2.initialize(BSON("handshake" << clientRID2 << "member" << 2))); + ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake2)); + HandshakeArgs handshake3; + ASSERT_OK(handshake3.initialize(BSON("handshake" << clientRID3 << "member" << 3))); + ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake3)); + HandshakeArgs handshake4; + ASSERT_OK(handshake4.initialize(BSON("handshake" << clientRID4 << "member" << 4))); + ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake4)); + + // Test invalid write concern + WriteConcernOptions invalidWriteConcern; + invalidWriteConcern.wTimeout = WriteConcernOptions::kNoWaiting; + invalidWriteConcern.wMode = "fakemode"; + + ReplicationCoordinator::StatusAndDuration statusAndDur = + getReplCoord()->awaitReplication(&txn, time1, invalidWriteConcern); + ASSERT_EQUALS(ErrorCodes::UnknownReplWriteConcern, statusAndDur.status); + + + // Set up valid write concerns for the rest of the test + WriteConcernOptions majorityWriteConcern; + majorityWriteConcern.wTimeout = WriteConcernOptions::kNoWaiting; + majorityWriteConcern.wMode = "majority"; + + WriteConcernOptions multiDCWriteConcern; + multiDCWriteConcern.wTimeout = WriteConcernOptions::kNoWaiting; + multiDCWriteConcern.wMode = "multiDC"; + + WriteConcernOptions multiRackWriteConcern; + multiRackWriteConcern.wTimeout = WriteConcernOptions::kNoWaiting; + multiRackWriteConcern.wMode = "multiDCAndRack"; + + + // Nothing satisfied + getReplCoord()->setMyLastOptime(time1); + statusAndDur = getReplCoord()->awaitReplication(&txn, time1, majorityWriteConcern); + ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, statusAndDur.status); + statusAndDur = getReplCoord()->awaitReplication(&txn, time1, multiDCWriteConcern); + ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, statusAndDur.status); + statusAndDur = getReplCoord()->awaitReplication(&txn, time1, multiRackWriteConcern); + ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, statusAndDur.status); + + // Majority satisfied but not either custom mode + getReplCoord()->setLastOptime_forTest(clientRID1, time1); + getReplCoord()->setLastOptime_forTest(clientRID2, time1); + + statusAndDur = getReplCoord()->awaitReplication(&txn, time1, majorityWriteConcern); + ASSERT_OK(statusAndDur.status); + statusAndDur = getReplCoord()->awaitReplication(&txn, time1, multiDCWriteConcern); + ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, statusAndDur.status); + statusAndDur = getReplCoord()->awaitReplication(&txn, time1, multiRackWriteConcern); + ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, statusAndDur.status); + + // All modes satisfied + getReplCoord()->setLastOptime_forTest(clientRID3, time1); + + statusAndDur = getReplCoord()->awaitReplication(&txn, time1, majorityWriteConcern); + ASSERT_OK(statusAndDur.status); + statusAndDur = getReplCoord()->awaitReplication(&txn, time1, multiDCWriteConcern); + ASSERT_OK(statusAndDur.status); + statusAndDur = getReplCoord()->awaitReplication(&txn, time1, multiRackWriteConcern); + ASSERT_OK(statusAndDur.status); + + // multiDC satisfied but not majority or multiRack + getReplCoord()->setMyLastOptime(time2); + getReplCoord()->setLastOptime_forTest(clientRID3, time2); + + statusAndDur = getReplCoord()->awaitReplication(&txn, time2, majorityWriteConcern); + ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, statusAndDur.status); + statusAndDur = getReplCoord()->awaitReplication(&txn, time2, multiDCWriteConcern); + ASSERT_OK(statusAndDur.status); + statusAndDur = getReplCoord()->awaitReplication(&txn, time2, multiRackWriteConcern); + ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, statusAndDur.status); +} - TEST_F(ReplCoordTest, StartupWithValidLocalConfig) { - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("_id" << 1 << "host" << "node1:12345"))), - HostAndPort("node1", 12345)); - } - - TEST_F(ReplCoordTest, StartupWithConfigMissingSelf) { - startCapturingLogMessages(); - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("_id" << 1 << "host" << "node1:12345") << - BSON("_id" << 2 << "host" << "node2:54321"))), - HostAndPort("node3", 12345)); - stopCapturingLogMessages(); - ASSERT_EQUALS(1, countLogLinesContaining("NodeNotFound")); - } - - TEST_F(ReplCoordTest, StartupWithLocalConfigSetNameMismatch) { - init("mySet"); - startCapturingLogMessages(); - assertStartSuccess( - BSON("_id" << "notMySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("_id" << 1 << "host" << "node1:12345"))), - HostAndPort("node1", 12345)); - stopCapturingLogMessages(); - ASSERT_EQUALS(1, countLogLinesContaining("reports set name of notMySet,")); - } - - TEST_F(ReplCoordTest, StartupWithNoLocalConfig) { - startCapturingLogMessages(); - start(); - stopCapturingLogMessages(); - ASSERT_EQUALS(1, countLogLinesContaining("Did not find local ")); - ASSERT_EQUALS(MemberState::RS_STARTUP, getReplCoord()->getMemberState().s); - } - - TEST_F(ReplCoordTest, InitiateFailsWithEmptyConfig) { - OperationContextNoop txn; - init("mySet"); - start(HostAndPort("node1", 12345)); - BSONObjBuilder result; - ASSERT_EQUALS(ErrorCodes::InvalidReplicaSetConfig, - getReplCoord()->processReplSetInitiate(&txn, BSONObj(), &result)); - ASSERT_EQUALS(MemberState::RS_STARTUP, getReplCoord()->getMemberState().s); - } - - TEST_F(ReplCoordTest, InitiateSucceedsWithOneNodeConfig) { - OperationContextNoop txn; - init("mySet"); - start(HostAndPort("node1", 12345)); - ASSERT_EQUALS(MemberState::RS_STARTUP, getReplCoord()->getMemberState().s); - - // Starting uninitialized, show that we can perform the initiate behavior. - BSONObjBuilder result1; - ASSERT_OK(getReplCoord()->processReplSetInitiate( - &txn, - BSON("_id" << "mySet" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "node1:12345"))), - &result1)); - ASSERT_EQUALS(ReplicationCoordinator::modeReplSet, getReplCoord()->getReplicationMode()); - - // Show that initiate fails after it has already succeeded. - BSONObjBuilder result2; - ASSERT_EQUALS(ErrorCodes::AlreadyInitialized, - getReplCoord()->processReplSetInitiate( - &txn, - BSON("_id" << "mySet" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "node1:12345"))), - &result2)); - - // Still in repl set mode, even after failed reinitiate. - ASSERT_EQUALS(ReplicationCoordinator::modeReplSet, getReplCoord()->getReplicationMode()); - } - - TEST_F(ReplCoordTest, InitiateSucceedsAfterFailing) { - OperationContextNoop txn; - init("mySet"); - start(HostAndPort("node1", 12345)); - BSONObjBuilder result; - ASSERT_EQUALS(ErrorCodes::InvalidReplicaSetConfig, - getReplCoord()->processReplSetInitiate(&txn, BSONObj(), &result)); - ASSERT_EQUALS(MemberState::RS_STARTUP, getReplCoord()->getMemberState().s); - - // Having failed to initiate once, show that we can now initiate. - BSONObjBuilder result1; - ASSERT_OK(getReplCoord()->processReplSetInitiate( - &txn, - BSON("_id" << "mySet" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "node1:12345"))), - &result1)); - ASSERT_EQUALS(ReplicationCoordinator::modeReplSet, getReplCoord()->getReplicationMode()); - } - - TEST_F(ReplCoordTest, InitiateFailsIfAlreadyInitialized) { - OperationContextNoop txn; - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("_id" << 1 << "host" << "node1:12345"))), - HostAndPort("node1", 12345)); - BSONObjBuilder result; - ASSERT_EQUALS(ErrorCodes::AlreadyInitialized, - getReplCoord()->processReplSetInitiate( - &txn, - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("_id" << 1 << - "host" << "node1:12345"))), - &result)); - } - - TEST_F(ReplCoordTest, InitiateFailsIfSelfMissing) { - OperationContextNoop txn; - BSONObjBuilder result; - init("mySet"); - start(HostAndPort("node1", 12345)); - ASSERT_EQUALS(ErrorCodes::InvalidReplicaSetConfig, - getReplCoord()->processReplSetInitiate( - &txn, - BSON("_id" << "mySet" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "node4"))), - &result)); - } - - void doReplSetInitiate(ReplicationCoordinatorImpl* replCoord, Status* status) { - OperationContextNoop txn; - BSONObjBuilder garbage; - *status = replCoord->processReplSetInitiate( - &txn, - BSON("_id" << "mySet" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "node1:12345") << - BSON("_id" << 1 << "host" << "node2:54321"))), - &garbage); - } - - TEST_F(ReplCoordTest, InitiateFailsIfQuorumNotMet) { - init("mySet"); - start(HostAndPort("node1", 12345)); - ASSERT_EQUALS(MemberState::RS_STARTUP, getReplCoord()->getMemberState().s); - - ReplSetHeartbeatArgs hbArgs; - hbArgs.setSetName("mySet"); - hbArgs.setProtocolVersion(1); - hbArgs.setConfigVersion(1); - hbArgs.setCheckEmpty(true); - hbArgs.setSenderHost(HostAndPort("node1", 12345)); - hbArgs.setSenderId(0); - - Status status(ErrorCodes::InternalError, "Not set"); - boost::thread prsiThread(stdx::bind(doReplSetInitiate, getReplCoord(), &status)); - const Date_t startDate = getNet()->now(); - getNet()->enterNetwork(); - const NetworkInterfaceMock::NetworkOperationIterator noi = getNet()->getNextReadyRequest(); - ASSERT_EQUALS(HostAndPort("node2", 54321), noi->getRequest().target); - ASSERT_EQUALS("admin", noi->getRequest().dbname); - ASSERT_EQUALS(hbArgs.toBSON(), noi->getRequest().cmdObj); - getNet()->scheduleResponse(noi, startDate + 10, ResponseStatus(ErrorCodes::NoSuchKey, - "No response")); - getNet()->runUntil(startDate + 10); - getNet()->exitNetwork(); - ASSERT_EQUALS(startDate + 10, getNet()->now()); - prsiThread.join(); - ASSERT_EQUALS(ErrorCodes::NodeNotFound, status); - ASSERT_EQUALS(MemberState::RS_STARTUP, getReplCoord()->getMemberState().s); - } - - TEST_F(ReplCoordTest, InitiatePassesIfQuorumMet) { - init("mySet"); - start(HostAndPort("node1", 12345)); - ASSERT_EQUALS(MemberState::RS_STARTUP, getReplCoord()->getMemberState().s); - - ReplSetHeartbeatArgs hbArgs; - hbArgs.setSetName("mySet"); - hbArgs.setProtocolVersion(1); - hbArgs.setConfigVersion(1); - hbArgs.setCheckEmpty(true); - hbArgs.setSenderHost(HostAndPort("node1", 12345)); - hbArgs.setSenderId(0); - - Status status(ErrorCodes::InternalError, "Not set"); - boost::thread prsiThread(stdx::bind(doReplSetInitiate, getReplCoord(), &status)); - const Date_t startDate = getNet()->now(); - getNet()->enterNetwork(); - const NetworkInterfaceMock::NetworkOperationIterator noi = getNet()->getNextReadyRequest(); - ASSERT_EQUALS(HostAndPort("node2", 54321), noi->getRequest().target); - ASSERT_EQUALS("admin", noi->getRequest().dbname); - ASSERT_EQUALS(hbArgs.toBSON(), noi->getRequest().cmdObj); - ReplSetHeartbeatResponse hbResp; - hbResp.setVersion(0); - getNet()->scheduleResponse( - noi, - startDate + 10, - ResponseStatus(ReplicationExecutor::RemoteCommandResponse(hbResp.toBSON(), - Milliseconds(8)))); - getNet()->runUntil(startDate + 10); - getNet()->exitNetwork(); - ASSERT_EQUALS(startDate + 10, getNet()->now()); - prsiThread.join(); - ASSERT_OK(status); - ASSERT_EQUALS(ReplicationCoordinator::modeReplSet, getReplCoord()->getReplicationMode()); +/** + * Used to wait for replication in a separate thread without blocking execution of the test. + * To use, set the optime and write concern to be passed to awaitReplication and then call + * start(), which will spawn a thread that calls awaitReplication. No calls may be made + * on the ReplicationAwaiter instance between calling start and getResult(). After returning + * from getResult(), you can call reset() to allow the awaiter to be reused for another + * awaitReplication call. + */ +class ReplicationAwaiter { +public: + ReplicationAwaiter(ReplicationCoordinatorImpl* replCoord, OperationContext* txn) + : _replCoord(replCoord), + _finished(false), + _result(ReplicationCoordinator::StatusAndDuration( + Status::OK(), ReplicationCoordinator::Milliseconds(0))) {} + + void setOpTime(const OpTime& ot) { + _optime = ot; } - TEST_F(ReplCoordTest, InitiateFailsWithSetNameMismatch) { - OperationContextNoop txn; - init("mySet"); - start(HostAndPort("node1", 12345)); - ASSERT_EQUALS(MemberState::RS_STARTUP, getReplCoord()->getMemberState().s); - - BSONObjBuilder result1; - ASSERT_EQUALS( - ErrorCodes::InvalidReplicaSetConfig, - getReplCoord()->processReplSetInitiate( - &txn, - BSON("_id" << "wrongSet" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "node1:12345"))), - &result1)); - ASSERT_EQUALS(MemberState::RS_STARTUP, getReplCoord()->getMemberState().s); + void setWriteConcern(const WriteConcernOptions& wc) { + _writeConcern = wc; } - TEST_F(ReplCoordTest, InitiateFailsWithoutReplSetFlag) { - OperationContextNoop txn; - init(""); - start(HostAndPort("node1", 12345)); - ASSERT_EQUALS(MemberState::RS_STARTUP, getReplCoord()->getMemberState().s); - - BSONObjBuilder result1; - ASSERT_EQUALS( - ErrorCodes::NoReplicationEnabled, - getReplCoord()->processReplSetInitiate( - &txn, - BSON("_id" << "mySet" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "node1:12345"))), - &result1)); - ASSERT_EQUALS(MemberState::RS_STARTUP, getReplCoord()->getMemberState().s); + // may block + ReplicationCoordinator::StatusAndDuration getResult() { + _thread->join(); + ASSERT(_finished); + return _result; } - TEST_F(ReplCoordTest, InitiateFailsWhileStoringLocalConfigDocument) { - OperationContextNoop txn; - init("mySet"); - start(HostAndPort("node1", 12345)); - ASSERT_EQUALS(MemberState::RS_STARTUP, getReplCoord()->getMemberState().s); - - BSONObjBuilder result1; - getExternalState()->setStoreLocalConfigDocumentStatus(Status(ErrorCodes::OutOfDiskSpace, - "The test set this")); - ASSERT_EQUALS( - ErrorCodes::OutOfDiskSpace, - getReplCoord()->processReplSetInitiate( - &txn, - BSON("_id" << "mySet" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "node1:12345"))), - &result1)); - ASSERT_EQUALS(MemberState::RS_STARTUP, getReplCoord()->getMemberState().s); + void start(OperationContext* txn) { + ASSERT(!_finished); + _thread.reset( + new boost::thread(stdx::bind(&ReplicationAwaiter::_awaitReplication, this, txn))); } - TEST_F(ReplCoordTest, CheckReplEnabledForCommandNotRepl) { - // pass in settings to avoid having a replSet - ReplSettings settings; - init(settings); - start(); - - // check status NoReplicationEnabled and empty result - BSONObjBuilder result; - Status status = getReplCoord()->checkReplEnabledForCommand(&result); - ASSERT_EQUALS(status, ErrorCodes::NoReplicationEnabled); - ASSERT_TRUE(result.obj().isEmpty()); + void reset() { + ASSERT(_finished); + _finished = false; + _result = ReplicationCoordinator::StatusAndDuration( + Status::OK(), ReplicationCoordinator::Milliseconds(0)); } - TEST_F(ReplCoordTest, checkReplEnabledForCommandConfigSvr) { - ReplSettings settings; - serverGlobalParams.configsvr = true; - init(settings); - start(); - - // check status NoReplicationEnabled and result mentions configsrv - BSONObjBuilder result; - Status status = getReplCoord()->checkReplEnabledForCommand(&result); - ASSERT_EQUALS(status, ErrorCodes::NoReplicationEnabled); - ASSERT_EQUALS(result.obj()["info"].String(), "configsvr"); - serverGlobalParams.configsvr = false; +private: + void _awaitReplication(OperationContext* txn) { + _result = _replCoord->awaitReplication(txn, _optime, _writeConcern); + _finished = true; } - TEST_F(ReplCoordTest, checkReplEnabledForCommandNoConfig) { - start(); - - // check status NotYetInitialized and result mentions rs.initiate - BSONObjBuilder result; - Status status = getReplCoord()->checkReplEnabledForCommand(&result); - ASSERT_EQUALS(status, ErrorCodes::NotYetInitialized); - ASSERT_TRUE(result.obj()["info"].String().find("rs.initiate") != std::string::npos); + ReplicationCoordinatorImpl* _replCoord; + bool _finished; + OpTime _optime; + WriteConcernOptions _writeConcern; + ReplicationCoordinator::StatusAndDuration _result; + boost::scoped_ptr<boost::thread> _thread; +}; + +TEST_F(ReplCoordTest, AwaitReplicationNumberOfNodesBlocking) { + OperationContextNoop txn; + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("host" + << "node1:12345" + << "_id" << 0) + << BSON("host" + << "node2:12345" + << "_id" << 1) << BSON("host" + << "node3:12345" + << "_id" << 2))), + HostAndPort("node1", 12345)); + ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + getReplCoord()->setMyLastOptime(OpTime(100, 0)); + simulateSuccessfulElection(); + + ReplicationAwaiter awaiter(getReplCoord(), &txn); + + OID client1 = OID::gen(); + OID client2 = OID::gen(); + OpTime time1(100, 1); + OpTime time2(100, 2); + + HandshakeArgs handshake1; + ASSERT_OK(handshake1.initialize(BSON("handshake" << client1 << "member" << 1))); + ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake1)); + HandshakeArgs handshake2; + ASSERT_OK(handshake2.initialize(BSON("handshake" << client2 << "member" << 2))); + ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake2)); + + WriteConcernOptions writeConcern; + writeConcern.wTimeout = WriteConcernOptions::kNoTimeout; + writeConcern.wNumNodes = 2; + + // 2 nodes waiting for time1 + awaiter.setOpTime(time1); + awaiter.setWriteConcern(writeConcern); + awaiter.start(&txn); + getReplCoord()->setMyLastOptime(time1); + ASSERT_OK(getReplCoord()->setLastOptime_forTest(client1, time1)); + ReplicationCoordinator::StatusAndDuration statusAndDur = awaiter.getResult(); + ASSERT_OK(statusAndDur.status); + awaiter.reset(); + + // 2 nodes waiting for time2 + awaiter.setOpTime(time2); + awaiter.start(&txn); + getReplCoord()->setMyLastOptime(time2); + ASSERT_OK(getReplCoord()->setLastOptime_forTest(client1, time2)); + statusAndDur = awaiter.getResult(); + ASSERT_OK(statusAndDur.status); + awaiter.reset(); + + // 3 nodes waiting for time2 + writeConcern.wNumNodes = 3; + awaiter.setWriteConcern(writeConcern); + awaiter.start(&txn); + ASSERT_OK(getReplCoord()->setLastOptime_forTest(client2, time2)); + statusAndDur = awaiter.getResult(); + ASSERT_OK(statusAndDur.status); + awaiter.reset(); +} + +TEST_F(ReplCoordTest, AwaitReplicationTimeout) { + OperationContextNoop txn; + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("host" + << "node1:12345" + << "_id" << 0) + << BSON("host" + << "node2:12345" + << "_id" << 1) << BSON("host" + << "node3:12345" + << "_id" << 2))), + HostAndPort("node1", 12345)); + ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + getReplCoord()->setMyLastOptime(OpTime(100, 0)); + simulateSuccessfulElection(); + + ReplicationAwaiter awaiter(getReplCoord(), &txn); + + OID client = OID::gen(); + OpTime time1(100, 1); + OpTime time2(100, 2); + + HandshakeArgs handshake; + ASSERT_OK(handshake.initialize(BSON("handshake" << client << "member" << 1))); + ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake)); + + WriteConcernOptions writeConcern; + writeConcern.wTimeout = 50; + writeConcern.wNumNodes = 2; + + // 2 nodes waiting for time2 + awaiter.setOpTime(time2); + awaiter.setWriteConcern(writeConcern); + awaiter.start(&txn); + getReplCoord()->setMyLastOptime(time2); + ASSERT_OK(getReplCoord()->setLastOptime_forTest(client, time1)); + ReplicationCoordinator::StatusAndDuration statusAndDur = awaiter.getResult(); + ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, statusAndDur.status); + awaiter.reset(); +} + +TEST_F(ReplCoordTest, AwaitReplicationShutdown) { + OperationContextNoop txn; + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("host" + << "node1:12345" + << "_id" << 0) + << BSON("host" + << "node2:12345" + << "_id" << 1) << BSON("host" + << "node3:12345" + << "_id" << 2))), + HostAndPort("node1", 12345)); + ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + getReplCoord()->setMyLastOptime(OpTime(100, 0)); + simulateSuccessfulElection(); + + ReplicationAwaiter awaiter(getReplCoord(), &txn); + + OID client1 = OID::gen(); + OID client2 = OID::gen(); + OpTime time1(100, 1); + OpTime time2(100, 2); + + HandshakeArgs handshake1; + ASSERT_OK(handshake1.initialize(BSON("handshake" << client1 << "member" << 1))); + ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake1)); + HandshakeArgs handshake2; + ASSERT_OK(handshake2.initialize(BSON("handshake" << client2 << "member" << 2))); + ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake2)); + + WriteConcernOptions writeConcern; + writeConcern.wTimeout = WriteConcernOptions::kNoTimeout; + writeConcern.wNumNodes = 2; + + // 2 nodes waiting for time2 + awaiter.setOpTime(time2); + awaiter.setWriteConcern(writeConcern); + awaiter.start(&txn); + ASSERT_OK(getReplCoord()->setLastOptime_forTest(client1, time1)); + ASSERT_OK(getReplCoord()->setLastOptime_forTest(client2, time1)); + shutdown(); + ReplicationCoordinator::StatusAndDuration statusAndDur = awaiter.getResult(); + ASSERT_EQUALS(ErrorCodes::ShutdownInProgress, statusAndDur.status); + awaiter.reset(); +} + +TEST_F(ReplCoordTest, AwaitReplicationStepDown) { + // Test that a thread blocked in awaitReplication will be woken up and return NotMaster + // if the node steps down while it is waiting. + OperationContextReplMock txn; + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("host" + << "node1:12345" + << "_id" << 0) + << BSON("host" + << "node2:12345" + << "_id" << 1) << BSON("host" + << "node3:12345" + << "_id" << 2))), + HostAndPort("node1", 12345)); + ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + getReplCoord()->setMyLastOptime(OpTime(100, 0)); + simulateSuccessfulElection(); + + ReplicationAwaiter awaiter(getReplCoord(), &txn); + + OID client1 = OID::gen(); + OID client2 = OID::gen(); + OpTime time1(100, 1); + OpTime time2(100, 2); + + HandshakeArgs handshake1; + ASSERT_OK(handshake1.initialize(BSON("handshake" << client1 << "member" << 1))); + ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake1)); + HandshakeArgs handshake2; + ASSERT_OK(handshake2.initialize(BSON("handshake" << client2 << "member" << 2))); + ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake2)); + + WriteConcernOptions writeConcern; + writeConcern.wTimeout = WriteConcernOptions::kNoTimeout; + writeConcern.wNumNodes = 2; + + // 2 nodes waiting for time2 + awaiter.setOpTime(time2); + awaiter.setWriteConcern(writeConcern); + awaiter.start(&txn); + ASSERT_OK(getReplCoord()->setLastOptime_forTest(client1, time1)); + ASSERT_OK(getReplCoord()->setLastOptime_forTest(client2, time1)); + getReplCoord()->stepDown(&txn, true, Milliseconds(0), Milliseconds(1000)); + ReplicationCoordinator::StatusAndDuration statusAndDur = awaiter.getResult(); + ASSERT_EQUALS(ErrorCodes::NotMaster, statusAndDur.status); + awaiter.reset(); +} + +class OperationContextNoopWithInterrupt : public OperationContextReplMock { +public: + OperationContextNoopWithInterrupt() : _opID(0), _interruptOp(false) {} + + virtual unsigned int getOpID() const { + return _opID; } - TEST_F(ReplCoordTest, checkReplEnabledForCommandWorking) { - assertStartSuccess(BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("host" << "node1:12345" << - "_id" << 0 ))), - HostAndPort("node1", 12345)); - - // check status OK and result is empty - BSONObjBuilder result; - Status status = getReplCoord()->checkReplEnabledForCommand(&result); - ASSERT_EQUALS(status, Status::OK()); - ASSERT_TRUE(result.obj().isEmpty()); + /** + * Can only be called before any multi-threaded access to this object has begun. + */ + void setOpID(unsigned int opID) { + _opID = opID; } - TEST_F(ReplCoordTest, BasicRBIDUsage) { - start(); - BSONObjBuilder result; - getReplCoord()->processReplSetGetRBID(&result); - long long initialValue = result.obj()["rbid"].Int(); - getReplCoord()->incrementRollbackID(); - - BSONObjBuilder result2; - getReplCoord()->processReplSetGetRBID(&result2); - long long incrementedValue = result2.obj()["rbid"].Int(); - ASSERT_EQUALS(incrementedValue, initialValue + 1); + virtual void checkForInterrupt() const { + if (_interruptOp) { + uasserted(ErrorCodes::Interrupted, "operation was interrupted"); + } } - TEST_F(ReplCoordTest, AwaitReplicationNoReplEnabled) { - init(""); - OperationContextNoop txn; - OpTime time(100, 1); - - WriteConcernOptions writeConcern; - writeConcern.wTimeout = WriteConcernOptions::kNoWaiting; - writeConcern.wNumNodes = 2; - - // Because we didn't set ReplSettings.replSet, it will think we're a standalone so - // awaitReplication will always work. - ReplicationCoordinator::StatusAndDuration statusAndDur = - getReplCoord()->awaitReplication(&txn, time, writeConcern); - ASSERT_OK(statusAndDur.status); + virtual Status checkForInterruptNoAssert() const { + if (_interruptOp) { + return Status(ErrorCodes::Interrupted, "operation was interrupted"); + } + return Status::OK(); } - TEST_F(ReplCoordTest, AwaitReplicationMasterSlaveMajorityBaseCase) { - ReplSettings settings; - settings.master = true; - init(settings); - OperationContextNoop txn; - OpTime time(100, 1); - - WriteConcernOptions writeConcern; - writeConcern.wTimeout = WriteConcernOptions::kNoWaiting; - writeConcern.wNumNodes = 2; - - - writeConcern.wNumNodes = 0; - writeConcern.wMode = "majority"; - // w:majority always works on master/slave - ReplicationCoordinator::StatusAndDuration statusAndDur = getReplCoord()->awaitReplication( - &txn, time, writeConcern); - ASSERT_OK(statusAndDur.status); + /** + * Can only be called before any multi-threaded access to this object has begun. + */ + void setInterruptOp(bool interrupt) { + _interruptOp = interrupt; } - TEST_F(ReplCoordTest, AwaitReplicationReplSetBaseCases) { - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("host" << "node1:12345" << "_id" << 0) << - BSON("host" << "node2:12345" << "_id" << 1) << - BSON("host" << "node3:12345" << "_id" << 2))), - HostAndPort("node1", 12345)); - - OperationContextNoop txn; - OpTime time(100, 1); - - WriteConcernOptions writeConcern; - writeConcern.wTimeout = WriteConcernOptions::kNoWaiting; - writeConcern.wNumNodes = 0; // Waiting for 0 nodes always works - writeConcern.wMode = ""; - - // Should fail when not primary - ReplicationCoordinator::StatusAndDuration statusAndDur = getReplCoord()->awaitReplication( - &txn, time, writeConcern); - ASSERT_EQUALS(ErrorCodes::NotMaster, statusAndDur.status); - - ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(100, 0)); - simulateSuccessfulElection(); - - statusAndDur = getReplCoord()->awaitReplication(&txn, time, writeConcern); - ASSERT_OK(statusAndDur.status); - } +private: + unsigned int _opID; + bool _interruptOp; +}; + +TEST_F(ReplCoordTest, AwaitReplicationInterrupt) { + // Tests that a thread blocked in awaitReplication can be killed by a killOp operation + OperationContextNoopWithInterrupt txn; + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "node1") + << BSON("_id" << 1 << "host" + << "node2") << BSON("_id" << 2 << "host" + << "node3"))), + HostAndPort("node1")); + ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + getReplCoord()->setMyLastOptime(OpTime(100, 0)); + simulateSuccessfulElection(); + + ReplicationAwaiter awaiter(getReplCoord(), &txn); + + OID client1 = OID::gen(); + OID client2 = OID::gen(); + OpTime time1(100, 1); + OpTime time2(100, 2); + + HandshakeArgs handshake1; + ASSERT_OK(handshake1.initialize(BSON("handshake" << client1 << "member" << 1))); + ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake1)); + HandshakeArgs handshake2; + ASSERT_OK(handshake2.initialize(BSON("handshake" << client2 << "member" << 2))); + ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake2)); + + WriteConcernOptions writeConcern; + writeConcern.wTimeout = WriteConcernOptions::kNoTimeout; + writeConcern.wNumNodes = 2; + + unsigned int opID = 100; + txn.setOpID(opID); + + // 2 nodes waiting for time2 + awaiter.setOpTime(time2); + awaiter.setWriteConcern(writeConcern); + awaiter.start(&txn); + ASSERT_OK(getReplCoord()->setLastOptime_forTest(client1, time1)); + ASSERT_OK(getReplCoord()->setLastOptime_forTest(client2, time1)); + + txn.setInterruptOp(true); + getReplCoord()->interrupt(opID); + ReplicationCoordinator::StatusAndDuration statusAndDur = awaiter.getResult(); + ASSERT_EQUALS(ErrorCodes::Interrupted, statusAndDur.status); + awaiter.reset(); +} + +class StepDownTest : public ReplCoordTest { +protected: + OID myRid; + OID rid2; + OID rid3; + +private: + virtual void setUp() { + ReplCoordTest::setUp(); + init("mySet/test1:1234,test2:1234,test3:1234"); - TEST_F(ReplCoordTest, AwaitReplicationNumberOfNodesNonBlocking) { - OperationContextNoop txn; - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("host" << "node1:12345" << "_id" << 0) << - BSON("host" << "node2:12345" << "_id" << 1) << - BSON("host" << "node3:12345" << "_id" << 2) << - BSON("host" << "node4:12345" << "_id" << 3))), - HostAndPort("node1", 12345)); + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "test1:1234") + << BSON("_id" << 1 << "host" + << "test2:1234") + << BSON("_id" << 2 << "host" + << "test3:1234"))), + HostAndPort("test1", 1234)); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(100, 0)); - simulateSuccessfulElection(); - - OID client1 = OID::gen(); - OID client2 = OID::gen(); - OID client3 = OID::gen(); - OpTime time1(100, 1); - OpTime time2(100, 2); - - HandshakeArgs handshake1; - ASSERT_OK(handshake1.initialize(BSON("handshake" << client1 << "member" << 1))); - ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake1)); + myRid = getReplCoord()->getMyRID(); + rid2 = OID::gen(); + rid3 = OID::gen(); HandshakeArgs handshake2; - ASSERT_OK(handshake2.initialize(BSON("handshake" << client2 << "member" << 2))); - ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake2)); + handshake2.initialize( + BSON("handshake" << rid2 << "member" << 1 << "config" << BSON("_id" << 1 << "host" + << "test2:1234"))); HandshakeArgs handshake3; - ASSERT_OK(handshake3.initialize(BSON("handshake" << client3 << "member" << 3))); - ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake3)); - - WriteConcernOptions writeConcern; - writeConcern.wTimeout = WriteConcernOptions::kNoWaiting; - writeConcern.wNumNodes = 1; - - // 1 node waiting for time 1 - ReplicationCoordinator::StatusAndDuration statusAndDur = - getReplCoord()->awaitReplication(&txn, time1, writeConcern); - ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, statusAndDur.status); - getReplCoord()->setMyLastOptime(time1); - statusAndDur = getReplCoord()->awaitReplication(&txn, time1, writeConcern); - ASSERT_OK(statusAndDur.status); - - // 2 nodes waiting for time1 - writeConcern.wNumNodes = 2; - statusAndDur = getReplCoord()->awaitReplication(&txn, time1, writeConcern); - ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, statusAndDur.status); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(client1, time1)); - statusAndDur = getReplCoord()->awaitReplication(&txn, time1, writeConcern); - ASSERT_OK(statusAndDur.status); - - // 2 nodes waiting for time2 - statusAndDur = getReplCoord()->awaitReplication(&txn, time2, writeConcern); - ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, statusAndDur.status); - getReplCoord()->setMyLastOptime(time2); - statusAndDur = getReplCoord()->awaitReplication(&txn, time2, writeConcern); - ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, statusAndDur.status); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(client3, time2)); - statusAndDur = getReplCoord()->awaitReplication(&txn, time2, writeConcern); - ASSERT_OK(statusAndDur.status); - - // 3 nodes waiting for time2 - writeConcern.wNumNodes = 3; - statusAndDur = getReplCoord()->awaitReplication(&txn, time2, writeConcern); - ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, statusAndDur.status); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(client2, time2)); - statusAndDur = getReplCoord()->awaitReplication(&txn, time2, writeConcern); - ASSERT_OK(statusAndDur.status); - } - - TEST_F(ReplCoordTest, AwaitReplicationNamedModesNonBlocking) { + handshake3.initialize( + BSON("handshake" << rid3 << "member" << 2 << "config" << BSON("_id" << 2 << "host" + << "test3:1234"))); OperationContextNoop txn; - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "node0" << - "tags" << BSON("dc" << "NA" << - "rack" << "rackNA1")) << - BSON("_id" << 1 << - "host" << "node1" << - "tags" << BSON("dc" << "NA" << - "rack" << "rackNA2")) << - BSON("_id" << 2 << - "host" << "node2" << - "tags" << BSON("dc" << "NA" << - "rack" << "rackNA3")) << - BSON("_id" << 3 << - "host" << "node3" << - "tags" << BSON("dc" << "EU" << - "rack" << "rackEU1")) << - BSON("_id" << 4 << - "host" << "node4" << - "tags" << BSON("dc" << "EU" << - "rack" << "rackEU2"))) << - "settings" << BSON("getLastErrorModes" << - BSON("multiDC" << BSON("dc" << 2) << - "multiDCAndRack" << BSON("dc" << 2 << "rack" << 3)))), - HostAndPort("node0")); - ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(100, 0)); - simulateSuccessfulElection(); - - OID clientRID1 = OID::gen(); - OID clientRID2 = OID::gen(); - OID clientRID3 = OID::gen(); - OID clientRID4 = OID::gen(); - OpTime time1(100, 1); - OpTime time2(100, 2); - - HandshakeArgs handshake1; - ASSERT_OK(handshake1.initialize(BSON("handshake" << clientRID1 << "member" << 1))); - ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake1)); - HandshakeArgs handshake2; - ASSERT_OK(handshake2.initialize(BSON("handshake" << clientRID2 << "member" << 2))); ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake2)); - HandshakeArgs handshake3; - ASSERT_OK(handshake3.initialize(BSON("handshake" << clientRID3 << "member" << 3))); ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake3)); - HandshakeArgs handshake4; - ASSERT_OK(handshake4.initialize(BSON("handshake" << clientRID4 << "member" << 4))); - ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake4)); - - // Test invalid write concern - WriteConcernOptions invalidWriteConcern; - invalidWriteConcern.wTimeout = WriteConcernOptions::kNoWaiting; - invalidWriteConcern.wMode = "fakemode"; - - ReplicationCoordinator::StatusAndDuration statusAndDur = - getReplCoord()->awaitReplication(&txn, time1, invalidWriteConcern); - ASSERT_EQUALS(ErrorCodes::UnknownReplWriteConcern, statusAndDur.status); - - - // Set up valid write concerns for the rest of the test - WriteConcernOptions majorityWriteConcern; - majorityWriteConcern.wTimeout = WriteConcernOptions::kNoWaiting; - majorityWriteConcern.wMode = "majority"; - - WriteConcernOptions multiDCWriteConcern; - multiDCWriteConcern.wTimeout = WriteConcernOptions::kNoWaiting; - multiDCWriteConcern.wMode = "multiDC"; - - WriteConcernOptions multiRackWriteConcern; - multiRackWriteConcern.wTimeout = WriteConcernOptions::kNoWaiting; - multiRackWriteConcern.wMode = "multiDCAndRack"; - - - // Nothing satisfied - getReplCoord()->setMyLastOptime(time1); - statusAndDur = getReplCoord()->awaitReplication(&txn, time1, majorityWriteConcern); - ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, statusAndDur.status); - statusAndDur = getReplCoord()->awaitReplication(&txn, time1, multiDCWriteConcern); - ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, statusAndDur.status); - statusAndDur = getReplCoord()->awaitReplication(&txn, time1, multiRackWriteConcern); - ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, statusAndDur.status); - - // Majority satisfied but not either custom mode - getReplCoord()->setLastOptime_forTest(clientRID1, time1); - getReplCoord()->setLastOptime_forTest(clientRID2, time1); - - statusAndDur = getReplCoord()->awaitReplication(&txn, time1, majorityWriteConcern); - ASSERT_OK(statusAndDur.status); - statusAndDur = getReplCoord()->awaitReplication(&txn, time1, multiDCWriteConcern); - ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, statusAndDur.status); - statusAndDur = getReplCoord()->awaitReplication(&txn, time1, multiRackWriteConcern); - ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, statusAndDur.status); - - // All modes satisfied - getReplCoord()->setLastOptime_forTest(clientRID3, time1); - - statusAndDur = getReplCoord()->awaitReplication(&txn, time1, majorityWriteConcern); - ASSERT_OK(statusAndDur.status); - statusAndDur = getReplCoord()->awaitReplication(&txn, time1, multiDCWriteConcern); - ASSERT_OK(statusAndDur.status); - statusAndDur = getReplCoord()->awaitReplication(&txn, time1, multiRackWriteConcern); - ASSERT_OK(statusAndDur.status); - - // multiDC satisfied but not majority or multiRack - getReplCoord()->setMyLastOptime(time2); - getReplCoord()->setLastOptime_forTest(clientRID3, time2); - - statusAndDur = getReplCoord()->awaitReplication(&txn, time2, majorityWriteConcern); - ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, statusAndDur.status); - statusAndDur = getReplCoord()->awaitReplication(&txn, time2, multiDCWriteConcern); - ASSERT_OK(statusAndDur.status); - statusAndDur = getReplCoord()->awaitReplication(&txn, time2, multiRackWriteConcern); - ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, statusAndDur.status); } - - /** - * Used to wait for replication in a separate thread without blocking execution of the test. - * To use, set the optime and write concern to be passed to awaitReplication and then call - * start(), which will spawn a thread that calls awaitReplication. No calls may be made - * on the ReplicationAwaiter instance between calling start and getResult(). After returning - * from getResult(), you can call reset() to allow the awaiter to be reused for another - * awaitReplication call. - */ - class ReplicationAwaiter { - public: - - ReplicationAwaiter(ReplicationCoordinatorImpl* replCoord, OperationContext* txn) : - _replCoord(replCoord), _finished(false), - _result(ReplicationCoordinator::StatusAndDuration( - Status::OK(), ReplicationCoordinator::Milliseconds(0))) {} - - void setOpTime(const OpTime& ot) { - _optime = ot; - } - - void setWriteConcern(const WriteConcernOptions& wc) { - _writeConcern = wc; - } - - // may block - ReplicationCoordinator::StatusAndDuration getResult() { - _thread->join(); - ASSERT(_finished); - return _result; - } - - void start(OperationContext* txn) { - ASSERT(!_finished); - _thread.reset(new boost::thread(stdx::bind(&ReplicationAwaiter::_awaitReplication, - this, - txn))); - } - - void reset() { - ASSERT(_finished); - _finished = false; - _result = ReplicationCoordinator::StatusAndDuration( - Status::OK(), ReplicationCoordinator::Milliseconds(0)); - } - - private: - - void _awaitReplication(OperationContext* txn) { - _result = _replCoord->awaitReplication(txn, _optime, _writeConcern); - _finished = true; - } - - ReplicationCoordinatorImpl* _replCoord; - bool _finished; - OpTime _optime; - WriteConcernOptions _writeConcern; - ReplicationCoordinator::StatusAndDuration _result; - boost::scoped_ptr<boost::thread> _thread; - }; - - TEST_F(ReplCoordTest, AwaitReplicationNumberOfNodesBlocking) { - OperationContextNoop txn; - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("host" << "node1:12345" << "_id" << 0) << - BSON("host" << "node2:12345" << "_id" << 1) << - BSON("host" << "node3:12345" << "_id" << 2))), - HostAndPort("node1", 12345)); - ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(100, 0)); - simulateSuccessfulElection(); - - ReplicationAwaiter awaiter(getReplCoord(), &txn); - - OID client1 = OID::gen(); - OID client2 = OID::gen(); - OpTime time1(100, 1); - OpTime time2(100, 2); - - HandshakeArgs handshake1; - ASSERT_OK(handshake1.initialize(BSON("handshake" << client1 << "member" << 1))); - ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake1)); - HandshakeArgs handshake2; - ASSERT_OK(handshake2.initialize(BSON("handshake" << client2 << "member" << 2))); - ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake2)); - - WriteConcernOptions writeConcern; - writeConcern.wTimeout = WriteConcernOptions::kNoTimeout; - writeConcern.wNumNodes = 2; - - // 2 nodes waiting for time1 - awaiter.setOpTime(time1); - awaiter.setWriteConcern(writeConcern); - awaiter.start(&txn); - getReplCoord()->setMyLastOptime(time1); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(client1, time1)); - ReplicationCoordinator::StatusAndDuration statusAndDur = awaiter.getResult(); - ASSERT_OK(statusAndDur.status); - awaiter.reset(); - - // 2 nodes waiting for time2 - awaiter.setOpTime(time2); - awaiter.start(&txn); - getReplCoord()->setMyLastOptime(time2); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(client1, time2)); - statusAndDur = awaiter.getResult(); - ASSERT_OK(statusAndDur.status); - awaiter.reset(); - - // 3 nodes waiting for time2 - writeConcern.wNumNodes = 3; - awaiter.setWriteConcern(writeConcern); - awaiter.start(&txn); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(client2, time2)); - statusAndDur = awaiter.getResult(); - ASSERT_OK(statusAndDur.status); - awaiter.reset(); - } - - TEST_F(ReplCoordTest, AwaitReplicationTimeout) { - OperationContextNoop txn; - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("host" << "node1:12345" << "_id" << 0) << - BSON("host" << "node2:12345" << "_id" << 1) << - BSON("host" << "node3:12345" << "_id" << 2))), - HostAndPort("node1", 12345)); - ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(100, 0)); - simulateSuccessfulElection(); - - ReplicationAwaiter awaiter(getReplCoord(), &txn); - - OID client = OID::gen(); - OpTime time1(100, 1); - OpTime time2(100, 2); - - HandshakeArgs handshake; - ASSERT_OK(handshake.initialize(BSON("handshake" << client << "member" << 1))); - ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake)); - - WriteConcernOptions writeConcern; - writeConcern.wTimeout = 50; - writeConcern.wNumNodes = 2; - - // 2 nodes waiting for time2 - awaiter.setOpTime(time2); - awaiter.setWriteConcern(writeConcern); - awaiter.start(&txn); - getReplCoord()->setMyLastOptime(time2); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(client, time1)); - ReplicationCoordinator::StatusAndDuration statusAndDur = awaiter.getResult(); - ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, statusAndDur.status); - awaiter.reset(); +}; + +TEST_F(StepDownTest, StepDownNotPrimary) { + OperationContextReplMock txn; + OpTime optime1(100, 1); + // All nodes are caught up + getReplCoord()->setMyLastOptime(optime1); + ASSERT_OK(getReplCoord()->setLastOptime_forTest(rid2, optime1)); + ASSERT_OK(getReplCoord()->setLastOptime_forTest(rid3, optime1)); + + Status status = getReplCoord()->stepDown(&txn, false, Milliseconds(0), Milliseconds(0)); + ASSERT_EQUALS(ErrorCodes::NotMaster, status); + ASSERT_TRUE(getReplCoord()->getMemberState().secondary()); +} + +TEST_F(StepDownTest, StepDownTimeoutAcquiringGlobalLock) { + OperationContextReplMock txn; + OpTime optime1(100, 1); + // All nodes are caught up + getReplCoord()->setMyLastOptime(optime1); + ASSERT_OK(getReplCoord()->setLastOptime_forTest(rid2, optime1)); + ASSERT_OK(getReplCoord()->setLastOptime_forTest(rid3, optime1)); + + simulateSuccessfulElection(); + + // Make sure stepDown cannot grab the global shared lock + Lock::GlobalWrite lk(txn.lockState()); + + Status status = getReplCoord()->stepDown(&txn, false, Milliseconds(0), Milliseconds(1000)); + ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, status); + ASSERT_TRUE(getReplCoord()->getMemberState().primary()); +} + +TEST_F(StepDownTest, StepDownNoWaiting) { + OperationContextReplMock txn; + OpTime optime1(100, 1); + // All nodes are caught up + getReplCoord()->setMyLastOptime(optime1); + ASSERT_OK(getReplCoord()->setLastOptime_forTest(rid2, optime1)); + ASSERT_OK(getReplCoord()->setLastOptime_forTest(rid3, optime1)); + + simulateSuccessfulElection(); + + enterNetwork(); + getNet()->runUntil(getNet()->now() + 2000); + ASSERT(getNet()->hasReadyRequests()); + NetworkInterfaceMock::NetworkOperationIterator noi = getNet()->getNextReadyRequest(); + ReplicationExecutor::RemoteCommandRequest request = noi->getRequest(); + log() << request.target.toString() << " processing " << request.cmdObj; + ReplSetHeartbeatArgs hbArgs; + if (hbArgs.initialize(request.cmdObj).isOK()) { + ReplSetHeartbeatResponse hbResp; + hbResp.setSetName(hbArgs.getSetName()); + hbResp.setState(MemberState::RS_SECONDARY); + hbResp.setVersion(hbArgs.getConfigVersion()); + hbResp.setOpTime(optime1); + BSONObjBuilder respObj; + respObj << "ok" << 1; + hbResp.addToBSON(&respObj); + getNet()->scheduleResponse(noi, getNet()->now(), makeResponseStatus(respObj.obj())); } - - TEST_F(ReplCoordTest, AwaitReplicationShutdown) { - OperationContextNoop txn; - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("host" << "node1:12345" << "_id" << 0) << - BSON("host" << "node2:12345" << "_id" << 1) << - BSON("host" << "node3:12345" << "_id" << 2))), - HostAndPort("node1", 12345)); - ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(100, 0)); - simulateSuccessfulElection(); - - ReplicationAwaiter awaiter(getReplCoord(), &txn); - - OID client1 = OID::gen(); - OID client2 = OID::gen(); - OpTime time1(100, 1); - OpTime time2(100, 2); - - HandshakeArgs handshake1; - ASSERT_OK(handshake1.initialize(BSON("handshake" << client1 << "member" << 1))); - ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake1)); - HandshakeArgs handshake2; - ASSERT_OK(handshake2.initialize(BSON("handshake" << client2 << "member" << 2))); - ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake2)); - - WriteConcernOptions writeConcern; - writeConcern.wTimeout = WriteConcernOptions::kNoTimeout; - writeConcern.wNumNodes = 2; - - // 2 nodes waiting for time2 - awaiter.setOpTime(time2); - awaiter.setWriteConcern(writeConcern); - awaiter.start(&txn); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(client1, time1)); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(client2, time1)); - shutdown(); - ReplicationCoordinator::StatusAndDuration statusAndDur = awaiter.getResult(); - ASSERT_EQUALS(ErrorCodes::ShutdownInProgress, statusAndDur.status); - awaiter.reset(); + while (getNet()->hasReadyRequests()) { + getNet()->blackHole(getNet()->getNextReadyRequest()); } + getNet()->runReadyNetworkOperations(); + exitNetwork(); + + + ASSERT_TRUE(getReplCoord()->getMemberState().primary()); + ASSERT_OK(getReplCoord()->stepDown(&txn, false, Milliseconds(0), Milliseconds(1000))); + enterNetwork(); // So we can safely inspect the topology coordinator + ASSERT_EQUALS(Date_t(getNet()->now().millis + 1000), getTopoCoord().getStepDownTime()); + ASSERT_TRUE(getTopoCoord().getMemberState().secondary()); + exitNetwork(); + ASSERT_TRUE(getReplCoord()->getMemberState().secondary()); +} + +TEST_F(ReplCoordTest, StepDownAndBackUpSingleNode) { + init("mySet"); + + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "test1:1234"))), + HostAndPort("test1", 1234)); + OperationContextReplMock txn; + getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY); + + ASSERT_TRUE(getReplCoord()->getMemberState().primary()); + ASSERT_OK(getReplCoord()->stepDown(&txn, true, Milliseconds(0), Milliseconds(1000))); + getNet()->enterNetwork(); // Must do this before inspecting the topocoord + Date_t stepdownUntil = Date_t(getNet()->now().millis + 1000); + ASSERT_EQUALS(stepdownUntil, getTopoCoord().getStepDownTime()); + ASSERT_TRUE(getTopoCoord().getMemberState().secondary()); + ASSERT_TRUE(getReplCoord()->getMemberState().secondary()); + + // Now run time forward and make sure that the node becomes primary again when the stepdown + // period ends. + getNet()->runUntil(stepdownUntil); + ASSERT_EQUALS(stepdownUntil, getNet()->now()); + ASSERT_TRUE(getTopoCoord().getMemberState().primary()); + getNet()->exitNetwork(); + ASSERT_TRUE(getReplCoord()->getMemberState().primary()); +} - TEST_F(ReplCoordTest, AwaitReplicationStepDown) { - // Test that a thread blocked in awaitReplication will be woken up and return NotMaster - // if the node steps down while it is waiting. - OperationContextReplMock txn; - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("host" << "node1:12345" << "_id" << 0) << - BSON("host" << "node2:12345" << "_id" << 1) << - BSON("host" << "node3:12345" << "_id" << 2))), - HostAndPort("node1", 12345)); - ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(100, 0)); - simulateSuccessfulElection(); - - ReplicationAwaiter awaiter(getReplCoord(), &txn); - - OID client1 = OID::gen(); - OID client2 = OID::gen(); - OpTime time1(100, 1); - OpTime time2(100, 2); - - HandshakeArgs handshake1; - ASSERT_OK(handshake1.initialize(BSON("handshake" << client1 << "member" << 1))); - ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake1)); - HandshakeArgs handshake2; - ASSERT_OK(handshake2.initialize(BSON("handshake" << client2 << "member" << 2))); - ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake2)); - - WriteConcernOptions writeConcern; - writeConcern.wTimeout = WriteConcernOptions::kNoTimeout; - writeConcern.wNumNodes = 2; - - // 2 nodes waiting for time2 - awaiter.setOpTime(time2); - awaiter.setWriteConcern(writeConcern); - awaiter.start(&txn); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(client1, time1)); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(client2, time1)); - getReplCoord()->stepDown(&txn, true, Milliseconds(0), Milliseconds(1000)); - ReplicationCoordinator::StatusAndDuration statusAndDur = awaiter.getResult(); - ASSERT_EQUALS(ErrorCodes::NotMaster, statusAndDur.status); - awaiter.reset(); +/** + * Used to run wait for stepDown() to finish in a separate thread without blocking execution of + * the test. To use, set the values of "force", "waitTime", and "stepDownTime", which will be + * used as the arguments passed to stepDown, and then call + * start(), which will spawn a thread that calls stepDown. No calls may be made + * on the StepDownRunner instance between calling start and getResult(). After returning + * from getResult(), you can call reset() to allow the StepDownRunner to be reused for another + * stepDown call. + */ +class StepDownRunner { +public: + StepDownRunner(ReplicationCoordinatorImpl* replCoord) + : _replCoord(replCoord), + _finished(false), + _result(Status::OK()), + _force(false), + _waitTime(0), + _stepDownTime(0) {} + + // may block + Status getResult() { + _thread->join(); + ASSERT(_finished); + return _result; } - class OperationContextNoopWithInterrupt : public OperationContextReplMock { - public: - - OperationContextNoopWithInterrupt() : _opID(0), _interruptOp(false) {} - - virtual unsigned int getOpID() const { - return _opID; - } - - /** - * Can only be called before any multi-threaded access to this object has begun. - */ - void setOpID(unsigned int opID) { - _opID = opID; - } - - virtual void checkForInterrupt() const { - if (_interruptOp) { - uasserted(ErrorCodes::Interrupted, "operation was interrupted"); - } - } - - virtual Status checkForInterruptNoAssert() const { - if (_interruptOp) { - return Status(ErrorCodes::Interrupted, "operation was interrupted"); - } - return Status::OK(); - } - - /** - * Can only be called before any multi-threaded access to this object has begun. - */ - void setInterruptOp(bool interrupt) { - _interruptOp = interrupt; - } - - private: - unsigned int _opID; - bool _interruptOp; - }; - - TEST_F(ReplCoordTest, AwaitReplicationInterrupt) { - // Tests that a thread blocked in awaitReplication can be killed by a killOp operation - OperationContextNoopWithInterrupt txn; - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("_id" << 0 << "host" << "node1") << - BSON("_id" << 1 << "host" << "node2") << - BSON("_id" << 2 << "host" << "node3"))), - HostAndPort("node1")); - ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(100, 0)); - simulateSuccessfulElection(); - - ReplicationAwaiter awaiter(getReplCoord(), &txn); - - OID client1 = OID::gen(); - OID client2 = OID::gen(); - OpTime time1(100, 1); - OpTime time2(100, 2); - - HandshakeArgs handshake1; - ASSERT_OK(handshake1.initialize(BSON("handshake" << client1 << "member" << 1))); - ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake1)); - HandshakeArgs handshake2; - ASSERT_OK(handshake2.initialize(BSON("handshake" << client2 << "member" << 2))); - ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake2)); - - WriteConcernOptions writeConcern; - writeConcern.wTimeout = WriteConcernOptions::kNoTimeout; - writeConcern.wNumNodes = 2; - - unsigned int opID = 100; - txn.setOpID(opID); - - // 2 nodes waiting for time2 - awaiter.setOpTime(time2); - awaiter.setWriteConcern(writeConcern); - awaiter.start(&txn); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(client1, time1)); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(client2, time1)); - - txn.setInterruptOp(true); - getReplCoord()->interrupt(opID); - ReplicationCoordinator::StatusAndDuration statusAndDur = awaiter.getResult(); - ASSERT_EQUALS(ErrorCodes::Interrupted, statusAndDur.status); - awaiter.reset(); + void start(OperationContext* txn) { + ASSERT(!_finished); + _thread.reset(new boost::thread(stdx::bind(&StepDownRunner::_stepDown, this, txn))); } - class StepDownTest : public ReplCoordTest { - protected: - OID myRid; - OID rid2; - OID rid3; - - private: - virtual void setUp() { - ReplCoordTest::setUp(); - init("mySet/test1:1234,test2:1234,test3:1234"); - - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << "host" << "test1:1234") << - BSON("_id" << 1 << "host" << "test2:1234") << - BSON("_id" << 2 << "host" << "test3:1234"))), - HostAndPort("test1", 1234)); - ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - myRid = getReplCoord()->getMyRID(); - rid2 = OID::gen(); - rid3 = OID::gen(); - HandshakeArgs handshake2; - handshake2.initialize(BSON("handshake" << rid2 << - "member" << 1 << - "config" << BSON("_id" << 1 << "host" << "test2:1234"))); - HandshakeArgs handshake3; - handshake3.initialize(BSON("handshake" << rid3 << - "member" << 2 << - "config" << BSON("_id" << 2 << "host" << "test3:1234"))); - OperationContextNoop txn; - ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake2)); - ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake3)); - } - }; - - TEST_F(StepDownTest, StepDownNotPrimary) { - OperationContextReplMock txn; - OpTime optime1(100, 1); - // All nodes are caught up - getReplCoord()->setMyLastOptime(optime1); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(rid2, optime1)); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(rid3, optime1)); - - Status status = getReplCoord()->stepDown(&txn, false, Milliseconds(0), Milliseconds(0)); - ASSERT_EQUALS(ErrorCodes::NotMaster, status); - ASSERT_TRUE(getReplCoord()->getMemberState().secondary()); + void reset() { + ASSERT(_finished); + _finished = false; + _result = Status(ErrorCodes::InternalError, "Result Status never set"); } - TEST_F(StepDownTest, StepDownTimeoutAcquiringGlobalLock) { - OperationContextReplMock txn; - OpTime optime1(100, 1); - // All nodes are caught up - getReplCoord()->setMyLastOptime(optime1); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(rid2, optime1)); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(rid3, optime1)); - - simulateSuccessfulElection(); - - // Make sure stepDown cannot grab the global shared lock - Lock::GlobalWrite lk(txn.lockState()); - - Status status = getReplCoord()->stepDown(&txn, false, Milliseconds(0), Milliseconds(1000)); - ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, status); - ASSERT_TRUE(getReplCoord()->getMemberState().primary()); + void setForce(bool force) { + _force = force; } - TEST_F(StepDownTest, StepDownNoWaiting) { - OperationContextReplMock txn; - OpTime optime1(100, 1); - // All nodes are caught up - getReplCoord()->setMyLastOptime(optime1); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(rid2, optime1)); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(rid3, optime1)); - - simulateSuccessfulElection(); - - enterNetwork(); - getNet()->runUntil(getNet()->now() + 2000); - ASSERT(getNet()->hasReadyRequests()); - NetworkInterfaceMock::NetworkOperationIterator noi = getNet()->getNextReadyRequest(); - ReplicationExecutor::RemoteCommandRequest request = noi->getRequest(); - log() << request.target.toString() << " processing " << request.cmdObj; - ReplSetHeartbeatArgs hbArgs; - if (hbArgs.initialize(request.cmdObj).isOK()) { - ReplSetHeartbeatResponse hbResp; - hbResp.setSetName(hbArgs.getSetName()); - hbResp.setState(MemberState::RS_SECONDARY); - hbResp.setVersion(hbArgs.getConfigVersion()); - hbResp.setOpTime(optime1); - BSONObjBuilder respObj; - respObj << "ok" << 1; - hbResp.addToBSON(&respObj); - getNet()->scheduleResponse(noi, getNet()->now(), makeResponseStatus(respObj.obj())); - } - while (getNet()->hasReadyRequests()) { - getNet()->blackHole(getNet()->getNextReadyRequest()); - } - getNet()->runReadyNetworkOperations(); - exitNetwork(); - - - ASSERT_TRUE(getReplCoord()->getMemberState().primary()); - ASSERT_OK(getReplCoord()->stepDown(&txn, false, Milliseconds(0), Milliseconds(1000))); - enterNetwork(); // So we can safely inspect the topology coordinator - ASSERT_EQUALS(Date_t(getNet()->now().millis + 1000), getTopoCoord().getStepDownTime()); - ASSERT_TRUE(getTopoCoord().getMemberState().secondary()); - exitNetwork(); - ASSERT_TRUE(getReplCoord()->getMemberState().secondary()); + void setWaitTime(const Milliseconds& waitTime) { + _waitTime = waitTime; } - TEST_F(ReplCoordTest, StepDownAndBackUpSingleNode) { - init("mySet"); - - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << "host" << "test1:1234"))), - HostAndPort("test1", 1234)); - OperationContextReplMock txn; - getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY); - - ASSERT_TRUE(getReplCoord()->getMemberState().primary()); - ASSERT_OK(getReplCoord()->stepDown(&txn, true, Milliseconds(0), Milliseconds(1000))); - getNet()->enterNetwork(); // Must do this before inspecting the topocoord - Date_t stepdownUntil = Date_t(getNet()->now().millis + 1000); - ASSERT_EQUALS(stepdownUntil, getTopoCoord().getStepDownTime()); - ASSERT_TRUE(getTopoCoord().getMemberState().secondary()); - ASSERT_TRUE(getReplCoord()->getMemberState().secondary()); - - // Now run time forward and make sure that the node becomes primary again when the stepdown - // period ends. - getNet()->runUntil(stepdownUntil); - ASSERT_EQUALS(stepdownUntil, getNet()->now()); - ASSERT_TRUE(getTopoCoord().getMemberState().primary()); - getNet()->exitNetwork(); - ASSERT_TRUE(getReplCoord()->getMemberState().primary()); + void setStepDownTime(const Milliseconds& stepDownTime) { + _stepDownTime = stepDownTime; } - /** - * Used to run wait for stepDown() to finish in a separate thread without blocking execution of - * the test. To use, set the values of "force", "waitTime", and "stepDownTime", which will be - * used as the arguments passed to stepDown, and then call - * start(), which will spawn a thread that calls stepDown. No calls may be made - * on the StepDownRunner instance between calling start and getResult(). After returning - * from getResult(), you can call reset() to allow the StepDownRunner to be reused for another - * stepDown call. - */ - class StepDownRunner { - public: - - StepDownRunner(ReplicationCoordinatorImpl* replCoord) : - _replCoord(replCoord), _finished(false), _result(Status::OK()), _force(false), - _waitTime(0), _stepDownTime(0) {} - - // may block - Status getResult() { - _thread->join(); - ASSERT(_finished); - return _result; - } - - void start(OperationContext* txn) { - ASSERT(!_finished); - _thread.reset(new boost::thread(stdx::bind(&StepDownRunner::_stepDown, - this, - txn))); - } - - void reset() { - ASSERT(_finished); - _finished = false; - _result = Status(ErrorCodes::InternalError, "Result Status never set"); - } - - void setForce(bool force) { - _force = force; - } - - void setWaitTime(const Milliseconds& waitTime) { - _waitTime = waitTime; - } - - void setStepDownTime(const Milliseconds& stepDownTime) { - _stepDownTime = stepDownTime; - } - - private: - - void _stepDown(OperationContext* txn) { - _result = _replCoord->stepDown(txn, _force, _waitTime, _stepDownTime); - _finished = true; - } - - ReplicationCoordinatorImpl* _replCoord; - bool _finished; - Status _result; - boost::scoped_ptr<boost::thread> _thread; - bool _force; - Milliseconds _waitTime; - Milliseconds _stepDownTime; - }; - - TEST_F(StepDownTest, StepDownNotCaughtUp) { - OperationContextReplMock txn; - OpTime optime1(100, 1); - OpTime optime2(100, 2); - // No secondary is caught up - getReplCoord()->setMyLastOptime(optime2); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(rid2, optime1)); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(rid3, optime1)); - - // Try to stepDown but time out because no secondaries are caught up - StepDownRunner runner(getReplCoord()); - runner.setForce(false); - runner.setWaitTime(Milliseconds(0)); - runner.setStepDownTime(Milliseconds(1000)); - - simulateSuccessfulElection(); - - runner.start(&txn); - Status status = runner.getResult(); - ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, status); - ASSERT_TRUE(getReplCoord()->getMemberState().primary()); - - // Now use "force" to force it to step down even though no one is caught up - runner.reset(); - getNet()->enterNetwork(); - const Date_t startDate = getNet()->now(); - while (startDate + 1000 < getNet()->now()) { - while (getNet()->hasReadyRequests()) { - getNet()->blackHole(getNet()->getNextReadyRequest()); - } - getNet()->runUntil(startDate + 1000); - } - getNet()->exitNetwork(); - ASSERT_TRUE(getReplCoord()->getMemberState().primary()); - runner.setForce(true); - runner.start(&txn); - status = runner.getResult(); - ASSERT_OK(status); - ASSERT_TRUE(getReplCoord()->getMemberState().secondary()); - +private: + void _stepDown(OperationContext* txn) { + _result = _replCoord->stepDown(txn, _force, _waitTime, _stepDownTime); + _finished = true; } - TEST_F(StepDownTest, StepDownCatchUp) { - OperationContextReplMock txn; - OpTime optime1(100, 1); - OpTime optime2(100, 2); - // No secondary is caught up - getReplCoord()->setMyLastOptime(optime2); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(rid2, optime1)); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(rid3, optime1)); - - // stepDown where the secondary actually has to catch up before the stepDown can succeed - StepDownRunner runner(getReplCoord()); - runner.setForce(false); - runner.setWaitTime(Milliseconds(10000)); - runner.setStepDownTime(Milliseconds(60000)); - - simulateSuccessfulElection(); - - runner.start(&txn); - - // Make a secondary actually catch up - enterNetwork(); - getNet()->runUntil(getNet()->now() + 2000); - ASSERT(getNet()->hasReadyRequests()); - NetworkInterfaceMock::NetworkOperationIterator noi = getNet()->getNextReadyRequest(); - ReplicationExecutor::RemoteCommandRequest request = noi->getRequest(); - log() << request.target.toString() << " processing " << request.cmdObj; - ReplSetHeartbeatArgs hbArgs; - if (hbArgs.initialize(request.cmdObj).isOK()) { - ReplSetHeartbeatResponse hbResp; - hbResp.setSetName(hbArgs.getSetName()); - hbResp.setState(MemberState::RS_SECONDARY); - hbResp.setVersion(hbArgs.getConfigVersion()); - hbResp.setOpTime(optime2); - BSONObjBuilder respObj; - respObj << "ok" << 1; - hbResp.addToBSON(&respObj); - getNet()->scheduleResponse(noi, getNet()->now(), makeResponseStatus(respObj.obj())); - } + ReplicationCoordinatorImpl* _replCoord; + bool _finished; + Status _result; + boost::scoped_ptr<boost::thread> _thread; + bool _force; + Milliseconds _waitTime; + Milliseconds _stepDownTime; +}; + +TEST_F(StepDownTest, StepDownNotCaughtUp) { + OperationContextReplMock txn; + OpTime optime1(100, 1); + OpTime optime2(100, 2); + // No secondary is caught up + getReplCoord()->setMyLastOptime(optime2); + ASSERT_OK(getReplCoord()->setLastOptime_forTest(rid2, optime1)); + ASSERT_OK(getReplCoord()->setLastOptime_forTest(rid3, optime1)); + + // Try to stepDown but time out because no secondaries are caught up + StepDownRunner runner(getReplCoord()); + runner.setForce(false); + runner.setWaitTime(Milliseconds(0)); + runner.setStepDownTime(Milliseconds(1000)); + + simulateSuccessfulElection(); + + runner.start(&txn); + Status status = runner.getResult(); + ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, status); + ASSERT_TRUE(getReplCoord()->getMemberState().primary()); + + // Now use "force" to force it to step down even though no one is caught up + runner.reset(); + getNet()->enterNetwork(); + const Date_t startDate = getNet()->now(); + while (startDate + 1000 < getNet()->now()) { while (getNet()->hasReadyRequests()) { getNet()->blackHole(getNet()->getNextReadyRequest()); } - getNet()->runReadyNetworkOperations(); - exitNetwork(); - - ASSERT_OK(runner.getResult()); - ASSERT_TRUE(getReplCoord()->getMemberState().secondary()); + getNet()->runUntil(startDate + 1000); } - - TEST_F(StepDownTest, InterruptStepDown) { - OperationContextNoopWithInterrupt txn; - OpTime optime1(100, 1); - OpTime optime2(100, 2); - // No secondary is caught up - getReplCoord()->setMyLastOptime(optime2); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(rid2, optime1)); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(rid3, optime1)); - - // stepDown where the secondary actually has to catch up before the stepDown can succeed - StepDownRunner runner(getReplCoord()); - runner.setForce(false); - runner.setWaitTime(Milliseconds(10000)); - runner.setStepDownTime(Milliseconds(60000)); - - simulateSuccessfulElection(); - ASSERT_TRUE(getReplCoord()->getMemberState().primary()); - - runner.start(&txn); - - unsigned int opID = 100; - txn.setOpID(opID); - txn.setInterruptOp(true); - getReplCoord()->interrupt(opID); - - ASSERT_EQUALS(ErrorCodes::Interrupted, runner.getResult()); - ASSERT_TRUE(getReplCoord()->getMemberState().primary()); - } - - TEST_F(ReplCoordTest, GetReplicationModeNone) { - init(); - ASSERT_EQUALS(MemberState::RS_STARTUP, getReplCoord()->getMemberState().s); - } - - TEST_F(ReplCoordTest, GetReplicationModeMaster) { - // modeMasterSlave if master set - ReplSettings settings; - settings.master = true; - init(settings); - ASSERT_EQUALS(ReplicationCoordinator::modeMasterSlave, - getReplCoord()->getReplicationMode()); - } - - TEST_F(ReplCoordTest, GetReplicationModeSlave) { - // modeMasterSlave if the slave flag was set - ReplSettings settings; - settings.slave = SimpleSlave; - init(settings); - ASSERT_EQUALS(ReplicationCoordinator::modeMasterSlave, - getReplCoord()->getReplicationMode()); + getNet()->exitNetwork(); + ASSERT_TRUE(getReplCoord()->getMemberState().primary()); + runner.setForce(true); + runner.start(&txn); + status = runner.getResult(); + ASSERT_OK(status); + ASSERT_TRUE(getReplCoord()->getMemberState().secondary()); +} + +TEST_F(StepDownTest, StepDownCatchUp) { + OperationContextReplMock txn; + OpTime optime1(100, 1); + OpTime optime2(100, 2); + // No secondary is caught up + getReplCoord()->setMyLastOptime(optime2); + ASSERT_OK(getReplCoord()->setLastOptime_forTest(rid2, optime1)); + ASSERT_OK(getReplCoord()->setLastOptime_forTest(rid3, optime1)); + + // stepDown where the secondary actually has to catch up before the stepDown can succeed + StepDownRunner runner(getReplCoord()); + runner.setForce(false); + runner.setWaitTime(Milliseconds(10000)); + runner.setStepDownTime(Milliseconds(60000)); + + simulateSuccessfulElection(); + + runner.start(&txn); + + // Make a secondary actually catch up + enterNetwork(); + getNet()->runUntil(getNet()->now() + 2000); + ASSERT(getNet()->hasReadyRequests()); + NetworkInterfaceMock::NetworkOperationIterator noi = getNet()->getNextReadyRequest(); + ReplicationExecutor::RemoteCommandRequest request = noi->getRequest(); + log() << request.target.toString() << " processing " << request.cmdObj; + ReplSetHeartbeatArgs hbArgs; + if (hbArgs.initialize(request.cmdObj).isOK()) { + ReplSetHeartbeatResponse hbResp; + hbResp.setSetName(hbArgs.getSetName()); + hbResp.setState(MemberState::RS_SECONDARY); + hbResp.setVersion(hbArgs.getConfigVersion()); + hbResp.setOpTime(optime2); + BSONObjBuilder respObj; + respObj << "ok" << 1; + hbResp.addToBSON(&respObj); + getNet()->scheduleResponse(noi, getNet()->now(), makeResponseStatus(respObj.obj())); } - - TEST_F(ReplCoordTest, GetReplicationModeRepl) { - // modeReplSet if the set name was supplied. - ReplSettings settings; - settings.replSet = "mySet/node1:12345"; - init(settings); - ASSERT_EQUALS(ReplicationCoordinator::modeReplSet, getReplCoord()->getReplicationMode()); - ASSERT_EQUALS(MemberState::RS_STARTUP, getReplCoord()->getMemberState().s); - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("host" << "node1:12345" << "_id" << 0 ))), - HostAndPort("node1", 12345)); + while (getNet()->hasReadyRequests()) { + getNet()->blackHole(getNet()->getNextReadyRequest()); } - - TEST_F(ReplCoordTest, TestPrepareReplSetUpdatePositionCommand) { - OperationContextNoop txn; - init("mySet/test1:1234,test2:1234,test3:1234"); - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << "host" << "test1:1234") << - BSON("_id" << 1 << "host" << "test2:1234") << - BSON("_id" << 2 << "host" << "test3:1234"))), - HostAndPort("test1", 1234)); - OID myRid = getReplCoord()->getMyRID(); - OID rid2 = OID::gen(); - OID rid3 = OID::gen(); - HandshakeArgs handshake2; - handshake2.initialize(BSON("handshake" << rid2 << - "member" << 1 << - "config" << BSON("_id" << 1 << "host" << "test2:1234"))); - HandshakeArgs handshake3; - handshake3.initialize(BSON("handshake" << rid3 << - "member" << 2 << - "config" << BSON("_id" << 2 << "host" << "test3:1234"))); - ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake2)); - ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake3)); - OpTime optime1(100, 1); - OpTime optime2(100, 2); - OpTime optime3(2, 1); - getReplCoord()->setMyLastOptime(optime1); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(rid2, optime2)); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(rid3, optime3)); - - // Check that the proper BSON is generated for the replSetUpdatePositionCommand - BSONObjBuilder cmdBuilder; - getReplCoord()->prepareReplSetUpdatePositionCommand(&cmdBuilder); - BSONObj cmd = cmdBuilder.done(); - - ASSERT_EQUALS(2, cmd.nFields()); - ASSERT_EQUALS("replSetUpdatePosition", cmd.firstElement().fieldNameStringData()); - - std::set<OID> rids; - BSONForEach(entryElement, cmd["optimes"].Obj()) { - BSONObj entry = entryElement.Obj(); - OID rid = entry["_id"].OID(); - rids.insert(rid); - if (rid == myRid) { - ASSERT_EQUALS(optime1, entry["optime"]._opTime()); - } else if (rid == rid2) { - ASSERT_EQUALS(optime2, entry["optime"]._opTime()); - } else { - ASSERT_EQUALS(rid3, rid); - ASSERT_EQUALS(optime3, entry["optime"]._opTime()); - } + getNet()->runReadyNetworkOperations(); + exitNetwork(); + + ASSERT_OK(runner.getResult()); + ASSERT_TRUE(getReplCoord()->getMemberState().secondary()); +} + +TEST_F(StepDownTest, InterruptStepDown) { + OperationContextNoopWithInterrupt txn; + OpTime optime1(100, 1); + OpTime optime2(100, 2); + // No secondary is caught up + getReplCoord()->setMyLastOptime(optime2); + ASSERT_OK(getReplCoord()->setLastOptime_forTest(rid2, optime1)); + ASSERT_OK(getReplCoord()->setLastOptime_forTest(rid3, optime1)); + + // stepDown where the secondary actually has to catch up before the stepDown can succeed + StepDownRunner runner(getReplCoord()); + runner.setForce(false); + runner.setWaitTime(Milliseconds(10000)); + runner.setStepDownTime(Milliseconds(60000)); + + simulateSuccessfulElection(); + ASSERT_TRUE(getReplCoord()->getMemberState().primary()); + + runner.start(&txn); + + unsigned int opID = 100; + txn.setOpID(opID); + txn.setInterruptOp(true); + getReplCoord()->interrupt(opID); + + ASSERT_EQUALS(ErrorCodes::Interrupted, runner.getResult()); + ASSERT_TRUE(getReplCoord()->getMemberState().primary()); +} + +TEST_F(ReplCoordTest, GetReplicationModeNone) { + init(); + ASSERT_EQUALS(MemberState::RS_STARTUP, getReplCoord()->getMemberState().s); +} + +TEST_F(ReplCoordTest, GetReplicationModeMaster) { + // modeMasterSlave if master set + ReplSettings settings; + settings.master = true; + init(settings); + ASSERT_EQUALS(ReplicationCoordinator::modeMasterSlave, getReplCoord()->getReplicationMode()); +} + +TEST_F(ReplCoordTest, GetReplicationModeSlave) { + // modeMasterSlave if the slave flag was set + ReplSettings settings; + settings.slave = SimpleSlave; + init(settings); + ASSERT_EQUALS(ReplicationCoordinator::modeMasterSlave, getReplCoord()->getReplicationMode()); +} + +TEST_F(ReplCoordTest, GetReplicationModeRepl) { + // modeReplSet if the set name was supplied. + ReplSettings settings; + settings.replSet = "mySet/node1:12345"; + init(settings); + ASSERT_EQUALS(ReplicationCoordinator::modeReplSet, getReplCoord()->getReplicationMode()); + ASSERT_EQUALS(MemberState::RS_STARTUP, getReplCoord()->getMemberState().s); + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 2 << "members" << BSON_ARRAY(BSON("host" + << "node1:12345" + << "_id" << 0))), + HostAndPort("node1", 12345)); +} + +TEST_F(ReplCoordTest, TestPrepareReplSetUpdatePositionCommand) { + OperationContextNoop txn; + init("mySet/test1:1234,test2:1234,test3:1234"); + assertStartSuccess( + BSON("_id" + << "mySet" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "test1:1234") + << BSON("_id" << 1 << "host" + << "test2:1234") << BSON("_id" << 2 << "host" + << "test3:1234"))), + HostAndPort("test1", 1234)); + OID myRid = getReplCoord()->getMyRID(); + OID rid2 = OID::gen(); + OID rid3 = OID::gen(); + HandshakeArgs handshake2; + handshake2.initialize( + BSON("handshake" << rid2 << "member" << 1 << "config" << BSON("_id" << 1 << "host" + << "test2:1234"))); + HandshakeArgs handshake3; + handshake3.initialize( + BSON("handshake" << rid3 << "member" << 2 << "config" << BSON("_id" << 2 << "host" + << "test3:1234"))); + ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake2)); + ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake3)); + OpTime optime1(100, 1); + OpTime optime2(100, 2); + OpTime optime3(2, 1); + getReplCoord()->setMyLastOptime(optime1); + ASSERT_OK(getReplCoord()->setLastOptime_forTest(rid2, optime2)); + ASSERT_OK(getReplCoord()->setLastOptime_forTest(rid3, optime3)); + + // Check that the proper BSON is generated for the replSetUpdatePositionCommand + BSONObjBuilder cmdBuilder; + getReplCoord()->prepareReplSetUpdatePositionCommand(&cmdBuilder); + BSONObj cmd = cmdBuilder.done(); + + ASSERT_EQUALS(2, cmd.nFields()); + ASSERT_EQUALS("replSetUpdatePosition", cmd.firstElement().fieldNameStringData()); + + std::set<OID> rids; + BSONForEach(entryElement, cmd["optimes"].Obj()) { + BSONObj entry = entryElement.Obj(); + OID rid = entry["_id"].OID(); + rids.insert(rid); + if (rid == myRid) { + ASSERT_EQUALS(optime1, entry["optime"]._opTime()); + } else if (rid == rid2) { + ASSERT_EQUALS(optime2, entry["optime"]._opTime()); + } else { + ASSERT_EQUALS(rid3, rid); + ASSERT_EQUALS(optime3, entry["optime"]._opTime()); } - ASSERT_EQUALS(3U, rids.size()); // Make sure we saw all 3 nodes } - - TEST_F(ReplCoordTest, TestHandshakes) { - init("mySet/test1:1234,test2:1234,test3:1234"); - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << "host" << "test1:1234") << - BSON("_id" << 1 << "host" << "test2:1234") << - BSON("_id" << 2 << "host" << "test3:1234"))), - HostAndPort("test2", 1234)); - // Test generating basic handshake with no chaining - std::vector<BSONObj> handshakes; - OperationContextNoop txn; - getReplCoord()->prepareReplSetUpdatePositionCommandHandshakes(&handshakes); - ASSERT_EQUALS(1U, handshakes.size()); - BSONObj handshakeCmd = handshakes[0]; + ASSERT_EQUALS(3U, rids.size()); // Make sure we saw all 3 nodes +} + +TEST_F(ReplCoordTest, TestHandshakes) { + init("mySet/test1:1234,test2:1234,test3:1234"); + assertStartSuccess( + BSON("_id" + << "mySet" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "test1:1234") + << BSON("_id" << 1 << "host" + << "test2:1234") << BSON("_id" << 2 << "host" + << "test3:1234"))), + HostAndPort("test2", 1234)); + // Test generating basic handshake with no chaining + std::vector<BSONObj> handshakes; + OperationContextNoop txn; + getReplCoord()->prepareReplSetUpdatePositionCommandHandshakes(&handshakes); + ASSERT_EQUALS(1U, handshakes.size()); + BSONObj handshakeCmd = handshakes[0]; + ASSERT_EQUALS(2, handshakeCmd.nFields()); + ASSERT_EQUALS("replSetUpdatePosition", handshakeCmd.firstElement().fieldNameStringData()); + BSONObj handshake = handshakeCmd["handshake"].Obj(); + ASSERT_EQUALS(getReplCoord()->getMyRID(), handshake["handshake"].OID()); + ASSERT_EQUALS(1, handshake["member"].Int()); + handshakes.clear(); + + // Have other nodes handshake us and make sure we process it right. + OID slave1RID = OID::gen(); + OID slave2RID = OID::gen(); + HandshakeArgs slave1Handshake; + slave1Handshake.initialize( + BSON("handshake" << slave1RID << "member" << 0 << "config" << BSON("_id" << 0 << "host" + << "test1:1234"))); + HandshakeArgs slave2Handshake; + slave2Handshake.initialize( + BSON("handshake" << slave2RID << "member" << 2 << "config" << BSON("_id" << 2 << "host" + << "test2:1234"))); + ASSERT_OK(getReplCoord()->processHandshake(&txn, slave1Handshake)); + ASSERT_OK(getReplCoord()->processHandshake(&txn, slave2Handshake)); + + getReplCoord()->prepareReplSetUpdatePositionCommandHandshakes(&handshakes); + ASSERT_EQUALS(3U, handshakes.size()); + std::set<OID> rids; + for (std::vector<BSONObj>::iterator it = handshakes.begin(); it != handshakes.end(); ++it) { + BSONObj handshakeCmd = *it; ASSERT_EQUALS(2, handshakeCmd.nFields()); ASSERT_EQUALS("replSetUpdatePosition", handshakeCmd.firstElement().fieldNameStringData()); - BSONObj handshake = handshakeCmd["handshake"].Obj(); - ASSERT_EQUALS(getReplCoord()->getMyRID(), handshake["handshake"].OID()); - ASSERT_EQUALS(1, handshake["member"].Int()); - handshakes.clear(); - - // Have other nodes handshake us and make sure we process it right. - OID slave1RID = OID::gen(); - OID slave2RID = OID::gen(); - HandshakeArgs slave1Handshake; - slave1Handshake.initialize(BSON("handshake" << slave1RID << - "member" << 0 << - "config" << BSON("_id" << 0 << "host" << "test1:1234"))); - HandshakeArgs slave2Handshake; - slave2Handshake.initialize(BSON("handshake" << slave2RID << - "member" << 2 << - "config" << BSON("_id" << 2 << "host" << "test2:1234"))); - ASSERT_OK(getReplCoord()->processHandshake(&txn, slave1Handshake)); - ASSERT_OK(getReplCoord()->processHandshake(&txn, slave2Handshake)); - - getReplCoord()->prepareReplSetUpdatePositionCommandHandshakes(&handshakes); - ASSERT_EQUALS(3U, handshakes.size()); - std::set<OID> rids; - for (std::vector<BSONObj>::iterator it = handshakes.begin(); it != handshakes.end(); ++it) { - BSONObj handshakeCmd = *it; - ASSERT_EQUALS(2, handshakeCmd.nFields()); - ASSERT_EQUALS("replSetUpdatePosition", - handshakeCmd.firstElement().fieldNameStringData()); - - BSONObj handshake = handshakeCmd["handshake"].Obj(); - OID rid = handshake["handshake"].OID(); - rids.insert(rid); - if (rid == getReplCoord()->getMyRID()) { - ASSERT_EQUALS(1, handshake["member"].Int()); - } else if (rid == slave1RID) { - ASSERT_EQUALS(0, handshake["member"].Int()); - } else { - ASSERT_EQUALS(slave2RID, rid); - ASSERT_EQUALS(2, handshake["member"].Int()); - } - } - ASSERT_EQUALS(3U, rids.size()); // Make sure we saw all 3 nodes - } - - TEST_F(ReplCoordTest, SetMaintenanceMode) { - init("mySet/test1:1234,test2:1234,test3:1234"); - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << "host" << "test1:1234") << - BSON("_id" << 1 << "host" << "test2:1234") << - BSON("_id" << 2 << "host" << "test3:1234"))), - HostAndPort("test2", 1234)); - OperationContextNoop txn; - getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY); - getReplCoord()->setMyLastOptime(OpTime(100, 0)); - - // Can't unset maintenance mode if it was never set to begin with. - Status status = getReplCoord()->setMaintenanceMode(false); - ASSERT_EQUALS(ErrorCodes::OperationFailed, status); - ASSERT_TRUE(getReplCoord()->getMemberState().secondary()); - - // valid set - ASSERT_OK(getReplCoord()->setMaintenanceMode(true)); - ASSERT_TRUE(getReplCoord()->getMemberState().recovering()); - - // If we go into rollback while in maintenance mode, our state changes to RS_ROLLBACK. - getReplCoord()->setFollowerMode(MemberState::RS_ROLLBACK); - ASSERT_TRUE(getReplCoord()->getMemberState().rollback()); - - // When we go back to SECONDARY, we still observe RECOVERING because of maintenance mode. - getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY); - ASSERT_TRUE(getReplCoord()->getMemberState().recovering()); - - // Can set multiple times - ASSERT_OK(getReplCoord()->setMaintenanceMode(true)); - ASSERT_OK(getReplCoord()->setMaintenanceMode(true)); - - // Need to unset the number of times you set - ASSERT_OK(getReplCoord()->setMaintenanceMode(false)); - ASSERT_OK(getReplCoord()->setMaintenanceMode(false)); - ASSERT_OK(getReplCoord()->setMaintenanceMode(false)); - status = getReplCoord()->setMaintenanceMode(false); - // fourth one fails b/c we only set three times - ASSERT_EQUALS(ErrorCodes::OperationFailed, status); - // Unsetting maintenance mode changes our state to secondary if maintenance mode was - // the only thinking keeping us out of it. - ASSERT_TRUE(getReplCoord()->getMemberState().secondary()); - - // From rollback, entering and exiting maintenance mode doesn't change perceived - // state. - getReplCoord()->setFollowerMode(MemberState::RS_ROLLBACK); - ASSERT_TRUE(getReplCoord()->getMemberState().rollback()); - ASSERT_OK(getReplCoord()->setMaintenanceMode(true)); - ASSERT_TRUE(getReplCoord()->getMemberState().rollback()); - ASSERT_OK(getReplCoord()->setMaintenanceMode(false)); - ASSERT_TRUE(getReplCoord()->getMemberState().rollback()); - - // Rollback is sticky even if entered while in maintenance mode. - getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY); - ASSERT_TRUE(getReplCoord()->getMemberState().secondary()); - ASSERT_OK(getReplCoord()->setMaintenanceMode(true)); - ASSERT_TRUE(getReplCoord()->getMemberState().recovering()); - getReplCoord()->setFollowerMode(MemberState::RS_ROLLBACK); - ASSERT_TRUE(getReplCoord()->getMemberState().rollback()); - ASSERT_OK(getReplCoord()->setMaintenanceMode(false)); - ASSERT_TRUE(getReplCoord()->getMemberState().rollback()); - getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY); - ASSERT_TRUE(getReplCoord()->getMemberState().secondary()); - - // Can't modify maintenance mode when PRIMARY - simulateSuccessfulElection(); - - status = getReplCoord()->setMaintenanceMode(true); - ASSERT_EQUALS(ErrorCodes::NotSecondary, status); - ASSERT_TRUE(getReplCoord()->getMemberState().primary()); - - simulateStepDownOnIsolation(); - - status = getReplCoord()->setMaintenanceMode(false); - ASSERT_EQUALS(ErrorCodes::OperationFailed, status); - ASSERT_OK(getReplCoord()->setMaintenanceMode(true)); - ASSERT_OK(getReplCoord()->setMaintenanceMode(false)); - } - - TEST_F(ReplCoordTest, GetHostsWrittenToReplSet) { - HostAndPort myHost("node1:12345"); - HostAndPort client1Host("node2:12345"); - HostAndPort client2Host("node3:12345") ; - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("_id" << 0 << "host" << myHost.toString()) << - BSON("_id" << 1 << "host" << client1Host.toString()) << - BSON("_id" << 2 << "host" << client2Host.toString()))), - HostAndPort("node1", 12345)); - OperationContextNoop txn; - - OID client1 = OID::gen(); - OID client2 = OID::gen(); - OpTime time1(100, 1); - OpTime time2(100, 2); - - HandshakeArgs handshake1; - ASSERT_OK(handshake1.initialize(BSON("handshake" << client1 << "member" << 1))); - ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake1)); - HandshakeArgs handshake2; - ASSERT_OK(handshake2.initialize(BSON("handshake" << client2 << "member" << 2))); - ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake2)); - - getReplCoord()->setMyLastOptime(time2); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(client1, time1)); - - std::vector<HostAndPort> caughtUpHosts = getReplCoord()->getHostsWrittenTo(time2); - ASSERT_EQUALS(1U, caughtUpHosts.size()); - ASSERT_EQUALS(myHost, caughtUpHosts[0]); - - ASSERT_OK(getReplCoord()->setLastOptime_forTest(client2, time2)); - caughtUpHosts = getReplCoord()->getHostsWrittenTo(time2); - ASSERT_EQUALS(2U, caughtUpHosts.size()); - if (myHost == caughtUpHosts[0]) { - ASSERT_EQUALS(client2Host, caughtUpHosts[1]); - } - else { - ASSERT_EQUALS(client2Host, caughtUpHosts[0]); - ASSERT_EQUALS(myHost, caughtUpHosts[1]); - } - } - TEST_F(ReplCoordTest, GetHostsWrittenToMasterSlave) { - ReplSettings settings; - settings.master = true; - init(settings); - HostAndPort clientHost("node2:12345"); - OperationContextNoop txn; - - OID client = OID::gen(); - OpTime time1(100, 1); - OpTime time2(100, 2); - - getExternalState()->setClientHostAndPort(clientHost); - HandshakeArgs handshake; - ASSERT_OK(handshake.initialize(BSON("handshake" << client))); - ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake)); - - getReplCoord()->setMyLastOptime(time2); - ASSERT_OK(getReplCoord()->setLastOptimeForSlave(client, time1)); - - std::vector<HostAndPort> caughtUpHosts = getReplCoord()->getHostsWrittenTo(time2); - ASSERT_EQUALS(0U, caughtUpHosts.size()); // self doesn't get included in master-slave - - ASSERT_OK(getReplCoord()->setLastOptimeForSlave(client, time2)); - caughtUpHosts = getReplCoord()->getHostsWrittenTo(time2); - ASSERT_EQUALS(1U, caughtUpHosts.size()); - ASSERT_EQUALS(clientHost, caughtUpHosts[0]); - } - - TEST_F(ReplCoordTest, GetOtherNodesInReplSetNoConfig) { - start(); - ASSERT_EQUALS(0U, getReplCoord()->getOtherNodesInReplSet().size()); - } - - TEST_F(ReplCoordTest, GetOtherNodesInReplSet) { - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("_id" << 0 << "host" << "h1") << - BSON("_id" << 1 << "host" << "h2") << - BSON("_id" << 2 << - "host" << "h3" << - "priority" << 0 << - "hidden" << true))), - HostAndPort("h1")); - - std::vector<HostAndPort> otherNodes = getReplCoord()->getOtherNodesInReplSet(); - ASSERT_EQUALS(2U, otherNodes.size()); - if (otherNodes[0] == HostAndPort("h2")) { - ASSERT_EQUALS(HostAndPort("h3"), otherNodes[1]); - } - else { - ASSERT_EQUALS(HostAndPort("h3"), otherNodes[0]); - ASSERT_EQUALS(HostAndPort("h2"), otherNodes[0]); - } - } - - TEST_F(ReplCoordTest, IsMasterNoConfig) { - start(); - IsMasterResponse response; - - getReplCoord()->fillIsMasterForReplSet(&response); - ASSERT_FALSE(response.isConfigSet()); - BSONObj responseObj = response.toBSON(); - ASSERT_FALSE(responseObj["ismaster"].Bool()); - ASSERT_FALSE(responseObj["secondary"].Bool()); - ASSERT_TRUE(responseObj["isreplicaset"].Bool()); - ASSERT_EQUALS("Does not have a valid replica set config", responseObj["info"].String()); - - IsMasterResponse roundTripped; - ASSERT_OK(roundTripped.initialize(response.toBSON())); - } - - TEST_F(ReplCoordTest, IsMaster) { - HostAndPort h1("h1"); - HostAndPort h2("h2"); - HostAndPort h3("h3"); - HostAndPort h4("h4"); - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("_id" << 0 << "host" << h1.toString()) << - BSON("_id" << 1 << "host" << h2.toString()) << - BSON("_id" << 2 << - "host" << h3.toString() << - "arbiterOnly" << true) << - BSON("_id" << 3 << - "host" << h4.toString() << - "priority" << 0 << - "tags" << BSON("key1" << "value1" << - "key2" << "value2")))), - h4); - getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY); - ASSERT_TRUE(getReplCoord()->getMemberState().secondary()); - - IsMasterResponse response; - getReplCoord()->fillIsMasterForReplSet(&response); - - ASSERT_EQUALS("mySet", response.getReplSetName()); - ASSERT_EQUALS(2, response.getReplSetVersion()); - ASSERT_FALSE(response.isMaster()); - ASSERT_TRUE(response.isSecondary()); - // TODO(spencer): test that response includes current primary when there is one. - ASSERT_FALSE(response.isArbiterOnly()); - ASSERT_TRUE(response.isPassive()); - ASSERT_FALSE(response.isHidden()); - ASSERT_TRUE(response.shouldBuildIndexes()); - ASSERT_EQUALS(0, response.getSlaveDelay().total_seconds()); - ASSERT_EQUALS(h4, response.getMe()); - - std::vector<HostAndPort> hosts = response.getHosts(); - ASSERT_EQUALS(2U, hosts.size()); - if (hosts[0] == h1) { - ASSERT_EQUALS(h2, hosts[1]); - } - else { - ASSERT_EQUALS(h2, hosts[0]); - ASSERT_EQUALS(h1, hosts[1]); + BSONObj handshake = handshakeCmd["handshake"].Obj(); + OID rid = handshake["handshake"].OID(); + rids.insert(rid); + if (rid == getReplCoord()->getMyRID()) { + ASSERT_EQUALS(1, handshake["member"].Int()); + } else if (rid == slave1RID) { + ASSERT_EQUALS(0, handshake["member"].Int()); + } else { + ASSERT_EQUALS(slave2RID, rid); + ASSERT_EQUALS(2, handshake["member"].Int()); } - std::vector<HostAndPort> passives = response.getPassives(); - ASSERT_EQUALS(1U, passives.size()); - ASSERT_EQUALS(h4, passives[0]); - std::vector<HostAndPort> arbiters = response.getArbiters(); - ASSERT_EQUALS(1U, arbiters.size()); - ASSERT_EQUALS(h3, arbiters[0]); - - unordered_map<std::string, std::string> tags = response.getTags(); - ASSERT_EQUALS(2U, tags.size()); - ASSERT_EQUALS("value1", tags["key1"]); - ASSERT_EQUALS("value2", tags["key2"]); - - IsMasterResponse roundTripped; - ASSERT_OK(roundTripped.initialize(response.toBSON())); - } - - TEST_F(ReplCoordTest, ShutDownBeforeStartUpFinished) { - init(); - startCapturingLogMessages(); - getReplCoord()->shutdown(); - stopCapturingLogMessages(); - ASSERT_EQUALS(1, - countLogLinesContaining("shutdown() called before startReplication() finished")); - } - - TEST_F(ReplCoordTest, UpdatePositionWithRIDTest) { - OperationContextNoop txn; - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("host" << "node1:12345" << "_id" << 0) << - BSON("host" << "node2:12345" << "_id" << 1) << - BSON("host" << "node3:12345" << "_id" << 2) << - BSON("host" << "node4:12345" << "_id" << 3) << - BSON("host" << "node5:12345" << "_id" << 4))), - HostAndPort("node1", 12345)); - ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(100, 0)); - simulateSuccessfulElection(); - - OID selfRID = getReplCoord()->getMyRID(); - OID client1 = OID::gen(); - OID client2 = OID::gen(); - OID client3 = OID::gen(); - OID client4 = OID::gen(); - OpTime time1(100, 1); - OpTime time2(100, 2); - OpTime staleTime(10, 0); - getReplCoord()->setMyLastOptime(time2); - - WriteConcernOptions writeConcern; - writeConcern.wTimeout = WriteConcernOptions::kNoWaiting; - writeConcern.wNumNodes = 2; - - // receive an updateposition for 3 members, with new enough time, but no handshakes yet - UpdatePositionArgs args; - ASSERT_OK(args.initialize(BSON("replSetUpdatePosition" << 1 << - "optimes" << BSON_ARRAY( - BSON("_id" << client1 << "optime" << time1) << - BSON("_id" << client2 << "optime" << time1) << - BSON("_id" << client3 << "optime" << time1))))); - ASSERT_EQUALS(ErrorCodes::NodeNotFound, - getReplCoord()->processReplSetUpdatePosition(args)); - ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, - getReplCoord()->awaitReplication(&txn, time1, writeConcern).status); - - // handshake for middle of three nodes, updatePosition should end early, not updating - // any members, write concern 2 should still fail - HandshakeArgs handshake2; - ASSERT_OK(handshake2.initialize(BSON("handshake" << client2 << "member" << 2))); - ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake2)); - ASSERT_EQUALS(ErrorCodes::NodeNotFound, - getReplCoord()->processReplSetUpdatePosition(args)); - ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, - getReplCoord()->awaitReplication(&txn, time1, writeConcern).status); - - // handshake for first of three nodes, updatePosition should end early, but the first two - // should get through and writeconcern <=3 should pass, but 4 should fail - HandshakeArgs handshake1; - ASSERT_OK(handshake1.initialize(BSON("handshake" << client1 << "member" << 1))); - ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake1)); - ASSERT_EQUALS(ErrorCodes::NodeNotFound, - getReplCoord()->processReplSetUpdatePosition(args)); - ASSERT_OK(getReplCoord()->awaitReplication(&txn, time1, writeConcern).status); - writeConcern.wNumNodes = 3; - ASSERT_OK(getReplCoord()->awaitReplication(&txn, time1, writeConcern).status); - writeConcern.wNumNodes = 4; - ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, - getReplCoord()->awaitReplication(&txn, time1, writeConcern).status); - - // receive a stale value for ourself, should not cause progress to go backwards - HandshakeArgs handshake3; - ASSERT_OK(handshake3.initialize(BSON("handshake" << client3 << "member" << 3))); - ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake3)); - HandshakeArgs handshake4; - ASSERT_OK(handshake4.initialize(BSON("handshake" << client4 << "member" << 4))); - ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake4)); - UpdatePositionArgs args2; - ASSERT_OK(args2.initialize(BSON("replSetUpdatePosition" << 1 << - "optimes" << BSON_ARRAY( - BSON("_id" << selfRID << "optime" << staleTime) << - BSON("_id" << client3 << "optime" << time2) << - BSON("_id" << client4 << "optime" << time2))))); - ASSERT_OK(getReplCoord()->processReplSetUpdatePosition(args2)); - // all nodes should have through time1 and three should have through time2 - writeConcern.wNumNodes = 5; - ASSERT_OK(getReplCoord()->awaitReplication(&txn, time1, writeConcern).status); - writeConcern.wNumNodes = 3; - ASSERT_OK(getReplCoord()->awaitReplication(&txn, time2, writeConcern).status); - writeConcern.wNumNodes = 4; - ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, - getReplCoord()->awaitReplication(&txn, time2, writeConcern).status); - - // receive a stale value for another, should not cause progress to go backwards - UpdatePositionArgs args3; - ASSERT_OK(args3.initialize(BSON("replSetUpdatePosition" << 1 << - "optimes" << BSON_ARRAY( - BSON("_id" << client1 << "optime" << time2) << - BSON("_id" << client2 << "optime" << time2) << - BSON("_id" << client3 << "optime" << staleTime))))); - ASSERT_OK(getReplCoord()->processReplSetUpdatePosition(args3)); - // all nodes should have through time2 - writeConcern.wNumNodes = 5; - ASSERT_OK(getReplCoord()->awaitReplication(&txn, time2, writeConcern).status); - } - - TEST_F(ReplCoordTest, UpdatePositionWithConfigVersionAndMemberIdTest) { - OperationContextNoop txn; - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("host" << "node1:12345" << "_id" << 0) << - BSON("host" << "node2:12345" << "_id" << 1) << - BSON("host" << "node3:12345" << "_id" << 2))), - HostAndPort("node1", 12345)); - ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(100, 0)); - simulateSuccessfulElection(); - - OpTime time1(100, 1); - OpTime time2(100, 2); - OpTime staleTime(10, 0); - getReplCoord()->setMyLastOptime(time1); - - WriteConcernOptions writeConcern; - writeConcern.wTimeout = WriteConcernOptions::kNoWaiting; - writeConcern.wNumNodes = 1; - - ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, - getReplCoord()->awaitReplication(&txn, time2, writeConcern).status); - - // receive updatePosition containing ourself, should not process the update for self - UpdatePositionArgs args; - ASSERT_OK(args.initialize(BSON("replSetUpdatePosition" << 1 << - "optimes" << BSON_ARRAY( - BSON("cfgver" << 2 << - "memberId" << 0 << - "optime" << time2))))); - - ASSERT_OK(getReplCoord()->processReplSetUpdatePosition(args)); - ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, - getReplCoord()->awaitReplication(&txn, time2, writeConcern).status); - - // receive updatePosition with incorrect config version - UpdatePositionArgs args2; - ASSERT_OK(args2.initialize(BSON("replSetUpdatePosition" << 1 << - "optimes" << BSON_ARRAY( - BSON("cfgver" << 3 << - "memberId" << 1 << - "optime" << time2))))); - - ASSERT_EQUALS(ErrorCodes::InvalidReplicaSetConfig, - getReplCoord()->processReplSetUpdatePosition(args2)); - ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, - getReplCoord()->awaitReplication(&txn, time2, writeConcern).status); - - // receive updatePosition with nonexistent member id - UpdatePositionArgs args3; - ASSERT_OK(args3.initialize(BSON("replSetUpdatePosition" << 1 << - "optimes" << BSON_ARRAY( - BSON("cfgver" << 2 << - "memberId" << 9 << - "optime" << time2))))); - - ASSERT_EQUALS(ErrorCodes::NodeNotFound, - getReplCoord()->processReplSetUpdatePosition(args3)); - ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, - getReplCoord()->awaitReplication(&txn, time2, writeConcern).status); - - // receive a good update position - getReplCoord()->setMyLastOptime(time2); - UpdatePositionArgs args4; - ASSERT_OK(args4.initialize(BSON("replSetUpdatePosition" << 1 << - "optimes" << BSON_ARRAY( - BSON("cfgver" << 2 << - "memberId" << 1 << - "optime" << time2) << - BSON("cfgver" << 2 << - "memberId" << 2 << - "optime" << time2))))); - - ASSERT_OK(getReplCoord()->processReplSetUpdatePosition(args4)); - ASSERT_OK(getReplCoord()->awaitReplication(&txn, time2, writeConcern).status); - - writeConcern.wNumNodes = 3; - ASSERT_OK(getReplCoord()->awaitReplication(&txn, time2, writeConcern).status); - } - - void doReplSetReconfig(ReplicationCoordinatorImpl* replCoord, Status* status) { - OperationContextNoop txn; - BSONObjBuilder garbage; - ReplSetReconfigArgs args; - args.force = false; - args.newConfigObj = BSON("_id" << "mySet" << - "version" << 3 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << - "host" << "node1:12345" << - "priority" << 3) << - BSON("_id" << 1 << "host" << "node2:12345") << - BSON("_id" << 2 << "host" << "node3:12345"))); - *status = replCoord->processReplSetReconfig(&txn, args, &garbage); } - - TEST_F(ReplCoordTest, AwaitReplicationReconfigSimple) { - OperationContextNoop txn; - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("host" << "node1:12345" << "_id" << 0) << - BSON("host" << "node2:12345" << "_id" << 1) << - BSON("host" << "node3:12345" << "_id" << 2))), - HostAndPort("node1", 12345)); - ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(100, 2)); - simulateSuccessfulElection(); - - OID selfRID = getReplCoord()->getMyRID(); - OID node2 = OID::gen(); - OID node3 = OID::gen(); - OpTime time(100, 2); - - HandshakeArgs handshake; - ASSERT_OK(handshake.initialize(BSON("handshake" << node2 << "member" << 1))); - ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake)); - ASSERT_OK(handshake.initialize(BSON("handshake" << node3 << "member" << 2))); - ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake)); - - // 3 nodes waiting for time - WriteConcernOptions writeConcern; - writeConcern.wTimeout = WriteConcernOptions::kNoTimeout; - writeConcern.wNumNodes = 3; - - ReplicationAwaiter awaiter(getReplCoord(), &txn); - awaiter.setOpTime(time); - awaiter.setWriteConcern(writeConcern); - awaiter.start(&txn); - - // reconfig - Status status(ErrorCodes::InternalError, "Not Set"); - boost::thread reconfigThread(stdx::bind(doReplSetReconfig, getReplCoord(), &status)); - - NetworkInterfaceMock* net = getNet(); - getNet()->enterNetwork(); - const NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); - const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); - repl::ReplSetHeartbeatArgs hbArgs; - ASSERT_OK(hbArgs.initialize(request.cmdObj)); - repl::ReplSetHeartbeatResponse hbResp; - hbResp.setSetName("mySet"); - hbResp.setState(MemberState::RS_SECONDARY); - hbResp.setVersion(2); - BSONObjBuilder respObj; - respObj << "ok" << 1; - hbResp.addToBSON(&respObj); - net->scheduleResponse(noi, net->now(), makeResponseStatus(respObj.obj())); - net->runReadyNetworkOperations(); - getNet()->exitNetwork(); - reconfigThread.join(); - ASSERT_OK(status); - - // satisfy write concern - ASSERT_OK(getReplCoord()->setLastOptime_forTest(selfRID, time)); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(node2, time)); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(node3, time)); - ReplicationCoordinator::StatusAndDuration statusAndDur = awaiter.getResult(); - ASSERT_OK(statusAndDur.status); - awaiter.reset(); + ASSERT_EQUALS(3U, rids.size()); // Make sure we saw all 3 nodes +} + +TEST_F(ReplCoordTest, SetMaintenanceMode) { + init("mySet/test1:1234,test2:1234,test3:1234"); + assertStartSuccess( + BSON("_id" + << "mySet" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "test1:1234") + << BSON("_id" << 1 << "host" + << "test2:1234") << BSON("_id" << 2 << "host" + << "test3:1234"))), + HostAndPort("test2", 1234)); + OperationContextNoop txn; + getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY); + getReplCoord()->setMyLastOptime(OpTime(100, 0)); + + // Can't unset maintenance mode if it was never set to begin with. + Status status = getReplCoord()->setMaintenanceMode(false); + ASSERT_EQUALS(ErrorCodes::OperationFailed, status); + ASSERT_TRUE(getReplCoord()->getMemberState().secondary()); + + // valid set + ASSERT_OK(getReplCoord()->setMaintenanceMode(true)); + ASSERT_TRUE(getReplCoord()->getMemberState().recovering()); + + // If we go into rollback while in maintenance mode, our state changes to RS_ROLLBACK. + getReplCoord()->setFollowerMode(MemberState::RS_ROLLBACK); + ASSERT_TRUE(getReplCoord()->getMemberState().rollback()); + + // When we go back to SECONDARY, we still observe RECOVERING because of maintenance mode. + getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY); + ASSERT_TRUE(getReplCoord()->getMemberState().recovering()); + + // Can set multiple times + ASSERT_OK(getReplCoord()->setMaintenanceMode(true)); + ASSERT_OK(getReplCoord()->setMaintenanceMode(true)); + + // Need to unset the number of times you set + ASSERT_OK(getReplCoord()->setMaintenanceMode(false)); + ASSERT_OK(getReplCoord()->setMaintenanceMode(false)); + ASSERT_OK(getReplCoord()->setMaintenanceMode(false)); + status = getReplCoord()->setMaintenanceMode(false); + // fourth one fails b/c we only set three times + ASSERT_EQUALS(ErrorCodes::OperationFailed, status); + // Unsetting maintenance mode changes our state to secondary if maintenance mode was + // the only thinking keeping us out of it. + ASSERT_TRUE(getReplCoord()->getMemberState().secondary()); + + // From rollback, entering and exiting maintenance mode doesn't change perceived + // state. + getReplCoord()->setFollowerMode(MemberState::RS_ROLLBACK); + ASSERT_TRUE(getReplCoord()->getMemberState().rollback()); + ASSERT_OK(getReplCoord()->setMaintenanceMode(true)); + ASSERT_TRUE(getReplCoord()->getMemberState().rollback()); + ASSERT_OK(getReplCoord()->setMaintenanceMode(false)); + ASSERT_TRUE(getReplCoord()->getMemberState().rollback()); + + // Rollback is sticky even if entered while in maintenance mode. + getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY); + ASSERT_TRUE(getReplCoord()->getMemberState().secondary()); + ASSERT_OK(getReplCoord()->setMaintenanceMode(true)); + ASSERT_TRUE(getReplCoord()->getMemberState().recovering()); + getReplCoord()->setFollowerMode(MemberState::RS_ROLLBACK); + ASSERT_TRUE(getReplCoord()->getMemberState().rollback()); + ASSERT_OK(getReplCoord()->setMaintenanceMode(false)); + ASSERT_TRUE(getReplCoord()->getMemberState().rollback()); + getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY); + ASSERT_TRUE(getReplCoord()->getMemberState().secondary()); + + // Can't modify maintenance mode when PRIMARY + simulateSuccessfulElection(); + + status = getReplCoord()->setMaintenanceMode(true); + ASSERT_EQUALS(ErrorCodes::NotSecondary, status); + ASSERT_TRUE(getReplCoord()->getMemberState().primary()); + + simulateStepDownOnIsolation(); + + status = getReplCoord()->setMaintenanceMode(false); + ASSERT_EQUALS(ErrorCodes::OperationFailed, status); + ASSERT_OK(getReplCoord()->setMaintenanceMode(true)); + ASSERT_OK(getReplCoord()->setMaintenanceMode(false)); +} + +TEST_F(ReplCoordTest, GetHostsWrittenToReplSet) { + HostAndPort myHost("node1:12345"); + HostAndPort client1Host("node2:12345"); + HostAndPort client2Host("node3:12345"); + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" << myHost.toString()) + << BSON("_id" << 1 << "host" << client1Host.toString()) + << BSON("_id" << 2 << "host" << client2Host.toString()))), + HostAndPort("node1", 12345)); + OperationContextNoop txn; + + OID client1 = OID::gen(); + OID client2 = OID::gen(); + OpTime time1(100, 1); + OpTime time2(100, 2); + + HandshakeArgs handshake1; + ASSERT_OK(handshake1.initialize(BSON("handshake" << client1 << "member" << 1))); + ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake1)); + HandshakeArgs handshake2; + ASSERT_OK(handshake2.initialize(BSON("handshake" << client2 << "member" << 2))); + ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake2)); + + getReplCoord()->setMyLastOptime(time2); + ASSERT_OK(getReplCoord()->setLastOptime_forTest(client1, time1)); + + std::vector<HostAndPort> caughtUpHosts = getReplCoord()->getHostsWrittenTo(time2); + ASSERT_EQUALS(1U, caughtUpHosts.size()); + ASSERT_EQUALS(myHost, caughtUpHosts[0]); + + ASSERT_OK(getReplCoord()->setLastOptime_forTest(client2, time2)); + caughtUpHosts = getReplCoord()->getHostsWrittenTo(time2); + ASSERT_EQUALS(2U, caughtUpHosts.size()); + if (myHost == caughtUpHosts[0]) { + ASSERT_EQUALS(client2Host, caughtUpHosts[1]); + } else { + ASSERT_EQUALS(client2Host, caughtUpHosts[0]); + ASSERT_EQUALS(myHost, caughtUpHosts[1]); } - - void doReplSetReconfigToFewer(ReplicationCoordinatorImpl* replCoord, Status* status) { - OperationContextNoop txn; - BSONObjBuilder garbage; - ReplSetReconfigArgs args; - args.force = false; - args.newConfigObj = BSON("_id" << "mySet" << - "version" << 3 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "node1:12345") << - BSON("_id" << 2 << "host" << "node3:12345"))); - *status = replCoord->processReplSetReconfig(&txn, args, &garbage); +} + +TEST_F(ReplCoordTest, GetHostsWrittenToMasterSlave) { + ReplSettings settings; + settings.master = true; + init(settings); + HostAndPort clientHost("node2:12345"); + OperationContextNoop txn; + + OID client = OID::gen(); + OpTime time1(100, 1); + OpTime time2(100, 2); + + getExternalState()->setClientHostAndPort(clientHost); + HandshakeArgs handshake; + ASSERT_OK(handshake.initialize(BSON("handshake" << client))); + ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake)); + + getReplCoord()->setMyLastOptime(time2); + ASSERT_OK(getReplCoord()->setLastOptimeForSlave(client, time1)); + + std::vector<HostAndPort> caughtUpHosts = getReplCoord()->getHostsWrittenTo(time2); + ASSERT_EQUALS(0U, caughtUpHosts.size()); // self doesn't get included in master-slave + + ASSERT_OK(getReplCoord()->setLastOptimeForSlave(client, time2)); + caughtUpHosts = getReplCoord()->getHostsWrittenTo(time2); + ASSERT_EQUALS(1U, caughtUpHosts.size()); + ASSERT_EQUALS(clientHost, caughtUpHosts[0]); +} + +TEST_F(ReplCoordTest, GetOtherNodesInReplSetNoConfig) { + start(); + ASSERT_EQUALS(0U, getReplCoord()->getOtherNodesInReplSet().size()); +} + +TEST_F(ReplCoordTest, GetOtherNodesInReplSet) { + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "h1") + << BSON("_id" << 1 << "host" + << "h2") + << BSON("_id" << 2 << "host" + << "h3" + << "priority" << 0 << "hidden" << true))), + HostAndPort("h1")); + + std::vector<HostAndPort> otherNodes = getReplCoord()->getOtherNodesInReplSet(); + ASSERT_EQUALS(2U, otherNodes.size()); + if (otherNodes[0] == HostAndPort("h2")) { + ASSERT_EQUALS(HostAndPort("h3"), otherNodes[1]); + } else { + ASSERT_EQUALS(HostAndPort("h3"), otherNodes[0]); + ASSERT_EQUALS(HostAndPort("h2"), otherNodes[0]); } - - TEST_F(ReplCoordTest, AwaitReplicationReconfigNodeCountExceedsNumberOfNodes) { - OperationContextNoop txn; - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("host" << "node1:12345" << "_id" << 0) << - BSON("host" << "node2:12345" << "_id" << 1) << - BSON("host" << "node3:12345" << "_id" << 2))), - HostAndPort("node1", 12345)); - ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(100, 2)); - simulateSuccessfulElection(); - - OID node2 = OID::gen(); - OID node3 = OID::gen(); - OpTime time(100, 2); - - HandshakeArgs handshake; - ASSERT_OK(handshake.initialize(BSON("handshake" << node2 << "member" << 1))); - ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake)); - ASSERT_OK(handshake.initialize(BSON("handshake" << node3 << "member" << 2))); - ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake)); - - // 3 nodes waiting for time - WriteConcernOptions writeConcern; - writeConcern.wTimeout = WriteConcernOptions::kNoTimeout; - writeConcern.wNumNodes = 3; - - ReplicationAwaiter awaiter(getReplCoord(), &txn); - awaiter.setOpTime(time); - awaiter.setWriteConcern(writeConcern); - awaiter.start(&txn); - - // reconfig to fewer nodes - Status status(ErrorCodes::InternalError, "Not Set"); - boost::thread reconfigThread(stdx::bind(doReplSetReconfigToFewer, getReplCoord(), &status)); - - NetworkInterfaceMock* net = getNet(); - getNet()->enterNetwork(); - const NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); - const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); - repl::ReplSetHeartbeatArgs hbArgs; - ASSERT_OK(hbArgs.initialize(request.cmdObj)); - repl::ReplSetHeartbeatResponse hbResp; - hbResp.setSetName("mySet"); - hbResp.setState(MemberState::RS_SECONDARY); - hbResp.setVersion(2); - BSONObjBuilder respObj; - respObj << "ok" << 1; - hbResp.addToBSON(&respObj); - net->scheduleResponse(noi, net->now(), makeResponseStatus(respObj.obj())); - net->runReadyNetworkOperations(); - getNet()->exitNetwork(); - reconfigThread.join(); - ASSERT_OK(status); - std::cout << "asdf" << std::endl; - - // writeconcern feasability should be reevaluated and an error should be returned - ReplicationCoordinator::StatusAndDuration statusAndDur = awaiter.getResult(); - ASSERT_EQUALS(ErrorCodes::CannotSatisfyWriteConcern, statusAndDur.status); - awaiter.reset(); +} + +TEST_F(ReplCoordTest, IsMasterNoConfig) { + start(); + IsMasterResponse response; + + getReplCoord()->fillIsMasterForReplSet(&response); + ASSERT_FALSE(response.isConfigSet()); + BSONObj responseObj = response.toBSON(); + ASSERT_FALSE(responseObj["ismaster"].Bool()); + ASSERT_FALSE(responseObj["secondary"].Bool()); + ASSERT_TRUE(responseObj["isreplicaset"].Bool()); + ASSERT_EQUALS("Does not have a valid replica set config", responseObj["info"].String()); + + IsMasterResponse roundTripped; + ASSERT_OK(roundTripped.initialize(response.toBSON())); +} + +TEST_F(ReplCoordTest, IsMaster) { + HostAndPort h1("h1"); + HostAndPort h2("h2"); + HostAndPort h3("h3"); + HostAndPort h4("h4"); + assertStartSuccess( + BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" << h1.toString()) + << BSON("_id" << 1 << "host" << h2.toString()) + << BSON("_id" << 2 << "host" << h3.toString() << "arbiterOnly" << true) + << BSON("_id" << 3 << "host" << h4.toString() << "priority" << 0 + << "tags" << BSON("key1" + << "value1" + << "key2" + << "value2")))), + h4); + getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY); + ASSERT_TRUE(getReplCoord()->getMemberState().secondary()); + + IsMasterResponse response; + getReplCoord()->fillIsMasterForReplSet(&response); + + ASSERT_EQUALS("mySet", response.getReplSetName()); + ASSERT_EQUALS(2, response.getReplSetVersion()); + ASSERT_FALSE(response.isMaster()); + ASSERT_TRUE(response.isSecondary()); + // TODO(spencer): test that response includes current primary when there is one. + ASSERT_FALSE(response.isArbiterOnly()); + ASSERT_TRUE(response.isPassive()); + ASSERT_FALSE(response.isHidden()); + ASSERT_TRUE(response.shouldBuildIndexes()); + ASSERT_EQUALS(0, response.getSlaveDelay().total_seconds()); + ASSERT_EQUALS(h4, response.getMe()); + + std::vector<HostAndPort> hosts = response.getHosts(); + ASSERT_EQUALS(2U, hosts.size()); + if (hosts[0] == h1) { + ASSERT_EQUALS(h2, hosts[1]); + } else { + ASSERT_EQUALS(h2, hosts[0]); + ASSERT_EQUALS(h1, hosts[1]); } - - TEST_F(ReplCoordTest, AwaitReplicationReconfigToSmallerMajority) { - OperationContextNoop txn; - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("host" << "node1:12345" << "_id" << 0) << - BSON("host" << "node2:12345" << "_id" << 1) << - BSON("host" << "node3:12345" << "_id" << 2) << - BSON("host" << "node4:12345" << "_id" << 3) << - BSON("host" << "node5:12345" << "_id" << 4))), - HostAndPort("node1", 12345)); - ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(100, 1)); - simulateSuccessfulElection(); - - OID node2 = OID::gen(); - OID node3 = OID::gen(); - OpTime time(100, 2); - - HandshakeArgs handshake; - ASSERT_OK(handshake.initialize(BSON("handshake" << node2 << "member" << 1))); - ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake)); - ASSERT_OK(handshake.initialize(BSON("handshake" << node3 << "member" << 2))); - ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake)); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(node2, time)); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(node3, time)); - - // majority nodes waiting for time - WriteConcernOptions writeConcern; - writeConcern.wTimeout = WriteConcernOptions::kNoTimeout; - writeConcern.wMode = "majority"; - - ReplicationAwaiter awaiter(getReplCoord(), &txn); - awaiter.setOpTime(time); - awaiter.setWriteConcern(writeConcern); - awaiter.start(&txn); - - // demonstrate that majority cannot currently be satisfied - WriteConcernOptions writeConcern2; - writeConcern2.wTimeout = WriteConcernOptions::kNoWaiting; - writeConcern2.wMode = "majority"; - ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, - getReplCoord()->awaitReplication(&txn, time, writeConcern2).status); - - // reconfig to three nodes - Status status(ErrorCodes::InternalError, "Not Set"); - boost::thread reconfigThread(stdx::bind(doReplSetReconfig, getReplCoord(), &status)); - - NetworkInterfaceMock* net = getNet(); - getNet()->enterNetwork(); - const NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); - const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); - repl::ReplSetHeartbeatArgs hbArgs; - ASSERT_OK(hbArgs.initialize(request.cmdObj)); - repl::ReplSetHeartbeatResponse hbResp; - hbResp.setSetName("mySet"); - hbResp.setState(MemberState::RS_SECONDARY); - hbResp.setVersion(2); - BSONObjBuilder respObj; - respObj << "ok" << 1; - hbResp.addToBSON(&respObj); - net->scheduleResponse(noi, net->now(), makeResponseStatus(respObj.obj())); - net->runReadyNetworkOperations(); - getNet()->exitNetwork(); - reconfigThread.join(); - ASSERT_OK(status); - - // writeconcern feasability should be reevaluated and be satisfied - ReplicationCoordinator::StatusAndDuration statusAndDur = awaiter.getResult(); - ASSERT_OK(statusAndDur.status); - awaiter.reset(); - } - - TEST_F(ReplCoordTest, AwaitReplicationMajority) { - // Test that we can satisfy majority write concern can only be - // statisfied by voting data-bearing members. - OperationContextNoop txn; - assertStartSuccess( - BSON("_id" << "mySet" << - "version" << 2 << - "members" << BSON_ARRAY(BSON("host" << "node1:12345" << "_id" << 0) << - BSON("host" << "node2:12345" << "_id" << 1) << - BSON("host" << "node3:12345" << "_id" << 2) << - BSON("host" << "node4:12345" << - "_id" << 3 << - "votes" << 0) << - BSON("host" << "node5:12345" << - "_id" << 4 << - "arbiterOnly" << true))), - HostAndPort("node1", 12345)); - ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - OpTime time(100, 0); - getReplCoord()->setMyLastOptime(time); - simulateSuccessfulElection(); - - WriteConcernOptions majorityWriteConcern; - majorityWriteConcern.wTimeout = WriteConcernOptions::kNoWaiting; - majorityWriteConcern.wMode = "majority"; - - ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, - getReplCoord()->awaitReplication(&txn, time, majorityWriteConcern).status); - - OID client1 = OID::gen(); - HandshakeArgs handshake1; - ASSERT_OK(handshake1.initialize(BSON("handshake" << client1 << "member" << 1))); - ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake1)); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(client1, time)); - ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, - getReplCoord()->awaitReplication(&txn, time, majorityWriteConcern).status); - - // this member does not vote and as a result should not count towards write concern - OID client3 = OID::gen(); - HandshakeArgs handshake3; - ASSERT_OK(handshake3.initialize(BSON("handshake" << client3 << "member" << 3))); - ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake3)); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(client3, time)); - ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, - getReplCoord()->awaitReplication(&txn, time, majorityWriteConcern).status); - - OID client2 = OID::gen(); - HandshakeArgs handshake2; - ASSERT_OK(handshake2.initialize(BSON("handshake" << client2 << "member" << 2))); - ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake2)); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(client2, time)); - ASSERT_OK(getReplCoord()->awaitReplication(&txn, time, majorityWriteConcern).status); - } - - // TODO(schwerin): Unit test election id updating + std::vector<HostAndPort> passives = response.getPassives(); + ASSERT_EQUALS(1U, passives.size()); + ASSERT_EQUALS(h4, passives[0]); + std::vector<HostAndPort> arbiters = response.getArbiters(); + ASSERT_EQUALS(1U, arbiters.size()); + ASSERT_EQUALS(h3, arbiters[0]); + + unordered_map<std::string, std::string> tags = response.getTags(); + ASSERT_EQUALS(2U, tags.size()); + ASSERT_EQUALS("value1", tags["key1"]); + ASSERT_EQUALS("value2", tags["key2"]); + + IsMasterResponse roundTripped; + ASSERT_OK(roundTripped.initialize(response.toBSON())); +} + +TEST_F(ReplCoordTest, ShutDownBeforeStartUpFinished) { + init(); + startCapturingLogMessages(); + getReplCoord()->shutdown(); + stopCapturingLogMessages(); + ASSERT_EQUALS(1, + countLogLinesContaining("shutdown() called before startReplication() finished")); +} + +TEST_F(ReplCoordTest, UpdatePositionWithRIDTest) { + OperationContextNoop txn; + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("host" + << "node1:12345" + << "_id" << 0) + << BSON("host" + << "node2:12345" + << "_id" << 1) << BSON("host" + << "node3:12345" + << "_id" << 2) + << BSON("host" + << "node4:12345" + << "_id" << 3) << BSON("host" + << "node5:12345" + << "_id" << 4))), + HostAndPort("node1", 12345)); + ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + getReplCoord()->setMyLastOptime(OpTime(100, 0)); + simulateSuccessfulElection(); + + OID selfRID = getReplCoord()->getMyRID(); + OID client1 = OID::gen(); + OID client2 = OID::gen(); + OID client3 = OID::gen(); + OID client4 = OID::gen(); + OpTime time1(100, 1); + OpTime time2(100, 2); + OpTime staleTime(10, 0); + getReplCoord()->setMyLastOptime(time2); + + WriteConcernOptions writeConcern; + writeConcern.wTimeout = WriteConcernOptions::kNoWaiting; + writeConcern.wNumNodes = 2; + + // receive an updateposition for 3 members, with new enough time, but no handshakes yet + UpdatePositionArgs args; + ASSERT_OK(args.initialize(BSON("replSetUpdatePosition" + << 1 << "optimes" + << BSON_ARRAY(BSON("_id" << client1 << "optime" << time1) + << BSON("_id" << client2 << "optime" << time1) + << BSON("_id" << client3 << "optime" << time1))))); + ASSERT_EQUALS(ErrorCodes::NodeNotFound, getReplCoord()->processReplSetUpdatePosition(args)); + ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, + getReplCoord()->awaitReplication(&txn, time1, writeConcern).status); + + // handshake for middle of three nodes, updatePosition should end early, not updating + // any members, write concern 2 should still fail + HandshakeArgs handshake2; + ASSERT_OK(handshake2.initialize(BSON("handshake" << client2 << "member" << 2))); + ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake2)); + ASSERT_EQUALS(ErrorCodes::NodeNotFound, getReplCoord()->processReplSetUpdatePosition(args)); + ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, + getReplCoord()->awaitReplication(&txn, time1, writeConcern).status); + + // handshake for first of three nodes, updatePosition should end early, but the first two + // should get through and writeconcern <=3 should pass, but 4 should fail + HandshakeArgs handshake1; + ASSERT_OK(handshake1.initialize(BSON("handshake" << client1 << "member" << 1))); + ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake1)); + ASSERT_EQUALS(ErrorCodes::NodeNotFound, getReplCoord()->processReplSetUpdatePosition(args)); + ASSERT_OK(getReplCoord()->awaitReplication(&txn, time1, writeConcern).status); + writeConcern.wNumNodes = 3; + ASSERT_OK(getReplCoord()->awaitReplication(&txn, time1, writeConcern).status); + writeConcern.wNumNodes = 4; + ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, + getReplCoord()->awaitReplication(&txn, time1, writeConcern).status); + + // receive a stale value for ourself, should not cause progress to go backwards + HandshakeArgs handshake3; + ASSERT_OK(handshake3.initialize(BSON("handshake" << client3 << "member" << 3))); + ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake3)); + HandshakeArgs handshake4; + ASSERT_OK(handshake4.initialize(BSON("handshake" << client4 << "member" << 4))); + ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake4)); + UpdatePositionArgs args2; + ASSERT_OK(args2.initialize( + BSON("replSetUpdatePosition" + << 1 << "optimes" << BSON_ARRAY(BSON("_id" << selfRID << "optime" << staleTime) + << BSON("_id" << client3 << "optime" << time2) + << BSON("_id" << client4 << "optime" << time2))))); + ASSERT_OK(getReplCoord()->processReplSetUpdatePosition(args2)); + // all nodes should have through time1 and three should have through time2 + writeConcern.wNumNodes = 5; + ASSERT_OK(getReplCoord()->awaitReplication(&txn, time1, writeConcern).status); + writeConcern.wNumNodes = 3; + ASSERT_OK(getReplCoord()->awaitReplication(&txn, time2, writeConcern).status); + writeConcern.wNumNodes = 4; + ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, + getReplCoord()->awaitReplication(&txn, time2, writeConcern).status); + + // receive a stale value for another, should not cause progress to go backwards + UpdatePositionArgs args3; + ASSERT_OK(args3.initialize( + BSON("replSetUpdatePosition" + << 1 << "optimes" << BSON_ARRAY(BSON("_id" << client1 << "optime" << time2) + << BSON("_id" << client2 << "optime" << time2) + << BSON("_id" << client3 << "optime" << staleTime))))); + ASSERT_OK(getReplCoord()->processReplSetUpdatePosition(args3)); + // all nodes should have through time2 + writeConcern.wNumNodes = 5; + ASSERT_OK(getReplCoord()->awaitReplication(&txn, time2, writeConcern).status); +} + +TEST_F(ReplCoordTest, UpdatePositionWithConfigVersionAndMemberIdTest) { + OperationContextNoop txn; + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("host" + << "node1:12345" + << "_id" << 0) + << BSON("host" + << "node2:12345" + << "_id" << 1) << BSON("host" + << "node3:12345" + << "_id" << 2))), + HostAndPort("node1", 12345)); + ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + getReplCoord()->setMyLastOptime(OpTime(100, 0)); + simulateSuccessfulElection(); + + OpTime time1(100, 1); + OpTime time2(100, 2); + OpTime staleTime(10, 0); + getReplCoord()->setMyLastOptime(time1); + + WriteConcernOptions writeConcern; + writeConcern.wTimeout = WriteConcernOptions::kNoWaiting; + writeConcern.wNumNodes = 1; + + ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, + getReplCoord()->awaitReplication(&txn, time2, writeConcern).status); + + // receive updatePosition containing ourself, should not process the update for self + UpdatePositionArgs args; + ASSERT_OK(args.initialize( + BSON("replSetUpdatePosition" + << 1 << "optimes" + << BSON_ARRAY(BSON("cfgver" << 2 << "memberId" << 0 << "optime" << time2))))); + + ASSERT_OK(getReplCoord()->processReplSetUpdatePosition(args)); + ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, + getReplCoord()->awaitReplication(&txn, time2, writeConcern).status); + + // receive updatePosition with incorrect config version + UpdatePositionArgs args2; + ASSERT_OK(args2.initialize( + BSON("replSetUpdatePosition" + << 1 << "optimes" + << BSON_ARRAY(BSON("cfgver" << 3 << "memberId" << 1 << "optime" << time2))))); + + ASSERT_EQUALS(ErrorCodes::InvalidReplicaSetConfig, + getReplCoord()->processReplSetUpdatePosition(args2)); + ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, + getReplCoord()->awaitReplication(&txn, time2, writeConcern).status); + + // receive updatePosition with nonexistent member id + UpdatePositionArgs args3; + ASSERT_OK(args3.initialize( + BSON("replSetUpdatePosition" + << 1 << "optimes" + << BSON_ARRAY(BSON("cfgver" << 2 << "memberId" << 9 << "optime" << time2))))); + + ASSERT_EQUALS(ErrorCodes::NodeNotFound, getReplCoord()->processReplSetUpdatePosition(args3)); + ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, + getReplCoord()->awaitReplication(&txn, time2, writeConcern).status); + + // receive a good update position + getReplCoord()->setMyLastOptime(time2); + UpdatePositionArgs args4; + ASSERT_OK(args4.initialize( + BSON("replSetUpdatePosition" + << 1 << "optimes" + << BSON_ARRAY(BSON("cfgver" << 2 << "memberId" << 1 << "optime" << time2) + << BSON("cfgver" << 2 << "memberId" << 2 << "optime" << time2))))); + + ASSERT_OK(getReplCoord()->processReplSetUpdatePosition(args4)); + ASSERT_OK(getReplCoord()->awaitReplication(&txn, time2, writeConcern).status); + + writeConcern.wNumNodes = 3; + ASSERT_OK(getReplCoord()->awaitReplication(&txn, time2, writeConcern).status); +} + +void doReplSetReconfig(ReplicationCoordinatorImpl* replCoord, Status* status) { + OperationContextNoop txn; + BSONObjBuilder garbage; + ReplSetReconfigArgs args; + args.force = false; + args.newConfigObj = BSON("_id" + << "mySet" + << "version" << 3 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "node1:12345" + << "priority" << 3) + << BSON("_id" << 1 << "host" + << "node2:12345") + << BSON("_id" << 2 << "host" + << "node3:12345"))); + *status = replCoord->processReplSetReconfig(&txn, args, &garbage); +} + +TEST_F(ReplCoordTest, AwaitReplicationReconfigSimple) { + OperationContextNoop txn; + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("host" + << "node1:12345" + << "_id" << 0) + << BSON("host" + << "node2:12345" + << "_id" << 1) << BSON("host" + << "node3:12345" + << "_id" << 2))), + HostAndPort("node1", 12345)); + ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + getReplCoord()->setMyLastOptime(OpTime(100, 2)); + simulateSuccessfulElection(); + + OID selfRID = getReplCoord()->getMyRID(); + OID node2 = OID::gen(); + OID node3 = OID::gen(); + OpTime time(100, 2); + + HandshakeArgs handshake; + ASSERT_OK(handshake.initialize(BSON("handshake" << node2 << "member" << 1))); + ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake)); + ASSERT_OK(handshake.initialize(BSON("handshake" << node3 << "member" << 2))); + ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake)); + + // 3 nodes waiting for time + WriteConcernOptions writeConcern; + writeConcern.wTimeout = WriteConcernOptions::kNoTimeout; + writeConcern.wNumNodes = 3; + + ReplicationAwaiter awaiter(getReplCoord(), &txn); + awaiter.setOpTime(time); + awaiter.setWriteConcern(writeConcern); + awaiter.start(&txn); + + // reconfig + Status status(ErrorCodes::InternalError, "Not Set"); + boost::thread reconfigThread(stdx::bind(doReplSetReconfig, getReplCoord(), &status)); + + NetworkInterfaceMock* net = getNet(); + getNet()->enterNetwork(); + const NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); + const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); + repl::ReplSetHeartbeatArgs hbArgs; + ASSERT_OK(hbArgs.initialize(request.cmdObj)); + repl::ReplSetHeartbeatResponse hbResp; + hbResp.setSetName("mySet"); + hbResp.setState(MemberState::RS_SECONDARY); + hbResp.setVersion(2); + BSONObjBuilder respObj; + respObj << "ok" << 1; + hbResp.addToBSON(&respObj); + net->scheduleResponse(noi, net->now(), makeResponseStatus(respObj.obj())); + net->runReadyNetworkOperations(); + getNet()->exitNetwork(); + reconfigThread.join(); + ASSERT_OK(status); + + // satisfy write concern + ASSERT_OK(getReplCoord()->setLastOptime_forTest(selfRID, time)); + ASSERT_OK(getReplCoord()->setLastOptime_forTest(node2, time)); + ASSERT_OK(getReplCoord()->setLastOptime_forTest(node3, time)); + ReplicationCoordinator::StatusAndDuration statusAndDur = awaiter.getResult(); + ASSERT_OK(statusAndDur.status); + awaiter.reset(); +} + +void doReplSetReconfigToFewer(ReplicationCoordinatorImpl* replCoord, Status* status) { + OperationContextNoop txn; + BSONObjBuilder garbage; + ReplSetReconfigArgs args; + args.force = false; + args.newConfigObj = BSON("_id" + << "mySet" + << "version" << 3 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "node1:12345") + << BSON("_id" << 2 << "host" + << "node3:12345"))); + *status = replCoord->processReplSetReconfig(&txn, args, &garbage); +} + +TEST_F(ReplCoordTest, AwaitReplicationReconfigNodeCountExceedsNumberOfNodes) { + OperationContextNoop txn; + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("host" + << "node1:12345" + << "_id" << 0) + << BSON("host" + << "node2:12345" + << "_id" << 1) << BSON("host" + << "node3:12345" + << "_id" << 2))), + HostAndPort("node1", 12345)); + ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + getReplCoord()->setMyLastOptime(OpTime(100, 2)); + simulateSuccessfulElection(); + + OID node2 = OID::gen(); + OID node3 = OID::gen(); + OpTime time(100, 2); + + HandshakeArgs handshake; + ASSERT_OK(handshake.initialize(BSON("handshake" << node2 << "member" << 1))); + ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake)); + ASSERT_OK(handshake.initialize(BSON("handshake" << node3 << "member" << 2))); + ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake)); + + // 3 nodes waiting for time + WriteConcernOptions writeConcern; + writeConcern.wTimeout = WriteConcernOptions::kNoTimeout; + writeConcern.wNumNodes = 3; + + ReplicationAwaiter awaiter(getReplCoord(), &txn); + awaiter.setOpTime(time); + awaiter.setWriteConcern(writeConcern); + awaiter.start(&txn); + + // reconfig to fewer nodes + Status status(ErrorCodes::InternalError, "Not Set"); + boost::thread reconfigThread(stdx::bind(doReplSetReconfigToFewer, getReplCoord(), &status)); + + NetworkInterfaceMock* net = getNet(); + getNet()->enterNetwork(); + const NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); + const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); + repl::ReplSetHeartbeatArgs hbArgs; + ASSERT_OK(hbArgs.initialize(request.cmdObj)); + repl::ReplSetHeartbeatResponse hbResp; + hbResp.setSetName("mySet"); + hbResp.setState(MemberState::RS_SECONDARY); + hbResp.setVersion(2); + BSONObjBuilder respObj; + respObj << "ok" << 1; + hbResp.addToBSON(&respObj); + net->scheduleResponse(noi, net->now(), makeResponseStatus(respObj.obj())); + net->runReadyNetworkOperations(); + getNet()->exitNetwork(); + reconfigThread.join(); + ASSERT_OK(status); + std::cout << "asdf" << std::endl; + + // writeconcern feasability should be reevaluated and an error should be returned + ReplicationCoordinator::StatusAndDuration statusAndDur = awaiter.getResult(); + ASSERT_EQUALS(ErrorCodes::CannotSatisfyWriteConcern, statusAndDur.status); + awaiter.reset(); +} + +TEST_F(ReplCoordTest, AwaitReplicationReconfigToSmallerMajority) { + OperationContextNoop txn; + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("host" + << "node1:12345" + << "_id" << 0) + << BSON("host" + << "node2:12345" + << "_id" << 1) << BSON("host" + << "node3:12345" + << "_id" << 2) + << BSON("host" + << "node4:12345" + << "_id" << 3) << BSON("host" + << "node5:12345" + << "_id" << 4))), + HostAndPort("node1", 12345)); + ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + getReplCoord()->setMyLastOptime(OpTime(100, 1)); + simulateSuccessfulElection(); + + OID node2 = OID::gen(); + OID node3 = OID::gen(); + OpTime time(100, 2); + + HandshakeArgs handshake; + ASSERT_OK(handshake.initialize(BSON("handshake" << node2 << "member" << 1))); + ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake)); + ASSERT_OK(handshake.initialize(BSON("handshake" << node3 << "member" << 2))); + ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake)); + ASSERT_OK(getReplCoord()->setLastOptime_forTest(node2, time)); + ASSERT_OK(getReplCoord()->setLastOptime_forTest(node3, time)); + + // majority nodes waiting for time + WriteConcernOptions writeConcern; + writeConcern.wTimeout = WriteConcernOptions::kNoTimeout; + writeConcern.wMode = "majority"; + + ReplicationAwaiter awaiter(getReplCoord(), &txn); + awaiter.setOpTime(time); + awaiter.setWriteConcern(writeConcern); + awaiter.start(&txn); + + // demonstrate that majority cannot currently be satisfied + WriteConcernOptions writeConcern2; + writeConcern2.wTimeout = WriteConcernOptions::kNoWaiting; + writeConcern2.wMode = "majority"; + ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, + getReplCoord()->awaitReplication(&txn, time, writeConcern2).status); + + // reconfig to three nodes + Status status(ErrorCodes::InternalError, "Not Set"); + boost::thread reconfigThread(stdx::bind(doReplSetReconfig, getReplCoord(), &status)); + + NetworkInterfaceMock* net = getNet(); + getNet()->enterNetwork(); + const NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); + const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); + repl::ReplSetHeartbeatArgs hbArgs; + ASSERT_OK(hbArgs.initialize(request.cmdObj)); + repl::ReplSetHeartbeatResponse hbResp; + hbResp.setSetName("mySet"); + hbResp.setState(MemberState::RS_SECONDARY); + hbResp.setVersion(2); + BSONObjBuilder respObj; + respObj << "ok" << 1; + hbResp.addToBSON(&respObj); + net->scheduleResponse(noi, net->now(), makeResponseStatus(respObj.obj())); + net->runReadyNetworkOperations(); + getNet()->exitNetwork(); + reconfigThread.join(); + ASSERT_OK(status); + + // writeconcern feasability should be reevaluated and be satisfied + ReplicationCoordinator::StatusAndDuration statusAndDur = awaiter.getResult(); + ASSERT_OK(statusAndDur.status); + awaiter.reset(); +} + +TEST_F(ReplCoordTest, AwaitReplicationMajority) { + // Test that we can satisfy majority write concern can only be + // statisfied by voting data-bearing members. + OperationContextNoop txn; + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("host" + << "node1:12345" + << "_id" << 0) + << BSON("host" + << "node2:12345" + << "_id" << 1) << BSON("host" + << "node3:12345" + << "_id" << 2) + << BSON("host" + << "node4:12345" + << "_id" << 3 << "votes" << 0) + << BSON("host" + << "node5:12345" + << "_id" << 4 << "arbiterOnly" << true))), + HostAndPort("node1", 12345)); + ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + OpTime time(100, 0); + getReplCoord()->setMyLastOptime(time); + simulateSuccessfulElection(); + + WriteConcernOptions majorityWriteConcern; + majorityWriteConcern.wTimeout = WriteConcernOptions::kNoWaiting; + majorityWriteConcern.wMode = "majority"; + + ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, + getReplCoord()->awaitReplication(&txn, time, majorityWriteConcern).status); + + OID client1 = OID::gen(); + HandshakeArgs handshake1; + ASSERT_OK(handshake1.initialize(BSON("handshake" << client1 << "member" << 1))); + ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake1)); + ASSERT_OK(getReplCoord()->setLastOptime_forTest(client1, time)); + ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, + getReplCoord()->awaitReplication(&txn, time, majorityWriteConcern).status); + + // this member does not vote and as a result should not count towards write concern + OID client3 = OID::gen(); + HandshakeArgs handshake3; + ASSERT_OK(handshake3.initialize(BSON("handshake" << client3 << "member" << 3))); + ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake3)); + ASSERT_OK(getReplCoord()->setLastOptime_forTest(client3, time)); + ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, + getReplCoord()->awaitReplication(&txn, time, majorityWriteConcern).status); + + OID client2 = OID::gen(); + HandshakeArgs handshake2; + ASSERT_OK(handshake2.initialize(BSON("handshake" << client2 << "member" << 2))); + ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake2)); + ASSERT_OK(getReplCoord()->setLastOptime_forTest(client2, time)); + ASSERT_OK(getReplCoord()->awaitReplication(&txn, time, majorityWriteConcern).status); +} + +// TODO(schwerin): Unit test election id updating } // namespace } // namespace repl diff --git a/src/mongo/db/repl/replication_coordinator_mock.cpp b/src/mongo/db/repl/replication_coordinator_mock.cpp index c601a8d4f44..b05412174ef 100644 --- a/src/mongo/db/repl/replication_coordinator_mock.cpp +++ b/src/mongo/db/repl/replication_coordinator_mock.cpp @@ -37,263 +37,258 @@ namespace mongo { namespace repl { - using std::vector; - - ReplicationCoordinatorMock::ReplicationCoordinatorMock(const ReplSettings& settings) : - _settings(settings) {} - ReplicationCoordinatorMock::~ReplicationCoordinatorMock() {} - - void ReplicationCoordinatorMock::startReplication(OperationContext* txn) { - // TODO - } - - void ReplicationCoordinatorMock::shutdown() { - // TODO - } - - const ReplSettings& ReplicationCoordinatorMock::getSettings() const { - return _settings; - } - - bool ReplicationCoordinatorMock::isReplEnabled() const { - return _settings.usingReplSets() || _settings.master || _settings.slave; - } - - ReplicationCoordinator::Mode ReplicationCoordinatorMock::getReplicationMode() const { - return modeNone; - } - - MemberState ReplicationCoordinatorMock::getMemberState() const { - // TODO - invariant(false); - } - - bool ReplicationCoordinatorMock::isInPrimaryOrSecondaryState() const { - invariant(false); - } - - Seconds ReplicationCoordinatorMock::getSlaveDelaySecs() const { - return Seconds(0); - } - - void ReplicationCoordinatorMock::clearSyncSourceBlacklist() {} - - ReplicationCoordinator::StatusAndDuration ReplicationCoordinatorMock::awaitReplication( - const OperationContext* txn, - const OpTime& ts, - const WriteConcernOptions& writeConcern) { - // TODO - return StatusAndDuration(Status::OK(), Milliseconds(0)); - } - - ReplicationCoordinator::StatusAndDuration - ReplicationCoordinatorMock::awaitReplicationOfLastOpForClient( - const OperationContext* txn, - const WriteConcernOptions& writeConcern) { - return StatusAndDuration(Status::OK(), Milliseconds(0)); - } - - Status ReplicationCoordinatorMock::stepDown(OperationContext* txn, - bool force, - const Milliseconds& waitTime, - const Milliseconds& stepdownTime) { - return Status::OK(); - } - - bool ReplicationCoordinatorMock::isMasterForReportingPurposes() { - // TODO - return true; - } - - bool ReplicationCoordinatorMock::canAcceptWritesForDatabase(const StringData& dbName) { - // TODO - return true; - } - - Status ReplicationCoordinatorMock::checkCanServeReadsFor(OperationContext* txn, - const NamespaceString& ns, - bool slaveOk) { - // TODO - return Status::OK(); - } - - bool ReplicationCoordinatorMock::shouldIgnoreUniqueIndex(const IndexDescriptor* idx) { - // TODO - return false; - } - - Status ReplicationCoordinatorMock::setLastOptimeForSlave(const OID& rid, const OpTime& ts) { - return Status::OK(); - } - - void ReplicationCoordinatorMock::setMyHeartbeatMessage(const std::string& msg) { - // TODO - } - - void ReplicationCoordinatorMock::setMyLastOptime(const OpTime& ts) {} - - void ReplicationCoordinatorMock::resetMyLastOptime() {} - - OpTime ReplicationCoordinatorMock::getMyLastOptime() const { - // TODO - return OpTime(); - } - - - OID ReplicationCoordinatorMock::getElectionId() { - // TODO - return OID(); - } - - OID ReplicationCoordinatorMock::getMyRID() const { - return OID(); - } - - int ReplicationCoordinatorMock::getMyId() const { - return 0; - } - - bool ReplicationCoordinatorMock::setFollowerMode(const MemberState& newState) { - return true; - } - - bool ReplicationCoordinatorMock::isWaitingForApplierToDrain() { - return false; - } - - void ReplicationCoordinatorMock::signalDrainComplete(OperationContext*) {} - - void ReplicationCoordinatorMock::signalUpstreamUpdater() {} - - bool ReplicationCoordinatorMock::prepareReplSetUpdatePositionCommand( - BSONObjBuilder* cmdBuilder) { - return true; - } - - void ReplicationCoordinatorMock::prepareReplSetUpdatePositionCommandHandshakes( - std::vector<BSONObj>* handshakes) {} - - void ReplicationCoordinatorMock::processReplSetGetConfig(BSONObjBuilder* result) { - // TODO - } - - Status ReplicationCoordinatorMock::processReplSetGetStatus(BSONObjBuilder* result) { - return Status::OK(); - } - - void ReplicationCoordinatorMock::fillIsMasterForReplSet(IsMasterResponse* result) {} - - void ReplicationCoordinatorMock::appendSlaveInfoData(BSONObjBuilder* result) {} - - Status ReplicationCoordinatorMock::setMaintenanceMode(bool activate) { - return Status::OK(); - } - - bool ReplicationCoordinatorMock::getMaintenanceMode() { - return false; - } - - Status ReplicationCoordinatorMock::processReplSetSyncFrom(const HostAndPort& target, - BSONObjBuilder* resultObj) { - // TODO - return Status::OK(); - } - - Status ReplicationCoordinatorMock::processReplSetFreeze(int secs, BSONObjBuilder* resultObj) { - // TODO - return Status::OK(); - } - - Status ReplicationCoordinatorMock::processHeartbeat(const ReplSetHeartbeatArgs& args, - ReplSetHeartbeatResponse* response) { - return Status::OK(); - } - - Status ReplicationCoordinatorMock::processReplSetReconfig(OperationContext* txn, - const ReplSetReconfigArgs& args, - BSONObjBuilder* resultObj) { - return Status::OK(); - } - - Status ReplicationCoordinatorMock::processReplSetInitiate(OperationContext* txn, - const BSONObj& configObj, - BSONObjBuilder* resultObj) { - return Status::OK(); - } - - Status ReplicationCoordinatorMock::processReplSetGetRBID(BSONObjBuilder* resultObj) { - return Status::OK(); - } - - void ReplicationCoordinatorMock::incrementRollbackID() {} +using std::vector; + +ReplicationCoordinatorMock::ReplicationCoordinatorMock(const ReplSettings& settings) + : _settings(settings) {} +ReplicationCoordinatorMock::~ReplicationCoordinatorMock() {} + +void ReplicationCoordinatorMock::startReplication(OperationContext* txn) { + // TODO +} + +void ReplicationCoordinatorMock::shutdown() { + // TODO +} + +const ReplSettings& ReplicationCoordinatorMock::getSettings() const { + return _settings; +} + +bool ReplicationCoordinatorMock::isReplEnabled() const { + return _settings.usingReplSets() || _settings.master || _settings.slave; +} + +ReplicationCoordinator::Mode ReplicationCoordinatorMock::getReplicationMode() const { + return modeNone; +} - Status ReplicationCoordinatorMock::processReplSetFresh(const ReplSetFreshArgs& args, - BSONObjBuilder* resultObj) { - return Status::OK(); - } +MemberState ReplicationCoordinatorMock::getMemberState() const { + // TODO + invariant(false); +} - Status ReplicationCoordinatorMock::processReplSetElect(const ReplSetElectArgs& args, - BSONObjBuilder* resultObj) { - // TODO - return Status::OK(); - } - - Status ReplicationCoordinatorMock::processReplSetUpdatePosition( - const UpdatePositionArgs& updates) { - // TODO - return Status::OK(); - } - - Status ReplicationCoordinatorMock::processHandshake(OperationContext* txn, - const HandshakeArgs& handshake) { - return Status::OK(); - } - - bool ReplicationCoordinatorMock::buildsIndexes() { - // TODO - return true; - } - - std::vector<HostAndPort> ReplicationCoordinatorMock::getHostsWrittenTo(const OpTime& op) { - return std::vector<HostAndPort>(); - } - - vector<HostAndPort> ReplicationCoordinatorMock::getOtherNodesInReplSet() const { - return std::vector<HostAndPort>(); - } - - Status ReplicationCoordinatorMock::checkIfWriteConcernCanBeSatisfied( - const WriteConcernOptions& writeConcern) const { - return Status::OK(); - } - - WriteConcernOptions ReplicationCoordinatorMock::getGetLastErrorDefault() { - return WriteConcernOptions(); - } - - Status ReplicationCoordinatorMock::checkReplEnabledForCommand(BSONObjBuilder* result) { - // TODO - return Status::OK(); - } - - HostAndPort ReplicationCoordinatorMock::chooseNewSyncSource(const OpTime& lastOpTimeFetched) { - invariant(false); - return HostAndPort(); - } - - void ReplicationCoordinatorMock::blacklistSyncSource(const HostAndPort& host, Date_t until) { - invariant(false); - } - - void ReplicationCoordinatorMock::resetLastOpTimeFromOplog(OperationContext* txn) { - invariant(false); - } - - bool ReplicationCoordinatorMock::shouldChangeSyncSource(const HostAndPort& currentSource) { - invariant(false); - } - - void ReplicationCoordinatorMock::summarizeAsHtml(ReplSetHtmlSummary* output) {} - -} // namespace repl -} // namespace mongo +bool ReplicationCoordinatorMock::isInPrimaryOrSecondaryState() const { + invariant(false); +} + +Seconds ReplicationCoordinatorMock::getSlaveDelaySecs() const { + return Seconds(0); +} + +void ReplicationCoordinatorMock::clearSyncSourceBlacklist() {} + +ReplicationCoordinator::StatusAndDuration ReplicationCoordinatorMock::awaitReplication( + const OperationContext* txn, const OpTime& ts, const WriteConcernOptions& writeConcern) { + // TODO + return StatusAndDuration(Status::OK(), Milliseconds(0)); +} + +ReplicationCoordinator::StatusAndDuration +ReplicationCoordinatorMock::awaitReplicationOfLastOpForClient( + const OperationContext* txn, const WriteConcernOptions& writeConcern) { + return StatusAndDuration(Status::OK(), Milliseconds(0)); +} + +Status ReplicationCoordinatorMock::stepDown(OperationContext* txn, + bool force, + const Milliseconds& waitTime, + const Milliseconds& stepdownTime) { + return Status::OK(); +} + +bool ReplicationCoordinatorMock::isMasterForReportingPurposes() { + // TODO + return true; +} + +bool ReplicationCoordinatorMock::canAcceptWritesForDatabase(const StringData& dbName) { + // TODO + return true; +} + +Status ReplicationCoordinatorMock::checkCanServeReadsFor(OperationContext* txn, + const NamespaceString& ns, + bool slaveOk) { + // TODO + return Status::OK(); +} + +bool ReplicationCoordinatorMock::shouldIgnoreUniqueIndex(const IndexDescriptor* idx) { + // TODO + return false; +} + +Status ReplicationCoordinatorMock::setLastOptimeForSlave(const OID& rid, const OpTime& ts) { + return Status::OK(); +} + +void ReplicationCoordinatorMock::setMyHeartbeatMessage(const std::string& msg) { + // TODO +} + +void ReplicationCoordinatorMock::setMyLastOptime(const OpTime& ts) {} + +void ReplicationCoordinatorMock::resetMyLastOptime() {} + +OpTime ReplicationCoordinatorMock::getMyLastOptime() const { + // TODO + return OpTime(); +} + + +OID ReplicationCoordinatorMock::getElectionId() { + // TODO + return OID(); +} + +OID ReplicationCoordinatorMock::getMyRID() const { + return OID(); +} + +int ReplicationCoordinatorMock::getMyId() const { + return 0; +} + +bool ReplicationCoordinatorMock::setFollowerMode(const MemberState& newState) { + return true; +} + +bool ReplicationCoordinatorMock::isWaitingForApplierToDrain() { + return false; +} + +void ReplicationCoordinatorMock::signalDrainComplete(OperationContext*) {} + +void ReplicationCoordinatorMock::signalUpstreamUpdater() {} + +bool ReplicationCoordinatorMock::prepareReplSetUpdatePositionCommand(BSONObjBuilder* cmdBuilder) { + return true; +} + +void ReplicationCoordinatorMock::prepareReplSetUpdatePositionCommandHandshakes( + std::vector<BSONObj>* handshakes) {} + +void ReplicationCoordinatorMock::processReplSetGetConfig(BSONObjBuilder* result) { + // TODO +} + +Status ReplicationCoordinatorMock::processReplSetGetStatus(BSONObjBuilder* result) { + return Status::OK(); +} + +void ReplicationCoordinatorMock::fillIsMasterForReplSet(IsMasterResponse* result) {} + +void ReplicationCoordinatorMock::appendSlaveInfoData(BSONObjBuilder* result) {} + +Status ReplicationCoordinatorMock::setMaintenanceMode(bool activate) { + return Status::OK(); +} + +bool ReplicationCoordinatorMock::getMaintenanceMode() { + return false; +} + +Status ReplicationCoordinatorMock::processReplSetSyncFrom(const HostAndPort& target, + BSONObjBuilder* resultObj) { + // TODO + return Status::OK(); +} + +Status ReplicationCoordinatorMock::processReplSetFreeze(int secs, BSONObjBuilder* resultObj) { + // TODO + return Status::OK(); +} + +Status ReplicationCoordinatorMock::processHeartbeat(const ReplSetHeartbeatArgs& args, + ReplSetHeartbeatResponse* response) { + return Status::OK(); +} + +Status ReplicationCoordinatorMock::processReplSetReconfig(OperationContext* txn, + const ReplSetReconfigArgs& args, + BSONObjBuilder* resultObj) { + return Status::OK(); +} + +Status ReplicationCoordinatorMock::processReplSetInitiate(OperationContext* txn, + const BSONObj& configObj, + BSONObjBuilder* resultObj) { + return Status::OK(); +} + +Status ReplicationCoordinatorMock::processReplSetGetRBID(BSONObjBuilder* resultObj) { + return Status::OK(); +} + +void ReplicationCoordinatorMock::incrementRollbackID() {} + +Status ReplicationCoordinatorMock::processReplSetFresh(const ReplSetFreshArgs& args, + BSONObjBuilder* resultObj) { + return Status::OK(); +} + +Status ReplicationCoordinatorMock::processReplSetElect(const ReplSetElectArgs& args, + BSONObjBuilder* resultObj) { + // TODO + return Status::OK(); +} + +Status ReplicationCoordinatorMock::processReplSetUpdatePosition(const UpdatePositionArgs& updates) { + // TODO + return Status::OK(); +} + +Status ReplicationCoordinatorMock::processHandshake(OperationContext* txn, + const HandshakeArgs& handshake) { + return Status::OK(); +} + +bool ReplicationCoordinatorMock::buildsIndexes() { + // TODO + return true; +} + +std::vector<HostAndPort> ReplicationCoordinatorMock::getHostsWrittenTo(const OpTime& op) { + return std::vector<HostAndPort>(); +} + +vector<HostAndPort> ReplicationCoordinatorMock::getOtherNodesInReplSet() const { + return std::vector<HostAndPort>(); +} + +Status ReplicationCoordinatorMock::checkIfWriteConcernCanBeSatisfied( + const WriteConcernOptions& writeConcern) const { + return Status::OK(); +} + +WriteConcernOptions ReplicationCoordinatorMock::getGetLastErrorDefault() { + return WriteConcernOptions(); +} + +Status ReplicationCoordinatorMock::checkReplEnabledForCommand(BSONObjBuilder* result) { + // TODO + return Status::OK(); +} + +HostAndPort ReplicationCoordinatorMock::chooseNewSyncSource(const OpTime& lastOpTimeFetched) { + invariant(false); + return HostAndPort(); +} + +void ReplicationCoordinatorMock::blacklistSyncSource(const HostAndPort& host, Date_t until) { + invariant(false); +} + +void ReplicationCoordinatorMock::resetLastOpTimeFromOplog(OperationContext* txn) { + invariant(false); +} + +bool ReplicationCoordinatorMock::shouldChangeSyncSource(const HostAndPort& currentSource) { + invariant(false); +} + +void ReplicationCoordinatorMock::summarizeAsHtml(ReplSetHtmlSummary* output) {} + +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/replication_coordinator_mock.h b/src/mongo/db/repl/replication_coordinator_mock.h index 3f645554c7c..1e31629ee36 100644 --- a/src/mongo/db/repl/replication_coordinator_mock.h +++ b/src/mongo/db/repl/replication_coordinator_mock.h @@ -34,158 +34,148 @@ namespace mongo { namespace repl { - /** - * A mock ReplicationCoordinator. Currently it is extremely simple and exists solely to link - * into dbtests. - */ - class ReplicationCoordinatorMock : public ReplicationCoordinator { - MONGO_DISALLOW_COPYING(ReplicationCoordinatorMock); - - public: - - ReplicationCoordinatorMock(const ReplSettings& settings); - virtual ~ReplicationCoordinatorMock(); +/** + * A mock ReplicationCoordinator. Currently it is extremely simple and exists solely to link + * into dbtests. + */ +class ReplicationCoordinatorMock : public ReplicationCoordinator { + MONGO_DISALLOW_COPYING(ReplicationCoordinatorMock); - virtual void startReplication(OperationContext* txn); +public: + ReplicationCoordinatorMock(const ReplSettings& settings); + virtual ~ReplicationCoordinatorMock(); - virtual void shutdown(); + virtual void startReplication(OperationContext* txn); - virtual const ReplSettings& getSettings() const; + virtual void shutdown(); - virtual bool isReplEnabled() const; + virtual const ReplSettings& getSettings() const; - virtual Mode getReplicationMode() const; + virtual bool isReplEnabled() const; - virtual MemberState getMemberState() const; + virtual Mode getReplicationMode() const; - virtual bool isInPrimaryOrSecondaryState() const; + virtual MemberState getMemberState() const; - virtual Seconds getSlaveDelaySecs() const; + virtual bool isInPrimaryOrSecondaryState() const; - virtual void clearSyncSourceBlacklist(); + virtual Seconds getSlaveDelaySecs() const; - virtual ReplicationCoordinator::StatusAndDuration awaitReplication( - const OperationContext* txn, - const OpTime& ts, - const WriteConcernOptions& writeConcern); + virtual void clearSyncSourceBlacklist(); - virtual ReplicationCoordinator::StatusAndDuration awaitReplicationOfLastOpForClient( - const OperationContext* txn, - const WriteConcernOptions& writeConcern); + virtual ReplicationCoordinator::StatusAndDuration awaitReplication( + const OperationContext* txn, const OpTime& ts, const WriteConcernOptions& writeConcern); - virtual Status stepDown(OperationContext* txn, - bool force, - const Milliseconds& waitTime, - const Milliseconds& stepdownTime); + virtual ReplicationCoordinator::StatusAndDuration awaitReplicationOfLastOpForClient( + const OperationContext* txn, const WriteConcernOptions& writeConcern); - virtual bool isMasterForReportingPurposes(); + virtual Status stepDown(OperationContext* txn, + bool force, + const Milliseconds& waitTime, + const Milliseconds& stepdownTime); - virtual bool canAcceptWritesForDatabase(const StringData& dbName); + virtual bool isMasterForReportingPurposes(); - virtual Status checkIfWriteConcernCanBeSatisfied( - const WriteConcernOptions& writeConcern) const; + virtual bool canAcceptWritesForDatabase(const StringData& dbName); - virtual Status checkCanServeReadsFor(OperationContext* txn, - const NamespaceString& ns, - bool slaveOk); + virtual Status checkIfWriteConcernCanBeSatisfied(const WriteConcernOptions& writeConcern) const; - virtual bool shouldIgnoreUniqueIndex(const IndexDescriptor* idx); + virtual Status checkCanServeReadsFor(OperationContext* txn, + const NamespaceString& ns, + bool slaveOk); - virtual Status setLastOptimeForSlave(const OID& rid, const OpTime& ts); + virtual bool shouldIgnoreUniqueIndex(const IndexDescriptor* idx); - virtual void setMyLastOptime(const OpTime& ts); + virtual Status setLastOptimeForSlave(const OID& rid, const OpTime& ts); - virtual void resetMyLastOptime(); + virtual void setMyLastOptime(const OpTime& ts); - virtual void setMyHeartbeatMessage(const std::string& msg); + virtual void resetMyLastOptime(); - virtual OpTime getMyLastOptime() const; + virtual void setMyHeartbeatMessage(const std::string& msg); - virtual OID getElectionId(); + virtual OpTime getMyLastOptime() const; - virtual OID getMyRID() const; + virtual OID getElectionId(); - virtual int getMyId() const; + virtual OID getMyRID() const; - virtual bool setFollowerMode(const MemberState& newState); + virtual int getMyId() const; - virtual bool isWaitingForApplierToDrain(); + virtual bool setFollowerMode(const MemberState& newState); - virtual void signalDrainComplete(OperationContext*); + virtual bool isWaitingForApplierToDrain(); - virtual void signalUpstreamUpdater(); + virtual void signalDrainComplete(OperationContext*); - virtual bool prepareReplSetUpdatePositionCommand(BSONObjBuilder* cmdBuilder); + virtual void signalUpstreamUpdater(); - virtual void prepareReplSetUpdatePositionCommandHandshakes( - std::vector<BSONObj>* handshakes); + virtual bool prepareReplSetUpdatePositionCommand(BSONObjBuilder* cmdBuilder); - virtual Status processReplSetGetStatus(BSONObjBuilder* result); + virtual void prepareReplSetUpdatePositionCommandHandshakes(std::vector<BSONObj>* handshakes); - virtual void fillIsMasterForReplSet(IsMasterResponse* result); + virtual Status processReplSetGetStatus(BSONObjBuilder* result); - virtual void appendSlaveInfoData(BSONObjBuilder* result); + virtual void fillIsMasterForReplSet(IsMasterResponse* result); - virtual void processReplSetGetConfig(BSONObjBuilder* result); + virtual void appendSlaveInfoData(BSONObjBuilder* result); - virtual Status setMaintenanceMode(bool activate); + virtual void processReplSetGetConfig(BSONObjBuilder* result); - virtual bool getMaintenanceMode(); + virtual Status setMaintenanceMode(bool activate); - virtual Status processReplSetSyncFrom(const HostAndPort& target, - BSONObjBuilder* resultObj); + virtual bool getMaintenanceMode(); - virtual Status processReplSetFreeze(int secs, BSONObjBuilder* resultObj); + virtual Status processReplSetSyncFrom(const HostAndPort& target, BSONObjBuilder* resultObj); - virtual Status processHeartbeat(const ReplSetHeartbeatArgs& args, - ReplSetHeartbeatResponse* response); + virtual Status processReplSetFreeze(int secs, BSONObjBuilder* resultObj); - virtual Status processReplSetReconfig(OperationContext* txn, - const ReplSetReconfigArgs& args, - BSONObjBuilder* resultObj); + virtual Status processHeartbeat(const ReplSetHeartbeatArgs& args, + ReplSetHeartbeatResponse* response); - virtual Status processReplSetInitiate(OperationContext* txn, - const BSONObj& configObj, - BSONObjBuilder* resultObj); + virtual Status processReplSetReconfig(OperationContext* txn, + const ReplSetReconfigArgs& args, + BSONObjBuilder* resultObj); - virtual Status processReplSetGetRBID(BSONObjBuilder* resultObj); + virtual Status processReplSetInitiate(OperationContext* txn, + const BSONObj& configObj, + BSONObjBuilder* resultObj); - virtual void incrementRollbackID(); + virtual Status processReplSetGetRBID(BSONObjBuilder* resultObj); - virtual Status processReplSetFresh(const ReplSetFreshArgs& args, - BSONObjBuilder* resultObj); + virtual void incrementRollbackID(); - virtual Status processReplSetElect(const ReplSetElectArgs& args, - BSONObjBuilder* resultObj); + virtual Status processReplSetFresh(const ReplSetFreshArgs& args, BSONObjBuilder* resultObj); - virtual Status processReplSetUpdatePosition(const UpdatePositionArgs& updates); + virtual Status processReplSetElect(const ReplSetElectArgs& args, BSONObjBuilder* resultObj); - virtual Status processHandshake(OperationContext* txn, const HandshakeArgs& handshake); + virtual Status processReplSetUpdatePosition(const UpdatePositionArgs& updates); - virtual bool buildsIndexes(); + virtual Status processHandshake(OperationContext* txn, const HandshakeArgs& handshake); - virtual std::vector<HostAndPort> getHostsWrittenTo(const OpTime& op); + virtual bool buildsIndexes(); - virtual std::vector<HostAndPort> getOtherNodesInReplSet() const; + virtual std::vector<HostAndPort> getHostsWrittenTo(const OpTime& op); - virtual WriteConcernOptions getGetLastErrorDefault(); + virtual std::vector<HostAndPort> getOtherNodesInReplSet() const; - virtual Status checkReplEnabledForCommand(BSONObjBuilder* result); + virtual WriteConcernOptions getGetLastErrorDefault(); - virtual HostAndPort chooseNewSyncSource(const OpTime& lastOpTimeFetched); + virtual Status checkReplEnabledForCommand(BSONObjBuilder* result); - virtual void blacklistSyncSource(const HostAndPort& host, Date_t until); + virtual HostAndPort chooseNewSyncSource(const OpTime& lastOpTimeFetched); - virtual void resetLastOpTimeFromOplog(OperationContext* txn); + virtual void blacklistSyncSource(const HostAndPort& host, Date_t until); - virtual bool shouldChangeSyncSource(const HostAndPort& currentSource); + virtual void resetLastOpTimeFromOplog(OperationContext* txn); - virtual void summarizeAsHtml(ReplSetHtmlSummary* output); + virtual bool shouldChangeSyncSource(const HostAndPort& currentSource); - private: + virtual void summarizeAsHtml(ReplSetHtmlSummary* output); - const ReplSettings _settings; - }; +private: + const ReplSettings _settings; +}; -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/replication_coordinator_test_fixture.cpp b/src/mongo/db/repl/replication_coordinator_test_fixture.cpp index 2479c5d4724..e129a4693df 100644 --- a/src/mongo/db/repl/replication_coordinator_test_fixture.cpp +++ b/src/mongo/db/repl/replication_coordinator_test_fixture.cpp @@ -49,225 +49,208 @@ namespace mongo { namespace repl { namespace { - bool stringContains(const std::string &haystack, const std::string& needle) { - return haystack.find(needle) != std::string::npos; - } +bool stringContains(const std::string& haystack, const std::string& needle) { + return haystack.find(needle) != std::string::npos; +} } // namespace - ReplicaSetConfig ReplCoordTest::assertMakeRSConfig(const BSONObj& configBson) { - ReplicaSetConfig config; - ASSERT_OK(config.initialize(configBson)); - ASSERT_OK(config.validate()); - return config; - } - - ReplCoordTest::ReplCoordTest() : _callShutdown(false) {} - ReplCoordTest::~ReplCoordTest() {} - - void ReplCoordTest::setUp() { - _settings.replSet = "mySet/node1:12345,node2:54321"; - } +ReplicaSetConfig ReplCoordTest::assertMakeRSConfig(const BSONObj& configBson) { + ReplicaSetConfig config; + ASSERT_OK(config.initialize(configBson)); + ASSERT_OK(config.validate()); + return config; +} - void ReplCoordTest::tearDown() { - if (_externalState) { - _externalState->setStoreLocalConfigDocumentToHang(false); - } - if (_callShutdown) { - shutdown(); - } - } +ReplCoordTest::ReplCoordTest() : _callShutdown(false) {} +ReplCoordTest::~ReplCoordTest() {} - void ReplCoordTest::assertRunUntil(Date_t newTime) { - this->_net->runUntil(newTime); - ASSERT_EQUALS(newTime, getNet()->now()); - } +void ReplCoordTest::setUp() { + _settings.replSet = "mySet/node1:12345,node2:54321"; +} - void ReplCoordTest::enterNetwork() { - getNet()->enterNetwork(); +void ReplCoordTest::tearDown() { + if (_externalState) { + _externalState->setStoreLocalConfigDocumentToHang(false); } - - void ReplCoordTest::exitNetwork() { - getNet()->exitNetwork(); + if (_callShutdown) { + shutdown(); } - - void ReplCoordTest::addSelf(const HostAndPort& selfHost) { - getExternalState()->addSelf(selfHost); +} + +void ReplCoordTest::assertRunUntil(Date_t newTime) { + this->_net->runUntil(newTime); + ASSERT_EQUALS(newTime, getNet()->now()); +} + +void ReplCoordTest::enterNetwork() { + getNet()->enterNetwork(); +} + +void ReplCoordTest::exitNetwork() { + getNet()->exitNetwork(); +} + +void ReplCoordTest::addSelf(const HostAndPort& selfHost) { + getExternalState()->addSelf(selfHost); +} + +void ReplCoordTest::init() { + invariant(!_repl); + invariant(!_callShutdown); + + // PRNG seed for tests. + const int64_t seed = 0; + + _topo = new TopologyCoordinatorImpl(Seconds(0)); + _net = new NetworkInterfaceMock; + _externalState = new ReplicationCoordinatorExternalStateMock; + _repl.reset(new ReplicationCoordinatorImpl(_settings, _externalState, _net, _topo, seed)); +} + +void ReplCoordTest::init(const ReplSettings& settings) { + _settings = settings; + init(); +} + +void ReplCoordTest::init(const std::string& replSet) { + _settings.replSet = replSet; + init(); +} + +void ReplCoordTest::start() { + invariant(!_callShutdown); + // if we haven't initialized yet, do that first. + if (!_repl) { + init(); } - void ReplCoordTest::init() { - invariant(!_repl); - invariant(!_callShutdown); - - // PRNG seed for tests. - const int64_t seed = 0; - - _topo = new TopologyCoordinatorImpl(Seconds(0)); - _net = new NetworkInterfaceMock; - _externalState = new ReplicationCoordinatorExternalStateMock; - _repl.reset(new ReplicationCoordinatorImpl(_settings, - _externalState, - _net, - _topo, - seed)); - } + OperationContextNoop txn; + _repl->startReplication(&txn); + _repl->waitForStartUpComplete(); + _callShutdown = true; +} - void ReplCoordTest::init(const ReplSettings& settings) { - _settings = settings; +void ReplCoordTest::start(const BSONObj& configDoc, const HostAndPort& selfHost) { + if (!_repl) { init(); } + _externalState->setLocalConfigDocument(StatusWith<BSONObj>(configDoc)); + _externalState->addSelf(selfHost); + start(); +} - void ReplCoordTest::init(const std::string& replSet) { - _settings.replSet = replSet; +void ReplCoordTest::start(const HostAndPort& selfHost) { + if (!_repl) { init(); } - - void ReplCoordTest::start() { - invariant(!_callShutdown); - // if we haven't initialized yet, do that first. - if (!_repl) { - init(); - } - - OperationContextNoop txn; - _repl->startReplication(&txn); - _repl->waitForStartUpComplete(); - _callShutdown = true; - } - - void ReplCoordTest::start(const BSONObj& configDoc, const HostAndPort& selfHost) { - if (!_repl) { - init(); - } - _externalState->setLocalConfigDocument(StatusWith<BSONObj>(configDoc)); - _externalState->addSelf(selfHost); - start(); - } - - void ReplCoordTest::start(const HostAndPort& selfHost) { - if (!_repl) { - init(); - } - _externalState->addSelf(selfHost); - start(); - } - - void ReplCoordTest::assertStartSuccess( - const BSONObj& configDoc, - const HostAndPort& selfHost) { - start(configDoc, selfHost); - ASSERT_NE(MemberState::RS_STARTUP, getReplCoord()->getMemberState().s); - } - - ResponseStatus ReplCoordTest::makeResponseStatus(const BSONObj& doc, Milliseconds millis) { - log() << "Responding with " << doc; - return ResponseStatus(ReplicationExecutor::RemoteCommandResponse(doc, millis)); - } - - void ReplCoordTest::simulateSuccessfulElection() { - OperationContextReplMock txn; - ReplicationCoordinatorImpl* replCoord = getReplCoord(); - NetworkInterfaceMock* net = getNet(); - ReplicaSetConfig rsConfig = replCoord->getReplicaSetConfig_forTest(); - ASSERT(replCoord->getMemberState().secondary()) << - replCoord->getMemberState().toString(); - while (!replCoord->getMemberState().primary()) { - log() << "Waiting on network in state " << replCoord->getMemberState(); - getNet()->enterNetwork(); - const NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); - const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); - log() << request.target.toString() << " processing " << request.cmdObj; - ReplSetHeartbeatArgs hbArgs; - if (hbArgs.initialize(request.cmdObj).isOK()) { - ReplSetHeartbeatResponse hbResp; - hbResp.setSetName(rsConfig.getReplSetName()); - hbResp.setState(MemberState::RS_SECONDARY); - hbResp.setVersion(rsConfig.getConfigVersion()); - BSONObjBuilder respObj; - respObj << "ok" << 1; - hbResp.addToBSON(&respObj); - net->scheduleResponse(noi, net->now(), makeResponseStatus(respObj.obj())); - } - else if (request.cmdObj.firstElement().fieldNameStringData() == "replSetFresh") { - net->scheduleResponse(noi, net->now(), makeResponseStatus( - BSON("ok" << 1 << - "fresher" << false << - "opTime" << Date_t(OpTime(0, 0).asDate()) << - "veto" << false))); - } - else if (request.cmdObj.firstElement().fieldNameStringData() == "replSetElect") { - net->scheduleResponse(noi, net->now(), makeResponseStatus( - BSON("ok" << 1 << - "vote" << 1 << - "round" << request.cmdObj["round"].OID()))); - } - else { - error() << "Black holing unexpected request to " << request.target << ": " << - request.cmdObj; - net->blackHole(noi); - } - net->runReadyNetworkOperations(); - getNet()->exitNetwork(); + _externalState->addSelf(selfHost); + start(); +} + +void ReplCoordTest::assertStartSuccess(const BSONObj& configDoc, const HostAndPort& selfHost) { + start(configDoc, selfHost); + ASSERT_NE(MemberState::RS_STARTUP, getReplCoord()->getMemberState().s); +} + +ResponseStatus ReplCoordTest::makeResponseStatus(const BSONObj& doc, Milliseconds millis) { + log() << "Responding with " << doc; + return ResponseStatus(ReplicationExecutor::RemoteCommandResponse(doc, millis)); +} + +void ReplCoordTest::simulateSuccessfulElection() { + OperationContextReplMock txn; + ReplicationCoordinatorImpl* replCoord = getReplCoord(); + NetworkInterfaceMock* net = getNet(); + ReplicaSetConfig rsConfig = replCoord->getReplicaSetConfig_forTest(); + ASSERT(replCoord->getMemberState().secondary()) << replCoord->getMemberState().toString(); + while (!replCoord->getMemberState().primary()) { + log() << "Waiting on network in state " << replCoord->getMemberState(); + getNet()->enterNetwork(); + const NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); + const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); + log() << request.target.toString() << " processing " << request.cmdObj; + ReplSetHeartbeatArgs hbArgs; + if (hbArgs.initialize(request.cmdObj).isOK()) { + ReplSetHeartbeatResponse hbResp; + hbResp.setSetName(rsConfig.getReplSetName()); + hbResp.setState(MemberState::RS_SECONDARY); + hbResp.setVersion(rsConfig.getConfigVersion()); + BSONObjBuilder respObj; + respObj << "ok" << 1; + hbResp.addToBSON(&respObj); + net->scheduleResponse(noi, net->now(), makeResponseStatus(respObj.obj())); + } else if (request.cmdObj.firstElement().fieldNameStringData() == "replSetFresh") { + net->scheduleResponse( + noi, + net->now(), + makeResponseStatus(BSON("ok" << 1 << "fresher" << false << "opTime" + << Date_t(OpTime(0, 0).asDate()) << "veto" << false))); + } else if (request.cmdObj.firstElement().fieldNameStringData() == "replSetElect") { + net->scheduleResponse(noi, + net->now(), + makeResponseStatus(BSON("ok" << 1 << "vote" << 1 << "round" + << request.cmdObj["round"].OID()))); + } else { + error() << "Black holing unexpected request to " << request.target << ": " + << request.cmdObj; + net->blackHole(noi); } - ASSERT(replCoord->isWaitingForApplierToDrain()); - ASSERT(replCoord->getMemberState().primary()) << - replCoord->getMemberState().toString(); - - IsMasterResponse imResponse; - replCoord->fillIsMasterForReplSet(&imResponse); - ASSERT_FALSE(imResponse.isMaster()) << imResponse.toBSON().toString(); - ASSERT_TRUE(imResponse.isSecondary()) << imResponse.toBSON().toString(); - replCoord->signalDrainComplete(&txn); - replCoord->fillIsMasterForReplSet(&imResponse); - ASSERT_TRUE(imResponse.isMaster()) << imResponse.toBSON().toString(); - ASSERT_FALSE(imResponse.isSecondary()) << imResponse.toBSON().toString(); - - ASSERT(replCoord->getMemberState().primary()) << - replCoord->getMemberState().toString(); + net->runReadyNetworkOperations(); + getNet()->exitNetwork(); } - - void ReplCoordTest::simulateStepDownOnIsolation() { - ReplicationCoordinatorImpl* replCoord = getReplCoord(); - NetworkInterfaceMock* net = getNet(); - ReplicaSetConfig rsConfig = replCoord->getReplicaSetConfig_forTest(); - ASSERT(replCoord->getMemberState().primary()) << - replCoord->getMemberState().toString(); - while (replCoord->getMemberState().primary()) { - log() << "Waiting on network in state " << replCoord->getMemberState(); - getNet()->enterNetwork(); - net->runUntil(net->now() + 10000); - const NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); - const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); - log() << request.target.toString() << " processing " << request.cmdObj; - ReplSetHeartbeatArgs hbArgs; - if (hbArgs.initialize(request.cmdObj).isOK()) { - net->scheduleResponse(noi, - net->now(), - ResponseStatus(ErrorCodes::NetworkTimeout, "Nobody's home")); - } - else { - error() << "Black holing unexpected request to " << request.target << ": " << - request.cmdObj; - net->blackHole(noi); - } - net->runReadyNetworkOperations(); - getNet()->exitNetwork(); + ASSERT(replCoord->isWaitingForApplierToDrain()); + ASSERT(replCoord->getMemberState().primary()) << replCoord->getMemberState().toString(); + + IsMasterResponse imResponse; + replCoord->fillIsMasterForReplSet(&imResponse); + ASSERT_FALSE(imResponse.isMaster()) << imResponse.toBSON().toString(); + ASSERT_TRUE(imResponse.isSecondary()) << imResponse.toBSON().toString(); + replCoord->signalDrainComplete(&txn); + replCoord->fillIsMasterForReplSet(&imResponse); + ASSERT_TRUE(imResponse.isMaster()) << imResponse.toBSON().toString(); + ASSERT_FALSE(imResponse.isSecondary()) << imResponse.toBSON().toString(); + + ASSERT(replCoord->getMemberState().primary()) << replCoord->getMemberState().toString(); +} + +void ReplCoordTest::simulateStepDownOnIsolation() { + ReplicationCoordinatorImpl* replCoord = getReplCoord(); + NetworkInterfaceMock* net = getNet(); + ReplicaSetConfig rsConfig = replCoord->getReplicaSetConfig_forTest(); + ASSERT(replCoord->getMemberState().primary()) << replCoord->getMemberState().toString(); + while (replCoord->getMemberState().primary()) { + log() << "Waiting on network in state " << replCoord->getMemberState(); + getNet()->enterNetwork(); + net->runUntil(net->now() + 10000); + const NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); + const ReplicationExecutor::RemoteCommandRequest& request = noi->getRequest(); + log() << request.target.toString() << " processing " << request.cmdObj; + ReplSetHeartbeatArgs hbArgs; + if (hbArgs.initialize(request.cmdObj).isOK()) { + net->scheduleResponse( + noi, net->now(), ResponseStatus(ErrorCodes::NetworkTimeout, "Nobody's home")); + } else { + error() << "Black holing unexpected request to " << request.target << ": " + << request.cmdObj; + net->blackHole(noi); } + net->runReadyNetworkOperations(); + getNet()->exitNetwork(); } - - void ReplCoordTest::shutdown() { - invariant(_callShutdown); - _net->exitNetwork(); - _repl->shutdown(); - _callShutdown = false; - } - - int64_t ReplCoordTest::countLogLinesContaining(const std::string& needle) { - return std::count_if(getCapturedLogMessages().begin(), - getCapturedLogMessages().end(), - stdx::bind(stringContains, - stdx::placeholders::_1, - needle)); - } +} + +void ReplCoordTest::shutdown() { + invariant(_callShutdown); + _net->exitNetwork(); + _repl->shutdown(); + _callShutdown = false; +} + +int64_t ReplCoordTest::countLogLinesContaining(const std::string& needle) { + return std::count_if(getCapturedLogMessages().begin(), + getCapturedLogMessages().end(), + stdx::bind(stringContains, stdx::placeholders::_1, needle)); +} } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/replication_coordinator_test_fixture.h b/src/mongo/db/repl/replication_coordinator_test_fixture.h index 48f644abbe7..5e4094c466d 100644 --- a/src/mongo/db/repl/replication_coordinator_test_fixture.h +++ b/src/mongo/db/repl/replication_coordinator_test_fixture.h @@ -38,155 +38,163 @@ namespace mongo { - class BSONObj; - struct HostAndPort; +class BSONObj; +struct HostAndPort; namespace repl { - class NetworkInterfaceMock; - class ReplicaSetConfig; - class ReplicationCoordinatorExternalStateMock; - class ReplicationCoordinatorImpl; - class TopologyCoordinatorImpl; - - /** - * Fixture for testing ReplicationCoordinatorImpl behaviors. - */ - class ReplCoordTest : public mongo::unittest::Test { - public: - /** - * Makes a ResponseStatus with the given "doc" response and optional elapsed time "millis". - */ - static ResponseStatus makeResponseStatus(const BSONObj& doc, - Milliseconds millis = Milliseconds(0)); - - /** - * Constructs a ReplicaSetConfig from the given BSON, or raises a test failure exception. - */ - static ReplicaSetConfig assertMakeRSConfig(const BSONObj& configBSON); - - ReplCoordTest(); - virtual ~ReplCoordTest(); - - protected: - virtual void setUp(); - virtual void tearDown(); - - /** - * Gets the network mock. - */ - NetworkInterfaceMock* getNet() { return _net; } - - /** - * Gets the replication coordinator under test. - */ - ReplicationCoordinatorImpl* getReplCoord() { return _repl.get();} - - /** - * Gets the topology coordinator used by the replication coordinator under test. - */ - TopologyCoordinatorImpl& getTopoCoord() { return *_topo;} - - /** - * Gets the external state used by the replication coordinator under test. - */ - ReplicationCoordinatorExternalStateMock* getExternalState() { return _externalState; } - - /** - * Adds "selfHost" to the list of hosts that identify as "this" host. - */ - void addSelf(const HostAndPort& selfHost); - - /** - * Moves time forward in the network until the new time, and asserts if now!=newTime after - */ - void assertRunUntil(Date_t newTime); - - /** - * Shorthand for getNet()->enterNetwork() - */ - void enterNetwork(); - - /** - * Shorthand for getNet()->exitNetwork() - */ - void exitNetwork(); - - /** - * Initializes the objects under test; this behavior is optional, in case you need to call - * any methods on the network or coordinator objects before calling start. - */ - void init(); - - /** - * Initializes the objects under test, using the given "settings". - */ - void init(const ReplSettings& settings); - - /** - * Initializes the objects under test, using "replSet" as the name of the replica set under - * test. - */ - void init(const std::string& replSet); - - /** - * Starts the replication coordinator under test, with no local config document and - * no notion of what host or hosts are represented by the network interface. - */ - void start(); - - /** - * Starts the replication coordinator under test, with the given configuration in - * local storage and the given host name. - */ - void start(const BSONObj& configDoc, const HostAndPort& selfHost); - - /** - * Starts the replication coordinator under test with the given host name. - */ - void start(const HostAndPort& selfHost); - - /** - * Brings the repl coord from SECONDARY to PRIMARY by simulating the messages required to - * elect it. - * - * Behavior is unspecified if node does not have a clean config, is not in SECONDARY, etc. - */ - void simulateSuccessfulElection(); - - /** - * Brings the repl coord from PRIMARY to SECONDARY by simulating a period of time in which - * all heartbeats respond with an error condition, such as time out. - */ - void simulateStepDownOnIsolation(); - - /** - * Asserts that calling start(configDoc, selfHost) successfully initiates the - * ReplicationCoordinator under test. - */ - void assertStartSuccess(const BSONObj& configDoc, const HostAndPort& selfHost); - - /** - * Shuts down the objects under test. - */ - void shutdown(); - - /** - * Returns the number of collected log lines containing "needle". - */ - int64_t countLogLinesContaining(const std::string& needle); - - private: - boost::scoped_ptr<ReplicationCoordinatorImpl> _repl; - // Owned by ReplicationCoordinatorImpl - TopologyCoordinatorImpl* _topo; - // Owned by ReplicationCoordinatorImpl - NetworkInterfaceMock* _net; - // Owned by ReplicationCoordinatorImpl - ReplicationCoordinatorExternalStateMock* _externalState; - ReplSettings _settings; - bool _callShutdown; - }; +class NetworkInterfaceMock; +class ReplicaSetConfig; +class ReplicationCoordinatorExternalStateMock; +class ReplicationCoordinatorImpl; +class TopologyCoordinatorImpl; + +/** + * Fixture for testing ReplicationCoordinatorImpl behaviors. + */ +class ReplCoordTest : public mongo::unittest::Test { +public: + /** + * Makes a ResponseStatus with the given "doc" response and optional elapsed time "millis". + */ + static ResponseStatus makeResponseStatus(const BSONObj& doc, + Milliseconds millis = Milliseconds(0)); + + /** + * Constructs a ReplicaSetConfig from the given BSON, or raises a test failure exception. + */ + static ReplicaSetConfig assertMakeRSConfig(const BSONObj& configBSON); + + ReplCoordTest(); + virtual ~ReplCoordTest(); + +protected: + virtual void setUp(); + virtual void tearDown(); + + /** + * Gets the network mock. + */ + NetworkInterfaceMock* getNet() { + return _net; + } + + /** + * Gets the replication coordinator under test. + */ + ReplicationCoordinatorImpl* getReplCoord() { + return _repl.get(); + } + + /** + * Gets the topology coordinator used by the replication coordinator under test. + */ + TopologyCoordinatorImpl& getTopoCoord() { + return *_topo; + } + + /** + * Gets the external state used by the replication coordinator under test. + */ + ReplicationCoordinatorExternalStateMock* getExternalState() { + return _externalState; + } + + /** + * Adds "selfHost" to the list of hosts that identify as "this" host. + */ + void addSelf(const HostAndPort& selfHost); + + /** + * Moves time forward in the network until the new time, and asserts if now!=newTime after + */ + void assertRunUntil(Date_t newTime); + + /** + * Shorthand for getNet()->enterNetwork() + */ + void enterNetwork(); + + /** + * Shorthand for getNet()->exitNetwork() + */ + void exitNetwork(); + + /** + * Initializes the objects under test; this behavior is optional, in case you need to call + * any methods on the network or coordinator objects before calling start. + */ + void init(); + + /** + * Initializes the objects under test, using the given "settings". + */ + void init(const ReplSettings& settings); + + /** + * Initializes the objects under test, using "replSet" as the name of the replica set under + * test. + */ + void init(const std::string& replSet); + + /** + * Starts the replication coordinator under test, with no local config document and + * no notion of what host or hosts are represented by the network interface. + */ + void start(); + + /** + * Starts the replication coordinator under test, with the given configuration in + * local storage and the given host name. + */ + void start(const BSONObj& configDoc, const HostAndPort& selfHost); + + /** + * Starts the replication coordinator under test with the given host name. + */ + void start(const HostAndPort& selfHost); + + /** + * Brings the repl coord from SECONDARY to PRIMARY by simulating the messages required to + * elect it. + * + * Behavior is unspecified if node does not have a clean config, is not in SECONDARY, etc. + */ + void simulateSuccessfulElection(); + + /** + * Brings the repl coord from PRIMARY to SECONDARY by simulating a period of time in which + * all heartbeats respond with an error condition, such as time out. + */ + void simulateStepDownOnIsolation(); + + /** + * Asserts that calling start(configDoc, selfHost) successfully initiates the + * ReplicationCoordinator under test. + */ + void assertStartSuccess(const BSONObj& configDoc, const HostAndPort& selfHost); + + /** + * Shuts down the objects under test. + */ + void shutdown(); + + /** + * Returns the number of collected log lines containing "needle". + */ + int64_t countLogLinesContaining(const std::string& needle); + +private: + boost::scoped_ptr<ReplicationCoordinatorImpl> _repl; + // Owned by ReplicationCoordinatorImpl + TopologyCoordinatorImpl* _topo; + // Owned by ReplicationCoordinatorImpl + NetworkInterfaceMock* _net; + // Owned by ReplicationCoordinatorImpl + ReplicationCoordinatorExternalStateMock* _externalState; + ReplSettings _settings; + bool _callShutdown; +}; } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/replication_executor.cpp b/src/mongo/db/repl/replication_executor.cpp index ad24282b404..c7cb4c9cb9c 100644 --- a/src/mongo/db/repl/replication_executor.cpp +++ b/src/mongo/db/repl/replication_executor.cpp @@ -39,519 +39,452 @@ namespace mongo { namespace repl { namespace { - stdx::function<void ()> makeNoExcept(const stdx::function<void ()> &fn); +stdx::function<void()> makeNoExcept(const stdx::function<void()>& fn); } // namespace - const ReplicationExecutor::Milliseconds ReplicationExecutor::kNoTimeout(-1); - const Date_t ReplicationExecutor::kNoExpirationDate(-1); - - ReplicationExecutor::ReplicationExecutor(NetworkInterface* netInterface, int64_t prngSeed) : - _random(prngSeed), - _networkInterface(netInterface), - _totalEventWaiters(0), - _inShutdown(false), - _dblockWorkers(threadpool::ThreadPool::DoNotStartThreadsTag(), - 1, - "replCallbackWithGlobalLock-"), - _nextId(0) { - } - - ReplicationExecutor::~ReplicationExecutor() {} - - std::string ReplicationExecutor::getDiagnosticString() { - boost::lock_guard<boost::mutex> lk(_mutex); - return _getDiagnosticString_inlock(); - } - - std::string ReplicationExecutor::_getDiagnosticString_inlock() const { - str::stream output; - output << "ReplicationExecutor"; - output << " networkInProgress:" << _networkInProgressQueue.size(); - output << " exclusiveInProgress:" << _exclusiveLockInProgressQueue.size(); - output << " sleeperQueue:" << _sleepersQueue.size(); - output << " ready:" << _readyQueue.size(); - output << " free:" << _freeQueue.size(); - output << " unsignaledEvents:" << _unsignaledEvents.size(); - output << " eventWaiters:" << _totalEventWaiters; - output << " shuttingDown:" << _inShutdown; - output << " networkInterface:" << _networkInterface->getDiagnosticString(); - return output; - } - - Date_t ReplicationExecutor::now() { - return _networkInterface->now(); - } - - void ReplicationExecutor::run() { - setThreadName("ReplicationExecutor"); - _networkInterface->startup(); - _dblockWorkers.startThreads(); - std::pair<WorkItem, CallbackHandle> work; - while ((work = getWork()).first.callback) { - { - boost::lock_guard<boost::mutex> lk(_terribleExLockSyncMutex); - const Status inStatus = work.first.isCanceled ? - Status(ErrorCodes::CallbackCanceled, "Callback canceled") : - Status::OK(); - makeNoExcept(stdx::bind(work.first.callback, - CallbackData(this, work.second, inStatus)))(); - } - signalEvent(work.first.finishedEvent); - } - finishShutdown(); - _networkInterface->shutdown(); - } - - void ReplicationExecutor::shutdown() { - // Correct shutdown needs to: - // * Disable future work queueing. - // * drain all of the unsignaled events, sleepers, and ready queue, by running those - // callbacks with a "shutdown" or "canceled" status. - // * Signal all threads blocked in waitForEvent, and wait for them to return from that method. - boost::lock_guard<boost::mutex> lk(_mutex); - _inShutdown = true; - - _readyQueue.splice(_readyQueue.end(), _exclusiveLockInProgressQueue); - _readyQueue.splice(_readyQueue.end(), _networkInProgressQueue); - _readyQueue.splice(_readyQueue.end(), _sleepersQueue); - for (EventList::iterator event = _unsignaledEvents.begin(); - event != _unsignaledEvents.end(); - ++event) { - - _readyQueue.splice(_readyQueue.end(), event->waiters); - } - for (WorkQueue::iterator readyWork = _readyQueue.begin(); - readyWork != _readyQueue.end(); - ++readyWork) { - - readyWork->isCanceled = true; +const ReplicationExecutor::Milliseconds ReplicationExecutor::kNoTimeout(-1); +const Date_t ReplicationExecutor::kNoExpirationDate(-1); + +ReplicationExecutor::ReplicationExecutor(NetworkInterface* netInterface, int64_t prngSeed) + : _random(prngSeed), + _networkInterface(netInterface), + _totalEventWaiters(0), + _inShutdown(false), + _dblockWorkers( + threadpool::ThreadPool::DoNotStartThreadsTag(), 1, "replCallbackWithGlobalLock-"), + _nextId(0) {} + +ReplicationExecutor::~ReplicationExecutor() {} + +std::string ReplicationExecutor::getDiagnosticString() { + boost::lock_guard<boost::mutex> lk(_mutex); + return _getDiagnosticString_inlock(); +} + +std::string ReplicationExecutor::_getDiagnosticString_inlock() const { + str::stream output; + output << "ReplicationExecutor"; + output << " networkInProgress:" << _networkInProgressQueue.size(); + output << " exclusiveInProgress:" << _exclusiveLockInProgressQueue.size(); + output << " sleeperQueue:" << _sleepersQueue.size(); + output << " ready:" << _readyQueue.size(); + output << " free:" << _freeQueue.size(); + output << " unsignaledEvents:" << _unsignaledEvents.size(); + output << " eventWaiters:" << _totalEventWaiters; + output << " shuttingDown:" << _inShutdown; + output << " networkInterface:" << _networkInterface->getDiagnosticString(); + return output; +} + +Date_t ReplicationExecutor::now() { + return _networkInterface->now(); +} + +void ReplicationExecutor::run() { + setThreadName("ReplicationExecutor"); + _networkInterface->startup(); + _dblockWorkers.startThreads(); + std::pair<WorkItem, CallbackHandle> work; + while ((work = getWork()).first.callback) { + { + boost::lock_guard<boost::mutex> lk(_terribleExLockSyncMutex); + const Status inStatus = work.first.isCanceled + ? Status(ErrorCodes::CallbackCanceled, "Callback canceled") + : Status::OK(); + makeNoExcept( + stdx::bind(work.first.callback, CallbackData(this, work.second, inStatus)))(); } + signalEvent(work.first.finishedEvent); + } + finishShutdown(); + _networkInterface->shutdown(); +} + +void ReplicationExecutor::shutdown() { + // Correct shutdown needs to: + // * Disable future work queueing. + // * drain all of the unsignaled events, sleepers, and ready queue, by running those + // callbacks with a "shutdown" or "canceled" status. + // * Signal all threads blocked in waitForEvent, and wait for them to return from that method. + boost::lock_guard<boost::mutex> lk(_mutex); + _inShutdown = true; + + _readyQueue.splice(_readyQueue.end(), _exclusiveLockInProgressQueue); + _readyQueue.splice(_readyQueue.end(), _networkInProgressQueue); + _readyQueue.splice(_readyQueue.end(), _sleepersQueue); + for (EventList::iterator event = _unsignaledEvents.begin(); event != _unsignaledEvents.end(); + ++event) { + _readyQueue.splice(_readyQueue.end(), event->waiters); + } + for (WorkQueue::iterator readyWork = _readyQueue.begin(); readyWork != _readyQueue.end(); + ++readyWork) { + readyWork->isCanceled = true; + } + _networkInterface->signalWorkAvailable(); +} + +void ReplicationExecutor::finishShutdown() { + _dblockWorkers.join(); + boost::unique_lock<boost::mutex> lk(_mutex); + invariant(_inShutdown); + invariant(_exclusiveLockInProgressQueue.empty()); + invariant(_readyQueue.empty()); + invariant(_sleepersQueue.empty()); + + while (!_unsignaledEvents.empty()) { + EventList::iterator event = _unsignaledEvents.begin(); + invariant(event->waiters.empty()); + signalEvent_inlock(EventHandle(event, ++_nextId)); + } + + while (_totalEventWaiters > 0) + _noMoreWaitingThreads.wait(lk); + + invariant(_exclusiveLockInProgressQueue.empty()); + invariant(_readyQueue.empty()); + invariant(_sleepersQueue.empty()); + invariant(_unsignaledEvents.empty()); +} + +void ReplicationExecutor::maybeNotifyShutdownComplete_inlock() { + if (_totalEventWaiters == 0) + _noMoreWaitingThreads.notify_all(); +} + +StatusWith<ReplicationExecutor::EventHandle> ReplicationExecutor::makeEvent() { + boost::lock_guard<boost::mutex> lk(_mutex); + return makeEvent_inlock(); +} + +StatusWith<ReplicationExecutor::EventHandle> ReplicationExecutor::makeEvent_inlock() { + if (_inShutdown) + return StatusWith<EventHandle>(ErrorCodes::ShutdownInProgress, "Shutdown in progress"); + + if (_signaledEvents.empty()) + _signaledEvents.push_back(Event()); + const EventList::iterator iter = _signaledEvents.begin(); + invariant(iter->waiters.empty()); + iter->generation++; + iter->isSignaled = false; + _unsignaledEvents.splice(_unsignaledEvents.end(), _signaledEvents, iter); + return StatusWith<EventHandle>(EventHandle(iter, ++_nextId)); +} + +void ReplicationExecutor::signalEvent(const EventHandle& event) { + boost::lock_guard<boost::mutex> lk(_mutex); + signalEvent_inlock(event); +} + +void ReplicationExecutor::signalEvent_inlock(const EventHandle& event) { + invariant(!event._iter->isSignaled); + invariant(event._iter->generation == event._generation); + event._iter->isSignaled = true; + _signaledEvents.splice(_signaledEvents.end(), _unsignaledEvents, event._iter); + if (!event._iter->waiters.empty()) { + _readyQueue.splice(_readyQueue.end(), event._iter->waiters); _networkInterface->signalWorkAvailable(); } - - void ReplicationExecutor::finishShutdown() { - _dblockWorkers.join(); - boost::unique_lock<boost::mutex> lk(_mutex); - invariant(_inShutdown); - invariant(_exclusiveLockInProgressQueue.empty()); - invariant(_readyQueue.empty()); - invariant(_sleepersQueue.empty()); - - while (!_unsignaledEvents.empty()) { - EventList::iterator event = _unsignaledEvents.begin(); - invariant(event->waiters.empty()); - signalEvent_inlock(EventHandle(event, ++_nextId)); - } - - while (_totalEventWaiters > 0) - _noMoreWaitingThreads.wait(lk); - - invariant(_exclusiveLockInProgressQueue.empty()); - invariant(_readyQueue.empty()); - invariant(_sleepersQueue.empty()); - invariant(_unsignaledEvents.empty()); - } - - void ReplicationExecutor::maybeNotifyShutdownComplete_inlock() { - if (_totalEventWaiters == 0) - _noMoreWaitingThreads.notify_all(); - } - - StatusWith<ReplicationExecutor::EventHandle> ReplicationExecutor::makeEvent() { - boost::lock_guard<boost::mutex> lk(_mutex); - return makeEvent_inlock(); - } - - StatusWith<ReplicationExecutor::EventHandle> ReplicationExecutor::makeEvent_inlock() { - if (_inShutdown) - return StatusWith<EventHandle>(ErrorCodes::ShutdownInProgress, "Shutdown in progress"); - - if (_signaledEvents.empty()) - _signaledEvents.push_back(Event()); - const EventList::iterator iter = _signaledEvents.begin(); - invariant(iter->waiters.empty()); - iter->generation++; - iter->isSignaled = false; - _unsignaledEvents.splice(_unsignaledEvents.end(), _signaledEvents, iter); - return StatusWith<EventHandle>(EventHandle(iter, ++_nextId)); - } - - void ReplicationExecutor::signalEvent(const EventHandle& event) { - boost::lock_guard<boost::mutex> lk(_mutex); - signalEvent_inlock(event); - } - - void ReplicationExecutor::signalEvent_inlock(const EventHandle& event) { - invariant(!event._iter->isSignaled); - invariant(event._iter->generation == event._generation); - event._iter->isSignaled = true; - _signaledEvents.splice(_signaledEvents.end(), _unsignaledEvents, event._iter); - if (!event._iter->waiters.empty()) { - _readyQueue.splice(_readyQueue.end(), event._iter->waiters); - _networkInterface->signalWorkAvailable(); - } - event._iter->isSignaledCondition->notify_all(); - } - - StatusWith<ReplicationExecutor::CallbackHandle> ReplicationExecutor::onEvent( - const EventHandle& event, - const CallbackFn& work) { - boost::lock_guard<boost::mutex> lk(_mutex); - invariant(event.isValid()); - invariant(event._generation <= event._iter->generation); - WorkQueue* queue = &_readyQueue; - if (event._generation == event._iter->generation && !event._iter->isSignaled) { - queue = &event._iter->waiters; - } - else { - queue = &_readyQueue; - } - return enqueueWork_inlock(queue, work); - } - - void ReplicationExecutor::waitForEvent(const EventHandle& event) { - boost::unique_lock<boost::mutex> lk(_mutex); - invariant(event.isValid()); - ++_totalEventWaiters; - while ((event._generation == event._iter->generation) && !event._iter->isSignaled) { - event._iter->isSignaledCondition->wait(lk); - } - --_totalEventWaiters; - maybeNotifyShutdownComplete_inlock(); - } - - static void remoteCommandFinished( - const ReplicationExecutor::CallbackData& cbData, - const ReplicationExecutor::RemoteCommandCallbackFn& cb, - const ReplicationExecutor::RemoteCommandRequest& request, - const ResponseStatus& response) { - - if (cbData.status.isOK()) { - cb(ReplicationExecutor::RemoteCommandCallbackData( - cbData.executor, cbData.myHandle, request, response)); - } - else { - cb(ReplicationExecutor::RemoteCommandCallbackData( - cbData.executor, - cbData.myHandle, - request, - ResponseStatus(cbData.status))); - } - } - - static void remoteCommandFailedEarly( - const ReplicationExecutor::CallbackData& cbData, - const ReplicationExecutor::RemoteCommandCallbackFn& cb, - const ReplicationExecutor::RemoteCommandRequest& request) { - - invariant(!cbData.status.isOK()); + event._iter->isSignaledCondition->notify_all(); +} + +StatusWith<ReplicationExecutor::CallbackHandle> ReplicationExecutor::onEvent( + const EventHandle& event, const CallbackFn& work) { + boost::lock_guard<boost::mutex> lk(_mutex); + invariant(event.isValid()); + invariant(event._generation <= event._iter->generation); + WorkQueue* queue = &_readyQueue; + if (event._generation == event._iter->generation && !event._iter->isSignaled) { + queue = &event._iter->waiters; + } else { + queue = &_readyQueue; + } + return enqueueWork_inlock(queue, work); +} + +void ReplicationExecutor::waitForEvent(const EventHandle& event) { + boost::unique_lock<boost::mutex> lk(_mutex); + invariant(event.isValid()); + ++_totalEventWaiters; + while ((event._generation == event._iter->generation) && !event._iter->isSignaled) { + event._iter->isSignaledCondition->wait(lk); + } + --_totalEventWaiters; + maybeNotifyShutdownComplete_inlock(); +} + +static void remoteCommandFinished(const ReplicationExecutor::CallbackData& cbData, + const ReplicationExecutor::RemoteCommandCallbackFn& cb, + const ReplicationExecutor::RemoteCommandRequest& request, + const ResponseStatus& response) { + if (cbData.status.isOK()) { cb(ReplicationExecutor::RemoteCommandCallbackData( - cbData.executor, - cbData.myHandle, - request, - ResponseStatus(cbData.status))); - } - - void ReplicationExecutor::_finishRemoteCommand( - const RemoteCommandRequest& request, - const ResponseStatus& response, - const CallbackHandle& cbHandle, - const uint64_t expectedHandleGeneration, - const RemoteCommandCallbackFn& cb) { - - const WorkQueue::iterator iter = cbHandle._iter; - boost::lock_guard<boost::mutex> lk(_mutex); - if (_inShutdown) { - return; - } - if (expectedHandleGeneration != iter->generation) { - return; - } - iter->callback = stdx::bind(remoteCommandFinished, - stdx::placeholders::_1, - cb, - request, - response); - _readyQueue.splice(_readyQueue.end(), _networkInProgressQueue, iter); - } - - StatusWith<ReplicationExecutor::CallbackHandle> ReplicationExecutor::scheduleRemoteCommand( - const RemoteCommandRequest& request, - const RemoteCommandCallbackFn& cb) { - RemoteCommandRequest scheduledRequest = request; - if (request.timeout == kNoTimeout) { - scheduledRequest.expirationDate = kNoExpirationDate; - } - else { - scheduledRequest.expirationDate = - _networkInterface->now() + scheduledRequest.timeout.total_milliseconds(); - } - boost::lock_guard<boost::mutex> lk(_mutex); - StatusWith<CallbackHandle> handle = enqueueWork_inlock( - &_networkInProgressQueue, - stdx::bind(remoteCommandFailedEarly, - stdx::placeholders::_1, - cb, - scheduledRequest)); - if (handle.isOK()) { - handle.getValue()._iter->isNetworkOperation = true; - _networkInterface->startCommand( - handle.getValue(), - scheduledRequest, - stdx::bind(&ReplicationExecutor::_finishRemoteCommand, - this, - scheduledRequest, - stdx::placeholders::_1, - handle.getValue(), - handle.getValue()._iter->generation, - cb)); - } - return handle; - } - - StatusWith<ReplicationExecutor::CallbackHandle> ReplicationExecutor::scheduleWork( - const CallbackFn& work) { - boost::lock_guard<boost::mutex> lk(_mutex); - _networkInterface->signalWorkAvailable(); - return enqueueWork_inlock(&_readyQueue, work); - } - - StatusWith<ReplicationExecutor::CallbackHandle> ReplicationExecutor::scheduleWorkAt( - Date_t when, - const CallbackFn& work) { - - boost::lock_guard<boost::mutex> lk(_mutex); - WorkQueue temp; - StatusWith<CallbackHandle> cbHandle = enqueueWork_inlock(&temp, work); - if (!cbHandle.isOK()) - return cbHandle; - cbHandle.getValue()._iter->readyDate = when; - WorkQueue::iterator insertBefore = _sleepersQueue.begin(); - while (insertBefore != _sleepersQueue.end() && insertBefore->readyDate <= when) - ++insertBefore; - _sleepersQueue.splice(insertBefore, temp, temp.begin()); + cbData.executor, cbData.myHandle, request, response)); + } else { + cb(ReplicationExecutor::RemoteCommandCallbackData( + cbData.executor, cbData.myHandle, request, ResponseStatus(cbData.status))); + } +} + +static void remoteCommandFailedEarly(const ReplicationExecutor::CallbackData& cbData, + const ReplicationExecutor::RemoteCommandCallbackFn& cb, + const ReplicationExecutor::RemoteCommandRequest& request) { + invariant(!cbData.status.isOK()); + cb(ReplicationExecutor::RemoteCommandCallbackData( + cbData.executor, cbData.myHandle, request, ResponseStatus(cbData.status))); +} + +void ReplicationExecutor::_finishRemoteCommand(const RemoteCommandRequest& request, + const ResponseStatus& response, + const CallbackHandle& cbHandle, + const uint64_t expectedHandleGeneration, + const RemoteCommandCallbackFn& cb) { + const WorkQueue::iterator iter = cbHandle._iter; + boost::lock_guard<boost::mutex> lk(_mutex); + if (_inShutdown) { + return; + } + if (expectedHandleGeneration != iter->generation) { + return; + } + iter->callback = + stdx::bind(remoteCommandFinished, stdx::placeholders::_1, cb, request, response); + _readyQueue.splice(_readyQueue.end(), _networkInProgressQueue, iter); +} + +StatusWith<ReplicationExecutor::CallbackHandle> ReplicationExecutor::scheduleRemoteCommand( + const RemoteCommandRequest& request, const RemoteCommandCallbackFn& cb) { + RemoteCommandRequest scheduledRequest = request; + if (request.timeout == kNoTimeout) { + scheduledRequest.expirationDate = kNoExpirationDate; + } else { + scheduledRequest.expirationDate = + _networkInterface->now() + scheduledRequest.timeout.total_milliseconds(); + } + boost::lock_guard<boost::mutex> lk(_mutex); + StatusWith<CallbackHandle> handle = enqueueWork_inlock( + &_networkInProgressQueue, + stdx::bind(remoteCommandFailedEarly, stdx::placeholders::_1, cb, scheduledRequest)); + if (handle.isOK()) { + handle.getValue()._iter->isNetworkOperation = true; + _networkInterface->startCommand(handle.getValue(), + scheduledRequest, + stdx::bind(&ReplicationExecutor::_finishRemoteCommand, + this, + scheduledRequest, + stdx::placeholders::_1, + handle.getValue(), + handle.getValue()._iter->generation, + cb)); + } + return handle; +} + +StatusWith<ReplicationExecutor::CallbackHandle> ReplicationExecutor::scheduleWork( + const CallbackFn& work) { + boost::lock_guard<boost::mutex> lk(_mutex); + _networkInterface->signalWorkAvailable(); + return enqueueWork_inlock(&_readyQueue, work); +} + +StatusWith<ReplicationExecutor::CallbackHandle> ReplicationExecutor::scheduleWorkAt( + Date_t when, const CallbackFn& work) { + boost::lock_guard<boost::mutex> lk(_mutex); + WorkQueue temp; + StatusWith<CallbackHandle> cbHandle = enqueueWork_inlock(&temp, work); + if (!cbHandle.isOK()) return cbHandle; - } - - void ReplicationExecutor::doOperationWithGlobalExclusiveLock( - OperationContext* txn, - const CallbackHandle& cbHandle) { - boost::unique_lock<boost::mutex> lk(_mutex); - if (_inShutdown) - return; - const WorkQueue::iterator iter = cbHandle._iter; - const uint64_t generation = iter->generation; - invariant(generation == cbHandle._generation); - WorkItem work = *iter; - iter->callback = CallbackFn(); - _freeQueue.splice(_freeQueue.begin(), _exclusiveLockInProgressQueue, iter); + cbHandle.getValue()._iter->readyDate = when; + WorkQueue::iterator insertBefore = _sleepersQueue.begin(); + while (insertBefore != _sleepersQueue.end() && insertBefore->readyDate <= when) + ++insertBefore; + _sleepersQueue.splice(insertBefore, temp, temp.begin()); + return cbHandle; +} + +void ReplicationExecutor::doOperationWithGlobalExclusiveLock(OperationContext* txn, + const CallbackHandle& cbHandle) { + boost::unique_lock<boost::mutex> lk(_mutex); + if (_inShutdown) + return; + const WorkQueue::iterator iter = cbHandle._iter; + const uint64_t generation = iter->generation; + invariant(generation == cbHandle._generation); + WorkItem work = *iter; + iter->callback = CallbackFn(); + _freeQueue.splice(_freeQueue.begin(), _exclusiveLockInProgressQueue, iter); + lk.unlock(); + { + boost::lock_guard<boost::mutex> terribleLock(_terribleExLockSyncMutex); + work.callback(CallbackData(this, + cbHandle, + (work.isCanceled + ? Status(ErrorCodes::CallbackCanceled, "Callback canceled") + : Status::OK()), + txn)); + } + lk.lock(); + signalEvent_inlock(work.finishedEvent); +} + +StatusWith<ReplicationExecutor::CallbackHandle> +ReplicationExecutor::scheduleWorkWithGlobalExclusiveLock(const CallbackFn& work) { + boost::lock_guard<boost::mutex> lk(_mutex); + StatusWith<CallbackHandle> handle = enqueueWork_inlock(&_exclusiveLockInProgressQueue, work); + if (handle.isOK()) { + const stdx::function<void(OperationContext*)> doOp = + stdx::bind(&ReplicationExecutor::doOperationWithGlobalExclusiveLock, + this, + stdx::placeholders::_1, + handle.getValue()); + _dblockWorkers.schedule(makeNoExcept(stdx::bind( + &NetworkInterface::runCallbackWithGlobalExclusiveLock, _networkInterface.get(), doOp))); + } + return handle; +} + +void ReplicationExecutor::cancel(const CallbackHandle& cbHandle) { + boost::unique_lock<boost::mutex> lk(_mutex); + if (cbHandle._iter->generation != cbHandle._generation) { + return; + } + cbHandle._iter->isCanceled = true; + if (cbHandle._iter->isNetworkOperation) { lk.unlock(); - { - boost::lock_guard<boost::mutex> terribleLock(_terribleExLockSyncMutex); - work.callback(CallbackData(this, - cbHandle, - (work.isCanceled ? - Status(ErrorCodes::CallbackCanceled, "Callback canceled") : - Status::OK()), - txn)); - } - lk.lock(); - signalEvent_inlock(work.finishedEvent); - } - - StatusWith<ReplicationExecutor::CallbackHandle> - ReplicationExecutor::scheduleWorkWithGlobalExclusiveLock( - const CallbackFn& work) { - - boost::lock_guard<boost::mutex> lk(_mutex); - StatusWith<CallbackHandle> handle = enqueueWork_inlock(&_exclusiveLockInProgressQueue, - work); - if (handle.isOK()) { - const stdx::function<void (OperationContext*)> doOp = stdx::bind( - &ReplicationExecutor::doOperationWithGlobalExclusiveLock, - this, - stdx::placeholders::_1, - handle.getValue()); - _dblockWorkers.schedule( - makeNoExcept(stdx::bind( - &NetworkInterface::runCallbackWithGlobalExclusiveLock, - _networkInterface.get(), - doOp))); - } - return handle; - } - - void ReplicationExecutor::cancel(const CallbackHandle& cbHandle) { - boost::unique_lock<boost::mutex> lk(_mutex); - if (cbHandle._iter->generation != cbHandle._generation) { - return; - } - cbHandle._iter->isCanceled = true; - if (cbHandle._iter->isNetworkOperation) { - lk.unlock(); - _networkInterface->cancelCommand(cbHandle); - } - } - - void ReplicationExecutor::wait(const CallbackHandle& cbHandle) { - waitForEvent(cbHandle._finishedEvent); - } - - std::pair<ReplicationExecutor::WorkItem, ReplicationExecutor::CallbackHandle> - ReplicationExecutor::getWork() { - boost::unique_lock<boost::mutex> lk(_mutex); - while (true) { - const Date_t now = _networkInterface->now(); - Date_t nextWakeupDate = scheduleReadySleepers_inlock(now); - if (!_readyQueue.empty()) { - break; - } - else if (_inShutdown) { - return std::make_pair(WorkItem(), CallbackHandle()); - } - lk.unlock(); - if (nextWakeupDate == Date_t(~0ULL)) { - _networkInterface->waitForWork(); - } - else { - _networkInterface->waitForWorkUntil(nextWakeupDate); - } - lk.lock(); - } - const CallbackHandle cbHandle(_readyQueue.begin()); - const WorkItem work = *cbHandle._iter; - _readyQueue.begin()->callback = CallbackFn(); - _freeQueue.splice(_freeQueue.begin(), _readyQueue, _readyQueue.begin()); - return std::make_pair(work, cbHandle); - } - - int64_t ReplicationExecutor::nextRandomInt64(int64_t limit) { - return _random.nextInt64(limit); - } - - Date_t ReplicationExecutor::scheduleReadySleepers_inlock(const Date_t now) { - WorkQueue::iterator iter = _sleepersQueue.begin(); - while ((iter != _sleepersQueue.end()) && (iter->readyDate <= now)) { - ++iter; - } - _readyQueue.splice(_readyQueue.end(), _sleepersQueue, _sleepersQueue.begin(), iter); - if (iter == _sleepersQueue.end()) { - // indicate no sleeper to wait for - return Date_t(~0ULL); + _networkInterface->cancelCommand(cbHandle); + } +} + +void ReplicationExecutor::wait(const CallbackHandle& cbHandle) { + waitForEvent(cbHandle._finishedEvent); +} + +std::pair<ReplicationExecutor::WorkItem, ReplicationExecutor::CallbackHandle> +ReplicationExecutor::getWork() { + boost::unique_lock<boost::mutex> lk(_mutex); + while (true) { + const Date_t now = _networkInterface->now(); + Date_t nextWakeupDate = scheduleReadySleepers_inlock(now); + if (!_readyQueue.empty()) { + break; + } else if (_inShutdown) { + return std::make_pair(WorkItem(), CallbackHandle()); } - return iter->readyDate; - } - - StatusWith<ReplicationExecutor::CallbackHandle> ReplicationExecutor::enqueueWork_inlock( - WorkQueue* queue, const CallbackFn& callback) { - - invariant(callback); - StatusWith<EventHandle> event = makeEvent_inlock(); - if (!event.isOK()) - return StatusWith<CallbackHandle>(event.getStatus()); - - if (_freeQueue.empty()) - _freeQueue.push_front(WorkItem()); - const WorkQueue::iterator iter = _freeQueue.begin(); - iter->generation++; - iter->callback = callback; - iter->finishedEvent = event.getValue(); - iter->readyDate = Date_t(); - iter->isCanceled = false; - queue->splice(queue->end(), _freeQueue, iter); - return StatusWith<CallbackHandle>(CallbackHandle(iter)); - } - - ReplicationExecutor::EventHandle::EventHandle(const EventList::iterator& iter, uint64_t id) : - _iter(iter), - _generation(iter->generation), - _id(id) { - } - - ReplicationExecutor::CallbackHandle::CallbackHandle(const WorkQueue::iterator& iter) : - _iter(iter), - _generation(iter->generation), - _finishedEvent(iter->finishedEvent) { - } - - ReplicationExecutor::CallbackData::CallbackData(ReplicationExecutor* theExecutor, - const CallbackHandle& theHandle, - const Status& theStatus, - OperationContext* theTxn) : - executor(theExecutor), - myHandle(theHandle), - status(theStatus), - txn(theTxn) { - } - - ReplicationExecutor::RemoteCommandRequest::RemoteCommandRequest() : - timeout(kNoTimeout), - expirationDate(kNoExpirationDate) { - } - - ReplicationExecutor::RemoteCommandRequest::RemoteCommandRequest( - const HostAndPort& theTarget, - const std::string& theDbName, - const BSONObj& theCmdObj, - const Milliseconds timeoutMillis) : - target(theTarget), - dbname(theDbName), - cmdObj(theCmdObj), - timeout(timeoutMillis) { - if (timeoutMillis == kNoTimeout) { - expirationDate = kNoExpirationDate; + lk.unlock(); + if (nextWakeupDate == Date_t(~0ULL)) { + _networkInterface->waitForWork(); + } else { + _networkInterface->waitForWorkUntil(nextWakeupDate); } + lk.lock(); } - - std::string ReplicationExecutor::RemoteCommandRequest::getDiagnosticString() { - str::stream out; - out << "RemoteCommand -- target:" << target.toString() << " db:" << dbname; - - if (expirationDate != kNoExpirationDate) - out << " expDate:" << expirationDate.toString(); - - out << " cmd:" << cmdObj.getOwned().toString(); - return out; - } - - ReplicationExecutor::RemoteCommandCallbackData::RemoteCommandCallbackData( - ReplicationExecutor* theExecutor, - const CallbackHandle& theHandle, - const RemoteCommandRequest& theRequest, - const ResponseStatus& theResponse) : - executor(theExecutor), - myHandle(theHandle), - request(theRequest), - response(theResponse) { - } - - ReplicationExecutor::WorkItem::WorkItem() : generation(0U), - isNetworkOperation(false), - isCanceled(false) {} - - ReplicationExecutor::Event::Event() : - generation(0), - isSignaled(false), - isSignaledCondition(new boost::condition_variable) { - } - - ReplicationExecutor::NetworkInterface::NetworkInterface() {} - ReplicationExecutor::NetworkInterface::~NetworkInterface() {} + const CallbackHandle cbHandle(_readyQueue.begin()); + const WorkItem work = *cbHandle._iter; + _readyQueue.begin()->callback = CallbackFn(); + _freeQueue.splice(_freeQueue.begin(), _readyQueue, _readyQueue.begin()); + return std::make_pair(work, cbHandle); +} + +int64_t ReplicationExecutor::nextRandomInt64(int64_t limit) { + return _random.nextInt64(limit); +} + +Date_t ReplicationExecutor::scheduleReadySleepers_inlock(const Date_t now) { + WorkQueue::iterator iter = _sleepersQueue.begin(); + while ((iter != _sleepersQueue.end()) && (iter->readyDate <= now)) { + ++iter; + } + _readyQueue.splice(_readyQueue.end(), _sleepersQueue, _sleepersQueue.begin(), iter); + if (iter == _sleepersQueue.end()) { + // indicate no sleeper to wait for + return Date_t(~0ULL); + } + return iter->readyDate; +} + +StatusWith<ReplicationExecutor::CallbackHandle> ReplicationExecutor::enqueueWork_inlock( + WorkQueue* queue, const CallbackFn& callback) { + invariant(callback); + StatusWith<EventHandle> event = makeEvent_inlock(); + if (!event.isOK()) + return StatusWith<CallbackHandle>(event.getStatus()); + + if (_freeQueue.empty()) + _freeQueue.push_front(WorkItem()); + const WorkQueue::iterator iter = _freeQueue.begin(); + iter->generation++; + iter->callback = callback; + iter->finishedEvent = event.getValue(); + iter->readyDate = Date_t(); + iter->isCanceled = false; + queue->splice(queue->end(), _freeQueue, iter); + return StatusWith<CallbackHandle>(CallbackHandle(iter)); +} + +ReplicationExecutor::EventHandle::EventHandle(const EventList::iterator& iter, uint64_t id) + : _iter(iter), _generation(iter->generation), _id(id) {} + +ReplicationExecutor::CallbackHandle::CallbackHandle(const WorkQueue::iterator& iter) + : _iter(iter), _generation(iter->generation), _finishedEvent(iter->finishedEvent) {} + +ReplicationExecutor::CallbackData::CallbackData(ReplicationExecutor* theExecutor, + const CallbackHandle& theHandle, + const Status& theStatus, + OperationContext* theTxn) + : executor(theExecutor), myHandle(theHandle), status(theStatus), txn(theTxn) {} + +ReplicationExecutor::RemoteCommandRequest::RemoteCommandRequest() + : timeout(kNoTimeout), expirationDate(kNoExpirationDate) {} + +ReplicationExecutor::RemoteCommandRequest::RemoteCommandRequest(const HostAndPort& theTarget, + const std::string& theDbName, + const BSONObj& theCmdObj, + const Milliseconds timeoutMillis) + : target(theTarget), dbname(theDbName), cmdObj(theCmdObj), timeout(timeoutMillis) { + if (timeoutMillis == kNoTimeout) { + expirationDate = kNoExpirationDate; + } +} + +std::string ReplicationExecutor::RemoteCommandRequest::getDiagnosticString() { + str::stream out; + out << "RemoteCommand -- target:" << target.toString() << " db:" << dbname; + + if (expirationDate != kNoExpirationDate) + out << " expDate:" << expirationDate.toString(); + + out << " cmd:" << cmdObj.getOwned().toString(); + return out; +} + +ReplicationExecutor::RemoteCommandCallbackData::RemoteCommandCallbackData( + ReplicationExecutor* theExecutor, + const CallbackHandle& theHandle, + const RemoteCommandRequest& theRequest, + const ResponseStatus& theResponse) + : executor(theExecutor), myHandle(theHandle), request(theRequest), response(theResponse) {} + +ReplicationExecutor::WorkItem::WorkItem() + : generation(0U), isNetworkOperation(false), isCanceled(false) {} + +ReplicationExecutor::Event::Event() + : generation(0), isSignaled(false), isSignaledCondition(new boost::condition_variable) {} + +ReplicationExecutor::NetworkInterface::NetworkInterface() {} +ReplicationExecutor::NetworkInterface::~NetworkInterface() {} namespace { - void callNoExcept(const stdx::function<void ()>& fn) { - try { - fn(); - } - catch (...) { - std::terminate(); - } +void callNoExcept(const stdx::function<void()>& fn) { + try { + fn(); + } catch (...) { + std::terminate(); } +} - stdx::function<void ()> makeNoExcept(const stdx::function<void ()> &fn) { - return stdx::bind(callNoExcept, fn); - } +stdx::function<void()> makeNoExcept(const stdx::function<void()>& fn) { + return stdx::bind(callNoExcept, fn); +} } // namespace diff --git a/src/mongo/db/repl/replication_executor.h b/src/mongo/db/repl/replication_executor.h index fcac23469b4..1708b1ec87d 100644 --- a/src/mongo/db/repl/replication_executor.h +++ b/src/mongo/db/repl/replication_executor.h @@ -49,576 +49,578 @@ namespace mongo { - class OperationContext; +class OperationContext; namespace repl { +/** + * Event loop for driving state machines in replication. + * + * The event loop has notions of events and callbacks. + * + * Callbacks are function objects representing work to be performed in some sequential order by + * the executor. They may be scheduled by client threads or by other callbacks. Methods that + * schedule callbacks return a CallbackHandle if they are able to enqueue the callback in the + * appropriate work queue. Every CallbackHandle represents an invocation of a function that + * will happen before the executor returns from run(). Calling cancel(CallbackHandle) schedules + * the specified callback to run with a flag indicating that it is "canceled," but it will run. + * Client threads may block waiting for a callback to execute by calling wait(CallbackHandle). + * + * Events are level-triggered and may only be signaled one time. Client threads and callbacks + * may schedule callbacks to be run by the executor after the event is signaled, and client + * threads may ask the executor to block them until after the event is signaled. + * + * If an event is unsignaled when shutdown is called, the executor will ensure that any threads + * blocked in waitForEvent() eventually return. + * + * Logically, Callbacks and Events exist for the life of the executor. That means that while + * the executor is in scope, no CallbackHandle or EventHandle is stale. + * + * Usage: Instantiate an executor, schedule a work item, call run(). + * + * Implementation details: + * + * The executor is composed of several WorkQueues, which are queues of WorkItems. WorkItems + * describe units of work -- a callback and state needed to track its lifecycle. The iterators + * pointing to WorkItems are spliced between the WorkQueues, rather than copying WorkItems + * themselves. Further, those WorkQueue::iterators are never invalidated during the life of an + * executor. They may be recycled to represent new work items, but when that happens, a counter + * on the WorkItem is incremented, to disambiguate. Handles referencing WorkQueue::iterators, + * called CallbackHandles, are thus valid for the life of the executor, simplifying lifecycle + * management. + * + * All work executed by the run() method of the executor is popped off the front of the + * _readyQueue. Remote commands blocked on the network can be found in the + * _networkInProgressQueue. Callbacks waiting for a timer to expire are in the _sleepersQueue. + * When the network returns or the timer expires, items from these two queues are transferred to + * the back of the _readyQueue. + * + * The _exclusiveLockInProgressQueue, which represents work items to execute while holding the + * GlobalWrite lock, is exceptional. WorkItems in that queue execute in unspecified order with + * respect to work in the _readyQueue or other WorkItems in the _exclusiveLockInProgressQueue, + * but they are executed in a single serial order with respect to those other WorkItems. The + * _terribleExLockSyncMutex is used to provide this serialization, until such time as the global + * lock may be passed from one thread to another. + * + * Events work similiarly to WorkItems, and EventList is akin to WorkQueue. + */ +class ReplicationExecutor { + MONGO_DISALLOW_COPYING(ReplicationExecutor); + +public: + typedef boost::posix_time::milliseconds Milliseconds; + struct CallbackData; + class CallbackHandle; + class EventHandle; + class NetworkInterface; + struct RemoteCommandCallbackData; + struct RemoteCommandRequest; + struct RemoteCommandResponse; + typedef StatusWith<RemoteCommandResponse> ResponseStatus; + + static const Milliseconds kNoTimeout; + static const Date_t kNoExpirationDate; + + /** + * Type of a regular callback function. + * + * The status argument passed at invocation will have code ErrorCodes::CallbackCanceled if + * the callback was canceled for any reason (including shutdown). Otherwise, it should have + * Status::OK(). + */ + typedef stdx::function<void(const CallbackData&)> CallbackFn; + + /** + * Type of a callback from a request to run a command on a remote MongoDB node. + * + * The StatusWith<const BSONObj> will have ErrorCodes::CallbackCanceled if the callback was + * canceled. Otherwise, its status will represent any failure to execute the command. + * If the command executed and a response came back, then the status object will contain + * the BSONObj returned by the command, with the "ok" field indicating the success of the + * command in the usual way. + */ + typedef stdx::function<void(const RemoteCommandCallbackData&)> RemoteCommandCallbackFn; + + /** + * Constructs a new executor. + * + * Takes ownership of the passed NetworkInterface object. + */ + explicit ReplicationExecutor(NetworkInterface* netInterface, int64_t pnrgSeed); + + /** + * Destroys an executor. + */ + ~ReplicationExecutor(); + + /** + * Returns diagnostic information. + */ + std::string getDiagnosticString(); + + /** + * Gets the current time as reported by the network interface. + */ + Date_t now(); + + /** + * Executes the run loop. May be called up to one time. + * + * Returns after the executor has been shutdown and is safe to delete. + */ + void run(); + + /** + * Signals to the executor that it should shut down. The only reliable indication + * that shutdown has completed is that the run() method returns. + * + * May be called by client threads or callbacks running in the executor. + */ + void shutdown(); + + /** + * Creates a new event. Returns a handle to the event, or ErrorCodes::ShutdownInProgress if + * makeEvent() fails because the executor is shutting down. + * + * May be called by client threads or callbacks running in the executor. + */ + StatusWith<EventHandle> makeEvent(); + + /** + * Signals the event, making waiting client threads and callbacks runnable. + * + * May be called up to one time per event. + * + * May be called by client threads or callbacks running in the executor. + */ + void signalEvent(const EventHandle&); + + /** + * Schedules a callback, "work", to run after "event" is signaled. If "event" + * has already been signaled, marks "work" as immediately runnable. + * + * If "event" has yet to be signaled when "shutdown()" is called, "work" will + * be scheduled with a status of ErrorCodes::CallbackCanceled. + * + * May be called by client threads or callbacks running in the executor. + */ + StatusWith<CallbackHandle> onEvent(const EventHandle& event, const CallbackFn& work); + + /** + * Blocks the calling thread until after "event" is signaled. Also returns + * if the event is never signaled but shutdown() is called on the executor. + * + * NOTE: Do not call from a callback running in the executor. + * + * TODO(schwerin): Change return type so that the caller can know which of the two reasons + * led to this method returning. + */ + void waitForEvent(const EventHandle& event); + + /** + * Schedules "work" to be run by the executor ASAP. + * + * Returns a handle for waiting on or canceling the callback, or + * ErrorCodes::ShutdownInProgress. + * + * May be called by client threads or callbacks running in the executor. + */ + StatusWith<CallbackHandle> scheduleWork(const CallbackFn& work); + + /** + * Schedules "work" to be run by the executor no sooner than "when". + * + * Returns a handle for waiting on or canceling the callback, or + * ErrorCodes::ShutdownInProgress. + * + * May be called by client threads or callbacks running in the executor. + */ + StatusWith<CallbackHandle> scheduleWorkAt(Date_t when, const CallbackFn& work); + + /** + * Schedules "work" to be run by the executor while holding the global exclusive lock. + * + * The "work" will run exclusively, as though it were executed by the main + * run loop, but there are no ordering guarantees provided with respect to + * any other work item. + * + * Returns a handle for waiting on or canceling the callback, or + * ErrorCodes::ShutdownInProgress. + * + * May be called by client threads or callbacks running in the executor. + */ + StatusWith<CallbackHandle> scheduleWorkWithGlobalExclusiveLock(const CallbackFn& work); + + /** + * Schedules "cb" to be run by the executor with the result of executing the remote command + * described by "request". + * + * Returns a handle for waiting on or canceling the callback, or + * ErrorCodes::ShutdownInProgress. + * + * May be called by client threads or callbacks running in the executor. + */ + StatusWith<CallbackHandle> scheduleRemoteCommand(const RemoteCommandRequest& request, + const RemoteCommandCallbackFn& cb); + + /** + * If the callback referenced by "cbHandle" hasn't already executed, marks it as + * canceled and runnable. + * + * May be called by client threads or callbacks running in the executor. + */ + void cancel(const CallbackHandle& cbHandle); + + /** + * Blocks until the executor finishes running the callback referenced by "cbHandle". + * + * Becaue callbacks all run during shutdown if they weren't run beforehand, there is no need + * to indicate the reason for returning from wait(CallbackHandle). It is always that the + * callback ran. + * + * NOTE: Do not call from a callback running in the executor. + */ + void wait(const CallbackHandle& cbHandle); + + /** + * Returns an int64_t generated by the prng with a max value of "limit". + */ + int64_t nextRandomInt64(int64_t limit); + +private: + struct Event; + struct WorkItem; + + /** + * A linked list of WorkItem objects. + * + * WorkItems get moved among lists by splicing iterators of work lists together, + * not by copying underlying WorkItem objects. + */ + typedef stdx::list<WorkItem> WorkQueue; + + /** + * A linked list of Event objects, like WorkQueue, above. + */ + typedef stdx::list<Event> EventList; + + /** + * Returns diagnostic info + */ + std::string _getDiagnosticString_inlock() const; + /** + * Implementation of makeEvent() for use when _mutex is already held. + */ + StatusWith<EventHandle> makeEvent_inlock(); + + /** + * Gets a single piece of work to execute. + * + * If the "callback" member of the returned WorkItem is falsey, that is a signal + * to the run loop to wait for shutdown. + */ + std::pair<WorkItem, CallbackHandle> getWork(); + + /** + * Marks as runnable any sleepers whose ready date has passed as of "now". + * Returns the date when the next sleeper will be ready, or Date_t(~0ULL) if there are no + * remaining sleepers. + */ + Date_t scheduleReadySleepers_inlock(Date_t now); + + /** + * Enqueues "callback" into "queue". + * + * Assumes that "queue" is sorted by readyDate, and performs insertion sort, starting + * at the back of the "queue" working toward the front. + * + * Use Date_t(0) for readyDate to mean "ready now". + */ + StatusWith<CallbackHandle> enqueueWork_inlock(WorkQueue* queue, const CallbackFn& callback); + + /** + * Implementation of signalEvent() that assumes the caller owns _mutex. + */ + void signalEvent_inlock(const EventHandle& event); + + /** + * Notifies interested parties that shutdown has completed, if it has. + */ + void maybeNotifyShutdownComplete_inlock(); + + /** + * Completes the shutdown process. Called by run(). + */ + void finishShutdown(); + + void _finishRemoteCommand(const RemoteCommandRequest& request, + const StatusWith<RemoteCommandResponse>& response, + const CallbackHandle& cbHandle, + const uint64_t expectedHandleGeneration, + const RemoteCommandCallbackFn& cb); + + /** + * Executes the callback referenced by "cbHandle", and moves the underlying + * WorkQueue::iterator into the _freeQueue. "txn" is a pointer to the OperationContext + * owning the global exclusive lock. + * + * Serializes execution of "cbHandle" with the execution of other callbacks. + */ + void doOperationWithGlobalExclusiveLock(OperationContext* txn, const CallbackHandle& cbHandle); + + // PRNG; seeded at class construction time. + PseudoRandom _random; + + boost::scoped_ptr<NetworkInterface> _networkInterface; + boost::mutex _mutex; + boost::mutex _terribleExLockSyncMutex; + boost::condition_variable _noMoreWaitingThreads; + WorkQueue _freeQueue; + WorkQueue _readyQueue; + WorkQueue _exclusiveLockInProgressQueue; + WorkQueue _networkInProgressQueue; + WorkQueue _sleepersQueue; + EventList _unsignaledEvents; + EventList _signaledEvents; + int64_t _totalEventWaiters; + bool _inShutdown; + threadpool::ThreadPool _dblockWorkers; + uint64_t _nextId; +}; + +/** + * Reference to an event object in the executor. + */ +class ReplicationExecutor::EventHandle { + friend class ReplicationExecutor; + +public: + EventHandle() : _generation(0), _id(0) {} + + /** + * Returns true if the handle is valid, meaning that it identifies + */ + bool isValid() const { + return _id != 0; + } + + bool operator==(const EventHandle& other) const { + return (_id == other._id); + } + + bool operator!=(const EventHandle& other) const { + return !(*this == other); + } + +private: + EventHandle(const EventList::iterator& iter, const uint64_t id); + + EventList::iterator _iter; + uint64_t _generation; + uint64_t _id; +}; + +/** + * Reference to a scheduled callback. + */ +class ReplicationExecutor::CallbackHandle { + friend class ReplicationExecutor; + +public: + CallbackHandle() : _generation(0) {} + + bool isValid() const { + return _finishedEvent.isValid(); + } + + bool operator==(const CallbackHandle& other) const { + return (_finishedEvent == other._finishedEvent); + } + + bool operator!=(const CallbackHandle& other) const { + return !(*this == other); + } + +private: + explicit CallbackHandle(const WorkQueue::iterator& iter); + + WorkQueue::iterator _iter; + uint64_t _generation; + EventHandle _finishedEvent; +}; + +struct ReplicationExecutor::CallbackData { + CallbackData(ReplicationExecutor* theExecutor, + const CallbackHandle& theHandle, + const Status& theStatus, + OperationContext* txn = NULL); + + ReplicationExecutor* executor; + CallbackHandle myHandle; + Status status; + OperationContext* txn; +}; + +/** + * Type of object describing a command to execute against a remote MongoDB node. + */ +struct ReplicationExecutor::RemoteCommandRequest { + RemoteCommandRequest(); + RemoteCommandRequest(const HostAndPort& theTarget, + const std::string& theDbName, + const BSONObj& theCmdObj, + const Milliseconds timeoutMillis = kNoTimeout); + + // Returns diagnostic info. + std::string getDiagnosticString(); + + HostAndPort target; + std::string dbname; + BSONObj cmdObj; + Milliseconds timeout; + Date_t expirationDate; // Set by scheduleRemoteCommand. +}; + +struct ReplicationExecutor::RemoteCommandResponse { + RemoteCommandResponse() : data(), elapsedMillis(Milliseconds(0)) {} + RemoteCommandResponse(BSONObj obj, Milliseconds millis) : data(obj), elapsedMillis(millis) {} + + BSONObj data; + Milliseconds elapsedMillis; +}; + +/** + * Interface to networking and lock manager. + */ +class ReplicationExecutor::NetworkInterface { + MONGO_DISALLOW_COPYING(NetworkInterface); + +public: + typedef RemoteCommandResponse Response; + typedef stdx::function<void(const ResponseStatus&)> RemoteCommandCompletionFn; + + virtual ~NetworkInterface(); + + /** + * Returns diagnostic info. + */ + virtual std::string getDiagnosticString() = 0; + + /** + * Starts up the network interface. + * + * It is valid to call all methods except shutdown() before this method completes. That is, + * implementations may not assume that startup() completes before startCommand() first + * executes. + * + * Called by the owning ReplicationExecutor inside its run() method. + */ + virtual void startup() = 0; + + /** + * Shuts down the network interface. Must be called before this instance gets deleted, + * if startup() is called. + * + * Called by the owning ReplicationExecutor inside its run() method. + */ + virtual void shutdown() = 0; + + /** + * Blocks the current thread (presumably the executor thread) until the network interface + * knows of work for the executor to perform. + */ + virtual void waitForWork() = 0; + + /** + * Similar to waitForWork, but only blocks until "when". + */ + virtual void waitForWorkUntil(Date_t when) = 0; + /** - * Event loop for driving state machines in replication. - * - * The event loop has notions of events and callbacks. - * - * Callbacks are function objects representing work to be performed in some sequential order by - * the executor. They may be scheduled by client threads or by other callbacks. Methods that - * schedule callbacks return a CallbackHandle if they are able to enqueue the callback in the - * appropriate work queue. Every CallbackHandle represents an invocation of a function that - * will happen before the executor returns from run(). Calling cancel(CallbackHandle) schedules - * the specified callback to run with a flag indicating that it is "canceled," but it will run. - * Client threads may block waiting for a callback to execute by calling wait(CallbackHandle). - * - * Events are level-triggered and may only be signaled one time. Client threads and callbacks - * may schedule callbacks to be run by the executor after the event is signaled, and client - * threads may ask the executor to block them until after the event is signaled. - * - * If an event is unsignaled when shutdown is called, the executor will ensure that any threads - * blocked in waitForEvent() eventually return. - * - * Logically, Callbacks and Events exist for the life of the executor. That means that while - * the executor is in scope, no CallbackHandle or EventHandle is stale. - * - * Usage: Instantiate an executor, schedule a work item, call run(). - * - * Implementation details: - * - * The executor is composed of several WorkQueues, which are queues of WorkItems. WorkItems - * describe units of work -- a callback and state needed to track its lifecycle. The iterators - * pointing to WorkItems are spliced between the WorkQueues, rather than copying WorkItems - * themselves. Further, those WorkQueue::iterators are never invalidated during the life of an - * executor. They may be recycled to represent new work items, but when that happens, a counter - * on the WorkItem is incremented, to disambiguate. Handles referencing WorkQueue::iterators, - * called CallbackHandles, are thus valid for the life of the executor, simplifying lifecycle - * management. - * - * All work executed by the run() method of the executor is popped off the front of the - * _readyQueue. Remote commands blocked on the network can be found in the - * _networkInProgressQueue. Callbacks waiting for a timer to expire are in the _sleepersQueue. - * When the network returns or the timer expires, items from these two queues are transferred to - * the back of the _readyQueue. - * - * The _exclusiveLockInProgressQueue, which represents work items to execute while holding the - * GlobalWrite lock, is exceptional. WorkItems in that queue execute in unspecified order with - * respect to work in the _readyQueue or other WorkItems in the _exclusiveLockInProgressQueue, - * but they are executed in a single serial order with respect to those other WorkItems. The - * _terribleExLockSyncMutex is used to provide this serialization, until such time as the global - * lock may be passed from one thread to another. - * - * Events work similiarly to WorkItems, and EventList is akin to WorkQueue. - */ - class ReplicationExecutor { - MONGO_DISALLOW_COPYING(ReplicationExecutor); - public: - typedef boost::posix_time::milliseconds Milliseconds; - struct CallbackData; - class CallbackHandle; - class EventHandle; - class NetworkInterface; - struct RemoteCommandCallbackData; - struct RemoteCommandRequest; - struct RemoteCommandResponse; - typedef StatusWith<RemoteCommandResponse> ResponseStatus; - - static const Milliseconds kNoTimeout; - static const Date_t kNoExpirationDate; - - /** - * Type of a regular callback function. - * - * The status argument passed at invocation will have code ErrorCodes::CallbackCanceled if - * the callback was canceled for any reason (including shutdown). Otherwise, it should have - * Status::OK(). - */ - typedef stdx::function<void (const CallbackData&)> CallbackFn; - - /** - * Type of a callback from a request to run a command on a remote MongoDB node. - * - * The StatusWith<const BSONObj> will have ErrorCodes::CallbackCanceled if the callback was - * canceled. Otherwise, its status will represent any failure to execute the command. - * If the command executed and a response came back, then the status object will contain - * the BSONObj returned by the command, with the "ok" field indicating the success of the - * command in the usual way. - */ - typedef stdx::function<void (const RemoteCommandCallbackData&)> RemoteCommandCallbackFn; - - /** - * Constructs a new executor. - * - * Takes ownership of the passed NetworkInterface object. - */ - explicit ReplicationExecutor(NetworkInterface* netInterface, int64_t pnrgSeed); - - /** - * Destroys an executor. - */ - ~ReplicationExecutor(); - - /** - * Returns diagnostic information. - */ - std::string getDiagnosticString(); - - /** - * Gets the current time as reported by the network interface. - */ - Date_t now(); - - /** - * Executes the run loop. May be called up to one time. - * - * Returns after the executor has been shutdown and is safe to delete. - */ - void run(); - - /** - * Signals to the executor that it should shut down. The only reliable indication - * that shutdown has completed is that the run() method returns. - * - * May be called by client threads or callbacks running in the executor. - */ - void shutdown(); - - /** - * Creates a new event. Returns a handle to the event, or ErrorCodes::ShutdownInProgress if - * makeEvent() fails because the executor is shutting down. - * - * May be called by client threads or callbacks running in the executor. - */ - StatusWith<EventHandle> makeEvent(); - - /** - * Signals the event, making waiting client threads and callbacks runnable. - * - * May be called up to one time per event. - * - * May be called by client threads or callbacks running in the executor. - */ - void signalEvent(const EventHandle&); - - /** - * Schedules a callback, "work", to run after "event" is signaled. If "event" - * has already been signaled, marks "work" as immediately runnable. - * - * If "event" has yet to be signaled when "shutdown()" is called, "work" will - * be scheduled with a status of ErrorCodes::CallbackCanceled. - * - * May be called by client threads or callbacks running in the executor. - */ - StatusWith<CallbackHandle> onEvent(const EventHandle& event, const CallbackFn& work); - - /** - * Blocks the calling thread until after "event" is signaled. Also returns - * if the event is never signaled but shutdown() is called on the executor. - * - * NOTE: Do not call from a callback running in the executor. - * - * TODO(schwerin): Change return type so that the caller can know which of the two reasons - * led to this method returning. - */ - void waitForEvent(const EventHandle& event); - - /** - * Schedules "work" to be run by the executor ASAP. - * - * Returns a handle for waiting on or canceling the callback, or - * ErrorCodes::ShutdownInProgress. - * - * May be called by client threads or callbacks running in the executor. - */ - StatusWith<CallbackHandle> scheduleWork(const CallbackFn& work); - - /** - * Schedules "work" to be run by the executor no sooner than "when". - * - * Returns a handle for waiting on or canceling the callback, or - * ErrorCodes::ShutdownInProgress. - * - * May be called by client threads or callbacks running in the executor. - */ - StatusWith<CallbackHandle> scheduleWorkAt(Date_t when, const CallbackFn& work); - - /** - * Schedules "work" to be run by the executor while holding the global exclusive lock. - * - * The "work" will run exclusively, as though it were executed by the main - * run loop, but there are no ordering guarantees provided with respect to - * any other work item. - * - * Returns a handle for waiting on or canceling the callback, or - * ErrorCodes::ShutdownInProgress. - * - * May be called by client threads or callbacks running in the executor. - */ - StatusWith<CallbackHandle> scheduleWorkWithGlobalExclusiveLock( - const CallbackFn& work); - - /** - * Schedules "cb" to be run by the executor with the result of executing the remote command - * described by "request". - * - * Returns a handle for waiting on or canceling the callback, or - * ErrorCodes::ShutdownInProgress. - * - * May be called by client threads or callbacks running in the executor. - */ - StatusWith<CallbackHandle> scheduleRemoteCommand( - const RemoteCommandRequest& request, - const RemoteCommandCallbackFn& cb); - - /** - * If the callback referenced by "cbHandle" hasn't already executed, marks it as - * canceled and runnable. - * - * May be called by client threads or callbacks running in the executor. - */ - void cancel(const CallbackHandle& cbHandle); - - /** - * Blocks until the executor finishes running the callback referenced by "cbHandle". - * - * Becaue callbacks all run during shutdown if they weren't run beforehand, there is no need - * to indicate the reason for returning from wait(CallbackHandle). It is always that the - * callback ran. - * - * NOTE: Do not call from a callback running in the executor. - */ - void wait(const CallbackHandle& cbHandle); - - /** - * Returns an int64_t generated by the prng with a max value of "limit". - */ - int64_t nextRandomInt64(int64_t limit); - - private: - struct Event; - struct WorkItem; - - /** - * A linked list of WorkItem objects. - * - * WorkItems get moved among lists by splicing iterators of work lists together, - * not by copying underlying WorkItem objects. - */ - typedef stdx::list<WorkItem> WorkQueue; - - /** - * A linked list of Event objects, like WorkQueue, above. - */ - typedef stdx::list<Event> EventList; - - /** - * Returns diagnostic info - */ - std::string _getDiagnosticString_inlock() const; - /** - * Implementation of makeEvent() for use when _mutex is already held. - */ - StatusWith<EventHandle> makeEvent_inlock(); - - /** - * Gets a single piece of work to execute. - * - * If the "callback" member of the returned WorkItem is falsey, that is a signal - * to the run loop to wait for shutdown. - */ - std::pair<WorkItem, CallbackHandle> getWork(); - - /** - * Marks as runnable any sleepers whose ready date has passed as of "now". - * Returns the date when the next sleeper will be ready, or Date_t(~0ULL) if there are no - * remaining sleepers. - */ - Date_t scheduleReadySleepers_inlock(Date_t now); - - /** - * Enqueues "callback" into "queue". - * - * Assumes that "queue" is sorted by readyDate, and performs insertion sort, starting - * at the back of the "queue" working toward the front. - * - * Use Date_t(0) for readyDate to mean "ready now". - */ - StatusWith<CallbackHandle> enqueueWork_inlock(WorkQueue* queue, const CallbackFn& callback); - - /** - * Implementation of signalEvent() that assumes the caller owns _mutex. - */ - void signalEvent_inlock(const EventHandle& event); - - /** - * Notifies interested parties that shutdown has completed, if it has. - */ - void maybeNotifyShutdownComplete_inlock(); - - /** - * Completes the shutdown process. Called by run(). - */ - void finishShutdown(); - - void _finishRemoteCommand( - const RemoteCommandRequest& request, - const StatusWith<RemoteCommandResponse>& response, - const CallbackHandle& cbHandle, - const uint64_t expectedHandleGeneration, - const RemoteCommandCallbackFn& cb); - - /** - * Executes the callback referenced by "cbHandle", and moves the underlying - * WorkQueue::iterator into the _freeQueue. "txn" is a pointer to the OperationContext - * owning the global exclusive lock. - * - * Serializes execution of "cbHandle" with the execution of other callbacks. - */ - void doOperationWithGlobalExclusiveLock(OperationContext* txn, - const CallbackHandle& cbHandle); - - // PRNG; seeded at class construction time. - PseudoRandom _random; - - boost::scoped_ptr<NetworkInterface> _networkInterface; - boost::mutex _mutex; - boost::mutex _terribleExLockSyncMutex; - boost::condition_variable _noMoreWaitingThreads; - WorkQueue _freeQueue; - WorkQueue _readyQueue; - WorkQueue _exclusiveLockInProgressQueue; - WorkQueue _networkInProgressQueue; - WorkQueue _sleepersQueue; - EventList _unsignaledEvents; - EventList _signaledEvents; - int64_t _totalEventWaiters; - bool _inShutdown; - threadpool::ThreadPool _dblockWorkers; - uint64_t _nextId; - }; - - /** - * Reference to an event object in the executor. - */ - class ReplicationExecutor::EventHandle { - friend class ReplicationExecutor; - public: - EventHandle() : _generation(0), _id(0) {} - - /** - * Returns true if the handle is valid, meaning that it identifies - */ - bool isValid() const { return _id != 0; } - - bool operator==(const EventHandle &other) const { - return (_id == other._id); - } - - bool operator!=(const EventHandle &other) const { - return !(*this == other); - } - - private: - EventHandle(const EventList::iterator& iter, const uint64_t id); - - EventList::iterator _iter; - uint64_t _generation; - uint64_t _id; - }; - - /** - * Reference to a scheduled callback. - */ - class ReplicationExecutor::CallbackHandle { - friend class ReplicationExecutor; - public: - CallbackHandle() : _generation(0) {} - - bool isValid() const { return _finishedEvent.isValid(); } - - bool operator==(const CallbackHandle &other) const { - return (_finishedEvent == other._finishedEvent); - } - - bool operator!=(const CallbackHandle &other) const { - return !(*this == other); - } - - private: - explicit CallbackHandle(const WorkQueue::iterator& iter); - - WorkQueue::iterator _iter; - uint64_t _generation; - EventHandle _finishedEvent; - }; - - struct ReplicationExecutor::CallbackData { - CallbackData(ReplicationExecutor* theExecutor, - const CallbackHandle& theHandle, - const Status& theStatus, - OperationContext* txn = NULL); - - ReplicationExecutor* executor; - CallbackHandle myHandle; - Status status; - OperationContext* txn; - }; - - /** - * Type of object describing a command to execute against a remote MongoDB node. - */ - struct ReplicationExecutor::RemoteCommandRequest { - RemoteCommandRequest(); - RemoteCommandRequest(const HostAndPort& theTarget, - const std::string& theDbName, - const BSONObj& theCmdObj, - const Milliseconds timeoutMillis = kNoTimeout); - - // Returns diagnostic info. - std::string getDiagnosticString(); - - HostAndPort target; - std::string dbname; - BSONObj cmdObj; - Milliseconds timeout; - Date_t expirationDate; // Set by scheduleRemoteCommand. - }; - - struct ReplicationExecutor::RemoteCommandResponse { - RemoteCommandResponse() : data(), elapsedMillis(Milliseconds(0)) {} - RemoteCommandResponse(BSONObj obj, Milliseconds millis) - : data(obj), - elapsedMillis(millis) {} - - BSONObj data; - Milliseconds elapsedMillis; - }; - - /** - * Interface to networking and lock manager. - */ - class ReplicationExecutor::NetworkInterface { - MONGO_DISALLOW_COPYING(NetworkInterface); - public: - typedef RemoteCommandResponse Response; - typedef stdx::function<void (const ResponseStatus&)> RemoteCommandCompletionFn; - - virtual ~NetworkInterface(); - - /** - * Returns diagnostic info. - */ - virtual std::string getDiagnosticString() = 0; - - /** - * Starts up the network interface. - * - * It is valid to call all methods except shutdown() before this method completes. That is, - * implementations may not assume that startup() completes before startCommand() first - * executes. - * - * Called by the owning ReplicationExecutor inside its run() method. - */ - virtual void startup() = 0; - - /** - * Shuts down the network interface. Must be called before this instance gets deleted, - * if startup() is called. - * - * Called by the owning ReplicationExecutor inside its run() method. - */ - virtual void shutdown() = 0; - - /** - * Blocks the current thread (presumably the executor thread) until the network interface - * knows of work for the executor to perform. - */ - virtual void waitForWork() = 0; - - /** - * Similar to waitForWork, but only blocks until "when". - */ - virtual void waitForWorkUntil(Date_t when) = 0; - - /** - * Signals to the network interface that there is new work (such as a signaled event) for - * the executor to process. Wakes the executor from waitForWork() and friends. - */ - virtual void signalWorkAvailable() = 0; - - /** - * Returns the current time. - */ - virtual Date_t now() = 0; - - /** - * Starts asynchronous execution of the command described by "request". - */ - virtual void startCommand(const CallbackHandle& cbHandle, - const RemoteCommandRequest& request, - const RemoteCommandCompletionFn& onFinish) = 0; - - /** - * Requests cancelation of the network activity associated with "cbHandle" if it has not yet - * completed. - */ - virtual void cancelCommand(const CallbackHandle& cbHandle) = 0; - - /** - * Runs the given callback while holding the global exclusive lock. - */ - virtual void runCallbackWithGlobalExclusiveLock( - const stdx::function<void (OperationContext*)>& callback) = 0; - - protected: - NetworkInterface(); - }; - - typedef ReplicationExecutor::ResponseStatus ResponseStatus; - - // Must be after NetworkInterface class - struct ReplicationExecutor::RemoteCommandCallbackData { - RemoteCommandCallbackData(ReplicationExecutor* theExecutor, - const CallbackHandle& theHandle, - const RemoteCommandRequest& theRequest, - const StatusWith<RemoteCommandResponse>& theResponse); - - ReplicationExecutor* executor; - CallbackHandle myHandle; - RemoteCommandRequest request; - StatusWith<RemoteCommandResponse> response; - }; - - /** - * Description of a scheduled but not-yet-run work item. - * - * Once created, WorkItem objects remain in scope until the executor is destroyed. - * However, over their lifetime, they may represent many different work items. This - * divorces the lifetime of CallbackHandles from the lifetime of WorkItem objects, but - * requires a unique generation identifier in CallbackHandles and WorkItem objects. - * - * WorkItem is copyable so that it may be stored in a list. However, in practice they - * should only be copied by getWork() and when allocating new entries into a WorkQueue (not - * when moving entries between work lists). - */ - struct ReplicationExecutor::WorkItem { - WorkItem(); - uint64_t generation; - CallbackFn callback; - EventHandle finishedEvent; - Date_t readyDate; - bool isNetworkOperation; - bool isCanceled; - }; - - /** - * Description of an unsignaled event. - * - * Like WorkItem, above, but for events. On signaling, the executor bumps the - * generation, marks all waiters as runnable, and moves the event from the "unsignaled" - * EventList to the "signaled" EventList, the latter being a free list of events. - */ - struct ReplicationExecutor::Event { - Event(); - uint64_t generation; - bool isSignaled; - WorkQueue waiters; - boost::shared_ptr<boost::condition_variable> isSignaledCondition; - }; + * Signals to the network interface that there is new work (such as a signaled event) for + * the executor to process. Wakes the executor from waitForWork() and friends. + */ + virtual void signalWorkAvailable() = 0; + + /** + * Returns the current time. + */ + virtual Date_t now() = 0; + + /** + * Starts asynchronous execution of the command described by "request". + */ + virtual void startCommand(const CallbackHandle& cbHandle, + const RemoteCommandRequest& request, + const RemoteCommandCompletionFn& onFinish) = 0; + + /** + * Requests cancelation of the network activity associated with "cbHandle" if it has not yet + * completed. + */ + virtual void cancelCommand(const CallbackHandle& cbHandle) = 0; + + /** + * Runs the given callback while holding the global exclusive lock. + */ + virtual void runCallbackWithGlobalExclusiveLock( + const stdx::function<void(OperationContext*)>& callback) = 0; + +protected: + NetworkInterface(); +}; + +typedef ReplicationExecutor::ResponseStatus ResponseStatus; + +// Must be after NetworkInterface class +struct ReplicationExecutor::RemoteCommandCallbackData { + RemoteCommandCallbackData(ReplicationExecutor* theExecutor, + const CallbackHandle& theHandle, + const RemoteCommandRequest& theRequest, + const StatusWith<RemoteCommandResponse>& theResponse); + + ReplicationExecutor* executor; + CallbackHandle myHandle; + RemoteCommandRequest request; + StatusWith<RemoteCommandResponse> response; +}; + +/** + * Description of a scheduled but not-yet-run work item. + * + * Once created, WorkItem objects remain in scope until the executor is destroyed. + * However, over their lifetime, they may represent many different work items. This + * divorces the lifetime of CallbackHandles from the lifetime of WorkItem objects, but + * requires a unique generation identifier in CallbackHandles and WorkItem objects. + * + * WorkItem is copyable so that it may be stored in a list. However, in practice they + * should only be copied by getWork() and when allocating new entries into a WorkQueue (not + * when moving entries between work lists). + */ +struct ReplicationExecutor::WorkItem { + WorkItem(); + uint64_t generation; + CallbackFn callback; + EventHandle finishedEvent; + Date_t readyDate; + bool isNetworkOperation; + bool isCanceled; +}; + +/** + * Description of an unsignaled event. + * + * Like WorkItem, above, but for events. On signaling, the executor bumps the + * generation, marks all waiters as runnable, and moves the event from the "unsignaled" + * EventList to the "signaled" EventList, the latter being a free list of events. + */ +struct ReplicationExecutor::Event { + Event(); + uint64_t generation; + bool isSignaled; + WorkQueue waiters; + boost::shared_ptr<boost::condition_variable> isSignaledCondition; +}; } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/replication_executor_test.cpp b/src/mongo/db/repl/replication_executor_test.cpp index 24f1109ac33..a6305056a7e 100644 --- a/src/mongo/db/repl/replication_executor_test.cpp +++ b/src/mongo/db/repl/replication_executor_test.cpp @@ -44,477 +44,434 @@ namespace repl { namespace { - bool operator==(const ReplicationExecutor::RemoteCommandRequest lhs, - const ReplicationExecutor::RemoteCommandRequest rhs) { - return lhs.target == rhs.target && - lhs.dbname == rhs.dbname && - lhs.cmdObj == rhs.cmdObj; +bool operator==(const ReplicationExecutor::RemoteCommandRequest lhs, + const ReplicationExecutor::RemoteCommandRequest rhs) { + return lhs.target == rhs.target && lhs.dbname == rhs.dbname && lhs.cmdObj == rhs.cmdObj; +} + +bool operator!=(const ReplicationExecutor::RemoteCommandRequest lhs, + const ReplicationExecutor::RemoteCommandRequest rhs) { + return !(lhs == rhs); +} + +void setStatus(const ReplicationExecutor::CallbackData& cbData, Status* target) { + *target = cbData.status; +} + +void setStatusAndShutdown(const ReplicationExecutor::CallbackData& cbData, Status* target) { + setStatus(cbData, target); + if (cbData.status != ErrorCodes::CallbackCanceled) + cbData.executor->shutdown(); +} + +void setStatusAndTriggerEvent(const ReplicationExecutor::CallbackData& cbData, + Status* outStatus, + ReplicationExecutor::EventHandle event) { + *outStatus = cbData.status; + if (!cbData.status.isOK()) + return; + cbData.executor->signalEvent(event); +} + +void scheduleSetStatusAndShutdown(const ReplicationExecutor::CallbackData& cbData, + Status* outStatus1, + Status* outStatus2) { + if (!cbData.status.isOK()) { + *outStatus1 = cbData.status; + return; } + *outStatus1 = cbData.executor->scheduleWork(stdx::bind(setStatusAndShutdown, + stdx::placeholders::_1, + outStatus2)).getStatus(); +} - bool operator!=(const ReplicationExecutor::RemoteCommandRequest lhs, - const ReplicationExecutor::RemoteCommandRequest rhs) { - return !(lhs == rhs); - } - - void setStatus(const ReplicationExecutor::CallbackData& cbData, Status* target) { - *target = cbData.status; - } - - void setStatusAndShutdown(const ReplicationExecutor::CallbackData& cbData, - Status* target) { - setStatus(cbData, target); - if (cbData.status != ErrorCodes::CallbackCanceled) - cbData.executor->shutdown(); - } - - void setStatusAndTriggerEvent(const ReplicationExecutor::CallbackData& cbData, - Status* outStatus, - ReplicationExecutor::EventHandle event) { - *outStatus = cbData.status; - if (!cbData.status.isOK()) - return; - cbData.executor->signalEvent(event); - } +const int64_t prngSeed = 1; - void scheduleSetStatusAndShutdown(const ReplicationExecutor::CallbackData& cbData, - Status* outStatus1, - Status* outStatus2) { - if (!cbData.status.isOK()) { - *outStatus1 = cbData.status; - return; - } - *outStatus1= cbData.executor->scheduleWork(stdx::bind(setStatusAndShutdown, - stdx::placeholders::_1, - outStatus2)).getStatus(); +class ReplicationExecutorTest : public unittest::Test { +protected: + NetworkInterfaceMock* getNet() { + return _net; } - - const int64_t prngSeed = 1; - - class ReplicationExecutorTest : public unittest::Test { - protected: - NetworkInterfaceMock* getNet() { return _net; } - ReplicationExecutor& getExecutor() { return *_executor; } - - void launchExecutorThread(); - void joinExecutorThread(); - - virtual void setUp(); - virtual void tearDown(); - - private: - NetworkInterfaceMock* _net; - boost::scoped_ptr<ReplicationExecutor> _executor; - boost::scoped_ptr<boost::thread> _executorThread; - }; - - void ReplicationExecutorTest::launchExecutorThread() { - ASSERT(!_executorThread); - _executorThread.reset( - new boost::thread(stdx::bind(&ReplicationExecutor::run, _executor.get()))); - _net->enterNetwork(); - } - - void ReplicationExecutorTest::joinExecutorThread() { - ASSERT(_executorThread); - _net->exitNetwork(); - _executorThread->join(); - _executorThread.reset(); - } - - void ReplicationExecutorTest::setUp() { - _net = new NetworkInterfaceMock; - _executor.reset(new ReplicationExecutor(_net, prngSeed)); - } - - void ReplicationExecutorTest::tearDown() { - if (_executorThread) { - _executor->shutdown(); - joinExecutorThread(); - } - } - - TEST_F(ReplicationExecutorTest, RunOne) { - ReplicationExecutor& executor = getExecutor(); - Status status(ErrorCodes::InternalError, "Not mutated"); - ASSERT_OK(executor.scheduleWork(stdx::bind(setStatusAndShutdown, - stdx::placeholders::_1, - &status)).getStatus()); - executor.run(); - ASSERT_OK(status); - } - - TEST_F(ReplicationExecutorTest, Schedule1ButShutdown) { - ReplicationExecutor& executor = getExecutor(); - Status status(ErrorCodes::InternalError, "Not mutated"); - ASSERT_OK(executor.scheduleWork(stdx::bind(setStatusAndShutdown, - stdx::placeholders::_1, - &status)).getStatus()); - executor.shutdown(); - executor.run(); - ASSERT_EQUALS(status, ErrorCodes::CallbackCanceled); + ReplicationExecutor& getExecutor() { + return *_executor; } - TEST_F(ReplicationExecutorTest, Schedule2Cancel1) { - ReplicationExecutor& executor = getExecutor(); - Status status1(ErrorCodes::InternalError, "Not mutated"); - Status status2(ErrorCodes::InternalError, "Not mutated"); - ReplicationExecutor::CallbackHandle cb = unittest::assertGet( - executor.scheduleWork(stdx::bind(setStatusAndShutdown, - stdx::placeholders::_1, - &status1))); - executor.cancel(cb); - ASSERT_OK(executor.scheduleWork(stdx::bind(setStatusAndShutdown, - stdx::placeholders::_1, - &status2)).getStatus()); - executor.run(); - ASSERT_EQUALS(status1, ErrorCodes::CallbackCanceled); - ASSERT_OK(status2); - } - - TEST_F(ReplicationExecutorTest, OneSchedulesAnother) { - ReplicationExecutor& executor = getExecutor(); - Status status1(ErrorCodes::InternalError, "Not mutated"); - Status status2(ErrorCodes::InternalError, "Not mutated"); - ASSERT_OK(executor.scheduleWork(stdx::bind(scheduleSetStatusAndShutdown, - stdx::placeholders::_1, - &status1, - &status2)).getStatus()); - executor.run(); - ASSERT_OK(status1); - ASSERT_OK(status2); - } - - class EventChainAndWaitingTest { - MONGO_DISALLOW_COPYING(EventChainAndWaitingTest); - public: - EventChainAndWaitingTest(); - void run(); - private: - void onGo(const ReplicationExecutor::CallbackData& cbData); - void onGoAfterTriggered(const ReplicationExecutor::CallbackData& cbData); - - NetworkInterfaceMock* net; - ReplicationExecutor executor; - boost::thread executorThread; - const ReplicationExecutor::EventHandle goEvent; - const ReplicationExecutor::EventHandle event2; - const ReplicationExecutor::EventHandle event3; - ReplicationExecutor::EventHandle triggerEvent; - ReplicationExecutor::CallbackFn triggered2; - ReplicationExecutor::CallbackFn triggered3; - Status status1; - Status status2; - Status status3; - Status status4; - Status status5; - }; - - TEST(ReplicationExecutorTest, EventChainAndWaiting) { - EventChainAndWaitingTest().run(); - } - - EventChainAndWaitingTest::EventChainAndWaitingTest() : - net(new NetworkInterfaceMock), - executor(net, prngSeed), - executorThread(stdx::bind(&ReplicationExecutor::run, &executor)), - goEvent(unittest::assertGet(executor.makeEvent())), - event2(unittest::assertGet(executor.makeEvent())), - event3(unittest::assertGet(executor.makeEvent())), - status1(ErrorCodes::InternalError, "Not mutated"), - status2(ErrorCodes::InternalError, "Not mutated"), - status3(ErrorCodes::InternalError, "Not mutated"), - status4(ErrorCodes::InternalError, "Not mutated"), - status5(ErrorCodes::InternalError, "Not mutated") { - - triggered2 = stdx::bind(setStatusAndTriggerEvent, - stdx::placeholders::_1, - &status2, - event2); - triggered3 = stdx::bind(setStatusAndTriggerEvent, - stdx::placeholders::_1, - &status3, - event3); - } - - void EventChainAndWaitingTest::run() { - executor.onEvent(goEvent, - stdx::bind(&EventChainAndWaitingTest::onGo, - this, - stdx::placeholders::_1)); - executor.signalEvent(goEvent); - executor.waitForEvent(goEvent); - executor.waitForEvent(event2); - executor.waitForEvent(event3); - - ReplicationExecutor::EventHandle neverSignaledEvent = - unittest::assertGet(executor.makeEvent()); - boost::thread neverSignaledWaiter(stdx::bind(&ReplicationExecutor::waitForEvent, - &executor, - neverSignaledEvent)); - ReplicationExecutor::CallbackHandle shutdownCallback = unittest::assertGet( - executor.scheduleWork(stdx::bind(setStatusAndShutdown, - stdx::placeholders::_1, - &status5))); - executor.wait(shutdownCallback); - neverSignaledWaiter.join(); - executorThread.join(); - ASSERT_OK(status1); - ASSERT_OK(status2); - ASSERT_OK(status3); - ASSERT_OK(status4); - ASSERT_OK(status5); - } - - void EventChainAndWaitingTest::onGo(const ReplicationExecutor::CallbackData& cbData) { - if (!cbData.status.isOK()) { - status1 = cbData.status; - return; - } - ReplicationExecutor* executor = cbData.executor; - StatusWith<ReplicationExecutor::EventHandle> errorOrTriggerEvent = executor->makeEvent(); - if (!errorOrTriggerEvent.isOK()) { - status1 = errorOrTriggerEvent.getStatus(); - executor->shutdown(); - return; - } - triggerEvent = errorOrTriggerEvent.getValue(); - StatusWith<ReplicationExecutor::CallbackHandle> cbHandle = executor->onEvent( - triggerEvent, triggered2); - if (!cbHandle.isOK()) { - status1 = cbHandle.getStatus(); - executor->shutdown(); - return; - } - cbHandle = executor->onEvent(triggerEvent, triggered3); - if (!cbHandle.isOK()) { - status1 = cbHandle.getStatus(); - executor->shutdown(); - return; - } - - cbHandle = executor->onEvent( - goEvent, - stdx::bind(&EventChainAndWaitingTest::onGoAfterTriggered, - this, - stdx::placeholders::_1)); - if (!cbHandle.isOK()) { - status1 = cbHandle.getStatus(); - executor->shutdown(); - return; - } - status1 = Status::OK(); - } - - void EventChainAndWaitingTest::onGoAfterTriggered( - const ReplicationExecutor::CallbackData& cbData) { - status4 = cbData.status; - if (!cbData.status.isOK()) { - return; - } - cbData.executor->signalEvent(triggerEvent); - } - - TEST_F(ReplicationExecutorTest, ScheduleWorkAt) { - NetworkInterfaceMock* net = getNet(); - ReplicationExecutor& executor = getExecutor(); - launchExecutorThread(); - Status status1(ErrorCodes::InternalError, "Not mutated"); - Status status2(ErrorCodes::InternalError, "Not mutated"); - Status status3(ErrorCodes::InternalError, "Not mutated"); - const Date_t now = net->now(); - const ReplicationExecutor::CallbackHandle cb1 = - unittest::assertGet(executor.scheduleWorkAt(Date_t(now.millis + 100), - stdx::bind(setStatus, - stdx::placeholders::_1, - &status1))); - unittest::assertGet(executor.scheduleWorkAt(Date_t(now.millis + 5000), - stdx::bind(setStatus, - stdx::placeholders::_1, - &status3))); - const ReplicationExecutor::CallbackHandle cb2 = - unittest::assertGet(executor.scheduleWorkAt(Date_t(now.millis + 200), - stdx::bind(setStatusAndShutdown, - stdx::placeholders::_1, - &status2))); - const Date_t startTime = net->now(); - net->runUntil(startTime + 200 /*ms*/); - ASSERT_EQUALS(startTime + 200, net->now()); - executor.wait(cb1); - executor.wait(cb2); - ASSERT_OK(status1); - ASSERT_OK(status2); - executor.shutdown(); + void launchExecutorThread(); + void joinExecutorThread(); + + virtual void setUp(); + virtual void tearDown(); + +private: + NetworkInterfaceMock* _net; + boost::scoped_ptr<ReplicationExecutor> _executor; + boost::scoped_ptr<boost::thread> _executorThread; +}; + +void ReplicationExecutorTest::launchExecutorThread() { + ASSERT(!_executorThread); + _executorThread.reset( + new boost::thread(stdx::bind(&ReplicationExecutor::run, _executor.get()))); + _net->enterNetwork(); +} + +void ReplicationExecutorTest::joinExecutorThread() { + ASSERT(_executorThread); + _net->exitNetwork(); + _executorThread->join(); + _executorThread.reset(); +} + +void ReplicationExecutorTest::setUp() { + _net = new NetworkInterfaceMock; + _executor.reset(new ReplicationExecutor(_net, prngSeed)); +} + +void ReplicationExecutorTest::tearDown() { + if (_executorThread) { + _executor->shutdown(); joinExecutorThread(); - ASSERT_EQUALS(status3, ErrorCodes::CallbackCanceled); } - - std::string getRequestDescription(const ReplicationExecutor::RemoteCommandRequest& request) { - return mongoutils::str::stream() << "Request(" << request.target.toString() << ", " << - request.dbname << ", " << request.cmdObj << ')'; +} + +TEST_F(ReplicationExecutorTest, RunOne) { + ReplicationExecutor& executor = getExecutor(); + Status status(ErrorCodes::InternalError, "Not mutated"); + ASSERT_OK( + executor.scheduleWork(stdx::bind(setStatusAndShutdown, stdx::placeholders::_1, &status)) + .getStatus()); + executor.run(); + ASSERT_OK(status); +} + +TEST_F(ReplicationExecutorTest, Schedule1ButShutdown) { + ReplicationExecutor& executor = getExecutor(); + Status status(ErrorCodes::InternalError, "Not mutated"); + ASSERT_OK( + executor.scheduleWork(stdx::bind(setStatusAndShutdown, stdx::placeholders::_1, &status)) + .getStatus()); + executor.shutdown(); + executor.run(); + ASSERT_EQUALS(status, ErrorCodes::CallbackCanceled); +} + +TEST_F(ReplicationExecutorTest, Schedule2Cancel1) { + ReplicationExecutor& executor = getExecutor(); + Status status1(ErrorCodes::InternalError, "Not mutated"); + Status status2(ErrorCodes::InternalError, "Not mutated"); + ReplicationExecutor::CallbackHandle cb = unittest::assertGet( + executor.scheduleWork(stdx::bind(setStatusAndShutdown, stdx::placeholders::_1, &status1))); + executor.cancel(cb); + ASSERT_OK( + executor.scheduleWork(stdx::bind(setStatusAndShutdown, stdx::placeholders::_1, &status2)) + .getStatus()); + executor.run(); + ASSERT_EQUALS(status1, ErrorCodes::CallbackCanceled); + ASSERT_OK(status2); +} + +TEST_F(ReplicationExecutorTest, OneSchedulesAnother) { + ReplicationExecutor& executor = getExecutor(); + Status status1(ErrorCodes::InternalError, "Not mutated"); + Status status2(ErrorCodes::InternalError, "Not mutated"); + ASSERT_OK(executor.scheduleWork(stdx::bind(scheduleSetStatusAndShutdown, + stdx::placeholders::_1, + &status1, + &status2)).getStatus()); + executor.run(); + ASSERT_OK(status1); + ASSERT_OK(status2); +} + +class EventChainAndWaitingTest { + MONGO_DISALLOW_COPYING(EventChainAndWaitingTest); + +public: + EventChainAndWaitingTest(); + void run(); + +private: + void onGo(const ReplicationExecutor::CallbackData& cbData); + void onGoAfterTriggered(const ReplicationExecutor::CallbackData& cbData); + + NetworkInterfaceMock* net; + ReplicationExecutor executor; + boost::thread executorThread; + const ReplicationExecutor::EventHandle goEvent; + const ReplicationExecutor::EventHandle event2; + const ReplicationExecutor::EventHandle event3; + ReplicationExecutor::EventHandle triggerEvent; + ReplicationExecutor::CallbackFn triggered2; + ReplicationExecutor::CallbackFn triggered3; + Status status1; + Status status2; + Status status3; + Status status4; + Status status5; +}; + +TEST(ReplicationExecutorTest, EventChainAndWaiting) { + EventChainAndWaitingTest().run(); +} + +EventChainAndWaitingTest::EventChainAndWaitingTest() + : net(new NetworkInterfaceMock), + executor(net, prngSeed), + executorThread(stdx::bind(&ReplicationExecutor::run, &executor)), + goEvent(unittest::assertGet(executor.makeEvent())), + event2(unittest::assertGet(executor.makeEvent())), + event3(unittest::assertGet(executor.makeEvent())), + status1(ErrorCodes::InternalError, "Not mutated"), + status2(ErrorCodes::InternalError, "Not mutated"), + status3(ErrorCodes::InternalError, "Not mutated"), + status4(ErrorCodes::InternalError, "Not mutated"), + status5(ErrorCodes::InternalError, "Not mutated") { + triggered2 = stdx::bind(setStatusAndTriggerEvent, stdx::placeholders::_1, &status2, event2); + triggered3 = stdx::bind(setStatusAndTriggerEvent, stdx::placeholders::_1, &status3, event3); +} + +void EventChainAndWaitingTest::run() { + executor.onEvent(goEvent, + stdx::bind(&EventChainAndWaitingTest::onGo, this, stdx::placeholders::_1)); + executor.signalEvent(goEvent); + executor.waitForEvent(goEvent); + executor.waitForEvent(event2); + executor.waitForEvent(event3); + + ReplicationExecutor::EventHandle neverSignaledEvent = unittest::assertGet(executor.makeEvent()); + boost::thread neverSignaledWaiter( + stdx::bind(&ReplicationExecutor::waitForEvent, &executor, neverSignaledEvent)); + ReplicationExecutor::CallbackHandle shutdownCallback = unittest::assertGet( + executor.scheduleWork(stdx::bind(setStatusAndShutdown, stdx::placeholders::_1, &status5))); + executor.wait(shutdownCallback); + neverSignaledWaiter.join(); + executorThread.join(); + ASSERT_OK(status1); + ASSERT_OK(status2); + ASSERT_OK(status3); + ASSERT_OK(status4); + ASSERT_OK(status5); +} + +void EventChainAndWaitingTest::onGo(const ReplicationExecutor::CallbackData& cbData) { + if (!cbData.status.isOK()) { + status1 = cbData.status; + return; } - - static void setStatusOnRemoteCommandCompletion( - const ReplicationExecutor::RemoteCommandCallbackData& cbData, - const ReplicationExecutor::RemoteCommandRequest& expectedRequest, - Status* outStatus) { - - if (cbData.request != expectedRequest) { - *outStatus = Status( - ErrorCodes::BadValue, - mongoutils::str::stream() << "Actual request: " << - getRequestDescription(cbData.request) << "; expected: " << - getRequestDescription(expectedRequest)); - return; - } - *outStatus = cbData.response.getStatus(); + ReplicationExecutor* executor = cbData.executor; + StatusWith<ReplicationExecutor::EventHandle> errorOrTriggerEvent = executor->makeEvent(); + if (!errorOrTriggerEvent.isOK()) { + status1 = errorOrTriggerEvent.getStatus(); + executor->shutdown(); + return; } - - TEST_F(ReplicationExecutorTest, ScheduleRemoteCommand) { - NetworkInterfaceMock* net = getNet(); - ReplicationExecutor& executor = getExecutor(); - launchExecutorThread(); - Status status1(ErrorCodes::InternalError, "Not mutated"); - const ReplicationExecutor::RemoteCommandRequest request( - HostAndPort("localhost", 27017), - "mydb", - BSON("whatsUp" << "doc")); - ReplicationExecutor::CallbackHandle cbHandle = unittest::assertGet( - executor.scheduleRemoteCommand( - request, - stdx::bind(setStatusOnRemoteCommandCompletion, - stdx::placeholders::_1, - request, - &status1))); - ASSERT(net->hasReadyRequests()); - NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); - net->scheduleResponse(noi, - net->now(), - ResponseStatus(ErrorCodes::NoSuchKey, "I'm missing")); - net->runReadyNetworkOperations(); - ASSERT(!net->hasReadyRequests()); - executor.wait(cbHandle); - executor.shutdown(); - joinExecutorThread(); - ASSERT_EQUALS(ErrorCodes::NoSuchKey, status1); + triggerEvent = errorOrTriggerEvent.getValue(); + StatusWith<ReplicationExecutor::CallbackHandle> cbHandle = + executor->onEvent(triggerEvent, triggered2); + if (!cbHandle.isOK()) { + status1 = cbHandle.getStatus(); + executor->shutdown(); + return; } - - TEST_F(ReplicationExecutorTest, ScheduleAndCancelRemoteCommand) { - ReplicationExecutor& executor = getExecutor(); - Status status1(ErrorCodes::InternalError, "Not mutated"); - const ReplicationExecutor::RemoteCommandRequest request( - HostAndPort("localhost", 27017), - "mydb", - BSON("whatsUp" << "doc")); - ReplicationExecutor::CallbackHandle cbHandle = unittest::assertGet( - executor.scheduleRemoteCommand( - request, - stdx::bind(setStatusOnRemoteCommandCompletion, - stdx::placeholders::_1, - request, - &status1))); - executor.cancel(cbHandle); - launchExecutorThread(); - getNet()->runReadyNetworkOperations(); - executor.wait(cbHandle); - executor.shutdown(); - joinExecutorThread(); - ASSERT_EQUALS(ErrorCodes::CallbackCanceled, status1); + cbHandle = executor->onEvent(triggerEvent, triggered3); + if (!cbHandle.isOK()) { + status1 = cbHandle.getStatus(); + executor->shutdown(); + return; } - TEST_F(ReplicationExecutorTest, ScheduleExclusiveLockOperation) { - ReplicationExecutor& executor = getExecutor(); - Status status1(ErrorCodes::InternalError, "Not mutated"); - ASSERT_OK(executor.scheduleWorkWithGlobalExclusiveLock( - stdx::bind(setStatusAndShutdown, - stdx::placeholders::_1, - &status1)).getStatus()); - executor.run(); - ASSERT_OK(status1); + cbHandle = executor->onEvent( + goEvent, + stdx::bind(&EventChainAndWaitingTest::onGoAfterTriggered, this, stdx::placeholders::_1)); + if (!cbHandle.isOK()) { + status1 = cbHandle.getStatus(); + executor->shutdown(); + return; } + status1 = Status::OK(); +} - TEST_F(ReplicationExecutorTest, RemoteCommandWithTimeout) { - NetworkInterfaceMock* net = getNet(); - ReplicationExecutor& executor = getExecutor(); - Status status(ErrorCodes::InternalError, ""); - launchExecutorThread(); - const ReplicationExecutor::RemoteCommandRequest request( - HostAndPort("lazy", 27017), - "admin", - BSON("sleep" << 1), - ReplicationExecutor::Milliseconds(1)); - ReplicationExecutor::CallbackHandle cbHandle = unittest::assertGet( - executor.scheduleRemoteCommand( - request, - stdx::bind(setStatusOnRemoteCommandCompletion, - stdx::placeholders::_1, - request, - &status))); - ASSERT(net->hasReadyRequests()); - const Date_t startTime = net->now(); - NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); - net->scheduleResponse(noi, - startTime + 2, - ResponseStatus(ErrorCodes::ExceededTimeLimit, "I took too long")); - net->runUntil(startTime + 2); - ASSERT_EQUALS(startTime + 2, net->now()); - executor.wait(cbHandle); - ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, status); +void EventChainAndWaitingTest::onGoAfterTriggered(const ReplicationExecutor::CallbackData& cbData) { + status4 = cbData.status; + if (!cbData.status.isOK()) { + return; } - - TEST_F(ReplicationExecutorTest, CallbackHandleComparison) { - ReplicationExecutor& executor = getExecutor(); - Status status(ErrorCodes::InternalError, ""); - const ReplicationExecutor::RemoteCommandRequest request( - HostAndPort("lazy", 27017), - "admin", - BSON("cmd" << 1)); - ReplicationExecutor::CallbackHandle cbHandle1 = unittest::assertGet( - executor.scheduleRemoteCommand( - request, - stdx::bind(setStatusOnRemoteCommandCompletion, - stdx::placeholders::_1, - request, - &status))); - ReplicationExecutor::CallbackHandle cbHandle2 = unittest::assertGet( - executor.scheduleRemoteCommand( - request, - stdx::bind(setStatusOnRemoteCommandCompletion, - stdx::placeholders::_1, - request, - &status))); - - // test equality - ASSERT_TRUE(cbHandle1 == cbHandle1); - ASSERT_TRUE(cbHandle2 == cbHandle2); - ASSERT_FALSE(cbHandle1 != cbHandle1); - ASSERT_FALSE(cbHandle2 != cbHandle2); - - // test inequality - ASSERT_TRUE(cbHandle1 != cbHandle2); - ASSERT_TRUE(cbHandle2 != cbHandle1); - ASSERT_FALSE(cbHandle1 == cbHandle2); - ASSERT_FALSE(cbHandle2 == cbHandle1); - - ReplicationExecutor::CallbackHandle cbHandle1Copy = cbHandle1; - ASSERT_TRUE(cbHandle1 == cbHandle1Copy); - ASSERT_TRUE(cbHandle1Copy == cbHandle1); - ASSERT_FALSE(cbHandle1Copy != cbHandle1); - ASSERT_FALSE(cbHandle1 != cbHandle1Copy); - - std::vector<ReplicationExecutor::CallbackHandle> cbs; - cbs.push_back(cbHandle1); - cbs.push_back(cbHandle2); - ASSERT(cbHandle1 != cbHandle2); - std::vector<ReplicationExecutor::CallbackHandle>::iterator foundHandle = - std::find(cbs.begin(), - cbs.end(), - cbHandle1); - ASSERT_TRUE(cbs.end() != foundHandle); - ASSERT_TRUE(cbHandle1 == *foundHandle); - launchExecutorThread(); - executor.shutdown(); - joinExecutorThread(); + cbData.executor->signalEvent(triggerEvent); +} + +TEST_F(ReplicationExecutorTest, ScheduleWorkAt) { + NetworkInterfaceMock* net = getNet(); + ReplicationExecutor& executor = getExecutor(); + launchExecutorThread(); + Status status1(ErrorCodes::InternalError, "Not mutated"); + Status status2(ErrorCodes::InternalError, "Not mutated"); + Status status3(ErrorCodes::InternalError, "Not mutated"); + const Date_t now = net->now(); + const ReplicationExecutor::CallbackHandle cb1 = unittest::assertGet(executor.scheduleWorkAt( + Date_t(now.millis + 100), stdx::bind(setStatus, stdx::placeholders::_1, &status1))); + unittest::assertGet(executor.scheduleWorkAt( + Date_t(now.millis + 5000), stdx::bind(setStatus, stdx::placeholders::_1, &status3))); + const ReplicationExecutor::CallbackHandle cb2 = unittest::assertGet(executor.scheduleWorkAt( + Date_t(now.millis + 200), + stdx::bind(setStatusAndShutdown, stdx::placeholders::_1, &status2))); + const Date_t startTime = net->now(); + net->runUntil(startTime + 200 /*ms*/); + ASSERT_EQUALS(startTime + 200, net->now()); + executor.wait(cb1); + executor.wait(cb2); + ASSERT_OK(status1); + ASSERT_OK(status2); + executor.shutdown(); + joinExecutorThread(); + ASSERT_EQUALS(status3, ErrorCodes::CallbackCanceled); +} + +std::string getRequestDescription(const ReplicationExecutor::RemoteCommandRequest& request) { + return mongoutils::str::stream() << "Request(" << request.target.toString() << ", " + << request.dbname << ", " << request.cmdObj << ')'; +} + +static void setStatusOnRemoteCommandCompletion( + const ReplicationExecutor::RemoteCommandCallbackData& cbData, + const ReplicationExecutor::RemoteCommandRequest& expectedRequest, + Status* outStatus) { + if (cbData.request != expectedRequest) { + *outStatus = Status(ErrorCodes::BadValue, + mongoutils::str::stream() + << "Actual request: " << getRequestDescription(cbData.request) + << "; expected: " << getRequestDescription(expectedRequest)); + return; } + *outStatus = cbData.response.getStatus(); +} + +TEST_F(ReplicationExecutorTest, ScheduleRemoteCommand) { + NetworkInterfaceMock* net = getNet(); + ReplicationExecutor& executor = getExecutor(); + launchExecutorThread(); + Status status1(ErrorCodes::InternalError, "Not mutated"); + const ReplicationExecutor::RemoteCommandRequest request(HostAndPort("localhost", 27017), + "mydb", + BSON("whatsUp" + << "doc")); + ReplicationExecutor::CallbackHandle cbHandle = + unittest::assertGet(executor.scheduleRemoteCommand( + request, + stdx::bind( + setStatusOnRemoteCommandCompletion, stdx::placeholders::_1, request, &status1))); + ASSERT(net->hasReadyRequests()); + NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); + net->scheduleResponse(noi, net->now(), ResponseStatus(ErrorCodes::NoSuchKey, "I'm missing")); + net->runReadyNetworkOperations(); + ASSERT(!net->hasReadyRequests()); + executor.wait(cbHandle); + executor.shutdown(); + joinExecutorThread(); + ASSERT_EQUALS(ErrorCodes::NoSuchKey, status1); +} + +TEST_F(ReplicationExecutorTest, ScheduleAndCancelRemoteCommand) { + ReplicationExecutor& executor = getExecutor(); + Status status1(ErrorCodes::InternalError, "Not mutated"); + const ReplicationExecutor::RemoteCommandRequest request(HostAndPort("localhost", 27017), + "mydb", + BSON("whatsUp" + << "doc")); + ReplicationExecutor::CallbackHandle cbHandle = + unittest::assertGet(executor.scheduleRemoteCommand( + request, + stdx::bind( + setStatusOnRemoteCommandCompletion, stdx::placeholders::_1, request, &status1))); + executor.cancel(cbHandle); + launchExecutorThread(); + getNet()->runReadyNetworkOperations(); + executor.wait(cbHandle); + executor.shutdown(); + joinExecutorThread(); + ASSERT_EQUALS(ErrorCodes::CallbackCanceled, status1); +} + +TEST_F(ReplicationExecutorTest, ScheduleExclusiveLockOperation) { + ReplicationExecutor& executor = getExecutor(); + Status status1(ErrorCodes::InternalError, "Not mutated"); + ASSERT_OK(executor.scheduleWorkWithGlobalExclusiveLock( + stdx::bind(setStatusAndShutdown, stdx::placeholders::_1, &status1)) + .getStatus()); + executor.run(); + ASSERT_OK(status1); +} + +TEST_F(ReplicationExecutorTest, RemoteCommandWithTimeout) { + NetworkInterfaceMock* net = getNet(); + ReplicationExecutor& executor = getExecutor(); + Status status(ErrorCodes::InternalError, ""); + launchExecutorThread(); + const ReplicationExecutor::RemoteCommandRequest request(HostAndPort("lazy", 27017), + "admin", + BSON("sleep" << 1), + ReplicationExecutor::Milliseconds(1)); + ReplicationExecutor::CallbackHandle cbHandle = + unittest::assertGet(executor.scheduleRemoteCommand( + request, + stdx::bind( + setStatusOnRemoteCommandCompletion, stdx::placeholders::_1, request, &status))); + ASSERT(net->hasReadyRequests()); + const Date_t startTime = net->now(); + NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); + net->scheduleResponse( + noi, startTime + 2, ResponseStatus(ErrorCodes::ExceededTimeLimit, "I took too long")); + net->runUntil(startTime + 2); + ASSERT_EQUALS(startTime + 2, net->now()); + executor.wait(cbHandle); + ASSERT_EQUALS(ErrorCodes::ExceededTimeLimit, status); +} + +TEST_F(ReplicationExecutorTest, CallbackHandleComparison) { + ReplicationExecutor& executor = getExecutor(); + Status status(ErrorCodes::InternalError, ""); + const ReplicationExecutor::RemoteCommandRequest request( + HostAndPort("lazy", 27017), "admin", BSON("cmd" << 1)); + ReplicationExecutor::CallbackHandle cbHandle1 = + unittest::assertGet(executor.scheduleRemoteCommand( + request, + stdx::bind( + setStatusOnRemoteCommandCompletion, stdx::placeholders::_1, request, &status))); + ReplicationExecutor::CallbackHandle cbHandle2 = + unittest::assertGet(executor.scheduleRemoteCommand( + request, + stdx::bind( + setStatusOnRemoteCommandCompletion, stdx::placeholders::_1, request, &status))); + + // test equality + ASSERT_TRUE(cbHandle1 == cbHandle1); + ASSERT_TRUE(cbHandle2 == cbHandle2); + ASSERT_FALSE(cbHandle1 != cbHandle1); + ASSERT_FALSE(cbHandle2 != cbHandle2); + + // test inequality + ASSERT_TRUE(cbHandle1 != cbHandle2); + ASSERT_TRUE(cbHandle2 != cbHandle1); + ASSERT_FALSE(cbHandle1 == cbHandle2); + ASSERT_FALSE(cbHandle2 == cbHandle1); + + ReplicationExecutor::CallbackHandle cbHandle1Copy = cbHandle1; + ASSERT_TRUE(cbHandle1 == cbHandle1Copy); + ASSERT_TRUE(cbHandle1Copy == cbHandle1); + ASSERT_FALSE(cbHandle1Copy != cbHandle1); + ASSERT_FALSE(cbHandle1 != cbHandle1Copy); + + std::vector<ReplicationExecutor::CallbackHandle> cbs; + cbs.push_back(cbHandle1); + cbs.push_back(cbHandle2); + ASSERT(cbHandle1 != cbHandle2); + std::vector<ReplicationExecutor::CallbackHandle>::iterator foundHandle = + std::find(cbs.begin(), cbs.end(), cbHandle1); + ASSERT_TRUE(cbs.end() != foundHandle); + ASSERT_TRUE(cbHandle1 == *foundHandle); + launchExecutorThread(); + executor.shutdown(); + joinExecutorThread(); +} } // namespace } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/replication_info.cpp b/src/mongo/db/repl/replication_info.cpp index cf7ff0faabb..72449c7c6a5 100644 --- a/src/mongo/db/repl/replication_info.cpp +++ b/src/mongo/db/repl/replication_info.cpp @@ -49,182 +49,193 @@ namespace mongo { - using std::auto_ptr; - using std::list; - using std::string; - using std::stringstream; +using std::auto_ptr; +using std::list; +using std::string; +using std::stringstream; namespace repl { - void appendReplicationInfo(OperationContext* txn, BSONObjBuilder& result, int level) { - ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); - if (replCoord->getSettings().usingReplSets()) { - IsMasterResponse isMasterResponse; - replCoord->fillIsMasterForReplSet(&isMasterResponse); - result.appendElements(isMasterResponse.toBSON()); - if (level) { - replCoord->appendSlaveInfoData(&result); - } - return; - } - - // TODO(dannenberg) replAllDead is bad and should be removed when master slave is removed - if (replAllDead) { - result.append("ismaster", 0); - string s = string("dead: ") + replAllDead; - result.append("info", s); +void appendReplicationInfo(OperationContext* txn, BSONObjBuilder& result, int level) { + ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); + if (replCoord->getSettings().usingReplSets()) { + IsMasterResponse isMasterResponse; + replCoord->fillIsMasterForReplSet(&isMasterResponse); + result.appendElements(isMasterResponse.toBSON()); + if (level) { + replCoord->appendSlaveInfoData(&result); } - else { - result.appendBool("ismaster", - getGlobalReplicationCoordinator()->isMasterForReportingPurposes()); + return; + } + + // TODO(dannenberg) replAllDead is bad and should be removed when master slave is removed + if (replAllDead) { + result.append("ismaster", 0); + string s = string("dead: ") + replAllDead; + result.append("info", s); + } else { + result.appendBool("ismaster", + getGlobalReplicationCoordinator()->isMasterForReportingPurposes()); + } + + if (level) { + BSONObjBuilder sources(result.subarrayStart("sources")); + + int n = 0; + list<BSONObj> src; + { + const char* localSources = "local.sources"; + AutoGetCollectionForRead ctx(txn, localSources); + auto_ptr<PlanExecutor> exec( + InternalPlanner::collectionScan(txn, localSources, ctx.getCollection())); + BSONObj obj; + PlanExecutor::ExecState state; + while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { + src.push_back(obj); + } } - - if (level) { - BSONObjBuilder sources( result.subarrayStart( "sources" ) ); - - int n = 0; - list<BSONObj> src; + + for (list<BSONObj>::const_iterator i = src.begin(); i != src.end(); i++) { + BSONObj s = *i; + BSONObjBuilder bb; + bb.append(s["host"]); + string sourcename = s["source"].valuestr(); + if (sourcename != "main") + bb.append(s["source"]); { - const char* localSources = "local.sources"; - AutoGetCollectionForRead ctx(txn, localSources); - auto_ptr<PlanExecutor> exec( - InternalPlanner::collectionScan(txn, localSources, ctx.getCollection())); - BSONObj obj; - PlanExecutor::ExecState state; - while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { - src.push_back(obj); - } + BSONElement e = s["syncedTo"]; + BSONObjBuilder t(bb.subobjStart("syncedTo")); + t.appendDate("time", e.timestampTime()); + t.append("inc", e.timestampInc()); + t.done(); } - - for( list<BSONObj>::const_iterator i = src.begin(); i != src.end(); i++ ) { - BSONObj s = *i; - BSONObjBuilder bb; - bb.append( s["host"] ); - string sourcename = s["source"].valuestr(); - if ( sourcename != "main" ) - bb.append( s["source"] ); - { - BSONElement e = s["syncedTo"]; - BSONObjBuilder t( bb.subobjStart( "syncedTo" ) ); - t.appendDate( "time" , e.timestampTime() ); - t.append( "inc" , e.timestampInc() ); - t.done(); - } - - if ( level > 1 ) { - wassert(!txn->lockState()->isLocked()); - // note: there is no so-style timeout on this connection; perhaps we should have one. - ScopedDbConnection conn(s["host"].valuestr()); - - DBClientConnection *cliConn = dynamic_cast< DBClientConnection* >( &conn.conn() ); - if ( cliConn && replAuthenticate(cliConn) ) { - BSONObj first = conn->findOne( (string)"local.oplog.$" + sourcename, - Query().sort( BSON( "$natural" << 1 ) ) ); - BSONObj last = conn->findOne( (string)"local.oplog.$" + sourcename, - Query().sort( BSON( "$natural" << -1 ) ) ); - bb.appendDate( "masterFirst" , first["ts"].timestampTime() ); - bb.appendDate( "masterLast" , last["ts"].timestampTime() ); - double lag = (double) (last["ts"].timestampTime() - s["syncedTo"].timestampTime()); - bb.append( "lagSeconds" , lag / 1000 ); - } - conn.done(); + + if (level > 1) { + wassert(!txn->lockState()->isLocked()); + // note: there is no so-style timeout on this connection; perhaps we should have one. + ScopedDbConnection conn(s["host"].valuestr()); + + DBClientConnection* cliConn = dynamic_cast<DBClientConnection*>(&conn.conn()); + if (cliConn && replAuthenticate(cliConn)) { + BSONObj first = conn->findOne((string) "local.oplog.$" + sourcename, + Query().sort(BSON("$natural" << 1))); + BSONObj last = conn->findOne((string) "local.oplog.$" + sourcename, + Query().sort(BSON("$natural" << -1))); + bb.appendDate("masterFirst", first["ts"].timestampTime()); + bb.appendDate("masterLast", last["ts"].timestampTime()); + double lag = + (double)(last["ts"].timestampTime() - s["syncedTo"].timestampTime()); + bb.append("lagSeconds", lag / 1000); } - - sources.append( BSONObjBuilder::numStr( n++ ) , bb.obj() ); + conn.done(); } - - sources.done(); - replCoord->appendSlaveInfoData(&result); + sources.append(BSONObjBuilder::numStr(n++), bb.obj()); } + + sources.done(); + + replCoord->appendSlaveInfoData(&result); } - - class ReplicationInfoServerStatus : public ServerStatusSection { - public: - ReplicationInfoServerStatus() : ServerStatusSection( "repl" ){} - bool includeByDefault() const { return true; } - - BSONObj generateSection(OperationContext* txn, - const BSONElement& configElement) const { - - if (!getGlobalReplicationCoordinator()->isReplEnabled()) { - return BSONObj(); - } - - int level = configElement.numberInt(); - - BSONObjBuilder result; - appendReplicationInfo(txn, result, level); - getGlobalReplicationCoordinator()->processReplSetGetRBID(&result); - - return result.obj(); +} + +class ReplicationInfoServerStatus : public ServerStatusSection { +public: + ReplicationInfoServerStatus() : ServerStatusSection("repl") {} + bool includeByDefault() const { + return true; + } + + BSONObj generateSection(OperationContext* txn, const BSONElement& configElement) const { + if (!getGlobalReplicationCoordinator()->isReplEnabled()) { + return BSONObj(); } - } replicationInfoServerStatus; + int level = configElement.numberInt(); - class OplogInfoServerStatus : public ServerStatusSection { - public: - OplogInfoServerStatus() : ServerStatusSection( "oplog" ){} - bool includeByDefault() const { return false; } + BSONObjBuilder result; + appendReplicationInfo(txn, result, level); + getGlobalReplicationCoordinator()->processReplSetGetRBID(&result); - BSONObj generateSection(OperationContext* txn, - const BSONElement& configElement) const { + return result.obj(); + } - ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); - if (!replCoord->isReplEnabled()) { - return BSONObj(); - } +} replicationInfoServerStatus; - BSONObjBuilder result; - result.append("latestOptime", replCoord->getMyLastOptime()); - - const char* oplogNS = - replCoord->getReplicationMode() == ReplicationCoordinator::modeReplSet ? - rsoplog : "local.oplog.$main"; - BSONObj o; - uassert(17347, - "Problem reading earliest entry from oplog", - Helpers::getSingleton(txn, oplogNS, o)); - result.append("earliestOptime", o["ts"]._opTime()); - return result.obj(); - } - } oplogInfoServerStatus; +class OplogInfoServerStatus : public ServerStatusSection { +public: + OplogInfoServerStatus() : ServerStatusSection("oplog") {} + bool includeByDefault() const { + return false; + } - class CmdIsMaster : public Command { - public: - virtual bool requiresAuth() { return false; } - virtual bool slaveOk() const { - return true; - } - virtual void help( stringstream &help ) const { - help << "Check if this server is primary for a replica pair/set; also if it is --master or --slave in simple master/slave setups.\n"; - help << "{ isMaster : 1 }"; - } - virtual bool isWriteCommandForConfigServer() const { return false; } - virtual void addRequiredPrivileges(const std::string& dbname, - const BSONObj& cmdObj, - std::vector<Privilege>* out) {} // No auth required - CmdIsMaster() : Command("isMaster", true, "ismaster") { } - virtual bool run(OperationContext* txn, const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) { - /* currently request to arbiter is (somewhat arbitrarily) an ismaster request that is not - authenticated. - */ - if ( cmdObj["forShell"].trueValue() ) - lastError.disableForCommand(); - - appendReplicationInfo(txn, result, 0); - - result.appendNumber("maxBsonObjectSize", BSONObjMaxUserSize); - result.appendNumber("maxMessageSizeBytes", MaxMessageSizeBytes); - result.appendNumber("maxWriteBatchSize", BatchedCommandRequest::kMaxWriteBatchSize); - result.appendDate("localTime", jsTime()); - result.append("maxWireVersion", maxWireVersion); - result.append("minWireVersion", minWireVersion); - return true; + BSONObj generateSection(OperationContext* txn, const BSONElement& configElement) const { + ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); + if (!replCoord->isReplEnabled()) { + return BSONObj(); } - } cmdismaster; - OpCounterServerStatusSection replOpCounterServerStatusSection( "opcountersRepl", &replOpCounters ); + BSONObjBuilder result; + result.append("latestOptime", replCoord->getMyLastOptime()); + + const char* oplogNS = replCoord->getReplicationMode() == ReplicationCoordinator::modeReplSet + ? rsoplog + : "local.oplog.$main"; + BSONObj o; + uassert(17347, + "Problem reading earliest entry from oplog", + Helpers::getSingleton(txn, oplogNS, o)); + result.append("earliestOptime", o["ts"]._opTime()); + return result.obj(); + } +} oplogInfoServerStatus; + +class CmdIsMaster : public Command { +public: + virtual bool requiresAuth() { + return false; + } + virtual bool slaveOk() const { + return true; + } + virtual void help(stringstream& help) const { + help << "Check if this server is primary for a replica pair/set; also if it is --master or " + "--slave in simple master/slave setups.\n"; + help << "{ isMaster : 1 }"; + } + virtual bool isWriteCommandForConfigServer() const { + return false; + } + virtual void addRequiredPrivileges(const std::string& dbname, + const BSONObj& cmdObj, + std::vector<Privilege>* out) {} // No auth required + CmdIsMaster() : Command("isMaster", true, "ismaster") {} + virtual bool run(OperationContext* txn, + const string&, + BSONObj& cmdObj, + int, + string& errmsg, + BSONObjBuilder& result, + bool /*fromRepl*/) { + /* currently request to arbiter is (somewhat arbitrarily) an ismaster request that is not + authenticated. + */ + if (cmdObj["forShell"].trueValue()) + lastError.disableForCommand(); + + appendReplicationInfo(txn, result, 0); + + result.appendNumber("maxBsonObjectSize", BSONObjMaxUserSize); + result.appendNumber("maxMessageSizeBytes", MaxMessageSizeBytes); + result.appendNumber("maxWriteBatchSize", BatchedCommandRequest::kMaxWriteBatchSize); + result.appendDate("localTime", jsTime()); + result.append("maxWireVersion", maxWireVersion); + result.append("minWireVersion", minWireVersion); + return true; + } +} cmdismaster; + +OpCounterServerStatusSection replOpCounterServerStatusSection("opcountersRepl", &replOpCounters); -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/replset_commands.cpp b/src/mongo/db/repl/replset_commands.cpp index 5e03fa43766..20c40774837 100644 --- a/src/mongo/db/repl/replset_commands.cpp +++ b/src/mongo/db/repl/replset_commands.cpp @@ -57,671 +57,736 @@ namespace mongo { namespace repl { - using std::string; - using std::stringstream; - using std::vector; +using std::string; +using std::stringstream; +using std::vector; - unsigned replSetForceInitialSyncFailure = 0; +unsigned replSetForceInitialSyncFailure = 0; - // Testing only, enabled via command-line. - class CmdReplSetTest : public ReplSetCommand { - public: - virtual void help( stringstream &help ) const { - help << "Just for regression tests.\n"; +// Testing only, enabled via command-line. +class CmdReplSetTest : public ReplSetCommand { +public: + virtual void help(stringstream& help) const { + help << "Just for regression tests.\n"; + } + // No auth needed because it only works when enabled via command line. + virtual void addRequiredPrivileges(const std::string& dbname, + const BSONObj& cmdObj, + std::vector<Privilege>* out) {} + CmdReplSetTest() : ReplSetCommand("replSetTest") {} + virtual bool run(OperationContext* txn, + const string&, + BSONObj& cmdObj, + int, + string& errmsg, + BSONObjBuilder& result, + bool fromRepl) { + log() << "replSet replSetTest command received: " << cmdObj.toString(); + + if (cmdObj.hasElement("forceInitialSyncFailure")) { + replSetForceInitialSyncFailure = (unsigned)cmdObj["forceInitialSyncFailure"].Number(); + return true; } - // No auth needed because it only works when enabled via command line. - virtual void addRequiredPrivileges(const std::string& dbname, - const BSONObj& cmdObj, - std::vector<Privilege>* out) {} - CmdReplSetTest() : ReplSetCommand("replSetTest") { } - virtual bool run(OperationContext* txn, const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { - log() << "replSet replSetTest command received: " << cmdObj.toString(); - - if( cmdObj.hasElement("forceInitialSyncFailure") ) { - replSetForceInitialSyncFailure = (unsigned) cmdObj["forceInitialSyncFailure"].Number(); - return true; - } - Status status = getGlobalReplicationCoordinator()->checkReplEnabledForCommand(&result); - if (!status.isOK()) - return appendCommandStatus(result, status); + Status status = getGlobalReplicationCoordinator()->checkReplEnabledForCommand(&result); + if (!status.isOK()) + return appendCommandStatus(result, status); - return false; - } - }; - MONGO_INITIALIZER(RegisterReplSetTestCmd)(InitializerContext* context) { - if (Command::testCommandsEnabled) { - // Leaked intentionally: a Command registers itself when constructed. - new CmdReplSetTest(); - } - return Status::OK(); + return false; } +}; +MONGO_INITIALIZER(RegisterReplSetTestCmd)(InitializerContext* context) { + if (Command::testCommandsEnabled) { + // Leaked intentionally: a Command registers itself when constructed. + new CmdReplSetTest(); + } + return Status::OK(); +} - /** get rollback id. used to check if a rollback happened during some interval of time. - as consumed, the rollback id is not in any particular order, it simply changes on each rollback. - @see incRBID() - */ - class CmdReplSetGetRBID : public ReplSetCommand { - public: - virtual void help( stringstream &help ) const { - help << "internal"; - } - CmdReplSetGetRBID() : ReplSetCommand("replSetGetRBID") {} - virtual void addRequiredPrivileges(const std::string& dbname, - const BSONObj& cmdObj, - std::vector<Privilege>* out) { - ActionSet actions; - actions.addAction(ActionType::internal); - out->push_back(Privilege(ResourcePattern::forClusterResource(), actions)); - } - virtual bool run(OperationContext* txn, const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { - Status status = getGlobalReplicationCoordinator()->checkReplEnabledForCommand(&result); - if (!status.isOK()) - return appendCommandStatus(result, status); - - status = getGlobalReplicationCoordinator()->processReplSetGetRBID(&result); +/** get rollback id. used to check if a rollback happened during some interval of time. + as consumed, the rollback id is not in any particular order, it simply changes on each rollback. + @see incRBID() +*/ +class CmdReplSetGetRBID : public ReplSetCommand { +public: + virtual void help(stringstream& help) const { + help << "internal"; + } + CmdReplSetGetRBID() : ReplSetCommand("replSetGetRBID") {} + virtual void addRequiredPrivileges(const std::string& dbname, + const BSONObj& cmdObj, + std::vector<Privilege>* out) { + ActionSet actions; + actions.addAction(ActionType::internal); + out->push_back(Privilege(ResourcePattern::forClusterResource(), actions)); + } + virtual bool run(OperationContext* txn, + const string&, + BSONObj& cmdObj, + int, + string& errmsg, + BSONObjBuilder& result, + bool fromRepl) { + Status status = getGlobalReplicationCoordinator()->checkReplEnabledForCommand(&result); + if (!status.isOK()) return appendCommandStatus(result, status); - } - } cmdReplSetRBID; - - class CmdReplSetGetStatus : public ReplSetCommand { - public: - virtual void help( stringstream &help ) const { - help << "Report status of a replica set from the POV of this server\n"; - help << "{ replSetGetStatus : 1 }"; - help << "\nhttp://dochub.mongodb.org/core/replicasetcommands"; - } - virtual void addRequiredPrivileges(const std::string& dbname, - const BSONObj& cmdObj, - std::vector<Privilege>* out) { - ActionSet actions; - actions.addAction(ActionType::replSetGetStatus); - out->push_back(Privilege(ResourcePattern::forClusterResource(), actions)); - } - CmdReplSetGetStatus() : ReplSetCommand("replSetGetStatus", true) { } - virtual bool run(OperationContext* txn, const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { - if ( cmdObj["forShell"].trueValue() ) - lastError.disableForCommand(); - Status status = getGlobalReplicationCoordinator()->checkReplEnabledForCommand(&result); - if (!status.isOK()) - return appendCommandStatus(result, status); + status = getGlobalReplicationCoordinator()->processReplSetGetRBID(&result); + return appendCommandStatus(result, status); + } +} cmdReplSetRBID; + +class CmdReplSetGetStatus : public ReplSetCommand { +public: + virtual void help(stringstream& help) const { + help << "Report status of a replica set from the POV of this server\n"; + help << "{ replSetGetStatus : 1 }"; + help << "\nhttp://dochub.mongodb.org/core/replicasetcommands"; + } + virtual void addRequiredPrivileges(const std::string& dbname, + const BSONObj& cmdObj, + std::vector<Privilege>* out) { + ActionSet actions; + actions.addAction(ActionType::replSetGetStatus); + out->push_back(Privilege(ResourcePattern::forClusterResource(), actions)); + } + CmdReplSetGetStatus() : ReplSetCommand("replSetGetStatus", true) {} + virtual bool run(OperationContext* txn, + const string&, + BSONObj& cmdObj, + int, + string& errmsg, + BSONObjBuilder& result, + bool fromRepl) { + if (cmdObj["forShell"].trueValue()) + lastError.disableForCommand(); + + Status status = getGlobalReplicationCoordinator()->checkReplEnabledForCommand(&result); + if (!status.isOK()) + return appendCommandStatus(result, status); - status = getGlobalReplicationCoordinator()->processReplSetGetStatus(&result); + status = getGlobalReplicationCoordinator()->processReplSetGetStatus(&result); + return appendCommandStatus(result, status); + } +} cmdReplSetGetStatus; + +class CmdReplSetGetConfig : public ReplSetCommand { +public: + virtual void help(stringstream& help) const { + help << "Returns the current replica set configuration"; + help << "{ replSetGetConfig : 1 }"; + help << "\nhttp://dochub.mongodb.org/core/replicasetcommands"; + } + virtual void addRequiredPrivileges(const std::string& dbname, + const BSONObj& cmdObj, + std::vector<Privilege>* out) { + ActionSet actions; + actions.addAction(ActionType::replSetGetConfig); + out->push_back(Privilege(ResourcePattern::forClusterResource(), actions)); + } + CmdReplSetGetConfig() : ReplSetCommand("replSetGetConfig", true) {} + virtual bool run(OperationContext* txn, + const string&, + BSONObj& cmdObj, + int, + string& errmsg, + BSONObjBuilder& result, + bool fromRepl) { + Status status = getGlobalReplicationCoordinator()->checkReplEnabledForCommand(&result); + if (!status.isOK()) return appendCommandStatus(result, status); - } - } cmdReplSetGetStatus; - - class CmdReplSetGetConfig : public ReplSetCommand { - public: - virtual void help( stringstream &help ) const { - help << "Returns the current replica set configuration"; - help << "{ replSetGetConfig : 1 }"; - help << "\nhttp://dochub.mongodb.org/core/replicasetcommands"; - } - virtual void addRequiredPrivileges(const std::string& dbname, - const BSONObj& cmdObj, - std::vector<Privilege>* out) { - ActionSet actions; - actions.addAction(ActionType::replSetGetConfig); - out->push_back(Privilege(ResourcePattern::forClusterResource(), actions)); - } - CmdReplSetGetConfig() : ReplSetCommand("replSetGetConfig", true) { } - virtual bool run(OperationContext* txn, const string& , BSONObj& cmdObj, - int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { - Status status = getGlobalReplicationCoordinator()->checkReplEnabledForCommand(&result); - if (!status.isOK()) - return appendCommandStatus(result, status); - getGlobalReplicationCoordinator()->processReplSetGetConfig(&result); - return true; - } - } cmdReplSetGetConfig; + getGlobalReplicationCoordinator()->processReplSetGetConfig(&result); + return true; + } +} cmdReplSetGetConfig; namespace { - HostAndPort someHostAndPortForMe() { - const char* ips = serverGlobalParams.bind_ip.c_str(); - while (*ips) { - std::string ip; - const char* comma = strchr(ips, ','); - if (comma) { - ip = std::string(ips, comma - ips); - ips = comma + 1; - } - else { - ip = std::string(ips); - ips = ""; - } - HostAndPort h = HostAndPort(ip, serverGlobalParams.port); - if (!h.isLocalHost()) { - return h; - } +HostAndPort someHostAndPortForMe() { + const char* ips = serverGlobalParams.bind_ip.c_str(); + while (*ips) { + std::string ip; + const char* comma = strchr(ips, ','); + if (comma) { + ip = std::string(ips, comma - ips); + ips = comma + 1; + } else { + ip = std::string(ips); + ips = ""; + } + HostAndPort h = HostAndPort(ip, serverGlobalParams.port); + if (!h.isLocalHost()) { + return h; } - - std::string h = getHostName(); - verify(!h.empty()); - verify(h != "localhost"); - return HostAndPort(h, serverGlobalParams.port); } -} // namespace - class CmdReplSetInitiate : public ReplSetCommand { - public: - virtual bool isWriteCommandForConfigServer() const { return false; } - CmdReplSetInitiate() : ReplSetCommand("replSetInitiate") { } - virtual void help(stringstream& h) const { - h << "Initiate/christen a replica set."; - h << "\nhttp://dochub.mongodb.org/core/replicasetcommands"; - } - virtual void addRequiredPrivileges(const std::string& dbname, - const BSONObj& cmdObj, - std::vector<Privilege>* out) { - ActionSet actions; - actions.addAction(ActionType::replSetConfigure); - out->push_back(Privilege(ResourcePattern::forClusterResource(), actions)); - } - virtual bool run(OperationContext* txn, - const string& , - BSONObj& cmdObj, - int, string& errmsg, - BSONObjBuilder& result, - bool fromRepl) { - - BSONObj configObj; - if( cmdObj["replSetInitiate"].type() == Object ) { - configObj = cmdObj["replSetInitiate"].Obj(); - } + std::string h = getHostName(); + verify(!h.empty()); + verify(h != "localhost"); + return HostAndPort(h, serverGlobalParams.port); +} +} // namespace - if (configObj.isEmpty()) { - result.append("info2", "no configuration explicitly specified -- making one"); - log() << "replSet info initiate : no configuration specified. " - "Using a default configuration for the set"; - - ReplicationCoordinatorExternalStateImpl externalState; - std::string name; - std::vector<HostAndPort> seeds; - std::set<HostAndPort> seedSet; - parseReplSetSeedList( - &externalState, - getGlobalReplicationCoordinator()->getSettings().replSet, - name, - seeds, - seedSet); // may throw... - - BSONObjBuilder b; - b.append("_id", name); - b.append("version", 1); - BSONObjBuilder members; - HostAndPort me = someHostAndPortForMe(); - members.append("0", BSON( "_id" << 0 << "host" << me.toString() )); - result.append("me", me.toString()); - for( unsigned i = 0; i < seeds.size(); i++ ) { - members.append(BSONObjBuilder::numStr(i+1), - BSON( "_id" << i+1 << "host" << seeds[i].toString())); - } - b.appendArray("members", members.obj()); - configObj = b.obj(); - log() << "replSet created this configuration for initiation : " << - configObj.toString(); +class CmdReplSetInitiate : public ReplSetCommand { +public: + virtual bool isWriteCommandForConfigServer() const { + return false; + } + CmdReplSetInitiate() : ReplSetCommand("replSetInitiate") {} + virtual void help(stringstream& h) const { + h << "Initiate/christen a replica set."; + h << "\nhttp://dochub.mongodb.org/core/replicasetcommands"; + } + virtual void addRequiredPrivileges(const std::string& dbname, + const BSONObj& cmdObj, + std::vector<Privilege>* out) { + ActionSet actions; + actions.addAction(ActionType::replSetConfigure); + out->push_back(Privilege(ResourcePattern::forClusterResource(), actions)); + } + virtual bool run(OperationContext* txn, + const string&, + BSONObj& cmdObj, + int, + string& errmsg, + BSONObjBuilder& result, + bool fromRepl) { + BSONObj configObj; + if (cmdObj["replSetInitiate"].type() == Object) { + configObj = cmdObj["replSetInitiate"].Obj(); + } + + if (configObj.isEmpty()) { + result.append("info2", "no configuration explicitly specified -- making one"); + log() << "replSet info initiate : no configuration specified. " + "Using a default configuration for the set"; + + ReplicationCoordinatorExternalStateImpl externalState; + std::string name; + std::vector<HostAndPort> seeds; + std::set<HostAndPort> seedSet; + parseReplSetSeedList(&externalState, + getGlobalReplicationCoordinator()->getSettings().replSet, + name, + seeds, + seedSet); // may throw... + + BSONObjBuilder b; + b.append("_id", name); + b.append("version", 1); + BSONObjBuilder members; + HostAndPort me = someHostAndPortForMe(); + members.append("0", BSON("_id" << 0 << "host" << me.toString())); + result.append("me", me.toString()); + for (unsigned i = 0; i < seeds.size(); i++) { + members.append(BSONObjBuilder::numStr(i + 1), + BSON("_id" << i + 1 << "host" << seeds[i].toString())); } + b.appendArray("members", members.obj()); + configObj = b.obj(); + log() << "replSet created this configuration for initiation : " << configObj.toString(); + } - if (configObj.getField("version").eoo()) { - // Missing version field defaults to version 1. - BSONObjBuilder builder; - builder.appendElements(configObj); - builder.append("version", 1); - configObj = builder.obj(); - } + if (configObj.getField("version").eoo()) { + // Missing version field defaults to version 1. + BSONObjBuilder builder; + builder.appendElements(configObj); + builder.append("version", 1); + configObj = builder.obj(); + } + + Status status = + getGlobalReplicationCoordinator()->processReplSetInitiate(txn, configObj, &result); + return appendCommandStatus(result, status); + } +} cmdReplSetInitiate; + +class CmdReplSetReconfig : public ReplSetCommand { + RWLock mutex; /* we don't need rw but we wanted try capability. :-( */ +public: + virtual void help(stringstream& help) const { + help << "Adjust configuration of a replica set\n"; + help << "{ replSetReconfig : config_object }"; + help << "\nhttp://dochub.mongodb.org/core/replicasetcommands"; + } + virtual void addRequiredPrivileges(const std::string& dbname, + const BSONObj& cmdObj, + std::vector<Privilege>* out) { + ActionSet actions; + actions.addAction(ActionType::replSetConfigure); + out->push_back(Privilege(ResourcePattern::forClusterResource(), actions)); + } + CmdReplSetReconfig() : ReplSetCommand("replSetReconfig"), mutex("rsreconfig") {} + virtual bool run(OperationContext* txn, + const string& a, + BSONObj& b, + int e, + string& errmsg, + BSONObjBuilder& c, + bool d) { + try { + rwlock_try_write lk(mutex); + return _run(txn, a, b, e, errmsg, c, d); + } catch (rwlock_try_write::exception&) { + } + errmsg = "a replSetReconfig is already in progress"; + return false; + } - Status status = getGlobalReplicationCoordinator()->processReplSetInitiate(txn, - configObj, - &result); +private: + bool _run(OperationContext* txn, + const string&, + BSONObj& cmdObj, + int, + string& errmsg, + BSONObjBuilder& result, + bool fromRepl) { + Status status = getGlobalReplicationCoordinator()->checkReplEnabledForCommand(&result); + if (!status.isOK()) { return appendCommandStatus(result, status); } - } cmdReplSetInitiate; - - class CmdReplSetReconfig : public ReplSetCommand { - RWLock mutex; /* we don't need rw but we wanted try capability. :-( */ - public: - virtual void help( stringstream &help ) const { - help << "Adjust configuration of a replica set\n"; - help << "{ replSetReconfig : config_object }"; - help << "\nhttp://dochub.mongodb.org/core/replicasetcommands"; - } - virtual void addRequiredPrivileges(const std::string& dbname, - const BSONObj& cmdObj, - std::vector<Privilege>* out) { - ActionSet actions; - actions.addAction(ActionType::replSetConfigure); - out->push_back(Privilege(ResourcePattern::forClusterResource(), actions)); - } - CmdReplSetReconfig() : ReplSetCommand("replSetReconfig"), mutex("rsreconfig") { } - virtual bool run(OperationContext* txn, const string& a, BSONObj& b, int e, string& errmsg, BSONObjBuilder& c, bool d) { - try { - rwlock_try_write lk(mutex); - return _run(txn, a,b,e,errmsg,c,d); - } - catch(rwlock_try_write::exception&) { } - errmsg = "a replSetReconfig is already in progress"; + + if (cmdObj["replSetReconfig"].type() != Object) { + errmsg = "no configuration specified"; return false; } - private: - bool _run(OperationContext* txn, const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { - Status status = getGlobalReplicationCoordinator()->checkReplEnabledForCommand(&result); - if (!status.isOK()) { - return appendCommandStatus(result, status); - } - if( cmdObj["replSetReconfig"].type() != Object ) { - errmsg = "no configuration specified"; - return false; - } + ReplicationCoordinator::ReplSetReconfigArgs parsedArgs; + parsedArgs.newConfigObj = cmdObj["replSetReconfig"].Obj(); + parsedArgs.force = cmdObj.hasField("force") && cmdObj["force"].trueValue(); + status = + getGlobalReplicationCoordinator()->processReplSetReconfig(txn, parsedArgs, &result); - ReplicationCoordinator::ReplSetReconfigArgs parsedArgs; - parsedArgs.newConfigObj = cmdObj["replSetReconfig"].Obj(); - parsedArgs.force = cmdObj.hasField("force") && cmdObj["force"].trueValue(); - status = getGlobalReplicationCoordinator()->processReplSetReconfig(txn, - parsedArgs, - &result); + ScopedTransaction scopedXact(txn, MODE_X); + Lock::GlobalWrite globalWrite(txn->lockState()); - ScopedTransaction scopedXact(txn, MODE_X); - Lock::GlobalWrite globalWrite(txn->lockState()); + WriteUnitOfWork wuow(txn); + if (status.isOK() && !parsedArgs.force) { + logOpInitiate(txn, + BSON("msg" + << "Reconfig set" + << "version" << parsedArgs.newConfigObj["version"])); + } + wuow.commit(); - WriteUnitOfWork wuow(txn); - if (status.isOK() && !parsedArgs.force) { - logOpInitiate(txn, BSON("msg" << "Reconfig set" << - "version" << parsedArgs.newConfigObj["version"])); - } - wuow.commit(); + return appendCommandStatus(result, status); + } +} cmdReplSetReconfig; + +class CmdReplSetFreeze : public ReplSetCommand { +public: + virtual void help(stringstream& help) const { + help << "{ replSetFreeze : <seconds> }"; + help << "'freeze' state of member to the extent we can do that. What this really means is " + "that\n"; + help << "this node will not attempt to become primary until the time period specified " + "expires.\n"; + help << "You can call again with {replSetFreeze:0} to unfreeze sooner.\n"; + help << "A process restart unfreezes the member also.\n"; + help << "\nhttp://dochub.mongodb.org/core/replicasetcommands"; + } + virtual void addRequiredPrivileges(const std::string& dbname, + const BSONObj& cmdObj, + std::vector<Privilege>* out) { + ActionSet actions; + actions.addAction(ActionType::replSetStateChange); + out->push_back(Privilege(ResourcePattern::forClusterResource(), actions)); + } + CmdReplSetFreeze() : ReplSetCommand("replSetFreeze") {} + virtual bool run(OperationContext* txn, + const string&, + BSONObj& cmdObj, + int, + string& errmsg, + BSONObjBuilder& result, + bool fromRepl) { + Status status = getGlobalReplicationCoordinator()->checkReplEnabledForCommand(&result); + if (!status.isOK()) + return appendCommandStatus(result, status); + int secs = (int)cmdObj.firstElement().numberInt(); + return appendCommandStatus( + result, getGlobalReplicationCoordinator()->processReplSetFreeze(secs, &result)); + } +} cmdReplSetFreeze; + +class CmdReplSetStepDown : public ReplSetCommand { +public: + virtual void help(stringstream& help) const { + help << "{ replSetStepDown : <seconds> }\n"; + help << "Step down as primary. Will not try to reelect self for the specified time period " + "(1 minute if no numeric secs value specified).\n"; + help << "(If another member with same priority takes over in the meantime, it will stay " + "primary.)\n"; + help << "http://dochub.mongodb.org/core/replicasetcommands"; + } + virtual void addRequiredPrivileges(const std::string& dbname, + const BSONObj& cmdObj, + std::vector<Privilege>* out) { + ActionSet actions; + actions.addAction(ActionType::replSetStateChange); + out->push_back(Privilege(ResourcePattern::forClusterResource(), actions)); + } + CmdReplSetStepDown() : ReplSetCommand("replSetStepDown") {} + virtual bool run(OperationContext* txn, + const string&, + BSONObj& cmdObj, + int, + string& errmsg, + BSONObjBuilder& result, + bool fromRepl) { + Status status = getGlobalReplicationCoordinator()->checkReplEnabledForCommand(&result); + if (!status.isOK()) return appendCommandStatus(result, status); - } - } cmdReplSetReconfig; - - class CmdReplSetFreeze : public ReplSetCommand { - public: - virtual void help( stringstream &help ) const { - help << "{ replSetFreeze : <seconds> }"; - help << "'freeze' state of member to the extent we can do that. What this really means is that\n"; - help << "this node will not attempt to become primary until the time period specified expires.\n"; - help << "You can call again with {replSetFreeze:0} to unfreeze sooner.\n"; - help << "A process restart unfreezes the member also.\n"; - help << "\nhttp://dochub.mongodb.org/core/replicasetcommands"; - } - virtual void addRequiredPrivileges(const std::string& dbname, - const BSONObj& cmdObj, - std::vector<Privilege>* out) { - ActionSet actions; - actions.addAction(ActionType::replSetStateChange); - out->push_back(Privilege(ResourcePattern::forClusterResource(), actions)); - } - CmdReplSetFreeze() : ReplSetCommand("replSetFreeze") { } - virtual bool run(OperationContext* txn, const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { - Status status = getGlobalReplicationCoordinator()->checkReplEnabledForCommand(&result); - if (!status.isOK()) - return appendCommandStatus(result, status); - int secs = (int) cmdObj.firstElement().numberInt(); - return appendCommandStatus( - result, - getGlobalReplicationCoordinator()->processReplSetFreeze(secs, &result)); - } - } cmdReplSetFreeze; - - class CmdReplSetStepDown: public ReplSetCommand { - public: - virtual void help( stringstream &help ) const { - help << "{ replSetStepDown : <seconds> }\n"; - help << "Step down as primary. Will not try to reelect self for the specified time period (1 minute if no numeric secs value specified).\n"; - help << "(If another member with same priority takes over in the meantime, it will stay primary.)\n"; - help << "http://dochub.mongodb.org/core/replicasetcommands"; - } - virtual void addRequiredPrivileges(const std::string& dbname, - const BSONObj& cmdObj, - std::vector<Privilege>* out) { - ActionSet actions; - actions.addAction(ActionType::replSetStateChange); - out->push_back(Privilege(ResourcePattern::forClusterResource(), actions)); - } - CmdReplSetStepDown() : ReplSetCommand("replSetStepDown") { } - virtual bool run(OperationContext* txn, const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { - Status status = getGlobalReplicationCoordinator()->checkReplEnabledForCommand(&result); - if (!status.isOK()) - return appendCommandStatus(result, status); + const bool force = cmdObj["force"].trueValue(); - const bool force = cmdObj["force"].trueValue(); + long long stepDownForSecs = cmdObj.firstElement().numberLong(); + if (stepDownForSecs == 0) { + stepDownForSecs = 60; + } else if (stepDownForSecs < 0) { + status = Status(ErrorCodes::BadValue, "stepdown period must be a positive integer"); + return appendCommandStatus(result, status); + } - long long stepDownForSecs = cmdObj.firstElement().numberLong(); - if (stepDownForSecs == 0) { - stepDownForSecs = 60; - } - else if (stepDownForSecs < 0) { - status = Status(ErrorCodes::BadValue, - "stepdown period must be a positive integer"); - return appendCommandStatus(result, status); + long long secondaryCatchUpPeriodSecs; + status = bsonExtractIntegerField( + cmdObj, "secondaryCatchUpPeriodSecs", &secondaryCatchUpPeriodSecs); + if (status.code() == ErrorCodes::NoSuchKey) { + // if field is absent, default values + if (force) { + secondaryCatchUpPeriodSecs = 0; + } else { + secondaryCatchUpPeriodSecs = 10; } + } else if (!status.isOK()) { + return appendCommandStatus(result, status); + } - long long secondaryCatchUpPeriodSecs; - status = bsonExtractIntegerField(cmdObj, - "secondaryCatchUpPeriodSecs", - &secondaryCatchUpPeriodSecs); - if (status.code() == ErrorCodes::NoSuchKey) { - // if field is absent, default values - if (force) { - secondaryCatchUpPeriodSecs = 0; - } - else { - secondaryCatchUpPeriodSecs = 10; - } - } - else if (!status.isOK()) { - return appendCommandStatus(result, status); - } + if (secondaryCatchUpPeriodSecs < 0) { + status = Status(ErrorCodes::BadValue, + "secondaryCatchUpPeriodSecs period must be a positive or absent"); + return appendCommandStatus(result, status); + } - if (secondaryCatchUpPeriodSecs < 0) { - status = Status(ErrorCodes::BadValue, - "secondaryCatchUpPeriodSecs period must be a positive or absent"); - return appendCommandStatus(result, status); - } + if (stepDownForSecs < secondaryCatchUpPeriodSecs) { + status = Status(ErrorCodes::BadValue, + "stepdown period must be longer than secondaryCatchUpPeriodSecs"); + return appendCommandStatus(result, status); + } - if (stepDownForSecs < secondaryCatchUpPeriodSecs) { - status = Status(ErrorCodes::BadValue, - "stepdown period must be longer than secondaryCatchUpPeriodSecs"); - return appendCommandStatus(result, status); - } + log() << "Attempting to step down in response to replSetStepDown command"; - log() << "Attempting to step down in response to replSetStepDown command"; + status = getGlobalReplicationCoordinator()->stepDown( + txn, + force, + ReplicationCoordinator::Milliseconds(secondaryCatchUpPeriodSecs * 1000), + ReplicationCoordinator::Milliseconds(stepDownForSecs * 1000)); + return appendCommandStatus(result, status); + } +} cmdReplSetStepDown; - status = getGlobalReplicationCoordinator()->stepDown( - txn, - force, - ReplicationCoordinator::Milliseconds(secondaryCatchUpPeriodSecs * 1000), - ReplicationCoordinator::Milliseconds(stepDownForSecs * 1000)); +class CmdReplSetMaintenance : public ReplSetCommand { +public: + virtual void help(stringstream& help) const { + help << "{ replSetMaintenance : bool }\n"; + help << "Enable or disable maintenance mode."; + } + virtual void addRequiredPrivileges(const std::string& dbname, + const BSONObj& cmdObj, + std::vector<Privilege>* out) { + ActionSet actions; + actions.addAction(ActionType::replSetStateChange); + out->push_back(Privilege(ResourcePattern::forClusterResource(), actions)); + } + CmdReplSetMaintenance() : ReplSetCommand("replSetMaintenance") {} + virtual bool run(OperationContext* txn, + const string&, + BSONObj& cmdObj, + int, + string& errmsg, + BSONObjBuilder& result, + bool fromRepl) { + Status status = getGlobalReplicationCoordinator()->checkReplEnabledForCommand(&result); + if (!status.isOK()) return appendCommandStatus(result, status); - } - } cmdReplSetStepDown; - class CmdReplSetMaintenance: public ReplSetCommand { - public: - virtual void help( stringstream &help ) const { - help << "{ replSetMaintenance : bool }\n"; - help << "Enable or disable maintenance mode."; - } - virtual void addRequiredPrivileges(const std::string& dbname, - const BSONObj& cmdObj, - std::vector<Privilege>* out) { - ActionSet actions; - actions.addAction(ActionType::replSetStateChange); - out->push_back(Privilege(ResourcePattern::forClusterResource(), actions)); - } - CmdReplSetMaintenance() : ReplSetCommand("replSetMaintenance") { } - virtual bool run(OperationContext* txn, const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { - Status status = getGlobalReplicationCoordinator()->checkReplEnabledForCommand(&result); - if (!status.isOK()) - return appendCommandStatus(result, status); + return appendCommandStatus(result, + getGlobalReplicationCoordinator()->setMaintenanceMode( + cmdObj["replSetMaintenance"].trueValue())); + } +} cmdReplSetMaintenance; - return appendCommandStatus( - result, - getGlobalReplicationCoordinator()->setMaintenanceMode( - cmdObj["replSetMaintenance"].trueValue())); - } - } cmdReplSetMaintenance; +class CmdReplSetSyncFrom : public ReplSetCommand { +public: + virtual void help(stringstream& help) const { + help << "{ replSetSyncFrom : \"host:port\" }\n"; + help << "Change who this member is syncing from."; + } + virtual void addRequiredPrivileges(const std::string& dbname, + const BSONObj& cmdObj, + std::vector<Privilege>* out) { + ActionSet actions; + actions.addAction(ActionType::replSetStateChange); + out->push_back(Privilege(ResourcePattern::forClusterResource(), actions)); + } + CmdReplSetSyncFrom() : ReplSetCommand("replSetSyncFrom") {} + virtual bool run(OperationContext* txn, + const string&, + BSONObj& cmdObj, + int, + string& errmsg, + BSONObjBuilder& result, + bool fromRepl) { + Status status = getGlobalReplicationCoordinator()->checkReplEnabledForCommand(&result); + if (!status.isOK()) + return appendCommandStatus(result, status); - class CmdReplSetSyncFrom: public ReplSetCommand { - public: - virtual void help( stringstream &help ) const { - help << "{ replSetSyncFrom : \"host:port\" }\n"; - help << "Change who this member is syncing from."; - } - virtual void addRequiredPrivileges(const std::string& dbname, - const BSONObj& cmdObj, - std::vector<Privilege>* out) { - ActionSet actions; - actions.addAction(ActionType::replSetStateChange); - out->push_back(Privilege(ResourcePattern::forClusterResource(), actions)); - } - CmdReplSetSyncFrom() : ReplSetCommand("replSetSyncFrom") { } - virtual bool run(OperationContext* txn, const string&, - BSONObj& cmdObj, - int, - string& errmsg, - BSONObjBuilder& result, - bool fromRepl) { - Status status = getGlobalReplicationCoordinator()->checkReplEnabledForCommand(&result); - if (!status.isOK()) - return appendCommandStatus(result, status); + HostAndPort targetHostAndPort; + status = targetHostAndPort.initialize(cmdObj["replSetSyncFrom"].valuestrsafe()); + if (!status.isOK()) + return appendCommandStatus(result, status); - HostAndPort targetHostAndPort; - status = targetHostAndPort.initialize(cmdObj["replSetSyncFrom"].valuestrsafe()); - if (!status.isOK()) - return appendCommandStatus(result, status); + return appendCommandStatus( + result, + getGlobalReplicationCoordinator()->processReplSetSyncFrom(targetHostAndPort, &result)); + } +} cmdReplSetSyncFrom; - return appendCommandStatus( - result, - getGlobalReplicationCoordinator()->processReplSetSyncFrom(targetHostAndPort, - &result)); - } - } cmdReplSetSyncFrom; +class CmdReplSetUpdatePosition : public ReplSetCommand { +public: + virtual void help(stringstream& help) const { + help << "internal"; + } + virtual void addRequiredPrivileges(const std::string& dbname, + const BSONObj& cmdObj, + std::vector<Privilege>* out) { + ActionSet actions; + actions.addAction(ActionType::internal); + out->push_back(Privilege(ResourcePattern::forClusterResource(), actions)); + } + CmdReplSetUpdatePosition() : ReplSetCommand("replSetUpdatePosition") {} + virtual bool run(OperationContext* txn, + const string&, + BSONObj& cmdObj, + int, + string& errmsg, + BSONObjBuilder& result, + bool fromRepl) { + Status status = getGlobalReplicationCoordinator()->checkReplEnabledForCommand(&result); + if (!status.isOK()) + return appendCommandStatus(result, status); - class CmdReplSetUpdatePosition: public ReplSetCommand { - public: - virtual void help( stringstream &help ) const { - help << "internal"; - } - virtual void addRequiredPrivileges(const std::string& dbname, - const BSONObj& cmdObj, - std::vector<Privilege>* out) { - ActionSet actions; - actions.addAction(ActionType::internal); - out->push_back(Privilege(ResourcePattern::forClusterResource(), actions)); - } - CmdReplSetUpdatePosition() : ReplSetCommand("replSetUpdatePosition") { } - virtual bool run(OperationContext* txn, const string& , BSONObj& cmdObj, int, string& errmsg, - BSONObjBuilder& result, bool fromRepl) { - Status status = getGlobalReplicationCoordinator()->checkReplEnabledForCommand(&result); + if (cmdObj.hasField("handshake")) { + // we have received a handshake, not an update message + // handshakes are done here to ensure the receiving end supports the update command + + HandshakeArgs handshake; + status = handshake.initialize(cmdObj["handshake"].embeddedObject()); if (!status.isOK()) return appendCommandStatus(result, status); - if (cmdObj.hasField("handshake")) { - // we have received a handshake, not an update message - // handshakes are done here to ensure the receiving end supports the update command - - HandshakeArgs handshake; - status = handshake.initialize(cmdObj["handshake"].embeddedObject()); - if (!status.isOK()) - return appendCommandStatus(result, status); - - if (!handshake.hasMemberId()) { - return appendCommandStatus( - result, - Status(ErrorCodes::NoSuchKey, - "replSetUpdatePosition handshake was missing 'member' field")); - } - + if (!handshake.hasMemberId()) { return appendCommandStatus( - result, - getGlobalReplicationCoordinator()->processHandshake(txn, handshake)); + result, + Status(ErrorCodes::NoSuchKey, + "replSetUpdatePosition handshake was missing 'member' field")); } - UpdatePositionArgs args; - status = args.initialize(cmdObj); - if (!status.isOK()) - return appendCommandStatus(result, status); - return appendCommandStatus( - result, - getGlobalReplicationCoordinator()->processReplSetUpdatePosition(args)); - + result, getGlobalReplicationCoordinator()->processHandshake(txn, handshake)); } - } cmdReplSetUpdatePosition; + + UpdatePositionArgs args; + status = args.initialize(cmdObj); + if (!status.isOK()) + return appendCommandStatus(result, status); + + return appendCommandStatus( + result, getGlobalReplicationCoordinator()->processReplSetUpdatePosition(args)); + } +} cmdReplSetUpdatePosition; namespace { - /** - * Returns true if there is no data on this server. Useful when starting replication. - * The "local" database does NOT count except for "rs.oplog" collection. - * Used to set the hasData field on replset heartbeat command response. - */ - bool replHasDatabases(OperationContext* txn) { - vector<string> names; - StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine(); - storageEngine->listDatabases(&names); - - if( names.size() >= 2 ) return true; - if( names.size() == 1 ) { - if( names[0] != "local" ) - return true; - - // we have a local database. return true if oplog isn't empty - BSONObj o; - if (Helpers::getSingleton(txn, repl::rsoplog, o)) { - return true; - } +/** + * Returns true if there is no data on this server. Useful when starting replication. + * The "local" database does NOT count except for "rs.oplog" collection. + * Used to set the hasData field on replset heartbeat command response. + */ +bool replHasDatabases(OperationContext* txn) { + vector<string> names; + StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine(); + storageEngine->listDatabases(&names); + + if (names.size() >= 2) + return true; + if (names.size() == 1) { + if (names[0] != "local") + return true; + + // we have a local database. return true if oplog isn't empty + BSONObj o; + if (Helpers::getSingleton(txn, repl::rsoplog, o)) { + return true; } - return false; } + return false; +} -} // namespace +} // namespace - MONGO_FP_DECLARE(rsDelayHeartbeatResponse); +MONGO_FP_DECLARE(rsDelayHeartbeatResponse); - /* { replSetHeartbeat : <setname> } */ - class CmdReplSetHeartbeat : public ReplSetCommand { - public: - void help(stringstream& h) const { h << "internal"; } - CmdReplSetHeartbeat() : ReplSetCommand("replSetHeartbeat") { } - virtual void addRequiredPrivileges(const std::string& dbname, - const BSONObj& cmdObj, - std::vector<Privilege>* out) { - ActionSet actions; - actions.addAction(ActionType::internal); - out->push_back(Privilege(ResourcePattern::forClusterResource(), actions)); +/* { replSetHeartbeat : <setname> } */ +class CmdReplSetHeartbeat : public ReplSetCommand { +public: + void help(stringstream& h) const { + h << "internal"; + } + CmdReplSetHeartbeat() : ReplSetCommand("replSetHeartbeat") {} + virtual void addRequiredPrivileges(const std::string& dbname, + const BSONObj& cmdObj, + std::vector<Privilege>* out) { + ActionSet actions; + actions.addAction(ActionType::internal); + out->push_back(Privilege(ResourcePattern::forClusterResource(), actions)); + } + virtual bool run(OperationContext* txn, + const string&, + BSONObj& cmdObj, + int, + string& errmsg, + BSONObjBuilder& result, + bool fromRepl) { + MONGO_FAIL_POINT_BLOCK(rsDelayHeartbeatResponse, delay) { + const BSONObj& data = delay.getData(); + sleepsecs(data["delay"].numberInt()); + } + + Status status = Status(ErrorCodes::InternalError, "status not set in heartbeat code"); + /* we don't call ReplSetCommand::check() here because heartbeat + checks many things that are pre-initialization. */ + if (!getGlobalReplicationCoordinator()->getSettings().usingReplSets()) { + status = Status(ErrorCodes::NoReplicationEnabled, "not running with --replSet"); + return appendCommandStatus(result, status); } - virtual bool run(OperationContext* txn, const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { - MONGO_FAIL_POINT_BLOCK(rsDelayHeartbeatResponse, delay) { - const BSONObj& data = delay.getData(); - sleepsecs(data["delay"].numberInt()); - } + /* we want to keep heartbeat connections open when relinquishing primary. + tag them here. */ + { + AbstractMessagingPort* mp = txn->getClient()->port(); + if (mp) + mp->tag |= ScopedConn::keepOpen; + } - Status status = Status(ErrorCodes::InternalError, "status not set in heartbeat code"); - /* we don't call ReplSetCommand::check() here because heartbeat - checks many things that are pre-initialization. */ - if (!getGlobalReplicationCoordinator()->getSettings().usingReplSets()) { - status = Status(ErrorCodes::NoReplicationEnabled, "not running with --replSet"); - return appendCommandStatus(result, status); - } + ReplSetHeartbeatArgs args; + status = args.initialize(cmdObj); + if (!status.isOK()) { + return appendCommandStatus(result, status); + } - /* we want to keep heartbeat connections open when relinquishing primary. - tag them here. */ - { - AbstractMessagingPort *mp = txn->getClient()->port(); - if( mp ) - mp->tag |= ScopedConn::keepOpen; - } + // ugh. + if (args.getCheckEmpty()) { + result.append("hasData", replHasDatabases(txn)); + } - ReplSetHeartbeatArgs args; - status = args.initialize(cmdObj); - if (!status.isOK()) { - return appendCommandStatus(result, status); - } + ReplSetHeartbeatResponse response; + status = getGlobalReplicationCoordinator()->processHeartbeat(args, &response); + if (status.isOK()) + response.addToBSON(&result); + return appendCommandStatus(result, status); + } +} cmdReplSetHeartbeat; - // ugh. - if (args.getCheckEmpty()) { - result.append("hasData", replHasDatabases(txn)); - } +/** the first cmd called by a node seeking election and it's a basic sanity + test: do any of the nodes it can reach know that it can't be the primary? + */ +class CmdReplSetFresh : public ReplSetCommand { +public: + void help(stringstream& h) const { + h << "internal"; + } + CmdReplSetFresh() : ReplSetCommand("replSetFresh") {} + virtual void addRequiredPrivileges(const std::string& dbname, + const BSONObj& cmdObj, + std::vector<Privilege>* out) { + ActionSet actions; + actions.addAction(ActionType::internal); + out->push_back(Privilege(ResourcePattern::forClusterResource(), actions)); + } - ReplSetHeartbeatResponse response; - status = getGlobalReplicationCoordinator()->processHeartbeat(args, &response); - if (status.isOK()) - response.addToBSON(&result); + virtual bool run(OperationContext* txn, + const string&, + BSONObj& cmdObj, + int, + string& errmsg, + BSONObjBuilder& result, + bool fromRepl) { + Status status = getGlobalReplicationCoordinator()->checkReplEnabledForCommand(&result); + if (!status.isOK()) return appendCommandStatus(result, status); - } - } cmdReplSetHeartbeat; - - /** the first cmd called by a node seeking election and it's a basic sanity - test: do any of the nodes it can reach know that it can't be the primary? - */ - class CmdReplSetFresh : public ReplSetCommand { - public: - void help(stringstream& h) const { h << "internal"; } - CmdReplSetFresh() : ReplSetCommand("replSetFresh") { } - virtual void addRequiredPrivileges(const std::string& dbname, - const BSONObj& cmdObj, - std::vector<Privilege>* out) { - ActionSet actions; - actions.addAction(ActionType::internal); - out->push_back(Privilege(ResourcePattern::forClusterResource(), actions)); - } - virtual bool run(OperationContext* txn, - const string&, - BSONObj& cmdObj, - int, - string& errmsg, - BSONObjBuilder& result, - bool fromRepl) { - Status status = getGlobalReplicationCoordinator()->checkReplEnabledForCommand(&result); - if (!status.isOK()) - return appendCommandStatus(result, status); + ReplicationCoordinator::ReplSetFreshArgs parsedArgs; + parsedArgs.id = cmdObj["id"].Int(); + parsedArgs.setName = cmdObj["set"].checkAndGetStringData(); + parsedArgs.who = HostAndPort(cmdObj["who"].String()); + BSONElement cfgverElement = cmdObj["cfgver"]; + uassert(28525, + str::stream() << "Expected cfgver argument to replSetFresh command to have " + "numeric type, but found " << typeName(cfgverElement.type()), + cfgverElement.isNumber()); + parsedArgs.cfgver = cfgverElement.safeNumberLong(); + parsedArgs.opTime = OpTime(cmdObj["opTime"].Date()); + + status = getGlobalReplicationCoordinator()->processReplSetFresh(parsedArgs, &result); + return appendCommandStatus(result, status); + } +} cmdReplSetFresh; - ReplicationCoordinator::ReplSetFreshArgs parsedArgs; - parsedArgs.id = cmdObj["id"].Int(); - parsedArgs.setName = cmdObj["set"].checkAndGetStringData(); - parsedArgs.who = HostAndPort(cmdObj["who"].String()); - BSONElement cfgverElement = cmdObj["cfgver"]; - uassert(28525, - str::stream() << "Expected cfgver argument to replSetFresh command to have " - "numeric type, but found " << typeName(cfgverElement.type()), - cfgverElement.isNumber()); - parsedArgs.cfgver = cfgverElement.safeNumberLong(); - parsedArgs.opTime = OpTime(cmdObj["opTime"].Date()); - - status = getGlobalReplicationCoordinator()->processReplSetFresh(parsedArgs, &result); - return appendCommandStatus(result, status); - } - } cmdReplSetFresh; - - class CmdReplSetElect : public ReplSetCommand { - public: - void help(stringstream& h) const { h << "internal"; } - CmdReplSetElect() : ReplSetCommand("replSetElect") { } - virtual void addRequiredPrivileges(const std::string& dbname, - const BSONObj& cmdObj, - std::vector<Privilege>* out) { - ActionSet actions; - actions.addAction(ActionType::internal); - out->push_back(Privilege(ResourcePattern::forClusterResource(), actions)); - } - private: - virtual bool run(OperationContext* txn, - const string&, - BSONObj& cmdObj, - int, - string& errmsg, - BSONObjBuilder& result, - bool fromRepl) { - DEV log() << "replSet received elect msg " << cmdObj.toString(); - else LOG(2) << "replSet received elect msg " << cmdObj.toString(); - - Status status = getGlobalReplicationCoordinator()->checkReplEnabledForCommand(&result); - if (!status.isOK()) - return appendCommandStatus(result, status); +class CmdReplSetElect : public ReplSetCommand { +public: + void help(stringstream& h) const { + h << "internal"; + } + CmdReplSetElect() : ReplSetCommand("replSetElect") {} + virtual void addRequiredPrivileges(const std::string& dbname, + const BSONObj& cmdObj, + std::vector<Privilege>* out) { + ActionSet actions; + actions.addAction(ActionType::internal); + out->push_back(Privilege(ResourcePattern::forClusterResource(), actions)); + } - ReplicationCoordinator::ReplSetElectArgs parsedArgs; - parsedArgs.set = cmdObj["set"].checkAndGetStringData(); - parsedArgs.whoid = cmdObj["whoid"].Int(); - BSONElement cfgverElement = cmdObj["cfgver"]; - uassert(28526, - str::stream() << "Expected cfgver argument to replSetElect command to have " - "numeric type, but found " << typeName(cfgverElement.type()), - cfgverElement.isNumber()); - parsedArgs.cfgver = cfgverElement.safeNumberLong(); - parsedArgs.round = cmdObj["round"].OID(); - - status = getGlobalReplicationCoordinator()->processReplSetElect(parsedArgs, &result); +private: + virtual bool run(OperationContext* txn, + const string&, + BSONObj& cmdObj, + int, + string& errmsg, + BSONObjBuilder& result, + bool fromRepl) { + DEV log() << "replSet received elect msg " << cmdObj.toString(); + else LOG(2) << "replSet received elect msg " << cmdObj.toString(); + + Status status = getGlobalReplicationCoordinator()->checkReplEnabledForCommand(&result); + if (!status.isOK()) return appendCommandStatus(result, status); - } - } cmdReplSetElect; -} // namespace repl -} // namespace mongo + ReplicationCoordinator::ReplSetElectArgs parsedArgs; + parsedArgs.set = cmdObj["set"].checkAndGetStringData(); + parsedArgs.whoid = cmdObj["whoid"].Int(); + BSONElement cfgverElement = cmdObj["cfgver"]; + uassert(28526, + str::stream() << "Expected cfgver argument to replSetElect command to have " + "numeric type, but found " << typeName(cfgverElement.type()), + cfgverElement.isNumber()); + parsedArgs.cfgver = cfgverElement.safeNumberLong(); + parsedArgs.round = cmdObj["round"].OID(); + + status = getGlobalReplicationCoordinator()->processReplSetElect(parsedArgs, &result); + return appendCommandStatus(result, status); + } +} cmdReplSetElect; + +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/replset_commands.h b/src/mongo/db/repl/replset_commands.h index ff39e063da4..50e764ac691 100644 --- a/src/mongo/db/repl/replset_commands.h +++ b/src/mongo/db/repl/replset_commands.h @@ -36,20 +36,28 @@ namespace mongo { namespace repl { - extern unsigned replSetForceInitialSyncFailure; +extern unsigned replSetForceInitialSyncFailure; - /** - * Base class for repl set commands. Checks basic things such if we're in - * rs mode before the command does its real work. - */ - class ReplSetCommand : public Command { - protected: - ReplSetCommand(const char * s, bool show=false) : Command(s, show) { } - virtual bool slaveOk() const { return true; } - virtual bool adminOnly() const { return true; } - virtual bool isWriteCommandForConfigServer() const { return false; } - virtual void help( std::stringstream &help ) const { help << "internal"; } - }; +/** + * Base class for repl set commands. Checks basic things such if we're in + * rs mode before the command does its real work. + */ +class ReplSetCommand : public Command { +protected: + ReplSetCommand(const char* s, bool show = false) : Command(s, show) {} + virtual bool slaveOk() const { + return true; + } + virtual bool adminOnly() const { + return true; + } + virtual bool isWriteCommandForConfigServer() const { + return false; + } + virtual void help(std::stringstream& help) const { + help << "internal"; + } +}; -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/replset_web_handler.cpp b/src/mongo/db/repl/replset_web_handler.cpp index 12fe6ab8bd0..6ec53363396 100644 --- a/src/mongo/db/repl/replset_web_handler.cpp +++ b/src/mongo/db/repl/replset_web_handler.cpp @@ -41,56 +41,55 @@ namespace mongo { namespace repl { - using namespace html; +using namespace html; - class ReplSetHandler : public DbWebHandler { - public: - ReplSetHandler() : DbWebHandler( "_replSet" , 1 , false ) {} +class ReplSetHandler : public DbWebHandler { +public: + ReplSetHandler() : DbWebHandler("_replSet", 1, false) {} - virtual bool handles( const std::string& url ) const { - return str::startsWith( url , "/_replSet" ); - } - - virtual void handle( OperationContext* txn, - const char *rq, - const std::string& url, - BSONObj params, - std::string& responseMsg, - int& responseCode, - std::vector<std::string>& headers, - const SockAddr &from ) { - responseMsg = _replSet(txn); - responseCode = 200; - } + virtual bool handles(const std::string& url) const { + return str::startsWith(url, "/_replSet"); + } - /* /_replSet show replica set status in html format */ - std::string _replSet(OperationContext* txn) { - std::stringstream s; - s << start("Replica Set Status " + prettyHostName()); - s << p( a("/", "back", "Home") + " | " + - a("/local/system.replset/?html=1", "", "View Replset Config") + " | " + - a("/replSetGetStatus?text=1", "", "replSetGetStatus") + " | " + - a("http://dochub.mongodb.org/core/replicasets", "", "Docs") - ); + virtual void handle(OperationContext* txn, + const char* rq, + const std::string& url, + BSONObj params, + std::string& responseMsg, + int& responseCode, + std::vector<std::string>& headers, + const SockAddr& from) { + responseMsg = _replSet(txn); + responseCode = 200; + } - ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); - if (replCoord->getReplicationMode() != ReplicationCoordinator::modeReplSet) { - s << p("Not using --replSet"); - s << _end(); - return s.str(); - } + /* /_replSet show replica set status in html format */ + std::string _replSet(OperationContext* txn) { + std::stringstream s; + s << start("Replica Set Status " + prettyHostName()); + s << p(a("/", "back", "Home") + " | " + + a("/local/system.replset/?html=1", "", "View Replset Config") + " | " + + a("/replSetGetStatus?text=1", "", "replSetGetStatus") + " | " + + a("http://dochub.mongodb.org/core/replicasets", "", "Docs")); - ReplSetHtmlSummary summary; - replCoord->summarizeAsHtml(&summary); - s << summary.toHtmlString(); - - s << p("Recent replset log activity:"); - fillRsLog(&s); + ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); + if (replCoord->getReplicationMode() != ReplicationCoordinator::modeReplSet) { + s << p("Not using --replSet"); s << _end(); return s.str(); } - } replSetHandler; + ReplSetHtmlSummary summary; + replCoord->summarizeAsHtml(&summary); + s << summary.toHtmlString(); + + s << p("Recent replset log activity:"); + fillRsLog(&s); + s << _end(); + return s.str(); + } + +} replSetHandler; -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/resync.cpp b/src/mongo/db/repl/resync.cpp index 16e385c9b48..7581c572cfb 100644 --- a/src/mongo/db/repl/resync.cpp +++ b/src/mongo/db/repl/resync.cpp @@ -35,100 +35,100 @@ namespace mongo { - using std::string; - using std::stringstream; +using std::string; +using std::stringstream; namespace repl { - // operator requested resynchronization of replication (on a slave or secondary). {resync: 1} - class CmdResync : public Command { - public: - virtual bool slaveOk() const { - return true; - } - virtual bool adminOnly() const { - return true; - } - virtual bool isWriteCommandForConfigServer() const { return true; } - virtual void addRequiredPrivileges(const std::string& dbname, - const BSONObj& cmdObj, - std::vector<Privilege>* out) { - ActionSet actions; - actions.addAction(ActionType::resync); - out->push_back(Privilege(ResourcePattern::forClusterResource(), actions)); - } +// operator requested resynchronization of replication (on a slave or secondary). {resync: 1} +class CmdResync : public Command { +public: + virtual bool slaveOk() const { + return true; + } + virtual bool adminOnly() const { + return true; + } + virtual bool isWriteCommandForConfigServer() const { + return true; + } + virtual void addRequiredPrivileges(const std::string& dbname, + const BSONObj& cmdObj, + std::vector<Privilege>* out) { + ActionSet actions; + actions.addAction(ActionType::resync); + out->push_back(Privilege(ResourcePattern::forClusterResource(), actions)); + } - void help(stringstream& h) const { - h << "resync (from scratch) a stale slave or replica set secondary node.\n"; - } + void help(stringstream& h) const { + h << "resync (from scratch) a stale slave or replica set secondary node.\n"; + } - CmdResync() : Command("resync") { } - virtual bool run(OperationContext* txn, - const string& dbname, - BSONObj& cmdObj, - int, - string& errmsg, - BSONObjBuilder& result, - bool fromRepl) { + CmdResync() : Command("resync") {} + virtual bool run(OperationContext* txn, + const string& dbname, + BSONObj& cmdObj, + int, + string& errmsg, + BSONObjBuilder& result, + bool fromRepl) { + ScopedTransaction transaction(txn, MODE_X); + Lock::GlobalWrite globalWriteLock(txn->lockState()); - ScopedTransaction transaction(txn, MODE_X); - Lock::GlobalWrite globalWriteLock(txn->lockState()); - - ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); - if (getGlobalReplicationCoordinator()->getSettings().usingReplSets()) { - const MemberState memberState = replCoord->getMemberState(); - if (memberState.startup()) { - return appendCommandStatus(result, Status(ErrorCodes::NotYetInitialized, - "no replication yet active")); - } - if (memberState.primary() || - !replCoord->setFollowerMode(MemberState::RS_STARTUP2)) { - return appendCommandStatus(result, Status(ErrorCodes::NotSecondary, - "primaries cannot resync")); - } - BackgroundSync::get()->setInitialSyncRequestedFlag(true); - return true; + ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); + if (getGlobalReplicationCoordinator()->getSettings().usingReplSets()) { + const MemberState memberState = replCoord->getMemberState(); + if (memberState.startup()) { + return appendCommandStatus( + result, Status(ErrorCodes::NotYetInitialized, "no replication yet active")); } - - // below this comment pertains only to master/slave replication - if ( cmdObj.getBoolField( "force" ) ) { - if ( !waitForSyncToFinish(txn, errmsg ) ) - return false; - replAllDead = "resync forced"; + if (memberState.primary() || !replCoord->setFollowerMode(MemberState::RS_STARTUP2)) { + return appendCommandStatus( + result, Status(ErrorCodes::NotSecondary, "primaries cannot resync")); } - // TODO(dannenberg) replAllDead is bad and should be removed when masterslave is removed - if (!replAllDead) { - errmsg = "not dead, no need to resync"; - return false; - } - if ( !waitForSyncToFinish(txn, errmsg ) ) - return false; - - ReplSource::forceResyncDead( txn, "client" ); - result.append( "info", "triggered resync for all sources" ); - + BackgroundSync::get()->setInitialSyncRequestedFlag(true); return true; } - bool waitForSyncToFinish(OperationContext* txn, string &errmsg) const { - // Wait for slave thread to finish syncing, so sources will be be - // reloaded with new saved state on next pass. - Timer t; - while ( 1 ) { - if ( syncing == 0 || t.millis() > 30000 ) - break; - { - Lock::TempRelease t(txn->lockState()); - relinquishSyncingSome = 1; - sleepmillis(1); - } - } - if ( syncing ) { - errmsg = "timeout waiting for sync() to finish"; + // below this comment pertains only to master/slave replication + if (cmdObj.getBoolField("force")) { + if (!waitForSyncToFinish(txn, errmsg)) return false; + replAllDead = "resync forced"; + } + // TODO(dannenberg) replAllDead is bad and should be removed when masterslave is removed + if (!replAllDead) { + errmsg = "not dead, no need to resync"; + return false; + } + if (!waitForSyncToFinish(txn, errmsg)) + return false; + + ReplSource::forceResyncDead(txn, "client"); + result.append("info", "triggered resync for all sources"); + + return true; + } + + bool waitForSyncToFinish(OperationContext* txn, string& errmsg) const { + // Wait for slave thread to finish syncing, so sources will be be + // reloaded with new saved state on next pass. + Timer t; + while (1) { + if (syncing == 0 || t.millis() > 30000) + break; + { + Lock::TempRelease t(txn->lockState()); + relinquishSyncingSome = 1; + sleepmillis(1); } - return true; } - } cmdResync; -} // namespace repl -} // namespace mongo + if (syncing) { + errmsg = "timeout waiting for sync() to finish"; + return false; + } + return true; + } +} cmdResync; +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/rs_base.h b/src/mongo/db/repl/rs_base.h index 1ba1838ce2b..99f7d188eac 100644 --- a/src/mongo/db/repl/rs_base.h +++ b/src/mongo/db/repl/rs_base.h @@ -36,55 +36,61 @@ namespace mongo { namespace repl { - /** - * most operations on a ReplSet object should be done while locked. that - * logic implemented here. - * - * Order of locking: lock the replica set, then take a rwlock. - */ - class RSBase : boost::noncopyable { - private: - mongo::mutex m; - int _locked; - ThreadLocalValue<bool> _lockedByMe; - protected: - RSBase() : m("RSBase"), _locked(0) { } - ~RSBase() { } +/** + * most operations on a ReplSet object should be done while locked. that + * logic implemented here. + * + * Order of locking: lock the replica set, then take a rwlock. + */ +class RSBase : boost::noncopyable { +private: + mongo::mutex m; + int _locked; + ThreadLocalValue<bool> _lockedByMe; + +protected: + RSBase() : m("RSBase"), _locked(0) {} + ~RSBase() {} + +public: + class lock { + RSBase& rsbase; + std::auto_ptr<scoped_lock> sl; public: - class lock { - RSBase& rsbase; - std::auto_ptr<scoped_lock> sl; - public: - lock(RSBase* b) : rsbase(*b) { - if( rsbase._lockedByMe.get() ) - return; // recursive is ok... + lock(RSBase* b) : rsbase(*b) { + if (rsbase._lockedByMe.get()) + return; // recursive is ok... - sl.reset( new scoped_lock(rsbase.m) ); - DEV verify(rsbase._locked == 0); - rsbase._locked++; - rsbase._lockedByMe.set(true); + sl.reset(new scoped_lock(rsbase.m)); + DEV verify(rsbase._locked == 0); + rsbase._locked++; + rsbase._lockedByMe.set(true); + } + ~lock() { + if (sl.get()) { + verify(rsbase._lockedByMe.get()); + DEV verify(rsbase._locked == 1); + rsbase._lockedByMe.set(false); + rsbase._locked--; } - ~lock() { - if( sl.get() ) { - verify( rsbase._lockedByMe.get() ); - DEV verify(rsbase._locked == 1); - rsbase._lockedByMe.set(false); - rsbase._locked--; - } - } - }; + } + }; - /* for asserts */ - bool locked() const { return _locked != 0; } + /* for asserts */ + bool locked() const { + return _locked != 0; + } - /** if true, is locked, and was locked by this thread. note if false, it could be in the - * lock or not for another just for asserts & such so we can make the contracts clear on - * who locks what when. we don't use these locks that frequently, so the little bit of - * overhead is fine. - */ - bool lockedByMe() { return _lockedByMe.get(); } - }; + /** if true, is locked, and was locked by this thread. note if false, it could be in the + * lock or not for another just for asserts & such so we can make the contracts clear on + * who locks what when. we don't use these locks that frequently, so the little bit of + * overhead is fine. + */ + bool lockedByMe() { + return _lockedByMe.get(); + } +}; -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/rs_exception.h b/src/mongo/db/repl/rs_exception.h index fcbf2c7d502..862226750d3 100644 --- a/src/mongo/db/repl/rs_exception.h +++ b/src/mongo/db/repl/rs_exception.h @@ -33,15 +33,19 @@ namespace mongo { namespace repl { - class VoteException : public std::exception { - public: - const char * what() const throw () { return "VoteException"; } - }; +class VoteException : public std::exception { +public: + const char* what() const throw() { + return "VoteException"; + } +}; - class RetryAfterSleepException : public std::exception { - public: - const char * what() const throw () { return "RetryAfterSleepException"; } - }; +class RetryAfterSleepException : public std::exception { +public: + const char* what() const throw() { + return "RetryAfterSleepException"; + } +}; -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/rs_initialsync.cpp b/src/mongo/db/repl/rs_initialsync.cpp index 2514a88953d..0e2fa1a5a9a 100644 --- a/src/mongo/db/repl/rs_initialsync.cpp +++ b/src/mongo/db/repl/rs_initialsync.cpp @@ -58,526 +58,513 @@ namespace mongo { namespace repl { namespace { - using std::list; - using std::string; - - // Failpoint which fails initial sync and leaves on oplog entry in the buffer. - MONGO_FP_DECLARE(failInitSyncWithBufferedEntriesLeft); - - /** - * Truncates the oplog (removes any documents) and resets internal variables that were - * originally initialized or affected by using values from the oplog at startup time. These - * include the last applied optime, the last fetched optime, and the sync source blacklist. - * Also resets the bgsync thread so that it reconnects its sync source after the oplog has been - * truncated. - */ - void truncateAndResetOplog(OperationContext* txn, - ReplicationCoordinator* replCoord, - BackgroundSync* bgsync) { - // Clear minvalid - setMinValid(txn, OpTime()); - - AutoGetDb autoDb(txn, "local", MODE_X); - massert(28585, "no local database found", autoDb.getDb()); - invariant(txn->lockState()->isCollectionLockedForMode(rsoplog, MODE_X)); - // Note: the following order is important. - // The bgsync thread uses an empty optime as a sentinel to know to wait - // for initial sync; thus, we must - // ensure the lastAppliedOptime is empty before restarting the bgsync thread - // via stop(). - // We must clear the sync source blacklist after calling stop() - // because the bgsync thread, while running, may update the blacklist. - replCoord->resetMyLastOptime(); - bgsync->stop(); - bgsync->setLastAppliedHash(0); - bgsync->clearBuffer(); - - replCoord->clearSyncSourceBlacklist(); - - // Truncate the oplog in case there was a prior initial sync that failed. - Collection* collection = autoDb.getDb()->getCollection(rsoplog); - fassert(28565, collection); - MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { - WriteUnitOfWork wunit(txn); - Status status = collection->truncate(txn); - fassert(28564, status); - wunit.commit(); - } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "truncate", collection->ns().ns()); +using std::list; +using std::string; + +// Failpoint which fails initial sync and leaves on oplog entry in the buffer. +MONGO_FP_DECLARE(failInitSyncWithBufferedEntriesLeft); + +/** + * Truncates the oplog (removes any documents) and resets internal variables that were + * originally initialized or affected by using values from the oplog at startup time. These + * include the last applied optime, the last fetched optime, and the sync source blacklist. + * Also resets the bgsync thread so that it reconnects its sync source after the oplog has been + * truncated. + */ +void truncateAndResetOplog(OperationContext* txn, + ReplicationCoordinator* replCoord, + BackgroundSync* bgsync) { + // Clear minvalid + setMinValid(txn, OpTime()); + + AutoGetDb autoDb(txn, "local", MODE_X); + massert(28585, "no local database found", autoDb.getDb()); + invariant(txn->lockState()->isCollectionLockedForMode(rsoplog, MODE_X)); + // Note: the following order is important. + // The bgsync thread uses an empty optime as a sentinel to know to wait + // for initial sync; thus, we must + // ensure the lastAppliedOptime is empty before restarting the bgsync thread + // via stop(). + // We must clear the sync source blacklist after calling stop() + // because the bgsync thread, while running, may update the blacklist. + replCoord->resetMyLastOptime(); + bgsync->stop(); + bgsync->setLastAppliedHash(0); + bgsync->clearBuffer(); + + replCoord->clearSyncSourceBlacklist(); + + // Truncate the oplog in case there was a prior initial sync that failed. + Collection* collection = autoDb.getDb()->getCollection(rsoplog); + fassert(28565, collection); + MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { + WriteUnitOfWork wunit(txn); + Status status = collection->truncate(txn); + fassert(28564, status); + wunit.commit(); } + MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "truncate", collection->ns().ns()); +} - /** - * Confirms that the "admin" database contains a supported version of the auth - * data schema. Terminates the process if the "admin" contains clearly incompatible - * auth data. - */ - void checkAdminDatabasePostClone(OperationContext* txn, Database* adminDb) { - // Assumes txn holds MODE_X or MODE_S lock on "admin" database. - if (!adminDb) { +/** + * Confirms that the "admin" database contains a supported version of the auth + * data schema. Terminates the process if the "admin" contains clearly incompatible + * auth data. + */ +void checkAdminDatabasePostClone(OperationContext* txn, Database* adminDb) { + // Assumes txn holds MODE_X or MODE_S lock on "admin" database. + if (!adminDb) { + return; + } + Collection* const usersCollection = + adminDb->getCollection(AuthorizationManager::usersCollectionNamespace); + const bool hasUsers = + usersCollection && !Helpers::findOne(txn, usersCollection, BSONObj(), false).isNull(); + Collection* const adminVersionCollection = + adminDb->getCollection(AuthorizationManager::versionCollectionNamespace); + BSONObj authSchemaVersionDocument; + if (!adminVersionCollection || + !Helpers::findOne(txn, + adminVersionCollection, + AuthorizationManager::versionDocumentQuery, + authSchemaVersionDocument)) { + if (!hasUsers) { + // It's OK to have no auth version document if there are no user documents. return; } - Collection* const usersCollection = - adminDb->getCollection(AuthorizationManager::usersCollectionNamespace); - const bool hasUsers = usersCollection && - !Helpers::findOne(txn, usersCollection, BSONObj(), false).isNull(); - Collection* const adminVersionCollection = - adminDb->getCollection(AuthorizationManager::versionCollectionNamespace); - BSONObj authSchemaVersionDocument; - if (!adminVersionCollection || !Helpers::findOne(txn, - adminVersionCollection, - AuthorizationManager::versionDocumentQuery, - authSchemaVersionDocument)) { - if (!hasUsers) { - // It's OK to have no auth version document if there are no user documents. - return; - } - severe() << "During initial sync, found documents in " << - AuthorizationManager::usersCollectionNamespace << - " but could not find an auth schema version document in " << - AuthorizationManager::versionCollectionNamespace; - severe() << "This indicates that the primary of this replica set was not successfully " - "upgraded to schema version " << AuthorizationManager::schemaVersion26Final << - ", which is the minimum supported schema version in this version of MongoDB"; - fassertFailedNoTrace(28620); - } - long long foundSchemaVersion; - Status status = bsonExtractIntegerField(authSchemaVersionDocument, - AuthorizationManager::schemaVersionFieldName, - &foundSchemaVersion); - if (!status.isOK()) { - severe() << "During initial sync, found malformed auth schema version document: " << - status << "; document: " << authSchemaVersionDocument; - fassertFailedNoTrace(28618); - } - if ((foundSchemaVersion != AuthorizationManager::schemaVersion26Final) && - (foundSchemaVersion != AuthorizationManager::schemaVersion28SCRAM)) { - severe() << "During initial sync, found auth schema version " << foundSchemaVersion << - ", but this version of MongoDB only supports schema versions " << - AuthorizationManager::schemaVersion26Final << " and " << - AuthorizationManager::schemaVersion28SCRAM; - fassertFailedNoTrace(28619); - } + severe() << "During initial sync, found documents in " + << AuthorizationManager::usersCollectionNamespace + << " but could not find an auth schema version document in " + << AuthorizationManager::versionCollectionNamespace; + severe() << "This indicates that the primary of this replica set was not successfully " + "upgraded to schema version " << AuthorizationManager::schemaVersion26Final + << ", which is the minimum supported schema version in this version of MongoDB"; + fassertFailedNoTrace(28620); } - - bool _initialSyncClone(OperationContext* txn, - Cloner& cloner, - const std::string& host, - const list<string>& dbs, - bool dataPass) { - - for( list<string>::const_iterator i = dbs.begin(); i != dbs.end(); i++ ) { - const string db = *i; - if ( db == "local" ) - continue; - - if ( dataPass ) - log() << "initial sync cloning db: " << db; - else - log() << "initial sync cloning indexes for : " << db; - - string err; - int errCode; - CloneOptions options; - options.fromDB = db; - options.logForRepl = false; - options.slaveOk = true; - options.useReplAuth = true; - options.snapshot = false; - options.mayYield = true; - options.mayBeInterrupted = false; - options.syncData = dataPass; - options.syncIndexes = ! dataPass; - - // Make database stable - ScopedTransaction transaction(txn, MODE_IX); - Lock::DBLock dbWrite(txn->lockState(), db, MODE_X); - - if (!cloner.go(txn, db, host, options, NULL, err, &errCode)) { - log() << "initial sync: error while " - << (dataPass ? "cloning " : "indexing ") << db - << ". " << (err.empty() ? "" : err + ". "); - return false; - } - - if (db == "admin") { - checkAdminDatabasePostClone(txn, dbHolder().get(txn, db)); - } - } - - return true; + long long foundSchemaVersion; + Status status = bsonExtractIntegerField(authSchemaVersionDocument, + AuthorizationManager::schemaVersionFieldName, + &foundSchemaVersion); + if (!status.isOK()) { + severe() << "During initial sync, found malformed auth schema version document: " << status + << "; document: " << authSchemaVersionDocument; + fassertFailedNoTrace(28618); } - - /** - * Replays the sync target's oplog from lastOp to the latest op on the sync target. - * - * @param syncer either initial sync (can reclone missing docs) or "normal" sync (no recloning) - * @param r the oplog reader - * @return if applying the oplog succeeded - */ - bool _initialSyncApplyOplog( OperationContext* ctx, - repl::SyncTail& syncer, - OplogReader* r) { - const OpTime startOpTime = getGlobalReplicationCoordinator()->getMyLastOptime(); - BSONObj lastOp; - - // If the fail point is set, exit failing. - if (MONGO_FAIL_POINT(failInitSyncWithBufferedEntriesLeft)) { - log() << "adding fake oplog entry to buffer."; - BackgroundSync::get()->pushTestOpToBuffer( - BSON("ts" << startOpTime << "v" << 1 << "op" << "n")); + if ((foundSchemaVersion != AuthorizationManager::schemaVersion26Final) && + (foundSchemaVersion != AuthorizationManager::schemaVersion28SCRAM)) { + severe() << "During initial sync, found auth schema version " << foundSchemaVersion + << ", but this version of MongoDB only supports schema versions " + << AuthorizationManager::schemaVersion26Final << " and " + << AuthorizationManager::schemaVersion28SCRAM; + fassertFailedNoTrace(28619); + } +} + +bool _initialSyncClone(OperationContext* txn, + Cloner& cloner, + const std::string& host, + const list<string>& dbs, + bool dataPass) { + for (list<string>::const_iterator i = dbs.begin(); i != dbs.end(); i++) { + const string db = *i; + if (db == "local") + continue; + + if (dataPass) + log() << "initial sync cloning db: " << db; + else + log() << "initial sync cloning indexes for : " << db; + + string err; + int errCode; + CloneOptions options; + options.fromDB = db; + options.logForRepl = false; + options.slaveOk = true; + options.useReplAuth = true; + options.snapshot = false; + options.mayYield = true; + options.mayBeInterrupted = false; + options.syncData = dataPass; + options.syncIndexes = !dataPass; + + // Make database stable + ScopedTransaction transaction(txn, MODE_IX); + Lock::DBLock dbWrite(txn->lockState(), db, MODE_X); + + if (!cloner.go(txn, db, host, options, NULL, err, &errCode)) { + log() << "initial sync: error while " << (dataPass ? "cloning " : "indexing ") << db + << ". " << (err.empty() ? "" : err + ". "); return false; } - try { - // It may have been a long time since we last used this connection to - // query the oplog, depending on the size of the databases we needed to clone. - // A common problem is that TCP keepalives are set too infrequent, and thus - // our connection here is terminated by a firewall due to inactivity. - // Solution is to increase the TCP keepalive frequency. - lastOp = r->getLastOp(rsoplog); - } catch ( SocketException & ) { - HostAndPort host = r->getHost(); - log() << "connection lost to " << host.toString() << - "; is your tcp keepalive interval set appropriately?"; - if ( !r->connect(host) ) { - error() << "initial sync couldn't connect to " << host.toString(); - throw; - } - // retry - lastOp = r->getLastOp(rsoplog); + if (db == "admin") { + checkAdminDatabasePostClone(txn, dbHolder().get(txn, db)); } + } - if (lastOp.isEmpty()) { - error() << "initial sync lastOp is empty"; - sleepsecs(1); - return false; + return true; +} + +/** + * Replays the sync target's oplog from lastOp to the latest op on the sync target. + * + * @param syncer either initial sync (can reclone missing docs) or "normal" sync (no recloning) + * @param r the oplog reader + * @return if applying the oplog succeeded + */ +bool _initialSyncApplyOplog(OperationContext* ctx, repl::SyncTail& syncer, OplogReader* r) { + const OpTime startOpTime = getGlobalReplicationCoordinator()->getMyLastOptime(); + BSONObj lastOp; + + // If the fail point is set, exit failing. + if (MONGO_FAIL_POINT(failInitSyncWithBufferedEntriesLeft)) { + log() << "adding fake oplog entry to buffer."; + BackgroundSync::get()->pushTestOpToBuffer(BSON("ts" << startOpTime << "v" << 1 << "op" + << "n")); + return false; + } + + try { + // It may have been a long time since we last used this connection to + // query the oplog, depending on the size of the databases we needed to clone. + // A common problem is that TCP keepalives are set too infrequent, and thus + // our connection here is terminated by a firewall due to inactivity. + // Solution is to increase the TCP keepalive frequency. + lastOp = r->getLastOp(rsoplog); + } catch (SocketException&) { + HostAndPort host = r->getHost(); + log() << "connection lost to " << host.toString() + << "; is your tcp keepalive interval set appropriately?"; + if (!r->connect(host)) { + error() << "initial sync couldn't connect to " << host.toString(); + throw; } + // retry + lastOp = r->getLastOp(rsoplog); + } - OpTime stopOpTime = lastOp["ts"]._opTime(); + if (lastOp.isEmpty()) { + error() << "initial sync lastOp is empty"; + sleepsecs(1); + return false; + } - // If we already have what we need then return. - if (stopOpTime == startOpTime) - return true; + OpTime stopOpTime = lastOp["ts"]._opTime(); - verify( !stopOpTime.isNull() ); - verify( stopOpTime > startOpTime ); + // If we already have what we need then return. + if (stopOpTime == startOpTime) + return true; - // apply till stopOpTime - try { - LOG(2) << "Applying oplog entries from " << startOpTime.toStringPretty() - << " until " << stopOpTime.toStringPretty(); - syncer.oplogApplication(ctx, stopOpTime); + verify(!stopOpTime.isNull()); + verify(stopOpTime > startOpTime); - if (inShutdown()) { - return false; - } - } - catch (const DBException&) { - getGlobalReplicationCoordinator()->resetMyLastOptime(); - BackgroundSync::get()->setLastAppliedHash(0); - warning() << "initial sync failed during oplog application phase, and will retry"; + // apply till stopOpTime + try { + LOG(2) << "Applying oplog entries from " << startOpTime.toStringPretty() << " until " + << stopOpTime.toStringPretty(); + syncer.oplogApplication(ctx, stopOpTime); - sleepsecs(5); + if (inShutdown()) { return false; } - - return true; + } catch (const DBException&) { + getGlobalReplicationCoordinator()->resetMyLastOptime(); + BackgroundSync::get()->setLastAppliedHash(0); + warning() << "initial sync failed during oplog application phase, and will retry"; + + sleepsecs(5); + return false; } - void _tryToApplyOpWithRetry(OperationContext* txn, InitialSync* init, const BSONObj& op) { - try { - if (!init->syncApply(txn, op)) { - bool retry; - { - ScopedTransaction transaction(txn, MODE_X); - Lock::GlobalWrite lk(txn->lockState()); - retry = init->shouldRetry(txn, op); - } + return true; +} + +void _tryToApplyOpWithRetry(OperationContext* txn, InitialSync* init, const BSONObj& op) { + try { + if (!init->syncApply(txn, op)) { + bool retry; + { + ScopedTransaction transaction(txn, MODE_X); + Lock::GlobalWrite lk(txn->lockState()); + retry = init->shouldRetry(txn, op); + } - if (retry) { - // retry - if (!init->syncApply(txn, op)) { - uasserted(28542, - str::stream() << "During initial sync, failed to apply op: " - << op); - } + if (retry) { + // retry + if (!init->syncApply(txn, op)) { + uasserted(28542, + str::stream() << "During initial sync, failed to apply op: " << op); } - // If shouldRetry() returns false, fall through. - // This can happen if the document that was moved and missed by Cloner - // subsequently got deleted and no longer exists on the Sync Target at all } + // If shouldRetry() returns false, fall through. + // This can happen if the document that was moved and missed by Cloner + // subsequently got deleted and no longer exists on the Sync Target at all } - catch (const DBException& e) { - error() << "exception: " << causedBy(e) << " on: " << op.toString(); - uasserted(28541, - str::stream() << "During initial sync, failed to apply op: " - << op); - } + } catch (const DBException& e) { + error() << "exception: " << causedBy(e) << " on: " << op.toString(); + uasserted(28541, str::stream() << "During initial sync, failed to apply op: " << op); } +} - /** - * Do the initial sync for this member. There are several steps to this process: - * - * 0. Add _initialSyncFlag to minValid collection to tell us to restart initial sync if we - * crash in the middle of this procedure - * 1. Record start time. - * 2. Clone. - * 3. Set minValid1 to sync target's latest op time. - * 4. Apply ops from start to minValid1, fetching missing docs as needed. - * 5. Set minValid2 to sync target's latest op time. - * 6. Apply ops from minValid1 to minValid2. - * 7. Build indexes. - * 8. Set minValid3 to sync target's latest op time. - * 9. Apply ops from minValid2 to minValid3. - 10. Cleanup minValid collection: remove _initialSyncFlag field, set ts to minValid3 OpTime - * - * At that point, initial sync is finished. Note that the oplog from the sync target is applied - * three times: step 4, 6, and 8. 4 may involve refetching, 6 should not. By the end of 6, - * this member should have consistent data. 8 is "cosmetic," it is only to get this member - * closer to the latest op time before it can transition out of startup state - * - * Returns a Status with ErrorCode::ShutdownInProgress if the node enters shutdown, - * ErrorCode::InitialSyncOplogSourceMissing if the node fails to find an sync source, Status::OK - * if everything worked, and ErrorCode::InitialSyncFailure for all other error cases. - */ - Status _initialSync() { - - log() << "initial sync pending"; - - BackgroundSync* bgsync(BackgroundSync::get()); - OperationContextImpl txn; - ReplicationCoordinator* replCoord(getGlobalReplicationCoordinator()); - - // reset state for initial sync - truncateAndResetOplog(&txn, replCoord, bgsync); - - OplogReader r; - OpTime nullOpTime(0, 0); - - while (r.getHost().empty()) { - // We must prime the sync source selector so that it considers all candidates regardless - // of oplog position, by passing in "nullOpTime" as the last op fetched time. - r.connectToSyncSource(&txn, nullOpTime, replCoord); - if (r.getHost().empty()) { - std::string msg = - "no valid sync sources found in current replset to do an initial sync"; - log() << msg; - return Status(ErrorCodes::InitialSyncOplogSourceMissing, msg); - } +/** + * Do the initial sync for this member. There are several steps to this process: + * + * 0. Add _initialSyncFlag to minValid collection to tell us to restart initial sync if we + * crash in the middle of this procedure + * 1. Record start time. + * 2. Clone. + * 3. Set minValid1 to sync target's latest op time. + * 4. Apply ops from start to minValid1, fetching missing docs as needed. + * 5. Set minValid2 to sync target's latest op time. + * 6. Apply ops from minValid1 to minValid2. + * 7. Build indexes. + * 8. Set minValid3 to sync target's latest op time. + * 9. Apply ops from minValid2 to minValid3. + 10. Cleanup minValid collection: remove _initialSyncFlag field, set ts to minValid3 OpTime + * + * At that point, initial sync is finished. Note that the oplog from the sync target is applied + * three times: step 4, 6, and 8. 4 may involve refetching, 6 should not. By the end of 6, + * this member should have consistent data. 8 is "cosmetic," it is only to get this member + * closer to the latest op time before it can transition out of startup state + * + * Returns a Status with ErrorCode::ShutdownInProgress if the node enters shutdown, + * ErrorCode::InitialSyncOplogSourceMissing if the node fails to find an sync source, Status::OK + * if everything worked, and ErrorCode::InitialSyncFailure for all other error cases. + */ +Status _initialSync() { + log() << "initial sync pending"; + + BackgroundSync* bgsync(BackgroundSync::get()); + OperationContextImpl txn; + ReplicationCoordinator* replCoord(getGlobalReplicationCoordinator()); + + // reset state for initial sync + truncateAndResetOplog(&txn, replCoord, bgsync); + + OplogReader r; + OpTime nullOpTime(0, 0); + + while (r.getHost().empty()) { + // We must prime the sync source selector so that it considers all candidates regardless + // of oplog position, by passing in "nullOpTime" as the last op fetched time. + r.connectToSyncSource(&txn, nullOpTime, replCoord); + if (r.getHost().empty()) { + std::string msg = + "no valid sync sources found in current replset to do an initial sync"; + log() << msg; + return Status(ErrorCodes::InitialSyncOplogSourceMissing, msg); + } - if (inShutdown()) { - return Status(ErrorCodes::ShutdownInProgress, "shutting down"); - } + if (inShutdown()) { + return Status(ErrorCodes::ShutdownInProgress, "shutting down"); } + } - InitialSync init(bgsync); - init.setHostname(r.getHost().toString()); + InitialSync init(bgsync); + init.setHostname(r.getHost().toString()); - BSONObj lastOp = r.getLastOp(rsoplog); - if ( lastOp.isEmpty() ) { - std::string msg = "initial sync couldn't read remote oplog"; - log() << msg; - sleepsecs(15); - return Status(ErrorCodes::InitialSyncFailure, msg); - } + BSONObj lastOp = r.getLastOp(rsoplog); + if (lastOp.isEmpty()) { + std::string msg = "initial sync couldn't read remote oplog"; + log() << msg; + sleepsecs(15); + return Status(ErrorCodes::InitialSyncFailure, msg); + } - if (getGlobalReplicationCoordinator()->getSettings().fastsync) { - log() << "fastsync: skipping database clone"; + if (getGlobalReplicationCoordinator()->getSettings().fastsync) { + log() << "fastsync: skipping database clone"; - // prime oplog - try { - _tryToApplyOpWithRetry(&txn, &init, lastOp); - std::deque<BSONObj> ops; - ops.push_back(lastOp); - writeOpsToOplog(&txn, ops); - return Status::OK(); - } catch (DBException& e) { - // Return if in shutdown - if (inShutdown()) { - return Status(ErrorCodes::ShutdownInProgress, "shutdown in progress"); - } - throw; + // prime oplog + try { + _tryToApplyOpWithRetry(&txn, &init, lastOp); + std::deque<BSONObj> ops; + ops.push_back(lastOp); + writeOpsToOplog(&txn, ops); + return Status::OK(); + } catch (DBException& e) { + // Return if in shutdown + if (inShutdown()) { + return Status(ErrorCodes::ShutdownInProgress, "shutdown in progress"); } + throw; } + } - // Add field to minvalid document to tell us to restart initial sync if we crash - setInitialSyncFlag(&txn); + // Add field to minvalid document to tell us to restart initial sync if we crash + setInitialSyncFlag(&txn); - log() << "initial sync drop all databases"; - dropAllDatabasesExceptLocal(&txn); + log() << "initial sync drop all databases"; + dropAllDatabasesExceptLocal(&txn); - log() << "initial sync clone all databases"; + log() << "initial sync clone all databases"; - list<string> dbs = r.conn()->getDatabaseNames(); - { - // Clone admin database first, to catch schema errors. - list<string>::iterator admin = std::find(dbs.begin(), dbs.end(), "admin"); - if (admin != dbs.end()) { - dbs.splice(dbs.begin(), dbs, admin); - } + list<string> dbs = r.conn()->getDatabaseNames(); + { + // Clone admin database first, to catch schema errors. + list<string>::iterator admin = std::find(dbs.begin(), dbs.end(), "admin"); + if (admin != dbs.end()) { + dbs.splice(dbs.begin(), dbs, admin); } + } - Cloner cloner; - if (!_initialSyncClone(&txn, cloner, r.conn()->getServerAddress(), dbs, true)) { - return Status(ErrorCodes::InitialSyncFailure, "initial sync failed data cloning"); - } + Cloner cloner; + if (!_initialSyncClone(&txn, cloner, r.conn()->getServerAddress(), dbs, true)) { + return Status(ErrorCodes::InitialSyncFailure, "initial sync failed data cloning"); + } - log() << "initial sync data copy, starting syncup"; + log() << "initial sync data copy, starting syncup"; - // prime oplog - _tryToApplyOpWithRetry(&txn, &init, lastOp); - std::deque<BSONObj> ops; - ops.push_back(lastOp); - writeOpsToOplog(&txn, ops); + // prime oplog + _tryToApplyOpWithRetry(&txn, &init, lastOp); + std::deque<BSONObj> ops; + ops.push_back(lastOp); + writeOpsToOplog(&txn, ops); - std::string msg = "oplog sync 1 of 3"; - log() << msg; - if (!_initialSyncApplyOplog(&txn, init, &r)) { - return Status(ErrorCodes::InitialSyncFailure, - str::stream() << "initial sync failed: " << msg); - } + std::string msg = "oplog sync 1 of 3"; + log() << msg; + if (!_initialSyncApplyOplog(&txn, init, &r)) { + return Status(ErrorCodes::InitialSyncFailure, + str::stream() << "initial sync failed: " << msg); + } - // Now we sync to the latest op on the sync target _again_, as we may have recloned ops - // that were "from the future" compared with minValid. During this second application, - // nothing should need to be recloned. - msg = "oplog sync 2 of 3"; - log() << msg; - if (!_initialSyncApplyOplog(&txn, init, &r)) { - return Status(ErrorCodes::InitialSyncFailure, - str::stream() << "initial sync failed: " << msg); - } - // data should now be consistent + // Now we sync to the latest op on the sync target _again_, as we may have recloned ops + // that were "from the future" compared with minValid. During this second application, + // nothing should need to be recloned. + msg = "oplog sync 2 of 3"; + log() << msg; + if (!_initialSyncApplyOplog(&txn, init, &r)) { + return Status(ErrorCodes::InitialSyncFailure, + str::stream() << "initial sync failed: " << msg); + } + // data should now be consistent - msg = "initial sync building indexes"; - log() << msg; - if (!_initialSyncClone(&txn, cloner, r.conn()->getServerAddress(), dbs, false)) { - return Status(ErrorCodes::InitialSyncFailure, - str::stream() << "initial sync failed: " << msg); - } + msg = "initial sync building indexes"; + log() << msg; + if (!_initialSyncClone(&txn, cloner, r.conn()->getServerAddress(), dbs, false)) { + return Status(ErrorCodes::InitialSyncFailure, + str::stream() << "initial sync failed: " << msg); + } - msg = "oplog sync 3 of 3"; - log() << msg; + msg = "oplog sync 3 of 3"; + log() << msg; - SyncTail tail(bgsync, multiSyncApply); - if (!_initialSyncApplyOplog(&txn, tail, &r)) { - return Status(ErrorCodes::InitialSyncFailure, - str::stream() << "initial sync failed: " << msg); - } - - // --------- + SyncTail tail(bgsync, multiSyncApply); + if (!_initialSyncApplyOplog(&txn, tail, &r)) { + return Status(ErrorCodes::InitialSyncFailure, + str::stream() << "initial sync failed: " << msg); + } - Status status = getGlobalAuthorizationManager()->initialize(&txn); - if (!status.isOK()) { - warning() << "Failed to reinitialize auth data after initial sync. " << status; - return status; - } + // --------- - log() << "initial sync finishing up"; + Status status = getGlobalAuthorizationManager()->initialize(&txn); + if (!status.isOK()) { + warning() << "Failed to reinitialize auth data after initial sync. " << status; + return status; + } - { - ScopedTransaction scopedXact(&txn, MODE_IX); - AutoGetDb autodb(&txn, "local", MODE_X); - OpTime lastOpTimeWritten(getGlobalReplicationCoordinator()->getMyLastOptime()); - log() << "replSet set minValid=" << lastOpTimeWritten; + log() << "initial sync finishing up"; - // Initial sync is now complete. Flag this by setting minValid to the last thing - // we synced. - setMinValid(&txn, lastOpTimeWritten); + { + ScopedTransaction scopedXact(&txn, MODE_IX); + AutoGetDb autodb(&txn, "local", MODE_X); + OpTime lastOpTimeWritten(getGlobalReplicationCoordinator()->getMyLastOptime()); + log() << "replSet set minValid=" << lastOpTimeWritten; - // Clear the initial sync flag. - clearInitialSyncFlag(&txn); - BackgroundSync::get()->setInitialSyncRequestedFlag(false); - } + // Initial sync is now complete. Flag this by setting minValid to the last thing + // we synced. + setMinValid(&txn, lastOpTimeWritten); - // If we just cloned & there were no ops applied, we still want the primary to know where - // we're up to - bgsync->notify(&txn); - - log() << "initial sync done"; - std::vector<BSONObj> handshakeObjs; - replCoord->prepareReplSetUpdatePositionCommandHandshakes(&handshakeObjs); - for (std::vector<BSONObj>::iterator it = handshakeObjs.begin(); - it != handshakeObjs.end(); - ++it) { - BSONObj res; - try { - if (!r.conn()->runCommand("admin", *it, res)) { - warning() << "InitialSync error reporting sync progress during handshake"; - return Status::OK(); - } - } - catch (const DBException& e) { - warning() << "InitialSync error reporting sync progress during handshake: " - << e.what(); - return Status::OK(); - } - } + // Clear the initial sync flag. + clearInitialSyncFlag(&txn); + BackgroundSync::get()->setInitialSyncRequestedFlag(false); + } + + // If we just cloned & there were no ops applied, we still want the primary to know where + // we're up to + bgsync->notify(&txn); - BSONObjBuilder updateCmd; + log() << "initial sync done"; + std::vector<BSONObj> handshakeObjs; + replCoord->prepareReplSetUpdatePositionCommandHandshakes(&handshakeObjs); + for (std::vector<BSONObj>::iterator it = handshakeObjs.begin(); it != handshakeObjs.end(); + ++it) { BSONObj res; - if (!replCoord->prepareReplSetUpdatePositionCommand(&updateCmd)) { - warning() << "InitialSync couldn't generate updatePosition command"; - return Status::OK(); - } try { - if (!r.conn()->runCommand("admin", updateCmd.obj(), res)) { - warning() << "InitialSync error reporting sync progress during updatePosition"; + if (!r.conn()->runCommand("admin", *it, res)) { + warning() << "InitialSync error reporting sync progress during handshake"; return Status::OK(); } - } - catch (const DBException& e) { - warning() << "InitialSync error reporting sync progress during updatePosition: " - << e.what(); + } catch (const DBException& e) { + warning() << "InitialSync error reporting sync progress during handshake: " << e.what(); return Status::OK(); } + } + BSONObjBuilder updateCmd; + BSONObj res; + if (!replCoord->prepareReplSetUpdatePositionCommand(&updateCmd)) { + warning() << "InitialSync couldn't generate updatePosition command"; + return Status::OK(); + } + try { + if (!r.conn()->runCommand("admin", updateCmd.obj(), res)) { + warning() << "InitialSync error reporting sync progress during updatePosition"; + return Status::OK(); + } + } catch (const DBException& e) { + warning() << "InitialSync error reporting sync progress during updatePosition: " + << e.what(); return Status::OK(); } -} // namespace - void syncDoInitialSync() { - static const int maxFailedAttempts = 10; + return Status::OK(); +} +} // namespace - { - OperationContextImpl txn; - createOplog(&txn); - } +void syncDoInitialSync() { + static const int maxFailedAttempts = 10; - int failedAttempts = 0; - while ( failedAttempts < maxFailedAttempts ) { - try { - // leave loop when successful - Status status = _initialSync(); - if (status.isOK()) { - break; - } - if (status == ErrorCodes::InitialSyncOplogSourceMissing) { - sleepsecs(1); - return; - } + { + OperationContextImpl txn; + createOplog(&txn); + } + + int failedAttempts = 0; + while (failedAttempts < maxFailedAttempts) { + try { + // leave loop when successful + Status status = _initialSync(); + if (status.isOK()) { + break; } - catch(const DBException& e) { - error() << e ; - // Return if in shutdown - if (inShutdown()) { - return; - } + if (status == ErrorCodes::InitialSyncOplogSourceMissing) { + sleepsecs(1); + return; } - + } catch (const DBException& e) { + error() << e; + // Return if in shutdown if (inShutdown()) { return; } - - error() << "initial sync attempt failed, " - << (maxFailedAttempts - ++failedAttempts) << " attempts remaining"; - sleepsecs(5); } - // No need to print a stack - if (failedAttempts >= maxFailedAttempts) { - severe() << "The maximum number of retries have been exhausted for initial sync."; - fassertFailedNoTrace(16233); + if (inShutdown()) { + return; } + + error() << "initial sync attempt failed, " << (maxFailedAttempts - ++failedAttempts) + << " attempts remaining"; + sleepsecs(5); + } + + // No need to print a stack + if (failedAttempts >= maxFailedAttempts) { + severe() << "The maximum number of retries have been exhausted for initial sync."; + fassertFailedNoTrace(16233); } +} -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/rs_initialsync.h b/src/mongo/db/repl/rs_initialsync.h index 659bb5ad577..7add22b9a37 100644 --- a/src/mongo/db/repl/rs_initialsync.h +++ b/src/mongo/db/repl/rs_initialsync.h @@ -30,10 +30,10 @@ namespace mongo { namespace repl { - /** - * Begins an initial sync of a node. This drops all data, chooses a sync source, - * and runs the cloner from that sync source. The node's state is not changed. - */ - void syncDoInitialSync(); +/** + * Begins an initial sync of a node. This drops all data, chooses a sync source, + * and runs the cloner from that sync source. The node's state is not changed. + */ +void syncDoInitialSync(); } } diff --git a/src/mongo/db/repl/rs_rollback.cpp b/src/mongo/db/repl/rs_rollback.cpp index 93740b37827..e1260054e40 100644 --- a/src/mongo/db/repl/rs_rollback.cpp +++ b/src/mongo/db/repl/rs_rollback.cpp @@ -98,796 +98,753 @@ namespace mongo { - using boost::shared_ptr; - using std::auto_ptr; - using std::endl; - using std::list; - using std::map; - using std::set; - using std::string; - using std::pair; +using boost::shared_ptr; +using std::auto_ptr; +using std::endl; +using std::list; +using std::map; +using std::set; +using std::string; +using std::pair; namespace repl { namespace { - class RSFatalException : public std::exception { - public: - RSFatalException(std::string m = "replica set fatal exception") - : msg(m) {} - virtual ~RSFatalException() throw() {}; - virtual const char* what() const throw() { - return msg.c_str(); - } - private: - std::string msg; - }; - - struct DocID { - // ns and _id both point into ownedObj's buffer - BSONObj ownedObj; - const char* ns; - BSONElement _id; - bool operator<(const DocID& other) const { - int comp = strcmp(ns, other.ns); - if (comp < 0) - return true; - if (comp > 0) - return false; - return _id < other._id; - } - }; +class RSFatalException : public std::exception { +public: + RSFatalException(std::string m = "replica set fatal exception") : msg(m) {} + virtual ~RSFatalException() throw(){}; + virtual const char* what() const throw() { + return msg.c_str(); + } - struct FixUpInfo { - // note this is a set -- if there are many $inc's on a single document we need to rollback, - // we only need to refetch it once. - set<DocID> toRefetch; +private: + std::string msg; +}; + +struct DocID { + // ns and _id both point into ownedObj's buffer + BSONObj ownedObj; + const char* ns; + BSONElement _id; + bool operator<(const DocID& other) const { + int comp = strcmp(ns, other.ns); + if (comp < 0) + return true; + if (comp > 0) + return false; + return _id < other._id; + } +}; - // collections to drop - set<string> toDrop; +struct FixUpInfo { + // note this is a set -- if there are many $inc's on a single document we need to rollback, + // we only need to refetch it once. + set<DocID> toRefetch; - set<string> collectionsToResync; + // collections to drop + set<string> toDrop; - OpTime commonPoint; - RecordId commonPointOurDiskloc; + set<string> collectionsToResync; - int rbid; // remote server's current rollback sequence # - }; + OpTime commonPoint; + RecordId commonPointOurDiskloc; + int rbid; // remote server's current rollback sequence # +}; - /** helper to get rollback id from another server. */ - int getRBID(DBClientConnection *c) { - bo info; - c->simpleCommand("admin", &info, "replSetGetRBID"); - return info["rbid"].numberInt(); - } +/** helper to get rollback id from another server. */ +int getRBID(DBClientConnection* c) { + bo info; + c->simpleCommand("admin", &info, "replSetGetRBID"); + return info["rbid"].numberInt(); +} - void refetch(FixUpInfo& fixUpInfo, const BSONObj& ourObj) { - const char* op = ourObj.getStringField("op"); - if (*op == 'n') - return; - if (ourObj.objsize() > 512 * 1024 * 1024) - throw RSFatalException("rollback too large"); +void refetch(FixUpInfo& fixUpInfo, const BSONObj& ourObj) { + const char* op = ourObj.getStringField("op"); + if (*op == 'n') + return; + + if (ourObj.objsize() > 512 * 1024 * 1024) + throw RSFatalException("rollback too large"); - DocID doc; - doc.ownedObj = ourObj.getOwned(); - doc.ns = doc.ownedObj.getStringField("ns"); - if (*doc.ns == '\0') { - warning() << "replSet WARNING ignoring op on rollback no ns TODO : " + DocID doc; + doc.ownedObj = ourObj.getOwned(); + doc.ns = doc.ownedObj.getStringField("ns"); + if (*doc.ns == '\0') { + warning() << "replSet WARNING ignoring op on rollback no ns TODO : " << doc.ownedObj.toString(); - return; - } + return; + } - BSONObj obj = doc.ownedObj.getObjectField(*op=='u' ? "o2" : "o"); - if (obj.isEmpty()) { - warning() << "replSet warning ignoring op on rollback : " << doc.ownedObj.toString(); - return; - } + BSONObj obj = doc.ownedObj.getObjectField(*op == 'u' ? "o2" : "o"); + if (obj.isEmpty()) { + warning() << "replSet warning ignoring op on rollback : " << doc.ownedObj.toString(); + return; + } - if (*op == 'c') { - BSONElement first = obj.firstElement(); - NamespaceString nss(doc.ns); // foo.$cmd - string cmdname = first.fieldName(); - Command* cmd = Command::findCommand(cmdname.c_str()); - if (cmd == NULL) { - severe() << "replSet warning rollback no such command " << first.fieldName(); - fassertFailedNoTrace(18751); - } - if (cmdname == "create") { - // Create collection operation - // { ts: ..., h: ..., op: "c", ns: "foo.$cmd", o: { create: "abc", ... } } - string ns = nss.db().toString() + '.' + obj["create"].String(); // -> foo.abc - fixUpInfo.toDrop.insert(ns); - return; - } - else if (cmdname == "drop") { - string ns = nss.db().toString() + '.' + first.valuestr(); - fixUpInfo.collectionsToResync.insert(ns); - return; - } - else if (cmdname == "dropIndexes" || cmdname == "deleteIndexes") { - // TODO: this is bad. we simply full resync the collection here, - // which could be very slow. - warning() << "replSet info rollback of dropIndexes is slow in this version of " - << "mongod"; - string ns = nss.db().toString() + '.' + first.valuestr(); - fixUpInfo.collectionsToResync.insert(ns); - return; - } - else if (cmdname == "renameCollection") { - // TODO: slow. - warning() << "replSet info rollback of renameCollection is slow in this version of " - << "mongod"; - string from = first.valuestr(); - string to = obj["to"].String(); - fixUpInfo.collectionsToResync.insert(from); - fixUpInfo.collectionsToResync.insert(to); - return; - } - else if (cmdname == "dropDatabase") { - severe() << "replSet error rollback : can't rollback drop database full resync " - << "will be required"; - log() << "replSet " << obj.toString(); - throw RSFatalException(); - } - else if (cmdname == "collMod") { - if (obj.nFields() == 2 && obj["usePowerOf2Sizes"].type() == Bool) { - log() << "replSet not rolling back change of usePowerOf2Sizes: " << obj; - } - else { - severe() << "replSet error cannot rollback a collMod command: " << obj; - throw RSFatalException(); - } - } - else { - severe() << "replSet error can't rollback this command yet: " - << obj.toString(); - log() << "replSet cmdname=" << cmdname; - throw RSFatalException(); - } + if (*op == 'c') { + BSONElement first = obj.firstElement(); + NamespaceString nss(doc.ns); // foo.$cmd + string cmdname = first.fieldName(); + Command* cmd = Command::findCommand(cmdname.c_str()); + if (cmd == NULL) { + severe() << "replSet warning rollback no such command " << first.fieldName(); + fassertFailedNoTrace(18751); } - - doc._id = obj["_id"]; - if (doc._id.eoo()) { - warning() << "replSet WARNING ignoring op on rollback no _id TODO : " << doc.ns << ' ' - << doc.ownedObj.toString(); + if (cmdname == "create") { + // Create collection operation + // { ts: ..., h: ..., op: "c", ns: "foo.$cmd", o: { create: "abc", ... } } + string ns = nss.db().toString() + '.' + obj["create"].String(); // -> foo.abc + fixUpInfo.toDrop.insert(ns); + return; + } else if (cmdname == "drop") { + string ns = nss.db().toString() + '.' + first.valuestr(); + fixUpInfo.collectionsToResync.insert(ns); + return; + } else if (cmdname == "dropIndexes" || cmdname == "deleteIndexes") { + // TODO: this is bad. we simply full resync the collection here, + // which could be very slow. + warning() << "replSet info rollback of dropIndexes is slow in this version of " + << "mongod"; + string ns = nss.db().toString() + '.' + first.valuestr(); + fixUpInfo.collectionsToResync.insert(ns); return; + } else if (cmdname == "renameCollection") { + // TODO: slow. + warning() << "replSet info rollback of renameCollection is slow in this version of " + << "mongod"; + string from = first.valuestr(); + string to = obj["to"].String(); + fixUpInfo.collectionsToResync.insert(from); + fixUpInfo.collectionsToResync.insert(to); + return; + } else if (cmdname == "dropDatabase") { + severe() << "replSet error rollback : can't rollback drop database full resync " + << "will be required"; + log() << "replSet " << obj.toString(); + throw RSFatalException(); + } else if (cmdname == "collMod") { + if (obj.nFields() == 2 && obj["usePowerOf2Sizes"].type() == Bool) { + log() << "replSet not rolling back change of usePowerOf2Sizes: " << obj; + } else { + severe() << "replSet error cannot rollback a collMod command: " << obj; + throw RSFatalException(); + } + } else { + severe() << "replSet error can't rollback this command yet: " << obj.toString(); + log() << "replSet cmdname=" << cmdname; + throw RSFatalException(); } + } - fixUpInfo.toRefetch.insert(doc); + doc._id = obj["_id"]; + if (doc._id.eoo()) { + warning() << "replSet WARNING ignoring op on rollback no _id TODO : " << doc.ns << ' ' + << doc.ownedObj.toString(); + return; } - StatusWith<FixUpInfo> syncRollbackFindCommonPoint(OperationContext* txn, - DBClientConnection* them) { - Client::Context ctx(txn, rsoplog); - FixUpInfo fixUpInfo; + fixUpInfo.toRefetch.insert(doc); +} - boost::scoped_ptr<PlanExecutor> exec( - InternalPlanner::collectionScan(txn, - rsoplog, - ctx.db()->getCollection(rsoplog), - InternalPlanner::BACKWARD)); +StatusWith<FixUpInfo> syncRollbackFindCommonPoint(OperationContext* txn, DBClientConnection* them) { + Client::Context ctx(txn, rsoplog); + FixUpInfo fixUpInfo; - BSONObj ourObj; - RecordId ourLoc; + boost::scoped_ptr<PlanExecutor> exec(InternalPlanner::collectionScan( + txn, rsoplog, ctx.db()->getCollection(rsoplog), InternalPlanner::BACKWARD)); - if (PlanExecutor::ADVANCED != exec->getNext(&ourObj, &ourLoc)) { - return StatusWith<FixUpInfo>(ErrorCodes::OplogStartMissing, "no oplog during initsync"); - } + BSONObj ourObj; + RecordId ourLoc; - const Query query = Query().sort(reverseNaturalObj); - const BSONObj fields = BSON("ts" << 1 << "h" << 1); + if (PlanExecutor::ADVANCED != exec->getNext(&ourObj, &ourLoc)) { + return StatusWith<FixUpInfo>(ErrorCodes::OplogStartMissing, "no oplog during initsync"); + } - //auto_ptr<DBClientCursor> u = us->query(rsoplog, query, 0, 0, &fields, 0, 0); + const Query query = Query().sort(reverseNaturalObj); + const BSONObj fields = BSON("ts" << 1 << "h" << 1); - fixUpInfo.rbid = getRBID(them); - auto_ptr<DBClientCursor> oplogCursor = them->query(rsoplog, query, 0, 0, &fields, 0, 0); + // auto_ptr<DBClientCursor> u = us->query(rsoplog, query, 0, 0, &fields, 0, 0); - if (oplogCursor.get() == NULL || !oplogCursor->more()) - throw RSFatalException("remote oplog empty or unreadable"); + fixUpInfo.rbid = getRBID(them); + auto_ptr<DBClientCursor> oplogCursor = them->query(rsoplog, query, 0, 0, &fields, 0, 0); - OpTime ourTime = ourObj["ts"]._opTime(); - BSONObj theirObj = oplogCursor->nextSafe(); - OpTime theirTime = theirObj["ts"]._opTime(); + if (oplogCursor.get() == NULL || !oplogCursor->more()) + throw RSFatalException("remote oplog empty or unreadable"); - long long diff = static_cast<long long>(ourTime.getSecs()) - - static_cast<long long>(theirTime.getSecs()); - // diff could be positive, negative, or zero - log() << "replSet info rollback our last optime: " << ourTime.toStringPretty(); - log() << "replSet info rollback their last optime: " << theirTime.toStringPretty(); - log() << "replSet info rollback diff in end of log times: " << diff << " seconds"; - if (diff > 1800) { - log() << "replSet rollback too long a time period for a rollback."; - throw RSFatalException("rollback error: not willing to roll back " - "more than 30 minutes of data"); - } + OpTime ourTime = ourObj["ts"]._opTime(); + BSONObj theirObj = oplogCursor->nextSafe(); + OpTime theirTime = theirObj["ts"]._opTime(); - unsigned long long scanned = 0; - while (1) { - scanned++; - // todo add code to assure no excessive scanning for too long - if (ourTime == theirTime) { - if (ourObj["h"].Long() == theirObj["h"].Long()) { - // found the point back in time where we match. - // todo : check a few more just to be careful about hash collisions. - log() << "replSet rollback found matching events at " - << ourTime.toStringPretty(); - log() << "replSet rollback findcommonpoint scanned : " << scanned; - fixUpInfo.commonPoint = ourTime; - fixUpInfo.commonPointOurDiskloc = ourLoc; - break; - } + long long diff = + static_cast<long long>(ourTime.getSecs()) - static_cast<long long>(theirTime.getSecs()); + // diff could be positive, negative, or zero + log() << "replSet info rollback our last optime: " << ourTime.toStringPretty(); + log() << "replSet info rollback their last optime: " << theirTime.toStringPretty(); + log() << "replSet info rollback diff in end of log times: " << diff << " seconds"; + if (diff > 1800) { + log() << "replSet rollback too long a time period for a rollback."; + throw RSFatalException( + "rollback error: not willing to roll back " + "more than 30 minutes of data"); + } - refetch(fixUpInfo, ourObj); + unsigned long long scanned = 0; + while (1) { + scanned++; + // todo add code to assure no excessive scanning for too long + if (ourTime == theirTime) { + if (ourObj["h"].Long() == theirObj["h"].Long()) { + // found the point back in time where we match. + // todo : check a few more just to be careful about hash collisions. + log() << "replSet rollback found matching events at " << ourTime.toStringPretty(); + log() << "replSet rollback findcommonpoint scanned : " << scanned; + fixUpInfo.commonPoint = ourTime; + fixUpInfo.commonPointOurDiskloc = ourLoc; + break; + } - if (!oplogCursor->more()) { - log() << "replSet rollback error RS100 reached beginning of remote oplog"; - log() << "replSet them: " << them->toString() << " scanned: " << scanned; - log() << "replSet theirTime: " << theirTime.toStringLong(); - log() << "replSet ourTime: " << ourTime.toStringLong(); - throw RSFatalException("RS100 reached beginning of remote oplog [2]"); - } - theirObj = oplogCursor->nextSafe(); - theirTime = theirObj["ts"]._opTime(); - - if (PlanExecutor::ADVANCED != exec->getNext(&ourObj, &ourLoc)) { - log() << "replSet rollback error RS101 reached beginning of local oplog"; - log() << "replSet them: " << them->toString() << " scanned: " << scanned; - log() << "replSet theirTime: " << theirTime.toStringLong(); - log() << "replSet ourTime: " << ourTime.toStringLong(); - throw RSFatalException("RS101 reached beginning of local oplog [1]"); - } - ourTime = ourObj["ts"]._opTime(); + refetch(fixUpInfo, ourObj); + + if (!oplogCursor->more()) { + log() << "replSet rollback error RS100 reached beginning of remote oplog"; + log() << "replSet them: " << them->toString() << " scanned: " << scanned; + log() << "replSet theirTime: " << theirTime.toStringLong(); + log() << "replSet ourTime: " << ourTime.toStringLong(); + throw RSFatalException("RS100 reached beginning of remote oplog [2]"); } - else if (theirTime > ourTime) { - if (!oplogCursor->more()) { - log() << "replSet rollback error RS100 reached beginning of remote oplog"; - log() << "replSet them: " << them->toString() << " scanned: " - << scanned; - log() << "replSet theirTime: " << theirTime.toStringLong(); - log() << "replSet ourTime: " << ourTime.toStringLong(); - throw RSFatalException("RS100 reached beginning of remote oplog [1]"); - } - theirObj = oplogCursor->nextSafe(); - theirTime = theirObj["ts"]._opTime(); + theirObj = oplogCursor->nextSafe(); + theirTime = theirObj["ts"]._opTime(); + + if (PlanExecutor::ADVANCED != exec->getNext(&ourObj, &ourLoc)) { + log() << "replSet rollback error RS101 reached beginning of local oplog"; + log() << "replSet them: " << them->toString() << " scanned: " << scanned; + log() << "replSet theirTime: " << theirTime.toStringLong(); + log() << "replSet ourTime: " << ourTime.toStringLong(); + throw RSFatalException("RS101 reached beginning of local oplog [1]"); } - else { - // theirTime < ourTime - refetch(fixUpInfo, ourObj); - if (PlanExecutor::ADVANCED != exec->getNext(&ourObj, &ourLoc)) { - log() << "replSet rollback error RS101 reached beginning of local oplog"; - log() << "replSet them: " << them->toString() << " scanned: " << scanned; - log() << "replSet theirTime: " << theirTime.toStringLong(); - log() << "replSet ourTime: " << ourTime.toStringLong(); - throw RSFatalException("RS101 reached beginning of local oplog [2]"); - } - ourTime = ourObj["ts"]._opTime(); + ourTime = ourObj["ts"]._opTime(); + } else if (theirTime > ourTime) { + if (!oplogCursor->more()) { + log() << "replSet rollback error RS100 reached beginning of remote oplog"; + log() << "replSet them: " << them->toString() << " scanned: " << scanned; + log() << "replSet theirTime: " << theirTime.toStringLong(); + log() << "replSet ourTime: " << ourTime.toStringLong(); + throw RSFatalException("RS100 reached beginning of remote oplog [1]"); + } + theirObj = oplogCursor->nextSafe(); + theirTime = theirObj["ts"]._opTime(); + } else { + // theirTime < ourTime + refetch(fixUpInfo, ourObj); + if (PlanExecutor::ADVANCED != exec->getNext(&ourObj, &ourLoc)) { + log() << "replSet rollback error RS101 reached beginning of local oplog"; + log() << "replSet them: " << them->toString() << " scanned: " << scanned; + log() << "replSet theirTime: " << theirTime.toStringLong(); + log() << "replSet ourTime: " << ourTime.toStringLong(); + throw RSFatalException("RS101 reached beginning of local oplog [2]"); } + ourTime = ourObj["ts"]._opTime(); } - - return StatusWith<FixUpInfo>(fixUpInfo); } - bool copyCollectionFromRemote(OperationContext* txn, - const string& host, - const string& ns, - string& errmsg) { - Cloner cloner; + return StatusWith<FixUpInfo>(fixUpInfo); +} - DBClientConnection *tmpConn = new DBClientConnection(); - // cloner owns _conn in auto_ptr - cloner.setConnection(tmpConn); - uassert(15908, errmsg, - tmpConn->connect(HostAndPort(host), errmsg) && replAuthenticate(tmpConn)); +bool copyCollectionFromRemote(OperationContext* txn, + const string& host, + const string& ns, + string& errmsg) { + Cloner cloner; - return cloner.copyCollection(txn, ns, BSONObj(), errmsg, true, false, true, false); - } + DBClientConnection* tmpConn = new DBClientConnection(); + // cloner owns _conn in auto_ptr + cloner.setConnection(tmpConn); + uassert( + 15908, errmsg, tmpConn->connect(HostAndPort(host), errmsg) && replAuthenticate(tmpConn)); - void syncFixUp(OperationContext* txn, - FixUpInfo& fixUpInfo, - OplogReader* oplogreader, - ReplicationCoordinator* replCoord) { - DBClientConnection* them = oplogreader->conn(); + return cloner.copyCollection(txn, ns, BSONObj(), errmsg, true, false, true, false); +} - // fetch all first so we needn't handle interruption in a fancy way +void syncFixUp(OperationContext* txn, + FixUpInfo& fixUpInfo, + OplogReader* oplogreader, + ReplicationCoordinator* replCoord) { + DBClientConnection* them = oplogreader->conn(); - unsigned long long totalSize = 0; + // fetch all first so we needn't handle interruption in a fancy way - list< pair<DocID, BSONObj> > goodVersions; + unsigned long long totalSize = 0; - BSONObj newMinValid; + list<pair<DocID, BSONObj>> goodVersions; - // fetch all the goodVersions of each document from current primary - DocID doc; - unsigned long long numFetched = 0; - try { - for (set<DocID>::iterator it = fixUpInfo.toRefetch.begin(); - it != fixUpInfo.toRefetch.end(); - it++) { - doc = *it; - - verify(!doc._id.eoo()); - - { - // TODO : slow. lots of round trips. - numFetched++; - BSONObj good = them->findOne(doc.ns, doc._id.wrap(), - NULL, QueryOption_SlaveOk).getOwned(); - totalSize += good.objsize(); - uassert(13410, "replSet too much data to roll back", - totalSize < 300 * 1024 * 1024); - - // note good might be eoo, indicating we should delete it - goodVersions.push_back(pair<DocID, BSONObj>(doc,good)); - } - } - newMinValid = oplogreader->getLastOp(rsoplog); - if (newMinValid.isEmpty()) { - error() << "rollback error newMinValid empty?"; - return; + BSONObj newMinValid; + + // fetch all the goodVersions of each document from current primary + DocID doc; + unsigned long long numFetched = 0; + try { + for (set<DocID>::iterator it = fixUpInfo.toRefetch.begin(); it != fixUpInfo.toRefetch.end(); + it++) { + doc = *it; + + verify(!doc._id.eoo()); + + { + // TODO : slow. lots of round trips. + numFetched++; + BSONObj good = + them->findOne(doc.ns, doc._id.wrap(), NULL, QueryOption_SlaveOk).getOwned(); + totalSize += good.objsize(); + uassert(13410, "replSet too much data to roll back", totalSize < 300 * 1024 * 1024); + + // note good might be eoo, indicating we should delete it + goodVersions.push_back(pair<DocID, BSONObj>(doc, good)); } } - catch (DBException& e) { - LOG(1) << "rollback re-get objects: " << e.toString(); - error() << "rollback couldn't re-get ns:" << doc.ns << " _id:" << doc._id << ' ' - << numFetched << '/' << fixUpInfo.toRefetch.size(); - throw e; - } - - log() << "rollback 3.5"; - if (fixUpInfo.rbid != getRBID(oplogreader->conn())) { - // our source rolled back itself. so the data we received isn't necessarily consistent. - warning() << "rollback rbid on source changed during rollback, cancelling this attempt"; + newMinValid = oplogreader->getLastOp(rsoplog); + if (newMinValid.isEmpty()) { + error() << "rollback error newMinValid empty?"; return; } + } catch (DBException& e) { + LOG(1) << "rollback re-get objects: " << e.toString(); + error() << "rollback couldn't re-get ns:" << doc.ns << " _id:" << doc._id << ' ' + << numFetched << '/' << fixUpInfo.toRefetch.size(); + throw e; + } - // update them - log() << "rollback 4 n:" << goodVersions.size(); - - bool warn = false; + log() << "rollback 3.5"; + if (fixUpInfo.rbid != getRBID(oplogreader->conn())) { + // our source rolled back itself. so the data we received isn't necessarily consistent. + warning() << "rollback rbid on source changed during rollback, cancelling this attempt"; + return; + } - invariant(!fixUpInfo.commonPointOurDiskloc.isNull()); - invariant(txn->lockState()->isW()); + // update them + log() << "rollback 4 n:" << goodVersions.size(); - // we have items we are writing that aren't from a point-in-time. thus best not to come - // online until we get to that point in freshness. - OpTime minValid = newMinValid["ts"]._opTime(); - log() << "replSet minvalid=" << minValid.toStringLong(); - setMinValid(txn, minValid); + bool warn = false; - // any full collection resyncs required? - if (!fixUpInfo.collectionsToResync.empty()) { - for (set<string>::iterator it = fixUpInfo.collectionsToResync.begin(); - it != fixUpInfo.collectionsToResync.end(); - it++) { - string ns = *it; - log() << "rollback 4.1 coll resync " << ns; + invariant(!fixUpInfo.commonPointOurDiskloc.isNull()); + invariant(txn->lockState()->isW()); - const NamespaceString nss(ns); + // we have items we are writing that aren't from a point-in-time. thus best not to come + // online until we get to that point in freshness. + OpTime minValid = newMinValid["ts"]._opTime(); + log() << "replSet minvalid=" << minValid.toStringLong(); + setMinValid(txn, minValid); - Database* db = dbHolder().openDb(txn, nss.db().toString()); - invariant(db); + // any full collection resyncs required? + if (!fixUpInfo.collectionsToResync.empty()) { + for (set<string>::iterator it = fixUpInfo.collectionsToResync.begin(); + it != fixUpInfo.collectionsToResync.end(); + it++) { + string ns = *it; + log() << "rollback 4.1 coll resync " << ns; - { - WriteUnitOfWork wunit(txn); - db->dropCollection(txn, ns); - wunit.commit(); - } + const NamespaceString nss(ns); - { - string errmsg; + Database* db = dbHolder().openDb(txn, nss.db().toString()); + invariant(db); - // This comes as a GlobalWrite lock, so there is no DB to be acquired after - // resume, so we can skip the DB stability checks. Also - // copyCollectionFromRemote will acquire its own database pointer, under the - // appropriate locks, so just releasing and acquiring the lock is safe. - invariant(txn->lockState()->isW()); - Lock::TempRelease release(txn->lockState()); - - bool ok = copyCollectionFromRemote(txn, them->getServerAddress(), ns, errmsg); - uassert(15909, str::stream() << "replSet rollback error resyncing collection " - << ns << ' ' << errmsg, ok); - } + { + WriteUnitOfWork wunit(txn); + db->dropCollection(txn, ns); + wunit.commit(); } - // we did more reading from primary, so check it again for a rollback (which would mess - // us up), and make minValid newer. - log() << "rollback 4.2"; + { + string errmsg; - string err; - try { - newMinValid = oplogreader->getLastOp(rsoplog); - if (newMinValid.isEmpty()) { - err = "can't get minvalid from sync source"; - } - else { - OpTime minValid = newMinValid["ts"]._opTime(); - log() << "replSet minvalid=" << minValid.toStringLong(); - setMinValid(txn, minValid); - } - } - catch (DBException& e) { - err = "can't get/set minvalid: "; - err += e.what(); - } - if (fixUpInfo.rbid != getRBID(oplogreader->conn())) { - // our source rolled back itself. so the data we received isn't necessarily - // consistent. however, we've now done writes. thus we have a problem. - err += "rbid at primary changed during resync/rollback"; - } - if (!err.empty()) { - error() << "replSet error rolling back : " << err - << ". A full resync will be necessary."; - // TODO: reset minvalid so that we are permanently in fatal state - // TODO: don't be fatal, but rather, get all the data first. - throw RSFatalException(); + // This comes as a GlobalWrite lock, so there is no DB to be acquired after + // resume, so we can skip the DB stability checks. Also + // copyCollectionFromRemote will acquire its own database pointer, under the + // appropriate locks, so just releasing and acquiring the lock is safe. + invariant(txn->lockState()->isW()); + Lock::TempRelease release(txn->lockState()); + + bool ok = copyCollectionFromRemote(txn, them->getServerAddress(), ns, errmsg); + uassert(15909, + str::stream() << "replSet rollback error resyncing collection " << ns << ' ' + << errmsg, + ok); } - log() << "rollback 4.3"; } - map<string,shared_ptr<Helpers::RemoveSaver> > removeSavers; + // we did more reading from primary, so check it again for a rollback (which would mess + // us up), and make minValid newer. + log() << "rollback 4.2"; - log() << "rollback 4.6"; - // drop collections to drop before doing individual fixups - that might make things faster - // below actually if there were subsequent inserts to rollback - for (set<string>::iterator it = fixUpInfo.toDrop.begin(); - it != fixUpInfo.toDrop.end(); - it++) { - log() << "replSet rollback drop: " << *it; - - Database* db = dbHolder().get(txn, nsToDatabaseSubstring(*it)); - if (db) { - WriteUnitOfWork wunit(txn); - - shared_ptr<Helpers::RemoveSaver>& removeSaver = removeSavers[*it]; - if (!removeSaver) - removeSaver.reset(new Helpers::RemoveSaver("rollback", "", *it)); - - // perform a collection scan and write all documents in the collection to disk - boost::scoped_ptr<PlanExecutor> exec( - InternalPlanner::collectionScan(txn, - *it, - db->getCollection(*it))); - BSONObj curObj; - PlanExecutor::ExecState execState; - while (PlanExecutor::ADVANCED == (execState = exec->getNext(&curObj, NULL))) { - removeSaver->goingToDelete(curObj); - } - if (execState != PlanExecutor::IS_EOF) { - if (execState == PlanExecutor::FAILURE && - WorkingSetCommon::isValidStatusMemberObject(curObj)) { - Status errorStatus = WorkingSetCommon::getMemberObjectStatus(curObj); - severe() << "rolling back createCollection on " << *it - << " failed with " << errorStatus - << ". A full resync is necessary."; - } - else { - severe() << "rolling back createCollection on " << *it - << " failed. A full resync is necessary."; - } - - throw RSFatalException(); - } - - db->dropCollection(txn, *it); - wunit.commit(); + string err; + try { + newMinValid = oplogreader->getLastOp(rsoplog); + if (newMinValid.isEmpty()) { + err = "can't get minvalid from sync source"; + } else { + OpTime minValid = newMinValid["ts"]._opTime(); + log() << "replSet minvalid=" << minValid.toStringLong(); + setMinValid(txn, minValid); } + } catch (DBException& e) { + err = "can't get/set minvalid: "; + err += e.what(); + } + if (fixUpInfo.rbid != getRBID(oplogreader->conn())) { + // our source rolled back itself. so the data we received isn't necessarily + // consistent. however, we've now done writes. thus we have a problem. + err += "rbid at primary changed during resync/rollback"; } + if (!err.empty()) { + error() << "replSet error rolling back : " << err + << ". A full resync will be necessary."; + // TODO: reset minvalid so that we are permanently in fatal state + // TODO: don't be fatal, but rather, get all the data first. + throw RSFatalException(); + } + log() << "rollback 4.3"; + } - log() << "rollback 4.7"; - Client::Context ctx(txn, rsoplog); - Collection* oplogCollection = ctx.db()->getCollection(rsoplog); - uassert(13423, - str::stream() << "replSet error in rollback can't find " << rsoplog, - oplogCollection); - - unsigned deletes = 0, updates = 0; - time_t lastProgressUpdate = time(0); - time_t progressUpdateGap = 10; - for (list<pair<DocID, BSONObj> >::iterator it = goodVersions.begin(); - it != goodVersions.end(); - it++) { - time_t now = time(0); - if (now - lastProgressUpdate > progressUpdateGap) { - log() << "replSet " << deletes << " delete and " - << updates << " update operations processed out of " - << goodVersions.size() << " total operations"; - lastProgressUpdate = now; + map<string, shared_ptr<Helpers::RemoveSaver>> removeSavers; + + log() << "rollback 4.6"; + // drop collections to drop before doing individual fixups - that might make things faster + // below actually if there were subsequent inserts to rollback + for (set<string>::iterator it = fixUpInfo.toDrop.begin(); it != fixUpInfo.toDrop.end(); it++) { + log() << "replSet rollback drop: " << *it; + + Database* db = dbHolder().get(txn, nsToDatabaseSubstring(*it)); + if (db) { + WriteUnitOfWork wunit(txn); + + shared_ptr<Helpers::RemoveSaver>& removeSaver = removeSavers[*it]; + if (!removeSaver) + removeSaver.reset(new Helpers::RemoveSaver("rollback", "", *it)); + + // perform a collection scan and write all documents in the collection to disk + boost::scoped_ptr<PlanExecutor> exec( + InternalPlanner::collectionScan(txn, *it, db->getCollection(*it))); + BSONObj curObj; + PlanExecutor::ExecState execState; + while (PlanExecutor::ADVANCED == (execState = exec->getNext(&curObj, NULL))) { + removeSaver->goingToDelete(curObj); } - const DocID& doc = it->first; - BSONObj pattern = doc._id.wrap(); // { _id : ... } - try { - verify(doc.ns && *doc.ns); - if (fixUpInfo.collectionsToResync.count(doc.ns)) { - // we just synced this entire collection - continue; + if (execState != PlanExecutor::IS_EOF) { + if (execState == PlanExecutor::FAILURE && + WorkingSetCommon::isValidStatusMemberObject(curObj)) { + Status errorStatus = WorkingSetCommon::getMemberObjectStatus(curObj); + severe() << "rolling back createCollection on " << *it << " failed with " + << errorStatus << ". A full resync is necessary."; + } else { + severe() << "rolling back createCollection on " << *it + << " failed. A full resync is necessary."; } - // keep an archive of items rolled back - shared_ptr<Helpers::RemoveSaver>& removeSaver = removeSavers[doc.ns]; - if (!removeSaver) - removeSaver.reset(new Helpers::RemoveSaver("rollback", "", doc.ns)); - - // todo: lots of overhead in context, this can be faster - Client::Context ctx(txn, doc.ns); + throw RSFatalException(); + } - // Add the doc to our rollback file - BSONObj obj; + db->dropCollection(txn, *it); + wunit.commit(); + } + } - Collection* collection = ctx.db()->getCollection(doc.ns); + log() << "rollback 4.7"; + Client::Context ctx(txn, rsoplog); + Collection* oplogCollection = ctx.db()->getCollection(rsoplog); + uassert(13423, + str::stream() << "replSet error in rollback can't find " << rsoplog, + oplogCollection); + + unsigned deletes = 0, updates = 0; + time_t lastProgressUpdate = time(0); + time_t progressUpdateGap = 10; + for (list<pair<DocID, BSONObj>>::iterator it = goodVersions.begin(); it != goodVersions.end(); + it++) { + time_t now = time(0); + if (now - lastProgressUpdate > progressUpdateGap) { + log() << "replSet " << deletes << " delete and " << updates + << " update operations processed out of " << goodVersions.size() + << " total operations"; + lastProgressUpdate = now; + } + const DocID& doc = it->first; + BSONObj pattern = doc._id.wrap(); // { _id : ... } + try { + verify(doc.ns && *doc.ns); + if (fixUpInfo.collectionsToResync.count(doc.ns)) { + // we just synced this entire collection + continue; + } - // Do not log an error when undoing an insert on a no longer existent collection. - // It is likely that the collection was dropped as part of rolling back a - // createCollection command and regardless, the document no longer exists. - if (collection) { - bool found = Helpers::findOne(txn, collection, pattern, obj, false); - if (found) { - removeSaver->goingToDelete(obj); - } - else { - error() << "rollback cannot find object: " << pattern - << " in namespace " << doc.ns; - } + // keep an archive of items rolled back + shared_ptr<Helpers::RemoveSaver>& removeSaver = removeSavers[doc.ns]; + if (!removeSaver) + removeSaver.reset(new Helpers::RemoveSaver("rollback", "", doc.ns)); + + // todo: lots of overhead in context, this can be faster + Client::Context ctx(txn, doc.ns); + + // Add the doc to our rollback file + BSONObj obj; + + Collection* collection = ctx.db()->getCollection(doc.ns); + + // Do not log an error when undoing an insert on a no longer existent collection. + // It is likely that the collection was dropped as part of rolling back a + // createCollection command and regardless, the document no longer exists. + if (collection) { + bool found = Helpers::findOne(txn, collection, pattern, obj, false); + if (found) { + removeSaver->goingToDelete(obj); + } else { + error() << "rollback cannot find object: " << pattern << " in namespace " + << doc.ns; } + } - if (it->second.isEmpty()) { - // wasn't on the primary; delete. - // TODO 1.6 : can't delete from a capped collection. need to handle that here. - deletes++; - - if (collection) { - if (collection->isCapped()) { - // can't delete from a capped collection - so we truncate instead. if - // this item must go, so must all successors!!! - try { - // TODO: IIRC cappedTruncateAfter does not handle completely empty. - // this will crazy slow if no _id index. - long long start = Listener::getElapsedTimeMillis(); - RecordId loc = Helpers::findOne(txn, collection, pattern, false); - if (Listener::getElapsedTimeMillis() - start > 200) - log() << "replSet warning roll back slow no _id index for " - << doc.ns << " perhaps?"; - // would be faster but requires index: - // RecordId loc = Helpers::findById(nsd, pattern); - if (!loc.isNull()) { - try { - collection->temp_cappedTruncateAfter(txn, loc, true); - } - catch (DBException& e) { - if (e.getCode() == 13415) { - // hack: need to just make cappedTruncate do this... - MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { - WriteUnitOfWork wunit(txn); - uassertStatusOK(collection->truncate(txn)); - wunit.commit(); - } MONGO_WRITE_CONFLICT_RETRY_LOOP_END( - txn, - "truncate", - collection->ns().ns()); - } - else { - throw e; + if (it->second.isEmpty()) { + // wasn't on the primary; delete. + // TODO 1.6 : can't delete from a capped collection. need to handle that here. + deletes++; + + if (collection) { + if (collection->isCapped()) { + // can't delete from a capped collection - so we truncate instead. if + // this item must go, so must all successors!!! + try { + // TODO: IIRC cappedTruncateAfter does not handle completely empty. + // this will crazy slow if no _id index. + long long start = Listener::getElapsedTimeMillis(); + RecordId loc = Helpers::findOne(txn, collection, pattern, false); + if (Listener::getElapsedTimeMillis() - start > 200) + log() << "replSet warning roll back slow no _id index for " + << doc.ns << " perhaps?"; + // would be faster but requires index: + // RecordId loc = Helpers::findById(nsd, pattern); + if (!loc.isNull()) { + try { + collection->temp_cappedTruncateAfter(txn, loc, true); + } catch (DBException& e) { + if (e.getCode() == 13415) { + // hack: need to just make cappedTruncate do this... + MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { + WriteUnitOfWork wunit(txn); + uassertStatusOK(collection->truncate(txn)); + wunit.commit(); } + MONGO_WRITE_CONFLICT_RETRY_LOOP_END( + txn, "truncate", collection->ns().ns()); + } else { + throw e; } } } - catch (DBException& e) { - log() << "replSet error rolling back capped collection rec " - << doc.ns << ' ' << e.toString(); - } - } - else { - deleteObjects(txn, - ctx.db(), - doc.ns, - pattern, - PlanExecutor::YIELD_MANUAL, - true, // justone - false, // logop - true); // god + } catch (DBException& e) { + log() << "replSet error rolling back capped collection rec " << doc.ns + << ' ' << e.toString(); } - // did we just empty the collection? if so let's check if it even - // exists on the source. - if (collection->numRecords(txn) == 0) { - try { - std::list<BSONObj> lst = - them->getCollectionInfos( ctx.db()->name(), - BSON( "name" << nsToCollectionSubstring( doc.ns ) ) ); - if (lst.empty()) { - // we should drop - WriteUnitOfWork wunit(txn); - ctx.db()->dropCollection(txn, doc.ns); - wunit.commit(); - } - } - catch (DBException&) { - // this isn't *that* big a deal, but is bad. - log() << "replSet warning rollback error querying for existence of " - << doc.ns << " at the primary, ignoring"; + } else { + deleteObjects(txn, + ctx.db(), + doc.ns, + pattern, + PlanExecutor::YIELD_MANUAL, + true, // justone + false, // logop + true); // god + } + // did we just empty the collection? if so let's check if it even + // exists on the source. + if (collection->numRecords(txn) == 0) { + try { + std::list<BSONObj> lst = them->getCollectionInfos( + ctx.db()->name(), BSON("name" << nsToCollectionSubstring(doc.ns))); + if (lst.empty()) { + // we should drop + WriteUnitOfWork wunit(txn); + ctx.db()->dropCollection(txn, doc.ns); + wunit.commit(); } + } catch (DBException&) { + // this isn't *that* big a deal, but is bad. + log() << "replSet warning rollback error querying for existence of " + << doc.ns << " at the primary, ignoring"; } } } - else { - // TODO faster... - OpDebug debug; - updates++; - - const NamespaceString requestNs(doc.ns); - UpdateRequest request(requestNs); - - request.setQuery(pattern); - request.setUpdates(it->second); - request.setGod(); - request.setUpsert(); - UpdateLifecycleImpl updateLifecycle(true, requestNs); - request.setLifecycle(&updateLifecycle); - - update(txn, ctx.db(), request, &debug); - - } - } - catch (DBException& e) { - log() << "replSet exception in rollback ns:" << doc.ns << ' ' << pattern.toString() - << ' ' << e.toString() << " ndeletes:" << deletes; - warn = true; + } else { + // TODO faster... + OpDebug debug; + updates++; + + const NamespaceString requestNs(doc.ns); + UpdateRequest request(requestNs); + + request.setQuery(pattern); + request.setUpdates(it->second); + request.setGod(); + request.setUpsert(); + UpdateLifecycleImpl updateLifecycle(true, requestNs); + request.setLifecycle(&updateLifecycle); + + update(txn, ctx.db(), request, &debug); } - } - - removeSavers.clear(); // this effectively closes all of them - log() << "rollback 5 d:" << deletes << " u:" << updates; - log() << "rollback 6"; - - // clean up oplog - LOG(2) << "replSet rollback truncate oplog after " << - fixUpInfo.commonPoint.toStringPretty(); - // TODO: fatal error if this throws? - oplogCollection->temp_cappedTruncateAfter(txn, fixUpInfo.commonPointOurDiskloc, false); - - Status status = getGlobalAuthorizationManager()->initialize(txn); - if (!status.isOK()) { - warning() << "Failed to reinitialize auth data after rollback: " << status; + } catch (DBException& e) { + log() << "replSet exception in rollback ns:" << doc.ns << ' ' << pattern.toString() + << ' ' << e.toString() << " ndeletes:" << deletes; warn = true; } + } - // Reload the lastOpTimeApplied value in the replcoord and the lastAppliedHash value in - // bgsync to reflect our new last op. - replCoord->resetLastOpTimeFromOplog(txn); - BackgroundSync::get()->loadLastAppliedHash(txn); + removeSavers.clear(); // this effectively closes all of them + log() << "rollback 5 d:" << deletes << " u:" << updates; + log() << "rollback 6"; - // done - if (warn) - warning() << "issues during syncRollback, see log"; - else - log() << "rollback done"; - } + // clean up oplog + LOG(2) << "replSet rollback truncate oplog after " << fixUpInfo.commonPoint.toStringPretty(); + // TODO: fatal error if this throws? + oplogCollection->temp_cappedTruncateAfter(txn, fixUpInfo.commonPointOurDiskloc, false); - unsigned _syncRollback(OperationContext* txn, - OplogReader* oplogreader, - ReplicationCoordinator* replCoord) { - invariant(!txn->lockState()->isLocked()); + Status status = getGlobalAuthorizationManager()->initialize(txn); + if (!status.isOK()) { + warning() << "Failed to reinitialize auth data after rollback: " << status; + warn = true; + } - log() << "rollback 0"; + // Reload the lastOpTimeApplied value in the replcoord and the lastAppliedHash value in + // bgsync to reflect our new last op. + replCoord->resetLastOpTimeFromOplog(txn); + BackgroundSync::get()->loadLastAppliedHash(txn); + + // done + if (warn) + warning() << "issues during syncRollback, see log"; + else + log() << "rollback done"; +} + +unsigned _syncRollback(OperationContext* txn, + OplogReader* oplogreader, + ReplicationCoordinator* replCoord) { + invariant(!txn->lockState()->isLocked()); + + log() << "rollback 0"; + + Lock::GlobalWrite globalWrite(txn->lockState(), 20000); + if (!globalWrite.isLocked()) { + warning() << "rollback couldn't get write lock in a reasonable time"; + return 2; + } - Lock::GlobalWrite globalWrite(txn->lockState(), 20000); - if (!globalWrite.isLocked()) { - warning() << "rollback couldn't get write lock in a reasonable time"; - return 2; - } + /** by doing this, we will not service reads (return an error as we aren't in secondary + * state. that perhaps is moot because of the write lock above, but that write lock + * probably gets deferred or removed or yielded later anyway. + * + * also, this is better for status reporting - we know what is happening. + */ + if (!replCoord->setFollowerMode(MemberState::RS_ROLLBACK)) { + warning() << "Cannot transition from " << replCoord->getMemberState() << " to " + << MemberState(MemberState::RS_ROLLBACK); + return 0; + } - /** by doing this, we will not service reads (return an error as we aren't in secondary - * state. that perhaps is moot because of the write lock above, but that write lock - * probably gets deferred or removed or yielded later anyway. - * - * also, this is better for status reporting - we know what is happening. - */ - if (!replCoord->setFollowerMode(MemberState::RS_ROLLBACK)) { - warning() << "Cannot transition from " << replCoord->getMemberState() << - " to " << MemberState(MemberState::RS_ROLLBACK); - return 0; - } + FixUpInfo how; + log() << "rollback 1"; + { + oplogreader->resetCursor(); - FixUpInfo how; - log() << "rollback 1"; - { - oplogreader->resetCursor(); - - log() << "rollback 2 FindCommonPoint"; - try { - StatusWith<FixUpInfo> res = syncRollbackFindCommonPoint(txn, oplogreader->conn()); - if (!res.isOK()) { - switch (res.getStatus().code()) { - case ErrorCodes::OplogStartMissing: - return 1; - default: - throw new RSFatalException(res.getStatus().toString()); - } - } - else { - how = res.getValue(); + log() << "rollback 2 FindCommonPoint"; + try { + StatusWith<FixUpInfo> res = syncRollbackFindCommonPoint(txn, oplogreader->conn()); + if (!res.isOK()) { + switch (res.getStatus().code()) { + case ErrorCodes::OplogStartMissing: + return 1; + default: + throw new RSFatalException(res.getStatus().toString()); } + } else { + how = res.getValue(); } - catch (RSFatalException& e) { - error() << string(e.what()); - fassertFailedNoTrace(18752); - return 2; - } - catch (DBException& e) { - warning() << string("rollback 2 exception ") + e.toString() + "; sleeping 1 min"; + } catch (RSFatalException& e) { + error() << string(e.what()); + fassertFailedNoTrace(18752); + return 2; + } catch (DBException& e) { + warning() << string("rollback 2 exception ") + e.toString() + "; sleeping 1 min"; - // Release the GlobalWrite lock while sleeping. We should always come here with a - // GlobalWrite lock - invariant(txn->lockState()->isW()); - Lock::TempRelease(txn->lockState()); + // Release the GlobalWrite lock while sleeping. We should always come here with a + // GlobalWrite lock + invariant(txn->lockState()->isW()); + Lock::TempRelease(txn->lockState()); - sleepsecs(60); - throw; - } + sleepsecs(60); + throw; } + } - log() << "replSet rollback 3 fixup"; + log() << "replSet rollback 3 fixup"; + replCoord->incrementRollbackID(); + try { + syncFixUp(txn, how, oplogreader, replCoord); + } catch (RSFatalException& e) { + error() << "exception during rollback: " << e.what(); + fassertFailedNoTrace(18753); + return 2; + } catch (...) { replCoord->incrementRollbackID(); - try { - syncFixUp(txn, how, oplogreader, replCoord); - } - catch (RSFatalException& e) { - error() << "exception during rollback: " << e.what(); - fassertFailedNoTrace(18753); - return 2; - } - catch (...) { - replCoord->incrementRollbackID(); - - if (!replCoord->setFollowerMode(MemberState::RS_RECOVERING)) { - warning() << "Failed to transition into " << - MemberState(MemberState::RS_RECOVERING) << "; expected to be in state " << - MemberState(MemberState::RS_ROLLBACK) << "but found self in " << - replCoord->getMemberState(); - } - throw; - } - replCoord->incrementRollbackID(); - - // success - leave "ROLLBACK" state - // can go to SECONDARY once minvalid is achieved if (!replCoord->setFollowerMode(MemberState::RS_RECOVERING)) { - warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING) << - "; expected to be in state " << MemberState(MemberState::RS_ROLLBACK) << - "but found self in " << replCoord->getMemberState(); + warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING) + << "; expected to be in state " << MemberState(MemberState::RS_ROLLBACK) + << "but found self in " << replCoord->getMemberState(); } - return 0; + throw; } -} // namespace - - void syncRollback(OperationContext* txn, - OpTime lastOpTimeApplied, - OplogReader* oplogreader, - ReplicationCoordinator* replCoord) { - // check that we are at minvalid, otherwise we cannot rollback as we may be in an - // inconsistent state - { - OpTime minvalid = getMinValid(txn); - if( minvalid > lastOpTimeApplied ) { - severe() << "replSet need to rollback, but in inconsistent state" << endl; - log() << "minvalid: " << minvalid.toString() << " our last optime: " - << lastOpTimeApplied.toString() << endl; - fassertFailedNoTrace(18750); - return; - } + replCoord->incrementRollbackID(); + + // success - leave "ROLLBACK" state + // can go to SECONDARY once minvalid is achieved + if (!replCoord->setFollowerMode(MemberState::RS_RECOVERING)) { + warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING) + << "; expected to be in state " << MemberState(MemberState::RS_ROLLBACK) + << "but found self in " << replCoord->getMemberState(); + } + + return 0; +} +} // namespace + +void syncRollback(OperationContext* txn, + OpTime lastOpTimeApplied, + OplogReader* oplogreader, + ReplicationCoordinator* replCoord) { + // check that we are at minvalid, otherwise we cannot rollback as we may be in an + // inconsistent state + { + OpTime minvalid = getMinValid(txn); + if (minvalid > lastOpTimeApplied) { + severe() << "replSet need to rollback, but in inconsistent state" << endl; + log() << "minvalid: " << minvalid.toString() + << " our last optime: " << lastOpTimeApplied.toString() << endl; + fassertFailedNoTrace(18750); + return; } + } - log() << "beginning rollback" << rsLog; + log() << "beginning rollback" << rsLog; - unsigned s = _syncRollback(txn, oplogreader, replCoord); - if (s) - sleepsecs(s); - - log() << "rollback finished" << rsLog; - } + unsigned s = _syncRollback(txn, oplogreader, replCoord); + if (s) + sleepsecs(s); + + log() << "rollback finished" << rsLog; +} -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/rs_rollback.h b/src/mongo/db/repl/rs_rollback.h index 8c5644f816a..955ea5b1199 100644 --- a/src/mongo/db/repl/rs_rollback.h +++ b/src/mongo/db/repl/rs_rollback.h @@ -29,39 +29,38 @@ #pragma once namespace mongo { - class OperationContext; - class OpTime; +class OperationContext; +class OpTime; namespace repl { - class OplogReader; - class ReplicationCoordinator; +class OplogReader; +class ReplicationCoordinator; - /** - * Initiates the rollback process. - * This function assumes the preconditions for undertaking rollback have already been met; - * we have ops in our oplog that our sync source does not have, and we are not currently - * PRIMARY. - * The rollback procedure is: - * - find the common point between this node and its sync source - * - undo operations by fetching all documents affected, then replaying - * the sync source's oplog until we reach the time in the oplog when we fetched the last - * document. - * This function can throw std::exception on failures. - * This function runs a command on the sync source to detect if the sync source rolls back - * while our rollback is in progress. - * - * @param txn Used to read and write from this node's databases - * @param lastOpTimeWritten The last OpTime applied by the applier - * @param oplogreader Must already be connected to a sync source. Used to fetch documents. - * @param replCoord Used to track the rollback ID and to change the follower state - * - * Failures: some failure cases are fatal; others throw std::exception. - */ - - void syncRollback(OperationContext* txn, - OpTime lastOpTimeWritten, - OplogReader* oplogreader, - ReplicationCoordinator* replCoord); +/** + * Initiates the rollback process. + * This function assumes the preconditions for undertaking rollback have already been met; + * we have ops in our oplog that our sync source does not have, and we are not currently + * PRIMARY. + * The rollback procedure is: + * - find the common point between this node and its sync source + * - undo operations by fetching all documents affected, then replaying + * the sync source's oplog until we reach the time in the oplog when we fetched the last + * document. + * This function can throw std::exception on failures. + * This function runs a command on the sync source to detect if the sync source rolls back + * while our rollback is in progress. + * + * @param txn Used to read and write from this node's databases + * @param lastOpTimeWritten The last OpTime applied by the applier + * @param oplogreader Must already be connected to a sync source. Used to fetch documents. + * @param replCoord Used to track the rollback ID and to change the follower state + * + * Failures: some failure cases are fatal; others throw std::exception. + */ +void syncRollback(OperationContext* txn, + OpTime lastOpTimeWritten, + OplogReader* oplogreader, + ReplicationCoordinator* replCoord); } } diff --git a/src/mongo/db/repl/rs_sync.cpp b/src/mongo/db/repl/rs_sync.cpp index 75154edcd20..9d8be3778ce 100644 --- a/src/mongo/db/repl/rs_sync.cpp +++ b/src/mongo/db/repl/rs_sync.cpp @@ -62,89 +62,85 @@ namespace mongo { namespace repl { - void runSyncThread() { - Client::initThread("rsSync"); - cc().getAuthorizationSession()->grantInternalAuthorization(); - ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); - - // Set initial indexPrefetch setting - const std::string& prefetch = replCoord->getSettings().rsIndexPrefetch; - if (!prefetch.empty()) { - BackgroundSync::IndexPrefetchConfig prefetchConfig = BackgroundSync::PREFETCH_ALL; - if (prefetch == "none") - prefetchConfig = BackgroundSync::PREFETCH_NONE; - else if (prefetch == "_id_only") - prefetchConfig = BackgroundSync::PREFETCH_ID_ONLY; - else if (prefetch == "all") - prefetchConfig = BackgroundSync::PREFETCH_ALL; - else { - warning() << "unrecognized indexPrefetch setting " << prefetch << ", defaulting " - << "to \"all\""; - } - BackgroundSync::get()->setIndexPrefetchConfig(prefetchConfig); +void runSyncThread() { + Client::initThread("rsSync"); + cc().getAuthorizationSession()->grantInternalAuthorization(); + ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); + + // Set initial indexPrefetch setting + const std::string& prefetch = replCoord->getSettings().rsIndexPrefetch; + if (!prefetch.empty()) { + BackgroundSync::IndexPrefetchConfig prefetchConfig = BackgroundSync::PREFETCH_ALL; + if (prefetch == "none") + prefetchConfig = BackgroundSync::PREFETCH_NONE; + else if (prefetch == "_id_only") + prefetchConfig = BackgroundSync::PREFETCH_ID_ONLY; + else if (prefetch == "all") + prefetchConfig = BackgroundSync::PREFETCH_ALL; + else { + warning() << "unrecognized indexPrefetch setting " << prefetch << ", defaulting " + << "to \"all\""; } + BackgroundSync::get()->setIndexPrefetchConfig(prefetchConfig); + } - while (!inShutdown()) { - // After a reconfig, we may not be in the replica set anymore, so - // check that we are in the set (and not an arbiter) before - // trying to sync with other replicas. - // TODO(spencer): Use a condition variable to await loading a config - if (replCoord->getMemberState().startup()) { - log() << "replSet warning did not receive a valid config yet, sleeping 5 seconds "; - sleepsecs(5); - continue; - } + while (!inShutdown()) { + // After a reconfig, we may not be in the replica set anymore, so + // check that we are in the set (and not an arbiter) before + // trying to sync with other replicas. + // TODO(spencer): Use a condition variable to await loading a config + if (replCoord->getMemberState().startup()) { + log() << "replSet warning did not receive a valid config yet, sleeping 5 seconds "; + sleepsecs(5); + continue; + } - const MemberState memberState = replCoord->getMemberState(); + const MemberState memberState = replCoord->getMemberState(); - // An arbiter can never transition to any other state, and doesn't replicate, ever - if (memberState.arbiter()) { - break; - } + // An arbiter can never transition to any other state, and doesn't replicate, ever + if (memberState.arbiter()) { + break; + } + + // If we are removed then we don't belong to the set anymore + if (memberState.removed()) { + sleepsecs(5); + continue; + } - // If we are removed then we don't belong to the set anymore - if (memberState.removed()) { - sleepsecs(5); + try { + if (memberState.primary() && !replCoord->isWaitingForApplierToDrain()) { + sleepsecs(1); continue; } - try { - - if (memberState.primary() && !replCoord->isWaitingForApplierToDrain()) { - sleepsecs(1); - continue; - } - - bool initialSyncRequested = BackgroundSync::get()->getInitialSyncRequestedFlag(); - // Check criteria for doing an initial sync: - // 1. If the oplog is empty, do an initial sync - // 2. If minValid has _initialSyncFlag set, do an initial sync - // 3. If initialSyncRequested is true - if (getGlobalReplicationCoordinator()->getMyLastOptime().isNull() || - getInitialSyncFlag() || - initialSyncRequested) { - syncDoInitialSync(); - continue; // start from top again in case sync failed. - } - if (!replCoord->setFollowerMode(MemberState::RS_RECOVERING)) { - continue; - } - - /* we have some data. continue tailing. */ - SyncTail tail(BackgroundSync::get(), multiSyncApply); - tail.oplogApplication(); + bool initialSyncRequested = BackgroundSync::get()->getInitialSyncRequestedFlag(); + // Check criteria for doing an initial sync: + // 1. If the oplog is empty, do an initial sync + // 2. If minValid has _initialSyncFlag set, do an initial sync + // 3. If initialSyncRequested is true + if (getGlobalReplicationCoordinator()->getMyLastOptime().isNull() || + getInitialSyncFlag() || initialSyncRequested) { + syncDoInitialSync(); + continue; // start from top again in case sync failed. } - catch(const DBException& e) { - log() << "Received exception while syncing: " << e.toString(); - sleepsecs(10); - } - catch(const std::exception& e) { - log() << "Received exception while syncing: " << e.what(); - sleepsecs(10); + if (!replCoord->setFollowerMode(MemberState::RS_RECOVERING)) { + continue; } + + /* we have some data. continue tailing. */ + SyncTail tail(BackgroundSync::get(), multiSyncApply); + tail.oplogApplication(); + } catch (const DBException& e) { + log() << "Received exception while syncing: " << e.toString(); + sleepsecs(10); + } catch (const std::exception& e) { + log() << "Received exception while syncing: " << e.what(); + sleepsecs(10); } - cc().shutdown(); } + cc().shutdown(); +} -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/rs_sync.h b/src/mongo/db/repl/rs_sync.h index a031f70c611..ad5d9be88a5 100644 --- a/src/mongo/db/repl/rs_sync.h +++ b/src/mongo/db/repl/rs_sync.h @@ -41,8 +41,8 @@ namespace mongo { namespace repl { - // Body of the thread that will do the background sync. - void runSyncThread(); +// Body of the thread that will do the background sync. +void runSyncThread(); -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/rslog.cpp b/src/mongo/db/repl/rslog.cpp index 9a02f64ce27..f48d6399847 100644 --- a/src/mongo/db/repl/rslog.cpp +++ b/src/mongo/db/repl/rslog.cpp @@ -36,12 +36,12 @@ namespace mongo { namespace repl { - static RamLog* _rsLog = RamLog::get("rs"); - logger::Tee* rsLog = _rsLog; +static RamLog* _rsLog = RamLog::get("rs"); +logger::Tee* rsLog = _rsLog; - void fillRsLog(std::stringstream* s) { - _rsLog->toHTML(*s); - } +void fillRsLog(std::stringstream* s) { + _rsLog->toHTML(*s); +} -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/rslog.h b/src/mongo/db/repl/rslog.h index 7a6624c876d..5b0b694d7bf 100644 --- a/src/mongo/db/repl/rslog.h +++ b/src/mongo/db/repl/rslog.h @@ -33,15 +33,15 @@ namespace mongo { namespace logger { - class Tee; -} // namespace logger +class Tee; +} // namespace logger namespace repl { - void fillRsLog(std::stringstream* s); +void fillRsLog(std::stringstream* s); - // ramlog used for replSet actions - extern logger::Tee* rsLog; +// ramlog used for replSet actions +extern logger::Tee* rsLog; -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/scatter_gather_algorithm.cpp b/src/mongo/db/repl/scatter_gather_algorithm.cpp index 5e65a8f2df1..78fc22fa38f 100644 --- a/src/mongo/db/repl/scatter_gather_algorithm.cpp +++ b/src/mongo/db/repl/scatter_gather_algorithm.cpp @@ -33,7 +33,7 @@ namespace mongo { namespace repl { - ScatterGatherAlgorithm::~ScatterGatherAlgorithm() {} +ScatterGatherAlgorithm::~ScatterGatherAlgorithm() {} } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/scatter_gather_algorithm.h b/src/mongo/db/repl/scatter_gather_algorithm.h index 7622f0e385e..22d05c1f3a4 100644 --- a/src/mongo/db/repl/scatter_gather_algorithm.h +++ b/src/mongo/db/repl/scatter_gather_algorithm.h @@ -34,47 +34,47 @@ namespace mongo { - template <typename T> class StatusWith; +template <typename T> +class StatusWith; namespace repl { +/** + * Interface for a specialization of a scatter-gather algorithm that sends + * requests to a set of targets, and then processes responses until it has + * seen enough. + * + * To use, call getRequests() to get a vector of request objects describing network operations. + * Start performing the network operations in any order, and then, until + * hasReceivedSufficientResponses() returns true, call processResponse for each response as it + * arrives. Once hasReceivedSufficientResponses() you may cancel outstanding network + * operations, and must stop calling processResponse. Implementations of this interface may + * assume that processResponse() is never called after hasReceivedSufficientResponses() returns + * true. + */ +class ScatterGatherAlgorithm { +public: /** - * Interface for a specialization of a scatter-gather algorithm that sends - * requests to a set of targets, and then processes responses until it has - * seen enough. - * - * To use, call getRequests() to get a vector of request objects describing network operations. - * Start performing the network operations in any order, and then, until - * hasReceivedSufficientResponses() returns true, call processResponse for each response as it - * arrives. Once hasReceivedSufficientResponses() you may cancel outstanding network - * operations, and must stop calling processResponse. Implementations of this interface may - * assume that processResponse() is never called after hasReceivedSufficientResponses() returns - * true. + * Returns the list of requests that should be sent. */ - class ScatterGatherAlgorithm { - public: - /** - * Returns the list of requests that should be sent. - */ - virtual std::vector<ReplicationExecutor::RemoteCommandRequest> getRequests() const = 0; + virtual std::vector<ReplicationExecutor::RemoteCommandRequest> getRequests() const = 0; - /** - * Method to call once for each received response. - */ - virtual void processResponse( - const ReplicationExecutor::RemoteCommandRequest& request, - const ResponseStatus& response) = 0; + /** + * Method to call once for each received response. + */ + virtual void processResponse(const ReplicationExecutor::RemoteCommandRequest& request, + const ResponseStatus& response) = 0; - /** - * Returns true if no more calls to processResponse are needed to consider the - * algorithm complete. Once this method returns true, one should no longer - * call processResponse. - */ - virtual bool hasReceivedSufficientResponses() const = 0; + /** + * Returns true if no more calls to processResponse are needed to consider the + * algorithm complete. Once this method returns true, one should no longer + * call processResponse. + */ + virtual bool hasReceivedSufficientResponses() const = 0; - protected: - virtual ~ScatterGatherAlgorithm(); // Shouldn't actually be virtual. - }; +protected: + virtual ~ScatterGatherAlgorithm(); // Shouldn't actually be virtual. +}; } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/scatter_gather_runner.cpp b/src/mongo/db/repl/scatter_gather_runner.cpp index ce2d8a7dbb9..15934106f59 100644 --- a/src/mongo/db/repl/scatter_gather_runner.cpp +++ b/src/mongo/db/repl/scatter_gather_runner.cpp @@ -41,121 +41,105 @@ namespace mongo { namespace repl { - ScatterGatherRunner::ScatterGatherRunner(ScatterGatherAlgorithm* algorithm) : - _algorithm(algorithm), - _started(false) { +ScatterGatherRunner::ScatterGatherRunner(ScatterGatherAlgorithm* algorithm) + : _algorithm(algorithm), _started(false) {} + +ScatterGatherRunner::~ScatterGatherRunner() {} + +static void startTrampoline(const ReplicationExecutor::CallbackData& cbData, + ScatterGatherRunner* runner, + StatusWith<ReplicationExecutor::EventHandle>* result) { + *result = runner->start(cbData.executor); +} + +Status ScatterGatherRunner::run(ReplicationExecutor* executor) { + StatusWith<ReplicationExecutor::EventHandle> finishEvh(ErrorCodes::InternalError, "Not set"); + StatusWith<ReplicationExecutor::CallbackHandle> startCBH = executor->scheduleWork( + stdx::bind(startTrampoline, stdx::placeholders::_1, this, &finishEvh)); + if (!startCBH.isOK()) { + return startCBH.getStatus(); } - - ScatterGatherRunner::~ScatterGatherRunner() { + executor->wait(startCBH.getValue()); + if (!finishEvh.isOK()) { + return finishEvh.getStatus(); } - - static void startTrampoline(const ReplicationExecutor::CallbackData& cbData, - ScatterGatherRunner* runner, - StatusWith<ReplicationExecutor::EventHandle>* result) { - - *result = runner->start(cbData.executor); - } - - Status ScatterGatherRunner::run(ReplicationExecutor* executor) { - StatusWith<ReplicationExecutor::EventHandle> finishEvh(ErrorCodes::InternalError, - "Not set"); - StatusWith<ReplicationExecutor::CallbackHandle> startCBH = executor->scheduleWork( - stdx::bind(startTrampoline, stdx::placeholders::_1, this, &finishEvh)); - if (!startCBH.isOK()) { - return startCBH.getStatus(); - } - executor->wait(startCBH.getValue()); - if (!finishEvh.isOK()) { - return finishEvh.getStatus(); - } - executor->waitForEvent(finishEvh.getValue()); - return Status::OK(); + executor->waitForEvent(finishEvh.getValue()); + return Status::OK(); +} + +StatusWith<ReplicationExecutor::EventHandle> ScatterGatherRunner::start( + ReplicationExecutor* executor, const stdx::function<void()>& onCompletion) { + invariant(!_started); + _started = true; + _actualResponses = 0; + _onCompletion = onCompletion; + StatusWith<ReplicationExecutor::EventHandle> evh = executor->makeEvent(); + if (!evh.isOK()) { + return evh; } - - StatusWith<ReplicationExecutor::EventHandle> ScatterGatherRunner::start( - ReplicationExecutor* executor, - const stdx::function<void ()>& onCompletion) { - - invariant(!_started); - _started = true; - _actualResponses = 0; - _onCompletion = onCompletion; - StatusWith<ReplicationExecutor::EventHandle> evh = executor->makeEvent(); - if (!evh.isOK()) { - return evh; - } - _sufficientResponsesReceived = evh.getValue(); - ScopeGuard earlyReturnGuard = MakeGuard( - &ScatterGatherRunner::_signalSufficientResponsesReceived, - this, - executor); - - const ReplicationExecutor::RemoteCommandCallbackFn cb = stdx::bind( - &ScatterGatherRunner::_processResponse, - stdx::placeholders::_1, - this); - - std::vector<ReplicationExecutor::RemoteCommandRequest> requests = _algorithm->getRequests(); - for (size_t i = 0; i < requests.size(); ++i) { - const StatusWith<ReplicationExecutor::CallbackHandle> cbh = - executor->scheduleRemoteCommand(requests[i], cb); - if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { - return StatusWith<ReplicationExecutor::EventHandle>(cbh.getStatus()); - } - fassert(18743, cbh.getStatus()); - _callbacks.push_back(cbh.getValue()); + _sufficientResponsesReceived = evh.getValue(); + ScopeGuard earlyReturnGuard = + MakeGuard(&ScatterGatherRunner::_signalSufficientResponsesReceived, this, executor); + + const ReplicationExecutor::RemoteCommandCallbackFn cb = + stdx::bind(&ScatterGatherRunner::_processResponse, stdx::placeholders::_1, this); + + std::vector<ReplicationExecutor::RemoteCommandRequest> requests = _algorithm->getRequests(); + for (size_t i = 0; i < requests.size(); ++i) { + const StatusWith<ReplicationExecutor::CallbackHandle> cbh = + executor->scheduleRemoteCommand(requests[i], cb); + if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { + return StatusWith<ReplicationExecutor::EventHandle>(cbh.getStatus()); } - - if (_callbacks.empty() || _algorithm->hasReceivedSufficientResponses()) { - invariant(_algorithm->hasReceivedSufficientResponses()); - _signalSufficientResponsesReceived(executor); - } - - earlyReturnGuard.Dismiss(); - return evh; + fassert(18743, cbh.getStatus()); + _callbacks.push_back(cbh.getValue()); } - void ScatterGatherRunner::cancel(ReplicationExecutor* executor) { - invariant(_started); + if (_callbacks.empty() || _algorithm->hasReceivedSufficientResponses()) { + invariant(_algorithm->hasReceivedSufficientResponses()); _signalSufficientResponsesReceived(executor); } - void ScatterGatherRunner::_processResponse( - const ReplicationExecutor::RemoteCommandCallbackData& cbData, - ScatterGatherRunner* runner) { - - // It is possible that the ScatterGatherRunner has already gone out of scope, if the - // response indicates the callback was canceled. In that case, do not access any members - // of "runner" and return immediately. - if (cbData.response.getStatus() == ErrorCodes::CallbackCanceled) { - return; - } - - ++runner->_actualResponses; - runner->_algorithm->processResponse(cbData.request, cbData.response); - if (runner->_algorithm->hasReceivedSufficientResponses()) { - runner->_signalSufficientResponsesReceived(cbData.executor); - } - else { - invariant(runner->_actualResponses < runner->_callbacks.size()); - } + earlyReturnGuard.Dismiss(); + return evh; +} + +void ScatterGatherRunner::cancel(ReplicationExecutor* executor) { + invariant(_started); + _signalSufficientResponsesReceived(executor); +} + +void ScatterGatherRunner::_processResponse( + const ReplicationExecutor::RemoteCommandCallbackData& cbData, ScatterGatherRunner* runner) { + // It is possible that the ScatterGatherRunner has already gone out of scope, if the + // response indicates the callback was canceled. In that case, do not access any members + // of "runner" and return immediately. + if (cbData.response.getStatus() == ErrorCodes::CallbackCanceled) { + return; } - void ScatterGatherRunner::_signalSufficientResponsesReceived(ReplicationExecutor* executor) { - if (_sufficientResponsesReceived.isValid()) { - std::for_each(_callbacks.begin(), - _callbacks.end(), - stdx::bind(&ReplicationExecutor::cancel, - executor, - stdx::placeholders::_1)); - const ReplicationExecutor::EventHandle h = _sufficientResponsesReceived; - _sufficientResponsesReceived = ReplicationExecutor::EventHandle(); - if (_onCompletion) { - _onCompletion(); - } - executor->signalEvent(h); + ++runner->_actualResponses; + runner->_algorithm->processResponse(cbData.request, cbData.response); + if (runner->_algorithm->hasReceivedSufficientResponses()) { + runner->_signalSufficientResponsesReceived(cbData.executor); + } else { + invariant(runner->_actualResponses < runner->_callbacks.size()); + } +} + +void ScatterGatherRunner::_signalSufficientResponsesReceived(ReplicationExecutor* executor) { + if (_sufficientResponsesReceived.isValid()) { + std::for_each(_callbacks.begin(), + _callbacks.end(), + stdx::bind(&ReplicationExecutor::cancel, executor, stdx::placeholders::_1)); + const ReplicationExecutor::EventHandle h = _sufficientResponsesReceived; + _sufficientResponsesReceived = ReplicationExecutor::EventHandle(); + if (_onCompletion) { + _onCompletion(); } + executor->signalEvent(h); } +} } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/scatter_gather_runner.h b/src/mongo/db/repl/scatter_gather_runner.h index 63921a91596..551db7a2e7a 100644 --- a/src/mongo/db/repl/scatter_gather_runner.h +++ b/src/mongo/db/repl/scatter_gather_runner.h @@ -36,88 +36,90 @@ namespace mongo { - template <typename T> class StatusWith; +template <typename T> +class StatusWith; namespace repl { - class ScatterGatherAlgorithm; +class ScatterGatherAlgorithm; + +/** + * Implementation of a scatter-gather behavior using a ReplicationExecutor. + */ +class ScatterGatherRunner { + MONGO_DISALLOW_COPYING(ScatterGatherRunner); + +public: + /** + * Constructs a new runner whose underlying algorithm is "algorithm". + * + * "algorithm" must remain in scope until the runner's destructor completes. + */ + explicit ScatterGatherRunner(ScatterGatherAlgorithm* algorithm); + + ~ScatterGatherRunner(); + + /** + * Runs the scatter-gather process using "executor", and blocks until it completes. + * + * Must _not_ be run from inside the executor context. + * + * Returns ErrorCodes::ShutdownInProgress if the executor enters or is already in + * the shutdown state before run() can schedule execution of the scatter-gather + * in the executor. Note that if the executor is shut down after the algorithm + * is scheduled but before it completes, this method will return Status::OK(), + * just as it does when it runs successfully to completion. + */ + Status run(ReplicationExecutor* executor); + + /** + * Starts executing the scatter-gather process using "executor". + * + * On success, returns an event handle that will be signaled when the runner has + * finished executing the scatter-gather process. After that event has been + * signaled, it is safe for the caller to examine any state on "algorithm". + * + * This method must be called inside the executor context. + * + * onCompletion is an optional callback that will be executed in executor context + * immediately prior to signaling the event handle returned here. It must never + * throw exceptions. It may examine the state of the algorithm object. + * + * NOTE: If the executor starts to shut down before onCompletion executes, onCompletion may + * never execute, even though the returned event will eventually be signaled. + */ + StatusWith<ReplicationExecutor::EventHandle> start( + ReplicationExecutor* executor, + const stdx::function<void()>& onCompletion = stdx::function<void()>()); + + /** + * Informs the runner to cancel further processing. The "executor" argument + * must point to the same executor passed to "start()". + * + * Like start, this method must be called from within the executor context. + */ + void cancel(ReplicationExecutor* executor); + +private: + /** + * Callback invoked once for every response from the network. + */ + static void _processResponse(const ReplicationExecutor::RemoteCommandCallbackData& cbData, + ScatterGatherRunner* runner); /** - * Implementation of a scatter-gather behavior using a ReplicationExecutor. + * Method that performs all actions required when _algorithm indicates a sufficient + * number of respones have been received. */ - class ScatterGatherRunner { - MONGO_DISALLOW_COPYING(ScatterGatherRunner); - public: - /** - * Constructs a new runner whose underlying algorithm is "algorithm". - * - * "algorithm" must remain in scope until the runner's destructor completes. - */ - explicit ScatterGatherRunner(ScatterGatherAlgorithm* algorithm); - - ~ScatterGatherRunner(); - - /** - * Runs the scatter-gather process using "executor", and blocks until it completes. - * - * Must _not_ be run from inside the executor context. - * - * Returns ErrorCodes::ShutdownInProgress if the executor enters or is already in - * the shutdown state before run() can schedule execution of the scatter-gather - * in the executor. Note that if the executor is shut down after the algorithm - * is scheduled but before it completes, this method will return Status::OK(), - * just as it does when it runs successfully to completion. - */ - Status run(ReplicationExecutor* executor); - - /** - * Starts executing the scatter-gather process using "executor". - * - * On success, returns an event handle that will be signaled when the runner has - * finished executing the scatter-gather process. After that event has been - * signaled, it is safe for the caller to examine any state on "algorithm". - * - * This method must be called inside the executor context. - * - * onCompletion is an optional callback that will be executed in executor context - * immediately prior to signaling the event handle returned here. It must never - * throw exceptions. It may examine the state of the algorithm object. - * - * NOTE: If the executor starts to shut down before onCompletion executes, onCompletion may - * never execute, even though the returned event will eventually be signaled. - */ - StatusWith<ReplicationExecutor::EventHandle> start( - ReplicationExecutor* executor, - const stdx::function<void ()>& onCompletion = stdx::function<void ()>()); - - /** - * Informs the runner to cancel further processing. The "executor" argument - * must point to the same executor passed to "start()". - * - * Like start, this method must be called from within the executor context. - */ - void cancel(ReplicationExecutor* executor); - - private: - /** - * Callback invoked once for every response from the network. - */ - static void _processResponse(const ReplicationExecutor::RemoteCommandCallbackData& cbData, - ScatterGatherRunner* runner); - - /** - * Method that performs all actions required when _algorithm indicates a sufficient - * number of respones have been received. - */ - void _signalSufficientResponsesReceived(ReplicationExecutor* executor); - - ScatterGatherAlgorithm* _algorithm; - stdx::function<void ()> _onCompletion; - ReplicationExecutor::EventHandle _sufficientResponsesReceived; - std::vector<ReplicationExecutor::CallbackHandle> _callbacks; - size_t _actualResponses; - bool _started; - }; + void _signalSufficientResponsesReceived(ReplicationExecutor* executor); + + ScatterGatherAlgorithm* _algorithm; + stdx::function<void()> _onCompletion; + ReplicationExecutor::EventHandle _sufficientResponsesReceived; + std::vector<ReplicationExecutor::CallbackHandle> _callbacks; + size_t _actualResponses; + bool _started; +}; } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/scatter_gather_test.cpp b/src/mongo/db/repl/scatter_gather_test.cpp index 270aa53a499..303ce841211 100644 --- a/src/mongo/db/repl/scatter_gather_test.cpp +++ b/src/mongo/db/repl/scatter_gather_test.cpp @@ -41,305 +41,292 @@ namespace mongo { namespace repl { namespace { - /** - * Algorithm for testing the ScatterGatherRunner, which will finish running when finish() is - * called, or upon receiving responses from two nodes. Creates a three requests algorithm - * simulating running an algorithm against three other nodes. - */ - class ScatterGatherTestAlgorithm : public ScatterGatherAlgorithm { - public: - ScatterGatherTestAlgorithm(int64_t maxResponses = 2) : - _done(false), - _numResponses(0), - _maxResponses(maxResponses) {} - - virtual std::vector<ReplicationExecutor::RemoteCommandRequest> getRequests() const { - std::vector<ReplicationExecutor::RemoteCommandRequest> requests; - for (int i = 0; i < 3; i++) { - requests.push_back(ReplicationExecutor::RemoteCommandRequest( - HostAndPort("hostname", i), - "admin", - BSONObj(), - Milliseconds(30*1000))); - } - return requests; - } - - virtual void processResponse( - const ReplicationExecutor::RemoteCommandRequest& request, - const ResponseStatus& response) { - _numResponses++; - } - - void finish() { - _done = true; - } - - virtual bool hasReceivedSufficientResponses() const { - if (_done) { - return _done; - } - - return _numResponses >= _maxResponses; - } - - int getResponseCount() { - return _numResponses; +/** + * Algorithm for testing the ScatterGatherRunner, which will finish running when finish() is + * called, or upon receiving responses from two nodes. Creates a three requests algorithm + * simulating running an algorithm against three other nodes. + */ +class ScatterGatherTestAlgorithm : public ScatterGatherAlgorithm { +public: + ScatterGatherTestAlgorithm(int64_t maxResponses = 2) + : _done(false), _numResponses(0), _maxResponses(maxResponses) {} + + virtual std::vector<ReplicationExecutor::RemoteCommandRequest> getRequests() const { + std::vector<ReplicationExecutor::RemoteCommandRequest> requests; + for (int i = 0; i < 3; i++) { + requests.push_back(ReplicationExecutor::RemoteCommandRequest( + HostAndPort("hostname", i), "admin", BSONObj(), Milliseconds(30 * 1000))); } - - private: - - bool _done; - int64_t _numResponses; - int64_t _maxResponses; - }; - - /** - * ScatterGatherTest base class which sets up the ReplicationExecutor and NetworkInterfaceMock. - */ - class ScatterGatherTest : public mongo::unittest::Test { - protected: - - NetworkInterfaceMock* getNet() { return _net; } - ReplicationExecutor* getExecutor() { return _executor.get(); } - - int64_t countLogLinesContaining(const std::string& needle); - private: - - void setUp(); - void tearDown(); - - // owned by _executor - NetworkInterfaceMock* _net; - boost::scoped_ptr<ReplicationExecutor> _executor; - boost::scoped_ptr<boost::thread> _executorThread; - }; - - void ScatterGatherTest::setUp() { - _net = new NetworkInterfaceMock; - _executor.reset(new ReplicationExecutor(_net, 1 /* prng seed */)); - _executorThread.reset(new boost::thread(stdx::bind(&ReplicationExecutor::run, - _executor.get()))); + return requests; } - void ScatterGatherTest::tearDown() { - _executor->shutdown(); - _executorThread->join(); + virtual void processResponse(const ReplicationExecutor::RemoteCommandRequest& request, + const ResponseStatus& response) { + _numResponses++; } + void finish() { + _done = true; + } - // Used to run a ScatterGatherRunner in a separate thread, to avoid blocking test execution. - class ScatterGatherRunnerRunner { - public: - - ScatterGatherRunnerRunner(ScatterGatherRunner* sgr, ReplicationExecutor* executor) : - _sgr(sgr), - _executor(executor), - _result(Status(ErrorCodes::BadValue, "failed to set status")) {} - - // Could block if _sgr has not finished - Status getResult() { - _thread->join(); - return _result; - } - - void run() { - _thread.reset(new boost::thread(stdx::bind(&ScatterGatherRunnerRunner::_run, - this, - _executor))); - } - - private: - - void _run(ReplicationExecutor* executor) { - _result = _sgr->run(_executor); + virtual bool hasReceivedSufficientResponses() const { + if (_done) { + return _done; } - ScatterGatherRunner* _sgr; - ReplicationExecutor* _executor; - Status _result; - boost::scoped_ptr<boost::thread> _thread; - }; - - // Simple onCompletion function which will toggle a bool, so that we can check the logs to - // ensure the onCompletion function ran when expected. - void onCompletionTestFunction(bool* ran) { - *ran = true; + return _numResponses >= _maxResponses; } - // Confirm that running via start() will finish and run the onComplete function once sufficient - // responses have been received. - // Confirm that deleting both the ScatterGatherTestAlgorithm and ScatterGatherRunner while - // scheduled callbacks still exist will not be unsafe (ASAN builder) after the algorithm has - // completed. - TEST_F(ScatterGatherTest, DeleteAlgorithmAfterItHasCompleted) { - ScatterGatherTestAlgorithm* sga = new ScatterGatherTestAlgorithm(); - ScatterGatherRunner* sgr = new ScatterGatherRunner(sga); - bool ranCompletion = false; - StatusWith<ReplicationExecutor::EventHandle> status = sgr->start(getExecutor(), - stdx::bind(&onCompletionTestFunction, &ranCompletion)); - ASSERT_OK(status.getStatus()); - ASSERT_FALSE(ranCompletion); - - NetworkInterfaceMock* net = getNet(); - net->enterNetwork(); - NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); - net->scheduleResponse(noi, - net->now()+2000, - ResponseStatus(ReplicationExecutor::RemoteCommandResponse( - BSON("ok" << 1), - boost::posix_time::milliseconds(10)))); - ASSERT_FALSE(ranCompletion); - - noi = net->getNextReadyRequest(); - net->scheduleResponse(noi, - net->now()+2000, - ResponseStatus(ReplicationExecutor::RemoteCommandResponse( - BSON("ok" << 1), - boost::posix_time::milliseconds(10)))); - ASSERT_FALSE(ranCompletion); - - noi = net->getNextReadyRequest(); - net->scheduleResponse(noi, - net->now()+5000, - ResponseStatus(ReplicationExecutor::RemoteCommandResponse( - BSON("ok" << 1), - boost::posix_time::milliseconds(10)))); - ASSERT_FALSE(ranCompletion); - - net->runUntil(net->now()+2000); - ASSERT_TRUE(ranCompletion); - - delete sga; - delete sgr; - - net->runReadyNetworkOperations(); - - net->exitNetwork(); + int getResponseCount() { + return _numResponses; } - // Confirm that shutting the ReplicationExecutor down before calling run() will cause run() - // to return ErrorCodes::ShutdownInProgress. - TEST_F(ScatterGatherTest, ShutdownExecutorBeforeRun) { - ScatterGatherTestAlgorithm sga; - ScatterGatherRunner sgr(&sga); - getExecutor()->shutdown(); - sga.finish(); - Status status = sgr.run(getExecutor()); - ASSERT_EQUALS(ErrorCodes::ShutdownInProgress, status); - } +private: + bool _done; + int64_t _numResponses; + int64_t _maxResponses; +}; - // Confirm that shutting the ReplicationExecutor down after calling run(), but before run() - // finishes will cause run() to return Status::OK(). - TEST_F(ScatterGatherTest, ShutdownExecutorAfterRun) { - ScatterGatherTestAlgorithm sga; - ScatterGatherRunner sgr(&sga); - ScatterGatherRunnerRunner sgrr(&sgr, getExecutor()); - sgrr.run(); - // need to wait for the scatter-gather to be scheduled in the executor - NetworkInterfaceMock* net = getNet(); - net->enterNetwork(); - NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); - net->blackHole(noi); - net->exitNetwork(); - getExecutor()->shutdown(); - Status status = sgrr.getResult(); - ASSERT_OK(status); +/** + * ScatterGatherTest base class which sets up the ReplicationExecutor and NetworkInterfaceMock. + */ +class ScatterGatherTest : public mongo::unittest::Test { +protected: + NetworkInterfaceMock* getNet() { + return _net; } - - // Confirm that shutting the ReplicationExecutor down before calling start() will cause start() - // to return ErrorCodes::ShutdownInProgress and should not run onCompletion(). - TEST_F(ScatterGatherTest, ShutdownExecutorBeforeStart) { - ScatterGatherTestAlgorithm sga; - ScatterGatherRunner sgr(&sga); - getExecutor()->shutdown(); - bool ranCompletion = false; - StatusWith<ReplicationExecutor::EventHandle> status = sgr.start(getExecutor(), - stdx::bind(&onCompletionTestFunction, &ranCompletion)); - sga.finish(); - ASSERT_FALSE(ranCompletion); - ASSERT_EQUALS(ErrorCodes::ShutdownInProgress, status.getStatus()); + ReplicationExecutor* getExecutor() { + return _executor.get(); } - // Confirm that shutting the ReplicationExecutor down after calling start() will cause start() - // to return Status::OK and should not run onCompletion(). - TEST_F(ScatterGatherTest, ShutdownExecutorAfterStart) { - ScatterGatherTestAlgorithm sga; - ScatterGatherRunner sgr(&sga); - bool ranCompletion = false; - StatusWith<ReplicationExecutor::EventHandle> status = sgr.start(getExecutor(), - stdx::bind(&onCompletionTestFunction, &ranCompletion)); - getExecutor()->shutdown(); - sga.finish(); - ASSERT_FALSE(ranCompletion); - ASSERT_OK(status.getStatus()); + int64_t countLogLinesContaining(const std::string& needle); + +private: + void setUp(); + void tearDown(); + + // owned by _executor + NetworkInterfaceMock* _net; + boost::scoped_ptr<ReplicationExecutor> _executor; + boost::scoped_ptr<boost::thread> _executorThread; +}; + +void ScatterGatherTest::setUp() { + _net = new NetworkInterfaceMock; + _executor.reset(new ReplicationExecutor(_net, 1 /* prng seed */)); + _executorThread.reset( + new boost::thread(stdx::bind(&ReplicationExecutor::run, _executor.get()))); +} + +void ScatterGatherTest::tearDown() { + _executor->shutdown(); + _executorThread->join(); +} + + +// Used to run a ScatterGatherRunner in a separate thread, to avoid blocking test execution. +class ScatterGatherRunnerRunner { +public: + ScatterGatherRunnerRunner(ScatterGatherRunner* sgr, ReplicationExecutor* executor) + : _sgr(sgr), + _executor(executor), + _result(Status(ErrorCodes::BadValue, "failed to set status")) {} + + // Could block if _sgr has not finished + Status getResult() { + _thread->join(); + return _result; } - // Confirm that responses are not processed once sufficient responses have been received. - TEST_F(ScatterGatherTest, DoNotProcessMoreThanSufficientResponses) { - ScatterGatherTestAlgorithm sga; - ScatterGatherRunner sgr(&sga); - bool ranCompletion = false; - StatusWith<ReplicationExecutor::EventHandle> status = sgr.start(getExecutor(), - stdx::bind(&onCompletionTestFunction, &ranCompletion)); - ASSERT_OK(status.getStatus()); - ASSERT_FALSE(ranCompletion); - - NetworkInterfaceMock* net = getNet(); - net->enterNetwork(); - NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); - net->scheduleResponse(noi, - net->now()+2000, - ResponseStatus(ReplicationExecutor::RemoteCommandResponse( - BSON("ok" << 1), - boost::posix_time::milliseconds(10)))); - ASSERT_FALSE(ranCompletion); - - noi = net->getNextReadyRequest(); - net->scheduleResponse(noi, - net->now()+2000, - ResponseStatus(ReplicationExecutor::RemoteCommandResponse( - BSON("ok" << 1), - boost::posix_time::milliseconds(10)))); - ASSERT_FALSE(ranCompletion); - - noi = net->getNextReadyRequest(); - net->scheduleResponse(noi, - net->now()+5000, - ResponseStatus(ReplicationExecutor::RemoteCommandResponse( - BSON("ok" << 1), - boost::posix_time::milliseconds(10)))); - ASSERT_FALSE(ranCompletion); - - net->runUntil(net->now()+2000); - ASSERT_TRUE(ranCompletion); - - - net->runReadyNetworkOperations(); - // the third resposne should not be processed, so the count should not increment - ASSERT_EQUALS(2, sga.getResponseCount()); - - net->exitNetwork(); + void run() { + _thread.reset( + new boost::thread(stdx::bind(&ScatterGatherRunnerRunner::_run, this, _executor))); } - // Confirm that starting with sufficient responses received will immediate complete. - TEST_F(ScatterGatherTest, DoNotCreateCallbacksIfHasSufficientResponsesReturnsTrueImmediately) { - ScatterGatherTestAlgorithm sga; - // set hasReceivedSufficientResponses to return true before the run starts - sga.finish(); - ScatterGatherRunner sgr(&sga); - bool ranCompletion = false; - StatusWith<ReplicationExecutor::EventHandle> status = sgr.start(getExecutor(), - stdx::bind(&onCompletionTestFunction, &ranCompletion)); - ASSERT_OK(status.getStatus()); - ASSERT_TRUE(ranCompletion); - - NetworkInterfaceMock* net = getNet(); - net->enterNetwork(); - ASSERT_FALSE(net->hasReadyRequests()); - net->exitNetwork(); +private: + void _run(ReplicationExecutor* executor) { + _result = _sgr->run(_executor); } + ScatterGatherRunner* _sgr; + ReplicationExecutor* _executor; + Status _result; + boost::scoped_ptr<boost::thread> _thread; +}; + +// Simple onCompletion function which will toggle a bool, so that we can check the logs to +// ensure the onCompletion function ran when expected. +void onCompletionTestFunction(bool* ran) { + *ran = true; +} + +// Confirm that running via start() will finish and run the onComplete function once sufficient +// responses have been received. +// Confirm that deleting both the ScatterGatherTestAlgorithm and ScatterGatherRunner while +// scheduled callbacks still exist will not be unsafe (ASAN builder) after the algorithm has +// completed. +TEST_F(ScatterGatherTest, DeleteAlgorithmAfterItHasCompleted) { + ScatterGatherTestAlgorithm* sga = new ScatterGatherTestAlgorithm(); + ScatterGatherRunner* sgr = new ScatterGatherRunner(sga); + bool ranCompletion = false; + StatusWith<ReplicationExecutor::EventHandle> status = + sgr->start(getExecutor(), stdx::bind(&onCompletionTestFunction, &ranCompletion)); + ASSERT_OK(status.getStatus()); + ASSERT_FALSE(ranCompletion); + + NetworkInterfaceMock* net = getNet(); + net->enterNetwork(); + NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); + net->scheduleResponse(noi, + net->now() + 2000, + ResponseStatus(ReplicationExecutor::RemoteCommandResponse( + BSON("ok" << 1), boost::posix_time::milliseconds(10)))); + ASSERT_FALSE(ranCompletion); + + noi = net->getNextReadyRequest(); + net->scheduleResponse(noi, + net->now() + 2000, + ResponseStatus(ReplicationExecutor::RemoteCommandResponse( + BSON("ok" << 1), boost::posix_time::milliseconds(10)))); + ASSERT_FALSE(ranCompletion); + + noi = net->getNextReadyRequest(); + net->scheduleResponse(noi, + net->now() + 5000, + ResponseStatus(ReplicationExecutor::RemoteCommandResponse( + BSON("ok" << 1), boost::posix_time::milliseconds(10)))); + ASSERT_FALSE(ranCompletion); + + net->runUntil(net->now() + 2000); + ASSERT_TRUE(ranCompletion); + + delete sga; + delete sgr; + + net->runReadyNetworkOperations(); + + net->exitNetwork(); +} + +// Confirm that shutting the ReplicationExecutor down before calling run() will cause run() +// to return ErrorCodes::ShutdownInProgress. +TEST_F(ScatterGatherTest, ShutdownExecutorBeforeRun) { + ScatterGatherTestAlgorithm sga; + ScatterGatherRunner sgr(&sga); + getExecutor()->shutdown(); + sga.finish(); + Status status = sgr.run(getExecutor()); + ASSERT_EQUALS(ErrorCodes::ShutdownInProgress, status); +} + +// Confirm that shutting the ReplicationExecutor down after calling run(), but before run() +// finishes will cause run() to return Status::OK(). +TEST_F(ScatterGatherTest, ShutdownExecutorAfterRun) { + ScatterGatherTestAlgorithm sga; + ScatterGatherRunner sgr(&sga); + ScatterGatherRunnerRunner sgrr(&sgr, getExecutor()); + sgrr.run(); + // need to wait for the scatter-gather to be scheduled in the executor + NetworkInterfaceMock* net = getNet(); + net->enterNetwork(); + NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); + net->blackHole(noi); + net->exitNetwork(); + getExecutor()->shutdown(); + Status status = sgrr.getResult(); + ASSERT_OK(status); +} + +// Confirm that shutting the ReplicationExecutor down before calling start() will cause start() +// to return ErrorCodes::ShutdownInProgress and should not run onCompletion(). +TEST_F(ScatterGatherTest, ShutdownExecutorBeforeStart) { + ScatterGatherTestAlgorithm sga; + ScatterGatherRunner sgr(&sga); + getExecutor()->shutdown(); + bool ranCompletion = false; + StatusWith<ReplicationExecutor::EventHandle> status = + sgr.start(getExecutor(), stdx::bind(&onCompletionTestFunction, &ranCompletion)); + sga.finish(); + ASSERT_FALSE(ranCompletion); + ASSERT_EQUALS(ErrorCodes::ShutdownInProgress, status.getStatus()); +} + +// Confirm that shutting the ReplicationExecutor down after calling start() will cause start() +// to return Status::OK and should not run onCompletion(). +TEST_F(ScatterGatherTest, ShutdownExecutorAfterStart) { + ScatterGatherTestAlgorithm sga; + ScatterGatherRunner sgr(&sga); + bool ranCompletion = false; + StatusWith<ReplicationExecutor::EventHandle> status = + sgr.start(getExecutor(), stdx::bind(&onCompletionTestFunction, &ranCompletion)); + getExecutor()->shutdown(); + sga.finish(); + ASSERT_FALSE(ranCompletion); + ASSERT_OK(status.getStatus()); +} + +// Confirm that responses are not processed once sufficient responses have been received. +TEST_F(ScatterGatherTest, DoNotProcessMoreThanSufficientResponses) { + ScatterGatherTestAlgorithm sga; + ScatterGatherRunner sgr(&sga); + bool ranCompletion = false; + StatusWith<ReplicationExecutor::EventHandle> status = + sgr.start(getExecutor(), stdx::bind(&onCompletionTestFunction, &ranCompletion)); + ASSERT_OK(status.getStatus()); + ASSERT_FALSE(ranCompletion); + + NetworkInterfaceMock* net = getNet(); + net->enterNetwork(); + NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); + net->scheduleResponse(noi, + net->now() + 2000, + ResponseStatus(ReplicationExecutor::RemoteCommandResponse( + BSON("ok" << 1), boost::posix_time::milliseconds(10)))); + ASSERT_FALSE(ranCompletion); + + noi = net->getNextReadyRequest(); + net->scheduleResponse(noi, + net->now() + 2000, + ResponseStatus(ReplicationExecutor::RemoteCommandResponse( + BSON("ok" << 1), boost::posix_time::milliseconds(10)))); + ASSERT_FALSE(ranCompletion); + + noi = net->getNextReadyRequest(); + net->scheduleResponse(noi, + net->now() + 5000, + ResponseStatus(ReplicationExecutor::RemoteCommandResponse( + BSON("ok" << 1), boost::posix_time::milliseconds(10)))); + ASSERT_FALSE(ranCompletion); + + net->runUntil(net->now() + 2000); + ASSERT_TRUE(ranCompletion); + + + net->runReadyNetworkOperations(); + // the third resposne should not be processed, so the count should not increment + ASSERT_EQUALS(2, sga.getResponseCount()); + + net->exitNetwork(); +} + +// Confirm that starting with sufficient responses received will immediate complete. +TEST_F(ScatterGatherTest, DoNotCreateCallbacksIfHasSufficientResponsesReturnsTrueImmediately) { + ScatterGatherTestAlgorithm sga; + // set hasReceivedSufficientResponses to return true before the run starts + sga.finish(); + ScatterGatherRunner sgr(&sga); + bool ranCompletion = false; + StatusWith<ReplicationExecutor::EventHandle> status = + sgr.start(getExecutor(), stdx::bind(&onCompletionTestFunction, &ranCompletion)); + ASSERT_OK(status.getStatus()); + ASSERT_TRUE(ranCompletion); + + NetworkInterfaceMock* net = getNet(); + net->enterNetwork(); + ASSERT_FALSE(net->hasReadyRequests()); + net->exitNetwork(); +} + #if 0 // TODO Enable this test once we have a way to test for invariants. @@ -383,41 +370,39 @@ namespace { net->exitNetwork(); ASSERT_FALSE(ranCompletion); } -#endif // 0 - - // Confirm that running via run() will finish once sufficient responses have been received. - TEST_F(ScatterGatherTest, SuccessfulScatterGatherViaRun) { - ScatterGatherTestAlgorithm sga; - ScatterGatherRunner sgr(&sga); - ScatterGatherRunnerRunner sgrr(&sgr, getExecutor()); - sgrr.run(); - - NetworkInterfaceMock* net = getNet(); - net->enterNetwork(); - NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); - net->scheduleResponse(noi, - net->now(), - ResponseStatus(ReplicationExecutor::RemoteCommandResponse( - BSON("ok" << 1), - boost::posix_time::milliseconds(10)))); - net->runReadyNetworkOperations(); - - noi = net->getNextReadyRequest(); - net->blackHole(noi); - net->runReadyNetworkOperations(); - - noi = net->getNextReadyRequest(); - net->scheduleResponse(noi, - net->now(), - ResponseStatus(ReplicationExecutor::RemoteCommandResponse( - BSON("ok" << 1), - boost::posix_time::milliseconds(10)))); - net->runReadyNetworkOperations(); - net->exitNetwork(); - - Status status = sgrr.getResult(); - ASSERT_OK(status); - } +#endif // 0 + +// Confirm that running via run() will finish once sufficient responses have been received. +TEST_F(ScatterGatherTest, SuccessfulScatterGatherViaRun) { + ScatterGatherTestAlgorithm sga; + ScatterGatherRunner sgr(&sga); + ScatterGatherRunnerRunner sgrr(&sgr, getExecutor()); + sgrr.run(); + + NetworkInterfaceMock* net = getNet(); + net->enterNetwork(); + NetworkInterfaceMock::NetworkOperationIterator noi = net->getNextReadyRequest(); + net->scheduleResponse(noi, + net->now(), + ResponseStatus(ReplicationExecutor::RemoteCommandResponse( + BSON("ok" << 1), boost::posix_time::milliseconds(10)))); + net->runReadyNetworkOperations(); + + noi = net->getNextReadyRequest(); + net->blackHole(noi); + net->runReadyNetworkOperations(); + + noi = net->getNextReadyRequest(); + net->scheduleResponse(noi, + net->now(), + ResponseStatus(ReplicationExecutor::RemoteCommandResponse( + BSON("ok" << 1), boost::posix_time::milliseconds(10)))); + net->runReadyNetworkOperations(); + net->exitNetwork(); + + Status status = sgrr.getResult(); + ASSERT_OK(status); +} } // namespace } // namespace repl diff --git a/src/mongo/db/repl/scoped_conn.cpp b/src/mongo/db/repl/scoped_conn.cpp index e4116fcf584..113261733da 100644 --- a/src/mongo/db/repl/scoped_conn.cpp +++ b/src/mongo/db/repl/scoped_conn.cpp @@ -39,41 +39,42 @@ namespace mongo { namespace repl { - static const int DEFAULT_HEARTBEAT_TIMEOUT_SECS = 10; +static const int DEFAULT_HEARTBEAT_TIMEOUT_SECS = 10; - // This is a bitmask with the first bit set. It's used to mark connections that should be kept - // open during stepdowns - const unsigned ScopedConn::keepOpen = 1; - ScopedConn::M& ScopedConn::_map = *(new ScopedConn::M()); - mutex ScopedConn::mapMutex("ScopedConn::mapMutex"); +// This is a bitmask with the first bit set. It's used to mark connections that should be kept +// open during stepdowns +const unsigned ScopedConn::keepOpen = 1; +ScopedConn::M& ScopedConn::_map = *(new ScopedConn::M()); +mutex ScopedConn::mapMutex("ScopedConn::mapMutex"); - ScopedConn::ConnectionInfo::ConnectionInfo() : lock("ConnectionInfo"), - cc(new DBClientConnection(/*reconnect*/ true, - /*timeout*/ DEFAULT_HEARTBEAT_TIMEOUT_SECS)), - connected(false) { - cc->_logLevel = logger::LogSeverity::Debug(2); - } +ScopedConn::ConnectionInfo::ConnectionInfo() + : lock("ConnectionInfo"), + cc(new DBClientConnection(/*reconnect*/ true, + /*timeout*/ DEFAULT_HEARTBEAT_TIMEOUT_SECS)), + connected(false) { + cc->_logLevel = logger::LogSeverity::Debug(2); +} - // we should already be locked... - bool ScopedConn::connect() { - std::string err; - if (!connInfo->cc->connect(HostAndPort(_hostport), err)) { - log() << "couldn't connect to " << _hostport << ": " << err; - return false; - } - connInfo->connected = true; - connInfo->tagPort(); - - // if we cannot authenticate against a member, then either its key file - // or our key file has to change. if our key file has to change, we'll - // be rebooting. if their file has to change, they'll be rebooted so the - // connection created above will go dead, reconnect, and reauth. - if (getGlobalAuthorizationManager()->isAuthEnabled()) { - return authenticateInternalUser(connInfo->cc.get()); - } +// we should already be locked... +bool ScopedConn::connect() { + std::string err; + if (!connInfo->cc->connect(HostAndPort(_hostport), err)) { + log() << "couldn't connect to " << _hostport << ": " << err; + return false; + } + connInfo->connected = true; + connInfo->tagPort(); - return true; + // if we cannot authenticate against a member, then either its key file + // or our key file has to change. if our key file has to change, we'll + // be rebooting. if their file has to change, they'll be rebooted so the + // connection created above will go dead, reconnect, and reauth. + if (getGlobalAuthorizationManager()->isAuthEnabled()) { + return authenticateInternalUser(connInfo->cc.get()); } -} // namespace repl -} // namespace mongo + return true; +} + +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/scoped_conn.h b/src/mongo/db/repl/scoped_conn.h index b357d17648e..85f6dd5080f 100644 --- a/src/mongo/db/repl/scoped_conn.h +++ b/src/mongo/db/repl/scoped_conn.h @@ -41,118 +41,122 @@ namespace mongo { namespace repl { - /** here we keep a single connection (with reconnect) for a set of hosts, - one each, and allow one user at a time per host. if in use already for that - host, we block. so this is an easy way to keep a 1-deep pool of connections - that many threads can share. +/** here we keep a single connection (with reconnect) for a set of hosts, + one each, and allow one user at a time per host. if in use already for that + host, we block. so this is an easy way to keep a 1-deep pool of connections + that many threads can share. - thread-safe. + thread-safe. - Example: - { - ScopedConn c("foo.acme.com:9999"); - c->runCommand(...); - } + Example: + { + ScopedConn c("foo.acme.com:9999"); + c->runCommand(...); + } - throws exception on connect error (but fine to try again later with a new - scopedconn object for same host). - */ - class ScopedConn { - public: - // A flag to keep ScopedConns open when all other sockets are disconnected - static const unsigned keepOpen; - - /** throws assertions if connect failure etc. */ - ScopedConn(const std::string& hostport); - ~ScopedConn() { - // conLock releases... - } - void reconnect() { - connInfo->cc.reset(new DBClientConnection(true, connInfo->getTimeout())); - connInfo->cc->_logLevel = logger::LogSeverity::Debug(2); - connInfo->connected = false; - connect(); - } + throws exception on connect error (but fine to try again later with a new + scopedconn object for same host). +*/ +class ScopedConn { +public: + // A flag to keep ScopedConns open when all other sockets are disconnected + static const unsigned keepOpen; + + /** throws assertions if connect failure etc. */ + ScopedConn(const std::string& hostport); + ~ScopedConn() { + // conLock releases... + } + void reconnect() { + connInfo->cc.reset(new DBClientConnection(true, connInfo->getTimeout())); + connInfo->cc->_logLevel = logger::LogSeverity::Debug(2); + connInfo->connected = false; + connect(); + } - void setTimeout(time_t timeout) { - connInfo->setTimeout(timeout); - } + void setTimeout(time_t timeout) { + connInfo->setTimeout(timeout); + } - /* If we were to run a query and not exhaust the cursor, future use of the connection would be problematic. - So here what we do is wrapper known safe methods and not allow cursor-style queries at all. This makes - ScopedConn limited in functionality but very safe. More non-cursor wrappers can be added here if needed. - */ - bool runCommand(const std::string &dbname, const BSONObj& cmd, BSONObj &info, int options=0) { - return conn()->runCommand(dbname, cmd, info, options); - } - unsigned long long count(const std::string &ns) { - return conn()->count(ns); - } - BSONObj findOne(const std::string &ns, const Query& q, const BSONObj *fieldsToReturn = 0, int queryOptions = 0) { - return conn()->findOne(ns, q, fieldsToReturn, queryOptions); + /* If we were to run a query and not exhaust the cursor, future use of the connection would be problematic. + So here what we do is wrapper known safe methods and not allow cursor-style queries at all. This makes + ScopedConn limited in functionality but very safe. More non-cursor wrappers can be added here if needed. + */ + bool runCommand(const std::string& dbname, const BSONObj& cmd, BSONObj& info, int options = 0) { + return conn()->runCommand(dbname, cmd, info, options); + } + unsigned long long count(const std::string& ns) { + return conn()->count(ns); + } + BSONObj findOne(const std::string& ns, + const Query& q, + const BSONObj* fieldsToReturn = 0, + int queryOptions = 0) { + return conn()->findOne(ns, q, fieldsToReturn, queryOptions); + } + +private: + std::auto_ptr<scoped_lock> connLock; + static mongo::mutex mapMutex; + struct ConnectionInfo { + mongo::mutex lock; + boost::scoped_ptr<DBClientConnection> cc; + bool connected; + ConnectionInfo(); + + void tagPort() { + MessagingPort& mp = cc->port(); + mp.tag |= ScopedConn::keepOpen; } - private: - std::auto_ptr<scoped_lock> connLock; - static mongo::mutex mapMutex; - struct ConnectionInfo { - mongo::mutex lock; - boost::scoped_ptr<DBClientConnection> cc; - bool connected; - ConnectionInfo(); - - void tagPort() { - MessagingPort& mp = cc->port(); - mp.tag |= ScopedConn::keepOpen; - } - - void setTimeout(time_t timeout) { - _timeout = timeout; - cc->setSoTimeout(_timeout); - } - - int getTimeout() { - return _timeout; - } - - private: - int _timeout; - } *connInfo; - typedef std::map<std::string,ScopedConn::ConnectionInfo*> M; - static M& _map; - boost::scoped_ptr<DBClientConnection>& conn() { return connInfo->cc; } - const std::string _hostport; - - // we should already be locked... - bool connect(); - - }; - - inline ScopedConn::ScopedConn(const std::string& hostport) : _hostport(hostport) { - bool first = false; - { - scoped_lock lk(mapMutex); - connInfo = _map[_hostport]; - if( connInfo == 0 ) { - connInfo = _map[_hostport] = new ConnectionInfo(); - first = true; - connLock.reset( new scoped_lock(connInfo->lock) ); - } + void setTimeout(time_t timeout) { + _timeout = timeout; + cc->setSoTimeout(_timeout); } - // already locked connLock above - if (first) { - connect(); - return; + int getTimeout() { + return _timeout; } - connLock.reset( new scoped_lock(connInfo->lock) ); - if (connInfo->connected) { - return; + private: + int _timeout; + } * connInfo; + typedef std::map<std::string, ScopedConn::ConnectionInfo*> M; + static M& _map; + boost::scoped_ptr<DBClientConnection>& conn() { + return connInfo->cc; + } + const std::string _hostport; + + // we should already be locked... + bool connect(); +}; + +inline ScopedConn::ScopedConn(const std::string& hostport) : _hostport(hostport) { + bool first = false; + { + scoped_lock lk(mapMutex); + connInfo = _map[_hostport]; + if (connInfo == 0) { + connInfo = _map[_hostport] = new ConnectionInfo(); + first = true; + connLock.reset(new scoped_lock(connInfo->lock)); } + } - // Keep trying to connect if we're not yet connected + // already locked connLock above + if (first) { connect(); + return; } -} // namespace repl -} // namespace mongo + + connLock.reset(new scoped_lock(connInfo->lock)); + if (connInfo->connected) { + return; + } + + // Keep trying to connect if we're not yet connected + connect(); +} +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/server.h b/src/mongo/db/repl/server.h index 9216c0bc87d..d376bc7faa5 100644 --- a/src/mongo/db/repl/server.h +++ b/src/mongo/db/repl/server.h @@ -38,39 +38,43 @@ namespace mongo { - namespace task { +namespace task { - typedef stdx::function<void()> lam; +typedef stdx::function<void()> lam; - /** typical usage is: task::fork( new Server("threadname") ); */ - class Server : public Task { - public: - /** send a message to the port */ - void send(lam); +/** typical usage is: task::fork( new Server("threadname") ); */ +class Server : public Task { +public: + /** send a message to the port */ + void send(lam); - Server(const std::string& name) : m("server"), _name(name), rq(false) { } - virtual ~Server() { } + Server(const std::string& name) : m("server"), _name(name), rq(false) {} + virtual ~Server() {} - /** send message but block until function completes */ - void call(const lam&); + /** send message but block until function completes */ + void call(const lam&); - void requeue() { rq = true; } - - protected: - // REMINDER : for use in mongod, you will want to have this call Client::initThread(). - virtual void starting() { } + void requeue() { + rq = true; + } - private: - virtual bool initClient() { return true; } - virtual std::string name() const { return _name; } - void doWork(); - std::deque<lam> d; - mongo::mutex m; - boost::condition c; - std::string _name; - bool rq; - }; +protected: + // REMINDER : for use in mongod, you will want to have this call Client::initThread(). + virtual void starting() {} +private: + virtual bool initClient() { + return true; } - + virtual std::string name() const { + return _name; + } + void doWork(); + std::deque<lam> d; + mongo::mutex m; + boost::condition c; + std::string _name; + bool rq; +}; +} } diff --git a/src/mongo/db/repl/sync.cpp b/src/mongo/db/repl/sync.cpp index bc7d86c8d31..d77827bc57c 100644 --- a/src/mongo/db/repl/sync.cpp +++ b/src/mongo/db/repl/sync.cpp @@ -48,109 +48,103 @@ namespace mongo { - using std::endl; - using std::string; +using std::endl; +using std::string; namespace repl { - void Sync::setHostname(const string& hostname) { - hn = hostname; - } +void Sync::setHostname(const string& hostname) { + hn = hostname; +} - BSONObj Sync::getMissingDoc(OperationContext* txn, Database* db, const BSONObj& o) { - OplogReader missingObjReader; // why are we using OplogReader to run a non-oplog query? - const char *ns = o.getStringField("ns"); +BSONObj Sync::getMissingDoc(OperationContext* txn, Database* db, const BSONObj& o) { + OplogReader missingObjReader; // why are we using OplogReader to run a non-oplog query? + const char* ns = o.getStringField("ns"); - // capped collections - Collection* collection = db->getCollection(ns); - if ( collection && collection->isCapped() ) { - log() << "replication missing doc, but this is okay for a capped collection (" << ns << ")" << endl; - return BSONObj(); - } + // capped collections + Collection* collection = db->getCollection(ns); + if (collection && collection->isCapped()) { + log() << "replication missing doc, but this is okay for a capped collection (" << ns << ")" + << endl; + return BSONObj(); + } - const int retryMax = 3; - for (int retryCount = 1; retryCount <= retryMax; ++retryCount) { - if (retryCount != 1) { - // if we are retrying, sleep a bit to let the network possibly recover - sleepsecs(retryCount * retryCount); - } - try { - bool ok = missingObjReader.connect(HostAndPort(hn)); - if (!ok) { - warning() << "network problem detected while connecting to the " - << "sync source, attempt " << retryCount << " of " - << retryMax << endl; - continue; // try again - } - } - catch (const SocketException&) { + const int retryMax = 3; + for (int retryCount = 1; retryCount <= retryMax; ++retryCount) { + if (retryCount != 1) { + // if we are retrying, sleep a bit to let the network possibly recover + sleepsecs(retryCount * retryCount); + } + try { + bool ok = missingObjReader.connect(HostAndPort(hn)); + if (!ok) { warning() << "network problem detected while connecting to the " - << "sync source, attempt " << retryCount << " of " - << retryMax << endl; - continue; // try again - } - - // might be more than just _id in the update criteria - BSONObj query = BSONObjBuilder().append(o.getObjectField("o2")["_id"]).obj(); - BSONObj missingObj; - try { - missingObj = missingObjReader.findOne(ns, query); - } - catch (const SocketException&) { - warning() << "network problem detected while fetching a missing document from the " - << "sync source, attempt " << retryCount << " of " - << retryMax << endl; - continue; // try again - } - catch (DBException& e) { - log() << "replication assertion fetching missing object: " << e.what() << endl; - throw; + << "sync source, attempt " << retryCount << " of " << retryMax << endl; + continue; // try again } + } catch (const SocketException&) { + warning() << "network problem detected while connecting to the " + << "sync source, attempt " << retryCount << " of " << retryMax << endl; + continue; // try again + } - // success! - return missingObj; + // might be more than just _id in the update criteria + BSONObj query = BSONObjBuilder().append(o.getObjectField("o2")["_id"]).obj(); + BSONObj missingObj; + try { + missingObj = missingObjReader.findOne(ns, query); + } catch (const SocketException&) { + warning() << "network problem detected while fetching a missing document from the " + << "sync source, attempt " << retryCount << " of " << retryMax << endl; + continue; // try again + } catch (DBException& e) { + log() << "replication assertion fetching missing object: " << e.what() << endl; + throw; } - // retry count exceeded - msgasserted(15916, - str::stream() << "Can no longer connect to initial sync source: " << hn); - } - bool Sync::shouldRetry(OperationContext* txn, const BSONObj& o) { - const NamespaceString nss(o.getStringField("ns")); - MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { - // Take an X lock on the database in order to preclude other modifications. - // Also, the database might not exist yet, so create it. - AutoGetOrCreateDb autoDb(txn, nss.db(), MODE_X); - Database* const db = autoDb.getDb(); - - // we don't have the object yet, which is possible on initial sync. get it. - log() << "adding missing object" << endl; // rare enough we can log - BSONObj missingObj = getMissingDoc(txn, db, o); - - if( missingObj.isEmpty() ) { - log() << "missing object not found on source." - " presumably deleted later in oplog"; - log() << "o2: " << o.getObjectField("o2").toString(); - log() << "o firstfield: " << o.getObjectField("o").firstElementFieldName(); - return false; - } - else { - WriteUnitOfWork wunit(txn); - - Collection* const coll = db->getOrCreateCollection(txn, nss.toString()); - invariant(coll); - - StatusWith<RecordId> result = coll->insertDocument(txn, missingObj, true); - uassert(15917, - str::stream() << "failed to insert missing doc: " - << result.getStatus().toString(), - result.isOK() ); - LOG(1) << "inserted missing doc: " << missingObj.toString() << endl; - wunit.commit(); - return true; - } - } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "InsertRetry", nss.ns()); + // success! + return missingObj; + } + // retry count exceeded + msgasserted(15916, str::stream() << "Can no longer connect to initial sync source: " << hn); +} + +bool Sync::shouldRetry(OperationContext* txn, const BSONObj& o) { + const NamespaceString nss(o.getStringField("ns")); + MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { + // Take an X lock on the database in order to preclude other modifications. + // Also, the database might not exist yet, so create it. + AutoGetOrCreateDb autoDb(txn, nss.db(), MODE_X); + Database* const db = autoDb.getDb(); + + // we don't have the object yet, which is possible on initial sync. get it. + log() << "adding missing object" << endl; // rare enough we can log + BSONObj missingObj = getMissingDoc(txn, db, o); + + if (missingObj.isEmpty()) { + log() << "missing object not found on source." + " presumably deleted later in oplog"; + log() << "o2: " << o.getObjectField("o2").toString(); + log() << "o firstfield: " << o.getObjectField("o").firstElementFieldName(); + return false; + } else { + WriteUnitOfWork wunit(txn); + + Collection* const coll = db->getOrCreateCollection(txn, nss.toString()); + invariant(coll); + + StatusWith<RecordId> result = coll->insertDocument(txn, missingObj, true); + uassert( + 15917, + str::stream() << "failed to insert missing doc: " << result.getStatus().toString(), + result.isOK()); + LOG(1) << "inserted missing doc: " << missingObj.toString() << endl; + wunit.commit(); + return true; + } } + MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "InsertRetry", nss.ns()); +} -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/sync.h b/src/mongo/db/repl/sync.h index cdda55f4f13..0788288429d 100644 --- a/src/mongo/db/repl/sync.h +++ b/src/mongo/db/repl/sync.h @@ -33,25 +33,26 @@ #include "mongo/db/jsobj.h" namespace mongo { - class Database; - class OperationContext; +class Database; +class OperationContext; namespace repl { - class Sync { - protected: - std::string hn; - public: - Sync(const std::string& hostname) : hn(hostname) {} - virtual ~Sync() {} - virtual BSONObj getMissingDoc(OperationContext* txn, Database* db, const BSONObj& o); - - /** - * If applyOperation_inlock should be called again after an update fails. - */ - virtual bool shouldRetry(OperationContext* txn, const BSONObj& o); - void setHostname(const std::string& hostname); - }; - -} // namespace repl -} // namespace mongo +class Sync { +protected: + std::string hn; + +public: + Sync(const std::string& hostname) : hn(hostname) {} + virtual ~Sync() {} + virtual BSONObj getMissingDoc(OperationContext* txn, Database* db, const BSONObj& o); + + /** + * If applyOperation_inlock should be called again after an update fails. + */ + virtual bool shouldRetry(OperationContext* txn, const BSONObj& o); + void setHostname(const std::string& hostname); +}; + +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/sync_source_feedback.cpp b/src/mongo/db/repl/sync_source_feedback.cpp index c70f101904e..1a1057b8fae 100644 --- a/src/mongo/db/repl/sync_source_feedback.cpp +++ b/src/mongo/db/repl/sync_source_feedback.cpp @@ -50,270 +50,261 @@ namespace mongo { - using std::endl; - using std::string; +using std::endl; +using std::string; namespace repl { - // used in replAuthenticate - static const BSONObj userReplQuery = fromjson("{\"user\":\"repl\"}"); +// used in replAuthenticate +static const BSONObj userReplQuery = fromjson("{\"user\":\"repl\"}"); - SyncSourceFeedback::SyncSourceFeedback() : _positionChanged(false), - _handshakeNeeded(false), - _shutdownSignaled(false) {} - SyncSourceFeedback::~SyncSourceFeedback() {} +SyncSourceFeedback::SyncSourceFeedback() + : _positionChanged(false), _handshakeNeeded(false), _shutdownSignaled(false) {} +SyncSourceFeedback::~SyncSourceFeedback() {} - void SyncSourceFeedback::_resetConnection() { - LOG(1) << "resetting connection in sync source feedback"; - _connection.reset(); - } - - bool SyncSourceFeedback::replAuthenticate() { - if (!getGlobalAuthorizationManager()->isAuthEnabled()) - return true; - - if (!isInternalAuthSet()) - return false; - return authenticateInternalUser(_connection.get()); - } - - void SyncSourceFeedback::ensureMe(OperationContext* txn) { - string myname = getHostName(); - { - ScopedTransaction transaction(txn, MODE_IX); - Lock::DBLock dlk(txn->lockState(), "local", MODE_X); - Client::Context ctx(txn, "local"); - - // local.me is an identifier for a server for getLastError w:2+ - if (!Helpers::getSingleton(txn, "local.me", _me) || - !_me.hasField("host") || - _me["host"].String() != myname) { - - WriteUnitOfWork wunit(txn); +void SyncSourceFeedback::_resetConnection() { + LOG(1) << "resetting connection in sync source feedback"; + _connection.reset(); +} - // clean out local.me - Helpers::emptyCollection(txn, "local.me"); - - // repopulate - BSONObjBuilder b; - b.appendOID("_id", 0, true); - b.append("host", myname); - _me = b.obj(); - Helpers::putSingleton(txn, "local.me", _me); +bool SyncSourceFeedback::replAuthenticate() { + if (!getGlobalAuthorizationManager()->isAuthEnabled()) + return true; - wunit.commit(); - } - // _me is used outside of a read lock, so we must copy it out of the mmap - _me = _me.getOwned(); + if (!isInternalAuthSet()) + return false; + return authenticateInternalUser(_connection.get()); +} + +void SyncSourceFeedback::ensureMe(OperationContext* txn) { + string myname = getHostName(); + { + ScopedTransaction transaction(txn, MODE_IX); + Lock::DBLock dlk(txn->lockState(), "local", MODE_X); + Client::Context ctx(txn, "local"); + + // local.me is an identifier for a server for getLastError w:2+ + if (!Helpers::getSingleton(txn, "local.me", _me) || !_me.hasField("host") || + _me["host"].String() != myname) { + WriteUnitOfWork wunit(txn); + + // clean out local.me + Helpers::emptyCollection(txn, "local.me"); + + // repopulate + BSONObjBuilder b; + b.appendOID("_id", 0, true); + b.append("host", myname); + _me = b.obj(); + Helpers::putSingleton(txn, "local.me", _me); + + wunit.commit(); } + // _me is used outside of a read lock, so we must copy it out of the mmap + _me = _me.getOwned(); } +} - bool SyncSourceFeedback::replHandshake(OperationContext* txn) { - ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); - if (replCoord->getMemberState().primary()) { - // primary has no one to handshake to - return true; - } - // construct a vector of handshake obj for us as well as all chained members - std::vector<BSONObj> handshakeObjs; - replCoord->prepareReplSetUpdatePositionCommandHandshakes(&handshakeObjs); - LOG(1) << "handshaking upstream updater"; - for (std::vector<BSONObj>::iterator it = handshakeObjs.begin(); - it != handshakeObjs.end(); - ++it) { - BSONObj res; - try { - LOG(2) << "Sending to " << _connection.get()->toString() << " the replication " - "handshake: " << *it; - if (!_connection->runCommand("admin", *it, res)) { - std::string errMsg = res["errmsg"].valuestrsafe(); - massert(17447, "upstream updater is not supported by the member from which we" - " are syncing, please update all nodes to 2.6 or later.", - errMsg.find("no such cmd") == std::string::npos); - - log() << "replSet error while handshaking the upstream updater: " - << errMsg; - - // sleep half a second if we are not in our sync source's config - // TODO(dannenberg) after 3.0, remove the string comparison - if (res["code"].numberInt() == ErrorCodes::NodeNotFound || - errMsg.find("could not be found in replica set config while attempting " - "to associate it with") != std::string::npos) { - - // black list sync target for 10 seconds and find a new one - replCoord->blacklistSyncSource(_syncTarget, - Date_t(curTimeMillis64() + 10*1000)); - BackgroundSync::get()->clearSyncTarget(); - } - - _resetConnection(); - return false; - } - } - catch (const DBException& e) { - log() << "SyncSourceFeedback error sending handshake: " << e.what() << endl; - _resetConnection(); - return false; - } - } +bool SyncSourceFeedback::replHandshake(OperationContext* txn) { + ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); + if (replCoord->getMemberState().primary()) { + // primary has no one to handshake to return true; } - - bool SyncSourceFeedback::_connect(OperationContext* txn, const HostAndPort& host) { - if (hasConnection()) { - return true; - } - log() << "replset setting syncSourceFeedback to " << host.toString(); - _connection.reset(new DBClientConnection(false, OplogReader::tcp_timeout)); - string errmsg; + // construct a vector of handshake obj for us as well as all chained members + std::vector<BSONObj> handshakeObjs; + replCoord->prepareReplSetUpdatePositionCommandHandshakes(&handshakeObjs); + LOG(1) << "handshaking upstream updater"; + for (std::vector<BSONObj>::iterator it = handshakeObjs.begin(); it != handshakeObjs.end(); + ++it) { + BSONObj res; try { - if (!_connection->connect(host, errmsg) || - (getGlobalAuthorizationManager()->isAuthEnabled() && !replAuthenticate())) { + LOG(2) << "Sending to " << _connection.get()->toString() << " the replication " + "handshake: " << *it; + if (!_connection->runCommand("admin", *it, res)) { + std::string errMsg = res["errmsg"].valuestrsafe(); + massert(17447, + "upstream updater is not supported by the member from which we" + " are syncing, please update all nodes to 2.6 or later.", + errMsg.find("no such cmd") == std::string::npos); + + log() << "replSet error while handshaking the upstream updater: " << errMsg; + + // sleep half a second if we are not in our sync source's config + // TODO(dannenberg) after 3.0, remove the string comparison + if (res["code"].numberInt() == ErrorCodes::NodeNotFound || + errMsg.find( + "could not be found in replica set config while attempting " + "to associate it with") != std::string::npos) { + // black list sync target for 10 seconds and find a new one + replCoord->blacklistSyncSource(_syncTarget, + Date_t(curTimeMillis64() + 10 * 1000)); + BackgroundSync::get()->clearSyncTarget(); + } + _resetConnection(); - log() << "repl: " << errmsg << endl; return false; } - } - catch (const DBException& e) { - log() << "Error connecting to " << host.toString() << ": " << e.what(); + } catch (const DBException& e) { + log() << "SyncSourceFeedback error sending handshake: " << e.what() << endl; _resetConnection(); return false; } - - return hasConnection(); } + return true; +} - void SyncSourceFeedback::forwardSlaveHandshake() { - boost::unique_lock<boost::mutex> lock(_mtx); - _handshakeNeeded = true; - _cond.notify_all(); +bool SyncSourceFeedback::_connect(OperationContext* txn, const HostAndPort& host) { + if (hasConnection()) { + return true; } - - void SyncSourceFeedback::forwardSlaveProgress() { - boost::unique_lock<boost::mutex> lock(_mtx); - _positionChanged = true; - _cond.notify_all(); + log() << "replset setting syncSourceFeedback to " << host.toString(); + _connection.reset(new DBClientConnection(false, OplogReader::tcp_timeout)); + string errmsg; + try { + if (!_connection->connect(host, errmsg) || + (getGlobalAuthorizationManager()->isAuthEnabled() && !replAuthenticate())) { + _resetConnection(); + log() << "repl: " << errmsg << endl; + return false; + } + } catch (const DBException& e) { + log() << "Error connecting to " << host.toString() << ": " << e.what(); + _resetConnection(); + return false; } - Status SyncSourceFeedback::updateUpstream(OperationContext* txn) { - ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); - if (replCoord->getMemberState().primary()) { - // primary has no one to update to + return hasConnection(); +} + +void SyncSourceFeedback::forwardSlaveHandshake() { + boost::unique_lock<boost::mutex> lock(_mtx); + _handshakeNeeded = true; + _cond.notify_all(); +} + +void SyncSourceFeedback::forwardSlaveProgress() { + boost::unique_lock<boost::mutex> lock(_mtx); + _positionChanged = true; + _cond.notify_all(); +} + +Status SyncSourceFeedback::updateUpstream(OperationContext* txn) { + ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); + if (replCoord->getMemberState().primary()) { + // primary has no one to update to + return Status::OK(); + } + BSONObjBuilder cmd; + { + boost::unique_lock<boost::mutex> lock(_mtx); + if (_handshakeNeeded) { + // Don't send updates if there are nodes that haven't yet been handshaked + return Status(ErrorCodes::NodeNotFound, + "Need to send handshake before updating position upstream"); + } + // the command could not be created, likely because the node was removed from the set + if (!replCoord->prepareReplSetUpdatePositionCommand(&cmd)) { return Status::OK(); } - BSONObjBuilder cmd; + } + BSONObj res; + + LOG(2) << "Sending slave oplog progress to upstream updater: " << cmd.done(); + try { + _connection->runCommand("admin", cmd.obj(), res); + } catch (const DBException& e) { + log() << "SyncSourceFeedback error sending update: " << e.what() << endl; + // blacklist sync target for .5 seconds and find a new one + replCoord->blacklistSyncSource(_syncTarget, Date_t(curTimeMillis64() + 500)); + BackgroundSync::get()->clearSyncTarget(); + _resetConnection(); + return e.toStatus(); + } + + Status status = Command::getStatusFromCommandResult(res); + if (!status.isOK()) { + log() << "SyncSourceFeedback error sending update, response: " << res.toString() << endl; + // blacklist sync target for .5 seconds and find a new one + replCoord->blacklistSyncSource(_syncTarget, Date_t(curTimeMillis64() + 500)); + BackgroundSync::get()->clearSyncTarget(); + _resetConnection(); + } + return status; +} + +void SyncSourceFeedback::shutdown() { + boost::unique_lock<boost::mutex> lock(_mtx); + _shutdownSignaled = true; + _cond.notify_all(); +} + +void SyncSourceFeedback::run() { + Client::initThread("SyncSourceFeedback"); + OperationContextImpl txn; + + bool positionChanged = false; + bool handshakeNeeded = false; + ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); + while (!inShutdown()) { // TODO(spencer): Remove once legacy repl coordinator is gone. { boost::unique_lock<boost::mutex> lock(_mtx); - if (_handshakeNeeded) { - // Don't send updates if there are nodes that haven't yet been handshaked - return Status(ErrorCodes::NodeNotFound, - "Need to send handshake before updating position upstream"); + while (!_positionChanged && !_handshakeNeeded && !_shutdownSignaled) { + _cond.wait(lock); } - // the command could not be created, likely because the node was removed from the set - if (!replCoord->prepareReplSetUpdatePositionCommand(&cmd)) { - return Status::OK(); + + if (_shutdownSignaled) { + break; } - } - BSONObj res; - LOG(2) << "Sending slave oplog progress to upstream updater: " << cmd.done(); - try { - _connection->runCommand("admin", cmd.obj(), res); + positionChanged = _positionChanged; + handshakeNeeded = _handshakeNeeded; + _positionChanged = false; + _handshakeNeeded = false; } - catch (const DBException& e) { - log() << "SyncSourceFeedback error sending update: " << e.what() << endl; - // blacklist sync target for .5 seconds and find a new one - replCoord->blacklistSyncSource(_syncTarget, - Date_t(curTimeMillis64() + 500)); - BackgroundSync::get()->clearSyncTarget(); + + MemberState state = replCoord->getMemberState(); + if (state.primary() || state.startup()) { _resetConnection(); - return e.toStatus(); + continue; } - - Status status = Command::getStatusFromCommandResult(res); - if (!status.isOK()) { - log() << "SyncSourceFeedback error sending update, response: " << res.toString() <<endl; - // blacklist sync target for .5 seconds and find a new one - replCoord->blacklistSyncSource(_syncTarget, - Date_t(curTimeMillis64() + 500)); - BackgroundSync::get()->clearSyncTarget(); + const HostAndPort target = BackgroundSync::get()->getSyncTarget(); + if (_syncTarget != target) { _resetConnection(); + _syncTarget = target; } - return status; - } - - void SyncSourceFeedback::shutdown() { - boost::unique_lock<boost::mutex> lock(_mtx); - _shutdownSignaled = true; - _cond.notify_all(); - } - - void SyncSourceFeedback::run() { - Client::initThread("SyncSourceFeedback"); - OperationContextImpl txn; - - bool positionChanged = false; - bool handshakeNeeded = false; - ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); - while (!inShutdown()) { // TODO(spencer): Remove once legacy repl coordinator is gone. - { - boost::unique_lock<boost::mutex> lock(_mtx); - while (!_positionChanged && !_handshakeNeeded && !_shutdownSignaled) { - _cond.wait(lock); - } - - if (_shutdownSignaled) { - break; - } - - positionChanged = _positionChanged; - handshakeNeeded = _handshakeNeeded; - _positionChanged = false; - _handshakeNeeded = false; - } - - MemberState state = replCoord->getMemberState(); - if (state.primary() || state.startup()) { - _resetConnection(); + if (!hasConnection()) { + // fix connection if need be + if (target.empty()) { + sleepmillis(500); continue; } - const HostAndPort target = BackgroundSync::get()->getSyncTarget(); - if (_syncTarget != target) { - _resetConnection(); - _syncTarget = target; + if (!_connect(&txn, target)) { + sleepmillis(500); + continue; } - if (!hasConnection()) { - // fix connection if need be - if (target.empty()) { - sleepmillis(500); - continue; - } - if (!_connect(&txn, target)) { - sleepmillis(500); - continue; - } - handshakeNeeded = true; + handshakeNeeded = true; + } + if (handshakeNeeded) { + positionChanged = true; + if (!replHandshake(&txn)) { + boost::unique_lock<boost::mutex> lock(_mtx); + _handshakeNeeded = true; + continue; } - if (handshakeNeeded) { - positionChanged = true; - if (!replHandshake(&txn)) { - boost::unique_lock<boost::mutex> lock(_mtx); + } + if (positionChanged) { + Status status = updateUpstream(&txn); + if (!status.isOK()) { + boost::unique_lock<boost::mutex> lock(_mtx); + _positionChanged = true; + if (status == ErrorCodes::NodeNotFound) { _handshakeNeeded = true; - continue; - } - } - if (positionChanged) { - Status status = updateUpstream(&txn); - if (!status.isOK()) { - boost::unique_lock<boost::mutex> lock(_mtx); - _positionChanged = true; - if (status == ErrorCodes::NodeNotFound) { - _handshakeNeeded = true; - } } } } - cc().shutdown(); } -} // namespace repl -} // namespace mongo + cc().shutdown(); +} +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/sync_source_feedback.h b/src/mongo/db/repl/sync_source_feedback.h index 0f6d24cb1a0..fce865494a2 100644 --- a/src/mongo/db/repl/sync_source_feedback.h +++ b/src/mongo/db/repl/sync_source_feedback.h @@ -38,81 +38,81 @@ #include "mongo/util/net/hostandport.h" namespace mongo { - class OperationContext; +class OperationContext; namespace repl { - class SyncSourceFeedback { - public: - SyncSourceFeedback(); - ~SyncSourceFeedback(); - - /// Ensures local.me is populated and populates it if not. - /// TODO(spencer): Remove this function once the LegacyReplicationCoordinator is gone. - void ensureMe(OperationContext* txn); - - /// Notifies the SyncSourceFeedbackThread to wake up and send a handshake up the replication - /// chain, upon receiving a handshake. - void forwardSlaveHandshake(); - - /// Notifies the SyncSourceFeedbackThread to wake up and send an update upstream of slave - /// replication progress. - void forwardSlaveProgress(); - - /// Loops continuously until shutdown() is called, passing updates when they are present. - /// TODO(spencer): Currently also can terminate when the global inShutdown() function - /// returns true. Remove that once the legacy repl coordinator is gone. - void run(); - - /// Signals the run() method to terminate. - void shutdown(); - - private: - void _resetConnection(); - - /** - * Authenticates _connection using the server's cluster-membership credentials. - * - * Returns true on successful authentication. - */ - bool replAuthenticate(); - - /* Sends initialization information to our sync target, also determines whether or not they - * support the updater command. - */ - bool replHandshake(OperationContext* txn); - - /* Inform the sync target of our current position in the oplog, as well as the positions - * of all secondaries chained through us. - * ErrorCodes::NodeNotFound indicates that the caller should re-run replHandshake before - * calling this again. - */ - Status updateUpstream(OperationContext* txn); - - bool hasConnection() { - return _connection.get(); - } - - /// Connect to sync target. - bool _connect(OperationContext* txn, const HostAndPort& host); - - // stores our OID to be passed along in commands - /// TODO(spencer): Remove this once the LegacyReplicationCoordinator is gone. - BSONObj _me; - // the member we are currently syncing from - HostAndPort _syncTarget; - // our connection to our sync target - boost::scoped_ptr<DBClientConnection> _connection; - // protects cond, _shutdownSignaled, and the indicator bools. - boost::mutex _mtx; - // used to alert our thread of changes which need to be passed up the chain - boost::condition _cond; - // used to indicate a position change which has not yet been pushed along - bool _positionChanged; - // used to indicate a connection change which has not yet been shook on - bool _handshakeNeeded; - // Once this is set to true the _run method will terminate - bool _shutdownSignaled; - }; -} // namespace repl -} // namespace mongo +class SyncSourceFeedback { +public: + SyncSourceFeedback(); + ~SyncSourceFeedback(); + + /// Ensures local.me is populated and populates it if not. + /// TODO(spencer): Remove this function once the LegacyReplicationCoordinator is gone. + void ensureMe(OperationContext* txn); + + /// Notifies the SyncSourceFeedbackThread to wake up and send a handshake up the replication + /// chain, upon receiving a handshake. + void forwardSlaveHandshake(); + + /// Notifies the SyncSourceFeedbackThread to wake up and send an update upstream of slave + /// replication progress. + void forwardSlaveProgress(); + + /// Loops continuously until shutdown() is called, passing updates when they are present. + /// TODO(spencer): Currently also can terminate when the global inShutdown() function + /// returns true. Remove that once the legacy repl coordinator is gone. + void run(); + + /// Signals the run() method to terminate. + void shutdown(); + +private: + void _resetConnection(); + + /** + * Authenticates _connection using the server's cluster-membership credentials. + * + * Returns true on successful authentication. + */ + bool replAuthenticate(); + + /* Sends initialization information to our sync target, also determines whether or not they + * support the updater command. + */ + bool replHandshake(OperationContext* txn); + + /* Inform the sync target of our current position in the oplog, as well as the positions + * of all secondaries chained through us. + * ErrorCodes::NodeNotFound indicates that the caller should re-run replHandshake before + * calling this again. + */ + Status updateUpstream(OperationContext* txn); + + bool hasConnection() { + return _connection.get(); + } + + /// Connect to sync target. + bool _connect(OperationContext* txn, const HostAndPort& host); + + // stores our OID to be passed along in commands + /// TODO(spencer): Remove this once the LegacyReplicationCoordinator is gone. + BSONObj _me; + // the member we are currently syncing from + HostAndPort _syncTarget; + // our connection to our sync target + boost::scoped_ptr<DBClientConnection> _connection; + // protects cond, _shutdownSignaled, and the indicator bools. + boost::mutex _mtx; + // used to alert our thread of changes which need to be passed up the chain + boost::condition _cond; + // used to indicate a position change which has not yet been pushed along + bool _positionChanged; + // used to indicate a connection change which has not yet been shook on + bool _handshakeNeeded; + // Once this is set to true the _run method will terminate + bool _shutdownSignaled; +}; +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/sync_tail.cpp b/src/mongo/db/repl/sync_tail.cpp index 4b8c8dbfecb..dbde6ca3549 100644 --- a/src/mongo/db/repl/sync_tail.cpp +++ b/src/mongo/db/repl/sync_tail.cpp @@ -59,268 +59,242 @@ namespace mongo { - using std::endl; +using std::endl; namespace repl { #if defined(MONGO_PLATFORM_64) - const int replWriterThreadCount = 16; - const int replPrefetcherThreadCount = 16; +const int replWriterThreadCount = 16; +const int replPrefetcherThreadCount = 16; #elif defined(MONGO_PLATFORM_32) - const int replWriterThreadCount = 2; - const int replPrefetcherThreadCount = 2; +const int replWriterThreadCount = 2; +const int replPrefetcherThreadCount = 2; #else #error need to include something that defines MONGO_PLATFORM_XX #endif - static Counter64 opsAppliedStats; +static Counter64 opsAppliedStats; - //The oplog entries applied - static ServerStatusMetricField<Counter64> displayOpsApplied( "repl.apply.ops", - &opsAppliedStats ); +// The oplog entries applied +static ServerStatusMetricField<Counter64> displayOpsApplied("repl.apply.ops", &opsAppliedStats); - MONGO_FP_DECLARE(rsSyncApplyStop); +MONGO_FP_DECLARE(rsSyncApplyStop); - // Number and time of each ApplyOps worker pool round - static TimerStats applyBatchStats; - static ServerStatusMetricField<TimerStats> displayOpBatchesApplied( - "repl.apply.batches", - &applyBatchStats ); - void initializePrefetchThread() { - if (!ClientBasic::getCurrent()) { - Client::initThreadIfNotAlready(); - cc().getAuthorizationSession()->grantInternalAuthorization(); - } +// Number and time of each ApplyOps worker pool round +static TimerStats applyBatchStats; +static ServerStatusMetricField<TimerStats> displayOpBatchesApplied("repl.apply.batches", + &applyBatchStats); +void initializePrefetchThread() { + if (!ClientBasic::getCurrent()) { + Client::initThreadIfNotAlready(); + cc().getAuthorizationSession()->grantInternalAuthorization(); } - namespace { - bool isCrudOpType( const char* field ) { - switch ( field[0] ) { - case 'd': - case 'i': - case 'u': - return field[1] == 0; - } - return false; - } +} +namespace { +bool isCrudOpType(const char* field) { + switch (field[0]) { + case 'd': + case 'i': + case 'u': + return field[1] == 0; } + return false; +} +} - SyncTail::SyncTail(BackgroundSyncInterface *q, MultiSyncApplyFunc func) : - Sync(""), - _networkQueue(q), - _applyFunc(func), - _writerPool(replWriterThreadCount, "repl writer worker "), - _prefetcherPool(replPrefetcherThreadCount, "repl prefetch worker ") - {} +SyncTail::SyncTail(BackgroundSyncInterface* q, MultiSyncApplyFunc func) + : Sync(""), + _networkQueue(q), + _applyFunc(func), + _writerPool(replWriterThreadCount, "repl writer worker "), + _prefetcherPool(replPrefetcherThreadCount, "repl prefetch worker ") {} - SyncTail::~SyncTail() {} +SyncTail::~SyncTail() {} - bool SyncTail::peek(BSONObj* op) { - return _networkQueue->peek(op); +bool SyncTail::peek(BSONObj* op) { + return _networkQueue->peek(op); +} +/* apply the log op that is in param o + @return bool success (true) or failure (false) +*/ +bool SyncTail::syncApply(OperationContext* txn, const BSONObj& op, bool convertUpdateToUpsert) { + if (inShutdown()) { + return true; } - /* apply the log op that is in param o - @return bool success (true) or failure (false) - */ - bool SyncTail::syncApply(OperationContext* txn, - const BSONObj &op, - bool convertUpdateToUpsert) { - if (inShutdown()) { - return true; - } - - // Count each log op application as a separate operation, for reporting purposes - txn->getCurOp()->reset(); + // Count each log op application as a separate operation, for reporting purposes + txn->getCurOp()->reset(); - const char *ns = op.getStringField("ns"); - verify(ns); + const char* ns = op.getStringField("ns"); + verify(ns); - if ( (*ns == '\0') || (*ns == '.') ) { - // this is ugly - // this is often a no-op - // but can't be 100% sure - if( *op.getStringField("op") != 'n' ) { - error() << "replSet skipping bad op in oplog: " << op.toString(); - } - return true; + if ((*ns == '\0') || (*ns == '.')) { + // this is ugly + // this is often a no-op + // but can't be 100% sure + if (*op.getStringField("op") != 'n') { + error() << "replSet skipping bad op in oplog: " << op.toString(); } + return true; + } - const char* opType = op["op"].valuestrsafe(); - - bool isCommand(opType[0] == 'c'); + const char* opType = op["op"].valuestrsafe(); - for ( int createCollection = 0; createCollection < 2; createCollection++ ) { - try { - boost::scoped_ptr<Lock::GlobalWrite> globalWriteLock; + bool isCommand(opType[0] == 'c'); - // DB lock always acquires the global lock - boost::scoped_ptr<Lock::DBLock> dbLock; - boost::scoped_ptr<Lock::CollectionLock> collectionLock; + for (int createCollection = 0; createCollection < 2; createCollection++) { + try { + boost::scoped_ptr<Lock::GlobalWrite> globalWriteLock; - bool isIndexBuild = opType[0] == 'i' && - nsToCollectionSubstring( ns ) == "system.indexes"; + // DB lock always acquires the global lock + boost::scoped_ptr<Lock::DBLock> dbLock; + boost::scoped_ptr<Lock::CollectionLock> collectionLock; - if (isCommand) { - // a command may need a global write lock. so we will conservatively go - // ahead and grab one here. suboptimal. :-( - globalWriteLock.reset(new Lock::GlobalWrite(txn->lockState())); - } - else if (isIndexBuild) { - dbLock.reset(new Lock::DBLock(txn->lockState(), - nsToDatabaseSubstring(ns), MODE_X)); - } - else if (isCrudOpType(opType)) { - LockMode mode = createCollection ? MODE_X : MODE_IX; - dbLock.reset(new Lock::DBLock(txn->lockState(), - nsToDatabaseSubstring(ns), mode)); - collectionLock.reset(new Lock::CollectionLock(txn->lockState(), ns, mode)); - - if (!createCollection && !dbHolder().get(txn, nsToDatabaseSubstring(ns))) { - // need to create database, try again - continue; - } - } - else { - // Unknown op? - dbLock.reset(new Lock::DBLock(txn->lockState(), - nsToDatabaseSubstring(ns), MODE_X)); - } + bool isIndexBuild = opType[0] == 'i' && nsToCollectionSubstring(ns) == "system.indexes"; - Client::Context ctx(txn, ns); + if (isCommand) { + // a command may need a global write lock. so we will conservatively go + // ahead and grab one here. suboptimal. :-( + globalWriteLock.reset(new Lock::GlobalWrite(txn->lockState())); + } else if (isIndexBuild) { + dbLock.reset(new Lock::DBLock(txn->lockState(), nsToDatabaseSubstring(ns), MODE_X)); + } else if (isCrudOpType(opType)) { + LockMode mode = createCollection ? MODE_X : MODE_IX; + dbLock.reset(new Lock::DBLock(txn->lockState(), nsToDatabaseSubstring(ns), mode)); + collectionLock.reset(new Lock::CollectionLock(txn->lockState(), ns, mode)); - if ( createCollection == 0 && - !isIndexBuild && - isCrudOpType(opType) && - ctx.db()->getCollection(ns) == NULL ) { - // uh, oh, we need to create collection - // try again + if (!createCollection && !dbHolder().get(txn, nsToDatabaseSubstring(ns))) { + // need to create database, try again continue; } - - // For non-initial-sync, we convert updates to upserts - // to suppress errors when replaying oplog entries. - bool ok = !applyOperation_inlock(txn, ctx.db(), op, true, convertUpdateToUpsert); - opsAppliedStats.increment(); - return ok; + } else { + // Unknown op? + dbLock.reset(new Lock::DBLock(txn->lockState(), nsToDatabaseSubstring(ns), MODE_X)); } - catch (const WriteConflictException&) { - log() << "WriteConflictException while doing oplog application on: " << ns - << ", retrying."; - createCollection--; + + Client::Context ctx(txn, ns); + + if (createCollection == 0 && !isIndexBuild && isCrudOpType(opType) && + ctx.db()->getCollection(ns) == NULL) { + // uh, oh, we need to create collection + // try again + continue; } - } - // Keeps the compiler warnings happy - invariant(false); - return false; + // For non-initial-sync, we convert updates to upserts + // to suppress errors when replaying oplog entries. + bool ok = !applyOperation_inlock(txn, ctx.db(), op, true, convertUpdateToUpsert); + opsAppliedStats.increment(); + return ok; + } catch (const WriteConflictException&) { + log() << "WriteConflictException while doing oplog application on: " << ns + << ", retrying."; + createCollection--; + } } - // The pool threads call this to prefetch each op - void SyncTail::prefetchOp(const BSONObj& op) { - initializePrefetchThread(); - - const char *ns = op.getStringField("ns"); - if (ns && (ns[0] != '\0')) { - try { - // one possible tweak here would be to stay in the read lock for this database - // for multiple prefetches if they are for the same database. - OperationContextImpl txn; - AutoGetCollectionForRead ctx(&txn, ns); - Database* db = ctx.getDb(); - if (db) { - prefetchPagesForReplicatedOp(&txn, db, op); - } - } - catch (const DBException& e) { - LOG(2) << "ignoring exception in prefetchOp(): " << e.what() << endl; - } - catch (const std::exception& e) { - log() << "Unhandled std::exception in prefetchOp(): " << e.what() << endl; - fassertFailed(16397); + // Keeps the compiler warnings happy + invariant(false); + return false; +} + +// The pool threads call this to prefetch each op +void SyncTail::prefetchOp(const BSONObj& op) { + initializePrefetchThread(); + + const char* ns = op.getStringField("ns"); + if (ns && (ns[0] != '\0')) { + try { + // one possible tweak here would be to stay in the read lock for this database + // for multiple prefetches if they are for the same database. + OperationContextImpl txn; + AutoGetCollectionForRead ctx(&txn, ns); + Database* db = ctx.getDb(); + if (db) { + prefetchPagesForReplicatedOp(&txn, db, op); } + } catch (const DBException& e) { + LOG(2) << "ignoring exception in prefetchOp(): " << e.what() << endl; + } catch (const std::exception& e) { + log() << "Unhandled std::exception in prefetchOp(): " << e.what() << endl; + fassertFailed(16397); } } +} - // Doles out all the work to the reader pool threads and waits for them to complete - void SyncTail::prefetchOps(const std::deque<BSONObj>& ops) { - for (std::deque<BSONObj>::const_iterator it = ops.begin(); - it != ops.end(); - ++it) { - _prefetcherPool.schedule(&prefetchOp, *it); - } - _prefetcherPool.join(); +// Doles out all the work to the reader pool threads and waits for them to complete +void SyncTail::prefetchOps(const std::deque<BSONObj>& ops) { + for (std::deque<BSONObj>::const_iterator it = ops.begin(); it != ops.end(); ++it) { + _prefetcherPool.schedule(&prefetchOp, *it); } - - // Doles out all the work to the writer pool threads and waits for them to complete - void SyncTail::applyOps(const std::vector< std::vector<BSONObj> >& writerVectors) { - TimerHolder timer(&applyBatchStats); - for (std::vector< std::vector<BSONObj> >::const_iterator it = writerVectors.begin(); - it != writerVectors.end(); - ++it) { - if (!it->empty()) { - _writerPool.schedule(_applyFunc, boost::cref(*it), this); - } + _prefetcherPool.join(); +} + +// Doles out all the work to the writer pool threads and waits for them to complete +void SyncTail::applyOps(const std::vector<std::vector<BSONObj>>& writerVectors) { + TimerHolder timer(&applyBatchStats); + for (std::vector<std::vector<BSONObj>>::const_iterator it = writerVectors.begin(); + it != writerVectors.end(); + ++it) { + if (!it->empty()) { + _writerPool.schedule(_applyFunc, boost::cref(*it), this); } - _writerPool.join(); } + _writerPool.join(); +} - // Doles out all the work to the writer pool threads and waits for them to complete - OpTime SyncTail::multiApply(OperationContext* txn, std::deque<BSONObj>& ops) { +// Doles out all the work to the writer pool threads and waits for them to complete +OpTime SyncTail::multiApply(OperationContext* txn, std::deque<BSONObj>& ops) { + if (getGlobalEnvironment()->getGlobalStorageEngine()->isMmapV1()) { + // Use a ThreadPool to prefetch all the operations in a batch. + prefetchOps(ops); + } - if (getGlobalEnvironment()->getGlobalStorageEngine()->isMmapV1()) { - // Use a ThreadPool to prefetch all the operations in a batch. - prefetchOps(ops); - } - - std::vector< std::vector<BSONObj> > writerVectors(replWriterThreadCount); - fillWriterVectors(ops, &writerVectors); - LOG(2) << "replication batch size is " << ops.size() << endl; - // We must grab this because we're going to grab write locks later. - // We hold this mutex the entire time we're writing; it doesn't matter - // because all readers are blocked anyway. - SimpleMutex::scoped_lock fsynclk(filesLockedFsync); - - // stop all readers until we're done - Lock::ParallelBatchWriterMode pbwm(txn->lockState()); - - ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); - if (replCoord->getMemberState().primary() && - !replCoord->isWaitingForApplierToDrain()) { - - severe() << "attempting to replicate ops while primary"; - fassertFailed(28527); - } + std::vector<std::vector<BSONObj>> writerVectors(replWriterThreadCount); + fillWriterVectors(ops, &writerVectors); + LOG(2) << "replication batch size is " << ops.size() << endl; + // We must grab this because we're going to grab write locks later. + // We hold this mutex the entire time we're writing; it doesn't matter + // because all readers are blocked anyway. + SimpleMutex::scoped_lock fsynclk(filesLockedFsync); + + // stop all readers until we're done + Lock::ParallelBatchWriterMode pbwm(txn->lockState()); + + ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); + if (replCoord->getMemberState().primary() && !replCoord->isWaitingForApplierToDrain()) { + severe() << "attempting to replicate ops while primary"; + fassertFailed(28527); + } - applyOps(writerVectors); + applyOps(writerVectors); - if (inShutdown()) { - return OpTime(); - } + if (inShutdown()) { + return OpTime(); + } - OpTime lastOpTime = writeOpsToOplog(txn, ops); + OpTime lastOpTime = writeOpsToOplog(txn, ops); - BackgroundSync::get()->notify(txn); + BackgroundSync::get()->notify(txn); - return lastOpTime; - } + return lastOpTime; +} - void SyncTail::fillWriterVectors(const std::deque<BSONObj>& ops, - std::vector< std::vector<BSONObj> >* writerVectors) { - - for (std::deque<BSONObj>::const_iterator it = ops.begin(); - it != ops.end(); - ++it) { - const BSONElement e = it->getField("ns"); - verify(e.type() == String); - const char* ns = e.valuestr(); - int len = e.valuestrsize(); - uint32_t hash = 0; - MurmurHash3_x86_32( ns, len, 0, &hash); - - const char* opType = it->getField( "op" ).valuestrsafe(); - - if (getGlobalEnvironment()->getGlobalStorageEngine()->supportsDocLocking() && - isCrudOpType(opType)) { - BSONElement id; - switch (opType[0]) { +void SyncTail::fillWriterVectors(const std::deque<BSONObj>& ops, + std::vector<std::vector<BSONObj>>* writerVectors) { + for (std::deque<BSONObj>::const_iterator it = ops.begin(); it != ops.end(); ++it) { + const BSONElement e = it->getField("ns"); + verify(e.type() == String); + const char* ns = e.valuestr(); + int len = e.valuestrsize(); + uint32_t hash = 0; + MurmurHash3_x86_32(ns, len, 0, &hash); + + const char* opType = it->getField("op").valuestrsafe(); + + if (getGlobalEnvironment()->getGlobalStorageEngine()->supportsDocLocking() && + isCrudOpType(opType)) { + BSONElement id; + switch (opType[0]) { case 'u': id = it->getField("o2").Obj()["_id"]; break; @@ -328,381 +302,373 @@ namespace repl { case 'i': id = it->getField("o").Obj()["_id"]; break; - } - - const size_t idHash = BSONElement::Hasher()( id ); - MurmurHash3_x86_32(&idHash, sizeof(idHash), hash, &hash); } - (*writerVectors)[hash % writerVectors->size()].push_back(*it); + const size_t idHash = BSONElement::Hasher()(id); + MurmurHash3_x86_32(&idHash, sizeof(idHash), hash, &hash); } - } - void SyncTail::oplogApplication(OperationContext* txn, const OpTime& endOpTime) { - _applyOplogUntil(txn, endOpTime); - } - /* applies oplog from "now" until endOpTime using the applier threads for initial sync*/ - void SyncTail::_applyOplogUntil(OperationContext* txn, const OpTime& endOpTime) { - unsigned long long bytesApplied = 0; - unsigned long long entriesApplied = 0; - while (true) { - OpQueue ops; - - while (!tryPopAndWaitForMore(txn, &ops, getGlobalReplicationCoordinator())) { - // nothing came back last time, so go again - if (ops.empty()) continue; - - // Check if we reached the end - const BSONObj currentOp = ops.back(); - const OpTime currentOpTime = currentOp["ts"]._opTime(); + (*writerVectors)[hash % writerVectors->size()].push_back(*it); + } +} +void SyncTail::oplogApplication(OperationContext* txn, const OpTime& endOpTime) { + _applyOplogUntil(txn, endOpTime); +} - // When we reach the end return this batch - if (currentOpTime == endOpTime) { - break; - } - else if (currentOpTime > endOpTime) { - severe() << "Applied past expected end " << endOpTime << " to " << currentOpTime - << " without seeing it. Rollback?"; - fassertFailedNoTrace(18693); - } +/* applies oplog from "now" until endOpTime using the applier threads for initial sync*/ +void SyncTail::_applyOplogUntil(OperationContext* txn, const OpTime& endOpTime) { + unsigned long long bytesApplied = 0; + unsigned long long entriesApplied = 0; + while (true) { + OpQueue ops; - // apply replication batch limits - if (ops.getSize() > replBatchLimitBytes) - break; - if (ops.getDeque().size() > replBatchLimitOperations) - break; - }; + while (!tryPopAndWaitForMore(txn, &ops, getGlobalReplicationCoordinator())) { + // nothing came back last time, so go again + if (ops.empty()) + continue; - if (ops.empty()) { - severe() << "got no ops for batch..."; - fassertFailedNoTrace(18692); + // Check if we reached the end + const BSONObj currentOp = ops.back(); + const OpTime currentOpTime = currentOp["ts"]._opTime(); + + // When we reach the end return this batch + if (currentOpTime == endOpTime) { + break; + } else if (currentOpTime > endOpTime) { + severe() << "Applied past expected end " << endOpTime << " to " << currentOpTime + << " without seeing it. Rollback?"; + fassertFailedNoTrace(18693); } - const BSONObj lastOp = ops.back().getOwned(); + // apply replication batch limits + if (ops.getSize() > replBatchLimitBytes) + break; + if (ops.getDeque().size() > replBatchLimitOperations) + break; + }; - // Tally operation information - bytesApplied += ops.getSize(); - entriesApplied += ops.getDeque().size(); + if (ops.empty()) { + severe() << "got no ops for batch..."; + fassertFailedNoTrace(18692); + } - const OpTime lastOpTime = multiApply(txn, ops.getDeque()); + const BSONObj lastOp = ops.back().getOwned(); - if (inShutdown()) { - return; - } + // Tally operation information + bytesApplied += ops.getSize(); + entriesApplied += ops.getDeque().size(); - // if the last op applied was our end, return - if (lastOpTime == endOpTime) { - LOG(1) << "SyncTail applied " << entriesApplied - << " entries (" << bytesApplied << " bytes)" - << " and finished at opTime " << endOpTime.toStringPretty(); - return; - } - } // end of while (true) - } + const OpTime lastOpTime = multiApply(txn, ops.getDeque()); -namespace { - void tryToGoLiveAsASecondary(OperationContext* txn, ReplicationCoordinator* replCoord) { - if (replCoord->isInPrimaryOrSecondaryState()) { + if (inShutdown()) { return; } - ScopedTransaction transaction(txn, MODE_S); - Lock::GlobalRead readLock(txn->lockState()); - - if (replCoord->getMaintenanceMode()) { - // we're not actually going live + // if the last op applied was our end, return + if (lastOpTime == endOpTime) { + LOG(1) << "SyncTail applied " << entriesApplied << " entries (" << bytesApplied + << " bytes)" + << " and finished at opTime " << endOpTime.toStringPretty(); return; } + } // end of while (true) +} - // Only state RECOVERING can transition to SECONDARY. - MemberState state(replCoord->getMemberState()); - if (!state.recovering()) { - return; - } +namespace { +void tryToGoLiveAsASecondary(OperationContext* txn, ReplicationCoordinator* replCoord) { + if (replCoord->isInPrimaryOrSecondaryState()) { + return; + } - OpTime minvalid = getMinValid(txn); - if (minvalid > replCoord->getMyLastOptime()) { - return; - } + ScopedTransaction transaction(txn, MODE_S); + Lock::GlobalRead readLock(txn->lockState()); - bool worked = replCoord->setFollowerMode(MemberState::RS_SECONDARY); - if (!worked) { - warning() << "Failed to transition into " << MemberState(MemberState::RS_SECONDARY) - << ". Current state: " << replCoord->getMemberState(); - } + if (replCoord->getMaintenanceMode()) { + // we're not actually going live + return; + } + + // Only state RECOVERING can transition to SECONDARY. + MemberState state(replCoord->getMemberState()); + if (!state.recovering()) { + return; + } + + OpTime minvalid = getMinValid(txn); + if (minvalid > replCoord->getMyLastOptime()) { + return; + } + + bool worked = replCoord->setFollowerMode(MemberState::RS_SECONDARY); + if (!worked) { + warning() << "Failed to transition into " << MemberState(MemberState::RS_SECONDARY) + << ". Current state: " << replCoord->getMemberState(); } } +} - /* tail an oplog. ok to return, will be re-called. */ - void SyncTail::oplogApplication() { - ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); +/* tail an oplog. ok to return, will be re-called. */ +void SyncTail::oplogApplication() { + ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); - while(!inShutdown()) { - OpQueue ops; - OperationContextImpl txn; + while (!inShutdown()) { + OpQueue ops; + OperationContextImpl txn; - Timer batchTimer; - int lastTimeChecked = 0; + Timer batchTimer; + int lastTimeChecked = 0; - do { - int now = batchTimer.seconds(); + do { + int now = batchTimer.seconds(); - // apply replication batch limits - if (!ops.empty()) { - if (now > replBatchLimitSeconds) - break; - if (ops.getDeque().size() > replBatchLimitOperations) - break; - } - // occasionally check some things - // (always checked in the first iteration of this do-while loop, because - // ops is empty) - if (ops.empty() || now > lastTimeChecked) { - BackgroundSync* bgsync = BackgroundSync::get(); - if (bgsync->getInitialSyncRequestedFlag()) { - // got a resync command - return; - } - lastTimeChecked = now; - // can we become secondary? - // we have to check this before calling mgr, as we must be a secondary to - // become primary - tryToGoLiveAsASecondary(&txn, replCoord); + // apply replication batch limits + if (!ops.empty()) { + if (now > replBatchLimitSeconds) + break; + if (ops.getDeque().size() > replBatchLimitOperations) + break; + } + // occasionally check some things + // (always checked in the first iteration of this do-while loop, because + // ops is empty) + if (ops.empty() || now > lastTimeChecked) { + BackgroundSync* bgsync = BackgroundSync::get(); + if (bgsync->getInitialSyncRequestedFlag()) { + // got a resync command + return; } + lastTimeChecked = now; + // can we become secondary? + // we have to check this before calling mgr, as we must be a secondary to + // become primary + tryToGoLiveAsASecondary(&txn, replCoord); + } - const int slaveDelaySecs = replCoord->getSlaveDelaySecs().total_seconds(); - if (!ops.empty() && slaveDelaySecs > 0) { - const BSONObj& lastOp = ops.getDeque().back(); - const unsigned int opTimestampSecs = lastOp["ts"]._opTime().getSecs(); + const int slaveDelaySecs = replCoord->getSlaveDelaySecs().total_seconds(); + if (!ops.empty() && slaveDelaySecs > 0) { + const BSONObj& lastOp = ops.getDeque().back(); + const unsigned int opTimestampSecs = lastOp["ts"]._opTime().getSecs(); - // Stop the batch as the lastOp is too new to be applied. If we continue - // on, we can get ops that are way ahead of the delay and this will - // make this thread sleep longer when handleSlaveDelay is called - // and apply ops much sooner than we like. - if (opTimestampSecs > static_cast<unsigned int>(time(0) - slaveDelaySecs)) { - break; - } + // Stop the batch as the lastOp is too new to be applied. If we continue + // on, we can get ops that are way ahead of the delay and this will + // make this thread sleep longer when handleSlaveDelay is called + // and apply ops much sooner than we like. + if (opTimestampSecs > static_cast<unsigned int>(time(0) - slaveDelaySecs)) { + break; } - // keep fetching more ops as long as we haven't filled up a full batch yet - } while (!tryPopAndWaitForMore(&txn, &ops, replCoord) && // tryPopAndWaitForMore returns - // true when we need to end a - // batch early - (ops.getSize() < replBatchLimitBytes) && - !inShutdown()); - - // For pausing replication in tests - while (MONGO_FAIL_POINT(rsSyncApplyStop)) { - sleepmillis(0); } + // keep fetching more ops as long as we haven't filled up a full batch yet + } while (!tryPopAndWaitForMore(&txn, &ops, replCoord) && // tryPopAndWaitForMore returns + // true when we need to end a + // batch early + (ops.getSize() < replBatchLimitBytes) && + !inShutdown()); + + // For pausing replication in tests + while (MONGO_FAIL_POINT(rsSyncApplyStop)) { + sleepmillis(0); + } - if (ops.empty()) { - continue; - } + if (ops.empty()) { + continue; + } - const BSONObj& lastOp = ops.getDeque().back(); - handleSlaveDelay(lastOp); + const BSONObj& lastOp = ops.getDeque().back(); + handleSlaveDelay(lastOp); - // Set minValid to the last op to be applied in this next batch. - // This will cause this node to go into RECOVERING state - // if we should crash and restart before updating the oplog - OpTime minValid = lastOp["ts"]._opTime(); - setMinValid(&txn, minValid); - multiApply(&txn, ops.getDeque()); - } + // Set minValid to the last op to be applied in this next batch. + // This will cause this node to go into RECOVERING state + // if we should crash and restart before updating the oplog + OpTime minValid = lastOp["ts"]._opTime(); + setMinValid(&txn, minValid); + multiApply(&txn, ops.getDeque()); } +} - // Copies ops out of the bgsync queue into the deque passed in as a parameter. - // Returns true if the batch should be ended early. - // Batch should end early if we encounter a command, or if - // there are no further ops in the bgsync queue to read. - // This function also blocks 1 second waiting for new ops to appear in the bgsync - // queue. We can't block forever because there are maintenance things we need - // to periodically check in the loop. - bool SyncTail::tryPopAndWaitForMore(OperationContext* txn, - SyncTail::OpQueue* ops, - ReplicationCoordinator* replCoord) { - BSONObj op; - // Check to see if there are ops waiting in the bgsync queue - bool peek_success = peek(&op); - - if (!peek_success) { - // if we don't have anything in the queue, wait a bit for something to appear - if (ops->empty()) { - if (replCoord->isWaitingForApplierToDrain()) { - BackgroundSync::get()->waitUntilPaused(); - if (peek(&op)) { - // The producer generated a last batch of ops before pausing so return - // false so that we'll come back and apply them before signaling the drain - // is complete. - return false; - } - replCoord->signalDrainComplete(txn); +// Copies ops out of the bgsync queue into the deque passed in as a parameter. +// Returns true if the batch should be ended early. +// Batch should end early if we encounter a command, or if +// there are no further ops in the bgsync queue to read. +// This function also blocks 1 second waiting for new ops to appear in the bgsync +// queue. We can't block forever because there are maintenance things we need +// to periodically check in the loop. +bool SyncTail::tryPopAndWaitForMore(OperationContext* txn, + SyncTail::OpQueue* ops, + ReplicationCoordinator* replCoord) { + BSONObj op; + // Check to see if there are ops waiting in the bgsync queue + bool peek_success = peek(&op); + + if (!peek_success) { + // if we don't have anything in the queue, wait a bit for something to appear + if (ops->empty()) { + if (replCoord->isWaitingForApplierToDrain()) { + BackgroundSync::get()->waitUntilPaused(); + if (peek(&op)) { + // The producer generated a last batch of ops before pausing so return + // false so that we'll come back and apply them before signaling the drain + // is complete. + return false; } - // block up to 1 second - _networkQueue->waitForMore(); - return false; + replCoord->signalDrainComplete(txn); } - - // otherwise, apply what we have - return true; + // block up to 1 second + _networkQueue->waitForMore(); + return false; } - const char* ns = op["ns"].valuestrsafe(); - - // check for commands - if ((op["op"].valuestrsafe()[0] == 'c') || - // Index builds are acheived through the use of an insert op, not a command op. - // The following line is the same as what the insert code uses to detect an index build. - ( *ns != '\0' && nsToCollectionSubstring(ns) == "system.indexes" )) { - - if (ops->empty()) { - // apply commands one-at-a-time - ops->push_back(op); - _networkQueue->consume(); - } + // otherwise, apply what we have + return true; + } - // otherwise, apply what we have so far and come back for the command - return true; + const char* ns = op["ns"].valuestrsafe(); + + // check for commands + if ((op["op"].valuestrsafe()[0] == 'c') || + // Index builds are acheived through the use of an insert op, not a command op. + // The following line is the same as what the insert code uses to detect an index build. + (*ns != '\0' && nsToCollectionSubstring(ns) == "system.indexes")) { + if (ops->empty()) { + // apply commands one-at-a-time + ops->push_back(op); + _networkQueue->consume(); } - // check for oplog version change - BSONElement elemVersion = op["v"]; - int curVersion = 0; - if (elemVersion.eoo()) - // missing version means version 1 - curVersion = 1; - else - curVersion = elemVersion.Int(); - - if (curVersion != OPLOG_VERSION) { - severe() << "expected oplog version " << OPLOG_VERSION << " but found version " - << curVersion << " in oplog entry: " << op; - fassertFailedNoTrace(18820); - } - - // Copy the op to the deque and remove it from the bgsync queue. - ops->push_back(op); - _networkQueue->consume(); + // otherwise, apply what we have so far and come back for the command + return true; + } - // Go back for more ops - return false; + // check for oplog version change + BSONElement elemVersion = op["v"]; + int curVersion = 0; + if (elemVersion.eoo()) + // missing version means version 1 + curVersion = 1; + else + curVersion = elemVersion.Int(); + + if (curVersion != OPLOG_VERSION) { + severe() << "expected oplog version " << OPLOG_VERSION << " but found version " + << curVersion << " in oplog entry: " << op; + fassertFailedNoTrace(18820); } - void SyncTail::handleSlaveDelay(const BSONObj& lastOp) { - ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); - int slaveDelaySecs = replCoord->getSlaveDelaySecs().total_seconds(); - - // ignore slaveDelay if the box is still initializing. once - // it becomes secondary we can worry about it. - if( slaveDelaySecs > 0 && replCoord->getMemberState().secondary() ) { - const OpTime ts = lastOp["ts"]._opTime(); - long long a = ts.getSecs(); - long long b = time(0); - long long lag = b - a; - long long sleeptime = slaveDelaySecs - lag; - if( sleeptime > 0 ) { - uassert(12000, "rs slaveDelay differential too big check clocks and systems", - sleeptime < 0x40000000); - if( sleeptime < 60 ) { - sleepsecs((int) sleeptime); - } - else { - warning() << "replSet slavedelay causing a long sleep of " << sleeptime - << " seconds"; - // sleep(hours) would prevent reconfigs from taking effect & such! - long long waitUntil = b + sleeptime; - while(time(0) < waitUntil) { - sleepsecs(6); - - // Handle reconfigs that changed the slave delay - if (replCoord->getSlaveDelaySecs().total_seconds() != slaveDelaySecs) - break; - } + // Copy the op to the deque and remove it from the bgsync queue. + ops->push_back(op); + _networkQueue->consume(); + + // Go back for more ops + return false; +} + +void SyncTail::handleSlaveDelay(const BSONObj& lastOp) { + ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); + int slaveDelaySecs = replCoord->getSlaveDelaySecs().total_seconds(); + + // ignore slaveDelay if the box is still initializing. once + // it becomes secondary we can worry about it. + if (slaveDelaySecs > 0 && replCoord->getMemberState().secondary()) { + const OpTime ts = lastOp["ts"]._opTime(); + long long a = ts.getSecs(); + long long b = time(0); + long long lag = b - a; + long long sleeptime = slaveDelaySecs - lag; + if (sleeptime > 0) { + uassert(12000, + "rs slaveDelay differential too big check clocks and systems", + sleeptime < 0x40000000); + if (sleeptime < 60) { + sleepsecs((int)sleeptime); + } else { + warning() << "replSet slavedelay causing a long sleep of " << sleeptime + << " seconds"; + // sleep(hours) would prevent reconfigs from taking effect & such! + long long waitUntil = b + sleeptime; + while (time(0) < waitUntil) { + sleepsecs(6); + + // Handle reconfigs that changed the slave delay + if (replCoord->getSlaveDelaySecs().total_seconds() != slaveDelaySecs) + break; } } - } // endif slaveDelay - } + } + } // endif slaveDelay +} - static AtomicUInt32 replWriterWorkerId; +static AtomicUInt32 replWriterWorkerId; - static void initializeWriterThread() { - // Only do this once per thread - if (!ClientBasic::getCurrent()) { - Client::initThreadIfNotAlready(); - cc().getAuthorizationSession()->grantInternalAuthorization(); - } +static void initializeWriterThread() { + // Only do this once per thread + if (!ClientBasic::getCurrent()) { + Client::initThreadIfNotAlready(); + cc().getAuthorizationSession()->grantInternalAuthorization(); } +} - // This free function is used by the writer threads to apply each op - void multiSyncApply(const std::vector<BSONObj>& ops, SyncTail* st) { - initializeWriterThread(); +// This free function is used by the writer threads to apply each op +void multiSyncApply(const std::vector<BSONObj>& ops, SyncTail* st) { + initializeWriterThread(); - OperationContextImpl txn; + OperationContextImpl txn; - // allow us to get through the magic barrier - txn.lockState()->setIsBatchWriter(true); + // allow us to get through the magic barrier + txn.lockState()->setIsBatchWriter(true); - bool convertUpdatesToUpserts = true; + bool convertUpdatesToUpserts = true; - for (std::vector<BSONObj>::const_iterator it = ops.begin(); - it != ops.end(); - ++it) { - try { - if (!st->syncApply(&txn, *it, convertUpdatesToUpserts)) { - fassertFailedNoTrace(16359); - } + for (std::vector<BSONObj>::const_iterator it = ops.begin(); it != ops.end(); ++it) { + try { + if (!st->syncApply(&txn, *it, convertUpdatesToUpserts)) { + fassertFailedNoTrace(16359); } - catch (const DBException& e) { - error() << "writer worker caught exception: " << causedBy(e) - << " on: " << it->toString(); + } catch (const DBException& e) { + error() << "writer worker caught exception: " << causedBy(e) + << " on: " << it->toString(); - if (inShutdown()) { - return; - } - - fassertFailedNoTrace(16360); + if (inShutdown()) { + return; } + + fassertFailedNoTrace(16360); } } +} - // This free function is used by the initial sync writer threads to apply each op - void multiInitialSyncApply(const std::vector<BSONObj>& ops, SyncTail* st) { - initializeWriterThread(); - - OperationContextImpl txn; +// This free function is used by the initial sync writer threads to apply each op +void multiInitialSyncApply(const std::vector<BSONObj>& ops, SyncTail* st) { + initializeWriterThread(); - // allow us to get through the magic barrier - txn.lockState()->setIsBatchWriter(true); + OperationContextImpl txn; - for (std::vector<BSONObj>::const_iterator it = ops.begin(); - it != ops.end(); - ++it) { - try { - if (!st->syncApply(&txn, *it)) { + // allow us to get through the magic barrier + txn.lockState()->setIsBatchWriter(true); - if (st->shouldRetry(&txn, *it)) { - if (!st->syncApply(&txn, *it)) { - fassertFailedNoTrace(15915); - } + for (std::vector<BSONObj>::const_iterator it = ops.begin(); it != ops.end(); ++it) { + try { + if (!st->syncApply(&txn, *it)) { + if (st->shouldRetry(&txn, *it)) { + if (!st->syncApply(&txn, *it)) { + fassertFailedNoTrace(15915); } - - // If shouldRetry() returns false, fall through. - // This can happen if the document that was moved and missed by Cloner - // subsequently got deleted and no longer exists on the Sync Target at all } - } - catch (const DBException& e) { - error() << "writer worker caught exception: " << causedBy(e) - << " on: " << it->toString(); - if (inShutdown()) { - return; - } + // If shouldRetry() returns false, fall through. + // This can happen if the document that was moved and missed by Cloner + // subsequently got deleted and no longer exists on the Sync Target at all + } + } catch (const DBException& e) { + error() << "writer worker caught exception: " << causedBy(e) + << " on: " << it->toString(); - fassertFailedNoTrace(16361); + if (inShutdown()) { + return; } + + fassertFailedNoTrace(16361); } } +} -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/sync_tail.h b/src/mongo/db/repl/sync_tail.h index 1802a05f41c..82db52c02c3 100644 --- a/src/mongo/db/repl/sync_tail.h +++ b/src/mongo/db/repl/sync_tail.h @@ -36,108 +36,112 @@ namespace mongo { - class OperationContext; +class OperationContext; namespace repl { - class BackgroundSyncInterface; - class ReplicationCoordinator; +class BackgroundSyncInterface; +class ReplicationCoordinator; + +/** + * "Normal" replica set syncing + */ +class SyncTail : public Sync { + typedef void (*MultiSyncApplyFunc)(const std::vector<BSONObj>& ops, SyncTail* st); + +public: + SyncTail(BackgroundSyncInterface* q, MultiSyncApplyFunc func); + virtual ~SyncTail(); + virtual bool syncApply(OperationContext* txn, + const BSONObj& o, + bool convertUpdateToUpsert = false); /** - * "Normal" replica set syncing + * Runs _applyOplogUntil(stopOpTime) */ - class SyncTail : public Sync { - typedef void (*MultiSyncApplyFunc)(const std::vector<BSONObj>& ops, SyncTail* st); + virtual void oplogApplication(OperationContext* txn, const OpTime& stopOpTime); + + void oplogApplication(); + bool peek(BSONObj* obj); + + class OpQueue { public: - SyncTail(BackgroundSyncInterface *q, MultiSyncApplyFunc func); - virtual ~SyncTail(); - virtual bool syncApply(OperationContext* txn, - const BSONObj &o, - bool convertUpdateToUpsert = false); - - /** - * Runs _applyOplogUntil(stopOpTime) - */ - virtual void oplogApplication(OperationContext* txn, const OpTime& stopOpTime); - - void oplogApplication(); - bool peek(BSONObj* obj); - - class OpQueue { - public: - OpQueue() : _size(0) {} - size_t getSize() { return _size; } - std::deque<BSONObj>& getDeque() { return _deque; } - void push_back(BSONObj& op) { - _deque.push_back(op); - _size += op.objsize(); - } - bool empty() { - return _deque.empty(); - } - - BSONObj back() { - verify(!_deque.empty()); - return _deque.back(); - } - - private: - std::deque<BSONObj> _deque; - size_t _size; - }; - - // returns true if we should continue waiting for BSONObjs, false if we should - // stop waiting and apply the queue we have. Only returns false if !ops.empty(). - bool tryPopAndWaitForMore(OperationContext* txn, - OpQueue* ops, - ReplicationCoordinator* replCoord); - - protected: - // Cap the batches using the limit on journal commits. - // This works out to be 100 MB (64 bit) or 50 MB (32 bit) - static const unsigned int replBatchLimitBytes = dur::UncommittedBytesLimit; - static const int replBatchLimitSeconds = 1; - static const unsigned int replBatchLimitOperations = 5000; - - // Prefetch and write a deque of operations, using the supplied function. - // Initial Sync and Sync Tail each use a different function. - // Returns the last OpTime applied. - OpTime multiApply(OperationContext* txn, std::deque<BSONObj>& ops); - - /** - * Applies oplog entries until reaching "endOpTime". - * - * NOTE:Will not transition or check states - */ - void _applyOplogUntil(OperationContext* txn, const OpTime& endOpTime); + OpQueue() : _size(0) {} + size_t getSize() { + return _size; + } + std::deque<BSONObj>& getDeque() { + return _deque; + } + void push_back(BSONObj& op) { + _deque.push_back(op); + _size += op.objsize(); + } + bool empty() { + return _deque.empty(); + } + + BSONObj back() { + verify(!_deque.empty()); + return _deque.back(); + } private: - BackgroundSyncInterface* _networkQueue; + std::deque<BSONObj> _deque; + size_t _size; + }; - // Function to use during applyOps - MultiSyncApplyFunc _applyFunc; + // returns true if we should continue waiting for BSONObjs, false if we should + // stop waiting and apply the queue we have. Only returns false if !ops.empty(). + bool tryPopAndWaitForMore(OperationContext* txn, + OpQueue* ops, + ReplicationCoordinator* replCoord); - // Doles out all the work to the reader pool threads and waits for them to complete - void prefetchOps(const std::deque<BSONObj>& ops); - // Used by the thread pool readers to prefetch an op - static void prefetchOp(const BSONObj& op); +protected: + // Cap the batches using the limit on journal commits. + // This works out to be 100 MB (64 bit) or 50 MB (32 bit) + static const unsigned int replBatchLimitBytes = dur::UncommittedBytesLimit; + static const int replBatchLimitSeconds = 1; + static const unsigned int replBatchLimitOperations = 5000; - // Doles out all the work to the writer pool threads and waits for them to complete - void applyOps(const std::vector< std::vector<BSONObj> >& writerVectors); + // Prefetch and write a deque of operations, using the supplied function. + // Initial Sync and Sync Tail each use a different function. + // Returns the last OpTime applied. + OpTime multiApply(OperationContext* txn, std::deque<BSONObj>& ops); - void fillWriterVectors(const std::deque<BSONObj>& ops, - std::vector< std::vector<BSONObj> >* writerVectors); - void handleSlaveDelay(const BSONObj& op); + /** + * Applies oplog entries until reaching "endOpTime". + * + * NOTE:Will not transition or check states + */ + void _applyOplogUntil(OperationContext* txn, const OpTime& endOpTime); - // persistent pool of worker threads for writing ops to the databases - threadpool::ThreadPool _writerPool; - // persistent pool of worker threads for prefetching - threadpool::ThreadPool _prefetcherPool; +private: + BackgroundSyncInterface* _networkQueue; - }; + // Function to use during applyOps + MultiSyncApplyFunc _applyFunc; + + // Doles out all the work to the reader pool threads and waits for them to complete + void prefetchOps(const std::deque<BSONObj>& ops); + // Used by the thread pool readers to prefetch an op + static void prefetchOp(const BSONObj& op); + + // Doles out all the work to the writer pool threads and waits for them to complete + void applyOps(const std::vector<std::vector<BSONObj>>& writerVectors); + + void fillWriterVectors(const std::deque<BSONObj>& ops, + std::vector<std::vector<BSONObj>>* writerVectors); + void handleSlaveDelay(const BSONObj& op); + + // persistent pool of worker threads for writing ops to the databases + threadpool::ThreadPool _writerPool; + // persistent pool of worker threads for prefetching + threadpool::ThreadPool _prefetcherPool; +}; - // These free functions are used by the thread pool workers to write ops to the db. - void multiSyncApply(const std::vector<BSONObj>& ops, SyncTail* st); - void multiInitialSyncApply(const std::vector<BSONObj>& ops, SyncTail* st); +// These free functions are used by the thread pool workers to write ops to the db. +void multiSyncApply(const std::vector<BSONObj>& ops, SyncTail* st); +void multiInitialSyncApply(const std::vector<BSONObj>& ops, SyncTail* st); -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/topology_coordinator.cpp b/src/mongo/db/repl/topology_coordinator.cpp index 99738a38421..7ca7ba6aa84 100644 --- a/src/mongo/db/repl/topology_coordinator.cpp +++ b/src/mongo/db/repl/topology_coordinator.cpp @@ -39,30 +39,30 @@ namespace mongo { namespace repl { namespace { - static const int kLeaderValue = 0; - static const int kFollowerValue = 1; - static const int kCandidateValue = 2; +static const int kLeaderValue = 0; +static const int kFollowerValue = 1; +static const int kCandidateValue = 2; } // namespace - const TopologyCoordinator::Role TopologyCoordinator::Role::leader(kLeaderValue); - const TopologyCoordinator::Role TopologyCoordinator::Role::follower(kFollowerValue); - const TopologyCoordinator::Role TopologyCoordinator::Role::candidate(kCandidateValue); +const TopologyCoordinator::Role TopologyCoordinator::Role::leader(kLeaderValue); +const TopologyCoordinator::Role TopologyCoordinator::Role::follower(kFollowerValue); +const TopologyCoordinator::Role TopologyCoordinator::Role::candidate(kCandidateValue); - TopologyCoordinator::Role::Role(int value) : _value(value) {} +TopologyCoordinator::Role::Role(int value) : _value(value) {} - std::string TopologyCoordinator::Role::toString() const { - switch(_value) { +std::string TopologyCoordinator::Role::toString() const { + switch (_value) { case kLeaderValue: return "leader"; case kFollowerValue: return "follower"; case kCandidateValue: return "candidate"; - } - invariant(false); } + invariant(false); +} - TopologyCoordinator::~TopologyCoordinator() {} +TopologyCoordinator::~TopologyCoordinator() {} } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/topology_coordinator.h b/src/mongo/db/repl/topology_coordinator.h index 97f85faf521..315882020f0 100644 --- a/src/mongo/db/repl/topology_coordinator.h +++ b/src/mongo/db/repl/topology_coordinator.h @@ -42,364 +42,367 @@ namespace mongo { namespace repl { - class HeartbeatResponseAction; - class ReplSetHeartbeatArgs; - class ReplicaSetConfig; - class TagSubgroup; - struct MemberState; +class HeartbeatResponseAction; +class ReplSetHeartbeatArgs; +class ReplicaSetConfig; +class TagSubgroup; +struct MemberState; + +/** + * Replication Topology Coordinator interface. + * + * This object is responsible for managing the topology of the cluster. + * Tasks include consensus and leader election, chaining, and configuration management. + * Methods of this class should be non-blocking. + */ +class TopologyCoordinator { + MONGO_DISALLOW_COPYING(TopologyCoordinator); + +public: + class Role; + + virtual ~TopologyCoordinator(); + + //////////////////////////////////////////////////////////// + // + // State inspection methods. + // + //////////////////////////////////////////////////////////// + + /** + * Gets the role of this member in the replication protocol. + */ + virtual Role getRole() const = 0; + + /** + * Gets the MemberState of this member in the replica set. + */ + virtual MemberState getMemberState() const = 0; + + /** + * Returns the address of the current sync source, or an empty HostAndPort if there is no + * current sync source. + */ + virtual HostAndPort getSyncSourceAddress() const = 0; + + /** + * Retrieves a vector of HostAndPorts containing all nodes that are neither DOWN nor + * ourself. + */ + virtual std::vector<HostAndPort> getMaybeUpHostAndPorts() const = 0; + + /** + * Gets the earliest time the current node will stand for election. + */ + virtual Date_t getStepDownTime() const = 0; + + /** + * Gets the current value of the maintenance mode counter. + */ + virtual int getMaintenanceCount() const = 0; + + //////////////////////////////////////////////////////////// + // + // Basic state manipulation methods. + // + //////////////////////////////////////////////////////////// + + /** + * Sets the index into the config used when we next choose a sync source + */ + virtual void setForceSyncSourceIndex(int index) = 0; + + /** + * Chooses and sets a new sync source, based on our current knowledge of the world. + */ + virtual HostAndPort chooseNewSyncSource(Date_t now, const OpTime& lastOpApplied) = 0; + + /** + * Suppresses selecting "host" as sync source until "until". + */ + virtual void blacklistSyncSource(const HostAndPort& host, Date_t until) = 0; + + /** + * Removes a single entry "host" from the list of potential sync sources which we + * have blacklisted, if it is supposed to be unblacklisted by "now". + */ + virtual void unblacklistSyncSource(const HostAndPort& host, Date_t now) = 0; + + /** + * Clears the list of potential sync sources we have blacklisted. + */ + virtual void clearSyncSourceBlacklist() = 0; + + /** + * Determines if a new sync source should be chosen, if a better candidate sync source is + * available. If the current sync source's last optime is more than _maxSyncSourceLagSecs + * behind any syncable source, this function returns true. + * + * "now" is used to skip over currently blacklisted sync sources. + */ + virtual bool shouldChangeSyncSource(const HostAndPort& currentSource, Date_t now) const = 0; /** - * Replication Topology Coordinator interface. + * Checks whether we are a single node set and we are not in a stepdown period. If so, + * puts us into candidate mode, otherwise does nothing. This is used to ensure that + * nodes in a single node replset become primary again when their stepdown period ends. + */ + virtual bool becomeCandidateIfStepdownPeriodOverAndSingleNodeSet(Date_t now) = 0; + + /** + * Sets the earliest time the current node will stand for election to "newTime". * - * This object is responsible for managing the topology of the cluster. - * Tasks include consensus and leader election, chaining, and configuration management. - * Methods of this class should be non-blocking. + * Until this time, while the node may report itself as electable, it will not stand + * for election. + */ + virtual void setElectionSleepUntil(Date_t newTime) = 0; + + /** + * Sets the reported mode of this node to one of RS_SECONDARY, RS_STARTUP2, RS_ROLLBACK or + * RS_RECOVERING, when getRole() == Role::follower. This is the interface by which the + * applier changes the reported member state of the current node, and enables or suppresses + * electability of the current node. All modes but RS_SECONDARY indicate an unelectable + * follower state (one that cannot transition to candidate). */ - class TopologyCoordinator { - MONGO_DISALLOW_COPYING(TopologyCoordinator); - public: - class Role; - - virtual ~TopologyCoordinator(); - - //////////////////////////////////////////////////////////// - // - // State inspection methods. - // - //////////////////////////////////////////////////////////// - - /** - * Gets the role of this member in the replication protocol. - */ - virtual Role getRole() const = 0; - - /** - * Gets the MemberState of this member in the replica set. - */ - virtual MemberState getMemberState() const = 0; - - /** - * Returns the address of the current sync source, or an empty HostAndPort if there is no - * current sync source. - */ - virtual HostAndPort getSyncSourceAddress() const = 0; - - /** - * Retrieves a vector of HostAndPorts containing all nodes that are neither DOWN nor - * ourself. - */ - virtual std::vector<HostAndPort> getMaybeUpHostAndPorts() const = 0; - - /** - * Gets the earliest time the current node will stand for election. - */ - virtual Date_t getStepDownTime() const = 0; - - /** - * Gets the current value of the maintenance mode counter. - */ - virtual int getMaintenanceCount() const = 0; - - //////////////////////////////////////////////////////////// - // - // Basic state manipulation methods. - // - //////////////////////////////////////////////////////////// - - /** - * Sets the index into the config used when we next choose a sync source - */ - virtual void setForceSyncSourceIndex(int index) = 0; - - /** - * Chooses and sets a new sync source, based on our current knowledge of the world. - */ - virtual HostAndPort chooseNewSyncSource(Date_t now, const OpTime& lastOpApplied) = 0; - - /** - * Suppresses selecting "host" as sync source until "until". - */ - virtual void blacklistSyncSource(const HostAndPort& host, Date_t until) = 0; - - /** - * Removes a single entry "host" from the list of potential sync sources which we - * have blacklisted, if it is supposed to be unblacklisted by "now". - */ - virtual void unblacklistSyncSource(const HostAndPort& host, Date_t now) = 0; - - /** - * Clears the list of potential sync sources we have blacklisted. - */ - virtual void clearSyncSourceBlacklist() = 0; - - /** - * Determines if a new sync source should be chosen, if a better candidate sync source is - * available. If the current sync source's last optime is more than _maxSyncSourceLagSecs - * behind any syncable source, this function returns true. - * - * "now" is used to skip over currently blacklisted sync sources. - */ - virtual bool shouldChangeSyncSource(const HostAndPort& currentSource, Date_t now) const = 0; - - /** - * Checks whether we are a single node set and we are not in a stepdown period. If so, - * puts us into candidate mode, otherwise does nothing. This is used to ensure that - * nodes in a single node replset become primary again when their stepdown period ends. - */ - virtual bool becomeCandidateIfStepdownPeriodOverAndSingleNodeSet(Date_t now) = 0; - - /** - * Sets the earliest time the current node will stand for election to "newTime". - * - * Until this time, while the node may report itself as electable, it will not stand - * for election. - */ - virtual void setElectionSleepUntil(Date_t newTime) = 0; - - /** - * Sets the reported mode of this node to one of RS_SECONDARY, RS_STARTUP2, RS_ROLLBACK or - * RS_RECOVERING, when getRole() == Role::follower. This is the interface by which the - * applier changes the reported member state of the current node, and enables or suppresses - * electability of the current node. All modes but RS_SECONDARY indicate an unelectable - * follower state (one that cannot transition to candidate). - */ - virtual void setFollowerMode(MemberState::MS newMode) = 0; - - /** - * Adjusts the maintenance mode count by "inc". - * - * It is an error to call this method if getRole() does not return Role::follower. - * It is an error to allow the maintenance count to go negative. - */ - virtual void adjustMaintenanceCountBy(int inc) = 0; - - //////////////////////////////////////////////////////////// - // - // Methods that prepare responses to command requests. - // - //////////////////////////////////////////////////////////// - - // produces a reply to a replSetSyncFrom command - virtual void prepareSyncFromResponse(const ReplicationExecutor::CallbackData& data, - const HostAndPort& target, - const OpTime& lastOpApplied, - BSONObjBuilder* response, - Status* result) = 0; - - // produce a reply to a replSetFresh command - virtual void prepareFreshResponse(const ReplicationCoordinator::ReplSetFreshArgs& args, - Date_t now, - OpTime lastOpApplied, - BSONObjBuilder* response, - Status* result) = 0; - - // produce a reply to a received electCmd - virtual void prepareElectResponse(const ReplicationCoordinator::ReplSetElectArgs& args, - Date_t now, - OpTime lastOpApplied, - BSONObjBuilder* response, - Status* result) = 0; - - // produce a reply to a heartbeat - virtual Status prepareHeartbeatResponse(Date_t now, - const ReplSetHeartbeatArgs& args, - const std::string& ourSetName, - const OpTime& lastOpApplied, - ReplSetHeartbeatResponse* response) = 0; - - // produce a reply to a status request - virtual void prepareStatusResponse(const ReplicationExecutor::CallbackData& data, - Date_t now, - unsigned uptime, - const OpTime& lastOpApplied, - BSONObjBuilder* response, - Status* result) = 0; - - // produce a reply to an ismaster request. It is only valid to call this if we are a - // replset. - virtual void fillIsMasterForReplSet(IsMasterResponse* response) = 0; - - // produce a reply to a freeze request - virtual void prepareFreezeResponse(Date_t now, int secs, BSONObjBuilder* response) = 0; - - //////////////////////////////////////////////////////////// - // - // Methods for sending and receiving heartbeats, - // reconfiguring and handling the results of standing for - // election. - // - //////////////////////////////////////////////////////////// - - /** - * Updates the topology coordinator's notion of the replica set configuration. - * - * "newConfig" is the new configuration, and "selfIndex" is the index of this - * node's configuration information in "newConfig", or "selfIndex" is -1 to - * indicate that this node is not a member of "newConfig". - * - * newConfig.isInitialized() should be true, though implementations may accept - * configurations where this is not true, for testing purposes. - */ - virtual void updateConfig(const ReplicaSetConfig& newConfig, - int selfIndex, - Date_t now, - OpTime lastOpApplied) = 0; - - /** - * Prepares a heartbeat request appropriate for sending to "target", assuming the - * current time is "now". "ourSetName" is used as the name for our replica set if - * the topology coordinator does not have a valid configuration installed. - * - * The returned pair contains proper arguments for a replSetHeartbeat command, and - * an amount of time to wait for the response. - * - * This call should be paired (with intervening network communication) with a call to - * processHeartbeatResponse for the same "target". - */ - virtual std::pair<ReplSetHeartbeatArgs, Milliseconds> prepareHeartbeatRequest( - Date_t now, - const std::string& ourSetName, - const HostAndPort& target) = 0; - - /** - * Processes a heartbeat response from "target" that arrived around "now", having - * spent "networkRoundTripTime" millis on the network. - * - * Updates internal topology coordinator state, and returns instructions about what action - * to take next. - * - * If the next action indicates StartElection, the topology coordinator has transitioned to - * the "candidate" role, and will remain there until processWinElection or - * processLoseElection are called. - * - * If the next action indicates "StepDownSelf", the topology coordinator has transitioned - * to the "follower" role from "leader", and the caller should take any necessary actions - * to become a follower. - * - * If the next action indicates "StepDownRemotePrimary", the caller should take steps to - * cause the specified remote host to step down from primary to secondary. - * - * If the next action indicates "Reconfig", the caller should verify the configuration in - * hbResponse is acceptable, perform any other reconfiguration actions it must, and call - * updateConfig with the new configuration and the appropriate value for "selfIndex". It - * must also wrap up any outstanding elections (by calling processLoseElection or - * processWinElection) before calling updateConfig. - * - * This call should be paired (with intervening network communication) with a call to - * prepareHeartbeatRequest for the same "target". - */ - virtual HeartbeatResponseAction processHeartbeatResponse( - Date_t now, - Milliseconds networkRoundTripTime, - const HostAndPort& target, - const StatusWith<ReplSetHeartbeatResponse>& hbResponse, - OpTime myLastOpApplied) = 0; - - /** - * If getRole() == Role::candidate and this node has not voted too recently, updates the - * lastVote tracker and returns true. Otherwise, returns false. - */ - virtual bool voteForMyself(Date_t now) = 0; - - /** - * Performs state updates associated with winning an election. - * - * It is an error to call this if the topology coordinator is not in candidate mode. - * - * Exactly one of either processWinElection or processLoseElection must be called if - * processHeartbeatResponse returns StartElection, to exit candidate mode. - */ - virtual void processWinElection(OID electionId, OpTime electionOpTime) = 0; - - /** - * Performs state updates associated with losing an election. - * - * It is an error to call this if the topology coordinator is not in candidate mode. - * - * Exactly one of either processWinElection or processLoseElection must be called if - * processHeartbeatResponse returns StartElection, to exit candidate mode. - */ - virtual void processLoseElection() = 0; - - /** - * Tries to transition the coordinator from the leader role to the follower role. - * - * Fails if "force" is not set and no follower is known to be up. It is illegal - * to call this method if the node is not leader. - * - * Returns whether or not the step down succeeded. - */ - virtual bool stepDown(Date_t until, bool force, OpTime lastOpApplied) = 0; - - /** - * Sometimes a request to step down comes in (like via a heartbeat), but we don't have the - * global exclusive lock so we can't actually stepdown at that moment. When that happens - * we record that a stepdown request is pending and schedule work to stepdown in the global - * lock. This method is called after holding the global lock to perform the actual - * stepdown, but only if the node hasn't already stepped down another way since the work was - * scheduled. Returns true if it actually steps down, and false otherwise. - */ - virtual bool stepDownIfPending() = 0; - - /** - * Considers whether or not this node should stand for election, and returns true - * if the node has transitioned to candidate role as a result of the call. - */ - virtual bool checkShouldStandForElection(Date_t now, const OpTime& lastOpApplied) = 0; - - /** - * Set the outgoing heartbeat message from self - */ - virtual void setMyHeartbeatMessage(const Date_t now, const std::string& s) = 0; - - /** - * Writes into 'output' all the information needed to generate a summary of the current - * replication state for use by the web interface. - */ - virtual void summarizeAsHtml(ReplSetHtmlSummary* output) = 0; - - protected: - TopologyCoordinator() {} - }; + virtual void setFollowerMode(MemberState::MS newMode) = 0; /** - * Type that denotes the role of a node in the replication protocol. + * Adjusts the maintenance mode count by "inc". * - * The role is distinct from MemberState, in that it only deals with the - * roles a node plays in the basic protocol -- leader, follower and candidate. - * The mapping between MemberState and Role is complex -- several MemberStates - * map to the follower role, and MemberState::RS_SECONDARY maps to either - * follower or candidate roles, e.g. + * It is an error to call this method if getRole() does not return Role::follower. + * It is an error to allow the maintenance count to go negative. */ - class TopologyCoordinator::Role { - public: - /** - * Constant indicating leader role. - */ - static const Role leader; + virtual void adjustMaintenanceCountBy(int inc) = 0; + + //////////////////////////////////////////////////////////// + // + // Methods that prepare responses to command requests. + // + //////////////////////////////////////////////////////////// + + // produces a reply to a replSetSyncFrom command + virtual void prepareSyncFromResponse(const ReplicationExecutor::CallbackData& data, + const HostAndPort& target, + const OpTime& lastOpApplied, + BSONObjBuilder* response, + Status* result) = 0; + + // produce a reply to a replSetFresh command + virtual void prepareFreshResponse(const ReplicationCoordinator::ReplSetFreshArgs& args, + Date_t now, + OpTime lastOpApplied, + BSONObjBuilder* response, + Status* result) = 0; + + // produce a reply to a received electCmd + virtual void prepareElectResponse(const ReplicationCoordinator::ReplSetElectArgs& args, + Date_t now, + OpTime lastOpApplied, + BSONObjBuilder* response, + Status* result) = 0; + + // produce a reply to a heartbeat + virtual Status prepareHeartbeatResponse(Date_t now, + const ReplSetHeartbeatArgs& args, + const std::string& ourSetName, + const OpTime& lastOpApplied, + ReplSetHeartbeatResponse* response) = 0; + + // produce a reply to a status request + virtual void prepareStatusResponse(const ReplicationExecutor::CallbackData& data, + Date_t now, + unsigned uptime, + const OpTime& lastOpApplied, + BSONObjBuilder* response, + Status* result) = 0; + + // produce a reply to an ismaster request. It is only valid to call this if we are a + // replset. + virtual void fillIsMasterForReplSet(IsMasterResponse* response) = 0; + + // produce a reply to a freeze request + virtual void prepareFreezeResponse(Date_t now, int secs, BSONObjBuilder* response) = 0; + + //////////////////////////////////////////////////////////// + // + // Methods for sending and receiving heartbeats, + // reconfiguring and handling the results of standing for + // election. + // + //////////////////////////////////////////////////////////// - /** - * Constant indicating follower role. - */ - static const Role follower; + /** + * Updates the topology coordinator's notion of the replica set configuration. + * + * "newConfig" is the new configuration, and "selfIndex" is the index of this + * node's configuration information in "newConfig", or "selfIndex" is -1 to + * indicate that this node is not a member of "newConfig". + * + * newConfig.isInitialized() should be true, though implementations may accept + * configurations where this is not true, for testing purposes. + */ + virtual void updateConfig(const ReplicaSetConfig& newConfig, + int selfIndex, + Date_t now, + OpTime lastOpApplied) = 0; - /** - * Constant indicating candidate role - */ - static const Role candidate; + /** + * Prepares a heartbeat request appropriate for sending to "target", assuming the + * current time is "now". "ourSetName" is used as the name for our replica set if + * the topology coordinator does not have a valid configuration installed. + * + * The returned pair contains proper arguments for a replSetHeartbeat command, and + * an amount of time to wait for the response. + * + * This call should be paired (with intervening network communication) with a call to + * processHeartbeatResponse for the same "target". + */ + virtual std::pair<ReplSetHeartbeatArgs, Milliseconds> prepareHeartbeatRequest( + Date_t now, const std::string& ourSetName, const HostAndPort& target) = 0; + + /** + * Processes a heartbeat response from "target" that arrived around "now", having + * spent "networkRoundTripTime" millis on the network. + * + * Updates internal topology coordinator state, and returns instructions about what action + * to take next. + * + * If the next action indicates StartElection, the topology coordinator has transitioned to + * the "candidate" role, and will remain there until processWinElection or + * processLoseElection are called. + * + * If the next action indicates "StepDownSelf", the topology coordinator has transitioned + * to the "follower" role from "leader", and the caller should take any necessary actions + * to become a follower. + * + * If the next action indicates "StepDownRemotePrimary", the caller should take steps to + * cause the specified remote host to step down from primary to secondary. + * + * If the next action indicates "Reconfig", the caller should verify the configuration in + * hbResponse is acceptable, perform any other reconfiguration actions it must, and call + * updateConfig with the new configuration and the appropriate value for "selfIndex". It + * must also wrap up any outstanding elections (by calling processLoseElection or + * processWinElection) before calling updateConfig. + * + * This call should be paired (with intervening network communication) with a call to + * prepareHeartbeatRequest for the same "target". + */ + virtual HeartbeatResponseAction processHeartbeatResponse( + Date_t now, + Milliseconds networkRoundTripTime, + const HostAndPort& target, + const StatusWith<ReplSetHeartbeatResponse>& hbResponse, + OpTime myLastOpApplied) = 0; + + /** + * If getRole() == Role::candidate and this node has not voted too recently, updates the + * lastVote tracker and returns true. Otherwise, returns false. + */ + virtual bool voteForMyself(Date_t now) = 0; + + /** + * Performs state updates associated with winning an election. + * + * It is an error to call this if the topology coordinator is not in candidate mode. + * + * Exactly one of either processWinElection or processLoseElection must be called if + * processHeartbeatResponse returns StartElection, to exit candidate mode. + */ + virtual void processWinElection(OID electionId, OpTime electionOpTime) = 0; + + /** + * Performs state updates associated with losing an election. + * + * It is an error to call this if the topology coordinator is not in candidate mode. + * + * Exactly one of either processWinElection or processLoseElection must be called if + * processHeartbeatResponse returns StartElection, to exit candidate mode. + */ + virtual void processLoseElection() = 0; + + /** + * Tries to transition the coordinator from the leader role to the follower role. + * + * Fails if "force" is not set and no follower is known to be up. It is illegal + * to call this method if the node is not leader. + * + * Returns whether or not the step down succeeded. + */ + virtual bool stepDown(Date_t until, bool force, OpTime lastOpApplied) = 0; + + /** + * Sometimes a request to step down comes in (like via a heartbeat), but we don't have the + * global exclusive lock so we can't actually stepdown at that moment. When that happens + * we record that a stepdown request is pending and schedule work to stepdown in the global + * lock. This method is called after holding the global lock to perform the actual + * stepdown, but only if the node hasn't already stepped down another way since the work was + * scheduled. Returns true if it actually steps down, and false otherwise. + */ + virtual bool stepDownIfPending() = 0; + + /** + * Considers whether or not this node should stand for election, and returns true + * if the node has transitioned to candidate role as a result of the call. + */ + virtual bool checkShouldStandForElection(Date_t now, const OpTime& lastOpApplied) = 0; + + /** + * Set the outgoing heartbeat message from self + */ + virtual void setMyHeartbeatMessage(const Date_t now, const std::string& s) = 0; + + /** + * Writes into 'output' all the information needed to generate a summary of the current + * replication state for use by the web interface. + */ + virtual void summarizeAsHtml(ReplSetHtmlSummary* output) = 0; + +protected: + TopologyCoordinator() {} +}; + +/** + * Type that denotes the role of a node in the replication protocol. + * + * The role is distinct from MemberState, in that it only deals with the + * roles a node plays in the basic protocol -- leader, follower and candidate. + * The mapping between MemberState and Role is complex -- several MemberStates + * map to the follower role, and MemberState::RS_SECONDARY maps to either + * follower or candidate roles, e.g. + */ +class TopologyCoordinator::Role { +public: + /** + * Constant indicating leader role. + */ + static const Role leader; + + /** + * Constant indicating follower role. + */ + static const Role follower; + + /** + * Constant indicating candidate role + */ + static const Role candidate; - Role() {} + Role() {} - bool operator==(Role other) const { return _value == other._value; } - bool operator!=(Role other) const { return _value != other._value; } + bool operator==(Role other) const { + return _value == other._value; + } + bool operator!=(Role other) const { + return _value != other._value; + } - std::string toString() const; + std::string toString() const; - private: - explicit Role(int value); +private: + explicit Role(int value); - int _value; - }; + int _value; +}; -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/topology_coordinator_impl.cpp b/src/mongo/db/repl/topology_coordinator_impl.cpp index 9cb5701faef..8d78463fe6c 100644 --- a/src/mongo/db/repl/topology_coordinator_impl.cpp +++ b/src/mongo/db/repl/topology_coordinator_impl.cpp @@ -52,1200 +52,1136 @@ namespace mongo { namespace repl { - using std::vector; +using std::vector; - const Seconds TopologyCoordinatorImpl::LastVote::leaseTime = Seconds(30); +const Seconds TopologyCoordinatorImpl::LastVote::leaseTime = Seconds(30); namespace { - template <typename T> - int indexOfIterator(const std::vector<T>& vec, - typename std::vector<T>::const_iterator& it) { - return static_cast<int>(it - vec.begin()); - } - - // Interval between the time the last heartbeat from a node was received successfully, or - // the time when we gave up retrying, and when the next heartbeat should be sent to a target. - const Milliseconds kHeartbeatInterval(Seconds(2).total_milliseconds()); +template <typename T> +int indexOfIterator(const std::vector<T>& vec, typename std::vector<T>::const_iterator& it) { + return static_cast<int>(it - vec.begin()); +} - // Maximum number of retries for a failed heartbeat. - const int kMaxHeartbeatRetries = 2; +// Interval between the time the last heartbeat from a node was received successfully, or +// the time when we gave up retrying, and when the next heartbeat should be sent to a target. +const Milliseconds kHeartbeatInterval(Seconds(2).total_milliseconds()); - /** - * Returns true if the only up heartbeats are auth errors. - */ - bool _hasOnlyAuthErrorUpHeartbeats(const std::vector<MemberHeartbeatData>& hbdata, - const int selfIndex) { - bool foundAuthError = false; - for (std::vector<MemberHeartbeatData>::const_iterator it = hbdata.begin(); - it != hbdata.end(); - ++it) { - if (indexOfIterator(hbdata, it) == selfIndex) { - continue; - } +// Maximum number of retries for a failed heartbeat. +const int kMaxHeartbeatRetries = 2; - if (it->up()) { - return false; - } +/** + * Returns true if the only up heartbeats are auth errors. + */ +bool _hasOnlyAuthErrorUpHeartbeats(const std::vector<MemberHeartbeatData>& hbdata, + const int selfIndex) { + bool foundAuthError = false; + for (std::vector<MemberHeartbeatData>::const_iterator it = hbdata.begin(); it != hbdata.end(); + ++it) { + if (indexOfIterator(hbdata, it) == selfIndex) { + continue; + } - if (it->hasAuthIssue()) { - foundAuthError = true; - } + if (it->up()) { + return false; } - return foundAuthError; + if (it->hasAuthIssue()) { + foundAuthError = true; + } } + return foundAuthError; +} + } // namespace - PingStats::PingStats() : - count(0), - value(std::numeric_limits<unsigned int>::max()), - _lastHeartbeatStartDate(0), - _numFailuresSinceLastStart(std::numeric_limits<int>::max()) { +PingStats::PingStats() + : count(0), + value(std::numeric_limits<unsigned int>::max()), + _lastHeartbeatStartDate(0), + _numFailuresSinceLastStart(std::numeric_limits<int>::max()) {} + +void PingStats::start(Date_t now) { + _lastHeartbeatStartDate = now; + _numFailuresSinceLastStart = 0; +} + +void PingStats::hit(int millis) { + _numFailuresSinceLastStart = std::numeric_limits<int>::max(); + ++count; + value = value == std::numeric_limits<unsigned int>::max() + ? millis + : static_cast<unsigned long>((value * .8) + (millis * .2)); +} + +void PingStats::miss() { + ++_numFailuresSinceLastStart; +} + +TopologyCoordinatorImpl::TopologyCoordinatorImpl(Seconds maxSyncSourceLagSecs) + : _role(Role::follower), + _currentPrimaryIndex(-1), + _forceSyncSourceIndex(-1), + _maxSyncSourceLagSecs(maxSyncSourceLagSecs), + _selfIndex(-1), + _stepDownPending(false), + _stepDownUntil(0), + _electionSleepUntil(0), + _maintenanceModeCalls(0), + _followerMode(MemberState::RS_STARTUP2) { + invariant(getMemberState() == MemberState::RS_STARTUP); +} + +TopologyCoordinator::Role TopologyCoordinatorImpl::getRole() const { + return _role; +} + +void TopologyCoordinatorImpl::setForceSyncSourceIndex(int index) { + invariant(_forceSyncSourceIndex < _rsConfig.getNumMembers()); + _forceSyncSourceIndex = index; +} + +HostAndPort TopologyCoordinatorImpl::getSyncSourceAddress() const { + return _syncSource; +} + +HostAndPort TopologyCoordinatorImpl::chooseNewSyncSource(Date_t now, const OpTime& lastOpApplied) { + // If we are primary, then we aren't syncing from anyone (else). + if (_iAmPrimary()) { + return HostAndPort(); + } + + // If we are not a member of the current replica set configuration, no sync source is valid. + if (_selfIndex == -1) { + LOG(2) << "Cannot sync from any members because we are not in the replica set config"; + return HostAndPort(); + } + + // if we have a target we've requested to sync from, use it + if (_forceSyncSourceIndex != -1) { + invariant(_forceSyncSourceIndex < _rsConfig.getNumMembers()); + _syncSource = _rsConfig.getMemberAt(_forceSyncSourceIndex).getHostAndPort(); + _forceSyncSourceIndex = -1; + std::string msg(str::stream() << "syncing from: " << _syncSource.toString() + << " by request"); + log() << msg << rsLog; + setMyHeartbeatMessage(now, msg); + return _syncSource; } - void PingStats::start(Date_t now) { - _lastHeartbeatStartDate = now; - _numFailuresSinceLastStart = 0; - } + // wait for 2N pings (not counting ourselves) before choosing a sync target + int needMorePings = (_hbdata.size() - 1) * 2 - _getTotalPings(); - void PingStats::hit(int millis) { - _numFailuresSinceLastStart = std::numeric_limits<int>::max(); - ++count; - value = value == std::numeric_limits<unsigned int>::max() ? millis : - static_cast<unsigned long>((value * .8) + (millis * .2)); - } - - void PingStats::miss() { - ++_numFailuresSinceLastStart; + if (needMorePings > 0) { + OCCASIONALLY log() << "waiting for " << needMorePings + << " pings from other members before syncing"; + _syncSource = HostAndPort(); + return _syncSource; } - TopologyCoordinatorImpl::TopologyCoordinatorImpl(Seconds maxSyncSourceLagSecs) : - _role(Role::follower), - _currentPrimaryIndex(-1), - _forceSyncSourceIndex(-1), - _maxSyncSourceLagSecs(maxSyncSourceLagSecs), - _selfIndex(-1), - _stepDownPending(false), - _stepDownUntil(0), - _electionSleepUntil(0), - _maintenanceModeCalls(0), - _followerMode(MemberState::RS_STARTUP2) - { - invariant(getMemberState() == MemberState::RS_STARTUP); + // If we are only allowed to sync from the primary, set that + if (!_rsConfig.isChainingAllowed()) { + if (_currentPrimaryIndex == -1) { + LOG(1) << "Cannot select sync source because chaining is" + " not allowed and primary is unknown/down"; + _syncSource = HostAndPort(); + return _syncSource; + } else if (_memberIsBlacklisted(*_currentPrimaryMember(), now)) { + LOG(1) << "Cannot select sync source because chaining is" + "not allowed and primary is not currently accepting our updates"; + _syncSource = HostAndPort(); + return _syncSource; + } else { + _syncSource = _rsConfig.getMemberAt(_currentPrimaryIndex).getHostAndPort(); + std::string msg(str::stream() << "syncing from primary: " << _syncSource.toString()); + log() << msg << rsLog; + setMyHeartbeatMessage(now, msg); + return _syncSource; + } } - TopologyCoordinator::Role TopologyCoordinatorImpl::getRole() const { - return _role; - } + // find the member with the lowest ping time that is ahead of me - void TopologyCoordinatorImpl::setForceSyncSourceIndex(int index) { - invariant(_forceSyncSourceIndex < _rsConfig.getNumMembers()); - _forceSyncSourceIndex = index; + // Find primary's oplog time. Reject sync candidates that are more than + // maxSyncSourceLagSecs seconds behind. + OpTime primaryOpTime; + if (_currentPrimaryIndex != -1) { + primaryOpTime = _hbdata[_currentPrimaryIndex].getOpTime(); + } else { + // choose a time that will exclude no candidates, since we don't see a primary + primaryOpTime = OpTime(_maxSyncSourceLagSecs.total_seconds(), 0); } - HostAndPort TopologyCoordinatorImpl::getSyncSourceAddress() const { - return _syncSource; + if (primaryOpTime.getSecs() < + static_cast<unsigned int>(_maxSyncSourceLagSecs.total_seconds())) { + // erh - I think this means there was just a new election + // and we don't yet know the new primary's optime + primaryOpTime = OpTime(_maxSyncSourceLagSecs.total_seconds(), 0); } - HostAndPort TopologyCoordinatorImpl::chooseNewSyncSource(Date_t now, - const OpTime& lastOpApplied) { - // If we are primary, then we aren't syncing from anyone (else). - if (_iAmPrimary()) { - return HostAndPort(); - } + OpTime oldestSyncOpTime(primaryOpTime.getSecs() - _maxSyncSourceLagSecs.total_seconds(), 0); - // If we are not a member of the current replica set configuration, no sync source is valid. - if (_selfIndex == -1) { - LOG(2) << "Cannot sync from any members because we are not in the replica set config"; - return HostAndPort(); - } + int closestIndex = -1; - // if we have a target we've requested to sync from, use it - if (_forceSyncSourceIndex != -1) { - invariant(_forceSyncSourceIndex < _rsConfig.getNumMembers()); - _syncSource = _rsConfig.getMemberAt(_forceSyncSourceIndex).getHostAndPort(); - _forceSyncSourceIndex = -1; - std::string msg(str::stream() << "syncing from: " - << _syncSource.toString() << " by request"); - log() << msg << rsLog; - setMyHeartbeatMessage(now, msg); - return _syncSource; - } - - // wait for 2N pings (not counting ourselves) before choosing a sync target - int needMorePings = (_hbdata.size() - 1) * 2 - _getTotalPings(); - - if (needMorePings > 0) { - OCCASIONALLY log() << "waiting for " << needMorePings - << " pings from other members before syncing"; - _syncSource = HostAndPort(); - return _syncSource; - } - - // If we are only allowed to sync from the primary, set that - if (!_rsConfig.isChainingAllowed()) { - if (_currentPrimaryIndex == -1) { - LOG(1) << "Cannot select sync source because chaining is" - " not allowed and primary is unknown/down"; - _syncSource = HostAndPort(); - return _syncSource; + // Make two attempts. The first attempt, we ignore those nodes with + // slave delay higher than our own, hidden nodes, and nodes that are excessively lagged. + // The second attempt includes such nodes, in case those are the only ones we can reach. + // This loop attempts to set 'closestIndex'. + for (int attempts = 0; attempts < 2; ++attempts) { + for (std::vector<MemberHeartbeatData>::const_iterator it = _hbdata.begin(); + it != _hbdata.end(); + ++it) { + const int itIndex = indexOfIterator(_hbdata, it); + // Don't consider ourselves. + if (itIndex == _selfIndex) { + continue; } - else if (_memberIsBlacklisted(*_currentPrimaryMember(), now)) { - LOG(1) << "Cannot select sync source because chaining is" - "not allowed and primary is not currently accepting our updates"; - _syncSource = HostAndPort(); - return _syncSource; + // Candidate must be up to be considered. + if (!it->up()) { + continue; } - else { - _syncSource = _rsConfig.getMemberAt(_currentPrimaryIndex).getHostAndPort(); - std::string msg(str::stream() << "syncing from primary: " - << _syncSource.toString()); - log() << msg << rsLog; - setMyHeartbeatMessage(now, msg); - return _syncSource; + // Candidate must be PRIMARY or SECONDARY state to be considered. + if (!it->getState().readable()) { + continue; } - } - - // find the member with the lowest ping time that is ahead of me - // Find primary's oplog time. Reject sync candidates that are more than - // maxSyncSourceLagSecs seconds behind. - OpTime primaryOpTime; - if (_currentPrimaryIndex != -1) { - primaryOpTime = _hbdata[_currentPrimaryIndex].getOpTime(); - } - else { - // choose a time that will exclude no candidates, since we don't see a primary - primaryOpTime = OpTime(_maxSyncSourceLagSecs.total_seconds(), 0); - } - - if (primaryOpTime.getSecs() < - static_cast<unsigned int>(_maxSyncSourceLagSecs.total_seconds())) { - // erh - I think this means there was just a new election - // and we don't yet know the new primary's optime - primaryOpTime = OpTime(_maxSyncSourceLagSecs.total_seconds(), 0); - } - - OpTime oldestSyncOpTime(primaryOpTime.getSecs() - _maxSyncSourceLagSecs.total_seconds(), 0); - - int closestIndex = -1; - - // Make two attempts. The first attempt, we ignore those nodes with - // slave delay higher than our own, hidden nodes, and nodes that are excessively lagged. - // The second attempt includes such nodes, in case those are the only ones we can reach. - // This loop attempts to set 'closestIndex'. - for (int attempts = 0; attempts < 2; ++attempts) { - for (std::vector<MemberHeartbeatData>::const_iterator it = _hbdata.begin(); - it != _hbdata.end(); - ++it) { - const int itIndex = indexOfIterator(_hbdata, it); - // Don't consider ourselves. - if (itIndex == _selfIndex) { - continue; - } - // Candidate must be up to be considered. - if (!it->up()) { - continue; - } - // Candidate must be PRIMARY or SECONDARY state to be considered. - if (!it->getState().readable()) { - continue; - } - - const MemberConfig& itMemberConfig(_rsConfig.getMemberAt(itIndex)); - - // Candidate must build indexes if we build indexes, to be considered. - if (_selfConfig().shouldBuildIndexes()) { - if (!itMemberConfig.shouldBuildIndexes()) { - continue; - } - } + const MemberConfig& itMemberConfig(_rsConfig.getMemberAt(itIndex)); - // only consider candidates that are ahead of where we are - if (it->getOpTime() <= lastOpApplied) { + // Candidate must build indexes if we build indexes, to be considered. + if (_selfConfig().shouldBuildIndexes()) { + if (!itMemberConfig.shouldBuildIndexes()) { continue; } + } - // omit candidates that are excessively behind, on the first attempt at least. - if (attempts == 0 && - it->getOpTime() < oldestSyncOpTime) { - continue; - } + // only consider candidates that are ahead of where we are + if (it->getOpTime() <= lastOpApplied) { + continue; + } - // omit nodes that are more latent than anything we've already considered - if ((closestIndex != -1) && - (_getPing(itMemberConfig.getHostAndPort()) - > _getPing(_rsConfig.getMemberAt(closestIndex).getHostAndPort()))) { - continue; - } + // omit candidates that are excessively behind, on the first attempt at least. + if (attempts == 0 && it->getOpTime() < oldestSyncOpTime) { + continue; + } - if (attempts == 0) { - if (_selfConfig().getSlaveDelay() < itMemberConfig.getSlaveDelay() - || itMemberConfig.isHidden()) { - continue; // skip this one in the first attempt - } - } + // omit nodes that are more latent than anything we've already considered + if ((closestIndex != -1) && + (_getPing(itMemberConfig.getHostAndPort()) > + _getPing(_rsConfig.getMemberAt(closestIndex).getHostAndPort()))) { + continue; + } - if (_memberIsBlacklisted(itMemberConfig, now)) { - continue; + if (attempts == 0) { + if (_selfConfig().getSlaveDelay() < itMemberConfig.getSlaveDelay() || + itMemberConfig.isHidden()) { + continue; // skip this one in the first attempt } - - // This candidate has passed all tests; set 'closestIndex' - closestIndex = itIndex; } - if (closestIndex != -1) break; // no need for second attempt - } - if (closestIndex == -1) { - // Did not find any members to sync from - std::string msg("could not find member to sync from"); - // Only log when we had a valid sync source before - if (!_syncSource.empty()) { - log() << msg << rsLog; + if (_memberIsBlacklisted(itMemberConfig, now)) { + continue; } - setMyHeartbeatMessage(now, msg); - _syncSource = HostAndPort(); - return _syncSource; + // This candidate has passed all tests; set 'closestIndex' + closestIndex = itIndex; } - _syncSource = _rsConfig.getMemberAt(closestIndex).getHostAndPort(); - std::string msg(str::stream() << "syncing from: " << _syncSource.toString(), 0); - log() << msg << rsLog; - setMyHeartbeatMessage(now, msg); - return _syncSource; + if (closestIndex != -1) + break; // no need for second attempt } - bool TopologyCoordinatorImpl::_memberIsBlacklisted(const MemberConfig& memberConfig, - Date_t now) const { - std::map<HostAndPort,Date_t>::const_iterator blacklisted = - _syncSourceBlacklist.find(memberConfig.getHostAndPort()); - if (blacklisted != _syncSourceBlacklist.end()) { - if (blacklisted->second > now) { - return true; - } + if (closestIndex == -1) { + // Did not find any members to sync from + std::string msg("could not find member to sync from"); + // Only log when we had a valid sync source before + if (!_syncSource.empty()) { + log() << msg << rsLog; } - return false; - } + setMyHeartbeatMessage(now, msg); - void TopologyCoordinatorImpl::blacklistSyncSource(const HostAndPort& host, Date_t until) { - LOG(2) << "blacklisting " << host << " until " << until.toString(); - _syncSourceBlacklist[host] = until; + _syncSource = HostAndPort(); + return _syncSource; } - - void TopologyCoordinatorImpl::unblacklistSyncSource(const HostAndPort& host, Date_t now) { - std::map<HostAndPort, Date_t>::iterator hostItr = _syncSourceBlacklist.find(host); - if (hostItr != _syncSourceBlacklist.end() && now >= hostItr->second) { - LOG(2) << "unblacklisting " << host; - _syncSourceBlacklist.erase(hostItr); + _syncSource = _rsConfig.getMemberAt(closestIndex).getHostAndPort(); + std::string msg(str::stream() << "syncing from: " << _syncSource.toString(), 0); + log() << msg << rsLog; + setMyHeartbeatMessage(now, msg); + return _syncSource; +} + +bool TopologyCoordinatorImpl::_memberIsBlacklisted(const MemberConfig& memberConfig, + Date_t now) const { + std::map<HostAndPort, Date_t>::const_iterator blacklisted = + _syncSourceBlacklist.find(memberConfig.getHostAndPort()); + if (blacklisted != _syncSourceBlacklist.end()) { + if (blacklisted->second > now) { + return true; } } + return false; +} - void TopologyCoordinatorImpl::clearSyncSourceBlacklist() { - _syncSourceBlacklist.clear(); - } - - void TopologyCoordinatorImpl::prepareSyncFromResponse( - const ReplicationExecutor::CallbackData& data, - const HostAndPort& target, - const OpTime& lastOpApplied, - BSONObjBuilder* response, - Status* result) { - if (data.status == ErrorCodes::CallbackCanceled) { - *result = Status(ErrorCodes::ShutdownInProgress, "replication system is shutting down"); - return; - } - - response->append("syncFromRequested", target.toString()); - - if (_selfIndex == -1) { - *result = Status(ErrorCodes::NotSecondary, - "Removed and uninitialized nodes do not sync"); - return; - } +void TopologyCoordinatorImpl::blacklistSyncSource(const HostAndPort& host, Date_t until) { + LOG(2) << "blacklisting " << host << " until " << until.toString(); + _syncSourceBlacklist[host] = until; +} - const MemberConfig& selfConfig = _selfConfig(); - if (selfConfig.isArbiter()) { - *result = Status(ErrorCodes::NotSecondary, "arbiters don't sync"); - return; - } - if (_selfIndex == _currentPrimaryIndex) { - *result = Status(ErrorCodes::NotSecondary, "primaries don't sync"); - return; - } - - ReplicaSetConfig::MemberIterator targetConfig = _rsConfig.membersEnd(); - int targetIndex = 0; - for (ReplicaSetConfig::MemberIterator it = _rsConfig.membersBegin(); - it != _rsConfig.membersEnd(); ++it) { - if (it->getHostAndPort() == target) { - targetConfig = it; - break; - } - ++targetIndex; - } - if (targetConfig == _rsConfig.membersEnd()) { - *result = Status(ErrorCodes::NodeNotFound, - str::stream() << "Could not find member \"" << target.toString() << - "\" in replica set"); - return; - } - if (targetIndex == _selfIndex) { - *result = Status(ErrorCodes::InvalidOptions, "I cannot sync from myself"); - return; - } - if (targetConfig->isArbiter()) { - *result = Status(ErrorCodes::InvalidOptions, - str::stream() << "Cannot sync from \"" << target.toString() << - "\" because it is an arbiter"); - return; - } - if (!targetConfig->shouldBuildIndexes() && selfConfig.shouldBuildIndexes()) { - *result = Status(ErrorCodes::InvalidOptions, - str::stream() << "Cannot sync from \"" << target.toString() << - "\" because it does not build indexes"); - return; - } - - const MemberHeartbeatData& hbdata = _hbdata[targetIndex]; - if (hbdata.hasAuthIssue()) { - *result = Status(ErrorCodes::Unauthorized, - str::stream() << "not authorized to communicate with " << - target.toString()); - return; - } - if (hbdata.getHealth() == 0) { - *result = Status(ErrorCodes::HostUnreachable, - str::stream() << "I cannot reach the requested member: " << - target.toString()); - return; - } - if (hbdata.getOpTime().getSecs()+10 < lastOpApplied.getSecs()) { - warning() << "attempting to sync from " << target - << ", but its latest opTime is " << hbdata.getOpTime().getSecs() - << " and ours is " << lastOpApplied.getSecs() << " so this may not work"; - response->append("warning", - str::stream() << "requested member \"" << target.toString() << - "\" is more than 10 seconds behind us"); - // not returning bad Status, just warning - } +void TopologyCoordinatorImpl::unblacklistSyncSource(const HostAndPort& host, Date_t now) { + std::map<HostAndPort, Date_t>::iterator hostItr = _syncSourceBlacklist.find(host); + if (hostItr != _syncSourceBlacklist.end() && now >= hostItr->second) { + LOG(2) << "unblacklisting " << host; + _syncSourceBlacklist.erase(hostItr); + } +} - HostAndPort prevSyncSource = getSyncSourceAddress(); - if (!prevSyncSource.empty()) { - response->append("prevSyncTarget", prevSyncSource.toString()); - } +void TopologyCoordinatorImpl::clearSyncSourceBlacklist() { + _syncSourceBlacklist.clear(); +} - setForceSyncSourceIndex(targetIndex); - *result = Status::OK(); +void TopologyCoordinatorImpl::prepareSyncFromResponse(const ReplicationExecutor::CallbackData& data, + const HostAndPort& target, + const OpTime& lastOpApplied, + BSONObjBuilder* response, + Status* result) { + if (data.status == ErrorCodes::CallbackCanceled) { + *result = Status(ErrorCodes::ShutdownInProgress, "replication system is shutting down"); + return; } - void TopologyCoordinatorImpl::prepareFreshResponse( - const ReplicationCoordinator::ReplSetFreshArgs& args, - const Date_t now, - const OpTime lastOpApplied, - BSONObjBuilder* response, - Status* result) { - - if (_selfIndex == -1) { - *result = Status(ErrorCodes::ReplicaSetNotFound, - "Cannot participate in elections because not initialized"); - return; - } + response->append("syncFromRequested", target.toString()); - if (args.setName != _rsConfig.getReplSetName()) { - *result = Status(ErrorCodes::ReplicaSetNotFound, - str::stream() << "Wrong repl set name. Expected: " << - _rsConfig.getReplSetName() << - ", received: " << args.setName); - return; - } + if (_selfIndex == -1) { + *result = Status(ErrorCodes::NotSecondary, "Removed and uninitialized nodes do not sync"); + return; + } - if (args.id == static_cast<unsigned>(_selfConfig().getId())) { - *result = Status(ErrorCodes::BadValue, - str::stream() << "Received replSetFresh command from member with the " - "same member ID as ourself: " << args.id); - return; - } + const MemberConfig& selfConfig = _selfConfig(); + if (selfConfig.isArbiter()) { + *result = Status(ErrorCodes::NotSecondary, "arbiters don't sync"); + return; + } + if (_selfIndex == _currentPrimaryIndex) { + *result = Status(ErrorCodes::NotSecondary, "primaries don't sync"); + return; + } - bool weAreFresher = false; - if( _rsConfig.getConfigVersion() > args.cfgver ) { - log() << "replSet member " << args.who << " is not yet aware its cfg version " - << args.cfgver << " is stale"; - response->append("info", "config version stale"); - weAreFresher = true; - } - // check not only our own optime, but any other member we can reach - else if (args.opTime < _latestKnownOpTime(lastOpApplied)) { - weAreFresher = true; - } - response->appendDate("opTime", lastOpApplied.asDate()); - response->append("fresher", weAreFresher); - - std::string errmsg; - bool doVeto = _shouldVetoMember(args, now, lastOpApplied, &errmsg); - response->append("veto", doVeto); - if (doVeto) { - response->append("errmsg", errmsg); + ReplicaSetConfig::MemberIterator targetConfig = _rsConfig.membersEnd(); + int targetIndex = 0; + for (ReplicaSetConfig::MemberIterator it = _rsConfig.membersBegin(); + it != _rsConfig.membersEnd(); + ++it) { + if (it->getHostAndPort() == target) { + targetConfig = it; + break; } - *result = Status::OK(); + ++targetIndex; + } + if (targetConfig == _rsConfig.membersEnd()) { + *result = Status(ErrorCodes::NodeNotFound, + str::stream() << "Could not find member \"" << target.toString() + << "\" in replica set"); + return; + } + if (targetIndex == _selfIndex) { + *result = Status(ErrorCodes::InvalidOptions, "I cannot sync from myself"); + return; + } + if (targetConfig->isArbiter()) { + *result = Status(ErrorCodes::InvalidOptions, + str::stream() << "Cannot sync from \"" << target.toString() + << "\" because it is an arbiter"); + return; + } + if (!targetConfig->shouldBuildIndexes() && selfConfig.shouldBuildIndexes()) { + *result = Status(ErrorCodes::InvalidOptions, + str::stream() << "Cannot sync from \"" << target.toString() + << "\" because it does not build indexes"); + return; + } + + const MemberHeartbeatData& hbdata = _hbdata[targetIndex]; + if (hbdata.hasAuthIssue()) { + *result = + Status(ErrorCodes::Unauthorized, + str::stream() << "not authorized to communicate with " << target.toString()); + return; + } + if (hbdata.getHealth() == 0) { + *result = + Status(ErrorCodes::HostUnreachable, + str::stream() << "I cannot reach the requested member: " << target.toString()); + return; + } + if (hbdata.getOpTime().getSecs() + 10 < lastOpApplied.getSecs()) { + warning() << "attempting to sync from " << target << ", but its latest opTime is " + << hbdata.getOpTime().getSecs() << " and ours is " << lastOpApplied.getSecs() + << " so this may not work"; + response->append("warning", + str::stream() << "requested member \"" << target.toString() + << "\" is more than 10 seconds behind us"); + // not returning bad Status, just warning + } + + HostAndPort prevSyncSource = getSyncSourceAddress(); + if (!prevSyncSource.empty()) { + response->append("prevSyncTarget", prevSyncSource.toString()); + } + + setForceSyncSourceIndex(targetIndex); + *result = Status::OK(); +} + +void TopologyCoordinatorImpl::prepareFreshResponse( + const ReplicationCoordinator::ReplSetFreshArgs& args, + const Date_t now, + const OpTime lastOpApplied, + BSONObjBuilder* response, + Status* result) { + if (_selfIndex == -1) { + *result = Status(ErrorCodes::ReplicaSetNotFound, + "Cannot participate in elections because not initialized"); + return; + } + + if (args.setName != _rsConfig.getReplSetName()) { + *result = + Status(ErrorCodes::ReplicaSetNotFound, + str::stream() << "Wrong repl set name. Expected: " << _rsConfig.getReplSetName() + << ", received: " << args.setName); + return; + } + + if (args.id == static_cast<unsigned>(_selfConfig().getId())) { + *result = Status(ErrorCodes::BadValue, + str::stream() << "Received replSetFresh command from member with the " + "same member ID as ourself: " << args.id); + return; + } + + bool weAreFresher = false; + if (_rsConfig.getConfigVersion() > args.cfgver) { + log() << "replSet member " << args.who << " is not yet aware its cfg version " + << args.cfgver << " is stale"; + response->append("info", "config version stale"); + weAreFresher = true; + } + // check not only our own optime, but any other member we can reach + else if (args.opTime < _latestKnownOpTime(lastOpApplied)) { + weAreFresher = true; + } + response->appendDate("opTime", lastOpApplied.asDate()); + response->append("fresher", weAreFresher); + + std::string errmsg; + bool doVeto = _shouldVetoMember(args, now, lastOpApplied, &errmsg); + response->append("veto", doVeto); + if (doVeto) { + response->append("errmsg", errmsg); + } + *result = Status::OK(); +} + +bool TopologyCoordinatorImpl::_shouldVetoMember( + const ReplicationCoordinator::ReplSetFreshArgs& args, + const Date_t& now, + const OpTime& lastOpApplied, + std::string* errmsg) const { + if (_rsConfig.getConfigVersion() < args.cfgver) { + // We are stale; do not veto. + return false; } - bool TopologyCoordinatorImpl::_shouldVetoMember( - const ReplicationCoordinator::ReplSetFreshArgs& args, - const Date_t& now, - const OpTime& lastOpApplied, - std::string* errmsg) const { + const unsigned int memberID = args.id; + const int hopefulIndex = _getMemberIndex(memberID); + invariant(hopefulIndex != _selfIndex); + const int highestPriorityIndex = _getHighestPriorityElectableIndex(now, lastOpApplied); - if (_rsConfig.getConfigVersion() < args.cfgver) { - // We are stale; do not veto. - return false; - } - - const unsigned int memberID = args.id; - const int hopefulIndex = _getMemberIndex(memberID); - invariant(hopefulIndex != _selfIndex); - const int highestPriorityIndex = _getHighestPriorityElectableIndex(now, lastOpApplied); - - if (hopefulIndex == -1) { - *errmsg = str::stream() << "replSet couldn't find member with id " << memberID; - return true; - } + if (hopefulIndex == -1) { + *errmsg = str::stream() << "replSet couldn't find member with id " << memberID; + return true; + } - if (_iAmPrimary() && lastOpApplied >= _hbdata[hopefulIndex].getOpTime()) { - // hbinfo is not updated for ourself, so if we are primary we have to check the - // primary's last optime separately - *errmsg = str::stream() << "I am already primary, " << - _rsConfig.getMemberAt(hopefulIndex).getHostAndPort().toString() << - " can try again once I've stepped down"; - return true; - } + if (_iAmPrimary() && lastOpApplied >= _hbdata[hopefulIndex].getOpTime()) { + // hbinfo is not updated for ourself, so if we are primary we have to check the + // primary's last optime separately + *errmsg = str::stream() << "I am already primary, " + << _rsConfig.getMemberAt(hopefulIndex).getHostAndPort().toString() + << " can try again once I've stepped down"; + return true; + } - if (_currentPrimaryIndex != -1 && - (hopefulIndex != _currentPrimaryIndex) && - (_hbdata[_currentPrimaryIndex].getOpTime() >= - _hbdata[hopefulIndex].getOpTime())) { - // other members might be aware of more up-to-date nodes - *errmsg = str::stream() << - _rsConfig.getMemberAt(hopefulIndex).getHostAndPort().toString() << - " is trying to elect itself but " << - _rsConfig.getMemberAt(_currentPrimaryIndex).getHostAndPort().toString() << - " is already primary and more up-to-date"; - return true; - } + if (_currentPrimaryIndex != -1 && (hopefulIndex != _currentPrimaryIndex) && + (_hbdata[_currentPrimaryIndex].getOpTime() >= _hbdata[hopefulIndex].getOpTime())) { + // other members might be aware of more up-to-date nodes + *errmsg = + str::stream() << _rsConfig.getMemberAt(hopefulIndex).getHostAndPort().toString() + << " is trying to elect itself but " + << _rsConfig.getMemberAt(_currentPrimaryIndex).getHostAndPort().toString() + << " is already primary and more up-to-date"; + return true; + } - if ((highestPriorityIndex != -1)) { - const MemberConfig& hopefulMember = _rsConfig.getMemberAt(hopefulIndex); - const MemberConfig& priorityMember = _rsConfig.getMemberAt(highestPriorityIndex); - - if (priorityMember.getPriority() > hopefulMember.getPriority()) { - *errmsg = str::stream() - << hopefulMember.getHostAndPort().toString() - << " has lower priority of " << hopefulMember.getPriority() << " than " - << priorityMember.getHostAndPort().toString() - << " which has a priority of " << priorityMember.getPriority(); - return true; - } - } + if ((highestPriorityIndex != -1)) { + const MemberConfig& hopefulMember = _rsConfig.getMemberAt(hopefulIndex); + const MemberConfig& priorityMember = _rsConfig.getMemberAt(highestPriorityIndex); - UnelectableReasonMask reason = _getUnelectableReason(hopefulIndex, lastOpApplied); - reason &= ~RefusesToStand; - if (reason) { - *errmsg = str::stream() - << "I don't think " - << _rsConfig.getMemberAt(hopefulIndex).getHostAndPort().toString() - << " is electable because the " << _getUnelectableReasonString(reason); + if (priorityMember.getPriority() > hopefulMember.getPriority()) { + *errmsg = str::stream() << hopefulMember.getHostAndPort().toString() + << " has lower priority of " << hopefulMember.getPriority() + << " than " << priorityMember.getHostAndPort().toString() + << " which has a priority of " << priorityMember.getPriority(); return true; } - - return false; } - // produce a reply to a received electCmd - void TopologyCoordinatorImpl::prepareElectResponse( - const ReplicationCoordinator::ReplSetElectArgs& args, - const Date_t now, - const OpTime lastOpApplied, - BSONObjBuilder* response, - Status* result) { - - if (_selfIndex == -1) { - *result = Status(ErrorCodes::ReplicaSetNotFound, - "Cannot participate in election because not initialized"); - return; - } - - const long long myver = _rsConfig.getConfigVersion(); - const int highestPriorityIndex = _getHighestPriorityElectableIndex(now, lastOpApplied); - - const MemberConfig* primary = _currentPrimaryMember(); - const MemberConfig* hopeful = _rsConfig.findMemberByID(args.whoid); - const MemberConfig* highestPriority = highestPriorityIndex == -1 ? NULL : - &_rsConfig.getMemberAt(highestPriorityIndex); - - int vote = 0; - if (args.set != _rsConfig.getReplSetName()) { - log() << "replSet error received an elect request for '" << args.set - << "' but our set name is '" << - _rsConfig.getReplSetName() << "'"; - } - else if ( myver < args.cfgver ) { - // we are stale. don't vote - log() << "replSetElect not voting because our config version is stale. Our version: " << - myver << ", their version: " << args.cfgver; - } - else if ( myver > args.cfgver ) { - // they are stale! - log() << "replSetElect command received stale config version # during election. " - "Our version: " << myver << ", their version: " << args.cfgver; - vote = -10000; - } - else if (!hopeful) { - log() << "replSetElect couldn't find member with id " << args.whoid; - vote = -10000; - } - else if (_iAmPrimary()) { - log() << "I am already primary, " << hopeful->getHostAndPort().toString() - << " can try again once I've stepped down"; - vote = -10000; - } - else if (primary) { - log() << hopeful->getHostAndPort().toString() << " is trying to elect itself but " - << primary->getHostAndPort().toString() << " is already primary"; - vote = -10000; - } - else if (highestPriority && highestPriority->getPriority() > hopeful->getPriority()) { - // TODO(spencer): What if the lower-priority member is more up-to-date? - log() << hopeful->getHostAndPort().toString() << " has lower priority than " - << highestPriority->getHostAndPort().toString(); - vote = -10000; - } - else if (_lastVote.when.millis + LastVote::leaseTime.total_milliseconds() >= now.millis && - _lastVote.whoId != args.whoid) { - log() << "replSet voting no for " - << hopeful->getHostAndPort().toString() - << "; voted for " << _lastVote.whoHostAndPort.toString() << ' ' - << (now.millis - _lastVote.when.millis) / 1000 << " secs ago"; - } - else { - _lastVote.when = now; - _lastVote.whoId = args.whoid; - _lastVote.whoHostAndPort = hopeful->getHostAndPort(); - vote = _selfConfig().getNumVotes(); - invariant(hopeful->getId() == args.whoid); - if (vote > 0) { - log() << "replSetElect voting yea for " << hopeful->getHostAndPort().toString() - << " (" << args.whoid << ')'; - } - } - - response->append("vote", vote); - response->append("round", args.round); - *result = Status::OK(); + UnelectableReasonMask reason = _getUnelectableReason(hopefulIndex, lastOpApplied); + reason &= ~RefusesToStand; + if (reason) { + *errmsg = str::stream() << "I don't think " + << _rsConfig.getMemberAt(hopefulIndex).getHostAndPort().toString() + << " is electable because the " + << _getUnelectableReasonString(reason); + return true; } - // produce a reply to a heartbeat - Status TopologyCoordinatorImpl::prepareHeartbeatResponse( - Date_t now, - const ReplSetHeartbeatArgs& args, - const std::string& ourSetName, - const OpTime& lastOpApplied, - ReplSetHeartbeatResponse* response) { - - if (args.getProtocolVersion() != 1) { + return false; +} + +// produce a reply to a received electCmd +void TopologyCoordinatorImpl::prepareElectResponse( + const ReplicationCoordinator::ReplSetElectArgs& args, + const Date_t now, + const OpTime lastOpApplied, + BSONObjBuilder* response, + Status* result) { + if (_selfIndex == -1) { + *result = Status(ErrorCodes::ReplicaSetNotFound, + "Cannot participate in election because not initialized"); + return; + } + + const long long myver = _rsConfig.getConfigVersion(); + const int highestPriorityIndex = _getHighestPriorityElectableIndex(now, lastOpApplied); + + const MemberConfig* primary = _currentPrimaryMember(); + const MemberConfig* hopeful = _rsConfig.findMemberByID(args.whoid); + const MemberConfig* highestPriority = + highestPriorityIndex == -1 ? NULL : &_rsConfig.getMemberAt(highestPriorityIndex); + + int vote = 0; + if (args.set != _rsConfig.getReplSetName()) { + log() << "replSet error received an elect request for '" << args.set + << "' but our set name is '" << _rsConfig.getReplSetName() << "'"; + } else if (myver < args.cfgver) { + // we are stale. don't vote + log() << "replSetElect not voting because our config version is stale. Our version: " + << myver << ", their version: " << args.cfgver; + } else if (myver > args.cfgver) { + // they are stale! + log() << "replSetElect command received stale config version # during election. " + "Our version: " << myver << ", their version: " << args.cfgver; + vote = -10000; + } else if (!hopeful) { + log() << "replSetElect couldn't find member with id " << args.whoid; + vote = -10000; + } else if (_iAmPrimary()) { + log() << "I am already primary, " << hopeful->getHostAndPort().toString() + << " can try again once I've stepped down"; + vote = -10000; + } else if (primary) { + log() << hopeful->getHostAndPort().toString() << " is trying to elect itself but " + << primary->getHostAndPort().toString() << " is already primary"; + vote = -10000; + } else if (highestPriority && highestPriority->getPriority() > hopeful->getPriority()) { + // TODO(spencer): What if the lower-priority member is more up-to-date? + log() << hopeful->getHostAndPort().toString() << " has lower priority than " + << highestPriority->getHostAndPort().toString(); + vote = -10000; + } else if (_lastVote.when.millis + LastVote::leaseTime.total_milliseconds() >= now.millis && + _lastVote.whoId != args.whoid) { + log() << "replSet voting no for " << hopeful->getHostAndPort().toString() << "; voted for " + << _lastVote.whoHostAndPort.toString() << ' ' + << (now.millis - _lastVote.when.millis) / 1000 << " secs ago"; + } else { + _lastVote.when = now; + _lastVote.whoId = args.whoid; + _lastVote.whoHostAndPort = hopeful->getHostAndPort(); + vote = _selfConfig().getNumVotes(); + invariant(hopeful->getId() == args.whoid); + if (vote > 0) { + log() << "replSetElect voting yea for " << hopeful->getHostAndPort().toString() << " (" + << args.whoid << ')'; + } + } + + response->append("vote", vote); + response->append("round", args.round); + *result = Status::OK(); +} + +// produce a reply to a heartbeat +Status TopologyCoordinatorImpl::prepareHeartbeatResponse(Date_t now, + const ReplSetHeartbeatArgs& args, + const std::string& ourSetName, + const OpTime& lastOpApplied, + ReplSetHeartbeatResponse* response) { + if (args.getProtocolVersion() != 1) { + return Status(ErrorCodes::BadValue, + str::stream() << "replset: incompatible replset protocol version: " + << args.getProtocolVersion()); + } + + // Verify that replica set names match + const std::string rshb = args.getSetName(); + if (ourSetName != rshb) { + log() << "replSet set names do not match, ours: " << ourSetName + << "; remote node's: " << rshb; + response->noteMismatched(); + return Status(ErrorCodes::InconsistentReplicaSetNames, + str::stream() << "Our set name of " << ourSetName << " does not match name " + << rshb << " reported by remote node"); + } + + const MemberState myState = getMemberState(); + if (_selfIndex == -1) { + if (myState.removed()) { + return Status(ErrorCodes::InvalidReplicaSetConfig, + "Our replica set configuration is invalid or does not include us"); + } + } else { + invariant(_rsConfig.getReplSetName() == args.getSetName()); + if (args.getSenderId() == _selfConfig().getId()) { return Status(ErrorCodes::BadValue, - str::stream() << "replset: incompatible replset protocol version: " - << args.getProtocolVersion()); - } - - // Verify that replica set names match - const std::string rshb = args.getSetName(); - if (ourSetName != rshb) { - log() << "replSet set names do not match, ours: " << ourSetName << - "; remote node's: " << rshb; - response->noteMismatched(); - return Status(ErrorCodes::InconsistentReplicaSetNames, str::stream() << - "Our set name of " << ourSetName << " does not match name " << rshb << - " reported by remote node"); + str::stream() << "Received heartbeat from member with the same " + "member ID as ourself: " << args.getSenderId()); } + } - const MemberState myState = getMemberState(); - if (_selfIndex == -1) { - if (myState.removed()) { - return Status(ErrorCodes::InvalidReplicaSetConfig, - "Our replica set configuration is invalid or does not include us"); - } - } - else { - invariant(_rsConfig.getReplSetName() == args.getSetName()); - if (args.getSenderId() == _selfConfig().getId()) { - return Status(ErrorCodes::BadValue, - str::stream() << "Received heartbeat from member with the same " - "member ID as ourself: " << args.getSenderId()); - } - } - - // This is a replica set - response->noteReplSet(); - - // For 2.6 compatibility - if (_rsConfig.isInitialized()) { - response->setSetName(ourSetName); - } - response->setState(myState.s); - if (myState.primary()) { - response->setElectionTime(_electionTime); - } - - // Are we electable - response->setElectable(!_getMyUnelectableReason(now, lastOpApplied)); - - // Heartbeat status message - response->setHbMsg(_getHbmsg(now)); - response->setTime(Seconds(Milliseconds(now.asInt64()).total_seconds())); - response->setOpTime(lastOpApplied.asDate()); - - if (!_syncSource.empty()) { - response->setSyncingTo(_syncSource.toString()); - } + // This is a replica set + response->noteReplSet(); - if (!_rsConfig.isInitialized()) { - response->setVersion(-2); - return Status::OK(); - } + // For 2.6 compatibility + if (_rsConfig.isInitialized()) { + response->setSetName(ourSetName); + } + response->setState(myState.s); + if (myState.primary()) { + response->setElectionTime(_electionTime); + } - const long long v = _rsConfig.getConfigVersion(); - response->setVersion(v); - // Deliver new config if caller's version is older than ours - if (v > args.getConfigVersion()) { - response->setConfig(_rsConfig); - } + // Are we electable + response->setElectable(!_getMyUnelectableReason(now, lastOpApplied)); - // Resolve the caller's id in our Member list - int from = -1; - if (v == args.getConfigVersion() && args.getSenderId() != -1) { - from = _getMemberIndex(args.getSenderId()); - } - if (from == -1) { - // Can't find the member, so we leave out the stateDisagreement field - return Status::OK(); - } - invariant(from != _selfIndex); + // Heartbeat status message + response->setHbMsg(_getHbmsg(now)); + response->setTime(Seconds(Milliseconds(now.asInt64()).total_seconds())); + response->setOpTime(lastOpApplied.asDate()); - // if we thought that this node is down, let it know - if (!_hbdata[from].up()) { - response->noteStateDisagreement(); - } + if (!_syncSource.empty()) { + response->setSyncingTo(_syncSource.toString()); + } - // note that we got a heartbeat from this node - _hbdata[from].setLastHeartbeatRecv(now); + if (!_rsConfig.isInitialized()) { + response->setVersion(-2); return Status::OK(); } - - int TopologyCoordinatorImpl::_getMemberIndex(int id) const { - int index = 0; - for (ReplicaSetConfig::MemberIterator it = _rsConfig.membersBegin(); - it != _rsConfig.membersEnd(); - ++it, ++index) { - if (it->getId() == id) { - return index; - } - } - return -1; + const long long v = _rsConfig.getConfigVersion(); + response->setVersion(v); + // Deliver new config if caller's version is older than ours + if (v > args.getConfigVersion()) { + response->setConfig(_rsConfig); } - std::pair<ReplSetHeartbeatArgs, Milliseconds> TopologyCoordinatorImpl::prepareHeartbeatRequest( - Date_t now, - const std::string& ourSetName, - const HostAndPort& target) { - - PingStats& hbStats = _pings[target]; - Milliseconds alreadyElapsed(now.asInt64() - hbStats.getLastHeartbeatStartDate().asInt64()); - if (!_rsConfig.isInitialized() || - (hbStats.getNumFailuresSinceLastStart() > kMaxHeartbeatRetries) || - (alreadyElapsed >= _rsConfig.getHeartbeatTimeoutPeriodMillis())) { - - // This is either the first request ever for "target", or the heartbeat timeout has - // passed, so we're starting a "new" heartbeat. - hbStats.start(now); - alreadyElapsed = Milliseconds(0); - } - ReplSetHeartbeatArgs hbArgs; - hbArgs.setProtocolVersion(1); - hbArgs.setCheckEmpty(false); - if (_rsConfig.isInitialized()) { - hbArgs.setSetName(_rsConfig.getReplSetName()); - hbArgs.setConfigVersion(_rsConfig.getConfigVersion()); - if (_selfIndex >= 0) { - const MemberConfig& me = _selfConfig(); - hbArgs.setSenderHost(me.getHostAndPort()); - hbArgs.setSenderId(me.getId()); - } - } - else { - hbArgs.setSetName(ourSetName); - hbArgs.setConfigVersion(-2); - } - - const Milliseconds timeoutPeriod( - _rsConfig.isInitialized() ? - _rsConfig.getHeartbeatTimeoutPeriodMillis() : - Milliseconds( - ReplicaSetConfig::kDefaultHeartbeatTimeoutPeriod.total_milliseconds())); - const Milliseconds timeout( - timeoutPeriod.total_milliseconds() - alreadyElapsed.total_milliseconds()); - return std::make_pair(hbArgs, timeout); - } - - HeartbeatResponseAction TopologyCoordinatorImpl::processHeartbeatResponse( - Date_t now, - Milliseconds networkRoundTripTime, - const HostAndPort& target, - const StatusWith<ReplSetHeartbeatResponse>& hbResponse, - OpTime myLastOpApplied) { - - const MemberState originalState = getMemberState(); - PingStats& hbStats = _pings[target]; - invariant(hbStats.getLastHeartbeatStartDate() != Date_t(0)); - if (!hbResponse.isOK()) { - hbStats.miss(); - } - else { - hbStats.hit(networkRoundTripTime.total_milliseconds()); - // Log diagnostics. - if (hbResponse.getValue().isStateDisagreement()) { - LOG(1) << target << - " thinks that we are down because they cannot send us heartbeats."; - } - } - - const bool isUnauthorized = - (hbResponse.getStatus().code() == ErrorCodes::Unauthorized) || - (hbResponse.getStatus().code() == ErrorCodes::AuthenticationFailed); - - Milliseconds alreadyElapsed(now.asInt64() - hbStats.getLastHeartbeatStartDate().asInt64()); - Date_t nextHeartbeatStartDate; - // determine next start time - if (_rsConfig.isInitialized() && - (hbStats.getNumFailuresSinceLastStart() <= kMaxHeartbeatRetries) && - (alreadyElapsed < _rsConfig.getHeartbeatTimeoutPeriodMillis())) { - - if (isUnauthorized) { - nextHeartbeatStartDate = now + kHeartbeatInterval.total_milliseconds(); - } else { - nextHeartbeatStartDate = now; - } - } - else { + // Resolve the caller's id in our Member list + int from = -1; + if (v == args.getConfigVersion() && args.getSenderId() != -1) { + from = _getMemberIndex(args.getSenderId()); + } + if (from == -1) { + // Can't find the member, so we leave out the stateDisagreement field + return Status::OK(); + } + invariant(from != _selfIndex); + + // if we thought that this node is down, let it know + if (!_hbdata[from].up()) { + response->noteStateDisagreement(); + } + + // note that we got a heartbeat from this node + _hbdata[from].setLastHeartbeatRecv(now); + return Status::OK(); +} + + +int TopologyCoordinatorImpl::_getMemberIndex(int id) const { + int index = 0; + for (ReplicaSetConfig::MemberIterator it = _rsConfig.membersBegin(); + it != _rsConfig.membersEnd(); + ++it, ++index) { + if (it->getId() == id) { + return index; + } + } + return -1; +} + +std::pair<ReplSetHeartbeatArgs, Milliseconds> TopologyCoordinatorImpl::prepareHeartbeatRequest( + Date_t now, const std::string& ourSetName, const HostAndPort& target) { + PingStats& hbStats = _pings[target]; + Milliseconds alreadyElapsed(now.asInt64() - hbStats.getLastHeartbeatStartDate().asInt64()); + if (!_rsConfig.isInitialized() || + (hbStats.getNumFailuresSinceLastStart() > kMaxHeartbeatRetries) || + (alreadyElapsed >= _rsConfig.getHeartbeatTimeoutPeriodMillis())) { + // This is either the first request ever for "target", or the heartbeat timeout has + // passed, so we're starting a "new" heartbeat. + hbStats.start(now); + alreadyElapsed = Milliseconds(0); + } + ReplSetHeartbeatArgs hbArgs; + hbArgs.setProtocolVersion(1); + hbArgs.setCheckEmpty(false); + if (_rsConfig.isInitialized()) { + hbArgs.setSetName(_rsConfig.getReplSetName()); + hbArgs.setConfigVersion(_rsConfig.getConfigVersion()); + if (_selfIndex >= 0) { + const MemberConfig& me = _selfConfig(); + hbArgs.setSenderHost(me.getHostAndPort()); + hbArgs.setSenderId(me.getId()); + } + } else { + hbArgs.setSetName(ourSetName); + hbArgs.setConfigVersion(-2); + } + + const Milliseconds timeoutPeriod( + _rsConfig.isInitialized() + ? _rsConfig.getHeartbeatTimeoutPeriodMillis() + : Milliseconds(ReplicaSetConfig::kDefaultHeartbeatTimeoutPeriod.total_milliseconds())); + const Milliseconds timeout(timeoutPeriod.total_milliseconds() - + alreadyElapsed.total_milliseconds()); + return std::make_pair(hbArgs, timeout); +} + +HeartbeatResponseAction TopologyCoordinatorImpl::processHeartbeatResponse( + Date_t now, + Milliseconds networkRoundTripTime, + const HostAndPort& target, + const StatusWith<ReplSetHeartbeatResponse>& hbResponse, + OpTime myLastOpApplied) { + const MemberState originalState = getMemberState(); + PingStats& hbStats = _pings[target]; + invariant(hbStats.getLastHeartbeatStartDate() != Date_t(0)); + if (!hbResponse.isOK()) { + hbStats.miss(); + } else { + hbStats.hit(networkRoundTripTime.total_milliseconds()); + // Log diagnostics. + if (hbResponse.getValue().isStateDisagreement()) { + LOG(1) << target << " thinks that we are down because they cannot send us heartbeats."; + } + } + + const bool isUnauthorized = (hbResponse.getStatus().code() == ErrorCodes::Unauthorized) || + (hbResponse.getStatus().code() == ErrorCodes::AuthenticationFailed); + + Milliseconds alreadyElapsed(now.asInt64() - hbStats.getLastHeartbeatStartDate().asInt64()); + Date_t nextHeartbeatStartDate; + // determine next start time + if (_rsConfig.isInitialized() && + (hbStats.getNumFailuresSinceLastStart() <= kMaxHeartbeatRetries) && + (alreadyElapsed < _rsConfig.getHeartbeatTimeoutPeriodMillis())) { + if (isUnauthorized) { nextHeartbeatStartDate = now + kHeartbeatInterval.total_milliseconds(); + } else { + nextHeartbeatStartDate = now; } + } else { + nextHeartbeatStartDate = now + kHeartbeatInterval.total_milliseconds(); + } - if (hbResponse.isOK() && hbResponse.getValue().hasConfig()) { - const long long currentConfigVersion = - _rsConfig.isInitialized() ? _rsConfig.getConfigVersion() : -2; - const ReplicaSetConfig& newConfig = hbResponse.getValue().getConfig(); - if (newConfig.getConfigVersion() > currentConfigVersion) { - HeartbeatResponseAction nextAction = HeartbeatResponseAction::makeReconfigAction(); - nextAction.setNextHeartbeatStartDate(nextHeartbeatStartDate); - return nextAction; - } - else { - // Could be we got the newer version before we got the response, or the - // target erroneously sent us one, even through it isn't newer. - if (newConfig.getConfigVersion() < currentConfigVersion) { - LOG(1) << "Config version from heartbeat was older than ours."; - } - else { - LOG(2) << "Config from heartbeat response was same as ours."; - } - if (logger::globalLogDomain()->shouldLog( - MongoLogDefaultComponent_component, - ::mongo::LogstreamBuilder::severityCast(2))) { - LogstreamBuilder lsb = log(); - if (_rsConfig.isInitialized()) { - lsb << "Current config: " << _rsConfig.toBSON() << "; "; - } - lsb << "Config in heartbeat: " << newConfig.toBSON(); - } - } - } - - // Check if the heartbeat target is in our config. If it isn't, there's nothing left to do, - // so return early. - if (!_rsConfig.isInitialized()) { - HeartbeatResponseAction nextAction = HeartbeatResponseAction::makeNoAction(); - nextAction.setNextHeartbeatStartDate(nextHeartbeatStartDate); - return nextAction; - } - const int memberIndex = _rsConfig.findMemberIndexByHostAndPort(target); - if (memberIndex == -1) { - LOG(1) << "replset: Could not find " << target << " in current config so ignoring --" - " current config: " << _rsConfig.toBSON(); - HeartbeatResponseAction nextAction = HeartbeatResponseAction::makeNoAction(); + if (hbResponse.isOK() && hbResponse.getValue().hasConfig()) { + const long long currentConfigVersion = + _rsConfig.isInitialized() ? _rsConfig.getConfigVersion() : -2; + const ReplicaSetConfig& newConfig = hbResponse.getValue().getConfig(); + if (newConfig.getConfigVersion() > currentConfigVersion) { + HeartbeatResponseAction nextAction = HeartbeatResponseAction::makeReconfigAction(); nextAction.setNextHeartbeatStartDate(nextHeartbeatStartDate); return nextAction; - } - invariant(memberIndex != _selfIndex); - - MemberHeartbeatData& hbData = _hbdata[memberIndex]; - const MemberConfig member = _rsConfig.getMemberAt(memberIndex); - if (!hbResponse.isOK()) { - if (isUnauthorized) { - LOG(1) << "setAuthIssue: heartbeat response failed due to authentication" - " issue for member _id:" << member.getId(); - hbData.setAuthIssue(now); - } - else if (hbStats.getNumFailuresSinceLastStart() > kMaxHeartbeatRetries || - alreadyElapsed >= _rsConfig.getHeartbeatTimeoutPeriodMillis()) { - - LOG(1) << "setDownValues: heartbeat response failed for member _id:" - << member.getId() << ", msg: " - << hbResponse.getStatus().reason(); - - hbData.setDownValues(now, hbResponse.getStatus().reason()); + } else { + // Could be we got the newer version before we got the response, or the + // target erroneously sent us one, even through it isn't newer. + if (newConfig.getConfigVersion() < currentConfigVersion) { + LOG(1) << "Config version from heartbeat was older than ours."; + } else { + LOG(2) << "Config from heartbeat response was same as ours."; } - else { - LOG(3) << "Bad heartbeat response from " << target << - "; trying again; Retries left: " << - (kMaxHeartbeatRetries - hbStats.getNumFailuresSinceLastStart()) << - "; " << alreadyElapsed.total_milliseconds() << "ms have already elapsed"; + if (logger::globalLogDomain()->shouldLog(MongoLogDefaultComponent_component, + ::mongo::LogstreamBuilder::severityCast(2))) { + LogstreamBuilder lsb = log(); + if (_rsConfig.isInitialized()) { + lsb << "Current config: " << _rsConfig.toBSON() << "; "; + } + lsb << "Config in heartbeat: " << newConfig.toBSON(); } } - else { - ReplSetHeartbeatResponse hbr = hbResponse.getValue(); - LOG(3) << "setUpValues: heartbeat response good for member _id:" - << member.getId() << ", msg: " - << hbr.getHbMsg(); - hbData.setUpValues(now, member.getHostAndPort(), hbr); - } - HeartbeatResponseAction nextAction = _updateHeartbeatDataImpl( - memberIndex, - originalState, - now, - myLastOpApplied); + } + // Check if the heartbeat target is in our config. If it isn't, there's nothing left to do, + // so return early. + if (!_rsConfig.isInitialized()) { + HeartbeatResponseAction nextAction = HeartbeatResponseAction::makeNoAction(); nextAction.setNextHeartbeatStartDate(nextHeartbeatStartDate); return nextAction; } + const int memberIndex = _rsConfig.findMemberIndexByHostAndPort(target); + if (memberIndex == -1) { + LOG(1) << "replset: Could not find " << target << " in current config so ignoring --" + " current config: " << _rsConfig.toBSON(); + HeartbeatResponseAction nextAction = HeartbeatResponseAction::makeNoAction(); + nextAction.setNextHeartbeatStartDate(nextHeartbeatStartDate); + return nextAction; + } + invariant(memberIndex != _selfIndex); + + MemberHeartbeatData& hbData = _hbdata[memberIndex]; + const MemberConfig member = _rsConfig.getMemberAt(memberIndex); + if (!hbResponse.isOK()) { + if (isUnauthorized) { + LOG(1) << "setAuthIssue: heartbeat response failed due to authentication" + " issue for member _id:" << member.getId(); + hbData.setAuthIssue(now); + } else if (hbStats.getNumFailuresSinceLastStart() > kMaxHeartbeatRetries || + alreadyElapsed >= _rsConfig.getHeartbeatTimeoutPeriodMillis()) { + LOG(1) << "setDownValues: heartbeat response failed for member _id:" << member.getId() + << ", msg: " << hbResponse.getStatus().reason(); + + hbData.setDownValues(now, hbResponse.getStatus().reason()); + } else { + LOG(3) << "Bad heartbeat response from " << target << "; trying again; Retries left: " + << (kMaxHeartbeatRetries - hbStats.getNumFailuresSinceLastStart()) << "; " + << alreadyElapsed.total_milliseconds() << "ms have already elapsed"; + } + } else { + ReplSetHeartbeatResponse hbr = hbResponse.getValue(); + LOG(3) << "setUpValues: heartbeat response good for member _id:" << member.getId() + << ", msg: " << hbr.getHbMsg(); + hbData.setUpValues(now, member.getHostAndPort(), hbr); + } + HeartbeatResponseAction nextAction = + _updateHeartbeatDataImpl(memberIndex, originalState, now, myLastOpApplied); + + nextAction.setNextHeartbeatStartDate(nextHeartbeatStartDate); + return nextAction; +} + +HeartbeatResponseAction TopologyCoordinatorImpl::_updateHeartbeatDataImpl( + int updatedConfigIndex, + const MemberState& originalState, + Date_t now, + const OpTime& lastOpApplied) { + // This method has two interrelated responsibilities, performed in two phases. + // + // First, it updates the local notion of which remote node, if any is primary. In the + // process, it may request a remote primary to step down because there is a higher priority + // node waiting, or because the local node thinks it is primary and that it has a more + // recent electionTime. It may instead decide that the local node should step down itself, + // because a remote has a more recent election time. + // + // Second, if there is no remote primary, and the local node is not primary, it considers + // whether or not to stand for election. + invariant(updatedConfigIndex != _selfIndex); + + // We are missing from the config, so do not participate in primary maintenance or election. + if (_selfIndex == -1) { + return HeartbeatResponseAction::makeNoAction(); + } + + //////////////////// + // Phase 1 + //////////////////// + + // If we believe the node whose data was just updated is primary, confirm that + // the updated data supports that notion. If not, erase our notion of who is primary. + if (updatedConfigIndex == _currentPrimaryIndex) { + const MemberHeartbeatData& updatedHBData = _hbdata[updatedConfigIndex]; + if (!updatedHBData.up() || !updatedHBData.getState().primary()) { + _currentPrimaryIndex = -1; + } + } + + // If the current primary is not highest priority and up to date (within 10s), + // have them/me stepdown. + if (_currentPrimaryIndex != -1) { + // check if we should ask the primary (possibly ourselves) to step down + const int highestPriorityIndex = _getHighestPriorityElectableIndex(now, lastOpApplied); + if (highestPriorityIndex != -1) { + const MemberConfig& currentPrimaryMember = _rsConfig.getMemberAt(_currentPrimaryIndex); + const MemberConfig& highestPriorityMember = _rsConfig.getMemberAt(highestPriorityIndex); + const OpTime highestPriorityMemberOptime = highestPriorityIndex == _selfIndex + ? lastOpApplied + : _hbdata[highestPriorityIndex].getOpTime(); - HeartbeatResponseAction TopologyCoordinatorImpl::_updateHeartbeatDataImpl( - int updatedConfigIndex, - const MemberState& originalState, - Date_t now, - const OpTime& lastOpApplied) { - - // This method has two interrelated responsibilities, performed in two phases. - // - // First, it updates the local notion of which remote node, if any is primary. In the - // process, it may request a remote primary to step down because there is a higher priority - // node waiting, or because the local node thinks it is primary and that it has a more - // recent electionTime. It may instead decide that the local node should step down itself, - // because a remote has a more recent election time. - // - // Second, if there is no remote primary, and the local node is not primary, it considers - // whether or not to stand for election. - invariant(updatedConfigIndex != _selfIndex); - - // We are missing from the config, so do not participate in primary maintenance or election. - if (_selfIndex == -1) { - return HeartbeatResponseAction::makeNoAction(); - } - - //////////////////// - // Phase 1 - //////////////////// - - // If we believe the node whose data was just updated is primary, confirm that - // the updated data supports that notion. If not, erase our notion of who is primary. - if (updatedConfigIndex == _currentPrimaryIndex) { - const MemberHeartbeatData& updatedHBData = _hbdata[updatedConfigIndex]; - if (!updatedHBData.up() || !updatedHBData.getState().primary()) { - _currentPrimaryIndex = -1; - } - } + if ((highestPriorityMember.getPriority() > currentPrimaryMember.getPriority()) && + _isOpTimeCloseEnoughToLatestToElect(highestPriorityMemberOptime, lastOpApplied)) { + const OpTime latestOpTime = _latestKnownOpTime(lastOpApplied); - // If the current primary is not highest priority and up to date (within 10s), - // have them/me stepdown. - if (_currentPrimaryIndex != -1) { - // check if we should ask the primary (possibly ourselves) to step down - const int highestPriorityIndex = _getHighestPriorityElectableIndex(now, lastOpApplied); - if (highestPriorityIndex != -1) { - const MemberConfig& currentPrimaryMember = - _rsConfig.getMemberAt(_currentPrimaryIndex); - const MemberConfig& highestPriorityMember = - _rsConfig.getMemberAt(highestPriorityIndex); - const OpTime highestPriorityMemberOptime = highestPriorityIndex == _selfIndex ? - lastOpApplied : _hbdata[highestPriorityIndex].getOpTime(); - - if ((highestPriorityMember.getPriority() > currentPrimaryMember.getPriority()) && - _isOpTimeCloseEnoughToLatestToElect(highestPriorityMemberOptime, - lastOpApplied)) { - const OpTime latestOpTime = _latestKnownOpTime(lastOpApplied); - - if (_iAmPrimary()) { - if (_stepDownPending) { - return HeartbeatResponseAction::makeNoAction(); - } - _stepDownPending = true; - log() << "Stepping down self (priority " - << currentPrimaryMember.getPriority() << ") because " - << highestPriorityMember.getHostAndPort() << " has higher priority " - << highestPriorityMember.getPriority() << " and is only " - << (latestOpTime.getSecs() - highestPriorityMemberOptime.getSecs()) - << " seconds behind me"; - const Date_t until = now + - LastVote::leaseTime.total_milliseconds() + - kHeartbeatInterval.total_milliseconds(); - if (_electionSleepUntil < until) { - _electionSleepUntil = until; - } - return HeartbeatResponseAction::makeStepDownSelfAction(_selfIndex); + if (_iAmPrimary()) { + if (_stepDownPending) { + return HeartbeatResponseAction::makeNoAction(); } - else if ((highestPriorityMemberOptime == _selfIndex) && - (_electionSleepUntil <= now)) { - // If this node is the highest priority node, and it is not in - // an inter-election sleep period, ask the current primary to step down. - // This is an optimization, because the remote primary will almost certainly - // notice this node's electability promptly, via its own heartbeat process. - log() << "Requesting that " << currentPrimaryMember.getHostAndPort() - << " (priority " << currentPrimaryMember.getPriority() - << ") step down because I have higher priority " - << highestPriorityMember.getPriority() << " and am only " - << (latestOpTime.getSecs() - highestPriorityMemberOptime.getSecs()) - << " seconds behind it"; - int primaryIndex = _currentPrimaryIndex; - _currentPrimaryIndex = -1; - return HeartbeatResponseAction::makeStepDownRemoteAction(primaryIndex); + _stepDownPending = true; + log() << "Stepping down self (priority " << currentPrimaryMember.getPriority() + << ") because " << highestPriorityMember.getHostAndPort() + << " has higher priority " << highestPriorityMember.getPriority() + << " and is only " + << (latestOpTime.getSecs() - highestPriorityMemberOptime.getSecs()) + << " seconds behind me"; + const Date_t until = now + LastVote::leaseTime.total_milliseconds() + + kHeartbeatInterval.total_milliseconds(); + if (_electionSleepUntil < until) { + _electionSleepUntil = until; } + return HeartbeatResponseAction::makeStepDownSelfAction(_selfIndex); + } else if ((highestPriorityMemberOptime == _selfIndex) && + (_electionSleepUntil <= now)) { + // If this node is the highest priority node, and it is not in + // an inter-election sleep period, ask the current primary to step down. + // This is an optimization, because the remote primary will almost certainly + // notice this node's electability promptly, via its own heartbeat process. + log() << "Requesting that " << currentPrimaryMember.getHostAndPort() + << " (priority " << currentPrimaryMember.getPriority() + << ") step down because I have higher priority " + << highestPriorityMember.getPriority() << " and am only " + << (latestOpTime.getSecs() - highestPriorityMemberOptime.getSecs()) + << " seconds behind it"; + int primaryIndex = _currentPrimaryIndex; + _currentPrimaryIndex = -1; + return HeartbeatResponseAction::makeStepDownRemoteAction(primaryIndex); } } } + } - // Scan the member list's heartbeat data for who is primary, and update - // _currentPrimaryIndex and _role, or request a remote to step down, as necessary. - { - int remotePrimaryIndex = -1; - for (std::vector<MemberHeartbeatData>::const_iterator it = _hbdata.begin(); - it != _hbdata.end(); - ++it) { - const int itIndex = indexOfIterator(_hbdata, it); - if (itIndex == _selfIndex) { - continue; - } - - if( it->getState().primary() && it->up() ) { - if (remotePrimaryIndex != -1) { - // two other nodes think they are primary (asynchronously polled) - // -- wait for things to settle down. - log() << "replSet info two remote primaries (transiently)"; - return HeartbeatResponseAction::makeNoAction(); - } - remotePrimaryIndex = itIndex; - } + // Scan the member list's heartbeat data for who is primary, and update + // _currentPrimaryIndex and _role, or request a remote to step down, as necessary. + { + int remotePrimaryIndex = -1; + for (std::vector<MemberHeartbeatData>::const_iterator it = _hbdata.begin(); + it != _hbdata.end(); + ++it) { + const int itIndex = indexOfIterator(_hbdata, it); + if (itIndex == _selfIndex) { + continue; } - if (remotePrimaryIndex != -1) { - // If it's the same as last time, don't do anything further. - if (_currentPrimaryIndex == remotePrimaryIndex) { + if (it->getState().primary() && it->up()) { + if (remotePrimaryIndex != -1) { + // two other nodes think they are primary (asynchronously polled) + // -- wait for things to settle down. + log() << "replSet info two remote primaries (transiently)"; return HeartbeatResponseAction::makeNoAction(); } - // Clear last heartbeat message on ourselves (why?) - setMyHeartbeatMessage(now, ""); - - // If we are also primary, this is a problem. Determine who should step down. - if (_iAmPrimary()) { - OpTime remoteElectionTime = _hbdata[remotePrimaryIndex].getElectionTime(); - log() << "replset: another primary seen with election time " - << remoteElectionTime << " my election time is " << _electionTime; - - // Step down whomever has the older election time. - if (remoteElectionTime > _electionTime) { - if (_stepDownPending) { - return HeartbeatResponseAction::makeNoAction(); - } - _stepDownPending = true; - log() << "stepping down; another primary was elected more recently"; - return HeartbeatResponseAction::makeStepDownSelfAction(_selfIndex); - } - else { - log() << "another PRIMARY detected and it should step down" - " since it was elected earlier than me"; - return HeartbeatResponseAction::makeStepDownRemoteAction( - remotePrimaryIndex); - } - } - - _currentPrimaryIndex = remotePrimaryIndex; - return HeartbeatResponseAction::makeNoAction(); + remotePrimaryIndex = itIndex; } } - //////////////////// - // Phase 2 - //////////////////// + if (remotePrimaryIndex != -1) { + // If it's the same as last time, don't do anything further. + if (_currentPrimaryIndex == remotePrimaryIndex) { + return HeartbeatResponseAction::makeNoAction(); + } + // Clear last heartbeat message on ourselves (why?) + setMyHeartbeatMessage(now, ""); - // We do not believe any remote to be primary. + // If we are also primary, this is a problem. Determine who should step down. + if (_iAmPrimary()) { + OpTime remoteElectionTime = _hbdata[remotePrimaryIndex].getElectionTime(); + log() << "replset: another primary seen with election time " << remoteElectionTime + << " my election time is " << _electionTime; - // If we are primary, check if we can still see majority of the set; - // stepdown if we can't. - if (_iAmPrimary()) { - if (CannotSeeMajority & _getMyUnelectableReason(now, lastOpApplied)) { - if (_stepDownPending) { - return HeartbeatResponseAction::makeNoAction(); + // Step down whomever has the older election time. + if (remoteElectionTime > _electionTime) { + if (_stepDownPending) { + return HeartbeatResponseAction::makeNoAction(); + } + _stepDownPending = true; + log() << "stepping down; another primary was elected more recently"; + return HeartbeatResponseAction::makeStepDownSelfAction(_selfIndex); + } else { + log() << "another PRIMARY detected and it should step down" + " since it was elected earlier than me"; + return HeartbeatResponseAction::makeStepDownRemoteAction(remotePrimaryIndex); } - _stepDownPending = true; - log() << "can't see a majority of the set, relinquishing primary"; - return HeartbeatResponseAction::makeStepDownSelfAction(_selfIndex); } - LOG(2) << "Choosing to remain primary"; + _currentPrimaryIndex = remotePrimaryIndex; return HeartbeatResponseAction::makeNoAction(); } + } - fassert(18505, _currentPrimaryIndex == -1); - - const MemberState currentState = getMemberState(); - if (originalState.recovering() && currentState.secondary()) { - // We just transitioned from RECOVERING to SECONDARY, this can only happen if we - // received a heartbeat with an auth error when previously all the heartbeats we'd - // received had auth errors. In this case, don't return makeElectAction() because - // that could cause the election to start before the ReplicationCoordinator has updated - // its notion of the member state to SECONDARY. Instead return noAction so that the - // ReplicationCooridinator knows to update its tracking of the member state off of the - // TopologyCoordinator, and leave starting the election until the next heartbeat comes - // back. - return HeartbeatResponseAction::makeNoAction(); - } + //////////////////// + // Phase 2 + //////////////////// - // At this point, there is no primary anywhere. Check to see if we should become a - // candidate. - if (!checkShouldStandForElection(now, lastOpApplied)) { - return HeartbeatResponseAction::makeNoAction(); + // We do not believe any remote to be primary. + + // If we are primary, check if we can still see majority of the set; + // stepdown if we can't. + if (_iAmPrimary()) { + if (CannotSeeMajority & _getMyUnelectableReason(now, lastOpApplied)) { + if (_stepDownPending) { + return HeartbeatResponseAction::makeNoAction(); + } + _stepDownPending = true; + log() << "can't see a majority of the set, relinquishing primary"; + return HeartbeatResponseAction::makeStepDownSelfAction(_selfIndex); } - return HeartbeatResponseAction::makeElectAction(); + + LOG(2) << "Choosing to remain primary"; + return HeartbeatResponseAction::makeNoAction(); } - bool TopologyCoordinatorImpl::checkShouldStandForElection( - Date_t now, const OpTime& lastOpApplied) { - if (_currentPrimaryIndex != -1) { - return false; - } - invariant (_role != Role::leader); + fassert(18505, _currentPrimaryIndex == -1); - if (_role == Role::candidate) { - LOG(2) << "Not standing for election again; already candidate"; - return false; - } + const MemberState currentState = getMemberState(); + if (originalState.recovering() && currentState.secondary()) { + // We just transitioned from RECOVERING to SECONDARY, this can only happen if we + // received a heartbeat with an auth error when previously all the heartbeats we'd + // received had auth errors. In this case, don't return makeElectAction() because + // that could cause the election to start before the ReplicationCoordinator has updated + // its notion of the member state to SECONDARY. Instead return noAction so that the + // ReplicationCooridinator knows to update its tracking of the member state off of the + // TopologyCoordinator, and leave starting the election until the next heartbeat comes + // back. + return HeartbeatResponseAction::makeNoAction(); + } - const UnelectableReasonMask unelectableReason = _getMyUnelectableReason(now, lastOpApplied); - if (NotCloseEnoughToLatestOptime & unelectableReason) { - LOG(2) << "Not standing for election because " << - _getUnelectableReasonString(unelectableReason) << "; my last optime is " << - lastOpApplied << " and the newest is " << _latestKnownOpTime(lastOpApplied); - return false; - } - if (unelectableReason) { - LOG(2) << "Not standing for election because " << - _getUnelectableReasonString(unelectableReason); - return false; - } - if (_electionSleepUntil > now) { - LOG(2) << "Not standing for election before " << - dateToISOStringLocal(_electionSleepUntil) << " because I stood too recently"; - return false; - } - // All checks passed, become a candidate and start election proceedings. - _role = Role::candidate; - return true; + // At this point, there is no primary anywhere. Check to see if we should become a + // candidate. + if (!checkShouldStandForElection(now, lastOpApplied)) { + return HeartbeatResponseAction::makeNoAction(); } + return HeartbeatResponseAction::makeElectAction(); +} - bool TopologyCoordinatorImpl::_aMajoritySeemsToBeUp() const { - int vUp = 0; - for (std::vector<MemberHeartbeatData>::const_iterator it = _hbdata.begin(); - it != _hbdata.end(); - ++it) { - const int itIndex = indexOfIterator(_hbdata, it); - if (itIndex == _selfIndex || it->up()) { - vUp += _rsConfig.getMemberAt(itIndex).getNumVotes(); - } - } +bool TopologyCoordinatorImpl::checkShouldStandForElection(Date_t now, const OpTime& lastOpApplied) { + if (_currentPrimaryIndex != -1) { + return false; + } + invariant(_role != Role::leader); - return vUp * 2 > _rsConfig.getTotalVotingMembers(); + if (_role == Role::candidate) { + LOG(2) << "Not standing for election again; already candidate"; + return false; } - bool TopologyCoordinatorImpl::_isOpTimeCloseEnoughToLatestToElect( - const OpTime& otherOpTime, const OpTime& ourLastOpApplied) const { - const OpTime latestKnownOpTime = _latestKnownOpTime(ourLastOpApplied); - // Use addition instead of subtraction to avoid overflow. - return otherOpTime.getSecs() + 10 >= (latestKnownOpTime.getSecs()); + const UnelectableReasonMask unelectableReason = _getMyUnelectableReason(now, lastOpApplied); + if (NotCloseEnoughToLatestOptime & unelectableReason) { + LOG(2) << "Not standing for election because " + << _getUnelectableReasonString(unelectableReason) << "; my last optime is " + << lastOpApplied << " and the newest is " << _latestKnownOpTime(lastOpApplied); + return false; + } + if (unelectableReason) { + LOG(2) << "Not standing for election because " + << _getUnelectableReasonString(unelectableReason); + return false; + } + if (_electionSleepUntil > now) { + LOG(2) << "Not standing for election before " << dateToISOStringLocal(_electionSleepUntil) + << " because I stood too recently"; + return false; } + // All checks passed, become a candidate and start election proceedings. + _role = Role::candidate; + return true; +} - bool TopologyCoordinatorImpl::_iAmPrimary() const { - if (_role == Role::leader) { - invariant(_currentPrimaryIndex == _selfIndex); - return true; +bool TopologyCoordinatorImpl::_aMajoritySeemsToBeUp() const { + int vUp = 0; + for (std::vector<MemberHeartbeatData>::const_iterator it = _hbdata.begin(); it != _hbdata.end(); + ++it) { + const int itIndex = indexOfIterator(_hbdata, it); + if (itIndex == _selfIndex || it->up()) { + vUp += _rsConfig.getMemberAt(itIndex).getNumVotes(); } - return false; } - OpTime TopologyCoordinatorImpl::_latestKnownOpTime(OpTime ourLastOpApplied) const { - OpTime latest = ourLastOpApplied; + return vUp * 2 > _rsConfig.getTotalVotingMembers(); +} - for (std::vector<MemberHeartbeatData>::const_iterator it = _hbdata.begin(); - it != _hbdata.end(); - ++it) { +bool TopologyCoordinatorImpl::_isOpTimeCloseEnoughToLatestToElect( + const OpTime& otherOpTime, const OpTime& ourLastOpApplied) const { + const OpTime latestKnownOpTime = _latestKnownOpTime(ourLastOpApplied); + // Use addition instead of subtraction to avoid overflow. + return otherOpTime.getSecs() + 10 >= (latestKnownOpTime.getSecs()); +} - if (indexOfIterator(_hbdata, it) == _selfIndex) { - continue; - } - if (!it->up()) { - continue; - } +bool TopologyCoordinatorImpl::_iAmPrimary() const { + if (_role == Role::leader) { + invariant(_currentPrimaryIndex == _selfIndex); + return true; + } + return false; +} - OpTime optime = it->getOpTime(); +OpTime TopologyCoordinatorImpl::_latestKnownOpTime(OpTime ourLastOpApplied) const { + OpTime latest = ourLastOpApplied; - if (optime > latest) { - latest = optime; - } + for (std::vector<MemberHeartbeatData>::const_iterator it = _hbdata.begin(); it != _hbdata.end(); + ++it) { + if (indexOfIterator(_hbdata, it) == _selfIndex) { + continue; } + if (!it->up()) { + continue; + } + + OpTime optime = it->getOpTime(); - return latest; + if (optime > latest) { + latest = optime; + } } - bool TopologyCoordinatorImpl::_isMemberHigherPriority(int memberOneIndex, - int memberTwoIndex) const { - if (memberOneIndex == -1) - return false; + return latest; +} - if (memberTwoIndex == -1) - return true; +bool TopologyCoordinatorImpl::_isMemberHigherPriority(int memberOneIndex, + int memberTwoIndex) const { + if (memberOneIndex == -1) + return false; - return _rsConfig.getMemberAt(memberOneIndex).getPriority() > - _rsConfig.getMemberAt(memberTwoIndex).getPriority(); - } + if (memberTwoIndex == -1) + return true; - int TopologyCoordinatorImpl::_getHighestPriorityElectableIndex( - Date_t now, OpTime lastOpApplied) const { - int maxIndex = -1; - for (int currentIndex = 0; currentIndex < _rsConfig.getNumMembers(); currentIndex++) { - UnelectableReasonMask reason = currentIndex == _selfIndex ? - _getMyUnelectableReason(now, lastOpApplied) : - _getUnelectableReason(currentIndex, lastOpApplied); - if (None == reason && _isMemberHigherPriority(currentIndex, maxIndex)) { - maxIndex = currentIndex; - } - } + return _rsConfig.getMemberAt(memberOneIndex).getPriority() > + _rsConfig.getMemberAt(memberTwoIndex).getPriority(); +} - return maxIndex; +int TopologyCoordinatorImpl::_getHighestPriorityElectableIndex(Date_t now, + OpTime lastOpApplied) const { + int maxIndex = -1; + for (int currentIndex = 0; currentIndex < _rsConfig.getNumMembers(); currentIndex++) { + UnelectableReasonMask reason = currentIndex == _selfIndex + ? _getMyUnelectableReason(now, lastOpApplied) + : _getUnelectableReason(currentIndex, lastOpApplied); + if (None == reason && _isMemberHigherPriority(currentIndex, maxIndex)) { + maxIndex = currentIndex; + } } - void TopologyCoordinatorImpl::changeMemberState_forTest(const MemberState& newMemberState, - OpTime electionTime) { - invariant(_selfIndex != -1); - if (newMemberState == getMemberState()) - return; - switch(newMemberState.s) { + return maxIndex; +} + +void TopologyCoordinatorImpl::changeMemberState_forTest(const MemberState& newMemberState, + OpTime electionTime) { + invariant(_selfIndex != -1); + if (newMemberState == getMemberState()) + return; + switch (newMemberState.s) { case MemberState::RS_PRIMARY: _role = Role::candidate; processWinElection(OID(), electionTime); @@ -1263,708 +1199,675 @@ namespace { } break; case MemberState::RS_STARTUP: - updateConfig( - ReplicaSetConfig(), - -1, - Date_t(), - OpTime()); + updateConfig(ReplicaSetConfig(), -1, Date_t(), OpTime()); break; default: severe() << "Cannot switch to state " << newMemberState; invariant(false); - } - if (getMemberState() != newMemberState.s) { - severe() << "Expected to enter state " << newMemberState << " but am now in " << - getMemberState(); - invariant(false); - } - log() << "replSet " << newMemberState; - } - - void TopologyCoordinatorImpl::_setCurrentPrimaryForTest(int primaryIndex) { - if (primaryIndex == _selfIndex) { - changeMemberState_forTest(MemberState::RS_PRIMARY); - } - else { - if (_iAmPrimary()) { - changeMemberState_forTest(MemberState::RS_SECONDARY); - } - if (primaryIndex != -1) { - ReplSetHeartbeatResponse hbResponse; - hbResponse.setState(MemberState::RS_PRIMARY); - hbResponse.setElectionTime(OpTime()); - hbResponse.setOpTime(_hbdata[primaryIndex].getOpTime()); - hbResponse.setSyncingTo(""); - hbResponse.setHbMsg(""); - _hbdata[primaryIndex].setUpValues( - _hbdata[primaryIndex].getLastHeartbeat(), - _rsConfig.getMemberAt(primaryIndex).getHostAndPort(), - hbResponse); - } - _currentPrimaryIndex = primaryIndex; - } } - - const MemberConfig* TopologyCoordinatorImpl::_currentPrimaryMember() const { - if (_currentPrimaryIndex == -1) - return NULL; - - return &(_rsConfig.getMemberAt(_currentPrimaryIndex)); + if (getMemberState() != newMemberState.s) { + severe() << "Expected to enter state " << newMemberState << " but am now in " + << getMemberState(); + invariant(false); } + log() << "replSet " << newMemberState; +} - void TopologyCoordinatorImpl::prepareStatusResponse( - const ReplicationExecutor::CallbackData& data, - Date_t now, - unsigned selfUptime, - const OpTime& lastOpApplied, - BSONObjBuilder* response, - Status* result) { - if (data.status == ErrorCodes::CallbackCanceled) { - *result = Status(ErrorCodes::ShutdownInProgress, "replication system is shutting down"); - return; - } - - // output for each member - vector<BSONObj> membersOut; - const MemberState myState = getMemberState(); +void TopologyCoordinatorImpl::_setCurrentPrimaryForTest(int primaryIndex) { + if (primaryIndex == _selfIndex) { + changeMemberState_forTest(MemberState::RS_PRIMARY); + } else { + if (_iAmPrimary()) { + changeMemberState_forTest(MemberState::RS_SECONDARY); + } + if (primaryIndex != -1) { + ReplSetHeartbeatResponse hbResponse; + hbResponse.setState(MemberState::RS_PRIMARY); + hbResponse.setElectionTime(OpTime()); + hbResponse.setOpTime(_hbdata[primaryIndex].getOpTime()); + hbResponse.setSyncingTo(""); + hbResponse.setHbMsg(""); + _hbdata[primaryIndex].setUpValues(_hbdata[primaryIndex].getLastHeartbeat(), + _rsConfig.getMemberAt(primaryIndex).getHostAndPort(), + hbResponse); + } + _currentPrimaryIndex = primaryIndex; + } +} + +const MemberConfig* TopologyCoordinatorImpl::_currentPrimaryMember() const { + if (_currentPrimaryIndex == -1) + return NULL; + + return &(_rsConfig.getMemberAt(_currentPrimaryIndex)); +} + +void TopologyCoordinatorImpl::prepareStatusResponse(const ReplicationExecutor::CallbackData& data, + Date_t now, + unsigned selfUptime, + const OpTime& lastOpApplied, + BSONObjBuilder* response, + Status* result) { + if (data.status == ErrorCodes::CallbackCanceled) { + *result = Status(ErrorCodes::ShutdownInProgress, "replication system is shutting down"); + return; + } + + // output for each member + vector<BSONObj> membersOut; + const MemberState myState = getMemberState(); + + if (_selfIndex == -1) { + // We're REMOVED or have an invalid config + response->append("state", static_cast<int>(myState.s)); + response->append("stateStr", myState.toString()); + response->append("uptime", selfUptime); + response->append("optime", lastOpApplied); + response->appendDate("optimeDate", Date_t(lastOpApplied.getSecs() * 1000ULL)); + if (_maintenanceModeCalls) { + response->append("maintenanceMode", _maintenanceModeCalls); + } + std::string s = _getHbmsg(now); + if (!s.empty()) + response->append("infoMessage", s); + *result = Status(ErrorCodes::InvalidReplicaSetConfig, + "Our replica set config is invalid or we are not a member of it"); + return; + } + + for (std::vector<MemberHeartbeatData>::const_iterator it = _hbdata.begin(); it != _hbdata.end(); + ++it) { + const int itIndex = indexOfIterator(_hbdata, it); + if (itIndex == _selfIndex) { + // add self + BSONObjBuilder bb; + bb.append("_id", _selfConfig().getId()); + bb.append("name", _selfConfig().getHostAndPort().toString()); + bb.append("health", 1.0); + bb.append("state", static_cast<int>(myState.s)); + bb.append("stateStr", myState.toString()); + bb.append("uptime", selfUptime); + if (!_selfConfig().isArbiter()) { + bb.append("optime", lastOpApplied); + bb.appendDate("optimeDate", Date_t(lastOpApplied.getSecs() * 1000ULL)); + } + + if (!_syncSource.empty() && !_iAmPrimary()) { + bb.append("syncingTo", _syncSource.toString()); + } - if (_selfIndex == -1) { - // We're REMOVED or have an invalid config - response->append("state", static_cast<int>(myState.s)); - response->append("stateStr", myState.toString()); - response->append("uptime", selfUptime); - response->append("optime", lastOpApplied); - response->appendDate("optimeDate", Date_t(lastOpApplied.getSecs() * 1000ULL)); if (_maintenanceModeCalls) { - response->append("maintenanceMode", _maintenanceModeCalls); + bb.append("maintenanceMode", _maintenanceModeCalls); } - std::string s = _getHbmsg(now); - if( !s.empty() ) - response->append("infoMessage", s); - *result = Status(ErrorCodes::InvalidReplicaSetConfig, - "Our replica set config is invalid or we are not a member of it"); - return; - } - for (std::vector<MemberHeartbeatData>::const_iterator it = _hbdata.begin(); - it != _hbdata.end(); - ++it) { - const int itIndex = indexOfIterator(_hbdata, it); - if (itIndex == _selfIndex) { - // add self - BSONObjBuilder bb; - bb.append("_id", _selfConfig().getId()); - bb.append("name", _selfConfig().getHostAndPort().toString()); - bb.append("health", 1.0); - bb.append("state", static_cast<int>(myState.s)); - bb.append("stateStr", myState.toString()); - bb.append("uptime", selfUptime); - if (!_selfConfig().isArbiter()) { - bb.append("optime", lastOpApplied); - bb.appendDate("optimeDate", Date_t(lastOpApplied.getSecs() * 1000ULL)); - } - - if (!_syncSource.empty() && !_iAmPrimary()) { - bb.append("syncingTo", _syncSource.toString()); - } - - if (_maintenanceModeCalls) { - bb.append("maintenanceMode", _maintenanceModeCalls); - } - - std::string s = _getHbmsg(now); - if( !s.empty() ) - bb.append("infoMessage", s); - - if (myState.primary()) { - bb.append("electionTime", _electionTime); - bb.appendDate("electionDate", Date_t(_electionTime.getSecs() * 1000ULL)); - } - bb.appendIntOrLL("configVersion", _rsConfig.getConfigVersion()); - bb.append("self", true); - membersOut.push_back(bb.obj()); + std::string s = _getHbmsg(now); + if (!s.empty()) + bb.append("infoMessage", s); + + if (myState.primary()) { + bb.append("electionTime", _electionTime); + bb.appendDate("electionDate", Date_t(_electionTime.getSecs() * 1000ULL)); + } + bb.appendIntOrLL("configVersion", _rsConfig.getConfigVersion()); + bb.append("self", true); + membersOut.push_back(bb.obj()); + } else { + // add non-self member + const MemberConfig& itConfig = _rsConfig.getMemberAt(itIndex); + BSONObjBuilder bb; + bb.append("_id", itConfig.getId()); + bb.append("name", itConfig.getHostAndPort().toString()); + double h = it->getHealth(); + bb.append("health", h); + const MemberState state = it->getState(); + bb.append("state", static_cast<int>(state.s)); + if (h == 0) { + // if we can't connect the state info is from the past + // and could be confusing to show + bb.append("stateStr", "(not reachable/healthy)"); + } else { + bb.append("stateStr", it->getState().toString()); + } + + const unsigned int uptime = static_cast<unsigned int>( + (it->getUpSince() ? (now - it->getUpSince()) / 1000 /* convert millis to secs */ + : 0)); + bb.append("uptime", uptime); + if (!itConfig.isArbiter()) { + bb.append("optime", it->getOpTime()); + bb.appendDate("optimeDate", Date_t(it->getOpTime().getSecs() * 1000ULL)); + } + bb.appendDate("lastHeartbeat", it->getLastHeartbeat()); + bb.appendDate("lastHeartbeatRecv", it->getLastHeartbeatRecv()); + const int ping = _getPing(itConfig.getHostAndPort()); + if (ping != -1) { + bb.append("pingMs", ping); + std::string s = it->getLastHeartbeatMsg(); + if (!s.empty()) + bb.append("lastHeartbeatMessage", s); + } + if (it->hasAuthIssue()) { + bb.append("authenticated", false); + } + const std::string syncSource = it->getSyncSource(); + if (!syncSource.empty() && !state.primary()) { + bb.append("syncingTo", syncSource); } - else { - // add non-self member - const MemberConfig& itConfig = _rsConfig.getMemberAt(itIndex); - BSONObjBuilder bb; - bb.append("_id", itConfig.getId()); - bb.append("name", itConfig.getHostAndPort().toString()); - double h = it->getHealth(); - bb.append("health", h); - const MemberState state = it->getState(); - bb.append("state", static_cast<int>(state.s)); - if( h == 0 ) { - // if we can't connect the state info is from the past - // and could be confusing to show - bb.append("stateStr", "(not reachable/healthy)"); - } - else { - bb.append("stateStr", it->getState().toString()); - } - - const unsigned int uptime = static_cast<unsigned int> ((it->getUpSince() ? - (now - it->getUpSince()) / 1000 /* convert millis to secs */ : 0)); - bb.append("uptime", uptime); - if (!itConfig.isArbiter()) { - bb.append("optime", it->getOpTime()); - bb.appendDate("optimeDate", Date_t(it->getOpTime().getSecs() * 1000ULL)); - } - bb.appendDate("lastHeartbeat", it->getLastHeartbeat()); - bb.appendDate("lastHeartbeatRecv", it->getLastHeartbeatRecv()); - const int ping = _getPing(itConfig.getHostAndPort()); - if (ping != -1) { - bb.append("pingMs", ping); - std::string s = it->getLastHeartbeatMsg(); - if( !s.empty() ) - bb.append("lastHeartbeatMessage", s); - } - if (it->hasAuthIssue()) { - bb.append("authenticated", false); - } - const std::string syncSource = it->getSyncSource(); - if (!syncSource.empty() && !state.primary()) { - bb.append("syncingTo", syncSource); - } - if (state == MemberState::RS_PRIMARY) { - bb.append("electionTime", it->getElectionTime()); - bb.appendDate("electionDate", - Date_t(it->getElectionTime().getSecs() * 1000ULL)); - } - bb.appendIntOrLL("configVersion", it->getConfigVersion()); - membersOut.push_back(bb.obj()); + if (state == MemberState::RS_PRIMARY) { + bb.append("electionTime", it->getElectionTime()); + bb.appendDate("electionDate", Date_t(it->getElectionTime().getSecs() * 1000ULL)); } + bb.appendIntOrLL("configVersion", it->getConfigVersion()); + membersOut.push_back(bb.obj()); } + } - // sort members bson - sort(membersOut.begin(), membersOut.end()); + // sort members bson + sort(membersOut.begin(), membersOut.end()); - response->append("set", - _rsConfig.isInitialized() ? _rsConfig.getReplSetName() : ""); - response->append("date", now); - response->append("myState", myState.s); + response->append("set", _rsConfig.isInitialized() ? _rsConfig.getReplSetName() : ""); + response->append("date", now); + response->append("myState", myState.s); - // Add sync source info - if (!_syncSource.empty() && !myState.primary() && !myState.removed()) { - response->append("syncingTo", _syncSource.toString()); - } - - response->append("members", membersOut); - *result = Status::OK(); + // Add sync source info + if (!_syncSource.empty() && !myState.primary() && !myState.removed()) { + response->append("syncingTo", _syncSource.toString()); } - void TopologyCoordinatorImpl::fillIsMasterForReplSet(IsMasterResponse* response) { - - const MemberState myState = getMemberState(); - if (!_rsConfig.isInitialized() || myState.removed()) { - response->markAsNoConfig(); - return; - } + response->append("members", membersOut); + *result = Status::OK(); +} - response->setReplSetName(_rsConfig.getReplSetName()); - response->setReplSetVersion(_rsConfig.getConfigVersion()); - response->setIsMaster(myState.primary()); - response->setIsSecondary(myState.secondary()); +void TopologyCoordinatorImpl::fillIsMasterForReplSet(IsMasterResponse* response) { + const MemberState myState = getMemberState(); + if (!_rsConfig.isInitialized() || myState.removed()) { + response->markAsNoConfig(); + return; + } - { - for (ReplicaSetConfig::MemberIterator it = _rsConfig.membersBegin(); - it != _rsConfig.membersEnd(); ++it) { - if (it->isHidden() || it->getSlaveDelay().total_seconds() > 0) { - continue; - } + response->setReplSetName(_rsConfig.getReplSetName()); + response->setReplSetVersion(_rsConfig.getConfigVersion()); + response->setIsMaster(myState.primary()); + response->setIsSecondary(myState.secondary()); - if (it->isElectable()) { - response->addHost(it->getHostAndPort()); - } - else if (it->isArbiter()) { - response->addArbiter(it->getHostAndPort()); - } - else { - response->addPassive(it->getHostAndPort()); - } + { + for (ReplicaSetConfig::MemberIterator it = _rsConfig.membersBegin(); + it != _rsConfig.membersEnd(); + ++it) { + if (it->isHidden() || it->getSlaveDelay().total_seconds() > 0) { + continue; } - } - - const MemberConfig* curPrimary = _currentPrimaryMember(); - if (curPrimary) { - response->setPrimary(curPrimary->getHostAndPort()); - } - const MemberConfig& selfConfig = _rsConfig.getMemberAt(_selfIndex); - if (selfConfig.isArbiter()) { - response->setIsArbiterOnly(true); - } - else if (selfConfig.getPriority() == 0) { - response->setIsPassive(true); - } - if (selfConfig.getSlaveDelay().total_seconds()) { - response->setSlaveDelay(selfConfig.getSlaveDelay()); - } - if (selfConfig.isHidden()) { - response->setIsHidden(true); - } - if (!selfConfig.shouldBuildIndexes()) { - response->setShouldBuildIndexes(false); - } - const ReplicaSetTagConfig tagConfig = _rsConfig.getTagConfig(); - if (selfConfig.hasTags(tagConfig)) { - for (MemberConfig::TagIterator tag = selfConfig.tagsBegin(); - tag != selfConfig.tagsEnd(); ++tag) { - std::string tagKey = tagConfig.getTagKey(*tag); - if (tagKey[0] == '$') { - // Filter out internal tags - continue; - } - response->addTag(tagKey, tagConfig.getTagValue(*tag)); + if (it->isElectable()) { + response->addHost(it->getHostAndPort()); + } else if (it->isArbiter()) { + response->addArbiter(it->getHostAndPort()); + } else { + response->addPassive(it->getHostAndPort()); } } - response->setMe(selfConfig.getHostAndPort()); - if (_iAmPrimary()) { - response->setElectionId(_electionId); - } } - void TopologyCoordinatorImpl::prepareFreezeResponse( - Date_t now, int secs, BSONObjBuilder* response) { - - if (secs == 0) { - _stepDownUntil = now; - log() << "replSet info 'unfreezing'"; - response->append("info", "unfreezing"); - - if (_followerMode == MemberState::RS_SECONDARY && - _rsConfig.getNumMembers() == 1 && - _selfIndex == 0 && - _rsConfig.getMemberAt(_selfIndex).isElectable()) { - // If we are a one-node replica set, we're the one member, - // we're electable, and we are currently in followerMode SECONDARY, - // we must transition to candidate now that our stepdown period - // is no longer active, in leiu of heartbeats. - _role = Role::candidate; - } - } - else { - if ( secs == 1 ) - response->append("warning", "you really want to freeze for only 1 second?"); + const MemberConfig* curPrimary = _currentPrimaryMember(); + if (curPrimary) { + response->setPrimary(curPrimary->getHostAndPort()); + } - if (!_iAmPrimary()) { - _stepDownUntil = std::max(_stepDownUntil, Date_t(now + (secs * 1000))); - log() << "replSet info 'freezing' for " << secs << " seconds"; - } - else { - log() << "replSet info received freeze command but we are primary"; + const MemberConfig& selfConfig = _rsConfig.getMemberAt(_selfIndex); + if (selfConfig.isArbiter()) { + response->setIsArbiterOnly(true); + } else if (selfConfig.getPriority() == 0) { + response->setIsPassive(true); + } + if (selfConfig.getSlaveDelay().total_seconds()) { + response->setSlaveDelay(selfConfig.getSlaveDelay()); + } + if (selfConfig.isHidden()) { + response->setIsHidden(true); + } + if (!selfConfig.shouldBuildIndexes()) { + response->setShouldBuildIndexes(false); + } + const ReplicaSetTagConfig tagConfig = _rsConfig.getTagConfig(); + if (selfConfig.hasTags(tagConfig)) { + for (MemberConfig::TagIterator tag = selfConfig.tagsBegin(); tag != selfConfig.tagsEnd(); + ++tag) { + std::string tagKey = tagConfig.getTagKey(*tag); + if (tagKey[0] == '$') { + // Filter out internal tags + continue; } + response->addTag(tagKey, tagConfig.getTagValue(*tag)); } } + response->setMe(selfConfig.getHostAndPort()); + if (_iAmPrimary()) { + response->setElectionId(_electionId); + } +} - bool TopologyCoordinatorImpl::becomeCandidateIfStepdownPeriodOverAndSingleNodeSet(Date_t now) { - if (_stepDownUntil > now) { - return false; - } +void TopologyCoordinatorImpl::prepareFreezeResponse(Date_t now, + int secs, + BSONObjBuilder* response) { + if (secs == 0) { + _stepDownUntil = now; + log() << "replSet info 'unfreezing'"; + response->append("info", "unfreezing"); - if (_followerMode == MemberState::RS_SECONDARY && - _rsConfig.getNumMembers() == 1 && - _selfIndex == 0 && - _rsConfig.getMemberAt(_selfIndex).isElectable()) { - // If the new config describes a one-node replica set, we're the one member, + if (_followerMode == MemberState::RS_SECONDARY && _rsConfig.getNumMembers() == 1 && + _selfIndex == 0 && _rsConfig.getMemberAt(_selfIndex).isElectable()) { + // If we are a one-node replica set, we're the one member, // we're electable, and we are currently in followerMode SECONDARY, - // we must transition to candidate, in leiu of heartbeats. + // we must transition to candidate now that our stepdown period + // is no longer active, in leiu of heartbeats. _role = Role::candidate; - return true; } - return false; - } + } else { + if (secs == 1) + response->append("warning", "you really want to freeze for only 1 second?"); - void TopologyCoordinatorImpl::setElectionSleepUntil(Date_t newTime) { - if (_electionSleepUntil < newTime) { - _electionSleepUntil = newTime; + if (!_iAmPrimary()) { + _stepDownUntil = std::max(_stepDownUntil, Date_t(now + (secs * 1000))); + log() << "replSet info 'freezing' for " << secs << " seconds"; + } else { + log() << "replSet info received freeze command but we are primary"; } } +} - OpTime TopologyCoordinatorImpl::getElectionTime() const { - return _electionTime; - } - - OID TopologyCoordinatorImpl::getElectionId() const { - return _electionId; - } - - int TopologyCoordinatorImpl::getCurrentPrimaryIndex() const { - return _currentPrimaryIndex; +bool TopologyCoordinatorImpl::becomeCandidateIfStepdownPeriodOverAndSingleNodeSet(Date_t now) { + if (_stepDownUntil > now) { + return false; } - Date_t TopologyCoordinatorImpl::getStepDownTime() const { - return _stepDownUntil; + if (_followerMode == MemberState::RS_SECONDARY && _rsConfig.getNumMembers() == 1 && + _selfIndex == 0 && _rsConfig.getMemberAt(_selfIndex).isElectable()) { + // If the new config describes a one-node replica set, we're the one member, + // we're electable, and we are currently in followerMode SECONDARY, + // we must transition to candidate, in leiu of heartbeats. + _role = Role::candidate; + return true; } - - void TopologyCoordinatorImpl::_updateHeartbeatDataForReconfig(const ReplicaSetConfig& newConfig, - int selfIndex, - Date_t now) { - std::vector<MemberHeartbeatData> oldHeartbeats; - _hbdata.swap(oldHeartbeats); - - int index = 0; - for (ReplicaSetConfig::MemberIterator it = newConfig.membersBegin(); - it != newConfig.membersEnd(); - ++it, ++index) { - const MemberConfig& newMemberConfig = *it; - // TODO: C++11: use emplace_back() - if (index == selfIndex) { - // Insert placeholder for ourself, though we will never consult it. - _hbdata.push_back(MemberHeartbeatData()); - } - else { - MemberHeartbeatData newHeartbeatData; - for (int oldIndex = 0; oldIndex < _rsConfig.getNumMembers(); ++oldIndex) { - const MemberConfig& oldMemberConfig = _rsConfig.getMemberAt(oldIndex); - if (oldMemberConfig.getId() == newMemberConfig.getId() && - oldMemberConfig.getHostAndPort() == newMemberConfig.getHostAndPort()) { - // This member existed in the old config with the same member ID and - // HostAndPort, so copy its heartbeat data over. - newHeartbeatData = oldHeartbeats[oldIndex]; - break; - } + return false; +} + +void TopologyCoordinatorImpl::setElectionSleepUntil(Date_t newTime) { + if (_electionSleepUntil < newTime) { + _electionSleepUntil = newTime; + } +} + +OpTime TopologyCoordinatorImpl::getElectionTime() const { + return _electionTime; +} + +OID TopologyCoordinatorImpl::getElectionId() const { + return _electionId; +} + +int TopologyCoordinatorImpl::getCurrentPrimaryIndex() const { + return _currentPrimaryIndex; +} + +Date_t TopologyCoordinatorImpl::getStepDownTime() const { + return _stepDownUntil; +} + +void TopologyCoordinatorImpl::_updateHeartbeatDataForReconfig(const ReplicaSetConfig& newConfig, + int selfIndex, + Date_t now) { + std::vector<MemberHeartbeatData> oldHeartbeats; + _hbdata.swap(oldHeartbeats); + + int index = 0; + for (ReplicaSetConfig::MemberIterator it = newConfig.membersBegin(); + it != newConfig.membersEnd(); + ++it, ++index) { + const MemberConfig& newMemberConfig = *it; + // TODO: C++11: use emplace_back() + if (index == selfIndex) { + // Insert placeholder for ourself, though we will never consult it. + _hbdata.push_back(MemberHeartbeatData()); + } else { + MemberHeartbeatData newHeartbeatData; + for (int oldIndex = 0; oldIndex < _rsConfig.getNumMembers(); ++oldIndex) { + const MemberConfig& oldMemberConfig = _rsConfig.getMemberAt(oldIndex); + if (oldMemberConfig.getId() == newMemberConfig.getId() && + oldMemberConfig.getHostAndPort() == newMemberConfig.getHostAndPort()) { + // This member existed in the old config with the same member ID and + // HostAndPort, so copy its heartbeat data over. + newHeartbeatData = oldHeartbeats[oldIndex]; + break; } - _hbdata.push_back(newHeartbeatData); } + _hbdata.push_back(newHeartbeatData); } } +} - // This function installs a new config object and recreates MemberHeartbeatData objects - // that reflect the new config. - void TopologyCoordinatorImpl::updateConfig(const ReplicaSetConfig& newConfig, - int selfIndex, - Date_t now, - OpTime lastOpApplied) { - invariant(_role != Role::candidate); - invariant(selfIndex < newConfig.getNumMembers()); +// This function installs a new config object and recreates MemberHeartbeatData objects +// that reflect the new config. +void TopologyCoordinatorImpl::updateConfig(const ReplicaSetConfig& newConfig, + int selfIndex, + Date_t now, + OpTime lastOpApplied) { + invariant(_role != Role::candidate); + invariant(selfIndex < newConfig.getNumMembers()); - _updateHeartbeatDataForReconfig(newConfig, selfIndex, now); - _rsConfig = newConfig; - _selfIndex = selfIndex; - _forceSyncSourceIndex = -1; + _updateHeartbeatDataForReconfig(newConfig, selfIndex, now); + _rsConfig = newConfig; + _selfIndex = selfIndex; + _forceSyncSourceIndex = -1; - if (_role == Role::leader) { - if (_selfIndex == -1) { - log() << "Could not remain primary because no longer a member of the replica set"; - } - else if (!_selfConfig().isElectable()) { - log() <<" Could not remain primary because no longer electable"; - } - else { - // Don't stepdown if you don't have to. - _currentPrimaryIndex = _selfIndex; - return; - } - _role = Role::follower; + if (_role == Role::leader) { + if (_selfIndex == -1) { + log() << "Could not remain primary because no longer a member of the replica set"; + } else if (!_selfConfig().isElectable()) { + log() << " Could not remain primary because no longer electable"; + } else { + // Don't stepdown if you don't have to. + _currentPrimaryIndex = _selfIndex; + return; } + _role = Role::follower; + } - // By this point we know we are in Role::follower - _currentPrimaryIndex = -1; // force secondaries to re-detect who the primary is - _stepDownPending = false; - - if (_followerMode == MemberState::RS_SECONDARY && - _rsConfig.getNumMembers() == 1 && - _selfIndex == 0 && - _rsConfig.getMemberAt(_selfIndex).isElectable()) { - // If the new config describes a one-node replica set, we're the one member, - // we're electable, and we are currently in followerMode SECONDARY, - // we must transition to candidate, in leiu of heartbeats. - _role = Role::candidate; - } + // By this point we know we are in Role::follower + _currentPrimaryIndex = -1; // force secondaries to re-detect who the primary is + _stepDownPending = false; + + if (_followerMode == MemberState::RS_SECONDARY && _rsConfig.getNumMembers() == 1 && + _selfIndex == 0 && _rsConfig.getMemberAt(_selfIndex).isElectable()) { + // If the new config describes a one-node replica set, we're the one member, + // we're electable, and we are currently in followerMode SECONDARY, + // we must transition to candidate, in leiu of heartbeats. + _role = Role::candidate; } - std::string TopologyCoordinatorImpl::_getHbmsg(Date_t now) const { - // ignore messages over 2 minutes old - if ((now - _hbmsgTime) > 120) { - return ""; - } - return _hbmsg; +} +std::string TopologyCoordinatorImpl::_getHbmsg(Date_t now) const { + // ignore messages over 2 minutes old + if ((now - _hbmsgTime) > 120) { + return ""; } + return _hbmsg; +} - void TopologyCoordinatorImpl::setMyHeartbeatMessage(const Date_t now, - const std::string& message) { - _hbmsgTime = now; - _hbmsg = message; - } +void TopologyCoordinatorImpl::setMyHeartbeatMessage(const Date_t now, const std::string& message) { + _hbmsgTime = now; + _hbmsg = message; +} + +const MemberConfig& TopologyCoordinatorImpl::_selfConfig() const { + return _rsConfig.getMemberAt(_selfIndex); +} - const MemberConfig& TopologyCoordinatorImpl::_selfConfig() const { - return _rsConfig.getMemberAt(_selfIndex); +TopologyCoordinatorImpl::UnelectableReasonMask TopologyCoordinatorImpl::_getUnelectableReason( + int index, const OpTime& lastOpApplied) const { + invariant(index != _selfIndex); + const MemberConfig& memberConfig = _rsConfig.getMemberAt(index); + const MemberHeartbeatData& hbData = _hbdata[index]; + UnelectableReasonMask result = None; + if (memberConfig.isArbiter()) { + result |= ArbiterIAm; } + if (memberConfig.getPriority() <= 0) { + result |= NoPriority; + } + if (hbData.getState() != MemberState::RS_SECONDARY) { + result |= NotSecondary; + } + if (!_isOpTimeCloseEnoughToLatestToElect(hbData.getOpTime(), lastOpApplied)) { + result |= NotCloseEnoughToLatestOptime; + } + if (hbData.up() && hbData.isUnelectable()) { + result |= RefusesToStand; + } + invariant(result || memberConfig.isElectable()); + return result; +} - TopologyCoordinatorImpl::UnelectableReasonMask TopologyCoordinatorImpl::_getUnelectableReason( - int index, - const OpTime& lastOpApplied) const { - invariant(index != _selfIndex); - const MemberConfig& memberConfig = _rsConfig.getMemberAt(index); - const MemberHeartbeatData& hbData = _hbdata[index]; - UnelectableReasonMask result = None; - if (memberConfig.isArbiter()) { - result |= ArbiterIAm; - } - if (memberConfig.getPriority() <= 0) { - result |= NoPriority; - } - if (hbData.getState() != MemberState::RS_SECONDARY) { - result |= NotSecondary; - } - if (!_isOpTimeCloseEnoughToLatestToElect(hbData.getOpTime(), lastOpApplied)) { - result |= NotCloseEnoughToLatestOptime; - } - if (hbData.up() && hbData.isUnelectable()) { - result |= RefusesToStand; - } - invariant(result || memberConfig.isElectable()); +TopologyCoordinatorImpl::UnelectableReasonMask TopologyCoordinatorImpl::_getMyUnelectableReason( + const Date_t now, const OpTime lastApplied) const { + UnelectableReasonMask result = None; + if (lastApplied.isNull()) { + result |= NoData; + } + if (!_aMajoritySeemsToBeUp()) { + result |= CannotSeeMajority; + } + if (_selfIndex == -1) { + result |= NotInitialized; return result; } + if (_selfConfig().isArbiter()) { + result |= ArbiterIAm; + } + if (_selfConfig().getPriority() <= 0) { + result |= NoPriority; + } + if (_stepDownUntil > now) { + result |= StepDownPeriodActive; + } + if (_lastVote.whoId != -1 && _lastVote.whoId != _rsConfig.getMemberAt(_selfIndex).getId() && + _lastVote.when.millis + LastVote::leaseTime.total_milliseconds() >= now.millis) { + result |= VotedTooRecently; + } - TopologyCoordinatorImpl::UnelectableReasonMask TopologyCoordinatorImpl::_getMyUnelectableReason( - const Date_t now, - const OpTime lastApplied) const { - - UnelectableReasonMask result = None; - if (lastApplied.isNull()) { - result |= NoData; - } - if (!_aMajoritySeemsToBeUp()) { - result |= CannotSeeMajority; - } - if (_selfIndex == -1) { - result |= NotInitialized; - return result; - } - if (_selfConfig().isArbiter()) { - result |= ArbiterIAm; - } - if (_selfConfig().getPriority() <= 0) { - result |= NoPriority; - } - if (_stepDownUntil > now) { - result |= StepDownPeriodActive; - } - if (_lastVote.whoId != -1 && - _lastVote.whoId !=_rsConfig.getMemberAt(_selfIndex).getId() && - _lastVote.when.millis + LastVote::leaseTime.total_milliseconds() >= now.millis) { - result |= VotedTooRecently; - } - - // Cannot be electable unless secondary or already primary - if (!getMemberState().secondary() && !_iAmPrimary()) { - result |= NotSecondary; - } - if (!_isOpTimeCloseEnoughToLatestToElect(lastApplied, lastApplied)) { - result |= NotCloseEnoughToLatestOptime; - } - return result; + // Cannot be electable unless secondary or already primary + if (!getMemberState().secondary() && !_iAmPrimary()) { + result |= NotSecondary; } + if (!_isOpTimeCloseEnoughToLatestToElect(lastApplied, lastApplied)) { + result |= NotCloseEnoughToLatestOptime; + } + return result; +} - std::string TopologyCoordinatorImpl::_getUnelectableReasonString( - const UnelectableReasonMask ur) const { - invariant(ur); - str::stream ss; - bool hasWrittenToStream = false; - if (ur & NoData) { - ss << "node has no applied oplog entries"; - hasWrittenToStream = true; - } - if (ur & VotedTooRecently) { - if (hasWrittenToStream) { - ss << "; "; - } - hasWrittenToStream = true; - ss << "I recently voted for " << _lastVote.whoHostAndPort.toString(); - } - if (ur & CannotSeeMajority) { - if (hasWrittenToStream) { - ss << "; "; - } - hasWrittenToStream = true; - ss << "I cannot see a majority"; - } - if (ur & ArbiterIAm) { - if (hasWrittenToStream) { - ss << "; "; - } - hasWrittenToStream = true; - ss << "member is an arbiter"; +std::string TopologyCoordinatorImpl::_getUnelectableReasonString( + const UnelectableReasonMask ur) const { + invariant(ur); + str::stream ss; + bool hasWrittenToStream = false; + if (ur & NoData) { + ss << "node has no applied oplog entries"; + hasWrittenToStream = true; + } + if (ur & VotedTooRecently) { + if (hasWrittenToStream) { + ss << "; "; } - if (ur & NoPriority) { - if (hasWrittenToStream) { - ss << "; "; - } - hasWrittenToStream = true; - ss << "member has zero priority"; + hasWrittenToStream = true; + ss << "I recently voted for " << _lastVote.whoHostAndPort.toString(); + } + if (ur & CannotSeeMajority) { + if (hasWrittenToStream) { + ss << "; "; } - if (ur & StepDownPeriodActive) { - if (hasWrittenToStream) { - ss << "; "; - } - hasWrittenToStream = true; - ss << "I am still waiting for stepdown period to end at " << - dateToISOStringLocal(_stepDownUntil); + hasWrittenToStream = true; + ss << "I cannot see a majority"; + } + if (ur & ArbiterIAm) { + if (hasWrittenToStream) { + ss << "; "; } - if (ur & NotSecondary) { - if (hasWrittenToStream) { - ss << "; "; - } - hasWrittenToStream = true; - ss << "member is not currently a secondary"; + hasWrittenToStream = true; + ss << "member is an arbiter"; + } + if (ur & NoPriority) { + if (hasWrittenToStream) { + ss << "; "; } - if (ur & NotCloseEnoughToLatestOptime) { - if (hasWrittenToStream) { - ss << "; "; - } - hasWrittenToStream = true; - ss << "member is more than 10 seconds behind the most up-to-date member"; + hasWrittenToStream = true; + ss << "member has zero priority"; + } + if (ur & StepDownPeriodActive) { + if (hasWrittenToStream) { + ss << "; "; } - if (ur & NotInitialized) { - if (hasWrittenToStream) { - ss << "; "; - } - hasWrittenToStream = true; - ss << "node is not a member of a valid replica set configuration"; + hasWrittenToStream = true; + ss << "I am still waiting for stepdown period to end at " + << dateToISOStringLocal(_stepDownUntil); + } + if (ur & NotSecondary) { + if (hasWrittenToStream) { + ss << "; "; } - if (ur & RefusesToStand) { - if (hasWrittenToStream) { - ss << "; "; - } - hasWrittenToStream = true; - ss << "most recent heartbeat indicates node will not stand for election"; + hasWrittenToStream = true; + ss << "member is not currently a secondary"; + } + if (ur & NotCloseEnoughToLatestOptime) { + if (hasWrittenToStream) { + ss << "; "; } - if (!hasWrittenToStream) { - severe() << "Invalid UnelectableReasonMask value 0x" << integerToHex(ur); - fassertFailed(26011); + hasWrittenToStream = true; + ss << "member is more than 10 seconds behind the most up-to-date member"; + } + if (ur & NotInitialized) { + if (hasWrittenToStream) { + ss << "; "; } - ss << " (mask 0x" << integerToHex(ur) << ")"; - return ss; + hasWrittenToStream = true; + ss << "node is not a member of a valid replica set configuration"; } - - int TopologyCoordinatorImpl::_getPing(const HostAndPort& host) { - return _pings[host].getMillis(); + if (ur & RefusesToStand) { + if (hasWrittenToStream) { + ss << "; "; + } + hasWrittenToStream = true; + ss << "most recent heartbeat indicates node will not stand for election"; } - - void TopologyCoordinatorImpl::_setElectionTime(const OpTime& newElectionTime) { - _electionTime = newElectionTime; + if (!hasWrittenToStream) { + severe() << "Invalid UnelectableReasonMask value 0x" << integerToHex(ur); + fassertFailed(26011); } + ss << " (mask 0x" << integerToHex(ur) << ")"; + return ss; +} - int TopologyCoordinatorImpl::_getTotalPings() { - PingMap::iterator it = _pings.begin(); - PingMap::iterator end = _pings.end(); - int totalPings = 0; - while (it != end) { - totalPings += it->second.getCount(); - it++; - } - return totalPings; - } +int TopologyCoordinatorImpl::_getPing(const HostAndPort& host) { + return _pings[host].getMillis(); +} - std::vector<HostAndPort> TopologyCoordinatorImpl::getMaybeUpHostAndPorts() const { - std::vector<HostAndPort> upHosts; - for (std::vector<MemberHeartbeatData>::const_iterator it = _hbdata.begin(); - it != _hbdata.end(); - ++it) { - const int itIndex = indexOfIterator(_hbdata, it); - if (itIndex == _selfIndex) { - continue; // skip ourselves - } - if (!it->maybeUp()) { - continue; // skip DOWN nodes - } +void TopologyCoordinatorImpl::_setElectionTime(const OpTime& newElectionTime) { + _electionTime = newElectionTime; +} - upHosts.push_back(_rsConfig.getMemberAt(itIndex).getHostAndPort()); - } - return upHosts; +int TopologyCoordinatorImpl::_getTotalPings() { + PingMap::iterator it = _pings.begin(); + PingMap::iterator end = _pings.end(); + int totalPings = 0; + while (it != end) { + totalPings += it->second.getCount(); + it++; } + return totalPings; +} - bool TopologyCoordinatorImpl::voteForMyself(Date_t now) { - if (_role != Role::candidate) { - return false; +std::vector<HostAndPort> TopologyCoordinatorImpl::getMaybeUpHostAndPorts() const { + std::vector<HostAndPort> upHosts; + for (std::vector<MemberHeartbeatData>::const_iterator it = _hbdata.begin(); it != _hbdata.end(); + ++it) { + const int itIndex = indexOfIterator(_hbdata, it); + if (itIndex == _selfIndex) { + continue; // skip ourselves } - int selfId = _selfConfig().getId(); - if ((_lastVote.when + LastVote::leaseTime.total_milliseconds() >= now) - && (_lastVote.whoId != selfId)) { - log() << "replSet not voting yea for " << selfId << - " voted for " << _lastVote.whoHostAndPort.toString() << ' ' << - (now - _lastVote.when) / 1000 << " secs ago"; - return false; + if (!it->maybeUp()) { + continue; // skip DOWN nodes } - _lastVote.when = now; - _lastVote.whoId = selfId; - _lastVote.whoHostAndPort = _selfConfig().getHostAndPort(); - return true; + + upHosts.push_back(_rsConfig.getMemberAt(itIndex).getHostAndPort()); } + return upHosts; +} - MemberState TopologyCoordinatorImpl::getMemberState() const { - if (_selfIndex == -1) { - if (_rsConfig.isInitialized()) { - return MemberState::RS_REMOVED; - } - return MemberState::RS_STARTUP; - } - if (_role == Role::leader) { - invariant(_currentPrimaryIndex == _selfIndex); - return MemberState::RS_PRIMARY; - } - const MemberConfig& myConfig = _selfConfig(); - if (myConfig.isArbiter()) { - return MemberState::RS_ARBITER; - } - if (((_maintenanceModeCalls > 0) || (_hasOnlyAuthErrorUpHeartbeats(_hbdata, _selfIndex))) - && (_followerMode == MemberState::RS_SECONDARY)) { - return MemberState::RS_RECOVERING; - } - return _followerMode; +bool TopologyCoordinatorImpl::voteForMyself(Date_t now) { + if (_role != Role::candidate) { + return false; } + int selfId = _selfConfig().getId(); + if ((_lastVote.when + LastVote::leaseTime.total_milliseconds() >= now) && + (_lastVote.whoId != selfId)) { + log() << "replSet not voting yea for " << selfId << " voted for " + << _lastVote.whoHostAndPort.toString() << ' ' << (now - _lastVote.when) / 1000 + << " secs ago"; + return false; + } + _lastVote.when = now; + _lastVote.whoId = selfId; + _lastVote.whoHostAndPort = _selfConfig().getHostAndPort(); + return true; +} - void TopologyCoordinatorImpl::processWinElection( - OID electionId, - OpTime electionOpTime) { - invariant(_role == Role::candidate); - _electionTime = electionOpTime; - _electionId = electionId; - _role = Role::leader; - _currentPrimaryIndex = _selfIndex; - _syncSource = HostAndPort(); - _forceSyncSourceIndex = -1; +MemberState TopologyCoordinatorImpl::getMemberState() const { + if (_selfIndex == -1) { + if (_rsConfig.isInitialized()) { + return MemberState::RS_REMOVED; + } + return MemberState::RS_STARTUP; + } + if (_role == Role::leader) { + invariant(_currentPrimaryIndex == _selfIndex); + return MemberState::RS_PRIMARY; + } + const MemberConfig& myConfig = _selfConfig(); + if (myConfig.isArbiter()) { + return MemberState::RS_ARBITER; } + if (((_maintenanceModeCalls > 0) || (_hasOnlyAuthErrorUpHeartbeats(_hbdata, _selfIndex))) && + (_followerMode == MemberState::RS_SECONDARY)) { + return MemberState::RS_RECOVERING; + } + return _followerMode; +} - void TopologyCoordinatorImpl::processLoseElection() { - invariant(_role == Role::candidate); - const HostAndPort syncSourceAddress = getSyncSourceAddress(); - _electionTime = OpTime(0, 0); - _electionId = OID(); - _role = Role::follower; +void TopologyCoordinatorImpl::processWinElection(OID electionId, OpTime electionOpTime) { + invariant(_role == Role::candidate); + _electionTime = electionOpTime; + _electionId = electionId; + _role = Role::leader; + _currentPrimaryIndex = _selfIndex; + _syncSource = HostAndPort(); + _forceSyncSourceIndex = -1; +} - // Clear lastVote time, if we voted for ourselves in this election. - // This will allow us to vote for others. - if (_lastVote.whoId == _selfConfig().getId()) { - _lastVote.when = 0; - } +void TopologyCoordinatorImpl::processLoseElection() { + invariant(_role == Role::candidate); + const HostAndPort syncSourceAddress = getSyncSourceAddress(); + _electionTime = OpTime(0, 0); + _electionId = OID(); + _role = Role::follower; + + // Clear lastVote time, if we voted for ourselves in this election. + // This will allow us to vote for others. + if (_lastVote.whoId == _selfConfig().getId()) { + _lastVote.when = 0; } +} - bool TopologyCoordinatorImpl::stepDown(Date_t until, bool force, OpTime lastOpApplied) { - bool canStepDown = force; - for (int i = 0; !canStepDown && i < _rsConfig.getNumMembers(); ++i) { - if (i == _selfIndex) { - continue; - } - UnelectableReasonMask reason = _getUnelectableReason(i, lastOpApplied); - if (!reason && _hbdata[i].getOpTime() >= lastOpApplied) { - canStepDown = true; - } +bool TopologyCoordinatorImpl::stepDown(Date_t until, bool force, OpTime lastOpApplied) { + bool canStepDown = force; + for (int i = 0; !canStepDown && i < _rsConfig.getNumMembers(); ++i) { + if (i == _selfIndex) { + continue; } - - if (!canStepDown) { - return false; + UnelectableReasonMask reason = _getUnelectableReason(i, lastOpApplied); + if (!reason && _hbdata[i].getOpTime() >= lastOpApplied) { + canStepDown = true; } - _stepDownUntil = until; - _stepDownSelfAndReplaceWith(-1); - return true; } - void TopologyCoordinatorImpl::setFollowerMode(MemberState::MS newMode) { - invariant(_role == Role::follower); - switch (newMode) { + if (!canStepDown) { + return false; + } + _stepDownUntil = until; + _stepDownSelfAndReplaceWith(-1); + return true; +} + +void TopologyCoordinatorImpl::setFollowerMode(MemberState::MS newMode) { + invariant(_role == Role::follower); + switch (newMode) { case MemberState::RS_RECOVERING: case MemberState::RS_ROLLBACK: case MemberState::RS_SECONDARY: @@ -1973,129 +1876,126 @@ namespace { break; default: invariant(false); - } + } - if (_followerMode != MemberState::RS_SECONDARY) { - return; - } + if (_followerMode != MemberState::RS_SECONDARY) { + return; + } - // When a single node replica set transitions to SECONDARY, we must check if we should - // be a candidate here. This is necessary because a single node replica set has no - // heartbeats that would normally change the role to candidate. + // When a single node replica set transitions to SECONDARY, we must check if we should + // be a candidate here. This is necessary because a single node replica set has no + // heartbeats that would normally change the role to candidate. - if (_rsConfig.getNumMembers() == 1 && - _selfIndex == 0 && - _rsConfig.getMemberAt(_selfIndex).isElectable()) { - _role = Role::candidate; - } + if (_rsConfig.getNumMembers() == 1 && _selfIndex == 0 && + _rsConfig.getMemberAt(_selfIndex).isElectable()) { + _role = Role::candidate; } +} - bool TopologyCoordinatorImpl::stepDownIfPending() { - if (!_stepDownPending) { - return false; - } +bool TopologyCoordinatorImpl::stepDownIfPending() { + if (!_stepDownPending) { + return false; + } - int remotePrimaryIndex = -1; - for (std::vector<MemberHeartbeatData>::const_iterator it = _hbdata.begin(); - it != _hbdata.end(); ++it) { - const int itIndex = indexOfIterator(_hbdata, it); - if (itIndex == _selfIndex) { - continue; - } + int remotePrimaryIndex = -1; + for (std::vector<MemberHeartbeatData>::const_iterator it = _hbdata.begin(); it != _hbdata.end(); + ++it) { + const int itIndex = indexOfIterator(_hbdata, it); + if (itIndex == _selfIndex) { + continue; + } - if (it->getState().primary() && it->up()) { - if (remotePrimaryIndex != -1) { - // two other nodes think they are primary (asynchronously polled) - // -- wait for things to settle down. - remotePrimaryIndex = -1; - log() << "replSet info two remote primaries (transiently)"; - break; - } - remotePrimaryIndex = itIndex; + if (it->getState().primary() && it->up()) { + if (remotePrimaryIndex != -1) { + // two other nodes think they are primary (asynchronously polled) + // -- wait for things to settle down. + remotePrimaryIndex = -1; + log() << "replSet info two remote primaries (transiently)"; + break; } - } - _stepDownSelfAndReplaceWith(remotePrimaryIndex); + remotePrimaryIndex = itIndex; + } + } + _stepDownSelfAndReplaceWith(remotePrimaryIndex); + return true; +} + +void TopologyCoordinatorImpl::_stepDownSelfAndReplaceWith(int newPrimary) { + invariant(_role == Role::leader); + invariant(_selfIndex != -1); + invariant(_selfIndex != newPrimary); + invariant(_selfIndex == _currentPrimaryIndex); + _currentPrimaryIndex = newPrimary; + _role = Role::follower; + _stepDownPending = false; +} + +void TopologyCoordinatorImpl::adjustMaintenanceCountBy(int inc) { + invariant(_role == Role::follower); + _maintenanceModeCalls += inc; + invariant(_maintenanceModeCalls >= 0); +} + +int TopologyCoordinatorImpl::getMaintenanceCount() const { + return _maintenanceModeCalls; +} + +bool TopologyCoordinatorImpl::shouldChangeSyncSource(const HostAndPort& currentSource, + Date_t now) const { + // Methodology: + // If there exists a viable sync source member other than currentSource, whose oplog has + // reached an optime greater than _maxSyncSourceLagSecs later than currentSource's, return + // true. + + // If the user requested a sync source change, return true. + if (_forceSyncSourceIndex != -1) { return true; } - void TopologyCoordinatorImpl::_stepDownSelfAndReplaceWith(int newPrimary) { - invariant(_role == Role::leader); - invariant(_selfIndex != -1); - invariant(_selfIndex != newPrimary); - invariant(_selfIndex == _currentPrimaryIndex); - _currentPrimaryIndex = newPrimary; - _role = Role::follower; - _stepDownPending = false; - } - - void TopologyCoordinatorImpl::adjustMaintenanceCountBy(int inc) { - invariant(_role == Role::follower); - _maintenanceModeCalls += inc; - invariant(_maintenanceModeCalls >= 0); + const int currentMemberIndex = _rsConfig.findMemberIndexByHostAndPort(currentSource); + if (currentMemberIndex == -1) { + return true; } + invariant(currentMemberIndex != _selfIndex); - int TopologyCoordinatorImpl::getMaintenanceCount() const { - return _maintenanceModeCalls; + OpTime currentOpTime = _hbdata[currentMemberIndex].getOpTime(); + if (currentOpTime.isNull()) { + // Haven't received a heartbeat from the sync source yet, so can't tell if we should + // change. + return false; } - - bool TopologyCoordinatorImpl::shouldChangeSyncSource(const HostAndPort& currentSource, - Date_t now) const { - // Methodology: - // If there exists a viable sync source member other than currentSource, whose oplog has - // reached an optime greater than _maxSyncSourceLagSecs later than currentSource's, return - // true. - - // If the user requested a sync source change, return true. - if (_forceSyncSourceIndex != -1) { + unsigned int currentSecs = currentOpTime.getSecs(); + unsigned int goalSecs = currentSecs + _maxSyncSourceLagSecs.total_seconds(); + + for (std::vector<MemberHeartbeatData>::const_iterator it = _hbdata.begin(); it != _hbdata.end(); + ++it) { + const int itIndex = indexOfIterator(_hbdata, it); + const MemberConfig& candidateConfig = _rsConfig.getMemberAt(itIndex); + if (it->up() && + (candidateConfig.shouldBuildIndexes() || !_selfConfig().shouldBuildIndexes()) && + it->getState().readable() && !_memberIsBlacklisted(candidateConfig, now) && + goalSecs < it->getOpTime().getSecs()) { + log() << "changing sync target because current sync target's most recent OpTime is " + << currentOpTime.toStringLong() << " which is more than " + << _maxSyncSourceLagSecs.total_seconds() << " seconds behind member " + << candidateConfig.getHostAndPort().toString() << " whose most recent OpTime is " + << it->getOpTime().toStringLong(); + invariant(itIndex != _selfIndex); return true; } - - const int currentMemberIndex = _rsConfig.findMemberIndexByHostAndPort(currentSource); - if (currentMemberIndex == -1) { - return true; - } - invariant(currentMemberIndex != _selfIndex); - - OpTime currentOpTime = _hbdata[currentMemberIndex].getOpTime(); - if (currentOpTime.isNull()) { - // Haven't received a heartbeat from the sync source yet, so can't tell if we should - // change. - return false; - } - unsigned int currentSecs = currentOpTime.getSecs(); - unsigned int goalSecs = currentSecs + _maxSyncSourceLagSecs.total_seconds(); - - for (std::vector<MemberHeartbeatData>::const_iterator it = _hbdata.begin(); - it != _hbdata.end(); - ++it) { - const int itIndex = indexOfIterator(_hbdata, it); - const MemberConfig& candidateConfig = _rsConfig.getMemberAt(itIndex); - if (it->up() && - (candidateConfig.shouldBuildIndexes() || !_selfConfig().shouldBuildIndexes()) && - it->getState().readable() && - !_memberIsBlacklisted(candidateConfig, now) && - goalSecs < it->getOpTime().getSecs()) { - log() << "changing sync target because current sync target's most recent OpTime is " - << currentOpTime.toStringLong() << " which is more than " - << _maxSyncSourceLagSecs.total_seconds() << " seconds behind member " - << candidateConfig.getHostAndPort().toString() - << " whose most recent OpTime is " << it->getOpTime().toStringLong(); - invariant(itIndex != _selfIndex); - return true; - } - } - return false; } + return false; +} - void TopologyCoordinatorImpl::summarizeAsHtml(ReplSetHtmlSummary* output) { - output->setConfig(_rsConfig); - output->setHBData(_hbdata); - output->setSelfIndex(_selfIndex); - output->setPrimaryIndex(_currentPrimaryIndex); - output->setSelfState(getMemberState()); - output->setSelfHeartbeatMessage(_hbmsg); - } +void TopologyCoordinatorImpl::summarizeAsHtml(ReplSetHtmlSummary* output) { + output->setConfig(_rsConfig); + output->setHBData(_hbdata); + output->setSelfIndex(_selfIndex); + output->setPrimaryIndex(_currentPrimaryIndex); + output->setSelfState(getMemberState()); + output->setSelfHeartbeatMessage(_hbmsg); +} -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/topology_coordinator_impl.h b/src/mongo/db/repl/topology_coordinator_impl.h index 55c199fbd10..64b085bea12 100644 --- a/src/mongo/db/repl/topology_coordinator_impl.h +++ b/src/mongo/db/repl/topology_coordinator_impl.h @@ -41,360 +41,358 @@ namespace mongo { - class OperationContext; +class OperationContext; namespace repl { +/** + * Represents a latency measurement for each replica set member based on heartbeat requests. + * The measurement is an average weighted 80% to the old value, and 20% to the new value. + * + * Also stores information about heartbeat progress and retries. + */ +class PingStats { +public: + PingStats(); + /** - * Represents a latency measurement for each replica set member based on heartbeat requests. - * The measurement is an average weighted 80% to the old value, and 20% to the new value. + * Records that a new heartbeat request started at "now". * - * Also stores information about heartbeat progress and retries. + * This resets the failure count used in determining whether the next request to a target + * should be a retry or a regularly scheduled heartbeat message. */ - class PingStats { - public: - PingStats(); - - /** - * Records that a new heartbeat request started at "now". - * - * This resets the failure count used in determining whether the next request to a target - * should be a retry or a regularly scheduled heartbeat message. - */ - void start(Date_t now); - - /** - * Records that a heartbeat request completed successfully, and that "millis" milliseconds - * were spent for a single network roundtrip plus remote processing time. - */ - void hit(int millis); - - /** - * Records that a heartbeat request failed. - */ - void miss(); - - /** - * Gets the number of hit() calls. - */ - unsigned int getCount() const { return count; } - - /** - * Gets the weighted average round trip time for heartbeat messages to the target. - */ - unsigned int getMillis() const { return value; } - - /** - * Gets the date at which start() was last called, which is used to determine if - * a heartbeat should be retried or if the time limit has expired. - */ - Date_t getLastHeartbeatStartDate() const { return _lastHeartbeatStartDate; } - - /** - * Gets the number of failures since start() was last called. - * - * This value is incremented by calls to miss(), cleared by calls to start() and - * set to the maximum possible value by calls to hit(). - */ - int getNumFailuresSinceLastStart() const { return _numFailuresSinceLastStart; } - - private: - unsigned int count; - unsigned int value; - Date_t _lastHeartbeatStartDate; - int _numFailuresSinceLastStart; - }; + void start(Date_t now); + + /** + * Records that a heartbeat request completed successfully, and that "millis" milliseconds + * were spent for a single network roundtrip plus remote processing time. + */ + void hit(int millis); + + /** + * Records that a heartbeat request failed. + */ + void miss(); + + /** + * Gets the number of hit() calls. + */ + unsigned int getCount() const { + return count; + } + + /** + * Gets the weighted average round trip time for heartbeat messages to the target. + */ + unsigned int getMillis() const { + return value; + } - class TopologyCoordinatorImpl : public TopologyCoordinator { - public: - /** - * Constructs a Topology Coordinator object. - * @param maxSyncSourceLagSecs a sync source is re-evaluated after it lags behind further - * than this amount. - **/ - TopologyCoordinatorImpl(Seconds maxSyncSourceLagSecs); - - //////////////////////////////////////////////////////////// - // - // Implementation of TopologyCoordinator interface - // - //////////////////////////////////////////////////////////// - - virtual Role getRole() const; - virtual MemberState getMemberState() const; - virtual HostAndPort getSyncSourceAddress() const; - virtual std::vector<HostAndPort> getMaybeUpHostAndPorts() const; - virtual int getMaintenanceCount() const; - virtual void setForceSyncSourceIndex(int index); - virtual HostAndPort chooseNewSyncSource(Date_t now, - const OpTime& lastOpApplied); - virtual void blacklistSyncSource(const HostAndPort& host, Date_t until); - virtual void unblacklistSyncSource(const HostAndPort& host, Date_t now); - virtual void clearSyncSourceBlacklist(); - virtual bool shouldChangeSyncSource(const HostAndPort& currentSource, Date_t now) const; - virtual bool becomeCandidateIfStepdownPeriodOverAndSingleNodeSet(Date_t now); - virtual void setElectionSleepUntil(Date_t newTime); - virtual void setFollowerMode(MemberState::MS newMode); - virtual void adjustMaintenanceCountBy(int inc); - virtual void prepareSyncFromResponse(const ReplicationExecutor::CallbackData& data, - const HostAndPort& target, - const OpTime& lastOpApplied, - BSONObjBuilder* response, - Status* result); - virtual void prepareFreshResponse(const ReplicationCoordinator::ReplSetFreshArgs& args, - Date_t now, - OpTime lastOpApplied, - BSONObjBuilder* response, - Status* result); - virtual void prepareElectResponse(const ReplicationCoordinator::ReplSetElectArgs& args, - Date_t now, - OpTime lastOpApplied, - BSONObjBuilder* response, - Status* result); - virtual Status prepareHeartbeatResponse(Date_t now, - const ReplSetHeartbeatArgs& args, - const std::string& ourSetName, - const OpTime& lastOpApplied, - ReplSetHeartbeatResponse* response); - virtual void prepareStatusResponse(const ReplicationExecutor::CallbackData& data, - Date_t now, - unsigned uptime, - const OpTime& lastOpApplied, - BSONObjBuilder* response, - Status* result); - virtual void fillIsMasterForReplSet(IsMasterResponse* response); - virtual void prepareFreezeResponse(Date_t now, int secs, BSONObjBuilder* response); - virtual void updateConfig(const ReplicaSetConfig& newConfig, - int selfIndex, - Date_t now, - OpTime lastOpApplied); - virtual std::pair<ReplSetHeartbeatArgs, Milliseconds> prepareHeartbeatRequest( - Date_t now, - const std::string& ourSetName, - const HostAndPort& target); - virtual HeartbeatResponseAction processHeartbeatResponse( - Date_t now, - Milliseconds networkRoundTripTime, - const HostAndPort& target, - const StatusWith<ReplSetHeartbeatResponse>& hbResponse, - OpTime myLastOpApplied); - virtual bool voteForMyself(Date_t now); - virtual void processWinElection(OID electionId, OpTime electionOpTime); - virtual void processLoseElection(); - virtual bool checkShouldStandForElection(Date_t now, const OpTime& lastOpApplied); - virtual void setMyHeartbeatMessage(const Date_t now, const std::string& message); - virtual bool stepDown(Date_t until, bool force, OpTime lastOpApplied); - virtual bool stepDownIfPending(); - virtual Date_t getStepDownTime() const; - virtual void summarizeAsHtml(ReplSetHtmlSummary* output); - - //////////////////////////////////////////////////////////// - // - // Test support methods - // - //////////////////////////////////////////////////////////// - - // Changes _memberState to newMemberState. Only for testing. - void changeMemberState_forTest(const MemberState& newMemberState, - OpTime electionTime = OpTime(0,0)); - - // Sets "_electionTime" to "newElectionTime". Only for testing. - void _setElectionTime(const OpTime& newElectionTime); - - // Sets _currentPrimaryIndex to the given index. Should only be used in unit tests! - // TODO(spencer): Remove this once we can easily call for an election in unit tests to - // set the current primary. - void _setCurrentPrimaryForTest(int primaryIndex); - - // Returns _electionTime. Only used in unittests. - OpTime getElectionTime() const; - - // Returns _electionId. Only used in unittests. - OID getElectionId() const; - - // Returns _currentPrimaryIndex. Only used in unittests. - int getCurrentPrimaryIndex() const; - - private: - - enum UnelectableReason { - None = 0, - CannotSeeMajority = 1 << 0, - NotCloseEnoughToLatestOptime = 1 << 1, - ArbiterIAm = 1 << 2, - NotSecondary = 1 << 3, - NoPriority = 1 << 4, - StepDownPeriodActive = 1 << 5, - NoData = 1 << 6, - NotInitialized = 1 << 7, - VotedTooRecently = 1 << 8, - RefusesToStand = 1 << 9 - }; - typedef int UnelectableReasonMask; - - // Returns the number of heartbeat pings which have occurred. - int _getTotalPings(); - - // Returns the current "ping" value for the given member by their address - int _getPing(const HostAndPort& host); - - // Determines if we will veto the member specified by "args.id", given that the last op - // we have applied locally is "lastOpApplied". - // If we veto, the errmsg will be filled in with a reason - bool _shouldVetoMember(const ReplicationCoordinator::ReplSetFreshArgs& args, - const Date_t& now, - const OpTime& lastOpApplied, - std::string* errmsg) const; - - // Returns the index of the member with the matching id, or -1 if none match. - int _getMemberIndex(int id) const; - - // Sees if a majority number of votes are held by members who are currently "up" - bool _aMajoritySeemsToBeUp() const; - - // Is otherOpTime close enough (within 10 seconds) to the latest known optime to qualify - // for an election - bool _isOpTimeCloseEnoughToLatestToElect(const OpTime& otherOpTime, - const OpTime& ourLastOpApplied) const; - - // Returns reason why "self" member is unelectable - UnelectableReasonMask _getMyUnelectableReason( - const Date_t now, - const OpTime lastOpApplied) const; - - // Returns reason why memberIndex is unelectable - UnelectableReasonMask _getUnelectableReason( - int memberIndex, - const OpTime& lastOpApplied) const; - - // Returns the nice text of why the node is unelectable - std::string _getUnelectableReasonString(UnelectableReasonMask ur) const; - - // Return true if we are currently primary - bool _iAmPrimary() const; - - // Scans through all members that are 'up' and return the latest known optime. - OpTime _latestKnownOpTime(OpTime ourLastOpApplied) const; - - // Scans the electable set and returns the highest priority member index - int _getHighestPriorityElectableIndex(Date_t now, OpTime lastOpApplied) const; - - // Returns true if "one" member is higher priority than "two" member - bool _isMemberHigherPriority(int memberOneIndex, int memberTwoIndex) const; - - // Helper shortcut to self config - const MemberConfig& _selfConfig() const; - - // Returns NULL if there is no primary, or the MemberConfig* for the current primary - const MemberConfig* _currentPrimaryMember() const; - - /** - * Performs updating "_hbdata" and "_currentPrimaryIndex" for processHeartbeatResponse(). - */ - HeartbeatResponseAction _updateHeartbeatDataImpl( - int updatedConfigIndex, - const MemberState& originalState, - Date_t now, - const OpTime& lastOpApplied); - - /** - * Updates _hbdata based on the newConfig, ensuring that every member in the newConfig - * has an entry in _hbdata. If any nodes in the newConfig are also present in - * _currentConfig, copies their heartbeat info into the corresponding entry in the updated - * _hbdata vector. - */ - void _updateHeartbeatDataForReconfig(const ReplicaSetConfig& newConfig, - int selfIndex, - Date_t now); - - void _stepDownSelfAndReplaceWith(int newPrimary); - - MemberState _getMyState() const; - - /** - * Looks up the provided member in the blacklist and returns true if the member's blacklist - * expire time is after 'now'. If the member is found but the expire time is before 'now', - * the function returns false. If the member is not found in the blacklist, the function - * returns false. - **/ - bool _memberIsBlacklisted(const MemberConfig& memberConfig, Date_t now) const; - - // This node's role in the replication protocol. - Role _role; - - // This is a unique id that is generated and set each time we transition to PRIMARY, as the - // result of an election. - OID _electionId; - // The time at which the current PRIMARY was elected. - OpTime _electionTime; - - // the index of the member we currently believe is primary, if one exists, otherwise -1 - int _currentPrimaryIndex; - - // the hostandport we are currently syncing from - // empty if no sync source (we are primary, or we cannot connect to anyone yet) - HostAndPort _syncSource; - // These members are not chosen as sync sources for a period of time, due to connection - // issues with them - std::map<HostAndPort, Date_t> _syncSourceBlacklist; - // The next sync source to be chosen, requested via a replSetSyncFrom command - int _forceSyncSourceIndex; - // How far this node must fall behind before considering switching sync sources - Seconds _maxSyncSourceLagSecs; - - // "heartbeat message" - // sent in requestHeartbeat respond in field "hbm" - std::string _hbmsg; - Date_t _hbmsgTime; // when it was logged - - // heartbeat msg to send to others; descriptive diagnostic info - std::string _getHbmsg(Date_t now) const; - - int _selfIndex; // this node's index in _members and _currentConfig - - ReplicaSetConfig _rsConfig; // The current config, including a vector of MemberConfigs - - // heartbeat data for each member. It is guaranteed that this vector will be maintained - // in the same order as the MemberConfigs in _currentConfig, therefore the member config - // index can be used to index into this vector as well. - std::vector<MemberHeartbeatData> _hbdata; - - // Indicates that we've received a request to stepdown from PRIMARY (likely via a heartbeat) - bool _stepDownPending; - - // Time when stepDown command expires - Date_t _stepDownUntil; - - // A time before which this node will not stand for election. - Date_t _electionSleepUntil; - - // The number of calls we have had to enter maintenance mode - int _maintenanceModeCalls; - - // The sub-mode of follower that we are in. Legal values are RS_SECONDARY, RS_RECOVERING, - // RS_STARTUP2 (initial sync) and RS_ROLLBACK. Only meaningful if _role == Role::follower. - // Configured via setFollowerMode(). If the sub-mode is RS_SECONDARY, then the effective - // sub-mode is either RS_SECONDARY or RS_RECOVERING, depending on _maintenanceModeCalls. - // Rather than accesing this variable direclty, one should use the getMemberState() method, - // which computes the replica set node state on the fly. - MemberState::MS _followerMode; - - typedef std::map<HostAndPort, PingStats> PingMap; - // Ping stats for each member by HostAndPort; - PingMap _pings; - - // Last vote info from the election - struct LastVote { - - static const Seconds leaseTime; - - LastVote() : when(0), whoId(-1) { } - Date_t when; - int whoId; - HostAndPort whoHostAndPort; - } _lastVote; + /** + * Gets the date at which start() was last called, which is used to determine if + * a heartbeat should be retried or if the time limit has expired. + */ + Date_t getLastHeartbeatStartDate() const { + return _lastHeartbeatStartDate; + } + /** + * Gets the number of failures since start() was last called. + * + * This value is incremented by calls to miss(), cleared by calls to start() and + * set to the maximum possible value by calls to hit(). + */ + int getNumFailuresSinceLastStart() const { + return _numFailuresSinceLastStart; + } + +private: + unsigned int count; + unsigned int value; + Date_t _lastHeartbeatStartDate; + int _numFailuresSinceLastStart; +}; + +class TopologyCoordinatorImpl : public TopologyCoordinator { +public: + /** + * Constructs a Topology Coordinator object. + * @param maxSyncSourceLagSecs a sync source is re-evaluated after it lags behind further + * than this amount. + **/ + TopologyCoordinatorImpl(Seconds maxSyncSourceLagSecs); + + //////////////////////////////////////////////////////////// + // + // Implementation of TopologyCoordinator interface + // + //////////////////////////////////////////////////////////// + + virtual Role getRole() const; + virtual MemberState getMemberState() const; + virtual HostAndPort getSyncSourceAddress() const; + virtual std::vector<HostAndPort> getMaybeUpHostAndPorts() const; + virtual int getMaintenanceCount() const; + virtual void setForceSyncSourceIndex(int index); + virtual HostAndPort chooseNewSyncSource(Date_t now, const OpTime& lastOpApplied); + virtual void blacklistSyncSource(const HostAndPort& host, Date_t until); + virtual void unblacklistSyncSource(const HostAndPort& host, Date_t now); + virtual void clearSyncSourceBlacklist(); + virtual bool shouldChangeSyncSource(const HostAndPort& currentSource, Date_t now) const; + virtual bool becomeCandidateIfStepdownPeriodOverAndSingleNodeSet(Date_t now); + virtual void setElectionSleepUntil(Date_t newTime); + virtual void setFollowerMode(MemberState::MS newMode); + virtual void adjustMaintenanceCountBy(int inc); + virtual void prepareSyncFromResponse(const ReplicationExecutor::CallbackData& data, + const HostAndPort& target, + const OpTime& lastOpApplied, + BSONObjBuilder* response, + Status* result); + virtual void prepareFreshResponse(const ReplicationCoordinator::ReplSetFreshArgs& args, + Date_t now, + OpTime lastOpApplied, + BSONObjBuilder* response, + Status* result); + virtual void prepareElectResponse(const ReplicationCoordinator::ReplSetElectArgs& args, + Date_t now, + OpTime lastOpApplied, + BSONObjBuilder* response, + Status* result); + virtual Status prepareHeartbeatResponse(Date_t now, + const ReplSetHeartbeatArgs& args, + const std::string& ourSetName, + const OpTime& lastOpApplied, + ReplSetHeartbeatResponse* response); + virtual void prepareStatusResponse(const ReplicationExecutor::CallbackData& data, + Date_t now, + unsigned uptime, + const OpTime& lastOpApplied, + BSONObjBuilder* response, + Status* result); + virtual void fillIsMasterForReplSet(IsMasterResponse* response); + virtual void prepareFreezeResponse(Date_t now, int secs, BSONObjBuilder* response); + virtual void updateConfig(const ReplicaSetConfig& newConfig, + int selfIndex, + Date_t now, + OpTime lastOpApplied); + virtual std::pair<ReplSetHeartbeatArgs, Milliseconds> prepareHeartbeatRequest( + Date_t now, const std::string& ourSetName, const HostAndPort& target); + virtual HeartbeatResponseAction processHeartbeatResponse( + Date_t now, + Milliseconds networkRoundTripTime, + const HostAndPort& target, + const StatusWith<ReplSetHeartbeatResponse>& hbResponse, + OpTime myLastOpApplied); + virtual bool voteForMyself(Date_t now); + virtual void processWinElection(OID electionId, OpTime electionOpTime); + virtual void processLoseElection(); + virtual bool checkShouldStandForElection(Date_t now, const OpTime& lastOpApplied); + virtual void setMyHeartbeatMessage(const Date_t now, const std::string& message); + virtual bool stepDown(Date_t until, bool force, OpTime lastOpApplied); + virtual bool stepDownIfPending(); + virtual Date_t getStepDownTime() const; + virtual void summarizeAsHtml(ReplSetHtmlSummary* output); + + //////////////////////////////////////////////////////////// + // + // Test support methods + // + //////////////////////////////////////////////////////////// + + // Changes _memberState to newMemberState. Only for testing. + void changeMemberState_forTest(const MemberState& newMemberState, + OpTime electionTime = OpTime(0, 0)); + + // Sets "_electionTime" to "newElectionTime". Only for testing. + void _setElectionTime(const OpTime& newElectionTime); + + // Sets _currentPrimaryIndex to the given index. Should only be used in unit tests! + // TODO(spencer): Remove this once we can easily call for an election in unit tests to + // set the current primary. + void _setCurrentPrimaryForTest(int primaryIndex); + + // Returns _electionTime. Only used in unittests. + OpTime getElectionTime() const; + + // Returns _electionId. Only used in unittests. + OID getElectionId() const; + + // Returns _currentPrimaryIndex. Only used in unittests. + int getCurrentPrimaryIndex() const; + +private: + enum UnelectableReason { + None = 0, + CannotSeeMajority = 1 << 0, + NotCloseEnoughToLatestOptime = 1 << 1, + ArbiterIAm = 1 << 2, + NotSecondary = 1 << 3, + NoPriority = 1 << 4, + StepDownPeriodActive = 1 << 5, + NoData = 1 << 6, + NotInitialized = 1 << 7, + VotedTooRecently = 1 << 8, + RefusesToStand = 1 << 9 }; + typedef int UnelectableReasonMask; + + // Returns the number of heartbeat pings which have occurred. + int _getTotalPings(); -} // namespace repl -} // namespace mongo + // Returns the current "ping" value for the given member by their address + int _getPing(const HostAndPort& host); + + // Determines if we will veto the member specified by "args.id", given that the last op + // we have applied locally is "lastOpApplied". + // If we veto, the errmsg will be filled in with a reason + bool _shouldVetoMember(const ReplicationCoordinator::ReplSetFreshArgs& args, + const Date_t& now, + const OpTime& lastOpApplied, + std::string* errmsg) const; + + // Returns the index of the member with the matching id, or -1 if none match. + int _getMemberIndex(int id) const; + + // Sees if a majority number of votes are held by members who are currently "up" + bool _aMajoritySeemsToBeUp() const; + + // Is otherOpTime close enough (within 10 seconds) to the latest known optime to qualify + // for an election + bool _isOpTimeCloseEnoughToLatestToElect(const OpTime& otherOpTime, + const OpTime& ourLastOpApplied) const; + + // Returns reason why "self" member is unelectable + UnelectableReasonMask _getMyUnelectableReason(const Date_t now, + const OpTime lastOpApplied) const; + + // Returns reason why memberIndex is unelectable + UnelectableReasonMask _getUnelectableReason(int memberIndex, const OpTime& lastOpApplied) const; + + // Returns the nice text of why the node is unelectable + std::string _getUnelectableReasonString(UnelectableReasonMask ur) const; + + // Return true if we are currently primary + bool _iAmPrimary() const; + + // Scans through all members that are 'up' and return the latest known optime. + OpTime _latestKnownOpTime(OpTime ourLastOpApplied) const; + + // Scans the electable set and returns the highest priority member index + int _getHighestPriorityElectableIndex(Date_t now, OpTime lastOpApplied) const; + + // Returns true if "one" member is higher priority than "two" member + bool _isMemberHigherPriority(int memberOneIndex, int memberTwoIndex) const; + + // Helper shortcut to self config + const MemberConfig& _selfConfig() const; + + // Returns NULL if there is no primary, or the MemberConfig* for the current primary + const MemberConfig* _currentPrimaryMember() const; + + /** + * Performs updating "_hbdata" and "_currentPrimaryIndex" for processHeartbeatResponse(). + */ + HeartbeatResponseAction _updateHeartbeatDataImpl(int updatedConfigIndex, + const MemberState& originalState, + Date_t now, + const OpTime& lastOpApplied); + + /** + * Updates _hbdata based on the newConfig, ensuring that every member in the newConfig + * has an entry in _hbdata. If any nodes in the newConfig are also present in + * _currentConfig, copies their heartbeat info into the corresponding entry in the updated + * _hbdata vector. + */ + void _updateHeartbeatDataForReconfig(const ReplicaSetConfig& newConfig, + int selfIndex, + Date_t now); + + void _stepDownSelfAndReplaceWith(int newPrimary); + + MemberState _getMyState() const; + + /** + * Looks up the provided member in the blacklist and returns true if the member's blacklist + * expire time is after 'now'. If the member is found but the expire time is before 'now', + * the function returns false. If the member is not found in the blacklist, the function + * returns false. + **/ + bool _memberIsBlacklisted(const MemberConfig& memberConfig, Date_t now) const; + + // This node's role in the replication protocol. + Role _role; + + // This is a unique id that is generated and set each time we transition to PRIMARY, as the + // result of an election. + OID _electionId; + // The time at which the current PRIMARY was elected. + OpTime _electionTime; + + // the index of the member we currently believe is primary, if one exists, otherwise -1 + int _currentPrimaryIndex; + + // the hostandport we are currently syncing from + // empty if no sync source (we are primary, or we cannot connect to anyone yet) + HostAndPort _syncSource; + // These members are not chosen as sync sources for a period of time, due to connection + // issues with them + std::map<HostAndPort, Date_t> _syncSourceBlacklist; + // The next sync source to be chosen, requested via a replSetSyncFrom command + int _forceSyncSourceIndex; + // How far this node must fall behind before considering switching sync sources + Seconds _maxSyncSourceLagSecs; + + // "heartbeat message" + // sent in requestHeartbeat respond in field "hbm" + std::string _hbmsg; + Date_t _hbmsgTime; // when it was logged + + // heartbeat msg to send to others; descriptive diagnostic info + std::string _getHbmsg(Date_t now) const; + + int _selfIndex; // this node's index in _members and _currentConfig + + ReplicaSetConfig _rsConfig; // The current config, including a vector of MemberConfigs + + // heartbeat data for each member. It is guaranteed that this vector will be maintained + // in the same order as the MemberConfigs in _currentConfig, therefore the member config + // index can be used to index into this vector as well. + std::vector<MemberHeartbeatData> _hbdata; + + // Indicates that we've received a request to stepdown from PRIMARY (likely via a heartbeat) + bool _stepDownPending; + + // Time when stepDown command expires + Date_t _stepDownUntil; + + // A time before which this node will not stand for election. + Date_t _electionSleepUntil; + + // The number of calls we have had to enter maintenance mode + int _maintenanceModeCalls; + + // The sub-mode of follower that we are in. Legal values are RS_SECONDARY, RS_RECOVERING, + // RS_STARTUP2 (initial sync) and RS_ROLLBACK. Only meaningful if _role == Role::follower. + // Configured via setFollowerMode(). If the sub-mode is RS_SECONDARY, then the effective + // sub-mode is either RS_SECONDARY or RS_RECOVERING, depending on _maintenanceModeCalls. + // Rather than accesing this variable direclty, one should use the getMemberState() method, + // which computes the replica set node state on the fly. + MemberState::MS _followerMode; + + typedef std::map<HostAndPort, PingStats> PingMap; + // Ping stats for each member by HostAndPort; + PingMap _pings; + + // Last vote info from the election + struct LastVote { + static const Seconds leaseTime; + + LastVote() : when(0), whoId(-1) {} + Date_t when; + int whoId; + HostAndPort whoHostAndPort; + } _lastVote; +}; + +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/topology_coordinator_impl_test.cpp b/src/mongo/db/repl/topology_coordinator_impl_test.cpp index ade27637a32..78751dc2a01 100644 --- a/src/mongo/db/repl/topology_coordinator_impl_test.cpp +++ b/src/mongo/db/repl/topology_coordinator_impl_test.cpp @@ -51,1807 +51,1240 @@ namespace mongo { namespace repl { namespace { - bool stringContains(const std::string &haystack, const std::string& needle) { - return haystack.find(needle) != std::string::npos; +bool stringContains(const std::string& haystack, const std::string& needle) { + return haystack.find(needle) != std::string::npos; +} + +class TopoCoordTest : public mongo::unittest::Test { +public: + virtual void setUp() { + _topo.reset(new TopologyCoordinatorImpl(Seconds(100))); + _now = 0; + _selfIndex = -1; + _cbData.reset(new ReplicationExecutor::CallbackData( + NULL, ReplicationExecutor::CallbackHandle(), Status::OK())); } - class TopoCoordTest : public mongo::unittest::Test { - public: - virtual void setUp() { - _topo.reset(new TopologyCoordinatorImpl(Seconds(100))); - _now = 0; - _selfIndex = -1; - _cbData.reset(new ReplicationExecutor::CallbackData( - NULL, ReplicationExecutor::CallbackHandle(), Status::OK())); - } - - virtual void tearDown() { - _topo.reset(NULL); - _cbData.reset(NULL); - } - - protected: - TopologyCoordinatorImpl& getTopoCoord() {return *_topo;} - ReplicationExecutor::CallbackData cbData() {return *_cbData;} - Date_t& now() {return _now;} - - int64_t countLogLinesContaining(const std::string& needle) { - return std::count_if(getCapturedLogMessages().begin(), - getCapturedLogMessages().end(), - stdx::bind(stringContains, - stdx::placeholders::_1, - needle)); - } - - void makeSelfPrimary(const OpTime& electionOpTime = OpTime(0,0)) { - getTopoCoord().changeMemberState_forTest(MemberState::RS_PRIMARY, electionOpTime); - getTopoCoord()._setCurrentPrimaryForTest(_selfIndex); - } - - void setSelfMemberState(const MemberState& newState) { - getTopoCoord().changeMemberState_forTest(newState); - } - - int getCurrentPrimaryIndex() { - return getTopoCoord().getCurrentPrimaryIndex(); - } - // Update config and set selfIndex - // If "now" is passed in, set _now to now+1 - void updateConfig(BSONObj cfg, - int selfIndex, - Date_t now = Date_t(-1), - OpTime lastOp = OpTime()) { - ReplicaSetConfig config; - ASSERT_OK(config.initialize(cfg)); - ASSERT_OK(config.validate()); - - _selfIndex = selfIndex; - - if (now == Date_t(-1)) { - getTopoCoord().updateConfig(config, selfIndex, _now++, lastOp); - } - else { - invariant(now > _now); - getTopoCoord().updateConfig(config, selfIndex, now, lastOp); - _now = now + 1; - } - } - - HeartbeatResponseAction receiveUpHeartbeat( - const HostAndPort& member, - const std::string& setName, - MemberState memberState, - OpTime electionTime, - OpTime lastOpTimeSender, - OpTime lastOpTimeReceiver) { - return _receiveHeartbeatHelper(Status::OK(), - member, - setName, - memberState, - electionTime, - lastOpTimeSender, - lastOpTimeReceiver, - Milliseconds(1)); - } - - HeartbeatResponseAction receiveDownHeartbeat( - const HostAndPort& member, - const std::string& setName, - OpTime lastOpTimeReceiver, - ErrorCodes::Error errcode = ErrorCodes::HostUnreachable) { - // timed out heartbeat to mark a node as down - - Milliseconds roundTripTime( - ReplicaSetConfig::kDefaultHeartbeatTimeoutPeriod.total_milliseconds()); - return _receiveHeartbeatHelper(Status(errcode, ""), - member, - setName, - MemberState::RS_UNKNOWN, - OpTime(), - OpTime(), - lastOpTimeReceiver, - roundTripTime); - } - - HeartbeatResponseAction heartbeatFromMember(const HostAndPort& member, - const std::string& setName, - MemberState memberState, - OpTime lastOpTimeSender, - Milliseconds roundTripTime = Milliseconds(1)) { - return _receiveHeartbeatHelper(Status::OK(), - member, - setName, - memberState, - OpTime(), - lastOpTimeSender, - OpTime(), - roundTripTime); - } - - private: - - HeartbeatResponseAction _receiveHeartbeatHelper(Status responseStatus, - const HostAndPort& member, - const std::string& setName, - MemberState memberState, - OpTime electionTime, - OpTime lastOpTimeSender, - OpTime lastOpTimeReceiver, - Milliseconds roundTripTime) { - StatusWith<ReplSetHeartbeatResponse> hbResponse = - StatusWith<ReplSetHeartbeatResponse>(responseStatus); - - if (responseStatus.isOK()) { - ReplSetHeartbeatResponse hb; - hb.setVersion(1); - hb.setState(memberState); - hb.setOpTime(lastOpTimeSender); - hb.setElectionTime(electionTime); - hbResponse = StatusWith<ReplSetHeartbeatResponse>(hb); - } - getTopoCoord().prepareHeartbeatRequest(now(), - setName, - member); - now() += roundTripTime.total_milliseconds(); - return getTopoCoord().processHeartbeatResponse(now(), - roundTripTime, - member, - hbResponse, - lastOpTimeReceiver); - } - - private: - scoped_ptr<TopologyCoordinatorImpl> _topo; - scoped_ptr<ReplicationExecutor::CallbackData> _cbData; - Date_t _now; - int _selfIndex; - }; - - TEST_F(TopoCoordTest, ChooseSyncSourceBasic) { - // if we do not have an index in the config, we should get an empty syncsource - HostAndPort newSyncSource = getTopoCoord().chooseNewSyncSource(now()++, OpTime(0,0)); - ASSERT_TRUE(newSyncSource.empty()); - - updateConfig(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 10 << "host" << "hself") << - BSON("_id" << 20 << "host" << "h2") << - BSON("_id" << 30 << "host" << "h3"))), - 0); - setSelfMemberState(MemberState::RS_SECONDARY); - - // member h2 is the furthest ahead - heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, OpTime(1,0)); - heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, OpTime(0,0)); - - // We start with no sync source - ASSERT(getTopoCoord().getSyncSourceAddress().empty()); - - // Fail due to insufficient number of pings - newSyncSource = getTopoCoord().chooseNewSyncSource(now()++, OpTime(0,0)); - ASSERT_EQUALS(getTopoCoord().getSyncSourceAddress(), newSyncSource); - ASSERT(getTopoCoord().getSyncSourceAddress().empty()); - - // Record 2nd round of pings to allow choosing a new sync source; all members equidistant - heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, OpTime(1,0)); - heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, OpTime(0,0)); - - // Should choose h2, since it is furthest ahead - newSyncSource = getTopoCoord().chooseNewSyncSource(now()++, OpTime(0,0)); - ASSERT_EQUALS(getTopoCoord().getSyncSourceAddress(), newSyncSource); - ASSERT_EQUALS(HostAndPort("h2"), getTopoCoord().getSyncSourceAddress()); - - // h3 becomes further ahead, so it should be chosen - heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, OpTime(2,0)); - getTopoCoord().chooseNewSyncSource(now()++, OpTime(0,0)); - ASSERT_EQUALS(HostAndPort("h3"), getTopoCoord().getSyncSourceAddress()); - - // h3 becomes an invalid candidate for sync source; should choose h2 again - heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_RECOVERING, OpTime(2,0)); - getTopoCoord().chooseNewSyncSource(now()++, OpTime(0,0)); - ASSERT_EQUALS(HostAndPort("h2"), getTopoCoord().getSyncSourceAddress()); - - // h3 back in SECONDARY and ahead - heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, OpTime(2,0)); - getTopoCoord().chooseNewSyncSource(now()++, OpTime(0,0)); - ASSERT_EQUALS(HostAndPort("h3"), getTopoCoord().getSyncSourceAddress()); - - // h3 goes down - receiveDownHeartbeat(HostAndPort("h3"), "rs0", OpTime()); - getTopoCoord().chooseNewSyncSource(now()++, OpTime(0,0)); - ASSERT_EQUALS(HostAndPort("h2"), getTopoCoord().getSyncSourceAddress()); - - // h3 back up and ahead - heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, OpTime(2,0)); - getTopoCoord().chooseNewSyncSource(now()++, OpTime(0,0)); - ASSERT_EQUALS(HostAndPort("h3"), getTopoCoord().getSyncSourceAddress()); - + virtual void tearDown() { + _topo.reset(NULL); + _cbData.reset(NULL); } - TEST_F(TopoCoordTest, ChooseSyncSourceCandidates) { - updateConfig(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "hself") << - BSON("_id" << 10 << "host" << "h1") << - BSON("_id" << 20 << "host" << "h2" << - "buildIndexes" << false << "priority" << 0) << - BSON("_id" << 30 << "host" << "h3" << - "hidden" << true << "priority" << 0 << "votes" << 0) << - BSON("_id" << 40 << "host" << "h4" <<"arbiterOnly" << true) << - BSON("_id" << 50 << "host" << "h5" << - "slaveDelay" << 1 << "priority" << 0) << - BSON("_id" << 60 << "host" << "h6") << - BSON("_id" << 70 << "host" << "hprimary"))), - 0); - - setSelfMemberState(MemberState::RS_SECONDARY); - OpTime lastOpTimeWeApplied = OpTime(100,0); - - heartbeatFromMember(HostAndPort("h1"), "rs0", MemberState::RS_SECONDARY, - OpTime(501, 0), Milliseconds(700)); - heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, - OpTime(501, 0), Milliseconds(600)); - heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, - OpTime(501, 0), Milliseconds(500)); - heartbeatFromMember(HostAndPort("h4"), "rs0", MemberState::RS_SECONDARY, - OpTime(501, 0), Milliseconds(400)); - heartbeatFromMember(HostAndPort("h5"), "rs0", MemberState::RS_SECONDARY, - OpTime(501, 0), Milliseconds(300)); - - // This node is lagged further than maxSyncSourceLagSeconds. - heartbeatFromMember(HostAndPort("h6"), "rs0", MemberState::RS_SECONDARY, - OpTime(499, 0), Milliseconds(200)); - - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - heartbeatFromMember(HostAndPort("hprimary"), "rs0", MemberState::RS_PRIMARY, - OpTime(600, 0), Milliseconds(100)); - ASSERT_EQUALS(7, getCurrentPrimaryIndex()); - - // Record 2nd round of pings to allow choosing a new sync source - heartbeatFromMember(HostAndPort("h1"), "rs0", MemberState::RS_SECONDARY, - OpTime(501, 0), Milliseconds(700)); - heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, - OpTime(501, 0), Milliseconds(600)); - heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, - OpTime(501, 0), Milliseconds(500)); - heartbeatFromMember(HostAndPort("h4"), "rs0", MemberState::RS_SECONDARY, - OpTime(501, 0), Milliseconds(400)); - heartbeatFromMember(HostAndPort("h5"), "rs0", MemberState::RS_SECONDARY, - OpTime(501, 0), Milliseconds(300)); - heartbeatFromMember(HostAndPort("h6"), "rs0", MemberState::RS_SECONDARY, - OpTime(499, 0), Milliseconds(200)); - heartbeatFromMember(HostAndPort("hprimary"), "rs0", MemberState::RS_PRIMARY, - OpTime(600, 0), Milliseconds(100)); - - // Should choose primary first; it's closest - getTopoCoord().chooseNewSyncSource(now()++, lastOpTimeWeApplied); - ASSERT_EQUALS(HostAndPort("hprimary"), getTopoCoord().getSyncSourceAddress()); - - // Primary goes far far away - heartbeatFromMember(HostAndPort("hprimary"), "rs0", MemberState::RS_PRIMARY, - OpTime(600, 0), Milliseconds(100000000)); - - // Should choose h4. (if an arbiter has an oplog, it's a valid sync source) - // h6 is not considered because it is outside the maxSyncLagSeconds window, - getTopoCoord().chooseNewSyncSource(now()++, lastOpTimeWeApplied); - ASSERT_EQUALS(HostAndPort("h4"), getTopoCoord().getSyncSourceAddress()); - - // h4 goes down; should choose h1 - receiveDownHeartbeat(HostAndPort("h4"), "rs0", OpTime()); - getTopoCoord().chooseNewSyncSource(now()++, lastOpTimeWeApplied); - ASSERT_EQUALS(HostAndPort("h1"), getTopoCoord().getSyncSourceAddress()); - - // Primary and h1 go down; should choose h6 - receiveDownHeartbeat(HostAndPort("h1"), "rs0", OpTime()); - receiveDownHeartbeat(HostAndPort("hprimary"), "rs0", OpTime()); - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - getTopoCoord().chooseNewSyncSource(now()++, lastOpTimeWeApplied); - ASSERT_EQUALS(HostAndPort("h6"), getTopoCoord().getSyncSourceAddress()); - - // h6 goes down; should choose h5 - receiveDownHeartbeat(HostAndPort("h6"), "rs0", OpTime()); - getTopoCoord().chooseNewSyncSource(now()++, lastOpTimeWeApplied); - ASSERT_EQUALS(HostAndPort("h5"), getTopoCoord().getSyncSourceAddress()); - - // h5 goes down; should choose h3 - receiveDownHeartbeat(HostAndPort("h5"), "rs0", OpTime()); - getTopoCoord().chooseNewSyncSource(now()++, lastOpTimeWeApplied); - ASSERT_EQUALS(HostAndPort("h3"), getTopoCoord().getSyncSourceAddress()); - - // h3 goes down; no sync source candidates remain - receiveDownHeartbeat(HostAndPort("h3"), "rs0", OpTime()); - getTopoCoord().chooseNewSyncSource(now()++, lastOpTimeWeApplied); - ASSERT(getTopoCoord().getSyncSourceAddress().empty()); +protected: + TopologyCoordinatorImpl& getTopoCoord() { + return *_topo; } - - - TEST_F(TopoCoordTest, ChooseSyncSourceChainingNotAllowed) { - updateConfig(BSON("_id" << "rs0" << - "version" << 1 << - "settings" << BSON("chainingAllowed" << false) << - "members" << BSON_ARRAY( - BSON("_id" << 10 << "host" << "hself") << - BSON("_id" << 20 << "host" << "h2") << - BSON("_id" << 30 << "host" << "h3"))), - 0); - - setSelfMemberState(MemberState::RS_SECONDARY); - - heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, - OpTime(1, 0), Milliseconds(100)); - heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, - OpTime(1, 0), Milliseconds(100)); - heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, - OpTime(0, 0), Milliseconds(300)); - heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, - OpTime(0, 0), Milliseconds(300)); - - // No primary situation: should choose no sync source. - getTopoCoord().chooseNewSyncSource(now()++, OpTime(0,0)); - ASSERT(getTopoCoord().getSyncSourceAddress().empty()); - - // Add primary - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_PRIMARY, - OpTime(0, 0), Milliseconds(300)); - ASSERT_EQUALS(2, getCurrentPrimaryIndex()); - - // h3 is primary and should be chosen as sync source, despite being further away than h2 - // and the primary (h3) being behind our most recently applied optime - getTopoCoord().chooseNewSyncSource(now()++, OpTime(10,0)); - ASSERT_EQUALS(HostAndPort("h3"), getTopoCoord().getSyncSourceAddress()); - + ReplicationExecutor::CallbackData cbData() { + return *_cbData; } - - TEST_F(TopoCoordTest, EmptySyncSourceOnPrimary) { - updateConfig(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 10 << "host" << "hself") << - BSON("_id" << 20 << "host" << "h2") << - BSON("_id" << 30 << "host" << "h3"))), - 0); - - setSelfMemberState(MemberState::RS_SECONDARY); - - heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, - OpTime(1, 0), Milliseconds(100)); - heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, - OpTime(1, 0), Milliseconds(100)); - heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, - OpTime(0, 0), Milliseconds(300)); - heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, - OpTime(0, 0), Milliseconds(300)); - - // No primary situation: should choose h2 sync source. - getTopoCoord().chooseNewSyncSource(now()++, OpTime(0,0)); - ASSERT_EQUALS(HostAndPort("h2"), getTopoCoord().getSyncSourceAddress()); - - // Become primary - makeSelfPrimary(OpTime(3.0)); - ASSERT_EQUALS(0, getCurrentPrimaryIndex()); - - // Check sync source - ASSERT_EQUALS(HostAndPort(), getTopoCoord().getSyncSourceAddress()); + Date_t& now() { + return _now; } - TEST_F(TopoCoordTest, ForceSyncSource) { - updateConfig(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 10 << "host" << "hself") << - BSON("_id" << 20 << "host" << "h2") << - BSON("_id" << 30 << "host" << "h3"))), - 0); - - setSelfMemberState(MemberState::RS_SECONDARY); - - // two rounds of heartbeat pings from each member - heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, - OpTime(1, 0), Milliseconds(300)); - heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, - OpTime(1, 0), Milliseconds(300)); - heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, - OpTime(2, 0), Milliseconds(100)); - heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, - OpTime(2, 0), Milliseconds(100)); - - // force should overrule other defaults - getTopoCoord().chooseNewSyncSource(now()++, OpTime(0,0)); - ASSERT_EQUALS(HostAndPort("h3"), getTopoCoord().getSyncSourceAddress()); - getTopoCoord().setForceSyncSourceIndex(1); - // force should cause shouldChangeSyncSource() to return true - // even if the currentSource is the force target - ASSERT_TRUE(getTopoCoord().shouldChangeSyncSource(HostAndPort("h2"), now())); - ASSERT_TRUE(getTopoCoord().shouldChangeSyncSource(HostAndPort("h3"), now())); - getTopoCoord().chooseNewSyncSource(now()++, OpTime(0,0)); - ASSERT_EQUALS(HostAndPort("h2"), getTopoCoord().getSyncSourceAddress()); - - // force should only work for one call to chooseNewSyncSource - getTopoCoord().chooseNewSyncSource(now()++, OpTime(0,0)); - ASSERT_EQUALS(HostAndPort("h3"), getTopoCoord().getSyncSourceAddress()); - } - - TEST_F(TopoCoordTest, BlacklistSyncSource) { - updateConfig(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 10 << "host" << "hself") << - BSON("_id" << 20 << "host" << "h2") << - BSON("_id" << 30 << "host" << "h3"))), - 0); - - setSelfMemberState(MemberState::RS_SECONDARY); - - heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, - OpTime(1, 0), Milliseconds(300)); - heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, - OpTime(1, 0), Milliseconds(300)); - heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, - OpTime(2, 0), Milliseconds(100)); - heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, - OpTime(2, 0), Milliseconds(100)); - - getTopoCoord().chooseNewSyncSource(now()++, OpTime(0,0)); - ASSERT_EQUALS(HostAndPort("h3"), getTopoCoord().getSyncSourceAddress()); - - Date_t expireTime = 1000; - getTopoCoord().blacklistSyncSource(HostAndPort("h3"), expireTime); - getTopoCoord().chooseNewSyncSource(now()++, OpTime(0,0)); - // Should choose second best choice now that h3 is blacklisted. - ASSERT_EQUALS(HostAndPort("h2"), getTopoCoord().getSyncSourceAddress()); - - // After time has passed, should go back to original sync source - getTopoCoord().chooseNewSyncSource(expireTime, OpTime(0,0)); - ASSERT_EQUALS(HostAndPort("h3"), getTopoCoord().getSyncSourceAddress()); - } - - TEST_F(TopoCoordTest, BlacklistSyncSourceNoChaining) { - updateConfig(BSON("_id" << "rs0" << - "version" << 1 << - "settings" << BSON("chainingAllowed" << false) << - "members" << BSON_ARRAY( - BSON("_id" << 10 << "host" << "hself") << - BSON("_id" << 20 << "host" << "h2") << - BSON("_id" << 30 << "host" << "h3"))), - 0); - - setSelfMemberState(MemberState::RS_SECONDARY); - - heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_PRIMARY, - OpTime(2, 0), Milliseconds(100)); - heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_PRIMARY, - OpTime(2, 0), Milliseconds(100)); - ASSERT_EQUALS(1, getCurrentPrimaryIndex()); - - heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, - OpTime(2, 0), Milliseconds(100)); - heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, - OpTime(2, 0), Milliseconds(100)); - - getTopoCoord().chooseNewSyncSource(now()++, OpTime(0,0)); - ASSERT_EQUALS(HostAndPort("h2"), getTopoCoord().getSyncSourceAddress()); - - Date_t expireTime = 1000; - getTopoCoord().blacklistSyncSource(HostAndPort("h2"), expireTime); - getTopoCoord().chooseNewSyncSource(now()++, OpTime(0,0)); - // Can't choose any sync source now. - ASSERT(getTopoCoord().getSyncSourceAddress().empty()); - - // After time has passed, should go back to the primary - getTopoCoord().chooseNewSyncSource(expireTime, OpTime(0,0)); - ASSERT_EQUALS(HostAndPort("h2"), getTopoCoord().getSyncSourceAddress()); - } - - TEST_F(TopoCoordTest, OnlyUnauthorizedUpCausesRecovering) { - updateConfig(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 10 << "host" << "hself") << - BSON("_id" << 20 << "host" << "h2") << - BSON("_id" << 30 << "host" << "h3"))), - 0); - - setSelfMemberState(MemberState::RS_SECONDARY); - - // Generate enough heartbeats to select a sync source below - heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, - OpTime(1, 0), Milliseconds(300)); - heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, - OpTime(1, 0), Milliseconds(300)); - heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, - OpTime(2, 0), Milliseconds(100)); - heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, - OpTime(2, 0), Milliseconds(100)); - - ASSERT_EQUALS(HostAndPort("h3"), - getTopoCoord().chooseNewSyncSource(now()++, OpTime(0,0))); - ASSERT_EQUALS(MemberState::RS_SECONDARY, getTopoCoord().getMemberState().s); - // Good state setup done - - // Mark nodes down, ensure that we have no source and are secondary - receiveDownHeartbeat(HostAndPort("h2"), "rs0", OpTime(), ErrorCodes::NetworkTimeout); - receiveDownHeartbeat(HostAndPort("h3"), "rs0", OpTime(), ErrorCodes::NetworkTimeout); - ASSERT_TRUE(getTopoCoord().chooseNewSyncSource(now()++, OpTime(0,0)).empty()); - ASSERT_EQUALS(MemberState::RS_SECONDARY, getTopoCoord().getMemberState().s); - - // Mark nodes down + unauth, ensure that we have no source and are secondary - receiveDownHeartbeat(HostAndPort("h2"), "rs0", OpTime(), ErrorCodes::NetworkTimeout); - receiveDownHeartbeat(HostAndPort("h3"), "rs0", OpTime(), ErrorCodes::Unauthorized); - ASSERT_TRUE(getTopoCoord().chooseNewSyncSource(now()++, OpTime(0,0)).empty()); - ASSERT_EQUALS(MemberState::RS_RECOVERING, getTopoCoord().getMemberState().s); - - // Having an auth error but with another node up should bring us out of RECOVERING - HeartbeatResponseAction action = receiveUpHeartbeat(HostAndPort("h2"), - "rs0", - MemberState::RS_SECONDARY, - OpTime(0, 0), - OpTime(2, 0), - OpTime(2, 0)); - ASSERT_EQUALS(MemberState::RS_SECONDARY, getTopoCoord().getMemberState().s); - // Test that the heartbeat that brings us from RECOVERING to SECONDARY doesn't initiate - // an election (SERVER-17164) - ASSERT_NO_ACTION(action.getAction()); - } - - TEST_F(TopoCoordTest, ReceiveHeartbeatWhileAbsentFromConfig) { - updateConfig(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 10 << "host" << "h1") << - BSON("_id" << 20 << "host" << "h2") << - BSON("_id" << 30 << "host" << "h3"))), - -1); - ASSERT_NO_ACTION(heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, - OpTime(1, 0), Milliseconds(300)).getAction()); - } - - TEST_F(TopoCoordTest, PrepareSyncFromResponse) { - OpTime staleOpTime(1, 1); - OpTime ourOpTime(staleOpTime.getSecs() + 11, 1); - - Status result = Status::OK(); - BSONObjBuilder response; - - // if we do not have an index in the config, we should get ErrorCodes::NotSecondary - getTopoCoord().prepareSyncFromResponse(cbData(), HostAndPort("h1"), - ourOpTime, &response, &result); - ASSERT_EQUALS(ErrorCodes::NotSecondary, result); - ASSERT_EQUALS("Removed and uninitialized nodes do not sync", result.reason()); - - // Test trying to sync from another node when we are an arbiter - updateConfig(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << - "host" << "hself" << - "arbiterOnly" << true) << - BSON("_id" << 1 << - "host" << "h1"))), - 0); - - getTopoCoord().prepareSyncFromResponse(cbData(), HostAndPort("h1"), - ourOpTime, &response, &result); - ASSERT_EQUALS(ErrorCodes::NotSecondary, result); - ASSERT_EQUALS("arbiters don't sync", result.reason()); - - // Set up config for the rest of the tests - updateConfig(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "hself") << - BSON("_id" << 1 << "host" << "h1" << "arbiterOnly" << true) << - BSON("_id" << 2 << "host" << "h2" << - "priority" << 0 << "buildIndexes" << false) << - BSON("_id" << 3 << "host" << "h3") << - BSON("_id" << 4 << "host" << "h4") << - BSON("_id" << 5 << "host" << "h5") << - BSON("_id" << 6 << "host" << "h6"))), - 0); - - // Try to sync while PRIMARY - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - makeSelfPrimary(); - ASSERT_EQUALS(0, getCurrentPrimaryIndex()); - getTopoCoord()._setCurrentPrimaryForTest(0); - BSONObjBuilder response1; - getTopoCoord().prepareSyncFromResponse( - cbData(), HostAndPort("h3"), ourOpTime, &response1, &result); - ASSERT_EQUALS(ErrorCodes::NotSecondary, result); - ASSERT_EQUALS("primaries don't sync", result.reason()); - ASSERT_EQUALS("h3:27017", response1.obj()["syncFromRequested"].String()); - - // Try to sync from non-existent member - setSelfMemberState(MemberState::RS_SECONDARY); - getTopoCoord()._setCurrentPrimaryForTest(-1); - BSONObjBuilder response2; - getTopoCoord().prepareSyncFromResponse( - cbData(), HostAndPort("fakemember"), ourOpTime, &response2, &result); - ASSERT_EQUALS(ErrorCodes::NodeNotFound, result); - ASSERT_EQUALS("Could not find member \"fakemember:27017\" in replica set", result.reason()); - - // Try to sync from self - BSONObjBuilder response3; - getTopoCoord().prepareSyncFromResponse( - cbData(), HostAndPort("hself"), ourOpTime, &response3, &result); - ASSERT_EQUALS(ErrorCodes::InvalidOptions, result); - ASSERT_EQUALS("I cannot sync from myself", result.reason()); - - // Try to sync from an arbiter - BSONObjBuilder response4; - getTopoCoord().prepareSyncFromResponse( - cbData(), HostAndPort("h1"), ourOpTime, &response4, &result); - ASSERT_EQUALS(ErrorCodes::InvalidOptions, result); - ASSERT_EQUALS("Cannot sync from \"h1:27017\" because it is an arbiter", result.reason()); - - // Try to sync from a node that doesn't build indexes - BSONObjBuilder response5; - getTopoCoord().prepareSyncFromResponse( - cbData(), HostAndPort("h2"), ourOpTime, &response5, &result); - ASSERT_EQUALS(ErrorCodes::InvalidOptions, result); - ASSERT_EQUALS("Cannot sync from \"h2:27017\" because it does not build indexes", - result.reason()); - - // Try to sync from a member that is down - receiveDownHeartbeat(HostAndPort("h4"), "rs0", OpTime()); - - BSONObjBuilder response7; - getTopoCoord().prepareSyncFromResponse( - cbData(), HostAndPort("h4"), ourOpTime, &response7, &result); - ASSERT_EQUALS(ErrorCodes::HostUnreachable, result); - ASSERT_EQUALS("I cannot reach the requested member: h4:27017", result.reason()); - - // Sync successfully from a member that is stale - heartbeatFromMember(HostAndPort("h5"), "rs0", MemberState::RS_SECONDARY, - staleOpTime, Milliseconds(100)); - - BSONObjBuilder response8; - getTopoCoord().prepareSyncFromResponse( - cbData(), HostAndPort("h5"), ourOpTime, &response8, &result); - ASSERT_OK(result); - ASSERT_EQUALS("requested member \"h5:27017\" is more than 10 seconds behind us", - response8.obj()["warning"].String()); - getTopoCoord().chooseNewSyncSource(now()++, ourOpTime); - ASSERT_EQUALS(HostAndPort("h5"), getTopoCoord().getSyncSourceAddress()); - - // Sync successfully from an up-to-date member - heartbeatFromMember(HostAndPort("h6"), "rs0", MemberState::RS_SECONDARY, - ourOpTime, Milliseconds(100)); - - BSONObjBuilder response9; - getTopoCoord().prepareSyncFromResponse( - cbData(), HostAndPort("h6"), ourOpTime, &response9, &result); - ASSERT_OK(result); - BSONObj response9Obj = response9.obj(); - ASSERT_FALSE(response9Obj.hasField("warning")); - ASSERT_EQUALS(HostAndPort("h5").toString(), response9Obj["prevSyncTarget"].String()); - getTopoCoord().chooseNewSyncSource(now()++, ourOpTime); - ASSERT_EQUALS(HostAndPort("h6"), getTopoCoord().getSyncSourceAddress()); - - // node goes down between forceSync and chooseNewSyncSource - BSONObjBuilder response10; - getTopoCoord().prepareSyncFromResponse( - cbData(), HostAndPort("h6"), ourOpTime, &response10, &result); - BSONObj response10Obj = response10.obj(); - ASSERT_FALSE(response10Obj.hasField("warning")); - ASSERT_EQUALS(HostAndPort("h6").toString(), response10Obj["prevSyncTarget"].String()); - receiveDownHeartbeat(HostAndPort("h6"), "rs0", OpTime()); - HostAndPort syncSource = getTopoCoord().chooseNewSyncSource(now()++, OpTime(0,0)); - ASSERT_EQUALS(HostAndPort("h6"), syncSource); - - // Try to sync from a member that is unauth'd - receiveDownHeartbeat(HostAndPort("h5"), "rs0", OpTime(), ErrorCodes::Unauthorized); - - BSONObjBuilder response11; - getTopoCoord().prepareSyncFromResponse( - cbData(), HostAndPort("h5"), ourOpTime, &response11, &result); - ASSERT_NOT_OK(result); - ASSERT_EQUALS(ErrorCodes::Unauthorized, result.code()); - ASSERT_EQUALS("not authorized to communicate with h5:27017", - result.reason()); - - // Sync successfully from an up-to-date member. - heartbeatFromMember(HostAndPort("h6"), "rs0", MemberState::RS_SECONDARY, - ourOpTime, Milliseconds(100)); - BSONObjBuilder response12; - getTopoCoord().prepareSyncFromResponse( - cbData(), HostAndPort("h6"), ourOpTime, &response12, &result); - ASSERT_OK(result); - syncSource = getTopoCoord().chooseNewSyncSource(now()++, OpTime(0,0)); - ASSERT_EQUALS(HostAndPort("h6"), syncSource); + int64_t countLogLinesContaining(const std::string& needle) { + return std::count_if(getCapturedLogMessages().begin(), + getCapturedLogMessages().end(), + stdx::bind(stringContains, stdx::placeholders::_1, needle)); } - TEST_F(TopoCoordTest, ReplSetGetStatus) { - // This test starts by configuring a TopologyCoordinator as a member of a 4 node replica - // set, with each node in a different state. - // The first node is DOWN, as if we tried heartbeating them and it failed in some way. - // The second node is in state SECONDARY, as if we've received a valid heartbeat from them. - // The third node is in state UNKNOWN, as if we've not yet had any heartbeating activity - // with them yet. The fourth node is PRIMARY and corresponds to ourself, which gets its - // information for replSetGetStatus from a different source than the nodes that aren't - // ourself. After this setup, we call prepareStatusResponse and make sure that the fields - // returned for each member match our expectations. - Date_t startupTime(100); - Date_t heartbeatTime = 5000; - Seconds uptimeSecs(10); - Date_t curTime = heartbeatTime + uptimeSecs.total_milliseconds(); - OpTime electionTime(1, 2); - OpTime oplogProgress(3, 4); - std::string setName = "mySet"; - - updateConfig(BSON("_id" << setName << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << "host" << "test0:1234") << - BSON("_id" << 1 << "host" << "test1:1234") << - BSON("_id" << 2 << "host" << "test2:1234") << - BSON("_id" << 3 << "host" << "test3:1234"))), - 3, - startupTime + 1); - - // Now that the replica set is setup, put the members into the states we want them in. - HostAndPort member = HostAndPort("test0:1234"); - StatusWith<ReplSetHeartbeatResponse> hbResponse = - StatusWith<ReplSetHeartbeatResponse>(Status(ErrorCodes::HostUnreachable, "")); - - getTopoCoord().prepareHeartbeatRequest(startupTime + 2, setName, member); - Date_t timeoutTime = startupTime + 2 + - ReplicaSetConfig::kDefaultHeartbeatTimeoutPeriod.total_milliseconds(); - getTopoCoord().processHeartbeatResponse(timeoutTime, - Milliseconds(5000), - member, - hbResponse, - OpTime(0,0)); - - member = HostAndPort("test1:1234"); - ReplSetHeartbeatResponse hb; - hb.setVersion(1); - hb.setState(MemberState::RS_SECONDARY); - hb.setElectionTime(electionTime); - hb.setHbMsg("READY"); - hb.setOpTime(oplogProgress); - hbResponse = StatusWith<ReplSetHeartbeatResponse>(hb); - getTopoCoord().prepareHeartbeatRequest(startupTime + 2, - setName, - member); - getTopoCoord().processHeartbeatResponse(heartbeatTime, - Milliseconds(4000), - member, - hbResponse, - OpTime(0,0)); - makeSelfPrimary(); - - // Now node 0 is down, node 1 is up, and for node 2 we have no heartbeat data yet. - BSONObjBuilder statusBuilder; - Status resultStatus(ErrorCodes::InternalError, "prepareStatusResponse didn't set result"); - getTopoCoord().prepareStatusResponse(cbData(), - curTime, - uptimeSecs.total_seconds(), - oplogProgress, - &statusBuilder, - &resultStatus); - ASSERT_OK(resultStatus); - BSONObj rsStatus = statusBuilder.obj(); - - // Test results for all non-self members - ASSERT_EQUALS(setName, rsStatus["set"].String()); - ASSERT_EQUALS(curTime.asInt64(), rsStatus["date"].Date().asInt64()); - std::vector<BSONElement> memberArray = rsStatus["members"].Array(); - ASSERT_EQUALS(4U, memberArray.size()); - BSONObj member0Status = memberArray[0].Obj(); - BSONObj member1Status = memberArray[1].Obj(); - BSONObj member2Status = memberArray[2].Obj(); - - // Test member 0, the node that's DOWN - ASSERT_EQUALS(0, member0Status["_id"].numberInt()); - ASSERT_EQUALS("test0:1234", member0Status["name"].str()); - ASSERT_EQUALS(0, member0Status["health"].numberDouble()); - ASSERT_EQUALS(MemberState::RS_DOWN, member0Status["state"].numberInt()); - ASSERT_EQUALS("(not reachable/healthy)", member0Status["stateStr"].str()); - ASSERT_EQUALS(0, member0Status["uptime"].numberInt()); - ASSERT_EQUALS(OpTime(), OpTime(member0Status["optime"].timestampValue())); - ASSERT_TRUE(member0Status.hasField("optimeDate")); - ASSERT_EQUALS(Date_t(OpTime().getSecs() * 1000ULL), - member0Status["optimeDate"].Date().millis); - ASSERT_EQUALS(timeoutTime, member0Status["lastHeartbeat"].date()); - ASSERT_EQUALS(Date_t(), member0Status["lastHeartbeatRecv"].date()); - - // Test member 1, the node that's SECONDARY - ASSERT_EQUALS(1, member1Status["_id"].Int()); - ASSERT_EQUALS("test1:1234", member1Status["name"].String()); - ASSERT_EQUALS(1, member1Status["health"].Double()); - ASSERT_EQUALS(MemberState::RS_SECONDARY, member1Status["state"].numberInt()); - ASSERT_EQUALS(MemberState(MemberState::RS_SECONDARY).toString(), - member1Status["stateStr"].String()); - ASSERT_EQUALS(uptimeSecs.total_seconds(), member1Status["uptime"].numberInt()); - ASSERT_EQUALS(oplogProgress, OpTime(member1Status["optime"].timestampValue())); - ASSERT_TRUE(member1Status.hasField("optimeDate")); - ASSERT_EQUALS(Date_t(oplogProgress.getSecs() * 1000ULL), - member1Status["optimeDate"].Date().millis); - ASSERT_EQUALS(heartbeatTime, member1Status["lastHeartbeat"].date()); - ASSERT_EQUALS(Date_t(), member1Status["lastHeartbeatRecv"].date()); - ASSERT_EQUALS("READY", member1Status["lastHeartbeatMessage"].str()); - - // Test member 2, the node that's UNKNOWN - ASSERT_EQUALS(2, member2Status["_id"].numberInt()); - ASSERT_EQUALS("test2:1234", member2Status["name"].str()); - ASSERT_EQUALS(-1, member2Status["health"].numberDouble()); - ASSERT_EQUALS(MemberState::RS_UNKNOWN, member2Status["state"].numberInt()); - ASSERT_EQUALS(MemberState(MemberState::RS_UNKNOWN).toString(), - member2Status["stateStr"].str()); - ASSERT_TRUE(member2Status.hasField("uptime")); - ASSERT_TRUE(member2Status.hasField("optime")); - ASSERT_TRUE(member2Status.hasField("optimeDate")); - ASSERT_FALSE(member2Status.hasField("lastHearbeat")); - ASSERT_FALSE(member2Status.hasField("lastHearbeatRecv")); - - // Now test results for ourself, the PRIMARY - ASSERT_EQUALS(MemberState::RS_PRIMARY, rsStatus["myState"].numberInt()); - BSONObj selfStatus = memberArray[3].Obj(); - ASSERT_TRUE(selfStatus["self"].boolean()); - ASSERT_EQUALS(3, selfStatus["_id"].numberInt()); - ASSERT_EQUALS("test3:1234", selfStatus["name"].str()); - ASSERT_EQUALS(1, selfStatus["health"].numberDouble()); - ASSERT_EQUALS(MemberState::RS_PRIMARY, selfStatus["state"].numberInt()); - ASSERT_EQUALS(MemberState(MemberState::RS_PRIMARY).toString(), - selfStatus["stateStr"].str()); - ASSERT_EQUALS(uptimeSecs.total_seconds(), selfStatus["uptime"].numberInt()); - ASSERT_EQUALS(oplogProgress, OpTime(selfStatus["optime"].timestampValue())); - ASSERT_TRUE(selfStatus.hasField("optimeDate")); - ASSERT_EQUALS(Date_t(oplogProgress.getSecs() * 1000ULL), - selfStatus["optimeDate"].Date().millis); - - // TODO(spencer): Test electionTime and pingMs are set properly + void makeSelfPrimary(const OpTime& electionOpTime = OpTime(0, 0)) { + getTopoCoord().changeMemberState_forTest(MemberState::RS_PRIMARY, electionOpTime); + getTopoCoord()._setCurrentPrimaryForTest(_selfIndex); } - TEST_F(TopoCoordTest, ReplSetGetStatusFails) { - // This test starts by configuring a TopologyCoordinator to NOT be a member of a 3 node - // replica set. Then running prepareStatusResponse should fail. - Date_t startupTime(100); - Date_t heartbeatTime = 5000; - Seconds uptimeSecs(10); - Date_t curTime = heartbeatTime + uptimeSecs.total_milliseconds(); - OpTime oplogProgress(3, 4); - std::string setName = "mySet"; - - updateConfig(BSON("_id" << setName << - "version" << 1 << - "members" << BSON_ARRAY(BSON("_id" << 0 << "host" << "test0:1234") << - BSON("_id" << 1 << "host" << "test1:1234") << - BSON("_id" << 2 << "host" << "test2:1234"))), - -1, // This one is not part of the replica set. - startupTime + 1); - - BSONObjBuilder statusBuilder; - Status resultStatus(ErrorCodes::InternalError, "prepareStatusResponse didn't set result"); - getTopoCoord().prepareStatusResponse(cbData(), - curTime, - uptimeSecs.total_seconds(), - oplogProgress, - &statusBuilder, - &resultStatus); - ASSERT_NOT_OK(resultStatus); - ASSERT_EQUALS(ErrorCodes::InvalidReplicaSetConfig, resultStatus); + void setSelfMemberState(const MemberState& newState) { + getTopoCoord().changeMemberState_forTest(newState); } - TEST_F(TopoCoordTest, PrepareFreshResponse) { - ReplicationCoordinator::ReplSetFreshArgs args; - OpTime freshestOpTime(15, 10); - OpTime ourOpTime(10, 10); - OpTime staleOpTime(1, 1); - Status internalErrorStatus(ErrorCodes::InternalError, "didn't set status"); - - // if we do not have an index in the config, we should get ErrorCodes::ReplicaSetNotFound - BSONObjBuilder responseBuilder; - Status status = internalErrorStatus; - getTopoCoord().prepareFreshResponse(args, Date_t(), ourOpTime, &responseBuilder, &status); - ASSERT_EQUALS(ErrorCodes::ReplicaSetNotFound, status); - ASSERT_EQUALS("Cannot participate in elections because not initialized", status.reason()); - ASSERT_TRUE(responseBuilder.obj().isEmpty()); - - updateConfig(BSON("_id" << "rs0" << - "version" << 10 << - "members" << BSON_ARRAY( - BSON("_id" << 10 << - "host" << "hself" << - "priority" << 10) << - BSON("_id" << 20 << "host" << "h1") << - BSON("_id" << 30 << "host" << "h2") << - BSON("_id" << 40 << - "host" << "h3" << - "priority" << 10))), - 0); - - // Test with incorrect replset name - args.setName = "fakeset"; - - BSONObjBuilder responseBuilder0; - Status status0 = internalErrorStatus; - getTopoCoord().prepareFreshResponse(args, Date_t(), ourOpTime, &responseBuilder0, &status0); - ASSERT_EQUALS(ErrorCodes::ReplicaSetNotFound, status0); - ASSERT_TRUE(responseBuilder0.obj().isEmpty()); - - heartbeatFromMember(HostAndPort("h1"), "rs0", MemberState::RS_SECONDARY, ourOpTime); - - // Test with old config version - args.setName = "rs0"; - args.cfgver = 5; - args.id = 20; - args.who = HostAndPort("h1"); - args.opTime = ourOpTime; - - BSONObjBuilder responseBuilder1; - Status status1 = internalErrorStatus; - getTopoCoord().prepareFreshResponse(args, Date_t(), ourOpTime, &responseBuilder1, &status1); - ASSERT_OK(status1); - BSONObj response1 = responseBuilder1.obj(); - ASSERT_EQUALS("config version stale", response1["info"].String()); - ASSERT_EQUALS(ourOpTime, OpTime(response1["opTime"].timestampValue())); - ASSERT_TRUE(response1["fresher"].Bool()); - ASSERT_FALSE(response1["veto"].Bool()); - ASSERT_FALSE(response1.hasField("errmsg")); - - // Test with non-existent node. - args.cfgver = 10; - args.id = 0; - args.who = HostAndPort("fakenode"); - - BSONObjBuilder responseBuilder2; - Status status2 = internalErrorStatus; - getTopoCoord().prepareFreshResponse(args, Date_t(), ourOpTime, &responseBuilder2, &status2); - ASSERT_OK(status2); - BSONObj response2 = responseBuilder2.obj(); - ASSERT_EQUALS(ourOpTime, OpTime(response2["opTime"].timestampValue())); - ASSERT_FALSE(response2["fresher"].Bool()); - ASSERT_TRUE(response2["veto"].Bool()); - ASSERT_EQUALS("replSet couldn't find member with id 0", response2["errmsg"].String()); - - - // Test when we are primary. - args.id = 20; - args.who = HostAndPort("h1"); - - makeSelfPrimary(); - - BSONObjBuilder responseBuilder3; - Status status3 = internalErrorStatus; - getTopoCoord().prepareFreshResponse(args, Date_t(), ourOpTime, &responseBuilder3, &status3); - ASSERT_OK(status3); - BSONObj response3 = responseBuilder3.obj(); - ASSERT_FALSE(response3.hasField("info")); - ASSERT_EQUALS(ourOpTime, OpTime(response3["opTime"].timestampValue())); - ASSERT_FALSE(response3["fresher"].Bool()); - ASSERT_TRUE(response3["veto"].Bool()); - ASSERT_EQUALS("I am already primary, h1:27017 can try again once I've stepped down", - response3["errmsg"].String()); - - - // Test when someone else is primary. - heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, ourOpTime); - setSelfMemberState(MemberState::RS_SECONDARY); - getTopoCoord()._setCurrentPrimaryForTest(2); - - BSONObjBuilder responseBuilder4; - Status status4 = internalErrorStatus; - getTopoCoord().prepareFreshResponse(args, Date_t(), ourOpTime, &responseBuilder4, &status4); - ASSERT_OK(status4); - BSONObj response4 = responseBuilder4.obj(); - ASSERT_FALSE(response4.hasField("info")); - ASSERT_EQUALS(ourOpTime, OpTime(response4["opTime"].timestampValue())); - ASSERT_FALSE(response4["fresher"].Bool()); - ASSERT_TRUE(response4["veto"].Bool()); - ASSERT_EQUALS( - "h1:27017 is trying to elect itself but h2:27017 is already primary and more " - "up-to-date", - response4["errmsg"].String()); - - - // Test trying to elect a node that is caught up but isn't the highest priority node. - heartbeatFromMember(HostAndPort("h1"), "rs0", MemberState::RS_SECONDARY, ourOpTime); - heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, staleOpTime); - heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, ourOpTime); - - BSONObjBuilder responseBuilder5; - Status status5 = internalErrorStatus; - getTopoCoord().prepareFreshResponse(args, Date_t(), ourOpTime, &responseBuilder5, &status5); - ASSERT_OK(status5); - BSONObj response5 = responseBuilder5.obj(); - ASSERT_FALSE(response5.hasField("info")); - ASSERT_EQUALS(ourOpTime, OpTime(response5["opTime"].timestampValue())); - ASSERT_FALSE(response5["fresher"].Bool()); - ASSERT_TRUE(response5["veto"].Bool()); - ASSERT(response5["errmsg"].String().find("h1:27017 has lower priority of 1 than") != - std::string::npos) << response5["errmsg"].String(); - - // Test trying to elect a node that isn't electable because its down - args.id = 40; - args.who = HostAndPort("h3"); - - receiveDownHeartbeat(HostAndPort("h3"), "rs0", OpTime()); - - BSONObjBuilder responseBuilder6; - Status status6 = internalErrorStatus; - getTopoCoord().prepareFreshResponse(args, Date_t(), ourOpTime, &responseBuilder6, &status6); - ASSERT_OK(status6); - BSONObj response6 = responseBuilder6.obj(); - ASSERT_FALSE(response6.hasField("info")); - ASSERT_EQUALS(ourOpTime, OpTime(response6["opTime"].timestampValue())); - ASSERT_FALSE(response6["fresher"].Bool()); - ASSERT_TRUE(response6["veto"].Bool()); - ASSERT_NE(std::string::npos, response6["errmsg"].String().find( - "I don't think h3:27017 is electable because the member is not " - "currently a secondary")) << response6["errmsg"].String(); - - // Test trying to elect a node that isn't electable because it's PRIMARY - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_PRIMARY, ourOpTime); - ASSERT_EQUALS(3, getCurrentPrimaryIndex()); - - BSONObjBuilder responseBuilder7; - Status status7 = internalErrorStatus; - getTopoCoord().prepareFreshResponse(args, Date_t(), ourOpTime, &responseBuilder7, &status7); - ASSERT_OK(status7); - BSONObj response7 = responseBuilder7.obj(); - ASSERT_FALSE(response7.hasField("info")); - ASSERT_EQUALS(ourOpTime, OpTime(response7["opTime"].timestampValue())); - ASSERT_FALSE(response7["fresher"].Bool()); - ASSERT_TRUE(response7["veto"].Bool()); - ASSERT_NE(std::string::npos, response7["errmsg"].String().find( - "I don't think h3:27017 is electable because the member is not " - "currently a secondary")) << response7["errmsg"].String(); - - // Test trying to elect a node that isn't electable because it's STARTUP - heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_STARTUP, ourOpTime); - - BSONObjBuilder responseBuilder8; - Status status8 = internalErrorStatus; - getTopoCoord().prepareFreshResponse(args, Date_t(), ourOpTime, &responseBuilder8, &status8); - ASSERT_OK(status8); - BSONObj response8 = responseBuilder8.obj(); - ASSERT_FALSE(response8.hasField("info")); - ASSERT_EQUALS(ourOpTime, OpTime(response8["opTime"].timestampValue())); - ASSERT_FALSE(response8["fresher"].Bool()); - ASSERT_TRUE(response8["veto"].Bool()); - ASSERT_NE(std::string::npos, response8["errmsg"].String().find( - "I don't think h3:27017 is electable because the member is not " - "currently a secondary")) << response8["errmsg"].String(); - - // Test trying to elect a node that isn't electable because it's RECOVERING - heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_RECOVERING, ourOpTime); - - BSONObjBuilder responseBuilder9; - Status status9 = internalErrorStatus; - getTopoCoord().prepareFreshResponse(args, Date_t(), ourOpTime, &responseBuilder9, &status9); - ASSERT_OK(status9); - BSONObj response9 = responseBuilder9.obj(); - ASSERT_FALSE(response9.hasField("info")); - ASSERT_EQUALS(ourOpTime, OpTime(response9["opTime"].timestampValue())); - ASSERT_FALSE(response9["fresher"].Bool()); - ASSERT_TRUE(response9["veto"].Bool()); - ASSERT_NE(std::string::npos, response9["errmsg"].String().find( - "I don't think h3:27017 is electable because the member is not " - "currently a secondary")) << response9["errmsg"].String(); - - // Test trying to elect a node that is fresher but lower priority than the existing primary - args.id = 30; - args.who = HostAndPort("h2"); - - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_PRIMARY, ourOpTime); - ASSERT_EQUALS(3, getCurrentPrimaryIndex()); - heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, freshestOpTime); - - BSONObjBuilder responseBuilder10; - Status status10 = internalErrorStatus; - getTopoCoord().prepareFreshResponse(args, Date_t(), ourOpTime, &responseBuilder10, &status10); - ASSERT_OK(status10); - BSONObj response10 = responseBuilder10.obj(); - ASSERT_FALSE(response10.hasField("info")); - ASSERT_EQUALS(ourOpTime, OpTime(response10["opTime"].timestampValue())); - ASSERT_TRUE(response10["fresher"].Bool()); - ASSERT_TRUE(response10["veto"].Bool()); - ASSERT_TRUE(response10.hasField("errmsg")); - - - // Test trying to elect a valid node - args.id = 40; - args.who = HostAndPort("h3"); - - receiveDownHeartbeat(HostAndPort("h2"), "rs0", OpTime()); - heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, ourOpTime); - - BSONObjBuilder responseBuilder11; - Status status11 = internalErrorStatus; - getTopoCoord().prepareFreshResponse( - args, Date_t(), ourOpTime, &responseBuilder11, &status11); - ASSERT_OK(status11); - BSONObj response11 = responseBuilder11.obj(); - ASSERT_FALSE(response11.hasField("info")) << response11.toString(); - ASSERT_EQUALS(ourOpTime, OpTime(response11["opTime"].timestampValue())); - ASSERT_FALSE(response11["fresher"].Bool()) << response11.toString(); - ASSERT_FALSE(response11["veto"].Bool()) << response11.toString(); - ASSERT_FALSE(response11.hasField("errmsg")) << response11.toString(); - - // Test with our id - args.id = 10; - BSONObjBuilder responseBuilder12; - Status status12 = internalErrorStatus; - getTopoCoord().prepareFreshResponse( - args, Date_t(), ourOpTime, &responseBuilder12, &status12); - ASSERT_EQUALS(ErrorCodes::BadValue, status12); - ASSERT_EQUALS( - "Received replSetFresh command from member with the same member ID as ourself: 10", - status12.reason()); - ASSERT_TRUE(responseBuilder12.obj().isEmpty()); - + int getCurrentPrimaryIndex() { + return getTopoCoord().getCurrentPrimaryIndex(); } - - class HeartbeatResponseTest : public TopoCoordTest { - public: - - virtual void setUp() { - TopoCoordTest::setUp(); - updateConfig(BSON("_id" << "rs0" << - "version" << 5 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "host1:27017") << - BSON("_id" << 1 << "host" << "host2:27017") << - BSON("_id" << 2 << "host" << "host3:27017")) << - "settings" << BSON("heartbeatTimeoutSecs" << 5)), - 0); - } - - }; - - class HeartbeatResponseTestOneRetry : public HeartbeatResponseTest { - public: - virtual void setUp() { - HeartbeatResponseTest::setUp(); - - // Bring up the node we are heartbeating. - _target = HostAndPort("host2", 27017); - Date_t _upRequestDate = unittest::assertGet(dateFromISOString("2014-08-29T12:55Z")); - std::pair<ReplSetHeartbeatArgs, Milliseconds> uppingRequest = - getTopoCoord().prepareHeartbeatRequest(_upRequestDate, - "rs0", - _target); - HeartbeatResponseAction upAction = - getTopoCoord().processHeartbeatResponse( - _upRequestDate, - Milliseconds(0), - _target, - StatusWith<ReplSetHeartbeatResponse>(Status::OK()), - OpTime(0, 0)); // We've never applied anything. - ASSERT_EQUALS(HeartbeatResponseAction::NoAction, upAction.getAction()); - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - - - // Time of first request for this heartbeat period - _firstRequestDate = unittest::assertGet(dateFromISOString("2014-08-29T13:00Z")); - - // Initial heartbeat attempt prepared, at t + 0. - std::pair<ReplSetHeartbeatArgs, Milliseconds> request = - getTopoCoord().prepareHeartbeatRequest(_firstRequestDate, - "rs0", - _target); - // 5 seconds to successfully complete the heartbeat before the timeout expires. - ASSERT_EQUALS(5000, request.second.total_milliseconds()); - - // Initial heartbeat request fails at t + 4000ms - HeartbeatResponseAction action = - getTopoCoord().processHeartbeatResponse( - _firstRequestDate + 4000, // 4 seconds elapsed, retry allowed. - Milliseconds(3990), // Spent 3.99 of the 4 seconds in the network. - _target, - StatusWith<ReplSetHeartbeatResponse>(ErrorCodes::ExceededTimeLimit, - "Took too long"), - OpTime(0, 0)); // We've never applied anything. - - ASSERT_EQUALS(HeartbeatResponseAction::NoAction, action.getAction()); - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - // Because the heartbeat failed without timing out, we expect to retry immediately. - ASSERT_EQUALS(Date_t(_firstRequestDate + 4000), action.getNextHeartbeatStartDate()); - - // First heartbeat retry prepared, at t + 4000ms. - request = - getTopoCoord().prepareHeartbeatRequest( - _firstRequestDate + 4000, - "rs0", - _target); - // One second left to complete the heartbeat. - ASSERT_EQUALS(1000, request.second.total_milliseconds()); - - // Ensure a single failed heartbeat did not cause the node to be marked down - BSONObjBuilder statusBuilder; - Status resultStatus(ErrorCodes::InternalError, - "prepareStatusResponse didn't set result"); - getTopoCoord().prepareStatusResponse(cbData(), - _firstRequestDate + 4000, - 10, - OpTime(100,0), - &statusBuilder, - &resultStatus); - ASSERT_OK(resultStatus); - BSONObj rsStatus = statusBuilder.obj(); - std::vector<BSONElement> memberArray = rsStatus["members"].Array(); - BSONObj member1Status = memberArray[1].Obj(); - - ASSERT_EQUALS(1, member1Status["_id"].Int()); - ASSERT_EQUALS(1, member1Status["health"].Double()); - - } - - Date_t firstRequestDate() { - return _firstRequestDate; + // Update config and set selfIndex + // If "now" is passed in, set _now to now+1 + void updateConfig(BSONObj cfg, + int selfIndex, + Date_t now = Date_t(-1), + OpTime lastOp = OpTime()) { + ReplicaSetConfig config; + ASSERT_OK(config.initialize(cfg)); + ASSERT_OK(config.validate()); + + _selfIndex = selfIndex; + + if (now == Date_t(-1)) { + getTopoCoord().updateConfig(config, selfIndex, _now++, lastOp); + } else { + invariant(now > _now); + getTopoCoord().updateConfig(config, selfIndex, now, lastOp); + _now = now + 1; } - - HostAndPort target() { - return _target; - } - - private: - Date_t _firstRequestDate; - HostAndPort _target; - - }; - - class HeartbeatResponseTestTwoRetries : public HeartbeatResponseTestOneRetry { - public: - virtual void setUp() { - HeartbeatResponseTestOneRetry::setUp(); - // First retry fails at t + 4500ms - HeartbeatResponseAction action = - getTopoCoord().processHeartbeatResponse( - firstRequestDate() + 4500, // 4.5 of the 5 seconds elapsed; could retry. - Milliseconds(400), // Spent 0.4 of the 0.5 seconds in the network. - target(), - StatusWith<ReplSetHeartbeatResponse>(ErrorCodes::NodeNotFound, "Bad DNS?"), - OpTime(0, 0)); // We've never applied anything. - ASSERT_EQUALS(HeartbeatResponseAction::NoAction, action.getAction()); - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - // Because the first retry failed without timing out, we expect to retry immediately. - ASSERT_EQUALS(Date_t(firstRequestDate() + 4500), action.getNextHeartbeatStartDate()); - - // Second retry prepared at t + 4500ms. - std::pair<ReplSetHeartbeatArgs, Milliseconds> request = - getTopoCoord().prepareHeartbeatRequest( - firstRequestDate() + 4500, - "rs0", - target()); - // 500ms left to complete the heartbeat. - ASSERT_EQUALS(500, request.second.total_milliseconds()); - - // Ensure a second failed heartbeat did not cause the node to be marked down - BSONObjBuilder statusBuilder; - Status resultStatus(ErrorCodes::InternalError, - "prepareStatusResponse didn't set result"); - getTopoCoord().prepareStatusResponse(cbData(), - firstRequestDate() + 4000, - 10, - OpTime(100,0), - &statusBuilder, - &resultStatus); - ASSERT_OK(resultStatus); - BSONObj rsStatus = statusBuilder.obj(); - std::vector<BSONElement> memberArray = rsStatus["members"].Array(); - BSONObj member1Status = memberArray[1].Obj(); - - ASSERT_EQUALS(1, member1Status["_id"].Int()); - ASSERT_EQUALS(1, member1Status["health"].Double()); - } - }; - - class HeartbeatResponseHighVerbosityTest : public HeartbeatResponseTest { - public: - - virtual void setUp() { - HeartbeatResponseTest::setUp(); - // set verbosity as high as the highest verbosity log message we'd like to check for - logger::globalLogDomain()->setMinimumLoggedSeverity(logger::LogSeverity::Debug(3)); - } - - virtual void tearDown() { - HeartbeatResponseTest::tearDown(); - logger::globalLogDomain()->setMinimumLoggedSeverity(logger::LogSeverity::Log()); - } - - }; - - TEST_F(HeartbeatResponseHighVerbosityTest, UpdateHeartbeatDataNodeBelivesWeAreDown) { - OpTime lastOpTimeApplied = OpTime(3,0); - - // request heartbeat - std::pair<ReplSetHeartbeatArgs, Milliseconds> request = - getTopoCoord().prepareHeartbeatRequest(now()++, "rs0", HostAndPort("host2")); - - ReplSetHeartbeatResponse believesWeAreDownResponse; - believesWeAreDownResponse.noteReplSet(); - believesWeAreDownResponse.setSetName("rs0"); - believesWeAreDownResponse.setState(MemberState::RS_SECONDARY); - believesWeAreDownResponse.setElectable(true); - believesWeAreDownResponse.noteStateDisagreement(); - startCapturingLogMessages(); - HeartbeatResponseAction action = getTopoCoord().processHeartbeatResponse( - now()++, // Time is left. - Milliseconds(400), // Spent 0.4 of the 0.5 second in the network. - HostAndPort("host2"), - StatusWith<ReplSetHeartbeatResponse>(believesWeAreDownResponse), - lastOpTimeApplied); - stopCapturingLogMessages(); - ASSERT_NO_ACTION(action.getAction()); - ASSERT_EQUALS(1, countLogLinesContaining("host2:27017 thinks that we are down")); - } - TEST_F(HeartbeatResponseHighVerbosityTest, UpdateHeartbeatDataMemberNotInConfig) { - OpTime lastOpTimeApplied = OpTime(3,0); - - // request heartbeat - std::pair<ReplSetHeartbeatArgs, Milliseconds> request = - getTopoCoord().prepareHeartbeatRequest(now()++, "rs0", HostAndPort("host5")); - - ReplSetHeartbeatResponse memberMissingResponse; - memberMissingResponse.noteReplSet(); - memberMissingResponse.setSetName("rs0"); - memberMissingResponse.setState(MemberState::RS_SECONDARY); - memberMissingResponse.setElectable(true); - memberMissingResponse.noteStateDisagreement(); - startCapturingLogMessages(); - HeartbeatResponseAction action = getTopoCoord().processHeartbeatResponse( - now()++, // Time is left. - Milliseconds(400), // Spent 0.4 of the 0.5 second in the network. - HostAndPort("host5"), - StatusWith<ReplSetHeartbeatResponse>(memberMissingResponse), - lastOpTimeApplied); - stopCapturingLogMessages(); - ASSERT_NO_ACTION(action.getAction()); - ASSERT_EQUALS(1, countLogLinesContaining("Could not find host5:27017 in current config")); + HeartbeatResponseAction receiveUpHeartbeat(const HostAndPort& member, + const std::string& setName, + MemberState memberState, + OpTime electionTime, + OpTime lastOpTimeSender, + OpTime lastOpTimeReceiver) { + return _receiveHeartbeatHelper(Status::OK(), + member, + setName, + memberState, + electionTime, + lastOpTimeSender, + lastOpTimeReceiver, + Milliseconds(1)); } - TEST_F(HeartbeatResponseHighVerbosityTest, UpdateHeartbeatDataSameConfig) { - OpTime lastOpTimeApplied = OpTime(3,0); - - // request heartbeat - std::pair<ReplSetHeartbeatArgs, Milliseconds> request = - getTopoCoord().prepareHeartbeatRequest(now()++, "rs0", HostAndPort("host2")); - - // construct a copy of the original config for log message checking later - // see HeartbeatResponseTest for the origin of the original config - ReplicaSetConfig originalConfig; - originalConfig.initialize(BSON("_id" << "rs0" << - "version" << 5 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "host1:27017") << - BSON("_id" << 1 << "host" << "host2:27017") << - BSON("_id" << 2 << "host" << "host3:27017")) << - "settings" << BSON("heartbeatTimeoutSecs" << 5))); - - ReplSetHeartbeatResponse sameConfigResponse; - sameConfigResponse.noteReplSet(); - sameConfigResponse.setSetName("rs0"); - sameConfigResponse.setState(MemberState::RS_SECONDARY); - sameConfigResponse.setElectable(true); - sameConfigResponse.noteStateDisagreement(); - sameConfigResponse.setVersion(2); - sameConfigResponse.setConfig(originalConfig); - startCapturingLogMessages(); - HeartbeatResponseAction action = getTopoCoord().processHeartbeatResponse( - now()++, // Time is left. - Milliseconds(400), // Spent 0.4 of the 0.5 second in the network. - HostAndPort("host2"), - StatusWith<ReplSetHeartbeatResponse>(sameConfigResponse), - lastOpTimeApplied); - stopCapturingLogMessages(); - ASSERT_NO_ACTION(action.getAction()); - ASSERT_EQUALS(1, countLogLinesContaining("Config from heartbeat response was " - "same as ours.")); + HeartbeatResponseAction receiveDownHeartbeat( + const HostAndPort& member, + const std::string& setName, + OpTime lastOpTimeReceiver, + ErrorCodes::Error errcode = ErrorCodes::HostUnreachable) { + // timed out heartbeat to mark a node as down + + Milliseconds roundTripTime( + ReplicaSetConfig::kDefaultHeartbeatTimeoutPeriod.total_milliseconds()); + return _receiveHeartbeatHelper(Status(errcode, ""), + member, + setName, + MemberState::RS_UNKNOWN, + OpTime(), + OpTime(), + lastOpTimeReceiver, + roundTripTime); } - TEST_F(HeartbeatResponseHighVerbosityTest, UpdateHeartbeatDataOldConfig) { - OpTime lastOpTimeApplied = OpTime(3,0); - - // request heartbeat - std::pair<ReplSetHeartbeatArgs, Milliseconds> request = - getTopoCoord().prepareHeartbeatRequest(now()++, "rs0", HostAndPort("host2")); - - ReplSetHeartbeatResponse believesWeAreDownResponse; - believesWeAreDownResponse.noteReplSet(); - believesWeAreDownResponse.setSetName("rs0"); - believesWeAreDownResponse.setState(MemberState::RS_SECONDARY); - believesWeAreDownResponse.setElectable(true); - believesWeAreDownResponse.noteStateDisagreement(); - startCapturingLogMessages(); - HeartbeatResponseAction action = getTopoCoord().processHeartbeatResponse( - now()++, // Time is left. - Milliseconds(400), // Spent 0.4 of the 0.5 second in the network. - HostAndPort("host2"), - StatusWith<ReplSetHeartbeatResponse>(believesWeAreDownResponse), - lastOpTimeApplied); - stopCapturingLogMessages(); - ASSERT_NO_ACTION(action.getAction()); - ASSERT_EQUALS(1, countLogLinesContaining("host2:27017 thinks that we are down")); - - } - - TEST_F(HeartbeatResponseTestOneRetry, DecideToReconfig) { - // Confirm that action responses can come back from retries; in this, expect a Reconfig - // action. - ReplicaSetConfig newConfig; - ASSERT_OK(newConfig.initialize( - BSON("_id" << "rs0" << - "version" << 7 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "host1:27017") << - BSON("_id" << 1 << "host" << "host2:27017") << - BSON("_id" << 2 << "host" << "host3:27017") << - BSON("_id" << 3 << "host" << "host4:27017")) << - "settings" << BSON("heartbeatTimeoutSecs" << 5)))); - ASSERT_OK(newConfig.validate()); - - ReplSetHeartbeatResponse reconfigResponse; - reconfigResponse.noteReplSet(); - reconfigResponse.setSetName("rs0"); - reconfigResponse.setState(MemberState::RS_SECONDARY); - reconfigResponse.setElectable(true); - reconfigResponse.setVersion(7); - reconfigResponse.setConfig(newConfig); - HeartbeatResponseAction action = - getTopoCoord().processHeartbeatResponse( - firstRequestDate() + 4500, // Time is left. - Milliseconds(400), // Spent 0.4 of the 0.5 second in the network. - target(), - StatusWith<ReplSetHeartbeatResponse>(reconfigResponse), - OpTime(0, 0)); // We've never applied anything. - ASSERT_EQUALS(HeartbeatResponseAction::Reconfig, action.getAction()); - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - ASSERT_EQUALS(Date_t(firstRequestDate() + 6500), action.getNextHeartbeatStartDate()); - } - - TEST_F(HeartbeatResponseTestOneRetry, DecideToStepDownRemotePrimary) { - // Confirm that action responses can come back from retries; in this, expect a - // StepDownRemotePrimary action. - - // make self primary - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - makeSelfPrimary(OpTime(5,0)); - ASSERT_EQUALS(0, getCurrentPrimaryIndex()); - - ReplSetHeartbeatResponse electedMoreRecentlyResponse; - electedMoreRecentlyResponse.noteReplSet(); - electedMoreRecentlyResponse.setSetName("rs0"); - electedMoreRecentlyResponse.setState(MemberState::RS_PRIMARY); - electedMoreRecentlyResponse.setElectable(true); - electedMoreRecentlyResponse.setElectionTime(OpTime(3,0)); - electedMoreRecentlyResponse.setVersion(5); - HeartbeatResponseAction action = - getTopoCoord().processHeartbeatResponse( - firstRequestDate() + 4500, // Time is left. - Milliseconds(400), // Spent 0.4 of the 0.5 second in the network. - target(), - StatusWith<ReplSetHeartbeatResponse>(electedMoreRecentlyResponse), - OpTime(0,0)); // We've never applied anything. - ASSERT_EQUALS(HeartbeatResponseAction::StepDownRemotePrimary, action.getAction()); - ASSERT_EQUALS(1, action.getPrimaryConfigIndex()); - ASSERT_EQUALS(Date_t(firstRequestDate() + 6500), action.getNextHeartbeatStartDate()); - } - - TEST_F(HeartbeatResponseTestOneRetry, DecideToStepDownSelf) { - // Confirm that action responses can come back from retries; in this, expect a StepDownSelf - // action. - - // acknowledge the other member so that we see a majority - HeartbeatResponseAction action = receiveDownHeartbeat(HostAndPort("host3"), - "rs0", - OpTime(100, 0)); - ASSERT_NO_ACTION(action.getAction()); - - // make us PRIMARY - makeSelfPrimary(); - - ReplSetHeartbeatResponse electedMoreRecentlyResponse; - electedMoreRecentlyResponse.noteReplSet(); - electedMoreRecentlyResponse.setSetName("rs0"); - electedMoreRecentlyResponse.setState(MemberState::RS_PRIMARY); - electedMoreRecentlyResponse.setElectable(false); - electedMoreRecentlyResponse.setElectionTime(OpTime(10,0)); - electedMoreRecentlyResponse.setVersion(5); - action = - getTopoCoord().processHeartbeatResponse( - firstRequestDate() + 4500, // Time is left. - Milliseconds(400), // Spent 0.4 of the 0.5 second in the network. - target(), - StatusWith<ReplSetHeartbeatResponse>(electedMoreRecentlyResponse), - OpTime(0, 0)); // We've never applied anything. - ASSERT_EQUALS(HeartbeatResponseAction::StepDownSelf, action.getAction()); - ASSERT_EQUALS(0, action.getPrimaryConfigIndex()); - ASSERT_EQUALS(Date_t(firstRequestDate() + 6500), action.getNextHeartbeatStartDate()); - // Doesn't actually do the stepdown until stepDownIfPending is called - ASSERT_TRUE(TopologyCoordinator::Role::leader == getTopoCoord().getRole()); - ASSERT_EQUALS(0, getCurrentPrimaryIndex()); - - ASSERT_TRUE(getTopoCoord().stepDownIfPending()); - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - ASSERT_EQUALS(1, getCurrentPrimaryIndex()); - } - - TEST_F(HeartbeatResponseTestOneRetry, DecideToStartElection) { - // Confirm that action responses can come back from retries; in this, expect a StartElection - // action. - - // acknowledge the other member so that we see a majority - OpTime election = OpTime(400,0); - OpTime lastOpTimeApplied = OpTime(300,0); - HeartbeatResponseAction action = receiveUpHeartbeat(HostAndPort("host3"), - "rs0", - MemberState::RS_SECONDARY, - election, - election, - lastOpTimeApplied); - ASSERT_NO_ACTION(action.getAction()); - - // make sure we are electable - setSelfMemberState(MemberState::RS_SECONDARY); - - ReplSetHeartbeatResponse startElectionResponse; - startElectionResponse.noteReplSet(); - startElectionResponse.setSetName("rs0"); - startElectionResponse.setState(MemberState::RS_SECONDARY); - startElectionResponse.setElectable(true); - startElectionResponse.setVersion(5); - action = - getTopoCoord().processHeartbeatResponse( - firstRequestDate() + 4500, // Time is left. - Milliseconds(400), // Spent 0.4 of the 0.5 second in the network. - target(), - StatusWith<ReplSetHeartbeatResponse>(startElectionResponse), - election); - ASSERT_EQUALS(HeartbeatResponseAction::StartElection, action.getAction()); - ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); - ASSERT_EQUALS(Date_t(firstRequestDate() + 6500), action.getNextHeartbeatStartDate()); + HeartbeatResponseAction heartbeatFromMember(const HostAndPort& member, + const std::string& setName, + MemberState memberState, + OpTime lastOpTimeSender, + Milliseconds roundTripTime = Milliseconds(1)) { + return _receiveHeartbeatHelper(Status::OK(), + member, + setName, + memberState, + OpTime(), + lastOpTimeSender, + OpTime(), + roundTripTime); } - TEST_F(HeartbeatResponseTestTwoRetries, HeartbeatRetriesAtMostTwice) { - // Confirm that the topology coordinator attempts to retry a failed heartbeat two times - // after initial failure, assuming that the heartbeat timeout (set to 5 seconds in the - // fixture) has not expired. - // - // Failed heartbeats propose taking no action, other than scheduling the next heartbeat. We - // can detect a retry vs the next regularly scheduled heartbeat because retries are - // scheduled immediately, while subsequent heartbeats are scheduled after the hard-coded - // heartbeat interval of 2 seconds. - - // Second retry fails at t + 4800ms - HeartbeatResponseAction action = - getTopoCoord().processHeartbeatResponse( - firstRequestDate() + 4800, // 4.8 of the 5 seconds elapsed; could still retry. - Milliseconds(100), // Spent 0.1 of the 0.3 seconds in the network. - target(), - StatusWith<ReplSetHeartbeatResponse>(ErrorCodes::NodeNotFound, "Bad DNS?"), - OpTime(0, 0)); // We've never applied anything. - ASSERT_EQUALS(HeartbeatResponseAction::NoAction, action.getAction()); - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - // Because this is the second retry, rather than retry again, we expect to wait for the - // heartbeat interval of 2 seconds to elapse. - ASSERT_EQUALS(Date_t(firstRequestDate() + 6800), action.getNextHeartbeatStartDate()); - - // Ensure a third failed heartbeat caused the node to be marked down - BSONObjBuilder statusBuilder; - Status resultStatus(ErrorCodes::InternalError, - "prepareStatusResponse didn't set result"); - getTopoCoord().prepareStatusResponse(cbData(), - firstRequestDate() + 4900, - 10, - OpTime(100,0), - &statusBuilder, - &resultStatus); - ASSERT_OK(resultStatus); - BSONObj rsStatus = statusBuilder.obj(); - std::vector<BSONElement> memberArray = rsStatus["members"].Array(); - BSONObj member1Status = memberArray[1].Obj(); - - ASSERT_EQUALS(1, member1Status["_id"].Int()); - ASSERT_EQUALS(0, member1Status["health"].Double()); +private: + HeartbeatResponseAction _receiveHeartbeatHelper(Status responseStatus, + const HostAndPort& member, + const std::string& setName, + MemberState memberState, + OpTime electionTime, + OpTime lastOpTimeSender, + OpTime lastOpTimeReceiver, + Milliseconds roundTripTime) { + StatusWith<ReplSetHeartbeatResponse> hbResponse = + StatusWith<ReplSetHeartbeatResponse>(responseStatus); + + if (responseStatus.isOK()) { + ReplSetHeartbeatResponse hb; + hb.setVersion(1); + hb.setState(memberState); + hb.setOpTime(lastOpTimeSender); + hb.setElectionTime(electionTime); + hbResponse = StatusWith<ReplSetHeartbeatResponse>(hb); + } + getTopoCoord().prepareHeartbeatRequest(now(), setName, member); + now() += roundTripTime.total_milliseconds(); + return getTopoCoord().processHeartbeatResponse( + now(), roundTripTime, member, hbResponse, lastOpTimeReceiver); } - TEST_F(HeartbeatResponseTestTwoRetries, DecideToStepDownRemotePrimary) { - // Confirm that action responses can come back from retries; in this, expect a - // StepDownRemotePrimary action. - - // make self primary - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - makeSelfPrimary(OpTime(5,0)); - ASSERT_EQUALS(0, getCurrentPrimaryIndex()); - - ReplSetHeartbeatResponse electedMoreRecentlyResponse; - electedMoreRecentlyResponse.noteReplSet(); - electedMoreRecentlyResponse.setSetName("rs0"); - electedMoreRecentlyResponse.setState(MemberState::RS_PRIMARY); - electedMoreRecentlyResponse.setElectable(true); - electedMoreRecentlyResponse.setElectionTime(OpTime(3,0)); - electedMoreRecentlyResponse.setVersion(5); - HeartbeatResponseAction action = - getTopoCoord().processHeartbeatResponse( - firstRequestDate() + 5000, // Time is left. - Milliseconds(400), // Spent 0.4 of the 0.5 second in the network. - target(), - StatusWith<ReplSetHeartbeatResponse>(electedMoreRecentlyResponse), - OpTime(0,0)); // We've never applied anything. - ASSERT_EQUALS(HeartbeatResponseAction::StepDownRemotePrimary, action.getAction()); - ASSERT_EQUALS(1, action.getPrimaryConfigIndex()); - ASSERT_EQUALS(Date_t(firstRequestDate() + 7000), action.getNextHeartbeatStartDate()); +private: + scoped_ptr<TopologyCoordinatorImpl> _topo; + scoped_ptr<ReplicationExecutor::CallbackData> _cbData; + Date_t _now; + int _selfIndex; +}; + +TEST_F(TopoCoordTest, ChooseSyncSourceBasic) { + // if we do not have an index in the config, we should get an empty syncsource + HostAndPort newSyncSource = getTopoCoord().chooseNewSyncSource(now()++, OpTime(0, 0)); + ASSERT_TRUE(newSyncSource.empty()); + + updateConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 10 << "host" + << "hself") + << BSON("_id" << 20 << "host" + << "h2") << BSON("_id" << 30 << "host" + << "h3"))), + 0); + setSelfMemberState(MemberState::RS_SECONDARY); + + // member h2 is the furthest ahead + heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, OpTime(1, 0)); + heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, OpTime(0, 0)); + + // We start with no sync source + ASSERT(getTopoCoord().getSyncSourceAddress().empty()); + + // Fail due to insufficient number of pings + newSyncSource = getTopoCoord().chooseNewSyncSource(now()++, OpTime(0, 0)); + ASSERT_EQUALS(getTopoCoord().getSyncSourceAddress(), newSyncSource); + ASSERT(getTopoCoord().getSyncSourceAddress().empty()); + + // Record 2nd round of pings to allow choosing a new sync source; all members equidistant + heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, OpTime(1, 0)); + heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, OpTime(0, 0)); + + // Should choose h2, since it is furthest ahead + newSyncSource = getTopoCoord().chooseNewSyncSource(now()++, OpTime(0, 0)); + ASSERT_EQUALS(getTopoCoord().getSyncSourceAddress(), newSyncSource); + ASSERT_EQUALS(HostAndPort("h2"), getTopoCoord().getSyncSourceAddress()); + + // h3 becomes further ahead, so it should be chosen + heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, OpTime(2, 0)); + getTopoCoord().chooseNewSyncSource(now()++, OpTime(0, 0)); + ASSERT_EQUALS(HostAndPort("h3"), getTopoCoord().getSyncSourceAddress()); + + // h3 becomes an invalid candidate for sync source; should choose h2 again + heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_RECOVERING, OpTime(2, 0)); + getTopoCoord().chooseNewSyncSource(now()++, OpTime(0, 0)); + ASSERT_EQUALS(HostAndPort("h2"), getTopoCoord().getSyncSourceAddress()); + + // h3 back in SECONDARY and ahead + heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, OpTime(2, 0)); + getTopoCoord().chooseNewSyncSource(now()++, OpTime(0, 0)); + ASSERT_EQUALS(HostAndPort("h3"), getTopoCoord().getSyncSourceAddress()); + + // h3 goes down + receiveDownHeartbeat(HostAndPort("h3"), "rs0", OpTime()); + getTopoCoord().chooseNewSyncSource(now()++, OpTime(0, 0)); + ASSERT_EQUALS(HostAndPort("h2"), getTopoCoord().getSyncSourceAddress()); + + // h3 back up and ahead + heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, OpTime(2, 0)); + getTopoCoord().chooseNewSyncSource(now()++, OpTime(0, 0)); + ASSERT_EQUALS(HostAndPort("h3"), getTopoCoord().getSyncSourceAddress()); +} + +TEST_F(TopoCoordTest, ChooseSyncSourceCandidates) { + updateConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "hself") + << BSON("_id" << 10 << "host" + << "h1") + << BSON("_id" << 20 << "host" + << "h2" + << "buildIndexes" << false << "priority" << 0) + << BSON("_id" << 30 << "host" + << "h3" + << "hidden" << true << "priority" << 0 << "votes" + << 0) << BSON("_id" << 40 << "host" + << "h4" + << "arbiterOnly" << true) + << BSON("_id" << 50 << "host" + << "h5" + << "slaveDelay" << 1 << "priority" << 0) + << BSON("_id" << 60 << "host" + << "h6") << BSON("_id" << 70 << "host" + << "hprimary"))), + 0); + + setSelfMemberState(MemberState::RS_SECONDARY); + OpTime lastOpTimeWeApplied = OpTime(100, 0); + + heartbeatFromMember( + HostAndPort("h1"), "rs0", MemberState::RS_SECONDARY, OpTime(501, 0), Milliseconds(700)); + heartbeatFromMember( + HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, OpTime(501, 0), Milliseconds(600)); + heartbeatFromMember( + HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, OpTime(501, 0), Milliseconds(500)); + heartbeatFromMember( + HostAndPort("h4"), "rs0", MemberState::RS_SECONDARY, OpTime(501, 0), Milliseconds(400)); + heartbeatFromMember( + HostAndPort("h5"), "rs0", MemberState::RS_SECONDARY, OpTime(501, 0), Milliseconds(300)); + + // This node is lagged further than maxSyncSourceLagSeconds. + heartbeatFromMember( + HostAndPort("h6"), "rs0", MemberState::RS_SECONDARY, OpTime(499, 0), Milliseconds(200)); + + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + heartbeatFromMember( + HostAndPort("hprimary"), "rs0", MemberState::RS_PRIMARY, OpTime(600, 0), Milliseconds(100)); + ASSERT_EQUALS(7, getCurrentPrimaryIndex()); + + // Record 2nd round of pings to allow choosing a new sync source + heartbeatFromMember( + HostAndPort("h1"), "rs0", MemberState::RS_SECONDARY, OpTime(501, 0), Milliseconds(700)); + heartbeatFromMember( + HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, OpTime(501, 0), Milliseconds(600)); + heartbeatFromMember( + HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, OpTime(501, 0), Milliseconds(500)); + heartbeatFromMember( + HostAndPort("h4"), "rs0", MemberState::RS_SECONDARY, OpTime(501, 0), Milliseconds(400)); + heartbeatFromMember( + HostAndPort("h5"), "rs0", MemberState::RS_SECONDARY, OpTime(501, 0), Milliseconds(300)); + heartbeatFromMember( + HostAndPort("h6"), "rs0", MemberState::RS_SECONDARY, OpTime(499, 0), Milliseconds(200)); + heartbeatFromMember( + HostAndPort("hprimary"), "rs0", MemberState::RS_PRIMARY, OpTime(600, 0), Milliseconds(100)); + + // Should choose primary first; it's closest + getTopoCoord().chooseNewSyncSource(now()++, lastOpTimeWeApplied); + ASSERT_EQUALS(HostAndPort("hprimary"), getTopoCoord().getSyncSourceAddress()); + + // Primary goes far far away + heartbeatFromMember(HostAndPort("hprimary"), + "rs0", + MemberState::RS_PRIMARY, + OpTime(600, 0), + Milliseconds(100000000)); + + // Should choose h4. (if an arbiter has an oplog, it's a valid sync source) + // h6 is not considered because it is outside the maxSyncLagSeconds window, + getTopoCoord().chooseNewSyncSource(now()++, lastOpTimeWeApplied); + ASSERT_EQUALS(HostAndPort("h4"), getTopoCoord().getSyncSourceAddress()); + + // h4 goes down; should choose h1 + receiveDownHeartbeat(HostAndPort("h4"), "rs0", OpTime()); + getTopoCoord().chooseNewSyncSource(now()++, lastOpTimeWeApplied); + ASSERT_EQUALS(HostAndPort("h1"), getTopoCoord().getSyncSourceAddress()); + + // Primary and h1 go down; should choose h6 + receiveDownHeartbeat(HostAndPort("h1"), "rs0", OpTime()); + receiveDownHeartbeat(HostAndPort("hprimary"), "rs0", OpTime()); + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + getTopoCoord().chooseNewSyncSource(now()++, lastOpTimeWeApplied); + ASSERT_EQUALS(HostAndPort("h6"), getTopoCoord().getSyncSourceAddress()); + + // h6 goes down; should choose h5 + receiveDownHeartbeat(HostAndPort("h6"), "rs0", OpTime()); + getTopoCoord().chooseNewSyncSource(now()++, lastOpTimeWeApplied); + ASSERT_EQUALS(HostAndPort("h5"), getTopoCoord().getSyncSourceAddress()); + + // h5 goes down; should choose h3 + receiveDownHeartbeat(HostAndPort("h5"), "rs0", OpTime()); + getTopoCoord().chooseNewSyncSource(now()++, lastOpTimeWeApplied); + ASSERT_EQUALS(HostAndPort("h3"), getTopoCoord().getSyncSourceAddress()); + + // h3 goes down; no sync source candidates remain + receiveDownHeartbeat(HostAndPort("h3"), "rs0", OpTime()); + getTopoCoord().chooseNewSyncSource(now()++, lastOpTimeWeApplied); + ASSERT(getTopoCoord().getSyncSourceAddress().empty()); +} + + +TEST_F(TopoCoordTest, ChooseSyncSourceChainingNotAllowed) { + updateConfig(BSON("_id" + << "rs0" + << "version" << 1 << "settings" << BSON("chainingAllowed" << false) + << "members" + << BSON_ARRAY(BSON("_id" << 10 << "host" + << "hself") + << BSON("_id" << 20 << "host" + << "h2") << BSON("_id" << 30 << "host" + << "h3"))), + 0); + + setSelfMemberState(MemberState::RS_SECONDARY); + + heartbeatFromMember( + HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, OpTime(1, 0), Milliseconds(100)); + heartbeatFromMember( + HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, OpTime(1, 0), Milliseconds(100)); + heartbeatFromMember( + HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, OpTime(0, 0), Milliseconds(300)); + heartbeatFromMember( + HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, OpTime(0, 0), Milliseconds(300)); + + // No primary situation: should choose no sync source. + getTopoCoord().chooseNewSyncSource(now()++, OpTime(0, 0)); + ASSERT(getTopoCoord().getSyncSourceAddress().empty()); + + // Add primary + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + heartbeatFromMember( + HostAndPort("h3"), "rs0", MemberState::RS_PRIMARY, OpTime(0, 0), Milliseconds(300)); + ASSERT_EQUALS(2, getCurrentPrimaryIndex()); + + // h3 is primary and should be chosen as sync source, despite being further away than h2 + // and the primary (h3) being behind our most recently applied optime + getTopoCoord().chooseNewSyncSource(now()++, OpTime(10, 0)); + ASSERT_EQUALS(HostAndPort("h3"), getTopoCoord().getSyncSourceAddress()); +} + +TEST_F(TopoCoordTest, EmptySyncSourceOnPrimary) { + updateConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 10 << "host" + << "hself") + << BSON("_id" << 20 << "host" + << "h2") << BSON("_id" << 30 << "host" + << "h3"))), + 0); + + setSelfMemberState(MemberState::RS_SECONDARY); + + heartbeatFromMember( + HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, OpTime(1, 0), Milliseconds(100)); + heartbeatFromMember( + HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, OpTime(1, 0), Milliseconds(100)); + heartbeatFromMember( + HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, OpTime(0, 0), Milliseconds(300)); + heartbeatFromMember( + HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, OpTime(0, 0), Milliseconds(300)); + + // No primary situation: should choose h2 sync source. + getTopoCoord().chooseNewSyncSource(now()++, OpTime(0, 0)); + ASSERT_EQUALS(HostAndPort("h2"), getTopoCoord().getSyncSourceAddress()); + + // Become primary + makeSelfPrimary(OpTime(3.0)); + ASSERT_EQUALS(0, getCurrentPrimaryIndex()); + + // Check sync source + ASSERT_EQUALS(HostAndPort(), getTopoCoord().getSyncSourceAddress()); +} + +TEST_F(TopoCoordTest, ForceSyncSource) { + updateConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 10 << "host" + << "hself") + << BSON("_id" << 20 << "host" + << "h2") << BSON("_id" << 30 << "host" + << "h3"))), + 0); + + setSelfMemberState(MemberState::RS_SECONDARY); + + // two rounds of heartbeat pings from each member + heartbeatFromMember( + HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, OpTime(1, 0), Milliseconds(300)); + heartbeatFromMember( + HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, OpTime(1, 0), Milliseconds(300)); + heartbeatFromMember( + HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, OpTime(2, 0), Milliseconds(100)); + heartbeatFromMember( + HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, OpTime(2, 0), Milliseconds(100)); + + // force should overrule other defaults + getTopoCoord().chooseNewSyncSource(now()++, OpTime(0, 0)); + ASSERT_EQUALS(HostAndPort("h3"), getTopoCoord().getSyncSourceAddress()); + getTopoCoord().setForceSyncSourceIndex(1); + // force should cause shouldChangeSyncSource() to return true + // even if the currentSource is the force target + ASSERT_TRUE(getTopoCoord().shouldChangeSyncSource(HostAndPort("h2"), now())); + ASSERT_TRUE(getTopoCoord().shouldChangeSyncSource(HostAndPort("h3"), now())); + getTopoCoord().chooseNewSyncSource(now()++, OpTime(0, 0)); + ASSERT_EQUALS(HostAndPort("h2"), getTopoCoord().getSyncSourceAddress()); + + // force should only work for one call to chooseNewSyncSource + getTopoCoord().chooseNewSyncSource(now()++, OpTime(0, 0)); + ASSERT_EQUALS(HostAndPort("h3"), getTopoCoord().getSyncSourceAddress()); +} + +TEST_F(TopoCoordTest, BlacklistSyncSource) { + updateConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 10 << "host" + << "hself") + << BSON("_id" << 20 << "host" + << "h2") << BSON("_id" << 30 << "host" + << "h3"))), + 0); + + setSelfMemberState(MemberState::RS_SECONDARY); + + heartbeatFromMember( + HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, OpTime(1, 0), Milliseconds(300)); + heartbeatFromMember( + HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, OpTime(1, 0), Milliseconds(300)); + heartbeatFromMember( + HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, OpTime(2, 0), Milliseconds(100)); + heartbeatFromMember( + HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, OpTime(2, 0), Milliseconds(100)); + + getTopoCoord().chooseNewSyncSource(now()++, OpTime(0, 0)); + ASSERT_EQUALS(HostAndPort("h3"), getTopoCoord().getSyncSourceAddress()); + + Date_t expireTime = 1000; + getTopoCoord().blacklistSyncSource(HostAndPort("h3"), expireTime); + getTopoCoord().chooseNewSyncSource(now()++, OpTime(0, 0)); + // Should choose second best choice now that h3 is blacklisted. + ASSERT_EQUALS(HostAndPort("h2"), getTopoCoord().getSyncSourceAddress()); + + // After time has passed, should go back to original sync source + getTopoCoord().chooseNewSyncSource(expireTime, OpTime(0, 0)); + ASSERT_EQUALS(HostAndPort("h3"), getTopoCoord().getSyncSourceAddress()); +} + +TEST_F(TopoCoordTest, BlacklistSyncSourceNoChaining) { + updateConfig(BSON("_id" + << "rs0" + << "version" << 1 << "settings" << BSON("chainingAllowed" << false) + << "members" + << BSON_ARRAY(BSON("_id" << 10 << "host" + << "hself") + << BSON("_id" << 20 << "host" + << "h2") << BSON("_id" << 30 << "host" + << "h3"))), + 0); + + setSelfMemberState(MemberState::RS_SECONDARY); + + heartbeatFromMember( + HostAndPort("h2"), "rs0", MemberState::RS_PRIMARY, OpTime(2, 0), Milliseconds(100)); + heartbeatFromMember( + HostAndPort("h2"), "rs0", MemberState::RS_PRIMARY, OpTime(2, 0), Milliseconds(100)); + ASSERT_EQUALS(1, getCurrentPrimaryIndex()); + + heartbeatFromMember( + HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, OpTime(2, 0), Milliseconds(100)); + heartbeatFromMember( + HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, OpTime(2, 0), Milliseconds(100)); + + getTopoCoord().chooseNewSyncSource(now()++, OpTime(0, 0)); + ASSERT_EQUALS(HostAndPort("h2"), getTopoCoord().getSyncSourceAddress()); + + Date_t expireTime = 1000; + getTopoCoord().blacklistSyncSource(HostAndPort("h2"), expireTime); + getTopoCoord().chooseNewSyncSource(now()++, OpTime(0, 0)); + // Can't choose any sync source now. + ASSERT(getTopoCoord().getSyncSourceAddress().empty()); + + // After time has passed, should go back to the primary + getTopoCoord().chooseNewSyncSource(expireTime, OpTime(0, 0)); + ASSERT_EQUALS(HostAndPort("h2"), getTopoCoord().getSyncSourceAddress()); +} + +TEST_F(TopoCoordTest, OnlyUnauthorizedUpCausesRecovering) { + updateConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 10 << "host" + << "hself") + << BSON("_id" << 20 << "host" + << "h2") << BSON("_id" << 30 << "host" + << "h3"))), + 0); + + setSelfMemberState(MemberState::RS_SECONDARY); + + // Generate enough heartbeats to select a sync source below + heartbeatFromMember( + HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, OpTime(1, 0), Milliseconds(300)); + heartbeatFromMember( + HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, OpTime(1, 0), Milliseconds(300)); + heartbeatFromMember( + HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, OpTime(2, 0), Milliseconds(100)); + heartbeatFromMember( + HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, OpTime(2, 0), Milliseconds(100)); + + ASSERT_EQUALS(HostAndPort("h3"), getTopoCoord().chooseNewSyncSource(now()++, OpTime(0, 0))); + ASSERT_EQUALS(MemberState::RS_SECONDARY, getTopoCoord().getMemberState().s); + // Good state setup done + + // Mark nodes down, ensure that we have no source and are secondary + receiveDownHeartbeat(HostAndPort("h2"), "rs0", OpTime(), ErrorCodes::NetworkTimeout); + receiveDownHeartbeat(HostAndPort("h3"), "rs0", OpTime(), ErrorCodes::NetworkTimeout); + ASSERT_TRUE(getTopoCoord().chooseNewSyncSource(now()++, OpTime(0, 0)).empty()); + ASSERT_EQUALS(MemberState::RS_SECONDARY, getTopoCoord().getMemberState().s); + + // Mark nodes down + unauth, ensure that we have no source and are secondary + receiveDownHeartbeat(HostAndPort("h2"), "rs0", OpTime(), ErrorCodes::NetworkTimeout); + receiveDownHeartbeat(HostAndPort("h3"), "rs0", OpTime(), ErrorCodes::Unauthorized); + ASSERT_TRUE(getTopoCoord().chooseNewSyncSource(now()++, OpTime(0, 0)).empty()); + ASSERT_EQUALS(MemberState::RS_RECOVERING, getTopoCoord().getMemberState().s); + + // Having an auth error but with another node up should bring us out of RECOVERING + HeartbeatResponseAction action = receiveUpHeartbeat(HostAndPort("h2"), + "rs0", + MemberState::RS_SECONDARY, + OpTime(0, 0), + OpTime(2, 0), + OpTime(2, 0)); + ASSERT_EQUALS(MemberState::RS_SECONDARY, getTopoCoord().getMemberState().s); + // Test that the heartbeat that brings us from RECOVERING to SECONDARY doesn't initiate + // an election (SERVER-17164) + ASSERT_NO_ACTION(action.getAction()); +} + +TEST_F(TopoCoordTest, ReceiveHeartbeatWhileAbsentFromConfig) { + updateConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 10 << "host" + << "h1") + << BSON("_id" << 20 << "host" + << "h2") << BSON("_id" << 30 << "host" + << "h3"))), + -1); + ASSERT_NO_ACTION( + heartbeatFromMember( + HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, OpTime(1, 0), Milliseconds(300)) + .getAction()); +} + +TEST_F(TopoCoordTest, PrepareSyncFromResponse) { + OpTime staleOpTime(1, 1); + OpTime ourOpTime(staleOpTime.getSecs() + 11, 1); + + Status result = Status::OK(); + BSONObjBuilder response; + + // if we do not have an index in the config, we should get ErrorCodes::NotSecondary + getTopoCoord().prepareSyncFromResponse( + cbData(), HostAndPort("h1"), ourOpTime, &response, &result); + ASSERT_EQUALS(ErrorCodes::NotSecondary, result); + ASSERT_EQUALS("Removed and uninitialized nodes do not sync", result.reason()); + + // Test trying to sync from another node when we are an arbiter + updateConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "hself" + << "arbiterOnly" << true) + << BSON("_id" << 1 << "host" + << "h1"))), + 0); + + getTopoCoord().prepareSyncFromResponse( + cbData(), HostAndPort("h1"), ourOpTime, &response, &result); + ASSERT_EQUALS(ErrorCodes::NotSecondary, result); + ASSERT_EQUALS("arbiters don't sync", result.reason()); + + // Set up config for the rest of the tests + updateConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "hself") + << BSON("_id" << 1 << "host" + << "h1" + << "arbiterOnly" << true) + << BSON("_id" << 2 << "host" + << "h2" + << "priority" << 0 << "buildIndexes" << false) + << BSON("_id" << 3 << "host" + << "h3") << BSON("_id" << 4 << "host" + << "h4") + << BSON("_id" << 5 << "host" + << "h5") << BSON("_id" << 6 << "host" + << "h6"))), + 0); + + // Try to sync while PRIMARY + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + makeSelfPrimary(); + ASSERT_EQUALS(0, getCurrentPrimaryIndex()); + getTopoCoord()._setCurrentPrimaryForTest(0); + BSONObjBuilder response1; + getTopoCoord().prepareSyncFromResponse( + cbData(), HostAndPort("h3"), ourOpTime, &response1, &result); + ASSERT_EQUALS(ErrorCodes::NotSecondary, result); + ASSERT_EQUALS("primaries don't sync", result.reason()); + ASSERT_EQUALS("h3:27017", response1.obj()["syncFromRequested"].String()); + + // Try to sync from non-existent member + setSelfMemberState(MemberState::RS_SECONDARY); + getTopoCoord()._setCurrentPrimaryForTest(-1); + BSONObjBuilder response2; + getTopoCoord().prepareSyncFromResponse( + cbData(), HostAndPort("fakemember"), ourOpTime, &response2, &result); + ASSERT_EQUALS(ErrorCodes::NodeNotFound, result); + ASSERT_EQUALS("Could not find member \"fakemember:27017\" in replica set", result.reason()); + + // Try to sync from self + BSONObjBuilder response3; + getTopoCoord().prepareSyncFromResponse( + cbData(), HostAndPort("hself"), ourOpTime, &response3, &result); + ASSERT_EQUALS(ErrorCodes::InvalidOptions, result); + ASSERT_EQUALS("I cannot sync from myself", result.reason()); + + // Try to sync from an arbiter + BSONObjBuilder response4; + getTopoCoord().prepareSyncFromResponse( + cbData(), HostAndPort("h1"), ourOpTime, &response4, &result); + ASSERT_EQUALS(ErrorCodes::InvalidOptions, result); + ASSERT_EQUALS("Cannot sync from \"h1:27017\" because it is an arbiter", result.reason()); + + // Try to sync from a node that doesn't build indexes + BSONObjBuilder response5; + getTopoCoord().prepareSyncFromResponse( + cbData(), HostAndPort("h2"), ourOpTime, &response5, &result); + ASSERT_EQUALS(ErrorCodes::InvalidOptions, result); + ASSERT_EQUALS("Cannot sync from \"h2:27017\" because it does not build indexes", + result.reason()); + + // Try to sync from a member that is down + receiveDownHeartbeat(HostAndPort("h4"), "rs0", OpTime()); + + BSONObjBuilder response7; + getTopoCoord().prepareSyncFromResponse( + cbData(), HostAndPort("h4"), ourOpTime, &response7, &result); + ASSERT_EQUALS(ErrorCodes::HostUnreachable, result); + ASSERT_EQUALS("I cannot reach the requested member: h4:27017", result.reason()); + + // Sync successfully from a member that is stale + heartbeatFromMember( + HostAndPort("h5"), "rs0", MemberState::RS_SECONDARY, staleOpTime, Milliseconds(100)); + + BSONObjBuilder response8; + getTopoCoord().prepareSyncFromResponse( + cbData(), HostAndPort("h5"), ourOpTime, &response8, &result); + ASSERT_OK(result); + ASSERT_EQUALS("requested member \"h5:27017\" is more than 10 seconds behind us", + response8.obj()["warning"].String()); + getTopoCoord().chooseNewSyncSource(now()++, ourOpTime); + ASSERT_EQUALS(HostAndPort("h5"), getTopoCoord().getSyncSourceAddress()); + + // Sync successfully from an up-to-date member + heartbeatFromMember( + HostAndPort("h6"), "rs0", MemberState::RS_SECONDARY, ourOpTime, Milliseconds(100)); + + BSONObjBuilder response9; + getTopoCoord().prepareSyncFromResponse( + cbData(), HostAndPort("h6"), ourOpTime, &response9, &result); + ASSERT_OK(result); + BSONObj response9Obj = response9.obj(); + ASSERT_FALSE(response9Obj.hasField("warning")); + ASSERT_EQUALS(HostAndPort("h5").toString(), response9Obj["prevSyncTarget"].String()); + getTopoCoord().chooseNewSyncSource(now()++, ourOpTime); + ASSERT_EQUALS(HostAndPort("h6"), getTopoCoord().getSyncSourceAddress()); + + // node goes down between forceSync and chooseNewSyncSource + BSONObjBuilder response10; + getTopoCoord().prepareSyncFromResponse( + cbData(), HostAndPort("h6"), ourOpTime, &response10, &result); + BSONObj response10Obj = response10.obj(); + ASSERT_FALSE(response10Obj.hasField("warning")); + ASSERT_EQUALS(HostAndPort("h6").toString(), response10Obj["prevSyncTarget"].String()); + receiveDownHeartbeat(HostAndPort("h6"), "rs0", OpTime()); + HostAndPort syncSource = getTopoCoord().chooseNewSyncSource(now()++, OpTime(0, 0)); + ASSERT_EQUALS(HostAndPort("h6"), syncSource); + + // Try to sync from a member that is unauth'd + receiveDownHeartbeat(HostAndPort("h5"), "rs0", OpTime(), ErrorCodes::Unauthorized); + + BSONObjBuilder response11; + getTopoCoord().prepareSyncFromResponse( + cbData(), HostAndPort("h5"), ourOpTime, &response11, &result); + ASSERT_NOT_OK(result); + ASSERT_EQUALS(ErrorCodes::Unauthorized, result.code()); + ASSERT_EQUALS("not authorized to communicate with h5:27017", result.reason()); + + // Sync successfully from an up-to-date member. + heartbeatFromMember( + HostAndPort("h6"), "rs0", MemberState::RS_SECONDARY, ourOpTime, Milliseconds(100)); + BSONObjBuilder response12; + getTopoCoord().prepareSyncFromResponse( + cbData(), HostAndPort("h6"), ourOpTime, &response12, &result); + ASSERT_OK(result); + syncSource = getTopoCoord().chooseNewSyncSource(now()++, OpTime(0, 0)); + ASSERT_EQUALS(HostAndPort("h6"), syncSource); +} + +TEST_F(TopoCoordTest, ReplSetGetStatus) { + // This test starts by configuring a TopologyCoordinator as a member of a 4 node replica + // set, with each node in a different state. + // The first node is DOWN, as if we tried heartbeating them and it failed in some way. + // The second node is in state SECONDARY, as if we've received a valid heartbeat from them. + // The third node is in state UNKNOWN, as if we've not yet had any heartbeating activity + // with them yet. The fourth node is PRIMARY and corresponds to ourself, which gets its + // information for replSetGetStatus from a different source than the nodes that aren't + // ourself. After this setup, we call prepareStatusResponse and make sure that the fields + // returned for each member match our expectations. + Date_t startupTime(100); + Date_t heartbeatTime = 5000; + Seconds uptimeSecs(10); + Date_t curTime = heartbeatTime + uptimeSecs.total_milliseconds(); + OpTime electionTime(1, 2); + OpTime oplogProgress(3, 4); + std::string setName = "mySet"; + + updateConfig( + BSON("_id" << setName << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "test0:1234") + << BSON("_id" << 1 << "host" + << "test1:1234") << BSON("_id" << 2 << "host" + << "test2:1234") + << BSON("_id" << 3 << "host" + << "test3:1234"))), + 3, + startupTime + 1); + + // Now that the replica set is setup, put the members into the states we want them in. + HostAndPort member = HostAndPort("test0:1234"); + StatusWith<ReplSetHeartbeatResponse> hbResponse = + StatusWith<ReplSetHeartbeatResponse>(Status(ErrorCodes::HostUnreachable, "")); + + getTopoCoord().prepareHeartbeatRequest(startupTime + 2, setName, member); + Date_t timeoutTime = + startupTime + 2 + ReplicaSetConfig::kDefaultHeartbeatTimeoutPeriod.total_milliseconds(); + getTopoCoord().processHeartbeatResponse( + timeoutTime, Milliseconds(5000), member, hbResponse, OpTime(0, 0)); + + member = HostAndPort("test1:1234"); + ReplSetHeartbeatResponse hb; + hb.setVersion(1); + hb.setState(MemberState::RS_SECONDARY); + hb.setElectionTime(electionTime); + hb.setHbMsg("READY"); + hb.setOpTime(oplogProgress); + hbResponse = StatusWith<ReplSetHeartbeatResponse>(hb); + getTopoCoord().prepareHeartbeatRequest(startupTime + 2, setName, member); + getTopoCoord().processHeartbeatResponse( + heartbeatTime, Milliseconds(4000), member, hbResponse, OpTime(0, 0)); + makeSelfPrimary(); + + // Now node 0 is down, node 1 is up, and for node 2 we have no heartbeat data yet. + BSONObjBuilder statusBuilder; + Status resultStatus(ErrorCodes::InternalError, "prepareStatusResponse didn't set result"); + getTopoCoord().prepareStatusResponse(cbData(), + curTime, + uptimeSecs.total_seconds(), + oplogProgress, + &statusBuilder, + &resultStatus); + ASSERT_OK(resultStatus); + BSONObj rsStatus = statusBuilder.obj(); + + // Test results for all non-self members + ASSERT_EQUALS(setName, rsStatus["set"].String()); + ASSERT_EQUALS(curTime.asInt64(), rsStatus["date"].Date().asInt64()); + std::vector<BSONElement> memberArray = rsStatus["members"].Array(); + ASSERT_EQUALS(4U, memberArray.size()); + BSONObj member0Status = memberArray[0].Obj(); + BSONObj member1Status = memberArray[1].Obj(); + BSONObj member2Status = memberArray[2].Obj(); + + // Test member 0, the node that's DOWN + ASSERT_EQUALS(0, member0Status["_id"].numberInt()); + ASSERT_EQUALS("test0:1234", member0Status["name"].str()); + ASSERT_EQUALS(0, member0Status["health"].numberDouble()); + ASSERT_EQUALS(MemberState::RS_DOWN, member0Status["state"].numberInt()); + ASSERT_EQUALS("(not reachable/healthy)", member0Status["stateStr"].str()); + ASSERT_EQUALS(0, member0Status["uptime"].numberInt()); + ASSERT_EQUALS(OpTime(), OpTime(member0Status["optime"].timestampValue())); + ASSERT_TRUE(member0Status.hasField("optimeDate")); + ASSERT_EQUALS(Date_t(OpTime().getSecs() * 1000ULL), member0Status["optimeDate"].Date().millis); + ASSERT_EQUALS(timeoutTime, member0Status["lastHeartbeat"].date()); + ASSERT_EQUALS(Date_t(), member0Status["lastHeartbeatRecv"].date()); + + // Test member 1, the node that's SECONDARY + ASSERT_EQUALS(1, member1Status["_id"].Int()); + ASSERT_EQUALS("test1:1234", member1Status["name"].String()); + ASSERT_EQUALS(1, member1Status["health"].Double()); + ASSERT_EQUALS(MemberState::RS_SECONDARY, member1Status["state"].numberInt()); + ASSERT_EQUALS(MemberState(MemberState::RS_SECONDARY).toString(), + member1Status["stateStr"].String()); + ASSERT_EQUALS(uptimeSecs.total_seconds(), member1Status["uptime"].numberInt()); + ASSERT_EQUALS(oplogProgress, OpTime(member1Status["optime"].timestampValue())); + ASSERT_TRUE(member1Status.hasField("optimeDate")); + ASSERT_EQUALS(Date_t(oplogProgress.getSecs() * 1000ULL), + member1Status["optimeDate"].Date().millis); + ASSERT_EQUALS(heartbeatTime, member1Status["lastHeartbeat"].date()); + ASSERT_EQUALS(Date_t(), member1Status["lastHeartbeatRecv"].date()); + ASSERT_EQUALS("READY", member1Status["lastHeartbeatMessage"].str()); + + // Test member 2, the node that's UNKNOWN + ASSERT_EQUALS(2, member2Status["_id"].numberInt()); + ASSERT_EQUALS("test2:1234", member2Status["name"].str()); + ASSERT_EQUALS(-1, member2Status["health"].numberDouble()); + ASSERT_EQUALS(MemberState::RS_UNKNOWN, member2Status["state"].numberInt()); + ASSERT_EQUALS(MemberState(MemberState::RS_UNKNOWN).toString(), member2Status["stateStr"].str()); + ASSERT_TRUE(member2Status.hasField("uptime")); + ASSERT_TRUE(member2Status.hasField("optime")); + ASSERT_TRUE(member2Status.hasField("optimeDate")); + ASSERT_FALSE(member2Status.hasField("lastHearbeat")); + ASSERT_FALSE(member2Status.hasField("lastHearbeatRecv")); + + // Now test results for ourself, the PRIMARY + ASSERT_EQUALS(MemberState::RS_PRIMARY, rsStatus["myState"].numberInt()); + BSONObj selfStatus = memberArray[3].Obj(); + ASSERT_TRUE(selfStatus["self"].boolean()); + ASSERT_EQUALS(3, selfStatus["_id"].numberInt()); + ASSERT_EQUALS("test3:1234", selfStatus["name"].str()); + ASSERT_EQUALS(1, selfStatus["health"].numberDouble()); + ASSERT_EQUALS(MemberState::RS_PRIMARY, selfStatus["state"].numberInt()); + ASSERT_EQUALS(MemberState(MemberState::RS_PRIMARY).toString(), selfStatus["stateStr"].str()); + ASSERT_EQUALS(uptimeSecs.total_seconds(), selfStatus["uptime"].numberInt()); + ASSERT_EQUALS(oplogProgress, OpTime(selfStatus["optime"].timestampValue())); + ASSERT_TRUE(selfStatus.hasField("optimeDate")); + ASSERT_EQUALS(Date_t(oplogProgress.getSecs() * 1000ULL), + selfStatus["optimeDate"].Date().millis); + + // TODO(spencer): Test electionTime and pingMs are set properly +} + +TEST_F(TopoCoordTest, ReplSetGetStatusFails) { + // This test starts by configuring a TopologyCoordinator to NOT be a member of a 3 node + // replica set. Then running prepareStatusResponse should fail. + Date_t startupTime(100); + Date_t heartbeatTime = 5000; + Seconds uptimeSecs(10); + Date_t curTime = heartbeatTime + uptimeSecs.total_milliseconds(); + OpTime oplogProgress(3, 4); + std::string setName = "mySet"; + + updateConfig( + BSON("_id" << setName << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "test0:1234") + << BSON("_id" << 1 << "host" + << "test1:1234") << BSON("_id" << 2 << "host" + << "test2:1234"))), + -1, // This one is not part of the replica set. + startupTime + 1); + + BSONObjBuilder statusBuilder; + Status resultStatus(ErrorCodes::InternalError, "prepareStatusResponse didn't set result"); + getTopoCoord().prepareStatusResponse(cbData(), + curTime, + uptimeSecs.total_seconds(), + oplogProgress, + &statusBuilder, + &resultStatus); + ASSERT_NOT_OK(resultStatus); + ASSERT_EQUALS(ErrorCodes::InvalidReplicaSetConfig, resultStatus); +} + +TEST_F(TopoCoordTest, PrepareFreshResponse) { + ReplicationCoordinator::ReplSetFreshArgs args; + OpTime freshestOpTime(15, 10); + OpTime ourOpTime(10, 10); + OpTime staleOpTime(1, 1); + Status internalErrorStatus(ErrorCodes::InternalError, "didn't set status"); + + // if we do not have an index in the config, we should get ErrorCodes::ReplicaSetNotFound + BSONObjBuilder responseBuilder; + Status status = internalErrorStatus; + getTopoCoord().prepareFreshResponse(args, Date_t(), ourOpTime, &responseBuilder, &status); + ASSERT_EQUALS(ErrorCodes::ReplicaSetNotFound, status); + ASSERT_EQUALS("Cannot participate in elections because not initialized", status.reason()); + ASSERT_TRUE(responseBuilder.obj().isEmpty()); + + updateConfig(BSON("_id" + << "rs0" + << "version" << 10 << "members" + << BSON_ARRAY(BSON("_id" << 10 << "host" + << "hself" + << "priority" << 10) + << BSON("_id" << 20 << "host" + << "h1") << BSON("_id" << 30 << "host" + << "h2") + << BSON("_id" << 40 << "host" + << "h3" + << "priority" << 10))), + 0); + + // Test with incorrect replset name + args.setName = "fakeset"; + + BSONObjBuilder responseBuilder0; + Status status0 = internalErrorStatus; + getTopoCoord().prepareFreshResponse(args, Date_t(), ourOpTime, &responseBuilder0, &status0); + ASSERT_EQUALS(ErrorCodes::ReplicaSetNotFound, status0); + ASSERT_TRUE(responseBuilder0.obj().isEmpty()); + + heartbeatFromMember(HostAndPort("h1"), "rs0", MemberState::RS_SECONDARY, ourOpTime); + + // Test with old config version + args.setName = "rs0"; + args.cfgver = 5; + args.id = 20; + args.who = HostAndPort("h1"); + args.opTime = ourOpTime; + + BSONObjBuilder responseBuilder1; + Status status1 = internalErrorStatus; + getTopoCoord().prepareFreshResponse(args, Date_t(), ourOpTime, &responseBuilder1, &status1); + ASSERT_OK(status1); + BSONObj response1 = responseBuilder1.obj(); + ASSERT_EQUALS("config version stale", response1["info"].String()); + ASSERT_EQUALS(ourOpTime, OpTime(response1["opTime"].timestampValue())); + ASSERT_TRUE(response1["fresher"].Bool()); + ASSERT_FALSE(response1["veto"].Bool()); + ASSERT_FALSE(response1.hasField("errmsg")); + + // Test with non-existent node. + args.cfgver = 10; + args.id = 0; + args.who = HostAndPort("fakenode"); + + BSONObjBuilder responseBuilder2; + Status status2 = internalErrorStatus; + getTopoCoord().prepareFreshResponse(args, Date_t(), ourOpTime, &responseBuilder2, &status2); + ASSERT_OK(status2); + BSONObj response2 = responseBuilder2.obj(); + ASSERT_EQUALS(ourOpTime, OpTime(response2["opTime"].timestampValue())); + ASSERT_FALSE(response2["fresher"].Bool()); + ASSERT_TRUE(response2["veto"].Bool()); + ASSERT_EQUALS("replSet couldn't find member with id 0", response2["errmsg"].String()); + + + // Test when we are primary. + args.id = 20; + args.who = HostAndPort("h1"); + + makeSelfPrimary(); + + BSONObjBuilder responseBuilder3; + Status status3 = internalErrorStatus; + getTopoCoord().prepareFreshResponse(args, Date_t(), ourOpTime, &responseBuilder3, &status3); + ASSERT_OK(status3); + BSONObj response3 = responseBuilder3.obj(); + ASSERT_FALSE(response3.hasField("info")); + ASSERT_EQUALS(ourOpTime, OpTime(response3["opTime"].timestampValue())); + ASSERT_FALSE(response3["fresher"].Bool()); + ASSERT_TRUE(response3["veto"].Bool()); + ASSERT_EQUALS("I am already primary, h1:27017 can try again once I've stepped down", + response3["errmsg"].String()); + + + // Test when someone else is primary. + heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, ourOpTime); + setSelfMemberState(MemberState::RS_SECONDARY); + getTopoCoord()._setCurrentPrimaryForTest(2); + + BSONObjBuilder responseBuilder4; + Status status4 = internalErrorStatus; + getTopoCoord().prepareFreshResponse(args, Date_t(), ourOpTime, &responseBuilder4, &status4); + ASSERT_OK(status4); + BSONObj response4 = responseBuilder4.obj(); + ASSERT_FALSE(response4.hasField("info")); + ASSERT_EQUALS(ourOpTime, OpTime(response4["opTime"].timestampValue())); + ASSERT_FALSE(response4["fresher"].Bool()); + ASSERT_TRUE(response4["veto"].Bool()); + ASSERT_EQUALS( + "h1:27017 is trying to elect itself but h2:27017 is already primary and more " + "up-to-date", + response4["errmsg"].String()); + + + // Test trying to elect a node that is caught up but isn't the highest priority node. + heartbeatFromMember(HostAndPort("h1"), "rs0", MemberState::RS_SECONDARY, ourOpTime); + heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, staleOpTime); + heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, ourOpTime); + + BSONObjBuilder responseBuilder5; + Status status5 = internalErrorStatus; + getTopoCoord().prepareFreshResponse(args, Date_t(), ourOpTime, &responseBuilder5, &status5); + ASSERT_OK(status5); + BSONObj response5 = responseBuilder5.obj(); + ASSERT_FALSE(response5.hasField("info")); + ASSERT_EQUALS(ourOpTime, OpTime(response5["opTime"].timestampValue())); + ASSERT_FALSE(response5["fresher"].Bool()); + ASSERT_TRUE(response5["veto"].Bool()); + ASSERT(response5["errmsg"].String().find("h1:27017 has lower priority of 1 than") != + std::string::npos) + << response5["errmsg"].String(); + + // Test trying to elect a node that isn't electable because its down + args.id = 40; + args.who = HostAndPort("h3"); + + receiveDownHeartbeat(HostAndPort("h3"), "rs0", OpTime()); + + BSONObjBuilder responseBuilder6; + Status status6 = internalErrorStatus; + getTopoCoord().prepareFreshResponse(args, Date_t(), ourOpTime, &responseBuilder6, &status6); + ASSERT_OK(status6); + BSONObj response6 = responseBuilder6.obj(); + ASSERT_FALSE(response6.hasField("info")); + ASSERT_EQUALS(ourOpTime, OpTime(response6["opTime"].timestampValue())); + ASSERT_FALSE(response6["fresher"].Bool()); + ASSERT_TRUE(response6["veto"].Bool()); + ASSERT_NE(std::string::npos, + response6["errmsg"].String().find( + "I don't think h3:27017 is electable because the member is not " + "currently a secondary")) + << response6["errmsg"].String(); + + // Test trying to elect a node that isn't electable because it's PRIMARY + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_PRIMARY, ourOpTime); + ASSERT_EQUALS(3, getCurrentPrimaryIndex()); + + BSONObjBuilder responseBuilder7; + Status status7 = internalErrorStatus; + getTopoCoord().prepareFreshResponse(args, Date_t(), ourOpTime, &responseBuilder7, &status7); + ASSERT_OK(status7); + BSONObj response7 = responseBuilder7.obj(); + ASSERT_FALSE(response7.hasField("info")); + ASSERT_EQUALS(ourOpTime, OpTime(response7["opTime"].timestampValue())); + ASSERT_FALSE(response7["fresher"].Bool()); + ASSERT_TRUE(response7["veto"].Bool()); + ASSERT_NE(std::string::npos, + response7["errmsg"].String().find( + "I don't think h3:27017 is electable because the member is not " + "currently a secondary")) + << response7["errmsg"].String(); + + // Test trying to elect a node that isn't electable because it's STARTUP + heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_STARTUP, ourOpTime); + + BSONObjBuilder responseBuilder8; + Status status8 = internalErrorStatus; + getTopoCoord().prepareFreshResponse(args, Date_t(), ourOpTime, &responseBuilder8, &status8); + ASSERT_OK(status8); + BSONObj response8 = responseBuilder8.obj(); + ASSERT_FALSE(response8.hasField("info")); + ASSERT_EQUALS(ourOpTime, OpTime(response8["opTime"].timestampValue())); + ASSERT_FALSE(response8["fresher"].Bool()); + ASSERT_TRUE(response8["veto"].Bool()); + ASSERT_NE(std::string::npos, + response8["errmsg"].String().find( + "I don't think h3:27017 is electable because the member is not " + "currently a secondary")) + << response8["errmsg"].String(); + + // Test trying to elect a node that isn't electable because it's RECOVERING + heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_RECOVERING, ourOpTime); + + BSONObjBuilder responseBuilder9; + Status status9 = internalErrorStatus; + getTopoCoord().prepareFreshResponse(args, Date_t(), ourOpTime, &responseBuilder9, &status9); + ASSERT_OK(status9); + BSONObj response9 = responseBuilder9.obj(); + ASSERT_FALSE(response9.hasField("info")); + ASSERT_EQUALS(ourOpTime, OpTime(response9["opTime"].timestampValue())); + ASSERT_FALSE(response9["fresher"].Bool()); + ASSERT_TRUE(response9["veto"].Bool()); + ASSERT_NE(std::string::npos, + response9["errmsg"].String().find( + "I don't think h3:27017 is electable because the member is not " + "currently a secondary")) + << response9["errmsg"].String(); + + // Test trying to elect a node that is fresher but lower priority than the existing primary + args.id = 30; + args.who = HostAndPort("h2"); + + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_PRIMARY, ourOpTime); + ASSERT_EQUALS(3, getCurrentPrimaryIndex()); + heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, freshestOpTime); + + BSONObjBuilder responseBuilder10; + Status status10 = internalErrorStatus; + getTopoCoord().prepareFreshResponse(args, Date_t(), ourOpTime, &responseBuilder10, &status10); + ASSERT_OK(status10); + BSONObj response10 = responseBuilder10.obj(); + ASSERT_FALSE(response10.hasField("info")); + ASSERT_EQUALS(ourOpTime, OpTime(response10["opTime"].timestampValue())); + ASSERT_TRUE(response10["fresher"].Bool()); + ASSERT_TRUE(response10["veto"].Bool()); + ASSERT_TRUE(response10.hasField("errmsg")); + + + // Test trying to elect a valid node + args.id = 40; + args.who = HostAndPort("h3"); + + receiveDownHeartbeat(HostAndPort("h2"), "rs0", OpTime()); + heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, ourOpTime); + + BSONObjBuilder responseBuilder11; + Status status11 = internalErrorStatus; + getTopoCoord().prepareFreshResponse(args, Date_t(), ourOpTime, &responseBuilder11, &status11); + ASSERT_OK(status11); + BSONObj response11 = responseBuilder11.obj(); + ASSERT_FALSE(response11.hasField("info")) << response11.toString(); + ASSERT_EQUALS(ourOpTime, OpTime(response11["opTime"].timestampValue())); + ASSERT_FALSE(response11["fresher"].Bool()) << response11.toString(); + ASSERT_FALSE(response11["veto"].Bool()) << response11.toString(); + ASSERT_FALSE(response11.hasField("errmsg")) << response11.toString(); + + // Test with our id + args.id = 10; + BSONObjBuilder responseBuilder12; + Status status12 = internalErrorStatus; + getTopoCoord().prepareFreshResponse(args, Date_t(), ourOpTime, &responseBuilder12, &status12); + ASSERT_EQUALS(ErrorCodes::BadValue, status12); + ASSERT_EQUALS( + "Received replSetFresh command from member with the same member ID as ourself: 10", + status12.reason()); + ASSERT_TRUE(responseBuilder12.obj().isEmpty()); +} + +class HeartbeatResponseTest : public TopoCoordTest { +public: + virtual void setUp() { + TopoCoordTest::setUp(); + updateConfig( + BSON("_id" + << "rs0" + << "version" << 5 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "host1:27017") + << BSON("_id" << 1 << "host" + << "host2:27017") << BSON("_id" << 2 << "host" + << "host3:27017")) + << "settings" << BSON("heartbeatTimeoutSecs" << 5)), + 0); } - - TEST_F(HeartbeatResponseTestTwoRetries, DecideToStepDownSelf) { - // Confirm that action responses can come back from retries; in this, expect a StepDownSelf - // action. - - // acknowledge the other member so that we see a majority - HeartbeatResponseAction action = receiveDownHeartbeat(HostAndPort("host3"), - "rs0", - OpTime(100, 0)); - ASSERT_NO_ACTION(action.getAction()); - - // make us PRIMARY - makeSelfPrimary(); - - ReplSetHeartbeatResponse electedMoreRecentlyResponse; - electedMoreRecentlyResponse.noteReplSet(); - electedMoreRecentlyResponse.setSetName("rs0"); - electedMoreRecentlyResponse.setState(MemberState::RS_PRIMARY); - electedMoreRecentlyResponse.setElectable(false); - electedMoreRecentlyResponse.setElectionTime(OpTime(10,0)); - electedMoreRecentlyResponse.setVersion(5); - action = - getTopoCoord().processHeartbeatResponse( - firstRequestDate() + 5000, // Time is left. - Milliseconds(400), // Spent 0.4 of the 0.5 second in the network. - target(), - StatusWith<ReplSetHeartbeatResponse>(electedMoreRecentlyResponse), - OpTime(0, 0)); // We've never applied anything. - ASSERT_EQUALS(HeartbeatResponseAction::StepDownSelf, action.getAction()); - ASSERT_EQUALS(0, action.getPrimaryConfigIndex()); - ASSERT_EQUALS(Date_t(firstRequestDate() + 7000), action.getNextHeartbeatStartDate()); - // Doesn't actually do the stepdown until stepDownIfPending is called - ASSERT_TRUE(TopologyCoordinator::Role::leader == getTopoCoord().getRole()); - ASSERT_EQUALS(0, getCurrentPrimaryIndex()); - - ASSERT_TRUE(getTopoCoord().stepDownIfPending()); +}; + +class HeartbeatResponseTestOneRetry : public HeartbeatResponseTest { +public: + virtual void setUp() { + HeartbeatResponseTest::setUp(); + + // Bring up the node we are heartbeating. + _target = HostAndPort("host2", 27017); + Date_t _upRequestDate = unittest::assertGet(dateFromISOString("2014-08-29T12:55Z")); + std::pair<ReplSetHeartbeatArgs, Milliseconds> uppingRequest = + getTopoCoord().prepareHeartbeatRequest(_upRequestDate, "rs0", _target); + HeartbeatResponseAction upAction = getTopoCoord().processHeartbeatResponse( + _upRequestDate, + Milliseconds(0), + _target, + StatusWith<ReplSetHeartbeatResponse>(Status::OK()), + OpTime(0, 0)); // We've never applied anything. + ASSERT_EQUALS(HeartbeatResponseAction::NoAction, upAction.getAction()); ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - ASSERT_EQUALS(1, getCurrentPrimaryIndex()); - } - - TEST_F(HeartbeatResponseTestTwoRetries, DecideToStartElection) { - // Confirm that action responses can come back from retries; in this, expect a StartElection - // action. - - // acknowledge the other member so that we see a majority - OpTime election = OpTime(400,0); - OpTime lastOpTimeApplied = OpTime(300,0); - HeartbeatResponseAction action = receiveUpHeartbeat(HostAndPort("host3"), - "rs0", - MemberState::RS_SECONDARY, - election, - election, - lastOpTimeApplied); - ASSERT_NO_ACTION(action.getAction()); - - // make sure we are electable - setSelfMemberState(MemberState::RS_SECONDARY); - - ReplSetHeartbeatResponse startElectionResponse; - startElectionResponse.noteReplSet(); - startElectionResponse.setSetName("rs0"); - startElectionResponse.setState(MemberState::RS_SECONDARY); - startElectionResponse.setElectable(true); - startElectionResponse.setVersion(5); - action = - getTopoCoord().processHeartbeatResponse( - firstRequestDate() + 5000, // Time is left. - Milliseconds(400), // Spent 0.4 of the 0.5 second in the network. - target(), - StatusWith<ReplSetHeartbeatResponse>(startElectionResponse), - election); - ASSERT_EQUALS(HeartbeatResponseAction::StartElection, action.getAction()); - ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); - ASSERT_EQUALS(Date_t(firstRequestDate() + 7000), action.getNextHeartbeatStartDate()); - } - TEST_F(HeartbeatResponseTest, HeartbeatTimeoutSuppressesFirstRetry) { - // Confirm that the topology coordinator does not schedule an immediate heartbeat retry if - // the heartbeat timeout period expired before the initial request completed. - HostAndPort target("host2", 27017); - Date_t firstRequestDate = unittest::assertGet(dateFromISOString("2014-08-29T13:00Z")); + // Time of first request for this heartbeat period + _firstRequestDate = unittest::assertGet(dateFromISOString("2014-08-29T13:00Z")); - // Initial heartbeat request prepared, at t + 0. + // Initial heartbeat attempt prepared, at t + 0. std::pair<ReplSetHeartbeatArgs, Milliseconds> request = - getTopoCoord().prepareHeartbeatRequest(firstRequestDate, - "rs0", - target); + getTopoCoord().prepareHeartbeatRequest(_firstRequestDate, "rs0", _target); // 5 seconds to successfully complete the heartbeat before the timeout expires. ASSERT_EQUALS(5000, request.second.total_milliseconds()); - // Initial heartbeat request fails at t + 5000ms - HeartbeatResponseAction action = - getTopoCoord().processHeartbeatResponse( - firstRequestDate + 5000, // Entire heartbeat period elapsed; no retry allowed. - Milliseconds(4990), // Spent 4.99 of the 4 seconds in the network. - target, - StatusWith<ReplSetHeartbeatResponse>(ErrorCodes::ExceededTimeLimit, - "Took too long"), - OpTime(0, 0)); // We've never applied anything. - - ASSERT_EQUALS(HeartbeatResponseAction::NoAction, action.getAction()); - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - // Because the heartbeat timed out, we'll retry in 2 seconds. - ASSERT_EQUALS(Date_t(firstRequestDate + 7000), action.getNextHeartbeatStartDate()); - } - - TEST_F(HeartbeatResponseTestOneRetry, HeartbeatTimeoutSuppressesSecondRetry) { - // Confirm that the topology coordinator does not schedule an second heartbeat retry if - // the heartbeat timeout period expired before the first retry completed. - HeartbeatResponseAction action = - getTopoCoord().processHeartbeatResponse( - firstRequestDate() + 5010, // Entire heartbeat period elapsed; no retry allowed. - Milliseconds(1000), // Spent 1 of the 1.01 seconds in the network. - target(), - StatusWith<ReplSetHeartbeatResponse>(ErrorCodes::ExceededTimeLimit, - "Took too long"), - OpTime(0, 0)); // We've never applied anything. - - ASSERT_EQUALS(HeartbeatResponseAction::NoAction, action.getAction()); - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - // Because the heartbeat timed out, we'll retry in 2 seconds. - ASSERT_EQUALS(Date_t(firstRequestDate() + 7010), action.getNextHeartbeatStartDate()); - } - - TEST_F(HeartbeatResponseTestTwoRetries, HeartbeatThreeNonconsecutiveFailures) { - // Confirm that the topology coordinator does not mark a node down on three - // nonconsecutive heartbeat failures. - ReplSetHeartbeatResponse response; - response.noteReplSet(); - response.setSetName("rs0"); - response.setState(MemberState::RS_SECONDARY); - response.setElectable(true); - response.setVersion(5); - - // successful response (third response due to the two failures in setUp()) - HeartbeatResponseAction action = - getTopoCoord().processHeartbeatResponse( - firstRequestDate() + 4500, - Milliseconds(400), - target(), - StatusWith<ReplSetHeartbeatResponse>(response), - OpTime(0, 0)); // We've never applied anything. + // Initial heartbeat request fails at t + 4000ms + HeartbeatResponseAction action = getTopoCoord().processHeartbeatResponse( + _firstRequestDate + 4000, // 4 seconds elapsed, retry allowed. + Milliseconds(3990), // Spent 3.99 of the 4 seconds in the network. + _target, + StatusWith<ReplSetHeartbeatResponse>(ErrorCodes::ExceededTimeLimit, "Took too long"), + OpTime(0, 0)); // We've never applied anything. ASSERT_EQUALS(HeartbeatResponseAction::NoAction, action.getAction()); ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - // Because the heartbeat succeeded, we'll retry in 2 seconds. - ASSERT_EQUALS(Date_t(firstRequestDate() + 6500), action.getNextHeartbeatStartDate()); - - // request next heartbeat - getTopoCoord().prepareHeartbeatRequest(firstRequestDate() + 6500, "rs0", target()); - // third failed response - action = getTopoCoord().processHeartbeatResponse( - firstRequestDate() + 7100, - Milliseconds(400), - target(), - StatusWith<ReplSetHeartbeatResponse>(Status(ErrorCodes::HostUnreachable, "")), - OpTime(0, 0)); // We've never applied anything. + // Because the heartbeat failed without timing out, we expect to retry immediately. + ASSERT_EQUALS(Date_t(_firstRequestDate + 4000), action.getNextHeartbeatStartDate()); - ASSERT_EQUALS(HeartbeatResponseAction::NoAction, action.getAction()); - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); + // First heartbeat retry prepared, at t + 4000ms. + request = getTopoCoord().prepareHeartbeatRequest(_firstRequestDate + 4000, "rs0", _target); + // One second left to complete the heartbeat. + ASSERT_EQUALS(1000, request.second.total_milliseconds()); - // Ensure a third nonconsecutive heartbeat failure did not cause the node to be marked down + // Ensure a single failed heartbeat did not cause the node to be marked down BSONObjBuilder statusBuilder; - Status resultStatus(ErrorCodes::InternalError, - "prepareStatusResponse didn't set result"); - getTopoCoord().prepareStatusResponse(cbData(), - firstRequestDate() + 7000, - 600, - OpTime(100,0), - &statusBuilder, - &resultStatus); + Status resultStatus(ErrorCodes::InternalError, "prepareStatusResponse didn't set result"); + getTopoCoord().prepareStatusResponse( + cbData(), _firstRequestDate + 4000, 10, OpTime(100, 0), &statusBuilder, &resultStatus); ASSERT_OK(resultStatus); BSONObj rsStatus = statusBuilder.obj(); std::vector<BSONElement> memberArray = rsStatus["members"].Array(); @@ -1859,2382 +1292,2970 @@ namespace { ASSERT_EQUALS(1, member1Status["_id"].Int()); ASSERT_EQUALS(1, member1Status["health"].Double()); - } - TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataNewPrimary) { - OpTime election = OpTime(5,0); - OpTime lastOpTimeApplied = OpTime(3,0); - - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), - "rs0", - MemberState::RS_PRIMARY, - election, - election, - lastOpTimeApplied); - ASSERT_EQUALS(1, getCurrentPrimaryIndex()); - ASSERT_NO_ACTION(nextAction.getAction()); - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); + Date_t firstRequestDate() { + return _firstRequestDate; } - TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataTwoPrimariesNewOneOlder) { - OpTime election = OpTime(5,0); - OpTime election2 = OpTime(4,0); - OpTime lastOpTimeApplied = OpTime(3,0); - - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), - "rs0", - MemberState::RS_PRIMARY, - election, - election, - lastOpTimeApplied); - ASSERT_EQUALS(1, getCurrentPrimaryIndex()); - ASSERT_NO_ACTION(nextAction.getAction()); - - nextAction = receiveUpHeartbeat(HostAndPort("host3"), - "rs0", - MemberState::RS_PRIMARY, - election2, - election, - lastOpTimeApplied); - // second primary does not change primary index - ASSERT_EQUALS(1, getCurrentPrimaryIndex()); - ASSERT_NO_ACTION(nextAction.getAction()); - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - } - - TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataTwoPrimariesNewOneNewer) { - OpTime election = OpTime(4,0); - OpTime election2 = OpTime(5,0); - OpTime lastOpTimeApplied = OpTime(3,0); - - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), - "rs0", - MemberState::RS_PRIMARY, - election, - election, - lastOpTimeApplied); - ASSERT_EQUALS(1, getCurrentPrimaryIndex()); - ASSERT_NO_ACTION(nextAction.getAction()); - - nextAction = receiveUpHeartbeat(HostAndPort("host3"), - "rs0", - MemberState::RS_PRIMARY, - election2, - election, - lastOpTimeApplied); - // second primary does not change primary index - ASSERT_EQUALS(1, getCurrentPrimaryIndex()); - ASSERT_NO_ACTION(nextAction.getAction()); - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - } - - TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataTwoPrimariesIncludingMeNewOneOlder) { - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - makeSelfPrimary(OpTime(5,0)); - - OpTime election = OpTime(4,0); - OpTime lastOpTimeApplied = OpTime(3,0); - - ASSERT_EQUALS(0, getCurrentPrimaryIndex()); - HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), - "rs0", - MemberState::RS_PRIMARY, - election, - election, - lastOpTimeApplied); - ASSERT_EQUALS(0, getCurrentPrimaryIndex()); - ASSERT_EQUALS(HeartbeatResponseAction::StepDownRemotePrimary, nextAction.getAction()); - ASSERT_EQUALS(1, nextAction.getPrimaryConfigIndex()); - ASSERT_TRUE(TopologyCoordinator::Role::leader == getTopoCoord().getRole()); - } - - TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataStepDownPrimaryForHighPriorityFreshNode) { - // In this test, the Topology coordinator sees a PRIMARY ("host2") and then sees a higher - // priority and similarly fresh node ("host3"). However, since the coordinator's node - // (host1) is not the higher priority node, it takes no action. - updateConfig(BSON("_id" << "rs0" << - "version" << 6 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "host1:27017") << - BSON("_id" << 1 << "host" << "host2:27017") << - BSON("_id" << 2 << "host" << "host3:27017" << "priority" << 3)) << - "settings" << BSON("heartbeatTimeoutSecs" << 5)), - 0); - setSelfMemberState(MemberState::RS_SECONDARY); - - OpTime election = OpTime(0,0); - OpTime lastOpTimeApplied = OpTime(13,0); - OpTime slightlyLessFreshLastOpTimeApplied = OpTime(3,0); - - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), - "rs0", - MemberState::RS_PRIMARY, - election, - lastOpTimeApplied, - lastOpTimeApplied); - ASSERT_EQUALS(1, getCurrentPrimaryIndex()); - - nextAction = receiveUpHeartbeat(HostAndPort("host3"), - "rs0", - MemberState::RS_SECONDARY, - election, - slightlyLessFreshLastOpTimeApplied, - lastOpTimeApplied); - ASSERT_EQUALS(HeartbeatResponseAction::NoAction, nextAction.getAction()); - } - - TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataStepDownSelfForHighPriorityFreshNode) { - // In this test, the Topology coordinator becomes PRIMARY and then sees a higher priority - // and equally fresh node ("host3"). As a result it responds with a StepDownSelf action. - // - // Despite having stepped down, we should remain electable, in order to dissuade lower - // priority nodes from standing for election. - updateConfig(BSON("_id" << "rs0" << - "version" << 6 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "host1:27017") << - BSON("_id" << 1 << "host" << "host2:27017") << - BSON("_id" << 2 << "host" << "host3:27017" << "priority" << 3)) << - "settings" << BSON("heartbeatTimeoutSecs" << 5)), - 0); - OpTime election = OpTime(1000,0); - - getTopoCoord().setFollowerMode(MemberState::RS_SECONDARY); - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - makeSelfPrimary(election); - ASSERT_EQUALS(0, getCurrentPrimaryIndex()); - - HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host3"), - "rs0", - MemberState::RS_SECONDARY, - election, - election, - election); - ASSERT_EQUALS(HeartbeatResponseAction::StepDownSelf, nextAction.getAction()); - ASSERT_EQUALS(0, nextAction.getPrimaryConfigIndex()); - - // Process a heartbeat response to confirm that this node, which is no longer primary, - // still tells other nodes that it is electable. This will stop lower priority nodes - // from standing for election. - ReplSetHeartbeatArgs hbArgs; - hbArgs.setSetName("rs0"); - hbArgs.setProtocolVersion(1); - hbArgs.setConfigVersion(6); - hbArgs.setSenderId(1); - hbArgs.setSenderHost(HostAndPort("host3", 27017)); - ReplSetHeartbeatResponse hbResp; - ASSERT_OK(getTopoCoord().prepareHeartbeatResponse(now(), - hbArgs, - "rs0", - election, - &hbResp)); - ASSERT(!hbResp.hasIsElectable() || hbResp.isElectable()) << hbResp.toBSON().toString(); - } - - TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataDoNotStepDownSelfForHighPriorityStaleNode) { - // In this test, the Topology coordinator becomes PRIMARY and then sees a higher priority - // and stale node ("host3"). As a result it responds with NoAction. - updateConfig(BSON("_id" << "rs0" << - "version" << 6 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "host1:27017") << - BSON("_id" << 1 << "host" << "host2:27017") << - BSON("_id" << 2 << "host" << "host3:27017" << "priority" << 3)) << - "settings" << BSON("heartbeatTimeoutSecs" << 5)), - 0); - OpTime election = OpTime(1000,0); - OpTime staleTime = OpTime(0,0); - - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - makeSelfPrimary(election); - ASSERT_EQUALS(0, getCurrentPrimaryIndex()); - - HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host3"), - "rs0", - MemberState::RS_SECONDARY, - election, - staleTime, - election); - ASSERT_NO_ACTION(nextAction.getAction()); - } - - TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataDoNotStepDownPrimaryForHighPriorityStaleNode) { - // In this test, the Topology coordinator sees a PRIMARY ("host2") and then sees a higher - // priority and stale node ("host3"). As a result it responds with NoAction. - updateConfig(BSON("_id" << "rs0" << - "version" << 6 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "host1:27017") << - BSON("_id" << 1 << "host" << "host2:27017") << - BSON("_id" << 2 << "host" << "host3:27017" << "priority" << 3)) << - "settings" << BSON("heartbeatTimeoutSecs" << 5)), - 0); - setSelfMemberState(MemberState::RS_SECONDARY); - - OpTime election = OpTime(1000,0); - OpTime stale = OpTime(0,0); - - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), - "rs0", - MemberState::RS_PRIMARY, - election, - election, - election); - ASSERT_EQUALS(1, getCurrentPrimaryIndex()); - - nextAction = receiveUpHeartbeat(HostAndPort("host3"), - "rs0", - MemberState::RS_SECONDARY, - election, - stale, - election); - ASSERT_NO_ACTION(nextAction.getAction()); - } - - TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataTwoPrimariesIncludingMeNewOneNewer) { - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - makeSelfPrimary(OpTime(2,0)); - - OpTime election = OpTime(4,0); - OpTime lastOpTimeApplied = OpTime(3,0); - - ASSERT_EQUALS(0, getCurrentPrimaryIndex()); - HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), - "rs0", - MemberState::RS_PRIMARY, - election, - election, - lastOpTimeApplied); - ASSERT_EQUALS(HeartbeatResponseAction::StepDownSelf, nextAction.getAction()); - ASSERT_EQUALS(0, nextAction.getPrimaryConfigIndex()); - // Doesn't actually do the stepdown until stepDownIfPending is called - ASSERT_TRUE(TopologyCoordinator::Role::leader == getTopoCoord().getRole()); - ASSERT_EQUALS(0, getCurrentPrimaryIndex()); - - ASSERT_TRUE(getTopoCoord().stepDownIfPending()); - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - ASSERT_EQUALS(1, getCurrentPrimaryIndex()); - } - - TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataPrimaryDownNoMajority) { - setSelfMemberState(MemberState::RS_SECONDARY); - - OpTime election = OpTime(400,0); - OpTime lastOpTimeApplied = OpTime(300,0); - - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), - "rs0", - MemberState::RS_PRIMARY, - election, - election, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - ASSERT_EQUALS(1, getCurrentPrimaryIndex()); - - nextAction = receiveDownHeartbeat(HostAndPort("host2"), "rs0", lastOpTimeApplied); - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - ASSERT_NO_ACTION(nextAction.getAction()); - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - } - - TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataPrimaryDownMajorityButNoPriority) { - setSelfMemberState(MemberState::RS_SECONDARY); - - updateConfig(BSON("_id" << "rs0" << - "version" << 5 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "host1:27017" << "priority" << 0) << - BSON("_id" << 1 << "host" << "host2:27017") << - BSON("_id" << 2 << "host" << "host3:27017"))), - 0); - - OpTime election = OpTime(400,0); - OpTime lastOpTimeApplied = OpTime(300,0); - - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), - "rs0", - MemberState::RS_PRIMARY, - election, - election, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - ASSERT_EQUALS(1, getCurrentPrimaryIndex()); - - nextAction = receiveUpHeartbeat(HostAndPort("host3"), - "rs0", - MemberState::RS_SECONDARY, - election, - election, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - ASSERT_EQUALS(1, getCurrentPrimaryIndex()); - - nextAction = receiveDownHeartbeat(HostAndPort("host2"), "rs0", lastOpTimeApplied); - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - ASSERT_NO_ACTION(nextAction.getAction()); - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); + HostAndPort target() { + return _target; } - TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataPrimaryDownMajorityButIAmStarting) { - setSelfMemberState(MemberState::RS_STARTUP); - - OpTime election = OpTime(400,0); - OpTime lastOpTimeApplied = OpTime(300,0); - - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), - "rs0", - MemberState::RS_PRIMARY, - election, - election, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - - nextAction = receiveUpHeartbeat(HostAndPort("host3"), - "rs0", - MemberState::RS_SECONDARY, - election, - election, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - - nextAction = receiveDownHeartbeat(HostAndPort("host2"), "rs0", lastOpTimeApplied); - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - ASSERT_NO_ACTION(nextAction.getAction()); - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - } +private: + Date_t _firstRequestDate; + HostAndPort _target; +}; - TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataPrimaryDownMajorityButIAmRecovering) { - setSelfMemberState(MemberState::RS_RECOVERING); - - OpTime election = OpTime(400,0); - OpTime lastOpTimeApplied = OpTime(300,0); - - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), - "rs0", - MemberState::RS_PRIMARY, - election, - election, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - ASSERT_EQUALS(1, getCurrentPrimaryIndex()); - - nextAction = receiveDownHeartbeat(HostAndPort("host2"), "rs0", lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); +class HeartbeatResponseTestTwoRetries : public HeartbeatResponseTestOneRetry { +public: + virtual void setUp() { + HeartbeatResponseTestOneRetry::setUp(); + // First retry fails at t + 4500ms + HeartbeatResponseAction action = getTopoCoord().processHeartbeatResponse( + firstRequestDate() + 4500, // 4.5 of the 5 seconds elapsed; could retry. + Milliseconds(400), // Spent 0.4 of the 0.5 seconds in the network. + target(), + StatusWith<ReplSetHeartbeatResponse>(ErrorCodes::NodeNotFound, "Bad DNS?"), + OpTime(0, 0)); // We've never applied anything. + ASSERT_EQUALS(HeartbeatResponseAction::NoAction, action.getAction()); ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - } + // Because the first retry failed without timing out, we expect to retry immediately. + ASSERT_EQUALS(Date_t(firstRequestDate() + 4500), action.getNextHeartbeatStartDate()); - TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataPrimaryDownMajorityButIHaveStepdownWait) { - setSelfMemberState(MemberState::RS_SECONDARY); - - OpTime election = OpTime(400,0); - OpTime lastOpTimeApplied = OpTime(300,0); - - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), - "rs0", - MemberState::RS_PRIMARY, - election, - election, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - ASSERT_EQUALS(1, getCurrentPrimaryIndex()); - - nextAction = receiveUpHeartbeat(HostAndPort("host3"), - "rs0", - MemberState::RS_SECONDARY, - election, - election, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - - // freeze node to set stepdown wait - BSONObjBuilder response; - getTopoCoord().prepareFreezeResponse(now()++, 20, &response); - - nextAction = receiveDownHeartbeat(HostAndPort("host2"), "rs0", lastOpTimeApplied); - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - ASSERT_NO_ACTION(nextAction.getAction()); - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - } + // Second retry prepared at t + 4500ms. + std::pair<ReplSetHeartbeatArgs, Milliseconds> request = + getTopoCoord().prepareHeartbeatRequest(firstRequestDate() + 4500, "rs0", target()); + // 500ms left to complete the heartbeat. + ASSERT_EQUALS(500, request.second.total_milliseconds()); - TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataPrimaryDownMajorityButIAmArbiter) { - updateConfig(BSON("_id" << "rs0" << - "version" << 5 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "host1:27017" << - "arbiterOnly" << true) << - BSON("_id" << 1 << "host" << "host2:27017") << - BSON("_id" << 2 << "host" << "host3:27017"))), - 0); + // Ensure a second failed heartbeat did not cause the node to be marked down + BSONObjBuilder statusBuilder; + Status resultStatus(ErrorCodes::InternalError, "prepareStatusResponse didn't set result"); + getTopoCoord().prepareStatusResponse( + cbData(), firstRequestDate() + 4000, 10, OpTime(100, 0), &statusBuilder, &resultStatus); + ASSERT_OK(resultStatus); + BSONObj rsStatus = statusBuilder.obj(); + std::vector<BSONElement> memberArray = rsStatus["members"].Array(); + BSONObj member1Status = memberArray[1].Obj(); - OpTime election = OpTime(400,0); - OpTime lastOpTimeApplied = OpTime(300,0); - - HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host3"), - "rs0", - MemberState::RS_SECONDARY, - election, - election, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - - nextAction = receiveUpHeartbeat(HostAndPort("host2"), - "rs0", - MemberState::RS_PRIMARY, - election, - election, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - ASSERT_EQUALS(1, getCurrentPrimaryIndex()); - - nextAction = receiveDownHeartbeat(HostAndPort("host2"), "rs0", lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); + ASSERT_EQUALS(1, member1Status["_id"].Int()); + ASSERT_EQUALS(1, member1Status["health"].Double()); } +}; - TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataPrimaryDownMajority) { - setSelfMemberState(MemberState::RS_SECONDARY); - - OpTime election = OpTime(400,0); - OpTime lastOpTimeApplied = OpTime(399,0); - - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), - "rs0", - MemberState::RS_PRIMARY, - election, - election, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - ASSERT_EQUALS(1, getCurrentPrimaryIndex()); - - nextAction = receiveUpHeartbeat(HostAndPort("host3"), - "rs0", - MemberState::RS_SECONDARY, - election, - election, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - - nextAction = receiveDownHeartbeat(HostAndPort("host2"), "rs0", lastOpTimeApplied); - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - ASSERT_EQUALS(HeartbeatResponseAction::StartElection, nextAction.getAction()); - ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); +class HeartbeatResponseHighVerbosityTest : public HeartbeatResponseTest { +public: + virtual void setUp() { + HeartbeatResponseTest::setUp(); + // set verbosity as high as the highest verbosity log message we'd like to check for + logger::globalLogDomain()->setMinimumLoggedSeverity(logger::LogSeverity::Debug(3)); } - TEST_F(HeartbeatResponseTest, ElectionStartElectionWhileCandidate) { - // In this test, the TopologyCoordinator goes through the steps of a successful election, - // during which it receives a heartbeat that would normally trigger it to become a candidate - // and respond with a StartElection HeartbeatResponseAction. However, since it is already in - // candidate state, it responds with a NoAction HeartbeatResponseAction. Then finishes by - // being winning the election. - - // 1. All nodes heartbeat to indicate that they are up and that "host2" is PRIMARY. - // 2. "host2" goes down, triggering an election. - // 3. "host2" comes back, which would normally trigger election, but since the - // TopologyCoordinator is already in candidate mode, does not. - // 4. TopologyCoordinator concludes its freshness round successfully and wins the election. - - setSelfMemberState(MemberState::RS_SECONDARY); - now() += 30000; // we need to be more than LastVote::leaseTime from the start of time or - // else some Date_t math goes horribly awry - - OpTime election = OpTime(0,0); - OpTime lastOpTimeApplied = OpTime(130,0); - OID round = OID::gen(); - - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), - "rs0", - MemberState::RS_PRIMARY, - election, - lastOpTimeApplied, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - ASSERT_EQUALS(1, getCurrentPrimaryIndex()); - - nextAction = receiveUpHeartbeat(HostAndPort("host3"), - "rs0", - MemberState::RS_SECONDARY, - election, - lastOpTimeApplied, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - - // candidate time! - nextAction = receiveDownHeartbeat(HostAndPort("host2"), "rs0", lastOpTimeApplied); - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - ASSERT_EQUALS(HeartbeatResponseAction::StartElection, nextAction.getAction()); - ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); - - // see the downed node as SECONDARY and decide to take no action, but are still a candidate - nextAction = receiveUpHeartbeat(HostAndPort("host2"), - "rs0", - MemberState::RS_SECONDARY, - election, - lastOpTimeApplied, - lastOpTimeApplied); - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - - // normally this would trigger StartElection, but we are already a candidate - ASSERT_NO_ACTION(nextAction.getAction()); - ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); - - // now voteForSelf as though we received all our fresh responses - ASSERT_TRUE(getTopoCoord().voteForMyself(now()++)); - - // now win election and ensure _electionId and _electionTime are set properly - getTopoCoord().processWinElection(round, election); - ASSERT_EQUALS(round, getTopoCoord().getElectionId()); - ASSERT_EQUALS(election, getTopoCoord().getElectionTime()); - ASSERT_TRUE(TopologyCoordinator::Role::leader == getTopoCoord().getRole()); - ASSERT_EQUALS(0, getCurrentPrimaryIndex()); + virtual void tearDown() { + HeartbeatResponseTest::tearDown(); + logger::globalLogDomain()->setMinimumLoggedSeverity(logger::LogSeverity::Log()); } +}; + +TEST_F(HeartbeatResponseHighVerbosityTest, UpdateHeartbeatDataNodeBelivesWeAreDown) { + OpTime lastOpTimeApplied = OpTime(3, 0); + + // request heartbeat + std::pair<ReplSetHeartbeatArgs, Milliseconds> request = + getTopoCoord().prepareHeartbeatRequest(now()++, "rs0", HostAndPort("host2")); + + ReplSetHeartbeatResponse believesWeAreDownResponse; + believesWeAreDownResponse.noteReplSet(); + believesWeAreDownResponse.setSetName("rs0"); + believesWeAreDownResponse.setState(MemberState::RS_SECONDARY); + believesWeAreDownResponse.setElectable(true); + believesWeAreDownResponse.noteStateDisagreement(); + startCapturingLogMessages(); + HeartbeatResponseAction action = getTopoCoord().processHeartbeatResponse( + now()++, // Time is left. + Milliseconds(400), // Spent 0.4 of the 0.5 second in the network. + HostAndPort("host2"), + StatusWith<ReplSetHeartbeatResponse>(believesWeAreDownResponse), + lastOpTimeApplied); + stopCapturingLogMessages(); + ASSERT_NO_ACTION(action.getAction()); + ASSERT_EQUALS(1, countLogLinesContaining("host2:27017 thinks that we are down")); +} + +TEST_F(HeartbeatResponseHighVerbosityTest, UpdateHeartbeatDataMemberNotInConfig) { + OpTime lastOpTimeApplied = OpTime(3, 0); + + // request heartbeat + std::pair<ReplSetHeartbeatArgs, Milliseconds> request = + getTopoCoord().prepareHeartbeatRequest(now()++, "rs0", HostAndPort("host5")); + + ReplSetHeartbeatResponse memberMissingResponse; + memberMissingResponse.noteReplSet(); + memberMissingResponse.setSetName("rs0"); + memberMissingResponse.setState(MemberState::RS_SECONDARY); + memberMissingResponse.setElectable(true); + memberMissingResponse.noteStateDisagreement(); + startCapturingLogMessages(); + HeartbeatResponseAction action = getTopoCoord().processHeartbeatResponse( + now()++, // Time is left. + Milliseconds(400), // Spent 0.4 of the 0.5 second in the network. + HostAndPort("host5"), + StatusWith<ReplSetHeartbeatResponse>(memberMissingResponse), + lastOpTimeApplied); + stopCapturingLogMessages(); + ASSERT_NO_ACTION(action.getAction()); + ASSERT_EQUALS(1, countLogLinesContaining("Could not find host5:27017 in current config")); +} + +TEST_F(HeartbeatResponseHighVerbosityTest, UpdateHeartbeatDataSameConfig) { + OpTime lastOpTimeApplied = OpTime(3, 0); + + // request heartbeat + std::pair<ReplSetHeartbeatArgs, Milliseconds> request = + getTopoCoord().prepareHeartbeatRequest(now()++, "rs0", HostAndPort("host2")); + + // construct a copy of the original config for log message checking later + // see HeartbeatResponseTest for the origin of the original config + ReplicaSetConfig originalConfig; + originalConfig.initialize(BSON("_id" + << "rs0" + << "version" << 5 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "host1:27017") + << BSON("_id" << 1 << "host" + << "host2:27017") + << BSON("_id" << 2 << "host" + << "host3:27017")) << "settings" + << BSON("heartbeatTimeoutSecs" << 5))); + + ReplSetHeartbeatResponse sameConfigResponse; + sameConfigResponse.noteReplSet(); + sameConfigResponse.setSetName("rs0"); + sameConfigResponse.setState(MemberState::RS_SECONDARY); + sameConfigResponse.setElectable(true); + sameConfigResponse.noteStateDisagreement(); + sameConfigResponse.setVersion(2); + sameConfigResponse.setConfig(originalConfig); + startCapturingLogMessages(); + HeartbeatResponseAction action = getTopoCoord().processHeartbeatResponse( + now()++, // Time is left. + Milliseconds(400), // Spent 0.4 of the 0.5 second in the network. + HostAndPort("host2"), + StatusWith<ReplSetHeartbeatResponse>(sameConfigResponse), + lastOpTimeApplied); + stopCapturingLogMessages(); + ASSERT_NO_ACTION(action.getAction()); + ASSERT_EQUALS(1, + countLogLinesContaining( + "Config from heartbeat response was " + "same as ours.")); +} + +TEST_F(HeartbeatResponseHighVerbosityTest, UpdateHeartbeatDataOldConfig) { + OpTime lastOpTimeApplied = OpTime(3, 0); + + // request heartbeat + std::pair<ReplSetHeartbeatArgs, Milliseconds> request = + getTopoCoord().prepareHeartbeatRequest(now()++, "rs0", HostAndPort("host2")); + + ReplSetHeartbeatResponse believesWeAreDownResponse; + believesWeAreDownResponse.noteReplSet(); + believesWeAreDownResponse.setSetName("rs0"); + believesWeAreDownResponse.setState(MemberState::RS_SECONDARY); + believesWeAreDownResponse.setElectable(true); + believesWeAreDownResponse.noteStateDisagreement(); + startCapturingLogMessages(); + HeartbeatResponseAction action = getTopoCoord().processHeartbeatResponse( + now()++, // Time is left. + Milliseconds(400), // Spent 0.4 of the 0.5 second in the network. + HostAndPort("host2"), + StatusWith<ReplSetHeartbeatResponse>(believesWeAreDownResponse), + lastOpTimeApplied); + stopCapturingLogMessages(); + ASSERT_NO_ACTION(action.getAction()); + ASSERT_EQUALS(1, countLogLinesContaining("host2:27017 thinks that we are down")); +} + +TEST_F(HeartbeatResponseTestOneRetry, DecideToReconfig) { + // Confirm that action responses can come back from retries; in this, expect a Reconfig + // action. + ReplicaSetConfig newConfig; + ASSERT_OK(newConfig.initialize(BSON("_id" + << "rs0" + << "version" << 7 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "host1:27017") + << BSON("_id" << 1 << "host" + << "host2:27017") + << BSON("_id" << 2 << "host" + << "host3:27017") + << BSON("_id" << 3 << "host" + << "host4:27017")) << "settings" + << BSON("heartbeatTimeoutSecs" << 5)))); + ASSERT_OK(newConfig.validate()); + + ReplSetHeartbeatResponse reconfigResponse; + reconfigResponse.noteReplSet(); + reconfigResponse.setSetName("rs0"); + reconfigResponse.setState(MemberState::RS_SECONDARY); + reconfigResponse.setElectable(true); + reconfigResponse.setVersion(7); + reconfigResponse.setConfig(newConfig); + HeartbeatResponseAction action = getTopoCoord().processHeartbeatResponse( + firstRequestDate() + 4500, // Time is left. + Milliseconds(400), // Spent 0.4 of the 0.5 second in the network. + target(), + StatusWith<ReplSetHeartbeatResponse>(reconfigResponse), + OpTime(0, 0)); // We've never applied anything. + ASSERT_EQUALS(HeartbeatResponseAction::Reconfig, action.getAction()); + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); + ASSERT_EQUALS(Date_t(firstRequestDate() + 6500), action.getNextHeartbeatStartDate()); +} + +TEST_F(HeartbeatResponseTestOneRetry, DecideToStepDownRemotePrimary) { + // Confirm that action responses can come back from retries; in this, expect a + // StepDownRemotePrimary action. + + // make self primary + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + makeSelfPrimary(OpTime(5, 0)); + ASSERT_EQUALS(0, getCurrentPrimaryIndex()); + + ReplSetHeartbeatResponse electedMoreRecentlyResponse; + electedMoreRecentlyResponse.noteReplSet(); + electedMoreRecentlyResponse.setSetName("rs0"); + electedMoreRecentlyResponse.setState(MemberState::RS_PRIMARY); + electedMoreRecentlyResponse.setElectable(true); + electedMoreRecentlyResponse.setElectionTime(OpTime(3, 0)); + electedMoreRecentlyResponse.setVersion(5); + HeartbeatResponseAction action = getTopoCoord().processHeartbeatResponse( + firstRequestDate() + 4500, // Time is left. + Milliseconds(400), // Spent 0.4 of the 0.5 second in the network. + target(), + StatusWith<ReplSetHeartbeatResponse>(electedMoreRecentlyResponse), + OpTime(0, 0)); // We've never applied anything. + ASSERT_EQUALS(HeartbeatResponseAction::StepDownRemotePrimary, action.getAction()); + ASSERT_EQUALS(1, action.getPrimaryConfigIndex()); + ASSERT_EQUALS(Date_t(firstRequestDate() + 6500), action.getNextHeartbeatStartDate()); +} + +TEST_F(HeartbeatResponseTestOneRetry, DecideToStepDownSelf) { + // Confirm that action responses can come back from retries; in this, expect a StepDownSelf + // action. + + // acknowledge the other member so that we see a majority + HeartbeatResponseAction action = + receiveDownHeartbeat(HostAndPort("host3"), "rs0", OpTime(100, 0)); + ASSERT_NO_ACTION(action.getAction()); + + // make us PRIMARY + makeSelfPrimary(); + + ReplSetHeartbeatResponse electedMoreRecentlyResponse; + electedMoreRecentlyResponse.noteReplSet(); + electedMoreRecentlyResponse.setSetName("rs0"); + electedMoreRecentlyResponse.setState(MemberState::RS_PRIMARY); + electedMoreRecentlyResponse.setElectable(false); + electedMoreRecentlyResponse.setElectionTime(OpTime(10, 0)); + electedMoreRecentlyResponse.setVersion(5); + action = getTopoCoord().processHeartbeatResponse( + firstRequestDate() + 4500, // Time is left. + Milliseconds(400), // Spent 0.4 of the 0.5 second in the network. + target(), + StatusWith<ReplSetHeartbeatResponse>(electedMoreRecentlyResponse), + OpTime(0, 0)); // We've never applied anything. + ASSERT_EQUALS(HeartbeatResponseAction::StepDownSelf, action.getAction()); + ASSERT_EQUALS(0, action.getPrimaryConfigIndex()); + ASSERT_EQUALS(Date_t(firstRequestDate() + 6500), action.getNextHeartbeatStartDate()); + // Doesn't actually do the stepdown until stepDownIfPending is called + ASSERT_TRUE(TopologyCoordinator::Role::leader == getTopoCoord().getRole()); + ASSERT_EQUALS(0, getCurrentPrimaryIndex()); + + ASSERT_TRUE(getTopoCoord().stepDownIfPending()); + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); + ASSERT_EQUALS(1, getCurrentPrimaryIndex()); +} + +TEST_F(HeartbeatResponseTestOneRetry, DecideToStartElection) { + // Confirm that action responses can come back from retries; in this, expect a StartElection + // action. + + // acknowledge the other member so that we see a majority + OpTime election = OpTime(400, 0); + OpTime lastOpTimeApplied = OpTime(300, 0); + HeartbeatResponseAction action = receiveUpHeartbeat(HostAndPort("host3"), + "rs0", + MemberState::RS_SECONDARY, + election, + election, + lastOpTimeApplied); + ASSERT_NO_ACTION(action.getAction()); + + // make sure we are electable + setSelfMemberState(MemberState::RS_SECONDARY); + + ReplSetHeartbeatResponse startElectionResponse; + startElectionResponse.noteReplSet(); + startElectionResponse.setSetName("rs0"); + startElectionResponse.setState(MemberState::RS_SECONDARY); + startElectionResponse.setElectable(true); + startElectionResponse.setVersion(5); + action = getTopoCoord().processHeartbeatResponse( + firstRequestDate() + 4500, // Time is left. + Milliseconds(400), // Spent 0.4 of the 0.5 second in the network. + target(), + StatusWith<ReplSetHeartbeatResponse>(startElectionResponse), + election); + ASSERT_EQUALS(HeartbeatResponseAction::StartElection, action.getAction()); + ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); + ASSERT_EQUALS(Date_t(firstRequestDate() + 6500), action.getNextHeartbeatStartDate()); +} + +TEST_F(HeartbeatResponseTestTwoRetries, HeartbeatRetriesAtMostTwice) { + // Confirm that the topology coordinator attempts to retry a failed heartbeat two times + // after initial failure, assuming that the heartbeat timeout (set to 5 seconds in the + // fixture) has not expired. + // + // Failed heartbeats propose taking no action, other than scheduling the next heartbeat. We + // can detect a retry vs the next regularly scheduled heartbeat because retries are + // scheduled immediately, while subsequent heartbeats are scheduled after the hard-coded + // heartbeat interval of 2 seconds. + + // Second retry fails at t + 4800ms + HeartbeatResponseAction action = getTopoCoord().processHeartbeatResponse( + firstRequestDate() + 4800, // 4.8 of the 5 seconds elapsed; could still retry. + Milliseconds(100), // Spent 0.1 of the 0.3 seconds in the network. + target(), + StatusWith<ReplSetHeartbeatResponse>(ErrorCodes::NodeNotFound, "Bad DNS?"), + OpTime(0, 0)); // We've never applied anything. + ASSERT_EQUALS(HeartbeatResponseAction::NoAction, action.getAction()); + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); + // Because this is the second retry, rather than retry again, we expect to wait for the + // heartbeat interval of 2 seconds to elapse. + ASSERT_EQUALS(Date_t(firstRequestDate() + 6800), action.getNextHeartbeatStartDate()); + + // Ensure a third failed heartbeat caused the node to be marked down + BSONObjBuilder statusBuilder; + Status resultStatus(ErrorCodes::InternalError, "prepareStatusResponse didn't set result"); + getTopoCoord().prepareStatusResponse( + cbData(), firstRequestDate() + 4900, 10, OpTime(100, 0), &statusBuilder, &resultStatus); + ASSERT_OK(resultStatus); + BSONObj rsStatus = statusBuilder.obj(); + std::vector<BSONElement> memberArray = rsStatus["members"].Array(); + BSONObj member1Status = memberArray[1].Obj(); + + ASSERT_EQUALS(1, member1Status["_id"].Int()); + ASSERT_EQUALS(0, member1Status["health"].Double()); +} + +TEST_F(HeartbeatResponseTestTwoRetries, DecideToStepDownRemotePrimary) { + // Confirm that action responses can come back from retries; in this, expect a + // StepDownRemotePrimary action. + + // make self primary + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + makeSelfPrimary(OpTime(5, 0)); + ASSERT_EQUALS(0, getCurrentPrimaryIndex()); + + ReplSetHeartbeatResponse electedMoreRecentlyResponse; + electedMoreRecentlyResponse.noteReplSet(); + electedMoreRecentlyResponse.setSetName("rs0"); + electedMoreRecentlyResponse.setState(MemberState::RS_PRIMARY); + electedMoreRecentlyResponse.setElectable(true); + electedMoreRecentlyResponse.setElectionTime(OpTime(3, 0)); + electedMoreRecentlyResponse.setVersion(5); + HeartbeatResponseAction action = getTopoCoord().processHeartbeatResponse( + firstRequestDate() + 5000, // Time is left. + Milliseconds(400), // Spent 0.4 of the 0.5 second in the network. + target(), + StatusWith<ReplSetHeartbeatResponse>(electedMoreRecentlyResponse), + OpTime(0, 0)); // We've never applied anything. + ASSERT_EQUALS(HeartbeatResponseAction::StepDownRemotePrimary, action.getAction()); + ASSERT_EQUALS(1, action.getPrimaryConfigIndex()); + ASSERT_EQUALS(Date_t(firstRequestDate() + 7000), action.getNextHeartbeatStartDate()); +} + +TEST_F(HeartbeatResponseTestTwoRetries, DecideToStepDownSelf) { + // Confirm that action responses can come back from retries; in this, expect a StepDownSelf + // action. + + // acknowledge the other member so that we see a majority + HeartbeatResponseAction action = + receiveDownHeartbeat(HostAndPort("host3"), "rs0", OpTime(100, 0)); + ASSERT_NO_ACTION(action.getAction()); + + // make us PRIMARY + makeSelfPrimary(); + + ReplSetHeartbeatResponse electedMoreRecentlyResponse; + electedMoreRecentlyResponse.noteReplSet(); + electedMoreRecentlyResponse.setSetName("rs0"); + electedMoreRecentlyResponse.setState(MemberState::RS_PRIMARY); + electedMoreRecentlyResponse.setElectable(false); + electedMoreRecentlyResponse.setElectionTime(OpTime(10, 0)); + electedMoreRecentlyResponse.setVersion(5); + action = getTopoCoord().processHeartbeatResponse( + firstRequestDate() + 5000, // Time is left. + Milliseconds(400), // Spent 0.4 of the 0.5 second in the network. + target(), + StatusWith<ReplSetHeartbeatResponse>(electedMoreRecentlyResponse), + OpTime(0, 0)); // We've never applied anything. + ASSERT_EQUALS(HeartbeatResponseAction::StepDownSelf, action.getAction()); + ASSERT_EQUALS(0, action.getPrimaryConfigIndex()); + ASSERT_EQUALS(Date_t(firstRequestDate() + 7000), action.getNextHeartbeatStartDate()); + // Doesn't actually do the stepdown until stepDownIfPending is called + ASSERT_TRUE(TopologyCoordinator::Role::leader == getTopoCoord().getRole()); + ASSERT_EQUALS(0, getCurrentPrimaryIndex()); + + ASSERT_TRUE(getTopoCoord().stepDownIfPending()); + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); + ASSERT_EQUALS(1, getCurrentPrimaryIndex()); +} + +TEST_F(HeartbeatResponseTestTwoRetries, DecideToStartElection) { + // Confirm that action responses can come back from retries; in this, expect a StartElection + // action. + + // acknowledge the other member so that we see a majority + OpTime election = OpTime(400, 0); + OpTime lastOpTimeApplied = OpTime(300, 0); + HeartbeatResponseAction action = receiveUpHeartbeat(HostAndPort("host3"), + "rs0", + MemberState::RS_SECONDARY, + election, + election, + lastOpTimeApplied); + ASSERT_NO_ACTION(action.getAction()); + + // make sure we are electable + setSelfMemberState(MemberState::RS_SECONDARY); + + ReplSetHeartbeatResponse startElectionResponse; + startElectionResponse.noteReplSet(); + startElectionResponse.setSetName("rs0"); + startElectionResponse.setState(MemberState::RS_SECONDARY); + startElectionResponse.setElectable(true); + startElectionResponse.setVersion(5); + action = getTopoCoord().processHeartbeatResponse( + firstRequestDate() + 5000, // Time is left. + Milliseconds(400), // Spent 0.4 of the 0.5 second in the network. + target(), + StatusWith<ReplSetHeartbeatResponse>(startElectionResponse), + election); + ASSERT_EQUALS(HeartbeatResponseAction::StartElection, action.getAction()); + ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); + ASSERT_EQUALS(Date_t(firstRequestDate() + 7000), action.getNextHeartbeatStartDate()); +} + +TEST_F(HeartbeatResponseTest, HeartbeatTimeoutSuppressesFirstRetry) { + // Confirm that the topology coordinator does not schedule an immediate heartbeat retry if + // the heartbeat timeout period expired before the initial request completed. + + HostAndPort target("host2", 27017); + Date_t firstRequestDate = unittest::assertGet(dateFromISOString("2014-08-29T13:00Z")); + + // Initial heartbeat request prepared, at t + 0. + std::pair<ReplSetHeartbeatArgs, Milliseconds> request = + getTopoCoord().prepareHeartbeatRequest(firstRequestDate, "rs0", target); + // 5 seconds to successfully complete the heartbeat before the timeout expires. + ASSERT_EQUALS(5000, request.second.total_milliseconds()); + + // Initial heartbeat request fails at t + 5000ms + HeartbeatResponseAction action = getTopoCoord().processHeartbeatResponse( + firstRequestDate + 5000, // Entire heartbeat period elapsed; no retry allowed. + Milliseconds(4990), // Spent 4.99 of the 4 seconds in the network. + target, + StatusWith<ReplSetHeartbeatResponse>(ErrorCodes::ExceededTimeLimit, "Took too long"), + OpTime(0, 0)); // We've never applied anything. + + ASSERT_EQUALS(HeartbeatResponseAction::NoAction, action.getAction()); + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); + // Because the heartbeat timed out, we'll retry in 2 seconds. + ASSERT_EQUALS(Date_t(firstRequestDate + 7000), action.getNextHeartbeatStartDate()); +} + +TEST_F(HeartbeatResponseTestOneRetry, HeartbeatTimeoutSuppressesSecondRetry) { + // Confirm that the topology coordinator does not schedule an second heartbeat retry if + // the heartbeat timeout period expired before the first retry completed. + HeartbeatResponseAction action = getTopoCoord().processHeartbeatResponse( + firstRequestDate() + 5010, // Entire heartbeat period elapsed; no retry allowed. + Milliseconds(1000), // Spent 1 of the 1.01 seconds in the network. + target(), + StatusWith<ReplSetHeartbeatResponse>(ErrorCodes::ExceededTimeLimit, "Took too long"), + OpTime(0, 0)); // We've never applied anything. + + ASSERT_EQUALS(HeartbeatResponseAction::NoAction, action.getAction()); + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); + // Because the heartbeat timed out, we'll retry in 2 seconds. + ASSERT_EQUALS(Date_t(firstRequestDate() + 7010), action.getNextHeartbeatStartDate()); +} + +TEST_F(HeartbeatResponseTestTwoRetries, HeartbeatThreeNonconsecutiveFailures) { + // Confirm that the topology coordinator does not mark a node down on three + // nonconsecutive heartbeat failures. + ReplSetHeartbeatResponse response; + response.noteReplSet(); + response.setSetName("rs0"); + response.setState(MemberState::RS_SECONDARY); + response.setElectable(true); + response.setVersion(5); + + // successful response (third response due to the two failures in setUp()) + HeartbeatResponseAction action = + getTopoCoord().processHeartbeatResponse(firstRequestDate() + 4500, + Milliseconds(400), + target(), + StatusWith<ReplSetHeartbeatResponse>(response), + OpTime(0, 0)); // We've never applied anything. + + ASSERT_EQUALS(HeartbeatResponseAction::NoAction, action.getAction()); + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); + // Because the heartbeat succeeded, we'll retry in 2 seconds. + ASSERT_EQUALS(Date_t(firstRequestDate() + 6500), action.getNextHeartbeatStartDate()); + + // request next heartbeat + getTopoCoord().prepareHeartbeatRequest(firstRequestDate() + 6500, "rs0", target()); + // third failed response + action = getTopoCoord().processHeartbeatResponse( + firstRequestDate() + 7100, + Milliseconds(400), + target(), + StatusWith<ReplSetHeartbeatResponse>(Status(ErrorCodes::HostUnreachable, "")), + OpTime(0, 0)); // We've never applied anything. + + ASSERT_EQUALS(HeartbeatResponseAction::NoAction, action.getAction()); + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); + + // Ensure a third nonconsecutive heartbeat failure did not cause the node to be marked down + BSONObjBuilder statusBuilder; + Status resultStatus(ErrorCodes::InternalError, "prepareStatusResponse didn't set result"); + getTopoCoord().prepareStatusResponse( + cbData(), firstRequestDate() + 7000, 600, OpTime(100, 0), &statusBuilder, &resultStatus); + ASSERT_OK(resultStatus); + BSONObj rsStatus = statusBuilder.obj(); + std::vector<BSONElement> memberArray = rsStatus["members"].Array(); + BSONObj member1Status = memberArray[1].Obj(); + + ASSERT_EQUALS(1, member1Status["_id"].Int()); + ASSERT_EQUALS(1, member1Status["health"].Double()); +} + +TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataNewPrimary) { + OpTime election = OpTime(5, 0); + OpTime lastOpTimeApplied = OpTime(3, 0); + + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), + "rs0", + MemberState::RS_PRIMARY, + election, + election, + lastOpTimeApplied); + ASSERT_EQUALS(1, getCurrentPrimaryIndex()); + ASSERT_NO_ACTION(nextAction.getAction()); + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); +} + +TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataTwoPrimariesNewOneOlder) { + OpTime election = OpTime(5, 0); + OpTime election2 = OpTime(4, 0); + OpTime lastOpTimeApplied = OpTime(3, 0); + + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), + "rs0", + MemberState::RS_PRIMARY, + election, + election, + lastOpTimeApplied); + ASSERT_EQUALS(1, getCurrentPrimaryIndex()); + ASSERT_NO_ACTION(nextAction.getAction()); + + nextAction = receiveUpHeartbeat(HostAndPort("host3"), + "rs0", + MemberState::RS_PRIMARY, + election2, + election, + lastOpTimeApplied); + // second primary does not change primary index + ASSERT_EQUALS(1, getCurrentPrimaryIndex()); + ASSERT_NO_ACTION(nextAction.getAction()); + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); +} + +TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataTwoPrimariesNewOneNewer) { + OpTime election = OpTime(4, 0); + OpTime election2 = OpTime(5, 0); + OpTime lastOpTimeApplied = OpTime(3, 0); + + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), + "rs0", + MemberState::RS_PRIMARY, + election, + election, + lastOpTimeApplied); + ASSERT_EQUALS(1, getCurrentPrimaryIndex()); + ASSERT_NO_ACTION(nextAction.getAction()); + + nextAction = receiveUpHeartbeat(HostAndPort("host3"), + "rs0", + MemberState::RS_PRIMARY, + election2, + election, + lastOpTimeApplied); + // second primary does not change primary index + ASSERT_EQUALS(1, getCurrentPrimaryIndex()); + ASSERT_NO_ACTION(nextAction.getAction()); + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); +} + +TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataTwoPrimariesIncludingMeNewOneOlder) { + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + makeSelfPrimary(OpTime(5, 0)); + + OpTime election = OpTime(4, 0); + OpTime lastOpTimeApplied = OpTime(3, 0); + + ASSERT_EQUALS(0, getCurrentPrimaryIndex()); + HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), + "rs0", + MemberState::RS_PRIMARY, + election, + election, + lastOpTimeApplied); + ASSERT_EQUALS(0, getCurrentPrimaryIndex()); + ASSERT_EQUALS(HeartbeatResponseAction::StepDownRemotePrimary, nextAction.getAction()); + ASSERT_EQUALS(1, nextAction.getPrimaryConfigIndex()); + ASSERT_TRUE(TopologyCoordinator::Role::leader == getTopoCoord().getRole()); +} + +TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataStepDownPrimaryForHighPriorityFreshNode) { + // In this test, the Topology coordinator sees a PRIMARY ("host2") and then sees a higher + // priority and similarly fresh node ("host3"). However, since the coordinator's node + // (host1) is not the higher priority node, it takes no action. + updateConfig( + BSON("_id" + << "rs0" + << "version" << 6 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "host1:27017") + << BSON("_id" << 1 << "host" + << "host2:27017") << BSON("_id" << 2 << "host" + << "host3:27017" + << "priority" << 3)) + << "settings" << BSON("heartbeatTimeoutSecs" << 5)), + 0); + setSelfMemberState(MemberState::RS_SECONDARY); + + OpTime election = OpTime(0, 0); + OpTime lastOpTimeApplied = OpTime(13, 0); + OpTime slightlyLessFreshLastOpTimeApplied = OpTime(3, 0); + + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), + "rs0", + MemberState::RS_PRIMARY, + election, + lastOpTimeApplied, + lastOpTimeApplied); + ASSERT_EQUALS(1, getCurrentPrimaryIndex()); + + nextAction = receiveUpHeartbeat(HostAndPort("host3"), + "rs0", + MemberState::RS_SECONDARY, + election, + slightlyLessFreshLastOpTimeApplied, + lastOpTimeApplied); + ASSERT_EQUALS(HeartbeatResponseAction::NoAction, nextAction.getAction()); +} + +TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataStepDownSelfForHighPriorityFreshNode) { + // In this test, the Topology coordinator becomes PRIMARY and then sees a higher priority + // and equally fresh node ("host3"). As a result it responds with a StepDownSelf action. + // + // Despite having stepped down, we should remain electable, in order to dissuade lower + // priority nodes from standing for election. + updateConfig( + BSON("_id" + << "rs0" + << "version" << 6 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "host1:27017") + << BSON("_id" << 1 << "host" + << "host2:27017") << BSON("_id" << 2 << "host" + << "host3:27017" + << "priority" << 3)) + << "settings" << BSON("heartbeatTimeoutSecs" << 5)), + 0); + OpTime election = OpTime(1000, 0); + + getTopoCoord().setFollowerMode(MemberState::RS_SECONDARY); + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + makeSelfPrimary(election); + ASSERT_EQUALS(0, getCurrentPrimaryIndex()); + + HeartbeatResponseAction nextAction = receiveUpHeartbeat( + HostAndPort("host3"), "rs0", MemberState::RS_SECONDARY, election, election, election); + ASSERT_EQUALS(HeartbeatResponseAction::StepDownSelf, nextAction.getAction()); + ASSERT_EQUALS(0, nextAction.getPrimaryConfigIndex()); + + // Process a heartbeat response to confirm that this node, which is no longer primary, + // still tells other nodes that it is electable. This will stop lower priority nodes + // from standing for election. + ReplSetHeartbeatArgs hbArgs; + hbArgs.setSetName("rs0"); + hbArgs.setProtocolVersion(1); + hbArgs.setConfigVersion(6); + hbArgs.setSenderId(1); + hbArgs.setSenderHost(HostAndPort("host3", 27017)); + ReplSetHeartbeatResponse hbResp; + ASSERT_OK(getTopoCoord().prepareHeartbeatResponse(now(), hbArgs, "rs0", election, &hbResp)); + ASSERT(!hbResp.hasIsElectable() || hbResp.isElectable()) << hbResp.toBSON().toString(); +} + +TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataDoNotStepDownSelfForHighPriorityStaleNode) { + // In this test, the Topology coordinator becomes PRIMARY and then sees a higher priority + // and stale node ("host3"). As a result it responds with NoAction. + updateConfig( + BSON("_id" + << "rs0" + << "version" << 6 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "host1:27017") + << BSON("_id" << 1 << "host" + << "host2:27017") << BSON("_id" << 2 << "host" + << "host3:27017" + << "priority" << 3)) + << "settings" << BSON("heartbeatTimeoutSecs" << 5)), + 0); + OpTime election = OpTime(1000, 0); + OpTime staleTime = OpTime(0, 0); + + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + makeSelfPrimary(election); + ASSERT_EQUALS(0, getCurrentPrimaryIndex()); + + HeartbeatResponseAction nextAction = receiveUpHeartbeat( + HostAndPort("host3"), "rs0", MemberState::RS_SECONDARY, election, staleTime, election); + ASSERT_NO_ACTION(nextAction.getAction()); +} + +TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataDoNotStepDownPrimaryForHighPriorityStaleNode) { + // In this test, the Topology coordinator sees a PRIMARY ("host2") and then sees a higher + // priority and stale node ("host3"). As a result it responds with NoAction. + updateConfig( + BSON("_id" + << "rs0" + << "version" << 6 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "host1:27017") + << BSON("_id" << 1 << "host" + << "host2:27017") << BSON("_id" << 2 << "host" + << "host3:27017" + << "priority" << 3)) + << "settings" << BSON("heartbeatTimeoutSecs" << 5)), + 0); + setSelfMemberState(MemberState::RS_SECONDARY); + + OpTime election = OpTime(1000, 0); + OpTime stale = OpTime(0, 0); + + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + HeartbeatResponseAction nextAction = receiveUpHeartbeat( + HostAndPort("host2"), "rs0", MemberState::RS_PRIMARY, election, election, election); + ASSERT_EQUALS(1, getCurrentPrimaryIndex()); + + nextAction = receiveUpHeartbeat( + HostAndPort("host3"), "rs0", MemberState::RS_SECONDARY, election, stale, election); + ASSERT_NO_ACTION(nextAction.getAction()); +} + +TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataTwoPrimariesIncludingMeNewOneNewer) { + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + makeSelfPrimary(OpTime(2, 0)); + + OpTime election = OpTime(4, 0); + OpTime lastOpTimeApplied = OpTime(3, 0); + + ASSERT_EQUALS(0, getCurrentPrimaryIndex()); + HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), + "rs0", + MemberState::RS_PRIMARY, + election, + election, + lastOpTimeApplied); + ASSERT_EQUALS(HeartbeatResponseAction::StepDownSelf, nextAction.getAction()); + ASSERT_EQUALS(0, nextAction.getPrimaryConfigIndex()); + // Doesn't actually do the stepdown until stepDownIfPending is called + ASSERT_TRUE(TopologyCoordinator::Role::leader == getTopoCoord().getRole()); + ASSERT_EQUALS(0, getCurrentPrimaryIndex()); - TEST_F(HeartbeatResponseTest, ElectionVoteForAnotherNodeBeforeFreshnessReturns) { - // In this test, the TopologyCoordinator goes through the steps of an election. However, - // before its freshness round ends, it receives a fresh command followed by an elect command - // from another node, both of which it responds positively to. The TopologyCoordinator's - // freshness round then concludes successfully, but it fails to vote for itself, since it - // recently voted for another node. - - // 1. All nodes heartbeat to indicate that they are up and that "host2" is PRIMARY. - // 2. "host2" goes down, triggering an election. - // 3. "host3" sends a fresh command, which the TopologyCoordinator responds to positively. - // 4. "host3" sends an elect command, which the TopologyCoordinator responds to positively. - // 5. The TopologyCoordinator's concludes its freshness round successfully. - // 6. The TopologyCoordinator loses the election. - - setSelfMemberState(MemberState::RS_SECONDARY); - now() += 30000; // we need to be more than LastVote::leaseTime from the start of time or - // else some Date_t math goes horribly awry - - OpTime election = OpTime(0,0); - OpTime lastOpTimeApplied = OpTime(100,0); - OpTime fresherOpApplied = OpTime(200,0); - - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), - "rs0", - MemberState::RS_PRIMARY, - election, - lastOpTimeApplied, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - ASSERT_EQUALS(1, getCurrentPrimaryIndex()); - - nextAction = receiveUpHeartbeat(HostAndPort("host3"), - "rs0", - MemberState::RS_SECONDARY, - election, - lastOpTimeApplied, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - - // candidate time! - nextAction = receiveDownHeartbeat(HostAndPort("host2"), "rs0", lastOpTimeApplied); - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - ASSERT_EQUALS(HeartbeatResponseAction::StartElection, nextAction.getAction()); - ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); - - OpTime originalElectionTime = getTopoCoord().getElectionTime(); - OID originalElectionId = getTopoCoord().getElectionId(); - // prepare an incoming fresh command - ReplicationCoordinator::ReplSetFreshArgs freshArgs; - freshArgs.setName = "rs0"; - freshArgs.cfgver = 5; - freshArgs.id = 2; - freshArgs.who = HostAndPort("host3"); - freshArgs.opTime = fresherOpApplied; - - BSONObjBuilder freshResponseBuilder; - Status result = Status(ErrorCodes::InternalError, "status not set by prepareElectResponse"); - getTopoCoord().prepareFreshResponse( - freshArgs, now()++, lastOpTimeApplied, &freshResponseBuilder, &result); - BSONObj response = freshResponseBuilder.obj(); - ASSERT_OK(result); - ASSERT_EQUALS(lastOpTimeApplied, OpTime(response["opTime"].timestampValue())); - ASSERT_FALSE(response["fresher"].trueValue()); - ASSERT_FALSE(response["veto"].trueValue()); - ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); - // make sure incoming fresh commands do not change electionTime and electionId - ASSERT_EQUALS(originalElectionTime, getTopoCoord().getElectionTime()); - ASSERT_EQUALS(originalElectionId, getTopoCoord().getElectionId()); - - // an elect command comes in - ReplicationCoordinator::ReplSetElectArgs electArgs; - OID round = OID::gen(); - electArgs.set = "rs0"; - electArgs.round = round; - electArgs.cfgver = 5; - electArgs.whoid = 2; - - BSONObjBuilder electResponseBuilder; - result = Status(ErrorCodes::InternalError, "status not set by prepareElectResponse"); - startCapturingLogMessages(); - getTopoCoord().prepareElectResponse( - electArgs, now()++, OpTime(), &electResponseBuilder, &result); - stopCapturingLogMessages(); - response = electResponseBuilder.obj(); - ASSERT_OK(result); - ASSERT_EQUALS(1, response["vote"].Int()); - ASSERT_EQUALS(round, response["round"].OID()); - ASSERT_EQUALS(1, countLogLinesContaining("voting yea for host3:27017 (2)")); - ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); - // make sure incoming elect commands do not change electionTime and electionId - ASSERT_EQUALS(originalElectionTime, getTopoCoord().getElectionTime()); - ASSERT_EQUALS(originalElectionId, getTopoCoord().getElectionId()); - - // now voteForSelf as though we received all our fresh responses - ASSERT_FALSE(getTopoCoord().voteForMyself(now()++)); - - // receive a heartbeat indicating the other node was elected - nextAction = receiveUpHeartbeat(HostAndPort("host3"), - "rs0", - MemberState::RS_PRIMARY, - election, - lastOpTimeApplied, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - ASSERT_EQUALS(2, getCurrentPrimaryIndex()); - // make sure seeing a new primary does not change electionTime and electionId - ASSERT_EQUALS(originalElectionTime, getTopoCoord().getElectionTime()); - ASSERT_EQUALS(originalElectionId, getTopoCoord().getElectionId()); - - // now lose election and ensure _electionTime and _electionId are 0'd out - getTopoCoord().processLoseElection(); - ASSERT_EQUALS(OID(), getTopoCoord().getElectionId()); - ASSERT_EQUALS(OpTime(0,0), getTopoCoord().getElectionTime()); - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - ASSERT_EQUALS(2, getCurrentPrimaryIndex()); - } + ASSERT_TRUE(getTopoCoord().stepDownIfPending()); + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); + ASSERT_EQUALS(1, getCurrentPrimaryIndex()); +} - TEST_F(HeartbeatResponseTest, ElectionRespondToFreshBeforeOurFreshnessReturns) { - // In this test, the TopologyCoordinator goes through the steps of an election. However, - // before its freshness round ends, the TopologyCoordinator receives a fresh command from - // another node, which it responds positively to. Its freshness then ends successfully and - // it wins the election. The other node's elect command then comes in and is responded to - // negatively, maintaining the TopologyCoordinator's PRIMARY state. +TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataPrimaryDownNoMajority) { + setSelfMemberState(MemberState::RS_SECONDARY); - // 1. All nodes heartbeat to indicate that they are up and that "host2" is PRIMARY. - // 2. "host2" goes down, triggering an election. - // 3. "host3" sends a fresh command, which the TopologyCoordinator responds to positively. - // 4. The TopologyCoordinator concludes its freshness round successfully and wins - // the election. - // 5. "host3" sends an elect command, which the TopologyCoordinator responds to negatively. + OpTime election = OpTime(400, 0); + OpTime lastOpTimeApplied = OpTime(300, 0); - setSelfMemberState(MemberState::RS_SECONDARY); - now() += 30000; // we need to be more than LastVote::leaseTime from the start of time or - // else some Date_t math goes horribly awry - - OpTime election = OpTime(0,0); - OpTime lastOpTimeApplied = OpTime(100,0); - OpTime fresherLastOpTimeApplied = OpTime(200,0); - OID round = OID::gen(); - OID remoteRound = OID::gen(); - - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), - "rs0", - MemberState::RS_PRIMARY, - election, - lastOpTimeApplied, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - ASSERT_EQUALS(1, getCurrentPrimaryIndex()); - - nextAction = receiveUpHeartbeat(HostAndPort("host3"), - "rs0", - MemberState::RS_SECONDARY, - election, - lastOpTimeApplied, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - - // candidate time! - nextAction = receiveDownHeartbeat(HostAndPort("host2"), "rs0", lastOpTimeApplied); - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - ASSERT_EQUALS(HeartbeatResponseAction::StartElection, nextAction.getAction()); - ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); - - // prepare an incoming fresh command - ReplicationCoordinator::ReplSetFreshArgs freshArgs; - freshArgs.setName = "rs0"; - freshArgs.cfgver = 5; - freshArgs.id = 2; - freshArgs.who = HostAndPort("host3"); - freshArgs.opTime = fresherLastOpTimeApplied; - - BSONObjBuilder freshResponseBuilder; - Status result = Status(ErrorCodes::InternalError, "status not set by prepareElectResponse"); - getTopoCoord().prepareFreshResponse( - freshArgs, now()++, lastOpTimeApplied, &freshResponseBuilder, &result); - BSONObj response = freshResponseBuilder.obj(); - ASSERT_OK(result); - ASSERT_EQUALS(lastOpTimeApplied, OpTime(response["opTime"].timestampValue())); - ASSERT_FALSE(response["fresher"].trueValue()); - ASSERT_FALSE(response["veto"].trueValue()); - ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); - - // now voteForSelf as though we received all our fresh responses - ASSERT_TRUE(getTopoCoord().voteForMyself(now()++)); - // now win election and ensure _electionId and _electionTime are set properly - getTopoCoord().processWinElection(round, election); - ASSERT_EQUALS(round, getTopoCoord().getElectionId()); - ASSERT_EQUALS(election, getTopoCoord().getElectionTime()); - ASSERT_TRUE(TopologyCoordinator::Role::leader == getTopoCoord().getRole()); - ASSERT_EQUALS(0, getCurrentPrimaryIndex()); - - // an elect command comes in - ReplicationCoordinator::ReplSetElectArgs electArgs; - electArgs.set = "rs0"; - electArgs.round = remoteRound; - electArgs.cfgver = 5; - electArgs.whoid = 2; - - BSONObjBuilder electResponseBuilder; - result = Status(ErrorCodes::InternalError, "status not set by prepareElectResponse"); - startCapturingLogMessages(); - getTopoCoord().prepareElectResponse( - electArgs, now()++, OpTime(), &electResponseBuilder, &result); - stopCapturingLogMessages(); - response = electResponseBuilder.obj(); - ASSERT_OK(result); - ASSERT_EQUALS(-10000, response["vote"].Int()); - ASSERT_EQUALS(remoteRound, response["round"].OID()); - ASSERT_TRUE(TopologyCoordinator::Role::leader == getTopoCoord().getRole()); - ASSERT_EQUALS(0, getCurrentPrimaryIndex()); - } + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), + "rs0", + MemberState::RS_PRIMARY, + election, + election, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + ASSERT_EQUALS(1, getCurrentPrimaryIndex()); + + nextAction = receiveDownHeartbeat(HostAndPort("host2"), "rs0", lastOpTimeApplied); + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + ASSERT_NO_ACTION(nextAction.getAction()); + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); +} + +TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataPrimaryDownMajorityButNoPriority) { + setSelfMemberState(MemberState::RS_SECONDARY); + + updateConfig( + BSON("_id" + << "rs0" + << "version" << 5 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "host1:27017" + << "priority" << 0) + << BSON("_id" << 1 << "host" + << "host2:27017") << BSON("_id" << 2 << "host" + << "host3:27017"))), + 0); + + OpTime election = OpTime(400, 0); + OpTime lastOpTimeApplied = OpTime(300, 0); + + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), + "rs0", + MemberState::RS_PRIMARY, + election, + election, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + ASSERT_EQUALS(1, getCurrentPrimaryIndex()); + + nextAction = receiveUpHeartbeat(HostAndPort("host3"), + "rs0", + MemberState::RS_SECONDARY, + election, + election, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + ASSERT_EQUALS(1, getCurrentPrimaryIndex()); + + nextAction = receiveDownHeartbeat(HostAndPort("host2"), "rs0", lastOpTimeApplied); + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + ASSERT_NO_ACTION(nextAction.getAction()); + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); +} + +TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataPrimaryDownMajorityButIAmStarting) { + setSelfMemberState(MemberState::RS_STARTUP); + + OpTime election = OpTime(400, 0); + OpTime lastOpTimeApplied = OpTime(300, 0); + + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), + "rs0", + MemberState::RS_PRIMARY, + election, + election, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + + nextAction = receiveUpHeartbeat(HostAndPort("host3"), + "rs0", + MemberState::RS_SECONDARY, + election, + election, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + + nextAction = receiveDownHeartbeat(HostAndPort("host2"), "rs0", lastOpTimeApplied); + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + ASSERT_NO_ACTION(nextAction.getAction()); + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); +} + +TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataPrimaryDownMajorityButIAmRecovering) { + setSelfMemberState(MemberState::RS_RECOVERING); + + OpTime election = OpTime(400, 0); + OpTime lastOpTimeApplied = OpTime(300, 0); + + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), + "rs0", + MemberState::RS_PRIMARY, + election, + election, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + ASSERT_EQUALS(1, getCurrentPrimaryIndex()); - TEST_F(HeartbeatResponseTest, ElectionCompleteElectionThenReceiveFresh) { - // In this test, the TopologyCoordinator goes through the steps of an election. After - // being successfully elected, a fresher node sends a fresh command, which the - // TopologyCoordinator responds positively to. The fresher node then sends an elect command, - // which the Topology coordinator negatively to since the TopologyCoordinator just elected - // itself. + nextAction = receiveDownHeartbeat(HostAndPort("host2"), "rs0", lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); +} - // 1. All nodes heartbeat to indicate that they are up and that "host2" is PRIMARY. - // 2. "host2" goes down, triggering an election. - // 3. The TopologyCoordinator concludes its freshness round successfully and wins - // the election. - // 4. "host3" sends a fresh command, which the TopologyCoordinator responds to positively. - // 5. "host3" sends an elect command, which the TopologyCoordinator responds to negatively. +TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataPrimaryDownMajorityButIHaveStepdownWait) { + setSelfMemberState(MemberState::RS_SECONDARY); - setSelfMemberState(MemberState::RS_SECONDARY); - now() += 30000; // we need to be more than LastVote::leaseTime from the start of time or - // else some Date_t math goes horribly awry - - OpTime election = OpTime(0,0); - OpTime lastOpTimeApplied = OpTime(100,0); - OpTime fresherLastOpTimeApplied = OpTime(200,0); - OID round = OID::gen(); - OID remoteRound = OID::gen(); - - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), - "rs0", - MemberState::RS_PRIMARY, - election, - lastOpTimeApplied, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - ASSERT_EQUALS(1, getCurrentPrimaryIndex()); - - nextAction = receiveUpHeartbeat(HostAndPort("host3"), - "rs0", - MemberState::RS_SECONDARY, - election, - lastOpTimeApplied, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - - // candidate time! - nextAction = receiveDownHeartbeat(HostAndPort("host2"), "rs0", lastOpTimeApplied); - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - ASSERT_EQUALS(HeartbeatResponseAction::StartElection, nextAction.getAction()); - ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); - - // now voteForSelf as though we received all our fresh responses - ASSERT_TRUE(getTopoCoord().voteForMyself(now()++)); - // now win election - getTopoCoord().processWinElection(round, election); - ASSERT_EQUALS(0, getTopoCoord().getCurrentPrimaryIndex()); - ASSERT_TRUE(TopologyCoordinator::Role::leader == getTopoCoord().getRole()); - - // prepare an incoming fresh command - ReplicationCoordinator::ReplSetFreshArgs freshArgs; - freshArgs.setName = "rs0"; - freshArgs.cfgver = 5; - freshArgs.id = 2; - freshArgs.who = HostAndPort("host3"); - freshArgs.opTime = fresherLastOpTimeApplied; - - BSONObjBuilder freshResponseBuilder; - Status result = Status(ErrorCodes::InternalError, "status not set by prepareElectResponse"); - getTopoCoord().prepareFreshResponse( - freshArgs, now()++, lastOpTimeApplied, &freshResponseBuilder, &result); - BSONObj response = freshResponseBuilder.obj(); - ASSERT_OK(result); - ASSERT_EQUALS(lastOpTimeApplied, OpTime(response["opTime"].timestampValue())); - ASSERT_FALSE(response["fresher"].trueValue()); - ASSERT_TRUE(response["veto"].trueValue()) << response["errmsg"]; - ASSERT_TRUE(TopologyCoordinator::Role::leader == getTopoCoord().getRole()); - ASSERT_EQUALS(0, getCurrentPrimaryIndex()); - - // an elect command comes in - ReplicationCoordinator::ReplSetElectArgs electArgs; - electArgs.set = "rs0"; - electArgs.round = remoteRound; - electArgs.cfgver = 5; - electArgs.whoid = 2; - - BSONObjBuilder electResponseBuilder; - result = Status(ErrorCodes::InternalError, "status not set by prepareElectResponse"); - startCapturingLogMessages(); - getTopoCoord().prepareElectResponse( - electArgs, now()++, OpTime(), &electResponseBuilder, &result); - stopCapturingLogMessages(); - response = electResponseBuilder.obj(); - ASSERT_OK(result); - ASSERT_EQUALS(-10000, response["vote"].Int()); - ASSERT_EQUALS(remoteRound, response["round"].OID()); - ASSERT_TRUE(TopologyCoordinator::Role::leader == getTopoCoord().getRole()); - ASSERT_EQUALS(0, getCurrentPrimaryIndex()); - } + OpTime election = OpTime(400, 0); + OpTime lastOpTimeApplied = OpTime(300, 0); - TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataPrimaryDownMajorityOfVotersUp) { - updateConfig(BSON("_id" << "rs0" << - "version" << 5 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "host1:27017") << - BSON("_id" << 1 << "host" << "host2:27017") << - BSON("_id" << 2 << "host" << "host3:27017" << "votes" << 0) << - BSON("_id" << 3 << "host" << "host4:27017" << "votes" << 0) << - BSON("_id" << 4 << "host" << "host5:27017" << "votes" << 0) << - BSON("_id" << 5 << "host" << "host6:27017" << "votes" << 0) << - BSON("_id" << 6 << "host" << "host7:27017")) << - "settings" << BSON("heartbeatTimeoutSecs" << 5)), + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), + "rs0", + MemberState::RS_PRIMARY, + election, + election, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + ASSERT_EQUALS(1, getCurrentPrimaryIndex()); + + nextAction = receiveUpHeartbeat(HostAndPort("host3"), + "rs0", + MemberState::RS_SECONDARY, + election, + election, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + + // freeze node to set stepdown wait + BSONObjBuilder response; + getTopoCoord().prepareFreezeResponse(now()++, 20, &response); + + nextAction = receiveDownHeartbeat(HostAndPort("host2"), "rs0", lastOpTimeApplied); + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + ASSERT_NO_ACTION(nextAction.getAction()); + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); +} + +TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataPrimaryDownMajorityButIAmArbiter) { + updateConfig( + BSON("_id" + << "rs0" + << "version" << 5 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "host1:27017" + << "arbiterOnly" << true) + << BSON("_id" << 1 << "host" + << "host2:27017") << BSON("_id" << 2 << "host" + << "host3:27017"))), + 0); + + OpTime election = OpTime(400, 0); + OpTime lastOpTimeApplied = OpTime(300, 0); + + HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host3"), + "rs0", + MemberState::RS_SECONDARY, + election, + election, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + + nextAction = receiveUpHeartbeat(HostAndPort("host2"), + "rs0", + MemberState::RS_PRIMARY, + election, + election, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + ASSERT_EQUALS(1, getCurrentPrimaryIndex()); + + nextAction = receiveDownHeartbeat(HostAndPort("host2"), "rs0", lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); +} + +TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataPrimaryDownMajority) { + setSelfMemberState(MemberState::RS_SECONDARY); + + OpTime election = OpTime(400, 0); + OpTime lastOpTimeApplied = OpTime(399, 0); + + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), + "rs0", + MemberState::RS_PRIMARY, + election, + election, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + ASSERT_EQUALS(1, getCurrentPrimaryIndex()); + + nextAction = receiveUpHeartbeat(HostAndPort("host3"), + "rs0", + MemberState::RS_SECONDARY, + election, + election, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + + nextAction = receiveDownHeartbeat(HostAndPort("host2"), "rs0", lastOpTimeApplied); + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + ASSERT_EQUALS(HeartbeatResponseAction::StartElection, nextAction.getAction()); + ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); +} + +TEST_F(HeartbeatResponseTest, ElectionStartElectionWhileCandidate) { + // In this test, the TopologyCoordinator goes through the steps of a successful election, + // during which it receives a heartbeat that would normally trigger it to become a candidate + // and respond with a StartElection HeartbeatResponseAction. However, since it is already in + // candidate state, it responds with a NoAction HeartbeatResponseAction. Then finishes by + // being winning the election. + + // 1. All nodes heartbeat to indicate that they are up and that "host2" is PRIMARY. + // 2. "host2" goes down, triggering an election. + // 3. "host2" comes back, which would normally trigger election, but since the + // TopologyCoordinator is already in candidate mode, does not. + // 4. TopologyCoordinator concludes its freshness round successfully and wins the election. + + setSelfMemberState(MemberState::RS_SECONDARY); + now() += 30000; // we need to be more than LastVote::leaseTime from the start of time or + // else some Date_t math goes horribly awry + + OpTime election = OpTime(0, 0); + OpTime lastOpTimeApplied = OpTime(130, 0); + OID round = OID::gen(); + + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), + "rs0", + MemberState::RS_PRIMARY, + election, + lastOpTimeApplied, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + ASSERT_EQUALS(1, getCurrentPrimaryIndex()); + + nextAction = receiveUpHeartbeat(HostAndPort("host3"), + "rs0", + MemberState::RS_SECONDARY, + election, + lastOpTimeApplied, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + + // candidate time! + nextAction = receiveDownHeartbeat(HostAndPort("host2"), "rs0", lastOpTimeApplied); + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + ASSERT_EQUALS(HeartbeatResponseAction::StartElection, nextAction.getAction()); + ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); + + // see the downed node as SECONDARY and decide to take no action, but are still a candidate + nextAction = receiveUpHeartbeat(HostAndPort("host2"), + "rs0", + MemberState::RS_SECONDARY, + election, + lastOpTimeApplied, + lastOpTimeApplied); + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + + // normally this would trigger StartElection, but we are already a candidate + ASSERT_NO_ACTION(nextAction.getAction()); + ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); + + // now voteForSelf as though we received all our fresh responses + ASSERT_TRUE(getTopoCoord().voteForMyself(now()++)); + + // now win election and ensure _electionId and _electionTime are set properly + getTopoCoord().processWinElection(round, election); + ASSERT_EQUALS(round, getTopoCoord().getElectionId()); + ASSERT_EQUALS(election, getTopoCoord().getElectionTime()); + ASSERT_TRUE(TopologyCoordinator::Role::leader == getTopoCoord().getRole()); + ASSERT_EQUALS(0, getCurrentPrimaryIndex()); +} + +TEST_F(HeartbeatResponseTest, ElectionVoteForAnotherNodeBeforeFreshnessReturns) { + // In this test, the TopologyCoordinator goes through the steps of an election. However, + // before its freshness round ends, it receives a fresh command followed by an elect command + // from another node, both of which it responds positively to. The TopologyCoordinator's + // freshness round then concludes successfully, but it fails to vote for itself, since it + // recently voted for another node. + + // 1. All nodes heartbeat to indicate that they are up and that "host2" is PRIMARY. + // 2. "host2" goes down, triggering an election. + // 3. "host3" sends a fresh command, which the TopologyCoordinator responds to positively. + // 4. "host3" sends an elect command, which the TopologyCoordinator responds to positively. + // 5. The TopologyCoordinator's concludes its freshness round successfully. + // 6. The TopologyCoordinator loses the election. + + setSelfMemberState(MemberState::RS_SECONDARY); + now() += 30000; // we need to be more than LastVote::leaseTime from the start of time or + // else some Date_t math goes horribly awry + + OpTime election = OpTime(0, 0); + OpTime lastOpTimeApplied = OpTime(100, 0); + OpTime fresherOpApplied = OpTime(200, 0); + + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), + "rs0", + MemberState::RS_PRIMARY, + election, + lastOpTimeApplied, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + ASSERT_EQUALS(1, getCurrentPrimaryIndex()); + + nextAction = receiveUpHeartbeat(HostAndPort("host3"), + "rs0", + MemberState::RS_SECONDARY, + election, + lastOpTimeApplied, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + + // candidate time! + nextAction = receiveDownHeartbeat(HostAndPort("host2"), "rs0", lastOpTimeApplied); + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + ASSERT_EQUALS(HeartbeatResponseAction::StartElection, nextAction.getAction()); + ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); + + OpTime originalElectionTime = getTopoCoord().getElectionTime(); + OID originalElectionId = getTopoCoord().getElectionId(); + // prepare an incoming fresh command + ReplicationCoordinator::ReplSetFreshArgs freshArgs; + freshArgs.setName = "rs0"; + freshArgs.cfgver = 5; + freshArgs.id = 2; + freshArgs.who = HostAndPort("host3"); + freshArgs.opTime = fresherOpApplied; + + BSONObjBuilder freshResponseBuilder; + Status result = Status(ErrorCodes::InternalError, "status not set by prepareElectResponse"); + getTopoCoord().prepareFreshResponse( + freshArgs, now()++, lastOpTimeApplied, &freshResponseBuilder, &result); + BSONObj response = freshResponseBuilder.obj(); + ASSERT_OK(result); + ASSERT_EQUALS(lastOpTimeApplied, OpTime(response["opTime"].timestampValue())); + ASSERT_FALSE(response["fresher"].trueValue()); + ASSERT_FALSE(response["veto"].trueValue()); + ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); + // make sure incoming fresh commands do not change electionTime and electionId + ASSERT_EQUALS(originalElectionTime, getTopoCoord().getElectionTime()); + ASSERT_EQUALS(originalElectionId, getTopoCoord().getElectionId()); + + // an elect command comes in + ReplicationCoordinator::ReplSetElectArgs electArgs; + OID round = OID::gen(); + electArgs.set = "rs0"; + electArgs.round = round; + electArgs.cfgver = 5; + electArgs.whoid = 2; + + BSONObjBuilder electResponseBuilder; + result = Status(ErrorCodes::InternalError, "status not set by prepareElectResponse"); + startCapturingLogMessages(); + getTopoCoord().prepareElectResponse( + electArgs, now()++, OpTime(), &electResponseBuilder, &result); + stopCapturingLogMessages(); + response = electResponseBuilder.obj(); + ASSERT_OK(result); + ASSERT_EQUALS(1, response["vote"].Int()); + ASSERT_EQUALS(round, response["round"].OID()); + ASSERT_EQUALS(1, countLogLinesContaining("voting yea for host3:27017 (2)")); + ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); + // make sure incoming elect commands do not change electionTime and electionId + ASSERT_EQUALS(originalElectionTime, getTopoCoord().getElectionTime()); + ASSERT_EQUALS(originalElectionId, getTopoCoord().getElectionId()); + + // now voteForSelf as though we received all our fresh responses + ASSERT_FALSE(getTopoCoord().voteForMyself(now()++)); + + // receive a heartbeat indicating the other node was elected + nextAction = receiveUpHeartbeat(HostAndPort("host3"), + "rs0", + MemberState::RS_PRIMARY, + election, + lastOpTimeApplied, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + ASSERT_EQUALS(2, getCurrentPrimaryIndex()); + // make sure seeing a new primary does not change electionTime and electionId + ASSERT_EQUALS(originalElectionTime, getTopoCoord().getElectionTime()); + ASSERT_EQUALS(originalElectionId, getTopoCoord().getElectionId()); + + // now lose election and ensure _electionTime and _electionId are 0'd out + getTopoCoord().processLoseElection(); + ASSERT_EQUALS(OID(), getTopoCoord().getElectionId()); + ASSERT_EQUALS(OpTime(0, 0), getTopoCoord().getElectionTime()); + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); + ASSERT_EQUALS(2, getCurrentPrimaryIndex()); +} + +TEST_F(HeartbeatResponseTest, ElectionRespondToFreshBeforeOurFreshnessReturns) { + // In this test, the TopologyCoordinator goes through the steps of an election. However, + // before its freshness round ends, the TopologyCoordinator receives a fresh command from + // another node, which it responds positively to. Its freshness then ends successfully and + // it wins the election. The other node's elect command then comes in and is responded to + // negatively, maintaining the TopologyCoordinator's PRIMARY state. + + // 1. All nodes heartbeat to indicate that they are up and that "host2" is PRIMARY. + // 2. "host2" goes down, triggering an election. + // 3. "host3" sends a fresh command, which the TopologyCoordinator responds to positively. + // 4. The TopologyCoordinator concludes its freshness round successfully and wins + // the election. + // 5. "host3" sends an elect command, which the TopologyCoordinator responds to negatively. + + setSelfMemberState(MemberState::RS_SECONDARY); + now() += 30000; // we need to be more than LastVote::leaseTime from the start of time or + // else some Date_t math goes horribly awry + + OpTime election = OpTime(0, 0); + OpTime lastOpTimeApplied = OpTime(100, 0); + OpTime fresherLastOpTimeApplied = OpTime(200, 0); + OID round = OID::gen(); + OID remoteRound = OID::gen(); + + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), + "rs0", + MemberState::RS_PRIMARY, + election, + lastOpTimeApplied, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + ASSERT_EQUALS(1, getCurrentPrimaryIndex()); + + nextAction = receiveUpHeartbeat(HostAndPort("host3"), + "rs0", + MemberState::RS_SECONDARY, + election, + lastOpTimeApplied, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + + // candidate time! + nextAction = receiveDownHeartbeat(HostAndPort("host2"), "rs0", lastOpTimeApplied); + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + ASSERT_EQUALS(HeartbeatResponseAction::StartElection, nextAction.getAction()); + ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); + + // prepare an incoming fresh command + ReplicationCoordinator::ReplSetFreshArgs freshArgs; + freshArgs.setName = "rs0"; + freshArgs.cfgver = 5; + freshArgs.id = 2; + freshArgs.who = HostAndPort("host3"); + freshArgs.opTime = fresherLastOpTimeApplied; + + BSONObjBuilder freshResponseBuilder; + Status result = Status(ErrorCodes::InternalError, "status not set by prepareElectResponse"); + getTopoCoord().prepareFreshResponse( + freshArgs, now()++, lastOpTimeApplied, &freshResponseBuilder, &result); + BSONObj response = freshResponseBuilder.obj(); + ASSERT_OK(result); + ASSERT_EQUALS(lastOpTimeApplied, OpTime(response["opTime"].timestampValue())); + ASSERT_FALSE(response["fresher"].trueValue()); + ASSERT_FALSE(response["veto"].trueValue()); + ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); + + // now voteForSelf as though we received all our fresh responses + ASSERT_TRUE(getTopoCoord().voteForMyself(now()++)); + // now win election and ensure _electionId and _electionTime are set properly + getTopoCoord().processWinElection(round, election); + ASSERT_EQUALS(round, getTopoCoord().getElectionId()); + ASSERT_EQUALS(election, getTopoCoord().getElectionTime()); + ASSERT_TRUE(TopologyCoordinator::Role::leader == getTopoCoord().getRole()); + ASSERT_EQUALS(0, getCurrentPrimaryIndex()); + + // an elect command comes in + ReplicationCoordinator::ReplSetElectArgs electArgs; + electArgs.set = "rs0"; + electArgs.round = remoteRound; + electArgs.cfgver = 5; + electArgs.whoid = 2; + + BSONObjBuilder electResponseBuilder; + result = Status(ErrorCodes::InternalError, "status not set by prepareElectResponse"); + startCapturingLogMessages(); + getTopoCoord().prepareElectResponse( + electArgs, now()++, OpTime(), &electResponseBuilder, &result); + stopCapturingLogMessages(); + response = electResponseBuilder.obj(); + ASSERT_OK(result); + ASSERT_EQUALS(-10000, response["vote"].Int()); + ASSERT_EQUALS(remoteRound, response["round"].OID()); + ASSERT_TRUE(TopologyCoordinator::Role::leader == getTopoCoord().getRole()); + ASSERT_EQUALS(0, getCurrentPrimaryIndex()); +} + +TEST_F(HeartbeatResponseTest, ElectionCompleteElectionThenReceiveFresh) { + // In this test, the TopologyCoordinator goes through the steps of an election. After + // being successfully elected, a fresher node sends a fresh command, which the + // TopologyCoordinator responds positively to. The fresher node then sends an elect command, + // which the Topology coordinator negatively to since the TopologyCoordinator just elected + // itself. + + // 1. All nodes heartbeat to indicate that they are up and that "host2" is PRIMARY. + // 2. "host2" goes down, triggering an election. + // 3. The TopologyCoordinator concludes its freshness round successfully and wins + // the election. + // 4. "host3" sends a fresh command, which the TopologyCoordinator responds to positively. + // 5. "host3" sends an elect command, which the TopologyCoordinator responds to negatively. + + setSelfMemberState(MemberState::RS_SECONDARY); + now() += 30000; // we need to be more than LastVote::leaseTime from the start of time or + // else some Date_t math goes horribly awry + + OpTime election = OpTime(0, 0); + OpTime lastOpTimeApplied = OpTime(100, 0); + OpTime fresherLastOpTimeApplied = OpTime(200, 0); + OID round = OID::gen(); + OID remoteRound = OID::gen(); + + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), + "rs0", + MemberState::RS_PRIMARY, + election, + lastOpTimeApplied, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + ASSERT_EQUALS(1, getCurrentPrimaryIndex()); + + nextAction = receiveUpHeartbeat(HostAndPort("host3"), + "rs0", + MemberState::RS_SECONDARY, + election, + lastOpTimeApplied, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + + // candidate time! + nextAction = receiveDownHeartbeat(HostAndPort("host2"), "rs0", lastOpTimeApplied); + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + ASSERT_EQUALS(HeartbeatResponseAction::StartElection, nextAction.getAction()); + ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); + + // now voteForSelf as though we received all our fresh responses + ASSERT_TRUE(getTopoCoord().voteForMyself(now()++)); + // now win election + getTopoCoord().processWinElection(round, election); + ASSERT_EQUALS(0, getTopoCoord().getCurrentPrimaryIndex()); + ASSERT_TRUE(TopologyCoordinator::Role::leader == getTopoCoord().getRole()); + + // prepare an incoming fresh command + ReplicationCoordinator::ReplSetFreshArgs freshArgs; + freshArgs.setName = "rs0"; + freshArgs.cfgver = 5; + freshArgs.id = 2; + freshArgs.who = HostAndPort("host3"); + freshArgs.opTime = fresherLastOpTimeApplied; + + BSONObjBuilder freshResponseBuilder; + Status result = Status(ErrorCodes::InternalError, "status not set by prepareElectResponse"); + getTopoCoord().prepareFreshResponse( + freshArgs, now()++, lastOpTimeApplied, &freshResponseBuilder, &result); + BSONObj response = freshResponseBuilder.obj(); + ASSERT_OK(result); + ASSERT_EQUALS(lastOpTimeApplied, OpTime(response["opTime"].timestampValue())); + ASSERT_FALSE(response["fresher"].trueValue()); + ASSERT_TRUE(response["veto"].trueValue()) << response["errmsg"]; + ASSERT_TRUE(TopologyCoordinator::Role::leader == getTopoCoord().getRole()); + ASSERT_EQUALS(0, getCurrentPrimaryIndex()); + + // an elect command comes in + ReplicationCoordinator::ReplSetElectArgs electArgs; + electArgs.set = "rs0"; + electArgs.round = remoteRound; + electArgs.cfgver = 5; + electArgs.whoid = 2; + + BSONObjBuilder electResponseBuilder; + result = Status(ErrorCodes::InternalError, "status not set by prepareElectResponse"); + startCapturingLogMessages(); + getTopoCoord().prepareElectResponse( + electArgs, now()++, OpTime(), &electResponseBuilder, &result); + stopCapturingLogMessages(); + response = electResponseBuilder.obj(); + ASSERT_OK(result); + ASSERT_EQUALS(-10000, response["vote"].Int()); + ASSERT_EQUALS(remoteRound, response["round"].OID()); + ASSERT_TRUE(TopologyCoordinator::Role::leader == getTopoCoord().getRole()); + ASSERT_EQUALS(0, getCurrentPrimaryIndex()); +} + +TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataPrimaryDownMajorityOfVotersUp) { + updateConfig(BSON("_id" + << "rs0" + << "version" << 5 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "host1:27017") + << BSON("_id" << 1 << "host" + << "host2:27017") << BSON("_id" << 2 << "host" + << "host3:27017" + << "votes" << 0) + << BSON("_id" << 3 << "host" + << "host4:27017" + << "votes" << 0) << BSON("_id" << 4 << "host" + << "host5:27017" + << "votes" << 0) + << BSON("_id" << 5 << "host" + << "host6:27017" + << "votes" << 0) << BSON("_id" << 6 << "host" + << "host7:27017")) + << "settings" << BSON("heartbeatTimeoutSecs" << 5)), + 0); + + setSelfMemberState(MemberState::RS_SECONDARY); + + OpTime election = OpTime(400, 0); + OpTime lastOpTimeApplied = OpTime(300, 0); + + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), + "rs0", + MemberState::RS_PRIMARY, + election, + election, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + ASSERT_EQUALS(1, getCurrentPrimaryIndex()); + + // make sure all non-voting nodes are down, that way we do not have a majority of nodes + // but do have a majority of votes since one of two voting members is up and so are we + nextAction = receiveDownHeartbeat(HostAndPort("host3"), "rs0", lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + nextAction = receiveDownHeartbeat(HostAndPort("host4"), "rs0", lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + nextAction = receiveDownHeartbeat(HostAndPort("host5"), "rs0", lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + nextAction = receiveDownHeartbeat(HostAndPort("host6"), "rs0", lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + nextAction = receiveUpHeartbeat(HostAndPort("host7"), + "rs0", + MemberState::RS_SECONDARY, + election, + lastOpTimeApplied, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + + nextAction = receiveDownHeartbeat(HostAndPort("host2"), "rs0", lastOpTimeApplied); + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + ASSERT_EQUALS(HeartbeatResponseAction::StartElection, nextAction.getAction()); + ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); +} + +TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataRelinquishPrimaryDueToNodeDisappearing) { + // become PRIMARY + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + makeSelfPrimary(OpTime(2, 0)); + ASSERT_EQUALS(0, getCurrentPrimaryIndex()); + + // become aware of other nodes + heartbeatFromMember(HostAndPort("host2"), "rs0", MemberState::RS_SECONDARY, OpTime(1, 0)); + heartbeatFromMember(HostAndPort("host2"), "rs0", MemberState::RS_SECONDARY, OpTime(1, 0)); + heartbeatFromMember(HostAndPort("host3"), "rs0", MemberState::RS_SECONDARY, OpTime(0, 0)); + heartbeatFromMember(HostAndPort("host3"), "rs0", MemberState::RS_SECONDARY, OpTime(0, 0)); + + // lose that awareness and be sure we are going to stepdown + HeartbeatResponseAction nextAction = + receiveDownHeartbeat(HostAndPort("host2"), "rs0", OpTime(100, 0)); + ASSERT_NO_ACTION(nextAction.getAction()); + nextAction = receiveDownHeartbeat(HostAndPort("host3"), "rs0", OpTime(100, 0)); + ASSERT_EQUALS(HeartbeatResponseAction::StepDownSelf, nextAction.getAction()); + ASSERT_EQUALS(0, nextAction.getPrimaryConfigIndex()); + // Doesn't actually do the stepdown until stepDownIfPending is called + ASSERT_TRUE(TopologyCoordinator::Role::leader == getTopoCoord().getRole()); + ASSERT_EQUALS(0, getCurrentPrimaryIndex()); + + ASSERT_TRUE(getTopoCoord().stepDownIfPending()); + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); +} + +TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataRemoteDoesNotExist) { + OpTime election = OpTime(5, 0); + OpTime lastOpTimeApplied = OpTime(3, 0); + + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host9"), + "rs0", + MemberState::RS_PRIMARY, + election, + election, + lastOpTimeApplied); + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + ASSERT_NO_ACTION(nextAction.getAction()); + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); +} + +class PrepareElectResponseTest : public TopoCoordTest { +public: + PrepareElectResponseTest() + : now(0), + round(OID::gen()), + cbData(NULL, ReplicationExecutor::CallbackHandle(), Status::OK()) {} + + virtual void setUp() { + TopoCoordTest::setUp(); + updateConfig(BSON("_id" + << "rs0" + << "version" << 10 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "hself") + << BSON("_id" << 1 << "host" + << "h1") << BSON("_id" << 2 << "host" + << "h2" + << "priority" << 10) + << BSON("_id" << 3 << "host" + << "h3" + << "priority" << 10))), 0); - - setSelfMemberState(MemberState::RS_SECONDARY); - - OpTime election = OpTime(400,0); - OpTime lastOpTimeApplied = OpTime(300,0); - - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), - "rs0", - MemberState::RS_PRIMARY, - election, - election, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - ASSERT_EQUALS(1, getCurrentPrimaryIndex()); - - // make sure all non-voting nodes are down, that way we do not have a majority of nodes - // but do have a majority of votes since one of two voting members is up and so are we - nextAction = receiveDownHeartbeat(HostAndPort("host3"), "rs0", lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - nextAction = receiveDownHeartbeat(HostAndPort("host4"), "rs0", lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - nextAction = receiveDownHeartbeat(HostAndPort("host5"), "rs0", lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - nextAction = receiveDownHeartbeat(HostAndPort("host6"), "rs0", lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - nextAction = receiveUpHeartbeat(HostAndPort("host7"), - "rs0", - MemberState::RS_SECONDARY, - election, - lastOpTimeApplied, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - - nextAction = receiveDownHeartbeat(HostAndPort("host2"), "rs0", lastOpTimeApplied); - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - ASSERT_EQUALS(HeartbeatResponseAction::StartElection, nextAction.getAction()); - ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); - } - - TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataRelinquishPrimaryDueToNodeDisappearing) { - // become PRIMARY - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - makeSelfPrimary(OpTime(2,0)); - ASSERT_EQUALS(0, getCurrentPrimaryIndex()); - - // become aware of other nodes - heartbeatFromMember(HostAndPort("host2"), "rs0", MemberState::RS_SECONDARY, OpTime(1,0)); - heartbeatFromMember(HostAndPort("host2"), "rs0", MemberState::RS_SECONDARY, OpTime(1,0)); - heartbeatFromMember(HostAndPort("host3"), "rs0", MemberState::RS_SECONDARY, OpTime(0,0)); - heartbeatFromMember(HostAndPort("host3"), "rs0", MemberState::RS_SECONDARY, OpTime(0,0)); - - // lose that awareness and be sure we are going to stepdown - HeartbeatResponseAction nextAction = receiveDownHeartbeat(HostAndPort("host2"), - "rs0", - OpTime(100, 0)); - ASSERT_NO_ACTION(nextAction.getAction()); - nextAction = receiveDownHeartbeat(HostAndPort("host3"), "rs0", OpTime(100, 0)); - ASSERT_EQUALS(HeartbeatResponseAction::StepDownSelf, nextAction.getAction()); - ASSERT_EQUALS(0, nextAction.getPrimaryConfigIndex()); - // Doesn't actually do the stepdown until stepDownIfPending is called - ASSERT_TRUE(TopologyCoordinator::Role::leader == getTopoCoord().getRole()); - ASSERT_EQUALS(0, getCurrentPrimaryIndex()); - - ASSERT_TRUE(getTopoCoord().stepDownIfPending()); - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - } - - TEST_F(HeartbeatResponseTest, UpdateHeartbeatDataRemoteDoesNotExist) { - OpTime election = OpTime(5,0); - OpTime lastOpTimeApplied = OpTime(3,0); - - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host9"), - "rs0", - MemberState::RS_PRIMARY, - election, - election, - lastOpTimeApplied); - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - ASSERT_NO_ACTION(nextAction.getAction()); - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - } - - class PrepareElectResponseTest : public TopoCoordTest { - public: - - PrepareElectResponseTest() : - now(0), - round(OID::gen()), - cbData(NULL, ReplicationExecutor::CallbackHandle(), Status::OK()) {} - - virtual void setUp() { - TopoCoordTest::setUp(); - updateConfig(BSON("_id" << "rs0" << - "version" << 10 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "hself") << - BSON("_id" << 1 << "host" << "h1") << - BSON("_id" << 2 << - "host" << "h2" << - "priority" << 10) << - BSON("_id" << 3 << - "host" << "h3" << - "priority" << 10))), - 0); - } - - protected: - Date_t now; - OID round; - ReplicationExecutor::CallbackData cbData; - }; - - TEST_F(PrepareElectResponseTest, ElectResponseIncorrectReplSetName) { - // Test with incorrect replset name - ReplicationCoordinator::ReplSetElectArgs args; - args.set = "fakeset"; - args.round = round; - args.cfgver = 10; - args.whoid = 1; - - BSONObjBuilder responseBuilder; - Status result = Status(ErrorCodes::InternalError, "status not set by prepareElectResponse"); - startCapturingLogMessages(); - getTopoCoord().prepareElectResponse(args, now += 60000, OpTime(), &responseBuilder, &result); - stopCapturingLogMessages(); - BSONObj response = responseBuilder.obj(); - ASSERT_OK(result); - ASSERT_EQUALS(0, response["vote"].Int()); - ASSERT_EQUALS(round, response["round"].OID()); - ASSERT_EQUALS(1, - countLogLinesContaining("received an elect request for 'fakeset' but our " - "set name is 'rs0'")); - - // Make sure nay votes, do not prevent subsequent yeas (the way a yea vote would) - args.set = "rs0"; - BSONObjBuilder responseBuilder2; - getTopoCoord().prepareElectResponse(args, now++, OpTime(), &responseBuilder2, &result); - BSONObj response2 = responseBuilder2.obj(); - ASSERT_EQUALS(1, response2["vote"].Int()); - ASSERT_EQUALS(round, response2["round"].OID()); - } - - TEST_F(PrepareElectResponseTest, ElectResponseOurConfigStale) { - // Test with us having a stale config version - ReplicationCoordinator::ReplSetElectArgs args; - args.set = "rs0"; - args.round = round; - args.cfgver = 20; - args.whoid = 1; - - BSONObjBuilder responseBuilder; - Status result = Status(ErrorCodes::InternalError, "status not set by prepareElectResponse"); - startCapturingLogMessages(); - getTopoCoord().prepareElectResponse(args, now += 60000, OpTime(), &responseBuilder, &result); - stopCapturingLogMessages(); - BSONObj response = responseBuilder.obj(); - ASSERT_OK(result); - ASSERT_EQUALS(0, response["vote"].Int()); - ASSERT_EQUALS(round, response["round"].OID()); - ASSERT_EQUALS(1, - countLogLinesContaining("not voting because our config version is stale")); - - // Make sure nay votes, do not prevent subsequent yeas (the way a yea vote would) - args.cfgver = 10; - BSONObjBuilder responseBuilder2; - getTopoCoord().prepareElectResponse(args, now++, OpTime(), &responseBuilder2, &result); - BSONObj response2 = responseBuilder2.obj(); - ASSERT_EQUALS(1, response2["vote"].Int()); - ASSERT_EQUALS(round, response2["round"].OID()); - } - - TEST_F(PrepareElectResponseTest, ElectResponseTheirConfigStale) { - // Test with them having a stale config version - ReplicationCoordinator::ReplSetElectArgs args; - args.set = "rs0"; - args.round = round; - args.cfgver = 5; - args.whoid = 1; - - BSONObjBuilder responseBuilder; - Status result = Status(ErrorCodes::InternalError, "status not set by prepareElectResponse"); - startCapturingLogMessages(); - getTopoCoord().prepareElectResponse(args, now += 60000, OpTime(), &responseBuilder, &result); - stopCapturingLogMessages(); - BSONObj response = responseBuilder.obj(); - ASSERT_OK(result); - ASSERT_EQUALS(-10000, response["vote"].Int()); - ASSERT_EQUALS(round, response["round"].OID()); - ASSERT_EQUALS(1, - countLogLinesContaining("received stale config version # during election")); - - // Make sure nay votes, do not prevent subsequent yeas (the way a yea vote would) - args.cfgver = 10; - BSONObjBuilder responseBuilder2; - getTopoCoord().prepareElectResponse(args, now++, OpTime(), &responseBuilder2, &result); - BSONObj response2 = responseBuilder2.obj(); - ASSERT_EQUALS(1, response2["vote"].Int()); - ASSERT_EQUALS(round, response2["round"].OID()); - } - - TEST_F(PrepareElectResponseTest, ElectResponseNonExistentNode) { - // Test with a non-existent node - ReplicationCoordinator::ReplSetElectArgs args; - args.set = "rs0"; - args.round = round; - args.cfgver = 10; - args.whoid = 99; - - BSONObjBuilder responseBuilder; - Status result = Status(ErrorCodes::InternalError, "status not set by prepareElectResponse"); - startCapturingLogMessages(); - getTopoCoord().prepareElectResponse(args, now += 60000, OpTime(), &responseBuilder, &result); - stopCapturingLogMessages(); - BSONObj response = responseBuilder.obj(); - ASSERT_OK(result); - ASSERT_EQUALS(-10000, response["vote"].Int()); - ASSERT_EQUALS(round, response["round"].OID()); - ASSERT_EQUALS(1, countLogLinesContaining("couldn't find member with id 99")); - - // Make sure nay votes, do not prevent subsequent yeas (the way a yea vote would) - args.whoid = 1; - BSONObjBuilder responseBuilder2; - getTopoCoord().prepareElectResponse(args, now++, OpTime(), &responseBuilder2, &result); - BSONObj response2 = responseBuilder2.obj(); - ASSERT_EQUALS(1, response2["vote"].Int()); - ASSERT_EQUALS(round, response2["round"].OID()); - } - - TEST_F(PrepareElectResponseTest, ElectResponseWeArePrimary) { - // Test when we are already primary - ReplicationCoordinator::ReplSetElectArgs args; - args.set = "rs0"; - args.round = round; - args.cfgver = 10; - args.whoid = 1; - - getTopoCoord()._setCurrentPrimaryForTest(0); - - BSONObjBuilder responseBuilder; - Status result = Status(ErrorCodes::InternalError, "status not set by prepareElectResponse"); - startCapturingLogMessages(); - getTopoCoord().prepareElectResponse(args, now += 60000, OpTime(), &responseBuilder, &result); - stopCapturingLogMessages(); - BSONObj response = responseBuilder.obj(); - ASSERT_OK(result); - ASSERT_EQUALS(-10000, response["vote"].Int()); - ASSERT_EQUALS(round, response["round"].OID()); - ASSERT_EQUALS(1, countLogLinesContaining("I am already primary")); - - // Make sure nay votes, do not prevent subsequent yeas (the way a yea vote would) - getTopoCoord()._setCurrentPrimaryForTest(-1); - BSONObjBuilder responseBuilder2; - getTopoCoord().prepareElectResponse(args, now++, OpTime(), &responseBuilder2, &result); - BSONObj response2 = responseBuilder2.obj(); - ASSERT_EQUALS(1, response2["vote"].Int()); - ASSERT_EQUALS(round, response2["round"].OID()); - } - - TEST_F(PrepareElectResponseTest, ElectResponseSomeoneElseIsPrimary) { - // Test when someone else is already primary - ReplicationCoordinator::ReplSetElectArgs args; - args.set = "rs0"; - args.round = round; - args.cfgver = 10; - args.whoid = 1; - getTopoCoord()._setCurrentPrimaryForTest(2); - - BSONObjBuilder responseBuilder; - Status result = Status(ErrorCodes::InternalError, "status not set by prepareElectResponse"); - startCapturingLogMessages(); - getTopoCoord().prepareElectResponse(args, now += 60000, OpTime(), &responseBuilder, &result); - stopCapturingLogMessages(); - BSONObj response = responseBuilder.obj(); - ASSERT_OK(result); - ASSERT_EQUALS(-10000, response["vote"].Int()); - ASSERT_EQUALS(round, response["round"].OID()); - ASSERT_EQUALS(1, countLogLinesContaining("h2:27017 is already primary")); - - // Make sure nay votes, do not prevent subsequent yeas (the way a yea vote would) - getTopoCoord()._setCurrentPrimaryForTest(-1); - BSONObjBuilder responseBuilder2; - getTopoCoord().prepareElectResponse(args, now++, OpTime(), &responseBuilder2, &result); - BSONObj response2 = responseBuilder2.obj(); - ASSERT_EQUALS(1, response2["vote"].Int()); - ASSERT_EQUALS(round, response2["round"].OID()); - } - - TEST_F(PrepareElectResponseTest, ElectResponseNotHighestPriority) { - // Test trying to elect someone who isn't the highest priority node - ReplicationCoordinator::ReplSetElectArgs args; - args.set = "rs0"; - args.round = round; - args.cfgver = 10; - args.whoid = 1; - - heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, jsTime()); - - BSONObjBuilder responseBuilder; - Status result = Status(ErrorCodes::InternalError, "status not set by prepareElectResponse"); - startCapturingLogMessages(); - getTopoCoord().prepareElectResponse(args, now += 60000, OpTime(), &responseBuilder, &result); - stopCapturingLogMessages(); - BSONObj response = responseBuilder.obj(); - ASSERT_OK(result); - ASSERT_EQUALS(-10000, response["vote"].Int()); - ASSERT_EQUALS(round, response["round"].OID()); - ASSERT_EQUALS(1, countLogLinesContaining("h1:27017 has lower priority than h3:27017")); - - // Make sure nay votes, do not prevent subsequent yeas (the way a yea vote would) - args.whoid = 3; - BSONObjBuilder responseBuilder2; - getTopoCoord().prepareElectResponse(args, now++, OpTime(), &responseBuilder2, &result); - BSONObj response2 = responseBuilder2.obj(); - ASSERT_EQUALS(1, response2["vote"].Int()); - ASSERT_EQUALS(round, response2["round"].OID()); - } - - TEST_F(PrepareElectResponseTest, ElectResponseHighestPriorityOfLiveNodes) { - // Test trying to elect someone who isn't the highest priority node, but all higher nodes - // are down - ReplicationCoordinator::ReplSetElectArgs args; - args.set = "rs0"; - args.round = round; - args.cfgver = 10; - args.whoid = 1; - - receiveDownHeartbeat(HostAndPort("h3"), "rs0", OpTime()); - receiveDownHeartbeat(HostAndPort("h2"), "rs0", OpTime()); - - BSONObjBuilder responseBuilder; - Status result = Status::OK(); - startCapturingLogMessages(); - getTopoCoord().prepareElectResponse(args, now += 60000, OpTime(), &responseBuilder, &result); - stopCapturingLogMessages(); - BSONObj response = responseBuilder.obj(); - ASSERT_EQUALS(1, response["vote"].Int()); - ASSERT_EQUALS(round, response["round"].OID()); - } - - TEST_F(PrepareElectResponseTest, ElectResponseValidVotes) { - // Test a valid vote - ReplicationCoordinator::ReplSetElectArgs args; - args.set = "rs0"; - args.round = round; - args.cfgver = 10; - args.whoid = 2; - now = 100; - - BSONObjBuilder responseBuilder1; - Status result = Status(ErrorCodes::InternalError, "status not set by prepareElectResponse"); - startCapturingLogMessages(); - getTopoCoord().prepareElectResponse(args, now += 60000, OpTime(), &responseBuilder1, &result); - stopCapturingLogMessages(); - BSONObj response1 = responseBuilder1.obj(); - ASSERT_OK(result); - ASSERT_EQUALS(1, response1["vote"].Int()); - ASSERT_EQUALS(round, response1["round"].OID()); - ASSERT_EQUALS(1, countLogLinesContaining("voting yea for h2:27017 (2)")); - - // Test what would be a valid vote except that we already voted too recently - args.whoid = 3; - - BSONObjBuilder responseBuilder2; - startCapturingLogMessages(); - getTopoCoord().prepareElectResponse(args, now, OpTime(), &responseBuilder2, &result); - stopCapturingLogMessages(); - BSONObj response2 = responseBuilder2.obj(); - ASSERT_OK(result); - ASSERT_EQUALS(0, response2["vote"].Int()); - ASSERT_EQUALS(round, response2["round"].OID()); - ASSERT_EQUALS(1, countLogLinesContaining("voting no for h3:27017; " - "voted for h2:27017 0 secs ago")); - - // Test that after enough time passes the same vote can proceed - now += 30 * 1000 + 1; // just over 30 seconds later - - BSONObjBuilder responseBuilder3; - startCapturingLogMessages(); - getTopoCoord().prepareElectResponse(args, now++, OpTime(), &responseBuilder3, &result); - stopCapturingLogMessages(); - BSONObj response3 = responseBuilder3.obj(); - ASSERT_OK(result); - ASSERT_EQUALS(1, response3["vote"].Int()); - ASSERT_EQUALS(round, response3["round"].OID()); - ASSERT_EQUALS(1, countLogLinesContaining("voting yea for h3:27017 (3)")); - } - - TEST_F(TopoCoordTest, ElectResponseNotInConfig) { - ReplicationCoordinator::ReplSetElectArgs args; - BSONObjBuilder response; - Status status = Status(ErrorCodes::InternalError, "status not set by prepareElectResponse"); - getTopoCoord().prepareElectResponse(args, now(), OpTime(), &response, &status); - ASSERT_EQUALS(ErrorCodes::ReplicaSetNotFound, status); - ASSERT_EQUALS("Cannot participate in election because not initialized", status.reason()); - } - - class PrepareFreezeResponseTest : public TopoCoordTest { - public: - - virtual void setUp() { - TopoCoordTest::setUp(); - updateConfig(BSON("_id" << "rs0" << - "version" << 5 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "host1:27017") << - BSON("_id" << 1 << "host" << "host2:27017"))), - 0); - } - - BSONObj prepareFreezeResponse(int duration) { - BSONObjBuilder response; - startCapturingLogMessages(); - getTopoCoord().prepareFreezeResponse(now()++, duration, &response); - stopCapturingLogMessages(); - return response.obj(); - } - }; - - TEST_F(PrepareFreezeResponseTest, UnfreezeEvenWhenNotFrozen) { - BSONObj response = prepareFreezeResponse(0); - ASSERT_EQUALS("unfreezing", response["info"].String()); - ASSERT_EQUALS(1, countLogLinesContaining("replSet info 'unfreezing'")); - // 1 instead of 0 because it assigns to "now" in this case - ASSERT_EQUALS(1LL, getTopoCoord().getStepDownTime().asInt64()); } - TEST_F(PrepareFreezeResponseTest, FreezeForOneSecond) { - BSONObj response = prepareFreezeResponse(1); - ASSERT_EQUALS("you really want to freeze for only 1 second?", - response["warning"].String()); - ASSERT_EQUALS(1, countLogLinesContaining("replSet info 'freezing' for 1 seconds")); - // 1001 because "now" was incremented once during initialization + 1000 ms wait - ASSERT_EQUALS(1001LL, getTopoCoord().getStepDownTime().asInt64()); - } - - TEST_F(PrepareFreezeResponseTest, FreezeForManySeconds) { - BSONObj response = prepareFreezeResponse(20); - ASSERT_TRUE(response.isEmpty()); - ASSERT_EQUALS(1, countLogLinesContaining("replSet info 'freezing' for 20 seconds")); - // 20001 because "now" was incremented once during initialization + 20000 ms wait - ASSERT_EQUALS(20001LL, getTopoCoord().getStepDownTime().asInt64()); - } - - TEST_F(PrepareFreezeResponseTest, UnfreezeEvenWhenNotFrozenWhilePrimary) { - makeSelfPrimary(); - BSONObj response = prepareFreezeResponse(0); - ASSERT_EQUALS("unfreezing", response["info"].String()); - // doesn't mention being primary in this case for some reason - ASSERT_EQUALS(0, countLogLinesContaining( - "replSet info received freeze command but we are primary")); - // 1 instead of 0 because it assigns to "now" in this case - ASSERT_EQUALS(1LL, getTopoCoord().getStepDownTime().asInt64()); - } - - TEST_F(PrepareFreezeResponseTest, FreezeForOneSecondWhilePrimary) { - makeSelfPrimary(); - BSONObj response = prepareFreezeResponse(1); - ASSERT_EQUALS("you really want to freeze for only 1 second?", - response["warning"].String()); - ASSERT_EQUALS(1, countLogLinesContaining( - "replSet info received freeze command but we are primary")); - ASSERT_EQUALS(0LL, getTopoCoord().getStepDownTime().asInt64()); - } - - TEST_F(PrepareFreezeResponseTest, FreezeForManySecondsWhilePrimary) { - makeSelfPrimary(); - BSONObj response = prepareFreezeResponse(20); - ASSERT_TRUE(response.isEmpty()); - ASSERT_EQUALS(1, countLogLinesContaining( - "replSet info received freeze command but we are primary")); - ASSERT_EQUALS(0LL, getTopoCoord().getStepDownTime().asInt64()); - } - - TEST_F(TopoCoordTest, UnfreezeWhileLoneNode) { - updateConfig(BSON("_id" << "rs0" << - "version" << 5 << - "members" << BSON_ARRAY(BSON("_id" << 0 << "host" << "host1:27017"))), +protected: + Date_t now; + OID round; + ReplicationExecutor::CallbackData cbData; +}; + +TEST_F(PrepareElectResponseTest, ElectResponseIncorrectReplSetName) { + // Test with incorrect replset name + ReplicationCoordinator::ReplSetElectArgs args; + args.set = "fakeset"; + args.round = round; + args.cfgver = 10; + args.whoid = 1; + + BSONObjBuilder responseBuilder; + Status result = Status(ErrorCodes::InternalError, "status not set by prepareElectResponse"); + startCapturingLogMessages(); + getTopoCoord().prepareElectResponse(args, now += 60000, OpTime(), &responseBuilder, &result); + stopCapturingLogMessages(); + BSONObj response = responseBuilder.obj(); + ASSERT_OK(result); + ASSERT_EQUALS(0, response["vote"].Int()); + ASSERT_EQUALS(round, response["round"].OID()); + ASSERT_EQUALS(1, + countLogLinesContaining( + "received an elect request for 'fakeset' but our " + "set name is 'rs0'")); + + // Make sure nay votes, do not prevent subsequent yeas (the way a yea vote would) + args.set = "rs0"; + BSONObjBuilder responseBuilder2; + getTopoCoord().prepareElectResponse(args, now++, OpTime(), &responseBuilder2, &result); + BSONObj response2 = responseBuilder2.obj(); + ASSERT_EQUALS(1, response2["vote"].Int()); + ASSERT_EQUALS(round, response2["round"].OID()); +} + +TEST_F(PrepareElectResponseTest, ElectResponseOurConfigStale) { + // Test with us having a stale config version + ReplicationCoordinator::ReplSetElectArgs args; + args.set = "rs0"; + args.round = round; + args.cfgver = 20; + args.whoid = 1; + + BSONObjBuilder responseBuilder; + Status result = Status(ErrorCodes::InternalError, "status not set by prepareElectResponse"); + startCapturingLogMessages(); + getTopoCoord().prepareElectResponse(args, now += 60000, OpTime(), &responseBuilder, &result); + stopCapturingLogMessages(); + BSONObj response = responseBuilder.obj(); + ASSERT_OK(result); + ASSERT_EQUALS(0, response["vote"].Int()); + ASSERT_EQUALS(round, response["round"].OID()); + ASSERT_EQUALS(1, countLogLinesContaining("not voting because our config version is stale")); + + // Make sure nay votes, do not prevent subsequent yeas (the way a yea vote would) + args.cfgver = 10; + BSONObjBuilder responseBuilder2; + getTopoCoord().prepareElectResponse(args, now++, OpTime(), &responseBuilder2, &result); + BSONObj response2 = responseBuilder2.obj(); + ASSERT_EQUALS(1, response2["vote"].Int()); + ASSERT_EQUALS(round, response2["round"].OID()); +} + +TEST_F(PrepareElectResponseTest, ElectResponseTheirConfigStale) { + // Test with them having a stale config version + ReplicationCoordinator::ReplSetElectArgs args; + args.set = "rs0"; + args.round = round; + args.cfgver = 5; + args.whoid = 1; + + BSONObjBuilder responseBuilder; + Status result = Status(ErrorCodes::InternalError, "status not set by prepareElectResponse"); + startCapturingLogMessages(); + getTopoCoord().prepareElectResponse(args, now += 60000, OpTime(), &responseBuilder, &result); + stopCapturingLogMessages(); + BSONObj response = responseBuilder.obj(); + ASSERT_OK(result); + ASSERT_EQUALS(-10000, response["vote"].Int()); + ASSERT_EQUALS(round, response["round"].OID()); + ASSERT_EQUALS(1, countLogLinesContaining("received stale config version # during election")); + + // Make sure nay votes, do not prevent subsequent yeas (the way a yea vote would) + args.cfgver = 10; + BSONObjBuilder responseBuilder2; + getTopoCoord().prepareElectResponse(args, now++, OpTime(), &responseBuilder2, &result); + BSONObj response2 = responseBuilder2.obj(); + ASSERT_EQUALS(1, response2["vote"].Int()); + ASSERT_EQUALS(round, response2["round"].OID()); +} + +TEST_F(PrepareElectResponseTest, ElectResponseNonExistentNode) { + // Test with a non-existent node + ReplicationCoordinator::ReplSetElectArgs args; + args.set = "rs0"; + args.round = round; + args.cfgver = 10; + args.whoid = 99; + + BSONObjBuilder responseBuilder; + Status result = Status(ErrorCodes::InternalError, "status not set by prepareElectResponse"); + startCapturingLogMessages(); + getTopoCoord().prepareElectResponse(args, now += 60000, OpTime(), &responseBuilder, &result); + stopCapturingLogMessages(); + BSONObj response = responseBuilder.obj(); + ASSERT_OK(result); + ASSERT_EQUALS(-10000, response["vote"].Int()); + ASSERT_EQUALS(round, response["round"].OID()); + ASSERT_EQUALS(1, countLogLinesContaining("couldn't find member with id 99")); + + // Make sure nay votes, do not prevent subsequent yeas (the way a yea vote would) + args.whoid = 1; + BSONObjBuilder responseBuilder2; + getTopoCoord().prepareElectResponse(args, now++, OpTime(), &responseBuilder2, &result); + BSONObj response2 = responseBuilder2.obj(); + ASSERT_EQUALS(1, response2["vote"].Int()); + ASSERT_EQUALS(round, response2["round"].OID()); +} + +TEST_F(PrepareElectResponseTest, ElectResponseWeArePrimary) { + // Test when we are already primary + ReplicationCoordinator::ReplSetElectArgs args; + args.set = "rs0"; + args.round = round; + args.cfgver = 10; + args.whoid = 1; + + getTopoCoord()._setCurrentPrimaryForTest(0); + + BSONObjBuilder responseBuilder; + Status result = Status(ErrorCodes::InternalError, "status not set by prepareElectResponse"); + startCapturingLogMessages(); + getTopoCoord().prepareElectResponse(args, now += 60000, OpTime(), &responseBuilder, &result); + stopCapturingLogMessages(); + BSONObj response = responseBuilder.obj(); + ASSERT_OK(result); + ASSERT_EQUALS(-10000, response["vote"].Int()); + ASSERT_EQUALS(round, response["round"].OID()); + ASSERT_EQUALS(1, countLogLinesContaining("I am already primary")); + + // Make sure nay votes, do not prevent subsequent yeas (the way a yea vote would) + getTopoCoord()._setCurrentPrimaryForTest(-1); + BSONObjBuilder responseBuilder2; + getTopoCoord().prepareElectResponse(args, now++, OpTime(), &responseBuilder2, &result); + BSONObj response2 = responseBuilder2.obj(); + ASSERT_EQUALS(1, response2["vote"].Int()); + ASSERT_EQUALS(round, response2["round"].OID()); +} + +TEST_F(PrepareElectResponseTest, ElectResponseSomeoneElseIsPrimary) { + // Test when someone else is already primary + ReplicationCoordinator::ReplSetElectArgs args; + args.set = "rs0"; + args.round = round; + args.cfgver = 10; + args.whoid = 1; + getTopoCoord()._setCurrentPrimaryForTest(2); + + BSONObjBuilder responseBuilder; + Status result = Status(ErrorCodes::InternalError, "status not set by prepareElectResponse"); + startCapturingLogMessages(); + getTopoCoord().prepareElectResponse(args, now += 60000, OpTime(), &responseBuilder, &result); + stopCapturingLogMessages(); + BSONObj response = responseBuilder.obj(); + ASSERT_OK(result); + ASSERT_EQUALS(-10000, response["vote"].Int()); + ASSERT_EQUALS(round, response["round"].OID()); + ASSERT_EQUALS(1, countLogLinesContaining("h2:27017 is already primary")); + + // Make sure nay votes, do not prevent subsequent yeas (the way a yea vote would) + getTopoCoord()._setCurrentPrimaryForTest(-1); + BSONObjBuilder responseBuilder2; + getTopoCoord().prepareElectResponse(args, now++, OpTime(), &responseBuilder2, &result); + BSONObj response2 = responseBuilder2.obj(); + ASSERT_EQUALS(1, response2["vote"].Int()); + ASSERT_EQUALS(round, response2["round"].OID()); +} + +TEST_F(PrepareElectResponseTest, ElectResponseNotHighestPriority) { + // Test trying to elect someone who isn't the highest priority node + ReplicationCoordinator::ReplSetElectArgs args; + args.set = "rs0"; + args.round = round; + args.cfgver = 10; + args.whoid = 1; + + heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, jsTime()); + + BSONObjBuilder responseBuilder; + Status result = Status(ErrorCodes::InternalError, "status not set by prepareElectResponse"); + startCapturingLogMessages(); + getTopoCoord().prepareElectResponse(args, now += 60000, OpTime(), &responseBuilder, &result); + stopCapturingLogMessages(); + BSONObj response = responseBuilder.obj(); + ASSERT_OK(result); + ASSERT_EQUALS(-10000, response["vote"].Int()); + ASSERT_EQUALS(round, response["round"].OID()); + ASSERT_EQUALS(1, countLogLinesContaining("h1:27017 has lower priority than h3:27017")); + + // Make sure nay votes, do not prevent subsequent yeas (the way a yea vote would) + args.whoid = 3; + BSONObjBuilder responseBuilder2; + getTopoCoord().prepareElectResponse(args, now++, OpTime(), &responseBuilder2, &result); + BSONObj response2 = responseBuilder2.obj(); + ASSERT_EQUALS(1, response2["vote"].Int()); + ASSERT_EQUALS(round, response2["round"].OID()); +} + +TEST_F(PrepareElectResponseTest, ElectResponseHighestPriorityOfLiveNodes) { + // Test trying to elect someone who isn't the highest priority node, but all higher nodes + // are down + ReplicationCoordinator::ReplSetElectArgs args; + args.set = "rs0"; + args.round = round; + args.cfgver = 10; + args.whoid = 1; + + receiveDownHeartbeat(HostAndPort("h3"), "rs0", OpTime()); + receiveDownHeartbeat(HostAndPort("h2"), "rs0", OpTime()); + + BSONObjBuilder responseBuilder; + Status result = Status::OK(); + startCapturingLogMessages(); + getTopoCoord().prepareElectResponse(args, now += 60000, OpTime(), &responseBuilder, &result); + stopCapturingLogMessages(); + BSONObj response = responseBuilder.obj(); + ASSERT_EQUALS(1, response["vote"].Int()); + ASSERT_EQUALS(round, response["round"].OID()); +} + +TEST_F(PrepareElectResponseTest, ElectResponseValidVotes) { + // Test a valid vote + ReplicationCoordinator::ReplSetElectArgs args; + args.set = "rs0"; + args.round = round; + args.cfgver = 10; + args.whoid = 2; + now = 100; + + BSONObjBuilder responseBuilder1; + Status result = Status(ErrorCodes::InternalError, "status not set by prepareElectResponse"); + startCapturingLogMessages(); + getTopoCoord().prepareElectResponse(args, now += 60000, OpTime(), &responseBuilder1, &result); + stopCapturingLogMessages(); + BSONObj response1 = responseBuilder1.obj(); + ASSERT_OK(result); + ASSERT_EQUALS(1, response1["vote"].Int()); + ASSERT_EQUALS(round, response1["round"].OID()); + ASSERT_EQUALS(1, countLogLinesContaining("voting yea for h2:27017 (2)")); + + // Test what would be a valid vote except that we already voted too recently + args.whoid = 3; + + BSONObjBuilder responseBuilder2; + startCapturingLogMessages(); + getTopoCoord().prepareElectResponse(args, now, OpTime(), &responseBuilder2, &result); + stopCapturingLogMessages(); + BSONObj response2 = responseBuilder2.obj(); + ASSERT_OK(result); + ASSERT_EQUALS(0, response2["vote"].Int()); + ASSERT_EQUALS(round, response2["round"].OID()); + ASSERT_EQUALS(1, + countLogLinesContaining( + "voting no for h3:27017; " + "voted for h2:27017 0 secs ago")); + + // Test that after enough time passes the same vote can proceed + now += 30 * 1000 + 1; // just over 30 seconds later + + BSONObjBuilder responseBuilder3; + startCapturingLogMessages(); + getTopoCoord().prepareElectResponse(args, now++, OpTime(), &responseBuilder3, &result); + stopCapturingLogMessages(); + BSONObj response3 = responseBuilder3.obj(); + ASSERT_OK(result); + ASSERT_EQUALS(1, response3["vote"].Int()); + ASSERT_EQUALS(round, response3["round"].OID()); + ASSERT_EQUALS(1, countLogLinesContaining("voting yea for h3:27017 (3)")); +} + +TEST_F(TopoCoordTest, ElectResponseNotInConfig) { + ReplicationCoordinator::ReplSetElectArgs args; + BSONObjBuilder response; + Status status = Status(ErrorCodes::InternalError, "status not set by prepareElectResponse"); + getTopoCoord().prepareElectResponse(args, now(), OpTime(), &response, &status); + ASSERT_EQUALS(ErrorCodes::ReplicaSetNotFound, status); + ASSERT_EQUALS("Cannot participate in election because not initialized", status.reason()); +} + +class PrepareFreezeResponseTest : public TopoCoordTest { +public: + virtual void setUp() { + TopoCoordTest::setUp(); + updateConfig(BSON("_id" + << "rs0" + << "version" << 5 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "host1:27017") + << BSON("_id" << 1 << "host" + << "host2:27017"))), 0); - setSelfMemberState(MemberState::RS_SECONDARY); - - BSONObjBuilder response; - getTopoCoord().prepareFreezeResponse(now()++, 20, &response); - ASSERT(response.obj().isEmpty()); - BSONObjBuilder response2; - getTopoCoord().prepareFreezeResponse(now()++, 0, &response2); - ASSERT_EQUALS("unfreezing", response2.obj()["info"].String()); - ASSERT(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); } - class ShutdownInProgressTest : public TopoCoordTest { - public: - - ShutdownInProgressTest() : - ourCbData(NULL, - ReplicationExecutor::CallbackHandle(), - Status(ErrorCodes::CallbackCanceled, "")) {} - - virtual ReplicationExecutor::CallbackData cbData() { return ourCbData; } - - private: - ReplicationExecutor::CallbackData ourCbData; - }; - - TEST_F(ShutdownInProgressTest, ShutdownInProgressWhenCallbackCanceledSyncFrom) { - Status result = Status::OK(); + BSONObj prepareFreezeResponse(int duration) { BSONObjBuilder response; - getTopoCoord().prepareSyncFromResponse(cbData(), - HostAndPort("host2:27017"), - OpTime(0,0), - &response, - &result); - ASSERT_EQUALS(ErrorCodes::ShutdownInProgress, result); - ASSERT_TRUE(response.obj().isEmpty()); - - } - - TEST_F(ShutdownInProgressTest, ShutDownInProgressWhenCallbackCanceledStatus) { - Status result = Status::OK(); - BSONObjBuilder response; - getTopoCoord().prepareStatusResponse(cbData(), - Date_t(0), - 0, - OpTime(0,0), - &response, - &result); - ASSERT_EQUALS(ErrorCodes::ShutdownInProgress, result); - ASSERT_TRUE(response.obj().isEmpty()); - } - - class PrepareHeartbeatResponseTest : public TopoCoordTest { - public: - - virtual void setUp() { - TopoCoordTest::setUp(); - updateConfig(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 10 << "host" << "hself") << - BSON("_id" << 20 << "host" << "h2") << - BSON("_id" << 30 << "host" << "h3"))), - 0); - setSelfMemberState(MemberState::RS_SECONDARY); - } - - void prepareHeartbeatResponse(const ReplSetHeartbeatArgs& args, - OpTime lastOpApplied, - ReplSetHeartbeatResponse* response, - Status* result) { - *result = getTopoCoord().prepareHeartbeatResponse(now()++, - args, - "rs0", - lastOpApplied, - response); - } - - }; - - TEST_F(PrepareHeartbeatResponseTest, PrepareHeartbeatResponseBadProtocolVersion) { - // set up args with bad protocol version - ReplSetHeartbeatArgs args; - args.setProtocolVersion(3); - ReplSetHeartbeatResponse response; - Status result(ErrorCodes::InternalError, "prepareHeartbeatResponse didn't set result"); - - // prepare response and check the results - prepareHeartbeatResponse(args, OpTime(0,0), &response, &result); - ASSERT_EQUALS(ErrorCodes::BadValue, result); - ASSERT_EQUALS("replset: incompatible replset protocol version: 3", result.reason()); - ASSERT_EQUALS("", response.getHbMsg()); - } - - TEST_F(PrepareHeartbeatResponseTest, PrepareHeartbeatResponseFromSelf) { - // set up args with incorrect replset name - ReplSetHeartbeatArgs args; - args.setProtocolVersion(1); - args.setSetName("rs0"); - args.setSenderId(10); - ReplSetHeartbeatResponse response; - Status result(ErrorCodes::InternalError, "prepareHeartbeatResponse didn't set result"); - prepareHeartbeatResponse(args, OpTime(0,0), &response, &result); - ASSERT_EQUALS(ErrorCodes::BadValue, result); - ASSERT(result.reason().find("from member with the same member ID as our self")) << - "Actual string was \"" << result.reason() << '"'; - ASSERT_EQUALS("", response.getHbMsg()); - } - - TEST_F(PrepareHeartbeatResponseTest, PrepareHeartbeatResponseBadSetName) { - // set up args with incorrect replset name - ReplSetHeartbeatArgs args; - args.setProtocolVersion(1); - args.setSetName("rs1"); - ReplSetHeartbeatResponse response; - Status result(ErrorCodes::InternalError, "prepareHeartbeatResponse didn't set result"); - startCapturingLogMessages(); - prepareHeartbeatResponse(args, OpTime(0,0), &response, &result); + getTopoCoord().prepareFreezeResponse(now()++, duration, &response); stopCapturingLogMessages(); - ASSERT_EQUALS(ErrorCodes::InconsistentReplicaSetNames, result); - ASSERT(result.reason().find("repl set names do not match")) << "Actual string was \"" << - result.reason() << '"'; - ASSERT_EQUALS(1, - countLogLinesContaining("replSet set names do not match, ours: rs0; remote " - "node's: rs1")); - ASSERT_TRUE(response.isMismatched()); - ASSERT_EQUALS("", response.getHbMsg()); + return response.obj(); } - - TEST_F(PrepareHeartbeatResponseTest, PrepareHeartbeatResponseSenderIDMissing) { - // set up args without a senderID - ReplSetHeartbeatArgs args; - args.setProtocolVersion(1); - args.setSetName("rs0"); - args.setConfigVersion(1); - ReplSetHeartbeatResponse response; - Status result(ErrorCodes::InternalError, "prepareHeartbeatResponse didn't set result"); - - // prepare response and check the results - prepareHeartbeatResponse(args, OpTime(0,0), &response, &result); - ASSERT_OK(result); - ASSERT_FALSE(response.isElectable()); - ASSERT_TRUE(response.isReplSet()); - ASSERT_EQUALS(MemberState::RS_SECONDARY, response.getState().s); - ASSERT_EQUALS(OpTime(0,0), response.getOpTime()); - ASSERT_EQUALS(Seconds(0).total_milliseconds(), response.getTime().total_milliseconds()); - ASSERT_EQUALS("", response.getHbMsg()); - ASSERT_EQUALS("rs0", response.getReplicaSetName()); - ASSERT_EQUALS(1, response.getVersion()); - } - - TEST_F(PrepareHeartbeatResponseTest, PrepareHeartbeatResponseSenderIDNotInConfig) { - // set up args with a senderID which is not present in our config - ReplSetHeartbeatArgs args; - args.setProtocolVersion(1); - args.setSetName("rs0"); - args.setConfigVersion(1); - args.setSenderId(2); - ReplSetHeartbeatResponse response; - Status result(ErrorCodes::InternalError, "prepareHeartbeatResponse didn't set result"); - - // prepare response and check the results - prepareHeartbeatResponse(args, OpTime(0,0), &response, &result); - ASSERT_OK(result); - ASSERT_FALSE(response.isElectable()); - ASSERT_TRUE(response.isReplSet()); - ASSERT_EQUALS(MemberState::RS_SECONDARY, response.getState().s); - ASSERT_EQUALS(OpTime(0,0), response.getOpTime()); - ASSERT_EQUALS(Seconds(0).total_milliseconds(), response.getTime().total_milliseconds()); - ASSERT_EQUALS("", response.getHbMsg()); - ASSERT_EQUALS("rs0", response.getReplicaSetName()); - ASSERT_EQUALS(1, response.getVersion()); +}; + +TEST_F(PrepareFreezeResponseTest, UnfreezeEvenWhenNotFrozen) { + BSONObj response = prepareFreezeResponse(0); + ASSERT_EQUALS("unfreezing", response["info"].String()); + ASSERT_EQUALS(1, countLogLinesContaining("replSet info 'unfreezing'")); + // 1 instead of 0 because it assigns to "now" in this case + ASSERT_EQUALS(1LL, getTopoCoord().getStepDownTime().asInt64()); +} + +TEST_F(PrepareFreezeResponseTest, FreezeForOneSecond) { + BSONObj response = prepareFreezeResponse(1); + ASSERT_EQUALS("you really want to freeze for only 1 second?", response["warning"].String()); + ASSERT_EQUALS(1, countLogLinesContaining("replSet info 'freezing' for 1 seconds")); + // 1001 because "now" was incremented once during initialization + 1000 ms wait + ASSERT_EQUALS(1001LL, getTopoCoord().getStepDownTime().asInt64()); +} + +TEST_F(PrepareFreezeResponseTest, FreezeForManySeconds) { + BSONObj response = prepareFreezeResponse(20); + ASSERT_TRUE(response.isEmpty()); + ASSERT_EQUALS(1, countLogLinesContaining("replSet info 'freezing' for 20 seconds")); + // 20001 because "now" was incremented once during initialization + 20000 ms wait + ASSERT_EQUALS(20001LL, getTopoCoord().getStepDownTime().asInt64()); +} + +TEST_F(PrepareFreezeResponseTest, UnfreezeEvenWhenNotFrozenWhilePrimary) { + makeSelfPrimary(); + BSONObj response = prepareFreezeResponse(0); + ASSERT_EQUALS("unfreezing", response["info"].String()); + // doesn't mention being primary in this case for some reason + ASSERT_EQUALS( + 0, countLogLinesContaining("replSet info received freeze command but we are primary")); + // 1 instead of 0 because it assigns to "now" in this case + ASSERT_EQUALS(1LL, getTopoCoord().getStepDownTime().asInt64()); +} + +TEST_F(PrepareFreezeResponseTest, FreezeForOneSecondWhilePrimary) { + makeSelfPrimary(); + BSONObj response = prepareFreezeResponse(1); + ASSERT_EQUALS("you really want to freeze for only 1 second?", response["warning"].String()); + ASSERT_EQUALS( + 1, countLogLinesContaining("replSet info received freeze command but we are primary")); + ASSERT_EQUALS(0LL, getTopoCoord().getStepDownTime().asInt64()); +} + +TEST_F(PrepareFreezeResponseTest, FreezeForManySecondsWhilePrimary) { + makeSelfPrimary(); + BSONObj response = prepareFreezeResponse(20); + ASSERT_TRUE(response.isEmpty()); + ASSERT_EQUALS( + 1, countLogLinesContaining("replSet info received freeze command but we are primary")); + ASSERT_EQUALS(0LL, getTopoCoord().getStepDownTime().asInt64()); +} + +TEST_F(TopoCoordTest, UnfreezeWhileLoneNode) { + updateConfig(BSON("_id" + << "rs0" + << "version" << 5 << "members" << BSON_ARRAY(BSON("_id" << 0 << "host" + << "host1:27017"))), + 0); + setSelfMemberState(MemberState::RS_SECONDARY); + + BSONObjBuilder response; + getTopoCoord().prepareFreezeResponse(now()++, 20, &response); + ASSERT(response.obj().isEmpty()); + BSONObjBuilder response2; + getTopoCoord().prepareFreezeResponse(now()++, 0, &response2); + ASSERT_EQUALS("unfreezing", response2.obj()["info"].String()); + ASSERT(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); +} + +class ShutdownInProgressTest : public TopoCoordTest { +public: + ShutdownInProgressTest() + : ourCbData(NULL, + ReplicationExecutor::CallbackHandle(), + Status(ErrorCodes::CallbackCanceled, "")) {} + + virtual ReplicationExecutor::CallbackData cbData() { + return ourCbData; } - TEST_F(PrepareHeartbeatResponseTest, PrepareHeartbeatResponseConfigVersionLow) { - // set up args with a config version lower than ours - ReplSetHeartbeatArgs args; - args.setProtocolVersion(1); - args.setConfigVersion(0); - args.setSetName("rs0"); - args.setSenderId(20); - ReplSetHeartbeatResponse response; - Status result(ErrorCodes::InternalError, "prepareHeartbeatResponse didn't set result"); - - // prepare response and check the results - prepareHeartbeatResponse(args, OpTime(0,0), &response, &result); - ASSERT_OK(result); - ASSERT_TRUE(response.hasConfig()); - ASSERT_FALSE(response.isElectable()); - ASSERT_TRUE(response.isReplSet()); - ASSERT_EQUALS(MemberState::RS_SECONDARY, response.getState().s); - ASSERT_EQUALS(OpTime(0,0), response.getOpTime()); - ASSERT_EQUALS(Seconds(0).total_milliseconds(), response.getTime().total_milliseconds()); - ASSERT_EQUALS("", response.getHbMsg()); - ASSERT_EQUALS("rs0", response.getReplicaSetName()); - ASSERT_EQUALS(1, response.getVersion()); - } - - TEST_F(PrepareHeartbeatResponseTest, PrepareHeartbeatResponseConfigVersionHigh) { - // set up args with a config version higher than ours - ReplSetHeartbeatArgs args; - args.setProtocolVersion(1); - args.setConfigVersion(10); - args.setSetName("rs0"); - args.setSenderId(20); - ReplSetHeartbeatResponse response; - Status result(ErrorCodes::InternalError, "prepareHeartbeatResponse didn't set result"); - - // prepare response and check the results - prepareHeartbeatResponse(args, OpTime(0,0), &response, &result); - ASSERT_OK(result); - ASSERT_FALSE(response.hasConfig()); - ASSERT_FALSE(response.isElectable()); - ASSERT_TRUE(response.isReplSet()); - ASSERT_EQUALS(MemberState::RS_SECONDARY, response.getState().s); - ASSERT_EQUALS(OpTime(0,0), response.getOpTime()); - ASSERT_EQUALS(Seconds(0).total_milliseconds(), response.getTime().total_milliseconds()); - ASSERT_EQUALS("", response.getHbMsg()); - ASSERT_EQUALS("rs0", response.getReplicaSetName()); - ASSERT_EQUALS(1, response.getVersion()); - } - - TEST_F(PrepareHeartbeatResponseTest, PrepareHeartbeatResponseSenderDown) { - // set up args with sender down from our perspective - ReplSetHeartbeatArgs args; - args.setProtocolVersion(1); - args.setConfigVersion(1); - args.setSetName("rs0"); - args.setSenderId(20); - ReplSetHeartbeatResponse response; - Status result(ErrorCodes::InternalError, "prepareHeartbeatResponse didn't set result"); - - // prepare response and check the results - prepareHeartbeatResponse(args, OpTime(0,0), &response, &result); - ASSERT_OK(result); - ASSERT_FALSE(response.isElectable()); - ASSERT_TRUE(response.isReplSet()); - ASSERT_EQUALS(MemberState::RS_SECONDARY, response.getState().s); - ASSERT_EQUALS(OpTime(0,0), response.getOpTime()); - ASSERT_EQUALS(Seconds(0).total_milliseconds(), response.getTime().total_milliseconds()); - ASSERT_EQUALS("", response.getHbMsg()); - ASSERT_EQUALS("rs0", response.getReplicaSetName()); - ASSERT_EQUALS(1, response.getVersion()); - ASSERT_TRUE(response.isStateDisagreement()); - } - - TEST_F(PrepareHeartbeatResponseTest, PrepareHeartbeatResponseSenderUp) { - // set up args and acknowledge sender - heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, OpTime(0,0)); - ReplSetHeartbeatArgs args; - args.setProtocolVersion(1); - args.setConfigVersion(1); - args.setSetName("rs0"); - args.setSenderId(20); - ReplSetHeartbeatResponse response; - Status result(ErrorCodes::InternalError, "prepareHeartbeatResponse didn't set result"); - - // prepare response and check the results - prepareHeartbeatResponse(args, OpTime(100,0), &response, &result); - ASSERT_OK(result); - // this change to true because we can now see a majority, unlike in the previous cases - ASSERT_TRUE(response.isElectable()); - ASSERT_TRUE(response.isReplSet()); - ASSERT_EQUALS(MemberState::RS_SECONDARY, response.getState().s); - ASSERT_EQUALS(OpTime(100,0), response.getOpTime()); - ASSERT_EQUALS(Seconds(0).total_milliseconds(), response.getTime().total_milliseconds()); - ASSERT_EQUALS("", response.getHbMsg()); - ASSERT_EQUALS("rs0", response.getReplicaSetName()); - ASSERT_EQUALS(1, response.getVersion()); - } - - TEST_F(TopoCoordTest, PrepareHeartbeatResponseNoConfigYet) { - // set up args and acknowledge sender - ReplSetHeartbeatArgs args; - args.setProtocolVersion(1); - args.setConfigVersion(1); - args.setSetName("rs0"); - args.setSenderId(20); - ReplSetHeartbeatResponse response; - // prepare response and check the results - Status result = getTopoCoord().prepareHeartbeatResponse(now()++, - args, - "rs0", - OpTime(0,0), - &response); - ASSERT_OK(result); - // this change to true because we can now see a majority, unlike in the previous cases - ASSERT_FALSE(response.isElectable()); - ASSERT_TRUE(response.isReplSet()); - ASSERT_EQUALS(MemberState::RS_STARTUP, response.getState().s); - ASSERT_EQUALS(OpTime(0,0), response.getOpTime()); - ASSERT_EQUALS(Seconds(0).total_milliseconds(), response.getTime().total_milliseconds()); - ASSERT_EQUALS("", response.getHbMsg()); - ASSERT_EQUALS("", response.getReplicaSetName()); - ASSERT_EQUALS(-2, response.getVersion()); - } - - TEST_F(PrepareHeartbeatResponseTest, PrepareHeartbeatResponseAsPrimary) { - makeSelfPrimary(OpTime(10,0)); - heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, OpTime(0,0)); - - ReplSetHeartbeatArgs args; - args.setProtocolVersion(1); - args.setConfigVersion(1); - args.setSetName("rs0"); - args.setSenderId(20); - ReplSetHeartbeatResponse response; - Status result(ErrorCodes::InternalError, "prepareHeartbeatResponse didn't set result"); - - // prepare response and check the results - prepareHeartbeatResponse(args, OpTime(11,0), &response, &result); - ASSERT_OK(result); - // electable because we are already primary - ASSERT_TRUE(response.isElectable()); - ASSERT_TRUE(response.isReplSet()); - ASSERT_EQUALS(MemberState::RS_PRIMARY, response.getState().s); - ASSERT_EQUALS(OpTime(11,0), response.getOpTime()); - ASSERT_EQUALS(OpTime(10,0), response.getElectionTime()); - ASSERT_EQUALS(Seconds(0).total_milliseconds(), response.getTime().total_milliseconds()); - ASSERT_EQUALS("", response.getHbMsg()); - ASSERT_EQUALS("rs0", response.getReplicaSetName()); - ASSERT_EQUALS(1, response.getVersion()); - } - - TEST_F(PrepareHeartbeatResponseTest, PrepareHeartbeatResponseWithSyncSource) { - // get a sync source - heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, OpTime(0,0)); - heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, OpTime(0,0)); - heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, OpTime(1,0)); - heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, OpTime(1,0)); - getTopoCoord().chooseNewSyncSource(now()++, OpTime(0,0)); - - // set up args - ReplSetHeartbeatArgs args; - args.setProtocolVersion(1); - args.setConfigVersion(1); - args.setSetName("rs0"); - args.setSenderId(20); - ReplSetHeartbeatResponse response; - Status result(ErrorCodes::InternalError, "prepareHeartbeatResponse didn't set result"); - - // prepare response and check the results - prepareHeartbeatResponse(args, OpTime(100,0), &response, &result); - ASSERT_OK(result); - ASSERT_TRUE(response.isElectable()); - ASSERT_TRUE(response.isReplSet()); - ASSERT_EQUALS(MemberState::RS_SECONDARY, response.getState().s); - ASSERT_EQUALS(OpTime(100,0), response.getOpTime()); - ASSERT_EQUALS(Seconds(0).total_milliseconds(), response.getTime().total_milliseconds()); - // changed to a syncing message because our sync source changed recently - ASSERT_EQUALS("syncing from: h2:27017", response.getHbMsg()); - ASSERT_EQUALS("rs0", response.getReplicaSetName()); - ASSERT_EQUALS(1, response.getVersion()); - ASSERT_EQUALS(HostAndPort("h2").toString(), response.getSyncingTo()); - } - - TEST_F(TopoCoordTest, SetFollowerSecondaryWhenLoneNode) { - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - ASSERT_EQUALS(MemberState::RS_STARTUP, getTopoCoord().getMemberState().s); - updateConfig(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "hself"))), - 0); - ASSERT_EQUALS(MemberState::RS_STARTUP2, getTopoCoord().getMemberState().s); - - // if we are the only node, we should become a candidate when we transition to SECONDARY - ASSERT_FALSE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); - getTopoCoord().setFollowerMode(MemberState::RS_SECONDARY); - ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); - ASSERT_EQUALS(MemberState::RS_SECONDARY, getTopoCoord().getMemberState().s); - } - - TEST_F(TopoCoordTest, CandidateWhenLoneSecondaryNodeReconfig) { - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - ASSERT_EQUALS(MemberState::RS_STARTUP, getTopoCoord().getMemberState().s); - ReplicaSetConfig cfg; - cfg.initialize(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "hself" << "priority" << 0)))); - getTopoCoord().updateConfig(cfg, 0, now()++, OpTime()); - ASSERT_EQUALS(MemberState::RS_STARTUP2, getTopoCoord().getMemberState().s); - - ASSERT_FALSE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); - getTopoCoord().setFollowerMode(MemberState::RS_SECONDARY); - ASSERT_FALSE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); - ASSERT_EQUALS(MemberState::RS_SECONDARY, getTopoCoord().getMemberState().s); - - // we should become a candidate when we reconfig to become electable - - updateConfig(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "hself"))), - 0); - ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); - } - - TEST_F(TopoCoordTest, SetFollowerSecondaryWhenLoneUnelectableNode) { - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - ASSERT_EQUALS(MemberState::RS_STARTUP, getTopoCoord().getMemberState().s); - ReplicaSetConfig cfg; - cfg.initialize(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "hself" << "priority" << 0)))); - - getTopoCoord().updateConfig(cfg, 0, now()++, OpTime()); - ASSERT_EQUALS(MemberState::RS_STARTUP2, getTopoCoord().getMemberState().s); - - // despite being the only node, we are unelectable, so we should not become a candidate - ASSERT_FALSE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); - getTopoCoord().setFollowerMode(MemberState::RS_SECONDARY); - ASSERT_FALSE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); - ASSERT_EQUALS(MemberState::RS_SECONDARY, getTopoCoord().getMemberState().s); - } - - TEST_F(TopoCoordTest, ReconfigToBeAddedToTheSet) { - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - ASSERT_EQUALS(MemberState::RS_STARTUP, getTopoCoord().getMemberState().s); - // config to be absent from the set - updateConfig(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "host2:27017") << - BSON("_id" << 2 << "host" << "host3:27017"))), - -1); - // should become removed since we are not in the set - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - ASSERT_EQUALS(MemberState::RS_REMOVED, getTopoCoord().getMemberState().s); - - // reconfig to add to set - updateConfig(BSON("_id" << "rs0" << - "version" << 2 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "host1:27017") << - BSON("_id" << 1 << "host" << "host2:27017") << - BSON("_id" << 2 << "host" << "host3:27017"))), - 0); - // having been added to the config, we should no longer be REMOVED and should enter STARTUP2 - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - ASSERT_EQUALS(MemberState::RS_STARTUP2, getTopoCoord().getMemberState().s); - } - - TEST_F(TopoCoordTest, ReconfigToBeRemovedFromTheSet) { - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - ASSERT_EQUALS(MemberState::RS_STARTUP, getTopoCoord().getMemberState().s); - updateConfig(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "host1:27017") << - BSON("_id" << 1 << "host" << "host2:27017") << - BSON("_id" << 2 << "host" << "host3:27017"))), - 0); - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - ASSERT_EQUALS(MemberState::RS_STARTUP2, getTopoCoord().getMemberState().s); - - // reconfig to remove self - updateConfig(BSON("_id" << "rs0" << - "version" << 2 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "host2:27017") << - BSON("_id" << 2 << "host" << "host3:27017"))), - -1); - // should become removed since we are no longer in the set - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - ASSERT_EQUALS(MemberState::RS_REMOVED, getTopoCoord().getMemberState().s); - } - - TEST_F(TopoCoordTest, ReconfigToBeRemovedFromTheSetAsPrimary) { - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - ASSERT_EQUALS(MemberState::RS_STARTUP, getTopoCoord().getMemberState().s); - updateConfig(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "host1:27017"))), +private: + ReplicationExecutor::CallbackData ourCbData; +}; + +TEST_F(ShutdownInProgressTest, ShutdownInProgressWhenCallbackCanceledSyncFrom) { + Status result = Status::OK(); + BSONObjBuilder response; + getTopoCoord().prepareSyncFromResponse( + cbData(), HostAndPort("host2:27017"), OpTime(0, 0), &response, &result); + ASSERT_EQUALS(ErrorCodes::ShutdownInProgress, result); + ASSERT_TRUE(response.obj().isEmpty()); +} + +TEST_F(ShutdownInProgressTest, ShutDownInProgressWhenCallbackCanceledStatus) { + Status result = Status::OK(); + BSONObjBuilder response; + getTopoCoord().prepareStatusResponse(cbData(), Date_t(0), 0, OpTime(0, 0), &response, &result); + ASSERT_EQUALS(ErrorCodes::ShutdownInProgress, result); + ASSERT_TRUE(response.obj().isEmpty()); +} + +class PrepareHeartbeatResponseTest : public TopoCoordTest { +public: + virtual void setUp() { + TopoCoordTest::setUp(); + updateConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 10 << "host" + << "hself") + << BSON("_id" << 20 << "host" + << "h2") << BSON("_id" << 30 << "host" + << "h3"))), 0); - ASSERT_FALSE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); - ASSERT_EQUALS(MemberState::RS_STARTUP2, getTopoCoord().getMemberState().s); - getTopoCoord().setFollowerMode(MemberState::RS_SECONDARY); - ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); - - // win election and primary - getTopoCoord().processWinElection(OID::gen(), OpTime(0,0)); - ASSERT_TRUE(TopologyCoordinator::Role::leader == getTopoCoord().getRole()); - ASSERT_EQUALS(MemberState::RS_PRIMARY, getTopoCoord().getMemberState().s); - - // reconfig to remove self - updateConfig(BSON("_id" << "rs0" << - "version" << 2 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "host2:27017") << - BSON("_id" << 2 << "host" << "host3:27017"))), - -1); - // should become removed since we are no longer in the set even though we were primary - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - ASSERT_EQUALS(MemberState::RS_REMOVED, getTopoCoord().getMemberState().s); - } - - TEST_F(TopoCoordTest, ReconfigCanNoLongerBePrimary) { - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - ASSERT_EQUALS(MemberState::RS_STARTUP, getTopoCoord().getMemberState().s); - updateConfig(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "host1:27017"))), - 0); - ASSERT_FALSE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); - ASSERT_EQUALS(MemberState::RS_STARTUP2, getTopoCoord().getMemberState().s); - getTopoCoord().setFollowerMode(MemberState::RS_SECONDARY); - ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); - - // win election and primary - getTopoCoord().processWinElection(OID::gen(), OpTime(0,0)); - ASSERT_TRUE(TopologyCoordinator::Role::leader == getTopoCoord().getRole()); - ASSERT_EQUALS(MemberState::RS_PRIMARY, getTopoCoord().getMemberState().s); - - // now lose primary due to loss of electability - updateConfig(BSON("_id" << "rs0" << - "version" << 2 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "host1:27017" << "priority" << 0) << - BSON("_id" << 1 << "host" << "host2:27017") << - BSON("_id" << 2 << "host" << "host3:27017"))), - 0); - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - ASSERT_EQUALS(MemberState::RS_SECONDARY, getTopoCoord().getMemberState().s); - } - - TEST_F(TopoCoordTest, ReconfigContinueToBePrimary) { - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - ASSERT_EQUALS(MemberState::RS_STARTUP, getTopoCoord().getMemberState().s); - updateConfig(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "host1:27017"))), - 0); - - ASSERT_FALSE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); - ASSERT_EQUALS(MemberState::RS_STARTUP2, getTopoCoord().getMemberState().s); - getTopoCoord().setFollowerMode(MemberState::RS_SECONDARY); - ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); - - // win election and primary - getTopoCoord().processWinElection(OID::gen(), OpTime(0,0)); - ASSERT_TRUE(TopologyCoordinator::Role::leader == getTopoCoord().getRole()); - ASSERT_EQUALS(MemberState::RS_PRIMARY, getTopoCoord().getMemberState().s); - - // Now reconfig in ways that leave us electable and ensure we are still the primary. - // Add hosts - updateConfig(BSON("_id" << "rs0" << - "version" << 2 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "host1:27017") << - BSON("_id" << 1 << "host" << "host2:27017") << - BSON("_id" << 2 << "host" << "host3:27017"))), - 0, - Date_t(-1), - OpTime(10,0)); - ASSERT_TRUE(TopologyCoordinator::Role::leader == getTopoCoord().getRole()); - ASSERT_EQUALS(MemberState::RS_PRIMARY, getTopoCoord().getMemberState().s); - - // Change priorities and tags - updateConfig(BSON("_id" << "rs0" << - "version" << 2 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "host1:27017" << "priority" << 10) << - BSON("_id" << 1 << - "host" << "host2:27017" << - "priority" << 5 << - "tags" << BSON("dc" << "NA" << "rack" << "rack1")))), - 0, - Date_t(-1), - OpTime(10,0)); - ASSERT_TRUE(TopologyCoordinator::Role::leader == getTopoCoord().getRole()); - ASSERT_EQUALS(MemberState::RS_PRIMARY, getTopoCoord().getMemberState().s); - } - - TEST_F(TopoCoordTest, ReconfigKeepSecondary) { - updateConfig(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 1 << "host" << "host1:27017") << - BSON("_id" << 2 << "host" << "host2:27017"))), - 0); - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - ASSERT_EQUALS(MemberState::RS_STARTUP2, getTopoCoord().getMemberState().s); setSelfMemberState(MemberState::RS_SECONDARY); - ASSERT_EQUALS(MemberState::RS_SECONDARY, getTopoCoord().getMemberState().s); - - // reconfig and stay secondary - updateConfig(BSON("_id" << "rs0" << - "version" << 2 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "host1:27017") << - BSON("_id" << 1 << "host" << "host2:27017") << - BSON("_id" << 2 << "host" << "host3:27017"))), - 0); - ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); - ASSERT_EQUALS(MemberState::RS_SECONDARY, getTopoCoord().getMemberState().s); } - TEST_F(HeartbeatResponseTest, ReconfigBetweenHeartbeatRequestAndRepsonse) { - OpTime election = OpTime(14,0); - OpTime lastOpTimeApplied = OpTime(13,0); - - // all three members up and secondaries - setSelfMemberState(MemberState::RS_SECONDARY); - - HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host3"), - "rs0", - MemberState::RS_PRIMARY, - election, - lastOpTimeApplied, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - - nextAction = receiveUpHeartbeat(HostAndPort("host2"), - "rs0", - MemberState::RS_SECONDARY, - election, - lastOpTimeApplied, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - - // now request from host3 and receive after host2 has been removed via reconfig - getTopoCoord().prepareHeartbeatRequest(now()++, "rs0", HostAndPort("host3")); - - updateConfig(BSON("_id" << "rs0" << - "version" << 2 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "host1:27017") << - BSON("_id" << 2 << "host" << "host3:27017"))), - 0); - - ReplSetHeartbeatResponse hb; - hb.initialize(BSON("ok" << 1 << - "v" << 1 << - "state" << MemberState::RS_PRIMARY)); - hb.setOpTime(lastOpTimeApplied); - hb.setElectionTime(election); - StatusWith<ReplSetHeartbeatResponse> hbResponse = StatusWith<ReplSetHeartbeatResponse>(hb); - HeartbeatResponseAction action = getTopoCoord().processHeartbeatResponse(now()++, - Milliseconds(0), - HostAndPort("host3"), - hbResponse, - lastOpTimeApplied); - - // now primary should be host3, index 1, and we should perform NoAction in response - ASSERT_EQUALS(1, getCurrentPrimaryIndex()); - ASSERT_NO_ACTION(action.getAction()); - } - - TEST_F(HeartbeatResponseTest, ReconfigNodeRemovedBetweenHeartbeatRequestAndRepsonse) { - OpTime election = OpTime(14,0); - OpTime lastOpTimeApplied = OpTime(13,0); - - // all three members up and secondaries - setSelfMemberState(MemberState::RS_SECONDARY); - - HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host3"), - "rs0", - MemberState::RS_PRIMARY, - election, - lastOpTimeApplied, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - - nextAction = receiveUpHeartbeat(HostAndPort("host2"), - "rs0", - MemberState::RS_SECONDARY, - election, - lastOpTimeApplied, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - - // now request from host3 and receive after host2 has been removed via reconfig - getTopoCoord().prepareHeartbeatRequest(now()++, "rs0", HostAndPort("host3")); - - updateConfig(BSON("_id" << "rs0" << - "version" << 2 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "host1:27017") << - BSON("_id" << 1 << "host" << "host2:27017"))), - 0); - - ReplSetHeartbeatResponse hb; - hb.initialize(BSON("ok" << 1 << - "v" << 1 << - "state" << MemberState::RS_PRIMARY)); - hb.setOpTime(lastOpTimeApplied); - hb.setElectionTime(election); - StatusWith<ReplSetHeartbeatResponse> hbResponse = StatusWith<ReplSetHeartbeatResponse>(hb); - HeartbeatResponseAction action = getTopoCoord().processHeartbeatResponse(now()++, - Milliseconds(0), - HostAndPort("host3"), - hbResponse, - lastOpTimeApplied); - - // primary should not be set and we should perform NoAction in response - ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); - ASSERT_NO_ACTION(action.getAction()); - } - - TEST_F(HeartbeatResponseTest, ShouldChangeSyncSourceMemberNotInConfig) { - // In this test, the TopologyCoordinator should tell us to change sync sources away from - // "host4" since "host4" is absent from the config - ASSERT_TRUE(getTopoCoord().shouldChangeSyncSource(HostAndPort("host4"), now())); - } - - TEST_F(HeartbeatResponseTest, ShouldChangeSyncSourceMemberHasYetToHeartbeat) { - // In this test, the TopologyCoordinator should not tell us to change sync sources away from - // "host2" since we do not yet have a heartbeat (and as a result do not yet have an optime) - // for "host2" - ASSERT_FALSE(getTopoCoord().shouldChangeSyncSource(HostAndPort("host2"), now())); - } - - TEST_F(HeartbeatResponseTest, ShouldChangeSyncSourceFresherHappierMemberExists) { - // In this test, the TopologyCoordinator should tell us to change sync sources away from - // "host2" and to "host3" since "host2" is more than maxSyncSourceLagSecs(30) behind "host3" - OpTime election = OpTime(0,0); - OpTime lastOpTimeApplied = OpTime(4,0); - // ahead by more than maxSyncSourceLagSecs (30) - OpTime fresherLastOpTimeApplied = OpTime(3005,0); - - HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), - "rs0", - MemberState::RS_SECONDARY, - election, - lastOpTimeApplied, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - - nextAction = receiveUpHeartbeat(HostAndPort("host3"), - "rs0", - MemberState::RS_SECONDARY, - election, - fresherLastOpTimeApplied, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - - // set up complete, time for actual check - startCapturingLogMessages(); - ASSERT_TRUE(getTopoCoord().shouldChangeSyncSource(HostAndPort("host2"), now())); - stopCapturingLogMessages(); - ASSERT_EQUALS(1, countLogLinesContaining("changing sync target")); - } - - TEST_F(HeartbeatResponseTest, ShouldChangeSyncSourceFresherMemberIsBlackListed) { - // In this test, the TopologyCoordinator should not tell us to change sync sources away from - // "host2" and to "host3" despite "host2" being more than maxSyncSourceLagSecs(30) behind - // "host3", since "host3" is blacklisted - // Then, confirm that unblacklisting only works if time has passed the blacklist time. - OpTime election = OpTime(0,0); - OpTime lastOpTimeApplied = OpTime(400,0); - // ahead by more than maxSyncSourceLagSecs (30) - OpTime fresherLastOpTimeApplied = OpTime(3005,0); - - HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), - "rs0", - MemberState::RS_SECONDARY, - election, - lastOpTimeApplied, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - - nextAction = receiveUpHeartbeat(HostAndPort("host3"), - "rs0", - MemberState::RS_SECONDARY, - election, - fresherLastOpTimeApplied, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - getTopoCoord().blacklistSyncSource(HostAndPort("host3"), now() + 100); - - // set up complete, time for actual check - ASSERT_FALSE(getTopoCoord().shouldChangeSyncSource(HostAndPort("host2"), now())); - - // unblacklist with too early a time (node should remained blacklisted) - getTopoCoord().unblacklistSyncSource(HostAndPort("host3"), now() + 90); - ASSERT_FALSE(getTopoCoord().shouldChangeSyncSource(HostAndPort("host2"), now())); - - // unblacklist and it should succeed - getTopoCoord().unblacklistSyncSource(HostAndPort("host3"), now() + 100); - startCapturingLogMessages(); - ASSERT_TRUE(getTopoCoord().shouldChangeSyncSource(HostAndPort("host2"), now())); - stopCapturingLogMessages(); - ASSERT_EQUALS(1, countLogLinesContaining("changing sync target")); - } - - TEST_F(HeartbeatResponseTest, ShouldChangeSyncSourceFresherMemberIsDown) { - // In this test, the TopologyCoordinator should not tell us to change sync sources away from - // "host2" and to "host3" despite "host2" being more than maxSyncSourceLagSecs(30) behind - // "host3", since "host3" is down - OpTime election = OpTime(0,0); - OpTime lastOpTimeApplied = OpTime(400,0); - // ahead by more than maxSyncSourceLagSecs (30) - OpTime fresherLastOpTimeApplied = OpTime(3005,0); - - HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), - "rs0", - MemberState::RS_SECONDARY, - election, - lastOpTimeApplied, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - - nextAction = receiveUpHeartbeat(HostAndPort("host3"), - "rs0", - MemberState::RS_SECONDARY, - election, - fresherLastOpTimeApplied, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - - // set up complete, time for actual check - nextAction = receiveDownHeartbeat(HostAndPort("host3"), "rs0", lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - ASSERT_FALSE(getTopoCoord().shouldChangeSyncSource(HostAndPort("host2"), now())); - } - - TEST_F(HeartbeatResponseTest, ShouldChangeSyncSourceFresherMemberIsNotReadable) { - // In this test, the TopologyCoordinator should not tell us to change sync sources away from - // "host2" and to "host3" despite "host2" being more than maxSyncSourceLagSecs(30) behind - // "host3", since "host3" is in a non-readable mode (RS_ROLLBACK) - OpTime election = OpTime(0,0); - OpTime lastOpTimeApplied = OpTime(4,0); - // ahead by more than maxSyncSourceLagSecs (30) - OpTime fresherLastOpTimeApplied = OpTime(3005,0); - - HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), - "rs0", - MemberState::RS_SECONDARY, - election, - lastOpTimeApplied, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - - nextAction = receiveUpHeartbeat(HostAndPort("host3"), - "rs0", - MemberState::RS_ROLLBACK, - election, - fresherLastOpTimeApplied, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - - // set up complete, time for actual check - ASSERT_FALSE(getTopoCoord().shouldChangeSyncSource(HostAndPort("host2"), now())); - } - - TEST_F(HeartbeatResponseTest, ShouldChangeSyncSourceFresherMemberDoesNotBuildIndexes) { - // In this test, the TopologyCoordinator should not tell us to change sync sources away from - // "host2" and to "host3" despite "host2" being more than maxSyncSourceLagSecs(30) behind - // "host3", since "host3" does not build indexes - OpTime election = OpTime(0,0); - OpTime lastOpTimeApplied = OpTime(4,0); - // ahead by more than maxSyncSourceLagSecs (30) - OpTime fresherLastOpTimeApplied = OpTime(3005,0); - - updateConfig(BSON("_id" << "rs0" << - "version" << 6 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "hself") << - BSON("_id" << 1 << "host" << "host2") << - BSON("_id" << 2 << "host" << "host3" << - "buildIndexes" << false << "priority" << 0))), - 0); - HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), - "rs0", - MemberState::RS_SECONDARY, - election, - lastOpTimeApplied, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - nextAction = receiveUpHeartbeat(HostAndPort("host3"), - "rs0", - MemberState::RS_SECONDARY, - election, - fresherLastOpTimeApplied, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - - // set up complete, time for actual check - ASSERT_FALSE(getTopoCoord().shouldChangeSyncSource(HostAndPort("host2"), now())); - } - - TEST_F(HeartbeatResponseTest, ShouldChangeSyncSourceFresherMemberDoesNotBuildIndexesNorDoWe) { - // In this test, the TopologyCoordinator should tell us to change sync sources away from - // "host2" and to "host3" despite "host3" not building indexes because we do not build - // indexes either and "host2" is more than maxSyncSourceLagSecs(30) behind "host3" - OpTime election = OpTime(0,0); - OpTime lastOpTimeApplied = OpTime(4,0); - // ahead by more than maxSyncSourceLagSecs (30) - OpTime fresherLastOpTimeApplied = OpTime(3005,0); - - updateConfig(BSON("_id" << "rs0" << - "version" << 7 << - "members" << BSON_ARRAY( - BSON("_id" << 0 << "host" << "hself" << - "buildIndexes" << false << "priority" << 0) << - BSON("_id" << 1 << "host" << "host2") << - BSON("_id" << 2 << "host" << "host3" << - "buildIndexes" << false << "priority" << 0))), - 0); - HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), - "rs0", - MemberState::RS_SECONDARY, - election, - lastOpTimeApplied, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - nextAction = receiveUpHeartbeat(HostAndPort("host3"), - "rs0", - MemberState::RS_SECONDARY, - election, - fresherLastOpTimeApplied, - lastOpTimeApplied); - ASSERT_NO_ACTION(nextAction.getAction()); - - // set up complete, time for actual check - startCapturingLogMessages(); - ASSERT_TRUE(getTopoCoord().shouldChangeSyncSource(HostAndPort("host2"), now())); - stopCapturingLogMessages(); - ASSERT_EQUALS(1, countLogLinesContaining("changing sync target")); - } - - TEST_F(TopoCoordTest, CheckShouldStandForElectionWithPrimary) { - updateConfig(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 10 << "host" << "hself") << - BSON("_id" << 20 << "host" << "h2") << - BSON("_id" << 30 << "host" << "h3"))), - 0); - setSelfMemberState(MemberState::RS_SECONDARY); - - heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_PRIMARY, OpTime(1,0)); - ASSERT_FALSE(getTopoCoord().checkShouldStandForElection(now()++, OpTime(0,0))); - } - - TEST_F(TopoCoordTest, CheckShouldStandForElectionNotCloseEnoughToLastOptime) { - updateConfig(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 10 << "host" << "hself") << - BSON("_id" << 20 << "host" << "h2") << - BSON("_id" << 30 << "host" << "h3"))), - 0); - setSelfMemberState(MemberState::RS_SECONDARY); - - heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, OpTime(10000,0)); - ASSERT_FALSE(getTopoCoord().checkShouldStandForElection(now()++, OpTime(100,0))); - } - - TEST_F(TopoCoordTest, VoteForMyselfFailsWhileNotCandidate) { - updateConfig(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 10 << "host" << "hself") << - BSON("_id" << 20 << "host" << "h2") << - BSON("_id" << 30 << "host" << "h3"))), - 0); - setSelfMemberState(MemberState::RS_SECONDARY); - ASSERT_FALSE(getTopoCoord().voteForMyself(now()++)); - } - - TEST_F(TopoCoordTest, GetMemberStateArbiter) { - updateConfig(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 10 << "host" << "hself" << "arbiterOnly" << true) << - BSON("_id" << 20 << "host" << "h2") << - BSON("_id" << 30 << "host" << "h3"))), - 0); - ASSERT_EQUALS(MemberState::RS_ARBITER, getTopoCoord().getMemberState().s); - } - - TEST_F(TopoCoordTest, UnelectableIfAbsentFromConfig) { - logger::globalLogDomain()->setMinimumLoggedSeverity(logger::LogSeverity::Debug(3)); - startCapturingLogMessages(); - ASSERT_FALSE(getTopoCoord().checkShouldStandForElection(now()++, OpTime(10,0))); - stopCapturingLogMessages(); - ASSERT_EQUALS(1, countLogLinesContaining("not a member of a valid replica set config")); - logger::globalLogDomain()->setMinimumLoggedSeverity(logger::LogSeverity::Log()); - } - - TEST_F(TopoCoordTest, UnelectableIfVotedRecently) { - updateConfig(BSON("_id" << "rs0" << - "version" << 1 << - "members" << BSON_ARRAY( - BSON("_id" << 10 << "host" << "hself") << - BSON("_id" << 20 << "host" << "h2") << - BSON("_id" << 30 << "host" << "h3"))), - 0); - setSelfMemberState(MemberState::RS_SECONDARY); - heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, OpTime(100,0)); - - // vote for another node - OID remoteRound = OID::gen(); - ReplicationCoordinator::ReplSetElectArgs electArgs; - electArgs.set = "rs0"; - electArgs.round = remoteRound; - electArgs.cfgver = 1; - electArgs.whoid = 20; - - // need to be 30 secs beyond the start of time to pass last vote lease - now() += 30*1000; - BSONObjBuilder electResponseBuilder; - Status result = Status(ErrorCodes::InternalError, "status not set by prepareElectResponse"); - getTopoCoord().prepareElectResponse( - electArgs, now()++, OpTime(100,0), &electResponseBuilder, &result); - BSONObj response = electResponseBuilder.obj(); - ASSERT_OK(result); - std::cout << response; - ASSERT_EQUALS(1, response["vote"].Int()); - ASSERT_EQUALS(remoteRound, response["round"].OID()); - - logger::globalLogDomain()->setMinimumLoggedSeverity(logger::LogSeverity::Debug(3)); - startCapturingLogMessages(); - ASSERT_FALSE(getTopoCoord().checkShouldStandForElection(now()++, OpTime(10,0))); - stopCapturingLogMessages(); - ASSERT_EQUALS(1, countLogLinesContaining("I recently voted for ")); - logger::globalLogDomain()->setMinimumLoggedSeverity(logger::LogSeverity::Log()); + void prepareHeartbeatResponse(const ReplSetHeartbeatArgs& args, + OpTime lastOpApplied, + ReplSetHeartbeatResponse* response, + Status* result) { + *result = + getTopoCoord().prepareHeartbeatResponse(now()++, args, "rs0", lastOpApplied, response); } +}; + +TEST_F(PrepareHeartbeatResponseTest, PrepareHeartbeatResponseBadProtocolVersion) { + // set up args with bad protocol version + ReplSetHeartbeatArgs args; + args.setProtocolVersion(3); + ReplSetHeartbeatResponse response; + Status result(ErrorCodes::InternalError, "prepareHeartbeatResponse didn't set result"); + + // prepare response and check the results + prepareHeartbeatResponse(args, OpTime(0, 0), &response, &result); + ASSERT_EQUALS(ErrorCodes::BadValue, result); + ASSERT_EQUALS("replset: incompatible replset protocol version: 3", result.reason()); + ASSERT_EQUALS("", response.getHbMsg()); +} + +TEST_F(PrepareHeartbeatResponseTest, PrepareHeartbeatResponseFromSelf) { + // set up args with incorrect replset name + ReplSetHeartbeatArgs args; + args.setProtocolVersion(1); + args.setSetName("rs0"); + args.setSenderId(10); + ReplSetHeartbeatResponse response; + Status result(ErrorCodes::InternalError, "prepareHeartbeatResponse didn't set result"); + prepareHeartbeatResponse(args, OpTime(0, 0), &response, &result); + ASSERT_EQUALS(ErrorCodes::BadValue, result); + ASSERT(result.reason().find("from member with the same member ID as our self")) + << "Actual string was \"" << result.reason() << '"'; + ASSERT_EQUALS("", response.getHbMsg()); +} + +TEST_F(PrepareHeartbeatResponseTest, PrepareHeartbeatResponseBadSetName) { + // set up args with incorrect replset name + ReplSetHeartbeatArgs args; + args.setProtocolVersion(1); + args.setSetName("rs1"); + ReplSetHeartbeatResponse response; + Status result(ErrorCodes::InternalError, "prepareHeartbeatResponse didn't set result"); + + startCapturingLogMessages(); + prepareHeartbeatResponse(args, OpTime(0, 0), &response, &result); + stopCapturingLogMessages(); + ASSERT_EQUALS(ErrorCodes::InconsistentReplicaSetNames, result); + ASSERT(result.reason().find("repl set names do not match")) << "Actual string was \"" + << result.reason() << '"'; + ASSERT_EQUALS(1, + countLogLinesContaining( + "replSet set names do not match, ours: rs0; remote " + "node's: rs1")); + ASSERT_TRUE(response.isMismatched()); + ASSERT_EQUALS("", response.getHbMsg()); +} + +TEST_F(PrepareHeartbeatResponseTest, PrepareHeartbeatResponseSenderIDMissing) { + // set up args without a senderID + ReplSetHeartbeatArgs args; + args.setProtocolVersion(1); + args.setSetName("rs0"); + args.setConfigVersion(1); + ReplSetHeartbeatResponse response; + Status result(ErrorCodes::InternalError, "prepareHeartbeatResponse didn't set result"); + + // prepare response and check the results + prepareHeartbeatResponse(args, OpTime(0, 0), &response, &result); + ASSERT_OK(result); + ASSERT_FALSE(response.isElectable()); + ASSERT_TRUE(response.isReplSet()); + ASSERT_EQUALS(MemberState::RS_SECONDARY, response.getState().s); + ASSERT_EQUALS(OpTime(0, 0), response.getOpTime()); + ASSERT_EQUALS(Seconds(0).total_milliseconds(), response.getTime().total_milliseconds()); + ASSERT_EQUALS("", response.getHbMsg()); + ASSERT_EQUALS("rs0", response.getReplicaSetName()); + ASSERT_EQUALS(1, response.getVersion()); +} + +TEST_F(PrepareHeartbeatResponseTest, PrepareHeartbeatResponseSenderIDNotInConfig) { + // set up args with a senderID which is not present in our config + ReplSetHeartbeatArgs args; + args.setProtocolVersion(1); + args.setSetName("rs0"); + args.setConfigVersion(1); + args.setSenderId(2); + ReplSetHeartbeatResponse response; + Status result(ErrorCodes::InternalError, "prepareHeartbeatResponse didn't set result"); + + // prepare response and check the results + prepareHeartbeatResponse(args, OpTime(0, 0), &response, &result); + ASSERT_OK(result); + ASSERT_FALSE(response.isElectable()); + ASSERT_TRUE(response.isReplSet()); + ASSERT_EQUALS(MemberState::RS_SECONDARY, response.getState().s); + ASSERT_EQUALS(OpTime(0, 0), response.getOpTime()); + ASSERT_EQUALS(Seconds(0).total_milliseconds(), response.getTime().total_milliseconds()); + ASSERT_EQUALS("", response.getHbMsg()); + ASSERT_EQUALS("rs0", response.getReplicaSetName()); + ASSERT_EQUALS(1, response.getVersion()); +} + +TEST_F(PrepareHeartbeatResponseTest, PrepareHeartbeatResponseConfigVersionLow) { + // set up args with a config version lower than ours + ReplSetHeartbeatArgs args; + args.setProtocolVersion(1); + args.setConfigVersion(0); + args.setSetName("rs0"); + args.setSenderId(20); + ReplSetHeartbeatResponse response; + Status result(ErrorCodes::InternalError, "prepareHeartbeatResponse didn't set result"); + + // prepare response and check the results + prepareHeartbeatResponse(args, OpTime(0, 0), &response, &result); + ASSERT_OK(result); + ASSERT_TRUE(response.hasConfig()); + ASSERT_FALSE(response.isElectable()); + ASSERT_TRUE(response.isReplSet()); + ASSERT_EQUALS(MemberState::RS_SECONDARY, response.getState().s); + ASSERT_EQUALS(OpTime(0, 0), response.getOpTime()); + ASSERT_EQUALS(Seconds(0).total_milliseconds(), response.getTime().total_milliseconds()); + ASSERT_EQUALS("", response.getHbMsg()); + ASSERT_EQUALS("rs0", response.getReplicaSetName()); + ASSERT_EQUALS(1, response.getVersion()); +} + +TEST_F(PrepareHeartbeatResponseTest, PrepareHeartbeatResponseConfigVersionHigh) { + // set up args with a config version higher than ours + ReplSetHeartbeatArgs args; + args.setProtocolVersion(1); + args.setConfigVersion(10); + args.setSetName("rs0"); + args.setSenderId(20); + ReplSetHeartbeatResponse response; + Status result(ErrorCodes::InternalError, "prepareHeartbeatResponse didn't set result"); + + // prepare response and check the results + prepareHeartbeatResponse(args, OpTime(0, 0), &response, &result); + ASSERT_OK(result); + ASSERT_FALSE(response.hasConfig()); + ASSERT_FALSE(response.isElectable()); + ASSERT_TRUE(response.isReplSet()); + ASSERT_EQUALS(MemberState::RS_SECONDARY, response.getState().s); + ASSERT_EQUALS(OpTime(0, 0), response.getOpTime()); + ASSERT_EQUALS(Seconds(0).total_milliseconds(), response.getTime().total_milliseconds()); + ASSERT_EQUALS("", response.getHbMsg()); + ASSERT_EQUALS("rs0", response.getReplicaSetName()); + ASSERT_EQUALS(1, response.getVersion()); +} + +TEST_F(PrepareHeartbeatResponseTest, PrepareHeartbeatResponseSenderDown) { + // set up args with sender down from our perspective + ReplSetHeartbeatArgs args; + args.setProtocolVersion(1); + args.setConfigVersion(1); + args.setSetName("rs0"); + args.setSenderId(20); + ReplSetHeartbeatResponse response; + Status result(ErrorCodes::InternalError, "prepareHeartbeatResponse didn't set result"); + + // prepare response and check the results + prepareHeartbeatResponse(args, OpTime(0, 0), &response, &result); + ASSERT_OK(result); + ASSERT_FALSE(response.isElectable()); + ASSERT_TRUE(response.isReplSet()); + ASSERT_EQUALS(MemberState::RS_SECONDARY, response.getState().s); + ASSERT_EQUALS(OpTime(0, 0), response.getOpTime()); + ASSERT_EQUALS(Seconds(0).total_milliseconds(), response.getTime().total_milliseconds()); + ASSERT_EQUALS("", response.getHbMsg()); + ASSERT_EQUALS("rs0", response.getReplicaSetName()); + ASSERT_EQUALS(1, response.getVersion()); + ASSERT_TRUE(response.isStateDisagreement()); +} + +TEST_F(PrepareHeartbeatResponseTest, PrepareHeartbeatResponseSenderUp) { + // set up args and acknowledge sender + heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, OpTime(0, 0)); + ReplSetHeartbeatArgs args; + args.setProtocolVersion(1); + args.setConfigVersion(1); + args.setSetName("rs0"); + args.setSenderId(20); + ReplSetHeartbeatResponse response; + Status result(ErrorCodes::InternalError, "prepareHeartbeatResponse didn't set result"); + + // prepare response and check the results + prepareHeartbeatResponse(args, OpTime(100, 0), &response, &result); + ASSERT_OK(result); + // this change to true because we can now see a majority, unlike in the previous cases + ASSERT_TRUE(response.isElectable()); + ASSERT_TRUE(response.isReplSet()); + ASSERT_EQUALS(MemberState::RS_SECONDARY, response.getState().s); + ASSERT_EQUALS(OpTime(100, 0), response.getOpTime()); + ASSERT_EQUALS(Seconds(0).total_milliseconds(), response.getTime().total_milliseconds()); + ASSERT_EQUALS("", response.getHbMsg()); + ASSERT_EQUALS("rs0", response.getReplicaSetName()); + ASSERT_EQUALS(1, response.getVersion()); +} + +TEST_F(TopoCoordTest, PrepareHeartbeatResponseNoConfigYet) { + // set up args and acknowledge sender + ReplSetHeartbeatArgs args; + args.setProtocolVersion(1); + args.setConfigVersion(1); + args.setSetName("rs0"); + args.setSenderId(20); + ReplSetHeartbeatResponse response; + // prepare response and check the results + Status result = + getTopoCoord().prepareHeartbeatResponse(now()++, args, "rs0", OpTime(0, 0), &response); + ASSERT_OK(result); + // this change to true because we can now see a majority, unlike in the previous cases + ASSERT_FALSE(response.isElectable()); + ASSERT_TRUE(response.isReplSet()); + ASSERT_EQUALS(MemberState::RS_STARTUP, response.getState().s); + ASSERT_EQUALS(OpTime(0, 0), response.getOpTime()); + ASSERT_EQUALS(Seconds(0).total_milliseconds(), response.getTime().total_milliseconds()); + ASSERT_EQUALS("", response.getHbMsg()); + ASSERT_EQUALS("", response.getReplicaSetName()); + ASSERT_EQUALS(-2, response.getVersion()); +} + +TEST_F(PrepareHeartbeatResponseTest, PrepareHeartbeatResponseAsPrimary) { + makeSelfPrimary(OpTime(10, 0)); + heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, OpTime(0, 0)); + + ReplSetHeartbeatArgs args; + args.setProtocolVersion(1); + args.setConfigVersion(1); + args.setSetName("rs0"); + args.setSenderId(20); + ReplSetHeartbeatResponse response; + Status result(ErrorCodes::InternalError, "prepareHeartbeatResponse didn't set result"); + + // prepare response and check the results + prepareHeartbeatResponse(args, OpTime(11, 0), &response, &result); + ASSERT_OK(result); + // electable because we are already primary + ASSERT_TRUE(response.isElectable()); + ASSERT_TRUE(response.isReplSet()); + ASSERT_EQUALS(MemberState::RS_PRIMARY, response.getState().s); + ASSERT_EQUALS(OpTime(11, 0), response.getOpTime()); + ASSERT_EQUALS(OpTime(10, 0), response.getElectionTime()); + ASSERT_EQUALS(Seconds(0).total_milliseconds(), response.getTime().total_milliseconds()); + ASSERT_EQUALS("", response.getHbMsg()); + ASSERT_EQUALS("rs0", response.getReplicaSetName()); + ASSERT_EQUALS(1, response.getVersion()); +} + +TEST_F(PrepareHeartbeatResponseTest, PrepareHeartbeatResponseWithSyncSource) { + // get a sync source + heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, OpTime(0, 0)); + heartbeatFromMember(HostAndPort("h3"), "rs0", MemberState::RS_SECONDARY, OpTime(0, 0)); + heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, OpTime(1, 0)); + heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, OpTime(1, 0)); + getTopoCoord().chooseNewSyncSource(now()++, OpTime(0, 0)); + + // set up args + ReplSetHeartbeatArgs args; + args.setProtocolVersion(1); + args.setConfigVersion(1); + args.setSetName("rs0"); + args.setSenderId(20); + ReplSetHeartbeatResponse response; + Status result(ErrorCodes::InternalError, "prepareHeartbeatResponse didn't set result"); + + // prepare response and check the results + prepareHeartbeatResponse(args, OpTime(100, 0), &response, &result); + ASSERT_OK(result); + ASSERT_TRUE(response.isElectable()); + ASSERT_TRUE(response.isReplSet()); + ASSERT_EQUALS(MemberState::RS_SECONDARY, response.getState().s); + ASSERT_EQUALS(OpTime(100, 0), response.getOpTime()); + ASSERT_EQUALS(Seconds(0).total_milliseconds(), response.getTime().total_milliseconds()); + // changed to a syncing message because our sync source changed recently + ASSERT_EQUALS("syncing from: h2:27017", response.getHbMsg()); + ASSERT_EQUALS("rs0", response.getReplicaSetName()); + ASSERT_EQUALS(1, response.getVersion()); + ASSERT_EQUALS(HostAndPort("h2").toString(), response.getSyncingTo()); +} + +TEST_F(TopoCoordTest, SetFollowerSecondaryWhenLoneNode) { + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); + ASSERT_EQUALS(MemberState::RS_STARTUP, getTopoCoord().getMemberState().s); + updateConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" << BSON_ARRAY(BSON("_id" << 1 << "host" + << "hself"))), + 0); + ASSERT_EQUALS(MemberState::RS_STARTUP2, getTopoCoord().getMemberState().s); + + // if we are the only node, we should become a candidate when we transition to SECONDARY + ASSERT_FALSE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); + getTopoCoord().setFollowerMode(MemberState::RS_SECONDARY); + ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); + ASSERT_EQUALS(MemberState::RS_SECONDARY, getTopoCoord().getMemberState().s); +} + +TEST_F(TopoCoordTest, CandidateWhenLoneSecondaryNodeReconfig) { + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); + ASSERT_EQUALS(MemberState::RS_STARTUP, getTopoCoord().getMemberState().s); + ReplicaSetConfig cfg; + cfg.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "hself" + << "priority" << 0)))); + getTopoCoord().updateConfig(cfg, 0, now()++, OpTime()); + ASSERT_EQUALS(MemberState::RS_STARTUP2, getTopoCoord().getMemberState().s); + + ASSERT_FALSE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); + getTopoCoord().setFollowerMode(MemberState::RS_SECONDARY); + ASSERT_FALSE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); + ASSERT_EQUALS(MemberState::RS_SECONDARY, getTopoCoord().getMemberState().s); + + // we should become a candidate when we reconfig to become electable + + updateConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" << BSON_ARRAY(BSON("_id" << 1 << "host" + << "hself"))), + 0); + ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); +} + +TEST_F(TopoCoordTest, SetFollowerSecondaryWhenLoneUnelectableNode) { + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); + ASSERT_EQUALS(MemberState::RS_STARTUP, getTopoCoord().getMemberState().s); + ReplicaSetConfig cfg; + cfg.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "hself" + << "priority" << 0)))); + + getTopoCoord().updateConfig(cfg, 0, now()++, OpTime()); + ASSERT_EQUALS(MemberState::RS_STARTUP2, getTopoCoord().getMemberState().s); + + // despite being the only node, we are unelectable, so we should not become a candidate + ASSERT_FALSE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); + getTopoCoord().setFollowerMode(MemberState::RS_SECONDARY); + ASSERT_FALSE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); + ASSERT_EQUALS(MemberState::RS_SECONDARY, getTopoCoord().getMemberState().s); +} + +TEST_F(TopoCoordTest, ReconfigToBeAddedToTheSet) { + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); + ASSERT_EQUALS(MemberState::RS_STARTUP, getTopoCoord().getMemberState().s); + // config to be absent from the set + updateConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "host2:27017") + << BSON("_id" << 2 << "host" + << "host3:27017"))), + -1); + // should become removed since we are not in the set + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); + ASSERT_EQUALS(MemberState::RS_REMOVED, getTopoCoord().getMemberState().s); + + // reconfig to add to set + updateConfig( + BSON("_id" + << "rs0" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "host1:27017") + << BSON("_id" << 1 << "host" + << "host2:27017") << BSON("_id" << 2 << "host" + << "host3:27017"))), + 0); + // having been added to the config, we should no longer be REMOVED and should enter STARTUP2 + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); + ASSERT_EQUALS(MemberState::RS_STARTUP2, getTopoCoord().getMemberState().s); +} + +TEST_F(TopoCoordTest, ReconfigToBeRemovedFromTheSet) { + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); + ASSERT_EQUALS(MemberState::RS_STARTUP, getTopoCoord().getMemberState().s); + updateConfig( + BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "host1:27017") + << BSON("_id" << 1 << "host" + << "host2:27017") << BSON("_id" << 2 << "host" + << "host3:27017"))), + 0); + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); + ASSERT_EQUALS(MemberState::RS_STARTUP2, getTopoCoord().getMemberState().s); + + // reconfig to remove self + updateConfig(BSON("_id" + << "rs0" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "host2:27017") + << BSON("_id" << 2 << "host" + << "host3:27017"))), + -1); + // should become removed since we are no longer in the set + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); + ASSERT_EQUALS(MemberState::RS_REMOVED, getTopoCoord().getMemberState().s); +} + +TEST_F(TopoCoordTest, ReconfigToBeRemovedFromTheSetAsPrimary) { + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); + ASSERT_EQUALS(MemberState::RS_STARTUP, getTopoCoord().getMemberState().s); + updateConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" << BSON_ARRAY(BSON("_id" << 0 << "host" + << "host1:27017"))), + 0); + ASSERT_FALSE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); + ASSERT_EQUALS(MemberState::RS_STARTUP2, getTopoCoord().getMemberState().s); + getTopoCoord().setFollowerMode(MemberState::RS_SECONDARY); + ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); + + // win election and primary + getTopoCoord().processWinElection(OID::gen(), OpTime(0, 0)); + ASSERT_TRUE(TopologyCoordinator::Role::leader == getTopoCoord().getRole()); + ASSERT_EQUALS(MemberState::RS_PRIMARY, getTopoCoord().getMemberState().s); + + // reconfig to remove self + updateConfig(BSON("_id" + << "rs0" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "host2:27017") + << BSON("_id" << 2 << "host" + << "host3:27017"))), + -1); + // should become removed since we are no longer in the set even though we were primary + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); + ASSERT_EQUALS(MemberState::RS_REMOVED, getTopoCoord().getMemberState().s); +} + +TEST_F(TopoCoordTest, ReconfigCanNoLongerBePrimary) { + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); + ASSERT_EQUALS(MemberState::RS_STARTUP, getTopoCoord().getMemberState().s); + updateConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" << BSON_ARRAY(BSON("_id" << 0 << "host" + << "host1:27017"))), + 0); + ASSERT_FALSE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); + ASSERT_EQUALS(MemberState::RS_STARTUP2, getTopoCoord().getMemberState().s); + getTopoCoord().setFollowerMode(MemberState::RS_SECONDARY); + ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); + + // win election and primary + getTopoCoord().processWinElection(OID::gen(), OpTime(0, 0)); + ASSERT_TRUE(TopologyCoordinator::Role::leader == getTopoCoord().getRole()); + ASSERT_EQUALS(MemberState::RS_PRIMARY, getTopoCoord().getMemberState().s); + + // now lose primary due to loss of electability + updateConfig( + BSON("_id" + << "rs0" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "host1:27017" + << "priority" << 0) + << BSON("_id" << 1 << "host" + << "host2:27017") << BSON("_id" << 2 << "host" + << "host3:27017"))), + 0); + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); + ASSERT_EQUALS(MemberState::RS_SECONDARY, getTopoCoord().getMemberState().s); +} + +TEST_F(TopoCoordTest, ReconfigContinueToBePrimary) { + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); + ASSERT_EQUALS(MemberState::RS_STARTUP, getTopoCoord().getMemberState().s); + updateConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" << BSON_ARRAY(BSON("_id" << 0 << "host" + << "host1:27017"))), + 0); + + ASSERT_FALSE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); + ASSERT_EQUALS(MemberState::RS_STARTUP2, getTopoCoord().getMemberState().s); + getTopoCoord().setFollowerMode(MemberState::RS_SECONDARY); + ASSERT_TRUE(TopologyCoordinator::Role::candidate == getTopoCoord().getRole()); + + // win election and primary + getTopoCoord().processWinElection(OID::gen(), OpTime(0, 0)); + ASSERT_TRUE(TopologyCoordinator::Role::leader == getTopoCoord().getRole()); + ASSERT_EQUALS(MemberState::RS_PRIMARY, getTopoCoord().getMemberState().s); + + // Now reconfig in ways that leave us electable and ensure we are still the primary. + // Add hosts + updateConfig( + BSON("_id" + << "rs0" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "host1:27017") + << BSON("_id" << 1 << "host" + << "host2:27017") << BSON("_id" << 2 << "host" + << "host3:27017"))), + 0, + Date_t(-1), + OpTime(10, 0)); + ASSERT_TRUE(TopologyCoordinator::Role::leader == getTopoCoord().getRole()); + ASSERT_EQUALS(MemberState::RS_PRIMARY, getTopoCoord().getMemberState().s); + + // Change priorities and tags + updateConfig( + BSON("_id" + << "rs0" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "host1:27017" + << "priority" << 10) + << BSON("_id" << 1 << "host" + << "host2:27017" + << "priority" << 5 << "tags" << BSON("dc" + << "NA" + << "rack" + << "rack1")))), + 0, + Date_t(-1), + OpTime(10, 0)); + ASSERT_TRUE(TopologyCoordinator::Role::leader == getTopoCoord().getRole()); + ASSERT_EQUALS(MemberState::RS_PRIMARY, getTopoCoord().getMemberState().s); +} + +TEST_F(TopoCoordTest, ReconfigKeepSecondary) { + updateConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 1 << "host" + << "host1:27017") + << BSON("_id" << 2 << "host" + << "host2:27017"))), + 0); + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); + ASSERT_EQUALS(MemberState::RS_STARTUP2, getTopoCoord().getMemberState().s); + setSelfMemberState(MemberState::RS_SECONDARY); + ASSERT_EQUALS(MemberState::RS_SECONDARY, getTopoCoord().getMemberState().s); + + // reconfig and stay secondary + updateConfig( + BSON("_id" + << "rs0" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "host1:27017") + << BSON("_id" << 1 << "host" + << "host2:27017") << BSON("_id" << 2 << "host" + << "host3:27017"))), + 0); + ASSERT_TRUE(TopologyCoordinator::Role::follower == getTopoCoord().getRole()); + ASSERT_EQUALS(MemberState::RS_SECONDARY, getTopoCoord().getMemberState().s); +} + +TEST_F(HeartbeatResponseTest, ReconfigBetweenHeartbeatRequestAndRepsonse) { + OpTime election = OpTime(14, 0); + OpTime lastOpTimeApplied = OpTime(13, 0); + + // all three members up and secondaries + setSelfMemberState(MemberState::RS_SECONDARY); + + HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host3"), + "rs0", + MemberState::RS_PRIMARY, + election, + lastOpTimeApplied, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + + nextAction = receiveUpHeartbeat(HostAndPort("host2"), + "rs0", + MemberState::RS_SECONDARY, + election, + lastOpTimeApplied, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + + // now request from host3 and receive after host2 has been removed via reconfig + getTopoCoord().prepareHeartbeatRequest(now()++, "rs0", HostAndPort("host3")); + + updateConfig(BSON("_id" + << "rs0" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "host1:27017") + << BSON("_id" << 2 << "host" + << "host3:27017"))), + 0); + + ReplSetHeartbeatResponse hb; + hb.initialize(BSON("ok" << 1 << "v" << 1 << "state" << MemberState::RS_PRIMARY)); + hb.setOpTime(lastOpTimeApplied); + hb.setElectionTime(election); + StatusWith<ReplSetHeartbeatResponse> hbResponse = StatusWith<ReplSetHeartbeatResponse>(hb); + HeartbeatResponseAction action = getTopoCoord().processHeartbeatResponse( + now()++, Milliseconds(0), HostAndPort("host3"), hbResponse, lastOpTimeApplied); + + // now primary should be host3, index 1, and we should perform NoAction in response + ASSERT_EQUALS(1, getCurrentPrimaryIndex()); + ASSERT_NO_ACTION(action.getAction()); +} + +TEST_F(HeartbeatResponseTest, ReconfigNodeRemovedBetweenHeartbeatRequestAndRepsonse) { + OpTime election = OpTime(14, 0); + OpTime lastOpTimeApplied = OpTime(13, 0); + + // all three members up and secondaries + setSelfMemberState(MemberState::RS_SECONDARY); + + HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host3"), + "rs0", + MemberState::RS_PRIMARY, + election, + lastOpTimeApplied, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + + nextAction = receiveUpHeartbeat(HostAndPort("host2"), + "rs0", + MemberState::RS_SECONDARY, + election, + lastOpTimeApplied, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + + // now request from host3 and receive after host2 has been removed via reconfig + getTopoCoord().prepareHeartbeatRequest(now()++, "rs0", HostAndPort("host3")); + + updateConfig(BSON("_id" + << "rs0" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "host1:27017") + << BSON("_id" << 1 << "host" + << "host2:27017"))), + 0); + + ReplSetHeartbeatResponse hb; + hb.initialize(BSON("ok" << 1 << "v" << 1 << "state" << MemberState::RS_PRIMARY)); + hb.setOpTime(lastOpTimeApplied); + hb.setElectionTime(election); + StatusWith<ReplSetHeartbeatResponse> hbResponse = StatusWith<ReplSetHeartbeatResponse>(hb); + HeartbeatResponseAction action = getTopoCoord().processHeartbeatResponse( + now()++, Milliseconds(0), HostAndPort("host3"), hbResponse, lastOpTimeApplied); + + // primary should not be set and we should perform NoAction in response + ASSERT_EQUALS(-1, getCurrentPrimaryIndex()); + ASSERT_NO_ACTION(action.getAction()); +} + +TEST_F(HeartbeatResponseTest, ShouldChangeSyncSourceMemberNotInConfig) { + // In this test, the TopologyCoordinator should tell us to change sync sources away from + // "host4" since "host4" is absent from the config + ASSERT_TRUE(getTopoCoord().shouldChangeSyncSource(HostAndPort("host4"), now())); +} + +TEST_F(HeartbeatResponseTest, ShouldChangeSyncSourceMemberHasYetToHeartbeat) { + // In this test, the TopologyCoordinator should not tell us to change sync sources away from + // "host2" since we do not yet have a heartbeat (and as a result do not yet have an optime) + // for "host2" + ASSERT_FALSE(getTopoCoord().shouldChangeSyncSource(HostAndPort("host2"), now())); +} + +TEST_F(HeartbeatResponseTest, ShouldChangeSyncSourceFresherHappierMemberExists) { + // In this test, the TopologyCoordinator should tell us to change sync sources away from + // "host2" and to "host3" since "host2" is more than maxSyncSourceLagSecs(30) behind "host3" + OpTime election = OpTime(0, 0); + OpTime lastOpTimeApplied = OpTime(4, 0); + // ahead by more than maxSyncSourceLagSecs (30) + OpTime fresherLastOpTimeApplied = OpTime(3005, 0); + + HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), + "rs0", + MemberState::RS_SECONDARY, + election, + lastOpTimeApplied, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + + nextAction = receiveUpHeartbeat(HostAndPort("host3"), + "rs0", + MemberState::RS_SECONDARY, + election, + fresherLastOpTimeApplied, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + + // set up complete, time for actual check + startCapturingLogMessages(); + ASSERT_TRUE(getTopoCoord().shouldChangeSyncSource(HostAndPort("host2"), now())); + stopCapturingLogMessages(); + ASSERT_EQUALS(1, countLogLinesContaining("changing sync target")); +} + +TEST_F(HeartbeatResponseTest, ShouldChangeSyncSourceFresherMemberIsBlackListed) { + // In this test, the TopologyCoordinator should not tell us to change sync sources away from + // "host2" and to "host3" despite "host2" being more than maxSyncSourceLagSecs(30) behind + // "host3", since "host3" is blacklisted + // Then, confirm that unblacklisting only works if time has passed the blacklist time. + OpTime election = OpTime(0, 0); + OpTime lastOpTimeApplied = OpTime(400, 0); + // ahead by more than maxSyncSourceLagSecs (30) + OpTime fresherLastOpTimeApplied = OpTime(3005, 0); + + HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), + "rs0", + MemberState::RS_SECONDARY, + election, + lastOpTimeApplied, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + + nextAction = receiveUpHeartbeat(HostAndPort("host3"), + "rs0", + MemberState::RS_SECONDARY, + election, + fresherLastOpTimeApplied, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + getTopoCoord().blacklistSyncSource(HostAndPort("host3"), now() + 100); + + // set up complete, time for actual check + ASSERT_FALSE(getTopoCoord().shouldChangeSyncSource(HostAndPort("host2"), now())); + + // unblacklist with too early a time (node should remained blacklisted) + getTopoCoord().unblacklistSyncSource(HostAndPort("host3"), now() + 90); + ASSERT_FALSE(getTopoCoord().shouldChangeSyncSource(HostAndPort("host2"), now())); + + // unblacklist and it should succeed + getTopoCoord().unblacklistSyncSource(HostAndPort("host3"), now() + 100); + startCapturingLogMessages(); + ASSERT_TRUE(getTopoCoord().shouldChangeSyncSource(HostAndPort("host2"), now())); + stopCapturingLogMessages(); + ASSERT_EQUALS(1, countLogLinesContaining("changing sync target")); +} + +TEST_F(HeartbeatResponseTest, ShouldChangeSyncSourceFresherMemberIsDown) { + // In this test, the TopologyCoordinator should not tell us to change sync sources away from + // "host2" and to "host3" despite "host2" being more than maxSyncSourceLagSecs(30) behind + // "host3", since "host3" is down + OpTime election = OpTime(0, 0); + OpTime lastOpTimeApplied = OpTime(400, 0); + // ahead by more than maxSyncSourceLagSecs (30) + OpTime fresherLastOpTimeApplied = OpTime(3005, 0); + + HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), + "rs0", + MemberState::RS_SECONDARY, + election, + lastOpTimeApplied, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + + nextAction = receiveUpHeartbeat(HostAndPort("host3"), + "rs0", + MemberState::RS_SECONDARY, + election, + fresherLastOpTimeApplied, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + + // set up complete, time for actual check + nextAction = receiveDownHeartbeat(HostAndPort("host3"), "rs0", lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + ASSERT_FALSE(getTopoCoord().shouldChangeSyncSource(HostAndPort("host2"), now())); +} + +TEST_F(HeartbeatResponseTest, ShouldChangeSyncSourceFresherMemberIsNotReadable) { + // In this test, the TopologyCoordinator should not tell us to change sync sources away from + // "host2" and to "host3" despite "host2" being more than maxSyncSourceLagSecs(30) behind + // "host3", since "host3" is in a non-readable mode (RS_ROLLBACK) + OpTime election = OpTime(0, 0); + OpTime lastOpTimeApplied = OpTime(4, 0); + // ahead by more than maxSyncSourceLagSecs (30) + OpTime fresherLastOpTimeApplied = OpTime(3005, 0); + + HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), + "rs0", + MemberState::RS_SECONDARY, + election, + lastOpTimeApplied, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + + nextAction = receiveUpHeartbeat(HostAndPort("host3"), + "rs0", + MemberState::RS_ROLLBACK, + election, + fresherLastOpTimeApplied, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + + // set up complete, time for actual check + ASSERT_FALSE(getTopoCoord().shouldChangeSyncSource(HostAndPort("host2"), now())); +} + +TEST_F(HeartbeatResponseTest, ShouldChangeSyncSourceFresherMemberDoesNotBuildIndexes) { + // In this test, the TopologyCoordinator should not tell us to change sync sources away from + // "host2" and to "host3" despite "host2" being more than maxSyncSourceLagSecs(30) behind + // "host3", since "host3" does not build indexes + OpTime election = OpTime(0, 0); + OpTime lastOpTimeApplied = OpTime(4, 0); + // ahead by more than maxSyncSourceLagSecs (30) + OpTime fresherLastOpTimeApplied = OpTime(3005, 0); + + updateConfig(BSON("_id" + << "rs0" + << "version" << 6 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "hself") + << BSON("_id" << 1 << "host" + << "host2") + << BSON("_id" << 2 << "host" + << "host3" + << "buildIndexes" << false << "priority" << 0))), + 0); + HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), + "rs0", + MemberState::RS_SECONDARY, + election, + lastOpTimeApplied, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + nextAction = receiveUpHeartbeat(HostAndPort("host3"), + "rs0", + MemberState::RS_SECONDARY, + election, + fresherLastOpTimeApplied, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + + // set up complete, time for actual check + ASSERT_FALSE(getTopoCoord().shouldChangeSyncSource(HostAndPort("host2"), now())); +} + +TEST_F(HeartbeatResponseTest, ShouldChangeSyncSourceFresherMemberDoesNotBuildIndexesNorDoWe) { + // In this test, the TopologyCoordinator should tell us to change sync sources away from + // "host2" and to "host3" despite "host3" not building indexes because we do not build + // indexes either and "host2" is more than maxSyncSourceLagSecs(30) behind "host3" + OpTime election = OpTime(0, 0); + OpTime lastOpTimeApplied = OpTime(4, 0); + // ahead by more than maxSyncSourceLagSecs (30) + OpTime fresherLastOpTimeApplied = OpTime(3005, 0); + + updateConfig(BSON("_id" + << "rs0" + << "version" << 7 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "hself" + << "buildIndexes" << false << "priority" << 0) + << BSON("_id" << 1 << "host" + << "host2") + << BSON("_id" << 2 << "host" + << "host3" + << "buildIndexes" << false << "priority" << 0))), + 0); + HeartbeatResponseAction nextAction = receiveUpHeartbeat(HostAndPort("host2"), + "rs0", + MemberState::RS_SECONDARY, + election, + lastOpTimeApplied, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + nextAction = receiveUpHeartbeat(HostAndPort("host3"), + "rs0", + MemberState::RS_SECONDARY, + election, + fresherLastOpTimeApplied, + lastOpTimeApplied); + ASSERT_NO_ACTION(nextAction.getAction()); + + // set up complete, time for actual check + startCapturingLogMessages(); + ASSERT_TRUE(getTopoCoord().shouldChangeSyncSource(HostAndPort("host2"), now())); + stopCapturingLogMessages(); + ASSERT_EQUALS(1, countLogLinesContaining("changing sync target")); +} + +TEST_F(TopoCoordTest, CheckShouldStandForElectionWithPrimary) { + updateConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 10 << "host" + << "hself") + << BSON("_id" << 20 << "host" + << "h2") << BSON("_id" << 30 << "host" + << "h3"))), + 0); + setSelfMemberState(MemberState::RS_SECONDARY); + + heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_PRIMARY, OpTime(1, 0)); + ASSERT_FALSE(getTopoCoord().checkShouldStandForElection(now()++, OpTime(0, 0))); +} + +TEST_F(TopoCoordTest, CheckShouldStandForElectionNotCloseEnoughToLastOptime) { + updateConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 10 << "host" + << "hself") + << BSON("_id" << 20 << "host" + << "h2") << BSON("_id" << 30 << "host" + << "h3"))), + 0); + setSelfMemberState(MemberState::RS_SECONDARY); + + heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, OpTime(10000, 0)); + ASSERT_FALSE(getTopoCoord().checkShouldStandForElection(now()++, OpTime(100, 0))); +} + +TEST_F(TopoCoordTest, VoteForMyselfFailsWhileNotCandidate) { + updateConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 10 << "host" + << "hself") + << BSON("_id" << 20 << "host" + << "h2") << BSON("_id" << 30 << "host" + << "h3"))), + 0); + setSelfMemberState(MemberState::RS_SECONDARY); + ASSERT_FALSE(getTopoCoord().voteForMyself(now()++)); +} + +TEST_F(TopoCoordTest, GetMemberStateArbiter) { + updateConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 10 << "host" + << "hself" + << "arbiterOnly" << true) + << BSON("_id" << 20 << "host" + << "h2") << BSON("_id" << 30 << "host" + << "h3"))), + 0); + ASSERT_EQUALS(MemberState::RS_ARBITER, getTopoCoord().getMemberState().s); +} + +TEST_F(TopoCoordTest, UnelectableIfAbsentFromConfig) { + logger::globalLogDomain()->setMinimumLoggedSeverity(logger::LogSeverity::Debug(3)); + startCapturingLogMessages(); + ASSERT_FALSE(getTopoCoord().checkShouldStandForElection(now()++, OpTime(10, 0))); + stopCapturingLogMessages(); + ASSERT_EQUALS(1, countLogLinesContaining("not a member of a valid replica set config")); + logger::globalLogDomain()->setMinimumLoggedSeverity(logger::LogSeverity::Log()); +} + +TEST_F(TopoCoordTest, UnelectableIfVotedRecently) { + updateConfig(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 10 << "host" + << "hself") + << BSON("_id" << 20 << "host" + << "h2") << BSON("_id" << 30 << "host" + << "h3"))), + 0); + setSelfMemberState(MemberState::RS_SECONDARY); + heartbeatFromMember(HostAndPort("h2"), "rs0", MemberState::RS_SECONDARY, OpTime(100, 0)); + + // vote for another node + OID remoteRound = OID::gen(); + ReplicationCoordinator::ReplSetElectArgs electArgs; + electArgs.set = "rs0"; + electArgs.round = remoteRound; + electArgs.cfgver = 1; + electArgs.whoid = 20; + + // need to be 30 secs beyond the start of time to pass last vote lease + now() += 30 * 1000; + BSONObjBuilder electResponseBuilder; + Status result = Status(ErrorCodes::InternalError, "status not set by prepareElectResponse"); + getTopoCoord().prepareElectResponse( + electArgs, now()++, OpTime(100, 0), &electResponseBuilder, &result); + BSONObj response = electResponseBuilder.obj(); + ASSERT_OK(result); + std::cout << response; + ASSERT_EQUALS(1, response["vote"].Int()); + ASSERT_EQUALS(remoteRound, response["round"].OID()); + + logger::globalLogDomain()->setMinimumLoggedSeverity(logger::LogSeverity::Debug(3)); + startCapturingLogMessages(); + ASSERT_FALSE(getTopoCoord().checkShouldStandForElection(now()++, OpTime(10, 0))); + stopCapturingLogMessages(); + ASSERT_EQUALS(1, countLogLinesContaining("I recently voted for ")); + logger::globalLogDomain()->setMinimumLoggedSeverity(logger::LogSeverity::Log()); +} } // namespace } // namespace repl diff --git a/src/mongo/db/repl/update_position_args.cpp b/src/mongo/db/repl/update_position_args.cpp index 78b08bfc483..3cf98b33173 100644 --- a/src/mongo/db/repl/update_position_args.cpp +++ b/src/mongo/db/repl/update_position_args.cpp @@ -39,108 +39,105 @@ namespace mongo { namespace repl { - UpdatePositionArgs::UpdateInfo::UpdateInfo( - const OID& anRid, const OpTime& aTs, long long aCfgver, long long aMemberId) - : rid(anRid), ts(aTs), cfgver(aCfgver), memberId(aMemberId) {} +UpdatePositionArgs::UpdateInfo::UpdateInfo(const OID& anRid, + const OpTime& aTs, + long long aCfgver, + long long aMemberId) + : rid(anRid), ts(aTs), cfgver(aCfgver), memberId(aMemberId) {} namespace { - const std::string kCommandFieldName = "replSetUpdatePosition"; - const std::string kUpdateArrayFieldName = "optimes"; - - const std::string kLegalUpdatePositionFieldNames[] = { - kCommandFieldName, - kUpdateArrayFieldName, - }; - - const std::string kMemberRIDFieldName = "_id"; - const std::string kMemberConfigFieldName = "config"; - const std::string kOpTimeFieldName = "optime"; - const std::string kMemberIdFieldName = "memberId"; - const std::string kConfigVersionFieldName = "cfgver"; - - const std::string kLegalUpdateInfoFieldNames[] = { - kMemberConfigFieldName, - kMemberRIDFieldName, - kOpTimeFieldName, - kMemberIdFieldName, - kConfigVersionFieldName, - }; - -} // namespace +const std::string kCommandFieldName = "replSetUpdatePosition"; +const std::string kUpdateArrayFieldName = "optimes"; + +const std::string kLegalUpdatePositionFieldNames[] = { + kCommandFieldName, kUpdateArrayFieldName, +}; + +const std::string kMemberRIDFieldName = "_id"; +const std::string kMemberConfigFieldName = "config"; +const std::string kOpTimeFieldName = "optime"; +const std::string kMemberIdFieldName = "memberId"; +const std::string kConfigVersionFieldName = "cfgver"; + +const std::string kLegalUpdateInfoFieldNames[] = { + kMemberConfigFieldName, + kMemberRIDFieldName, + kOpTimeFieldName, + kMemberIdFieldName, + kConfigVersionFieldName, +}; + +} // namespace + +Status UpdatePositionArgs::initialize(const BSONObj& argsObj) { + Status status = + bsonCheckOnlyHasFields("UpdatePositionArgs", argsObj, kLegalUpdatePositionFieldNames); + + if (!status.isOK()) + return status; + + // grab the array of changes + BSONElement updateArray; + status = bsonExtractTypedField(argsObj, kUpdateArrayFieldName, Array, &updateArray); + if (!status.isOK()) + return status; + + // now parse each array entry into an update + BSONObjIterator i(updateArray.Obj()); + while (i.more()) { + BSONObj entry = i.next().Obj(); + status = bsonCheckOnlyHasFields("UpdateInfoArgs", entry, kLegalUpdateInfoFieldNames); + if (!status.isOK()) + return status; - Status UpdatePositionArgs::initialize(const BSONObj& argsObj) { - Status status = bsonCheckOnlyHasFields("UpdatePositionArgs", - argsObj, - kLegalUpdatePositionFieldNames); + OpTime ts; + status = bsonExtractOpTimeField(entry, kOpTimeFieldName, &ts); + if (!status.isOK()) + return status; + // TODO(spencer): The following three fields are optional in 3.0, but should be made + // required or ignored in 3.0 + long long cfgver; + status = bsonExtractIntegerFieldWithDefault(entry, kConfigVersionFieldName, -1, &cfgver); if (!status.isOK()) return status; - // grab the array of changes - BSONElement updateArray; - status = bsonExtractTypedField(argsObj, kUpdateArrayFieldName, Array, &updateArray); + OID rid; + status = bsonExtractOIDFieldWithDefault(entry, kMemberRIDFieldName, OID(), &rid); if (!status.isOK()) return status; - // now parse each array entry into an update - BSONObjIterator i(updateArray.Obj()); - while(i.more()) { - BSONObj entry = i.next().Obj(); - status = bsonCheckOnlyHasFields("UpdateInfoArgs", - entry, - kLegalUpdateInfoFieldNames); - if (!status.isOK()) - return status; - - OpTime ts; - status = bsonExtractOpTimeField(entry, kOpTimeFieldName, &ts); - if (!status.isOK()) - return status; - - // TODO(spencer): The following three fields are optional in 3.0, but should be made - // required or ignored in 3.0 - long long cfgver; - status = bsonExtractIntegerFieldWithDefault(entry, kConfigVersionFieldName, -1, &cfgver); - if (!status.isOK()) - return status; - - OID rid; - status = bsonExtractOIDFieldWithDefault(entry, kMemberRIDFieldName, OID(), &rid); - if (!status.isOK()) - return status; - - long long memberID; - status = bsonExtractIntegerFieldWithDefault(entry, kMemberIdFieldName, -1, &memberID); - if (!status.isOK()) - return status; - - _updates.push_back(UpdateInfo(rid, ts, cfgver, memberID)); - } + long long memberID; + status = bsonExtractIntegerFieldWithDefault(entry, kMemberIdFieldName, -1, &memberID); + if (!status.isOK()) + return status; - return Status::OK(); + _updates.push_back(UpdateInfo(rid, ts, cfgver, memberID)); } - BSONObj UpdatePositionArgs::toBSON() const { - BSONObjBuilder builder; - // add command name - builder.append(kCommandFieldName, 1); - - // build array of updates - if (!_updates.empty()) { - BSONArrayBuilder updateArray(builder.subarrayStart(kUpdateArrayFieldName)); - for (UpdatePositionArgs::UpdateIterator update = updatesBegin(); - update != updatesEnd(); - ++update) { - updateArray.append(BSON(kMemberRIDFieldName << update->rid << - kOpTimeFieldName << update->ts << - kConfigVersionFieldName << update->cfgver << - kMemberIdFieldName << update->memberId)); - } - updateArray.doneFast(); + return Status::OK(); +} + +BSONObj UpdatePositionArgs::toBSON() const { + BSONObjBuilder builder; + // add command name + builder.append(kCommandFieldName, 1); + + // build array of updates + if (!_updates.empty()) { + BSONArrayBuilder updateArray(builder.subarrayStart(kUpdateArrayFieldName)); + for (UpdatePositionArgs::UpdateIterator update = updatesBegin(); update != updatesEnd(); + ++update) { + updateArray.append(BSON(kMemberRIDFieldName << update->rid << kOpTimeFieldName + << update->ts << kConfigVersionFieldName + << update->cfgver << kMemberIdFieldName + << update->memberId)); } - return builder.obj(); + updateArray.doneFast(); } + return builder.obj(); +} } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/update_position_args.h b/src/mongo/db/repl/update_position_args.h index 9d9afebced2..a30b5b8029f 100644 --- a/src/mongo/db/repl/update_position_args.h +++ b/src/mongo/db/repl/update_position_args.h @@ -34,49 +34,54 @@ namespace mongo { - class Status; +class Status; namespace repl { - /** - * Arguments to the handshake command. - */ - class UpdatePositionArgs { - public: - struct UpdateInfo { - UpdateInfo(const OID& anRid, const OpTime& aTs, long long aCfgver, long long aMemberId); +/** + * Arguments to the handshake command. + */ +class UpdatePositionArgs { +public: + struct UpdateInfo { + UpdateInfo(const OID& anRid, const OpTime& aTs, long long aCfgver, long long aMemberId); - OID rid; - OpTime ts; - long long cfgver; - long long memberId; - }; + OID rid; + OpTime ts; + long long cfgver; + long long memberId; + }; - typedef std::vector<UpdateInfo>::const_iterator UpdateIterator; + typedef std::vector<UpdateInfo>::const_iterator UpdateIterator; - /** - * Initializes this UpdatePositionArgs from the contents of "argsObj". - */ - Status initialize(const BSONObj& argsObj); + /** + * Initializes this UpdatePositionArgs from the contents of "argsObj". + */ + Status initialize(const BSONObj& argsObj); - /** - * Gets a begin iterator over the UpdateInfos stored in this UpdatePositionArgs. - */ - UpdateIterator updatesBegin() const { return _updates.begin(); } + /** + * Gets a begin iterator over the UpdateInfos stored in this UpdatePositionArgs. + */ + UpdateIterator updatesBegin() const { + return _updates.begin(); + } - /** - * Gets an end iterator over the UpdateInfos stored in this UpdatePositionArgs. - */ - UpdateIterator updatesEnd() const { return _updates.end(); } + /** + * Gets an end iterator over the UpdateInfos stored in this UpdatePositionArgs. + */ + UpdateIterator updatesEnd() const { + return _updates.end(); + } - /** - * Returns a BSONified version of the object. - * _updates is only included if it is not empty. - */ - BSONObj toBSON() const; - private: - std::vector<UpdateInfo> _updates; - }; + /** + * Returns a BSONified version of the object. + * _updates is only included if it is not empty. + */ + BSONObj toBSON() const; + +private: + std::vector<UpdateInfo> _updates; +}; -} // namespace repl -} // namespace mongo +} // namespace repl +} // namespace mongo |