diff options
author | matt dannenberg <matt.dannenberg@10gen.com> | 2016-01-05 10:29:01 -0500 |
---|---|---|
committer | Scott Hernandez <scotthernandez@tart.local> | 2016-02-24 09:55:46 -0500 |
commit | 7bc59dac4f46e8f59786130262fb1dfea68fb605 (patch) | |
tree | 0ad5c45f99655a7cc60abc6162993ec1eab0070d /src | |
parent | b5a76e83860d0cff964af4989d798f19ffce4aae (diff) | |
download | mongo-7bc59dac4f46e8f59786130262fb1dfea68fb605.tar.gz |
SERVER-22276 SERVER-22277 implement "j" flag in write concern apply to secondary as well as primary
(cherry picked from commit 2c2e6a38f559f25559c2b24eff51511c6fbc4a5b)
Diffstat (limited to 'src')
65 files changed, 2365 insertions, 698 deletions
diff --git a/src/mongo/db/commands/cleanup_orphaned_cmd.cpp b/src/mongo/db/commands/cleanup_orphaned_cmd.cpp index e1980b9a0b1..6eafbe5a695 100644 --- a/src/mongo/db/commands/cleanup_orphaned_cmd.cpp +++ b/src/mongo/db/commands/cleanup_orphaned_cmd.cpp @@ -58,7 +58,7 @@ namespace { const int kDefaultWTimeoutMs = 60 * 1000; const WriteConcernOptions DefaultWriteConcern(WriteConcernOptions::kMajority, - WriteConcernOptions::NONE, + WriteConcernOptions::SyncMode::UNSET, kDefaultWTimeoutMs); enum CleanupResult { CleanupResult_Done, CleanupResult_Continue, CleanupResult_Error }; diff --git a/src/mongo/db/commands/write_commands/batch_executor.cpp b/src/mongo/db/commands/write_commands/batch_executor.cpp index 34eb38a13f2..146bfd7e87f 100644 --- a/src/mongo/db/commands/write_commands/batch_executor.cpp +++ b/src/mongo/db/commands/write_commands/batch_executor.cpp @@ -293,7 +293,8 @@ void WriteBatchExecutor::executeBatch(const BatchedCommandRequest& request, const WriteConcernOptions& writeConcern = _txn->getWriteConcern(); bool silentWC = writeConcern.wMode.empty() && writeConcern.wNumNodes == 0 && - writeConcern.syncMode == WriteConcernOptions::NONE; + (writeConcern.syncMode == WriteConcernOptions::SyncMode::NONE || + writeConcern.syncMode == WriteConcernOptions::SyncMode::UNSET); Timer commandTimer; diff --git a/src/mongo/db/query/find_and_modify_request_test.cpp b/src/mongo/db/query/find_and_modify_request_test.cpp index 27490715e02..761b344a96f 100644 --- a/src/mongo/db/query/find_and_modify_request_test.cpp +++ b/src/mongo/db/query/find_and_modify_request_test.cpp @@ -154,7 +154,7 @@ TEST(FindAndModifyRequest, UpdateWithSort) { TEST(FindAndModifyRequest, UpdateWithWriteConcern) { const BSONObj query(BSON("x" << 1)); const BSONObj update(BSON("y" << 1)); - const WriteConcernOptions writeConcern(2, WriteConcernOptions::FSYNC, 150); + const WriteConcernOptions writeConcern(2, WriteConcernOptions::SyncMode::FSYNC, 150); auto request = FindAndModifyRequest::makeUpdate(NamespaceString("test.user"), query, update); request.setWriteConcern(writeConcern); @@ -174,7 +174,7 @@ TEST(FindAndModifyRequest, UpdateWithFullSpec) { const BSONObj update(BSON("y" << 1)); const BSONObj sort(BSON("z" << -1)); const BSONObj field(BSON("x" << 1 << "y" << 1)); - const WriteConcernOptions writeConcern(2, WriteConcernOptions::FSYNC, 150); + const WriteConcernOptions writeConcern(2, WriteConcernOptions::SyncMode::FSYNC, 150); auto request = FindAndModifyRequest::makeUpdate(NamespaceString("test.user"), query, update); request.setFieldProjection(field); @@ -246,7 +246,7 @@ TEST(FindAndModifyRequest, RemoveWithSort) { TEST(FindAndModifyRequest, RemoveWithWriteConcern) { const BSONObj query(BSON("x" << 1)); - const WriteConcernOptions writeConcern(2, WriteConcernOptions::FSYNC, 150); + const WriteConcernOptions writeConcern(2, WriteConcernOptions::SyncMode::FSYNC, 150); auto request = FindAndModifyRequest::makeRemove(NamespaceString("test.user"), query); request.setWriteConcern(writeConcern); @@ -265,7 +265,7 @@ TEST(FindAndModifyRequest, RemoveWithFullSpec) { const BSONObj query(BSON("x" << 1)); const BSONObj sort(BSON("z" << -1)); const BSONObj field(BSON("x" << 1 << "y" << 1)); - const WriteConcernOptions writeConcern(2, WriteConcernOptions::FSYNC, 150); + const WriteConcernOptions writeConcern(2, WriteConcernOptions::SyncMode::FSYNC, 150); auto request = FindAndModifyRequest::makeRemove(NamespaceString("test.user"), query); request.setFieldProjection(field); diff --git a/src/mongo/db/range_deleter.cpp b/src/mongo/db/range_deleter.cpp index 7a4aaf878be..768cbd791b8 100644 --- a/src/mongo/db/range_deleter.cpp +++ b/src/mongo/db/range_deleter.cpp @@ -259,7 +259,7 @@ const int kWTimeoutMillis = 60 * 60 * 1000; bool _waitForMajority(OperationContext* txn, std::string* errMsg) { const WriteConcernOptions writeConcern( - WriteConcernOptions::kMajority, WriteConcernOptions::NONE, kWTimeoutMillis); + WriteConcernOptions::kMajority, WriteConcernOptions::SyncMode::UNSET, kWTimeoutMillis); repl::ReplicationCoordinator::StatusAndDuration replStatus = repl::getGlobalReplicationCoordinator()->awaitReplicationOfLastOpForClient(txn, diff --git a/src/mongo/db/repl/SConscript b/src/mongo/db/repl/SConscript index 778192b8567..5d2d8f1fe06 100644 --- a/src/mongo/db/repl/SConscript +++ b/src/mongo/db/repl/SConscript @@ -446,6 +446,7 @@ env.Library('replica_set_messages', 'handshake_args.cpp', 'is_master_response.cpp', 'member_config.cpp', + 'old_update_position_args.cpp', 'read_concern_response.cpp', 'repl_set_declare_election_winner_args.cpp', 'repl_set_heartbeat_args.cpp', diff --git a/src/mongo/db/repl/bgsync.cpp b/src/mongo/db/repl/bgsync.cpp index 13859b4deea..0e5e4cbb882 100644 --- a/src/mongo/db/repl/bgsync.cpp +++ b/src/mongo/db/repl/bgsync.cpp @@ -238,7 +238,7 @@ void BackgroundSync::_producerThread() { } // We need to wait until initial sync has started. - if (_replCoord->getMyLastOptime().isNull()) { + if (_replCoord->getMyLastAppliedOpTime().isNull()) { sleepsecs(1); return; } @@ -406,11 +406,11 @@ void BackgroundSync::_produce(OperationContext* txn) { log() << "Starting rollback due to " << fetcherReturnStatus; // Wait till all buffered oplog entries have drained and been applied. - auto lastApplied = _replCoord->getMyLastOptime(); - if (lastApplied != _lastOpTimeFetched) { + auto lastApplied = _replCoord->getMyLastAppliedOpTime(); + if (lastApplied != lastOpTimeFetched) { log() << "Waiting for all operations from " << lastApplied << " until " - << _lastOpTimeFetched << " to be applied before starting rollback."; - while (_lastOpTimeFetched > (lastApplied = _replCoord->getMyLastOptime())) { + << lastOpTimeFetched << " to be applied before starting rollback."; + while (lastOpTimeFetched > (lastApplied = _replCoord->getMyLastAppliedOpTime())) { sleepmillis(10); if (isStopped() || inShutdown()) { return; @@ -730,7 +730,7 @@ void BackgroundSync::start(OperationContext* txn) { _stopped = false; // reset _last fields with current oplog data - _lastOpTimeFetched = _replCoord->getMyLastOptime(); + _lastOpTimeFetched = _replCoord->getMyLastAppliedOpTime(); _lastFetchedHash = lastFetchedHash; LOG(1) << "bgsync fetch queue set to: " << _lastOpTimeFetched << " " << _lastFetchedHash; diff --git a/src/mongo/db/repl/data_replicator.cpp b/src/mongo/db/repl/data_replicator.cpp index ef296dd486a..aed481df509 100644 --- a/src/mongo/db/repl/data_replicator.cpp +++ b/src/mongo/db/repl/data_replicator.cpp @@ -525,7 +525,7 @@ DataReplicator::DataReplicator(DataReplicatorOptions opts, ReplicationExecutor* uassert(ErrorCodes::BadValue, "invalid rollback function", _opts.rollbackFn); uassert(ErrorCodes::BadValue, "invalid replSetUpdatePosition command object creation function", - _opts.prepareReplSetUpdatePositionCommandFn); + _opts.prepareOldReplSetUpdatePositionCommandFn); uassert(ErrorCodes::BadValue, "invalid getMyLastOptime function", _opts.getMyLastOptime); uassert(ErrorCodes::BadValue, "invalid setMyLastOptime function", _opts.setMyLastOptime); uassert(ErrorCodes::BadValue, "invalid setFollowerMode function", _opts.setFollowerMode); @@ -664,9 +664,9 @@ TimestampStatus DataReplicator::flushAndPause() { return TimestampStatus(_lastTimestampApplied); } -void DataReplicator::_resetState_inlock(Timestamp lastAppliedOptime) { +void DataReplicator::_resetState_inlock(Timestamp lastAppliedOpTime) { invariant(!_anyActiveHandles_inlock()); - _lastTimestampApplied = _lastTimestampFetched = lastAppliedOptime; + _lastTimestampApplied = _lastTimestampFetched = lastAppliedOpTime; _oplogBuffer.clear(); } @@ -1020,7 +1020,7 @@ void DataReplicator::_doNextActions_Steady_inlock() { if (!_reporterPaused && (!_reporter || !_reporter->getStatus().isOK())) { // TODO get reporter in good shape _reporter.reset( - new Reporter(_exec, _opts.prepareReplSetUpdatePositionCommandFn, _syncSource)); + new Reporter(_exec, _opts.prepareOldReplSetUpdatePositionCommandFn, _syncSource)); } } diff --git a/src/mongo/db/repl/data_replicator.h b/src/mongo/db/repl/data_replicator.h index bd3fd86101e..992aeb71d58 100644 --- a/src/mongo/db/repl/data_replicator.h +++ b/src/mongo/db/repl/data_replicator.h @@ -127,7 +127,7 @@ struct DataReplicatorOptions { Applier::ApplyOperationFn applierFn; RollbackFn rollbackFn; - Reporter::PrepareReplSetUpdatePositionCommandFn prepareReplSetUpdatePositionCommandFn; + Reporter::PrepareReplSetUpdatePositionCommandFn prepareOldReplSetUpdatePositionCommandFn; GetMyLastOptimeFn getMyLastOptime; SetMyLastOptimeFn setMyLastOptime; SetFollowerModeFn setFollowerMode; @@ -205,7 +205,7 @@ public: // For testing only - void _resetState_inlock(Timestamp lastAppliedOptime); + void _resetState_inlock(Timestamp lastAppliedOpTime); void _setInitialSyncStorageInterface(CollectionCloner::StorageInterface* si); private: diff --git a/src/mongo/db/repl/data_replicator_test.cpp b/src/mongo/db/repl/data_replicator_test.cpp index adff8d96782..046e73956df 100644 --- a/src/mongo/db/repl/data_replicator_test.cpp +++ b/src/mongo/db/repl/data_replicator_test.cpp @@ -179,7 +179,7 @@ protected: return _rollbackFn(txn, lastOpTimeWritten, syncSource); }; - options.prepareReplSetUpdatePositionCommandFn = + options.prepareOldReplSetUpdatePositionCommandFn = []() -> StatusWith<BSONObj> { return BSON("replSetUpdatePosition" << 1); }; options.getMyLastOptime = [this]() { return _myLastOpTime; }; options.setMyLastOptime = [this](const OpTime& opTime) { _setMyLastOptime(opTime); }; diff --git a/src/mongo/db/repl/initial_sync.cpp b/src/mongo/db/repl/initial_sync.cpp index e0ca82a6ea0..09553e3d93e 100644 --- a/src/mongo/db/repl/initial_sync.cpp +++ b/src/mongo/db/repl/initial_sync.cpp @@ -112,7 +112,7 @@ void InitialSync::_applyOplogUntil(OperationContext* txn, const OpTime& endOpTim const OpTime lastOpTime = multiApply(txn, ops); - replCoord->setMyLastOptime(lastOpTime); + replCoord->setMyLastAppliedOpTime(lastOpTime); setNewTimestamp(lastOpTime.getTimestamp()); if (inShutdown()) { @@ -122,8 +122,7 @@ void InitialSync::_applyOplogUntil(OperationContext* txn, const OpTime& endOpTim // if the last op applied was our end, return if (lastOpTime == endOpTime) { LOG(1) << "SyncTail applied " << entriesApplied << " entries (" << bytesApplied - << " bytes)" - << " and finished at opTime " << endOpTime; + << " bytes) and finished at opTime " << endOpTime; return; } } // end of while (true) diff --git a/src/mongo/db/repl/member_heartbeat_data.cpp b/src/mongo/db/repl/member_heartbeat_data.cpp index df6e5b0912d..c267a6ba8ed 100644 --- a/src/mongo/db/repl/member_heartbeat_data.cpp +++ b/src/mongo/db/repl/member_heartbeat_data.cpp @@ -42,7 +42,7 @@ namespace repl { MemberHeartbeatData::MemberHeartbeatData() : _health(-1), _authIssue(false) { _lastResponse.setState(MemberState::RS_UNKNOWN); _lastResponse.setElectionTime(Timestamp()); - _lastResponse.setOpTime(OpTime()); + _lastResponse.setAppliedOpTime(OpTime()); } void MemberHeartbeatData::setUpValues(Date_t now, @@ -60,10 +60,9 @@ void MemberHeartbeatData::setUpValues(Date_t now, if (!hbResponse.hasElectionTime()) { hbResponse.setElectionTime(_lastResponse.getElectionTime()); } - if (!hbResponse.hasOpTime()) { - hbResponse.setOpTime(_lastResponse.getOpTime()); + if (!hbResponse.hasAppliedOpTime()) { + hbResponse.setAppliedOpTime(_lastResponse.getAppliedOpTime()); } - // Log if the state changes if (_lastResponse.getState() != hbResponse.getState()) { log() << "Member " << host.toString() << " is now in state " @@ -82,7 +81,7 @@ void MemberHeartbeatData::setDownValues(Date_t now, const std::string& heartbeat _lastResponse = ReplSetHeartbeatResponse(); _lastResponse.setState(MemberState::RS_DOWN); _lastResponse.setElectionTime(Timestamp()); - _lastResponse.setOpTime(OpTime()); + _lastResponse.setAppliedOpTime(OpTime()); _lastResponse.setHbMsg(heartbeatMessage); _lastResponse.setSyncingTo(HostAndPort()); } @@ -96,7 +95,7 @@ void MemberHeartbeatData::setAuthIssue(Date_t now) { _lastResponse = ReplSetHeartbeatResponse(); _lastResponse.setState(MemberState::RS_UNKNOWN); _lastResponse.setElectionTime(Timestamp()); - _lastResponse.setOpTime(OpTime()); + _lastResponse.setAppliedOpTime(OpTime()); _lastResponse.setHbMsg(""); _lastResponse.setSyncingTo(HostAndPort()); } diff --git a/src/mongo/db/repl/member_heartbeat_data.h b/src/mongo/db/repl/member_heartbeat_data.h index d5b87a3767e..e64dcb4ef67 100644 --- a/src/mongo/db/repl/member_heartbeat_data.h +++ b/src/mongo/db/repl/member_heartbeat_data.h @@ -68,8 +68,8 @@ public: const HostAndPort& getSyncSource() const { return _lastResponse.getSyncingTo(); } - OpTime getOpTime() const { - return _lastResponse.getOpTime(); + OpTime getAppliedOpTime() const { + return _lastResponse.getAppliedOpTime(); } int getConfigVersion() const { return _lastResponse.getConfigVersion(); diff --git a/src/mongo/db/repl/minvalid.cpp b/src/mongo/db/repl/minvalid.cpp index 990d6224e50..90753cff0f4 100644 --- a/src/mongo/db/repl/minvalid.cpp +++ b/src/mongo/db/repl/minvalid.cpp @@ -39,6 +39,7 @@ #include "mongo/db/operation_context.h" #include "mongo/db/operation_context_impl.h" #include "mongo/db/repl/oplog.h" +#include "mongo/db/repl/replication_coordinator_global.h" #include "mongo/util/assert_util.h" #include "mongo/util/log.h" @@ -62,7 +63,10 @@ void clearInitialSyncFlag(OperationContext* txn) { } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "clearInitialSyncFlags", minvalidNS); + auto replCoord = repl::ReplicationCoordinator::get(txn); + OpTime time = replCoord->getMyLastAppliedOpTime(); txn->recoveryUnit()->waitUntilDurable(); + replCoord->setMyLastDurableOpTime(time); LOG(3) << "clearing initial sync flag"; } diff --git a/src/mongo/db/repl/old_update_position_args.cpp b/src/mongo/db/repl/old_update_position_args.cpp new file mode 100644 index 00000000000..1a01a1fa3e8 --- /dev/null +++ b/src/mongo/db/repl/old_update_position_args.cpp @@ -0,0 +1,154 @@ +/** + * Copyright 2014 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/platform/basic.h" + +#include "mongo/db/repl/old_update_position_args.h" + +#include "mongo/base/status.h" +#include "mongo/bson/util/bson_check.h" +#include "mongo/bson/util/bson_extract.h" +#include "mongo/db/jsobj.h" + +namespace mongo { +namespace repl { + + +OldUpdatePositionArgs::UpdateInfo::UpdateInfo(const OID& anRid, + const OpTime& aTs, + long long aCfgver, + long long aMemberId) + : rid(anRid), ts(aTs), cfgver(aCfgver), memberId(aMemberId) {} + +namespace { + +const std::string kCommandFieldName = "replSetUpdatePosition"; +const std::string kUpdateArrayFieldName = "optimes"; + +const std::string kLegalUpdatePositionFieldNames[] = { + kCommandFieldName, kUpdateArrayFieldName, +}; + +const std::string kMemberRIDFieldName = "_id"; +const std::string kMemberConfigFieldName = "config"; +const std::string kOpTimeFieldName = "optime"; +const std::string kMemberIdFieldName = "memberId"; +const std::string kConfigVersionFieldName = "cfgver"; + +const std::string kLegalUpdateInfoFieldNames[] = { + kMemberConfigFieldName, + kMemberRIDFieldName, + kOpTimeFieldName, + kMemberIdFieldName, + kConfigVersionFieldName, +}; + +} // namespace + +Status OldUpdatePositionArgs::initialize(const BSONObj& argsObj) { + Status status = + bsonCheckOnlyHasFields("OldUpdatePositionArgs", argsObj, kLegalUpdatePositionFieldNames); + + if (!status.isOK()) + return status; + + // grab the array of changes + BSONElement updateArray; + status = bsonExtractTypedField(argsObj, kUpdateArrayFieldName, Array, &updateArray); + if (!status.isOK()) + return status; + + // now parse each array entry into an update + BSONObjIterator i(updateArray.Obj()); + while (i.more()) { + BSONObj entry = i.next().Obj(); + status = bsonCheckOnlyHasFields("UpdateInfoArgs", entry, kLegalUpdateInfoFieldNames); + if (!status.isOK()) + return status; + + OpTime opTime; + if (entry[kOpTimeFieldName].isABSONObj()) { + // In protocol version 1, { ts: <timestamp>, t: term } + Status status = bsonExtractOpTimeField(entry, kOpTimeFieldName, &opTime); + if (!status.isOK()) + return status; + } else { + Timestamp ts; + status = bsonExtractTimestampField(entry, kOpTimeFieldName, &ts); + if (!status.isOK()) + return status; + opTime = OpTime(ts, OpTime::kUninitializedTerm); + } + if (!status.isOK()) + return status; + + // TODO(spencer): The following three fields are optional in 3.0, but should be made + // required or ignored in 3.0 + long long cfgver; + status = bsonExtractIntegerFieldWithDefault(entry, kConfigVersionFieldName, -1, &cfgver); + if (!status.isOK()) + return status; + + OID rid; + status = bsonExtractOIDFieldWithDefault(entry, kMemberRIDFieldName, OID(), &rid); + if (!status.isOK()) + return status; + + long long memberID; + status = bsonExtractIntegerFieldWithDefault(entry, kMemberIdFieldName, -1, &memberID); + if (!status.isOK()) + return status; + + _updates.push_back(UpdateInfo(rid, opTime, cfgver, memberID)); + } + + return Status::OK(); +} + +BSONObj OldUpdatePositionArgs::toBSON() const { + BSONObjBuilder builder; + // add command name + builder.append(kCommandFieldName, 1); + + // build array of updates + if (!_updates.empty()) { + BSONArrayBuilder updateArray(builder.subarrayStart(kUpdateArrayFieldName)); + for (OldUpdatePositionArgs::UpdateIterator update = updatesBegin(); update != updatesEnd(); + ++update) { + updateArray.append(BSON(kMemberRIDFieldName << update->rid << kOpTimeFieldName + << update->ts.getTimestamp() + << kConfigVersionFieldName << update->cfgver + << kMemberIdFieldName << update->memberId)); + } + updateArray.doneFast(); + } + return builder.obj(); +} + +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/old_update_position_args.h b/src/mongo/db/repl/old_update_position_args.h new file mode 100644 index 00000000000..fa9d1a3ef90 --- /dev/null +++ b/src/mongo/db/repl/old_update_position_args.h @@ -0,0 +1,88 @@ +/** + * Copyright (C) 2014 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include <vector> + +#include "mongo/db/jsobj.h" +#include "mongo/db/repl/optime.h" + +namespace mongo { + +class Status; + +namespace repl { + +/** + * Arguments to the handshake command. + */ +class OldUpdatePositionArgs { +public: + struct UpdateInfo { + UpdateInfo(const OID& anRid, const OpTime& aTs, long long aCfgver, long long aMemberId); + + OID rid; + OpTime ts; + long long cfgver; + long long memberId; + }; + + typedef std::vector<UpdateInfo>::const_iterator UpdateIterator; + + /** + * Initializes this OldUpdatePositionArgs from the contents of "argsObj". + */ + Status initialize(const BSONObj& argsObj); + + /** + * Gets a begin iterator over the UpdateInfos stored in this OldUpdatePositionArgs. + */ + UpdateIterator updatesBegin() const { + return _updates.begin(); + } + + /** + * Gets an end iterator over the UpdateInfos stored in this OldUpdatePositionArgs. + */ + UpdateIterator updatesEnd() const { + return _updates.end(); + } + + /** + * Returns a BSONified version of the object. + * _updates is only included if it is not empty. + */ + BSONObj toBSON() const; + +private: + std::vector<UpdateInfo> _updates; +}; + +} // namespace repl +} // namespace mongo diff --git a/src/mongo/db/repl/oplog.cpp b/src/mongo/db/repl/oplog.cpp index d5ff327d03c..f4c2f136965 100644 --- a/src/mongo/db/repl/oplog.cpp +++ b/src/mongo/db/repl/oplog.cpp @@ -218,7 +218,7 @@ public: : _newOpTime(newOpTime), _replCoord(replCoord) {} virtual void commit() { - _replCoord->setMyLastOptimeForward(_newOpTime); + _replCoord->setMyLastAppliedOpTimeForward(_newOpTime); } virtual void rollback() {} @@ -464,7 +464,7 @@ OpTime writeOpsToOplog(OperationContext* txn, const std::vector<BSONObj>& ops) { OpTime lastOptime; MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { - lastOptime = replCoord->getMyLastOptime(); + lastOptime = replCoord->getMyLastAppliedOpTime(); invariant(!ops.empty()); ScopedTransaction transaction(txn, MODE_IX); Lock::DBLock lk(txn->lockState(), "local", MODE_X); diff --git a/src/mongo/db/repl/repl_client_info.cpp b/src/mongo/db/repl/repl_client_info.cpp index b339c10b4f0..678938f2755 100644 --- a/src/mongo/db/repl/repl_client_info.cpp +++ b/src/mongo/db/repl/repl_client_info.cpp @@ -46,7 +46,7 @@ const Client::Decoration<ReplClientInfo> ReplClientInfo::forClient = void ReplClientInfo::setLastOpToSystemLastOpTime(OperationContext* txn) { ReplicationCoordinator* replCoord = repl::ReplicationCoordinator::get(txn->getServiceContext()); if (replCoord->isReplEnabled() && txn->writesAreReplicated()) { - setLastOp(replCoord->getMyLastOptime()); + setLastOp(replCoord->getMyLastAppliedOpTime()); } } diff --git a/src/mongo/db/repl/repl_set_heartbeat_response.cpp b/src/mongo/db/repl/repl_set_heartbeat_response.cpp index 8ccb6241950..8c06dc7bd7d 100644 --- a/src/mongo/db/repl/repl_set_heartbeat_response.cpp +++ b/src/mongo/db/repl/repl_set_heartbeat_response.cpp @@ -58,7 +58,8 @@ const std::string kIsReplSetFieldName = "rs"; const std::string kMemberStateFieldName = "state"; const std::string kMismatchFieldName = "mismatch"; const std::string kOkFieldName = "ok"; -const std::string kOpTimeFieldName = "opTime"; +const std::string kDurableOpTimeFieldName = "durableOpTime"; +const std::string kAppliedOpTimeFieldName = "opTime"; const std::string kPrimaryIdFieldName = "primaryId"; const std::string kReplSetFieldName = "set"; const std::string kSyncSourceFieldName = "syncingTo"; @@ -117,12 +118,15 @@ void ReplSetHeartbeatResponse::addToBSON(BSONObjBuilder* builder, bool isProtoco if (_primaryIdSet) { builder->append(kPrimaryIdFieldName, _primaryId); } - if (_opTimeSet) { + if (_durableOpTimeSet) { + _durableOpTime.append(builder, kDurableOpTimeFieldName); + } + if (_appliedOpTimeSet) { if (isProtocolVersionV1) { - _opTime.append(builder, kOpTimeFieldName); + _appliedOpTime.append(builder, kAppliedOpTimeFieldName); } else { - builder->appendDate(kOpTimeFieldName, - Date_t::fromMillisSinceEpoch(_opTime.getTimestamp().asLL())); + builder->appendDate(kAppliedOpTimeFieldName, + Date_t::fromMillisSinceEpoch(_appliedOpTime.getTimestamp().asLL())); } } } @@ -209,30 +213,39 @@ Status ReplSetHeartbeatResponse::initialize(const BSONObj& doc, long long term) return termStatus; } + Status status = bsonExtractOpTimeField(doc, kDurableOpTimeFieldName, &_durableOpTime); + if (!status.isOK()) { + if (status != ErrorCodes::NoSuchKey) { + return status; + } + } else { + _durableOpTimeSet = true; + } + // In order to support both the 3.0(V0) and 3.2(V1) heartbeats we must parse the OpTime // field based on its type. If it is a Date, we parse it as the timestamp and use // initialize's term argument to complete the OpTime type. If it is an Object, then it's // V1 and we construct an OpTime out of its nested fields. - const BSONElement opTimeElement = doc[kOpTimeFieldName]; - if (opTimeElement.eoo()) { - _opTimeSet = false; - } else if (opTimeElement.type() == bsonTimestamp) { - _opTimeSet = true; - _opTime = OpTime(opTimeElement.timestamp(), term); - } else if (opTimeElement.type() == Date) { - _opTimeSet = true; - _opTime = OpTime(Timestamp(opTimeElement.date()), term); - } else if (opTimeElement.type() == Object) { - Status status = bsonExtractOpTimeField(doc, kOpTimeFieldName, &_opTime); - _opTimeSet = true; + const BSONElement appliedOpTimeElement = doc[kAppliedOpTimeFieldName]; + if (appliedOpTimeElement.eoo()) { + _appliedOpTimeSet = false; + } else if (appliedOpTimeElement.type() == bsonTimestamp) { + _appliedOpTimeSet = true; + _appliedOpTime = OpTime(appliedOpTimeElement.timestamp(), term); + } else if (appliedOpTimeElement.type() == Date) { + _appliedOpTimeSet = true; + _appliedOpTime = OpTime(Timestamp(appliedOpTimeElement.date()), term); + } else if (appliedOpTimeElement.type() == Object) { + Status status = bsonExtractOpTimeField(doc, kAppliedOpTimeFieldName, &_appliedOpTime); + _appliedOpTimeSet = true; // since a v1 OpTime was in the response, the member must be part of a replset _isReplSet = true; } else { return Status(ErrorCodes::TypeMismatch, - str::stream() << "Expected \"" << kOpTimeFieldName + str::stream() << "Expected \"" << kAppliedOpTimeFieldName << "\" field in response to replSetHeartbeat " "command to have type Date or Timestamp, but found type " - << typeName(opTimeElement.type())); + << typeName(appliedOpTimeElement.type())); } const BSONElement electableElement = doc[kIsElectableFieldName]; @@ -274,7 +287,7 @@ Status ReplSetHeartbeatResponse::initialize(const BSONObj& doc, long long term) const BSONElement configVersionElement = doc[kConfigVersionFieldName]; // If we have an optime then we must have a configVersion - if (_opTimeSet && configVersionElement.eoo()) { + if (_appliedOpTimeSet && configVersionElement.eoo()) { return Status(ErrorCodes::NoSuchKey, str::stream() << "Response to replSetHeartbeat missing required \"" << kConfigVersionFieldName @@ -362,9 +375,14 @@ long long ReplSetHeartbeatResponse::getPrimaryId() const { return _primaryId; } -OpTime ReplSetHeartbeatResponse::getOpTime() const { - invariant(_opTimeSet); - return _opTime; +OpTime ReplSetHeartbeatResponse::getAppliedOpTime() const { + invariant(_appliedOpTimeSet); + return _appliedOpTime; +} + +OpTime ReplSetHeartbeatResponse::getDurableOpTime() const { + invariant(_durableOpTimeSet); + return _durableOpTime; } } // namespace repl diff --git a/src/mongo/db/repl/repl_set_heartbeat_response.h b/src/mongo/db/repl/repl_set_heartbeat_response.h index b3fba2a4803..2b968bbb17d 100644 --- a/src/mongo/db/repl/repl_set_heartbeat_response.h +++ b/src/mongo/db/repl/repl_set_heartbeat_response.h @@ -125,10 +125,14 @@ public: long long getTerm() const { return _term; } - bool hasOpTime() const { - return _opTimeSet; + bool hasAppliedOpTime() const { + return _appliedOpTimeSet; } - OpTime getOpTime() const; + OpTime getAppliedOpTime() const; + bool hasDurableOpTime() const { + return _durableOpTimeSet; + } + OpTime getDurableOpTime() const; /** * Sets _mismatch to true. @@ -232,9 +236,13 @@ public: _primaryIdSet = true; _primaryId = primaryId; } - void setOpTime(OpTime time) { - _opTimeSet = true; - _opTime = time; + void setAppliedOpTime(OpTime time) { + _appliedOpTimeSet = true; + _appliedOpTime = time; + } + void setDurableOpTime(OpTime time) { + _durableOpTimeSet = true; + _durableOpTime = time; } void setTerm(long long term) { _term = term; @@ -247,8 +255,11 @@ private: bool _timeSet = false; Seconds _time = Seconds(0); // Seconds since UNIX epoch. - bool _opTimeSet = false; - OpTime _opTime; + bool _appliedOpTimeSet = false; + OpTime _appliedOpTime; + + bool _durableOpTimeSet = false; + OpTime _durableOpTime; bool _electableSet = false; bool _electable = false; diff --git a/src/mongo/db/repl/repl_set_heartbeat_response_test.cpp b/src/mongo/db/repl/repl_set_heartbeat_response_test.cpp index 45c8dba3e1f..3c7adf479ee 100644 --- a/src/mongo/db/repl/repl_set_heartbeat_response_test.cpp +++ b/src/mongo/db/repl/repl_set_heartbeat_response_test.cpp @@ -51,7 +51,8 @@ TEST(ReplSetHeartbeatResponse, DefaultConstructThenSlowlyBuildToFullObj) { ASSERT_EQUALS(false, hbResponse.hasElectionTime()); ASSERT_EQUALS(false, hbResponse.hasIsElectable()); ASSERT_EQUALS(false, hbResponse.hasTime()); - ASSERT_EQUALS(false, hbResponse.hasOpTime()); + ASSERT_EQUALS(false, hbResponse.hasDurableOpTime()); + ASSERT_EQUALS(false, hbResponse.hasAppliedOpTime()); ASSERT_EQUALS(false, hbResponse.hasConfig()); ASSERT_EQUALS(false, hbResponse.isMismatched()); ASSERT_EQUALS(false, hbResponse.isReplSet()); @@ -75,7 +76,8 @@ TEST(ReplSetHeartbeatResponse, DefaultConstructThenSlowlyBuildToFullObj) { ASSERT_EQUALS(false, hbResponse.hasElectionTime()); ASSERT_EQUALS(false, hbResponse.hasIsElectable()); ASSERT_EQUALS(false, hbResponse.hasTime()); - ASSERT_EQUALS(false, hbResponse.hasOpTime()); + ASSERT_EQUALS(false, hbResponse.hasDurableOpTime()); + ASSERT_EQUALS(false, hbResponse.hasAppliedOpTime()); ASSERT_EQUALS(false, hbResponse.hasConfig()); ASSERT_EQUALS(false, hbResponse.isMismatched()); ASSERT_EQUALS(false, hbResponse.isReplSet()); @@ -101,7 +103,8 @@ TEST(ReplSetHeartbeatResponse, DefaultConstructThenSlowlyBuildToFullObj) { ASSERT_EQUALS(false, hbResponse.hasElectionTime()); ASSERT_EQUALS(false, hbResponse.hasIsElectable()); ASSERT_EQUALS(false, hbResponse.hasTime()); - ASSERT_EQUALS(false, hbResponse.hasOpTime()); + ASSERT_EQUALS(false, hbResponse.hasDurableOpTime()); + ASSERT_EQUALS(false, hbResponse.hasAppliedOpTime()); ASSERT_EQUALS(false, hbResponse.hasConfig()); ASSERT_EQUALS(false, hbResponse.isMismatched()); ASSERT_EQUALS(false, hbResponse.isReplSet()); @@ -128,7 +131,8 @@ TEST(ReplSetHeartbeatResponse, DefaultConstructThenSlowlyBuildToFullObj) { ASSERT_EQUALS(true, hbResponse.hasElectionTime()); ASSERT_EQUALS(false, hbResponse.hasIsElectable()); ASSERT_EQUALS(false, hbResponse.hasTime()); - ASSERT_EQUALS(false, hbResponse.hasOpTime()); + ASSERT_EQUALS(false, hbResponse.hasDurableOpTime()); + ASSERT_EQUALS(false, hbResponse.hasAppliedOpTime()); ASSERT_EQUALS(false, hbResponse.hasConfig()); ASSERT_EQUALS(false, hbResponse.isMismatched()); ASSERT_EQUALS(false, hbResponse.isReplSet()); @@ -150,14 +154,15 @@ TEST(ReplSetHeartbeatResponse, DefaultConstructThenSlowlyBuildToFullObj) { ASSERT_EQUALS(Status::OK(), initializeResult); ASSERT_EQUALS(hbResponseObj.toString(), hbResponseObjRoundTripChecker.toString()); - // set opTime - hbResponse.setOpTime(OpTime(Timestamp(10), 0)); + // set durableOpTime + hbResponse.setDurableOpTime(OpTime(Timestamp(10), 0)); ++fieldsSet; ASSERT_EQUALS(false, hbResponse.hasState()); ASSERT_EQUALS(true, hbResponse.hasElectionTime()); ASSERT_EQUALS(false, hbResponse.hasIsElectable()); ASSERT_EQUALS(false, hbResponse.hasTime()); - ASSERT_EQUALS(true, hbResponse.hasOpTime()); + ASSERT_EQUALS(true, hbResponse.hasDurableOpTime()); + ASSERT_EQUALS(false, hbResponse.hasAppliedOpTime()); ASSERT_EQUALS(false, hbResponse.hasConfig()); ASSERT_EQUALS(false, hbResponse.isMismatched()); ASSERT_EQUALS(false, hbResponse.isReplSet()); @@ -167,7 +172,7 @@ TEST(ReplSetHeartbeatResponse, DefaultConstructThenSlowlyBuildToFullObj) { ASSERT_EQUALS(HostAndPort(), hbResponse.getSyncingTo()); ASSERT_EQUALS(1, hbResponse.getConfigVersion()); ASSERT_EQUALS(Timestamp(10, 0), hbResponse.getElectionTime()); - ASSERT_EQUALS(OpTime(Timestamp(0, 10), 0), hbResponse.getOpTime()); + ASSERT_EQUALS(OpTime(Timestamp(0, 10), 0), hbResponse.getDurableOpTime()); hbResponseObj = hbResponse.toBSON(false); ASSERT_EQUALS(fieldsSet, hbResponseObj.nFields()); @@ -175,7 +180,41 @@ TEST(ReplSetHeartbeatResponse, DefaultConstructThenSlowlyBuildToFullObj) { ASSERT_EQUALS("", hbResponseObj["hbmsg"].String()); ASSERT_EQUALS(1, hbResponseObj["v"].Number()); ASSERT_EQUALS(Timestamp(10, 0), hbResponseObj["electionTime"].timestamp()); - ASSERT_EQUALS(Timestamp(0, 10), hbResponseObj["opTime"].timestamp()); + ASSERT_EQUALS(Timestamp(0, 10), hbResponseObj["durableOpTime"]["ts"].timestamp()); + + initializeResult = hbResponseObjRoundTripChecker.initialize(hbResponseObj, 0); + ASSERT_EQUALS(Status::OK(), initializeResult); + ASSERT_EQUALS(hbResponseObj.toString(), hbResponseObjRoundTripChecker.toBSON(false).toString()); + + // set appliedOpTime + hbResponse.setAppliedOpTime(OpTime(Timestamp(50), 0)); + ++fieldsSet; + ASSERT_EQUALS(false, hbResponse.hasState()); + ASSERT_EQUALS(true, hbResponse.hasElectionTime()); + ASSERT_EQUALS(false, hbResponse.hasIsElectable()); + ASSERT_EQUALS(false, hbResponse.hasTime()); + ASSERT_EQUALS(true, hbResponse.hasDurableOpTime()); + ASSERT_EQUALS(true, hbResponse.hasAppliedOpTime()); + ASSERT_EQUALS(false, hbResponse.hasConfig()); + ASSERT_EQUALS(false, hbResponse.isMismatched()); + ASSERT_EQUALS(false, hbResponse.isReplSet()); + ASSERT_EQUALS(false, hbResponse.isStateDisagreement()); + ASSERT_EQUALS("rs0", hbResponse.getReplicaSetName()); + ASSERT_EQUALS("", hbResponse.getHbMsg()); + ASSERT_EQUALS(HostAndPort(), hbResponse.getSyncingTo()); + ASSERT_EQUALS(1, hbResponse.getConfigVersion()); + ASSERT_EQUALS(Timestamp(10, 0), hbResponse.getElectionTime()); + ASSERT_EQUALS(OpTime(Timestamp(0, 10), 0), hbResponse.getDurableOpTime()); + ASSERT_EQUALS(OpTime(Timestamp(0, 50), 0), hbResponse.getAppliedOpTime()); + + hbResponseObj = hbResponse.toBSON(false); + ASSERT_EQUALS(fieldsSet, hbResponseObj.nFields()); + ASSERT_EQUALS("rs0", hbResponseObj["set"].String()); + ASSERT_EQUALS("", hbResponseObj["hbmsg"].String()); + ASSERT_EQUALS(1, hbResponseObj["v"].Number()); + ASSERT_EQUALS(Timestamp(10, 0), hbResponseObj["electionTime"].timestamp()); + ASSERT_EQUALS(Timestamp(0, 50), hbResponseObj["opTime"].timestamp()); + ASSERT_EQUALS(Timestamp(0, 10), hbResponseObj["durableOpTime"]["ts"].timestamp()); initializeResult = hbResponseObjRoundTripChecker.initialize(hbResponseObj, 0); ASSERT_EQUALS(Status::OK(), initializeResult); @@ -188,7 +227,8 @@ TEST(ReplSetHeartbeatResponse, DefaultConstructThenSlowlyBuildToFullObj) { ASSERT_EQUALS(true, hbResponse.hasElectionTime()); ASSERT_EQUALS(false, hbResponse.hasIsElectable()); ASSERT_EQUALS(true, hbResponse.hasTime()); - ASSERT_EQUALS(true, hbResponse.hasOpTime()); + ASSERT_EQUALS(true, hbResponse.hasDurableOpTime()); + ASSERT_EQUALS(true, hbResponse.hasAppliedOpTime()); ASSERT_EQUALS(false, hbResponse.hasConfig()); ASSERT_EQUALS(false, hbResponse.isMismatched()); ASSERT_EQUALS(false, hbResponse.isReplSet()); @@ -198,7 +238,8 @@ TEST(ReplSetHeartbeatResponse, DefaultConstructThenSlowlyBuildToFullObj) { ASSERT_EQUALS(HostAndPort(), hbResponse.getSyncingTo()); ASSERT_EQUALS(1, hbResponse.getConfigVersion()); ASSERT_EQUALS(Timestamp(10, 0), hbResponse.getElectionTime()); - ASSERT_EQUALS(OpTime(Timestamp(0, 10), 0), hbResponse.getOpTime()); + ASSERT_EQUALS(OpTime(Timestamp(0, 10), 0), hbResponse.getDurableOpTime()); + ASSERT_EQUALS(OpTime(Timestamp(0, 50), 0), hbResponse.getAppliedOpTime()); ASSERT_EQUALS(Seconds(10), hbResponse.getTime()); hbResponseObj = hbResponse.toBSON(false); @@ -207,7 +248,8 @@ TEST(ReplSetHeartbeatResponse, DefaultConstructThenSlowlyBuildToFullObj) { ASSERT_EQUALS("", hbResponseObj["hbmsg"].String()); ASSERT_EQUALS(1, hbResponseObj["v"].Number()); ASSERT_EQUALS(Timestamp(10, 0), hbResponseObj["electionTime"].timestamp()); - ASSERT_EQUALS(Timestamp(0, 10), hbResponseObj["opTime"].timestamp()); + ASSERT_EQUALS(Timestamp(0, 50), hbResponseObj["opTime"].timestamp()); + ASSERT_EQUALS(Timestamp(0, 10), hbResponseObj["durableOpTime"]["ts"].timestamp()); ASSERT_EQUALS(10, hbResponseObj["time"].numberLong()); initializeResult = hbResponseObjRoundTripChecker.initialize(hbResponseObj, 0); @@ -221,7 +263,8 @@ TEST(ReplSetHeartbeatResponse, DefaultConstructThenSlowlyBuildToFullObj) { ASSERT_EQUALS(true, hbResponse.hasElectionTime()); ASSERT_EQUALS(true, hbResponse.hasIsElectable()); ASSERT_EQUALS(true, hbResponse.hasTime()); - ASSERT_EQUALS(true, hbResponse.hasOpTime()); + ASSERT_EQUALS(true, hbResponse.hasDurableOpTime()); + ASSERT_EQUALS(true, hbResponse.hasAppliedOpTime()); ASSERT_EQUALS(false, hbResponse.hasConfig()); ASSERT_EQUALS(false, hbResponse.isMismatched()); ASSERT_EQUALS(false, hbResponse.isReplSet()); @@ -231,7 +274,8 @@ TEST(ReplSetHeartbeatResponse, DefaultConstructThenSlowlyBuildToFullObj) { ASSERT_EQUALS(HostAndPort(), hbResponse.getSyncingTo()); ASSERT_EQUALS(1, hbResponse.getConfigVersion()); ASSERT_EQUALS(Timestamp(10, 0), hbResponse.getElectionTime()); - ASSERT_EQUALS(OpTime(Timestamp(0, 10), 0), hbResponse.getOpTime()); + ASSERT_EQUALS(OpTime(Timestamp(0, 10), 0), hbResponse.getDurableOpTime()); + ASSERT_EQUALS(OpTime(Timestamp(0, 50), 0), hbResponse.getAppliedOpTime()); ASSERT_EQUALS(Seconds(10), hbResponse.getTime()); ASSERT_EQUALS(true, hbResponse.isElectable()); @@ -241,7 +285,8 @@ TEST(ReplSetHeartbeatResponse, DefaultConstructThenSlowlyBuildToFullObj) { ASSERT_EQUALS("", hbResponseObj["hbmsg"].String()); ASSERT_EQUALS(1, hbResponseObj["v"].Number()); ASSERT_EQUALS(Timestamp(10, 0), hbResponseObj["electionTime"].timestamp()); - ASSERT_EQUALS(Timestamp(0, 10), hbResponseObj["opTime"].timestamp()); + ASSERT_EQUALS(Timestamp(0, 50), hbResponseObj["opTime"].timestamp()); + ASSERT_EQUALS(Timestamp(0, 10), hbResponseObj["durableOpTime"]["ts"].timestamp()); ASSERT_EQUALS(10, hbResponseObj["time"].numberLong()); ASSERT_EQUALS(true, hbResponseObj["e"].trueValue()); @@ -257,7 +302,8 @@ TEST(ReplSetHeartbeatResponse, DefaultConstructThenSlowlyBuildToFullObj) { ASSERT_EQUALS(true, hbResponse.hasElectionTime()); ASSERT_EQUALS(true, hbResponse.hasIsElectable()); ASSERT_EQUALS(true, hbResponse.hasTime()); - ASSERT_EQUALS(true, hbResponse.hasOpTime()); + ASSERT_EQUALS(true, hbResponse.hasDurableOpTime()); + ASSERT_EQUALS(true, hbResponse.hasAppliedOpTime()); ASSERT_EQUALS(true, hbResponse.hasConfig()); ASSERT_EQUALS(false, hbResponse.isMismatched()); ASSERT_EQUALS(false, hbResponse.isReplSet()); @@ -267,7 +313,8 @@ TEST(ReplSetHeartbeatResponse, DefaultConstructThenSlowlyBuildToFullObj) { ASSERT_EQUALS(HostAndPort(), hbResponse.getSyncingTo()); ASSERT_EQUALS(1, hbResponse.getConfigVersion()); ASSERT_EQUALS(Timestamp(10, 0), hbResponse.getElectionTime()); - ASSERT_EQUALS(OpTime(Timestamp(0, 10), 0), hbResponse.getOpTime()); + ASSERT_EQUALS(OpTime(Timestamp(0, 10), 0), hbResponse.getDurableOpTime()); + ASSERT_EQUALS(OpTime(Timestamp(0, 50), 0), hbResponse.getAppliedOpTime()); ASSERT_EQUALS(Seconds(10), hbResponse.getTime()); ASSERT_EQUALS(true, hbResponse.isElectable()); ASSERT_EQUALS(config.toBSON().toString(), hbResponse.getConfig().toBSON().toString()); @@ -278,7 +325,8 @@ TEST(ReplSetHeartbeatResponse, DefaultConstructThenSlowlyBuildToFullObj) { ASSERT_EQUALS("", hbResponseObj["hbmsg"].String()); ASSERT_EQUALS(1, hbResponseObj["v"].Number()); ASSERT_EQUALS(Timestamp(10, 0), hbResponseObj["electionTime"].timestamp()); - ASSERT_EQUALS(Timestamp(0, 10), hbResponseObj["opTime"].timestamp()); + ASSERT_EQUALS(Timestamp(0, 50), hbResponseObj["opTime"].timestamp()); + ASSERT_EQUALS(Timestamp(0, 10), hbResponseObj["durableOpTime"]["ts"].timestamp()); ASSERT_EQUALS(10, hbResponseObj["time"].numberLong()); ASSERT_EQUALS(true, hbResponseObj["e"].trueValue()); ASSERT_EQUALS(config.toBSON().toString(), hbResponseObj["config"].Obj().toString()); @@ -294,7 +342,8 @@ TEST(ReplSetHeartbeatResponse, DefaultConstructThenSlowlyBuildToFullObj) { ASSERT_EQUALS(true, hbResponse.hasElectionTime()); ASSERT_EQUALS(true, hbResponse.hasIsElectable()); ASSERT_EQUALS(true, hbResponse.hasTime()); - ASSERT_EQUALS(true, hbResponse.hasOpTime()); + ASSERT_EQUALS(true, hbResponse.hasDurableOpTime()); + ASSERT_EQUALS(true, hbResponse.hasAppliedOpTime()); ASSERT_EQUALS(true, hbResponse.hasConfig()); ASSERT_EQUALS(false, hbResponse.isMismatched()); ASSERT_EQUALS(false, hbResponse.isReplSet()); @@ -306,7 +355,8 @@ TEST(ReplSetHeartbeatResponse, DefaultConstructThenSlowlyBuildToFullObj) { ASSERT_EQUALS(HostAndPort(), hbResponse.getSyncingTo()); ASSERT_EQUALS(1, hbResponse.getConfigVersion()); ASSERT_EQUALS(Timestamp(10, 0), hbResponse.getElectionTime()); - ASSERT_EQUALS(OpTime(Timestamp(0, 10), 0), hbResponse.getOpTime()); + ASSERT_EQUALS(OpTime(Timestamp(0, 10), 0), hbResponse.getDurableOpTime()); + ASSERT_EQUALS(OpTime(Timestamp(0, 50), 0), hbResponse.getAppliedOpTime()); ASSERT_EQUALS(Seconds(10), hbResponse.getTime()); ASSERT_EQUALS(true, hbResponse.isElectable()); ASSERT_EQUALS(config.toBSON().toString(), hbResponse.getConfig().toBSON().toString()); @@ -317,7 +367,8 @@ TEST(ReplSetHeartbeatResponse, DefaultConstructThenSlowlyBuildToFullObj) { ASSERT_EQUALS("", hbResponseObj["hbmsg"].String()); ASSERT_EQUALS(1, hbResponseObj["v"].Number()); ASSERT_EQUALS(Timestamp(10, 0), hbResponseObj["electionTime"].timestamp()); - ASSERT_EQUALS(Timestamp(0, 10), hbResponseObj["opTime"].timestamp()); + ASSERT_EQUALS(Timestamp(0, 50), hbResponseObj["opTime"].timestamp()); + ASSERT_EQUALS(Timestamp(0, 10), hbResponseObj["durableOpTime"]["ts"].timestamp()); ASSERT_EQUALS(10, hbResponseObj["time"].numberLong()); ASSERT_EQUALS(true, hbResponseObj["e"].trueValue()); ASSERT_EQUALS(config.toBSON().toString(), hbResponseObj["config"].Obj().toString()); @@ -334,7 +385,8 @@ TEST(ReplSetHeartbeatResponse, DefaultConstructThenSlowlyBuildToFullObj) { ASSERT_EQUALS(true, hbResponse.hasElectionTime()); ASSERT_EQUALS(true, hbResponse.hasIsElectable()); ASSERT_EQUALS(true, hbResponse.hasTime()); - ASSERT_EQUALS(true, hbResponse.hasOpTime()); + ASSERT_EQUALS(true, hbResponse.hasDurableOpTime()); + ASSERT_EQUALS(true, hbResponse.hasAppliedOpTime()); ASSERT_EQUALS(true, hbResponse.hasConfig()); ASSERT_EQUALS(false, hbResponse.isMismatched()); ASSERT_EQUALS(false, hbResponse.isReplSet()); @@ -346,7 +398,8 @@ TEST(ReplSetHeartbeatResponse, DefaultConstructThenSlowlyBuildToFullObj) { ASSERT_EQUALS(HostAndPort(), hbResponse.getSyncingTo()); ASSERT_EQUALS(1, hbResponse.getConfigVersion()); ASSERT_EQUALS(Timestamp(10, 0), hbResponse.getElectionTime()); - ASSERT_EQUALS(OpTime(Timestamp(0, 10), 0), hbResponse.getOpTime()); + ASSERT_EQUALS(OpTime(Timestamp(0, 10), 0), hbResponse.getDurableOpTime()); + ASSERT_EQUALS(OpTime(Timestamp(0, 50), 0), hbResponse.getAppliedOpTime()); ASSERT_EQUALS(Seconds(10), hbResponse.getTime()); ASSERT_EQUALS(true, hbResponse.isElectable()); ASSERT_EQUALS(config.toBSON().toString(), hbResponse.getConfig().toBSON().toString()); @@ -357,7 +410,8 @@ TEST(ReplSetHeartbeatResponse, DefaultConstructThenSlowlyBuildToFullObj) { ASSERT_EQUALS("", hbResponseObj["hbmsg"].String()); ASSERT_EQUALS(1, hbResponseObj["v"].Number()); ASSERT_EQUALS(Timestamp(10, 0), hbResponseObj["electionTime"].timestamp()); - ASSERT_EQUALS(Timestamp(0, 10), hbResponseObj["opTime"].timestamp()); + ASSERT_EQUALS(Timestamp(0, 50), hbResponseObj["opTime"].timestamp()); + ASSERT_EQUALS(Timestamp(0, 10), hbResponseObj["durableOpTime"]["ts"].timestamp()); ASSERT_EQUALS(10, hbResponseObj["time"].numberLong()); ASSERT_EQUALS(true, hbResponseObj["e"].trueValue()); ASSERT_EQUALS(config.toBSON().toString(), hbResponseObj["config"].Obj().toString()); @@ -376,7 +430,8 @@ TEST(ReplSetHeartbeatResponse, DefaultConstructThenSlowlyBuildToFullObj) { ASSERT_EQUALS(true, hbResponse.hasElectionTime()); ASSERT_EQUALS(true, hbResponse.hasIsElectable()); ASSERT_EQUALS(true, hbResponse.hasTime()); - ASSERT_EQUALS(true, hbResponse.hasOpTime()); + ASSERT_EQUALS(true, hbResponse.hasDurableOpTime()); + ASSERT_EQUALS(true, hbResponse.hasAppliedOpTime()); ASSERT_EQUALS(true, hbResponse.hasConfig()); ASSERT_EQUALS(false, hbResponse.isMismatched()); ASSERT_EQUALS(true, hbResponse.isReplSet()); @@ -388,7 +443,8 @@ TEST(ReplSetHeartbeatResponse, DefaultConstructThenSlowlyBuildToFullObj) { ASSERT_EQUALS(HostAndPort(), hbResponse.getSyncingTo()); ASSERT_EQUALS(1, hbResponse.getConfigVersion()); ASSERT_EQUALS(Timestamp(10, 0), hbResponse.getElectionTime()); - ASSERT_EQUALS(OpTime(Timestamp(0, 10), 0), hbResponse.getOpTime()); + ASSERT_EQUALS(OpTime(Timestamp(0, 10), 0), hbResponse.getDurableOpTime()); + ASSERT_EQUALS(OpTime(Timestamp(0, 50), 0), hbResponse.getAppliedOpTime()); ASSERT_EQUALS(Seconds(10), hbResponse.getTime()); ASSERT_EQUALS(true, hbResponse.isElectable()); ASSERT_EQUALS(config.toBSON().toString(), hbResponse.getConfig().toBSON().toString()); @@ -399,7 +455,8 @@ TEST(ReplSetHeartbeatResponse, DefaultConstructThenSlowlyBuildToFullObj) { ASSERT_EQUALS("", hbResponseObj["hbmsg"].String()); ASSERT_EQUALS(1, hbResponseObj["v"].Number()); ASSERT_EQUALS(Timestamp(10, 0), hbResponseObj["electionTime"].timestamp()); - ASSERT_EQUALS(Timestamp(0, 10), hbResponseObj["opTime"].timestamp()); + ASSERT_EQUALS(Timestamp(0, 50), hbResponseObj["opTime"].timestamp()); + ASSERT_EQUALS(Timestamp(0, 10), hbResponseObj["durableOpTime"]["ts"].timestamp()); ASSERT_EQUALS(10, hbResponseObj["time"].numberLong()); ASSERT_EQUALS(true, hbResponseObj["e"].trueValue()); ASSERT_EQUALS(config.toBSON().toString(), hbResponseObj["config"].Obj().toString()); @@ -419,7 +476,8 @@ TEST(ReplSetHeartbeatResponse, DefaultConstructThenSlowlyBuildToFullObj) { ASSERT_EQUALS(true, hbResponse.hasElectionTime()); ASSERT_EQUALS(true, hbResponse.hasIsElectable()); ASSERT_EQUALS(true, hbResponse.hasTime()); - ASSERT_EQUALS(true, hbResponse.hasOpTime()); + ASSERT_EQUALS(true, hbResponse.hasDurableOpTime()); + ASSERT_EQUALS(true, hbResponse.hasAppliedOpTime()); ASSERT_EQUALS(true, hbResponse.hasConfig()); ASSERT_EQUALS(false, hbResponse.isMismatched()); ASSERT_EQUALS(true, hbResponse.isReplSet()); @@ -431,7 +489,8 @@ TEST(ReplSetHeartbeatResponse, DefaultConstructThenSlowlyBuildToFullObj) { ASSERT_EQUALS(HostAndPort("syncTarget"), hbResponse.getSyncingTo()); ASSERT_EQUALS(1, hbResponse.getConfigVersion()); ASSERT_EQUALS(Timestamp(10, 0), hbResponse.getElectionTime()); - ASSERT_EQUALS(OpTime(Timestamp(0, 10), 0), hbResponse.getOpTime()); + ASSERT_EQUALS(OpTime(Timestamp(0, 10), 0), hbResponse.getDurableOpTime()); + ASSERT_EQUALS(OpTime(Timestamp(0, 50), 0), hbResponse.getAppliedOpTime()); ASSERT_EQUALS(Seconds(10), hbResponse.getTime()); ASSERT_EQUALS(true, hbResponse.isElectable()); ASSERT_EQUALS(config.toBSON().toString(), hbResponse.getConfig().toBSON().toString()); @@ -442,7 +501,8 @@ TEST(ReplSetHeartbeatResponse, DefaultConstructThenSlowlyBuildToFullObj) { ASSERT_EQUALS("", hbResponseObj["hbmsg"].String()); ASSERT_EQUALS(1, hbResponseObj["v"].Number()); ASSERT_EQUALS(Timestamp(10, 0), hbResponseObj["electionTime"].timestamp()); - ASSERT_EQUALS(Timestamp(0, 10), hbResponseObj["opTime"].timestamp()); + ASSERT_EQUALS(Timestamp(0, 50), hbResponseObj["opTime"].timestamp()); + ASSERT_EQUALS(Timestamp(0, 10), hbResponseObj["durableOpTime"]["ts"].timestamp()); ASSERT_EQUALS(10, hbResponseObj["time"].numberLong()); ASSERT_EQUALS(true, hbResponseObj["e"].trueValue()); ASSERT_EQUALS(config.toBSON().toString(), hbResponseObj["config"].Obj().toString()); @@ -462,7 +522,8 @@ TEST(ReplSetHeartbeatResponse, DefaultConstructThenSlowlyBuildToFullObj) { ASSERT_EQUALS(true, hbResponse.hasElectionTime()); ASSERT_EQUALS(true, hbResponse.hasIsElectable()); ASSERT_EQUALS(true, hbResponse.hasTime()); - ASSERT_EQUALS(true, hbResponse.hasOpTime()); + ASSERT_EQUALS(true, hbResponse.hasDurableOpTime()); + ASSERT_EQUALS(true, hbResponse.hasAppliedOpTime()); ASSERT_EQUALS(true, hbResponse.hasConfig()); ASSERT_EQUALS(false, hbResponse.isMismatched()); ASSERT_EQUALS(true, hbResponse.isReplSet()); @@ -474,7 +535,8 @@ TEST(ReplSetHeartbeatResponse, DefaultConstructThenSlowlyBuildToFullObj) { ASSERT_EQUALS(HostAndPort("syncTarget"), hbResponse.getSyncingTo()); ASSERT_EQUALS(1, hbResponse.getConfigVersion()); ASSERT_EQUALS(Timestamp(10, 0), hbResponse.getElectionTime()); - ASSERT_EQUALS(OpTime(Timestamp(0, 10), 0), hbResponse.getOpTime()); + ASSERT_EQUALS(OpTime(Timestamp(0, 10), 0), hbResponse.getDurableOpTime()); + ASSERT_EQUALS(OpTime(Timestamp(0, 50), 0), hbResponse.getAppliedOpTime()); ASSERT_EQUALS(Seconds(10), hbResponse.getTime()); ASSERT_EQUALS(true, hbResponse.isElectable()); ASSERT_EQUALS(config.toBSON().toString(), hbResponse.getConfig().toBSON().toString()); @@ -485,7 +547,8 @@ TEST(ReplSetHeartbeatResponse, DefaultConstructThenSlowlyBuildToFullObj) { ASSERT_EQUALS("lub dub", hbResponseObj["hbmsg"].String()); ASSERT_EQUALS(1, hbResponseObj["v"].Number()); ASSERT_EQUALS(Timestamp(10, 0), hbResponseObj["electionTime"].timestamp()); - ASSERT_EQUALS(Timestamp(0, 10), hbResponseObj["opTime"].timestamp()); + ASSERT_EQUALS(Timestamp(0, 50), hbResponseObj["opTime"].timestamp()); + ASSERT_EQUALS(Timestamp(0, 10), hbResponseObj["durableOpTime"]["ts"].timestamp()); ASSERT_EQUALS(10, hbResponseObj["time"].numberLong()); ASSERT_EQUALS(true, hbResponseObj["e"].trueValue()); ASSERT_EQUALS(config.toBSON().toString(), hbResponseObj["config"].Obj().toString()); @@ -505,7 +568,8 @@ TEST(ReplSetHeartbeatResponse, DefaultConstructThenSlowlyBuildToFullObj) { ASSERT_EQUALS(true, hbResponse.hasElectionTime()); ASSERT_EQUALS(true, hbResponse.hasIsElectable()); ASSERT_EQUALS(true, hbResponse.hasTime()); - ASSERT_EQUALS(true, hbResponse.hasOpTime()); + ASSERT_EQUALS(true, hbResponse.hasDurableOpTime()); + ASSERT_EQUALS(true, hbResponse.hasAppliedOpTime()); ASSERT_EQUALS(true, hbResponse.hasConfig()); ASSERT_EQUALS(true, hbResponse.isMismatched()); ASSERT_EQUALS(true, hbResponse.isReplSet()); @@ -517,7 +581,8 @@ TEST(ReplSetHeartbeatResponse, DefaultConstructThenSlowlyBuildToFullObj) { ASSERT_EQUALS(HostAndPort("syncTarget"), hbResponse.getSyncingTo()); ASSERT_EQUALS(1, hbResponse.getConfigVersion()); ASSERT_EQUALS(Timestamp(10, 0), hbResponse.getElectionTime()); - ASSERT_EQUALS(OpTime(Timestamp(0, 10), 0), hbResponse.getOpTime()); + ASSERT_EQUALS(OpTime(Timestamp(0, 10), 0), hbResponse.getDurableOpTime()); + ASSERT_EQUALS(OpTime(Timestamp(0, 50), 0), hbResponse.getAppliedOpTime()); ASSERT_EQUALS(Seconds(10), hbResponse.getTime()); ASSERT_EQUALS(true, hbResponse.isElectable()); ASSERT_EQUALS(config.toBSON().toString(), hbResponse.getConfig().toBSON().toString()); @@ -557,7 +622,23 @@ TEST(ReplSetHeartbeatResponse, InitializeWrongTimeType) { result.reason()); } -TEST(ReplSetHeartbeatResponse, InitializeWrongOpTimeType) { +TEST(ReplSetHeartbeatResponse, InitializeWrongDurableOpTimeType) { + ReplSetHeartbeatResponse hbResponse; + BSONObj initializerObj = BSON("ok" << 1.0 << "durableOpTime" + << "hello"); + Status result = hbResponse.initialize(initializerObj, 0); + ASSERT_EQUALS(ErrorCodes::TypeMismatch, result); + ASSERT_EQUALS("\"durableOpTime\" had the wrong type. Expected Object, found String", + result.reason()); + + BSONObj initializerObj2 = BSON("ok" << 1.0 << "durableOpTime" << OpTime().getTimestamp()); + Status result2 = hbResponse.initialize(initializerObj2, 0); + ASSERT_EQUALS(ErrorCodes::TypeMismatch, result2); + ASSERT_EQUALS("\"durableOpTime\" had the wrong type. Expected Object, found Timestamp", + result2.reason()); +} + +TEST(ReplSetHeartbeatResponse, InitializeWrongAppliedOpTimeType) { ReplSetHeartbeatResponse hbResponse; BSONObj initializerObj = BSON("ok" << 1.0 << "opTime" << "hello"); @@ -719,7 +800,7 @@ TEST(ReplSetHeartbeatResponse, InitializeBothOpTimeTypesSameResult) { result = hbResponseTimestamp.initialize(initializerTimestamp.obj(), 0); ASSERT_EQUALS(Status::OK(), result); - ASSERT_EQUALS(hbResponseTimestamp.getOpTime(), hbResponseTimestamp.getOpTime()); + ASSERT_EQUALS(hbResponseTimestamp.getAppliedOpTime(), hbResponseTimestamp.getAppliedOpTime()); } TEST(ReplSetHeartbeatResponse, NoConfigStillInitializing) { diff --git a/src/mongo/db/repl/repl_set_html_summary.cpp b/src/mongo/db/repl/repl_set_html_summary.cpp index 821e8258d2e..218dff908fd 100644 --- a/src/mongo/db/repl/repl_set_html_summary.cpp +++ b/src/mongo/db/repl/repl_set_html_summary.cpp @@ -185,8 +185,9 @@ const std::string ReplSetHtmlSummary::toHtmlString() const { } memberTable << td(grey(memberHB.getLastHeartbeatMsg(), !up)); // TODO(dannenberg): change timestamp to optime in V1 - memberTable << td( - memberHB.getLastHeartbeat() == Date_t() ? "?" : memberHB.getOpTime().toString()); + memberTable << td(memberHB.getLastHeartbeat() == Date_t() + ? "?" + : memberHB.getAppliedOpTime().toString()); } memberTable << _tr(); } @@ -200,7 +201,7 @@ const std::string ReplSetHtmlSummary::toHtmlString() const { const MemberConfig& selfConfig = _config.getMemberAt(_selfIndex); if (_primaryIndex >= 0 && _primaryIndex != _selfIndex && !selfConfig.isArbiter()) { - int lag = _hbData[_primaryIndex].getOpTime().getTimestamp().getSecs() - + int lag = _hbData[_primaryIndex].getAppliedOpTime().getTimestamp().getSecs() - _selfOptime.getTimestamp().getSecs(); s << tr("Lag: ", str::stream() << lag << " secs"); } diff --git a/src/mongo/db/repl/replica_set_config.cpp b/src/mongo/db/repl/replica_set_config.cpp index 2e86cecd9c9..4c578feca3a 100644 --- a/src/mongo/db/repl/replica_set_config.cpp +++ b/src/mongo/db/repl/replica_set_config.cpp @@ -61,21 +61,24 @@ const std::string kMembersFieldName = "members"; const std::string kSettingsFieldName = "settings"; const std::string kStepDownCheckWriteConcernModeName = "$stepDownCheck"; const std::string kProtocolVersionFieldName = "protocolVersion"; +const std::string kWriteConcernMajorityJournalDefaultFieldName = + "writeConcernMajorityJournalDefault"; const std::string kLegalConfigTopFieldNames[] = {kIdFieldName, ReplicaSetConfig::kVersionFieldName, kMembersFieldName, kSettingsFieldName, kProtocolVersionFieldName, - ReplicaSetConfig::kConfigServerFieldName}; + ReplicaSetConfig::kConfigServerFieldName, + kWriteConcernMajorityJournalDefaultFieldName}; -const std::string kElectionTimeoutFieldName = "electionTimeoutMillis"; -const std::string kHeartbeatIntervalFieldName = "heartbeatIntervalMillis"; -const std::string kHeartbeatTimeoutFieldName = "heartbeatTimeoutSecs"; const std::string kChainingAllowedFieldName = "chainingAllowed"; +const std::string kElectionTimeoutFieldName = "electionTimeoutMillis"; const std::string kGetLastErrorDefaultsFieldName = "getLastErrorDefaults"; const std::string kGetLastErrorModesFieldName = "getLastErrorModes"; const std::string kReplicaSetIdFieldName = "replicaSetId"; +const std::string kHeartbeatIntervalFieldName = "heartbeatIntervalMillis"; +const std::string kHeartbeatTimeoutFieldName = "heartbeatTimeoutSecs"; } // namespace @@ -164,6 +167,16 @@ Status ReplicaSetConfig::_initialize(const BSONObj& cfg, } // + // Parse writeConcernMajorityJournalDefault + // + status = bsonExtractBooleanFieldWithDefault(cfg, + kWriteConcernMajorityJournalDefaultFieldName, + _protocolVersion == 1, + &_writeConcernMajorityJournalDefault); + if (!status.isOK()) + return status; + + // // Parse settings // BSONElement settingsElement; @@ -490,6 +503,12 @@ Status ReplicaSetConfig::validate() const { "Nodes being used for config servers must be started with the " "--configsvr flag"); } + if (!_writeConcernMajorityJournalDefault) { + return Status(ErrorCodes::BadValue, + str::stream() << kWriteConcernMajorityJournalDefaultFieldName + << " must be true in replica set configurations being " + "used for config servers"); + } } else if (serverGlobalParams.configsvr) { return Status(ErrorCodes::BadValue, "Nodes started with the --configsvr flag must have configsvr:true in " @@ -652,8 +671,20 @@ BSONObj ReplicaSetConfig::toBSON() const { configBuilder.append(kConfigServerFieldName, _configServer); } + // Only include writeConcernMajorityJournalDefault if it is not the default version for this + // ProtocolVersion to prevent breaking cross version-3.2.1 compatibilty of ReplicaSetConfigs. if (_protocolVersion > 0) { configBuilder.append(kProtocolVersionFieldName, _protocolVersion); + // Only include writeConcernMajorityJournalDefault if it is not the default version for this + // ProtocolVersion to prevent breaking cross version-3.2.1 compatibilty of + // ReplicaSetConfigs. + if (!_writeConcernMajorityJournalDefault) { + configBuilder.append(kWriteConcernMajorityJournalDefaultFieldName, + _writeConcernMajorityJournalDefault); + } + } else if (_writeConcernMajorityJournalDefault) { + configBuilder.append(kWriteConcernMajorityJournalDefaultFieldName, + _writeConcernMajorityJournalDefault); } BSONArrayBuilder members(configBuilder.subarrayStart(kMembersFieldName)); diff --git a/src/mongo/db/repl/replica_set_config.h b/src/mongo/db/repl/replica_set_config.h index 5a2541b8ecf..509a84b0b5c 100644 --- a/src/mongo/db/repl/replica_set_config.h +++ b/src/mongo/db/repl/replica_set_config.h @@ -231,6 +231,14 @@ public: } /** + * Returns whether or not majority write concerns should implicitly journal, if j has not been + * explicitly set. + */ + bool getWriteConcernMajorityShouldJournal() const { + return _writeConcernMajorityJournalDefault; + } + + /** * Returns true if this replica set is for use as a config server replica set. */ bool isConfigServer() const { @@ -349,6 +357,7 @@ private: Milliseconds _heartbeatInterval = kDefaultHeartbeatInterval; Seconds _heartbeatTimeoutPeriod = kDefaultHeartbeatTimeoutPeriod; bool _chainingAllowed = kDefaultChainingAllowed; + bool _writeConcernMajorityJournalDefault = false; int _majorityVoteCount = 0; int _writeMajority = 0; int _totalVotingMembers = 0; diff --git a/src/mongo/db/repl/replica_set_config_test.cpp b/src/mongo/db/repl/replica_set_config_test.cpp index 19cda44163d..05e534584e7 100644 --- a/src/mongo/db/repl/replica_set_config_test.cpp +++ b/src/mongo/db/repl/replica_set_config_test.cpp @@ -76,6 +76,7 @@ TEST(ReplicaSetConfig, ParseMinimalConfigAndCheckDefaults) { ASSERT_EQUALS(ReplicaSetConfig::kDefaultElectionTimeoutPeriod, config.getElectionTimeoutPeriod()); ASSERT_TRUE(config.isChainingAllowed()); + ASSERT_FALSE(config.getWriteConcernMajorityShouldJournal()); ASSERT_FALSE(config.isConfigServer()); ASSERT_EQUALS(0, config.getProtocolVersion()); } @@ -104,6 +105,7 @@ TEST(ReplicaSetConfig, ParseLargeConfigAndCheckAccessors) { ASSERT_EQUALS(0, config.getDefaultWriteConcern().wNumNodes); ASSERT_EQUALS("majority", config.getDefaultWriteConcern().wMode); ASSERT_FALSE(config.isChainingAllowed()); + ASSERT_TRUE(config.getWriteConcernMajorityShouldJournal()); ASSERT_FALSE(config.isConfigServer()); ASSERT_EQUALS(Seconds(5), config.getHeartbeatInterval()); ASSERT_EQUALS(Seconds(120), config.getHeartbeatTimeoutPeriod()); @@ -977,7 +979,7 @@ TEST(ReplicaSetConfig, toBSONRoundTripAbilityLarge) { ASSERT_OK(configA.initialize(BSON( "_id" << "asdf" - << "version" << 9 << "members" + << "version" << 9 << "writeConcernMajorityJournalDefault" << true << "members" << BSON_ARRAY(BSON("_id" << 0 << "host" << "localhost:12345" << "arbiterOnly" << true << "votes" << 1) @@ -995,14 +997,14 @@ TEST(ReplicaSetConfig, toBSONRoundTripAbilityLarge) { << BSON("coast" << "west" << "hdd" - << "true"))) << "protocolVersion" << 0 - << "settings" << BSON("heartbeatIntervalMillis" - << 5000 << "heartbeatTimeoutSecs" << 20 << "electionTimeoutMillis" - << 4 << "chainingAllowd" << true << "getLastErrorDefaults" - << BSON("w" - << "majority") << "getLastErrorModes" - << BSON("disks" << BSON("ssd" << 1 << "hdd" << 1) << "coasts" - << BSON("coast" << 2)))))); + << "true"))) << "protocolVersion" << 0 << "settings" + + << BSON("heartbeatIntervalMillis" + << 5000 << "heartbeatTimeoutSecs" << 20 << "electionTimeoutMillis" << 4 + << "chainingAllowd" << true << "getLastErrorDefaults" << BSON("w" + << "majority") + << "getLastErrorModes" << BSON("disks" << BSON("ssd" << 1 << "hdd" << 1) << "coasts" + << BSON("coast" << 2)))))); BSONObj configObjA = configA.toBSON(); // Ensure a protocolVersion does not show up if it is 0 to maintain cross version compatibility. ASSERT_FALSE(configObjA.hasField("protocolVersion")); @@ -1199,6 +1201,23 @@ TEST(ReplicaSetConfig, CheckConfigServerCantHaveSlaveDelay) { ASSERT_STRING_CONTAINS(status.reason(), "cannot have a non-zero slaveDelay"); } +TEST(ReplicaSetConfig, CheckConfigServerMustHaveTrueForWriteConcernMajorityJournalDefault) { + serverGlobalParams.configsvr = true; + ON_BLOCK_EXIT([&] { serverGlobalParams.configsvr = false; }); + ReplicaSetConfig configA; + ASSERT_OK( + configA.initialize(BSON("_id" + << "rs0" + << "protocolVersion" << 1 << "version" << 1 << "configsvr" << true + << "members" << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345") + << BSON("_id" << 1 << "host" + << "localhost:54321")) + << "writeConcernMajorityJournalDefault" << false))); + Status status = configA.validate(); + ASSERT_EQUALS(ErrorCodes::BadValue, status); + ASSERT_STRING_CONTAINS(status.reason(), " must be true in replica set configurations being "); +} TEST(ReplicaSetConfig, GetPriorityTakeoverDelay) { ReplicaSetConfig configA; @@ -1341,6 +1360,50 @@ TEST(ReplicaSetConfig, ReplSetId) { "\"replicaSetId\" had the wrong type. Expected OID, found NumberInt32"); } +TEST(ReplicaSetConfig, ConfirmDefaultValuesOfAndAbilityToSetWriteConcernMajorityJournalDefault) { + // PV0, should default to false. + ReplicaSetConfig config; + ASSERT_OK(config.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345"))))); + ASSERT_OK(config.validate()); + ASSERT_FALSE(config.getWriteConcernMajorityShouldJournal()); + ASSERT_FALSE(config.toBSON().hasField("writeConcernMajorityJournalDefault")); + + // Should be able to set it true in PV0. + ASSERT_OK(config.initialize(BSON("_id" + << "rs0" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345")) + << "writeConcernMajorityJournalDefault" << true))); + ASSERT_OK(config.validate()); + ASSERT_TRUE(config.getWriteConcernMajorityShouldJournal()); + ASSERT_TRUE(config.toBSON().hasField("writeConcernMajorityJournalDefault")); + + // PV1, should default to true. + ASSERT_OK(config.initialize(BSON("_id" + << "rs0" + << "protocolVersion" << 1 << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345"))))); + ASSERT_OK(config.validate()); + ASSERT_TRUE(config.getWriteConcernMajorityShouldJournal()); + ASSERT_FALSE(config.toBSON().hasField("writeConcernMajorityJournalDefault")); + + // Should be able to set it false in PV1. + ASSERT_OK(config.initialize(BSON("_id" + << "rs0" + << "protocolVersion" << 1 << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "localhost:12345")) + << "writeConcernMajorityJournalDefault" << false))); + ASSERT_OK(config.validate()); + ASSERT_FALSE(config.getWriteConcernMajorityShouldJournal()); + ASSERT_TRUE(config.toBSON().hasField("writeConcernMajorityJournalDefault")); +} } // namespace } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/replication_coordinator.h b/src/mongo/db/repl/replication_coordinator.h index 1d31b059bb7..630b9277b48 100644 --- a/src/mongo/db/repl/replication_coordinator.h +++ b/src/mongo/db/repl/replication_coordinator.h @@ -67,6 +67,7 @@ namespace repl { class BackgroundSync; class HandshakeArgs; class IsMasterResponse; +class OldUpdatePositionArgs; class OplogReader; class OpTime; class ReadConcernArgs; @@ -293,11 +294,22 @@ public: * * The new value of "opTime" must be no less than any prior value passed to this method, and * it is the caller's job to properly synchronize this behavior. The exception to this rule - * is that after calls to resetLastOpTimeFromOplog(), the minimum acceptable value for + * is that after calls to resetLastOpTimesFromOplog(), the minimum acceptable value for * "opTime" is reset based on the contents of the oplog, and may go backwards due to * rollback. */ - virtual void setMyLastOptime(const OpTime& opTime) = 0; + virtual void setMyLastAppliedOpTime(const OpTime& opTime) = 0; + + /** + * Updates our internal tracking of the last OpTime durable to this node. + * + * The new value of "opTime" must be no less than any prior value passed to this method, and + * it is the caller's job to properly synchronize this behavior. The exception to this rule + * is that after calls to resetLastOpTimesFromOplog(), the minimum acceptable value for + * "opTime" is reset based on the contents of the oplog, and may go backwards due to + * rollback. + */ + virtual void setMyLastDurableOpTime(const OpTime& opTime) = 0; /** * Updates our internal tracking of the last OpTime applied to this node, but only @@ -307,12 +319,22 @@ public: * This function is used by logOp() on a primary, since the ops in the oplog do not * necessarily commit in sequential order. */ - virtual void setMyLastOptimeForward(const OpTime& opTime) = 0; + virtual void setMyLastAppliedOpTimeForward(const OpTime& opTime) = 0; + + /** + * Updates our internal tracking of the last OpTime durable to this node, but only + * if the supplied optime is later than the current last OpTime known to the replication + * coordinator. + * + * This function is used by logOp() on a primary, since the ops in the oplog do not + * necessarily commit in sequential order. + */ + virtual void setMyLastDurableOpTimeForward(const OpTime& opTime) = 0; /** * Same as above, but used during places we need to zero our last optime. */ - virtual void resetMyLastOptime() = 0; + virtual void resetMyLastOpTimes() = 0; /** * Updates our the message we include in heartbeat responses. @@ -320,9 +342,14 @@ public: virtual void setMyHeartbeatMessage(const std::string& msg) = 0; /** - * Returns the last optime recorded by setMyLastOptime. + * Returns the last optime recorded by setMyLastAppliedOpTime. */ - virtual OpTime getMyLastOptime() const = 0; + virtual OpTime getMyLastAppliedOpTime() const = 0; + + /** + * Returns the last optime recorded by setMyLastDurableOpTime. + */ + virtual OpTime getMyLastDurableOpTime() const = 0; /** * Waits until the optime of the current node is at least the opTime specified in @@ -408,6 +435,7 @@ public: * * The returned bool indicates whether or not the command was created. */ + virtual bool prepareOldReplSetUpdatePositionCommand(BSONObjBuilder* cmdBuilder) = 0; virtual bool prepareReplSetUpdatePositionCommand(BSONObjBuilder* cmdBuilder) = 0; /** @@ -573,7 +601,12 @@ public: * were applied. * "configVersion" will be populated with our config version if and only if we return * InvalidReplicaSetConfig. + * + * The OldUpdatePositionArgs version provides support for the pre-3.2.2 format of + * UpdatePositionArgs. */ + virtual Status processReplSetUpdatePosition(const OldUpdatePositionArgs& updates, + long long* configVersion) = 0; virtual Status processReplSetUpdatePosition(const UpdatePositionArgs& updates, long long* configVersion) = 0; @@ -594,8 +627,9 @@ public: /** * Returns a vector of members that have applied the operation with OpTime 'op'. + * "durablyWritten" indicates whether the operation has to be durably applied. */ - virtual std::vector<HostAndPort> getHostsWrittenTo(const OpTime& op) = 0; + virtual std::vector<HostAndPort> getHostsWrittenTo(const OpTime& op, bool durablyWritten) = 0; /** * Returns a vector of the members other than ourself in the replica set, as specified in @@ -620,10 +654,10 @@ public: virtual Status checkReplEnabledForCommand(BSONObjBuilder* result) = 0; /** - * Loads the optime from the last op in the oplog into the coordinator's lastOpApplied - * value. + * Loads the optime from the last op in the oplog into the coordinator's lastAppliedOpTime and + * lastDurableOpTime values. */ - virtual void resetLastOpTimeFromOplog(OperationContext* txn) = 0; + virtual void resetLastOpTimesFromOplog(OperationContext* txn) = 0; /** * Returns the OpTime of the latest replica set-committed op known to this server. @@ -661,6 +695,12 @@ public: virtual bool isV1ElectionProtocol() = 0; /** + * Returns whether or not majority write concerns should implicitly journal, if j has not been + * explicitly set. + */ + virtual bool getWriteConcernMajorityShouldJournal() = 0; + + /** * Writes into 'output' all the information needed to generate a summary of the current * replication state for use by the web interface. */ @@ -739,6 +779,13 @@ public: */ virtual size_t getNumUncommittedSnapshots() = 0; + /** + * Returns a new WriteConcernOptions based on "wc" but with UNSET syncMode reset to JOURNAL or + * NONE based on our rsConfig. + */ + virtual WriteConcernOptions populateUnsetWriteConcernOptionsSyncMode( + WriteConcernOptions wc) = 0; + protected: ReplicationCoordinator(); }; diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index b98004f05fc..76be96c4b3a 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -47,6 +47,7 @@ #include "mongo/db/repl/handshake_args.h" #include "mongo/db/repl/is_master_response.h" #include "mongo/db/repl/last_vote.h" +#include "mongo/db/repl/old_update_position_args.h" #include "mongo/db/repl/read_concern_args.h" #include "mongo/db/repl/read_concern_response.h" #include "mongo/db/repl/repl_client_info.h" @@ -64,6 +65,7 @@ #include "mongo/db/repl/update_position_args.h" #include "mongo/db/repl/vote_requester.h" #include "mongo/db/server_options.h" +#include "mongo/db/write_concern.h" #include "mongo/db/write_concern_options.h" #include "mongo/executor/connection_pool_stats.h" #include "mongo/rpc/metadata/repl_set_metadata.h" @@ -161,17 +163,17 @@ DataReplicatorOptions createDataReplicatorOptions(ReplicationCoordinator* replCo options.applierFn = [](OperationContext*, const BSONObj&) -> Status { return Status::OK(); }; options.rollbackFn = [](OperationContext*, const OpTime&, const HostAndPort&) { return Status::OK(); }; - options.prepareReplSetUpdatePositionCommandFn = [replCoord]() -> StatusWith<BSONObj> { + options.prepareOldReplSetUpdatePositionCommandFn = [replCoord]() -> StatusWith<BSONObj> { BSONObjBuilder bob; - if (replCoord->prepareReplSetUpdatePositionCommand(&bob)) { + if (replCoord->prepareOldReplSetUpdatePositionCommand(&bob)) { return bob.obj(); } return Status(ErrorCodes::OperationFailed, "unable to prepare replSetUpdatePosition command object"); }; - options.getMyLastOptime = [replCoord]() { return replCoord->getMyLastOptime(); }; + options.getMyLastOptime = [replCoord]() { return replCoord->getMyLastAppliedOpTime(); }; options.setMyLastOptime = - [replCoord](const OpTime& opTime) { replCoord->setMyLastOptime(opTime); }; + [replCoord](const OpTime& opTime) { replCoord->setMyLastAppliedOpTime(opTime); }; options.setFollowerMode = [replCoord](const MemberState& newState) { return replCoord->setFollowerMode(newState); }; options.syncSourceSelector = replCoord; @@ -186,7 +188,8 @@ ReplicationCoordinatorImpl::ReplicationCoordinatorImpl( int64_t prngSeed, NetworkInterface* network, StorageInterface* storage, - ReplicationExecutor* replExec) + ReplicationExecutor* replExec, + stdx::function<bool()>* isDurableStorageEngineFn) : _settings(settings), _replMode(getReplicationModeFromSettings(settings)), _topCoord(topCoord), @@ -202,7 +205,10 @@ ReplicationCoordinatorImpl::ReplicationCoordinatorImpl( _sleptLastElection(false), _canAcceptNonLocalWrites(!(settings.usingReplSets() || settings.isSlave())), _canServeNonLocalReads(0U), - _dr(createDataReplicatorOptions(this), &_replExecutor) { + _dr(createDataReplicatorOptions(this), &_replExecutor), + _isDurableStorageEngine(isDurableStorageEngineFn ? *isDurableStorageEngineFn : []() -> bool { + return getGlobalServiceContext()->getGlobalStorageEngine()->isDurable(); + }) { if (!isReplEnabled()) { return; } @@ -228,16 +234,23 @@ ReplicationCoordinatorImpl::ReplicationCoordinatorImpl( TopologyCoordinator* topCoord, int64_t prngSeed) : ReplicationCoordinatorImpl( - settings, externalState, topCoord, prngSeed, network, storage, nullptr) {} + settings, externalState, topCoord, prngSeed, network, storage, nullptr, nullptr) {} ReplicationCoordinatorImpl::ReplicationCoordinatorImpl( const ReplSettings& settings, ReplicationCoordinatorExternalState* externalState, TopologyCoordinator* topCoord, ReplicationExecutor* replExec, - int64_t prngSeed) - : ReplicationCoordinatorImpl( - settings, externalState, topCoord, prngSeed, nullptr, nullptr, replExec) {} + int64_t prngSeed, + stdx::function<bool()>* isDurableStorageEngineFn) + : ReplicationCoordinatorImpl(settings, + externalState, + topCoord, + prngSeed, + nullptr, + nullptr, + replExec, + isDurableStorageEngineFn) {} ReplicationCoordinatorImpl::~ReplicationCoordinatorImpl() {} @@ -404,7 +417,8 @@ void ReplicationCoordinatorImpl::_finishLoadLocalConfig( invariant(_rsConfigState == kConfigStartingUp); const PostMemberStateUpdateAction action = _setCurrentRSConfig_inlock(cbData, localConfig, myIndex.getValue()); - _setMyLastOptimeAndReport_inlock(&lk, lastOpTime, false); + _setMyLastAppliedOpTime_inlock(lastOpTime, false); + _setMyLastDurableOpTimeAndReport_inlock(&lk, lastOpTime, false); _externalState->setGlobalTimestamp(lastOpTime.getTimestamp()); // Step down is impossible, so we don't need to wait for the returned event. _updateTerm_incallback(term); @@ -741,9 +755,26 @@ void ReplicationCoordinatorImpl::_addSlaveInfo_inlock(const SlaveInfo& slaveInfo _wakeReadyWaiters_inlock(); } -void ReplicationCoordinatorImpl::_updateSlaveInfoOptime_inlock(SlaveInfo* slaveInfo, - const OpTime& opTime) { - slaveInfo->opTime = opTime; +void ReplicationCoordinatorImpl::_updateSlaveInfoAppliedOpTime_inlock(SlaveInfo* slaveInfo, + const OpTime& opTime) { + slaveInfo->lastAppliedOpTime = opTime; + slaveInfo->lastUpdate = _replExecutor.now(); + slaveInfo->down = false; + + // Wake up any threads waiting for replication that now have their replication + // check satisfied + _wakeReadyWaiters_inlock(); +} + +void ReplicationCoordinatorImpl::_updateSlaveInfoDurableOpTime_inlock(SlaveInfo* slaveInfo, + const OpTime& opTime) { + // lastAppliedOpTime cannot be behind lastDurableOpTime. + if (slaveInfo->lastAppliedOpTime < opTime) { + log() << "Durable progress is ahead of the applied progress. This is likely due to a " + "rollback."; + return; + } + slaveInfo->lastDurableOpTime = opTime; slaveInfo->lastUpdate = _replExecutor.now(); slaveInfo->down = false; @@ -824,13 +855,13 @@ Status ReplicationCoordinatorImpl::setLastOptimeForSlave(const OID& rid, const T OpTime opTime(ts, OpTime::kUninitializedTerm); SlaveInfo* slaveInfo = _findSlaveInfoByRID_inlock(rid); if (slaveInfo) { - if (slaveInfo->opTime < opTime) { - _updateSlaveInfoOptime_inlock(slaveInfo, opTime); + if (slaveInfo->lastAppliedOpTime < opTime) { + _updateSlaveInfoAppliedOpTime_inlock(slaveInfo, opTime); } } else { SlaveInfo newSlaveInfo; newSlaveInfo.rid = rid; - newSlaveInfo.opTime = opTime; + newSlaveInfo.lastAppliedOpTime = opTime; _addSlaveInfo_inlock(newSlaveInfo); } return Status::OK(); @@ -841,28 +872,41 @@ void ReplicationCoordinatorImpl::setMyHeartbeatMessage(const std::string& msg) { &TopologyCoordinator::setMyHeartbeatMessage, _topCoord.get(), _replExecutor.now(), msg)); } -void ReplicationCoordinatorImpl::setMyLastOptimeForward(const OpTime& opTime) { +void ReplicationCoordinatorImpl::setMyLastAppliedOpTimeForward(const OpTime& opTime) { + stdx::unique_lock<stdx::mutex> lock(_mutex); + if (opTime > _getMyLastAppliedOpTime_inlock()) { + _setMyLastAppliedOpTimeAndReport_inlock(&lock, opTime, false); + } +} + +void ReplicationCoordinatorImpl::setMyLastDurableOpTimeForward(const OpTime& opTime) { stdx::unique_lock<stdx::mutex> lock(_mutex); - if (opTime > _getMyLastOptime_inlock()) { - _setMyLastOptimeAndReport_inlock(&lock, opTime, false); + if (opTime > _getMyLastDurableOpTime_inlock()) { + _setMyLastDurableOpTimeAndReport_inlock(&lock, opTime, false); } } -void ReplicationCoordinatorImpl::setMyLastOptime(const OpTime& opTime) { +void ReplicationCoordinatorImpl::setMyLastAppliedOpTime(const OpTime& opTime) { + stdx::unique_lock<stdx::mutex> lock(_mutex); + _setMyLastAppliedOpTimeAndReport_inlock(&lock, opTime, false); +} + +void ReplicationCoordinatorImpl::setMyLastDurableOpTime(const OpTime& opTime) { stdx::unique_lock<stdx::mutex> lock(_mutex); - _setMyLastOptimeAndReport_inlock(&lock, opTime, false); + _setMyLastDurableOpTimeAndReport_inlock(&lock, opTime, false); } -void ReplicationCoordinatorImpl::resetMyLastOptime() { +void ReplicationCoordinatorImpl::resetMyLastOpTimes() { stdx::unique_lock<stdx::mutex> lock(_mutex); // Reset to uninitialized OpTime - _setMyLastOptimeAndReport_inlock(&lock, OpTime(), true); + _setMyLastAppliedOpTime_inlock(OpTime(), true); + _setMyLastDurableOpTimeAndReport_inlock(&lock, OpTime(), true); } -void ReplicationCoordinatorImpl::_setMyLastOptimeAndReport_inlock( +void ReplicationCoordinatorImpl::_setMyLastAppliedOpTimeAndReport_inlock( stdx::unique_lock<stdx::mutex>* lock, const OpTime& opTime, bool isRollbackAllowed) { invariant(lock->owns_lock()); - _setMyLastOptime_inlock(opTime, isRollbackAllowed); + _setMyLastAppliedOpTime_inlock(opTime, isRollbackAllowed); if (getReplicationMode() != modeReplSet) { return; @@ -877,11 +921,29 @@ void ReplicationCoordinatorImpl::_setMyLastOptimeAndReport_inlock( _externalState->forwardSlaveProgress(); // Must do this outside _mutex } -void ReplicationCoordinatorImpl::_setMyLastOptime_inlock(const OpTime& opTime, - bool isRollbackAllowed) { +void ReplicationCoordinatorImpl::_setMyLastDurableOpTimeAndReport_inlock( + stdx::unique_lock<stdx::mutex>* lock, const OpTime& opTime, bool isRollbackAllowed) { + invariant(lock->owns_lock()); + _setMyLastDurableOpTime_inlock(opTime, isRollbackAllowed); + + if (getReplicationMode() != modeReplSet) { + return; + } + + if (_getMemberState_inlock().primary()) { + return; + } + + lock->unlock(); + + _externalState->forwardSlaveProgress(); // Must do this outside _mutex +} + +void ReplicationCoordinatorImpl::_setMyLastAppliedOpTime_inlock(const OpTime& opTime, + bool isRollbackAllowed) { SlaveInfo* mySlaveInfo = &_slaveInfo[_getMyIndexInSlaveInfo_inlock()]; - invariant(isRollbackAllowed || mySlaveInfo->opTime <= opTime); - _updateSlaveInfoOptime_inlock(mySlaveInfo, opTime); + invariant(isRollbackAllowed || mySlaveInfo->lastAppliedOpTime <= opTime); + _updateSlaveInfoAppliedOpTime_inlock(mySlaveInfo, opTime); for (auto& opTimeWaiter : _opTimeWaiterList) { if (*(opTimeWaiter->opTime) <= opTime) { @@ -890,9 +952,27 @@ void ReplicationCoordinatorImpl::_setMyLastOptime_inlock(const OpTime& opTime, } } -OpTime ReplicationCoordinatorImpl::getMyLastOptime() const { +void ReplicationCoordinatorImpl::_setMyLastDurableOpTime_inlock(const OpTime& opTime, + bool isRollbackAllowed) { + SlaveInfo* mySlaveInfo = &_slaveInfo[_getMyIndexInSlaveInfo_inlock()]; + invariant(isRollbackAllowed || mySlaveInfo->lastDurableOpTime <= opTime); + // lastAppliedOpTime cannot be behind lastDurableOpTime. + if (mySlaveInfo->lastAppliedOpTime < opTime) { + log() << "Durable progress is ahead of the applied progress. This is likely due to a " + "rollback."; + return; + } + _updateSlaveInfoDurableOpTime_inlock(mySlaveInfo, opTime); +} + +OpTime ReplicationCoordinatorImpl::getMyLastAppliedOpTime() const { + stdx::lock_guard<stdx::mutex> lock(_mutex); + return _getMyLastAppliedOpTime_inlock(); +} + +OpTime ReplicationCoordinatorImpl::getMyLastDurableOpTime() const { stdx::lock_guard<stdx::mutex> lock(_mutex); - return _getMyLastOptime_inlock(); + return _getMyLastDurableOpTime_inlock(); } ReadConcernResponse ReplicationCoordinatorImpl::waitUntilOpTime(OperationContext* txn, @@ -933,7 +1013,7 @@ ReadConcernResponse ReplicationCoordinatorImpl::waitUntilOpTime(OperationContext auto loopCondition = [this, isMajorityReadConcern, targetOpTime] { return isMajorityReadConcern ? !_currentCommittedSnapshot || targetOpTime > _currentCommittedSnapshot->opTime - : targetOpTime > _getMyLastOptime_inlock(); + : targetOpTime > _getMyLastAppliedOpTime_inlock(); }; while (loopCondition()) { @@ -950,6 +1030,9 @@ ReadConcernResponse ReplicationCoordinatorImpl::waitUntilOpTime(OperationContext stdx::condition_variable condVar; WriteConcernOptions writeConcern; writeConcern.wMode = WriteConcernOptions::kMajority; + writeConcern.syncMode = getWriteConcernMajorityShouldJournal_inlock() + ? WriteConcernOptions::SyncMode::JOURNAL + : WriteConcernOptions::SyncMode::NONE; WaiterInfo waitInfo(isMajorityReadConcern ? &_replicationWaiterList : &_opTimeWaiterList, txn->getOpID(), @@ -967,25 +1050,111 @@ ReadConcernResponse ReplicationCoordinatorImpl::waitUntilOpTime(OperationContext return ReadConcernResponse(Status::OK(), Milliseconds(timer.millis())); } -OpTime ReplicationCoordinatorImpl::_getMyLastOptime_inlock() const { - return _slaveInfo[_getMyIndexInSlaveInfo_inlock()].opTime; +OpTime ReplicationCoordinatorImpl::_getMyLastAppliedOpTime_inlock() const { + return _slaveInfo[_getMyIndexInSlaveInfo_inlock()].lastAppliedOpTime; +} + +OpTime ReplicationCoordinatorImpl::_getMyLastDurableOpTime_inlock() const { + return _slaveInfo[_getMyIndexInSlaveInfo_inlock()].lastDurableOpTime; } -Status ReplicationCoordinatorImpl::setLastOptime_forTest(long long cfgVer, - long long memberId, - const OpTime& opTime) { +Status ReplicationCoordinatorImpl::setLastDurableOptime_forTest(long long cfgVer, + long long memberId, + const OpTime& opTime) { stdx::lock_guard<stdx::mutex> lock(_mutex); invariant(getReplicationMode() == modeReplSet); - const UpdatePositionArgs::UpdateInfo update(OID(), opTime, cfgVer, memberId); + const UpdatePositionArgs::UpdateInfo update(OpTime(), opTime, cfgVer, memberId); long long configVersion; return _setLastOptime_inlock(update, &configVersion); } +Status ReplicationCoordinatorImpl::setLastAppliedOptime_forTest(long long cfgVer, + long long memberId, + const OpTime& opTime) { + stdx::lock_guard<stdx::mutex> lock(_mutex); + invariant(getReplicationMode() == modeReplSet); + + const UpdatePositionArgs::UpdateInfo update(opTime, OpTime(), cfgVer, memberId); + long long configVersion; + return _setLastOptime_inlock(update, &configVersion); +} + +Status ReplicationCoordinatorImpl::_setLastOptime_inlock( + const OldUpdatePositionArgs::UpdateInfo& args, long long* configVersion) { + if (_selfIndex == -1) { + // Ignore updates when we're in state REMOVED + return Status(ErrorCodes::NotMasterOrSecondary, + "Received replSetUpdatePosition command but we are in state REMOVED"); + } + invariant(getReplicationMode() == modeReplSet); + + if (args.memberId < 0) { + std::string errmsg = str::stream() + << "Received replSetUpdatePosition for node with memberId " << args.memberId + << " which is negative and therefore invalid"; + LOG(1) << errmsg; + return Status(ErrorCodes::NodeNotFound, errmsg); + } + + if (args.memberId == _rsConfig.getMemberAt(_selfIndex).getId()) { + // Do not let remote nodes tell us what our optime is. + return Status::OK(); + } + + LOG(2) << "received notification that node with memberID " << args.memberId + << " in config with version " << args.cfgver + << " has durably reached optime: " << args.ts; + + SlaveInfo* slaveInfo = NULL; + if (args.cfgver != _rsConfig.getConfigVersion()) { + std::string errmsg = str::stream() + << "Received replSetUpdatePosition for node with memberId " << args.memberId + << " whose config version of " << args.cfgver << " doesn't match our config version of " + << _rsConfig.getConfigVersion(); + LOG(1) << errmsg; + *configVersion = _rsConfig.getConfigVersion(); + return Status(ErrorCodes::InvalidReplicaSetConfig, errmsg); + } + + slaveInfo = _findSlaveInfoByMemberID_inlock(args.memberId); + if (!slaveInfo) { + invariant(!_rsConfig.findMemberByID(args.memberId)); + + std::string errmsg = str::stream() + << "Received replSetUpdatePosition for node with memberId " << args.memberId + << " which doesn't exist in our config"; + LOG(1) << errmsg; + return Status(ErrorCodes::NodeNotFound, errmsg); + } + + invariant(args.memberId == slaveInfo->memberId); + + LOG(3) << "Node with memberID " << args.memberId << " has durably applied operationss through " + << slaveInfo->lastDurableOpTime << " and has applied operations through " + << slaveInfo->lastAppliedOpTime << "; updating to new durable operation with timestamp " + << args.ts; + + // Only update remote optimes if they increase. + if (slaveInfo->lastAppliedOpTime < args.ts) { + _updateSlaveInfoAppliedOpTime_inlock(slaveInfo, args.ts); + } + if (slaveInfo->lastDurableOpTime < args.ts) { + _updateSlaveInfoDurableOpTime_inlock(slaveInfo, args.ts); + } + + + // Update liveness for this node. + slaveInfo->lastUpdate = _replExecutor.now(); + slaveInfo->down = false; + _cancelAndRescheduleLivenessUpdate_inlock(args.memberId); + return Status::OK(); +} + Status ReplicationCoordinatorImpl::_setLastOptime_inlock(const UpdatePositionArgs::UpdateInfo& args, long long* configVersion) { if (_selfIndex == -1) { - // Ignore updates when we're in state REMOVED + // Ignore updates when we're in state REMOVED. return Status(ErrorCodes::NotMasterOrSecondary, "Received replSetUpdatePosition command but we are in state REMOVED"); } @@ -1005,7 +1174,9 @@ Status ReplicationCoordinatorImpl::_setLastOptime_inlock(const UpdatePositionArg } LOG(2) << "received notification that node with memberID " << args.memberId - << " in config with version " << args.cfgver << " has reached optime: " << args.ts; + << " in config with version " << args.cfgver + << " has reached optime: " << args.appliedOpTime + << " and is durable through: " << args.durableOpTime; SlaveInfo* slaveInfo = NULL; if (args.cfgver != _rsConfig.getConfigVersion()) { @@ -1032,11 +1203,17 @@ Status ReplicationCoordinatorImpl::_setLastOptime_inlock(const UpdatePositionArg invariant(args.memberId == slaveInfo->memberId); LOG(3) << "Node with memberID " << args.memberId << " currently has optime " - << slaveInfo->opTime << "; updating to " << args.ts; + << slaveInfo->lastAppliedOpTime << " durable through " << slaveInfo->lastDurableOpTime + << "; updating to optime " << args.appliedOpTime << " and durable through " + << args.durableOpTime; + // Only update remote optimes if they increase. - if (slaveInfo->opTime < args.ts) { - _updateSlaveInfoOptime_inlock(slaveInfo, args.ts); + if (slaveInfo->lastAppliedOpTime < args.appliedOpTime) { + _updateSlaveInfoAppliedOpTime_inlock(slaveInfo, args.appliedOpTime); + } + if (slaveInfo->lastDurableOpTime < args.durableOpTime) { + _updateSlaveInfoDurableOpTime_inlock(slaveInfo, args.durableOpTime); } // Update liveness for this node. @@ -1092,17 +1269,22 @@ void ReplicationCoordinatorImpl::interruptAll() { bool ReplicationCoordinatorImpl::_doneWaitingForReplication_inlock( const OpTime& opTime, SnapshotName minSnapshot, const WriteConcernOptions& writeConcern) { + invariant(writeConcern.syncMode != WriteConcernOptions::SyncMode::UNSET); Status status = _checkIfWriteConcernCanBeSatisfied_inlock(writeConcern); if (!status.isOK()) { return true; } if (writeConcern.wMode.empty()) - return _haveNumNodesReachedOpTime_inlock(opTime, writeConcern.wNumNodes); + return _haveNumNodesReachedOpTime_inlock(opTime, + writeConcern.wNumNodes, + writeConcern.syncMode == + WriteConcernOptions::SyncMode::JOURNAL); StringData patternName; if (writeConcern.wMode == WriteConcernOptions::kMajority) { - if (_externalState->snapshotsEnabled()) { + if (writeConcern.syncMode == WriteConcernOptions::SyncMode::JOURNAL && + _externalState->snapshotsEnabled()) { if (!_currentCommittedSnapshot) { return false; } @@ -1119,20 +1301,26 @@ bool ReplicationCoordinatorImpl::_doneWaitingForReplication_inlock( if (!tagPattern.isOK()) { return true; } - return _haveTaggedNodesReachedOpTime_inlock(opTime, tagPattern.getValue()); + return _haveTaggedNodesReachedOpTime_inlock(opTime, + tagPattern.getValue(), + writeConcern.syncMode == + WriteConcernOptions::SyncMode::JOURNAL); } -bool ReplicationCoordinatorImpl::_haveNumNodesReachedOpTime_inlock(const OpTime& opTime, - int numNodes) { - if (_getMyLastOptime_inlock() < opTime) { - // Secondaries that are for some reason ahead of us should not allow us to - // satisfy a write concern if we aren't caught up ourselves. +bool ReplicationCoordinatorImpl::_haveNumNodesReachedOpTime_inlock(const OpTime& targetOpTime, + int numNodes, + bool durablyWritten) { + // Replication progress that is for some reason ahead of us should not allow us to + // satisfy a write concern if we aren't caught up ourselves. + OpTime myOpTime = + durablyWritten ? _getMyLastDurableOpTime_inlock() : _getMyLastAppliedOpTime_inlock(); + if (myOpTime < targetOpTime) { return false; } for (SlaveInfoVector::iterator it = _slaveInfo.begin(); it != _slaveInfo.end(); ++it) { - const OpTime& slaveTime = it->opTime; - if (slaveTime >= opTime) { + const OpTime& slaveTime = durablyWritten ? it->lastDurableOpTime : it->lastAppliedOpTime; + if (slaveTime >= targetOpTime) { --numNodes; } @@ -1144,10 +1332,10 @@ bool ReplicationCoordinatorImpl::_haveNumNodesReachedOpTime_inlock(const OpTime& } bool ReplicationCoordinatorImpl::_haveTaggedNodesReachedOpTime_inlock( - const OpTime& opTime, const ReplicaSetTagPattern& tagPattern) { + const OpTime& opTime, const ReplicaSetTagPattern& tagPattern, bool durablyWritten) { ReplicaSetTagMatch matcher(tagPattern); for (SlaveInfoVector::iterator it = _slaveInfo.begin(); it != _slaveInfo.end(); ++it) { - const OpTime& slaveTime = it->opTime; + const OpTime& slaveTime = durablyWritten ? it->lastDurableOpTime : it->lastAppliedOpTime; if (slaveTime >= opTime) { // This node has reached the desired optime, now we need to check if it is a part // of the tagPattern. @@ -1168,18 +1356,25 @@ bool ReplicationCoordinatorImpl::_haveTaggedNodesReachedOpTime_inlock( ReplicationCoordinator::StatusAndDuration ReplicationCoordinatorImpl::awaitReplication( OperationContext* txn, const OpTime& opTime, const WriteConcernOptions& writeConcern) { Timer timer; + WriteConcernOptions fixedWriteConcern = populateUnsetWriteConcernOptionsSyncMode(writeConcern); stdx::unique_lock<stdx::mutex> lock(_mutex); - return _awaitReplication_inlock(&timer, &lock, txn, opTime, SnapshotName::min(), writeConcern); + return _awaitReplication_inlock( + &timer, &lock, txn, opTime, SnapshotName::min(), fixedWriteConcern); } ReplicationCoordinator::StatusAndDuration ReplicationCoordinatorImpl::awaitReplicationOfLastOpForClient( OperationContext* txn, const WriteConcernOptions& writeConcern) { Timer timer; + WriteConcernOptions fixedWriteConcern = populateUnsetWriteConcernOptionsSyncMode(writeConcern); stdx::unique_lock<stdx::mutex> lock(_mutex); const auto& clientInfo = ReplClientInfo::forClient(txn->getClient()); - return _awaitReplication_inlock( - &timer, &lock, txn, clientInfo.getLastOp(), clientInfo.getLastSnapshot(), writeConcern); + return _awaitReplication_inlock(&timer, + &lock, + txn, + clientInfo.getLastOp(), + clientInfo.getLastSnapshot(), + fixedWriteConcern); } ReplicationCoordinator::StatusAndDuration ReplicationCoordinatorImpl::_awaitReplication_inlock( @@ -1214,7 +1409,7 @@ ReplicationCoordinator::StatusAndDuration ReplicationCoordinatorImpl::_awaitRepl if (writeConcern.wMode.empty()) { if (writeConcern.wNumNodes < 1) { return StatusAndDuration(Status::OK(), Milliseconds(timer->millis())); - } else if (writeConcern.wNumNodes == 1 && _getMyLastOptime_inlock() >= opTime) { + } else if (writeConcern.wNumNodes == 1 && _getMyLastAppliedOpTime_inlock() >= opTime) { return StatusAndDuration(Status::OK(), Milliseconds(timer->millis())); } } @@ -1403,7 +1598,7 @@ void ReplicationCoordinatorImpl::_stepDownContinue( return; } bool forceNow = now >= waitUntil ? force : false; - if (_topCoord->stepDown(stepDownUntil, forceNow, getMyLastOptime())) { + if (_topCoord->stepDown(stepDownUntil, forceNow, getMyLastAppliedOpTime())) { // Schedule work to (potentially) step back up once the stepdown period has ended. _replExecutor.scheduleWorkAt(stepDownUntil, stdx::bind(&ReplicationCoordinatorImpl::_handleTimePassing, @@ -1612,6 +1807,37 @@ int ReplicationCoordinatorImpl::_getMyId_inlock() const { bool ReplicationCoordinatorImpl::prepareReplSetUpdatePositionCommand(BSONObjBuilder* cmdBuilder) { stdx::lock_guard<stdx::mutex> lock(_mutex); invariant(_rsConfig.isInitialized()); + // Do not send updates if we have been removed from the config. + if (_selfIndex == -1) { + return false; + } + cmdBuilder->append("replSetUpdatePosition", 1); + // Create an array containing objects each live member connected to us and for ourself. + BSONArrayBuilder arrayBuilder(cmdBuilder->subarrayStart("optimes")); + for (SlaveInfoVector::iterator itr = _slaveInfo.begin(); itr != _slaveInfo.end(); ++itr) { + if (itr->lastAppliedOpTime.isNull()) { + // Don't include info on members we haven't heard from yet. + continue; + } + // Don't include members we think are down. + if (!itr->self && itr->down) { + continue; + } + + BSONObjBuilder entry(arrayBuilder.subobjStart()); + itr->lastDurableOpTime.append(&entry, "durableOpTime"); + itr->lastAppliedOpTime.append(&entry, "appliedOpTime"); + entry.append("memberId", itr->memberId); + entry.append("cfgver", _rsConfig.getConfigVersion()); + } + + return true; +} + +bool ReplicationCoordinatorImpl::prepareOldReplSetUpdatePositionCommand( + BSONObjBuilder* cmdBuilder) { + stdx::lock_guard<stdx::mutex> lock(_mutex); + invariant(_rsConfig.isInitialized()); // do not send updates if we have been removed from the config if (_selfIndex == -1) { return false; @@ -1620,7 +1846,7 @@ bool ReplicationCoordinatorImpl::prepareReplSetUpdatePositionCommand(BSONObjBuil // create an array containing objects each member connected to us and for ourself BSONArrayBuilder arrayBuilder(cmdBuilder->subarrayStart("optimes")); for (SlaveInfoVector::iterator itr = _slaveInfo.begin(); itr != _slaveInfo.end(); ++itr) { - if (itr->opTime.isNull()) { + if (itr->lastDurableOpTime.isNull()) { // Don't include info on members we haven't heard from yet. continue; } @@ -1632,9 +1858,9 @@ bool ReplicationCoordinatorImpl::prepareReplSetUpdatePositionCommand(BSONObjBuil BSONObjBuilder entry(arrayBuilder.subobjStart()); entry.append("_id", itr->rid); if (isV1ElectionProtocol()) { - itr->opTime.append(&entry, "optime"); + itr->lastDurableOpTime.append(&entry, "optime"); } else { - entry.append("optime", itr->opTime.getTimestamp()); + entry.append("optime", itr->lastDurableOpTime.getTimestamp()); } entry.append("memberId", itr->memberId); entry.append("cfgver", _rsConfig.getConfigVersion()); @@ -1645,21 +1871,15 @@ bool ReplicationCoordinatorImpl::prepareReplSetUpdatePositionCommand(BSONObjBuil Status ReplicationCoordinatorImpl::processReplSetGetStatus(BSONObjBuilder* response) { Status result(ErrorCodes::InternalError, "didn't set status in prepareStatusResponse"); - CBHStatus cbh = - _replExecutor.scheduleWork(stdx::bind(&TopologyCoordinator::prepareStatusResponse, - _topCoord.get(), - stdx::placeholders::_1, - _replExecutor.now(), - time(0) - serverGlobalParams.started, - getMyLastOptime(), - response, - &result)); - if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { - return Status(ErrorCodes::ShutdownInProgress, "replication shutdown in progress"); - } - fassert(18640, cbh.getStatus()); - _replExecutor.wait(cbh.getValue()); - + _scheduleWorkAndWaitForCompletion(stdx::bind(&TopologyCoordinator::prepareStatusResponse, + _topCoord.get(), + stdx::placeholders::_1, + _replExecutor.now(), + time(0) - serverGlobalParams.started, + getMyLastAppliedOpTime(), + getLastCommittedOpTime(), + response, + &result)); return result; } @@ -1704,11 +1924,11 @@ void ReplicationCoordinatorImpl::appendSlaveInfoData(BSONObjBuilder* result) { entry.append("rid", itr->rid); if (isV1ElectionProtocol()) { BSONObjBuilder opTime(entry.subobjStart("optime")); - opTime.append("ts", itr->opTime.getTimestamp()); - opTime.append("term", itr->opTime.getTerm()); + opTime.append("ts", itr->lastDurableOpTime.getTimestamp()); + opTime.append("term", itr->lastDurableOpTime.getTerm()); opTime.done(); } else { - entry.append("optime", itr->opTime.getTimestamp()); + entry.append("optime", itr->lastDurableOpTime.getTimestamp()); } entry.append("host", itr->hostAndPort.toString()); if (getReplicationMode() == modeReplSet) { @@ -1850,7 +2070,7 @@ Status ReplicationCoordinatorImpl::processReplSetSyncFrom(const HostAndPort& tar _topCoord.get(), stdx::placeholders::_1, target, - _getMyLastOptime_inlock(), + _getMyLastAppliedOpTime_inlock(), resultObj, &result)); if (cbh.getStatus() == ErrorCodes::ShutdownInProgress) { @@ -1939,8 +2159,12 @@ void ReplicationCoordinatorImpl::_processHeartbeatFinish( auto senderHost(args.getSenderHost()); const Date_t now = _replExecutor.now(); - *outStatus = _topCoord->prepareHeartbeatResponse( - now, args, _settings.ourSetName(), getMyLastOptime(), response); + *outStatus = _topCoord->prepareHeartbeatResponse(now, + args, + _settings.ourSetName(), + getMyLastAppliedOpTime(), + getMyLastDurableOpTime(), + response); if ((outStatus->isOK() || *outStatus == ErrorCodes::InvalidReplicaSetConfig) && _selfIndex < 0) { // If this node does not belong to the configuration it knows about, send heartbeats @@ -2409,7 +2633,8 @@ void ReplicationCoordinatorImpl::_processReplSetFresh_finish( return; } - _topCoord->prepareFreshResponse(args, _replExecutor.now(), getMyLastOptime(), response, result); + _topCoord->prepareFreshResponse( + args, _replExecutor.now(), getMyLastAppliedOpTime(), response, result); } Status ReplicationCoordinatorImpl::processReplSetElect(const ReplSetElectArgs& args, @@ -2440,7 +2665,8 @@ void ReplicationCoordinatorImpl::_processReplSetElect_finish( return; } - _topCoord->prepareElectResponse(args, _replExecutor.now(), getMyLastOptime(), response, result); + _topCoord->prepareElectResponse( + args, _replExecutor.now(), getMyLastAppliedOpTime(), response, result); } ReplicationCoordinatorImpl::PostMemberStateUpdateAction @@ -2453,7 +2679,7 @@ ReplicationCoordinatorImpl::_setCurrentRSConfig_inlock( _setConfigState_inlock(kConfigSteady); // Must get this before changing our config. - OpTime myOptime = _getMyLastOptime_inlock(); + OpTime myOptime = _getMyLastAppliedOpTime_inlock(); _topCoord->updateConfig(newConfig, myIndex, _replExecutor.now(), myOptime); _cachedTerm = _topCoord->getTerm(); const ReplicaSetConfig oldConfig = _rsConfig; @@ -2512,6 +2738,31 @@ void ReplicationCoordinatorImpl::_wakeReadyWaiters_inlock() { } } +Status ReplicationCoordinatorImpl::processReplSetUpdatePosition( + const OldUpdatePositionArgs& updates, long long* configVersion) { + stdx::unique_lock<stdx::mutex> lock(_mutex); + Status status = Status::OK(); + bool somethingChanged = false; + for (OldUpdatePositionArgs::UpdateIterator update = updates.updatesBegin(); + update != updates.updatesEnd(); + ++update) { + status = _setLastOptime_inlock(*update, configVersion); + if (!status.isOK()) { + break; + } + somethingChanged = true; + } + + if (somethingChanged && !_getMemberState_inlock().primary()) { + lock.unlock(); + // Must do this outside _mutex + // TODO: enable _dr, remove _externalState when DataReplicator is used excl. + //_dr.slavesHaveProgressed(); + _externalState->forwardSlaveProgress(); + } + return status; +} + Status ReplicationCoordinatorImpl::processReplSetUpdatePosition(const UpdatePositionArgs& updates, long long* configVersion) { stdx::unique_lock<stdx::mutex> lock(_mutex); @@ -2572,19 +2823,25 @@ bool ReplicationCoordinatorImpl::buildsIndexes() { return self.shouldBuildIndexes(); } -std::vector<HostAndPort> ReplicationCoordinatorImpl::getHostsWrittenTo(const OpTime& op) { +std::vector<HostAndPort> ReplicationCoordinatorImpl::getHostsWrittenTo(const OpTime& op, + bool durablyWritten) { std::vector<HostAndPort> hosts; stdx::lock_guard<stdx::mutex> lk(_mutex); for (size_t i = 0; i < _slaveInfo.size(); ++i) { const SlaveInfo& slaveInfo = _slaveInfo[i]; - if (slaveInfo.opTime < op) { + if (getReplicationMode() == modeMasterSlave && slaveInfo.rid == _getMyRID_inlock()) { + // Master-slave doesn't know the HostAndPort for itself at this point. continue; } - if (getReplicationMode() == modeMasterSlave && slaveInfo.rid == _getMyRID_inlock()) { - // Master-slave doesn't know the HostAndPort for itself at this point. + if (durablyWritten) { + if (slaveInfo.lastDurableOpTime < op) { + continue; + } + } else if (slaveInfo.lastAppliedOpTime < op) { continue; } + hosts.push_back(slaveInfo.hostAndPort); } return hosts; @@ -2739,7 +2996,7 @@ void ReplicationCoordinatorImpl::blacklistSyncSource(const HostAndPort& host, Da _replExecutor.wait(cbh.getValue()); } -void ReplicationCoordinatorImpl::resetLastOpTimeFromOplog(OperationContext* txn) { +void ReplicationCoordinatorImpl::resetLastOpTimesFromOplog(OperationContext* txn) { StatusWith<OpTime> lastOpTimeStatus = _externalState->loadLastOpTime(txn); OpTime lastOpTime; if (!lastOpTimeStatus.isOK()) { @@ -2748,8 +3005,10 @@ void ReplicationCoordinatorImpl::resetLastOpTimeFromOplog(OperationContext* txn) } else { lastOpTime = lastOpTimeStatus.getValue(); } + stdx::unique_lock<stdx::mutex> lk(_mutex); - _setMyLastOptimeAndReport_inlock(&lk, lastOpTime, true); + _setMyLastAppliedOpTime_inlock(lastOpTime, true); + _setMyLastDurableOpTimeAndReport_inlock(&lk, lastOpTime, true); _externalState->setGlobalTimestamp(lastOpTime.getTimestamp()); } @@ -2764,7 +3023,7 @@ void ReplicationCoordinatorImpl::_shouldChangeSyncSource( } *shouldChange = _topCoord->shouldChangeSyncSource(currentSource, - getMyLastOptime(), + getMyLastAppliedOpTime(), syncSourceLastOpTime, syncSourceHasSyncSource, _replExecutor.now()); @@ -2801,7 +3060,7 @@ void ReplicationCoordinatorImpl::_updateLastCommittedOpTime_inlock() { auto memberConfig = _rsConfig.findMemberByID(sI.memberId); invariant(memberConfig); if (memberConfig->isVoter()) { - votingNodesOpTimes.push_back(sI.opTime); + votingNodesOpTimes.push_back(sI.lastDurableOpTime); } } @@ -2823,8 +3082,9 @@ void ReplicationCoordinatorImpl::_setLastCommittedOpTime(const OpTime& committed } void ReplicationCoordinatorImpl::_setLastCommittedOpTime_inlock(const OpTime& committedOpTime) { - if (committedOpTime <= _lastCommittedOpTime) + if (committedOpTime <= _lastCommittedOpTime) { return; // This may have come from an out-of-order heartbeat. Ignore it. + } // This check is performed to ensure primaries do not commit an OpTime from a previous term. if (_getMemberState_inlock().primary() && committedOpTime < _firstOpTimeOfMyTerm) { @@ -2832,7 +3092,8 @@ void ReplicationCoordinatorImpl::_setLastCommittedOpTime_inlock(const OpTime& co } if (_getMemberState_inlock().arbiter()) { - _setMyLastOptime_inlock(committedOpTime, false); + _setMyLastAppliedOpTime_inlock(committedOpTime, false); + _setMyLastDurableOpTime_inlock(committedOpTime, false); } _lastCommittedOpTime = committedOpTime; @@ -2840,6 +3101,7 @@ void ReplicationCoordinatorImpl::_setLastCommittedOpTime_inlock(const OpTime& co _externalState->notifyOplogMetadataWaiters(); auto maxSnapshotForOpTime = SnapshotInfo{committedOpTime, SnapshotName::max()}; + if (!_uncommittedSnapshots.empty() && _uncommittedSnapshots.front() <= maxSnapshotForOpTime) { // At least one uncommitted snapshot is ready to be blessed as committed. @@ -2920,7 +3182,7 @@ void ReplicationCoordinatorImpl::_processReplSetRequestVotes_finish( } stdx::unique_lock<stdx::mutex> lk(_mutex); - _topCoord->processReplSetRequestVotes(args, response, _getMyLastOptime_inlock()); + _topCoord->processReplSetRequestVotes(args, response, _getMyLastAppliedOpTime_inlock()); *result = Status::OK(); } @@ -2991,13 +3253,21 @@ void ReplicationCoordinatorImpl::_prepareReplResponseMetadata_finish( rpc::ReplSetMetadata* metadata) { OpTime lastReadableOpTime = getCurrentCommittedSnapshotOpTime(); OpTime lastVisibleOpTime = std::max(lastOpTimeFromClient, lastReadableOpTime); - _topCoord->prepareReplResponseMetadata(metadata, lastVisibleOpTime, getLastCommittedOpTime()); + _topCoord->prepareReplResponseMetadata(metadata, lastVisibleOpTime, _lastCommittedOpTime); } bool ReplicationCoordinatorImpl::isV1ElectionProtocol() { return _protVersion.load() == 1; } +bool ReplicationCoordinatorImpl::getWriteConcernMajorityShouldJournal() { + return getConfig().getWriteConcernMajorityShouldJournal(); +} + +bool ReplicationCoordinatorImpl::getWriteConcernMajorityShouldJournal_inlock() const { + return _rsConfig.getWriteConcernMajorityShouldJournal(); +} + Status ReplicationCoordinatorImpl::processHeartbeatV1(const ReplSetHeartbeatArgsV1& args, ReplSetHeartbeatResponse* response) { { @@ -3038,8 +3308,12 @@ void ReplicationCoordinatorImpl::_processHeartbeatFinishV1( auto senderHost(args.getSenderHost()); const Date_t now = _replExecutor.now(); - *outStatus = _topCoord->prepareHeartbeatResponseV1( - now, args, _settings.ourSetName(), getMyLastOptime(), response); + *outStatus = _topCoord->prepareHeartbeatResponseV1(now, + args, + _settings.ourSetName(), + getMyLastAppliedOpTime(), + getMyLastDurableOpTime(), + response); if ((outStatus->isOK() || *outStatus == ErrorCodes::InvalidReplicaSetConfig) && _selfIndex < 0) { @@ -3089,7 +3363,8 @@ void ReplicationCoordinatorImpl::_summarizeAsHtml_finish(const CallbackArgs& cbD return; } - output->setSelfOptime(getMyLastOptime()); + // TODO(dannenberg) consider putting both optimes into the htmlsummary. + output->setSelfOptime(getMyLastAppliedOpTime()); output->setSelfUptime(time(0) - serverGlobalParams.started); output->setNow(_replExecutor.now()); @@ -3394,5 +3669,19 @@ void ReplicationCoordinatorImpl::_scheduleElectionWinNotification() { } } +WriteConcernOptions ReplicationCoordinatorImpl::populateUnsetWriteConcernOptionsSyncMode( + WriteConcernOptions wc) { + WriteConcernOptions writeConcern(wc); + if (writeConcern.syncMode == WriteConcernOptions::SyncMode::UNSET) { + if (writeConcern.wMode == WriteConcernOptions::kMajority && _isDurableStorageEngine() && + getWriteConcernMajorityShouldJournal()) { + writeConcern.syncMode = WriteConcernOptions::SyncMode::JOURNAL; + } else { + writeConcern.syncMode = WriteConcernOptions::SyncMode::NONE; + } + } + return writeConcern; +} + } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h index 8252ac53412..c5cde2742bf 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.h +++ b/src/mongo/db/repl/replication_coordinator_impl.h @@ -37,6 +37,7 @@ #include "mongo/db/concurrency/d_concurrency.h" #include "mongo/db/repl/data_replicator.h" #include "mongo/db/repl/member_state.h" +#include "mongo/db/repl/old_update_position_args.h" #include "mongo/db/repl/optime.h" #include "mongo/db/repl/replica_set_config.h" #include "mongo/db/repl/replication_coordinator.h" @@ -104,7 +105,8 @@ public: ReplicationCoordinatorExternalState* externalState, TopologyCoordinator* topoCoord, ReplicationExecutor* replExec, - int64_t prngSeed); + int64_t prngSeed, + stdx::function<bool()>* isDurableStorageEngineFn); virtual ~ReplicationCoordinatorImpl(); // ================== Members of public ReplicationCoordinator API =================== @@ -170,15 +172,18 @@ public: virtual Status setLastOptimeForSlave(const OID& rid, const Timestamp& ts); - virtual void setMyLastOptime(const OpTime& opTime); + virtual void setMyLastAppliedOpTime(const OpTime& opTime); + virtual void setMyLastDurableOpTime(const OpTime& opTime); - virtual void setMyLastOptimeForward(const OpTime& opTime); + virtual void setMyLastAppliedOpTimeForward(const OpTime& opTime); + virtual void setMyLastDurableOpTimeForward(const OpTime& opTime); - virtual void resetMyLastOptime(); + virtual void resetMyLastOpTimes(); virtual void setMyHeartbeatMessage(const std::string& msg); - virtual OpTime getMyLastOptime() const override; + virtual OpTime getMyLastAppliedOpTime() const override; + virtual OpTime getMyLastDurableOpTime() const override; virtual ReadConcernResponse waitUntilOpTime(OperationContext* txn, const ReadConcernArgs& settings) override; @@ -199,6 +204,7 @@ public: virtual void signalUpstreamUpdater() override; + virtual bool prepareOldReplSetUpdatePositionCommand(BSONObjBuilder* cmdBuilder) override; virtual bool prepareReplSetUpdatePositionCommand(BSONObjBuilder* cmdBuilder) override; virtual Status processReplSetGetStatus(BSONObjBuilder* result) override; @@ -245,6 +251,8 @@ public: virtual Status processReplSetElect(const ReplSetElectArgs& args, BSONObjBuilder* response) override; + virtual Status processReplSetUpdatePosition(const OldUpdatePositionArgs& updates, + long long* configVersion) override; virtual Status processReplSetUpdatePosition(const UpdatePositionArgs& updates, long long* configVersion) override; @@ -252,7 +260,8 @@ public: virtual bool buildsIndexes() override; - virtual std::vector<HostAndPort> getHostsWrittenTo(const OpTime& op) override; + virtual std::vector<HostAndPort> getHostsWrittenTo(const OpTime& op, + bool durablyWritten) override; virtual std::vector<HostAndPort> getOtherNodesInReplSet() const override; @@ -266,7 +275,7 @@ public: virtual void blacklistSyncSource(const HostAndPort& host, Date_t until) override; - virtual void resetLastOpTimeFromOplog(OperationContext* txn) override; + virtual void resetLastOpTimesFromOplog(OperationContext* txn) override; virtual bool shouldChangeSyncSource(const HostAndPort& currentSource, const OpTime& syncSourceLastOpTime, @@ -290,6 +299,8 @@ public: virtual bool isV1ElectionProtocol() override; + virtual bool getWriteConcernMajorityShouldJournal() override; + virtual void summarizeAsHtml(ReplSetHtmlSummary* s) override; virtual void dropAllSnapshots() override; @@ -315,6 +326,9 @@ public: virtual size_t getNumUncommittedSnapshots() override; + virtual WriteConcernOptions populateUnsetWriteConcernOptionsSyncMode( + WriteConcernOptions wc) override; + // ================== Test support API =================== /** @@ -341,9 +355,10 @@ public: Date_t getPriorityTakeover_forTest() const; /** - * Simple wrapper around _setLastOptime_inlock to make it easier to test. + * Simple wrappers around _setLastOptime_inlock to make it easier to test. */ - Status setLastOptime_forTest(long long cfgVer, long long memberId, const OpTime& opTime); + Status setLastAppliedOptime_forTest(long long cfgVer, long long memberId, const OpTime& opTime); + Status setLastDurableOptime_forTest(long long cfgVer, long long memberId, const OpTime& opTime); /** * Non-blocking version of stepDown. @@ -436,7 +451,8 @@ private: int64_t prngSeed, executor::NetworkInterface* network, StorageInterface* storage, - ReplicationExecutor* replExec); + ReplicationExecutor* replExec, + stdx::function<bool()>* isDurableStorageEngineFn); /** * Configuration states for a replica set node. * @@ -485,7 +501,10 @@ private: // Struct that holds information about nodes in this replication group, mainly used for // tracking replication progress for write concern satisfaction. struct SlaveInfo { - OpTime opTime; // Our last known OpTime that this slave has replicated to. + // Our last known OpTime that this slave has applied and journaled to. + OpTime lastDurableOpTime; + // Our last known OpTime that this slave has applied, whether journaled or unjournaled. + OpTime lastAppliedOpTime; HostAndPort hostAndPort; // Client address of the slave. int memberId = -1; // Id of the node in the replica set config, or -1 if we're not a replSet. @@ -519,11 +538,18 @@ private: void _addSlaveInfo_inlock(const SlaveInfo& slaveInfo); /** - * Updates the item in _slaveInfo pointed to by 'slaveInfo' with the given OpTime 'opTime' - * and wakes up any threads waiting for replication that now have their write concern - * satisfied. + * Updates the durableOpTime field on the item in _slaveInfo pointed to by 'slaveInfo' with the + * given OpTime 'opTime' and wakes up any threads waiting for replication that now have their + * write concern satisfied. */ - void _updateSlaveInfoOptime_inlock(SlaveInfo* slaveInfo, const OpTime& opTime); + void _updateSlaveInfoDurableOpTime_inlock(SlaveInfo* slaveInfo, const OpTime& opTime); + + /** + * Updates the appliedOpTime field on the item in _slaveInfo pointed to by 'slaveInfo' with the + * given OpTime 'opTime' and wakes up any threads waiting for replication that now have their + * write concern satisfied. + */ + void _updateSlaveInfoAppliedOpTime_inlock(SlaveInfo* slaveInfo, const OpTime& opTime); /** * Returns the index into _slaveInfo where data corresponding to ourself is stored. @@ -533,6 +559,11 @@ private: size_t _getMyIndexInSlaveInfo_inlock() const; /** + * Returns the _writeConcernMajorityJournalDefault of our current _rsConfig. + */ + bool getWriteConcernMajorityShouldJournal_inlock() const; + + /** * Helper method that removes entries from _slaveInfo if they correspond to a node * with a member ID that is not in the current replica set config. Will always leave an * entry for ourself at the beginning of _slaveInfo, even if we aren't present in the @@ -665,15 +696,18 @@ private: /** * Helper for _doneWaitingForReplication_inlock that takes an integer write concern. + * "durablyWritten" indicates whether the operation has to be durably applied. */ - bool _haveNumNodesReachedOpTime_inlock(const OpTime& opTime, int numNodes); + bool _haveNumNodesReachedOpTime_inlock(const OpTime& opTime, int numNodes, bool durablyWritten); /** * Helper for _doneWaitingForReplication_inlock that takes a tag pattern representing a * named write concern mode. + * "durablyWritten" indicates whether the operation has to be durably applied. */ bool _haveTaggedNodesReachedOpTime_inlock(const OpTime& opTime, - const ReplicaSetTagPattern& tagPattern); + const ReplicaSetTagPattern& tagPattern, + bool durablyWritten); Status _checkIfWriteConcernCanBeSatisfied_inlock(const WriteConcernOptions& writeConcern) const; @@ -702,7 +736,8 @@ private: int _getMyId_inlock() const; - OpTime _getMyLastOptime_inlock() const; + OpTime _getMyLastAppliedOpTime_inlock() const; + OpTime _getMyLastDurableOpTime_inlock() const; /** * Bottom half of setFollowerMode. @@ -722,24 +757,44 @@ private: * This is only valid to call on replica sets. * "configVersion" will be populated with our config version if it and the configVersion * of "args" differ. + * + * The OldUpdatePositionArgs version provides support for the pre-3.2.2 format of + * UpdatePositionArgs. */ + Status _setLastOptime_inlock(const OldUpdatePositionArgs::UpdateInfo& args, + long long* configVersion); Status _setLastOptime_inlock(const UpdatePositionArgs::UpdateInfo& args, long long* configVersion); /** - * Helper method for setMyLastOptime that takes in a unique lock on + * Helper method for setMyLastAppliedOptime that takes in a unique lock on + * _mutex. The passed in lock must already be locked. It is unspecified what state the + * lock will be in after this method finishes. + * + * This function has the same rules for "opTime" as setMyLastAppliedOptime(), unless + * "isRollbackAllowed" is true. + * + * This function will also report our position externally (like upstream) if necessary. + */ + void _setMyLastAppliedOpTimeAndReport_inlock(stdx::unique_lock<stdx::mutex>* lock, + const OpTime& opTime, + bool isRollbackAllowed); + void _setMyLastAppliedOpTime_inlock(const OpTime& opTime, bool isRollbackAllowed); + + /** + * Helper method for setMyLastDurableOptime that takes in a unique lock on * _mutex. The passed in lock must already be locked. It is unspecified what state the * lock will be in after this method finishes. * - * This function has the same rules for "opTime" as setMyLastOptime(), unless + * This function has the same rules for "opTime" as setMyLastDurableOptime(), unless * "isRollbackAllowed" is true. * * This function will also report our position externally (like upstream) if necessary. */ - void _setMyLastOptimeAndReport_inlock(stdx::unique_lock<stdx::mutex>* lock, - const OpTime& opTime, - bool isRollbackAllowed); - void _setMyLastOptime_inlock(const OpTime& opTime, bool isRollbackAllowed); + void _setMyLastDurableOpTimeAndReport_inlock(stdx::unique_lock<stdx::mutex>* lock, + const OpTime& opTime, + bool isRollbackAllowed); + void _setMyLastDurableOpTime_inlock(const OpTime& opTime, bool isRollbackAllowed); /** * Schedules a heartbeat to be sent to "target" at "when". "targetIndex" is the index @@ -766,9 +821,12 @@ private: /** * Helper for _handleHeartbeatResponse. * - * Updates the optime associated with the member at "memberIndex" in our config. + * Updates the lastDurableOpTime and lastAppliedOpTime associated with the member at + * "memberIndex" in our config. */ - void _updateOpTimeFromHeartbeat_inlock(int memberIndex, const OpTime& optime); + void _updateOpTimesFromHeartbeat_inlock(int targetIndex, + const OpTime& durableOpTime, + const OpTime& appliedOpTime); /** * Starts a heartbeat for each member in the current config. Called within the executor @@ -1235,8 +1293,7 @@ private: // TODO: ideally this should only change on rollbacks NOT on mongod restarts also. int _rbid; // (M) - // list of information about clients waiting on replication. Does *not* own the - // WaiterInfos. + // list of information about clients waiting on replication. Does *not* own the WaiterInfos. std::vector<WaiterInfo*> _replicationWaiterList; // (M) // list of information about clients waiting for a particular opTime. @@ -1391,6 +1448,9 @@ private: // Cached copy of the current config protocol version. AtomicInt64 _protVersion; // (S) + + // Lambda indicating durability of storageEngine. + stdx::function<bool()> _isDurableStorageEngine; // (R) }; } // namespace repl diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect.cpp index c2105a1007a..e1109d0cc4d 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_elect.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_elect.cpp @@ -121,7 +121,7 @@ void ReplicationCoordinatorImpl::_startElectSelf() { invariant(_rsConfig.getMemberAt(_selfIndex).isElectable()); - OpTime lastOpTimeApplied(_getMyLastOptime_inlock()); + OpTime lastOpTimeApplied(_getMyLastAppliedOpTime_inlock()); if (lastOpTimeApplied.isNull()) { log() << "not trying to elect self, " @@ -275,7 +275,7 @@ void ReplicationCoordinatorImpl::_recoverFromElectionTie( return; } auto now = _replExecutor.now(); - auto lastOpApplied = getMyLastOptime(); + auto lastOpApplied = getMyLastAppliedOpTime(); if (_topCoord->checkShouldStandForElection(now, lastOpApplied)) { fassert(28817, _topCoord->becomeCandidateIfElectable(now, lastOpApplied)); _startElectSelf(); diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect_test.cpp index ce50d2e9619..6dac4852d0f 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_elect_test.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_elect_test.cpp @@ -161,7 +161,7 @@ TEST_F(ReplCoordElectTest, ElectionSucceedsWhenNodeIsTheOnlyElectableNode) { ASSERT(getReplCoord()->getMemberState().secondary()) << getReplCoord()->getMemberState().toString(); - getReplCoord()->setMyLastOptime(OpTime(Timestamp(10, 0), 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTime(Timestamp(10, 0), 0)); NetworkInterfaceMock* net = getNet(); net->enterNetwork(); @@ -232,7 +232,7 @@ TEST_F(ReplCoordElectTest, ElectionSucceedsWhenAllNodesVoteYea) { << "node3:12345"))); assertStartSuccess(configObj, HostAndPort("node1", 12345)); OperationContextNoop txn; - getReplCoord()->setMyLastOptime(OpTime{{100, 1}, 0}); + getReplCoord()->setMyLastAppliedOpTime(OpTime{{100, 1}, 0}); getExternalState()->setLastOpTime(OpTime{{100, 1}, 0}); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); @@ -259,7 +259,7 @@ TEST_F(ReplCoordElectTest, ElectionFailsWhenOneNodeVotesNay) { OperationContextNoop txn; OpTime time1(Timestamp(100, 1), 0); - getReplCoord()->setMyLastOptime(time1); + getReplCoord()->setMyLastAppliedOpTime(time1); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); simulateEnoughHeartbeatsForElectability(); @@ -304,7 +304,7 @@ TEST_F(ReplCoordElectTest, VotesWithStringValuesAreNotCountedAsYeas) { OperationContextNoop txn; OpTime time1(Timestamp(100, 1), 0); - getReplCoord()->setMyLastOptime(time1); + getReplCoord()->setMyLastAppliedOpTime(time1); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); simulateEnoughHeartbeatsForElectability(); @@ -349,7 +349,7 @@ TEST_F(ReplCoordElectTest, ElectionsAbortWhenNodeTransitionsToRollbackState) { OperationContextNoop txn; OpTime time1(Timestamp(100, 1), 0); - getReplCoord()->setMyLastOptime(time1); + getReplCoord()->setMyLastAppliedOpTime(time1); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); simulateEnoughHeartbeatsForElectability(); @@ -384,7 +384,7 @@ TEST_F(ReplCoordElectTest, NodeWillNotStandForElectionDuringHeartbeatReconfig) { << "node5:12345"))), HostAndPort("node1", 12345)); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(Timestamp(100, 0), 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTime(Timestamp(100, 0), 0)); // set hbreconfig to hang while in progress getExternalState()->setStoreLocalConfigDocumentToHang(true); @@ -478,7 +478,7 @@ TEST_F(ReplCoordElectTest, StepsDownRemoteIfNodeHasHigherPriorityThanCurrentPrim OperationContextNoop txn; OpTime time1(Timestamp(100, 1), 0); - getReplCoord()->setMyLastOptime(time1); + getReplCoord()->setMyLastAppliedOpTime(time1); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); auto net = getNet(); diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp index 1ed1f7769b3..97b545a1292 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_elect_v1.cpp @@ -124,9 +124,9 @@ void ReplicationCoordinatorImpl::_startElectSelfV1() { invariant(_rsConfig.getMemberAt(_selfIndex).isElectable()); - OpTime lastOpTimeApplied(_getMyLastOptime_inlock()); + OpTime lastOpTimeDurable(_getMyLastDurableOpTime_inlock()); - if (lastOpTimeApplied == OpTime()) { + if (lastOpTimeDurable == OpTime()) { log() << "not trying to elect self, " "do not yet have a complete set of data from any point in time"; return; @@ -147,7 +147,7 @@ void ReplicationCoordinatorImpl::_startElectSelfV1() { _selfIndex, _topCoord->getTerm(), true, // dry run - getMyLastOptime(), + getMyLastDurableOpTime(), stdx::bind(&ReplicationCoordinatorImpl::_onDryRunComplete, this, term)); if (nextPhaseEvh.getStatus() == ErrorCodes::ShutdownInProgress) { return; @@ -245,7 +245,7 @@ void ReplicationCoordinatorImpl::_startVoteRequester(long long newTerm) { _selfIndex, _topCoord->getTerm(), false, - getMyLastOptime(), + getMyLastDurableOpTime(), stdx::bind(&ReplicationCoordinatorImpl::_onVoteRequestComplete, this, newTerm)); if (nextPhaseEvh.getStatus() == ErrorCodes::ShutdownInProgress) { return; diff --git a/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp index bd8ddfbc139..e29e772b011 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_elect_v1_test.cpp @@ -104,7 +104,8 @@ TEST_F(ReplCoordElectV1Test, ElectionSucceedsWhenNodeIsTheOnlyElectableNode) { ASSERT(getReplCoord()->getMemberState().secondary()) << getReplCoord()->getMemberState().toString(); - getReplCoord()->setMyLastOptime(OpTime(Timestamp(10, 0), 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTime(Timestamp(10, 0), 0)); + getReplCoord()->setMyLastDurableOpTime(OpTime(Timestamp(10, 0), 0)); auto electionTimeoutWhen = getReplCoord()->getElectionTimeout_forTest(); ASSERT_NOT_EQUALS(Date_t(), electionTimeoutWhen); @@ -160,7 +161,8 @@ TEST_F(ReplCoordElectV1Test, StartElectionDoesNotStartAnElectionWhenNodeIsRecove ASSERT(getReplCoord()->getMemberState().recovering()) << getReplCoord()->getMemberState().toString(); - getReplCoord()->setMyLastOptime(OpTime(Timestamp(10, 0), 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTime(Timestamp(10, 0), 0)); + getReplCoord()->setMyLastDurableOpTime(OpTime(Timestamp(10, 0), 0)); simulateEnoughHeartbeatsForElectability(); auto electionTimeoutWhen = getReplCoord()->getElectionTimeout_forTest(); @@ -177,7 +179,8 @@ TEST_F(ReplCoordElectV1Test, ElectionSucceedsWhenNodeIsTheOnlyNode) { << "node1:12345")) << "protocolVersion" << 1), HostAndPort("node1", 12345)); - getReplCoord()->setMyLastOptime(OpTime(Timestamp(10, 0), 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTime(Timestamp(10, 0), 0)); + getReplCoord()->setMyLastDurableOpTime(OpTime(Timestamp(10, 0), 0)); getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY); getReplCoord()->waitForElectionFinish_forTest(); ASSERT(getReplCoord()->getMemberState().primary()) @@ -208,7 +211,8 @@ TEST_F(ReplCoordElectV1Test, ElectionSucceedsWhenAllNodesVoteYea) { << 1); assertStartSuccess(configObj, HostAndPort("node1", 12345)); OperationContextNoop txn; - getReplCoord()->setMyLastOptime(OpTime(Timestamp(100, 1), 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTime(Timestamp(100, 1), 0)); + getReplCoord()->setMyLastDurableOpTime(OpTime(Timestamp(100, 1), 0)); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); startCapturingLogMessages(); simulateSuccessfulV1Election(); @@ -243,7 +247,8 @@ TEST_F(ReplCoordElectV1Test, ElectionSucceedsWhenMaxSevenNodesVoteYea) { << "protocolVersion" << 1); assertStartSuccess(configObj, HostAndPort("node1", 12345)); OperationContextNoop txn; - getReplCoord()->setMyLastOptime(OpTime(Timestamp(100, 1), 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTime(Timestamp(100, 1), 0)); + getReplCoord()->setMyLastDurableOpTime(OpTime(Timestamp(100, 1), 0)); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); startCapturingLogMessages(); simulateSuccessfulV1Election(); @@ -276,7 +281,8 @@ TEST_F(ReplCoordElectV1Test, ElectionFailsWhenInsufficientVotesAreReceivedDuring OperationContextNoop txn; OpTime time1(Timestamp(100, 1), 0); - getReplCoord()->setMyLastOptime(time1); + getReplCoord()->setMyLastAppliedOpTime(time1); + getReplCoord()->setMyLastDurableOpTime(time1); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); simulateEnoughHeartbeatsForElectability(); @@ -331,7 +337,8 @@ TEST_F(ReplCoordElectV1Test, ElectionFailsWhenDryRunResponseContainsANewerTerm) OperationContextNoop txn; OpTime time1(Timestamp(100, 1), 0); - getReplCoord()->setMyLastOptime(time1); + getReplCoord()->setMyLastAppliedOpTime(time1); + getReplCoord()->setMyLastDurableOpTime(time1); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); simulateEnoughHeartbeatsForElectability(); @@ -390,7 +397,8 @@ TEST_F(ReplCoordElectV1Test, NodeWillNotStandForElectionDuringHeartbeatReconfig) << "protocolVersion" << 1), HostAndPort("node1", 12345)); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(Timestamp(100, 0), 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTime(Timestamp(100, 0), 0)); + getReplCoord()->setMyLastDurableOpTime(OpTime(Timestamp(100, 0), 0)); // set hbreconfig to hang while in progress getExternalState()->setStoreLocalConfigDocumentToHang(true); @@ -498,7 +506,8 @@ TEST_F(ReplCoordElectV1Test, NodeWillNotStandForElectionDuringHeartbeatReconfig) // // OperationContextNoop txn; // OpTime time1(Timestamp(100, 1), 0); -// getReplCoord()->setMyLastOptime(time1); +// getReplCoord()->setMyLastAppliedOpTime(time1); +// getReplCoord()->setMyLastDurableOpTime(time1); // ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); // // simulateEnoughHeartbeatsForElectability(); @@ -556,7 +565,8 @@ TEST_F(ReplCoordElectV1Test, ElectionFailsWhenInsufficientVotesAreReceivedDuring OperationContextNoop txn; OpTime time1(Timestamp(100, 1), 0); - getReplCoord()->setMyLastOptime(time1); + getReplCoord()->setMyLastAppliedOpTime(time1); + getReplCoord()->setMyLastDurableOpTime(time1); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); simulateEnoughHeartbeatsForElectability(); @@ -603,7 +613,8 @@ TEST_F(ReplCoordElectV1Test, ElectionsAbortWhenNodeTransitionsToRollbackState) { OperationContextNoop txn; OpTime time1(Timestamp(100, 1), 0); - getReplCoord()->setMyLastOptime(time1); + getReplCoord()->setMyLastAppliedOpTime(time1); + getReplCoord()->setMyLastDurableOpTime(time1); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); simulateEnoughHeartbeatsForElectability(); @@ -637,7 +648,8 @@ TEST_F(ReplCoordElectV1Test, ElectionFailsWhenVoteRequestResponseContainsANewerT OperationContextNoop txn; OpTime time1(Timestamp(100, 1), 0); - getReplCoord()->setMyLastOptime(time1); + getReplCoord()->setMyLastAppliedOpTime(time1); + getReplCoord()->setMyLastDurableOpTime(time1); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); simulateEnoughHeartbeatsForElectability(); @@ -687,7 +699,8 @@ TEST_F(ReplCoordElectV1Test, ElectionFailsWhenTermChangesDuringDryRun) { OperationContextNoop txn; OpTime time1(Timestamp(100, 1), 0); - getReplCoord()->setMyLastOptime(time1); + getReplCoord()->setMyLastAppliedOpTime(time1); + getReplCoord()->setMyLastDurableOpTime(time1); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); simulateEnoughHeartbeatsForElectability(); @@ -722,7 +735,8 @@ TEST_F(ReplCoordElectV1Test, ElectionFailsWhenTermChangesDuringActualElection) { OperationContextNoop txn; OpTime time1(Timestamp(100, 1), 0); - getReplCoord()->setMyLastOptime(time1); + getReplCoord()->setMyLastAppliedOpTime(time1); + getReplCoord()->setMyLastDurableOpTime(time1); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); simulateEnoughHeartbeatsForElectability(); @@ -775,7 +789,8 @@ TEST_F(ReplCoordElectV1Test, SchedulesPriorityTakeoverIfNodeHasHigherPriorityTha OperationContextNoop txn; OpTime time1(Timestamp(100, 1), 0); - replCoord->setMyLastOptime(time1); + replCoord->setMyLastAppliedOpTime(time1); + replCoord->setMyLastDurableOpTime(time1); ASSERT(replCoord->setFollowerMode(MemberState::RS_SECONDARY)); ASSERT_EQUALS(Date_t(), replCoord->getPriorityTakeover_forTest()); diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp index 74fb8fc8fe9..211abb412a2 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat.cpp @@ -156,7 +156,7 @@ void ReplicationCoordinatorImpl::_handleHeartbeatResponse( } } const Date_t now = _replExecutor.now(); - const OpTime lastApplied = getMyLastOptime(); // Locks and unlocks _mutex. + const OpTime lastApplied = getMyLastAppliedOpTime(); // Locks and unlocks _mutex. Milliseconds networkTime(0); StatusWith<ReplSetHeartbeatResponse> hbStatusResponse(hbResponse); @@ -183,15 +183,20 @@ void ReplicationCoordinatorImpl::_handleHeartbeatResponse( now, networkTime, target, hbStatusResponse, lastApplied); if (action.getAction() == HeartbeatResponseAction::NoAction && hbStatusResponse.isOK() && - hbStatusResponse.getValue().hasOpTime() && targetIndex >= 0 && - hbStatusResponse.getValue().hasState() && + targetIndex >= 0 && hbStatusResponse.getValue().hasState() && hbStatusResponse.getValue().getState() != MemberState::RS_PRIMARY) { - stdx::unique_lock<stdx::mutex> lk(_mutex); - if (hbStatusResponse.getValue().getConfigVersion() == _rsConfig.getConfigVersion()) { - _updateOpTimeFromHeartbeat_inlock(targetIndex, hbStatusResponse.getValue().getOpTime()); - // TODO: Enable with Data Replicator - // lk.unlock(); - //_dr.slavesHaveProgressed(); + ReplSetHeartbeatResponse hbResp = hbStatusResponse.getValue(); + if (hbResp.hasAppliedOpTime()) { + stdx::unique_lock<stdx::mutex> lk(_mutex); + if (hbResp.getConfigVersion() == _rsConfig.getConfigVersion()) { + _updateOpTimesFromHeartbeat_inlock( + targetIndex, + hbResp.hasDurableOpTime() ? hbResp.getDurableOpTime() : OpTime(), + hbResp.getAppliedOpTime()); + // TODO: Enable with Data Replicator + // lk.unlock(); + //_dr.slavesHaveProgressed(); + } } } @@ -204,14 +209,18 @@ void ReplicationCoordinatorImpl::_handleHeartbeatResponse( _handleHeartbeatResponseAction(action, hbStatusResponse); } -void ReplicationCoordinatorImpl::_updateOpTimeFromHeartbeat_inlock(int targetIndex, - const OpTime& optime) { +void ReplicationCoordinatorImpl::_updateOpTimesFromHeartbeat_inlock(int targetIndex, + const OpTime& durableOpTime, + const OpTime& appliedOpTime) { invariant(_selfIndex >= 0); invariant(targetIndex >= 0); SlaveInfo& slaveInfo = _slaveInfo[targetIndex]; - if (optime > slaveInfo.opTime) { - _updateSlaveInfoOptime_inlock(&slaveInfo, optime); + if (appliedOpTime > slaveInfo.lastAppliedOpTime) { + _updateSlaveInfoAppliedOpTime_inlock(&slaveInfo, appliedOpTime); + } + if (durableOpTime > slaveInfo.lastDurableOpTime) { + _updateSlaveInfoDurableOpTime_inlock(&slaveInfo, durableOpTime); } } @@ -608,7 +617,7 @@ void ReplicationCoordinatorImpl::_handleLivenessTimeout( // Secondaries might not see other secondaries in the cluster if they are not // downstream. HeartbeatResponseAction action = - _topCoord->setMemberAsDown(now, memberIndex, _getMyLastOptime_inlock()); + _topCoord->setMemberAsDown(now, memberIndex, _getMyLastDurableOpTime_inlock()); // Don't mind potential asynchronous stepdown as this is the last step of // liveness check. _handleHeartbeatResponseAction(action, makeStatusWith<ReplSetHeartbeatResponse>()); @@ -747,7 +756,7 @@ void ReplicationCoordinatorImpl::_startElectSelfIfEligibleV1(bool isPriorityTake _cancelAndRescheduleElectionTimeout_inlock(); } - if (!_topCoord->becomeCandidateIfElectable(_replExecutor.now(), getMyLastOptime())) { + if (!_topCoord->becomeCandidateIfElectable(_replExecutor.now(), getMyLastDurableOpTime())) { if (isPriorityTakeOver) { log() << "Not starting an election for a priority takeover, since we are not " "electable"; diff --git a/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp index c3dc217d91e..2233e21cc21 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_heartbeat_v1_test.cpp @@ -353,7 +353,7 @@ TEST_F(ReplCoordHBV1Test, ArbiterRecordsCommittedOpTimeFromHeartbeatMetadata) { ASSERT_OK(metadata.getStatus()); getReplCoord()->processReplSetMetadata(metadata.getValue()); - ASSERT_EQ(getReplCoord()->getMyLastOptime().getTimestamp(), expected.getTimestamp()); + ASSERT_EQ(getReplCoord()->getMyLastAppliedOpTime().getTimestamp(), expected.getTimestamp()); }; OpTime committedOpTime{Timestamp{10, 10}, 10}; diff --git a/src/mongo/db/repl/replication_coordinator_impl_reconfig_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_reconfig_test.cpp index 609a8249a4f..5c97e3bc976 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_reconfig_test.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_reconfig_test.cpp @@ -80,7 +80,8 @@ TEST_F(ReplCoordTest, NodeReturnsNotMasterWhenReconfigReceivedWhileSecondary) { HostAndPort("node1", 12345)); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(Timestamp(100, 0), 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTime(Timestamp(100, 0), 0)); + getReplCoord()->setMyLastDurableOpTime(OpTime(Timestamp(100, 0), 0)); BSONObjBuilder result; ReplSetReconfigArgs args; @@ -102,7 +103,8 @@ TEST_F(ReplCoordTest, NodeReturnsInvalidReplicaSetConfigWhenReconfigReceivedWith << "node2:12345"))), HostAndPort("node1", 12345)); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(Timestamp(100, 0), 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTime(Timestamp(100, 0), 0)); + getReplCoord()->setMyLastDurableOpTime(OpTime(Timestamp(100, 0), 0)); simulateSuccessfulV1Election(); BSONObjBuilder result; @@ -135,7 +137,8 @@ TEST_F(ReplCoordTest, NodeReturnsInvalidReplicaSetConfigWhenReconfigReceivedWith << "node2:12345"))), HostAndPort("node1", 12345)); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(Timestamp(100, 0), 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTime(Timestamp(100, 0), 0)); + getReplCoord()->setMyLastDurableOpTime(OpTime(Timestamp(100, 0), 0)); simulateSuccessfulV1Election(); BSONObjBuilder result; @@ -167,7 +170,8 @@ TEST_F(ReplCoordTest, NodeReturnsInvalidReplicaSetConfigWhenReconfigReceivedWith << BSON("replicaSetId" << OID::gen())), HostAndPort("node1", 12345)); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(Timestamp(100, 0), 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTime(Timestamp(100, 0), 0)); + getReplCoord()->setMyLastDurableOpTime(OpTime(Timestamp(100, 0), 0)); simulateSuccessfulV1Election(); BSONObjBuilder result; @@ -200,7 +204,8 @@ TEST_F(ReplCoordTest, << "node2:12345"))), HostAndPort("node1", 12345)); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(Timestamp(100, 0), 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTime(Timestamp(100, 0), 0)); + getReplCoord()->setMyLastDurableOpTime(OpTime(Timestamp(100, 0), 0)); simulateSuccessfulV1Election(); BSONObjBuilder result; @@ -265,7 +270,8 @@ TEST_F(ReplCoordTest, << "node2:12345"))), HostAndPort("node1", 12345)); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(Timestamp(100, 0), 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTime(Timestamp(100, 0), 0)); + getReplCoord()->setMyLastDurableOpTime(OpTime(Timestamp(100, 0), 0)); simulateSuccessfulV1Election(); Status status(ErrorCodes::InternalError, "Not Set"); @@ -303,7 +309,8 @@ TEST_F(ReplCoordTest, NodeReturnsOutOfDiskSpaceWhenSavingANewConfigFailsDuringRe << "node2:12345"))), HostAndPort("node1", 12345)); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(Timestamp(100, 0), 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTime(Timestamp(100, 0), 0)); + getReplCoord()->setMyLastDurableOpTime(OpTime(Timestamp(100, 0), 0)); simulateSuccessfulV1Election(); Status status(ErrorCodes::InternalError, "Not Set"); @@ -329,7 +336,8 @@ TEST_F(ReplCoordTest, << "node2:12345"))), HostAndPort("node1", 12345)); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(Timestamp(100, 0), 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTime(Timestamp(100, 0), 0)); + getReplCoord()->setMyLastDurableOpTime(OpTime(Timestamp(100, 0), 0)); simulateSuccessfulV1Election(); Status status(ErrorCodes::InternalError, "Not Set"); @@ -365,7 +373,8 @@ TEST_F(ReplCoordTest, NodeReturnsConfigurationInProgressWhenReceivingAReconfigWh init(); start(HostAndPort("node1", 12345)); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(Timestamp(100, 0), 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTime(Timestamp(100, 0), 0)); + getReplCoord()->setMyLastDurableOpTime(OpTime(Timestamp(100, 0), 0)); // initiate Status status(ErrorCodes::InternalError, "Not Set"); @@ -407,7 +416,8 @@ TEST_F(ReplCoordTest, PrimaryNodeAcceptsNewConfigWhenReceivingAReconfigWithAComp << BSON("replicaSetId" << OID::gen())), HostAndPort("node1", 12345)); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(Timestamp(100, 0), 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTime(Timestamp(100, 0), 0)); + getReplCoord()->setMyLastDurableOpTime(OpTime(Timestamp(100, 0), 0)); simulateSuccessfulV1Election(); Status status(ErrorCodes::InternalError, "Not Set"); @@ -448,7 +458,8 @@ TEST_F( << "node2:12345"))), HostAndPort("node1", 12345)); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(Timestamp(100, 0), 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTime(Timestamp(100, 0), 0)); + getReplCoord()->setMyLastDurableOpTime(OpTime(Timestamp(100, 0), 0)); simulateSuccessfulV1Election(); ASSERT_TRUE(getReplCoord()->getMemberState().primary()); @@ -503,7 +514,8 @@ TEST_F(ReplCoordTest, NodeDoesNotAcceptHeartbeatReconfigWhileInTheMidstOfReconfi << "node2:12345"))), HostAndPort("node1", 12345)); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(Timestamp(100, 0), 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTime(Timestamp(100, 0), 0)); + getReplCoord()->setMyLastDurableOpTime(OpTime(Timestamp(100, 0), 0)); simulateSuccessfulV1Election(); ASSERT_TRUE(getReplCoord()->getMemberState().primary()); @@ -565,7 +577,8 @@ TEST_F(ReplCoordTest, NodeAcceptsConfigFromAReconfigWithForceTrueWhileNotPrimary << "node2:12345"))), HostAndPort("node1", 12345)); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(Timestamp(100, 0), 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTime(Timestamp(100, 0), 0)); + getReplCoord()->setMyLastDurableOpTime(OpTime(Timestamp(100, 0), 0)); // fail before forced BSONObjBuilder result; diff --git a/src/mongo/db/repl/replication_coordinator_impl_test.cpp b/src/mongo/db/repl/replication_coordinator_impl_test.cpp index db7fc578af8..d7f37369a35 100644 --- a/src/mongo/db/repl/replication_coordinator_impl_test.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl_test.cpp @@ -39,6 +39,7 @@ #include "mongo/db/operation_context_noop.h" #include "mongo/db/repl/handshake_args.h" #include "mongo/db/repl/is_master_response.h" +#include "mongo/db/repl/old_update_position_args.h" #include "mongo/db/repl/operation_context_repl_mock.h" #include "mongo/db/repl/optime.h" #include "mongo/db/repl/read_concern_args.h" @@ -94,7 +95,8 @@ struct OpTimeWithTermZero { }; void runSingleNodeElection(ReplicationCoordinatorImpl* replCoord) { - replCoord->setMyLastOptime(OpTime(Timestamp(1, 0), 0)); + replCoord->setMyLastAppliedOpTime(OpTime(Timestamp(1, 0), 0)); + replCoord->setMyLastDurableOpTime(OpTime(Timestamp(1, 0), 0)); ASSERT(replCoord->setFollowerMode(MemberState::RS_SECONDARY)); replCoord->waitForElectionFinish_forTest(); @@ -749,7 +751,8 @@ TEST_F(ReplCoordTest, NodeReturnsOkWhenRunningAwaitReplicationAgainstPrimaryWith // Become primary. ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastDurableOpTime(OpTimeWithTermZero(100, 0)); simulateSuccessfulV1Election(); ASSERT(getReplCoord()->getMemberState().primary()); @@ -760,6 +763,80 @@ TEST_F(ReplCoordTest, NodeReturnsOkWhenRunningAwaitReplicationAgainstPrimaryWith ASSERT_TRUE(getExternalState()->isApplierSignaledToCancelFetcher()); } +TEST_F(ReplCoordTest, + NodeReturnsWriteConcernFailedUntilASufficientNumberOfNodesHaveTheWriteDurable) { + OperationContextNoop txn; + assertStartSuccess( + BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("host" + << "node1:12345" + << "_id" << 0) + << BSON("host" + << "node2:12345" + << "_id" << 1) << BSON("host" + << "node3:12345" + << "_id" << 2) << BSON("host" + << "node4:12345" + << "_id" << 3))), + HostAndPort("node1", 12345)); + ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + getReplCoord()->setMyLastAppliedOpTime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastDurableOpTime(OpTimeWithTermZero(100, 0)); + simulateSuccessfulV1Election(); + + OpTimeWithTermZero time1(100, 1); + OpTimeWithTermZero time2(100, 2); + + WriteConcernOptions writeConcern; + writeConcern.wTimeout = WriteConcernOptions::kNoWaiting; + writeConcern.wNumNodes = 1; + writeConcern.syncMode = WriteConcernOptions::SyncMode::JOURNAL; + + // 1 node waiting for time 1 + ReplicationCoordinator::StatusAndDuration statusAndDur = + getReplCoord()->awaitReplication(&txn, time1, writeConcern); + ASSERT_EQUALS(ErrorCodes::WriteConcernFailed, statusAndDur.status); + getReplCoord()->setMyLastAppliedOpTime(time1); + getReplCoord()->setMyLastDurableOpTime(time1); + statusAndDur = getReplCoord()->awaitReplication(&txn, time1, writeConcern); + ASSERT_OK(statusAndDur.status); + + // 2 nodes waiting for time1 + writeConcern.wNumNodes = 2; + statusAndDur = getReplCoord()->awaitReplication(&txn, time1, writeConcern); + ASSERT_EQUALS(ErrorCodes::WriteConcernFailed, statusAndDur.status); + // Applied is not durable and will not satisfy WriteConcern with SyncMode JOURNAL. + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(2, 1, time1)); + statusAndDur = getReplCoord()->awaitReplication(&txn, time1, writeConcern); + ASSERT_EQUALS(ErrorCodes::WriteConcernFailed, statusAndDur.status); + ASSERT_OK(getReplCoord()->setLastDurableOptime_forTest(2, 1, time1)); + statusAndDur = getReplCoord()->awaitReplication(&txn, time1, writeConcern); + ASSERT_OK(statusAndDur.status); + + // 2 nodes waiting for time2 + statusAndDur = getReplCoord()->awaitReplication(&txn, time2, writeConcern); + ASSERT_EQUALS(ErrorCodes::WriteConcernFailed, statusAndDur.status); + getReplCoord()->setMyLastAppliedOpTime(time2); + getReplCoord()->setMyLastDurableOpTime(time2); + statusAndDur = getReplCoord()->awaitReplication(&txn, time2, writeConcern); + ASSERT_EQUALS(ErrorCodes::WriteConcernFailed, statusAndDur.status); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(2, 2, time2)); + ASSERT_OK(getReplCoord()->setLastDurableOptime_forTest(2, 2, time2)); + statusAndDur = getReplCoord()->awaitReplication(&txn, time2, writeConcern); + ASSERT_OK(statusAndDur.status); + + // 3 nodes waiting for time2 + writeConcern.wNumNodes = 3; + statusAndDur = getReplCoord()->awaitReplication(&txn, time2, writeConcern); + ASSERT_EQUALS(ErrorCodes::WriteConcernFailed, statusAndDur.status); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(2, 3, time2)); + ASSERT_OK(getReplCoord()->setLastDurableOptime_forTest(2, 3, time2)); + statusAndDur = getReplCoord()->awaitReplication(&txn, time2, writeConcern); + ASSERT_OK(statusAndDur.status); +} + TEST_F(ReplCoordTest, NodeReturnsWriteConcernFailedUntilASufficientNumberOfNodesHaveTheWrite) { OperationContextNoop txn; assertStartSuccess( @@ -778,7 +855,8 @@ TEST_F(ReplCoordTest, NodeReturnsWriteConcernFailedUntilASufficientNumberOfNodes << "_id" << 3))), HostAndPort("node1", 12345)); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastDurableOpTime(OpTimeWithTermZero(100, 0)); simulateSuccessfulV1Election(); OpTimeWithTermZero time1(100, 1); @@ -792,7 +870,8 @@ TEST_F(ReplCoordTest, NodeReturnsWriteConcernFailedUntilASufficientNumberOfNodes ReplicationCoordinator::StatusAndDuration statusAndDur = getReplCoord()->awaitReplication(&txn, time1, writeConcern); ASSERT_EQUALS(ErrorCodes::WriteConcernFailed, statusAndDur.status); - getReplCoord()->setMyLastOptime(time1); + getReplCoord()->setMyLastAppliedOpTime(time1); + getReplCoord()->setMyLastDurableOpTime(time1); statusAndDur = getReplCoord()->awaitReplication(&txn, time1, writeConcern); ASSERT_OK(statusAndDur.status); @@ -800,17 +879,19 @@ TEST_F(ReplCoordTest, NodeReturnsWriteConcernFailedUntilASufficientNumberOfNodes writeConcern.wNumNodes = 2; statusAndDur = getReplCoord()->awaitReplication(&txn, time1, writeConcern); ASSERT_EQUALS(ErrorCodes::WriteConcernFailed, statusAndDur.status); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(2, 1, time1)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(2, 1, time1)); statusAndDur = getReplCoord()->awaitReplication(&txn, time1, writeConcern); ASSERT_OK(statusAndDur.status); // 2 nodes waiting for time2 statusAndDur = getReplCoord()->awaitReplication(&txn, time2, writeConcern); ASSERT_EQUALS(ErrorCodes::WriteConcernFailed, statusAndDur.status); - getReplCoord()->setMyLastOptime(time2); + getReplCoord()->setMyLastAppliedOpTime(time2); + getReplCoord()->setMyLastDurableOpTime(time2); statusAndDur = getReplCoord()->awaitReplication(&txn, time2, writeConcern); ASSERT_EQUALS(ErrorCodes::WriteConcernFailed, statusAndDur.status); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(2, 2, time2)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(2, 2, time2)); + ASSERT_OK(getReplCoord()->setLastDurableOptime_forTest(2, 2, time2)); statusAndDur = getReplCoord()->awaitReplication(&txn, time2, writeConcern); ASSERT_OK(statusAndDur.status); @@ -818,7 +899,7 @@ TEST_F(ReplCoordTest, NodeReturnsWriteConcernFailedUntilASufficientNumberOfNodes writeConcern.wNumNodes = 3; statusAndDur = getReplCoord()->awaitReplication(&txn, time2, writeConcern); ASSERT_EQUALS(ErrorCodes::WriteConcernFailed, statusAndDur.status); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(2, 3, time2)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(2, 3, time2)); statusAndDur = getReplCoord()->awaitReplication(&txn, time2, writeConcern); ASSERT_OK(statusAndDur.status); } @@ -842,7 +923,8 @@ TEST_F(ReplCoordTest, << "node4"))), HostAndPort("node0")); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(Timestamp(100, 0), 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTime(Timestamp(100, 0), 0)); + getReplCoord()->setMyLastDurableOpTime(OpTime(Timestamp(100, 0), 0)); simulateSuccessfulV1Election(); OpTime time1(Timestamp(100, 1), 1); @@ -858,7 +940,9 @@ TEST_F(ReplCoordTest, ASSERT_EQUALS(ErrorCodes::UnknownReplWriteConcern, statusAndDur.status); } -TEST_F(ReplCoordTest, NodeReturnsWriteConcernFailedUntilASufficientSetOfNodesHaveTheWrite) { +TEST_F( + ReplCoordTest, + NodeReturnsWriteConcernFailedUntilASufficientSetOfNodesHaveTheWriteAndTheWriteIsInACommittedSnapshot) { auto service = stdx::make_unique<ServiceContextNoop>(); auto client = service->makeClient("test"); OperationContextNoop txn(client.get(), 100); @@ -901,7 +985,8 @@ TEST_F(ReplCoordTest, NodeReturnsWriteConcernFailedUntilASufficientSetOfNodesHav << BSON("dc" << 2 << "rack" << 3)))), HostAndPort("node0")); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTime(Timestamp(100, 0), 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTime(Timestamp(100, 0), 0)); + getReplCoord()->setMyLastDurableOpTime(OpTime(Timestamp(100, 0), 0)); simulateSuccessfulV1Election(); OpTime time1(Timestamp(100, 1), 1); @@ -912,6 +997,7 @@ TEST_F(ReplCoordTest, NodeReturnsWriteConcernFailedUntilASufficientSetOfNodesHav WriteConcernOptions majorityWriteConcern; majorityWriteConcern.wTimeout = WriteConcernOptions::kNoWaiting; majorityWriteConcern.wMode = WriteConcernOptions::kMajority; + majorityWriteConcern.syncMode = WriteConcernOptions::SyncMode::JOURNAL; WriteConcernOptions multiDCWriteConcern; multiDCWriteConcern.wTimeout = WriteConcernOptions::kNoWaiting; @@ -923,7 +1009,8 @@ TEST_F(ReplCoordTest, NodeReturnsWriteConcernFailedUntilASufficientSetOfNodesHav // Nothing satisfied - getReplCoord()->setMyLastOptime(time1); + getReplCoord()->setMyLastAppliedOpTime(time1); + getReplCoord()->setMyLastDurableOpTime(time1); ReplicationCoordinator::StatusAndDuration statusAndDur = getReplCoord()->awaitReplication(&txn, time1, majorityWriteConcern); ASSERT_EQUALS(ErrorCodes::WriteConcernFailed, statusAndDur.status); @@ -933,8 +1020,10 @@ TEST_F(ReplCoordTest, NodeReturnsWriteConcernFailedUntilASufficientSetOfNodesHav ASSERT_EQUALS(ErrorCodes::WriteConcernFailed, statusAndDur.status); // Majority satisfied but not either custom mode - getReplCoord()->setLastOptime_forTest(2, 1, time1); - getReplCoord()->setLastOptime_forTest(2, 2, time1); + getReplCoord()->setLastAppliedOptime_forTest(2, 1, time1); + getReplCoord()->setLastDurableOptime_forTest(2, 1, time1); + getReplCoord()->setLastAppliedOptime_forTest(2, 2, time1); + getReplCoord()->setLastDurableOptime_forTest(2, 2, time1); getReplCoord()->onSnapshotCreate(time1, SnapshotName(1)); statusAndDur = getReplCoord()->awaitReplication(&txn, time1, majorityWriteConcern); @@ -945,7 +1034,8 @@ TEST_F(ReplCoordTest, NodeReturnsWriteConcernFailedUntilASufficientSetOfNodesHav ASSERT_EQUALS(ErrorCodes::WriteConcernFailed, statusAndDur.status); // All modes satisfied - getReplCoord()->setLastOptime_forTest(2, 3, time1); + getReplCoord()->setLastAppliedOptime_forTest(2, 3, time1); + getReplCoord()->setLastDurableOptime_forTest(2, 3, time1); statusAndDur = getReplCoord()->awaitReplication(&txn, time1, majorityWriteConcern); ASSERT_OK(statusAndDur.status); @@ -979,8 +1069,10 @@ TEST_F(ReplCoordTest, NodeReturnsWriteConcernFailedUntilASufficientSetOfNodesHav ASSERT_OK(statusAndDur.status); // multiDC satisfied but not majority or multiRack - getReplCoord()->setMyLastOptime(time2); - getReplCoord()->setLastOptime_forTest(2, 3, time2); + getReplCoord()->setMyLastAppliedOpTime(time2); + getReplCoord()->setMyLastDurableOpTime(time2); + getReplCoord()->setLastAppliedOptime_forTest(2, 3, time2); + getReplCoord()->setLastDurableOptime_forTest(2, 3, time2); statusAndDur = getReplCoord()->awaitReplication(&txn, time2, majorityWriteConcern); ASSERT_EQUALS(ErrorCodes::WriteConcernFailed, statusAndDur.status); @@ -1009,7 +1101,7 @@ public: _optime = ot; } - void setWriteConcern(const WriteConcernOptions& wc) { + void setWriteConcern(WriteConcernOptions wc) { _writeConcern = wc; } @@ -1061,7 +1153,8 @@ TEST_F(ReplCoordTest, NodeReturnsOkWhenAWriteConcernWithNoTimeoutHasBeenSatisfie << "_id" << 2))), HostAndPort("node1", 12345)); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastDurableOpTime(OpTimeWithTermZero(100, 0)); simulateSuccessfulV1Election(); ReplicationAwaiter awaiter(getReplCoord(), &txn); @@ -1077,8 +1170,9 @@ TEST_F(ReplCoordTest, NodeReturnsOkWhenAWriteConcernWithNoTimeoutHasBeenSatisfie awaiter.setOpTime(time1); awaiter.setWriteConcern(writeConcern); awaiter.start(&txn); - getReplCoord()->setMyLastOptime(time1); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(2, 1, time1)); + getReplCoord()->setMyLastAppliedOpTime(time1); + getReplCoord()->setMyLastDurableOpTime(time1); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(2, 1, time1)); ReplicationCoordinator::StatusAndDuration statusAndDur = awaiter.getResult(); ASSERT_OK(statusAndDur.status); awaiter.reset(); @@ -1086,8 +1180,9 @@ TEST_F(ReplCoordTest, NodeReturnsOkWhenAWriteConcernWithNoTimeoutHasBeenSatisfie // 2 nodes waiting for time2 awaiter.setOpTime(time2); awaiter.start(&txn); - getReplCoord()->setMyLastOptime(time2); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(2, 1, time2)); + getReplCoord()->setMyLastAppliedOpTime(time2); + getReplCoord()->setMyLastDurableOpTime(time2); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(2, 1, time2)); statusAndDur = awaiter.getResult(); ASSERT_OK(statusAndDur.status); awaiter.reset(); @@ -1096,7 +1191,7 @@ TEST_F(ReplCoordTest, NodeReturnsOkWhenAWriteConcernWithNoTimeoutHasBeenSatisfie writeConcern.wNumNodes = 3; awaiter.setWriteConcern(writeConcern); awaiter.start(&txn); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(2, 2, time2)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(2, 2, time2)); statusAndDur = awaiter.getResult(); ASSERT_OK(statusAndDur.status); awaiter.reset(); @@ -1117,7 +1212,8 @@ TEST_F(ReplCoordTest, NodeReturnsWriteConcernFailedWhenAWriteConcernTimesOutBefo << "_id" << 2))), HostAndPort("node1", 12345)); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastDurableOpTime(OpTimeWithTermZero(100, 0)); simulateSuccessfulV1Election(); ReplicationAwaiter awaiter(getReplCoord(), &txn); @@ -1133,8 +1229,9 @@ TEST_F(ReplCoordTest, NodeReturnsWriteConcernFailedWhenAWriteConcernTimesOutBefo awaiter.setOpTime(time2); awaiter.setWriteConcern(writeConcern); awaiter.start(&txn); - getReplCoord()->setMyLastOptime(time2); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(2, 1, time1)); + getReplCoord()->setMyLastAppliedOpTime(time2); + getReplCoord()->setMyLastDurableOpTime(time2); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(2, 1, time1)); ReplicationCoordinator::StatusAndDuration statusAndDur = awaiter.getResult(); ASSERT_EQUALS(ErrorCodes::WriteConcernFailed, statusAndDur.status); awaiter.reset(); @@ -1156,7 +1253,8 @@ TEST_F(ReplCoordTest, << "_id" << 2))), HostAndPort("node1", 12345)); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastDurableOpTime(OpTimeWithTermZero(100, 0)); simulateSuccessfulV1Election(); ReplicationAwaiter awaiter(getReplCoord(), &txn); @@ -1172,8 +1270,8 @@ TEST_F(ReplCoordTest, awaiter.setOpTime(time2); awaiter.setWriteConcern(writeConcern); awaiter.start(&txn); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(2, 1, time1)); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(2, 2, time1)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(2, 1, time1)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(2, 2, time1)); shutdown(); ReplicationCoordinator::StatusAndDuration statusAndDur = awaiter.getResult(); ASSERT_EQUALS(ErrorCodes::ShutdownInProgress, statusAndDur.status); @@ -1197,7 +1295,8 @@ TEST_F(ReplCoordTest, NodeReturnsNotMasterWhenSteppingDownBeforeSatisfyingAWrite << "_id" << 2))), HostAndPort("node1", 12345)); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastDurableOpTime(OpTimeWithTermZero(100, 0)); simulateSuccessfulV1Election(); ReplicationAwaiter awaiter(getReplCoord(), &txn); @@ -1213,8 +1312,8 @@ TEST_F(ReplCoordTest, NodeReturnsNotMasterWhenSteppingDownBeforeSatisfyingAWrite awaiter.setOpTime(time2); awaiter.setWriteConcern(writeConcern); awaiter.start(&txn); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(2, 1, time1)); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(2, 2, time1)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(2, 1, time1)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(2, 2, time1)); getReplCoord()->stepDown(&txn, true, Milliseconds(0), Milliseconds(1000)); ReplicationCoordinator::StatusAndDuration statusAndDur = awaiter.getResult(); ASSERT_EQUALS(ErrorCodes::NotMaster, statusAndDur.status); @@ -1236,7 +1335,8 @@ TEST_F(ReplCoordTest, << "node3"))), HostAndPort("node1")); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastDurableOpTime(OpTimeWithTermZero(100, 0)); simulateSuccessfulV1Election(); ReplicationAwaiter awaiter(getReplCoord(), &txn); @@ -1253,8 +1353,8 @@ TEST_F(ReplCoordTest, awaiter.setOpTime(time2); awaiter.setWriteConcern(writeConcern); awaiter.start(&txn); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(2, 1, time1)); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(2, 2, time1)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(2, 1, time1)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(2, 2, time1)); txn.setCheckForInterruptStatus(kInterruptedStatus); getReplCoord()->interrupt(opID); @@ -1310,7 +1410,8 @@ TEST_F(ReplCoordTest, NodeChangesTermAndStepsDownWhenAndOnlyWhenUpdateTermSuppli << "test3:1234")) << "protocolVersion" << 1), HostAndPort("test1", 1234)); - getReplCoord()->setMyLastOptime(OpTime(Timestamp(100, 1), 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTime(Timestamp(100, 1), 0)); + getReplCoord()->setMyLastDurableOpTime(OpTime(Timestamp(100, 1), 0)); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); ASSERT_TRUE(getReplCoord()->getMemberState().secondary()); @@ -1356,7 +1457,8 @@ TEST_F(ReplCoordTest, ConcurrentStepDownShouldNotSignalTheSameFinishEventMoreTha << "test3:1234")) << "protocolVersion" << 1), HostAndPort("test1", 1234)); - getReplCoord()->setMyLastOptime(OpTime(Timestamp(100, 1), 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTime(Timestamp(100, 1), 0)); + getReplCoord()->setMyLastDurableOpTime(OpTime(Timestamp(100, 1), 0)); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); ASSERT_TRUE(getReplCoord()->getMemberState().secondary()); @@ -1418,9 +1520,10 @@ TEST_F(StepDownTest, NodeReturnsNotMasterWhenAskedToStepDownAsANonPrimaryNode) { OperationContextReplMock txn; OpTimeWithTermZero optime1(100, 1); // All nodes are caught up - getReplCoord()->setMyLastOptime(optime1); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(1, 1, optime1)); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(1, 2, optime1)); + getReplCoord()->setMyLastAppliedOpTime(optime1); + getReplCoord()->setMyLastDurableOpTime(optime1); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 1, optime1)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 2, optime1)); Status status = getReplCoord()->stepDown(&txn, false, Milliseconds(0), Milliseconds(0)); ASSERT_EQUALS(ErrorCodes::NotMaster, status); @@ -1432,9 +1535,10 @@ TEST_F(StepDownTest, OperationContextReplMock txn; OpTimeWithTermZero optime1(100, 1); // All nodes are caught up - getReplCoord()->setMyLastOptime(optime1); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(1, 1, optime1)); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(1, 2, optime1)); + getReplCoord()->setMyLastAppliedOpTime(optime1); + getReplCoord()->setMyLastDurableOpTime(optime1); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 1, optime1)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 2, optime1)); simulateSuccessfulV1Election(); @@ -1451,9 +1555,10 @@ TEST_F(StepDownTest, OperationContextReplMock txn; OpTimeWithTermZero optime1(100, 1); // All nodes are caught up - getReplCoord()->setMyLastOptime(optime1); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(1, 1, optime1)); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(1, 2, optime1)); + getReplCoord()->setMyLastAppliedOpTime(optime1); + getReplCoord()->setMyLastDurableOpTime(optime1); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 1, optime1)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 2, optime1)); simulateSuccessfulV1Election(); @@ -1469,7 +1574,8 @@ TEST_F(StepDownTest, hbResp.setSetName(hbArgs.getSetName()); hbResp.setState(MemberState::RS_SECONDARY); hbResp.setConfigVersion(hbArgs.getConfigVersion()); - hbResp.setOpTime(optime1); + hbResp.setDurableOpTime(optime1); + hbResp.setAppliedOpTime(optime1); BSONObjBuilder respObj; respObj << "ok" << 1; hbResp.addToBSON(&respObj, false); @@ -1525,9 +1631,10 @@ TEST_F(StepDownTest, OpTimeWithTermZero optime2(100, 2); // No secondary is caught up auto repl = getReplCoord(); - repl->setMyLastOptime(optime2); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(1, 1, optime1)); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(1, 2, optime1)); + repl->setMyLastAppliedOpTime(optime2); + repl->setMyLastDurableOpTime(optime2); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 1, optime1)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 2, optime1)); simulateSuccessfulV1Election(); @@ -1559,9 +1666,10 @@ TEST_F(StepDownTest, OpTimeWithTermZero optime2(100, 2); // No secondary is caught up auto repl = getReplCoord(); - repl->setMyLastOptime(optime2); - ASSERT_OK(repl->setLastOptime_forTest(1, 1, optime1)); - ASSERT_OK(repl->setLastOptime_forTest(1, 2, optime1)); + repl->setMyLastAppliedOpTime(optime2); + repl->setMyLastDurableOpTime(optime2); + ASSERT_OK(repl->setLastAppliedOptime_forTest(1, 1, optime1)); + ASSERT_OK(repl->setLastAppliedOptime_forTest(1, 2, optime1)); simulateSuccessfulV1Election(); @@ -1589,7 +1697,8 @@ TEST_F(StepDownTest, hbResp.setSetName(hbArgs.getSetName()); hbResp.setState(MemberState::RS_SECONDARY); hbResp.setConfigVersion(hbArgs.getConfigVersion()); - hbResp.setOpTime(optime2); + hbResp.setAppliedOpTime(optime2); + hbResp.setDurableOpTime(optime2); BSONObjBuilder respObj; respObj << "ok" << 1; hbResp.addToBSON(&respObj, false); @@ -1615,9 +1724,10 @@ TEST_F(StepDownTest, OpTimeWithTermZero optime2(100, 2); // No secondary is caught up auto repl = getReplCoord(); - repl->setMyLastOptime(optime2); - ASSERT_OK(repl->setLastOptime_forTest(1, 1, optime1)); - ASSERT_OK(repl->setLastOptime_forTest(1, 2, optime1)); + repl->setMyLastAppliedOpTime(optime2); + repl->setMyLastDurableOpTime(optime2); + ASSERT_OK(repl->setLastAppliedOptime_forTest(1, 1, optime1)); + ASSERT_OK(repl->setLastAppliedOptime_forTest(1, 2, optime1)); simulateSuccessfulV1Election(); @@ -1673,7 +1783,8 @@ TEST_F(StepDownTest, hbResp.setSetName(hbArgs.getSetName()); hbResp.setState(MemberState::RS_SECONDARY); hbResp.setConfigVersion(hbArgs.getConfigVersion()); - hbResp.setOpTime(optime2); + hbResp.setAppliedOpTime(optime2); + hbResp.setDurableOpTime(optime2); BSONObjBuilder respObj; respObj << "ok" << 1; hbResp.addToBSON(&respObj, false); @@ -1697,9 +1808,10 @@ TEST_F(StepDownTest, NodeReturnsInterruptedWhenInterruptedDuringStepDown) { OpTimeWithTermZero optime2(100, 2); // No secondary is caught up auto repl = getReplCoord(); - repl->setMyLastOptime(optime2); - ASSERT_OK(repl->setLastOptime_forTest(1, 1, optime1)); - ASSERT_OK(repl->setLastOptime_forTest(1, 2, optime1)); + repl->setMyLastAppliedOpTime(optime2); + repl->setMyLastDurableOpTime(optime2); + ASSERT_OK(repl->setLastAppliedOptime_forTest(1, 1, optime1)); + ASSERT_OK(repl->setLastAppliedOptime_forTest(1, 2, optime1)); simulateSuccessfulV1Election(); ASSERT_TRUE(repl->getMemberState().primary()); @@ -1770,18 +1882,92 @@ TEST_F(ReplCoordTest, NodeIncludesOtherMembersProgressInUpdatePositionCommand) { << "test1:1234") << BSON("_id" << 1 << "host" << "test2:1234") << BSON("_id" << 2 << "host" + << "test3:1234") + << BSON("_id" << 3 << "host" + << "test4:1234"))), + HostAndPort("test1", 1234)); + OpTime optime1({2, 1}, 1); + OpTime optime2({100, 1}, 1); + OpTime optime3({100, 2}, 1); + getReplCoord()->setMyLastAppliedOpTime(optime1); + getReplCoord()->setMyLastDurableOpTime(optime1); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 1, optime2)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 2, optime3)); + ASSERT_OK(getReplCoord()->setLastDurableOptime_forTest(1, 2, optime3)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 3, optime3)); + ASSERT_OK(getReplCoord()->setLastDurableOptime_forTest(1, 3, optime1)); + + // Check that the proper BSON is generated for the replSetUpdatePositionCommand + BSONObjBuilder cmdBuilder; + getReplCoord()->prepareReplSetUpdatePositionCommand(&cmdBuilder); + BSONObj cmd = cmdBuilder.done(); + + ASSERT_EQUALS(2, cmd.nFields()); + ASSERT_EQUALS("replSetUpdatePosition", cmd.firstElement().fieldNameStringData()); + + std::set<long long> memberIds; + BSONForEach(entryElement, cmd["optimes"].Obj()) { + OpTime durableOpTime; + OpTime appliedOpTime; + BSONObj entry = entryElement.Obj(); + long long memberId = entry["memberId"].Number(); + memberIds.insert(memberId); + if (memberId == 0) { + log() << 0; + ASSERT_OK(bsonExtractOpTimeField(entry, "appliedOpTime", &appliedOpTime)); + ASSERT_OK(bsonExtractOpTimeField(entry, "durableOpTime", &durableOpTime)); + ASSERT_EQUALS(optime1, appliedOpTime); + ASSERT_EQUALS(optime1, durableOpTime); + } else if (memberId == 1) { + log() << 1; + ASSERT_OK(bsonExtractOpTimeField(entry, "appliedOpTime", &appliedOpTime)); + ASSERT_OK(bsonExtractOpTimeField(entry, "durableOpTime", &durableOpTime)); + ASSERT_EQUALS(optime2, appliedOpTime); + ASSERT_EQUALS(OpTime(), durableOpTime); + } else if (memberId == 2) { + log() << 2; + ASSERT_OK(bsonExtractOpTimeField(entry, "appliedOpTime", &appliedOpTime)); + ASSERT_OK(bsonExtractOpTimeField(entry, "durableOpTime", &durableOpTime)); + ASSERT_EQUALS(optime3, appliedOpTime); + ASSERT_EQUALS(optime3, durableOpTime); + } else { + log() << 3; + ASSERT_EQUALS(3, memberId); + ASSERT_OK(bsonExtractOpTimeField(entry, "appliedOpTime", &appliedOpTime)); + ASSERT_OK(bsonExtractOpTimeField(entry, "durableOpTime", &durableOpTime)); + ASSERT_EQUALS(optime3, appliedOpTime); + ASSERT_EQUALS(optime1, durableOpTime); + } + } + ASSERT_EQUALS(4U, memberIds.size()); // Make sure we saw all 4 nodes +} + +TEST_F(ReplCoordTest, NodeIncludesOtherMembersProgressInOldUpdatePositionCommand) { + OperationContextNoop txn; + init("mySet/test1:1234,test2:1234,test3:1234"); + assertStartSuccess( + BSON("_id" + << "mySet" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "test1:1234") + << BSON("_id" << 1 << "host" + << "test2:1234") << BSON("_id" << 2 << "host" << "test3:1234"))), HostAndPort("test1", 1234)); OpTimeWithTermZero optime1(100, 1); OpTimeWithTermZero optime2(100, 2); OpTimeWithTermZero optime3(2, 1); - getReplCoord()->setMyLastOptime(optime1); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(1, 1, optime2)); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(1, 2, optime3)); + getReplCoord()->setMyLastAppliedOpTime(optime1); + getReplCoord()->setMyLastDurableOpTime(optime1); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 1, optime2)); + ASSERT_OK(getReplCoord()->setLastDurableOptime_forTest(1, 1, optime2)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 2, optime3)); + ASSERT_OK(getReplCoord()->setLastDurableOptime_forTest(1, 2, optime3)); // Check that the proper BSON is generated for the replSetUpdatePositionCommand BSONObjBuilder cmdBuilder; - getReplCoord()->prepareReplSetUpdatePositionCommand(&cmdBuilder); + getReplCoord()->prepareOldReplSetUpdatePositionCommand(&cmdBuilder); BSONObj cmd = cmdBuilder.done(); ASSERT_EQUALS(2, cmd.nFields()); @@ -1821,7 +2007,8 @@ TEST_F(ReplCoordTest, HostAndPort("test2", 1234)); OperationContextNoop txn; getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY); - getReplCoord()->setMyLastOptime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastDurableOpTime(OpTimeWithTermZero(100, 0)); // Can't unset maintenance mode if it was never set to begin with. Status status = getReplCoord()->setMaintenanceMode(false); @@ -1844,7 +2031,8 @@ TEST_F(ReplCoordTest, HostAndPort("test2", 1234)); OperationContextNoop txn; getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY); - getReplCoord()->setMyLastOptime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastDurableOpTime(OpTimeWithTermZero(100, 0)); // valid set ASSERT_OK(getReplCoord()->setMaintenanceMode(true)); ASSERT_TRUE(getReplCoord()->getMemberState().recovering()); @@ -1872,7 +2060,8 @@ TEST_F(ReplCoordTest, AllowAsManyUnsetMaintenanceModesAsThereHaveBeenSetMaintena HostAndPort("test2", 1234)); OperationContextNoop txn; getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY); - getReplCoord()->setMyLastOptime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastDurableOpTime(OpTimeWithTermZero(100, 0)); // Can set multiple times ASSERT_OK(getReplCoord()->setMaintenanceMode(true)); ASSERT_OK(getReplCoord()->setMaintenanceMode(true)); @@ -1902,7 +2091,8 @@ TEST_F(ReplCoordTest, SettingAndUnsettingMaintenanceModeShouldNotAffectRollbackS HostAndPort("test2", 1234)); OperationContextNoop txn; getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY); - getReplCoord()->setMyLastOptime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastDurableOpTime(OpTimeWithTermZero(100, 0)); // From rollback, entering and exiting maintenance mode doesn't change perceived // state. @@ -1940,7 +2130,8 @@ TEST_F(ReplCoordTest, DoNotAllowMaintenanceModeWhilePrimary) { HostAndPort("test2", 1234)); OperationContextNoop txn; getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY); - getReplCoord()->setMyLastOptime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastDurableOpTime(OpTimeWithTermZero(100, 0)); // Can't modify maintenance mode when PRIMARY simulateSuccessfulV1Election(); @@ -1972,7 +2163,8 @@ TEST_F(ReplCoordTest, DoNotAllowSettingMaintenanceModeWhileConductingAnElection) HostAndPort("test2", 1234)); OperationContextNoop txn; getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY); - getReplCoord()->setMyLastOptime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastDurableOpTime(OpTimeWithTermZero(100, 0)); // TODO this election shouldn't have to happen. simulateSuccessfulV1Election(); @@ -2018,6 +2210,50 @@ TEST_F(ReplCoordTest, DoNotAllowSettingMaintenanceModeWhileConductingAnElection) } TEST_F(ReplCoordTest, + NodeReturnsACompleteListOfNodesWeKnowHaveTheWriteDurablyInResponseToGetHostsWrittenTo) { + HostAndPort myHost("node1:12345"); + HostAndPort client1Host("node2:12345"); + HostAndPort client2Host("node3:12345"); + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" << myHost.toString()) + << BSON("_id" << 1 << "host" << client1Host.toString()) + << BSON("_id" << 2 << "host" << client2Host.toString()))), + HostAndPort("node1", 12345)); + OperationContextNoop txn; + + OpTimeWithTermZero time1(100, 1); + OpTimeWithTermZero time2(100, 2); + + getReplCoord()->setMyLastAppliedOpTime(time2); + getReplCoord()->setMyLastDurableOpTime(time2); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(2, 1, time1)); + ASSERT_OK(getReplCoord()->setLastDurableOptime_forTest(2, 1, time1)); + + std::vector<HostAndPort> caughtUpHosts = getReplCoord()->getHostsWrittenTo(time2, true); + ASSERT_EQUALS(1U, caughtUpHosts.size()); + ASSERT_EQUALS(myHost, caughtUpHosts[0]); + + // Ensure updating applied does not affect the results for getHostsWritten durably. + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(2, 2, time2)); + caughtUpHosts = getReplCoord()->getHostsWrittenTo(time2, true); + ASSERT_EQUALS(1U, caughtUpHosts.size()); + ASSERT_EQUALS(myHost, caughtUpHosts[0]); + + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(2, 2, time2)); + ASSERT_OK(getReplCoord()->setLastDurableOptime_forTest(2, 2, time2)); + caughtUpHosts = getReplCoord()->getHostsWrittenTo(time2, true); + ASSERT_EQUALS(2U, caughtUpHosts.size()); + if (myHost == caughtUpHosts[0]) { + ASSERT_EQUALS(client2Host, caughtUpHosts[1]); + } else { + ASSERT_EQUALS(client2Host, caughtUpHosts[0]); + ASSERT_EQUALS(myHost, caughtUpHosts[1]); + } +} + +TEST_F(ReplCoordTest, NodeReturnsACompleteListOfNodesWeKnowHaveTheWriteInResponseToGetHostsWrittenTo) { HostAndPort myHost("node1:12345"); HostAndPort client1Host("node2:12345"); @@ -2034,15 +2270,16 @@ TEST_F(ReplCoordTest, OpTimeWithTermZero time1(100, 1); OpTimeWithTermZero time2(100, 2); - getReplCoord()->setMyLastOptime(time2); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(2, 1, time1)); + getReplCoord()->setMyLastAppliedOpTime(time2); + getReplCoord()->setMyLastDurableOpTime(time2); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(2, 1, time1)); - std::vector<HostAndPort> caughtUpHosts = getReplCoord()->getHostsWrittenTo(time2); + std::vector<HostAndPort> caughtUpHosts = getReplCoord()->getHostsWrittenTo(time2, false); ASSERT_EQUALS(1U, caughtUpHosts.size()); ASSERT_EQUALS(myHost, caughtUpHosts[0]); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(2, 2, time2)); - caughtUpHosts = getReplCoord()->getHostsWrittenTo(time2); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(2, 2, time2)); + caughtUpHosts = getReplCoord()->getHostsWrittenTo(time2, false); ASSERT_EQUALS(2U, caughtUpHosts.size()); if (myHost == caughtUpHosts[0]) { ASSERT_EQUALS(client2Host, caughtUpHosts[1]); @@ -2068,14 +2305,15 @@ TEST_F(ReplCoordTest, NodeDoesNotIncludeItselfWhenRunningGetHostsWrittenToInMast ASSERT_OK(handshake.initialize(BSON("handshake" << client))); ASSERT_OK(getReplCoord()->processHandshake(&txn, handshake)); - getReplCoord()->setMyLastOptime(time2); + getReplCoord()->setMyLastAppliedOpTime(time2); + getReplCoord()->setMyLastDurableOpTime(time2); ASSERT_OK(getReplCoord()->setLastOptimeForSlave(client, time1.timestamp)); - std::vector<HostAndPort> caughtUpHosts = getReplCoord()->getHostsWrittenTo(time2); + std::vector<HostAndPort> caughtUpHosts = getReplCoord()->getHostsWrittenTo(time2, false); ASSERT_EQUALS(0U, caughtUpHosts.size()); // self doesn't get included in master-slave ASSERT_OK(getReplCoord()->setLastOptimeForSlave(client, time2.timestamp)); - caughtUpHosts = getReplCoord()->getHostsWrittenTo(time2); + caughtUpHosts = getReplCoord()->getHostsWrittenTo(time2, false); ASSERT_EQUALS(1U, caughtUpHosts.size()); ASSERT_EQUALS(clientHost, caughtUpHosts[0]); } @@ -2208,13 +2446,61 @@ TEST_F(ReplCoordTest, DoNotProcessSelfWhenUpdatePositionContainsInfoAboutSelf) { << "_id" << 2))), HostAndPort("node1", 12345)); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastDurableOpTime(OpTimeWithTermZero(100, 0)); + simulateSuccessfulV1Election(); + + OpTime time1({100, 1}, 2); + OpTime time2({100, 2}, 2); + getReplCoord()->setMyLastAppliedOpTime(time1); + getReplCoord()->setMyLastDurableOpTime(time1); + + WriteConcernOptions writeConcern; + writeConcern.wTimeout = WriteConcernOptions::kNoWaiting; + writeConcern.wNumNodes = 1; + + ASSERT_EQUALS(ErrorCodes::WriteConcernFailed, + getReplCoord()->awaitReplication(&txn, time2, writeConcern).status); + + // receive updatePosition containing ourself, should not process the update for self + UpdatePositionArgs args; + ASSERT_OK(args.initialize( + BSON("replSetUpdatePosition" + << 1 << "optimes" + << BSON_ARRAY(BSON("cfgver" << 2 << "memberId" << 0 << "durableOpTime" + << BSON("ts" << time2.getTimestamp() << "t" << 2) + << "appliedOpTime" + << BSON("ts" << time2.getTimestamp() << "t" << 2)))))); + + ASSERT_OK(getReplCoord()->processReplSetUpdatePosition(args, 0)); + ASSERT_EQUALS(ErrorCodes::WriteConcernFailed, + getReplCoord()->awaitReplication(&txn, time2, writeConcern).status); +} + +TEST_F(ReplCoordTest, DoNotProcessSelfWhenOldUpdatePositionContainsInfoAboutSelf) { + OperationContextNoop txn; + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("host" + << "node1:12345" + << "_id" << 0) + << BSON("host" + << "node2:12345" + << "_id" << 1) << BSON("host" + << "node3:12345" + << "_id" << 2))), + HostAndPort("node1", 12345)); + ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + getReplCoord()->setMyLastAppliedOpTime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastDurableOpTime(OpTimeWithTermZero(100, 0)); simulateSuccessfulV1Election(); OpTimeWithTermZero time1(100, 1); OpTimeWithTermZero time2(100, 2); OpTimeWithTermZero staleTime(10, 0); - getReplCoord()->setMyLastOptime(time1); + getReplCoord()->setMyLastAppliedOpTime(time1); + getReplCoord()->setMyLastDurableOpTime(time1); WriteConcernOptions writeConcern; writeConcern.wTimeout = WriteConcernOptions::kNoWaiting; @@ -2224,7 +2510,7 @@ TEST_F(ReplCoordTest, DoNotProcessSelfWhenUpdatePositionContainsInfoAboutSelf) { getReplCoord()->awaitReplication(&txn, time2, writeConcern).status); // receive updatePosition containing ourself, should not process the update for self - UpdatePositionArgs args; + OldUpdatePositionArgs args; ASSERT_OK(args.initialize(BSON("replSetUpdatePosition" << 1 << "optimes" << BSON_ARRAY(BSON("cfgver" << 2 << "memberId" << 0 << "optime" @@ -2250,20 +2536,67 @@ TEST_F(ReplCoordTest, DoNotProcessUpdatePositionWhenItsConfigVersionIsIncorrect) << "_id" << 2))), HostAndPort("node1", 12345)); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastDurableOpTime(OpTimeWithTermZero(100, 0)); + simulateSuccessfulV1Election(); + + OpTime time1({100, 1}, 3); + OpTime time2({100, 2}, 3); + getReplCoord()->setMyLastAppliedOpTime(time1); + getReplCoord()->setMyLastDurableOpTime(time1); + + WriteConcernOptions writeConcern; + writeConcern.wTimeout = WriteConcernOptions::kNoWaiting; + writeConcern.wNumNodes = 1; + + // receive updatePosition with incorrect config version + UpdatePositionArgs args; + ASSERT_OK(args.initialize( + BSON("replSetUpdatePosition" + << 1 << "optimes" + << BSON_ARRAY(BSON("cfgver" << 3 << "memberId" << 1 << "durableOpTime" + << BSON("ts" << time2.getTimestamp() << "t" << 3) + << "appliedOpTime" + << BSON("ts" << time2.getTimestamp() << "t" << 3)))))); + + long long cfgver; + ASSERT_EQUALS(ErrorCodes::InvalidReplicaSetConfig, + getReplCoord()->processReplSetUpdatePosition(args, &cfgver)); + ASSERT_EQUALS(ErrorCodes::WriteConcernFailed, + getReplCoord()->awaitReplication(&txn, time2, writeConcern).status); +} + +TEST_F(ReplCoordTest, DoNotProcessOldUpdatePositionWhenItsConfigVersionIsIncorrect) { + OperationContextNoop txn; + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("host" + << "node1:12345" + << "_id" << 0) + << BSON("host" + << "node2:12345" + << "_id" << 1) << BSON("host" + << "node3:12345" + << "_id" << 2))), + HostAndPort("node1", 12345)); + ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + getReplCoord()->setMyLastAppliedOpTime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastDurableOpTime(OpTimeWithTermZero(100, 0)); simulateSuccessfulV1Election(); OpTimeWithTermZero time1(100, 1); OpTimeWithTermZero time2(100, 2); OpTimeWithTermZero staleTime(10, 0); - getReplCoord()->setMyLastOptime(time1); + getReplCoord()->setMyLastAppliedOpTime(time1); + getReplCoord()->setMyLastDurableOpTime(time1); WriteConcernOptions writeConcern; writeConcern.wTimeout = WriteConcernOptions::kNoWaiting; writeConcern.wNumNodes = 1; // receive updatePosition with incorrect config version - UpdatePositionArgs args; + OldUpdatePositionArgs args; ASSERT_OK(args.initialize(BSON("replSetUpdatePosition" << 1 << "optimes" << BSON_ARRAY(BSON("cfgver" << 3 << "memberId" << 1 << "optime" @@ -2291,20 +2624,65 @@ TEST_F(ReplCoordTest, DoNotProcessUpdatePositionOfMembersWhoseIdsAreNotInTheConf << "_id" << 2))), HostAndPort("node1", 12345)); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastDurableOpTime(OpTimeWithTermZero(100, 0)); + simulateSuccessfulV1Election(); + + OpTime time1({100, 1}, 2); + OpTime time2({100, 2}, 2); + getReplCoord()->setMyLastAppliedOpTime(time1); + getReplCoord()->setMyLastDurableOpTime(time1); + + WriteConcernOptions writeConcern; + writeConcern.wTimeout = WriteConcernOptions::kNoWaiting; + writeConcern.wNumNodes = 1; + + // receive updatePosition with nonexistent member id + UpdatePositionArgs args; + ASSERT_OK(args.initialize( + BSON("replSetUpdatePosition" + << 1 << "optimes" + << BSON_ARRAY(BSON("cfgver" << 2 << "memberId" << 9 << "durableOpTime" + << BSON("ts" << time2.getTimestamp() << "t" << 2) + << "appliedOpTime" + << BSON("ts" << time2.getTimestamp() << "t" << 2)))))); + + ASSERT_EQUALS(ErrorCodes::NodeNotFound, getReplCoord()->processReplSetUpdatePosition(args, 0)); + ASSERT_EQUALS(ErrorCodes::WriteConcernFailed, + getReplCoord()->awaitReplication(&txn, time2, writeConcern).status); +} + +TEST_F(ReplCoordTest, DoNotProcessOldUpdatePositionOfMembersWhoseIdsAreNotInTheConfig) { + OperationContextNoop txn; + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 2 << "members" + << BSON_ARRAY(BSON("host" + << "node1:12345" + << "_id" << 0) + << BSON("host" + << "node2:12345" + << "_id" << 1) << BSON("host" + << "node3:12345" + << "_id" << 2))), + HostAndPort("node1", 12345)); + ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); + getReplCoord()->setMyLastAppliedOpTime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastDurableOpTime(OpTimeWithTermZero(100, 0)); simulateSuccessfulV1Election(); OpTimeWithTermZero time1(100, 1); OpTimeWithTermZero time2(100, 2); OpTimeWithTermZero staleTime(10, 0); - getReplCoord()->setMyLastOptime(time1); + getReplCoord()->setMyLastAppliedOpTime(time1); + getReplCoord()->setMyLastDurableOpTime(time1); WriteConcernOptions writeConcern; writeConcern.wTimeout = WriteConcernOptions::kNoWaiting; writeConcern.wNumNodes = 1; // receive updatePosition with nonexistent member id - UpdatePositionArgs args; + OldUpdatePositionArgs args; ASSERT_OK(args.initialize(BSON("replSetUpdatePosition" << 1 << "optimes" << BSON_ARRAY(BSON("cfgver" << 2 << "memberId" << 9 << "optime" @@ -2331,21 +2709,24 @@ TEST_F(ReplCoordTest, << "_id" << 2))), HostAndPort("node1", 12345)); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastDurableOpTime(OpTimeWithTermZero(100, 0)); simulateSuccessfulV1Election(); OpTimeWithTermZero time1(100, 1); OpTimeWithTermZero time2(100, 2); OpTimeWithTermZero staleTime(10, 0); - getReplCoord()->setMyLastOptime(time1); + getReplCoord()->setMyLastAppliedOpTime(time1); + getReplCoord()->setMyLastDurableOpTime(time1); WriteConcernOptions writeConcern; writeConcern.wTimeout = WriteConcernOptions::kNoWaiting; writeConcern.wNumNodes = 1; // receive a good update position - getReplCoord()->setMyLastOptime(time2); - UpdatePositionArgs args; + getReplCoord()->setMyLastAppliedOpTime(time2); + getReplCoord()->setMyLastDurableOpTime(time2); + OldUpdatePositionArgs args; ASSERT_OK(args.initialize( BSON("replSetUpdatePosition" << 1 << "optimes" @@ -2393,7 +2774,8 @@ TEST_F(ReplCoordTest, AwaitReplicationShouldResolveAsNormalDuringAReconfig) { << "_id" << 2))), HostAndPort("node1", 12345)); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTimeWithTermZero(100, 2)); + getReplCoord()->setMyLastAppliedOpTime(OpTimeWithTermZero(100, 2)); + getReplCoord()->setMyLastDurableOpTime(OpTimeWithTermZero(100, 2)); simulateSuccessfulV1Election(); OpTimeWithTermZero time(100, 2); @@ -2408,6 +2790,12 @@ TEST_F(ReplCoordTest, AwaitReplicationShouldResolveAsNormalDuringAReconfig) { awaiter.setWriteConcern(writeConcern); awaiter.start(&txn); + ReplicationAwaiter awaiterJournaled(getReplCoord(), &txn); + writeConcern.wMode = WriteConcernOptions::kMajority; + awaiterJournaled.setOpTime(time); + awaiterJournaled.setWriteConcern(writeConcern); + awaiterJournaled.start(&txn); + // reconfig Status status(ErrorCodes::InternalError, "Not Set"); stdx::thread reconfigThread(stdx::bind(doReplSetReconfig, getReplCoord(), &status)); @@ -2417,12 +2805,22 @@ TEST_F(ReplCoordTest, AwaitReplicationShouldResolveAsNormalDuringAReconfig) { ASSERT_OK(status); // satisfy write concern - ASSERT_OK(getReplCoord()->setLastOptime_forTest(3, 0, time)); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(3, 1, time)); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(3, 2, time)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(3, 0, time)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(3, 1, time)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(3, 2, time)); ReplicationCoordinator::StatusAndDuration statusAndDur = awaiter.getResult(); ASSERT_OK(statusAndDur.status); awaiter.reset(); + + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(3, 0, time)); + ASSERT_OK(getReplCoord()->setLastDurableOptime_forTest(3, 0, time)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(3, 1, time)); + ASSERT_OK(getReplCoord()->setLastDurableOptime_forTest(3, 1, time)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(3, 2, time)); + ASSERT_OK(getReplCoord()->setLastDurableOptime_forTest(3, 2, time)); + ReplicationCoordinator::StatusAndDuration statusAndDurJournaled = awaiterJournaled.getResult(); + ASSERT_OK(statusAndDurJournaled.status); + awaiterJournaled.reset(); } void doReplSetReconfigToFewer(ReplicationCoordinatorImpl* replCoord, Status* status) { @@ -2457,7 +2855,8 @@ TEST_F( << "_id" << 2))), HostAndPort("node1", 12345)); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTimeWithTermZero(100, 2)); + getReplCoord()->setMyLastAppliedOpTime(OpTimeWithTermZero(100, 2)); + getReplCoord()->setMyLastDurableOpTime(OpTimeWithTermZero(100, 2)); simulateSuccessfulV1Election(); OpTimeWithTermZero time(100, 2); @@ -2472,6 +2871,12 @@ TEST_F( awaiter.setWriteConcern(writeConcern); awaiter.start(&txn); + ReplicationAwaiter awaiterJournaled(getReplCoord(), &txn); + writeConcern.wMode = WriteConcernOptions::kMajority; + awaiterJournaled.setOpTime(time); + awaiterJournaled.setWriteConcern(writeConcern); + awaiterJournaled.start(&txn); + // reconfig to fewer nodes Status status(ErrorCodes::InternalError, "Not Set"); stdx::thread reconfigThread(stdx::bind(doReplSetReconfigToFewer, getReplCoord(), &status)); @@ -2485,6 +2890,9 @@ TEST_F( ReplicationCoordinator::StatusAndDuration statusAndDur = awaiter.getResult(); ASSERT_EQUALS(ErrorCodes::CannotSatisfyWriteConcern, statusAndDur.status); awaiter.reset(); + ReplicationCoordinator::StatusAndDuration statusAndDurJournaled = awaiterJournaled.getResult(); + ASSERT_EQUALS(ErrorCodes::CannotSatisfyWriteConcern, statusAndDurJournaled.status); + awaiterJournaled.reset(); } TEST_F(ReplCoordTest, @@ -2508,14 +2916,16 @@ TEST_F(ReplCoordTest, << "_id" << 4))), HostAndPort("node1", 12345)); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); - getReplCoord()->setMyLastOptime(OpTimeWithTermZero(100, 1)); + getReplCoord()->setMyLastAppliedOpTime(OpTimeWithTermZero(100, 1)); + getReplCoord()->setMyLastDurableOpTime(OpTimeWithTermZero(100, 1)); simulateSuccessfulV1Election(); OpTime time(Timestamp(100, 2), 1); - getReplCoord()->setMyLastOptime(time); + getReplCoord()->setMyLastAppliedOpTime(time); + getReplCoord()->setMyLastDurableOpTime(time); getReplCoord()->onSnapshotCreate(time, SnapshotName(1)); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(2, 1, time)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(2, 1, time)); // majority nodes waiting for time @@ -2550,7 +2960,7 @@ TEST_F(ReplCoordTest, } TEST_F(ReplCoordTest, - NodeReturnsFromMajorityWriteConcernOnlyOnceAMajorityOfVotingNodesHaveReceivedTheWrite) { + NodeReturnsFromMajorityWriteConcernOnlyOnceTheWriteAppearsInACommittedSnapShot) { // Test that we can satisfy majority write concern can only be // satisfied by voting data-bearing members. OperationContextNoop txn; @@ -2574,26 +2984,31 @@ TEST_F(ReplCoordTest, HostAndPort("node1", 12345)); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); OpTime time(Timestamp(100, 0), 1); - getReplCoord()->setMyLastOptime(time); + getReplCoord()->setMyLastAppliedOpTime(time); + getReplCoord()->setMyLastDurableOpTime(time); simulateSuccessfulV1Election(); WriteConcernOptions majorityWriteConcern; majorityWriteConcern.wTimeout = WriteConcernOptions::kNoWaiting; majorityWriteConcern.wMode = WriteConcernOptions::kMajority; + majorityWriteConcern.syncMode = WriteConcernOptions::SyncMode::JOURNAL; ASSERT_EQUALS(ErrorCodes::WriteConcernFailed, getReplCoord()->awaitReplication(&txn, time, majorityWriteConcern).status); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(2, 1, time)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(2, 1, time)); + ASSERT_OK(getReplCoord()->setLastDurableOptime_forTest(2, 1, time)); ASSERT_EQUALS(ErrorCodes::WriteConcernFailed, getReplCoord()->awaitReplication(&txn, time, majorityWriteConcern).status); // this member does not vote and as a result should not count towards write concern - ASSERT_OK(getReplCoord()->setLastOptime_forTest(2, 3, time)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(2, 3, time)); + ASSERT_OK(getReplCoord()->setLastDurableOptime_forTest(2, 3, time)); ASSERT_EQUALS(ErrorCodes::WriteConcernFailed, getReplCoord()->awaitReplication(&txn, time, majorityWriteConcern).status); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(2, 2, time)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(2, 2, time)); + ASSERT_OK(getReplCoord()->setLastDurableOptime_forTest(2, 2, time)); ASSERT_EQUALS(ErrorCodes::WriteConcernFailed, getReplCoord()->awaitReplication(&txn, time, majorityWriteConcern).status); @@ -2626,30 +3041,38 @@ TEST_F(ReplCoordTest, ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); OpTime zero(Timestamp(0, 0), 0); OpTime time(Timestamp(100, 0), 1); - getReplCoord()->setMyLastOptime(time); + getReplCoord()->setMyLastAppliedOpTime(time); + getReplCoord()->setMyLastDurableOpTime(time); simulateSuccessfulV1Election(); ASSERT_EQUALS(zero, getReplCoord()->getLastCommittedOpTime()); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(2, 1, time)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(2, 1, time)); + ASSERT_OK(getReplCoord()->setLastDurableOptime_forTest(2, 1, time)); ASSERT_EQUALS(zero, getReplCoord()->getLastCommittedOpTime()); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(2, 3, time)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(2, 3, time)); + ASSERT_OK(getReplCoord()->setLastDurableOptime_forTest(2, 3, time)); ASSERT_EQUALS(zero, getReplCoord()->getLastCommittedOpTime()); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(2, 2, time)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(2, 2, time)); + ASSERT_OK(getReplCoord()->setLastDurableOptime_forTest(2, 2, time)); ASSERT_EQUALS(time, getReplCoord()->getLastCommittedOpTime()); // Set a new, later OpTime. OpTime newTime(Timestamp(100, 1), 1); - getReplCoord()->setMyLastOptime(newTime); + getReplCoord()->setMyLastAppliedOpTime(newTime); + getReplCoord()->setMyLastDurableOpTime(newTime); ASSERT_EQUALS(time, getReplCoord()->getLastCommittedOpTime()); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(2, 3, newTime)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(2, 3, newTime)); + ASSERT_OK(getReplCoord()->setLastDurableOptime_forTest(2, 3, newTime)); ASSERT_EQUALS(time, getReplCoord()->getLastCommittedOpTime()); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(2, 2, newTime)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(2, 2, newTime)); + ASSERT_OK(getReplCoord()->setLastDurableOptime_forTest(2, 2, newTime)); // Reached majority of voting nodes with newTime. ASSERT_EQUALS(time, getReplCoord()->getLastCommittedOpTime()); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(2, 1, newTime)); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(2, 1, newTime)); + ASSERT_OK(getReplCoord()->setLastDurableOptime_forTest(2, 1, newTime)); ASSERT_EQUALS(newTime, getReplCoord()->getLastCommittedOpTime()); } @@ -2662,7 +3085,8 @@ TEST_F(ReplCoordTest, NodeReturnsShutdownInProgressWhenWaitingUntilAnOpTimeDurin << "_id" << 0))), HostAndPort("node1", 12345)); - getReplCoord()->setMyLastOptime(OpTimeWithTermZero(10, 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTimeWithTermZero(10, 0)); + getReplCoord()->setMyLastDurableOpTime(OpTimeWithTermZero(10, 0)); shutdown(); @@ -2682,7 +3106,8 @@ TEST_F(ReplCoordTest, NodeReturnsInterruptedWhenWaitingUntilAnOpTimeIsInterrupte << "_id" << 0))), HostAndPort("node1", 12345)); - getReplCoord()->setMyLastOptime(OpTimeWithTermZero(10, 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTimeWithTermZero(10, 0)); + getReplCoord()->setMyLastDurableOpTime(OpTimeWithTermZero(10, 0)); txn.setCheckForInterruptStatus(Status(ErrorCodes::Interrupted, "test")); @@ -2717,7 +3142,8 @@ TEST_F(ReplCoordTest, NodeReturnsOkImmediatelyWhenWaitingUntilOpTimePassesAnOpTi << "_id" << 0))), HostAndPort("node1", 12345)); - getReplCoord()->setMyLastOptime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTimeWithTermZero(100, 0)); + getReplCoord()->setMyLastDurableOpTime(OpTimeWithTermZero(100, 0)); auto result = getReplCoord()->waitUntilOpTime( &txn, ReadConcernArgs(OpTimeWithTermZero(50, 0), ReadConcernLevel::kLocalReadConcern)); @@ -2736,7 +3162,8 @@ TEST_F(ReplCoordTest, NodeReturnsOkImmediatelyWhenWaitingUntilOpTimePassesAnOpTi OpTimeWithTermZero time(100, 0); - getReplCoord()->setMyLastOptime(time); + getReplCoord()->setMyLastAppliedOpTime(time); + getReplCoord()->setMyLastDurableOpTime(time); auto result = getReplCoord()->waitUntilOpTime( &txn, ReadConcernArgs(time, ReadConcernLevel::kLocalReadConcern)); @@ -2779,7 +3206,8 @@ TEST_F(ReplCoordTest, ReadAfterCommittedWhileShutdown) { HostAndPort("node1", 12345)); runSingleNodeElection(getReplCoord()); - getReplCoord()->setMyLastOptime(OpTime(Timestamp(10, 0), 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTime(Timestamp(10, 0), 0)); + getReplCoord()->setMyLastDurableOpTime(OpTime(Timestamp(10, 0), 0)); shutdown(); @@ -2800,7 +3228,8 @@ TEST_F(ReplCoordTest, ReadAfterCommittedInterrupted) { HostAndPort("node1", 12345)); runSingleNodeElection(getReplCoord()); - getReplCoord()->setMyLastOptime(OpTime(Timestamp(10, 0), 0)); + getReplCoord()->setMyLastAppliedOpTime(OpTime(Timestamp(10, 0), 0)); + getReplCoord()->setMyLastDurableOpTime(OpTime(Timestamp(10, 0), 0)); txn.setCheckForInterruptStatus(Status(ErrorCodes::Interrupted, "test")); @@ -2821,7 +3250,8 @@ TEST_F(ReplCoordTest, ReadAfterCommittedGreaterOpTime) { HostAndPort("node1", 12345)); runSingleNodeElection(getReplCoord()); - getReplCoord()->setMyLastOptime(OpTime(Timestamp(100, 0), 1)); + getReplCoord()->setMyLastAppliedOpTime(OpTime(Timestamp(100, 0), 1)); + getReplCoord()->setMyLastDurableOpTime(OpTime(Timestamp(100, 0), 1)); getReplCoord()->onSnapshotCreate(OpTime(Timestamp(100, 0), 1), SnapshotName(1)); auto result = getReplCoord()->waitUntilOpTime( &txn, ReadConcernArgs(OpTime(Timestamp(50, 0), 1), ReadConcernLevel::kMajorityReadConcern)); @@ -2840,7 +3270,8 @@ TEST_F(ReplCoordTest, ReadAfterCommittedEqualOpTime) { HostAndPort("node1", 12345)); runSingleNodeElection(getReplCoord()); OpTime time(Timestamp(100, 0), 1); - getReplCoord()->setMyLastOptime(time); + getReplCoord()->setMyLastAppliedOpTime(time); + getReplCoord()->setMyLastDurableOpTime(time); getReplCoord()->onSnapshotCreate(time, SnapshotName(1)); auto result = getReplCoord()->waitUntilOpTime( &txn, ReadConcernArgs(time, ReadConcernLevel::kMajorityReadConcern)); @@ -2858,13 +3289,15 @@ TEST_F(ReplCoordTest, ReadAfterCommittedDeferredGreaterOpTime) { << "_id" << 0))), HostAndPort("node1", 12345)); runSingleNodeElection(getReplCoord()); - getReplCoord()->setMyLastOptime(OpTime(Timestamp(0, 0), 1)); + getReplCoord()->setMyLastAppliedOpTime(OpTime(Timestamp(0, 0), 1)); + getReplCoord()->setMyLastDurableOpTime(OpTime(Timestamp(0, 0), 1)); OpTime committedOpTime(Timestamp(200, 0), 1); auto pseudoLogOp = stdx::async(stdx::launch::async, [this, &committedOpTime]() { // Not guaranteed to be scheduled after waitUntil blocks... - getReplCoord()->setMyLastOptime(committedOpTime); + getReplCoord()->setMyLastAppliedOpTime(committedOpTime); + getReplCoord()->setMyLastDurableOpTime(committedOpTime); getReplCoord()->onSnapshotCreate(committedOpTime, SnapshotName(1)); }); @@ -2886,7 +3319,8 @@ TEST_F(ReplCoordTest, ReadAfterCommittedDeferredEqualOpTime) { << "_id" << 0))), HostAndPort("node1", 12345)); runSingleNodeElection(getReplCoord()); - getReplCoord()->setMyLastOptime(OpTime(Timestamp(0, 0), 1)); + getReplCoord()->setMyLastAppliedOpTime(OpTime(Timestamp(0, 0), 1)); + getReplCoord()->setMyLastDurableOpTime(OpTime(Timestamp(0, 0), 1)); OpTime opTimeToWait(Timestamp(100, 0), 1); @@ -2894,7 +3328,8 @@ TEST_F(ReplCoordTest, ReadAfterCommittedDeferredEqualOpTime) { stdx::async(stdx::launch::async, [this, &opTimeToWait]() { // Not guaranteed to be scheduled after waitUntil blocks... - getReplCoord()->setMyLastOptime(opTimeToWait); + getReplCoord()->setMyLastAppliedOpTime(opTimeToWait); + getReplCoord()->setMyLastDurableOpTime(opTimeToWait); getReplCoord()->onSnapshotCreate(opTimeToWait, SnapshotName(1)); }); @@ -3373,9 +3808,11 @@ TEST_F(ReplCoordTest, AdvanceCommittedSnapshotToMostRecentSnapshotPriorToOpTimeW getReplCoord()->onSnapshotCreate(time5, SnapshotName(3)); // ensure current snapshot follows price is right rules (closest but not greater than) - getReplCoord()->setMyLastOptime(time3); + getReplCoord()->setMyLastAppliedOpTime(time3); + getReplCoord()->setMyLastDurableOpTime(time3); ASSERT_EQUALS(time2, getReplCoord()->getCurrentCommittedSnapshotOpTime()); - getReplCoord()->setMyLastOptime(time4); + getReplCoord()->setMyLastAppliedOpTime(time4); + getReplCoord()->setMyLastDurableOpTime(time4); ASSERT_EQUALS(time2, getReplCoord()->getCurrentCommittedSnapshotOpTime()); } @@ -3403,7 +3840,8 @@ TEST_F(ReplCoordTest, DoNotAdvanceCommittedSnapshotWhenAnOpTimeIsNewerThanOurLat getReplCoord()->onSnapshotCreate(time5, SnapshotName(3)); // ensure current snapshot will not advance beyond existing snapshots - getReplCoord()->setMyLastOptime(time6); + getReplCoord()->setMyLastAppliedOpTime(time6); + getReplCoord()->setMyLastDurableOpTime(time6); ASSERT_EQUALS(time5, getReplCoord()->getCurrentCommittedSnapshotOpTime()); } @@ -3431,7 +3869,8 @@ TEST_F(ReplCoordTest, getReplCoord()->onSnapshotCreate(time2, SnapshotName(2)); getReplCoord()->onSnapshotCreate(time5, SnapshotName(3)); - getReplCoord()->setMyLastOptime(time6); + getReplCoord()->setMyLastAppliedOpTime(time6); + getReplCoord()->setMyLastDurableOpTime(time6); ASSERT_EQUALS(time5, getReplCoord()->getCurrentCommittedSnapshotOpTime()); // ensure current snapshot updates on new snapshot if we are that far @@ -3467,7 +3906,34 @@ TEST_F(ReplCoordTest, ZeroCommittedSnapshotWhenAllSnapshotsAreDropped) { ASSERT_EQUALS(OpTime(), getReplCoord()->getCurrentCommittedSnapshotOpTime()); } -TEST_F(ReplCoordTest, NodeChangesMyLastOpTimeWhenAndOnlyWhenSetMyLastOpTimeReceivesANewerOpTime) { +TEST_F(ReplCoordTest, DoNotAdvanceCommittedSnapshotWhenAppliedOpTimeChanges) { + init("mySet"); + + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "test1:1234"))), + HostAndPort("test1", 1234)); + OperationContextReplMock txn; + runSingleNodeElection(getReplCoord()); + + OpTime time1(Timestamp(100, 1), 1); + OpTime time2(Timestamp(100, 2), 1); + + getReplCoord()->onSnapshotCreate(time1, SnapshotName(1)); + + getReplCoord()->setMyLastAppliedOpTime(time1); + ASSERT_EQUALS(OpTime(), getReplCoord()->getCurrentCommittedSnapshotOpTime()); + getReplCoord()->setMyLastAppliedOpTime(time2); + ASSERT_EQUALS(OpTime(), getReplCoord()->getCurrentCommittedSnapshotOpTime()); + getReplCoord()->setMyLastAppliedOpTime(time2); + getReplCoord()->setMyLastDurableOpTime(time2); + ASSERT_EQUALS(time1, getReplCoord()->getCurrentCommittedSnapshotOpTime()); +} + +TEST_F(ReplCoordTest, + NodeChangesMyLastOpTimeWhenAndOnlyWhensetMyLastDurableOpTimeReceivesANewerOpTime) { assertStartSuccess(BSON("_id" << "mySet" << "version" << 2 << "members" << BSON_ARRAY(BSON("host" @@ -3480,12 +3946,13 @@ TEST_F(ReplCoordTest, NodeChangesMyLastOpTimeWhenAndOnlyWhenSetMyLastOpTimeRecei OpTime time2(Timestamp(100, 2), 1); OpTime time3(Timestamp(100, 3), 1); - getReplCoord()->setMyLastOptime(time1); - ASSERT_EQUALS(time1, getReplCoord()->getMyLastOptime()); - getReplCoord()->setMyLastOptimeForward(time3); - ASSERT_EQUALS(time3, getReplCoord()->getMyLastOptime()); - getReplCoord()->setMyLastOptimeForward(time2); - ASSERT_EQUALS(time3, getReplCoord()->getMyLastOptime()); + getReplCoord()->setMyLastAppliedOpTime(time1); + ASSERT_EQUALS(time1, getReplCoord()->getMyLastAppliedOpTime()); + getReplCoord()->setMyLastAppliedOpTimeForward(time3); + ASSERT_EQUALS(time3, getReplCoord()->getMyLastAppliedOpTime()); + getReplCoord()->setMyLastAppliedOpTimeForward(time2); + getReplCoord()->setMyLastDurableOpTimeForward(time2); + ASSERT_EQUALS(time3, getReplCoord()->getMyLastAppliedOpTime()); } TEST_F(ReplCoordTest, OnlyForwardSyncProgressForOtherNodesWhenTheNodesAreBelievedToBeUp) { @@ -3502,8 +3969,10 @@ TEST_F(ReplCoordTest, OnlyForwardSyncProgressForOtherNodesWhenTheNodesAreBelieve << BSON("electionTimeoutMillis" << 2000 << "heartbeatIntervalMillis" << 40000)), HostAndPort("test1", 1234)); OpTime optime(Timestamp(100, 2), 0); - getReplCoord()->setMyLastOptime(optime); - ASSERT_OK(getReplCoord()->setLastOptime_forTest(1, 1, optime)); + getReplCoord()->setMyLastAppliedOpTime(optime); + getReplCoord()->setMyLastDurableOpTime(optime); + ASSERT_OK(getReplCoord()->setLastAppliedOptime_forTest(1, 1, optime)); + ASSERT_OK(getReplCoord()->setLastDurableOptime_forTest(1, 1, optime)); // Check that we have two entries in our UpdatePosition (us and node 1). BSONObjBuilder cmdBuilder; @@ -3514,11 +3983,29 @@ TEST_F(ReplCoordTest, OnlyForwardSyncProgressForOtherNodesWhenTheNodesAreBelieve BSONObj entry = entryElement.Obj(); long long memberId = entry["memberId"].Number(); memberIds.insert(memberId); + OpTime appliedOpTime; + OpTime durableOpTime; + bsonExtractOpTimeField(entry, "appliedOpTime", &appliedOpTime); + ASSERT_EQUALS(optime, appliedOpTime); + bsonExtractOpTimeField(entry, "durableOpTime", &durableOpTime); + ASSERT_EQUALS(optime, durableOpTime); + } + ASSERT_EQUALS(2U, memberIds.size()); + + // Check that this true for old style (pre-3.2.2) UpdatePosition as well. + BSONObjBuilder cmdBuilder2; + getReplCoord()->prepareOldReplSetUpdatePositionCommand(&cmdBuilder2); + BSONObj cmd2 = cmdBuilder2.done(); + std::set<long long> memberIds2; + BSONForEach(entryElement, cmd2["optimes"].Obj()) { + BSONObj entry = entryElement.Obj(); + long long memberId = entry["memberId"].Number(); + memberIds2.insert(memberId); OpTime entryOpTime; bsonExtractOpTimeField(entry, "optime", &entryOpTime); ASSERT_EQUALS(optime, entryOpTime); } - ASSERT_EQUALS(2U, memberIds.size()); + ASSERT_EQUALS(2U, memberIds2.size()); // Advance the clock far enough to cause the other node to be marked as DOWN. const Date_t startDate = getNet()->now(); @@ -3534,19 +4021,37 @@ TEST_F(ReplCoordTest, OnlyForwardSyncProgressForOtherNodesWhenTheNodesAreBelieve // Check there is one entry in our UpdatePosition, since we shouldn't forward for a // DOWN node. - BSONObjBuilder cmdBuilder2; - getReplCoord()->prepareReplSetUpdatePositionCommand(&cmdBuilder2); - BSONObj cmd2 = cmdBuilder2.done(); - std::set<long long> memberIds2; - BSONForEach(entryElement, cmd2["optimes"].Obj()) { + BSONObjBuilder cmdBuilder3; + getReplCoord()->prepareReplSetUpdatePositionCommand(&cmdBuilder3); + BSONObj cmd3 = cmdBuilder3.done(); + std::set<long long> memberIds3; + BSONForEach(entryElement, cmd3["optimes"].Obj()) { BSONObj entry = entryElement.Obj(); long long memberId = entry["memberId"].Number(); - memberIds2.insert(memberId); + memberIds3.insert(memberId); + OpTime appliedOpTime; + OpTime durableOpTime; + bsonExtractOpTimeField(entry, "appliedOpTime", &appliedOpTime); + ASSERT_EQUALS(optime, appliedOpTime); + bsonExtractOpTimeField(entry, "durableOpTime", &durableOpTime); + ASSERT_EQUALS(optime, durableOpTime); + } + ASSERT_EQUALS(1U, memberIds3.size()); + + // Check that this true for old style (pre-3.2.2) UpdatePosition as well. + BSONObjBuilder cmdBuilder4; + getReplCoord()->prepareOldReplSetUpdatePositionCommand(&cmdBuilder4); + BSONObj cmd4 = cmdBuilder4.done(); + std::set<long long> memberIds4; + BSONForEach(entryElement, cmd4["optimes"].Obj()) { + BSONObj entry = entryElement.Obj(); + long long memberId = entry["memberId"].Number(); + memberIds4.insert(memberId); OpTime entryOpTime; bsonExtractOpTimeField(entry, "optime", &entryOpTime); ASSERT_EQUALS(optime, entryOpTime); } - ASSERT_EQUALS(1U, memberIds2.size()); + ASSERT_EQUALS(1U, memberIds4.size()); } TEST_F(ReplCoordTest, StepDownWhenHandleLivenessTimeoutMarksAMajorityOfVotingNodesDown) { @@ -3571,10 +4076,11 @@ TEST_F(ReplCoordTest, StepDownWhenHandleLivenessTimeoutMarksAMajorityOfVotingNod HostAndPort("node1", 12345)); ASSERT(getReplCoord()->setFollowerMode(MemberState::RS_SECONDARY)); OpTime startingOpTime = OpTime(Timestamp(100, 1), 0); - getReplCoord()->setMyLastOptime(startingOpTime); + getReplCoord()->setMyLastAppliedOpTime(startingOpTime); + getReplCoord()->setMyLastDurableOpTime(startingOpTime); // Receive notification that every node is up. - UpdatePositionArgs args; + OldUpdatePositionArgs args; ASSERT_OK(args.initialize( BSON("replSetUpdatePosition" << 1 << "optimes" << BSON_ARRAY(BSON("cfgver" << 2 << "memberId" << 1 << "optime" @@ -3591,7 +4097,7 @@ TEST_F(ReplCoordTest, StepDownWhenHandleLivenessTimeoutMarksAMajorityOfVotingNod simulateSuccessfulV1Election(); // Keep two nodes alive. - UpdatePositionArgs args1; + OldUpdatePositionArgs args1; ASSERT_OK(args1.initialize( BSON("replSetUpdatePosition" << 1 << "optimes" << BSON_ARRAY(BSON("cfgver" << 2 << "memberId" << 1 << "optime" @@ -3608,7 +4114,7 @@ TEST_F(ReplCoordTest, StepDownWhenHandleLivenessTimeoutMarksAMajorityOfVotingNod ASSERT_EQUALS(MemberState::RS_PRIMARY, getReplCoord()->getMemberState().s); // Keep one node alive via two methods (UpdatePosition and requestHeartbeat). - UpdatePositionArgs args2; + OldUpdatePositionArgs args2; ASSERT_OK(args2.initialize( BSON("replSetUpdatePosition" << 1 << "optimes" << BSON_ARRAY(BSON("cfgver" << 2 << "memberId" << 1 << "optime" @@ -3654,7 +4160,8 @@ TEST_F(ReplCoordTest, WaitForMemberState) { HostAndPort("test1", 1234)); auto replCoord = getReplCoord(); auto initialTerm = replCoord->getTerm(); - replCoord->setMyLastOptime(OpTime(Timestamp(1, 0), 0)); + replCoord->setMyLastAppliedOpTime(OpTime(Timestamp(1, 0), 0)); + replCoord->setMyLastDurableOpTime(OpTime(Timestamp(1, 0), 0)); ASSERT_TRUE(replCoord->setFollowerMode(MemberState::RS_SECONDARY)); // Successful dry run election increases term. @@ -3688,7 +4195,8 @@ TEST_F(ReplCoordTest, WaitForDrainFinish) { HostAndPort("test1", 1234)); auto replCoord = getReplCoord(); auto initialTerm = replCoord->getTerm(); - replCoord->setMyLastOptime(OpTime(Timestamp(1, 0), 0)); + replCoord->setMyLastAppliedOpTime(OpTime(Timestamp(1, 0), 0)); + replCoord->setMyLastDurableOpTime(OpTime(Timestamp(1, 0), 0)); ASSERT_TRUE(replCoord->setFollowerMode(MemberState::RS_SECONDARY)); // Successful dry run election increases term. @@ -3713,8 +4221,168 @@ TEST_F(ReplCoordTest, WaitForDrainFinish) { ASSERT_OK(replCoord->waitForDrainFinish(Milliseconds(0))); } -// TODO(schwerin): Unit test election id updating +TEST_F(ReplCoordTest, UpdatePositionArgsReturnsNoSuchKeyWhenParsingOldUpdatePositionArgs) { + OldUpdatePositionArgs args; + UpdatePositionArgs args2; + OpTime opTime = OpTime(Timestamp(100, 1), 0); + ASSERT_EQUALS( + ErrorCodes::NoSuchKey, + args2.initialize(BSON( + "replSetUpdatePosition" + << 1 << "optimes" + << BSON_ARRAY( + BSON("cfgver" << 2 << "memberId" << 1 << "optime" << opTime.getTimestamp()) + << BSON("cfgver" << 2 << "memberId" << 2 << "optime" << opTime.getTimestamp()) + << BSON("cfgver" << 2 << "memberId" << 3 << "optime" << opTime.getTimestamp()) + << BSON("cfgver" << 2 << "memberId" << 4 << "optime" + << opTime.getTimestamp()))))); + + ASSERT_OK(args.initialize( + BSON("replSetUpdatePosition" + << 1 << "optimes" + << BSON_ARRAY( + BSON("cfgver" << 2 << "memberId" << 1 << "optime" << opTime.getTimestamp()) + << BSON("cfgver" << 2 << "memberId" << 2 << "optime" << opTime.getTimestamp()) + << BSON("cfgver" << 2 << "memberId" << 3 << "optime" << opTime.getTimestamp()) + << BSON("cfgver" << 2 << "memberId" << 4 << "optime" + << opTime.getTimestamp()))))); +} + + +TEST_F(ReplCoordTest, OldUpdatePositionArgsReturnsBadValueWhenParsingUpdatePositionArgs) { + OldUpdatePositionArgs args; + UpdatePositionArgs args2; + OpTime opTime = OpTime(Timestamp(100, 1), 0); + ASSERT_EQUALS( + ErrorCodes::BadValue, + args.initialize(BSON( + "replSetUpdatePosition" + << 1 << "optimes" + << BSON_ARRAY(BSON("cfgver" << 2 << "memberId" << 1 << "durableOpTime" + << BSON("ts" << opTime.getTimestamp() << "t" << 3) + << "appliedOpTime" + << BSON("ts" << opTime.getTimestamp() << "t" << 3)) + << BSON("cfgver" << 2 << "memberId" << 2 << "durableOpTime" + << BSON("ts" << opTime.getTimestamp() << "t" << 3) + << "appliedOpTime" + << BSON("ts" << opTime.getTimestamp() << "t" << 3)) + << BSON("cfgver" << 2 << "memberId" << 3 << "durableOpTime" + << BSON("ts" << opTime.getTimestamp() << "t" << 3) + << "appliedOpTime" + << BSON("ts" << opTime.getTimestamp() << "t" << 3)) + << BSON("cfgver" << 2 << "memberId" << 4 << "durableOpTime" + << BSON("ts" << opTime.getTimestamp() << "t" << 3) + << "appliedOpTime" + << BSON("ts" << opTime.getTimestamp() << "t" << 3)))))); + ASSERT_OK(args2.initialize( + BSON("replSetUpdatePosition" + << 1 << "optimes" + << BSON_ARRAY(BSON("cfgver" << 2 << "memberId" << 1 << "durableOpTime" + << BSON("ts" << opTime.getTimestamp() << "t" << 3) + << "appliedOpTime" + << BSON("ts" << opTime.getTimestamp() << "t" << 3)) + << BSON("cfgver" << 2 << "memberId" << 2 << "durableOpTime" + << BSON("ts" << opTime.getTimestamp() << "t" << 3) + << "appliedOpTime" + << BSON("ts" << opTime.getTimestamp() << "t" << 3)) + << BSON("cfgver" << 2 << "memberId" << 3 << "durableOpTime" + << BSON("ts" << opTime.getTimestamp() << "t" << 3) + << "appliedOpTime" + << BSON("ts" << opTime.getTimestamp() << "t" << 3)) + << BSON("cfgver" << 2 << "memberId" << 4 << "durableOpTime" + << BSON("ts" << opTime.getTimestamp() << "t" << 3) + << "appliedOpTime" + << BSON("ts" << opTime.getTimestamp() << "t" << 3)))))); +} + +TEST_F( + ReplCoordTest, + PopulateUnsetWriteConcernOptionsSyncModeReturnsInputWithSyncModeNoneIfUnsetAndWriteConcernMajorityJournalDefaultIsFalse) { + init("mySet"); + + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "test1:1234")) + << "writeConcernMajorityJournalDefault" << false), + HostAndPort("test1", 1234)); + + WriteConcernOptions wc; + wc.wMode = WriteConcernOptions::kMajority; + wc.syncMode = WriteConcernOptions::SyncMode::UNSET; + ASSERT(WriteConcernOptions::SyncMode::NONE == + getReplCoord()->populateUnsetWriteConcernOptionsSyncMode(wc).syncMode); +} + +TEST_F( + ReplCoordTest, + PopulateUnsetWriteConcernOptionsSyncModeReturnsInputWithSyncModeJournalIfUnsetAndWriteConcernMajorityJournalDefaultIsTrue) { + init("mySet"); + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "test1:1234")) + << "writeConcernMajorityJournalDefault" << true), + HostAndPort("test1", 1234)); + + WriteConcernOptions wc; + wc.wMode = WriteConcernOptions::kMajority; + wc.syncMode = WriteConcernOptions::SyncMode::UNSET; + ASSERT(WriteConcernOptions::SyncMode::JOURNAL == + getReplCoord()->populateUnsetWriteConcernOptionsSyncMode(wc).syncMode); +} + +TEST_F(ReplCoordTest, PopulateUnsetWriteConcernOptionsSyncModeReturnsInputIfSyncModeIsNotUnset) { + init("mySet"); + + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "test1:1234")) + << "writeConcernMajorityJournalDefault" << false), + HostAndPort("test1", 1234)); + + WriteConcernOptions wc; + wc.wMode = WriteConcernOptions::kMajority; + ASSERT(WriteConcernOptions::SyncMode::NONE == + getReplCoord()->populateUnsetWriteConcernOptionsSyncMode(wc).syncMode); + + wc.syncMode = WriteConcernOptions::SyncMode::JOURNAL; + ASSERT(WriteConcernOptions::SyncMode::JOURNAL == + getReplCoord()->populateUnsetWriteConcernOptionsSyncMode(wc).syncMode); + + wc.syncMode = WriteConcernOptions::SyncMode::FSYNC; + ASSERT(WriteConcernOptions::SyncMode::FSYNC == + getReplCoord()->populateUnsetWriteConcernOptionsSyncMode(wc).syncMode); +} + +TEST_F(ReplCoordTest, PopulateUnsetWriteConcernOptionsSyncModeReturnsInputIfWModeIsNotMajority) { + init("mySet"); + + assertStartSuccess(BSON("_id" + << "mySet" + << "version" << 1 << "members" + << BSON_ARRAY(BSON("_id" << 0 << "host" + << "test1:1234")) + << "writeConcernMajorityJournalDefault" << false), + HostAndPort("test1", 1234)); + + WriteConcernOptions wc; + wc.syncMode = WriteConcernOptions::SyncMode::UNSET; + wc.wMode = "not the value of kMajority"; + ASSERT(WriteConcernOptions::SyncMode::NONE == + getReplCoord()->populateUnsetWriteConcernOptionsSyncMode(wc).syncMode); + + wc.wMode = "like literally anythingelse"; + ASSERT(WriteConcernOptions::SyncMode::NONE == + getReplCoord()->populateUnsetWriteConcernOptionsSyncMode(wc).syncMode); +} + +// TODO(schwerin): Unit test election id updating } // namespace } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/replication_coordinator_mock.cpp b/src/mongo/db/repl/replication_coordinator_mock.cpp index acacb6c9584..2ceb947cb8e 100644 --- a/src/mongo/db/repl/replication_coordinator_mock.cpp +++ b/src/mongo/db/repl/replication_coordinator_mock.cpp @@ -148,22 +148,36 @@ void ReplicationCoordinatorMock::setMyHeartbeatMessage(const std::string& msg) { // TODO } -void ReplicationCoordinatorMock::setMyLastOptime(const OpTime& opTime) { - _myLastOpTime = opTime; +void ReplicationCoordinatorMock::setMyLastAppliedOpTime(const OpTime& opTime) { + _myLastAppliedOpTime = opTime; } -void ReplicationCoordinatorMock::setMyLastOptimeForward(const OpTime& opTime) { - if (opTime > _myLastOpTime) { - _myLastOpTime = opTime; +void ReplicationCoordinatorMock::setMyLastDurableOpTime(const OpTime& opTime) { + _myLastDurableOpTime = opTime; +} + +void ReplicationCoordinatorMock::setMyLastAppliedOpTimeForward(const OpTime& opTime) { + if (opTime > _myLastAppliedOpTime) { + _myLastAppliedOpTime = opTime; + } +} + +void ReplicationCoordinatorMock::setMyLastDurableOpTimeForward(const OpTime& opTime) { + if (opTime > _myLastDurableOpTime) { + _myLastDurableOpTime = opTime; } } -void ReplicationCoordinatorMock::resetMyLastOptime() { - _myLastOpTime = OpTime(); +void ReplicationCoordinatorMock::resetMyLastOpTimes() { + _myLastDurableOpTime = OpTime(); } -OpTime ReplicationCoordinatorMock::getMyLastOptime() const { - return _myLastOpTime; +OpTime ReplicationCoordinatorMock::getMyLastAppliedOpTime() const { + return _myLastAppliedOpTime; +} + +OpTime ReplicationCoordinatorMock::getMyLastDurableOpTime() const { + return _myLastDurableOpTime; } ReadConcernResponse ReplicationCoordinatorMock::waitUntilOpTime(OperationContext* txn, @@ -203,6 +217,12 @@ Status ReplicationCoordinatorMock::waitForDrainFinish(Milliseconds timeout) { void ReplicationCoordinatorMock::signalUpstreamUpdater() {} +bool ReplicationCoordinatorMock::prepareOldReplSetUpdatePositionCommand( + BSONObjBuilder* cmdBuilder) { + cmdBuilder->append("replSetUpdatePosition", 1); + return true; +} + bool ReplicationCoordinatorMock::prepareReplSetUpdatePositionCommand(BSONObjBuilder* cmdBuilder) { cmdBuilder->append("replSetUpdatePosition", 1); return true; @@ -284,6 +304,12 @@ Status ReplicationCoordinatorMock::processReplSetElect(const ReplSetElectArgs& a return Status::OK(); } +Status ReplicationCoordinatorMock::processReplSetUpdatePosition( + const OldUpdatePositionArgs& updates, long long* configVersion) { + // TODO + return Status::OK(); +} + Status ReplicationCoordinatorMock::processReplSetUpdatePosition(const UpdatePositionArgs& updates, long long* configVersion) { // TODO @@ -300,7 +326,8 @@ bool ReplicationCoordinatorMock::buildsIndexes() { return true; } -std::vector<HostAndPort> ReplicationCoordinatorMock::getHostsWrittenTo(const OpTime& op) { +std::vector<HostAndPort> ReplicationCoordinatorMock::getHostsWrittenTo(const OpTime& op, + bool durablyWritten) { return std::vector<HostAndPort>(); } @@ -328,7 +355,7 @@ HostAndPort ReplicationCoordinatorMock::chooseNewSyncSource(const Timestamp& las void ReplicationCoordinatorMock::blacklistSyncSource(const HostAndPort& host, Date_t until) {} -void ReplicationCoordinatorMock::resetLastOpTimeFromOplog(OperationContext* txn) { +void ReplicationCoordinatorMock::resetLastOpTimesFromOplog(OperationContext* txn) { invariant(false); } @@ -367,6 +394,10 @@ bool ReplicationCoordinatorMock::isV1ElectionProtocol() { return true; } +bool ReplicationCoordinatorMock::getWriteConcernMajorityShouldJournal() { + return true; +} + void ReplicationCoordinatorMock::summarizeAsHtml(ReplSetHtmlSummary* output) {} long long ReplicationCoordinatorMock::getTerm() { @@ -398,5 +429,13 @@ size_t ReplicationCoordinatorMock::getNumUncommittedSnapshots() { return 0; } +WriteConcernOptions ReplicationCoordinatorMock::populateUnsetWriteConcernOptionsSyncMode( + WriteConcernOptions wc) { + if (wc.syncMode == WriteConcernOptions::SyncMode::UNSET) { + wc.syncMode = WriteConcernOptions::SyncMode::JOURNAL; + } + return wc; +} + } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/replication_coordinator_mock.h b/src/mongo/db/repl/replication_coordinator_mock.h index 60f1b1b23c4..4a21d9ad705 100644 --- a/src/mongo/db/repl/replication_coordinator_mock.h +++ b/src/mongo/db/repl/replication_coordinator_mock.h @@ -103,15 +103,18 @@ public: virtual Status setLastOptimeForSlave(const OID& rid, const Timestamp& ts); - virtual void setMyLastOptime(const OpTime& opTime); + virtual void setMyLastAppliedOpTime(const OpTime& opTime); + virtual void setMyLastDurableOpTime(const OpTime& opTime); - virtual void setMyLastOptimeForward(const OpTime& opTime); + virtual void setMyLastAppliedOpTimeForward(const OpTime& opTime); + virtual void setMyLastDurableOpTimeForward(const OpTime& opTime); - virtual void resetMyLastOptime(); + virtual void resetMyLastOpTimes(); virtual void setMyHeartbeatMessage(const std::string& msg); - virtual OpTime getMyLastOptime() const; + virtual OpTime getMyLastAppliedOpTime() const; + virtual OpTime getMyLastDurableOpTime() const; virtual ReadConcernResponse waitUntilOpTime(OperationContext* txn, const ReadConcernArgs& settings) override; @@ -132,6 +135,7 @@ public: virtual void signalUpstreamUpdater(); + virtual bool prepareOldReplSetUpdatePositionCommand(BSONObjBuilder* cmdBuilder); virtual bool prepareReplSetUpdatePositionCommand(BSONObjBuilder* cmdBuilder); virtual Status processReplSetGetStatus(BSONObjBuilder* result); @@ -177,6 +181,8 @@ public: virtual Status processReplSetElect(const ReplSetElectArgs& args, BSONObjBuilder* resultObj); + virtual Status processReplSetUpdatePosition(const OldUpdatePositionArgs& updates, + long long* configVersion); virtual Status processReplSetUpdatePosition(const UpdatePositionArgs& updates, long long* configVersion); @@ -184,7 +190,7 @@ public: virtual bool buildsIndexes(); - virtual std::vector<HostAndPort> getHostsWrittenTo(const OpTime& op); + virtual std::vector<HostAndPort> getHostsWrittenTo(const OpTime& op, bool durablyWritten); virtual std::vector<HostAndPort> getOtherNodesInReplSet() const; @@ -196,7 +202,7 @@ public: virtual void blacklistSyncSource(const HostAndPort& host, Date_t until); - virtual void resetLastOpTimeFromOplog(OperationContext* txn); + virtual void resetLastOpTimesFromOplog(OperationContext* txn); virtual bool shouldChangeSyncSource(const HostAndPort& currentSource, const OpTime& syncSourceLastOpTime, @@ -220,6 +226,8 @@ public: virtual bool isV1ElectionProtocol(); + virtual bool getWriteConcernMajorityShouldJournal(); + virtual void summarizeAsHtml(ReplSetHtmlSummary* output); virtual long long getTerm(); @@ -241,11 +249,15 @@ public: virtual size_t getNumUncommittedSnapshots() override; + virtual WriteConcernOptions populateUnsetWriteConcernOptionsSyncMode( + WriteConcernOptions wc) override; + private: AtomicUInt64 _snapshotNameGenerator; const ReplSettings _settings; MemberState _memberState; - OpTime _myLastOpTime; + OpTime _myLastDurableOpTime; + OpTime _myLastAppliedOpTime; }; } // namespace repl diff --git a/src/mongo/db/repl/replication_coordinator_test_fixture.cpp b/src/mongo/db/repl/replication_coordinator_test_fixture.cpp index ed30ceacde8..cb45baf086f 100644 --- a/src/mongo/db/repl/replication_coordinator_test_fixture.cpp +++ b/src/mongo/db/repl/replication_coordinator_test_fixture.cpp @@ -113,12 +113,13 @@ void ReplCoordTest::init() { TopologyCoordinatorImpl::Options settings; _topo = new TopologyCoordinatorImpl(settings); + stdx::function<bool()> _durablityLambda = []() -> bool { return true; }; _net = new NetworkInterfaceMock; _storage = new StorageInterfaceMock; _replExec.reset(new ReplicationExecutor(_net, _storage, seed)); _externalState = new ReplicationCoordinatorExternalStateMock; - _repl.reset( - new ReplicationCoordinatorImpl(_settings, _externalState, _topo, _replExec.get(), seed)); + _repl.reset(new ReplicationCoordinatorImpl( + _settings, _externalState, _topo, _replExec.get(), seed, &_durablityLambda)); } void ReplCoordTest::init(const ReplSettings& settings) { diff --git a/src/mongo/db/repl/replication_info.cpp b/src/mongo/db/repl/replication_info.cpp index 273deaa3bd9..50e84803c6f 100644 --- a/src/mongo/db/repl/replication_info.cpp +++ b/src/mongo/db/repl/replication_info.cpp @@ -178,7 +178,7 @@ public: BSONObjBuilder result; // TODO(siyuan) Output term of OpTime - result.append("latestOptime", replCoord->getMyLastOptime().getTimestamp()); + result.append("latestOptime", replCoord->getMyLastAppliedOpTime().getTimestamp()); const std::string& oplogNS = replCoord->getReplicationMode() == ReplicationCoordinator::modeReplSet diff --git a/src/mongo/db/repl/replset_commands.cpp b/src/mongo/db/repl/replset_commands.cpp index 07f247fdc98..ee0aa50a6f0 100644 --- a/src/mongo/db/repl/replset_commands.cpp +++ b/src/mongo/db/repl/replset_commands.cpp @@ -45,6 +45,7 @@ #include "mongo/db/lasterror.h" #include "mongo/db/op_observer.h" #include "mongo/db/repl/initial_sync.h" +#include "mongo/db/repl/old_update_position_args.h" #include "mongo/db/repl/oplog.h" #include "mongo/db/repl/repl_set_heartbeat_args_v1.h" #include "mongo/db/repl/repl_set_heartbeat_args.h" @@ -669,25 +670,43 @@ public: // accept and ignore handshakes sent from old (3.0-series) nodes without erroring to // enable mixed-version operation, since we no longer use the handshakes - if (cmdObj.hasField("handshake")) { + if (cmdObj.hasField("handshake")) return true; - } + + // In the case of an update from a member with an invalid replica set config, + // we return our current config version. + long long configVersion = -1; UpdatePositionArgs args; + status = args.initialize(cmdObj); - if (!status.isOK()) + if (status.isOK()) { + // v3.2.2+ style replSetUpdatePosition command. + status = getGlobalReplicationCoordinator()->processReplSetUpdatePosition( + args, &configVersion); + + if (status == ErrorCodes::InvalidReplicaSetConfig) { + result.append("configVersion", configVersion); + } return appendCommandStatus(result, status); + } else if (status == ErrorCodes::NoSuchKey) { + // Pre-3.2.2 style replSetUpdatePosition command. + OldUpdatePositionArgs oldArgs; + status = oldArgs.initialize(cmdObj); + if (!status.isOK()) + return appendCommandStatus(result, status); - // in the case of an update from a member with an invalid replica set config, - // we return our current config version - long long configVersion = -1; - status = - getGlobalReplicationCoordinator()->processReplSetUpdatePosition(args, &configVersion); + status = getGlobalReplicationCoordinator()->processReplSetUpdatePosition( + oldArgs, &configVersion); - if (status == ErrorCodes::InvalidReplicaSetConfig) { - result.append("configVersion", configVersion); + if (status == ErrorCodes::InvalidReplicaSetConfig) { + result.append("configVersion", configVersion); + } + return appendCommandStatus(result, status); + } else { + // Parsing error from UpdatePositionArgs. + return appendCommandStatus(result, status); } - return appendCommandStatus(result, status); } } cmdReplSetUpdatePosition; diff --git a/src/mongo/db/repl/reporter.cpp b/src/mongo/db/repl/reporter.cpp index fbbc4d64d33..3a6cfb81e82 100644 --- a/src/mongo/db/repl/reporter.cpp +++ b/src/mongo/db/repl/reporter.cpp @@ -41,10 +41,10 @@ namespace repl { using executor::RemoteCommandRequest; Reporter::Reporter(ReplicationExecutor* executor, - PrepareReplSetUpdatePositionCommandFn prepareReplSetUpdatePositionCommandFn, + PrepareReplSetUpdatePositionCommandFn prepareOldReplSetUpdatePositionCommandFn, const HostAndPort& target) : _executor(executor), - _prepareReplSetUpdatePositionCommandFn(prepareReplSetUpdatePositionCommandFn), + _prepareOldReplSetUpdatePositionCommandFn(prepareOldReplSetUpdatePositionCommandFn), _target(target), _status(Status::OK()), _willRunAgain(false), @@ -52,7 +52,7 @@ Reporter::Reporter(ReplicationExecutor* executor, uassert(ErrorCodes::BadValue, "null replication executor", executor); uassert(ErrorCodes::BadValue, "null function to create replSetUpdatePosition command object", - prepareReplSetUpdatePositionCommandFn); + prepareOldReplSetUpdatePositionCommandFn); uassert(ErrorCodes::BadValue, "target name cannot be empty", !target.empty()); } @@ -105,11 +105,11 @@ Status Reporter::_schedule_inlock() { LOG(2) << "Reporter scheduling report to : " << _target; - auto prepareResult = _prepareReplSetUpdatePositionCommandFn(); + auto prepareResult = _prepareOldReplSetUpdatePositionCommandFn(); if (!prepareResult.isOK()) { // Returning NodeNotFound because currently this is the only way - // prepareReplSetUpdatePositionCommand() can fail in production. + // prepareOldReplSetUpdatePositionCommand() can fail in production. return Status(ErrorCodes::NodeNotFound, "Reporter failed to create replSetUpdatePositionCommand command."); } diff --git a/src/mongo/db/repl/reporter.h b/src/mongo/db/repl/reporter.h index 585539454c1..5fba25e5ba0 100644 --- a/src/mongo/db/repl/reporter.h +++ b/src/mongo/db/repl/reporter.h @@ -50,7 +50,7 @@ public: using PrepareReplSetUpdatePositionCommandFn = stdx::function<StatusWith<BSONObj>()>; Reporter(ReplicationExecutor* executor, - PrepareReplSetUpdatePositionCommandFn prepareReplSetUpdatePositionCommandFn, + PrepareReplSetUpdatePositionCommandFn prepareOldReplSetUpdatePositionCommandFn, const HostAndPort& target); virtual ~Reporter(); @@ -105,7 +105,7 @@ private: ReplicationExecutor* _executor; // Prepares update command object. - PrepareReplSetUpdatePositionCommandFn _prepareReplSetUpdatePositionCommandFn; + PrepareReplSetUpdatePositionCommandFn _prepareOldReplSetUpdatePositionCommandFn; // Host to whom the Reporter sends updates. HostAndPort _target; diff --git a/src/mongo/db/repl/reporter_test.cpp b/src/mongo/db/repl/reporter_test.cpp index c5533d7adb2..01904e81c92 100644 --- a/src/mongo/db/repl/reporter_test.cpp +++ b/src/mongo/db/repl/reporter_test.cpp @@ -52,7 +52,7 @@ public: _result = newResult; } - bool prepareReplSetUpdatePositionCommand(BSONObjBuilder* cmdBuilder) { + bool prepareOldReplSetUpdatePositionCommand(BSONObjBuilder* cmdBuilder) { if (!_result) { return _result; } @@ -85,7 +85,7 @@ protected: std::unique_ptr<Reporter> reporter; std::unique_ptr<MockProgressManager> posUpdater; - Reporter::PrepareReplSetUpdatePositionCommandFn prepareReplSetUpdatePositionCommandFn; + Reporter::PrepareReplSetUpdatePositionCommandFn prepareOldReplSetUpdatePositionCommandFn; }; ReporterTest::ReporterTest() {} @@ -93,16 +93,16 @@ ReporterTest::ReporterTest() {} void ReporterTest::setUp() { ReplicationExecutorTest::setUp(); posUpdater.reset(new MockProgressManager()); - prepareReplSetUpdatePositionCommandFn = [this]() -> StatusWith<BSONObj> { + prepareOldReplSetUpdatePositionCommandFn = [this]() -> StatusWith<BSONObj> { BSONObjBuilder bob; - if (posUpdater->prepareReplSetUpdatePositionCommand(&bob)) { + if (posUpdater->prepareOldReplSetUpdatePositionCommand(&bob)) { return bob.obj(); } return Status(ErrorCodes::OperationFailed, "unable to prepare replSetUpdatePosition command object"); }; reporter.reset(new Reporter(&getReplExecutor(), - [this]() { return prepareReplSetUpdatePositionCommandFn(); }, + [this]() { return prepareOldReplSetUpdatePositionCommandFn(); }, HostAndPort("h1"))); launchExecutorThread(); } @@ -138,12 +138,12 @@ TEST_F(ReporterTest, InvalidConstruction) { UserException); // null ReplicationExecutor - ASSERT_THROWS(Reporter(nullptr, prepareReplSetUpdatePositionCommandFn, HostAndPort("h1")), + ASSERT_THROWS(Reporter(nullptr, prepareOldReplSetUpdatePositionCommandFn, HostAndPort("h1")), UserException); // empty HostAndPort ASSERT_THROWS( - Reporter(&getReplExecutor(), prepareReplSetUpdatePositionCommandFn, HostAndPort()), + Reporter(&getReplExecutor(), prepareOldReplSetUpdatePositionCommandFn, HostAndPort()), UserException); } diff --git a/src/mongo/db/repl/rs_initialsync.cpp b/src/mongo/db/repl/rs_initialsync.cpp index 569480452d1..3d62914b081 100644 --- a/src/mongo/db/repl/rs_initialsync.cpp +++ b/src/mongo/db/repl/rs_initialsync.cpp @@ -88,11 +88,11 @@ void truncateAndResetOplog(OperationContext* txn, // Note: the following order is important. // The bgsync thread uses an empty optime as a sentinel to know to wait // for initial sync; thus, we must - // ensure the lastAppliedOptime is empty before restarting the bgsync thread + // ensure the lastAppliedOpTime is empty before restarting the bgsync thread // via stop(). // We must clear the sync source blacklist after calling stop() // because the bgsync thread, while running, may update the blacklist. - replCoord->resetMyLastOptime(); + replCoord->resetMyLastOpTimes(); bgsync->stop(); bgsync->clearBuffer(); @@ -214,7 +214,7 @@ bool _initialSyncClone(OperationContext* txn, * @return if applying the oplog succeeded. */ bool _initialSyncApplyOplog(OperationContext* ctx, repl::InitialSync* syncer, OplogReader* r) { - const OpTime startOpTime = getGlobalReplicationCoordinator()->getMyLastOptime(); + const OpTime startOpTime = getGlobalReplicationCoordinator()->getMyLastAppliedOpTime(); BSONObj lastOp; // If the fail point is set, exit failing. @@ -370,7 +370,7 @@ Status _initialSync() { // prime oplog, but don't need to actually apply the op as the cloned data already reflects it. OpTime lastOptime = writeOpsToOplog(&txn, {lastOp}); ReplClientInfo::forClient(txn.getClient()).setLastOp(lastOptime); - replCoord->setMyLastOptime(lastOptime); + replCoord->setMyLastAppliedOpTime(lastOptime); setNewTimestamp(lastOptime.getTimestamp()); std::string msg = "oplog sync 1 of 3"; @@ -425,7 +425,7 @@ Status _initialSync() { { ScopedTransaction scopedXact(&txn, MODE_IX); AutoGetDb autodb(&txn, "local", MODE_X); - OpTime lastOpTimeWritten(getGlobalReplicationCoordinator()->getMyLastOptime()); + OpTime lastOpTimeWritten(getGlobalReplicationCoordinator()->getMyLastAppliedOpTime()); log() << "set minValid=" << lastOpTimeWritten; // Initial sync is now complete. Flag this by setting minValid to the last thing diff --git a/src/mongo/db/repl/rs_rollback.cpp b/src/mongo/db/repl/rs_rollback.cpp index a9073f49524..076507b4106 100644 --- a/src/mongo/db/repl/rs_rollback.cpp +++ b/src/mongo/db/repl/rs_rollback.cpp @@ -791,9 +791,9 @@ void syncFixUp(OperationContext* txn, warn = true; } - // Reload the lastOpTimeApplied value in the replcoord and the lastAppliedHash value in - // bgsync to reflect our new last op. - replCoord->resetLastOpTimeFromOplog(txn); + // Reload the lastAppliedOpTime and lastDurableOpTime value in the replcoord and the + // lastAppliedHash value in bgsync to reflect our new last op. + replCoord->resetLastOpTimesFromOplog(txn); // done if (warn) diff --git a/src/mongo/db/repl/rs_rollback_test.cpp b/src/mongo/db/repl/rs_rollback_test.cpp index 7320cf98365..835389c3494 100644 --- a/src/mongo/db/repl/rs_rollback_test.cpp +++ b/src/mongo/db/repl/rs_rollback_test.cpp @@ -76,13 +76,13 @@ ReplSettings createReplSettings() { class ReplicationCoordinatorRollbackMock : public ReplicationCoordinatorMock { public: ReplicationCoordinatorRollbackMock(); - void resetLastOpTimeFromOplog(OperationContext* txn) override; + void resetLastOpTimesFromOplog(OperationContext* txn) override; }; ReplicationCoordinatorRollbackMock::ReplicationCoordinatorRollbackMock() : ReplicationCoordinatorMock(createReplSettings()) {} -void ReplicationCoordinatorRollbackMock::resetLastOpTimeFromOplog(OperationContext* txn) {} +void ReplicationCoordinatorRollbackMock::resetLastOpTimesFromOplog(OperationContext* txn) {} class RollbackSourceMock : public RollbackSource { public: diff --git a/src/mongo/db/repl/rs_sync.cpp b/src/mongo/db/repl/rs_sync.cpp index 83dd0131567..9e5c0e7e344 100644 --- a/src/mongo/db/repl/rs_sync.cpp +++ b/src/mongo/db/repl/rs_sync.cpp @@ -107,7 +107,7 @@ void runSyncThread() { // 1. If the oplog is empty, do an initial sync // 2. If minValid has _initialSyncFlag set, do an initial sync // 3. If initialSyncRequested is true - if (getGlobalReplicationCoordinator()->getMyLastOptime().isNull() || + if (getGlobalReplicationCoordinator()->getMyLastAppliedOpTime().isNull() || getInitialSyncFlag() || initialSyncRequested) { syncDoInitialSync(); continue; // start from top again in case sync failed. diff --git a/src/mongo/db/repl/sync_source_feedback.cpp b/src/mongo/db/repl/sync_source_feedback.cpp index 7c08a04be27..43b84f45e38 100644 --- a/src/mongo/db/repl/sync_source_feedback.cpp +++ b/src/mongo/db/repl/sync_source_feedback.cpp @@ -60,6 +60,7 @@ namespace repl { void SyncSourceFeedback::_resetConnection() { LOG(1) << "resetting connection in sync source feedback"; _connection.reset(); + _fallBackToOldUpdatePosition = false; } bool SyncSourceFeedback::replAuthenticate() { @@ -105,18 +106,24 @@ void SyncSourceFeedback::forwardSlaveProgress() { _cond.notify_all(); } -Status SyncSourceFeedback::updateUpstream(OperationContext* txn) { +Status SyncSourceFeedback::updateUpstream(OperationContext* txn, bool oldStyle) { auto replCoord = repl::ReplicationCoordinator::get(txn); if (replCoord->getMemberState().primary()) { - // primary has no one to update to + // Primary has no one to send updates to. return Status::OK(); } BSONObjBuilder cmd; { stdx::unique_lock<stdx::mutex> lock(_mtx); - // the command could not be created, likely because the node was removed from the set - if (!replCoord->prepareReplSetUpdatePositionCommand(&cmd)) { - return Status::OK(); + // The command could not be created, likely because this node was removed from the set. + if (!oldStyle) { + if (!replCoord->prepareReplSetUpdatePositionCommand(&cmd)) { + return Status::OK(); + } + } else { + if (!replCoord->prepareOldReplSetUpdatePositionCommand(&cmd)) { + return Status::OK(); + } } } BSONObj res; @@ -125,8 +132,9 @@ Status SyncSourceFeedback::updateUpstream(OperationContext* txn) { try { _connection->runCommand("admin", cmd.obj(), res); } catch (const DBException& e) { - log() << "SyncSourceFeedback error sending update: " << e.what() << endl; - // blacklist sync target for .5 seconds and find a new one + log() << "SyncSourceFeedback error sending " << (oldStyle ? "old style " : "") + << "update: " << e.what(); + // Blacklist sync target for .5 seconds and find a new one. replCoord->blacklistSyncSource(_syncTarget, Date_t::now() + Milliseconds(500)); BackgroundSync::get()->clearSyncTarget(); _resetConnection(); @@ -135,11 +143,15 @@ Status SyncSourceFeedback::updateUpstream(OperationContext* txn) { Status status = Command::getStatusFromCommandResult(res); if (!status.isOK()) { - log() << "SyncSourceFeedback error sending update, response: " << res.toString() << endl; - // blacklist sync target for .5 seconds and find a new one, unless we were rejected due - // to the syncsource having a newer config - if (status != ErrorCodes::InvalidReplicaSetConfig || res["configVersion"].eoo() || - res["configVersion"].numberLong() < replCoord->getConfig().getConfigVersion()) { + log() << "SyncSourceFeedback error sending " << (oldStyle ? "old style " : "") + << "update, response: " << res.toString(); + if (status == ErrorCodes::BadValue && !oldStyle) { + log() << "SyncSourceFeedback falling back to old style UpdatePosition command"; + _fallBackToOldUpdatePosition = true; + } else if (status != ErrorCodes::InvalidReplicaSetConfig || res["configVersion"].eoo() || + res["configVersion"].numberLong() < replCoord->getConfig().getConfigVersion()) { + // Blacklist sync target for .5 seconds and find a new one, unless we were rejected due + // to the syncsource having a newer config. replCoord->blacklistSyncSource(_syncTarget, Date_t::now() + Milliseconds(500)); BackgroundSync::get()->clearSyncTarget(); _resetConnection(); @@ -195,9 +207,16 @@ void SyncSourceFeedback::run() { continue; } } - Status status = updateUpstream(txn.get()); + bool oldFallBackValue = _fallBackToOldUpdatePosition; + Status status = updateUpstream(txn.get(), _fallBackToOldUpdatePosition); if (!status.isOK()) { - log() << "updateUpstream failed: " << status << ", will retry"; + if (_fallBackToOldUpdatePosition != oldFallBackValue) { + stdx::unique_lock<stdx::mutex> lock(_mtx); + _positionChanged = true; + } else { + log() << (_fallBackToOldUpdatePosition ? "old style " : "") << "updateUpstream" + << " failed: " << status << ", will retry"; + } } } } diff --git a/src/mongo/db/repl/sync_source_feedback.h b/src/mongo/db/repl/sync_source_feedback.h index d1dc13444e1..ed45b59a752 100644 --- a/src/mongo/db/repl/sync_source_feedback.h +++ b/src/mongo/db/repl/sync_source_feedback.h @@ -68,8 +68,10 @@ private: /* Inform the sync target of our current position in the oplog, as well as the positions * of all secondaries chained through us. + * "oldStyle" indicates whether or not the upstream node is pre-3.2.2 and needs the older style + * ReplSetUpdatePosition commands as a result. */ - Status updateUpstream(OperationContext* txn); + Status updateUpstream(OperationContext* txn, bool oldStyle); bool hasConnection() { return _connection.get(); @@ -92,6 +94,8 @@ private: bool _positionChanged = false; // Once this is set to true the _run method will terminate bool _shutdownSignaled = false; + // Indicates whether our syncSource can't accept the new version of the UpdatePosition command. + bool _fallBackToOldUpdatePosition = false; }; } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/sync_tail.cpp b/src/mongo/db/repl/sync_tail.cpp index 2e5d0182bcc..b81ee3bfa8e 100644 --- a/src/mongo/db/repl/sync_tail.cpp +++ b/src/mongo/db/repl/sync_tail.cpp @@ -216,13 +216,9 @@ ApplyBatchFinalizer::~ApplyBatchFinalizer() { } void ApplyBatchFinalizer::record(OpTime newOp) { - const bool mustWaitUntilDurable = _replCoord->isV1ElectionProtocol(); - if (!mustWaitUntilDurable) { - // We have to use setMyLastOptimeForward since this thread races with - // logTransitionToPrimaryToOplog. - _replCoord->setMyLastOptimeForward(newOp); - return; - } + // We have to use setMyLastAppliedOpTimeForward since this thread races with + // logTransitionToPrimaryToOplog. + _replCoord->setMyLastAppliedOpTimeForward(newOp); stdx::unique_lock<stdx::mutex> lock(_mutex); _latestOpTime = newOp; @@ -252,9 +248,9 @@ void ApplyBatchFinalizer::_run() { auto txn = cc().makeOperationContext(); txn->recoveryUnit()->goingToWaitUntilDurable(); txn->recoveryUnit()->waitUntilDurable(); - // We have to use setMyLastOptimeForward since this thread races with + // We have to use setMyLastDurableOpTimeForward since this thread races with // logTransitionToPrimaryToOplog. - _replCoord->setMyLastOptimeForward(latestOpTime); + _replCoord->setMyLastDurableOpTimeForward(latestOpTime); } } } // anonymous namespace containing ApplyBatchFinalizer definitions. @@ -716,7 +712,7 @@ void SyncTail::oplogApplication() { auto minValidBoundaries = getMinValid(&txn); OpTime originalEndOpTime(minValidBoundaries.end); - OpTime lastWriteOpTime{replCoord->getMyLastOptime()}; + OpTime lastWriteOpTime{replCoord->getMyLastAppliedOpTime()}; while (!inShutdown()) { OpQueue ops; @@ -747,6 +743,10 @@ void SyncTail::oplogApplication() { if (replCoord->isWaitingForApplierToDrain()) { replCoord->signalDrainComplete(&txn); } + + // Reset when triggered in case it was from a rollback, safe to do at any time. + lastWriteOpTime = replCoord->getMyLastAppliedOpTime(); + continue; // This wasn't a real op. Don't try to apply it. } diff --git a/src/mongo/db/repl/topology_coordinator.h b/src/mongo/db/repl/topology_coordinator.h index fa4edcc9bd9..11b61cb4490 100644 --- a/src/mongo/db/repl/topology_coordinator.h +++ b/src/mongo/db/repl/topology_coordinator.h @@ -233,6 +233,7 @@ public: const ReplSetHeartbeatArgs& args, const std::string& ourSetName, const OpTime& lastOpApplied, + const OpTime& lastOpDurable, ReplSetHeartbeatResponse* response) = 0; // produce a reply to a V1 heartbeat @@ -240,6 +241,7 @@ public: const ReplSetHeartbeatArgsV1& args, const std::string& ourSetName, const OpTime& lastOpApplied, + const OpTime& lastOpDurable, ReplSetHeartbeatResponse* response) = 0; // produce a reply to a status request diff --git a/src/mongo/db/repl/topology_coordinator_impl.cpp b/src/mongo/db/repl/topology_coordinator_impl.cpp index 0c4169fe4b9..32d857be5c8 100644 --- a/src/mongo/db/repl/topology_coordinator_impl.cpp +++ b/src/mongo/db/repl/topology_coordinator_impl.cpp @@ -204,7 +204,7 @@ HostAndPort TopologyCoordinatorImpl::chooseNewSyncSource(Date_t now, // Find primary's oplog time. Reject sync candidates that are more than // _options.maxSyncSourceLagSecs seconds behind. if (_currentPrimaryIndex != -1) { - OpTime primaryOpTime = _hbdata[_currentPrimaryIndex].getOpTime(); + OpTime primaryOpTime = _hbdata[_currentPrimaryIndex].getAppliedOpTime(); // Check if primaryOpTime is still close to 0 because we haven't received // our first heartbeat from a new primary yet. @@ -257,7 +257,7 @@ HostAndPort TopologyCoordinatorImpl::chooseNewSyncSource(Date_t now, continue; } // Candidates cannot be excessively behind. - if (it->getOpTime() < oldestSyncOpTime) { + if (it->getAppliedOpTime() < oldestSyncOpTime) { continue; } // Candidate must not have a configured delay larger than ours. @@ -272,7 +272,7 @@ HostAndPort TopologyCoordinatorImpl::chooseNewSyncSource(Date_t now, } } // only consider candidates that are ahead of where we are - if (it->getOpTime().getTimestamp() <= lastTimestampApplied) { + if (it->getAppliedOpTime().getTimestamp() <= lastTimestampApplied) { continue; } // Candidate cannot be more latent than anything we've already considered. @@ -421,10 +421,10 @@ void TopologyCoordinatorImpl::prepareSyncFromResponse(const ReplicationExecutor: str::stream() << "I cannot reach the requested member: " << target.toString()); return; } - if (hbdata.getOpTime().getSecs() + 10 < lastOpApplied.getSecs()) { + if (hbdata.getAppliedOpTime().getSecs() + 10 < lastOpApplied.getSecs()) { warning() << "attempting to sync from " << target << ", but its latest opTime is " - << hbdata.getOpTime().getSecs() << " and ours is " << lastOpApplied.getSecs() - << " so this may not work"; + << hbdata.getAppliedOpTime().getSecs() << " and ours is " + << lastOpApplied.getSecs() << " so this may not work"; response->append("warning", str::stream() << "requested member \"" << target.toString() << "\" is more than 10 seconds behind us"); @@ -518,7 +518,7 @@ bool TopologyCoordinatorImpl::_shouldVetoMember( return true; } - if (_iAmPrimary() && lastOpApplied >= _hbdata[hopefulIndex].getOpTime()) { + if (_iAmPrimary() && lastOpApplied >= _hbdata[hopefulIndex].getAppliedOpTime()) { // hbinfo is not updated for ourself, so if we are primary we have to check the // primary's last optime separately *errmsg = str::stream() << "I am already primary, " @@ -528,7 +528,8 @@ bool TopologyCoordinatorImpl::_shouldVetoMember( } if (_currentPrimaryIndex != -1 && (hopefulIndex != _currentPrimaryIndex) && - (_hbdata[_currentPrimaryIndex].getOpTime() >= _hbdata[hopefulIndex].getOpTime())) { + (_hbdata[_currentPrimaryIndex].getAppliedOpTime() >= + _hbdata[hopefulIndex].getAppliedOpTime())) { // other members might be aware of more up-to-date nodes *errmsg = str::stream() << _rsConfig.getMemberAt(hopefulIndex).getHostAndPort().toString() @@ -646,6 +647,7 @@ Status TopologyCoordinatorImpl::prepareHeartbeatResponse(Date_t now, const ReplSetHeartbeatArgs& args, const std::string& ourSetName, const OpTime& lastOpApplied, + const OpTime& lastOpDurable, ReplSetHeartbeatResponse* response) { if (args.getProtocolVersion() != 1) { return Status(ErrorCodes::BadValue, @@ -694,7 +696,8 @@ Status TopologyCoordinatorImpl::prepareHeartbeatResponse(Date_t now, // Heartbeat status message response->setHbMsg(_getHbmsg(now)); response->setTime(duration_cast<Seconds>(now - Date_t{})); - response->setOpTime(lastOpApplied); + response->setAppliedOpTime(lastOpApplied); + response->setDurableOpTime(lastOpDurable); if (!_syncSource.empty()) { response->setSyncingTo(_syncSource); @@ -737,6 +740,7 @@ Status TopologyCoordinatorImpl::prepareHeartbeatResponseV1(Date_t now, const ReplSetHeartbeatArgsV1& args, const std::string& ourSetName, const OpTime& lastOpApplied, + const OpTime& lastOpDurable, ReplSetHeartbeatResponse* response) { // Verify that replica set names match const std::string rshb = args.getSetName(); @@ -770,7 +774,8 @@ Status TopologyCoordinatorImpl::prepareHeartbeatResponseV1(Date_t now, response->setElectionTime(_electionTime); } - response->setOpTime(lastOpApplied); + response->setAppliedOpTime(lastOpApplied); + response->setDurableOpTime(lastOpDurable); if (_currentPrimaryIndex != -1) { response->setPrimaryId(_rsConfig.getMemberAt(_currentPrimaryIndex).getId()); @@ -1148,7 +1153,7 @@ HeartbeatResponseAction TopologyCoordinatorImpl::_updatePrimaryFromHBData( const MemberConfig& highestPriorityMember = _rsConfig.getMemberAt(highestPriorityIndex); const OpTime highestPriorityMemberOptime = highestPriorityIndex == _selfIndex ? lastOpApplied - : _hbdata[highestPriorityIndex].getOpTime(); + : _hbdata[highestPriorityIndex].getAppliedOpTime(); if ((highestPriorityMember.getPriority() > currentPrimaryMember.getPriority()) && _isOpTimeCloseEnoughToLatestToElect(highestPriorityMemberOptime, lastOpApplied)) { @@ -1378,7 +1383,7 @@ OpTime TopologyCoordinatorImpl::_latestKnownOpTime(const OpTime& ourLastOpApplie continue; } - OpTime optime = it->getOpTime(); + OpTime optime = it->getAppliedOpTime(); if (optime > latest) { latest = optime; @@ -1467,7 +1472,7 @@ void TopologyCoordinatorImpl::_setCurrentPrimaryForTest(int primaryIndex) { ReplSetHeartbeatResponse hbResponse; hbResponse.setState(MemberState::RS_PRIMARY); hbResponse.setElectionTime(Timestamp()); - hbResponse.setOpTime(_hbdata[primaryIndex].getOpTime()); + hbResponse.setAppliedOpTime(_hbdata[primaryIndex].getAppliedOpTime()); hbResponse.setSyncingTo(HostAndPort()); hbResponse.setHbMsg(""); _hbdata[primaryIndex].setUpValues(_hbdata[primaryIndex].getLastHeartbeat(), @@ -1598,15 +1603,16 @@ void TopologyCoordinatorImpl::prepareStatusResponse(const ReplicationExecutor::C if (!itConfig.isArbiter()) { if (_rsConfig.getProtocolVersion() == 1) { BSONObjBuilder opTime(bb.subobjStart("optime")); - opTime.append("ts", it->getOpTime().getTimestamp()); - opTime.append("t", it->getOpTime().getTerm()); + opTime.append("ts", it->getAppliedOpTime().getTimestamp()); + opTime.append("t", it->getAppliedOpTime().getTerm()); opTime.done(); } else { - bb.append("optime", it->getOpTime().getTimestamp()); + bb.append("optime", it->getAppliedOpTime().getTimestamp()); } - bb.appendDate("optimeDate", - Date_t::fromDurationSinceEpoch(Seconds(it->getOpTime().getSecs()))); + bb.appendDate( + "optimeDate", + Date_t::fromDurationSinceEpoch(Seconds(it->getAppliedOpTime().getSecs()))); } bb.appendDate("lastHeartbeat", it->getLastHeartbeat()); bb.appendDate("lastHeartbeatRecv", it->getLastHeartbeatRecv()); @@ -1914,7 +1920,7 @@ TopologyCoordinatorImpl::UnelectableReasonMask TopologyCoordinatorImpl::_getUnel result |= NotSecondary; } if (_rsConfig.getProtocolVersion() == 0 && - !_isOpTimeCloseEnoughToLatestToElect(hbData.getOpTime(), lastOpApplied)) { + !_isOpTimeCloseEnoughToLatestToElect(hbData.getAppliedOpTime(), lastOpApplied)) { result |= NotCloseEnoughToLatestOptime; } if (hbData.up() && hbData.isUnelectable()) { @@ -2175,7 +2181,7 @@ bool TopologyCoordinatorImpl::stepDown(Date_t until, bool force, const OpTime& l continue; } UnelectableReasonMask reason = _getUnelectableReason(i, lastOpApplied); - if (!reason && _hbdata[i].getOpTime() >= lastOpApplied) { + if (!reason && _hbdata[i].getAppliedOpTime() >= lastOpApplied) { canStepDown = true; } } @@ -2309,7 +2315,7 @@ bool TopologyCoordinatorImpl::shouldChangeSyncSource(const HostAndPort& currentS invariant(currentSourceIndex != _selfIndex); OpTime currentSourceOpTime = - std::max(syncSourceLastOpTime, _hbdata[currentSourceIndex].getOpTime()); + std::max(syncSourceLastOpTime, _hbdata[currentSourceIndex].getAppliedOpTime()); if (currentSourceOpTime.isNull()) { // Haven't received a heartbeat from the sync source yet, so can't tell if we should @@ -2333,12 +2339,12 @@ bool TopologyCoordinatorImpl::shouldChangeSyncSource(const HostAndPort& currentS if (it->up() && (candidateConfig.isVoter() || !_selfConfig().isVoter()) && (candidateConfig.shouldBuildIndexes() || !_selfConfig().shouldBuildIndexes()) && it->getState().readable() && !_memberIsBlacklisted(candidateConfig, now) && - goalSecs < it->getOpTime().getSecs()) { + goalSecs < it->getAppliedOpTime().getSecs()) { log() << "re-evaluating sync source because our current sync source's most recent " << "OpTime is " << currentSourceOpTime.toString() << " which is more than " << _options.maxSyncSourceLagSecs << " behind member " << candidateConfig.getHostAndPort().toString() << " whose most recent OpTime is " - << it->getOpTime().toString(); + << it->getAppliedOpTime().toString(); invariant(itIndex != _selfIndex); return true; } diff --git a/src/mongo/db/repl/topology_coordinator_impl.h b/src/mongo/db/repl/topology_coordinator_impl.h index 019f6fbaf1d..9c8fad87bfe 100644 --- a/src/mongo/db/repl/topology_coordinator_impl.h +++ b/src/mongo/db/repl/topology_coordinator_impl.h @@ -183,11 +183,13 @@ public: const ReplSetHeartbeatArgs& args, const std::string& ourSetName, const OpTime& lastOpApplied, + const OpTime& lastOpDurable, ReplSetHeartbeatResponse* response); virtual Status prepareHeartbeatResponseV1(Date_t now, const ReplSetHeartbeatArgsV1& args, const std::string& ourSetName, const OpTime& lastOpApplied, + const OpTime& lastOpDurable, ReplSetHeartbeatResponse* response); virtual void prepareStatusResponse(const ReplicationExecutor::CallbackArgs& data, Date_t now, diff --git a/src/mongo/db/repl/topology_coordinator_impl_test.cpp b/src/mongo/db/repl/topology_coordinator_impl_test.cpp index 294c591bbfe..5a73906d164 100644 --- a/src/mongo/db/repl/topology_coordinator_impl_test.cpp +++ b/src/mongo/db/repl/topology_coordinator_impl_test.cpp @@ -207,7 +207,8 @@ private: ReplSetHeartbeatResponse hb; hb.setConfigVersion(1); hb.setState(memberState); - hb.setOpTime(lastOpTimeSender); + hb.setDurableOpTime(lastOpTimeSender); + hb.setAppliedOpTime(lastOpTimeSender); hb.setElectionTime(electionTime); StatusWith<ReplSetHeartbeatResponse> hbResponse = responseStatus.isOK() @@ -1327,7 +1328,8 @@ TEST_F(TopoCoordTest, ReplSetGetStatus) { hb.setState(MemberState::RS_SECONDARY); hb.setElectionTime(electionTime); hb.setHbMsg("READY"); - hb.setOpTime(oplogProgress); + hb.setAppliedOpTime(oplogProgress); + hb.setDurableOpTime(oplogProgress); StatusWith<ReplSetHeartbeatResponse> hbResponseGood = StatusWith<ReplSetHeartbeatResponse>(hb); updateConfig( @@ -2960,7 +2962,8 @@ TEST_F( hbArgs.setSenderId(1); hbArgs.setSenderHost(HostAndPort("host3", 27017)); ReplSetHeartbeatResponse hbResp; - ASSERT_OK(getTopoCoord().prepareHeartbeatResponse(now(), hbArgs, "rs0", election, &hbResp)); + ASSERT_OK( + getTopoCoord().prepareHeartbeatResponse(now(), hbArgs, "rs0", election, election, &hbResp)); ASSERT(!hbResp.hasIsElectable() || hbResp.isElectable()) << hbResp.toString(); } @@ -4245,8 +4248,8 @@ public: OpTime lastOpApplied, ReplSetHeartbeatResponse* response, Status* result) { - *result = - getTopoCoord().prepareHeartbeatResponse(now()++, args, "rs0", lastOpApplied, response); + *result = getTopoCoord().prepareHeartbeatResponse( + now()++, args, "rs0", lastOpApplied, lastOpApplied, response); } }; @@ -4319,7 +4322,7 @@ TEST_F(PrepareHeartbeatResponseTest, ASSERT_FALSE(response.isElectable()); ASSERT_TRUE(response.isReplSet()); ASSERT_EQUALS(MemberState::RS_SECONDARY, response.getState().s); - ASSERT_EQUALS(OpTime(), response.getOpTime()); + ASSERT_EQUALS(OpTime(), response.getDurableOpTime()); ASSERT_EQUALS(0, durationCount<Seconds>(response.getTime())); ASSERT_EQUALS("", response.getHbMsg()); ASSERT_EQUALS("rs0", response.getReplicaSetName()); @@ -4343,7 +4346,7 @@ TEST_F(PrepareHeartbeatResponseTest, ASSERT_FALSE(response.isElectable()); ASSERT_TRUE(response.isReplSet()); ASSERT_EQUALS(MemberState::RS_SECONDARY, response.getState().s); - ASSERT_EQUALS(OpTime(), response.getOpTime()); + ASSERT_EQUALS(OpTime(), response.getDurableOpTime()); ASSERT_EQUALS(0, durationCount<Seconds>(response.getTime())); ASSERT_EQUALS("", response.getHbMsg()); ASSERT_EQUALS("rs0", response.getReplicaSetName()); @@ -4368,7 +4371,7 @@ TEST_F(PrepareHeartbeatResponseTest, ASSERT_FALSE(response.isElectable()); ASSERT_TRUE(response.isReplSet()); ASSERT_EQUALS(MemberState::RS_SECONDARY, response.getState().s); - ASSERT_EQUALS(OpTime(), response.getOpTime()); + ASSERT_EQUALS(OpTime(), response.getDurableOpTime()); ASSERT_EQUALS(0, durationCount<Seconds>(response.getTime())); ASSERT_EQUALS("", response.getHbMsg()); ASSERT_EQUALS("rs0", response.getReplicaSetName()); @@ -4393,7 +4396,7 @@ TEST_F(PrepareHeartbeatResponseTest, ASSERT_FALSE(response.isElectable()); ASSERT_TRUE(response.isReplSet()); ASSERT_EQUALS(MemberState::RS_SECONDARY, response.getState().s); - ASSERT_EQUALS(OpTime(), response.getOpTime()); + ASSERT_EQUALS(OpTime(), response.getDurableOpTime()); ASSERT_EQUALS(0, durationCount<Seconds>(response.getTime())); ASSERT_EQUALS("", response.getHbMsg()); ASSERT_EQUALS("rs0", response.getReplicaSetName()); @@ -4417,7 +4420,7 @@ TEST_F(PrepareHeartbeatResponseTest, ASSERT_FALSE(response.isElectable()); ASSERT_TRUE(response.isReplSet()); ASSERT_EQUALS(MemberState::RS_SECONDARY, response.getState().s); - ASSERT_EQUALS(OpTime(), response.getOpTime()); + ASSERT_EQUALS(OpTime(), response.getDurableOpTime()); ASSERT_EQUALS(0, durationCount<Seconds>(response.getTime())); ASSERT_EQUALS("", response.getHbMsg()); ASSERT_EQUALS("rs0", response.getReplicaSetName()); @@ -4444,7 +4447,7 @@ TEST_F(PrepareHeartbeatResponseTest, ASSERT_TRUE(response.isElectable()); ASSERT_TRUE(response.isReplSet()); ASSERT_EQUALS(MemberState::RS_SECONDARY, response.getState().s); - ASSERT_EQUALS(OpTime(Timestamp(100, 0), 0), response.getOpTime()); + ASSERT_EQUALS(OpTime(Timestamp(100, 0), 0), response.getDurableOpTime()); ASSERT_EQUALS(0, durationCount<Seconds>(response.getTime())); ASSERT_EQUALS("", response.getHbMsg()); ASSERT_EQUALS("rs0", response.getReplicaSetName()); @@ -4460,13 +4463,13 @@ TEST_F(TopoCoordTest, SetConfigVersionToNegativeTwoInHeartbeatResponseWhenNoConf args.setSenderId(20); ReplSetHeartbeatResponse response; // prepare response and check the results - Status result = - getTopoCoord().prepareHeartbeatResponse(now()++, args, "rs0", OpTime(), &response); + Status result = getTopoCoord().prepareHeartbeatResponse( + now()++, args, "rs0", OpTime(), OpTime(), &response); ASSERT_OK(result); ASSERT_FALSE(response.isElectable()); ASSERT_TRUE(response.isReplSet()); ASSERT_EQUALS(MemberState::RS_STARTUP, response.getState().s); - ASSERT_EQUALS(OpTime(), response.getOpTime()); + ASSERT_EQUALS(OpTime(), response.getDurableOpTime()); ASSERT_EQUALS(0, durationCount<Seconds>(response.getTime())); ASSERT_EQUALS("", response.getHbMsg()); ASSERT_EQUALS("rs0", response.getReplicaSetName()); @@ -4493,7 +4496,7 @@ TEST_F(PrepareHeartbeatResponseTest, ASSERT_TRUE(response.isElectable()); ASSERT_TRUE(response.isReplSet()); ASSERT_EQUALS(MemberState::RS_PRIMARY, response.getState().s); - ASSERT_EQUALS(OpTime(Timestamp(11, 0), 0), response.getOpTime()); + ASSERT_EQUALS(OpTime(Timestamp(11, 0), 0), response.getDurableOpTime()); ASSERT_EQUALS(Timestamp(10, 0), response.getElectionTime()); ASSERT_EQUALS(0, durationCount<Seconds>(response.getTime())); ASSERT_EQUALS("", response.getHbMsg()); @@ -4527,7 +4530,7 @@ TEST_F(PrepareHeartbeatResponseTest, ASSERT_TRUE(response.isElectable()); ASSERT_TRUE(response.isReplSet()); ASSERT_EQUALS(MemberState::RS_SECONDARY, response.getState().s); - ASSERT_EQUALS(OpTime(Timestamp(100, 0), 0), response.getOpTime()); + ASSERT_EQUALS(OpTime(Timestamp(100, 0), 0), response.getDurableOpTime()); ASSERT_EQUALS(0, durationCount<Seconds>(response.getTime())); // changed to a syncing message because our sync source changed recently ASSERT_EQUALS("syncing from: h2:27017", response.getHbMsg()); @@ -4854,7 +4857,7 @@ TEST_F(HeartbeatResponseTest, ReconfigBetweenHeartbeatRequestAndRepsonse) { ReplSetHeartbeatResponse hb; hb.initialize(BSON("ok" << 1 << "v" << 1 << "state" << MemberState::RS_PRIMARY), 0); - hb.setOpTime(lastOpTimeApplied); + hb.setDurableOpTime(lastOpTimeApplied); hb.setElectionTime(election.getTimestamp()); StatusWith<ReplSetHeartbeatResponse> hbResponse = StatusWith<ReplSetHeartbeatResponse>(hb); HeartbeatResponseAction action = getTopoCoord().processHeartbeatResponse( @@ -4903,7 +4906,7 @@ TEST_F(HeartbeatResponseTest, ReconfigNodeRemovedBetweenHeartbeatRequestAndRepso ReplSetHeartbeatResponse hb; hb.initialize(BSON("ok" << 1 << "v" << 1 << "state" << MemberState::RS_PRIMARY), 0); - hb.setOpTime(lastOpTimeApplied); + hb.setDurableOpTime(lastOpTimeApplied); hb.setElectionTime(election.getTimestamp()); StatusWith<ReplSetHeartbeatResponse> hbResponse = StatusWith<ReplSetHeartbeatResponse>(hb); HeartbeatResponseAction action = getTopoCoord().processHeartbeatResponse( diff --git a/src/mongo/db/repl/topology_coordinator_impl_v1_test.cpp b/src/mongo/db/repl/topology_coordinator_impl_v1_test.cpp index db549689d69..c8e533b1f86 100644 --- a/src/mongo/db/repl/topology_coordinator_impl_v1_test.cpp +++ b/src/mongo/db/repl/topology_coordinator_impl_v1_test.cpp @@ -209,7 +209,8 @@ private: ReplSetHeartbeatResponse hb; hb.setConfigVersion(1); hb.setState(memberState); - hb.setOpTime(lastOpTimeSender); + hb.setDurableOpTime(lastOpTimeSender); + hb.setAppliedOpTime(lastOpTimeSender); hb.setElectionTime(electionTime); hb.setTerm(getTopoCoord().getTerm()); @@ -1320,7 +1321,8 @@ TEST_F(TopoCoordTest, ReplSetGetStatus) { hb.setState(MemberState::RS_SECONDARY); hb.setElectionTime(electionTime); hb.setHbMsg("READY"); - hb.setOpTime(oplogProgress); + hb.setDurableOpTime(oplogProgress); + hb.setAppliedOpTime(oplogProgress); StatusWith<ReplSetHeartbeatResponse> hbResponseGood = StatusWith<ReplSetHeartbeatResponse>(hb); updateConfig( @@ -1553,7 +1555,7 @@ public: ReplSetHeartbeatResponse* response, Status* result) { *result = getTopoCoord().prepareHeartbeatResponseV1( - now()++, args, "rs0", lastOpApplied, response); + now()++, args, "rs0", lastOpApplied, lastOpApplied, response); } }; @@ -1625,13 +1627,13 @@ TEST_F(TopoCoordTest, SetConfigVersionToNegativeTwoInHeartbeatResponseWhenNoConf args.setSenderId(20); ReplSetHeartbeatResponse response; // prepare response and check the results - Status result = - getTopoCoord().prepareHeartbeatResponseV1(now()++, args, "rs0", OpTime(), &response); + Status result = getTopoCoord().prepareHeartbeatResponseV1( + now()++, args, "rs0", OpTime(), OpTime(), &response); ASSERT_OK(result); // this change to true because we can now see a majority, unlike in the previous cases ASSERT_EQUALS("rs0", response.getReplicaSetName()); ASSERT_EQUALS(MemberState::RS_STARTUP, response.getState().s); - ASSERT_EQUALS(OpTime(), response.getOpTime()); + ASSERT_EQUALS(OpTime(), response.getDurableOpTime()); // default term of topology coordinator is -1 ASSERT_EQUALS(-1, response.getTerm()); ASSERT_EQUALS(-2, response.getConfigVersion()); @@ -1651,7 +1653,7 @@ TEST_F(PrepareHeartbeatResponseV1Test, ASSERT_OK(result); ASSERT_EQUALS("rs0", response.getReplicaSetName()); ASSERT_EQUALS(MemberState::RS_SECONDARY, response.getState().s); - ASSERT_EQUALS(OpTime(), response.getOpTime()); + ASSERT_EQUALS(OpTime(), response.getDurableOpTime()); ASSERT_EQUALS(0, response.getTerm()); ASSERT_EQUALS(1, response.getConfigVersion()); } @@ -1671,7 +1673,7 @@ TEST_F(PrepareHeartbeatResponseV1Test, ASSERT_OK(result); ASSERT_EQUALS("rs0", response.getReplicaSetName()); ASSERT_EQUALS(MemberState::RS_SECONDARY, response.getState().s); - ASSERT_EQUALS(OpTime(), response.getOpTime()); + ASSERT_EQUALS(OpTime(), response.getDurableOpTime()); ASSERT_EQUALS(0, response.getTerm()); ASSERT_EQUALS(1, response.getConfigVersion()); } @@ -1692,7 +1694,7 @@ TEST_F(PrepareHeartbeatResponseV1Test, ASSERT_TRUE(response.hasConfig()); ASSERT_EQUALS("rs0", response.getReplicaSetName()); ASSERT_EQUALS(MemberState::RS_SECONDARY, response.getState().s); - ASSERT_EQUALS(OpTime(), response.getOpTime()); + ASSERT_EQUALS(OpTime(), response.getDurableOpTime()); ASSERT_EQUALS(0, response.getTerm()); ASSERT_EQUALS(1, response.getConfigVersion()); } @@ -1713,7 +1715,7 @@ TEST_F(PrepareHeartbeatResponseV1Test, ASSERT_FALSE(response.hasConfig()); ASSERT_EQUALS("rs0", response.getReplicaSetName()); ASSERT_EQUALS(MemberState::RS_SECONDARY, response.getState().s); - ASSERT_EQUALS(OpTime(), response.getOpTime()); + ASSERT_EQUALS(OpTime(), response.getDurableOpTime()); ASSERT_EQUALS(0, response.getTerm()); ASSERT_EQUALS(1, response.getConfigVersion()); } @@ -1736,7 +1738,7 @@ TEST_F(PrepareHeartbeatResponseV1Test, SetStatePrimaryInHeartbeatResponseWhenPri ASSERT_EQUALS(MemberState::RS_PRIMARY, response.getState().s); ASSERT_TRUE(response.hasElectionTime()); ASSERT_EQUALS(getTopoCoord().getElectionTime(), response.getElectionTime()); - ASSERT_EQUALS(OpTime(Timestamp(11, 0), 0), response.getOpTime()); + ASSERT_EQUALS(OpTime(Timestamp(11, 0), 0), response.getDurableOpTime()); ASSERT_EQUALS(0, response.getTerm()); ASSERT_EQUALS(1, response.getConfigVersion()); } @@ -1767,7 +1769,7 @@ TEST_F(PrepareHeartbeatResponseV1Test, ASSERT_EQUALS("rs0", response.getReplicaSetName()); ASSERT_EQUALS(MemberState::RS_SECONDARY, response.getState().s); ASSERT_FALSE(response.hasElectionTime()); - ASSERT_EQUALS(OpTime(Timestamp(100, 0), 0), response.getOpTime()); + ASSERT_EQUALS(OpTime(Timestamp(100, 0), 0), response.getDurableOpTime()); ASSERT_EQUALS(0, response.getTerm()); ASSERT_EQUALS(1, response.getConfigVersion()); ASSERT_EQUALS(HostAndPort("h2"), response.getSyncingTo()); @@ -3133,7 +3135,7 @@ TEST_F(HeartbeatResponseTestV1, ReconfigNodeRemovedBetweenHeartbeatRequestAndRep ReplSetHeartbeatResponse hb; hb.initialize(BSON("ok" << 1 << "v" << 1 << "state" << MemberState::RS_PRIMARY), 0); - hb.setOpTime(lastOpTimeApplied); + hb.setDurableOpTime(lastOpTimeApplied); hb.setElectionTime(election.getTimestamp()); StatusWith<ReplSetHeartbeatResponse> hbResponse = StatusWith<ReplSetHeartbeatResponse>(hb); HeartbeatResponseAction action = getTopoCoord().processHeartbeatResponse( @@ -3182,7 +3184,7 @@ TEST_F(HeartbeatResponseTestV1, ReconfigBetweenHeartbeatRequestAndRepsonse) { ReplSetHeartbeatResponse hb; hb.initialize(BSON("ok" << 1 << "v" << 1 << "state" << MemberState::RS_PRIMARY), 0); - hb.setOpTime(lastOpTimeApplied); + hb.setDurableOpTime(lastOpTimeApplied); hb.setElectionTime(election.getTimestamp()); StatusWith<ReplSetHeartbeatResponse> hbResponse = StatusWith<ReplSetHeartbeatResponse>(hb); HeartbeatResponseAction action = getTopoCoord().processHeartbeatResponse( diff --git a/src/mongo/db/repl/update_position_args.cpp b/src/mongo/db/repl/update_position_args.cpp index 6ccddfa96aa..c4e2688fdd1 100644 --- a/src/mongo/db/repl/update_position_args.cpp +++ b/src/mongo/db/repl/update_position_args.cpp @@ -1,5 +1,5 @@ /** - * Copyright 2014 MongoDB Inc. + * Copyright 2016 MongoDB Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License, version 3, @@ -39,11 +39,11 @@ namespace mongo { namespace repl { -UpdatePositionArgs::UpdateInfo::UpdateInfo(const OID& anRid, - const OpTime& aTs, +UpdatePositionArgs::UpdateInfo::UpdateInfo(const OpTime& applied, + const OpTime& durable, long long aCfgver, long long aMemberId) - : rid(anRid), ts(aTs), cfgver(aCfgver), memberId(aMemberId) {} + : appliedOpTime(applied), durableOpTime(durable), cfgver(aCfgver), memberId(aMemberId) {} namespace { @@ -54,32 +54,21 @@ const std::string kLegalUpdatePositionFieldNames[] = { kCommandFieldName, kUpdateArrayFieldName, }; -const std::string kMemberRIDFieldName = "_id"; -const std::string kMemberConfigFieldName = "config"; -const std::string kOpTimeFieldName = "optime"; +const std::string kAppliedOpTimeFieldName = "appliedOpTime"; +const std::string kDurableOpTimeFieldName = "durableOpTime"; const std::string kMemberIdFieldName = "memberId"; const std::string kConfigVersionFieldName = "cfgver"; const std::string kLegalUpdateInfoFieldNames[] = { - kMemberConfigFieldName, - kMemberRIDFieldName, - kOpTimeFieldName, - kMemberIdFieldName, - kConfigVersionFieldName, + kAppliedOpTimeFieldName, kDurableOpTimeFieldName, kMemberIdFieldName, kConfigVersionFieldName, }; } // namespace Status UpdatePositionArgs::initialize(const BSONObj& argsObj) { - Status status = - bsonCheckOnlyHasFields("UpdatePositionArgs", argsObj, kLegalUpdatePositionFieldNames); - - if (!status.isOK()) - return status; - // grab the array of changes BSONElement updateArray; - status = bsonExtractTypedField(argsObj, kUpdateArrayFieldName, Array, &updateArray); + Status status = bsonExtractTypedField(argsObj, kUpdateArrayFieldName, Array, &updateArray); if (!status.isOK()) return status; @@ -87,23 +76,14 @@ Status UpdatePositionArgs::initialize(const BSONObj& argsObj) { BSONObjIterator i(updateArray.Obj()); while (i.more()) { BSONObj entry = i.next().Obj(); - status = bsonCheckOnlyHasFields("UpdateInfoArgs", entry, kLegalUpdateInfoFieldNames); + + OpTime appliedOpTime; + status = bsonExtractOpTimeField(entry, kAppliedOpTimeFieldName, &appliedOpTime); if (!status.isOK()) return status; - OpTime opTime; - if (entry[kOpTimeFieldName].isABSONObj()) { - // In protocol version 1, { ts: <timestamp>, t: term } - Status status = bsonExtractOpTimeField(entry, kOpTimeFieldName, &opTime); - if (!status.isOK()) - return status; - } else { - Timestamp ts; - status = bsonExtractTimestampField(entry, kOpTimeFieldName, &ts); - if (!status.isOK()) - return status; - opTime = OpTime(ts, OpTime::kUninitializedTerm); - } + OpTime durableOpTime; + status = bsonExtractOpTimeField(entry, kDurableOpTimeFieldName, &durableOpTime); if (!status.isOK()) return status; @@ -114,17 +94,12 @@ Status UpdatePositionArgs::initialize(const BSONObj& argsObj) { if (!status.isOK()) return status; - OID rid; - status = bsonExtractOIDFieldWithDefault(entry, kMemberRIDFieldName, OID(), &rid); - if (!status.isOK()) - return status; - long long memberID; status = bsonExtractIntegerFieldWithDefault(entry, kMemberIdFieldName, -1, &memberID); if (!status.isOK()) return status; - _updates.push_back(UpdateInfo(rid, opTime, cfgver, memberID)); + _updates.push_back(UpdateInfo(appliedOpTime, durableOpTime, cfgver, memberID)); } return Status::OK(); @@ -140,10 +115,11 @@ BSONObj UpdatePositionArgs::toBSON() const { BSONArrayBuilder updateArray(builder.subarrayStart(kUpdateArrayFieldName)); for (UpdatePositionArgs::UpdateIterator update = updatesBegin(); update != updatesEnd(); ++update) { - updateArray.append(BSON(kMemberRIDFieldName << update->rid << kOpTimeFieldName - << update->ts.getTimestamp() - << kConfigVersionFieldName << update->cfgver - << kMemberIdFieldName << update->memberId)); + BSONObjBuilder updateEntry(updateArray.subobjStart()); + updateEntry.append(kConfigVersionFieldName, update->cfgver); + updateEntry.append(kMemberIdFieldName, update->memberId); + update->durableOpTime.append(&updateEntry, kDurableOpTimeFieldName); + update->appliedOpTime.append(&updateEntry, kAppliedOpTimeFieldName); } updateArray.doneFast(); } diff --git a/src/mongo/db/repl/update_position_args.h b/src/mongo/db/repl/update_position_args.h index ecaf9ec5d4e..823a775588e 100644 --- a/src/mongo/db/repl/update_position_args.h +++ b/src/mongo/db/repl/update_position_args.h @@ -1,5 +1,5 @@ /** - * Copyright (C) 2014 MongoDB Inc. + * Copyright (C) 2016 MongoDB Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License, version 3, @@ -45,10 +45,13 @@ namespace repl { class UpdatePositionArgs { public: struct UpdateInfo { - UpdateInfo(const OID& anRid, const OpTime& aTs, long long aCfgver, long long aMemberId); + UpdateInfo(const OpTime& applied, + const OpTime& durable, + long long aCfgver, + long long aMemberId); - OID rid; - OpTime ts; + OpTime appliedOpTime; + OpTime durableOpTime; long long cfgver; long long memberId; }; diff --git a/src/mongo/db/s/migration_impl.cpp b/src/mongo/db/s/migration_impl.cpp index d33fc302d9b..e761f27a78d 100644 --- a/src/mongo/db/s/migration_impl.cpp +++ b/src/mongo/db/s/migration_impl.cpp @@ -64,7 +64,7 @@ Tee* const migrateLog = RamLog::get("migrate"); const int kDefaultWriteTimeoutForMigrationMs = 60 * 1000; const WriteConcernOptions DefaultWriteConcernForMigration(2, - WriteConcernOptions::NONE, + WriteConcernOptions::SyncMode::NONE, kDefaultWriteTimeoutForMigrationMs); WriteConcernOptions getDefaultWriteConcernForMigration() { @@ -77,7 +77,7 @@ WriteConcernOptions getDefaultWriteConcernForMigration() { } } - return WriteConcernOptions(1, WriteConcernOptions::NONE, 0); + return WriteConcernOptions(1, WriteConcernOptions::SyncMode::NONE, 0); } BSONObj createRecvChunkCommitRequest(const MigrationSessionId& sessionId) { diff --git a/src/mongo/db/s/sharding_state_recovery.cpp b/src/mongo/db/s/sharding_state_recovery.cpp index ec95bb5f956..804f4b21ca2 100644 --- a/src/mongo/db/s/sharding_state_recovery.cpp +++ b/src/mongo/db/s/sharding_state_recovery.cpp @@ -64,7 +64,7 @@ const char kMinOpTimeUpdaters[] = "minOpTimeUpdaters"; const Seconds kWriteTimeout(15); const WriteConcernOptions kMajorityWriteConcern(WriteConcernOptions::kMajority, - WriteConcernOptions::NONE, + WriteConcernOptions::SyncMode::UNSET, kWriteTimeout); MONGO_EXPORT_STARTUP_SERVER_PARAMETER(recoverShardingState, bool, true); diff --git a/src/mongo/db/write_concern.cpp b/src/mongo/db/write_concern.cpp index 6f447d8b666..3c9086ca39a 100644 --- a/src/mongo/db/write_concern.cpp +++ b/src/mongo/db/write_concern.cpp @@ -26,6 +26,8 @@ * it in the license file. */ +#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kWrite + #include "mongo/platform/basic.h" #include "mongo/db/write_concern.h" @@ -43,6 +45,7 @@ #include "mongo/db/storage/storage_engine.h" #include "mongo/db/write_concern_options.h" #include "mongo/rpc/protocol.h" +#include "mongo/util/log.h" namespace mongo { @@ -59,23 +62,13 @@ static ServerStatusMetricField<Counter64> gleWtimeoutsDisplay("getLastError.wtim void setupSynchronousCommit(OperationContext* txn) { const WriteConcernOptions& writeConcern = txn->getWriteConcern(); - if (writeConcern.syncMode == WriteConcernOptions::JOURNAL || - writeConcern.syncMode == WriteConcernOptions::FSYNC) { + if (writeConcern.syncMode == WriteConcernOptions::SyncMode::JOURNAL || + writeConcern.syncMode == WriteConcernOptions::SyncMode::FSYNC) { txn->recoveryUnit()->goingToWaitUntilDurable(); } } namespace { -// The consensus protocol requires that w: majority implies j: true on all nodes. -void addJournalSyncForWMajority(WriteConcernOptions* writeConcern) { - if (repl::getGlobalReplicationCoordinator()->isV1ElectionProtocol() && - writeConcern->wMode == WriteConcernOptions::kMajority && - writeConcern->syncMode == WriteConcernOptions::NONE && - getGlobalServiceContext()->getGlobalStorageEngine()->isDurable()) { - writeConcern->syncMode = WriteConcernOptions::JOURNAL; - } -} - const std::string kLocalDB = "local"; } // namespace @@ -89,8 +82,6 @@ StatusWith<WriteConcernOptions> extractWriteConcern(OperationContext* txn, if (writeConcern.wNumNodes == 0 && writeConcern.wMode.empty()) { writeConcern.wNumNodes = 1; } - // Upgrade default write concern if necessary. - addJournalSyncForWMajority(&writeConcern); BSONElement writeConcernElement; Status wcStatus = bsonExtractTypedField(cmdObj, "writeConcern", Object, &writeConcernElement); @@ -118,17 +109,15 @@ StatusWith<WriteConcernOptions> extractWriteConcern(OperationContext* txn, return wcStatus; } - // Upgrade parsed write concern if necessary. - addJournalSyncForWMajority(&writeConcern); - return writeConcern; } + Status validateWriteConcern(OperationContext* txn, const WriteConcernOptions& writeConcern, const std::string& dbName) { const bool isJournalEnabled = getGlobalServiceContext()->getGlobalStorageEngine()->isDurable(); - if (writeConcern.syncMode == WriteConcernOptions::JOURNAL && !isJournalEnabled) { + if (writeConcern.syncMode == WriteConcernOptions::SyncMode::JOURNAL && !isJournalEnabled) { return Status(ErrorCodes::BadValue, "cannot use 'j' option when a host does not have journaling enabled"); } @@ -220,7 +209,7 @@ void WriteConcernResult::appendTo(const WriteConcernOptions& writeConcern, // GLE, but with journaling we don't actually need to run the fsync (fsync command is // preferred in 2.6). So we add a "waited" field if one doesn't exist. - if (writeConcern.syncMode == WriteConcernOptions::FSYNC) { + if (writeConcern.syncMode == WriteConcernOptions::SyncMode::FSYNC) { if (fsyncFiles < 0 && (wTime < 0 || !wTimedOut)) { dassert(result->asTempObj()["waited"].eoo()); result->appendNumber("waited", syncMillis); @@ -248,11 +237,18 @@ Status waitForWriteConcern(OperationContext* txn, // Next handle blocking on disk Timer syncTimer; + auto replCoord = repl::getGlobalReplicationCoordinator(); + WriteConcernOptions writeConcernWithPopulatedSyncMode = + replCoord->populateUnsetWriteConcernOptionsSyncMode(writeConcern); - switch (writeConcern.syncMode) { - case WriteConcernOptions::NONE: + + switch (writeConcernWithPopulatedSyncMode.syncMode) { + case WriteConcernOptions::SyncMode::UNSET: + severe() << "Attempting to wait on a WriteConcern with an unset sync option"; + fassertFailed(34410); + case WriteConcernOptions::SyncMode::NONE: break; - case WriteConcernOptions::FSYNC: { + case WriteConcernOptions::SyncMode::FSYNC: { StorageEngine* storageEngine = getGlobalServiceContext()->getGlobalStorageEngine(); if (!storageEngine->isDurable()) { result->fsyncFiles = storageEngine->flushAllFiles(true); @@ -262,8 +258,16 @@ Status waitForWriteConcern(OperationContext* txn, } break; } - case WriteConcernOptions::JOURNAL: - txn->recoveryUnit()->waitUntilDurable(); + case WriteConcernOptions::SyncMode::JOURNAL: + if (replCoord->getReplicationMode() != repl::ReplicationCoordinator::Mode::modeNone) { + // Wait for ops to become durable then update replication system's + // knowledge of this. + OpTime appliedOpTime = replCoord->getMyLastAppliedOpTime(); + txn->recoveryUnit()->waitUntilDurable(); + replCoord->setMyLastDurableOpTimeForward(appliedOpTime); + } else { + txn->recoveryUnit()->waitUntilDurable(); + } break; } @@ -277,7 +281,8 @@ Status waitForWriteConcern(OperationContext* txn, } // needed to avoid incrementing gleWtimeStats SERVER-9005 - if (writeConcern.wNumNodes <= 1 && writeConcern.wMode.empty()) { + if (writeConcernWithPopulatedSyncMode.wNumNodes <= 1 && + writeConcernWithPopulatedSyncMode.wMode.empty()) { // no desired replication check return Status::OK(); } @@ -285,14 +290,17 @@ Status waitForWriteConcern(OperationContext* txn, // Now we wait for replication // Note that replica set stepdowns and gle mode changes are thrown as errors repl::ReplicationCoordinator::StatusAndDuration replStatus = - repl::getGlobalReplicationCoordinator()->awaitReplication(txn, replOpTime, writeConcern); + repl::getGlobalReplicationCoordinator()->awaitReplication( + txn, replOpTime, writeConcernWithPopulatedSyncMode); if (replStatus.status == ErrorCodes::WriteConcernFailed) { gleWtimeouts.increment(); result->err = "timeout"; result->wTimedOut = true; } // Add stats - result->writtenTo = repl::getGlobalReplicationCoordinator()->getHostsWrittenTo(replOpTime); + result->writtenTo = repl::getGlobalReplicationCoordinator()->getHostsWrittenTo( + replOpTime, + writeConcernWithPopulatedSyncMode.syncMode == WriteConcernOptions::SyncMode::JOURNAL); gleWtimeStats.recordMillis(durationCount<Milliseconds>(replStatus.duration)); result->wTime = durationCount<Milliseconds>(replStatus.duration); diff --git a/src/mongo/db/write_concern_options.cpp b/src/mongo/db/write_concern_options.cpp index b099d868a96..58af7b36a9d 100644 --- a/src/mongo/db/write_concern_options.cpp +++ b/src/mongo/db/write_concern_options.cpp @@ -72,6 +72,7 @@ WriteConcernOptions::WriteConcernOptions(const std::string& mode, : syncMode(sync), wNumNodes(0), wMode(mode), wTimeout(durationCount<Milliseconds>(timeout)) {} Status WriteConcernOptions::parse(const BSONObj& obj) { + reset(); if (obj.isEmpty()) { return Status(ErrorCodes::FailedToParse, "write concern object cannot be empty"); } @@ -94,10 +95,11 @@ Status WriteConcernOptions::parse(const BSONObj& obj) { return Status(ErrorCodes::FailedToParse, "fsync and j options cannot be used together"); if (j) { - syncMode = JOURNAL; - } - if (fsync) { - syncMode = FSYNC; + syncMode = SyncMode::JOURNAL; + } else if (fsync) { + syncMode = SyncMode::FSYNC; + } else if (!jEl.eoo()) { + syncMode = SyncMode::NONE; } BSONElement e = obj["w"]; @@ -172,10 +174,12 @@ BSONObj WriteConcernOptions::toBSON() const { builder.append("w", wMode); } - if (syncMode == FSYNC) { + if (syncMode == SyncMode::FSYNC) { builder.append("fsync", true); - } else if (syncMode == JOURNAL) { + } else if (syncMode == SyncMode::JOURNAL) { builder.append("j", true); + } else if (syncMode == SyncMode::NONE) { + builder.append("j", false); } builder.append("wtimeout", wTimeout); diff --git a/src/mongo/db/write_concern_options.h b/src/mongo/db/write_concern_options.h index 1bac963f16f..5acc54e5294 100644 --- a/src/mongo/db/write_concern_options.h +++ b/src/mongo/db/write_concern_options.h @@ -37,7 +37,7 @@ class Status; struct WriteConcernOptions { public: - enum SyncMode { NONE, FSYNC, JOURNAL }; + enum class SyncMode { UNSET, NONE, FSYNC, JOURNAL }; static const int kNoTimeout = 0; static const int kNoWaiting = -1; @@ -51,6 +51,9 @@ public: WriteConcernOptions() { reset(); + // We set syncMode to NONE to avoid having an UNSET syncMode in default WriteConcernOptions + // since that can cause invariants to trigger. + syncMode = SyncMode::NONE; } WriteConcernOptions(int numNodes, SyncMode sync, int timeout); @@ -94,7 +97,7 @@ public: bool validForConfigServers() const; void reset() { - syncMode = NONE; + syncMode = SyncMode::UNSET; wNumNodes = 0; wMode = ""; wTimeout = 0; diff --git a/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp b/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp index 3313438448f..6355d87ad62 100644 --- a/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp +++ b/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp @@ -95,10 +95,12 @@ const ReadPreferenceSetting kConfigReadSelector(ReadPreference::Nearest, TagSet{ const ReadPreferenceSetting kConfigPrimaryPreferredSelector(ReadPreference::PrimaryPreferred, TagSet{}); const WriteConcernOptions kMajorityWriteConcern(WriteConcernOptions::kMajority, - // Note: Even though we're setting NONE here, + // Note: Even though we're setting UNSET here, // kMajority implies JOURNAL if journaling is - // supported by mongod. - WriteConcernOptions::NONE, + // supported by mongod and + // writeConcernMajorityJournalDefault is set to true + // in the ReplicaSetConfig. + WriteConcernOptions::SyncMode::UNSET, Seconds(15)); const int kMaxConfigVersionInitRetry = 3; @@ -791,6 +793,7 @@ bool CatalogManagerReplicaSet::runUserManagementWriteCommand(OperationContext* t // Make sure that if the command has a write concern that it is w:1 or w:majority, and // convert w:1 or no write concern to w:majority before sending. WriteConcernOptions writeConcern; + writeConcern.reset(); const char* writeConcernFieldName = "writeConcern"; BSONElement writeConcernElement = cmdObj[writeConcernFieldName]; bool initialCmdHadWriteConcern = !writeConcernElement.eoo(); diff --git a/src/mongo/s/catalog/replset/dist_lock_catalog_impl.cpp b/src/mongo/s/catalog/replset/dist_lock_catalog_impl.cpp index 2ce6c844619..3664fdfcbc5 100644 --- a/src/mongo/s/catalog/replset/dist_lock_catalog_impl.cpp +++ b/src/mongo/s/catalog/replset/dist_lock_catalog_impl.cpp @@ -65,10 +65,10 @@ const char kFindAndModifyResponseResultDocField[] = "value"; const char kLocalTimeField[] = "localTime"; const ReadPreferenceSetting kReadPref(ReadPreference::PrimaryOnly, TagSet()); const WriteConcernOptions kMajorityWriteConcern(WriteConcernOptions::kMajority, - // Note: Even though we're setting NONE here, + // Note: Even though we're setting UNSET here, // kMajority implies JOURNAL if journaling is // supported by this mongod. - WriteConcernOptions::NONE, + WriteConcernOptions::SyncMode::UNSET, Seconds(15)); /** diff --git a/src/mongo/s/catalog/type_settings.cpp b/src/mongo/s/catalog/type_settings.cpp index c19809efa30..d26972b5990 100644 --- a/src/mongo/s/catalog/type_settings.cpp +++ b/src/mongo/s/catalog/type_settings.cpp @@ -192,7 +192,7 @@ std::unique_ptr<WriteConcernOptions> SettingsType::getWriteConcern() const { dassert(_key == BalancerDocKey); if (isSecondaryThrottleSet() && !getSecondaryThrottle()) { - return stdx::make_unique<WriteConcernOptions>(1, WriteConcernOptions::NONE, 0); + return stdx::make_unique<WriteConcernOptions>(1, WriteConcernOptions::SyncMode::NONE, 0); } else if (!isMigrationWriteConcernSet()) { // Default setting. return nullptr; |