summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/mongo/db/repl/topology_coordinator.cpp32
-rw-r--r--src/mongo/db/repl/topology_coordinator.h6
-rw-r--r--src/mongo/db/repl/topology_coordinator_v1_test.cpp1
3 files changed, 34 insertions, 5 deletions
diff --git a/src/mongo/db/repl/topology_coordinator.cpp b/src/mongo/db/repl/topology_coordinator.cpp
index 1de492a3eef..41593036ad2 100644
--- a/src/mongo/db/repl/topology_coordinator.cpp
+++ b/src/mongo/db/repl/topology_coordinator.cpp
@@ -830,6 +830,12 @@ bool TopologyCoordinator::haveNumNodesReachedOpTime(const OpTime& targetOpTime,
return false;
}
+ // Invariants that we only wait for an OpTime in the term that this node is currently writing
+ // to. In other words, we do not support waiting for an OpTime written by a previous primary
+ // because comparing members' lastApplied/lastDurable alone is not sufficient to tell if the
+ // OpTime has been replicated.
+ invariant(targetOpTime.getTerm() == getMyLastAppliedOpTime().getTerm());
+
for (auto&& memberData : _memberData) {
const auto isArbiter = _rsConfig.getMemberAt(memberData.getConfigIndex()).isArbiter();
@@ -841,7 +847,14 @@ bool TopologyCoordinator::haveNumNodesReachedOpTime(const OpTime& targetOpTime,
const OpTime& memberOpTime =
durablyWritten ? memberData.getLastDurableOpTime() : memberData.getLastAppliedOpTime();
- if (memberOpTime >= targetOpTime) {
+ // In addition to checking if a member has a greater/equal timestamp field we also need to
+ // make sure that the memberOpTime is in the same term as the OpTime we wait for. If a
+ // member's OpTime has a higher term, it indicates that this node will be stepping down. And
+ // thus we do not know if the target OpTime in our previous term has been replicated to the
+ // member because the memberOpTime in a higher term could correspond to an operation in a
+ // divergent branch of history regardless of its timestamp.
+ if (memberOpTime.getTerm() == targetOpTime.getTerm() &&
+ memberOpTime.getTimestamp() >= targetOpTime.getTimestamp()) {
--numNodes;
}
@@ -856,10 +869,25 @@ bool TopologyCoordinator::haveTaggedNodesReachedOpTime(const OpTime& opTime,
const ReplSetTagPattern& tagPattern,
bool durablyWritten) {
ReplSetTagMatch matcher(tagPattern);
+
+ // Invariants that we only wait for an OpTime in the term that this node is currently writing
+ // to. In other words, we do not support waiting for an OpTime written by a previous primary
+ // because comparing members' lastApplied/lastDurable alone is not sufficient to tell if the
+ // OpTime has been replicated.
+ invariant(opTime.getTerm() == getMyLastAppliedOpTime().getTerm());
+
for (auto&& memberData : _memberData) {
const OpTime& memberOpTime =
durablyWritten ? memberData.getLastDurableOpTime() : memberData.getLastAppliedOpTime();
- if (memberOpTime >= opTime) {
+
+ // In addition to checking if a member has a greater/equal timestamp field we also need to
+ // make sure that the memberOpTime is in the same term as the OpTime we wait for. If a
+ // member's OpTime has a higher term, it indicates that this node will be stepping down. And
+ // thus we do not know if the target OpTime in our previous term has been replicated to the
+ // member because the memberOpTime in a higher term could correspond to an operation in a
+ // divergent branch of history regardless of its timestamp.
+ if (memberOpTime.getTerm() == opTime.getTerm() &&
+ memberOpTime.getTimestamp() >= opTime.getTimestamp()) {
// This node has reached the desired optime, now we need to check if it is a part
// of the tagPattern.
int memberIndex = memberData.getConfigIndex();
diff --git a/src/mongo/db/repl/topology_coordinator.h b/src/mongo/db/repl/topology_coordinator.h
index d1e8a8eb2b9..ec30a024d4b 100644
--- a/src/mongo/db/repl/topology_coordinator.h
+++ b/src/mongo/db/repl/topology_coordinator.h
@@ -407,14 +407,14 @@ public:
const StatusWith<ReplSetHeartbeatResponse>& hbResponse);
/**
- * Returns whether or not at least 'numNodes' have reached the given opTime.
+ * Returns whether or not at least 'numNodes' have reached the given opTime with the same term.
* "durablyWritten" indicates whether the operation has to be durably applied.
*/
bool haveNumNodesReachedOpTime(const OpTime& opTime, int numNodes, bool durablyWritten);
/**
- * Returns whether or not at least one node matching the tagPattern has reached
- * the given opTime.
+ * Returns whether or not at least one node matching the tagPattern has reached the given opTime
+ * with the same term.
* "durablyWritten" indicates whether the operation has to be durably applied.
*/
bool haveTaggedNodesReachedOpTime(const OpTime& opTime,
diff --git a/src/mongo/db/repl/topology_coordinator_v1_test.cpp b/src/mongo/db/repl/topology_coordinator_v1_test.cpp
index 3a272d4d67a..208ce19ef11 100644
--- a/src/mongo/db/repl/topology_coordinator_v1_test.cpp
+++ b/src/mongo/db/repl/topology_coordinator_v1_test.cpp
@@ -114,6 +114,7 @@ protected:
getTopoCoord().changeMemberState_forTest(MemberState::RS_PRIMARY, electionTimestamp);
getTopoCoord().setCurrentPrimary_forTest(_selfIndex, electionTimestamp);
OpTime dummyOpTime(Timestamp(1, 1), getTopoCoord().getTerm());
+ setMyOpTime(dummyOpTime);
ASSERT_OK(getTopoCoord().completeTransitionToPrimary(dummyOpTime));
}