summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMedha Potluri <medha.potluri@mongodb.com>2019-08-06 11:04:20 -0400
committerMedha Potluri <medha.potluri@mongodb.com>2019-08-16 14:30:34 -0400
commit8494ca7d8a88aa3d5df96e89beafacd4caca3801 (patch)
tree819fc2ee7c4b6d0c2e3a7cc6a0d93d34ae9dd70c
parentd362c1c39ca79dd20e0aa6e9f93171fc5bd2cdec (diff)
downloadmongo-8494ca7d8a88aa3d5df96e89beafacd4caca3801.tar.gz
SERVER-41504 Track the number of ops during catchup in replSetStatus on primaries
-rw-r--r--jstests/replsets/catchup.js19
-rw-r--r--src/mongo/db/repl/replication_coordinator.h6
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.cpp19
-rw-r--r--src/mongo/db/repl/replication_coordinator_impl.h6
-rw-r--r--src/mongo/db/repl/replication_coordinator_mock.cpp4
-rw-r--r--src/mongo/db/repl/replication_coordinator_mock.h2
-rw-r--r--src/mongo/db/repl/replication_metrics.cpp6
-rw-r--r--src/mongo/db/repl/replication_metrics.h1
-rw-r--r--src/mongo/db/repl/replication_metrics.idl5
-rw-r--r--src/mongo/db/repl/sync_tail.cpp4
-rw-r--r--src/mongo/embedded/replication_coordinator_embedded.cpp4
-rw-r--r--src/mongo/embedded/replication_coordinator_embedded.h2
12 files changed, 75 insertions, 3 deletions
diff --git a/jstests/replsets/catchup.js b/jstests/replsets/catchup.js
index a5b6292fa95..7457aa3a569 100644
--- a/jstests/replsets/catchup.js
+++ b/jstests/replsets/catchup.js
@@ -127,6 +127,13 @@ assert(!res.electionCandidateMetrics.targetCatchupOpTime,
() => "Response should not have an 'electionCandidateMetrics.targetCatchupOpTime' field: " +
tojson(res.electionCandidateMetrics));
+// Check that the 'electionCandidateMetrics' section of the replSetGetStatus response has a
+// 'numCatchUpOps' field once the primary is caught up, and that it has the correct value.
+assert(res.electionCandidateMetrics.numCatchUpOps,
+ () => "Response should have an 'electionCandidateMetrics.numCatchUpOps' field: " +
+ tojson(res.electionCandidateMetrics));
+assert.eq(res.electionCandidateMetrics.numCatchUpOps, 0);
+
jsTest.log("Case 2: The primary needs to catch up, succeeds in time.");
initialNewPrimaryStatus =
assert.commandWorked(rst.getSecondaries()[0].adminCommand({serverStatus: 1}));
@@ -172,15 +179,21 @@ verifyCatchUpConclusionReason(initialNewPrimaryStatus.electionMetrics,
newNewPrimaryStatus.electionMetrics,
'numCatchUpsSucceeded');
-// Check that the 'electionCandidateMetrics' section of the replSetGetStatus response has a
-// 'targetCatchupOpTime' field once heartbeats have updated the target opTime for catchup, and that
-// it has the correct value.
+// Check that the 'electionCandidateMetrics' section of the replSetGetStatus response has
+// 'targetCatchupOpTime' and 'numCatchUpOps' fields once the primary is caught up, and that they
+// have the correct values.
assert(res.electionCandidateMetrics.targetCatchupOpTime,
() => "Response should have an 'electionCandidateMetrics.targetCatchupOpTime' field: " +
tojson(res.electionCandidateMetrics));
assert.eq(res.electionCandidateMetrics.targetCatchupOpTime.ts,
stepUpResults.latestOpOnOldPrimary.ts);
assert.eq(res.electionCandidateMetrics.targetCatchupOpTime.t, stepUpResults.latestOpOnOldPrimary.t);
+assert(res.electionCandidateMetrics.numCatchUpOps,
+ () => "Response should have an 'electionCandidateMetrics.numCatchUpOps' field: " +
+ tojson(res.electionCandidateMetrics));
+// numCatchUpOps should be 4 because the 'foo' collection is implicitly created during the 3
+// inserts, and that's where the additional oplog entry comes from.
+assert.eq(res.electionCandidateMetrics.numCatchUpOps, 4);
// Wait for all secondaries to catch up
rst.awaitReplication();
diff --git a/src/mongo/db/repl/replication_coordinator.h b/src/mongo/db/repl/replication_coordinator.h
index 5eb8007d66c..2f8bd118552 100644
--- a/src/mongo/db/repl/replication_coordinator.h
+++ b/src/mongo/db/repl/replication_coordinator.h
@@ -879,6 +879,12 @@ public:
virtual Status abortCatchupIfNeeded(PrimaryCatchUpConclusionReason reason) = 0;
/**
+ * Increment the counter for the number of ops applied during catchup if the node is in catchup
+ * mode.
+ */
+ virtual void incrementNumCatchUpOpsIfCatchingUp(int numOps) = 0;
+
+ /**
* Signals that drop pending collections have been removed from storage.
*/
virtual void signalDropPendingCollectionsRemovedFromStorage() = 0;
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index c44ccf40836..305434968a2 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -3066,6 +3066,8 @@ void ReplicationCoordinatorImpl::CatchupState::start_inlock() {
return;
}
_timeoutCbh = status.getValue();
+
+ _numCatchUpOps = 0;
}
void ReplicationCoordinatorImpl::CatchupState::abort_inlock(PrimaryCatchUpConclusionReason reason) {
@@ -3101,6 +3103,9 @@ void ReplicationCoordinatorImpl::CatchupState::signalHeartbeatUpdate_inlock() {
if (*targetOpTime <= myLastApplied) {
log() << "Caught up to the latest optime known via heartbeats after becoming primary. "
<< "Target optime: " << *targetOpTime << ". My Last Applied: " << myLastApplied;
+ // Report the number of ops applied during catchup in replSetGetStatus once the primary is
+ // caught up.
+ ReplicationMetrics::get(getGlobalServiceContext()).setNumCatchUpOps(_numCatchUpOps);
abort_inlock(PrimaryCatchUpConclusionReason::kAlreadyCaughtUp);
return;
}
@@ -3135,6 +3140,9 @@ void ReplicationCoordinatorImpl::CatchupState::signalHeartbeatUpdate_inlock() {
if (*targetOpTime <= myLastApplied) {
log() << "Caught up to the latest known optime successfully after becoming primary. "
<< "Target optime: " << *targetOpTime << ". My Last Applied: " << myLastApplied;
+ // Report the number of ops applied during catchup in replSetGetStatus once the primary
+ // is caught up.
+ ReplicationMetrics::get(getGlobalServiceContext()).setNumCatchUpOps(_numCatchUpOps);
abort_inlock(PrimaryCatchUpConclusionReason::kSucceeded);
}
};
@@ -3142,6 +3150,10 @@ void ReplicationCoordinatorImpl::CatchupState::signalHeartbeatUpdate_inlock() {
_repl->_opTimeWaiterList.add_inlock(_waiter.get());
}
+void ReplicationCoordinatorImpl::CatchupState::incrementNumCatchUpOps_inlock(int numOps) {
+ _numCatchUpOps += numOps;
+}
+
Status ReplicationCoordinatorImpl::abortCatchupIfNeeded(PrimaryCatchUpConclusionReason reason) {
stdx::lock_guard<stdx::mutex> lk(_mutex);
if (_catchupState) {
@@ -3151,6 +3163,13 @@ Status ReplicationCoordinatorImpl::abortCatchupIfNeeded(PrimaryCatchUpConclusion
return Status(ErrorCodes::IllegalOperation, "The node is not in catch-up mode.");
}
+void ReplicationCoordinatorImpl::incrementNumCatchUpOpsIfCatchingUp(int numOps) {
+ stdx::lock_guard<stdx::mutex> lk(_mutex);
+ if (_catchupState) {
+ _catchupState->incrementNumCatchUpOps_inlock(numOps);
+ }
+}
+
void ReplicationCoordinatorImpl::signalDropPendingCollectionsRemovedFromStorage() {
stdx::lock_guard<stdx::mutex> lock(_mutex);
_wakeReadyWaiters_inlock();
diff --git a/src/mongo/db/repl/replication_coordinator_impl.h b/src/mongo/db/repl/replication_coordinator_impl.h
index a3df3f74555..d566238d64c 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.h
+++ b/src/mongo/db/repl/replication_coordinator_impl.h
@@ -323,6 +323,8 @@ public:
virtual Status abortCatchupIfNeeded(PrimaryCatchUpConclusionReason reason) override;
+ virtual void incrementNumCatchUpOpsIfCatchingUp(int numOps) override;
+
void signalDropPendingCollectionsRemovedFromStorage() final;
virtual boost::optional<Timestamp> getRecoveryTimestamp() override;
@@ -672,6 +674,8 @@ private:
void abort_inlock(PrimaryCatchUpConclusionReason reason);
// Heartbeat calls this function to update the target optime.
void signalHeartbeatUpdate_inlock();
+ // Increment the counter for the number of ops applied during catchup.
+ void incrementNumCatchUpOps_inlock(int numOps);
private:
ReplicationCoordinatorImpl* _repl; // Not owned.
@@ -680,6 +684,8 @@ private:
// Handle to a Waiter that contains the current target optime to reach after which
// we can exit catchup mode.
std::unique_ptr<CallbackWaiter> _waiter;
+ // Counter for the number of ops applied during catchup.
+ int _numCatchUpOps;
};
// Inner class to manage the concurrency of _canAcceptNonLocalWrites and _canServeNonLocalReads.
diff --git a/src/mongo/db/repl/replication_coordinator_mock.cpp b/src/mongo/db/repl/replication_coordinator_mock.cpp
index 6f828d6953f..cd8bfb9dc3a 100644
--- a/src/mongo/db/repl/replication_coordinator_mock.cpp
+++ b/src/mongo/db/repl/replication_coordinator_mock.cpp
@@ -533,6 +533,10 @@ Status ReplicationCoordinatorMock::abortCatchupIfNeeded(PrimaryCatchUpConclusion
return Status::OK();
}
+void ReplicationCoordinatorMock::incrementNumCatchUpOpsIfCatchingUp(int numOps) {
+ return;
+}
+
void ReplicationCoordinatorMock::signalDropPendingCollectionsRemovedFromStorage() {}
boost::optional<Timestamp> ReplicationCoordinatorMock::getRecoveryTimestamp() {
diff --git a/src/mongo/db/repl/replication_coordinator_mock.h b/src/mongo/db/repl/replication_coordinator_mock.h
index c7c08e81e9a..50b6d1a5af2 100644
--- a/src/mongo/db/repl/replication_coordinator_mock.h
+++ b/src/mongo/db/repl/replication_coordinator_mock.h
@@ -305,6 +305,8 @@ public:
virtual Status abortCatchupIfNeeded(PrimaryCatchUpConclusionReason reason) override;
+ virtual void incrementNumCatchUpOpsIfCatchingUp(int numOps) override;
+
void signalDropPendingCollectionsRemovedFromStorage() final;
virtual boost::optional<Timestamp> getRecoveryTimestamp() override;
diff --git a/src/mongo/db/repl/replication_metrics.cpp b/src/mongo/db/repl/replication_metrics.cpp
index 6603158c483..b6fb3289f35 100644
--- a/src/mongo/db/repl/replication_metrics.cpp
+++ b/src/mongo/db/repl/replication_metrics.cpp
@@ -273,6 +273,11 @@ void ReplicationMetrics::setTargetCatchupOpTime(OpTime opTime) {
_electionCandidateMetrics.setTargetCatchupOpTime(opTime);
}
+void ReplicationMetrics::setNumCatchUpOps(int numCatchUpOps) {
+ stdx::lock_guard<stdx::mutex> lk(_mutex);
+ _electionCandidateMetrics.setNumCatchUpOps(numCatchUpOps);
+}
+
void ReplicationMetrics::setNewTermStartDate(Date_t newTermStartDate) {
stdx::lock_guard<stdx::mutex> lk(_mutex);
_electionCandidateMetrics.setNewTermStartDate(newTermStartDate);
@@ -299,6 +304,7 @@ BSONObj ReplicationMetrics::getElectionCandidateMetricsBSON() {
void ReplicationMetrics::clearElectionCandidateMetrics() {
stdx::lock_guard<stdx::mutex> lk(_mutex);
_electionCandidateMetrics.setTargetCatchupOpTime(boost::none);
+ _electionCandidateMetrics.setNumCatchUpOps(boost::none);
_electionCandidateMetrics.setNewTermStartDate(boost::none);
_nodeIsCandidateOrPrimary = false;
}
diff --git a/src/mongo/db/repl/replication_metrics.h b/src/mongo/db/repl/replication_metrics.h
index 8781c51f6a3..a169e8a0da5 100644
--- a/src/mongo/db/repl/replication_metrics.h
+++ b/src/mongo/db/repl/replication_metrics.h
@@ -83,6 +83,7 @@ public:
// consistent state.
void setElectionCandidateMetrics(Date_t lastElectionDate);
void setTargetCatchupOpTime(OpTime opTime);
+ void setNumCatchUpOps(int numCatchUpOps);
void setNewTermStartDate(Date_t newTermStartDate);
boost::optional<OpTime> getTargetCatchupOpTime_forTesting();
diff --git a/src/mongo/db/repl/replication_metrics.idl b/src/mongo/db/repl/replication_metrics.idl
index e065a86631e..c17140b9125 100644
--- a/src/mongo/db/repl/replication_metrics.idl
+++ b/src/mongo/db/repl/replication_metrics.idl
@@ -130,6 +130,11 @@ structs:
description: "The node's target opTime for catchup"
type: optime
optional: true
+ numCatchUpOps:
+ description: "Number of ops applied during catchup when the primary successfully
+ catches up"
+ type: long
+ optional: true
newTermStartDate:
description: "Time the new term oplog entry was written"
type: date
diff --git a/src/mongo/db/repl/sync_tail.cpp b/src/mongo/db/repl/sync_tail.cpp
index 5b291d04739..17614f16d25 100644
--- a/src/mongo/db/repl/sync_tail.cpp
+++ b/src/mongo/db/repl/sync_tail.cpp
@@ -1380,6 +1380,10 @@ StatusWith<OpTime> SyncTail::multiApply(OperationContext* opCtx, MultiApplier::O
}
}
+ // Increment the counter for the number of ops applied during catchup if the node is in catchup
+ // mode.
+ replCoord->incrementNumCatchUpOpsIfCatchingUp(ops.size());
+
// We have now written all database writes and updated the oplog to match.
return ops.back().getOpTime();
}
diff --git a/src/mongo/embedded/replication_coordinator_embedded.cpp b/src/mongo/embedded/replication_coordinator_embedded.cpp
index f6f836fb410..b11d0106d77 100644
--- a/src/mongo/embedded/replication_coordinator_embedded.cpp
+++ b/src/mongo/embedded/replication_coordinator_embedded.cpp
@@ -359,6 +359,10 @@ Status ReplicationCoordinatorEmbedded::abortCatchupIfNeeded(PrimaryCatchUpConclu
UASSERT_NOT_IMPLEMENTED;
}
+void ReplicationCoordinatorEmbedded::incrementNumCatchUpOpsIfCatchingUp(int numOps) {
+ UASSERT_NOT_IMPLEMENTED;
+}
+
Status ReplicationCoordinatorEmbedded::processReplSetUpdatePosition(const UpdatePositionArgs&,
long long*) {
UASSERT_NOT_IMPLEMENTED;
diff --git a/src/mongo/embedded/replication_coordinator_embedded.h b/src/mongo/embedded/replication_coordinator_embedded.h
index 805983f21f9..1246adf7e93 100644
--- a/src/mongo/embedded/replication_coordinator_embedded.h
+++ b/src/mongo/embedded/replication_coordinator_embedded.h
@@ -251,6 +251,8 @@ public:
Status abortCatchupIfNeeded(PrimaryCatchUpConclusionReason reason) override;
+ void incrementNumCatchUpOpsIfCatchingUp(int numOps) override;
+
void signalDropPendingCollectionsRemovedFromStorage() final;
boost::optional<Timestamp> getRecoveryTimestamp() override;