summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMaria van Keulen <maria@mongodb.com>2019-06-04 15:02:35 -0400
committerMaria van Keulen <maria@mongodb.com>2019-06-06 13:19:17 -0400
commit70de3cf1c8394284cbb89d42df7d1f8672b8b412 (patch)
treeeccdd6c39cc6f05af919d38e383a6da3ec9bfe14
parentb9677eb4b98fba42c92e97a70eb58b812db1ce7a (diff)
downloadmongo-70de3cf1c8394284cbb89d42df7d1f8672b8b412.tar.gz
SERVER-41287 Measure count and duration of periods when isLagged is true
(cherry picked from commit c27cafcd34190924cd62267fa8b42baf3caa69cb)
-rw-r--r--src/mongo/db/storage/flow_control.cpp38
-rw-r--r--src/mongo/db/storage/flow_control.h7
2 files changed, 28 insertions, 17 deletions
diff --git a/src/mongo/db/storage/flow_control.cpp b/src/mongo/db/storage/flow_control.cpp
index af180435812..e4ba6ad4b03 100644
--- a/src/mongo/db/storage/flow_control.cpp
+++ b/src/mongo/db/storage/flow_control.cpp
@@ -190,18 +190,6 @@ double FlowControl::_getLocksPerOp() {
BSONObj FlowControl::generateSection(OperationContext* opCtx,
const BSONElement& configElement) const {
- // Flow Control does not have use for lag measured on nodes that cannot accept writes.
- const bool canAcceptWrites = _replCoord->canAcceptNonLocalWrites();
-
- // Flow Control is only enabled if FCV is 4.2.
- const bool isFCV42 =
- (serverGlobalParams.featureCompatibility.isVersionInitialized() &&
- serverGlobalParams.featureCompatibility.getVersion() ==
- ServerGlobalParams::FeatureCompatibility::Version::kFullyUpgradedTo42);
-
- const Date_t myLastAppliedWall = _replCoord->getMyLastAppliedOpTimeAndWallTime().wallTime;
- const Date_t lastCommittedWall = _replCoord->getLastCommittedOpTimeAndWallTime().wallTime;
-
BSONObjBuilder bob;
// Most of these values are only computed and meaningful when flow control is enabled.
bob.append("enabled", gFlowControlEnabled.load());
@@ -210,9 +198,9 @@ BSONObj FlowControl::generateSection(OperationContext* opCtx,
FlowControlTicketholder::get(opCtx)->totalTimeAcquiringMicros());
bob.append("locksPerOp", _lastLocksPerOp.load());
bob.append("sustainerRate", _lastSustainerAppliedCount.load());
- bob.append("isLagged",
- isFCV42 && canAcceptWrites &&
- getLagMillis(myLastAppliedWall, lastCommittedWall) >= getThresholdLagMillis());
+ bob.append("isLagged", _isLagged.load());
+ bob.append("isLaggedCount", _isLaggedCount.load());
+ bob.append("isLaggedTimeMicros", _isLaggedTimeMicros.load());
return bob.obj();
}
@@ -353,6 +341,11 @@ int FlowControl::getNumTickets() {
gFlowControlTicketMultiplierConstant.load(),
_kMaxTickets);
_lastTimeSustainerAdvanced = Date_t::now();
+ if (_isLagged.load()) {
+ _isLagged.store(false);
+ auto waitTime = curTimeMicros64() - _startWaitTime;
+ _isLaggedTimeMicros.fetchAndAddRelaxed(waitTime);
+ }
} else if (sustainerAdvanced(_prevMemberData, _currMemberData)) {
// Expected case where flow control has meaningful data from the last period to make a new
// calculation.
@@ -363,16 +356,25 @@ int FlowControl::getNumTickets() {
locksPerOp,
getLagMillis(myLastApplied.wallTime, lastCommitted.wallTime),
thresholdLagMillis);
+ if (!_isLagged.load()) {
+ _isLagged.store(true);
+ _isLaggedCount.fetchAndAddRelaxed(1);
+ _startWaitTime = curTimeMicros64();
+ }
} else {
// Unexpected case where consecutive readings from the topology state don't meet some basic
// expectations.
ret = _lastTargetTicketsPermitted.load();
_lastTimeSustainerAdvanced = Date_t::now();
+ // Since this case does not give conclusive evidence that isLagged could have meaningfully
+ // transitioned from true to false, it does not make sense to update the _isLagged*
+ // variables here.
}
ret = std::max(ret, gFlowControlMinTicketsPerSecond.load());
- LOG(DEBUG_LOG_LEVEL) << "Are lagged? " << !isHealthy << " Curr lag millis: "
+ LOG(DEBUG_LOG_LEVEL) << "Are lagged? " << (_isLagged.load() ? "true" : "false")
+ << " Curr lag millis: "
<< getLagMillis(myLastApplied.wallTime, lastCommitted.wallTime)
<< " OpsLagged: "
<< _approximateOpsBetween(lastCommitted.opTime.getTimestamp(),
@@ -381,7 +383,9 @@ int FlowControl::getNumTickets() {
<< " Last granted: " << _lastTargetTicketsPermitted.load()
<< " Last sustainer applied: " << _lastSustainerAppliedCount.load()
<< " Acquisitions since last check: " << locksUsedLastPeriod
- << " Locks per op: " << _lastLocksPerOp.load();
+ << " Locks per op: " << _lastLocksPerOp.load()
+ << " Count of lagged periods: " << _isLaggedCount.load()
+ << " Total duration of lagged periods: " << _isLaggedTimeMicros.load();
_lastTargetTicketsPermitted.store(ret);
diff --git a/src/mongo/db/storage/flow_control.h b/src/mongo/db/storage/flow_control.h
index 8e6014d29b8..e3905a3728b 100644
--- a/src/mongo/db/storage/flow_control.h
+++ b/src/mongo/db/storage/flow_control.h
@@ -120,6 +120,10 @@ private:
AtomicWord<int> _lastTargetTicketsPermitted{_kMaxTickets};
AtomicWord<double> _lastLocksPerOp{0.0};
AtomicWord<int> _lastSustainerAppliedCount{0};
+ AtomicWord<bool> _isLagged{false};
+ AtomicWord<int> _isLaggedCount{0};
+ // Use an int64_t as this is serialized to bson which does not support unsigned 64-bit numbers.
+ AtomicWord<std::int64_t> _isLaggedTimeMicros{0};
mutable stdx::mutex _sampledOpsMutex;
std::deque<Sample> _sampledOpsApplied;
@@ -134,6 +138,9 @@ private:
std::vector<repl::MemberData> _prevMemberData;
Date_t _lastTimeSustainerAdvanced;
+
+ // This value is used for calculating server status metrics.
+ std::uint64_t _startWaitTime = 0;
};
} // namespace mongo