From 70842af1543116b91c94072285a33f0384308b71 Mon Sep 17 00:00:00 2001 From: Andrew Shuvalov Date: Wed, 2 Jun 2021 13:31:34 +0000 Subject: SERVER-56361: Better FTDC diagnostics of RSM --- jstests/noPassthrough/ftdc_connection_pool.js | 6 + src/mongo/client/SConscript | 3 +- src/mongo/client/replica_set_monitor.cpp | 43 ++++-- src/mongo/client/replica_set_monitor.h | 20 ++- src/mongo/client/replica_set_monitor_manager.cpp | 39 ++++-- src/mongo/client/replica_set_monitor_manager.h | 6 +- src/mongo/client/replica_set_monitor_stats.cpp | 147 +++++++++++++++++++++ src/mongo/client/replica_set_monitor_stats.h | 129 ++++++++++++++++++ src/mongo/client/replica_set_monitor_test.cpp | 88 ++++++++---- src/mongo/client/replica_set_monitor_transport.cpp | 24 ++-- src/mongo/client/replica_set_monitor_transport.h | 10 +- 11 files changed, 442 insertions(+), 73 deletions(-) create mode 100644 src/mongo/client/replica_set_monitor_stats.cpp create mode 100644 src/mongo/client/replica_set_monitor_stats.h diff --git a/jstests/noPassthrough/ftdc_connection_pool.js b/jstests/noPassthrough/ftdc_connection_pool.js index f1fb7336aa0..351713c5a8c 100644 --- a/jstests/noPassthrough/ftdc_connection_pool.js +++ b/jstests/noPassthrough/ftdc_connection_pool.js @@ -24,6 +24,12 @@ load('jstests/libs/ftdc.js'); assert(stats.hasOwnProperty('totalAvailable')); assert(stats.hasOwnProperty('totalCreated')); assert(stats.hasOwnProperty('totalRefreshing')); + assert("getHostAndRefresh" in stats["replicaSetMonitor"]); + const getHostStats = stats["replicaSetMonitor"]["getHostAndRefresh"]; + assert(getHostStats.hasOwnProperty('currentlyActive')); + assert("hello" in stats["replicaSetMonitor"]); + const helloStats = stats["replicaSetMonitor"]["hello"]; + assert(helloStats.hasOwnProperty('currentlyActive')); // The connPoolStats command reply has "hosts", but FTDC's stats do not. assert(!stats.hasOwnProperty('hosts')); diff --git a/src/mongo/client/SConscript b/src/mongo/client/SConscript index 0f8b225c8ab..5a930faa1f5 100644 --- a/src/mongo/client/SConscript +++ b/src/mongo/client/SConscript @@ -186,7 +186,8 @@ clientDriverEnv.Library( 'global_conn_pool.cpp', 'replica_set_monitor.cpp', 'replica_set_monitor_manager.cpp', - 'replica_set_monitor_transport.cpp', + 'replica_set_monitor_stats.cpp', + 'replica_set_monitor_transport.cpp', ], LIBDEPS=[ '$BUILD_DIR/mongo/db/write_concern_options', diff --git a/src/mongo/client/replica_set_monitor.cpp b/src/mongo/client/replica_set_monitor.cpp index 339ea0316f1..fc5b8bbaed3 100644 --- a/src/mongo/client/replica_set_monitor.cpp +++ b/src/mongo/client/replica_set_monitor.cpp @@ -96,7 +96,7 @@ const int64_t unknownLatency = numeric_limits::max(); // After this period of time, RSM logging becomes more verbose. static constexpr int kRsmVerbosityThresholdTimeoutSec = 20; // When 'is master' reply latency is over 2 sec, it will be logged. -static constexpr int kSlowIsMasterThresholdMicros = 2L * 1000 * 1000; +static constexpr Milliseconds kSlowIsMasterThreshold{500}; static Seconds kHelloTimeout = Seconds{10}; @@ -222,21 +222,30 @@ Milliseconds ReplicaSetMonitor::getHelloTimeout() { ReplicaSetMonitor::ReplicaSetMonitor(StringData name, const std::set& seeds, - ReplicaSetMonitorTransportPtr transport) + ReplicaSetMonitorTransportPtr transport, + std::shared_ptr managerStats) : _state(std::make_shared(name, seeds)), _executor(globalRSMonitorManager.getExecutor()), - _rsmTransport(std::move(transport)) {} + _rsmTransport(std::move(transport)), + _stats(std::make_shared(managerStats)) {} ReplicaSetMonitor::ReplicaSetMonitor(const MongoURI& uri, - std::unique_ptr transport) + std::unique_ptr transport, + std::shared_ptr managerStats) : _state(std::make_shared(uri)), _executor(globalRSMonitorManager.getExecutor()), - _rsmTransport(std::move(transport)) {} + _rsmTransport(std::move(transport)), + _stats(std::make_shared(managerStats)) {} void ReplicaSetMonitor::init() { _scheduleRefresh(_executor->now()); } +// Test only constructor. +ReplicaSetMonitor::ReplicaSetMonitor(const SetStatePtr& initialState, + std::shared_ptr managerStats) + : _state(initialState), _stats(std::make_shared(managerStats)) {} + ReplicaSetMonitor::~ReplicaSetMonitor() { // need this lock because otherwise can get race with _scheduleRefresh() stdx::lock_guard lk(_mutex); @@ -315,7 +324,8 @@ StatusWith ReplicaSetMonitor::getHostOrRefresh(const ReadPreference return {std::move(out)}; } - const auto startTimeMs = Date_t::now(); + const Timer startTimer; + const auto statsCollector = _stats->collectGetHostAndRefreshStats(); while (true) { // We might not have found any matching hosts due to the scan, which just completed may have @@ -332,7 +342,7 @@ StatusWith ReplicaSetMonitor::getHostOrRefresh(const ReadPreference return {ErrorCodes::ShutdownInProgress, str::stream() << "Server is shutting down"}; } - const Milliseconds remaining = maxWait - (Date_t::now() - startTimeMs); + const Milliseconds remaining = maxWait - Milliseconds(startTimer.millis()); if (remaining < kFindHostMaxBackOffTime || areRefreshRetriesDisabledForTest.load()) { break; @@ -355,7 +365,7 @@ HostAndPort ReplicaSetMonitor::getMasterOrUassert() { Refresher ReplicaSetMonitor::startOrContinueRefresh() { stdx::lock_guard lk(_state->mutex); - Refresher out(_state, _executor, _rsmTransport.get()); + Refresher out(_state, _executor, _rsmTransport.get(), _stats.get()); DEV _state->checkInvariants(); return out; } @@ -524,8 +534,13 @@ void ReplicaSetMonitor::markAsRemoved() { Refresher::Refresher(const SetStatePtr& setState, executor::TaskExecutor* executor, - ReplicaSetMonitorTransport* transport) - : _set(setState), _scan(setState->currentScan), _executor(executor), _rsmTransport(transport) { + ReplicaSetMonitorTransport* transport, + ReplicaSetMonitorStats* stats) + : _set(setState), + _scan(setState->currentScan), + _executor(executor), + _rsmTransport(transport), + _stats(stats) { if (_scan) return; // participate in in-progress scan @@ -909,7 +924,7 @@ HostAndPort Refresher::_refreshUntilMatches(const ReadPreferenceSetting* criteri { Timer timer; auto helloFuture = _rsmTransport->sayHello( - ns.host, _set->name, _set->setUri, getHelloTimeout()); + ns.host, _set->name, _set->setUri, getHelloTimeout(), _stats); helloReplyStatus = helloFuture.getNoThrow(); pingMicros = timer.micros(); } @@ -1068,9 +1083,9 @@ void Node::update(const IsMasterReply& reply, bool verbose) { // update latency with smoothed moving average (1/4th the delta) latencyMicros += (reply.latencyMicros - latencyMicros) / 4; } - if (reply.latencyMicros > kSlowIsMasterThresholdMicros) { // > 2 seconds. - log() << "ReplicaSet Monitor received reply with latency above 2 seconds: " - << reply.raw; + if (Milliseconds(reply.latencyMicros / 1000) > kSlowIsMasterThreshold) { + log() << "Slow ReplicaSet Monitor reply for " << reply.setName << " from " << host + << ": " << reply.raw << " (" << (reply.latencyMicros / 1000) << " ms)"; } } diff --git a/src/mongo/client/replica_set_monitor.h b/src/mongo/client/replica_set_monitor.h index 115696c04ca..b7dbb312c82 100644 --- a/src/mongo/client/replica_set_monitor.h +++ b/src/mongo/client/replica_set_monitor.h @@ -32,13 +32,13 @@ #include #include -#include #include #include #include "mongo/base/disallow_copying.h" #include "mongo/base/string_data.h" #include "mongo/client/mongo_uri.h" +#include "mongo/client/replica_set_monitor_stats.h" #include "mongo/client/replica_set_monitor_transport.h" #include "mongo/executor/task_executor.h" #include "mongo/platform/atomic_word.h" @@ -74,9 +74,12 @@ public: */ ReplicaSetMonitor(StringData name, const std::set& seeds, - ReplicaSetMonitorTransportPtr transport); + ReplicaSetMonitorTransportPtr transport, + std::shared_ptr managerStats); - ReplicaSetMonitor(const MongoURI& uri, ReplicaSetMonitorTransportPtr transport); + ReplicaSetMonitor(const MongoURI& uri, + ReplicaSetMonitorTransportPtr transport, + std::shared_ptr managerStats); /** * Schedules the initial refresh task into task executor. @@ -275,8 +278,11 @@ public: /** * Allows tests to set initial conditions and introspect the current state. + * This constructor is used only in tests. */ - explicit ReplicaSetMonitor(const SetStatePtr& initialState) : _state(initialState) {} + explicit ReplicaSetMonitor(const SetStatePtr& initialState, + std::shared_ptr managerStats = + std::make_shared()); ~ReplicaSetMonitor(); /** @@ -313,6 +319,8 @@ private: executor::TaskExecutor* _executor; AtomicBool _isRemovedFromManager{false}; ReplicaSetMonitorTransportPtr _rsmTransport; + + std::shared_ptr _stats; }; @@ -357,7 +365,8 @@ public: */ explicit Refresher(const SetStatePtr& setState, executor::TaskExecutor* executor, - ReplicaSetMonitorTransport* transport); + ReplicaSetMonitorTransport* transport, + ReplicaSetMonitorStats* stats); struct NextStep { enum StepKind { @@ -436,6 +445,7 @@ private: executor::TaskExecutor* _executor; ReplicaSetMonitorTransport* _rsmTransport; + ReplicaSetMonitorStats* const _stats; }; } // namespace mongo diff --git a/src/mongo/client/replica_set_monitor_manager.cpp b/src/mongo/client/replica_set_monitor_manager.cpp index 7345fd95fc8..5e370546b59 100644 --- a/src/mongo/client/replica_set_monitor_manager.cpp +++ b/src/mongo/client/replica_set_monitor_manager.cpp @@ -106,7 +106,7 @@ auto makeThreadPool(const std::string& poolName) { return stdx::make_unique(threadPoolOptions); } -void ReplicaSetMonitorManager::_setupTaskExecutorInLock(const std::string& name) { +void ReplicaSetMonitorManager::_setupTaskExecutorAndStatsInLock(const std::string& name) { auto hookList = stdx::make_unique(); // do not restart taskExecutor if is in shutdown @@ -131,6 +131,10 @@ void ReplicaSetMonitorManager::_setupTaskExecutorInLock(const std::string& name) << redact(name); _taskExecutor->startup(); } + + if (!_stats) { + _stats = std::make_shared(); + } } namespace { @@ -144,7 +148,7 @@ shared_ptr ReplicaSetMonitorManager::getOrCreateMonitor( invariant(connStr.type() == ConnectionString::SET); stdx::lock_guard lk(_mutex); - _setupTaskExecutorInLock(connStr.toString()); + _setupTaskExecutorAndStatsInLock(connStr.toString()); auto setName = connStr.getSetName(); auto monitor = _monitors[setName].lock(); if (monitor) { @@ -159,7 +163,9 @@ shared_ptr ReplicaSetMonitorManager::getOrCreateMonitor( if (!transport) { transport = makeRsmTransport(); } - auto newMonitor = std::make_shared(setName, servers, std::move(transport)); + invariant(_stats); + auto newMonitor = + std::make_shared(setName, servers, std::move(transport), _stats); _monitors[setName] = newMonitor; newMonitor->init(); return newMonitor; @@ -170,7 +176,7 @@ shared_ptr ReplicaSetMonitorManager::getOrCreateMonitor( invariant(uri.type() == ConnectionString::SET); stdx::lock_guard lk(_mutex); - _setupTaskExecutorInLock(uri.toString()); + _setupTaskExecutorAndStatsInLock(uri.toString()); const auto& setName = uri.getSetName(); auto monitor = _monitors[setName].lock(); if (monitor) { @@ -180,7 +186,8 @@ shared_ptr ReplicaSetMonitorManager::getOrCreateMonitor( log() << "Starting new replica set monitor for " << uri.toString(); - auto newMonitor = std::make_shared(uri, std::move(transport)); + invariant(_stats); + auto newMonitor = std::make_shared(uri, std::move(transport), _stats); _monitors[setName] = newMonitor; newMonitor->init(); return newMonitor; @@ -253,15 +260,21 @@ void ReplicaSetMonitorManager::report(BSONObjBuilder* builder, bool forFTDC) { // calling ShardRegistry::updateConfigServerConnectionString. auto setNames = getAllSetNames(); - BSONObjBuilder setStats( - builder->subobjStart(forFTDC ? "replicaSetPingTimesMillis" : "replicaSets")); - - for (const auto& setName : setNames) { - auto monitor = getMonitor(setName); - if (!monitor) { - continue; + { + BSONObjBuilder setStats( + builder->subobjStart(forFTDC ? "replicaSetPingTimesMillis" : "replicaSets")); + + for (const auto& setName : setNames) { + auto monitor = getMonitor(setName); + if (!monitor) { + continue; + } + monitor->appendInfo(setStats, forFTDC); } - monitor->appendInfo(setStats, forFTDC); + } + + if (_stats) { + _stats->report(builder, forFTDC); } } diff --git a/src/mongo/client/replica_set_monitor_manager.h b/src/mongo/client/replica_set_monitor_manager.h index 89bee98cfc8..2d44e8b07b1 100644 --- a/src/mongo/client/replica_set_monitor_manager.h +++ b/src/mongo/client/replica_set_monitor_manager.h @@ -34,6 +34,7 @@ #include #include "mongo/base/disallow_copying.h" +#include "mongo/client/replica_set_monitor_stats.h" #include "mongo/client/replica_set_monitor_transport.h" #include "mongo/executor/task_executor.h" #include "mongo/stdx/mutex.h" @@ -116,10 +117,13 @@ private: // Needs to be after `_taskExecutor`, so that it will be destroyed before the `_taskExecutor`. ReplicaSetMonitorsMap _monitors; - void _setupTaskExecutorInLock(const std::string& name); + void _setupTaskExecutorAndStatsInLock(const std::string& name); // set to true when shutdown has been called. bool _isShutdown{false}; + + // Internally synchronized. + std::shared_ptr _stats; }; } // namespace mongo diff --git a/src/mongo/client/replica_set_monitor_stats.cpp b/src/mongo/client/replica_set_monitor_stats.cpp new file mode 100644 index 00000000000..adea837e0f6 --- /dev/null +++ b/src/mongo/client/replica_set_monitor_stats.cpp @@ -0,0 +1,147 @@ +/** + * Copyright (C) 2021-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/client/replica_set_monitor_stats.h" +#include "mongo/bson/bsonobjbuilder.h" + +namespace mongo { + +constexpr Microseconds ReplicaSetMonitorManagerStats::kMaxLatencyDefaultResetTimeout; + +static constexpr char kRSMPrefix[] = "replicaSetMonitor"; +static constexpr char kGetHostPrefix[] = "getHostAndRefresh"; +static constexpr char kHelloPrefix[] = "hello"; + +ReplicaSetMonitorManagerStats::ReplicaSetMonitorManagerStats(Microseconds resetTimeout) + : _resetTimeout(resetTimeout) {} + +void ReplicaSetMonitorManagerStats::report(BSONObjBuilder* builder, bool forFTDC) { + if (!forFTDC) { + return; + } + BSONObjBuilder rsmStats(builder->subobjStart(kRSMPrefix)); + { + BSONObjBuilder getHostStats(rsmStats.subobjStart(kGetHostPrefix)); + getHostStats.appendNumber("totalCalls", _getHostAndRefreshTotal); + getHostStats.appendNumber("currentlyActive", _getHostAndRefreshCurrent); + getHostStats.appendNumber("totalLatencyMicros", _getHostAndRefreshAggregateLatency); + + stdx::lock_guard lk(_mutex); + getHostStats.appendNumber("maxLatencyMicros", + durationCount(_getHostAndRefreshMaxLatency)); + } + { + BSONObjBuilder helloStats(rsmStats.subobjStart(kHelloPrefix)); + helloStats.appendNumber("totalCalls", _helloTotal); + helloStats.appendNumber("currentlyActive", _helloCurrent); + helloStats.appendNumber("totalLatencyMicros", _helloAggregateLatency); + + stdx::lock_guard lk(_mutex); + helloStats.appendNumber("maxLatencyMicros", durationCount(_helloMaxLatency)); + } +} + +void ReplicaSetMonitorManagerStats::enterGetHostAndRefresh() { + _getHostAndRefreshTotal.increment(1); + _getHostAndRefreshCurrent.increment(1); +} + +static auto kProcessLatency = [](Microseconds& currentMaxLatency, + Microseconds& newLatency, + Timer& resetTimer, + Microseconds resetTimeout) { + bool replace = false; + if (newLatency > currentMaxLatency) { + replace = true; + } + if (Microseconds(resetTimer.micros()) > resetTimeout) { + replace = true; + resetTimer.reset(); + } + if (replace) { + currentMaxLatency = newLatency; + } +}; + +void ReplicaSetMonitorManagerStats::leaveGetHostAndRefresh(Microseconds latency) { + _getHostAndRefreshCurrent.decrement(1); + _getHostAndRefreshAggregateLatency.increment(latency.count()); + + stdx::lock_guard lk(_mutex); + kProcessLatency(_getHostAndRefreshMaxLatency, + latency, + _lastTimeGetHostAndRefreshMaxLatencyUpdated, + _resetTimeout); +} + +void ReplicaSetMonitorManagerStats::enterHello() { + _helloTotal.increment(1); + _helloCurrent.increment(1); +} + +void ReplicaSetMonitorManagerStats::leaveHello(Microseconds latency) { + _helloCurrent.decrement(1); + _helloAggregateLatency.increment(latency.count()); + + stdx::lock_guard lk(_mutex); + kProcessLatency(_helloMaxLatency, latency, _lastTimeHelloMaxLatencyUpdated, _resetTimeout); +} + + +ReplicaSetMonitorStats::ReplicaSetMonitorStats( + std::shared_ptr managerStats) + : _managerStats(managerStats) {} + +void ReplicaSetMonitorStats::_enterGetHostAndRefresh() { + _getHostAndRefreshTotal.increment(1); + _getHostAndRefreshCurrent.increment(1); + _managerStats->enterGetHostAndRefresh(); +} + +void ReplicaSetMonitorStats::_leaveGetHostAndRefresh(const Timer& suppliedTimer) { + const Microseconds latency = Microseconds(suppliedTimer.micros()); + _getHostAndRefreshCurrent.decrement(1); + _getHostAndRefreshAggregateLatency.increment(latency.count()); + _managerStats->leaveGetHostAndRefresh(latency); +} + +void ReplicaSetMonitorStats::_enterHello() { + _helloTotal.increment(1); + _helloCurrent.increment(1); + _managerStats->enterHello(); +} + +void ReplicaSetMonitorStats::_leaveHello(const Timer& suppliedTimer) { + const Microseconds latency = Microseconds(suppliedTimer.micros()); + _helloCurrent.decrement(1); + _helloLatency.increment(latency.count()); + _managerStats->leaveHello(latency); +} + +} // namespace mongo diff --git a/src/mongo/client/replica_set_monitor_stats.h b/src/mongo/client/replica_set_monitor_stats.h new file mode 100644 index 00000000000..6778e25b1a9 --- /dev/null +++ b/src/mongo/client/replica_set_monitor_stats.h @@ -0,0 +1,129 @@ +/** + * Copyright (C) 2021-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include +#include + +#include "mongo/base/counter.h" +#include "mongo/util/duration.h" +#include "mongo/util/scopeguard.h" +#include "mongo/util/timer.h" + +namespace mongo { + +// Common and aggregate stats for all RSMs. +class ReplicaSetMonitorManagerStats { +public: + static constexpr Microseconds kMaxLatencyDefaultResetTimeout{1 * 1000 * 1000}; + + explicit ReplicaSetMonitorManagerStats( + Microseconds resetTimeout = kMaxLatencyDefaultResetTimeout); + + /** + * Reports information about the replica sets tracked by us, for diagnostic purposes. + */ + void report(BSONObjBuilder* builder, bool forFTDC); + + void enterGetHostAndRefresh(); + void leaveGetHostAndRefresh(Microseconds latency); + + void enterHello(); + void leaveHello(Microseconds latency); + +private: + const Microseconds _resetTimeout; + + mutable stdx::mutex _mutex; // NOLINT. + + // Stats for the outer loop of getHostAndRefresh(). + + Counter64 _getHostAndRefreshTotal; + Counter64 _getHostAndRefreshCurrent; + Counter64 _getHostAndRefreshAggregateLatency; + + Timer _lastTimeGetHostAndRefreshMaxLatencyUpdated; + // Keeps track of largest individual RSM latency registered during last time period. + Microseconds _getHostAndRefreshMaxLatency; + + Counter64 _helloTotal; + Counter64 _helloCurrent; + Counter64 _helloAggregateLatency; + + Timer _lastTimeHelloMaxLatencyUpdated; + Microseconds _helloMaxLatency; +}; + +// Stats for an RSM. +class ReplicaSetMonitorStats : public std::enable_shared_from_this { +public: + explicit ReplicaSetMonitorStats(std::shared_ptr managerStats); + + /** + * Returns a scope guard class instance to collect statistics. + * Invokes 'completionCallback' on leaving the scope. + * Callbacks arg: calculated latency. + */ + auto collectGetHostAndRefreshStats() { + _enterGetHostAndRefresh(); + return MakeGuard([ self = shared_from_this(), timer = std::make_shared() ] { + self->_leaveGetHostAndRefresh(*timer); + }); + } + + auto collectHelloStats() { + _enterHello(); + return MakeGuard([ self = shared_from_this(), timer = std::make_shared() ] { + self->_leaveHello(*timer); + }); + } + +private: + void _enterGetHostAndRefresh(); + void _leaveGetHostAndRefresh(const Timer& suppliedTimer); + + void _enterHello(); + void _leaveHello(const Timer& suppliedTimer); + + // Parent stats. + std::shared_ptr _managerStats; + + // Stats for the outer loop of getHostAndRefresh(). + + Counter64 _getHostAndRefreshTotal; + Counter64 _getHostAndRefreshCurrent; + Counter64 _getHostAndRefreshAggregateLatency; + + Counter64 _helloTotal; + Counter64 _helloCurrent; + Counter64 _helloLatency; +}; + +} // namespace mongo diff --git a/src/mongo/client/replica_set_monitor_test.cpp b/src/mongo/client/replica_set_monitor_test.cpp index 7540854d9ce..42728a92f79 100644 --- a/src/mongo/client/replica_set_monitor_test.cpp +++ b/src/mongo/client/replica_set_monitor_test.cpp @@ -59,6 +59,13 @@ std::vector basicSeedsBuilder() { const std::vector basicSeeds = basicSeedsBuilder(); const std::set basicSeedsSet(basicSeeds.begin(), basicSeeds.end()); +struct StatsForTest { + std::shared_ptr managerStats = + std::make_shared(); + std::shared_ptr stats = + std::make_shared(managerStats); +}; + // NOTE: Unless stated otherwise, all tests assume exclusive access to state belongs to the // current (only) thread, so they do not lock SetState::mutex before examining state. This is // NOT something that non-test code should do. @@ -347,7 +354,8 @@ TEST(ReplicaSetMonitor, IsMasterSecondaryWithTags) { TEST(ReplicaSetMonitor, CheckAllSeedsSerial) { SetStatePtr state = std::make_shared("name", basicSeedsSet); - Refresher refresher(state, nullptr, nullptr); + StatsForTest stats; + Refresher refresher(state, nullptr, nullptr, stats.stats.get()); set seen; @@ -395,7 +403,8 @@ TEST(ReplicaSetMonitor, CheckAllSeedsSerial) { TEST(ReplicaSetMonitor, CheckAllSeedsParallel) { SetStatePtr state = std::make_shared("name", basicSeedsSet); - Refresher refresher(state, nullptr, nullptr); + StatsForTest stats; + Refresher refresher(state, nullptr, nullptr, stats.stats.get()); set seen; @@ -453,7 +462,8 @@ TEST(ReplicaSetMonitor, CheckAllSeedsParallel) { TEST(ReplicaSetMonitor, NoMasterInitAllUp) { SetStatePtr state = std::make_shared("name", basicSeedsSet); - Refresher refresher(state, nullptr, nullptr); + StatsForTest stats; + Refresher refresher(state, nullptr, nullptr, stats.stats.get()); set seen; @@ -500,7 +510,8 @@ TEST(ReplicaSetMonitor, NoMasterInitAllUp) { TEST(ReplicaSetMonitor, MasterNotInSeeds_NoPrimaryInIsMaster) { SetStatePtr state = std::make_shared("name", basicSeedsSet); - Refresher refresher(state, nullptr, nullptr); + StatsForTest stats; + Refresher refresher(state, nullptr, nullptr, stats.stats.get()); set seen; @@ -577,7 +588,8 @@ TEST(ReplicaSetMonitor, MasterNotInSeeds_NoPrimaryInIsMaster) { TEST(ReplicaSetMonitor, MasterNotInSeeds_PrimaryInIsMaster) { SetStatePtr state = std::make_shared("name", basicSeedsSet); - Refresher refresher(state, nullptr, nullptr); + StatsForTest stats; + Refresher refresher(state, nullptr, nullptr, stats.stats.get()); set seen; @@ -643,7 +655,8 @@ TEST(ReplicaSetMonitor, SlavesUsableEvenIfNoMaster) { std::set seeds; seeds.insert(HostAndPort("a")); SetStatePtr state = std::make_shared("name", seeds); - Refresher refresher(state, nullptr, nullptr); + StatsForTest stats; + Refresher refresher(state, nullptr, nullptr, stats.stats.get()); const ReadPreferenceSetting secondary(ReadPreference::SecondaryOnly, TagSet()); @@ -685,7 +698,8 @@ TEST(ReplicaSetMonitor, SlavesUsableEvenIfNoMaster) { // Test multiple nodes that claim to be master (we use a last-wins policy) TEST(ReplicaSetMonitor, MultipleMasterLastNodeWins) { SetStatePtr state = std::make_shared("name", basicSeedsSet); - Refresher refresher(state, nullptr, nullptr); + StatsForTest stats; + Refresher refresher(state, nullptr, nullptr, stats.stats.get()); set seen; @@ -748,7 +762,8 @@ TEST(ReplicaSetMonitor, MultipleMasterLastNodeWins) { // Test nodes disagree about who is in the set, master is source of truth TEST(ReplicaSetMonitor, MasterIsSourceOfTruth) { SetStatePtr state = std::make_shared("name", basicSeedsSet); - Refresher refresher(state, nullptr, nullptr); + StatsForTest stats; + Refresher refresher(state, nullptr, nullptr, stats.stats.get()); BSONArray primaryHosts = BSON_ARRAY("a" << "b" @@ -789,7 +804,8 @@ TEST(ReplicaSetMonitor, MasterIsSourceOfTruth) { // Test multiple master nodes that disagree about set membership TEST(ReplicaSetMonitor, MultipleMastersDisagree) { SetStatePtr state = std::make_shared("name", basicSeedsSet); - Refresher refresher(state, nullptr, nullptr); + StatsForTest stats; + Refresher refresher(state, nullptr, nullptr, stats.stats.get()); BSONArray hostsForSeed[3]; hostsForSeed[0] = BSON_ARRAY("a" @@ -887,7 +903,8 @@ TEST(ReplicaSetMonitor, MultipleMastersDisagree) { // Ensure getMatchingHost returns hosts even if scan is ongoing TEST(ReplicaSetMonitor, GetMatchingDuringScan) { SetStatePtr state = std::make_shared("name", basicSeedsSet); - Refresher refresher(state, nullptr, nullptr); + StatsForTest stats; + Refresher refresher(state, nullptr, nullptr, stats.stats.get()); const ReadPreferenceSetting primaryOnly(ReadPreference::PrimaryOnly, TagSet()); const ReadPreferenceSetting secondaryOnly(ReadPreference::SecondaryOnly, TagSet()); @@ -990,7 +1007,8 @@ TEST(ReplicaSetMonitor, OutOfBandFailedHost) { // Newly elected primary with electionId >= maximum electionId seen by the Refresher TEST(ReplicaSetMonitorTests, NewPrimaryWithMaxElectionId) { SetStatePtr state = std::make_shared("name", basicSeedsSet); - Refresher refresher(state, nullptr, nullptr); + StatsForTest stats; + Refresher refresher(state, nullptr, nullptr, stats.stats.get()); set seen; @@ -1056,7 +1074,8 @@ TEST(ReplicaSetMonitorTests, NewPrimaryWithMaxElectionId) { // Ignore electionId of secondaries TEST(ReplicaSetMonitorTests, IgnoreElectionIdFromSecondaries) { SetStatePtr state = std::make_shared("name", basicSeedsSet); - Refresher refresher(state, nullptr, nullptr); + StatsForTest stats; + Refresher refresher(state, nullptr, nullptr, stats.stats.get()); set seen; @@ -1103,7 +1122,8 @@ TEST(ReplicaSetMonitorTests, IgnoreElectionIdFromSecondaries) { // Stale Primary with obsolete electionId TEST(ReplicaSetMonitorTests, StalePrimaryWithObsoleteElectionId) { SetStatePtr state = std::make_shared("name", basicSeedsSet); - Refresher refresher(state, nullptr, nullptr); + StatsForTest stats; + Refresher refresher(state, nullptr, nullptr, stats.stats.get()); const OID firstElectionId = OID::gen(); const OID secondElectionId = OID::gen(); @@ -1233,7 +1253,8 @@ TEST(ReplicaSetMonitor, PrimaryIsUpCheck) { */ TEST(ReplicaSetMonitorTests, TwoPrimaries2ndHasNewerConfigVersion) { SetStatePtr state = std::make_shared("name", basicSeedsSet); - Refresher refresher(state, nullptr, nullptr); + StatsForTest stats; + Refresher refresher(state, nullptr, nullptr, stats.stats.get()); auto ns = refresher.getNextStep(); ASSERT_EQUALS(ns.step, NextStep::CONTACT_HOST); @@ -1295,7 +1316,8 @@ TEST(ReplicaSetMonitorTests, TwoPrimaries2ndHasNewerConfigVersion) { */ TEST(ReplicaSetMonitorTests, TwoPrimaries2ndHasOlderConfigVersion) { SetStatePtr state = std::make_shared("name", basicSeedsSet); - Refresher refresher(state, nullptr, nullptr); + StatsForTest stats; + Refresher refresher(state, nullptr, nullptr, stats.stats.get()); auto ns = refresher.getNextStep(); ASSERT_EQUALS(ns.step, NextStep::CONTACT_HOST); @@ -1355,7 +1377,8 @@ TEST(ReplicaSetMonitorTests, TwoPrimaries2ndHasOlderConfigVersion) { */ TEST(ReplicaSetMonitor, MaxStalenessMSMatch) { SetStatePtr state = std::make_shared("name", basicSeedsSet); - Refresher refresher(state, nullptr, nullptr); + StatsForTest stats; + Refresher refresher(state, nullptr, nullptr, stats.stats.get()); repl::OpTime opTime{Timestamp{10, 10}, 10}; const ReadPreferenceSetting secondary(ReadPreference::SecondaryOnly, TagSet(), Seconds(100)); @@ -1408,7 +1431,8 @@ TEST(ReplicaSetMonitor, MaxStalenessMSMatch) { */ TEST(ReplicaSetMonitor, MaxStalenessMSNoMatch) { SetStatePtr state = std::make_shared("name", basicSeedsSet); - Refresher refresher(state, nullptr, nullptr); + StatsForTest stats; + Refresher refresher(state, nullptr, nullptr, stats.stats.get()); repl::OpTime opTime{Timestamp{10, 10}, 10}; const ReadPreferenceSetting secondary(ReadPreference::SecondaryOnly, TagSet(), Seconds(200)); @@ -1460,7 +1484,8 @@ TEST(ReplicaSetMonitor, MaxStalenessMSNoMatch) { */ TEST(ReplicaSetMonitor, MaxStalenessMSNoPrimaryMatch) { SetStatePtr state = std::make_shared("name", basicSeedsSet); - Refresher refresher(state, nullptr, nullptr); + StatsForTest stats; + Refresher refresher(state, nullptr, nullptr, stats.stats.get()); repl::OpTime opTime{Timestamp{10, 10}, 10}; const ReadPreferenceSetting secondary(ReadPreference::SecondaryOnly, TagSet(), Seconds(200)); @@ -1515,7 +1540,8 @@ TEST(ReplicaSetMonitor, MaxStalenessMSNoPrimaryMatch) { */ TEST(ReplicaSetMonitor, MaxStalenessMSAllFailed) { SetStatePtr state = std::make_shared("name", basicSeedsSet); - Refresher refresher(state, nullptr, nullptr); + StatsForTest stats; + Refresher refresher(state, nullptr, nullptr, stats.stats.get()); repl::OpTime opTime{Timestamp{10, 10}, 10}; const ReadPreferenceSetting secondary(ReadPreference::SecondaryOnly, TagSet(), Seconds(200)); @@ -1569,7 +1595,8 @@ TEST(ReplicaSetMonitor, MaxStalenessMSAllFailed) { */ TEST(ReplicaSetMonitor, MaxStalenessMSAllButPrimaryFailed) { SetStatePtr state = std::make_shared("name", basicSeedsSet); - Refresher refresher(state, nullptr, nullptr); + StatsForTest stats; + Refresher refresher(state, nullptr, nullptr, stats.stats.get()); repl::OpTime opTime{Timestamp{10, 10}, 10}; const ReadPreferenceSetting secondary(ReadPreference::SecondaryOnly, TagSet(), Seconds(200)); @@ -1622,7 +1649,8 @@ TEST(ReplicaSetMonitor, MaxStalenessMSAllButPrimaryFailed) { */ TEST(ReplicaSetMonitor, MaxStalenessMSOneSecondaryFailed) { SetStatePtr state = std::make_shared("name", basicSeedsSet); - Refresher refresher(state, nullptr, nullptr); + StatsForTest stats; + Refresher refresher(state, nullptr, nullptr, stats.stats.get()); repl::OpTime opTime{Timestamp{10, 10}, 10}; const ReadPreferenceSetting secondary(ReadPreference::SecondaryOnly, TagSet(), Seconds(200)); @@ -1674,7 +1702,8 @@ TEST(ReplicaSetMonitor, MaxStalenessMSOneSecondaryFailed) { */ TEST(ReplicaSetMonitor, MaxStalenessMSNonStaleSecondaryMatched) { SetStatePtr state = std::make_shared("name", basicSeedsSet); - Refresher refresher(state, nullptr, nullptr); + StatsForTest stats; + Refresher refresher(state, nullptr, nullptr, stats.stats.get()); repl::OpTime opTime{Timestamp{10, 10}, 10}; const ReadPreferenceSetting secondary(ReadPreference::SecondaryOnly, TagSet(), Seconds(200)); @@ -1727,7 +1756,8 @@ TEST(ReplicaSetMonitor, MaxStalenessMSNonStaleSecondaryMatched) { */ TEST(ReplicaSetMonitor, MaxStalenessMSNoLastWrite) { SetStatePtr state = std::make_shared("name", basicSeedsSet); - Refresher refresher(state, nullptr, nullptr); + StatsForTest stats; + Refresher refresher(state, nullptr, nullptr, stats.stats.get()); const ReadPreferenceSetting secondary(ReadPreference::SecondaryOnly, TagSet(), Seconds(200)); BSONArray hosts = BSON_ARRAY("a" @@ -1769,7 +1799,8 @@ TEST(ReplicaSetMonitor, MaxStalenessMSNoLastWrite) { */ TEST(ReplicaSetMonitor, MaxStalenessMSZeroNoLastWrite) { SetStatePtr state = std::make_shared("name", basicSeedsSet); - Refresher refresher(state, nullptr, nullptr); + StatsForTest stats; + Refresher refresher(state, nullptr, nullptr, stats.stats.get()); const ReadPreferenceSetting secondary(ReadPreference::SecondaryOnly, TagSet(), Seconds(0)); BSONArray hosts = BSON_ARRAY("a" @@ -1811,7 +1842,8 @@ TEST(ReplicaSetMonitor, MaxStalenessMSZeroNoLastWrite) { */ TEST(ReplicaSetMonitor, MinOpTimeMatched) { SetStatePtr state = std::make_shared("name", basicSeedsSet); - Refresher refresher(state, nullptr, nullptr); + StatsForTest stats; + Refresher refresher(state, nullptr, nullptr, stats.stats.get()); repl::OpTime minOpTimeSetting{Timestamp{10, 10}, 10}; repl::OpTime opTimeNonStale{Timestamp{10, 10}, 11}; @@ -1856,7 +1888,8 @@ TEST(ReplicaSetMonitor, MinOpTimeMatched) { */ TEST(ReplicaSetMonitor, MinOpTimeNotMatched) { SetStatePtr state = std::make_shared("name", basicSeedsSet); - Refresher refresher(state, nullptr, nullptr); + StatsForTest stats; + Refresher refresher(state, nullptr, nullptr, stats.stats.get()); repl::OpTime minOpTimeSetting{Timestamp{10, 10}, 10}; repl::OpTime opTimeNonStale{Timestamp{10, 10}, 11}; @@ -1901,7 +1934,8 @@ TEST(ReplicaSetMonitor, MinOpTimeNotMatched) { */ TEST(ReplicaSetMonitor, MinOpTimeIgnored) { SetStatePtr state = std::make_shared("name", basicSeedsSet); - Refresher refresher(state, nullptr, nullptr); + StatsForTest stats; + Refresher refresher(state, nullptr, nullptr, stats.stats.get()); repl::OpTime minOpTimeSetting{Timestamp{10, 10}, 10}; repl::OpTime opTimeStale{Timestamp{10, 10}, 9}; diff --git a/src/mongo/client/replica_set_monitor_transport.cpp b/src/mongo/client/replica_set_monitor_transport.cpp index 58cc7fa5671..7b3a6e74ef1 100644 --- a/src/mongo/client/replica_set_monitor_transport.cpp +++ b/src/mongo/client/replica_set_monitor_transport.cpp @@ -44,10 +44,12 @@ const int kLogLevel = 2; ReplicaSetMonitorTransport::~ReplicaSetMonitorTransport() {} -Future ReplicaSetMonitorDbClientTransport::sayHello(HostAndPort host, - const std::string& setName, - const MongoURI& setUri, - Milliseconds timeout) noexcept { +Future ReplicaSetMonitorDbClientTransport::sayHello( + HostAndPort host, + const std::string& setName, + const MongoURI& setUri, + Milliseconds timeout, + ReplicaSetMonitorStats* stats) noexcept { MongoURI targetURI; const auto& hostStr = host.toString(); Timer timer; @@ -62,6 +64,7 @@ Future ReplicaSetMonitorDbClientTransport::sayHello(HostAndPort host, LOG(kLogLevel) << "ReplicaSetMonitor " << setName << " sending hello request to " << hostStr; + const auto statsCollector = stats->collectHelloStats(); ScopedDbConnection conn(targetURI, durationCount(timeout)); bool ignoredOutParam = false; BSONObj reply; @@ -84,10 +87,12 @@ ReplicaSetMonitorExecutorTransport::ReplicaSetMonitorExecutorTransport( executor::TaskExecutor* executor) : _executor(executor) {} -Future ReplicaSetMonitorExecutorTransport::sayHello(HostAndPort host, - const std::string& setName, - const MongoURI& setUri, - Milliseconds timeout) noexcept { +Future ReplicaSetMonitorExecutorTransport::sayHello( + HostAndPort host, + const std::string& setName, + const MongoURI& setUri, + Milliseconds timeout, + ReplicaSetMonitorStats* stats) noexcept { try { auto pf = makePromiseFuture(); BSONObjBuilder bob; @@ -105,7 +110,8 @@ Future ReplicaSetMonitorExecutorTransport::sayHello(HostAndPort host, auto swCbHandle = _executor->scheduleRemoteCommand(std::move(request), [ this, setName, - requestState = std::make_shared(host, std::move(pf.promise)) + requestState = std::make_shared(host, std::move(pf.promise)), + statsCollector = stats->collectHelloStats() ](const executor::TaskExecutor::RemoteCommandCallbackArgs& result) mutable { LOG(kLogLevel) << "Replica set monitor " << setName << " received reply from " << requestState->host.toString() << ": " diff --git a/src/mongo/client/replica_set_monitor_transport.h b/src/mongo/client/replica_set_monitor_transport.h index cf6d3607dc2..660d5d4041a 100644 --- a/src/mongo/client/replica_set_monitor_transport.h +++ b/src/mongo/client/replica_set_monitor_transport.h @@ -31,6 +31,7 @@ #include "mongo/platform/basic.h" #include "mongo/client/mongo_uri.h" +#include "mongo/client/replica_set_monitor_stats.h" #include "mongo/executor/task_executor.h" #include "mongo/s/is_mongos.h" #include "mongo/transport/transport_layer.h" @@ -54,7 +55,8 @@ public: virtual Future sayHello(HostAndPort host, const std::string& setName, const MongoURI& setUri, - Milliseconds timeout) noexcept = 0; + Milliseconds timeout, + ReplicaSetMonitorStats* stats) noexcept = 0; virtual ~ReplicaSetMonitorTransport(); }; using ReplicaSetMonitorTransportPtr = std::unique_ptr; @@ -68,7 +70,8 @@ public: Future sayHello(HostAndPort host, const std::string& setName, const MongoURI& setUri, - Milliseconds timeout) noexcept override; + Milliseconds timeout, + ReplicaSetMonitorStats* stats) noexcept override; }; /** @@ -83,7 +86,8 @@ public: Future sayHello(HostAndPort host, const std::string& setName, const MongoURI& setUri, - Milliseconds timeout) noexcept override; + Milliseconds timeout, + ReplicaSetMonitorStats* stats) noexcept override; private: void _haltIfIncompatibleServer(Status status); -- cgit v1.2.1