From dac50f778efc784c316b79d1d936865f8cca59fe Mon Sep 17 00:00:00 2001 From: Shaileja Jain Date: Thu, 13 Jun 2019 11:22:10 -0400 Subject: SERVER-40490 RSM should check replicas at most twice per second --- src/mongo/client/replica_set_monitor.cpp | 29 +++++++++- src/mongo/client/replica_set_monitor_internal.h | 4 +- .../client/replica_set_monitor_test_concurrent.cpp | 62 ++++++++++++++++++++++ 3 files changed, 92 insertions(+), 3 deletions(-) (limited to 'src') diff --git a/src/mongo/client/replica_set_monitor.cpp b/src/mongo/client/replica_set_monitor.cpp index d134f935070..58640f16596 100644 --- a/src/mongo/client/replica_set_monitor.cpp +++ b/src/mongo/client/replica_set_monitor.cpp @@ -511,7 +511,25 @@ void Refresher::scheduleNetworkRequests(WithLock withLock) { if (ns.step != Refresher::NextStep::CONTACT_HOST) break; - scheduleIsMaster(ns.host, withLock); + // cancel any scheduled isMaster calls that haven't yet been called + Node* node = _set->findOrCreateNode(ns.host); + if (node->scheduledIsMasterHandle) { + _set->executor->cancel(node->scheduledIsMasterHandle); + } + + // ensure that the call to isMaster is scheduled at most every 500ms + if (_set->executor && (_set->executor->now() < node->nextPossibleIsMasterCall) && + !_set->isMocked) { + // schedule a new call + node->scheduledIsMasterHandle = uassertStatusOK(_set->executor->scheduleWorkAt( + node->nextPossibleIsMasterCall, + [ *this, host = ns.host ](const CallbackArgs& cbArgs) mutable { + stdx::lock_guard lk(_set->mutex); + scheduleIsMaster(host, lk); + })); + } else { + scheduleIsMaster(ns.host, withLock); + } } DEV _set->checkInvariants(); @@ -544,7 +562,6 @@ void Refresher::scheduleIsMaster(const HostAndPort& host, WithLock withLock) { nullptr, Milliseconds(int64_t(socketTimeoutSecs * 1000))); request.sslMode = _set->setUri.getSSLMode(); - auto status = _set->executor ->scheduleRemoteCommand( @@ -671,6 +688,14 @@ void Refresher::receivedIsMaster(const HostAndPort& from, return; } + // ensure that isMaster calls occur at most 500ms after the previous call ended + if (_set->executor) { + Node* node = _set->findNode(from); + if (node) { + node->nextPossibleIsMasterCall = _set->executor->now() + Milliseconds(500); + } + } + if (reply.setName != _set->name) { if (reply.raw["isreplicaset"].trueValue()) { // The reply came from a node in the state referred to as RSGhost in the SDAM diff --git a/src/mongo/client/replica_set_monitor_internal.h b/src/mongo/client/replica_set_monitor_internal.h index e7a57cc6543..76b67e975e0 100644 --- a/src/mongo/client/replica_set_monitor_internal.h +++ b/src/mongo/client/replica_set_monitor_internal.h @@ -134,7 +134,9 @@ public: Date_t lastWriteDate{}; // from isMasterReply Date_t lastWriteDateUpdateTime{}; // set to the local system's time at the time of updating // lastWriteDate - repl::OpTime opTime{}; // from isMasterReply + Date_t nextPossibleIsMasterCall{}; // time that previous isMaster check ended + executor::TaskExecutor::CallbackHandle scheduledIsMasterHandle; // + repl::OpTime opTime{}; // from isMasterReply }; using Nodes = std::vector; diff --git a/src/mongo/client/replica_set_monitor_test_concurrent.cpp b/src/mongo/client/replica_set_monitor_test_concurrent.cpp index 106dba50beb..d1506ed1bc7 100644 --- a/src/mongo/client/replica_set_monitor_test_concurrent.cpp +++ b/src/mongo/client/replica_set_monitor_test_concurrent.cpp @@ -298,5 +298,67 @@ TEST_F(ReplicaSetMonitorConcurrentTest, StepdownAndElection) { advanceTime(Milliseconds(100)); } } + +// Check that isMaster is being called at most every 500ms. +// +// 1. Create a replica set with two secondaries, Node 0 and Node 1 +// 2. Begin a ReplicaSetMonitor::getHostOrRefresh call with primaryOnly +// 3. Node 0 responds but Node 1 does not +// 4. After 0.5s call ReplicaSetMonitor::getHostOrRefresh again +TEST_F(ReplicaSetMonitorConcurrentTest, IsMasterFrequency) { + MockReplicaSet replSet("test", 2, /* hasPrimary = */ false, /* dollarPrefixHosts = */ false); + const auto node0 = HostAndPort(replSet.getSecondaries()[0]); + const auto node1 = HostAndPort(replSet.getSecondaries()[1]); + + auto state = std::make_shared( + replSet.getURI(), &getNotifier(), &getExecutor()); + auto monitor = std::make_shared(state); + + // Node 1 is unresponsive. + replSet.kill(replSet.getSecondaries()[1]); + monitor->init(); + + std::vector> primaryFutures; + primaryFutures.push_back(monitor->getHostOrRefresh(primaryOnly, Seconds(4))); + + // Because Node 1 is unresponsive, the monitor rechecks Node 0 every 500ms + // until Node 1 times out after 5 seconds. + auto checkUntil = [&](auto timeToStop, auto func) { + while (elapsed() < timeToStop) { + processReadyRequests(replSet); + func(); + // all primaryFutures are not ready, because the monitor cannot find the primary + for (SemiFuture& future : primaryFutures) { + ASSERT(!future.isReady()); + } + advanceTime(Milliseconds(100)); + } + }; + + // Up until 500ms there is only one isMaster call. + // At 500ms, the monitor rechecks node 0. + checkUntil(Milliseconds(500), [&]() { + ASSERT_EQ(getNumChecks(node0), 1); + ASSERT_EQ(getNumChecks(node1), 1); + }); + + // Triggers isMaster calls that will be delayed until 1000ms + checkUntil(Milliseconds(1000), [&]() { + // this should schedule a new isMaster call at 1000ms and cancel the + // previous job scheduled for the same time + primaryFutures.push_back(monitor->getHostOrRefresh(primaryOnly, Seconds(4))); + ASSERT_EQ(getNumChecks(node0), 2); + ASSERT_EQ(getNumChecks(node1), 1); + }); + + // At 1000ms, only one scheduled isMaster call runs, because all others have been canceled + checkUntil(Milliseconds(1500), [&]() { + ASSERT_EQ(getNumChecks(node0), 3); + ASSERT_EQ(getNumChecks(node1), 1); + }); + + // TODO: advanceTime(Seconds(5)) and getNoThrow() each entry in primaryFutures once + // race conditions in RSM expedited scans are fixed +} } // namespace } // namespace mongo -- cgit v1.2.1