diff options
-rw-r--r-- | buildscripts/resmokeconfig/suites/sharding_last_stable_mongos.yml | 3 | ||||
-rw-r--r-- | jstests/sharding/cursor_timeout.js | 27 | ||||
-rw-r--r-- | src/mongo/db/clientcursor.cpp | 4 | ||||
-rw-r--r-- | src/mongo/s/cluster_cursor_stats.cpp | 25 | ||||
-rw-r--r-- | src/mongo/s/query/cluster_cursor_cleanup_job.cpp | 7 | ||||
-rw-r--r-- | src/mongo/s/query/cluster_cursor_manager.cpp | 27 | ||||
-rw-r--r-- | src/mongo/s/query/cluster_cursor_manager.h | 23 |
7 files changed, 90 insertions, 26 deletions
diff --git a/buildscripts/resmokeconfig/suites/sharding_last_stable_mongos.yml b/buildscripts/resmokeconfig/suites/sharding_last_stable_mongos.yml index 562cefc1d1e..166bc7a35bc 100644 --- a/buildscripts/resmokeconfig/suites/sharding_last_stable_mongos.yml +++ b/buildscripts/resmokeconfig/suites/sharding_last_stable_mongos.yml @@ -36,6 +36,9 @@ selector: # TODO Assumes shardCollection can handle the collation option; enable when 3.4 becomes # 'last-stable'. - jstests/sharding/shard_collection_basic.js + # TODO Requires mongos to have the 'clientCursorMonitorFrequencySecs' server parameter; enable + # when 3.4 becomes 'last-stable'. + - jstests/sharding/cursor_timeout.js executor: js_test: diff --git a/jstests/sharding/cursor_timeout.js b/jstests/sharding/cursor_timeout.js index cea17d93dcc..90097261eb9 100644 --- a/jstests/sharding/cursor_timeout.js +++ b/jstests/sharding/cursor_timeout.js @@ -15,8 +15,14 @@ shards: 2, other: { chunkSize: 1, - shardOptions: {setParameter: "cursorTimeoutMillis=1000"}, - mongosOptions: {setParameter: "cursorTimeoutMillis=1000"} + shardOptions: { + verbose: 1, + setParameter: {cursorTimeoutMillis: 1000, clientCursorMonitorFrequencySecs: 1} + }, + mongosOptions: { + verbose: 1, + setParameter: {cursorTimeoutMillis: 1000, clientCursorMonitorFrequencySecs: 1} + } } }); @@ -66,9 +72,20 @@ cursorWithNoTimeout.next(); // Wait until the idle cursor background job has killed the cursors that do not have the "no - // timeout" flag set. We use the "cursorTimeoutMillis" setParameter above to reduce the amount - // of time we need to wait here. - sleep(5000); + // timeout" flag set. We use the "cursorTimeoutMillis" and "clientCursorMonitorFrequencySecs" + // setParameters above to reduce the amount of time we need to wait here. + assert.soon(function() { + return coll.getDB().serverStatus().metrics.cursor.timedOut > 0; + }, "sharded cursor failed to time out", 5000); + + // Wait for the shard to have two open cursors on it (shardedCursorWithNoTimeout and + // cursorWithNoTimeout). + // We cannot reliably use metrics.cursor.timedOut here, because this will be 2 if + // shardedCursorWithTimeout is killed for timing out on the shard, and 1 if + // shardedCursorWithTimeout is killed by a killCursors command from the mongos. + assert.soon(function() { + return shardColl.getDB().serverStatus().metrics.cursor.open.total == 2; + }, "cursor failed to time out", 5000); assert.throws(function() { shardedCursorWithTimeout.itcount(); diff --git a/src/mongo/db/clientcursor.cpp b/src/mongo/db/clientcursor.cpp index a4f80e44319..4cc2dd643a4 100644 --- a/src/mongo/db/clientcursor.cpp +++ b/src/mongo/db/clientcursor.cpp @@ -71,6 +71,7 @@ static ServerStatusMetricField<Counter64> dCursorStatusTimedout("cursor.timedOut &cursorStatsTimedOut); MONGO_EXPORT_SERVER_PARAMETER(cursorTimeoutMillis, int, 10 * 60 * 1000 /* 10 minutes */); +MONGO_EXPORT_SERVER_PARAMETER(clientCursorMonitorFrequencySecs, int, 4); long long ClientCursor::totalOpen() { return cursorStatsOpen.get(); @@ -263,7 +264,6 @@ public: void run() { Client::initThread("clientcursormon"); Timer t; - const int Secs = 4; while (!inShutdown()) { { const ServiceContext::UniqueOperationContext txnPtr = cc().makeOperationContext(); @@ -271,7 +271,7 @@ public: cursorStatsTimedOut.increment( CursorManager::timeoutCursorsGlobal(&txn, t.millisReset())); } - sleepsecs(Secs); + sleepsecs(clientCursorMonitorFrequencySecs); } } }; diff --git a/src/mongo/s/cluster_cursor_stats.cpp b/src/mongo/s/cluster_cursor_stats.cpp index 03ba9fc26d0..db72b245795 100644 --- a/src/mongo/s/cluster_cursor_stats.cpp +++ b/src/mongo/s/cluster_cursor_stats.cpp @@ -41,18 +41,23 @@ namespace { class ClusterCursorStats final : public ServerStatusMetric { public: - ClusterCursorStats() : ServerStatusMetric("cursor.open") {} + ClusterCursorStats() : ServerStatusMetric("cursor") {} void appendAtLeaf(BSONObjBuilder& b) const final { - BSONObjBuilder openBob(b.subobjStart(_leafName)); - auto stats = grid.getCursorManager()->stats(); - - openBob.append("multiTarget", static_cast<long long>(stats.cursorsSharded)); - openBob.append("singleTarget", static_cast<long long>(stats.cursorsNotSharded)); - openBob.append("pinned", static_cast<long long>(stats.cursorsPinned)); - openBob.append("total", - static_cast<long long>(stats.cursorsSharded + stats.cursorsNotSharded)); - openBob.done(); + BSONObjBuilder cursorBob(b.subobjStart(_leafName)); + cursorBob.append("timedOut", + static_cast<long long>(grid.getCursorManager()->cursorsTimedOut())); + { + BSONObjBuilder openBob(cursorBob.subobjStart("open")); + auto stats = grid.getCursorManager()->stats(); + openBob.append("multiTarget", static_cast<long long>(stats.cursorsSharded)); + openBob.append("singleTarget", static_cast<long long>(stats.cursorsNotSharded)); + openBob.append("pinned", static_cast<long long>(stats.cursorsPinned)); + openBob.append("total", + static_cast<long long>(stats.cursorsSharded + stats.cursorsNotSharded)); + openBob.doneFast(); + } + cursorBob.done(); } } clusterCursorStats; diff --git a/src/mongo/s/query/cluster_cursor_cleanup_job.cpp b/src/mongo/s/query/cluster_cursor_cleanup_job.cpp index a7e4159c9b5..068d06ac98a 100644 --- a/src/mongo/s/query/cluster_cursor_cleanup_job.cpp +++ b/src/mongo/s/query/cluster_cursor_cleanup_job.cpp @@ -51,6 +51,9 @@ ExportedServerParameter<long long, ServerParameterType::kStartupAndRuntime> "cursorTimeoutMillis", &cursorTimeoutMillis); +// Frequency with which ClusterCursorCleanupJob is run. +MONGO_EXPORT_SERVER_PARAMETER(clientCursorMonitorFrequencySecs, long long, 4); + } // namespace ClusterCursorCleanupJob clusterCursorCleanupJob; @@ -67,8 +70,8 @@ void ClusterCursorCleanupJob::run() { while (!inShutdown()) { manager->killMortalCursorsInactiveSince(Date_t::now() - Milliseconds(cursorTimeoutMillis.load())); - manager->reapZombieCursors(); - sleepFor(Seconds(4)); + manager->incrementCursorsTimedOut(manager->reapZombieCursors()); + sleepsecs(clientCursorMonitorFrequencySecs); } } diff --git a/src/mongo/s/query/cluster_cursor_manager.cpp b/src/mongo/s/query/cluster_cursor_manager.cpp index 3c84fa7fa54..924a472d7a5 100644 --- a/src/mongo/s/query/cluster_cursor_manager.cpp +++ b/src/mongo/s/query/cluster_cursor_manager.cpp @@ -332,6 +332,7 @@ void ClusterCursorManager::killMortalCursorsInactiveSince(Date_t cutoff) { CursorEntry& entry = cursorIdEntryPair.second; if (entry.getLifetimeType() == CursorLifetime::Mortal && entry.getLastActive() <= cutoff) { + entry.setInactive(); log() << "Marking cursor id " << cursorIdEntryPair.first << " for deletion, idle since " << entry.getLastActive().toString(); entry.setKillPending(); @@ -350,13 +351,22 @@ void ClusterCursorManager::killAllCursors() { } } -void ClusterCursorManager::reapZombieCursors() { +std::size_t ClusterCursorManager::reapZombieCursors() { + struct CursorDescriptor { + CursorDescriptor(NamespaceString ns, CursorId cursorId, bool isInactive) + : ns(std::move(ns)), cursorId(cursorId), isInactive(isInactive) {} + + NamespaceString ns; + CursorId cursorId; + bool isInactive; + }; + // List all zombie cursors under the manager lock, and kill them one-by-one while not holding // the lock (ClusterClientCursor::kill() is blocking, so we don't want to hold a lock while // issuing the kill). stdx::unique_lock<stdx::mutex> lk(_mutex); - std::vector<std::pair<NamespaceString, CursorId>> zombieCursorDescriptors; + std::vector<CursorDescriptor> zombieCursorDescriptors; for (auto& nsContainerPair : _namespaceToContainerMap) { const NamespaceString& nss = nsContainerPair.first; for (auto& cursorIdEntryPair : nsContainerPair.second.entryMap) { @@ -365,13 +375,15 @@ void ClusterCursorManager::reapZombieCursors() { if (!entry.getKillPending()) { continue; } - zombieCursorDescriptors.emplace_back(nss, cursorId); + zombieCursorDescriptors.emplace_back(nss, cursorId, entry.isInactive()); } } - for (auto& namespaceCursorIdPair : zombieCursorDescriptors) { + std::size_t cursorsTimedOut = 0; + + for (auto& cursorDescriptor : zombieCursorDescriptors) { StatusWith<std::unique_ptr<ClusterClientCursor>> zombieCursor = - detachCursor_inlock(namespaceCursorIdPair.first, namespaceCursorIdPair.second); + detachCursor_inlock(cursorDescriptor.ns, cursorDescriptor.cursorId); if (!zombieCursor.isOK()) { // Cursor in use, or has already been deleted. continue; @@ -381,7 +393,12 @@ void ClusterCursorManager::reapZombieCursors() { zombieCursor.getValue()->kill(); zombieCursor.getValue().reset(); lk.lock(); + + if (cursorDescriptor.isInactive) { + ++cursorsTimedOut; + } } + return cursorsTimedOut; } ClusterCursorManager::Stats ClusterCursorManager::stats() const { diff --git a/src/mongo/s/query/cluster_cursor_manager.h b/src/mongo/s/query/cluster_cursor_manager.h index 0561163b26b..7770cc741c8 100644 --- a/src/mongo/s/query/cluster_cursor_manager.h +++ b/src/mongo/s/query/cluster_cursor_manager.h @@ -309,7 +309,7 @@ public: /** * Attempts to performs a blocking kill and deletion of all non-pinned cursors that are marked - * as 'kill pending'. + * as 'kill pending'. Returns the number of cursors that were marked as inactive. * * If no other non-const methods are called simultaneously, it is guaranteed that this method * will delete all non-pinned cursors marked as 'kill pending'. Otherwise, no such guarantee is @@ -318,7 +318,7 @@ public: * * Can block. */ - void reapZombieCursors(); + std::size_t reapZombieCursors(); /** * Returns the number of open cursors on a ClusterCursorManager, broken down by type. @@ -340,6 +340,14 @@ public: */ boost::optional<NamespaceString> getNamespaceForCursorId(CursorId cursorId) const; + void incrementCursorsTimedOut(size_t inc) { + _cursorsTimedOut += inc; + } + + size_t cursorsTimedOut() const { + return _cursorsTimedOut; + } + private: class CursorEntry; using CursorEntryMap = std::unordered_map<CursorId, CursorEntry>; @@ -410,6 +418,10 @@ private: return _killPending; } + bool isInactive() const { + return _isInactive; + } + CursorType getCursorType() const { return _cursorType; } @@ -447,6 +459,10 @@ private: _killPending = true; } + void setInactive() { + _isInactive = true; + } + void setLastActive(Date_t lastActive) { _lastActive = lastActive; } @@ -454,6 +470,7 @@ private: private: std::unique_ptr<ClusterClientCursor> _cursor; bool _killPending = false; + bool _isInactive = false; CursorType _cursorType = CursorType::NamespaceNotSharded; CursorLifetime _cursorLifetime = CursorLifetime::Mortal; Date_t _lastActive; @@ -508,6 +525,8 @@ private: // when the last cursor on the given namespace is destroyed. std::unordered_map<NamespaceString, CursorEntryContainer, NamespaceString::Hasher> _namespaceToContainerMap; + + size_t _cursorsTimedOut = 0; }; } // namespace |