SERVER-29304 Exclude time spent blocking for awaitData from getMore latency reporting.

This change applies to various server diagnostics which include latency metrics: the slow query logs, the system.profile collection, top, global operation latency stats, and per-collection operation latency stats. The secs_running and microsecs_running fields reported in currentOp output, on the other hand, will include time spent blocking for awaitData. Furthermore, this change fixes the following: - getMore operations are no longer double-counted. - Lock acquisition times are now consistently included. This fixes a bug in which time spent acquiring locks was accidentally omitted in some cases.
author: David Storch <david.storch@10gen.com> 2017-06-05 19:33:37 -0400
committer: David Storch <david.storch@10gen.com> 2017-06-14 18:52:38 -0400
commit: a68f209e4ff495e805e4d1b12d6fc3381c56a771 (patch)
tree: f3828ef00f300c6d2d5965fe0d9f9ebb27180572 /src
parent: 2b2ea96ea4a926018dc02bfc1c33ebcfed8bc2a0 (diff)
download: mongo-a68f209e4ff495e805e4d1b12d6fc3381c56a771.tar.gz
11 files changed, 142 insertions, 69 deletions
diff --git a/src/mongo/db/assemble_response.cpp b/src/mongo/db/assemble_response.cpp
index 06ac6c090c6..2ecfa4093c3 100644
--- a/src/mongo/db/assemble_response.cpp
+++ b/src/mongo/db/assemble_response.cpp
@@ -493,12 +493,13 @@ mongo::DbResponse mongo::assembleResponse(OperationContext* opCtx, const Message
     }
     currentOp.ensureStarted();
     currentOp.done();
-    debug.executionTimeMicros = currentOp.totalTimeMicros();
+    debug.executionTimeMicros = durationCount<Microseconds>(currentOp.elapsedTimeExcludingPauses());
 
-    logThresholdMs += currentOp.getExpectedLatencyMs();
     Top::get(opCtx->getServiceContext())
         .incrementGlobalLatencyStats(
-            opCtx, currentOp.totalTimeMicros(), currentOp.getReadWriteType());
+            opCtx,
+            durationCount<Microseconds>(currentOp.elapsedTimeExcludingPauses()),
+            currentOp.getReadWriteType());
 
     const bool shouldSample = serverGlobalParams.sampleRate == 1.0
         ? true
diff --git a/src/mongo/db/commands/find_and_modify.cpp b/src/mongo/db/commands/find_and_modify.cpp
index 01d5d70b584..202bbdbf8f6 100644
--- a/src/mongo/db/commands/find_and_modify.cpp
+++ b/src/mongo/db/commands/find_and_modify.cpp
@@ -209,7 +209,7 @@ void recordStatsForTopCommand(OperationContext* opCtx) {
                 curOp->getNS(),
                 curOp->getLogicalOp(),
                 Top::LockType::WriteLocked,
-                curOp->elapsedMicros(),
+                durationCount<Microseconds>(curOp->elapsedTimeExcludingPauses()),
                 curOp->isCommand(),
                 curOp->getReadWriteType());
 }
diff --git a/src/mongo/db/commands/geo_near_cmd.cpp b/src/mongo/db/commands/geo_near_cmd.cpp
index 3b56d5074e3..8f693e74680 100644
--- a/src/mongo/db/commands/geo_near_cmd.cpp
+++ b/src/mongo/db/commands/geo_near_cmd.cpp
@@ -320,7 +320,8 @@ public:
             stats.append("avgDistance", totalDistance / results);
         }
         stats.append("maxDistance", farthestDist);
-        stats.appendIntOrLL("time", curOp->elapsedMicros() / 1000);
+        stats.appendIntOrLL("time",
+                            durationCount<Microseconds>(curOp->elapsedTimeExcludingPauses()));
         stats.done();
 
         collection->infoCache()->notifyOfQuery(opCtx, summary.indexesUsed);
diff --git a/src/mongo/db/commands/getmore_cmd.cpp b/src/mongo/db/commands/getmore_cmd.cpp
index 6cc3799d6a4..615caf264e5 100644
--- a/src/mongo/db/commands/getmore_cmd.cpp
+++ b/src/mongo/db/commands/getmore_cmd.cpp
@@ -168,21 +168,21 @@ public:
         // Cursors come in one of two flavors:
         // - Cursors owned by the collection cursor manager, such as those generated via the find
         //   command. For these cursors, we hold the appropriate collection lock for the duration of
-        //   the getMore using AutoGetCollectionForRead. This will automatically update the CurOp
-        //   object appropriately and record execution time via Top upon completion.
+        //   the getMore using AutoGetCollectionForRead.
         // - Cursors owned by the global cursor manager, such as those generated via the aggregate
         //   command. These cursors either hold no collection state or manage their collection state
-        //   internally, so we acquire no locks. In this case we use the AutoStatsTracker object to
-        //   update the CurOp object appropriately and record execution time via Top upon
-        //   completion.
+        //   internally, so we acquire no locks.
         //
-        // Thus, only one of 'readLock' and 'statsTracker' will be populated as we populate
-        // 'cursorManager'.
+        // While we only need to acquire locks in the case of a cursor which is *not* globally
+        // owned, we need to create an AutoStatsTracker in either case. This is responsible for
+        // updating statistics in CurOp and Top. We avoid using AutoGetCollectionForReadCommand
+        // because we may need to drop and reacquire locks when the cursor is awaitData, but we
+        // don't want to update the stats twice.
         //
-        // Note that we declare our locks before our ClientCursorPin, in order to ensure that
+        // Note that we acquire our locks before our ClientCursorPin, in order to ensure that
         // the pin's destructor is called before the lock's destructor (if there is one) so that the
         // cursor cleanup can occur under the lock.
-        boost::optional<AutoGetCollectionForReadCommand> readLock;
+        boost::optional<AutoGetCollectionForRead> readLock;
         boost::optional<AutoStatsTracker> statsTracker;
         CursorManager* cursorManager;
 
@@ -204,6 +204,13 @@ public:
                                                                ChunkVersion::IGNORED());
 
             readLock.emplace(opCtx, request.nss);
+            const int doNotChangeProfilingLevel = 0;
+            statsTracker.emplace(opCtx,
+                                 request.nss,
+                                 Top::LockType::ReadLocked,
+                                 readLock->getDb() ? readLock->getDb()->getProfilingLevel()
+                                                   : doNotChangeProfilingLevel);
+
             Collection* collection = readLock->getCollection();
             if (!collection) {
                 return appendCommandStatus(result,
@@ -368,15 +375,13 @@ public:
                 exec->saveState();
                 readLock.reset();
 
-                // Block waiting for data.
+                // Block waiting for data. Time spent blocking is not counted towards the total
+                // operation latency.
+                curOp->pauseTimer();
                 const auto timeout = opCtx->getRemainingMaxTimeMicros();
                 notifier->wait(notifierVersion, timeout);
                 notifier.reset();
-
-                // Set expected latency to match wait time. This makes sure the logs aren't spammed
-                // by awaitData queries that exceed slowms due to blocking on the
-                // CappedInsertNotifier.
-                curOp->setExpectedLatencyMs(durationCount<Milliseconds>(timeout));
+                curOp->resumeTimer();
 
                 readLock.emplace(opCtx, request.nss);
                 exec->restoreState();
diff --git a/src/mongo/db/curop.cpp b/src/mongo/db/curop.cpp
index 1894aaa5fef..451fa96132b 100644
--- a/src/mongo/db/curop.cpp
+++ b/src/mongo/db/curop.cpp
@@ -349,8 +349,8 @@ void appendAsObjOrString(StringData name,
 
 void CurOp::reportState(BSONObjBuilder* builder) {
     if (_start) {
-        builder->append("secs_running", elapsedSeconds());
-        builder->append("microsecs_running", static_cast<long long int>(elapsedMicros()));
+        builder->append("secs_running", durationCount<Seconds>(elapsedTimeTotal()));
+        builder->append("microsecs_running", durationCount<Microseconds>(elapsedTimeTotal()));
     }
 
     builder->append("op", logicalOpToString(_logicalOp));
diff --git a/src/mongo/db/curop.h b/src/mongo/db/curop.h
index c14af5e731f..bb1f82b1a5d 100644
--- a/src/mongo/db/curop.h
+++ b/src/mongo/db/curop.h
@@ -237,8 +237,7 @@ public:
         if (!shouldSample || _dbprofile <= 0)
             return false;
 
-        long long opMicros = isDone() ? totalTimeMicros() : elapsedMicros();
-        return opMicros >= serverGlobalParams.slowMS * 1000LL;
+        return elapsedTimeExcludingPauses() >= Milliseconds{serverGlobalParams.slowMS};
     }
 
     /**
@@ -292,17 +291,67 @@ public:
         return _end > 0;
     }
 
-    long long totalTimeMicros() {
-        massert(12601, "CurOp not marked done yet", _end);
-        return _end - startTime();
+    /**
+     * Stops the operation latency timer from "ticking". Time spent paused is not included in the
+     * latencies returned by elapsedTimeExcludingPauses().
+     *
+     * Illegal to call if either the CurOp has not been started, or the CurOp is already in a paused
+     * state.
+     */
+    void pauseTimer() {
+        invariant(isStarted());
+        invariant(_lastPauseTime == 0);
+        _lastPauseTime = curTimeMicros64();
     }
 
-    long long elapsedMicros() {
-        return curTimeMicros64() - startTime();
+    /**
+     * Starts the operation latency timer "ticking" again. Illegal to call if the CurOp has not been
+     * started and then subsequently paused.
+     */
+    void resumeTimer() {
+        invariant(isStarted());
+        invariant(_lastPauseTime > 0);
+        _totalPausedDuration +=
+            Microseconds{static_cast<long long>(curTimeMicros64()) - _lastPauseTime};
+        _lastPauseTime = 0;
     }
 
-    int elapsedSeconds() {
-        return static_cast<int>(elapsedMicros() / (1000 * 1000));
+    /**
+     * If this op has been marked as done(), returns the wall clock duration between being marked as
+     * started with ensureStarted() and the call to done().
+     *
+     * Otherwise, returns the wall clock duration between the start time and now.
+     *
+     * If this op has not yet been started, returns 0.
+     */
+    Microseconds elapsedTimeTotal() {
+        if (!isStarted()) {
+            return Microseconds{0};
+        }
+
+        if (!_end) {
+            return Microseconds{static_cast<long long>(curTimeMicros64() - startTime())};
+        } else {
+            return Microseconds{static_cast<long long>(_end - startTime())};
+        }
+    }
+
+    /**
+     * Returns the total elapsed duration minus any time spent in a paused state. See
+     * elapsedTimeTotal() for the definition of the total duration and pause/resumeTimer() for
+     * details on pausing.
+     *
+     * If this op has not yet been started, returns 0.
+     *
+     * Illegal to call while the timer is paused.
+     */
+    Microseconds elapsedTimeExcludingPauses() {
+        invariant(!_lastPauseTime);
+        if (!isStarted()) {
+            return Microseconds{0};
+        }
+
+        return elapsedTimeTotal() - _totalPausedDuration;
     }
 
     /**
@@ -377,17 +426,6 @@ public:
     }
 
     /**
-     * Access to _expectedLatencyMs is not synchronized, so it is illegal for threads other than the
-     * one executing the operation to call getExpectedLatencyMs() and setExpectedLatencyMs().
-     */
-    long long getExpectedLatencyMs() const {
-        return _expectedLatencyMs;
-    }
-    void setExpectedLatencyMs(long long latency) {
-        _expectedLatencyMs = latency;
-    }
-
-    /**
      * this should be used very sparingly
      * generally the Context should set this up
      * but sometimes you want to do it ahead of time
@@ -416,9 +454,20 @@ private:
     CurOpStack* _stack;
     CurOp* _parent{nullptr};
     Command* _command{nullptr};
+
+    // The time at which this CurOp instance was marked as started.
     long long _start{0};
+
+    // The time at which this CurOp instance was marked as done.
     long long _end{0};
 
+    // The time at which this CurOp instance had its timer paused, or 0 if the timer is not
+    // currently paused.
+    long long _lastPauseTime{0};
+
+    // The cumulative duration for which the timer has been paused.
+    Microseconds _totalPausedDuration{0};
+
     // _networkOp represents the network-level op code: OP_QUERY, OP_GET_MORE, OP_COMMAND, etc.
     NetworkOp _networkOp{opInvalid};  // only set this through setNetworkOp_inlock() to keep synced
     // _logicalOp is the logical operation type, ie 'dbQuery' regardless of whether this is an
@@ -436,11 +485,6 @@ private:
     ProgressMeter _progressMeter;
     int _numYields{0};
 
-    // this is how much "extra" time a query might take
-    // a writebacklisten for example will block for 30s
-    // so this should be 30000 in that case
-    long long _expectedLatencyMs{0};
-
     std::string _planSummary;
 };
 
diff --git a/src/mongo/db/db_raii.cpp b/src/mongo/db/db_raii.cpp
index b9863e956bf..bff855fcc5b 100644
--- a/src/mongo/db/db_raii.cpp
+++ b/src/mongo/db/db_raii.cpp
@@ -38,9 +38,14 @@
 #include "mongo/db/repl/replication_coordinator_global.h"
 #include "mongo/db/s/collection_sharding_state.h"
 #include "mongo/db/stats/top.h"
+#include "mongo/util/fail_point_service.h"
 
 namespace mongo {
 
+namespace {
+MONGO_FP_DECLARE(setAutoGetCollectionWait);
+}  // namespace
+
 AutoGetDb::AutoGetDb(OperationContext* opCtx, StringData ns, LockMode mode)
     : _dbLock(opCtx, ns, mode), _db(dbHolder().get(opCtx, ns)) {}
 
@@ -59,6 +64,12 @@ AutoGetCollection::AutoGetCollection(OperationContext* opCtx,
         db->getViewCatalog()->lookup(opCtx, nss.ns()))
         uasserted(ErrorCodes::CommandNotSupportedOnView,
                   str::stream() << "Namespace " << nss.ns() << " is a view, not a collection");
+
+    // Wait for a configured amount of time after acquiring locks if the failpoint is enabled.
+    MONGO_FAIL_POINT_BLOCK(setAutoGetCollectionWait, customWait) {
+        const BSONObj& data = customWait.getData();
+        sleepFor(Milliseconds(data["waitForMillis"].numberInt()));
+    }
 }
 
 AutoGetCollectionOrView::AutoGetCollectionOrView(OperationContext* opCtx,
@@ -107,7 +118,7 @@ AutoStatsTracker::~AutoStatsTracker() {
                 curOp->getNS(),
                 curOp->getLogicalOp(),
                 _lockType,
-                _timer.micros(),
+                durationCount<Microseconds>(curOp->elapsedTimeExcludingPauses()),
                 curOp->isCommand(),
                 curOp->getReadWriteType());
 }
diff --git a/src/mongo/db/db_raii.h b/src/mongo/db/db_raii.h
index ce9b1b200ff..641b64e7988 100644
--- a/src/mongo/db/db_raii.h
+++ b/src/mongo/db/db_raii.h
@@ -238,7 +238,6 @@ public:
     ~AutoStatsTracker();
 
 private:
-    const Timer _timer;
     OperationContext* _opCtx;
     Top::LockType _lockType;
 };
diff --git a/src/mongo/db/ops/write_ops_exec.cpp b/src/mongo/db/ops/write_ops_exec.cpp
index 757290ef1cc..f8b20c2d10c 100644
--- a/src/mongo/db/ops/write_ops_exec.cpp
+++ b/src/mongo/db/ops/write_ops_exec.cpp
@@ -82,7 +82,8 @@ MONGO_FP_DECLARE(failAllRemoves);
 void finishCurOp(OperationContext* opCtx, CurOp* curOp) {
     try {
         curOp->done();
-        long long executionTimeMicros = curOp->totalTimeMicros();
+        long long executionTimeMicros =
+            durationCount<Microseconds>(curOp->elapsedTimeExcludingPauses());
         curOp->debug().executionTimeMicros = executionTimeMicros;
 
         recordCurOpMetrics(opCtx);
@@ -91,7 +92,7 @@ void finishCurOp(OperationContext* opCtx, CurOp* curOp) {
                     curOp->getNS(),
                     curOp->getLogicalOp(),
                     Top::LockType::WriteLocked,
-                    curOp->totalTimeMicros(),
+                    durationCount<Microseconds>(curOp->elapsedTimeExcludingPauses()),
                     curOp->isCommand(),
                     curOp->getReadWriteType());
 
@@ -102,8 +103,7 @@ void finishCurOp(OperationContext* opCtx, CurOp* curOp) {
 
         const bool logAll = logger::globalLogDomain()->shouldLog(logger::LogComponent::kCommand,
                                                                  logger::LogSeverity::Debug(1));
-        const bool logSlow = executionTimeMicros >
-            (serverGlobalParams.slowMS + curOp->getExpectedLatencyMs()) * 1000LL;
+        const bool logSlow = executionTimeMicros > (serverGlobalParams.slowMS * 1000LL);
 
         const bool shouldSample = serverGlobalParams.sampleRate == 1.0
             ? true
@@ -405,7 +405,7 @@ WriteResult performInserts(OperationContext* opCtx, const InsertOp& wholeOp) {
                     wholeOp.ns.ns(),
                     LogicalOp::opInsert,
                     Top::LockType::WriteLocked,
-                    curOp.totalTimeMicros(),
+                    durationCount<Microseconds>(curOp.elapsedTimeExcludingPauses()),
                     curOp.isCommand(),
                     curOp.getReadWriteType());
 
diff --git a/src/mongo/db/query/explain.cpp b/src/mongo/db/query/explain.cpp
index b243336655b..b3784552f6d 100644
--- a/src/mongo/db/query/explain.cpp
+++ b/src/mongo/db/query/explain.cpp
@@ -763,7 +763,8 @@ void Explain::explainStages(PlanExecutor* exec,
 
         // Generate exec stats BSON for the winning plan.
         OperationContext* opCtx = exec->getOpCtx();
-        long long totalTimeMillis = CurOp::get(opCtx)->elapsedMicros() / 1000;
+        long long totalTimeMillis =
+            durationCount<Milliseconds>(CurOp::get(opCtx)->elapsedTimeTotal());
         generateExecStats(winningStats.get(), verbosity, &execBob, totalTimeMillis);
 
         // Also generate exec stats for all plans, if the verbosity level is high enough.
diff --git a/src/mongo/db/query/find.cpp b/src/mongo/db/query/find.cpp
index e03b3318b1a..56bb92cfb77 100644
--- a/src/mongo/db/query/find.cpp
+++ b/src/mongo/db/query/find.cpp
@@ -235,6 +235,7 @@ Message getMore(OperationContext* opCtx,
     invariant(ntoreturn >= 0);
 
     CurOp& curOp = *CurOp::get(opCtx);
+    curOp.ensureStarted();
 
     // For testing, we may want to fail if we receive a getmore.
     if (MONGO_FAIL_POINT(failReceivedGetmore)) {
@@ -247,18 +248,22 @@ Message getMore(OperationContext* opCtx,
 
     // Cursors come in one of two flavors:
     // - Cursors owned by the collection cursor manager, such as those generated via the find
-    //   command. For these cursors, we hold the appropriate collection lock for the duration of
-    //   the getMore using AutoGetCollectionForRead. This will automatically update the CurOp
-    //   object appropriately and record execution time via Top upon completion.
+    //   command. For these cursors, we hold the appropriate collection lock for the duration of the
+    //   getMore using AutoGetCollectionForRead.
     // - Cursors owned by the global cursor manager, such as those generated via the aggregate
     //   command. These cursors either hold no collection state or manage their collection state
-    //   internally, so we acquire no locks. In this case we use the AutoStatsTracker object to
-    //   update the CurOp object appropriately and record execution time via Top upon
-    //   completion.
+    //   internally, so we acquire no locks.
     //
-    // Thus, only one of 'readLock' and 'statsTracker' will be populated as we populate
-    // 'cursorManager'.
-    boost::optional<AutoGetCollectionForReadCommand> readLock;
+    // While we only need to acquire locks in the case of a cursor which is *not* globally owned, we
+    // need to create an AutoStatsTracker in either case. This is responsible for updating
+    // statistics in CurOp and Top. We avoid using AutoGetCollectionForReadCommand because we may
+    // need to drop and reacquire locks when the cursor is awaitData, but we don't want to update
+    // the stats twice.
+    //
+    // Note that we acquire our locks before our ClientCursorPin, in order to ensure that the pin's
+    // destructor is called before the lock's destructor (if there is one) so that the cursor
+    // cleanup can occur under the lock.
+    boost::optional<AutoGetCollectionForRead> readLock;
     boost::optional<AutoStatsTracker> statsTracker;
     CursorManager* cursorManager;
 
@@ -283,6 +288,12 @@ Message getMore(OperationContext* opCtx,
         }
     } else {
         readLock.emplace(opCtx, nss);
+        const int doNotChangeProfilingLevel = 0;
+        statsTracker.emplace(opCtx,
+                             nss,
+                             Top::LockType::ReadLocked,
+                             readLock->getDb() ? readLock->getDb()->getProfilingLevel()
+                                               : doNotChangeProfilingLevel);
         Collection* collection = readLock->getCollection();
         uassert(
             ErrorCodes::OperationFailed, "collection dropped between getMore calls", collection);
@@ -416,14 +427,13 @@ Message getMore(OperationContext* opCtx,
             exec->saveState();
             readLock.reset();
 
-            // Block waiting for data for up to 1 second.
+            // Block waiting for data for up to 1 second. Time spent blocking is not counted towards
+            // the total operation latency.
+            curOp.pauseTimer();
             Seconds timeout(1);
             notifier->wait(notifierVersion, timeout);
             notifier.reset();
-
-            // Set expected latency to match wait time. This makes sure the logs aren't spammed
-            // by awaitData queries that exceed slowms due to blocking on the CappedInsertNotifier.
-            curOp.setExpectedLatencyMs(durationCount<Milliseconds>(timeout));
+            curOp.resumeTimer();
 
             // Reacquiring locks.
             readLock.emplace(opCtx, nss);
@@ -507,6 +517,7 @@ std::string runQuery(OperationContext* opCtx,
                      const NamespaceString& nss,
                      Message& result) {
     CurOp& curOp = *CurOp::get(opCtx);
+    curOp.ensureStarted();
 
     uassert(ErrorCodes::InvalidNamespace,
             str::stream() << "Invalid ns [" << nss.ns() << "]",
author	David Storch <david.storch@10gen.com>	2017-06-05 19:33:37 -0400
committer	David Storch <david.storch@10gen.com>	2017-06-14 18:52:38 -0400
commit	a68f209e4ff495e805e4d1b12d6fc3381c56a771 (patch)
tree	f3828ef00f300c6d2d5965fe0d9f9ebb27180572 /src
parent	2b2ea96ea4a926018dc02bfc1c33ebcfed8bc2a0 (diff)
download	mongo-a68f209e4ff495e805e4d1b12d6fc3381c56a771.tar.gz