SERVER-32912 Ensure that killCursors always invalidates a pinned cursor

author: Bernard Gorman <bernard.gorman@gmail.com> 2018-02-02 16:02:30 +0000
committer: Bernard Gorman <bernard.gorman@gmail.com> 2018-02-13 11:04:43 +0000
commit: 1221b80ee0d65879a4c76ff98f78e92b53766cc0 (patch)
tree: a6e4f81b3a82160ba3663daa4e4eaac07fad652e /src
parent: ca0a855dfc0f479d85b76a640b12a259c0547310 (diff)
download: mongo-1221b80ee0d65879a4c76ff98f78e92b53766cc0.tar.gz
10 files changed, 163 insertions, 74 deletions
diff --git a/src/mongo/db/clientcursor.cpp b/src/mongo/db/clientcursor.cpp
index 46817485eba..3b0f6b4d297 100644
--- a/src/mongo/db/clientcursor.cpp
+++ b/src/mongo/db/clientcursor.cpp
@@ -115,8 +115,8 @@ ClientCursor::~ClientCursor() {
     }
 }
 
-void ClientCursor::markAsKilled(const std::string& reason) {
-    _exec->markAsKilled(reason);
+void ClientCursor::markAsKilled(Status killStatus) {
+    _exec->markAsKilled(killStatus);
 }
 
 void ClientCursor::dispose(OperationContext* opCtx) {
diff --git a/src/mongo/db/clientcursor.h b/src/mongo/db/clientcursor.h
index f6a97bfe4c9..f0f8772df73 100644
--- a/src/mongo/db/clientcursor.h
+++ b/src/mongo/db/clientcursor.h
@@ -268,10 +268,10 @@ private:
     ~ClientCursor();
 
     /**
-     * Marks this cursor as killed, so any future uses will return an error status including
-     * 'reason'.
+     * Marks this cursor as killed, so any future uses will return 'killStatus'. It is an error to
+     * call this method with Status::OK.
      */
-    void markAsKilled(const std::string& reason);
+    void markAsKilled(Status killStatus);
 
     /**
      * Disposes this ClientCursor's PlanExecutor. Must be called before deleting a ClientCursor to
diff --git a/src/mongo/db/commands/getmore_cmd.cpp b/src/mongo/db/commands/getmore_cmd.cpp
index 1e57c922448..901eaea3306 100644
--- a/src/mongo/db/commands/getmore_cmd.cpp
+++ b/src/mongo/db/commands/getmore_cmd.cpp
@@ -67,6 +67,37 @@ namespace {
 
 MONGO_FP_DECLARE(rsStopGetMoreCmd);
 
+// Helper function which sets the 'msg' field of the opCtx's CurOp to the specified string, and
+// returns the original value of the field.
+std::string updateCurOpMsg(OperationContext* opCtx, const std::string& newMsg) {
+    stdx::lock_guard<Client> lk(*opCtx->getClient());
+    auto oldMsg = CurOp::get(opCtx)->getMessage();
+    CurOp::get(opCtx)->setMessage_inlock(newMsg.c_str());
+    return oldMsg;
+}
+
+// This helper function works much like MONGO_FAIL_POINT_PAUSE_WHILE_SET, but it additionally
+// releases and re-acquires the collection readLock at regular intervals, in order to avoid
+// deadlocks caused by the pinned-cursor failpoints in this file (see SERVER-21997). Finally, it
+// also sets the 'msg' field of the opCtx's CurOp to the given string while the failpoint is active.
+void waitWhileFailPointEnabled(FailPoint* failPoint,
+                               OperationContext* opCtx,
+                               const NamespaceString& nss,
+                               const std::string& curOpMsg,
+                               boost::optional<AutoGetCollectionForRead>* readLock) {
+    invariant(failPoint);
+    auto origCurOpMsg = updateCurOpMsg(opCtx, curOpMsg);
+
+    while (MONGO_FAIL_POINT((*failPoint))) {
+        sleepFor(Milliseconds(10));
+        if (readLock && *readLock) {
+            readLock->reset();
+            readLock->emplace(opCtx, nss);
+        }
+    }
+    updateCurOpMsg(opCtx, origCurOpMsg);
+}
+
 /**
  * A command for running getMore() against an existing cursor registered with a CursorManager.
  * Used to generate the next batch of results for a ClientCursor.
@@ -228,15 +259,15 @@ public:
 
         ClientCursor* cursor = ccPin.getValue().getCursor();
 
-        // If the fail point is enabled, busy wait until it is disabled.
-        while (MONGO_FAIL_POINT(keepCursorPinnedDuringGetMore)) {
-            if (readLock) {
-                // We unlock and re-acquire the locks periodically in order to avoid deadlock (see
-                // SERVER-21997 for details).
-                sleepFor(Milliseconds(10));
-                readLock.reset();
-                readLock.emplace(opCtx, request.nss);
-            }
+        // If the 'waitAfterPinningCursorBeforeGetMoreBatch' fail point is enabled, set the 'msg'
+        // field of this operation's CurOp to signal that we've hit this point and then spin until
+        // the failpoint is released.
+        if (MONGO_FAIL_POINT(waitAfterPinningCursorBeforeGetMoreBatch)) {
+            waitWhileFailPointEnabled(&waitAfterPinningCursorBeforeGetMoreBatch,
+                                      opCtx,
+                                      request.nss,
+                                      "waitAfterPinningCursorBeforeGetMoreBatch",
+                                      &readLock);
         }
 
         // A user can only call getMore on their own cursor. If there were multiple users
@@ -344,6 +375,18 @@ public:
             awaitDataState(opCtx).shouldWaitForInserts = true;
         }
 
+        // We're about to begin running the PlanExecutor in order to fill the getMore batch. If the
+        // 'waitWithPinnedCursorDuringGetMoreBatch' failpoint is active, set the 'msg' field of this
+        // operation's CurOp to signal that we've hit this point and then spin until the failpoint
+        // is released.
+        if (MONGO_FAIL_POINT(waitWithPinnedCursorDuringGetMoreBatch)) {
+            waitWhileFailPointEnabled(&waitWithPinnedCursorDuringGetMoreBatch,
+                                      opCtx,
+                                      request.nss,
+                                      "waitWithPinnedCursorDuringGetMoreBatch",
+                                      &readLock);
+        }
+
         Status batchStatus = generateBatch(opCtx, cursor, request, &nextBatch, &state, &numResults);
         if (!batchStatus.isOK()) {
             return CommandHelpers::appendCommandStatus(result, batchStatus);
@@ -388,6 +431,18 @@ public:
             cursorFreer.Dismiss();
         }
 
+        // We're about to unpin the cursor as the ClientCursorPin goes out of scope (or delete it,
+        // if it has been exhausted). If the 'waitBeforeUnpinningOrDeletingCursorAfterGetMoreBatch'
+        // failpoint is active, set the 'msg' field of this operation's CurOp to signal that we've
+        // hit this point and then spin until the failpoint is released.
+        if (MONGO_FAIL_POINT(waitBeforeUnpinningOrDeletingCursorAfterGetMoreBatch)) {
+            waitWhileFailPointEnabled(&waitBeforeUnpinningOrDeletingCursorAfterGetMoreBatch,
+                                      opCtx,
+                                      request.nss,
+                                      "waitBeforeUnpinningOrDeletingCursorAfterGetMoreBatch",
+                                      &readLock);
+        }
+
         return true;
     }
 
@@ -451,28 +506,27 @@ public:
             return Status::OK();
         }
 
-        if (PlanExecutor::FAILURE == *state) {
-            nextBatch->abandon();
-
-            error() << "GetMore command executor error: " << PlanExecutor::statestr(*state)
-                    << ", stats: " << redact(Explain::getWinningPlanStats(exec));
-
-            return Status(ErrorCodes::OperationFailed,
-                          str::stream() << "GetMore command executor error: "
-                                        << WorkingSetCommon::toStatusString(obj));
-        } else if (PlanExecutor::DEAD == *state) {
-            nextBatch->abandon();
-
-            return Status(ErrorCodes::QueryPlanKilled,
-                          str::stream() << "PlanExecutor killed: "
-                                        << WorkingSetCommon::toStatusString(obj));
-        } else if (PlanExecutor::IS_EOF == *state) {
-            // This causes the reported latest oplog timestamp to advance even when there are
-            // no results for this particular query.
-            nextBatch->setLatestOplogTimestamp(exec->getLatestOplogTimestamp());
+        switch (*state) {
+            case PlanExecutor::FAILURE:
+                // Log an error message and then perform the same cleanup as DEAD.
+                error() << "GetMore command executor error: " << PlanExecutor::statestr(*state)
+                        << ", stats: " << redact(Explain::getWinningPlanStats(exec));
+            case PlanExecutor::DEAD: {
+                nextBatch->abandon();
+                // We should always have a valid status member object at this point.
+                auto status = WorkingSetCommon::getMemberObjectStatus(obj);
+                invariant(!status.isOK());
+                return status;
+            }
+            case PlanExecutor::IS_EOF:
+                // This causes the reported latest oplog timestamp to advance even when there are
+                // no results for this particular query.
+                nextBatch->setLatestOplogTimestamp(exec->getLatestOplogTimestamp());
+            default:
+                return Status::OK();
         }
 
-        return Status::OK();
+        MONGO_UNREACHABLE;
     }
 
 } getMoreCmd;
diff --git a/src/mongo/db/cursor_manager.cpp b/src/mongo/db/cursor_manager.cpp
index fac25953f3c..5168b638e0a 100644
--- a/src/mongo/db/cursor_manager.cpp
+++ b/src/mongo/db/cursor_manager.cpp
@@ -26,6 +26,8 @@
 *    it in the license file.
 */
 
+#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kQuery
+
 #include "mongo/platform/basic.h"
 
 #include "mongo/db/cursor_manager.h"
@@ -51,6 +53,7 @@
 #include "mongo/platform/random.h"
 #include "mongo/stdx/memory.h"
 #include "mongo/util/exit.h"
+#include "mongo/util/log.h"
 #include "mongo/util/startup_test.h"
 
 namespace mongo {
@@ -420,7 +423,7 @@ void CursorManager::invalidateAll(OperationContext* opCtx,
         for (auto&& exec : partition) {
             // The PlanExecutor is owned elsewhere, so we just mark it as killed and let it be
             // cleaned up later.
-            exec->markAsKilled(reason);
+            exec->markAsKilled({ErrorCodes::QueryPlanKilled, reason});
         }
     }
     allExecPartitions.clear();
@@ -431,7 +434,7 @@ void CursorManager::invalidateAll(OperationContext* opCtx,
     for (auto&& partition : allCurrentPartitions) {
         for (auto it = partition.begin(); it != partition.end();) {
             auto* cursor = it->second;
-            cursor->markAsKilled(reason);
+            cursor->markAsKilled({ErrorCodes::QueryPlanKilled, reason});
 
             // If there's an operation actively using the cursor, then that operation is now
             // responsible for cleaning it up.  Otherwise we can immediately dispose of it.
@@ -541,9 +544,7 @@ StatusWith<ClientCursorPin> CursorManager::pinCursor(OperationContext* opCtx,
             !cursor->_operationUsingCursor);
     if (cursor->getExecutor()->isMarkedAsKilled()) {
         // This cursor was killed while it was idle.
-        Status error{ErrorCodes::QueryPlanKilled,
-                     str::stream() << "cursor killed because: "
-                                   << cursor->getExecutor()->getKillReason()};
+        Status error = cursor->getExecutor()->getKillStatus();
         lockedPartition->erase(cursor->cursorid());
         cursor->dispose(opCtx);
         delete cursor;
@@ -572,10 +573,24 @@ void CursorManager::unpin(OperationContext* opCtx, ClientCursor* cursor) {
     // Avoid computing the current time within the critical section.
     auto now = opCtx->getServiceContext()->getPreciseClockSource()->now();
 
-    auto partitionLock = _cursorMap->lockOnePartition(cursor->cursorid());
+    auto partition = _cursorMap->lockOnePartition(cursor->cursorid());
     invariant(cursor->_operationUsingCursor);
+
+    // We must verify that no interrupts have occurred since we finished building the current
+    // batch. Otherwise, the cursor will be checked back in, the interrupted opCtx will be
+    // destroyed, and subsequent getMores with a fresh opCtx will succeed.
+    auto interruptStatus = cursor->_operationUsingCursor->checkForInterruptNoAssert();
     cursor->_operationUsingCursor = nullptr;
     cursor->_lastUseDate = now;
+    if (!interruptStatus.isOK()) {
+        // If an interrupt occurred after the batch was completed, we remove the now-unpinned cursor
+        // from the CursorManager, then dispose of and delete it.
+        LOG(0) << "removing cursor " << cursor->cursorid()
+               << " after completing batch: " << interruptStatus;
+        partition->erase(cursor->cursorid());
+        cursor->dispose(opCtx);
+        delete cursor;
+    }
 }
 
 void CursorManager::getCursorIds(std::set<CursorId>* openCursors) const {
diff --git a/src/mongo/db/query/find.cpp b/src/mongo/db/query/find.cpp
index 6038193508e..75ba802b40f 100644
--- a/src/mongo/db/query/find.cpp
+++ b/src/mongo/db/query/find.cpp
@@ -202,14 +202,22 @@ void generateBatch(int ntoreturn,
     }
 
     // Propagate any errors to the caller.
-    if (PlanExecutor::FAILURE == *state) {
-        error() << "getMore executor error, stats: " << redact(Explain::getWinningPlanStats(exec));
-        uasserted(17406, "getMore executor error: " + WorkingSetCommon::toStatusString(obj));
-    } else if (PlanExecutor::DEAD == *state) {
-        uasserted(ErrorCodes::QueryPlanKilled,
-                  str::stream() << "PlanExecutor killed: "
-                                << WorkingSetCommon::toStatusString(obj));
+    switch (*state) {
+        // Log an error message and then perform the same cleanup as DEAD.
+        case PlanExecutor::FAILURE:
+            error() << "getMore executor error, stats: "
+                    << redact(Explain::getWinningPlanStats(exec));
+        case PlanExecutor::DEAD: {
+            // We should always have a valid status object by this point.
+            auto status = WorkingSetCommon::getMemberObjectStatus(obj);
+            invariant(!status.isOK());
+            uassertStatusOK(status);
+        }
+        default:
+            return;
     }
+
+    MONGO_UNREACHABLE;
 }
 
 }  // namespace
@@ -324,7 +332,6 @@ Message getMore(OperationContext* opCtx,
             cursorid = 0;
             resultFlags = ResultFlag_CursorNotFound;
         } else {
-            invariant(ccPin == ErrorCodes::QueryPlanKilled || ccPin == ErrorCodes::Unauthorized);
             uassertStatusOK(ccPin.getStatus());
         }
     } else {
diff --git a/src/mongo/db/query/find_common.cpp b/src/mongo/db/query/find_common.cpp
index b59b6e2a699..affea8b8967 100644
--- a/src/mongo/db/query/find_common.cpp
+++ b/src/mongo/db/query/find_common.cpp
@@ -36,10 +36,14 @@
 
 namespace mongo {
 
-MONGO_FP_DECLARE(keepCursorPinnedDuringGetMore);
-
 MONGO_FP_DECLARE(disableAwaitDataForGetMoreCmd);
 
+MONGO_FP_DECLARE(waitAfterPinningCursorBeforeGetMoreBatch);
+
+MONGO_FP_DECLARE(waitWithPinnedCursorDuringGetMoreBatch);
+
+MONGO_FP_DECLARE(waitBeforeUnpinningOrDeletingCursorAfterGetMoreBatch);
+
 const OperationContext::Decoration<AwaitDataState> awaitDataState =
     OperationContext::declareDecoration<AwaitDataState>();
 
diff --git a/src/mongo/db/query/find_common.h b/src/mongo/db/query/find_common.h
index 0e83e3cb546..6aaa8e390ff 100644
--- a/src/mongo/db/query/find_common.h
+++ b/src/mongo/db/query/find_common.h
@@ -54,14 +54,22 @@ extern const OperationContext::Decoration<AwaitDataState> awaitDataState;
 class BSONObj;
 class QueryRequest;
 
-// Enabling this fail point will cause the getMore command to busy wait after pinning the cursor,
-// until the fail point is disabled.
-MONGO_FP_FORWARD_DECLARE(keepCursorPinnedDuringGetMore);
-
 // Failpoint for making getMore not wait for an awaitdata cursor. Allows us to avoid waiting during
 // tests.
 MONGO_FP_FORWARD_DECLARE(disableAwaitDataForGetMoreCmd);
 
+// Enabling this fail point will cause the getMore command to busy wait after pinning the cursor but
+// before we have started building the batch, until the fail point is disabled.
+MONGO_FP_FORWARD_DECLARE(waitAfterPinningCursorBeforeGetMoreBatch);
+
+// Enabling this fail point will cause the getMore command to busy wait with its cursor pinned while
+// building the batch, until the fail point is disabled.
+MONGO_FP_FORWARD_DECLARE(waitWithPinnedCursorDuringGetMoreBatch);
+
+// Enabling this failpoint will cause the getMore to wait just before it unpins its cursor after it
+// has completed building the current batch.
+MONGO_FP_FORWARD_DECLARE(waitBeforeUnpinningOrDeletingCursorAfterGetMoreBatch);
+
 /**
  * Suite of find/getMore related functions used in both the mongod and mongos query paths.
  */
diff --git a/src/mongo/db/query/plan_executor.cpp b/src/mongo/db/query/plan_executor.cpp
index b3ad1aa08d0..a85210d0751 100644
--- a/src/mongo/db/query/plan_executor.cpp
+++ b/src/mongo/db/query/plan_executor.cpp
@@ -370,9 +370,7 @@ Status PlanExecutor::restoreStateWithoutRetrying() {
     }
 
     _currentState = kUsable;
-    return isMarkedAsKilled()
-        ? Status{ErrorCodes::QueryPlanKilled, "query killed during yield: " + *_killReason}
-        : Status::OK();
+    return _killStatus;
 }
 
 void PlanExecutor::detachFromOperationContext() {
@@ -501,10 +499,8 @@ PlanExecutor::ExecState PlanExecutor::getNextImpl(Snapshotted<BSONObj>* objOut,
     invariant(_currentState == kUsable);
     if (isMarkedAsKilled()) {
         if (NULL != objOut) {
-            Status status(ErrorCodes::OperationFailed,
-                          str::stream() << "Operation aborted because: " << *_killReason);
             *objOut = Snapshotted<BSONObj>(SnapshotId(),
-                                           WorkingSetCommon::buildMemberStatusObject(status));
+                                           WorkingSetCommon::buildMemberStatusObject(_killStatus));
         }
         return PlanExecutor::DEAD;
     }
@@ -646,8 +642,12 @@ bool PlanExecutor::isEOF() {
     return isMarkedAsKilled() || (_stash.empty() && _root->isEOF());
 }
 
-void PlanExecutor::markAsKilled(string reason) {
-    _killReason = std::move(reason);
+void PlanExecutor::markAsKilled(Status killStatus) {
+    invariant(!killStatus.isOK());
+    // If killed multiple times, only retain the first status.
+    if (_killStatus.isOK()) {
+        _killStatus = killStatus;
+    }
 }
 
 void PlanExecutor::dispose(OperationContext* opCtx, CursorManager* cursorManager) {
@@ -678,8 +678,7 @@ Status PlanExecutor::executePlan() {
 
     if (PlanExecutor::DEAD == state || PlanExecutor::FAILURE == state) {
         if (isMarkedAsKilled()) {
-            return Status(ErrorCodes::QueryPlanKilled,
-                          str::stream() << "Operation aborted because: " << *_killReason);
+            return _killStatus;
         }
 
         auto errorStatus = WorkingSetCommon::getMemberObjectStatus(obj);
diff --git a/src/mongo/db/query/plan_executor.h b/src/mongo/db/query/plan_executor.h
index 9ce62619e2d..182dfa168c5 100644
--- a/src/mongo/db/query/plan_executor.h
+++ b/src/mongo/db/query/plan_executor.h
@@ -374,10 +374,12 @@ public:
     /**
      * If we're yielding locks, the database we're operating over or any collection we're relying on
      * may be dropped. Plan executors are notified of such events by calling markAsKilled().
-     * Callers must specify the 'reason' for why this executor is being killed. Subsequent calls to
-     * getNext() will return DEAD, and fill 'objOut' with an error detail including 'reason'.
+     * Callers must specify the reason for why this executor is being killed. Subsequent calls to
+     * getNext() will return DEAD, and fill 'objOut' with an error reflecting 'killStatus'. If this
+     * method is called multiple times, only the first 'killStatus' will be retained. It is an error
+     * to call this method with Status::OK.
      */
-    void markAsKilled(std::string reason);
+    void markAsKilled(Status killStatus);
 
     /**
      * Cleans up any state associated with this PlanExecutor. Must be called before deleting this
@@ -446,12 +448,12 @@ public:
     }
 
     bool isMarkedAsKilled() const {
-        return static_cast<bool>(_killReason);
+        return !_killStatus.isOK();
     }
 
-    const std::string& getKillReason() {
+    Status getKillStatus() {
         invariant(isMarkedAsKilled());
-        return *_killReason;
+        return _killStatus;
     }
 
     bool isDisposed() const {
@@ -550,9 +552,9 @@ private:
     std::unique_ptr<QuerySolution> _qs;
     std::unique_ptr<PlanStage> _root;
 
-    // If _killReason has a value, then we have been killed and the value represents the reason for
-    // the kill.
-    boost::optional<std::string> _killReason;
+    // If _killStatus has a non-OK value, then we have been killed and the value represents the
+    // reason for the kill.
+    Status _killStatus = Status::OK();
 
     // What namespace are we operating over?
     NamespaceString _nss;
diff --git a/src/mongo/s/query/cluster_find.cpp b/src/mongo/s/query/cluster_find.cpp
index d8f27ea170b..1ce61880996 100644
--- a/src/mongo/s/query/cluster_find.cpp
+++ b/src/mongo/s/query/cluster_find.cpp
@@ -405,7 +405,7 @@ StatusWith<CursorResponse> ClusterFind::runGetMore(OperationContext* opCtx,
     invariant(request.cursorid == pinnedCursor.getValue().getCursorId());
 
     // If the fail point is enabled, busy wait until it is disabled.
-    while (MONGO_FAIL_POINT(keepCursorPinnedDuringGetMore)) {
+    while (MONGO_FAIL_POINT(waitAfterPinningCursorBeforeGetMoreBatch)) {
     }
 
     if (auto readPref = pinnedCursor.getValue().getReadPreference()) {
author	Bernard Gorman <bernard.gorman@gmail.com>	2018-02-02 16:02:30 +0000
committer	Bernard Gorman <bernard.gorman@gmail.com>	2018-02-13 11:04:43 +0000
commit	1221b80ee0d65879a4c76ff98f78e92b53766cc0 (patch)
tree	a6e4f81b3a82160ba3663daa4e4eaac07fad652e /src
parent	ca0a855dfc0f479d85b76a640b12a259c0547310 (diff)
download	mongo-1221b80ee0d65879a4c76ff98f78e92b53766cc0.tar.gz