SERVER-2454 Return error on find/getMore PlanExecutor::DEAD

This is a partial backport to the 3.0 branch. It is intended to fix the issue reported in SERVER-20973.
author: James Wahlin <james.wahlin@10gen.com> 2015-10-23 11:05:39 -0400
committer: James Wahlin <james.wahlin@10gen.com> 2015-10-23 14:13:34 -0400
commit: 817c963bea7f12220deee14ecaa512161d39555a (patch)
tree: 254df03f52fc844358c9053fd1f612dc630dae52
parent: 9967e79ea6c20eded9818539c59964afc20058af (diff)
download: mongo-817c963bea7f12220deee14ecaa512161d39555a.tar.gz
6 files changed, 45 insertions, 49 deletions
diff --git a/jstests/concurrency/fsm_workload_modifiers/make_capped.js b/jstests/concurrency/fsm_workload_modifiers/make_capped.js
index 40ba3d09c24..d17f291faa6 100644
--- a/jstests/concurrency/fsm_workload_modifiers/make_capped.js
+++ b/jstests/concurrency/fsm_workload_modifiers/make_capped.js
@@ -20,7 +20,7 @@ function makeCapped($config, $super) {
             db[collName].drop();
             assertAlways.commandWorked(db.createCollection(collName, {
                 capped: true,
-                size: 4096 // bytes
+                size: 16384 // bytes
             }));
         });
 
diff --git a/jstests/replsets/resync.js b/jstests/replsets/resync.js
index af75d8bb813..49eb570a318 100755
--- a/jstests/replsets/resync.js
+++ b/jstests/replsets/resync.js
@@ -39,7 +39,18 @@
 
     function hasCycled() {
         var oplog = a_conn.getDB("local").oplog.rs;
-        return oplog.find( { "o.x" : 1 } ).sort( { $natural : 1 } ).limit(10).itcount() == 0;
+        try {
+            // Collection scan to determine if the oplog entry from the first insert has been
+            // deleted yet.
+            return oplog.find( { "o.x" : 1 } ).sort( { $natural : 1 } ).limit(10).itcount() == 0;
+        }
+        catch (except) {
+            // An error is expected in the case that capped deletions blow away the position of the
+            // collection scan during a yield. In this case, we just try again.
+            var errorRegex = /CappedPositionLost/;
+            assert(errorRegex.test(except.message));
+            return hasCycled();
+        }
     }
 
     // Make sure the oplog has rolled over on the primary and secondary that is up, 
diff --git a/src/mongo/base/error_codes.err b/src/mongo/base/error_codes.err
index 680404aeeea..41d9ce6c581 100644
--- a/src/mongo/base/error_codes.err
+++ b/src/mongo/base/error_codes.err
@@ -120,6 +120,7 @@ error_code("ConflictingOperationInProgress", 117)
 
 # backported
 error_code("OplogStartMissing", 120)
+error_code("CappedPositionLost", 121)
 
 # Non-sequential error codes (for compatibility only)
 error_code("NotMaster", 10107) #this comes from assert_util.h
diff --git a/src/mongo/db/exec/collection_scan.cpp b/src/mongo/db/exec/collection_scan.cpp
index 3a7700299a6..c5028da674a 100644
--- a/src/mongo/db/exec/collection_scan.cpp
+++ b/src/mongo/db/exec/collection_scan.cpp
@@ -35,6 +35,7 @@
 #include "mongo/db/exec/filter.h"
 #include "mongo/db/exec/scoped_timer.h"
 #include "mongo/db/exec/working_set.h"
+#include "mongo/db/exec/working_set_common.h"
 #include "mongo/db/catalog/collection.h"
 #include "mongo/db/storage/record_fetcher.h"
 #include "mongo/util/fail_point_service.h"
@@ -78,6 +79,12 @@ PlanStage::StageState CollectionScan::work(WorkingSetID* out) {
     ScopedTimer timer(&_commonStats.executionTimeMillis);
 
     if (_isDead) {
+        Status status(
+            ErrorCodes::CappedPositionLost,
+            str::stream()
+                << "CollectionScan died due to position in capped collection being deleted. "
+                << "Last seen record id: " << _lastSeenLoc);
+        *out = WorkingSetCommon::allocateStatusMember(_workingSet, status);
         return PlanStage::DEAD;
     }
 
@@ -85,6 +92,9 @@ PlanStage::StageState CollectionScan::work(WorkingSetID* out) {
     if (NULL == _iter) {
         if (_params.collection == NULL) {
             _isDead = true;
+            Status status(ErrorCodes::CappedPositionLost,
+                          str::stream() << "CollectionScan died due to NULL collection param");
+            *out = WorkingSetCommon::allocateStatusMember(_workingSet, status);
             return PlanStage::DEAD;
         }
 
@@ -100,6 +110,11 @@ PlanStage::StageState CollectionScan::work(WorkingSetID* out) {
             // the stream. This is related to the _lastSeenLock handling in invalidate.
             if (_iter->getNext() != _lastSeenLoc) {
                 _isDead = true;
+                Status status(ErrorCodes::CappedPositionLost,
+                              str::stream() << "CollectionScan died due to failure to restore "
+                                            << "tailable cursor position. "
+                                            << "Last seen record id: " << _lastSeenLoc);
+                *out = WorkingSetCommon::allocateStatusMember(_workingSet, status);
                 return PlanStage::DEAD;
             }
         }
@@ -221,8 +236,7 @@ void CollectionScan::restoreState(OperationContext* opCtx) {
     ++_commonStats.unyields;
     if (NULL != _iter) {
         if (!_iter->restoreState(opCtx)) {
-            warning() << "Collection dropped or state deleted during yield of CollectionScan: "
-                      << opCtx->getNS();
+            warning() << "Could not restore RecordCursor for CollectionScan: " << opCtx->getNS();
             _isDead = true;
         }
     }
diff --git a/src/mongo/db/query/find.cpp b/src/mongo/db/query/find.cpp
index 08567c1c986..a35f69bf1f7 100644
--- a/src/mongo/db/query/find.cpp
+++ b/src/mongo/db/query/find.cpp
@@ -366,23 +366,9 @@ QueryResult::View getMore(OperationContext* txn,
             // Propagate this error to caller.
             if (PlanExecutor::FAILURE == state) {
                 scoped_ptr<PlanStageStats> stats(exec->getStats());
-                error() << "Plan executor error, stats: " << Explain::statsToBSON(*stats);
-                uasserted(17406,
-                          "getMore executor error: " + WorkingSetCommon::toStatusString(obj));
-            }
-
-            // If we're dead there's no way to get more results.
-            saveClientCursor = false;
-
-            // In the old system tailable capped cursors would be killed off at the
-            // cursorid level.  If a tailable capped cursor is nuked the cursorid
-            // would vanish.
-            //
-            // In the new system they die and are cleaned up later (or time out).
-            // So this is where we get to remove the cursorid.
-            if (0 == numResults) {
-                resultFlags = ResultFlag_CursorNotFound;
+                error() << "getMore executor error, stats: " << Explain::statsToBSON(*stats);
             }
+            uasserted(17406, "getMore executor error: " + WorkingSetCommon::toStatusString(obj));
         } else if (PlanExecutor::IS_EOF == state) {
             // EOF is also end of the line unless it's tailable.
             saveClientCursor = queryOptions & QueryOption_CursorTailable;
@@ -786,16 +772,17 @@ std::string runQuery(OperationContext* txn,
     exec->deregisterExec();
 
     // Caller expects exceptions thrown in certain cases.
-    if (PlanExecutor::FAILURE == state) {
-        scoped_ptr<PlanStageStats> stats(exec->getStats());
-        error() << "Plan executor error, stats: " << Explain::statsToBSON(*stats);
-        uasserted(17144, "Executor error: " + WorkingSetCommon::toStatusString(obj));
+    if (PlanExecutor::FAILURE == state || PlanExecutor::DEAD == state) {
+        if (PlanExecutor::FAILURE == state) {
+            const std::unique_ptr<PlanStageStats> stats(exec->getStats());
+            error() << "Plan executor error during find: " << PlanExecutor::statestr(state)
+                    << ", stats: " << Explain::statsToBSON(*stats);
+        }
+        uasserted(17144,
+                  "Plan executor error during find: " + WorkingSetCommon::toStatusString(obj));
     }
 
-    // Why save a dead executor?
-    if (PlanExecutor::DEAD == state) {
-        saveClientCursor = false;
-    } else if (pq.getOptions().tailable) {
+    if (pq.getOptions().tailable) {
         // If we're tailing a capped collection, we don't bother saving the cursor if the
         // collection is empty. Otherwise, the semantics of the tailable cursor is that the
         // client will keep trying to read from it. So we'll keep it around.
diff --git a/src/mongo/dbtests/querytests.cpp b/src/mongo/dbtests/querytests.cpp
index b53f7166218..666e5e2761b 100644
--- a/src/mongo/dbtests/querytests.cpp
+++ b/src/mongo/dbtests/querytests.cpp
@@ -514,25 +514,10 @@ public:
         insert(ns, BSON("a" << 2));
         insert(ns, BSON("a" << 3));
 
-        // This can either have been killed, or jumped to the right thing.
-        // Key is that it can't skip.
+        // We have overwritten the previous cursor position and should encounter a dead cursor.
         if (c->more()) {
-            BSONObj x = c->next();
-            ASSERT_EQUALS(2, x["a"].numberInt());
+            ASSERT_THROWS(c->nextSafe(), AssertionException);
         }
-
-        // Inserting a document into a capped collection can force another document out.
-        // In this case, the capped collection has 2 documents, so inserting two more clobbers
-        // whatever RecordId that the underlying cursor had as its state.
-        //
-        // In the Cursor world, the ClientCursor was responsible for manipulating cursors.  It
-        // would detect that the cursor's "refloc" (translation: diskloc required to maintain
-        // iteration state) was being clobbered and it would kill the cursor.
-        //
-        // In the Runner world there is no notion of a "refloc" and as such the invalidation
-        // broadcast code doesn't know enough to know that the underlying collection iteration
-        // can't proceed.
-        // ASSERT_EQUALS( 0, c->getCursorId() );
     }
 };
 
@@ -555,11 +540,9 @@ public:
         insert(ns, BSON("a" << 3));
         insert(ns, BSON("a" << 4));
 
-        // This can either have been killed, or jumped to the right thing.
-        // Key is that it can't skip.
+        // We have overwritten the previous cursor position and should encounter a dead cursor.
         if (c->more()) {
-            BSONObj x = c->next();
-            ASSERT_EQUALS(2, x["a"].numberInt());
+            ASSERT_THROWS(c->nextSafe(), AssertionException);
         }
     }
 };
author	James Wahlin <james.wahlin@10gen.com>	2015-10-23 11:05:39 -0400
committer	James Wahlin <james.wahlin@10gen.com>	2015-10-23 14:13:34 -0400
commit	817c963bea7f12220deee14ecaa512161d39555a (patch)
tree	254df03f52fc844358c9053fd1f612dc630dae52
parent	9967e79ea6c20eded9818539c59964afc20058af (diff)
download	mongo-817c963bea7f12220deee14ecaa512161d39555a.tar.gz