2 files changed, 93 insertions, 5 deletions
diff --git a/jstests/noPassthrough/stepdown_during_getmore_then_legacy_killop.js b/jstests/noPassthrough/stepdown_during_getmore_then_legacy_killop.js
new file mode 100644
index 00000000000..6ef77858a0e
--- /dev/null
+++ b/jstests/noPassthrough/stepdown_during_getmore_then_legacy_killop.js
@@ -0,0 +1,90 @@
+/**
+ * Test the situation where
+ * - A stepdown happens at the end of a getMore operation, after it has checked for interrupt the
+     last time. The command will fail because the connection with the client will be cut as part of
+     the stepdown, but the cursor will still exist.
+ * - Kill the cursor using OP_KILL_CURSORS.
+ *
+ * This is intended to reproduce SERVER-37838.
+ * @tags: [requires_replication]
+ */
+(function() {
+    "use strict";
+
+    load("jstests/libs/fixture_helpers.js");  // For waitUntilServerHangsOnFailPoint().
+
+    // This test runs manual getMores using different connections, which will not inherit the
+    // implicit session of the cursor establishing command.
+    TestData.disableImplicitSessions = true;
+
+    const kName = "stepdown_during_getmore_then_legacy_killop";
+    const rst = new ReplSetTest({name: kName, nodes: 2});
+    rst.startSet();
+    rst.initiate();
+
+    const primary = rst.getPrimary();
+    const testDB = primary.getDB("test");
+    const coll = testDB[kName];
+
+    for (let i = 0; i < 10; i++) {
+        assert.commandWorked(coll.insert({x: 1}));
+    }
+    rst.awaitReplication();
+
+    // Start an aggregation.
+    const csCmdRes = assert.commandWorked(
+        coll.runCommand({aggregate: coll.getName(), pipeline: [], cursor: {batchSize: 2}}));
+
+    const cursorId = csCmdRes.cursor.id;
+
+    // Start a getMore that waits on a failpoint which comes after any checks for interrupt. This
+    // ensures that once we kill the operation (by causing a stepdown), it will still succeed,
+    // since it will have no opportunity to notice it's been killed.
+    const kFailPointName = "waitBeforeUnpinningOrDeletingCursorAfterGetMoreBatch";
+    assert.commandWorked(
+        primary.adminCommand({configureFailPoint: kFailPointName, mode: 'alwaysOn'}));
+
+    let code = `const collName = "${coll.getName()}";`;
+    code += `const cursorId = ${cursorId};`;
+    function runGetMore() {
+        const getMoreCmd = {getMore: cursorId, collection: collName, batchSize: 2};
+
+        const exn = assert.throws(() => db[collName].runCommand(getMoreCmd));
+        assert.gte(exn.message.indexOf("network error"), 0);
+    }
+    code += `(${runGetMore.toString()})();`;
+    const getMoreJoiner = startParallelShell(code, primary.port);
+
+    // Be sure the server is hanging on the failpoint.
+    assert.soon(function() {
+        const filter = {"msg": kFailPointName, "command.getMore": cursorId};
+        const ops = primary.getDB("admin")
+                        .aggregate([{$currentOp: {allUsers: true}}, {$match: filter}])
+                        .toArray();
+        return ops.length == 1;
+    });
+
+    // Now step down the node. This will cause the getMore operation to be marked as killed, though
+    // it will never check for interrupt because it's hanging on a failpoint that's past any
+    // interrupt checks. When the operation is done using its cursor, it will realize that its been
+    // killed and mark the _cursor_ as having been killed, so that further attempts to use it will
+    // fail.
+    assert.throws(function() {
+        primary.adminCommand({replSetStepDown: ReplSetTest.kDefaultTimeoutMS, force: true});
+    });
+
+    assert.commandWorked(primary.adminCommand({configureFailPoint: kFailPointName, mode: 'off'}));
+    getMoreJoiner();
+
+    // Close the cursor with an OP_KILL_CURSORS.
+    (function() {
+        testDB.getMongo().forceReadMode("legacy");
+
+        const curs = new DBCommandCursor(testDB, csCmdRes);
+        curs.close();
+
+        testDB.getMongo().forceReadMode("commands");
+    })();
+
+    rst.stopSet();
+})();
diff --git a/src/mongo/db/cursor_manager.cpp b/src/mongo/db/cursor_manager.cpp
index 092e95eedb0..03caf70537b 100644
--- a/src/mongo/db/cursor_manager.cpp
+++ b/src/mongo/db/cursor_manager.cpp
@@ -185,9 +185,8 @@ bool GlobalCursorIdCache::killCursor(OperationContext* opCtx, CursorId id, bool
     if (CursorManager::isGloballyManagedCursor(id)) {
         auto pin = globalCursorManager->pinCursor(opCtx, id, CursorManager::kNoCheckSession);
         if (!pin.isOK()) {
-            invariant(pin == ErrorCodes::CursorNotFound || pin == ErrorCodes::Unauthorized);
-            // No such cursor.  TODO: Consider writing to audit log here (even though we don't
-            // have a namespace).
+            // Either the cursor doesn't exist, or it was killed during the last time it was being
+            // used, and was cleaned up after this call. Either way, we cannot kill it.
             return false;
         }
         nss = pin.getValue().getCursor()->nss();
@@ -196,8 +195,7 @@ bool GlobalCursorIdCache::killCursor(OperationContext* opCtx, CursorId id, bool
         uint32_t nsid = idFromCursorId(id);
         IdToNssMap::const_iterator it = _idToNss.find(nsid);
         if (it == _idToNss.end()) {
-            // No namespace corresponding to this cursor id prefix.  TODO: Consider writing to
-            // audit log here (even though we don't have a namespace).
+            // No namespace corresponding to this cursor id prefix.
             return false;
         }
         nss = it->second;