SERVER-47117 clear maxTimeMSForHedgedReads delay from the getMore on hedge reads

(cherry picked from commit d05acff26c4c5a7fb27e9d0c7e31e4a3167fa7fe)
author: Misha Tyulenev <misha@mongodb.com> 2020-04-16 14:04:31 -0400
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2020-04-23 16:36:41 +0000
commit: 7ee6f5f0157eea08876c705dcf6e02309d6153da (patch)
tree: dd69a74576b02be018e9a0afd3e1083a6ae9d1b9
parent: c3b47f6e122ae36ff297944a9d90ab9f1151ec17 (diff)
download: mongo-7ee6f5f0157eea08876c705dcf6e02309d6153da.tar.gz
10 files changed, 140 insertions, 51 deletions
diff --git a/buildscripts/resmokeconfig/suites/causally_consistent_hedged_reads_jscore_passthrough.yml b/buildscripts/resmokeconfig/suites/causally_consistent_hedged_reads_jscore_passthrough.yml
index e0d35ae3acb..71fb2fb59ae 100644
--- a/buildscripts/resmokeconfig/suites/causally_consistent_hedged_reads_jscore_passthrough.yml
+++ b/buildscripts/resmokeconfig/suites/causally_consistent_hedged_reads_jscore_passthrough.yml
@@ -137,7 +137,7 @@ executor:
     mongos_options:
       set_parameters:
         enableTestCommands: 1
-        #TODO SERVER-47117
+        #TODO SERVER-47543
         maxTimeMSForHedgedReads: 10000
         logComponentVerbosity:
           verbosity: 0
diff --git a/jstests/sharding/hedged_reads.js b/jstests/sharding/hedged_reads.js
index f1d2e1a97ab..105fed6cb8c 100644
--- a/jstests/sharding/hedged_reads.js
+++ b/jstests/sharding/hedged_reads.js
@@ -74,10 +74,18 @@ const testDB = st.s.getDB(dbName);
 
 const kBlockCmdTimeMS = 5 * 60 * 1000;
 const kWaitKillOpTimeoutMS = 5 * 1000;
+const numDocs = 10;
 
 assert.commandWorked(st.s.adminCommand({enableSharding: dbName}));
 st.ensurePrimaryShard(dbName, st.shard0.shardName);
 
+let bulk = testDB[collName].initializeUnorderedBulkOp();
+for (let i = 0; i < numDocs; i++) {
+    bulk.insert({x: i});
+}
+assert.commandWorked(bulk.execute());
+assert.commandWorked(st.s.adminCommand({flushRouterConfig: 1}));
+
 jsTest.log(
     "Verify that maxTimeMS expiration of the additional request does not affect the command result");
 // The hedged read will have the maxTimeMS set to 10ms, hence need to sleep longer than that.
@@ -113,6 +121,7 @@ try {
         kWaitKillOpTimeoutMS);
 } finally {
     clearCommandDelay(sortedNodes[0]);
+    clearCommandDelay(sortedNodes[1]);
 }
 
 jsTest.log(
@@ -142,8 +151,38 @@ try {
         "Timed out waiting for the operation run by the additional request to be killed",
         kWaitKillOpTimeoutMS);
 } finally {
+    clearCommandDelay(sortedNodes[0]);
     clearCommandDelay(sortedNodes[1]);
 }
 
+jsTest.log("Verify that the getMore on hedge request do not inherit maxTimeMS");
+try {
+    assert.commandWorked(st.s.adminCommand({setParameter: 1, maxTimeMSForHedgedReads: 100}));
+
+    // force to open hedge read cursor on sortedNodes[1]
+    setCommandDelay(sortedNodes[0], "find", 100, ns);
+
+    // $where with sleep is used because blocking command via failCommand does not affect the opCtx
+    // deadlines as it blocks and unblocks the command before it starts execution.
+    const comment = "test_getmore_on_additional_request_" + ObjectId();
+    let findRes = assert.commandWorked(testDB.runCommand({
+        find: collName,
+        filter: {$where: "sleep(200); return true;", x: {$gte: 0}},
+        $readPreference: {mode: "nearest"},
+        batchSize: 0,
+        comment: comment
+    }));
+
+    const cursorId = findRes.cursor.id;
+    assert.neq(0, cursorId);
+
+    // confirm that getMore does not time out.
+    let getMoreRes =
+        assert.commandWorked(testDB.runCommand({getMore: cursorId, collection: collName}));
+    assert.eq(getMoreRes.cursor.nextBatch.length, numDocs);
+} finally {
+    clearCommandDelay(sortedNodes[0]);
+}
+
 st.stop();
 }());
diff --git a/src/mongo/db/command_generic_argument.cpp b/src/mongo/db/command_generic_argument.cpp
index d03b4e64eae..85556c371a6 100644
--- a/src/mongo/db/command_generic_argument.cpp
+++ b/src/mongo/db/command_generic_argument.cpp
@@ -52,7 +52,7 @@ struct SpecialArgRecord {
 // If that changes, it should be added. When you add to this list, consider whether you
 // should also change the filterCommandRequestForPassthrough() function.
 // clang-format off
-static constexpr std::array<SpecialArgRecord, 28> specials{{
+static constexpr std::array<SpecialArgRecord, 29> specials{{
     //                                       /-isGeneric
     //                                       |  /-stripFromRequest
     //                                       |  |  /-stripFromReply
@@ -83,7 +83,8 @@ static constexpr std::array<SpecialArgRecord, 28> specials{{
     {"operationTime"_sd,                     0, 0, 1},
     {"lastCommittedOpTime"_sd,               0, 0, 1},
     {"readOnly"_sd,                          0, 0, 1},
-    {"comment"_sd,                           1, 0, 0}}};
+    {"comment"_sd,                           1, 0, 0},
+    {"maxTimeMSOpOnly"_sd,                   1, 0, 0}}};
 // clang-format on
 
 template <bool SpecialArgRecord::*pmo>
diff --git a/src/mongo/db/cursor_manager.cpp b/src/mongo/db/cursor_manager.cpp
index eb7359d9f4d..c24abcdd8b5 100644
--- a/src/mongo/db/cursor_manager.cpp
+++ b/src/mongo/db/cursor_manager.cpp
@@ -367,6 +367,11 @@ ClientCursorPin CursorManager::registerCursor(OperationContext* opCtx,
         _opKeyMap.emplace(*opKey, cursorId);
     }
 
+    // Restores the maxTimeMS provided in the cursor generating command in the case it used
+    // maxTimeMSOpOnly. This way the pinned cursor will have the leftover time consistent with the
+    // maxTimeMS.
+    opCtx->restoreMaxTimeMS();
+
     return ClientCursorPin(opCtx, unownedCursor, this);
 }
 
diff --git a/src/mongo/db/operation_context.cpp b/src/mongo/db/operation_context.cpp
index 5401f36c20a..260de1bf20b 100644
--- a/src/mongo/db/operation_context.cpp
+++ b/src/mongo/db/operation_context.cpp
@@ -181,6 +181,27 @@ Microseconds OperationContext::getRemainingMaxTimeMicros() const {
     return _maxTime - getElapsedTime();
 }
 
+void OperationContext::restoreMaxTimeMS() {
+    if (!_storedMaxTime) {
+        return;
+    }
+
+    auto maxTime = *_storedMaxTime;
+    _storedMaxTime = boost::none;
+
+    if (maxTime <= Microseconds::zero()) {
+        maxTime = Microseconds::max();
+    }
+
+    if (maxTime == Microseconds::max()) {
+        _deadline = Date_t::max();
+    } else {
+        auto clock = getServiceContext()->getFastClockSource();
+        _deadline = clock->now() + clock->getPrecision() + maxTime - _elapsedTime.elapsed();
+    }
+    _maxTime = maxTime;
+}
+
 namespace {
 
 // Helper function for checkForInterrupt fail point.  Decides whether the operation currently
diff --git a/src/mongo/db/operation_context.h b/src/mongo/db/operation_context.h
index c1e224bfee7..ea565afd52c 100644
--- a/src/mongo/db/operation_context.h
+++ b/src/mongo/db/operation_context.h
@@ -423,6 +423,15 @@ public:
         return _exhaust;
     }
 
+    void storeMaxTimeMS(Microseconds maxTime) {
+        _storedMaxTime = maxTime;
+    }
+
+    /**
+     * Restore deadline to match the value stored in _storedMaxTime.
+     */
+    void restoreMaxTimeMS();
+
 private:
     StatusWith<stdx::cv_status> waitForConditionOrInterruptNoAssertUntil(
         stdx::condition_variable& cv, BasicLockableAdapter m, Date_t deadline) noexcept override;
@@ -540,12 +549,15 @@ private:
     bool _isExecutingShutdown = false;
 
     // Max operation time requested by the user or by the cursor in the case of a getMore with no
-    // user-specified maxTime. This is tracked with microsecond granularity for the purpose of
+    // user-specified maxTimeMS. This is tracked with microsecond granularity for the purpose of
     // assigning unused execution time back to a cursor at the end of an operation, only. The
     // _deadline and the service context's fast clock are the only values consulted for determining
     // if the operation's timelimit has been exceeded.
     Microseconds _maxTime = Microseconds::max();
 
+    // The value of the maxTimeMS requested by user in the case it was overwritten.
+    boost::optional<Microseconds> _storedMaxTime;
+
     // Timer counting the elapsed time since the construction of this OperationContext.
     Timer _elapsedTime;
 
diff --git a/src/mongo/db/query/query_request.cpp b/src/mongo/db/query/query_request.cpp
index dd80516535c..f5ac96d3d43 100644
--- a/src/mongo/db/query/query_request.cpp
+++ b/src/mongo/db/query/query_request.cpp
@@ -110,6 +110,7 @@ const char QueryRequest::kInternalReadAtClusterTimeField[] = "$_internalReadAtCl
 const char QueryRequest::kRequestResumeTokenField[] = "$_requestResumeToken";
 const char QueryRequest::kResumeAfterField[] = "$_resumeAfter";
 const char QueryRequest::kUse44SortKeys[] = "_use44SortKeys";
+const char QueryRequest::kMaxTimeMSOpOnlyField[] = "maxTimeMSOpOnly";
 
 // Field names for sorting options.
 const char QueryRequest::kNaturalSortField[] = "$natural";
diff --git a/src/mongo/db/query/query_request.h b/src/mongo/db/query/query_request.h
index 3593489cec8..115e8de7b60 100644
--- a/src/mongo/db/query/query_request.h
+++ b/src/mongo/db/query/query_request.h
@@ -80,6 +80,7 @@ public:
     static const char kRequestResumeTokenField[];
     static const char kResumeAfterField[];
     static const char kUse44SortKeys[];
+    static const char kMaxTimeMSOpOnlyField[];
 
     static const char kNaturalSortField[];
 
diff --git a/src/mongo/db/service_entry_point_common.cpp b/src/mongo/db/service_entry_point_common.cpp
index f773374f44c..0301647444c 100644
--- a/src/mongo/db/service_entry_point_common.cpp
+++ b/src/mongo/db/service_entry_point_common.cpp
@@ -981,6 +981,7 @@ void execCommandDatabase(OperationContext* opCtx,
         std::unique_ptr<MaintenanceModeSetter> mmSetter;
 
         BSONElement cmdOptionMaxTimeMSField;
+        BSONElement maxTimeMSOpOnlyField;
         BSONElement allowImplicitCollectionCreationField;
         BSONElement helpField;
 
@@ -989,6 +990,11 @@ void execCommandDatabase(OperationContext* opCtx,
             StringData fieldName = element.fieldNameStringData();
             if (fieldName == QueryRequest::cmdOptionMaxTimeMS) {
                 cmdOptionMaxTimeMSField = element;
+            } else if (fieldName == QueryRequest::kMaxTimeMSOpOnlyField) {
+                uassert(ErrorCodes::InvalidOptions,
+                        "Can not specify maxTimeMSOpOnly for non internal clients",
+                        isInternalClient);
+                maxTimeMSOpOnlyField = element;
             } else if (fieldName == "allowImplicitCollectionCreation") {
                 allowImplicitCollectionCreationField = element;
             } else if (fieldName == CommandHelpers::kHelpFieldName) {
@@ -1008,9 +1014,9 @@ void execCommandDatabase(OperationContext* opCtx,
 
         if (CommandHelpers::isHelpRequest(helpField)) {
             CurOp::get(opCtx)->ensureStarted();
-            // We disable last-error for help requests due to SERVER-11492, because config servers
-            // use help requests to determine which commands are database writes, and so must be
-            // forwarded to all config servers.
+            // We disable last-error for help requests due to SERVER-11492, because config
+            // servers use help requests to determine which commands are database writes, and so
+            // must be forwarded to all config servers.
             LastError::get(opCtx->getClient()).disable();
             Command::generateHelpResponse(opCtx, replyBuilder, *command);
             return;
@@ -1076,20 +1082,28 @@ void execCommandDatabase(OperationContext* opCtx,
             opCounters->gotCommand();
         }
 
-        // Parse the 'maxTimeMS' command option, and use it to set a deadline for the operation on
-        // the OperationContext. The 'maxTimeMS' option unfortunately has a different meaning for a
-        // getMore command, where it is used to communicate the maximum time to wait for new inserts
-        // on tailable cursors, not as a deadline for the operation.
+        // Parse the 'maxTimeMS' command option, and use it to set a deadline for the operation
+        // on the OperationContext. The 'maxTimeMS' option unfortunately has a different meaning
+        // for a getMore command, where it is used to communicate the maximum time to wait for
+        // new inserts on tailable cursors, not as a deadline for the operation.
         // TODO SERVER-34277 Remove the special handling for maxTimeMS for getMores. This will
-        // require introducing a new 'max await time' parameter for getMore, and eventually banning
-        // maxTimeMS altogether on a getMore command.
-        const int maxTimeMS =
-            uassertStatusOK(QueryRequest::parseMaxTimeMS(cmdOptionMaxTimeMSField));
-        if (maxTimeMS > 0 && command->getLogicalOp() != LogicalOp::opGetMore) {
+        // require introducing a new 'max await time' parameter for getMore, and eventually
+        // banning maxTimeMS altogether on a getMore command.
+        int maxTimeMS = uassertStatusOK(QueryRequest::parseMaxTimeMS(cmdOptionMaxTimeMSField));
+        int maxTimeMSOpOnly = uassertStatusOK(QueryRequest::parseMaxTimeMS(maxTimeMSOpOnlyField));
+
+        if ((maxTimeMS > 0 || maxTimeMSOpOnly > 0) &&
+            command->getLogicalOp() != LogicalOp::opGetMore) {
             uassert(40119,
                     "Illegal attempt to set operation deadline within DBDirectClient",
                     !opCtx->getClient()->isInDirectClient());
-            opCtx->setDeadlineAfterNowBy(Milliseconds{maxTimeMS}, ErrorCodes::MaxTimeMSExpired);
+            if (maxTimeMSOpOnly > 0 && (maxTimeMS == 0 || maxTimeMSOpOnly < maxTimeMS)) {
+                opCtx->storeMaxTimeMS(Milliseconds{maxTimeMS});
+                opCtx->setDeadlineAfterNowBy(Milliseconds{maxTimeMSOpOnly},
+                                             ErrorCodes::MaxTimeMSExpired);
+            } else if (maxTimeMS > 0) {
+                opCtx->setDeadlineAfterNowBy(Milliseconds{maxTimeMS}, ErrorCodes::MaxTimeMSExpired);
+            }
         }
 
         auto& readConcernArgs = repl::ReadConcernArgs::get(opCtx);
@@ -1130,8 +1144,8 @@ void execCommandDatabase(OperationContext* opCtx,
             opCtx->lockState()->setShouldConflictWithSecondaryBatchApplication(false);
         }
 
-        // Remember whether or not this operation is starting a transaction, in case something later
-        // in the execution needs to adjust its behavior based on this.
+        // Remember whether or not this operation is starting a transaction, in case something
+        // later in the execution needs to adjust its behavior based on this.
         opCtx->setIsStartingMultiDocumentTransaction(startTransaction);
 
         auto& oss = OperationShardingState::get(opCtx);
@@ -1156,9 +1170,10 @@ void execCommandDatabase(OperationContext* opCtx,
         // This may trigger the maxTimeAlwaysTimeOut failpoint.
         auto status = opCtx->checkForInterruptNoAssert();
 
-        // We still proceed if the primary stepped down, but accept other kinds of interruptions.
-        // We defer to individual commands to allow themselves to be interruptible by stepdowns,
-        // since commands like 'voteRequest' should conversely continue executing.
+        // We still proceed if the primary stepped down, but accept other kinds of
+        // interruptions. We defer to individual commands to allow themselves to be
+        // interruptible by stepdowns, since commands like 'voteRequest' should conversely
+        // continue executing.
         if (status != ErrorCodes::PrimarySteppedDown &&
             status != ErrorCodes::InterruptedDueToReplStateChange) {
             uassertStatusOK(status);
@@ -1208,10 +1223,10 @@ void execCommandDatabase(OperationContext* opCtx,
             auto engine = opCtx->getServiceContext()->getStorageEngine();
             invariant(engine && engine->supportsReadConcernSnapshot());
 
-            // SnapshotTooOld errors indicate that PIT ops are failing to find an available snapshot
-            // at their specified atClusterTime. Therefore, we'll try to increase the snapshot
-            // history window that the storage engine maintains in order to increase the likelihood
-            // of successful future PIT atClusterTime requests.
+            // SnapshotTooOld errors indicate that PIT ops are failing to find an available
+            // snapshot at their specified atClusterTime. Therefore, we'll try to increase the
+            // snapshot history window that the storage engine maintains in order to increase
+            // the likelihood of successful future PIT atClusterTime requests.
             SnapshotWindowUtil::incrementSnapshotTooOldErrorCount();
             SnapshotWindowUtil::increaseTargetSnapshotWindowSize(opCtx);
         } else {
@@ -1349,7 +1364,8 @@ DbResponse receivedCommands(OperationContext* opCtx,
                             ServiceEntryPointCommon::getRedactedCopyForLogging(c, request.body)));
 
             {
-                // Try to set this as early as possible, as soon as we have figured out the command.
+                // Try to set this as early as possible, as soon as we have figured out the
+                // command.
                 stdx::lock_guard<Client> lk(*opCtx->getClient());
                 CurOp::get(opCtx)->setLogicalOp_inlock(c->getLogicalOp());
             }
@@ -1588,12 +1604,12 @@ DbResponse receivedGetMore(OperationContext* opCtx,
             // Make sure that killCursorGlobal does not throw an exception if it is interrupted.
             UninterruptibleLockGuard noInterrupt(opCtx->lockState());
 
-            // If an error was thrown prior to auth checks, then the cursor should remain alive in
-            // order to prevent an unauthorized user from resulting in the death of a cursor. In
-            // other error cases, the cursor is dead and should be cleaned up.
+            // If an error was thrown prior to auth checks, then the cursor should remain alive
+            // in order to prevent an unauthorized user from resulting in the death of a cursor.
+            // In other error cases, the cursor is dead and should be cleaned up.
             //
-            // If killing the cursor fails, ignore the error and don't try again. The cursor should
-            // be reaped by the client cursor timeout thread.
+            // If killing the cursor fails, ignore the error and don't try again. The cursor
+            // should be reaped by the client cursor timeout thread.
             CursorManager::get(opCtx)
                 ->killCursor(opCtx, cursorid, false /* shouldAudit */)
                 .ignore();
diff --git a/src/mongo/executor/network_interface_tl.cpp b/src/mongo/executor/network_interface_tl.cpp
index ea058794a53..b00c1024e74 100644
--- a/src/mongo/executor/network_interface_tl.cpp
+++ b/src/mongo/executor/network_interface_tl.cpp
@@ -47,7 +47,7 @@ namespace mongo {
 namespace executor {
 
 namespace {
-static inline const std::string kMaxTimeMSOptionName = "maxTimeMS";
+static inline const std::string kMaxTimeMSOpOnlyField = "maxTimeMSOpOnly";
 }  // unnamed namespace
 
 /**
@@ -704,26 +704,19 @@ void NetworkInterfaceTL::RequestManager::trySend(
 
     if (requestState->isHedge) {
         invariant(cmdStatePtr->requestOnAny.hedgeOptions);
-
-        // Attach a maxTimeMS to the request.
         auto maxTimeMS = request.hedgeOptions->maxTimeMSForHedgedReads;
-        if (request.timeout == request.kNoTimeout || request.timeout > Milliseconds(maxTimeMS)) {
-            BSONObjBuilder updatedCmdBuilder;
-            for (const auto& elem : request.cmdObj) {
-                if (elem.fieldNameStringData() != kMaxTimeMSOptionName) {
-                    updatedCmdBuilder.append(elem);
-                }
-            }
-            updatedCmdBuilder.append(kMaxTimeMSOptionName, maxTimeMS);
-            request.cmdObj = updatedCmdBuilder.obj();
 
-            LOGV2_DEBUG(4647200,
-                        2,
-                        "Set maxTimeMS for request",
-                        "maxTimeMS"_attr = maxTimeMS,
-                        "request_id"_attr = cmdStatePtr->requestOnAny.id,
-                        "target"_attr = cmdStatePtr->requestOnAny.target[idx]);
-        }
+        BSONObjBuilder updatedCmdBuilder;
+        updatedCmdBuilder.appendElements(request.cmdObj);
+        updatedCmdBuilder.append(kMaxTimeMSOpOnlyField, maxTimeMS);
+        request.cmdObj = updatedCmdBuilder.obj();
+
+        LOGV2_DEBUG(4647200,
+                    2,
+                    "Setup hedge request",
+                    "request_id"_attr = cmdStatePtr->requestOnAny.id,
+                    "request"_attr = redact(request.toString()),
+                    "target"_attr = cmdStatePtr->requestOnAny.target[idx]);
 
         if (cmdStatePtr->interface->_svcCtx) {
             auto hm = HedgingMetrics::get(cmdStatePtr->interface->_svcCtx);
author	Misha Tyulenev <misha@mongodb.com>	2020-04-16 14:04:31 -0400
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2020-04-23 16:36:41 +0000
commit	7ee6f5f0157eea08876c705dcf6e02309d6153da (patch)
tree	dd69a74576b02be018e9a0afd3e1083a6ae9d1b9
parent	c3b47f6e122ae36ff297944a9d90ab9f1151ec17 (diff)
download	mongo-7ee6f5f0157eea08876c705dcf6e02309d6153da.tar.gz