From 83e1a7097433b98109bbdf67f4ae3eb2421c926f Mon Sep 17 00:00:00 2001 From: Charlie Swanson Date: Fri, 30 Mar 2018 15:54:57 -0400 Subject: SERVER-33942 Avoid setting deadline for maxTimeMS on tailable cursors --- .../tailable_getmore_to_mongos_does_not_timeout.js | 29 +++++++++++ .../tailable_getmore_does_not_timeout.js | 57 ++++++++++++++++++++++ src/mongo/db/commands/getmore_cmd.cpp | 1 - src/mongo/db/operation_context.h | 8 --- src/mongo/db/service_entry_point_common.cpp | 22 +++++---- src/mongo/s/commands/cluster_getmore_cmd.cpp | 4 ++ src/mongo/s/commands/strategy.cpp | 30 +++++++----- src/mongo/s/query/cluster_find.cpp | 9 ++-- 8 files changed, 123 insertions(+), 37 deletions(-) create mode 100644 jstests/noPassthrough/tailable_getmore_to_mongos_does_not_timeout.js create mode 100644 jstests/noPassthroughWithMongod/tailable_getmore_does_not_timeout.js diff --git a/jstests/noPassthrough/tailable_getmore_to_mongos_does_not_timeout.js b/jstests/noPassthrough/tailable_getmore_to_mongos_does_not_timeout.js new file mode 100644 index 00000000000..7c37d73ebdd --- /dev/null +++ b/jstests/noPassthrough/tailable_getmore_to_mongos_does_not_timeout.js @@ -0,0 +1,29 @@ +// Tests that specifying a maxTimeMS on a getMore request to mongos is not interpreted as a deadline +// for the operationfor a tailable + awaitData cursor. +// This test was designed to reproduce SERVER-33942 against a mongos. +// @tags: [requires_sharding] +(function() { + "use strict"; + + const st = new ShardingTest({shards: 2}); + + const db = st.s.getDB("test"); + const coll = db.capped; + assert.commandWorked(db.runCommand({create: "capped", capped: true, size: 1024})); + assert.writeOK(coll.insert({})); + const findResult = assert.commandWorked( + db.runCommand({find: "capped", filter: {}, tailable: true, awaitData: true})); + + const cursorId = findResult.cursor.id; + assert.neq(cursorId, 0); + + // Test that the getMores on this tailable cursor are immune to interrupt. + assert.commandWorked( + db.adminCommand({configureFailPoint: "maxTimeAlwaysTimeOut", mode: "alwaysOn"})); + assert.commandWorked(db.runCommand({getMore: cursorId, collection: "capped", maxTimeMS: 30})); + assert.commandWorked(db.runCommand({getMore: cursorId, collection: "capped"})); + assert.commandWorked( + db.adminCommand({configureFailPoint: "maxTimeAlwaysTimeOut", mode: "off"})); + + st.stop(); +}()); diff --git a/jstests/noPassthroughWithMongod/tailable_getmore_does_not_timeout.js b/jstests/noPassthroughWithMongod/tailable_getmore_does_not_timeout.js new file mode 100644 index 00000000000..0e8d05db09b --- /dev/null +++ b/jstests/noPassthroughWithMongod/tailable_getmore_does_not_timeout.js @@ -0,0 +1,57 @@ +// Tests that specifying a maxTimeMS on a getMore for a tailable + awaitData cursor is not +// interpreted as a deadline for the operation. +// This test was designed to reproduce SERVER-33942 against a mongod. +(function() { + "use strict"; + + const coll = db.tailable_getmore_no_timeout; + coll.drop(); + + assert.commandWorked(db.runCommand({create: coll.getName(), capped: true, size: 1024})); + + for (let i = 0; i < 10; ++i) { + assert.writeOK(coll.insert({_id: i})); + } + + const findResponse = assert.commandWorked( + db.runCommand({find: coll.getName(), filter: {}, tailable: true, awaitData: true})); + const cursorId = findResponse.cursor.id; + assert.neq(0, cursorId); + + // Start an operation in a parallel shell that holds the lock for a while. + const awaitSleepShell = startParallelShell( + () => assert.commandFailedWithCode(db.adminCommand({sleep: 1, lock: "w", secs: 600}), + ErrorCodes.Interrupted)); + + // Start a getMore and verify that it is waiting for the lock. + const getMoreMaxTimeMS = 10; + const awaitGetMoreShell = startParallelShell(` + // Wait for the sleep command to take the lock. + assert.soon(() => db.getSiblingDB("admin") + .currentOp({"command.sleep": 1, active: true}) + .inprog.length === 1); + // Start the getMore with a low maxTimeMS. + assert.commandWorked(db.runCommand({ + getMore: ${cursorId.toString()}, + collection: "${coll.getName()}", + maxTimeMS: ${getMoreMaxTimeMS} + })); + `); + + // Wait to see the getMore waiting on the lock. + assert.soon( + () => + db.currentOp({"command.getMore": cursorId, waitingForLock: true}).inprog.length === 1); + + // Sleep for twice the maxTimeMS to prove that the getMore won't time out waiting for the lock. + sleep(2 * getMoreMaxTimeMS); + + // Then kill the command with the lock, allowing the getMore to continue successfully. + const sleepOps = db.getSiblingDB("admin").currentOp({"command.sleep": 1, active: true}).inprog; + assert.eq(sleepOps.length, 1); + const sleepOpId = sleepOps[0].opid; + assert.commandWorked(db.adminCommand({killOp: 1, op: sleepOpId})); + + awaitSleepShell(); + awaitGetMoreShell(); +}()); diff --git a/src/mongo/db/commands/getmore_cmd.cpp b/src/mongo/db/commands/getmore_cmd.cpp index 12102bf0ed4..e62e3667afd 100644 --- a/src/mongo/db/commands/getmore_cmd.cpp +++ b/src/mongo/db/commands/getmore_cmd.cpp @@ -384,7 +384,6 @@ public: // any leftover time from the maxTimeMS of the operation that spawned this cursor, // applying it to this getMore. if (cursor->isAwaitData() && !disableAwaitDataFailpointActive) { - opCtx->clearDeadline(); awaitDataState(opCtx).waitForInsertsDeadline = opCtx->getServiceContext()->getPreciseClockSource()->now() + request.awaitDataTimeout.value_or(Seconds{1}); diff --git a/src/mongo/db/operation_context.h b/src/mongo/db/operation_context.h index 7c7862c6629..1e53a5648e5 100644 --- a/src/mongo/db/operation_context.h +++ b/src/mongo/db/operation_context.h @@ -405,14 +405,6 @@ public: return _deadline; } - /** - * Reset the deadline for this operation. - */ - void clearDeadline() { - _deadline = Date_t::max(); - _maxTime = computeMaxTimeFromDeadline(_deadline); - } - /** * Returns the number of milliseconds remaining for this operation's time limit or * Milliseconds::max() if the operation has no time limit. diff --git a/src/mongo/db/service_entry_point_common.cpp b/src/mongo/db/service_entry_point_common.cpp index d1b46dccb16..f460ffa9cd3 100644 --- a/src/mongo/db/service_entry_point_common.cpp +++ b/src/mongo/db/service_entry_point_common.cpp @@ -573,7 +573,6 @@ void execCommandDatabase(OperationContext* opCtx, BSONElement cmdOptionMaxTimeMSField; BSONElement allowImplicitCollectionCreationField; BSONElement helpField; - BSONElement queryOptionMaxTimeMSField; StringMap topLevelFields; for (auto&& element : request.body) { @@ -585,7 +584,8 @@ void execCommandDatabase(OperationContext* opCtx, } else if (fieldName == CommandHelpers::kHelpFieldName) { helpField = element; } else if (fieldName == QueryRequest::queryOptionMaxTimeMS) { - queryOptionMaxTimeMSField = element; + uasserted(ErrorCodes::InvalidOptions, + "no such command option $maxTimeMs; use maxTimeMS instead"); } uassert(ErrorCodes::FailedToParse, @@ -658,14 +658,16 @@ void execCommandDatabase(OperationContext* opCtx, opCounters->gotCommand(); } - // Handle command option maxTimeMS. - int maxTimeMS = uassertStatusOK(QueryRequest::parseMaxTimeMS(cmdOptionMaxTimeMSField)); - - uassert(ErrorCodes::InvalidOptions, - "no such command option $maxTimeMs; use maxTimeMS instead", - queryOptionMaxTimeMSField.eoo()); - - if (maxTimeMS > 0) { + // Parse the 'maxTimeMS' command option, and use it to set a deadline for the operation on + // the OperationContext. The 'maxTimeMS' option unfortunately has a different meaning for a + // getMore command, where it is used to communicate the maximum time to wait for new inserts + // on tailable cursors, not as a deadline for the operation. + // TODO SERVER-34277 Remove the special handling for maxTimeMS for getMores. This will + // require introducing a new 'max await time' parameter for getMore, and eventually banning + // maxTimeMS altogether on a getMore command. + const int maxTimeMS = + uassertStatusOK(QueryRequest::parseMaxTimeMS(cmdOptionMaxTimeMSField)); + if (maxTimeMS > 0 && command->getLogicalOp() != LogicalOp::opGetMore) { uassert(40119, "Illegal attempt to set operation deadline within DBDirectClient", !opCtx->getClient()->isInDirectClient()); diff --git a/src/mongo/s/commands/cluster_getmore_cmd.cpp b/src/mongo/s/commands/cluster_getmore_cmd.cpp index 70e8bdc7048..5455eb0f1ec 100644 --- a/src/mongo/s/commands/cluster_getmore_cmd.cpp +++ b/src/mongo/s/commands/cluster_getmore_cmd.cpp @@ -81,6 +81,10 @@ public: return "retrieve more documents for a cursor id"; } + LogicalOp getLogicalOp() const final { + return LogicalOp::opGetMore; + } + Status checkAuthForCommand(Client* client, const std::string& dbname, const BSONObj& cmdObj) const final { diff --git a/src/mongo/s/commands/strategy.cpp b/src/mongo/s/commands/strategy.cpp index baafa3bbce6..fa70221c4fd 100644 --- a/src/mongo/s/commands/strategy.cpp +++ b/src/mongo/s/commands/strategy.cpp @@ -280,18 +280,6 @@ void runCommand(OperationContext* opCtx, const OpMsgRequest& request, const NetworkOp opType, BSONObjBuilder&& builder) { - // Handle command option maxTimeMS first thing while processing the command so that the - // subsequent code has the deadline available - uassert(ErrorCodes::InvalidOptions, - "no such command option $maxTimeMs; use maxTimeMS instead", - request.body[QueryRequest::queryOptionMaxTimeMS].eoo()); - - const int maxTimeMS = uassertStatusOK( - QueryRequest::parseMaxTimeMS(request.body[QueryRequest::cmdOptionMaxTimeMS])); - if (maxTimeMS > 0) { - opCtx->setDeadlineAfterNowBy(Milliseconds{maxTimeMS}); - } - auto const commandName = request.getCommandName(); auto const command = CommandHelpers::findCommand(commandName); if (!command) { @@ -303,6 +291,24 @@ void runCommand(OperationContext* opCtx, return; } + // Parse the 'maxTimeMS' command option, and use it to set a deadline for the operation on + // the OperationContext. Be sure to do this as soon as possible so that further processing by + // subsequent code has the deadline available. The 'maxTimeMS' option unfortunately has a + // different meaning for a getMore command, where it is used to communicate the maximum time to + // wait for new inserts on tailable cursors, not as a deadline for the operation. + // TODO SERVER-34277 Remove the special handling for maxTimeMS for getMores. This will + // require introducing a new 'max await time' parameter for getMore, and eventually banning + // maxTimeMS altogether on a getMore command. + uassert(ErrorCodes::InvalidOptions, + "no such command option $maxTimeMs; use maxTimeMS instead", + request.body[QueryRequest::queryOptionMaxTimeMS].eoo()); + const int maxTimeMS = uassertStatusOK( + QueryRequest::parseMaxTimeMS(request.body[QueryRequest::cmdOptionMaxTimeMS])); + if (maxTimeMS > 0 && command->getLogicalOp() != LogicalOp::opGetMore) { + opCtx->setDeadlineAfterNowBy(Milliseconds{maxTimeMS}); + } + opCtx->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. + // Set the logical optype, command object and namespace as soon as we identify the command. If // the command does not define a fully-qualified namespace, set CurOp to the generic command // namespace db.$cmd. diff --git a/src/mongo/s/query/cluster_find.cpp b/src/mongo/s/query/cluster_find.cpp index 52726fba207..8c9e654f5bb 100644 --- a/src/mongo/s/query/cluster_find.cpp +++ b/src/mongo/s/query/cluster_find.cpp @@ -384,12 +384,9 @@ Status setUpOperationContextStateForGetMore(OperationContext* opCtx, } if (cursor->isTailableAndAwaitData()) { - // A maxTimeMS specified on a tailable, awaitData cursor is special. Instead of imposing a - // deadline on the operation, it is used to communicate how long the server should wait for - // new results. Here we clear any deadline set during command processing and track the - // deadline instead via the 'waitForInsertsDeadline' decoration. This deadline defaults to - // 1 second if the user didn't specify a maxTimeMS. - opCtx->clearDeadline(); + // For tailable + awaitData cursors, the request may have indicated a maximum amount of time + // to wait for new data. If not, default it to 1 second. We track the deadline instead via + // the 'waitForInsertsDeadline' decoration. auto timeout = request.awaitDataTimeout.value_or(Milliseconds{1000}); awaitDataState(opCtx).waitForInsertsDeadline = opCtx->getServiceContext()->getPreciseClockSource()->now() + timeout; -- cgit v1.2.1