summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAmirsaman Memaripour <amirsaman.memaripour@mongodb.com>2022-08-24 17:17:30 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-09-08 20:32:08 +0000
commit97ba88b52784d3c81a23a2994f50d16f3bf2dab0 (patch)
tree611c2cf59812aca5b87a6b454ba4a13374261dfc
parent4218c040fd9bf7873b765ea5d3113f718c44e9eb (diff)
downloadmongo-97ba88b52784d3c81a23a2994f50d16f3bf2dab0.tar.gz
SERVER-67465 Ensure network timeouts do not fail hedged operations
(cherry picked from commit 1744ab66eafba2dcc6dd96d7fa0d0d77eeae35d8)
-rw-r--r--jstests/sharding/hedged_reads.js47
-rw-r--r--src/mongo/executor/network_interface_tl.cpp39
2 files changed, 70 insertions, 16 deletions
diff --git a/jstests/sharding/hedged_reads.js b/jstests/sharding/hedged_reads.js
index 5ae15750b4f..f696304e8e3 100644
--- a/jstests/sharding/hedged_reads.js
+++ b/jstests/sharding/hedged_reads.js
@@ -5,6 +5,7 @@
(function() {
"use strict";
+load("jstests/libs/parallel_shell_helpers.js");
load("jstests/libs/fail_point_util.js");
function setCommandDelay(nodeConn, command, delay, ns) {
@@ -52,6 +53,49 @@ function checkForOpWithComment(conn, comment) {
return true;
}
+/**
+ * The following starts a parallel shell to run a `count` command against the cluster. The
+ * `failCommand` fail-point is enabled on both nodes that are targeted to run the hedged operation.
+ * `fp1` ensures the main target blocks the command until the fail-point is disabled. `fp2` makes
+ * the hedged target return a `NetworkInterfaceExceededTimeLimit` error code as the result of
+ * running the command. This simulates a situation where the hedged operations fail due to network
+ * timeouts, while the main operation succeeds.
+ */
+function verifyCommandWorksWhenHedgeOperationsFailWithNetworkTimeout(
+ port, dbName, collName, nodes) {
+ jsTestLog("Verify that command works when hedged operations fail due to a network timeout");
+
+ const ns = dbName + "." + collName;
+ let fp1 = configureFailPoint(nodes[0], "failCommand", {
+ failCommands: ["count"],
+ failInternalCommands: true,
+ namespace: ns,
+ blockConnection: true,
+ blockTimeMS: 10000, // this is not expected to unblock due to a timeout.
+ });
+
+ let fp2 = configureFailPoint(nodes[1], "failCommand", {
+ failCommands: ["count"],
+ failInternalCommands: true,
+ namespace: ns,
+ errorCode: NumberInt(ErrorCodes.NetworkInterfaceExceededTimeLimit),
+ });
+
+ const ps = startParallelShell(funWithArgs(function(dbName, collName) {
+ const testDB = db.getSiblingDB(dbName);
+ assert.commandWorked(testDB.runCommand(
+ {count: collName, $readPreference: {mode: "nearest"}}));
+ }, dbName, collName), port);
+
+ fp2.wait();
+ fp2.off();
+
+ fp1.wait();
+ fp1.off();
+
+ ps();
+}
+
const st = new ShardingTest({
mongos: [{
setParameter: {
@@ -95,6 +139,9 @@ assert.commandWorked(testDB.runCommand({
let sortedNodes = [...st.rs0.nodes].sort((node1, node2) => node1.host.localeCompare(node2.host));
+verifyCommandWorksWhenHedgeOperationsFailWithNetworkTimeout(
+ st.s0.port, dbName, collName, sortedNodes);
+
jsTest.log("Verify that the initial request is canceled when the hedged request responds first");
try {
// Make the initial request block.
diff --git a/src/mongo/executor/network_interface_tl.cpp b/src/mongo/executor/network_interface_tl.cpp
index 9a2695534dc..734ad7efb15 100644
--- a/src/mongo/executor/network_interface_tl.cpp
+++ b/src/mongo/executor/network_interface_tl.cpp
@@ -49,7 +49,21 @@ namespace executor {
namespace {
static inline const std::string kMaxTimeMSOpOnlyField = "maxTimeMSOpOnly";
-} // unnamed namespace
+
+/**
+ * We ignore a subset of errors that may occur while running hedged operations (e.g., maxTimeMS
+ * expiration), as the operation may safely succeed despite their failure. For example, a network
+ * timeout error indicates the remote host experienced a timeout while running a remote-command as
+ * part of executing the hedged operation. This is by no means an indication that the operation has
+ * failed, as other hedged operations may still succeed.
+ * TODO SERVER-68704 will include other error categories that are safe to ignore.
+ */
+bool skipHedgeResult(const Status& status) {
+ return status == ErrorCodes::MaxTimeMSExpired || status == ErrorCodes::StaleDbVersion ||
+ ErrorCodes::isNetworkTimeoutError(status) || ErrorCodes::isStaleShardVersionError(status);
+}
+
+} // namespace
/**
* SynchronizedCounters is synchronized bucket of event counts for commands
@@ -822,21 +836,14 @@ void NetworkInterfaceTL::RequestState::resolve(Future<RemoteCommandResponse> fut
returnConnection(status);
const auto commandStatus = getStatusFromCommandResult(response.data);
- if (isHedge) {
- // Ignore maxTimeMS expiration, StaleDbVersion or any error belonging to
- // StaleShardVersionError
- // error category for hedged reads without triggering the finish line.
- if (commandStatus == ErrorCodes::MaxTimeMSExpired ||
- commandStatus == ErrorCodes::StaleDbVersion ||
- ErrorCodes::isStaleShardVersionError(commandStatus)) {
- LOGV2_DEBUG(4660701,
- 2,
- "Hedged request returned status",
- "requestId"_attr = request->id,
- "target"_attr = request->target,
- "status"_attr = commandStatus);
- return;
- }
+ if (isHedge && skipHedgeResult(commandStatus)) {
+ LOGV2_DEBUG(4660701,
+ 2,
+ "Hedged request returned status",
+ "requestId"_attr = request->id,
+ "target"_attr = request->target,
+ "status"_attr = commandStatus);
+ return;
}
if (!cmdState->finishLine.arriveStrongly()) {