From ffff3f7fce95d45192c8b6cb65845907d6128922 Mon Sep 17 00:00:00 2001 From: Cheahuychou Mao Date: Fri, 27 Mar 2020 02:27:49 -0400 Subject: SERVER-47167 Make hedged read tests more robust --- jstests/sharding/hedged_reads.js | 74 +++++++++++------------ jstests/sharding/hedging_metrics_server_status.js | 30 ++++++--- src/mongo/executor/network_interface_tl.cpp | 3 +- 3 files changed, 59 insertions(+), 48 deletions(-) diff --git a/jstests/sharding/hedged_reads.js b/jstests/sharding/hedged_reads.js index 288f0d4d60a..f1d2e1a97ab 100644 --- a/jstests/sharding/hedged_reads.js +++ b/jstests/sharding/hedged_reads.js @@ -7,16 +7,17 @@ load("jstests/libs/fail_point_util.js"); -function setCommandDelay(nodeConn, command, delay) { +function setCommandDelay(nodeConn, command, delay, ns) { assert.commandWorked(nodeConn.adminCommand({ configureFailPoint: "failCommand", - mode: "alwaysOn", + mode: {times: 1}, data: { failInternalCommands: true, blockConnection: true, blockTimeMS: delay, failCommands: [command], - } + namespace: ns + }, })); } @@ -51,53 +52,50 @@ function checkForOpWithComment(conn, comment) { return true; } -const st = new ShardingTest({shards: 1, rs: {nodes: 2}}); +const st = new ShardingTest({ + mongos: [{ + setParameter: { + logComponentVerbosity: tojson({network: {verbosity: 2}}), + // Force the mongos's replica set monitors to always include all the eligible nodes. + "failpoint.scanningServerSelectorIgnoreLatencyWindow": tojson({mode: "alwaysOn"}), + "failpoint.sdamServerSelectorIgnoreLatencyWindow": tojson({mode: "alwaysOn"}), + // Force the mongos to send requests to hosts in alphabetical order of host names. + "failpoint.networkInterfaceSendRequestsToTargetHostsInAlphabeticalOrder": + tojson({mode: "alwaysOn"}) + } + }], + shards: 1, + rs: {nodes: 2, setParameter: {logComponentVerbosity: tojson({command: {verbosity: 1}})}} +}); const dbName = "hedged_reads"; const collName = "test"; const ns = dbName + "." + collName; const testDB = st.s.getDB(dbName); -const kLargeTimeoutMS = 5 * 60 * 1000; -const waitKillOpTimeoutMS = 5 * 1000; +const kBlockCmdTimeMS = 5 * 60 * 1000; +const kWaitKillOpTimeoutMS = 5 * 1000; assert.commandWorked(st.s.adminCommand({enableSharding: dbName})); st.ensurePrimaryShard(dbName, st.shard0.shardName); -// Force the mongos's replica set monitors to always include all the eligible nodes. -const replicaSetMonitorProtocol = - assert.commandWorked(st.s.adminCommand({getParameter: 1, replicaSetMonitorProtocol: 1})) - .replicaSetMonitorProtocol; -let serverSelectorFailPoint = configureFailPoint(st.s, - replicaSetMonitorProtocol === "scanning" - ? "scanningServerSelectorIgnoreLatencyWindow" - : "sdamServerSelectorIgnoreLatencyWindow"); - jsTest.log( "Verify that maxTimeMS expiration of the additional request does not affect the command result"); // The hedged read will have the maxTimeMS set to 10ms, hence need to sleep longer than that. assert.commandWorked(testDB.runCommand({ - find: collName, - filter: {$where: "sleep(100); return true;"}, + count: collName, + query: {$where: "sleep(100); return true;", x: {$gte: 0}}, $readPreference: {mode: "nearest"} })); -// Force the mongos to send requests to hosts in alphabetical order of host names. -let sendRequestsFailPoint = - configureFailPoint(st.s, "networkInterfaceSendRequestsToTargetHostsInAlphabeticalOrder"); let sortedNodes = [...st.rs0.nodes].sort((node1, node2) => node1.host.localeCompare(node2.host)); -st.rs0.nodes.forEach(function(conn) { - assert.commandWorked( - conn.adminCommand({setParameter: 1, logComponentVerbosity: {command: {verbosity: 1}}})); -}); - -assert.commandWorked( - st.s.adminCommand({setParameter: 1, logComponentVerbosity: {network: {verbosity: 2}}})); - jsTest.log("Verify that the initial request is canceled when the hedged request responds first"); try { // Make the initial request block. - setCommandDelay(sortedNodes[0], "count", kLargeTimeoutMS); + setCommandDelay(sortedNodes[0], "count", kBlockCmdTimeMS, ns); + + // Make the hedged request block for a while to allow the operation to start on the other node. + setCommandDelay(sortedNodes[1], "count", 100, ns); const comment = "test_kill_initial_request_" + ObjectId(); assert.commandWorked(testDB.runCommand({ @@ -112,7 +110,7 @@ try { return !checkForOpWithComment(sortedNodes[0], comment); }, "Timed out waiting for the operation run by the initial request to be killed", - waitKillOpTimeoutMS); + kWaitKillOpTimeoutMS); } finally { clearCommandDelay(sortedNodes[0]); } @@ -120,11 +118,14 @@ try { jsTest.log( "Verify that the additional request is canceled when the initial request responds first"); try { - // Make the additional/hedged request block, set a large maxTimeMSForHedgedReads to prevent - // the remote host from killing the operation by itself. + // Make the additional/hedged request block, set maxTimeMSForHedgedReads to the block time + // to prevent the remote host from killing the operation by itself. assert.commandWorked( - st.s.adminCommand({setParameter: 1, maxTimeMSForHedgedReads: kLargeTimeoutMS})); - setCommandDelay(sortedNodes[1], "count", kLargeTimeoutMS); + st.s.adminCommand({setParameter: 1, maxTimeMSForHedgedReads: kBlockCmdTimeMS})); + setCommandDelay(sortedNodes[1], "count", kBlockCmdTimeMS, ns); + + // Make the initial request block for a while to allow the operation to start on the other node. + setCommandDelay(sortedNodes[0], "count", 100, ns); const comment = "test_kill_additional_request_" + ObjectId(); assert.commandWorked(testDB.runCommand({ @@ -139,13 +140,10 @@ try { return !checkForOpWithComment(sortedNodes[1], comment); }, "Timed out waiting for the operation run by the additional request to be killed", - waitKillOpTimeoutMS); + kWaitKillOpTimeoutMS); } finally { clearCommandDelay(sortedNodes[1]); } -serverSelectorFailPoint.off(); -sendRequestsFailPoint.off(); - st.stop(); }()); diff --git a/jstests/sharding/hedging_metrics_server_status.js b/jstests/sharding/hedging_metrics_server_status.js index 377b0699a02..b40279896c4 100644 --- a/jstests/sharding/hedging_metrics_server_status.js +++ b/jstests/sharding/hedging_metrics_server_status.js @@ -43,15 +43,16 @@ function checkServerStatusHedgingMetrics(mongosConn, expectedHedgingMetrics) { serverStatusCheckTimeoutMS); } -function setCommandDelay(nodeConn, command, delay) { +function setCommandDelay(nodeConn, command, delay, ns) { assert.commandWorked(nodeConn.adminCommand({ configureFailPoint: "failCommand", - mode: "alwaysOn", + mode: {times: 1}, data: { failInternalCommands: true, blockConnection: true, blockTimeMS: delay, failCommands: [command], + namespace: ns, } })); } @@ -63,7 +64,21 @@ function clearCommandDelay(nodeConn) { })); } -const st = new ShardingTest({shards: 1, rs: {nodes: 2}, mongosOptions: {verbose: 2}}); +const st = new ShardingTest({ + mongos: [{ + setParameter: { + logComponentVerbosity: tojson({network: {verbosity: 2}}), + // Force the mongos's replica set monitors to always include all the eligible nodes. + "failpoint.scanningServerSelectorIgnoreLatencyWindow": tojson({mode: "alwaysOn"}), + "failpoint.sdamServerSelectorIgnoreLatencyWindow": tojson({mode: "alwaysOn"}), + // Force the mongos to send requests to hosts in alphabetical order of host names. + "failpoint.networkInterfaceSendRequestsToTargetHostsInAlphabeticalOrder": + tojson({mode: "alwaysOn"}) + } + }], + shards: 1, + rs: {nodes: 2, setParameter: {logComponentVerbosity: tojson({command: {verbosity: 1}})}} +}); const dbName = "foo"; const collName = "bar"; const ns = dbName + "." + collName; @@ -103,7 +118,7 @@ jsTestLog("Run commands with hedging enabled, and verify the metrics are as expe // Make the command slower on the first target host, and verify there is an advantageous // hedged read. try { - setCommandDelay(sortedNodes[0], "count", 1000); + setCommandDelay(sortedNodes[0], "count", 1000, ns); assert.commandWorked(testDB.runCommand( {count: collName, query: {x: {$gte: 0}}, $readPreference: {mode: "nearest"}})); } finally { @@ -119,8 +134,8 @@ checkServerStatusHedgingMetrics(testDB, expectedHedgingMetrics); // hedged read. Block the command on the first target host for some time to allow the hedged // request to get sent to the second target host. try { - setCommandDelay(sortedNodes[0], "count", 100); - setCommandDelay(sortedNodes[1], "count", 1000); + setCommandDelay(sortedNodes[0], "count", 100, ns); + setCommandDelay(sortedNodes[1], "count", 1000, ns); assert.commandWorked(testDB.runCommand( {count: collName, query: {x: {$gte: 0}}, $readPreference: {mode: "nearest"}})); @@ -133,8 +148,5 @@ expectedHedgingMetrics.numTotalOperations += 1; expectedHedgingMetrics.numTotalHedgedOperations += 1; checkServerStatusHedgingMetrics(testDB, expectedHedgingMetrics); -serverSelectorFailPoint.off(); -sendRequestsFailPoint.off(); - st.stop(); }()); diff --git a/src/mongo/executor/network_interface_tl.cpp b/src/mongo/executor/network_interface_tl.cpp index 3f2f6871c3d..a988ef76420 100644 --- a/src/mongo/executor/network_interface_tl.cpp +++ b/src/mongo/executor/network_interface_tl.cpp @@ -420,7 +420,8 @@ Status NetworkInterfaceTL::startCommand(const TaskExecutor::CallbackHandle& cbHa } bool targetHostsInAlphabeticalOrder = - MONGO_unlikely(networkInterfaceSendRequestsToTargetHostsInAlphabeticalOrder.shouldFail()); + MONGO_unlikely(networkInterfaceSendRequestsToTargetHostsInAlphabeticalOrder.shouldFail( + [request](const BSONObj&) { return request.hedgeOptions != boost::none; })); if (targetHostsInAlphabeticalOrder) { // Sort the target hosts by host names. -- cgit v1.2.1