/** * Tests hedging metrics in the serverStatus output. * @tags: [requires_fcv_70] */ (function() { "use strict"; load("jstests/libs/parallel_shell_helpers.js"); load("jstests/libs/fail_point_util.js"); function setCommandDelay(nodeConn, command, delay, ns) { assert.commandWorked(nodeConn.adminCommand({ configureFailPoint: "failCommand", mode: {times: 1}, data: { failInternalCommands: true, blockConnection: true, blockTimeMS: delay, failCommands: [command], namespace: ns }, })); } function clearCommandDelay(nodeConn) { assert.commandWorked(nodeConn.adminCommand({ configureFailPoint: "failCommand", mode: "off", })); } /* * Returns true if there is operation with the given comment running on the * given connection. */ function checkForOpWithComment(conn, comment) { const ret = conn.getDB("admin") .aggregate([{$currentOp: {localOps: true}}, {$match: {"command.comment": comment}}]) .toArray(); jsTestLog(`Checked currentOp with comment ${comment}: ${tojson(ret)}`); if (ret.length == 0) { return false; } if (ret.every(op => op.killPending)) { // CurrentOp actually blocks kills from proceeding. return false; } return true; } /** * The following starts a parallel shell to run a `count` command against the cluster. The * `failCommand` fail-point is enabled on both nodes that are targeted to run the hedged operation. * `fp1` ensures the main target blocks the command until the fail-point is disabled. `fp2` makes * the hedged target return a `NetworkInterfaceExceededTimeLimit` error code as the result of * running the command. This simulates a situation where the hedged operations fail due to network * timeouts, while the main operation succeeds. */ function verifyCommandWorksWhenHedgeOperationsFailWithNetworkTimeout( port, dbName, collName, nodes) { jsTestLog("Verify that command works when hedged operations fail due to a network timeout"); const ns = dbName + "." + collName; let fp1 = configureFailPoint(nodes[0], "failCommand", { failCommands: ["count"], failInternalCommands: true, namespace: ns, blockConnection: true, blockTimeMS: 10000, // this is not expected to unblock due to a timeout. }); let fp2 = configureFailPoint(nodes[1], "failCommand", { failCommands: ["count"], failInternalCommands: true, namespace: ns, errorCode: NumberInt(ErrorCodes.NetworkInterfaceExceededTimeLimit), }); const ps = startParallelShell(funWithArgs(function(dbName, collName) { const testDB = db.getSiblingDB(dbName); assert.commandWorked(testDB.runCommand( {count: collName, $readPreference: {mode: "nearest"}})); }, dbName, collName), port); fp2.wait(); fp2.off(); fp1.wait(); fp1.off(); ps(); } const st = new ShardingTest({ mongos: [{ setParameter: { logComponentVerbosity: tojson({network: {verbosity: 2}}), // Force the mongos's replica set monitors to always include all the eligible nodes. "failpoint.sdamServerSelectorIgnoreLatencyWindow": tojson({mode: "alwaysOn"}), // Force the mongos to send requests to hosts in alphabetical order of host names. "failpoint.hedgedReadsSendRequestsToTargetHostsInAlphabeticalOrder": tojson({mode: "alwaysOn"}), maxTimeMSForHedgedReads: 10 } }], shards: 1, rs: {nodes: 2, setParameter: {logComponentVerbosity: tojson({command: {verbosity: 1}})}} }); const dbName = "hedged_reads"; const collName = "test"; const ns = dbName + "." + collName; const testDB = st.s.getDB(dbName); const kBlockCmdTimeMS = 5 * 60 * 1000; const kWaitKillOpTimeoutMS = 5 * 1000; const numDocs = 10; assert.commandWorked(st.s.adminCommand({enableSharding: dbName})); st.ensurePrimaryShard(dbName, st.shard0.shardName); jsTest.log( "Verify that maxTimeMS expiration of the additional request does not affect the command result"); // The hedged read will have the maxTimeMS set to 10ms, hence need to sleep longer than that. assert.commandWorked(testDB.runCommand({ count: collName, query: {$where: "sleep(100); return true;", x: {$gte: 0}}, $readPreference: {mode: "nearest"} })); let sortedNodes = [...st.rs0.nodes].sort((node1, node2) => node1.host.localeCompare(node2.host)); verifyCommandWorksWhenHedgeOperationsFailWithNetworkTimeout( st.s0.port, dbName, collName, sortedNodes); jsTest.log("Verify that the initial request is canceled when the hedged request responds first"); try { // Make the initial request block. setCommandDelay(sortedNodes[0], "count", kBlockCmdTimeMS, ns); // Make the hedged request block for a while to allow the operation to start on the other node. setCommandDelay(sortedNodes[1], "count", 100, ns); const comment = "test_kill_initial_request_" + ObjectId(); assert.commandWorked(testDB.runCommand({ count: collName, query: {x: {$gte: 0}}, $readPreference: {mode: "nearest"}, comment: comment })); assert.soon( function() { return !checkForOpWithComment(sortedNodes[0], comment); }, "Timed out waiting for the operation run by the initial request to be killed", kWaitKillOpTimeoutMS); } finally { clearCommandDelay(sortedNodes[0]); clearCommandDelay(sortedNodes[1]); } jsTest.log( "Verify that the additional request is canceled when the initial request responds first"); try { // Make the additional/hedged request block, set maxTimeMSForHedgedReads to the block time // to prevent the remote host from killing the operation by itself. assert.commandWorked( st.s.adminCommand({setParameter: 1, maxTimeMSForHedgedReads: kBlockCmdTimeMS})); setCommandDelay(sortedNodes[1], "count", kBlockCmdTimeMS, ns); // Make the initial request block for a while to allow the operation to start on the other node. setCommandDelay(sortedNodes[0], "count", 100, ns); const comment = "test_kill_additional_request_" + ObjectId(); assert.commandWorked(testDB.runCommand({ count: collName, query: {x: {$gte: 0}}, $readPreference: {mode: "nearest"}, comment: comment })); assert.soon( function() { return !checkForOpWithComment(sortedNodes[1], comment); }, "Timed out waiting for the operation run by the additional request to be killed", kWaitKillOpTimeoutMS); } finally { clearCommandDelay(sortedNodes[0]); clearCommandDelay(sortedNodes[1]); assert.commandWorked(st.s.adminCommand({setParameter: 1, maxTimeMSForHedgedReads: 100})); } // Need causally consistent reads to verify the document count let session = testDB.getMongo().startSession({causalConsistency: true}); const sessionDB = session.getDatabase(dbName); const sessionColl = sessionDB.getCollection(collName); let bulk = sessionColl.initializeUnorderedBulkOp(); for (let i = 0; i < numDocs; i++) { bulk.insert({x: i}); } assert.commandWorked(bulk.execute()); jsTest.log("Verify that the getMore on hedge request do not inherit maxTimeMS"); try { // force to open hedge read cursor on sortedNodes[1] setCommandDelay(sortedNodes[0], "find", 500, ns); // $where with sleep is used because blocking command via failCommand does not affect the opCtx // deadlines as it blocks and unblocks the command before it starts execution. const comment = "test_getmore_on_additional_request_" + ObjectId(); let findRes = assert.commandWorked(sessionDB.runCommand({ find: collName, filter: {$where: "sleep(2000); return true;", x: {$gte: 0}}, $readPreference: {mode: "nearest"}, batchSize: 0, comment: comment })); const cursorId = findRes.cursor.id; assert.neq(0, cursorId); // confirm that getMore does not time out. let getMoreRes = assert.commandWorked(sessionDB.runCommand({getMore: cursorId, collection: collName})); assert.eq(getMoreRes.cursor.nextBatch.length, numDocs); } finally { clearCommandDelay(sortedNodes[0]); } st.stop(); }());