/** * When a network connection to the mongo shell is closed, attempting to call * Mongo.prototype.runCommand() and Mongo.prototype.runCommandWithMetadata() throws a JavaScript * exception. This override catches these exceptions (i.e. ones where isNetworkError() returns true) * and automatically re-sends the command request to the server, or propagates the error if the * command should already be using the shell's existing retryability logic. The goal of this * override is to implement retry logic such that the assertions within our existing JavaScript * tests still pass despite stepdowns of the CSRS and replica set shards happening in the * background. */ (function() { "use strict"; load("jstests/libs/override_methods/override_helpers.js"); load("jstests/libs/retryable_writes_util.js"); const kMaxNumRetries = 3; // Store a session to access ServerSession#canRetryWrites. let _serverSession; const mongoRunCommandOriginal = Mongo.prototype.runCommand; const mongoRunCommandWithMetadataOriginal = Mongo.prototype.runCommandWithMetadata; Mongo.prototype.runCommand = function runCommand(dbName, cmdObj, options) { if (typeof _serverSession === "undefined") { _serverSession = this.startSession()._serverSession; } return runWithRetriesOnNetworkErrors(this, cmdObj, mongoRunCommandOriginal, arguments); }; Mongo.prototype.runCommandWithMetadata = function runCommandWithMetadata( dbName, metadata, cmdObj) { if (typeof _serverSession === "undefined") { _serverSession = this.startSession()._serverSession; } return runWithRetriesOnNetworkErrors( this, cmdObj, mongoRunCommandWithMetadataOriginal, arguments); }; // Commands assumed to not be blindly retryable. const kNonRetryableCommands = new Set([ // Commands that take write concern and do not support txnNumbers. "_configsvrAddShard", "_configsvrAddShardToZone", "_configsvrCommitChunkMerge", "_configsvrCommitChunkMigration", "_configsvrCommitChunkSplit", "_configsvrCreateDatabase", "_configsvrEnableSharding", "_configsvrMoveChunk", "_configsvrMovePrimary", "_configsvrRemoveShard", "_configsvrRemoveShardFromZone", "_configsvrShardCollection", "_configsvrUpdateZoneKeyRange", "_mergeAuthzCollections", "_recvChunkStart", "appendOplogNote", "applyOps", "captrunc", "cleanupOrphaned", "clone", "cloneCollection", "cloneCollectionAsCapped", "collMod", "convertToCapped", "create", "createIndexes", "createRole", "createUser", "deleteIndexes", "drop", "dropAllRolesFromDatabase", "dropAllUsersFromDatabase", "dropDatabase", "dropIndexes", "dropRole", "dropUser", "emptycapped", "godinsert", "grantPrivilegesToRole", "grantRolesToRole", "grantRolesToUser", "mapreduce.shardedfinish", "moveChunk", "renameCollection", "revokePrivilegesFromRole", "revokeRolesFromRole", "revokeRolesFromUser", "updateRole", "updateUser", ]); // These commands are not idempotent because they return errors if retried after // successfully completing (like IndexNotFound, NamespaceExists, etc.), but because they // only take effect once, and many tests use them to set up state, their errors on retries // are handled specially. const kAcceptableNonRetryableCommands = new Set([ "create", "createIndexes", "deleteIndexes", "drop", "dropDatabase", // Already ignores NamespaceNotFound errors, so not handled below. "dropIndexes", ]); function isAcceptableNonRetryableCommand(cmdName) { return kAcceptableNonRetryableCommands.has(cmdName); } function isAcceptableRetryFailedResponse(cmdName, res) { return ((cmdName === "create" && res.code === ErrorCodes.NamespaceExists) || (cmdName === "createIndexes" && res.code === ErrorCodes.IndexAlreadyExists) || (cmdName === "drop" && res.code === ErrorCodes.NamespaceNotFound) || ((cmdName === "dropIndexes" || cmdName === "deleteIndexes") && res.code === ErrorCodes.IndexNotFound)); } // Commands that may return different values or fail if retried on a new primary after a // failover. const kNonFailoverTolerantCommands = new Set([ "currentOp", // Failovers can change currentOp output. "getLog", // The log is different on different servers. "killOp", // Failovers may interrupt operations intended to be killed later in the test. "logRotate", "planCacheClear", // The plan cache isn't replicated. "planCacheClearFilters", "planCacheListFilters", "planCacheListPlans", "planCacheListQueryShapes", "planCacheSetFilter", "profile", // Not replicated, so can't tolerate failovers. "setParameter", // Not replicated, so can't tolerate failovers. "stageDebug", "startSession", // Sessions are flushed to disk asynchronously. ]); // Several commands that use the plan executor swallow the actual error code from a failed plan // into their error message and instead return OperationFailed. // // TODO SERVER-32208: Remove this function once it is no longer needed. function isRetryableExecutorCodeAndMessage(code, msg) { return code === ErrorCodes.OperationFailed && typeof msg !== "undefined" && msg.indexOf("InterruptedDueToStepDown") >= 0; } function runWithRetriesOnNetworkErrors(mongo, cmdObj, clientFunction, clientFunctionArguments) { let cmdName = Object.keys(cmdObj)[0]; // If the command is in a wrapped form, then we look for the actual command object // inside the query/$query object. if (cmdName === "query" || cmdName === "$query") { cmdObj = cmdObj[cmdName]; cmdName = Object.keys(cmdObj)[0]; } const isRetryableWriteCmd = RetryableWritesUtil.isRetryableWriteCmdName(cmdName); const canRetryWrites = _serverSession.canRetryWrites(cmdObj); const startTime = Date.now(); let numRetries = !jsTest.options().skipRetryOnNetworkError ? kMaxNumRetries : 0; // Validate the command before running it, to prevent tests with non-retryable commands // from being run. if (isRetryableWriteCmd && !canRetryWrites) { throw new Error("Refusing to run a test that issues non-retryable write operations" + " since the test likely makes assertions on the write results and" + " can lead to spurious failures if a network error occurs."); } else if (cmdName === "getMore") { throw new Error( "Refusing to run a test that issues a getMore command since if a network error" + " occurs during it then we won't know whether the cursor was advanced or not."); } else if (kNonRetryableCommands.has(cmdName) && !isAcceptableNonRetryableCommand(cmdName)) { throw new Error( "Refusing to run a test that issues commands that are not blindly retryable, " + " cmdName: " + cmdName); } else if (kNonFailoverTolerantCommands.has(cmdName)) { throw new Error( "Refusing to run a test that issues commands that may return different values" + " after a failover, cmdName: " + cmdName); } else if (cmdName === "aggregate") { var stages = cmdObj.pipeline; // $listLocalSessions must be the first stage in the pipeline. const firstStage = stages && Array.isArray(stages) && (stages.length > 0) ? stages[0] : undefined; const hasListLocalStage = firstStage && (typeof firstStage === "object") && firstStage.hasOwnProperty("$listLocalSessions"); if (hasListLocalStage) { throw new Error("Refusing to run a test that issues an aggregation command with" + " $listLocalSessions because it relies on in-memory" + " state that may not survive failovers."); } // Aggregate can be either a read or a write depending on whether it has a $out stage. // $out is required to be the last stage of the pipeline. const lastStage = stages && Array.isArray(stages) && (stages.length !== 0) ? stages[stages.length - 1] : undefined; const hasOut = lastStage && (typeof lastStage === "object") && lastStage.hasOwnProperty("$out"); if (hasOut) { throw new Error("Refusing to run a test that issues an aggregation command" + " with $out because it is not retryable."); } const hasExplain = cmdObj.hasOwnProperty("explain"); if (hasExplain) { throw new Error( "Refusing to run a test that issues an aggregation command with explain" + " because it may return incomplete results if interrupted by a stepdown."); } } else if (cmdName === "mapReduce" || cmdName === "mapreduce") { throw new Error( "Refusing to run a test that issues a mapReduce command, because it calls " + " std::terminate() if interrupted by a stepdown."); } let retry = false; do { try { TestData.retryingOnNetworkError = retry; retry = true; let res = clientFunction.apply(mongo, clientFunctionArguments); if (isRetryableWriteCmd) { // findAndModify can fail during the find stage and return an executor error. if ((cmdName === "findandmodify" || cmdName === "findAndModify") && isRetryableExecutorCodeAndMessage(res.code, res.errmsg)) { print("=-=-=-= Retrying because of executor interruption: " + cmdName + ", retries remaining: " + numRetries + " error: " + tojsononeline(res)); continue; } // Don't interfere with retryable writes. return res; } if (cmdName === "explain") { // If an explain is interrupted by a stepdown, and it returns before its // connection is closed, it will return incomplete results. To prevent failing // the test, force retries of interrupted explains. if (res.hasOwnProperty("executionStats") && !res.executionStats.executionSuccess && (RetryableWritesUtil.isRetryableCode(res.executionStats.errorCode) || isRetryableExecutorCodeAndMessage(res.executionStats.errorCode, res.executionStats.errorMessage))) { print("=-=-=-= Forcing retry of interrupted explain, res: " + tojson(res)); continue; } // An explain command can fail if its child command cannot be run on the current // server. This can be hit if a primary only or not explicitly slaveOk command // is accepted by a primary node that then steps down and returns before having // its connection closed. if (!res.ok && res.errmsg.indexOf("child command cannot run on this node") >= 0) { print( "=-=-=-= Forcing retry of explain likely interrupted by transition to" + " secondary, res: " + tojson(res)); continue; } } if (!res.ok) { if (numRetries > 0) { if (RetryableWritesUtil.isRetryableCode(res.code)) { // Don't decrement retries, because the command returned before the // connection was closed, so a subsequent attempt will receive a // network error (or NotMaster error) and need to retry. print("=-=-=-= Retrying failed response with retryable code: " + res.code + ", for command: " + cmdName + ", retries remaining: " + numRetries); continue; } if (isRetryableExecutorCodeAndMessage(res.code, res.errmsg)) { print("=-=-=-= Retrying because of executor interruption: " + cmdName + ", retries remaining: " + numRetries); continue; } // listCollections and listIndexes called through mongos may return // OperationFailed if the request to establish a cursor on the targeted // shard fails with a network error. // // TODO SERVER-30949: Remove this check once those two commands retry on // retryable errors automatically. if ((cmdName === "listCollections" || cmdName === "listIndexes") && res.code === ErrorCodes.OperationFailed && res.hasOwnProperty("errmsg") && res.errmsg.indexOf("failed to read command response from shard") >= 0) { print("=-=-=-= Retrying failed mongos cursor command: " + cmdName + ", retries remaining: " + numRetries); continue; } // Thrown when an index build is interrupted during its collection scan. if (cmdName === "createIndexes" && res.codeName === "InterruptedDueToStepDown") { print("=-=-=-= Retrying because of interrupted collection scan: " + cmdName + ", retries remaining: " + numRetries); continue; } } // Swallow safe errors that may come from a retry since the command may have // completed before the connection was closed. if (isAcceptableRetryFailedResponse(cmdName, res)) { print("=-=-=-= Overriding safe failed response for: " + cmdName + ", code: " + res.code + ", retries remaining: " + numRetries); res.ok = 1; } } if (res.writeConcernError && numRetries > 0) { if (RetryableWritesUtil.isRetryableCode(res.writeConcernError.code)) { // Don't decrement retries, because the command returned before the // connection was closed, so a subsequent attempt will receive a // network error (or NotMaster error) and need to retry. print("=-=-=-= Retrying write concern error with retryable code: " + res.writeConcernError.code + ", for command: " + cmdName + ", retries remaining: " + numRetries); continue; } } TestData.retryingOnNetworkError = false; return res; } catch (e) { const kReplicaSetMonitorError = /^Could not find host matching read preference.*mode: "primary"/; if (numRetries === 0) { TestData.retryingOnNetworkError = false; jsTestLog("=-=-=-= No retries, throwing"); throw e; } else if (e.message.match(kReplicaSetMonitorError) && Date.now() - startTime < 5 * 60 * 1000) { // ReplicaSetMonitor::getHostOrRefresh() waits up to 15 seconds to find the // primary of the replica set. It is possible for the step up attempt of another // node in the replica set to take longer than 15 seconds so we allow retrying // for up to 5 minutes. print("=-=-=-= Failed to find primary when attempting to run " + cmdName + " command, will retry for another 15 seconds"); continue; } else if ((e.message.indexOf("writeConcernError") >= 0) && isRetryableError(e)) { print("=-=-=-= Retrying write concern error with retryable code for command: " + cmdName + ", retries remaining: " + numRetries + " error: " + tojson(e)); continue; } else if (!isNetworkError(e)) { throw e; } else if (isRetryableWriteCmd) { if (_serverSession.canRetryWrites(cmdObj)) { // If the command is retryable, assume the command has already gone through // or will go through the retry logic in SessionAwareClient, so propagate // the error. throw e; } } --numRetries; print("=-=-=-= Retrying on network error for command: " + cmdName + ", retries remaining: " + numRetries); } } while (numRetries >= 0); } OverrideHelpers.prependOverrideInParallelShell( "jstests/libs/override_methods/auto_retry_on_network_error.js"); const connectOriginal = connect; connect = function(url, user, pass) { let retVal; let connectionAttempts = 0; assert.soon( () => { try { connectionAttempts += 1; retVal = connectOriginal.apply(this, arguments); return true; } catch (e) { print("=-=-=-= Retrying connection to: " + url + ", attempts: " + connectionAttempts + ", failed with: " + tojson(e)); } }, "Failed connecting to url: " + tojson(url), undefined, // Default timeout. 2000); // 2 second interval. return retVal; }; })();