diff options
author | Jack Mulrow <jack.mulrow@mongodb.com> | 2017-12-07 10:24:06 -0500 |
---|---|---|
committer | Jack Mulrow <jack.mulrow@mongodb.com> | 2017-12-14 10:06:52 -0500 |
commit | 52b975db999de6e4f9169e6d733b1bfaf95c80e9 (patch) | |
tree | 3994ba5ce2758dbffaa92662a689b2cffc302e53 | |
parent | c7f1e65dcda3ac37764b445fd32c8e3310440e6b (diff) | |
download | mongo-52b975db999de6e4f9169e6d733b1bfaf95c80e9.tar.gz |
SERVER-31194 Add a version of retryable_writes_jscore_passthrough.yml with stepdowns
(cherry picked from commit 51b699b02a5858a115a95af206253104c46e4bb0)
-rw-r--r-- | buildscripts/resmokeconfig/suites/retryable_writes_jscore_passthrough.yml | 3 | ||||
-rw-r--r-- | buildscripts/resmokeconfig/suites/retryable_writes_jscore_stepdown_passthrough.yml | 142 | ||||
-rw-r--r-- | buildscripts/resmokelib/core/programs.py | 6 | ||||
-rw-r--r-- | buildscripts/resmokelib/testing/hooks/stepdown.py | 11 | ||||
-rw-r--r-- | etc/evergreen.yml | 23 | ||||
-rw-r--r-- | jstests/libs/override_methods/auto_retry_on_network_error.js | 281 | ||||
-rw-r--r-- | jstests/libs/override_methods/set_read_and_write_concerns.js | 25 | ||||
-rw-r--r-- | jstests/libs/retryable_writes_util.js | 14 | ||||
-rw-r--r-- | src/mongo/shell/session.js | 82 | ||||
-rw-r--r-- | src/mongo/shell/utils.js | 3 |
10 files changed, 559 insertions, 31 deletions
diff --git a/buildscripts/resmokeconfig/suites/retryable_writes_jscore_passthrough.yml b/buildscripts/resmokeconfig/suites/retryable_writes_jscore_passthrough.yml index 62ecd89e683..d7ca295ad1a 100644 --- a/buildscripts/resmokeconfig/suites/retryable_writes_jscore_passthrough.yml +++ b/buildscripts/resmokeconfig/suites/retryable_writes_jscore_passthrough.yml @@ -29,6 +29,9 @@ selector: # TODO SERVER-31245: Inserts to "system.indexes" bypass the check for retryability. - jstests/core/batch_write_command_insert.js + # TODO SERVER-31198: Remove once retry attempts are always decremented. + - jstests/core/write_result.js + executor: config: shell_options: diff --git a/buildscripts/resmokeconfig/suites/retryable_writes_jscore_stepdown_passthrough.yml b/buildscripts/resmokeconfig/suites/retryable_writes_jscore_stepdown_passthrough.yml new file mode 100644 index 00000000000..a52965222b6 --- /dev/null +++ b/buildscripts/resmokeconfig/suites/retryable_writes_jscore_stepdown_passthrough.yml @@ -0,0 +1,142 @@ +test_kind: js_test + +selector: + roots: + - jstests/core/**/*.js + exclude_files: + # These tests are not expected to pass with replica-sets: + - jstests/core/capped_update.js + - jstests/core/dbadmin.js + - jstests/core/opcounters_write_cmd.js + - jstests/core/read_after_optime.js + + # No-op retries are not ignored by top, the profiler, or opcount. + - jstests/core/operation_latency_histogram.js + - jstests/core/profile2.js + - jstests/core/profile3.js + - jstests/core/profile_findandmodify.js + - jstests/core/top.js + - jstests/core/views/views_stats.js + + # TODO SERVER-31249: getLastError should not be affected by no-op retries. + - jstests/core/bulk_legacy_enforce_gle.js + + # TODO SERVER-31242: findAndModify no-op retry should respect the fields option. + - jstests/core/crud_api.js + - jstests/core/find_and_modify2.js + - jstests/core/find_and_modify_server6865.js + + # TODO SERVER-31245: Inserts to "system.indexes" bypass the check for retryability. + - jstests/core/batch_write_command_insert.js + + # Stepdown commands during fsync lock will fail. + - jstests/core/currentop.js + - jstests/core/fsync.js + - jstests/core/killop_drop_collection.js + + # Expect drops/creates to fail or have certain a response: + - jstests/core/drop.js + - jstests/core/dropdb.js + - jstests/core/explain_upsert.js + - jstests/core/indexes_multiple_commands.js + + # Expect certain responses, but retries of successfully completed commands may return + # different values: + - jstests/core/create_indexes.js + - jstests/core/objid5.js + + # Expect results to return in a certain order, secondaries may apply ops out of order. + - jstests/core/coveredIndex1.js + - jstests/core/find1.js + - jstests/core/sortc.js + + # Use $listLocalSessions aggregation stage command: + - jstests/core/list_all_local_cursors.js + - jstests/core/list_all_local_sessions.js + - jstests/core/list_local_sessions.js + + - jstests/core/bench_test*.js # benchRun() used for writes + - jstests/core/connection_string_validation.js # Does not expect a replica set connection string. + - jstests/core/explain_large_bounds.js # Stepdown can timeout waiting for global lock. + - jstests/core/insert2.js # Creates new mongo connection. + - jstests/core/list_collections_filter.js # Temporary collections are dropped on failover. + - jstests/core/startup_log.js # Checks pid, which is different on each server. + - jstests/core/validate_cmd_ns.js # Calls _exec() directly, not retryable. + + exclude_with_any_tags: + - cannot_inject_read_write_concern + + # The next four tags correspond to the special errors thrown by the auto_retry_on_network_error.js + # override when it refuses to run a certain command. Above each tag are the messages that + # warrant it. + + # "Refusing to run a test that issues a getMore command since if a network error occurs during + # it then we won't know whether the cursor was advanced or not" + - requires_getmore + + # "Refusing to run a test that issues non-retryable write operations since the test likely makes + # assertions on the write results and can lead to spurious failures if a network error occurs" + - requires_non_retryable_writes + + # "Refusing to run a test that issues commands that are not blindly retryable" + # "Refusing to run a test that issues an aggregation command with $out because it is not + # retryable" + - requires_non_retryable_commands + + # "Refusing to run a test that issues commands that may return different values after a failover" + # "Refusing to run a test that issues an aggregation command with explain because it may return + # incomplete results" + # "Refusing to run a test that issues a mapReduce command, because it calls std::terminate() if + # interrupted by a stepdown" + - does_not_support_stepdowns + +executor: + config: + shell_options: + eval: >- + testingReplication = true; + load("jstests/libs/override_methods/auto_retry_on_network_error.js"); + db = connect(TestData.connectionString); + load("jstests/libs/override_methods/enable_sessions.js"); + load("jstests/libs/override_methods/set_read_and_write_concerns.js"); + global_vars: + TestData: + alwaysInjectTransactionNumber: true + defaultReadConcernLevel: "majority" + logRetryAttempts: true + overrideRetryAttempts: 3 + sessionOptions: + readConcern: + level: "majority" + # Force DBClientRS to find the primary for non-write commands. + readPreference: + mode: "primary" + retryWrites: true + # We specify nodb so the shell used by each test will attempt to connect after loading the + # retry logic in auto_retry_on_network_error.js. + nodb: "" + readMode: commands + hooks: + - class: ContinuousStepdown + # The CheckReplDBHash hook waits until all operations have replicated to and have been applied + # on the secondaries, so we run the ValidateCollections hook after it to ensure we're + # validating the entire contents of the collection. + - class: CheckReplOplogs + - class: CheckReplDBHash + - class: ValidateCollections + - class: CleanEveryN + n: 20 + fixture: + class: ReplicaSetFixture + mongod_options: + enableMajorityReadConcern: '' + oplogSize: 511 + set_parameters: + enableTestCommands: 1 + numInitialSyncAttempts: 1 + all_nodes_electable: true + num_nodes: 3 + replset_config_options: + settings: + # Speeds up failover, reduces chance DBClientRS times out retargeting the primary. + electionTimeoutMillis: 6500 diff --git a/buildscripts/resmokelib/core/programs.py b/buildscripts/resmokelib/core/programs.py index 097e0d7d1b4..ec7522dedc8 100644 --- a/buildscripts/resmokelib/core/programs.py +++ b/buildscripts/resmokelib/core/programs.py @@ -201,6 +201,12 @@ def mongo_shell_program(logger, executable=None, connection_string=None, filenam if "eval_prepend" in kwargs: eval_sb.append(str(kwargs.pop("eval_prepend"))) + # If nodb is specified, pass the connection string through TestData so it can be used inside the + # test, then delete it so it isn't given as an argument to the mongo shell. + if "nodb" in kwargs and connection_string is not None: + test_data["connectionString"] = connection_string + connection_string = None + for var_name in global_vars: _format_shell_vars(eval_sb, var_name, global_vars[var_name]) diff --git a/buildscripts/resmokelib/testing/hooks/stepdown.py b/buildscripts/resmokelib/testing/hooks/stepdown.py index 46f91ab4cd0..fb6da27ccbe 100644 --- a/buildscripts/resmokelib/testing/hooks/stepdown.py +++ b/buildscripts/resmokelib/testing/hooks/stepdown.py @@ -131,7 +131,9 @@ class _StepdownThread(threading.Thread): now = time.time() if now - self._last_exec > self._stepdown_interval_secs: self._step_down_all() - self._last_exec = now + # Wait until each replica set has a primary, so the test can make progress. + self._await_primaries() + self._last_exec = time.time() now = time.time() # 'wait_secs' is used to wait 'self._stepdown_interval_secs' from the moment the last # stepdown command was sent. @@ -154,8 +156,7 @@ class _StepdownThread(threading.Thread): # Wait until we are no longer executing stepdowns. self._is_idle_evt.wait() # Wait until we all the replica sets have primaries. - for fixture in self._rs_fixtures: - fixture.get_primary() + self._await_primaries() def resume(self): """Resumes the thread.""" @@ -169,6 +170,10 @@ class _StepdownThread(threading.Thread): # Wait until stop or timeout. self._is_stopped_evt.wait(timeout) + def _await_primaries(self): + for fixture in self._rs_fixtures: + fixture.get_primary() + def _step_down_all(self): self._is_idle_evt.clear() for rs_fixture in self._rs_fixtures: diff --git a/etc/evergreen.yml b/etc/evergreen.yml index f85208866dc..2c64255d184 100644 --- a/etc/evergreen.yml +++ b/etc/evergreen.yml @@ -4190,6 +4190,17 @@ tasks: run_multiple_jobs: true - <<: *task_template + name: retryable_writes_jscore_stepdown_passthrough_WT + depends_on: + - name: jsCore_WT + commands: + - func: "do setup" + - func: "run tests" + vars: + resmoke_args: --suites=retryable_writes_jscore_stepdown_passthrough --storageEngine=wiredTiger + run_multiple_jobs: true + +- <<: *task_template name: watchdog commands: - func: "do setup" @@ -5111,6 +5122,7 @@ buildvariants: - name: replica_sets_rollback_refetch_no_uuid - name: replica_sets_jscore_fcv34_passthrough_WT - name: retryable_writes_jscore_passthrough_WT + - name: retryable_writes_jscore_stepdown_passthrough_WT - name: master_slave_WT - name: master_slave_auth - name: master_slave_jscore_passthrough_WT @@ -6116,6 +6128,9 @@ buildvariants: - name: retryable_writes_jscore_passthrough_WT distros: - windows-64-vs2015-large + - name: retryable_writes_jscore_stepdown_passthrough_WT + distros: + - windows-64-vs2015-large - name: session_jscore_passthrough_WT - name: sharding_WT distros: @@ -6596,6 +6611,7 @@ buildvariants: - name: replica_sets_kill_secondaries_jscore_passthrough_WT - name: replica_sets_jscore_fcv34_passthrough_WT - name: retryable_writes_jscore_passthrough_WT + - name: retryable_writes_jscore_stepdown_passthrough_WT - name: rollback_fuzzer_WT - name: serial_run_WT - name: session_jscore_passthrough_WT @@ -6896,6 +6912,9 @@ buildvariants: - name: retryable_writes_jscore_passthrough_WT distros: - rhel62-large + - name: retryable_writes_jscore_stepdown_passthrough_WT + distros: + - rhel62-large - name: rlp_WT - name: sasl - name: session_jscore_passthrough_WT @@ -7032,6 +7051,7 @@ buildvariants: - name: replica_sets_kill_secondaries_jscore_passthrough_WT - name: replica_sets_jscore_fcv34_passthrough_WT - name: retryable_writes_jscore_passthrough_WT + - name: retryable_writes_jscore_stepdown_passthrough_WT - name: sasl - name: session_jscore_passthrough_WT - name: sharded_collections_jscore_passthrough_WT @@ -8751,6 +8771,7 @@ buildvariants: - name: replica_sets_jscore_passthrough - name: replica_sets_jscore_fcv34_passthrough - name: retryable_writes_jscore_passthrough_WT + - name: retryable_writes_jscore_stepdown_passthrough_WT - name: rlp - name: rollback_fuzzer_WT - name: sasl @@ -9147,6 +9168,7 @@ buildvariants: - name: replica_sets_kill_secondaries_jscore_passthrough_WT - name: replica_sets_jscore_fcv34_passthrough_WT - name: retryable_writes_jscore_passthrough_WT + - name: retryable_writes_jscore_stepdown_passthrough_WT - name: sasl - name: session_jscore_passthrough_WT - name: sharded_collections_jscore_passthrough_WT @@ -9298,6 +9320,7 @@ buildvariants: - name: replica_sets_kill_secondaries_jscore_passthrough_WT - name: replica_sets_jscore_fcv34_passthrough_WT - name: retryable_writes_jscore_passthrough_WT + - name: retryable_writes_jscore_stepdown_passthrough_WT - name: sasl - name: session_jscore_passthrough_WT - name: sharded_collections_jscore_passthrough_WT diff --git a/jstests/libs/override_methods/auto_retry_on_network_error.js b/jstests/libs/override_methods/auto_retry_on_network_error.js index 3386c468428..24d7f9b0528 100644 --- a/jstests/libs/override_methods/auto_retry_on_network_error.js +++ b/jstests/libs/override_methods/auto_retry_on_network_error.js @@ -13,6 +13,8 @@ load("jstests/libs/retryable_writes_util.js"); + const kMaxNumRetries = 3; + // Store a session to access ServerSession#canRetryWrites. let _serverSession; @@ -37,6 +39,118 @@ this, cmdObj, mongoRunCommandWithMetadataOriginal, arguments); }; + // Commands assumed to not be blindly retryable. + const kNonRetryableCommands = new Set([ + // Commands that take write concern and do not support txnNumbers. + "_configsvrAddShard", + "_configsvrAddShardToZone", + "_configsvrCommitChunkMerge", + "_configsvrCommitChunkMigration", + "_configsvrCommitChunkSplit", + "_configsvrCreateDatabase", + "_configsvrEnableSharding", + "_configsvrMoveChunk", + "_configsvrMovePrimary", + "_configsvrRemoveShard", + "_configsvrRemoveShardFromZone", + "_configsvrShardCollection", + "_configsvrUpdateZoneKeyRange", + "_mergeAuthzCollections", + "_recvChunkStart", + "appendOplogNote", + "applyOps", + "authSchemaUpgrade", + "captrunc", + "cleanupOrphaned", + "clone", + "cloneCollection", + "cloneCollectionAsCapped", + "collMod", + "convertToCapped", + "copydb", + "create", + "createIndexes", + "createRole", + "createUser", + "deleteIndexes", + "drop", + "dropAllRolesFromDatabase", + "dropAllUsersFromDatabase", + "dropDatabase", + "dropIndexes", + "dropRole", + "dropUser", + "emptycapped", + "godinsert", + "grantPrivilegesToRole", + "grantRolesToRole", + "grantRolesToUser", + "mapreduce.shardedfinish", + "moveChunk", + "renameCollection", + "revokePrivilegesFromRole", + "revokeRolesFromRole", + "revokeRolesFromUser", + "updateRole", + "updateUser", + + // Other commands. + "eval", // May contain non-retryable commands. + "$eval", + ]); + + // These commands are not idempotent because they return errors if retried after + // successfully completing (like IndexNotFound, NamespaceExists, etc.), but because they + // only take effect once, and many tests use them to set up state, their errors on retries + // are handled specially. + const kAcceptableNonRetryableCommands = new Set([ + "create", + "createIndexes", + "deleteIndexes", + "drop", + "dropDatabase", // Already ignores NamespaceNotFound errors, so not handled below. + "dropIndexes", + ]); + + function isAcceptableNonRetryableCommand(cmdName) { + return kAcceptableNonRetryableCommands.has(cmdName); + } + + function isAcceptableRetryFailedResponse(cmdName, res) { + return ((cmdName === "create" && res.code === ErrorCodes.NamespaceExists) || + (cmdName === "createIndexes" && res.code === ErrorCodes.IndexAlreadyExists) || + (cmdName === "drop" && res.code === ErrorCodes.NamespaceNotFound) || + ((cmdName === "dropIndexes" || cmdName === "deleteIndexes") && + res.code === ErrorCodes.IndexNotFound)); + } + + // Commands that may return different values or fail if retried on a new primary after a + // failover. + const kNonFailoverTolerantCommands = new Set([ + "currentOp", // Failovers can change currentOp output. + "getLog", // The log is different on different servers. + "killOp", // Failovers may interrupt operations intended to be killed later in the test. + "logRotate", + "planCacheClear", // The plan cache isn't replicated. + "planCacheClearFilters", + "planCacheListFilters", + "planCacheListPlans", + "planCacheListQueryShapes", + "planCacheSetFilter", + "profile", // Not replicated, so can't tolerate failovers. + "setParameter", // Not replicated, so can't tolerate failovers. + "stageDebug", + ]); + + // Several commands that use the plan executor swallow the actual error code from a failed plan + // into their error message and instead return OperationFailed. + // + // TODO SERVER-32208: Remove this function once it is no longer needed. + function isRetryableExecutorCodeAndMessage(code, msg) { + return code === ErrorCodes.OperationFailed && typeof msg !== "undefined" && + msg.indexOf("InterruptedDueToReplStateChange") >= 0; + } + function runWithRetriesOnNetworkErrors(mongo, cmdObj, clientFunction, clientFunctionArguments) { let cmdName = Object.keys(cmdObj)[0]; @@ -50,11 +164,137 @@ const isRetryableWriteCmd = RetryableWritesUtil.isRetryableWriteCmdName(cmdName); const canRetryWrites = _serverSession.canRetryWrites(cmdObj); - let numRetries = !jsTest.options().skipRetryOnNetworkError ? 1 : 0; + let numRetries = !jsTest.options().skipRetryOnNetworkError ? kMaxNumRetries : 0; + + // Validate the command before running it, to prevent tests with non-retryable commands + // from being run. + if (isRetryableWriteCmd && !canRetryWrites) { + throw new Error("Refusing to run a test that issues non-retryable write operations" + + " since the test likely makes assertions on the write results and" + + " can lead to spurious failures if a network error occurs."); + } else if (cmdName === "getMore") { + throw new Error( + "Refusing to run a test that issues a getMore command since if a network error" + + " occurs during it then we won't know whether the cursor was advanced or not."); + } else if (kNonRetryableCommands.has(cmdName) && + !isAcceptableNonRetryableCommand(cmdName)) { + throw new Error( + "Refusing to run a test that issues commands that are not blindly retryable, " + + " cmdName: " + cmdName); + } else if (kNonFailoverTolerantCommands.has(cmdName)) { + throw new Error( + "Refusing to run a test that issues commands that may return different values" + + " after a failover, cmdName: " + cmdName); + } else if (cmdName === "aggregate") { + // Aggregate can be either a read or a write depending on whether it has a $out stage. + // $out is required to be the last stage of the pipeline. + var stages = cmdObj.pipeline; + const lastStage = stages && Array.isArray(stages) && (stages.length !== 0) + ? stages[stages.length - 1] + : undefined; + const hasOut = + lastStage && (typeof lastStage === 'object') && lastStage.hasOwnProperty('$out'); + const hasExplain = cmdObj.hasOwnProperty("explain"); + if (hasExplain) { + throw new Error( + "Refusing to run a test that issues an aggregation command with explain" + + " because it may return incomplete results if interrupted by a stepdown."); + } + if (hasOut) { + throw new Error("Refusing to run a test that issues an aggregation command" + + " with $out because it is not retryable."); + } + } else if (cmdName === "mapReduce" || cmdName === "mapreduce") { + throw new Error( + "Refusing to run a test that issues a mapReduce command, because it calls " + + " std::terminate() if interrupted by a stepdown."); + } do { try { - return clientFunction.apply(mongo, clientFunctionArguments); + let res = clientFunction.apply(mongo, clientFunctionArguments); + + if (isRetryableWriteCmd && canRetryWrites) { + // findAndModify can fail during the find stage and return an executor error. + if ((cmdName === "findandmodify" || cmdName === "findAndModify") && + isRetryableExecutorCodeAndMessage(res.code, res.errmsg)) { + print("=-=-=-= Retrying because of executor interruption: " + cmdName + + ", retries remaining: " + numRetries); + continue; + } + + // Don't interfere with retryable writes. + return res; + } + + if (cmdName === "explain") { + // If an explain is interrupted by a stepdown, and it returns before its + // connection is closed, it will return incomplete results. To prevent failing + // the test, force retries of interrupted explains. + if (res.hasOwnProperty("executionStats") && + !res.executionStats.executionSuccess && + (RetryableWritesUtil.isRetryableCode(res.executionStats.errorCode) || + isRetryableExecutorCodeAndMessage(res.executionStats.errorCode, + res.executionStats.errorMessage))) { + print("=-=-=-= Forcing retry of interrupted explain, res: " + tojson(res)); + continue; + } + + // An explain command can fail if its child command cannot be run on the current + // server. This can be hit if a primary only or not explicitly slaveOk command + // is accepted by a primary node that then steps down and returns before having + // its connection closed. + if (!res.ok && + res.errmsg.indexOf("child command cannot run on this node") >= 0) { + print( + "=-=-=-= Forcing retry of explain likely interrupted by transition to" + + " secondary, res: " + tojson(res)); + continue; + } + } + + if (!res.ok) { + if (numRetries > 0) { + if (RetryableWritesUtil.isRetryableCode(res.code)) { + // Don't decrement retries, because the command returned before the + // connection was closed, so a subsequent attempt will receive a + // network error (or NotMaster error) and need to retry. + print("=-=-=-= Retrying failed response with retryable code: " + + res.code + ", for command: " + cmdName + ", retries remaining: " + + numRetries); + continue; + } + + if (isRetryableExecutorCodeAndMessage(res.code, res.errmsg)) { + // Don't decrement retries for the same reason as above. + print("=-=-=-= Retrying because of executor interruption: " + cmdName + + ", retries remaining: " + numRetries); + continue; + } + } + + // Swallow safe errors that may come from a retry since the command may have + // completed before the connection was closed. + if (isAcceptableRetryFailedResponse(cmdName, res)) { + print("=-=-=-= Overriding safe failed response for: " + cmdName + + ", retries remaining: " + numRetries); + res.ok = 1; + } + } + + if (res.writeConcernError && numRetries > 0) { + if (RetryableWritesUtil.isRetryableCode(res.writeConcernError.code)) { + // Don't decrement retries, because the command returned before the + // connection was closed, so a subsequent attempt will receive a + // network error (or NotMaster error) and need to retry. + print("=-=-=-= Retrying write concern error with retryable code: " + + res.writeConcernError.code + ", for command: " + cmdName + + ", retries remaining: " + numRetries); + continue; + } + } + + return res; } catch (e) { if (!isNetworkError(e) || numRetries === 0) { throw e; @@ -64,21 +304,12 @@ // or will go through the retry logic in SessionAwareClient, so propagate // the error. throw e; - } else { - throw new Error( - "Cowardly refusing to run a test that issues non-retryable write" + - " operations since the test likely makes assertions on the write" + - " results and can lead to spurious failures if a network error" + - " occurs."); } - } else if (cmdName === "getMore") { - throw new Error( - "Cowardly refusing to run a test that issues a getMore command since if" + - " a network error occurs during it then we won't know whether the cursor" + - " was advanced or not."); } --numRetries; + print("=-=-=-= Retrying on network error for command: " + cmdName + + ", retries remaining: " + numRetries); } } while (numRetries >= 0); } @@ -97,4 +328,28 @@ return startParallelShellOriginal(newCode, port, noConnect); }; + + const connectOriginal = connect; + + connect = function(url, user, pass) { + let retVal; + + let connectionAttempts = 0; + assert.soon( + () => { + try { + connectionAttempts += 1; + retVal = connectOriginal.apply(this, arguments); + return true; + } catch (e) { + print("=-=-=-= Retrying connection to: " + url + ", attempts: " + + connectionAttempts + ", failed with: " + tojson(e)); + } + }, + "Failed connecting to url: " + tojson(url), + undefined, // Default timeout. + 2000); // 2 second interval. + + return retVal; + }; })(); diff --git a/jstests/libs/override_methods/set_read_and_write_concerns.js b/jstests/libs/override_methods/set_read_and_write_concerns.js index 8bbf29226f2..7d1e3fc8e0f 100644 --- a/jstests/libs/override_methods/set_read_and_write_concerns.js +++ b/jstests/libs/override_methods/set_read_and_write_concerns.js @@ -62,7 +62,21 @@ // These commands directly support a writeConcern argument. var commandsToForceWriteConcern = [ + "_configsvrAddShard", + "_configsvrAddShardToZone", + "_configsvrCommitChunkMerge", + "_configsvrCommitChunkMigration", + "_configsvrCommitChunkSplit", + "_configsvrCreateDatabase", + "_configsvrEnableSharding", + "_configsvrMoveChunk", + "_configsvrMovePrimary", + "_configsvrRemoveShard", + "_configsvrRemoveShardFromZone", + "_configsvrShardCollection", + "_configsvrUpdateZoneKeyRange", "_mergeAuthzCollections", + "_recvChunkStart", "appendOplogNote", "applyOps", "authSchemaUpgrade", @@ -71,7 +85,7 @@ "clone", "cloneCollection", "cloneCollectionAsCapped", - // "collMod", SERVER-25196 - not supported + "collMod", "convertToCapped", "copydb", "create", @@ -80,7 +94,6 @@ "createUser", "delete", "drop", - "dropDatabase", "dropAllRolesFromDatabase", "dropAllUsersFromDatabase", "dropDatabase", @@ -95,14 +108,10 @@ "grantRolesToRole", "grantRolesToUser", "insert", - "mapReduceFinish", - "mergeAuthzCollections", + "mapreduce.shardedfinish", "moveChunk", - "movePrimary", - "remove", "renameCollection", - "resvChunkStart", - "revokePriviligesFromRole", + "revokePrivilegesFromRole", "revokeRolesFromRole", "revokeRolesFromUser", "setFeatureCompatibilityVersion", diff --git a/jstests/libs/retryable_writes_util.js b/jstests/libs/retryable_writes_util.js index d545f4e9ed8..5105157eba1 100644 --- a/jstests/libs/retryable_writes_util.js +++ b/jstests/libs/retryable_writes_util.js @@ -2,14 +2,22 @@ * Utilities for testing retryable writes. */ var RetryableWritesUtil = (function() { - const retryableWriteCommands = + /** + * Returns true if the error code is retryable, assuming the command is idempotent. + */ + function isRetryableCode(code) { + return ErrorCodes.isNetworkError(code) || ErrorCodes.isNotMasterError(code) || + ErrorCodes.isWriteConcernError(code) || ErrorCodes.isInterruption(code); + } + + const kRetryableWriteCommands = new Set(["delete", "findandmodify", "findAndModify", "insert", "update"]); /** * Returns true if the command name is that of a retryable write command. */ function isRetryableWriteCmdName(cmdName) { - return retryableWriteCommands.has(cmdName); + return kRetryableWriteCommands.has(cmdName); } const kStorageEnginesWithoutDocumentLocking = new Set(["ephemeralForTest", "mmapv1"]); @@ -22,5 +30,5 @@ var RetryableWritesUtil = (function() { return !kStorageEnginesWithoutDocumentLocking.has(storageEngineName); } - return {isRetryableWriteCmdName, storageEngineSupportsRetryableWrites}; + return {isRetryableCode, isRetryableWriteCmdName, storageEngineSupportsRetryableWrites}; })(); diff --git a/src/mongo/shell/session.js b/src/mongo/shell/session.js index 48e3eab554a..eb0cc9a6464 100644 --- a/src/mongo/shell/session.js +++ b/src/mongo/shell/session.js @@ -280,6 +280,47 @@ var { } } + /** + * Returns true if the error code is retryable, assuming the command is idempotent. + */ + function isRetryableCode(code) { + return ErrorCodes.isNetworkError(code) || ErrorCodes.isNotMasterError(code) || + // The driver's spec does not allow retrying on writeConcern errors, so only do so + // when testing retryable writes. + (jsTest.options().alwaysInjectTransactionNumber && + ErrorCodes.isWriteConcernError(code)); + } + + /** + * Returns the error code from a write response that should be used in the check for + * retryability. + */ + function getEffectiveWriteErrorCode(res) { + let code; + if (res instanceof WriteResult) { + if (res.hasWriteError()) { + code = res.getWriteError().code; + } else if (res.hasWriteConcernError()) { + code = res.getWriteConcernError().code; + } + } else if (res instanceof BulkWriteResult) { + if (res.hasWriteErrors()) { + code = res.getWriteErrorAt(0).code; + } else if (res.hasWriteConcernError()) { + code = res.getWriteConcernError().code; + } + } else { + if (res.writeError) { + code = res.writeError.code; + } else if (res.writeErrors) { + code = res.writeErrors[0].code; + } else if (res.writeConcernError) { + code = res.writeConcernError.code; + } + } + return code; + } + function runClientFunctionWithRetries( driverSession, cmdObj, clientFunction, clientFunctionArguments) { let cmdName = Object.keys(cmdObj)[0]; @@ -296,13 +337,42 @@ var { ? 1 : 0; + if (numRetries > 0 && jsTest.options().overrideRetryAttempts) { + numRetries = jsTest.options().overrideRetryAttempts; + } + do { try { - const res = clientFunction.apply(client, clientFunctionArguments); - if (res.ok === 1 || numRetries === 0 || - !ErrorCodes.isNotMasterError(res.code)) { - return res; + let res = clientFunction.apply(client, clientFunctionArguments); + + if (numRetries > 0) { + if (!res.ok && isRetryableCode(res.code)) { + // Don't decrement retries, because the command returned before the + // connection was closed, so a subsequent attempt will receive a + // network error (or NotMaster error) and need to retry. + if (jsTest.options().logRetryAttempts) { + print("=-=-=-= Retrying failed response with retryable code: " + + res.code + ", for command: " + cmdName + + ", retries remaining: " + numRetries); + } + continue; + } + + let code = getEffectiveWriteErrorCode(res); + if (isRetryableCode(code)) { + // Don't decrement retries, because the command returned before the + // connection was closed, so a subsequent attempt will receive a network + // error (or NotMaster error) and need to retry. + if (jsTest.options().logRetryAttempts) { + print("=-=-=-= Retrying write with retryable write error code: " + + code + ", for command: " + cmdName + ", retries remaining: " + + numRetries); + } + continue; + } } + + return res; } catch (e) { if (!isNetworkError(e) || numRetries === 0) { throw e; @@ -329,6 +399,10 @@ var { } --numRetries; + if (jsTest.options().logRetryAttempts) { + print("=-=-=-= Retrying on network error for command: " + cmdName + + ", retries remaining: " + numRetries); + } } while (numRetries >= 0); } diff --git a/src/mongo/shell/utils.js b/src/mongo/shell/utils.js index 638580ebf3c..f7ec71ed8b6 100644 --- a/src/mongo/shell/utils.js +++ b/src/mongo/shell/utils.js @@ -261,6 +261,9 @@ jsTestOptions = function() { alwaysInjectTransactionNumber: TestData.alwaysInjectTransactionNumber, skipGossipingClusterTime: TestData.skipGossipingClusterTime || false, disableEnableSessions: TestData.disableEnableSessions, + overrideRetryAttempts: TestData.overrideRetryAttempts || 0, + logRetryAttempts: TestData.logRetryAttempts || false, + connectionString: TestData.connectionString || "", }); } return _jsTestOptions; |