SERVER-31194 Add a version of retryable_writes_jscore_passthrough.yml with stepdowns

author: Jack Mulrow <jack.mulrow@mongodb.com> 2017-12-07 10:24:06 -0500
committer: Jack Mulrow <jack.mulrow@mongodb.com> 2017-12-07 14:36:44 -0500
commit: 51b699b02a5858a115a95af206253104c46e4bb0 (patch)
tree: 4177385765c59bfddfb6bc21031bd6e9be3bebb6
parent: 00d92ece19c5c4057d21eb237a2f9905b196191d (diff)
download: mongo-51b699b02a5858a115a95af206253104c46e4bb0.tar.gz
10 files changed, 559 insertions, 31 deletions
diff --git a/buildscripts/resmokeconfig/suites/retryable_writes_jscore_passthrough.yml b/buildscripts/resmokeconfig/suites/retryable_writes_jscore_passthrough.yml
index 62ecd89e683..d7ca295ad1a 100644
--- a/buildscripts/resmokeconfig/suites/retryable_writes_jscore_passthrough.yml
+++ b/buildscripts/resmokeconfig/suites/retryable_writes_jscore_passthrough.yml
@@ -29,6 +29,9 @@ selector:
   # TODO SERVER-31245: Inserts to "system.indexes" bypass the check for retryability.
   - jstests/core/batch_write_command_insert.js
 
+  # TODO SERVER-31198: Remove once retry attempts are always decremented.
+  - jstests/core/write_result.js
+
 executor:
   config:
     shell_options:
diff --git a/buildscripts/resmokeconfig/suites/retryable_writes_jscore_stepdown_passthrough.yml b/buildscripts/resmokeconfig/suites/retryable_writes_jscore_stepdown_passthrough.yml
new file mode 100644
index 00000000000..a52965222b6
--- /dev/null
+++ b/buildscripts/resmokeconfig/suites/retryable_writes_jscore_stepdown_passthrough.yml
@@ -0,0 +1,142 @@
+test_kind: js_test
+
+selector:
+  roots:
+  - jstests/core/**/*.js
+  exclude_files:
+  # These tests are not expected to pass with replica-sets:
+  - jstests/core/capped_update.js
+  - jstests/core/dbadmin.js
+  - jstests/core/opcounters_write_cmd.js
+  - jstests/core/read_after_optime.js
+
+  # No-op retries are not ignored by top, the profiler, or opcount.
+  - jstests/core/operation_latency_histogram.js
+  - jstests/core/profile2.js
+  - jstests/core/profile3.js
+  - jstests/core/profile_findandmodify.js
+  - jstests/core/top.js
+  - jstests/core/views/views_stats.js
+
+  # TODO SERVER-31249: getLastError should not be affected by no-op retries.
+  - jstests/core/bulk_legacy_enforce_gle.js
+
+  # TODO SERVER-31242: findAndModify no-op retry should respect the fields option.
+  - jstests/core/crud_api.js
+  - jstests/core/find_and_modify2.js
+  - jstests/core/find_and_modify_server6865.js
+
+  # TODO SERVER-31245: Inserts to "system.indexes" bypass the check for retryability.
+  - jstests/core/batch_write_command_insert.js
+
+  # Stepdown commands during fsync lock will fail.
+  - jstests/core/currentop.js
+  - jstests/core/fsync.js
+  - jstests/core/killop_drop_collection.js
+
+  # Expect drops/creates to fail or have certain a response:
+  - jstests/core/drop.js
+  - jstests/core/dropdb.js
+  - jstests/core/explain_upsert.js
+  - jstests/core/indexes_multiple_commands.js
+
+  # Expect certain responses, but retries of successfully completed commands may return
+  # different values:
+  - jstests/core/create_indexes.js
+  - jstests/core/objid5.js
+
+  # Expect results to return in a certain order, secondaries may apply ops out of order.
+  - jstests/core/coveredIndex1.js
+  - jstests/core/find1.js
+  - jstests/core/sortc.js
+
+  # Use $listLocalSessions aggregation stage command:
+  - jstests/core/list_all_local_cursors.js
+  - jstests/core/list_all_local_sessions.js
+  - jstests/core/list_local_sessions.js
+
+  - jstests/core/bench_test*.js # benchRun() used for writes
+  - jstests/core/connection_string_validation.js # Does not expect a replica set connection string.
+  - jstests/core/explain_large_bounds.js # Stepdown can timeout waiting for global lock.
+  - jstests/core/insert2.js # Creates new mongo connection.
+  - jstests/core/list_collections_filter.js # Temporary collections are dropped on failover.
+  - jstests/core/startup_log.js # Checks pid, which is different on each server.
+  - jstests/core/validate_cmd_ns.js # Calls _exec() directly, not retryable.
+
+  exclude_with_any_tags:
+  - cannot_inject_read_write_concern
+
+  # The next four tags correspond to the special errors thrown by the auto_retry_on_network_error.js
+  # override when it refuses to run a certain command. Above each tag are the messages that
+  # warrant it.
+
+  # "Refusing to run a test that issues a getMore command since if a network error occurs during
+  #   it then we won't know whether the cursor was advanced or not"
+  - requires_getmore
+
+  # "Refusing to run a test that issues non-retryable write operations since the test likely makes
+  #   assertions on the write results and can lead to spurious failures if a network error occurs"
+  - requires_non_retryable_writes
+
+  # "Refusing to run a test that issues commands that are not blindly retryable"
+  # "Refusing to run a test that issues an aggregation command with $out because it is not
+  #   retryable"
+  - requires_non_retryable_commands
+
+  # "Refusing to run a test that issues commands that may return different values after a failover"
+  # "Refusing to run a test that issues an aggregation command with explain because it may return
+  #   incomplete results"
+  # "Refusing to run a test that issues a mapReduce command, because it calls std::terminate() if
+  #   interrupted by a stepdown"
+  - does_not_support_stepdowns
+
+executor:
+  config:
+    shell_options:
+      eval: >-
+        testingReplication = true;
+        load("jstests/libs/override_methods/auto_retry_on_network_error.js");
+        db = connect(TestData.connectionString);
+        load("jstests/libs/override_methods/enable_sessions.js");
+        load("jstests/libs/override_methods/set_read_and_write_concerns.js");
+      global_vars:
+        TestData:
+          alwaysInjectTransactionNumber: true
+          defaultReadConcernLevel: "majority"
+          logRetryAttempts: true
+          overrideRetryAttempts: 3
+          sessionOptions:
+            readConcern:
+              level: "majority"
+            # Force DBClientRS to find the primary for non-write commands.
+            readPreference:
+              mode: "primary"
+            retryWrites: true
+      # We specify nodb so the shell used by each test will attempt to connect after loading the
+      # retry logic in auto_retry_on_network_error.js.
+      nodb: ""
+      readMode: commands
+  hooks:
+  - class: ContinuousStepdown
+  # The CheckReplDBHash hook waits until all operations have replicated to and have been applied
+  # on the secondaries, so we run the ValidateCollections hook after it to ensure we're
+  # validating the entire contents of the collection.
+  - class: CheckReplOplogs
+  - class: CheckReplDBHash
+  - class: ValidateCollections
+  - class: CleanEveryN
+    n: 20
+  fixture:
+    class: ReplicaSetFixture
+    mongod_options:
+      enableMajorityReadConcern: ''
+      oplogSize: 511
+      set_parameters:
+        enableTestCommands: 1
+        numInitialSyncAttempts: 1
+    all_nodes_electable: true
+    num_nodes: 3
+    replset_config_options:
+      settings:
+        # Speeds up failover, reduces chance DBClientRS times out retargeting the primary.
+        electionTimeoutMillis: 6500
diff --git a/buildscripts/resmokelib/core/programs.py b/buildscripts/resmokelib/core/programs.py
index 097e0d7d1b4..ec7522dedc8 100644
--- a/buildscripts/resmokelib/core/programs.py
+++ b/buildscripts/resmokelib/core/programs.py
@@ -201,6 +201,12 @@ def mongo_shell_program(logger, executable=None, connection_string=None, filenam
     if "eval_prepend" in kwargs:
         eval_sb.append(str(kwargs.pop("eval_prepend")))
 
+    # If nodb is specified, pass the connection string through TestData so it can be used inside the
+    # test, then delete it so it isn't given as an argument to the mongo shell.
+    if "nodb" in kwargs and connection_string is not None:
+        test_data["connectionString"] = connection_string
+        connection_string = None
+
     for var_name in global_vars:
         _format_shell_vars(eval_sb, var_name, global_vars[var_name])
 
diff --git a/buildscripts/resmokelib/testing/hooks/stepdown.py b/buildscripts/resmokelib/testing/hooks/stepdown.py
index 8ad70794807..438f4208cad 100644
--- a/buildscripts/resmokelib/testing/hooks/stepdown.py
+++ b/buildscripts/resmokelib/testing/hooks/stepdown.py
@@ -135,7 +135,9 @@ class _StepdownThread(threading.Thread):
             now = time.time()
             if now - self._last_exec > self._stepdown_interval_secs:
                 self._step_down_all()
-                self._last_exec = now
+                # Wait until each replica set has a primary, so the test can make progress.
+                self._await_primaries()
+                self._last_exec = time.time()
             now = time.time()
             # 'wait_secs' is used to wait 'self._stepdown_interval_secs' from the moment the last
             # stepdown command was sent.
@@ -158,8 +160,7 @@ class _StepdownThread(threading.Thread):
         # Wait until we are no longer executing stepdowns.
         self._is_idle_evt.wait()
         # Wait until we all the replica sets have primaries.
-        for fixture in self._rs_fixtures:
-            fixture.get_primary()
+        self._await_primaries()
 
     def resume(self):
         """Resumes the thread."""
@@ -177,6 +178,10 @@ class _StepdownThread(threading.Thread):
         # Wait until stop or timeout.
         self._is_stopped_evt.wait(timeout)
 
+    def _await_primaries(self):
+        for fixture in self._rs_fixtures:
+            fixture.get_primary()
+
     def _step_down_all(self):
         self._is_idle_evt.clear()
         for rs_fixture in self._rs_fixtures:
diff --git a/etc/evergreen.yml b/etc/evergreen.yml
index 2a61e2bbf2e..ff9f8297e99 100644
--- a/etc/evergreen.yml
+++ b/etc/evergreen.yml
@@ -4224,6 +4224,17 @@ tasks:
       run_multiple_jobs: true
 
 - <<: *task_template
+  name: retryable_writes_jscore_stepdown_passthrough_WT
+  depends_on:
+  - name: jsCore_WT
+  commands:
+  - func: "do setup"
+  - func: "run tests"
+    vars:
+      resmoke_args: --suites=retryable_writes_jscore_stepdown_passthrough --storageEngine=wiredTiger
+      run_multiple_jobs: true
+
+- <<: *task_template
   name: watchdog
   commands:
   - func: "do setup"
@@ -5145,6 +5156,7 @@ buildvariants:
   - name: replica_sets_pv0
   - name: replica_sets_rollback_refetch_no_uuid
   - name: retryable_writes_jscore_passthrough_WT
+  - name: retryable_writes_jscore_stepdown_passthrough_WT
   - name: master_slave_WT
   - name: master_slave_auth
   - name: master_slave_jscore_passthrough_WT
@@ -6145,6 +6157,9 @@ buildvariants:
   - name: retryable_writes_jscore_passthrough_WT
     distros:
     - windows-64-vs2015-large
+  - name: retryable_writes_jscore_stepdown_passthrough_WT
+    distros:
+    - windows-64-vs2015-large
   - name: session_jscore_passthrough_WT
   - name: sharding_WT
     distros:
@@ -6640,6 +6655,7 @@ buildvariants:
   - name: replica_sets_jscore_passthrough_WT
   - name: replica_sets_kill_secondaries_jscore_passthrough_WT
   - name: retryable_writes_jscore_passthrough_WT
+  - name: retryable_writes_jscore_stepdown_passthrough_WT
   - name: rollback_fuzzer_WT
   - name: serial_run_WT
   - name: session_jscore_passthrough_WT
@@ -6939,6 +6955,9 @@ buildvariants:
   - name: retryable_writes_jscore_passthrough_WT
     distros:
     - rhel62-large
+  - name: retryable_writes_jscore_stepdown_passthrough_WT
+    distros:
+    - rhel62-large
   - name: rlp_WT
   - name: sasl
   - name: session_jscore_passthrough_WT
@@ -7074,6 +7093,7 @@ buildvariants:
   - name: replica_sets_resync_static_jscore_passthrough_WT
   - name: replica_sets_kill_secondaries_jscore_passthrough_WT
   - name: retryable_writes_jscore_passthrough_WT
+  - name: retryable_writes_jscore_stepdown_passthrough_WT
   - name: sasl
   - name: session_jscore_passthrough_WT
   - name: sharded_collections_jscore_passthrough_WT
@@ -8795,6 +8815,7 @@ buildvariants:
   - name: replica_sets_pv0
   - name: replica_sets_jscore_passthrough
   - name: retryable_writes_jscore_passthrough_WT
+  - name: retryable_writes_jscore_stepdown_passthrough_WT
   - name: rlp
   - name: rollback_fuzzer_WT
   - name: sasl
@@ -9189,6 +9210,7 @@ buildvariants:
   - name: replica_sets_resync_static_jscore_passthrough_WT
   - name: replica_sets_kill_secondaries_jscore_passthrough_WT
   - name: retryable_writes_jscore_passthrough_WT
+  - name: retryable_writes_jscore_stepdown_passthrough_WT
   - name: sasl
   - name: session_jscore_passthrough_WT
   - name: sharded_collections_jscore_passthrough_WT
@@ -9341,6 +9363,7 @@ buildvariants:
   - name: replica_sets_resync_static_jscore_passthrough_WT
   - name: replica_sets_kill_secondaries_jscore_passthrough_WT
   - name: retryable_writes_jscore_passthrough_WT
+  - name: retryable_writes_jscore_stepdown_passthrough_WT
   - name: sasl
   - name: session_jscore_passthrough_WT
   - name: sharded_collections_jscore_passthrough_WT
diff --git a/jstests/libs/override_methods/auto_retry_on_network_error.js b/jstests/libs/override_methods/auto_retry_on_network_error.js
index 3386c468428..24d7f9b0528 100644
--- a/jstests/libs/override_methods/auto_retry_on_network_error.js
+++ b/jstests/libs/override_methods/auto_retry_on_network_error.js
@@ -13,6 +13,8 @@
 
     load("jstests/libs/retryable_writes_util.js");
 
+    const kMaxNumRetries = 3;
+
     // Store a session to access ServerSession#canRetryWrites.
     let _serverSession;
 
@@ -37,6 +39,118 @@
             this, cmdObj, mongoRunCommandWithMetadataOriginal, arguments);
     };
 
+    // Commands assumed to not be blindly retryable.
+    const kNonRetryableCommands = new Set([
+        // Commands that take write concern and do not support txnNumbers.
+        "_configsvrAddShard",
+        "_configsvrAddShardToZone",
+        "_configsvrCommitChunkMerge",
+        "_configsvrCommitChunkMigration",
+        "_configsvrCommitChunkSplit",
+        "_configsvrCreateDatabase",
+        "_configsvrEnableSharding",
+        "_configsvrMoveChunk",
+        "_configsvrMovePrimary",
+        "_configsvrRemoveShard",
+        "_configsvrRemoveShardFromZone",
+        "_configsvrShardCollection",
+        "_configsvrUpdateZoneKeyRange",
+        "_mergeAuthzCollections",
+        "_recvChunkStart",
+        "appendOplogNote",
+        "applyOps",
+        "authSchemaUpgrade",
+        "captrunc",
+        "cleanupOrphaned",
+        "clone",
+        "cloneCollection",
+        "cloneCollectionAsCapped",
+        "collMod",
+        "convertToCapped",
+        "copydb",
+        "create",
+        "createIndexes",
+        "createRole",
+        "createUser",
+        "deleteIndexes",
+        "drop",
+        "dropAllRolesFromDatabase",
+        "dropAllUsersFromDatabase",
+        "dropDatabase",
+        "dropIndexes",
+        "dropRole",
+        "dropUser",
+        "emptycapped",
+        "godinsert",
+        "grantPrivilegesToRole",
+        "grantRolesToRole",
+        "grantRolesToUser",
+        "mapreduce.shardedfinish",
+        "moveChunk",
+        "renameCollection",
+        "revokePrivilegesFromRole",
+        "revokeRolesFromRole",
+        "revokeRolesFromUser",
+        "updateRole",
+        "updateUser",
+
+        // Other commands.
+        "eval",  // May contain non-retryable commands.
+        "$eval",
+    ]);
+
+    // These commands are not idempotent because they return errors if retried after
+    // successfully completing (like IndexNotFound, NamespaceExists, etc.), but because they
+    // only take effect once, and many tests use them to set up state, their errors on retries
+    // are handled specially.
+    const kAcceptableNonRetryableCommands = new Set([
+        "create",
+        "createIndexes",
+        "deleteIndexes",
+        "drop",
+        "dropDatabase",  // Already ignores NamespaceNotFound errors, so not handled below.
+        "dropIndexes",
+    ]);
+
+    function isAcceptableNonRetryableCommand(cmdName) {
+        return kAcceptableNonRetryableCommands.has(cmdName);
+    }
+
+    function isAcceptableRetryFailedResponse(cmdName, res) {
+        return ((cmdName === "create" && res.code === ErrorCodes.NamespaceExists) ||
+                (cmdName === "createIndexes" && res.code === ErrorCodes.IndexAlreadyExists) ||
+                (cmdName === "drop" && res.code === ErrorCodes.NamespaceNotFound) ||
+                ((cmdName === "dropIndexes" || cmdName === "deleteIndexes") &&
+                 res.code === ErrorCodes.IndexNotFound));
+    }
+
+    // Commands that may return different values or fail if retried on a new primary after a
+    // failover.
+    const kNonFailoverTolerantCommands = new Set([
+        "currentOp",  // Failovers can change currentOp output.
+        "getLog",     // The log is different on different servers.
+        "killOp",     // Failovers may interrupt operations intended to be killed later in the test.
+        "logRotate",
+        "planCacheClear",  // The plan cache isn't replicated.
+        "planCacheClearFilters",
+        "planCacheListFilters",
+        "planCacheListPlans",
+        "planCacheListQueryShapes",
+        "planCacheSetFilter",
+        "profile",       // Not replicated, so can't tolerate failovers.
+        "setParameter",  // Not replicated, so can't tolerate failovers.
+        "stageDebug",
+    ]);
+
+    // Several commands that use the plan executor swallow the actual error code from a failed plan
+    // into their error message and instead return OperationFailed.
+    //
+    // TODO SERVER-32208: Remove this function once it is no longer needed.
+    function isRetryableExecutorCodeAndMessage(code, msg) {
+        return code === ErrorCodes.OperationFailed && typeof msg !== "undefined" &&
+            msg.indexOf("InterruptedDueToReplStateChange") >= 0;
+    }
+
     function runWithRetriesOnNetworkErrors(mongo, cmdObj, clientFunction, clientFunctionArguments) {
         let cmdName = Object.keys(cmdObj)[0];
 
@@ -50,11 +164,137 @@
         const isRetryableWriteCmd = RetryableWritesUtil.isRetryableWriteCmdName(cmdName);
         const canRetryWrites = _serverSession.canRetryWrites(cmdObj);
 
-        let numRetries = !jsTest.options().skipRetryOnNetworkError ? 1 : 0;
+        let numRetries = !jsTest.options().skipRetryOnNetworkError ? kMaxNumRetries : 0;
+
+        // Validate the command before running it, to prevent tests with non-retryable commands
+        // from being run.
+        if (isRetryableWriteCmd && !canRetryWrites) {
+            throw new Error("Refusing to run a test that issues non-retryable write operations" +
+                            " since the test likely makes assertions on the write results and" +
+                            " can lead to spurious failures if a network error occurs.");
+        } else if (cmdName === "getMore") {
+            throw new Error(
+                "Refusing to run a test that issues a getMore command since if a network error" +
+                " occurs during it then we won't know whether the cursor was advanced or not.");
+        } else if (kNonRetryableCommands.has(cmdName) &&
+                   !isAcceptableNonRetryableCommand(cmdName)) {
+            throw new Error(
+                "Refusing to run a test that issues commands that are not blindly retryable, " +
+                " cmdName: " + cmdName);
+        } else if (kNonFailoverTolerantCommands.has(cmdName)) {
+            throw new Error(
+                "Refusing to run a test that issues commands that may return different values" +
+                " after a failover, cmdName: " + cmdName);
+        } else if (cmdName === "aggregate") {
+            // Aggregate can be either a read or a write depending on whether it has a $out stage.
+            // $out is required to be the last stage of the pipeline.
+            var stages = cmdObj.pipeline;
+            const lastStage = stages && Array.isArray(stages) && (stages.length !== 0)
+                ? stages[stages.length - 1]
+                : undefined;
+            const hasOut =
+                lastStage && (typeof lastStage === 'object') && lastStage.hasOwnProperty('$out');
+            const hasExplain = cmdObj.hasOwnProperty("explain");
+            if (hasExplain) {
+                throw new Error(
+                    "Refusing to run a test that issues an aggregation command with explain" +
+                    " because it may return incomplete results if interrupted by a stepdown.");
+            }
+            if (hasOut) {
+                throw new Error("Refusing to run a test that issues an aggregation command" +
+                                " with $out because it is not retryable.");
+            }
+        } else if (cmdName === "mapReduce" || cmdName === "mapreduce") {
+            throw new Error(
+                "Refusing to run a test that issues a mapReduce command, because it calls " +
+                " std::terminate() if interrupted by a stepdown.");
+        }
 
         do {
             try {
-                return clientFunction.apply(mongo, clientFunctionArguments);
+                let res = clientFunction.apply(mongo, clientFunctionArguments);
+
+                if (isRetryableWriteCmd && canRetryWrites) {
+                    // findAndModify can fail during the find stage and return an executor error.
+                    if ((cmdName === "findandmodify" || cmdName === "findAndModify") &&
+                        isRetryableExecutorCodeAndMessage(res.code, res.errmsg)) {
+                        print("=-=-=-= Retrying because of executor interruption: " + cmdName +
+                              ", retries remaining: " + numRetries);
+                        continue;
+                    }
+
+                    // Don't interfere with retryable writes.
+                    return res;
+                }
+
+                if (cmdName === "explain") {
+                    // If an explain is interrupted by a stepdown, and it returns before its
+                    // connection is closed, it will return incomplete results. To prevent failing
+                    // the test, force retries of interrupted explains.
+                    if (res.hasOwnProperty("executionStats") &&
+                        !res.executionStats.executionSuccess &&
+                        (RetryableWritesUtil.isRetryableCode(res.executionStats.errorCode) ||
+                         isRetryableExecutorCodeAndMessage(res.executionStats.errorCode,
+                                                           res.executionStats.errorMessage))) {
+                        print("=-=-=-= Forcing retry of interrupted explain, res: " + tojson(res));
+                        continue;
+                    }
+
+                    // An explain command can fail if its child command cannot be run on the current
+                    // server. This can be hit if a primary only or not explicitly slaveOk command
+                    // is accepted by a primary node that then steps down and returns before having
+                    // its connection closed.
+                    if (!res.ok &&
+                        res.errmsg.indexOf("child command cannot run on this node") >= 0) {
+                        print(
+                            "=-=-=-= Forcing retry of explain likely interrupted by transition to" +
+                            " secondary, res: " + tojson(res));
+                        continue;
+                    }
+                }
+
+                if (!res.ok) {
+                    if (numRetries > 0) {
+                        if (RetryableWritesUtil.isRetryableCode(res.code)) {
+                            // Don't decrement retries, because the command returned before the
+                            // connection was closed, so a subsequent attempt will receive a
+                            // network error (or NotMaster error) and need to retry.
+                            print("=-=-=-= Retrying failed response with retryable code: " +
+                                  res.code + ", for command: " + cmdName + ", retries remaining: " +
+                                  numRetries);
+                            continue;
+                        }
+
+                        if (isRetryableExecutorCodeAndMessage(res.code, res.errmsg)) {
+                            // Don't decrement retries for the same reason as above.
+                            print("=-=-=-= Retrying because of executor interruption: " + cmdName +
+                                  ", retries remaining: " + numRetries);
+                            continue;
+                        }
+                    }
+
+                    // Swallow safe errors that may come from a retry since the command may have
+                    // completed before the connection was closed.
+                    if (isAcceptableRetryFailedResponse(cmdName, res)) {
+                        print("=-=-=-= Overriding safe failed response for: " + cmdName +
+                              ", retries remaining: " + numRetries);
+                        res.ok = 1;
+                    }
+                }
+
+                if (res.writeConcernError && numRetries > 0) {
+                    if (RetryableWritesUtil.isRetryableCode(res.writeConcernError.code)) {
+                        // Don't decrement retries, because the command returned before the
+                        // connection was closed, so a subsequent attempt will receive a
+                        // network error (or NotMaster error) and need to retry.
+                        print("=-=-=-= Retrying write concern error with retryable code: " +
+                              res.writeConcernError.code + ", for command: " + cmdName +
+                              ", retries remaining: " + numRetries);
+                        continue;
+                    }
+                }
+
+                return res;
             } catch (e) {
                 if (!isNetworkError(e) || numRetries === 0) {
                     throw e;
@@ -64,21 +304,12 @@
                         // or will go through the retry logic in SessionAwareClient, so propagate
                         // the error.
                         throw e;
-                    } else {
-                        throw new Error(
-                            "Cowardly refusing to run a test that issues non-retryable write" +
-                            " operations since the test likely makes assertions on the write" +
-                            " results and can lead to spurious failures if a network error" +
-                            " occurs.");
                     }
-                } else if (cmdName === "getMore") {
-                    throw new Error(
-                        "Cowardly refusing to run a test that issues a getMore command since if" +
-                        " a network error occurs during it then we won't know whether the cursor" +
-                        " was advanced or not.");
                 }
 
                 --numRetries;
+                print("=-=-=-= Retrying on network error for command: " + cmdName +
+                      ", retries remaining: " + numRetries);
             }
         } while (numRetries >= 0);
     }
@@ -97,4 +328,28 @@
 
         return startParallelShellOriginal(newCode, port, noConnect);
     };
+
+    const connectOriginal = connect;
+
+    connect = function(url, user, pass) {
+        let retVal;
+
+        let connectionAttempts = 0;
+        assert.soon(
+            () => {
+                try {
+                    connectionAttempts += 1;
+                    retVal = connectOriginal.apply(this, arguments);
+                    return true;
+                } catch (e) {
+                    print("=-=-=-= Retrying connection to: " + url + ", attempts: " +
+                          connectionAttempts + ", failed with: " + tojson(e));
+                }
+            },
+            "Failed connecting to url: " + tojson(url),
+            undefined,  // Default timeout.
+            2000);      // 2 second interval.
+
+        return retVal;
+    };
 })();
diff --git a/jstests/libs/override_methods/set_read_and_write_concerns.js b/jstests/libs/override_methods/set_read_and_write_concerns.js
index 8bbf29226f2..7d1e3fc8e0f 100644
--- a/jstests/libs/override_methods/set_read_and_write_concerns.js
+++ b/jstests/libs/override_methods/set_read_and_write_concerns.js
@@ -62,7 +62,21 @@
 
         // These commands directly support a writeConcern argument.
         var commandsToForceWriteConcern = [
+            "_configsvrAddShard",
+            "_configsvrAddShardToZone",
+            "_configsvrCommitChunkMerge",
+            "_configsvrCommitChunkMigration",
+            "_configsvrCommitChunkSplit",
+            "_configsvrCreateDatabase",
+            "_configsvrEnableSharding",
+            "_configsvrMoveChunk",
+            "_configsvrMovePrimary",
+            "_configsvrRemoveShard",
+            "_configsvrRemoveShardFromZone",
+            "_configsvrShardCollection",
+            "_configsvrUpdateZoneKeyRange",
             "_mergeAuthzCollections",
+            "_recvChunkStart",
             "appendOplogNote",
             "applyOps",
             "authSchemaUpgrade",
@@ -71,7 +85,7 @@
             "clone",
             "cloneCollection",
             "cloneCollectionAsCapped",
-            // "collMod", SERVER-25196 - not supported
+            "collMod",
             "convertToCapped",
             "copydb",
             "create",
@@ -80,7 +94,6 @@
             "createUser",
             "delete",
             "drop",
-            "dropDatabase",
             "dropAllRolesFromDatabase",
             "dropAllUsersFromDatabase",
             "dropDatabase",
@@ -95,14 +108,10 @@
             "grantRolesToRole",
             "grantRolesToUser",
             "insert",
-            "mapReduceFinish",
-            "mergeAuthzCollections",
+            "mapreduce.shardedfinish",
             "moveChunk",
-            "movePrimary",
-            "remove",
             "renameCollection",
-            "resvChunkStart",
-            "revokePriviligesFromRole",
+            "revokePrivilegesFromRole",
             "revokeRolesFromRole",
             "revokeRolesFromUser",
             "setFeatureCompatibilityVersion",
diff --git a/jstests/libs/retryable_writes_util.js b/jstests/libs/retryable_writes_util.js
index d545f4e9ed8..5105157eba1 100644
--- a/jstests/libs/retryable_writes_util.js
+++ b/jstests/libs/retryable_writes_util.js
@@ -2,14 +2,22 @@
  * Utilities for testing retryable writes.
  */
 var RetryableWritesUtil = (function() {
-    const retryableWriteCommands =
+    /**
+     * Returns true if the error code is retryable, assuming the command is idempotent.
+     */
+    function isRetryableCode(code) {
+        return ErrorCodes.isNetworkError(code) || ErrorCodes.isNotMasterError(code) ||
+            ErrorCodes.isWriteConcernError(code) || ErrorCodes.isInterruption(code);
+    }
+
+    const kRetryableWriteCommands =
         new Set(["delete", "findandmodify", "findAndModify", "insert", "update"]);
 
     /**
      * Returns true if the command name is that of a retryable write command.
      */
     function isRetryableWriteCmdName(cmdName) {
-        return retryableWriteCommands.has(cmdName);
+        return kRetryableWriteCommands.has(cmdName);
     }
 
     const kStorageEnginesWithoutDocumentLocking = new Set(["ephemeralForTest", "mmapv1"]);
@@ -22,5 +30,5 @@ var RetryableWritesUtil = (function() {
         return !kStorageEnginesWithoutDocumentLocking.has(storageEngineName);
     }
 
-    return {isRetryableWriteCmdName, storageEngineSupportsRetryableWrites};
+    return {isRetryableCode, isRetryableWriteCmdName, storageEngineSupportsRetryableWrites};
 })();
diff --git a/src/mongo/shell/session.js b/src/mongo/shell/session.js
index 48e3eab554a..eb0cc9a6464 100644
--- a/src/mongo/shell/session.js
+++ b/src/mongo/shell/session.js
@@ -280,6 +280,47 @@ var {
             }
         }
 
+        /**
+         * Returns true if the error code is retryable, assuming the command is idempotent.
+         */
+        function isRetryableCode(code) {
+            return ErrorCodes.isNetworkError(code) || ErrorCodes.isNotMasterError(code) ||
+                // The driver's spec does not allow retrying on writeConcern errors, so only do so
+                // when testing retryable writes.
+                (jsTest.options().alwaysInjectTransactionNumber &&
+                 ErrorCodes.isWriteConcernError(code));
+        }
+
+        /**
+         * Returns the error code from a write response that should be used in the check for
+         * retryability.
+         */
+        function getEffectiveWriteErrorCode(res) {
+            let code;
+            if (res instanceof WriteResult) {
+                if (res.hasWriteError()) {
+                    code = res.getWriteError().code;
+                } else if (res.hasWriteConcernError()) {
+                    code = res.getWriteConcernError().code;
+                }
+            } else if (res instanceof BulkWriteResult) {
+                if (res.hasWriteErrors()) {
+                    code = res.getWriteErrorAt(0).code;
+                } else if (res.hasWriteConcernError()) {
+                    code = res.getWriteConcernError().code;
+                }
+            } else {
+                if (res.writeError) {
+                    code = res.writeError.code;
+                } else if (res.writeErrors) {
+                    code = res.writeErrors[0].code;
+                } else if (res.writeConcernError) {
+                    code = res.writeConcernError.code;
+                }
+            }
+            return code;
+        }
+
         function runClientFunctionWithRetries(
             driverSession, cmdObj, clientFunction, clientFunctionArguments) {
             let cmdName = Object.keys(cmdObj)[0];
@@ -296,13 +337,42 @@ var {
                 ? 1
                 : 0;
 
+            if (numRetries > 0 && jsTest.options().overrideRetryAttempts) {
+                numRetries = jsTest.options().overrideRetryAttempts;
+            }
+
             do {
                 try {
-                    const res = clientFunction.apply(client, clientFunctionArguments);
-                    if (res.ok === 1 || numRetries === 0 ||
-                        !ErrorCodes.isNotMasterError(res.code)) {
-                        return res;
+                    let res = clientFunction.apply(client, clientFunctionArguments);
+
+                    if (numRetries > 0) {
+                        if (!res.ok && isRetryableCode(res.code)) {
+                            // Don't decrement retries, because the command returned before the
+                            // connection was closed, so a subsequent attempt will receive a
+                            // network error (or NotMaster error) and need to retry.
+                            if (jsTest.options().logRetryAttempts) {
+                                print("=-=-=-= Retrying failed response with retryable code: " +
+                                      res.code + ", for command: " + cmdName +
+                                      ", retries remaining: " + numRetries);
+                            }
+                            continue;
+                        }
+
+                        let code = getEffectiveWriteErrorCode(res);
+                        if (isRetryableCode(code)) {
+                            // Don't decrement retries, because the command returned before the
+                            // connection was closed, so a subsequent attempt will receive a network
+                            // error (or NotMaster error) and need to retry.
+                            if (jsTest.options().logRetryAttempts) {
+                                print("=-=-=-= Retrying write with retryable write error code: " +
+                                      code + ", for command: " + cmdName + ", retries remaining: " +
+                                      numRetries);
+                            }
+                            continue;
+                        }
                     }
+
+                    return res;
                 } catch (e) {
                     if (!isNetworkError(e) || numRetries === 0) {
                         throw e;
@@ -329,6 +399,10 @@ var {
                 }
 
                 --numRetries;
+                if (jsTest.options().logRetryAttempts) {
+                    print("=-=-=-= Retrying on network error for command: " + cmdName +
+                          ", retries remaining: " + numRetries);
+                }
             } while (numRetries >= 0);
         }
 
diff --git a/src/mongo/shell/utils.js b/src/mongo/shell/utils.js
index 638580ebf3c..f7ec71ed8b6 100644
--- a/src/mongo/shell/utils.js
+++ b/src/mongo/shell/utils.js
@@ -261,6 +261,9 @@ jsTestOptions = function() {
             alwaysInjectTransactionNumber: TestData.alwaysInjectTransactionNumber,
             skipGossipingClusterTime: TestData.skipGossipingClusterTime || false,
             disableEnableSessions: TestData.disableEnableSessions,
+            overrideRetryAttempts: TestData.overrideRetryAttempts || 0,
+            logRetryAttempts: TestData.logRetryAttempts || false,
+            connectionString: TestData.connectionString || "",
         });
     }
     return _jsTestOptions;
author	Jack Mulrow <jack.mulrow@mongodb.com>	2017-12-07 10:24:06 -0500
committer	Jack Mulrow <jack.mulrow@mongodb.com>	2017-12-07 14:36:44 -0500
commit	51b699b02a5858a115a95af206253104c46e4bb0 (patch)
tree	4177385765c59bfddfb6bc21031bd6e9be3bebb6
parent	00d92ece19c5c4057d21eb237a2f9905b196191d (diff)
download	mongo-51b699b02a5858a115a95af206253104c46e4bb0.tar.gz