summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJack Mulrow <jack.mulrow@mongodb.com>2017-12-07 10:24:06 -0500
committerJack Mulrow <jack.mulrow@mongodb.com>2017-12-07 14:36:44 -0500
commit51b699b02a5858a115a95af206253104c46e4bb0 (patch)
tree4177385765c59bfddfb6bc21031bd6e9be3bebb6
parent00d92ece19c5c4057d21eb237a2f9905b196191d (diff)
downloadmongo-51b699b02a5858a115a95af206253104c46e4bb0.tar.gz
SERVER-31194 Add a version of retryable_writes_jscore_passthrough.yml with stepdowns
-rw-r--r--buildscripts/resmokeconfig/suites/retryable_writes_jscore_passthrough.yml3
-rw-r--r--buildscripts/resmokeconfig/suites/retryable_writes_jscore_stepdown_passthrough.yml142
-rw-r--r--buildscripts/resmokelib/core/programs.py6
-rw-r--r--buildscripts/resmokelib/testing/hooks/stepdown.py11
-rw-r--r--etc/evergreen.yml23
-rw-r--r--jstests/libs/override_methods/auto_retry_on_network_error.js281
-rw-r--r--jstests/libs/override_methods/set_read_and_write_concerns.js25
-rw-r--r--jstests/libs/retryable_writes_util.js14
-rw-r--r--src/mongo/shell/session.js82
-rw-r--r--src/mongo/shell/utils.js3
10 files changed, 559 insertions, 31 deletions
diff --git a/buildscripts/resmokeconfig/suites/retryable_writes_jscore_passthrough.yml b/buildscripts/resmokeconfig/suites/retryable_writes_jscore_passthrough.yml
index 62ecd89e683..d7ca295ad1a 100644
--- a/buildscripts/resmokeconfig/suites/retryable_writes_jscore_passthrough.yml
+++ b/buildscripts/resmokeconfig/suites/retryable_writes_jscore_passthrough.yml
@@ -29,6 +29,9 @@ selector:
# TODO SERVER-31245: Inserts to "system.indexes" bypass the check for retryability.
- jstests/core/batch_write_command_insert.js
+ # TODO SERVER-31198: Remove once retry attempts are always decremented.
+ - jstests/core/write_result.js
+
executor:
config:
shell_options:
diff --git a/buildscripts/resmokeconfig/suites/retryable_writes_jscore_stepdown_passthrough.yml b/buildscripts/resmokeconfig/suites/retryable_writes_jscore_stepdown_passthrough.yml
new file mode 100644
index 00000000000..a52965222b6
--- /dev/null
+++ b/buildscripts/resmokeconfig/suites/retryable_writes_jscore_stepdown_passthrough.yml
@@ -0,0 +1,142 @@
+test_kind: js_test
+
+selector:
+ roots:
+ - jstests/core/**/*.js
+ exclude_files:
+ # These tests are not expected to pass with replica-sets:
+ - jstests/core/capped_update.js
+ - jstests/core/dbadmin.js
+ - jstests/core/opcounters_write_cmd.js
+ - jstests/core/read_after_optime.js
+
+ # No-op retries are not ignored by top, the profiler, or opcount.
+ - jstests/core/operation_latency_histogram.js
+ - jstests/core/profile2.js
+ - jstests/core/profile3.js
+ - jstests/core/profile_findandmodify.js
+ - jstests/core/top.js
+ - jstests/core/views/views_stats.js
+
+ # TODO SERVER-31249: getLastError should not be affected by no-op retries.
+ - jstests/core/bulk_legacy_enforce_gle.js
+
+ # TODO SERVER-31242: findAndModify no-op retry should respect the fields option.
+ - jstests/core/crud_api.js
+ - jstests/core/find_and_modify2.js
+ - jstests/core/find_and_modify_server6865.js
+
+ # TODO SERVER-31245: Inserts to "system.indexes" bypass the check for retryability.
+ - jstests/core/batch_write_command_insert.js
+
+ # Stepdown commands during fsync lock will fail.
+ - jstests/core/currentop.js
+ - jstests/core/fsync.js
+ - jstests/core/killop_drop_collection.js
+
+ # Expect drops/creates to fail or have certain a response:
+ - jstests/core/drop.js
+ - jstests/core/dropdb.js
+ - jstests/core/explain_upsert.js
+ - jstests/core/indexes_multiple_commands.js
+
+ # Expect certain responses, but retries of successfully completed commands may return
+ # different values:
+ - jstests/core/create_indexes.js
+ - jstests/core/objid5.js
+
+ # Expect results to return in a certain order, secondaries may apply ops out of order.
+ - jstests/core/coveredIndex1.js
+ - jstests/core/find1.js
+ - jstests/core/sortc.js
+
+ # Use $listLocalSessions aggregation stage command:
+ - jstests/core/list_all_local_cursors.js
+ - jstests/core/list_all_local_sessions.js
+ - jstests/core/list_local_sessions.js
+
+ - jstests/core/bench_test*.js # benchRun() used for writes
+ - jstests/core/connection_string_validation.js # Does not expect a replica set connection string.
+ - jstests/core/explain_large_bounds.js # Stepdown can timeout waiting for global lock.
+ - jstests/core/insert2.js # Creates new mongo connection.
+ - jstests/core/list_collections_filter.js # Temporary collections are dropped on failover.
+ - jstests/core/startup_log.js # Checks pid, which is different on each server.
+ - jstests/core/validate_cmd_ns.js # Calls _exec() directly, not retryable.
+
+ exclude_with_any_tags:
+ - cannot_inject_read_write_concern
+
+ # The next four tags correspond to the special errors thrown by the auto_retry_on_network_error.js
+ # override when it refuses to run a certain command. Above each tag are the messages that
+ # warrant it.
+
+ # "Refusing to run a test that issues a getMore command since if a network error occurs during
+ # it then we won't know whether the cursor was advanced or not"
+ - requires_getmore
+
+ # "Refusing to run a test that issues non-retryable write operations since the test likely makes
+ # assertions on the write results and can lead to spurious failures if a network error occurs"
+ - requires_non_retryable_writes
+
+ # "Refusing to run a test that issues commands that are not blindly retryable"
+ # "Refusing to run a test that issues an aggregation command with $out because it is not
+ # retryable"
+ - requires_non_retryable_commands
+
+ # "Refusing to run a test that issues commands that may return different values after a failover"
+ # "Refusing to run a test that issues an aggregation command with explain because it may return
+ # incomplete results"
+ # "Refusing to run a test that issues a mapReduce command, because it calls std::terminate() if
+ # interrupted by a stepdown"
+ - does_not_support_stepdowns
+
+executor:
+ config:
+ shell_options:
+ eval: >-
+ testingReplication = true;
+ load("jstests/libs/override_methods/auto_retry_on_network_error.js");
+ db = connect(TestData.connectionString);
+ load("jstests/libs/override_methods/enable_sessions.js");
+ load("jstests/libs/override_methods/set_read_and_write_concerns.js");
+ global_vars:
+ TestData:
+ alwaysInjectTransactionNumber: true
+ defaultReadConcernLevel: "majority"
+ logRetryAttempts: true
+ overrideRetryAttempts: 3
+ sessionOptions:
+ readConcern:
+ level: "majority"
+ # Force DBClientRS to find the primary for non-write commands.
+ readPreference:
+ mode: "primary"
+ retryWrites: true
+ # We specify nodb so the shell used by each test will attempt to connect after loading the
+ # retry logic in auto_retry_on_network_error.js.
+ nodb: ""
+ readMode: commands
+ hooks:
+ - class: ContinuousStepdown
+ # The CheckReplDBHash hook waits until all operations have replicated to and have been applied
+ # on the secondaries, so we run the ValidateCollections hook after it to ensure we're
+ # validating the entire contents of the collection.
+ - class: CheckReplOplogs
+ - class: CheckReplDBHash
+ - class: ValidateCollections
+ - class: CleanEveryN
+ n: 20
+ fixture:
+ class: ReplicaSetFixture
+ mongod_options:
+ enableMajorityReadConcern: ''
+ oplogSize: 511
+ set_parameters:
+ enableTestCommands: 1
+ numInitialSyncAttempts: 1
+ all_nodes_electable: true
+ num_nodes: 3
+ replset_config_options:
+ settings:
+ # Speeds up failover, reduces chance DBClientRS times out retargeting the primary.
+ electionTimeoutMillis: 6500
diff --git a/buildscripts/resmokelib/core/programs.py b/buildscripts/resmokelib/core/programs.py
index 097e0d7d1b4..ec7522dedc8 100644
--- a/buildscripts/resmokelib/core/programs.py
+++ b/buildscripts/resmokelib/core/programs.py
@@ -201,6 +201,12 @@ def mongo_shell_program(logger, executable=None, connection_string=None, filenam
if "eval_prepend" in kwargs:
eval_sb.append(str(kwargs.pop("eval_prepend")))
+ # If nodb is specified, pass the connection string through TestData so it can be used inside the
+ # test, then delete it so it isn't given as an argument to the mongo shell.
+ if "nodb" in kwargs and connection_string is not None:
+ test_data["connectionString"] = connection_string
+ connection_string = None
+
for var_name in global_vars:
_format_shell_vars(eval_sb, var_name, global_vars[var_name])
diff --git a/buildscripts/resmokelib/testing/hooks/stepdown.py b/buildscripts/resmokelib/testing/hooks/stepdown.py
index 8ad70794807..438f4208cad 100644
--- a/buildscripts/resmokelib/testing/hooks/stepdown.py
+++ b/buildscripts/resmokelib/testing/hooks/stepdown.py
@@ -135,7 +135,9 @@ class _StepdownThread(threading.Thread):
now = time.time()
if now - self._last_exec > self._stepdown_interval_secs:
self._step_down_all()
- self._last_exec = now
+ # Wait until each replica set has a primary, so the test can make progress.
+ self._await_primaries()
+ self._last_exec = time.time()
now = time.time()
# 'wait_secs' is used to wait 'self._stepdown_interval_secs' from the moment the last
# stepdown command was sent.
@@ -158,8 +160,7 @@ class _StepdownThread(threading.Thread):
# Wait until we are no longer executing stepdowns.
self._is_idle_evt.wait()
# Wait until we all the replica sets have primaries.
- for fixture in self._rs_fixtures:
- fixture.get_primary()
+ self._await_primaries()
def resume(self):
"""Resumes the thread."""
@@ -177,6 +178,10 @@ class _StepdownThread(threading.Thread):
# Wait until stop or timeout.
self._is_stopped_evt.wait(timeout)
+ def _await_primaries(self):
+ for fixture in self._rs_fixtures:
+ fixture.get_primary()
+
def _step_down_all(self):
self._is_idle_evt.clear()
for rs_fixture in self._rs_fixtures:
diff --git a/etc/evergreen.yml b/etc/evergreen.yml
index 2a61e2bbf2e..ff9f8297e99 100644
--- a/etc/evergreen.yml
+++ b/etc/evergreen.yml
@@ -4224,6 +4224,17 @@ tasks:
run_multiple_jobs: true
- <<: *task_template
+ name: retryable_writes_jscore_stepdown_passthrough_WT
+ depends_on:
+ - name: jsCore_WT
+ commands:
+ - func: "do setup"
+ - func: "run tests"
+ vars:
+ resmoke_args: --suites=retryable_writes_jscore_stepdown_passthrough --storageEngine=wiredTiger
+ run_multiple_jobs: true
+
+- <<: *task_template
name: watchdog
commands:
- func: "do setup"
@@ -5145,6 +5156,7 @@ buildvariants:
- name: replica_sets_pv0
- name: replica_sets_rollback_refetch_no_uuid
- name: retryable_writes_jscore_passthrough_WT
+ - name: retryable_writes_jscore_stepdown_passthrough_WT
- name: master_slave_WT
- name: master_slave_auth
- name: master_slave_jscore_passthrough_WT
@@ -6145,6 +6157,9 @@ buildvariants:
- name: retryable_writes_jscore_passthrough_WT
distros:
- windows-64-vs2015-large
+ - name: retryable_writes_jscore_stepdown_passthrough_WT
+ distros:
+ - windows-64-vs2015-large
- name: session_jscore_passthrough_WT
- name: sharding_WT
distros:
@@ -6640,6 +6655,7 @@ buildvariants:
- name: replica_sets_jscore_passthrough_WT
- name: replica_sets_kill_secondaries_jscore_passthrough_WT
- name: retryable_writes_jscore_passthrough_WT
+ - name: retryable_writes_jscore_stepdown_passthrough_WT
- name: rollback_fuzzer_WT
- name: serial_run_WT
- name: session_jscore_passthrough_WT
@@ -6939,6 +6955,9 @@ buildvariants:
- name: retryable_writes_jscore_passthrough_WT
distros:
- rhel62-large
+ - name: retryable_writes_jscore_stepdown_passthrough_WT
+ distros:
+ - rhel62-large
- name: rlp_WT
- name: sasl
- name: session_jscore_passthrough_WT
@@ -7074,6 +7093,7 @@ buildvariants:
- name: replica_sets_resync_static_jscore_passthrough_WT
- name: replica_sets_kill_secondaries_jscore_passthrough_WT
- name: retryable_writes_jscore_passthrough_WT
+ - name: retryable_writes_jscore_stepdown_passthrough_WT
- name: sasl
- name: session_jscore_passthrough_WT
- name: sharded_collections_jscore_passthrough_WT
@@ -8795,6 +8815,7 @@ buildvariants:
- name: replica_sets_pv0
- name: replica_sets_jscore_passthrough
- name: retryable_writes_jscore_passthrough_WT
+ - name: retryable_writes_jscore_stepdown_passthrough_WT
- name: rlp
- name: rollback_fuzzer_WT
- name: sasl
@@ -9189,6 +9210,7 @@ buildvariants:
- name: replica_sets_resync_static_jscore_passthrough_WT
- name: replica_sets_kill_secondaries_jscore_passthrough_WT
- name: retryable_writes_jscore_passthrough_WT
+ - name: retryable_writes_jscore_stepdown_passthrough_WT
- name: sasl
- name: session_jscore_passthrough_WT
- name: sharded_collections_jscore_passthrough_WT
@@ -9341,6 +9363,7 @@ buildvariants:
- name: replica_sets_resync_static_jscore_passthrough_WT
- name: replica_sets_kill_secondaries_jscore_passthrough_WT
- name: retryable_writes_jscore_passthrough_WT
+ - name: retryable_writes_jscore_stepdown_passthrough_WT
- name: sasl
- name: session_jscore_passthrough_WT
- name: sharded_collections_jscore_passthrough_WT
diff --git a/jstests/libs/override_methods/auto_retry_on_network_error.js b/jstests/libs/override_methods/auto_retry_on_network_error.js
index 3386c468428..24d7f9b0528 100644
--- a/jstests/libs/override_methods/auto_retry_on_network_error.js
+++ b/jstests/libs/override_methods/auto_retry_on_network_error.js
@@ -13,6 +13,8 @@
load("jstests/libs/retryable_writes_util.js");
+ const kMaxNumRetries = 3;
+
// Store a session to access ServerSession#canRetryWrites.
let _serverSession;
@@ -37,6 +39,118 @@
this, cmdObj, mongoRunCommandWithMetadataOriginal, arguments);
};
+ // Commands assumed to not be blindly retryable.
+ const kNonRetryableCommands = new Set([
+ // Commands that take write concern and do not support txnNumbers.
+ "_configsvrAddShard",
+ "_configsvrAddShardToZone",
+ "_configsvrCommitChunkMerge",
+ "_configsvrCommitChunkMigration",
+ "_configsvrCommitChunkSplit",
+ "_configsvrCreateDatabase",
+ "_configsvrEnableSharding",
+ "_configsvrMoveChunk",
+ "_configsvrMovePrimary",
+ "_configsvrRemoveShard",
+ "_configsvrRemoveShardFromZone",
+ "_configsvrShardCollection",
+ "_configsvrUpdateZoneKeyRange",
+ "_mergeAuthzCollections",
+ "_recvChunkStart",
+ "appendOplogNote",
+ "applyOps",
+ "authSchemaUpgrade",
+ "captrunc",
+ "cleanupOrphaned",
+ "clone",
+ "cloneCollection",
+ "cloneCollectionAsCapped",
+ "collMod",
+ "convertToCapped",
+ "copydb",
+ "create",
+ "createIndexes",
+ "createRole",
+ "createUser",
+ "deleteIndexes",
+ "drop",
+ "dropAllRolesFromDatabase",
+ "dropAllUsersFromDatabase",
+ "dropDatabase",
+ "dropIndexes",
+ "dropRole",
+ "dropUser",
+ "emptycapped",
+ "godinsert",
+ "grantPrivilegesToRole",
+ "grantRolesToRole",
+ "grantRolesToUser",
+ "mapreduce.shardedfinish",
+ "moveChunk",
+ "renameCollection",
+ "revokePrivilegesFromRole",
+ "revokeRolesFromRole",
+ "revokeRolesFromUser",
+ "updateRole",
+ "updateUser",
+
+ // Other commands.
+ "eval", // May contain non-retryable commands.
+ "$eval",
+ ]);
+
+ // These commands are not idempotent because they return errors if retried after
+ // successfully completing (like IndexNotFound, NamespaceExists, etc.), but because they
+ // only take effect once, and many tests use them to set up state, their errors on retries
+ // are handled specially.
+ const kAcceptableNonRetryableCommands = new Set([
+ "create",
+ "createIndexes",
+ "deleteIndexes",
+ "drop",
+ "dropDatabase", // Already ignores NamespaceNotFound errors, so not handled below.
+ "dropIndexes",
+ ]);
+
+ function isAcceptableNonRetryableCommand(cmdName) {
+ return kAcceptableNonRetryableCommands.has(cmdName);
+ }
+
+ function isAcceptableRetryFailedResponse(cmdName, res) {
+ return ((cmdName === "create" && res.code === ErrorCodes.NamespaceExists) ||
+ (cmdName === "createIndexes" && res.code === ErrorCodes.IndexAlreadyExists) ||
+ (cmdName === "drop" && res.code === ErrorCodes.NamespaceNotFound) ||
+ ((cmdName === "dropIndexes" || cmdName === "deleteIndexes") &&
+ res.code === ErrorCodes.IndexNotFound));
+ }
+
+ // Commands that may return different values or fail if retried on a new primary after a
+ // failover.
+ const kNonFailoverTolerantCommands = new Set([
+ "currentOp", // Failovers can change currentOp output.
+ "getLog", // The log is different on different servers.
+ "killOp", // Failovers may interrupt operations intended to be killed later in the test.
+ "logRotate",
+ "planCacheClear", // The plan cache isn't replicated.
+ "planCacheClearFilters",
+ "planCacheListFilters",
+ "planCacheListPlans",
+ "planCacheListQueryShapes",
+ "planCacheSetFilter",
+ "profile", // Not replicated, so can't tolerate failovers.
+ "setParameter", // Not replicated, so can't tolerate failovers.
+ "stageDebug",
+ ]);
+
+ // Several commands that use the plan executor swallow the actual error code from a failed plan
+ // into their error message and instead return OperationFailed.
+ //
+ // TODO SERVER-32208: Remove this function once it is no longer needed.
+ function isRetryableExecutorCodeAndMessage(code, msg) {
+ return code === ErrorCodes.OperationFailed && typeof msg !== "undefined" &&
+ msg.indexOf("InterruptedDueToReplStateChange") >= 0;
+ }
+
function runWithRetriesOnNetworkErrors(mongo, cmdObj, clientFunction, clientFunctionArguments) {
let cmdName = Object.keys(cmdObj)[0];
@@ -50,11 +164,137 @@
const isRetryableWriteCmd = RetryableWritesUtil.isRetryableWriteCmdName(cmdName);
const canRetryWrites = _serverSession.canRetryWrites(cmdObj);
- let numRetries = !jsTest.options().skipRetryOnNetworkError ? 1 : 0;
+ let numRetries = !jsTest.options().skipRetryOnNetworkError ? kMaxNumRetries : 0;
+
+ // Validate the command before running it, to prevent tests with non-retryable commands
+ // from being run.
+ if (isRetryableWriteCmd && !canRetryWrites) {
+ throw new Error("Refusing to run a test that issues non-retryable write operations" +
+ " since the test likely makes assertions on the write results and" +
+ " can lead to spurious failures if a network error occurs.");
+ } else if (cmdName === "getMore") {
+ throw new Error(
+ "Refusing to run a test that issues a getMore command since if a network error" +
+ " occurs during it then we won't know whether the cursor was advanced or not.");
+ } else if (kNonRetryableCommands.has(cmdName) &&
+ !isAcceptableNonRetryableCommand(cmdName)) {
+ throw new Error(
+ "Refusing to run a test that issues commands that are not blindly retryable, " +
+ " cmdName: " + cmdName);
+ } else if (kNonFailoverTolerantCommands.has(cmdName)) {
+ throw new Error(
+ "Refusing to run a test that issues commands that may return different values" +
+ " after a failover, cmdName: " + cmdName);
+ } else if (cmdName === "aggregate") {
+ // Aggregate can be either a read or a write depending on whether it has a $out stage.
+ // $out is required to be the last stage of the pipeline.
+ var stages = cmdObj.pipeline;
+ const lastStage = stages && Array.isArray(stages) && (stages.length !== 0)
+ ? stages[stages.length - 1]
+ : undefined;
+ const hasOut =
+ lastStage && (typeof lastStage === 'object') && lastStage.hasOwnProperty('$out');
+ const hasExplain = cmdObj.hasOwnProperty("explain");
+ if (hasExplain) {
+ throw new Error(
+ "Refusing to run a test that issues an aggregation command with explain" +
+ " because it may return incomplete results if interrupted by a stepdown.");
+ }
+ if (hasOut) {
+ throw new Error("Refusing to run a test that issues an aggregation command" +
+ " with $out because it is not retryable.");
+ }
+ } else if (cmdName === "mapReduce" || cmdName === "mapreduce") {
+ throw new Error(
+ "Refusing to run a test that issues a mapReduce command, because it calls " +
+ " std::terminate() if interrupted by a stepdown.");
+ }
do {
try {
- return clientFunction.apply(mongo, clientFunctionArguments);
+ let res = clientFunction.apply(mongo, clientFunctionArguments);
+
+ if (isRetryableWriteCmd && canRetryWrites) {
+ // findAndModify can fail during the find stage and return an executor error.
+ if ((cmdName === "findandmodify" || cmdName === "findAndModify") &&
+ isRetryableExecutorCodeAndMessage(res.code, res.errmsg)) {
+ print("=-=-=-= Retrying because of executor interruption: " + cmdName +
+ ", retries remaining: " + numRetries);
+ continue;
+ }
+
+ // Don't interfere with retryable writes.
+ return res;
+ }
+
+ if (cmdName === "explain") {
+ // If an explain is interrupted by a stepdown, and it returns before its
+ // connection is closed, it will return incomplete results. To prevent failing
+ // the test, force retries of interrupted explains.
+ if (res.hasOwnProperty("executionStats") &&
+ !res.executionStats.executionSuccess &&
+ (RetryableWritesUtil.isRetryableCode(res.executionStats.errorCode) ||
+ isRetryableExecutorCodeAndMessage(res.executionStats.errorCode,
+ res.executionStats.errorMessage))) {
+ print("=-=-=-= Forcing retry of interrupted explain, res: " + tojson(res));
+ continue;
+ }
+
+ // An explain command can fail if its child command cannot be run on the current
+ // server. This can be hit if a primary only or not explicitly slaveOk command
+ // is accepted by a primary node that then steps down and returns before having
+ // its connection closed.
+ if (!res.ok &&
+ res.errmsg.indexOf("child command cannot run on this node") >= 0) {
+ print(
+ "=-=-=-= Forcing retry of explain likely interrupted by transition to" +
+ " secondary, res: " + tojson(res));
+ continue;
+ }
+ }
+
+ if (!res.ok) {
+ if (numRetries > 0) {
+ if (RetryableWritesUtil.isRetryableCode(res.code)) {
+ // Don't decrement retries, because the command returned before the
+ // connection was closed, so a subsequent attempt will receive a
+ // network error (or NotMaster error) and need to retry.
+ print("=-=-=-= Retrying failed response with retryable code: " +
+ res.code + ", for command: " + cmdName + ", retries remaining: " +
+ numRetries);
+ continue;
+ }
+
+ if (isRetryableExecutorCodeAndMessage(res.code, res.errmsg)) {
+ // Don't decrement retries for the same reason as above.
+ print("=-=-=-= Retrying because of executor interruption: " + cmdName +
+ ", retries remaining: " + numRetries);
+ continue;
+ }
+ }
+
+ // Swallow safe errors that may come from a retry since the command may have
+ // completed before the connection was closed.
+ if (isAcceptableRetryFailedResponse(cmdName, res)) {
+ print("=-=-=-= Overriding safe failed response for: " + cmdName +
+ ", retries remaining: " + numRetries);
+ res.ok = 1;
+ }
+ }
+
+ if (res.writeConcernError && numRetries > 0) {
+ if (RetryableWritesUtil.isRetryableCode(res.writeConcernError.code)) {
+ // Don't decrement retries, because the command returned before the
+ // connection was closed, so a subsequent attempt will receive a
+ // network error (or NotMaster error) and need to retry.
+ print("=-=-=-= Retrying write concern error with retryable code: " +
+ res.writeConcernError.code + ", for command: " + cmdName +
+ ", retries remaining: " + numRetries);
+ continue;
+ }
+ }
+
+ return res;
} catch (e) {
if (!isNetworkError(e) || numRetries === 0) {
throw e;
@@ -64,21 +304,12 @@
// or will go through the retry logic in SessionAwareClient, so propagate
// the error.
throw e;
- } else {
- throw new Error(
- "Cowardly refusing to run a test that issues non-retryable write" +
- " operations since the test likely makes assertions on the write" +
- " results and can lead to spurious failures if a network error" +
- " occurs.");
}
- } else if (cmdName === "getMore") {
- throw new Error(
- "Cowardly refusing to run a test that issues a getMore command since if" +
- " a network error occurs during it then we won't know whether the cursor" +
- " was advanced or not.");
}
--numRetries;
+ print("=-=-=-= Retrying on network error for command: " + cmdName +
+ ", retries remaining: " + numRetries);
}
} while (numRetries >= 0);
}
@@ -97,4 +328,28 @@
return startParallelShellOriginal(newCode, port, noConnect);
};
+
+ const connectOriginal = connect;
+
+ connect = function(url, user, pass) {
+ let retVal;
+
+ let connectionAttempts = 0;
+ assert.soon(
+ () => {
+ try {
+ connectionAttempts += 1;
+ retVal = connectOriginal.apply(this, arguments);
+ return true;
+ } catch (e) {
+ print("=-=-=-= Retrying connection to: " + url + ", attempts: " +
+ connectionAttempts + ", failed with: " + tojson(e));
+ }
+ },
+ "Failed connecting to url: " + tojson(url),
+ undefined, // Default timeout.
+ 2000); // 2 second interval.
+
+ return retVal;
+ };
})();
diff --git a/jstests/libs/override_methods/set_read_and_write_concerns.js b/jstests/libs/override_methods/set_read_and_write_concerns.js
index 8bbf29226f2..7d1e3fc8e0f 100644
--- a/jstests/libs/override_methods/set_read_and_write_concerns.js
+++ b/jstests/libs/override_methods/set_read_and_write_concerns.js
@@ -62,7 +62,21 @@
// These commands directly support a writeConcern argument.
var commandsToForceWriteConcern = [
+ "_configsvrAddShard",
+ "_configsvrAddShardToZone",
+ "_configsvrCommitChunkMerge",
+ "_configsvrCommitChunkMigration",
+ "_configsvrCommitChunkSplit",
+ "_configsvrCreateDatabase",
+ "_configsvrEnableSharding",
+ "_configsvrMoveChunk",
+ "_configsvrMovePrimary",
+ "_configsvrRemoveShard",
+ "_configsvrRemoveShardFromZone",
+ "_configsvrShardCollection",
+ "_configsvrUpdateZoneKeyRange",
"_mergeAuthzCollections",
+ "_recvChunkStart",
"appendOplogNote",
"applyOps",
"authSchemaUpgrade",
@@ -71,7 +85,7 @@
"clone",
"cloneCollection",
"cloneCollectionAsCapped",
- // "collMod", SERVER-25196 - not supported
+ "collMod",
"convertToCapped",
"copydb",
"create",
@@ -80,7 +94,6 @@
"createUser",
"delete",
"drop",
- "dropDatabase",
"dropAllRolesFromDatabase",
"dropAllUsersFromDatabase",
"dropDatabase",
@@ -95,14 +108,10 @@
"grantRolesToRole",
"grantRolesToUser",
"insert",
- "mapReduceFinish",
- "mergeAuthzCollections",
+ "mapreduce.shardedfinish",
"moveChunk",
- "movePrimary",
- "remove",
"renameCollection",
- "resvChunkStart",
- "revokePriviligesFromRole",
+ "revokePrivilegesFromRole",
"revokeRolesFromRole",
"revokeRolesFromUser",
"setFeatureCompatibilityVersion",
diff --git a/jstests/libs/retryable_writes_util.js b/jstests/libs/retryable_writes_util.js
index d545f4e9ed8..5105157eba1 100644
--- a/jstests/libs/retryable_writes_util.js
+++ b/jstests/libs/retryable_writes_util.js
@@ -2,14 +2,22 @@
* Utilities for testing retryable writes.
*/
var RetryableWritesUtil = (function() {
- const retryableWriteCommands =
+ /**
+ * Returns true if the error code is retryable, assuming the command is idempotent.
+ */
+ function isRetryableCode(code) {
+ return ErrorCodes.isNetworkError(code) || ErrorCodes.isNotMasterError(code) ||
+ ErrorCodes.isWriteConcernError(code) || ErrorCodes.isInterruption(code);
+ }
+
+ const kRetryableWriteCommands =
new Set(["delete", "findandmodify", "findAndModify", "insert", "update"]);
/**
* Returns true if the command name is that of a retryable write command.
*/
function isRetryableWriteCmdName(cmdName) {
- return retryableWriteCommands.has(cmdName);
+ return kRetryableWriteCommands.has(cmdName);
}
const kStorageEnginesWithoutDocumentLocking = new Set(["ephemeralForTest", "mmapv1"]);
@@ -22,5 +30,5 @@ var RetryableWritesUtil = (function() {
return !kStorageEnginesWithoutDocumentLocking.has(storageEngineName);
}
- return {isRetryableWriteCmdName, storageEngineSupportsRetryableWrites};
+ return {isRetryableCode, isRetryableWriteCmdName, storageEngineSupportsRetryableWrites};
})();
diff --git a/src/mongo/shell/session.js b/src/mongo/shell/session.js
index 48e3eab554a..eb0cc9a6464 100644
--- a/src/mongo/shell/session.js
+++ b/src/mongo/shell/session.js
@@ -280,6 +280,47 @@ var {
}
}
+ /**
+ * Returns true if the error code is retryable, assuming the command is idempotent.
+ */
+ function isRetryableCode(code) {
+ return ErrorCodes.isNetworkError(code) || ErrorCodes.isNotMasterError(code) ||
+ // The driver's spec does not allow retrying on writeConcern errors, so only do so
+ // when testing retryable writes.
+ (jsTest.options().alwaysInjectTransactionNumber &&
+ ErrorCodes.isWriteConcernError(code));
+ }
+
+ /**
+ * Returns the error code from a write response that should be used in the check for
+ * retryability.
+ */
+ function getEffectiveWriteErrorCode(res) {
+ let code;
+ if (res instanceof WriteResult) {
+ if (res.hasWriteError()) {
+ code = res.getWriteError().code;
+ } else if (res.hasWriteConcernError()) {
+ code = res.getWriteConcernError().code;
+ }
+ } else if (res instanceof BulkWriteResult) {
+ if (res.hasWriteErrors()) {
+ code = res.getWriteErrorAt(0).code;
+ } else if (res.hasWriteConcernError()) {
+ code = res.getWriteConcernError().code;
+ }
+ } else {
+ if (res.writeError) {
+ code = res.writeError.code;
+ } else if (res.writeErrors) {
+ code = res.writeErrors[0].code;
+ } else if (res.writeConcernError) {
+ code = res.writeConcernError.code;
+ }
+ }
+ return code;
+ }
+
function runClientFunctionWithRetries(
driverSession, cmdObj, clientFunction, clientFunctionArguments) {
let cmdName = Object.keys(cmdObj)[0];
@@ -296,13 +337,42 @@ var {
? 1
: 0;
+ if (numRetries > 0 && jsTest.options().overrideRetryAttempts) {
+ numRetries = jsTest.options().overrideRetryAttempts;
+ }
+
do {
try {
- const res = clientFunction.apply(client, clientFunctionArguments);
- if (res.ok === 1 || numRetries === 0 ||
- !ErrorCodes.isNotMasterError(res.code)) {
- return res;
+ let res = clientFunction.apply(client, clientFunctionArguments);
+
+ if (numRetries > 0) {
+ if (!res.ok && isRetryableCode(res.code)) {
+ // Don't decrement retries, because the command returned before the
+ // connection was closed, so a subsequent attempt will receive a
+ // network error (or NotMaster error) and need to retry.
+ if (jsTest.options().logRetryAttempts) {
+ print("=-=-=-= Retrying failed response with retryable code: " +
+ res.code + ", for command: " + cmdName +
+ ", retries remaining: " + numRetries);
+ }
+ continue;
+ }
+
+ let code = getEffectiveWriteErrorCode(res);
+ if (isRetryableCode(code)) {
+ // Don't decrement retries, because the command returned before the
+ // connection was closed, so a subsequent attempt will receive a network
+ // error (or NotMaster error) and need to retry.
+ if (jsTest.options().logRetryAttempts) {
+ print("=-=-=-= Retrying write with retryable write error code: " +
+ code + ", for command: " + cmdName + ", retries remaining: " +
+ numRetries);
+ }
+ continue;
+ }
}
+
+ return res;
} catch (e) {
if (!isNetworkError(e) || numRetries === 0) {
throw e;
@@ -329,6 +399,10 @@ var {
}
--numRetries;
+ if (jsTest.options().logRetryAttempts) {
+ print("=-=-=-= Retrying on network error for command: " + cmdName +
+ ", retries remaining: " + numRetries);
+ }
} while (numRetries >= 0);
}
diff --git a/src/mongo/shell/utils.js b/src/mongo/shell/utils.js
index 638580ebf3c..f7ec71ed8b6 100644
--- a/src/mongo/shell/utils.js
+++ b/src/mongo/shell/utils.js
@@ -261,6 +261,9 @@ jsTestOptions = function() {
alwaysInjectTransactionNumber: TestData.alwaysInjectTransactionNumber,
skipGossipingClusterTime: TestData.skipGossipingClusterTime || false,
disableEnableSessions: TestData.disableEnableSessions,
+ overrideRetryAttempts: TestData.overrideRetryAttempts || 0,
+ logRetryAttempts: TestData.logRetryAttempts || false,
+ connectionString: TestData.connectionString || "",
});
}
return _jsTestOptions;