summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPavi Vetriselvan <pvselvan@umich.edu>2020-03-16 11:13:36 -0400
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2020-05-14 12:42:24 +0000
commitf4528563033d933ca920b3e4b2a5e3344e198a5c (patch)
tree8c20856b344e02483dceb1e13f35533e41db3ecd
parentcd9fbb56900343e7b1193922a2c4b197895e7f56 (diff)
downloadmongo-f4528563033d933ca920b3e4b2a5e3344e198a5c.tar.gz
SERVER-45094 add disabled replica set reconfig passthroughs
SERVER-45094 add retryable read logic to network_error_and_txn_override.js (cherry picked from commit f59f63db6c37c0d4657b57d559c95d830b0e34c2) SERVER-45094 add replica_sets_reconfig_jscore_passthrough suite (cherry picked from commit 4d91fac171cbe3f2af53d9258965399e648a1947) SERVER-45094 use w:1 writes and remove causal consistency in reconfig passthrough (cherry picked from commit a43cb23defc6182d08a7814e4731ef98f2d30b6a) SERVER-45094 add replica_sets_reconfig_jscore_stepdown_passthrough (cherry picked from commit 81e0ad27c280c02a49beb65ff4473d5dce62b089) SERVER-45094 add replica_sets_reconfig_kill_primary_jscore_passthrough (cherry picked from commit 2debab7987b24bf902f9a128654ce928441c29a2) SERVER-47678 stepdown and kill primary reconfig passthroughs should ignore ReplicaSetMonitorErrors (cherry picked from commit 91672e58f1169c7edd684b911f20f62b8a71f8d1) SERVER-47544 always increase election timeout to 24 hours in passthrough suites (cherry picked from commit 81d53a715f49827a9f2538d4572f9b01f2b12887)
-rw-r--r--buildscripts/resmokeconfig/suites/replica_sets_reconfig_jscore_passthrough.yml84
-rw-r--r--buildscripts/resmokeconfig/suites/replica_sets_reconfig_jscore_stepdown_passthrough.yml198
-rw-r--r--buildscripts/resmokeconfig/suites/replica_sets_reconfig_kill_primary_jscore_passthrough.yml214
-rw-r--r--buildscripts/resmokelib/testing/fixtures/replicaset.py20
-rw-r--r--buildscripts/resmokelib/testing/hooks/reconfig_background.py71
-rw-r--r--buildscripts/resmokelib/testing/hooks/stepdown.py26
-rw-r--r--etc/evergreen.yml30
-rw-r--r--jstests/hooks/run_reconfig_background.js147
-rw-r--r--jstests/libs/override_methods/network_error_and_txn_override.js52
-rw-r--r--jstests/replsets/txn_override_unittests.js210
10 files changed, 1043 insertions, 9 deletions
diff --git a/buildscripts/resmokeconfig/suites/replica_sets_reconfig_jscore_passthrough.yml b/buildscripts/resmokeconfig/suites/replica_sets_reconfig_jscore_passthrough.yml
new file mode 100644
index 00000000000..8285339493b
--- /dev/null
+++ b/buildscripts/resmokeconfig/suites/replica_sets_reconfig_jscore_passthrough.yml
@@ -0,0 +1,84 @@
+test_kind: js_test
+# This suite starts a 5-node replica set and uses DoReconfigInBackground hook to periodically run
+# safe reconfigs against the primary. These reconfigs change the number of voting nodes in the
+# replica set, which changes the voting majority used to satisfy the config commitment check and
+# oplog commitment check.
+
+selector:
+ roots:
+ - jstests/core/**/*.js
+ exclude_files:
+ # Transactions do not support retryability of individual operations.
+ # TODO: Remove this once it is supported (SERVER-33952).
+ - jstests/core/txns/**/*.js
+ # These tests are not expected to pass with replica-sets:
+ - jstests/core/dbadmin.js
+ - jstests/core/opcounters_write_cmd.js
+ - jstests/core/read_after_optime.js
+ - jstests/core/capped_update.js
+ # The set_param1.js test attempts to compare the response from running the {getParameter: "*"}
+ # command multiple times, which may observe the change to the "transactionLifetimeLimitSeconds"
+ # server parameter.
+ - jstests/core/set_param1.js
+
+ # These test run commands using legacy queries, which are not supported on sessions.
+ - jstests/core/comment_field.js
+ - jstests/core/exhaust.js
+ - jstests/core/invalidated_legacy_cursors.js
+ - jstests/core/validate_cmd_ns.js
+
+ # Unacknowledged writes prohibited in an explicit session.
+ - jstests/core/batch_write_command_w0.js
+ - jstests/core/crud_api.js
+
+ - jstests/core/connection_string_validation.js # Does not expect a replica set connection string.
+
+ exclude_with_any_tags:
+ - assumes_read_preference_unchanged
+ - requires_sharding
+
+executor:
+ archive:
+ hooks:
+ - CheckReplDBHash
+ - CheckReplOplogs
+ - ValidateCollections
+ config:
+ shell_options:
+ eval: >-
+ testingReplication = true;
+ load('jstests/libs/override_methods/network_error_and_txn_override.js');
+ load("jstests/libs/override_methods/enable_sessions.js");
+ global_vars:
+ TestData:
+ networkErrorAndTxnOverrideConfig:
+ backgroundReconfigs: true
+ sessionOptions:
+ # Force DBClientRS to find the primary for non-write commands to make sure reads still
+ # work as expected during reconfigs.
+ readPreference:
+ mode: "primary"
+ readMode: commands
+ hooks:
+ # The CheckReplDBHash hook waits until all operations have replicated to and have been applied
+ # on the secondaries, so we run the ValidateCollections hook after it to ensure we're
+ # validating the entire contents of the collection.
+ - class: DoReconfigInBackground
+ shell_options:
+ nodb: ""
+ - class: CheckReplOplogs
+ - class: CheckReplDBHash
+ - class: ValidateCollections
+ - class: CleanEveryN
+ n: 20
+ fixture:
+ class: ReplicaSetFixture
+ mongod_options:
+ enableMajorityReadConcern: ''
+ set_parameters:
+ enableTestCommands: 1
+ logComponentVerbosity:
+ replication:
+ heartbeats: 2
+ all_nodes_electable: true
+ num_nodes: 5
diff --git a/buildscripts/resmokeconfig/suites/replica_sets_reconfig_jscore_stepdown_passthrough.yml b/buildscripts/resmokeconfig/suites/replica_sets_reconfig_jscore_stepdown_passthrough.yml
new file mode 100644
index 00000000000..d221ccba640
--- /dev/null
+++ b/buildscripts/resmokeconfig/suites/replica_sets_reconfig_jscore_stepdown_passthrough.yml
@@ -0,0 +1,198 @@
+test_kind: js_test
+# This suite starts a 5-node replica set and runs both the DoReconfigInBackground and
+# ContinuousStepdown hook to periodically run safe reconfigs and step downs against the
+# primary. This tests that the concurrency between stepdowns and reconfigs is still ultimately
+# safe.
+
+selector:
+ roots:
+ - jstests/core/**/*.js
+ exclude_files:
+ # Transactions do not support retryability of individual operations.
+ # TODO: Remove this once it is supported (SERVER-33952).
+ - jstests/core/txns/**/*.js
+ # These tests are not expected to pass with replica-sets:
+ - jstests/core/dbadmin.js
+ - jstests/core/opcounters_write_cmd.js
+ - jstests/core/read_after_optime.js
+ - jstests/core/capped_update.js
+ # The set_param1.js test attempts to compare the response from running the {getParameter: "*"}
+ # command multiple times, which may observe the change to the "transactionLifetimeLimitSeconds"
+ # server parameter.
+ - jstests/core/set_param1.js
+
+ # No-op retries are not ignored by top, the profiler, or opcount.
+ - jstests/core/operation_latency_histogram.js
+ - jstests/core/profile2.js
+ - jstests/core/profile3.js
+ - jstests/core/profile_findandmodify.js
+ - jstests/core/top.js
+ - jstests/core/views/views_stats.js
+
+ # TODO SERVER-31249: getLastError should not be affected by no-op retries.
+ - jstests/core/bulk_legacy_enforce_gle.js
+
+ # TODO SERVER-31242: findAndModify no-op retry should respect the fields option.
+ - jstests/core/crud_api.js
+ - jstests/core/find_and_modify.js
+ - jstests/core/find_and_modify2.js
+ - jstests/core/find_and_modify_pipeline_update.js
+ - jstests/core/find_and_modify_server6865.js
+
+ # These test run commands using legacy queries, which are not supported on sessions.
+ - jstests/core/comment_field.js
+ - jstests/core/exhaust.js
+ - jstests/core/validate_cmd_ns.js
+
+ # Stepdown commands during fsync lock will fail.
+ - jstests/core/currentop.js
+ - jstests/core/fsync.js
+ - jstests/core/killop_drop_collection.js
+
+ # Assert on the ismaster field of an isMaster response. If a primary steps down after accepting
+ # an isMaster command and returns before its connection is closed, the response can contain
+ # ismaster: false.
+ - jstests/core/dbadmin.js
+ - jstests/core/ismaster.js
+
+ # Spawns new mongo shells, which don't retry connecting on stepdown errors.
+ - jstests/core/shell_connection_strings.js
+
+ # Expect drops/creates to fail or have a certain response:
+ - jstests/core/drop.js
+ - jstests/core/dropdb.js
+ - jstests/core/explain_upsert.js
+ - jstests/core/indexes_multiple_commands.js
+
+ # Expect certain responses, but retries of successfully completed commands may return
+ # different values:
+ - jstests/core/create_indexes.js
+ - jstests/core/objid5.js
+
+ # Unacknowledged writes prohibited in an explicit session.
+ - jstests/core/batch_write_command_w0.js
+
+ - jstests/core/bench_test*.js # benchRun() used for writes
+ - jstests/core/benchrun_pipeline_updates.js # benchRun() used for writes
+ - jstests/core/connection_string_validation.js # Does not expect a replica set connection string.
+ - jstests/core/list_collections_filter.js # Temporary collections are dropped on failover.
+ - jstests/core/top.js # Tests read commands (including getMore) against the secondary
+ - jstests/core/drop3.js # getMore is not causally consistent if collection is dropped
+ - jstests/core/validate_cmd_ns.js # Calls _exec() directly, not retryable.
+ - jstests/core/list_collections_filter.js # Temporary collections are dropped on failover.
+ - jstests/core/explain_large_bounds.js # Stepdown can timeout waiting for global lock.
+
+ # Tests that fail for Causal Consistency as they have statements that do not support
+ # non-local read concern.
+ - jstests/core/collation.js
+ # Parallel shell is not causally consistent
+ - jstests/core/benchrun_pipeline_updates.js
+ - jstests/core/find_and_modify_concurrent_update.js
+ - jstests/core/shellstartparallel.js
+
+ exclude_with_any_tags:
+ ##
+ # The next four tags correspond to the special errors thrown by the auto_retry_on_network_error.js
+ # override when it refuses to run a certain command. Above each tag are the message(s) that cause
+ # the tag to be warranted.
+ ##
+ # "Refusing to run a test that issues a getMore command since if a network error occurs during
+ # it then we won't know whether the cursor was advanced or not"
+ - requires_getmore
+ # "Refusing to run a test that issues non-retryable write operations since the test likely makes
+ # assertions on the write results and can lead to spurious failures if a network error occurs"
+ - requires_non_retryable_writes
+ # "Refusing to run a test that issues commands that are not blindly retryable"
+ # "Refusing to run a test that issues an aggregation command with $out because it is not
+ # retryable"
+ - requires_non_retryable_commands
+ # "Refusing to run a test that issues commands that may return different values after a failover"
+ # "Refusing to run a test that issues an aggregation command with explain because it may return
+ # incomplete results"
+ # "Refusing to run a test that issues an aggregation command with
+ # $listLocalSessions because it relies on in-memory state that may not survive failovers"
+ # "Refusing to run a test that issues a mapReduce command, because it calls std::terminate() if
+ # interrupted by a stepdown"
+ - does_not_support_stepdowns
+ ##
+ # The next two tags correspond to the special errors thrown by the
+ # set_read_and_write_concerns.js override when it refuses to replace the readConcern or
+ # writeConcern of a particular command. Above each tag are the message(s) that cause the tag to be
+ # warranted.
+ ##
+ # "Cowardly refusing to override read concern of command: ..."
+ - assumes_read_concern_unchanged
+ # "Cowardly refusing to override write concern of command: ..."
+ - assumes_write_concern_unchanged
+ ## The next tag corresponds to long running-operations, as they may exhaust their number
+ # of retries and result in a network error being thrown.
+ - operations_longer_than_stepdown_interval
+ - does_not_support_causal_consistency
+ - uses_transactions
+ # collStats is not causally consistent
+ - requires_collstats
+ - requires_dbstats
+ - requires_datasize
+ - requires_sharding
+
+executor:
+ archive:
+ hooks:
+ - CheckReplDBHash
+ - CheckReplOplogs
+ - ValidateCollections
+ config:
+ shell_options:
+ eval: >-
+ testingReplication = true;
+ load('jstests/libs/override_methods/network_error_and_txn_override.js');
+ db = connect(TestData.connectionString);
+ load("jstests/libs/override_methods/enable_sessions.js");
+ load("jstests/libs/override_methods/set_read_and_write_concerns.js");
+ global_vars:
+ TestData:
+ runningWithCausalConsistency: true
+ alwaysInjectTransactionNumber: true
+ defaultReadConcernLevel: "majority"
+ logRetryAttempts: true
+ networkErrorAndTxnOverrideConfig:
+ retryOnNetworkErrors: true
+ backgroundReconfigs: true
+ overrideRetryAttempts: 3
+ sessionOptions:
+ writeConcern:
+ w: "majority"
+ readConcern:
+ level: "majority"
+ # Force DBClientRS to find the primary for non-write commands.
+ readPreference:
+ mode: "primary"
+ retryWrites: true
+ # We specify nodb so the shell used by each test will attempt to connect after loading the
+ # retry logic in auto_retry_on_network_error.js.
+ nodb: ""
+ readMode: commands
+ hooks:
+ - class: DoReconfigInBackground
+ shell_options:
+ nodb: ""
+ - class: ContinuousStepdown
+ # The CheckReplDBHash hook waits until all operations have replicated to and have been applied
+ # on the secondaries, so we run the ValidateCollections hook after it to ensure we're
+ # validating the entire contents of the collection.
+ - class: CheckReplOplogs
+ - class: CheckReplDBHash
+ - class: ValidateCollections
+ - class: CleanEveryN
+ n: 20
+ fixture:
+ class: ReplicaSetFixture
+ mongod_options:
+ enableMajorityReadConcern: ''
+ set_parameters:
+ enableTestCommands: 1
+ logComponentVerbosity:
+ replication:
+ heartbeats: 2
+ all_nodes_electable: true
+ num_nodes: 5
diff --git a/buildscripts/resmokeconfig/suites/replica_sets_reconfig_kill_primary_jscore_passthrough.yml b/buildscripts/resmokeconfig/suites/replica_sets_reconfig_kill_primary_jscore_passthrough.yml
new file mode 100644
index 00000000000..6c973e3dd06
--- /dev/null
+++ b/buildscripts/resmokeconfig/suites/replica_sets_reconfig_kill_primary_jscore_passthrough.yml
@@ -0,0 +1,214 @@
+test_kind: js_test
+# This suite starts a 5-node replica set and runs both the DoReconfigInBackground and
+# ContinuousStepdown hook with kill: true and background_reconfig: true to periodically run
+# safe reconfigs and send kill signals to the primary.
+# This tests that the concurrency between killing the primary and reconfigs is still
+# ultimately safe.
+
+selector:
+ roots:
+ - jstests/core/**/*.js
+ exclude_files:
+ # Transactions do not support retryability of individual operations.
+ # TODO: Remove this once it is supported (SERVER-33952).
+ - jstests/core/txns/**/*.js
+ # These tests are not expected to pass with replica-sets:
+ - jstests/core/dbadmin.js
+ - jstests/core/opcounters_write_cmd.js
+ - jstests/core/read_after_optime.js
+ - jstests/core/capped_update.js
+ # The set_param1.js test attempts to compare the response from running the {getParameter: "*"}
+ # command multiple times, which may observe the change to the "transactionLifetimeLimitSeconds"
+ # server parameter.
+ - jstests/core/set_param1.js
+
+ # No-op retries are not ignored by top, the profiler, or opcount.
+ - jstests/core/operation_latency_histogram.js
+ - jstests/core/profile2.js
+ - jstests/core/profile3.js
+ - jstests/core/profile_findandmodify.js
+ - jstests/core/top.js
+ - jstests/core/views/views_stats.js
+
+ # TODO SERVER-31249: getLastError should not be affected by no-op retries.
+ - jstests/core/bulk_legacy_enforce_gle.js
+
+ # TODO SERVER-31242: findAndModify no-op retry should respect the fields option.
+ - jstests/core/crud_api.js
+ - jstests/core/find_and_modify.js
+ - jstests/core/find_and_modify2.js
+ - jstests/core/find_and_modify_pipeline_update.js
+ - jstests/core/find_and_modify_server6865.js
+
+ # These test run commands using legacy queries, which are not supported on sessions.
+ - jstests/core/comment_field.js
+ - jstests/core/exhaust.js
+ - jstests/core/validate_cmd_ns.js
+
+ # Stepdown commands during fsync lock will fail.
+ - jstests/core/currentop.js
+ - jstests/core/fsync.js
+ - jstests/core/killop_drop_collection.js
+
+ # Assert on the ismaster field of an isMaster response. If a primary steps down after accepting
+ # an isMaster command and returns before its connection is closed, the response can contain
+ # ismaster: false.
+ - jstests/core/dbadmin.js
+ - jstests/core/ismaster.js
+
+ # Spawns new mongo shells, which don't retry connecting on stepdown errors.
+ - jstests/core/shell_connection_strings.js
+
+ # Expect drops/creates to fail or have a certain response:
+ - jstests/core/drop.js
+ - jstests/core/dropdb.js
+ - jstests/core/explain_upsert.js
+ - jstests/core/indexes_multiple_commands.js
+
+ # Expect certain responses, but retries of successfully completed commands may return
+ # different values:
+ - jstests/core/create_indexes.js
+ - jstests/core/objid5.js
+
+ # Unacknowledged writes prohibited in an explicit session.
+ - jstests/core/batch_write_command_w0.js
+
+ - jstests/core/bench_test*.js # benchRun() used for writes
+ - jstests/core/benchrun_pipeline_updates.js # benchRun() used for writes
+ - jstests/core/connection_string_validation.js # Does not expect a replica set connection string.
+ - jstests/core/list_collections_filter.js # Temporary collections are dropped on failover.
+ - jstests/core/top.js # Tests read commands (including getMore) against the secondary
+ - jstests/core/drop3.js # getMore is not causally consistent if collection is dropped
+ - jstests/core/validate_cmd_ns.js # Calls _exec() directly, not retryable.
+ - jstests/core/list_collections_filter.js # Temporary collections are dropped on failover.
+ - jstests/core/explain_large_bounds.js # Stepdown can timeout waiting for global lock.
+
+ # Tests that fail for Causal Consistency as they have statements that do not support
+ # non-local read concern.
+ - jstests/core/collation.js
+ # Starts a parallel shell but won't restart it after unclean shutdown.
+ # TODO SERVER-33229: Remove these exclusions
+ - jstests/core/compact_keeps_indexes.js
+ - jstests/core/benchrun_pipeline_updates.js
+ - jstests/core/find_and_modify_concurrent_update.js
+ - jstests/core/shellstartparallel.js
+
+ # Inserts enough data that recovery takes more than 8 seconds, so we never get a working primary.
+ - jstests/core/geo_s2ordering.js
+
+ exclude_with_any_tags:
+ ##
+ # The next four tags correspond to the special errors thrown by the auto_retry_on_network_error.js
+ # override when it refuses to run a certain command. Above each tag are the message(s) that cause
+ # the tag to be warranted.
+ ##
+ # "Refusing to run a test that issues a getMore command since if a network error occurs during
+ # it then we won't know whether the cursor was advanced or not"
+ - requires_getmore
+ # "Refusing to run a test that issues non-retryable write operations since the test likely makes
+ # assertions on the write results and can lead to spurious failures if a network error occurs"
+ - requires_non_retryable_writes
+ # "Refusing to run a test that issues commands that are not blindly retryable"
+ # "Refusing to run a test that issues an aggregation command with $out because it is not
+ # retryable"
+ - requires_non_retryable_commands
+ # "Refusing to run a test that issues commands that may return different values after a failover"
+ # "Refusing to run a test that issues an aggregation command with explain because it may return
+ # incomplete results"
+ # "Refusing to run a test that issues an aggregation command with
+ # $listLocalSessions because it relies on in-memory state that may not survive failovers"
+ # "Refusing to run a test that issues a mapReduce command, because it calls std::terminate() if
+ # interrupted by a stepdown"
+ - does_not_support_stepdowns
+ ##
+ # The next two tags correspond to the special errors thrown by the
+ # set_read_and_write_concerns.js override when it refuses to replace the readConcern or
+ # writeConcern of a particular command. Above each tag are the message(s) that cause the tag to be
+ # warranted.
+ ##
+ # "Cowardly refusing to override read concern of command: ..."
+ - assumes_read_concern_unchanged
+ # "Cowardly refusing to override write concern of command: ..."
+ - assumes_write_concern_unchanged
+ ##
+ # The next three tags corresponds to the special errors thrown by the
+ # fail_unclean_shutdown_incompatible_commands.js override when it refuses to run commands that are
+ # inaccurate after an unclean shutdown. Above each tag is the message that causes the tag to be
+ # warranted.
+ ##
+ # "Cowardly fail if fastcount is run with a mongod that had an unclean shutdown: ..."
+ - requires_fastcount
+ # "Cowardly fail if dbStats is run with a mongod that had an unclean shutdown: ..."
+ - requires_dbstats
+ # "Cowardly fail if collStats is run with a mongod that had an unclean shutdown: ..."
+ - requires_collstats
+ # "Cowardly fail if unbounded dataSize is run with a mongod that had an unclean shutdown: ..."
+ - requires_datasize
+ ## The next tag corresponds to long running-operations, as they may exhaust their number
+ # of retries and result in a network error being thrown.
+ - operations_longer_than_stepdown_interval
+
+executor:
+ archive:
+ hooks:
+ - CheckReplDBHash
+ - CheckReplOplogs
+ - ValidateCollections
+ config:
+ shell_options:
+ eval: >-
+ testingReplication = true;
+ load('jstests/libs/override_methods/network_error_and_txn_override.js');
+ db = connect(TestData.connectionString);
+ load("jstests/libs/override_methods/enable_sessions.js");
+ load("jstests/libs/override_methods/set_read_and_write_concerns.js");
+ load("jstests/libs/override_methods/fail_unclean_shutdown_incompatible_commands.js");
+ load("jstests/libs/override_methods/fail_unclean_shutdown_start_parallel_shell.js");
+ global_vars:
+ TestData:
+ alwaysInjectTransactionNumber: true
+ defaultReadConcernLevel: "majority"
+ logRetryAttempts: true
+ networkErrorAndTxnOverrideConfig:
+ retryOnNetworkErrors: true
+ backgroundReconfigs: true
+ overrideRetryAttempts: 3
+ sessionOptions:
+ writeConcern:
+ w: "majority"
+ readConcern:
+ level: "majority"
+ # Force DBClientRS to find the primary for non-write commands.
+ readPreference:
+ mode: "primary"
+ retryWrites: true
+ # We specify nodb so the shell used by each test will attempt to connect after loading the
+ # retry logic in auto_retry_on_network_error.js.
+ nodb: ""
+ readMode: commands
+ hooks:
+ - class: DoReconfigInBackground
+ shell_options:
+ nodb: ""
+ - class: ContinuousStepdown
+ kill: true
+ background_reconfig: true
+ # The CheckReplDBHash hook waits until all operations have replicated to and have been applied
+ # on the secondaries, so we run the ValidateCollections hook after it to ensure we're
+ # validating the entire contents of the collection.
+ - class: CheckReplOplogs
+ - class: CheckReplDBHash
+ - class: ValidateCollections
+ - class: CleanEveryN
+ n: 20
+ fixture:
+ class: ReplicaSetFixture
+ mongod_options:
+ enableMajorityReadConcern: ''
+ set_parameters:
+ enableTestCommands: 1
+ logComponentVerbosity:
+ replication:
+ heartbeats: 2
+ all_nodes_electable: true
+ num_nodes: 5
diff --git a/buildscripts/resmokelib/testing/fixtures/replicaset.py b/buildscripts/resmokelib/testing/fixtures/replicaset.py
index fb8e90b5296..5de9996de1c 100644
--- a/buildscripts/resmokelib/testing/fixtures/replicaset.py
+++ b/buildscripts/resmokelib/testing/fixtures/replicaset.py
@@ -187,12 +187,10 @@ class ReplicaSetFixture(interface.ReplFixture): # pylint: disable=too-many-inst
replset_settings = self.replset_config_options["settings"]
repl_config["settings"] = replset_settings
- # If not all nodes are electable and no election timeout was specified, then we increase
- # the election timeout to 24 hours to prevent spurious elections.
- if not self.all_nodes_electable:
- repl_config.setdefault("settings", {})
- if "electionTimeoutMillis" not in repl_config["settings"]:
- repl_config["settings"]["electionTimeoutMillis"] = 24 * 60 * 60 * 1000
+ # Increase the election timeout to 24 hours to prevent spurious elections.
+ repl_config.setdefault("settings", {})
+ if "electionTimeoutMillis" not in repl_config["settings"]:
+ repl_config["settings"]["electionTimeoutMillis"] = 24 * 60 * 60 * 1000
# Start up a single node replica set then reconfigure to the correct size (if the config
# contains more than 1 node), so the primary is elected more quickly.
@@ -500,6 +498,16 @@ class ReplicaSetFixture(interface.ReplFixture): # pylint: disable=too-many-inst
primary = self.get_primary()
return [node for node in self.nodes if node.port != primary.port]
+ def get_voting_members(self):
+ """Return the number of voting nodes in the replica set."""
+ primary = self.get_primary()
+ client = primary.mongo_client()
+
+ members = client.admin.command({"replSetGetConfig": 1})['config']['members']
+ voting_members = [member['host'] for member in members if member['votes'] == 1]
+
+ return voting_members
+
def get_initial_sync_node(self):
"""Return initial sync node from the replica set."""
return self.initial_sync_node
diff --git a/buildscripts/resmokelib/testing/hooks/reconfig_background.py b/buildscripts/resmokelib/testing/hooks/reconfig_background.py
new file mode 100644
index 00000000000..dbf9b33a242
--- /dev/null
+++ b/buildscripts/resmokelib/testing/hooks/reconfig_background.py
@@ -0,0 +1,71 @@
+"""Test hook for running safe reconfigs against the primary of a replica set.
+
+This hook runs continously in a background thread while the test is running.
+"""
+
+import os.path
+
+from buildscripts.resmokelib import errors
+from buildscripts.resmokelib.testing.hooks import jsfile
+from buildscripts.resmokelib.testing.testcases import interface as testcase
+from buildscripts.resmokelib.testing.hooks.background_job import _BackgroundJob, _ContinuousDynamicJSTestCase
+
+
+class DoReconfigInBackground(jsfile.JSHook):
+ """A hook for running a safe reconfig against a replica set while a test is running."""
+
+ def __init__(self, hook_logger, fixture, shell_options=None):
+ """Initialize DoReconfigInBackground."""
+ description = "Run reconfigs against the primary while the test is running."
+ js_filename = os.path.join("jstests", "hooks", "run_reconfig_background.js")
+ jsfile.JSHook.__init__(self, hook_logger, fixture, js_filename, description,
+ shell_options=shell_options)
+
+ self._background_job = None
+
+ def before_suite(self, test_report):
+ """Start the background thread."""
+ self._background_job = _BackgroundJob("ReconfigInBackground")
+ self.logger.info("Starting the background reconfig thread.")
+ self._background_job.start()
+
+ def after_suite(self, test_report):
+ """Signal the background thread to exit, and wait until it does."""
+ if self._background_job is None:
+ return
+
+ self.logger.info("Stopping the background reconfig thread.")
+ self._background_job.stop()
+
+ def before_test(self, test, test_report):
+ """Instruct the background thread to run reconfigs while 'test' is also running."""
+ if self._background_job is None:
+ return
+
+ hook_test_case = _ContinuousDynamicJSTestCase.create_before_test(
+ self.logger.test_case_logger, test, self, self._js_filename, self._shell_options)
+ hook_test_case.configure(self.fixture)
+
+ self.logger.info("Resuming the background reconfig thread.")
+ self._background_job.resume(hook_test_case, test_report)
+
+ def after_test(self, test, test_report): # noqa: D205,D400
+ """Instruct the background thread to stop running reconfigs now that 'test' has
+ finished running.
+ """
+ if self._background_job is None:
+ return
+
+ self.logger.info("Pausing the background reconfig thread.")
+ self._background_job.pause()
+
+ if self._background_job.exc_info is not None:
+ if isinstance(self._background_job.exc_info[1], errors.TestFailure):
+ # If the mongo shell process running the JavaScript file exited with a non-zero
+ # return code, then we raise an errors.ServerFailure exception to cause resmoke.py's
+ # test execution to stop.
+ raise errors.ServerFailure(self._background_job.exc_info[1].args[0])
+ else:
+ self.logger.error("Encountered an error inside the background reconfig thread.",
+ exc_info=self._background_job.exc_info)
+ raise self._background_job.exc_info[1]
diff --git a/buildscripts/resmokelib/testing/hooks/stepdown.py b/buildscripts/resmokelib/testing/hooks/stepdown.py
index de2c89e9a38..4cfd09fd52d 100644
--- a/buildscripts/resmokelib/testing/hooks/stepdown.py
+++ b/buildscripts/resmokelib/testing/hooks/stepdown.py
@@ -27,7 +27,7 @@ class ContinuousStepdown(interface.Hook): # pylint: disable=too-many-instance-a
self, hook_logger, fixture, config_stepdown=True, shard_stepdown=True,
stepdown_interval_ms=8000, terminate=False, kill=False,
use_stepdown_permitted_file=False, wait_for_mongos_retarget=False,
- stepdown_via_heartbeats=True):
+ stepdown_via_heartbeats=True, background_reconfig=False):
"""Initialize the ContinuousStepdown.
Args:
@@ -64,6 +64,8 @@ class ContinuousStepdown(interface.Hook): # pylint: disable=too-many-instance-a
self._terminate = terminate or kill
self._kill = kill
+ self._background_reconfig = background_reconfig
+
# The stepdown file names need to match the same construction as found in
# jstests/concurrency/fsm_libs/resmoke_runner.js.
dbpath_prefix = fixture.get_dbpath_prefix()
@@ -87,7 +89,7 @@ class ContinuousStepdown(interface.Hook): # pylint: disable=too-many-instance-a
self._stepdown_thread = _StepdownThread(
self.logger, self._mongos_fixtures, self._rs_fixtures, self._stepdown_interval_secs,
self._terminate, self._kill, lifecycle, self._wait_for_mongos_retarget,
- self._stepdown_via_heartbeats)
+ self._stepdown_via_heartbeats, self._background_reconfig)
self.logger.info("Starting the stepdown thread.")
self._stepdown_thread.start()
@@ -348,7 +350,8 @@ class FileBasedStepdownLifecycle(object):
class _StepdownThread(threading.Thread): # pylint: disable=too-many-instance-attributes
def __init__( # pylint: disable=too-many-arguments
self, logger, mongos_fixtures, rs_fixtures, stepdown_interval_secs, terminate, kill,
- stepdown_lifecycle, wait_for_mongos_retarget, stepdown_via_heartbeats):
+ stepdown_lifecycle, wait_for_mongos_retarget, stepdown_via_heartbeats,
+ background_reconfig):
"""Initialize _StepdownThread."""
threading.Thread.__init__(self, name="StepdownThread")
self.daemon = True
@@ -365,6 +368,7 @@ class _StepdownThread(threading.Thread): # pylint: disable=too-many-instance-at
self.__lifecycle = stepdown_lifecycle
self._should_wait_for_mongos_retarget = wait_for_mongos_retarget
self._stepdown_via_heartbeats = stepdown_via_heartbeats
+ self._background_reconfig = background_reconfig
self._last_exec = time.time()
# Event set when the thread has been stopped using the 'stop()' method.
@@ -474,6 +478,22 @@ class _StepdownThread(threading.Thread): # pylint: disable=too-many-instance-at
rs_fixture.replset_name))
if self._terminate:
+ # If we're running with background reconfigs, it's possible to be in a scenario
+ # where we kill a necessary voting node (i.e. in a 5 node repl set), only 2 are
+ # voting. In this scenario, we want to avoid killing the primary because no
+ # secondary can step up.
+ if self._background_reconfig:
+ # stagger the kill thread so that it runs a little after the reconfig thread
+ time.sleep(1)
+ voting_members = rs_fixture.get_voting_members()
+
+ self.logger.info("Current voting members: %s", voting_members)
+
+ if len(voting_members) <= 3:
+ # Do not kill or terminate the primary if we don't have enough voting nodes to
+ # elect a new primary.
+ return
+
should_kill = self._kill and random.choice([True, False])
action = "Killing" if should_kill else "Terminating"
self.logger.info("%s the primary on port %d of replica set '%s'.", action, primary.port,
diff --git a/etc/evergreen.yml b/etc/evergreen.yml
index 48f75418a26..fc22975e4b9 100644
--- a/etc/evergreen.yml
+++ b/etc/evergreen.yml
@@ -6088,6 +6088,36 @@ tasks:
vars:
resmoke_args: --suites=replica_sets_jscore_passthrough --storageEngine=wiredTiger
+- <<: *task_template
+ name: replica_sets_reconfig_jscore_passthrough
+ depends_on:
+ - name: jsCore
+ commands:
+ - func: "do setup"
+ - func: "run tests"
+ vars:
+ resmoke_args: --suites=replica_sets_reconfig_jscore_passthrough --storageEngine=wiredTiger
+
+- <<: *task_template
+ name: replica_sets_reconfig_jscore_stepdown_passthrough
+ depends_on:
+ - name: jsCore
+ commands:
+ - func: "do setup"
+ - func: "run tests"
+ vars:
+ resmoke_args: --suites=replica_sets_reconfig_jscore_stepdown_passthrough --storageEngine=wiredTiger
+
+- <<: *task_template
+ name: replica_sets_reconfig_kill_primary_jscore_passthrough
+ depends_on:
+ - name: jsCore
+ commands:
+ - func: "do setup"
+ - func: "run tests"
+ vars:
+ resmoke_args: --suites=replica_sets_reconfig_kill_primary_jscore_passthrough --storageEngine=wiredTiger
+
- name: replica_sets_jscore_passthrough_gen
depends_on:
- name: jsCore
diff --git a/jstests/hooks/run_reconfig_background.js b/jstests/hooks/run_reconfig_background.js
new file mode 100644
index 00000000000..4717a8d4710
--- /dev/null
+++ b/jstests/hooks/run_reconfig_background.js
@@ -0,0 +1,147 @@
+/**
+ * This hook runs the reconfig command against the primary of a replica set:
+ * The reconfig command first chooses a random node (not the primary) and will change
+ * its votes and priority to 0 or 1 depending on the current value.
+ *
+ * This hook will run concurrently with tests.
+ */
+
+'use strict';
+
+(function() {
+load('jstests/libs/discover_topology.js'); // For Topology and DiscoverTopology.
+load('jstests/libs/parallelTester.js'); // For Thread.
+
+/**
+ * Returns true if the error code is transient.
+ */
+function isIgnorableError(codeName) {
+ if (codeName == "ConfigurationInProgress" || codeName == "NotMaster" ||
+ codeName == "InterruptedDueToReplStateChange" || codeName == "PrimarySteppedDown" ||
+ codeName === "NodeNotFound" || codeName === "ShutdownInProgress") {
+ return true;
+ }
+ return false;
+}
+
+/**
+ * Runs the reconfig command against the primary of a replica set.
+ *
+ * The reconfig command randomly chooses a node to change it's votes and priority to 0 or 1
+ * based on what the node's current votes and priority fields are. We always check to see that
+ * there exists at least two voting nodes in the set, which ensures that we can always have a
+ * primary in the case of stepdowns.
+ * We also want to avoid changing the votes and priority of the current primary to 0, since this
+ * will result in an error.
+ *
+ * The number of voting nodes in the replica set determines what the config majority is for both
+ * reconfig config commitment and reconfig oplog commitment.
+ *
+ * This function should not throw if everything is working properly.
+ */
+function reconfigBackground(primary, numNodes) {
+ // Calls 'func' with the print() function overridden to be a no-op.
+ Random.setRandomSeed();
+ const quietly = (func) => {
+ const printOriginal = print;
+ try {
+ print = Function.prototype;
+ func();
+ } finally {
+ print = printOriginal;
+ }
+ };
+
+ // The stepdown and kill primary hooks run concurrently with this reconfig hook. It is
+ // possible that the topology will not be properly updated in time, meaning that the
+ // current primary can be undefined if a secondary has not stepped up soon enough.
+ if (primary === undefined) {
+ jsTestLog("Skipping reconfig because we do not have a primary yet.");
+ return {ok: 1};
+ }
+
+ jsTestLog("primary is " + primary);
+
+ // Suppress the log messages generated establishing new mongo connections. The
+ // run_reconfig_background.js hook is executed frequently by resmoke.py and
+ // could lead to generating an overwhelming amount of log messages.
+ let conn;
+ quietly(() => {
+ conn = new Mongo(primary);
+ });
+ assert.neq(
+ null, conn, "Failed to connect to primary '" + primary + "' for background reconfigs");
+
+ var config = assert.commandWorked(conn.getDB("admin").runCommand({replSetGetConfig: 1})).config;
+
+ // Find the correct host in the member config
+ const primaryHostIndex = (cfg, pHost) => cfg.members.findIndex(m => m.host === pHost);
+ const primaryIndex = primaryHostIndex(config, primary);
+ jsTestLog("primaryIndex is " + primaryIndex);
+
+ // Calculate the total number of voting nodes in this set so that we make sure we
+ // always have at least two voting nodes. This is so that the primary can always
+ // safely step down because there is at least one other electable secondary.
+ const numVotingNodes = config.members.filter(member => member.votes === 1).length;
+
+ // Randomly change the vote of a node to 1 or 0 depending on its current value. Do not
+ // change the primary's votes.
+ var indexToChange = primaryIndex;
+ while (indexToChange === primaryIndex) {
+ // randInt is exclusive of the upper bound.
+ indexToChange = Random.randInt(numNodes);
+ }
+
+ jsTestLog("Running reconfig to change votes of node at index" + indexToChange);
+
+ // Change the priority to correspond to the votes. If the member's current votes field
+ // is 1, only change it to 0 if there are more than 3 voting members in this set.
+ // We want to ensure that there are at least 3 voting nodes so that killing the primary
+ // will not affect a majority.
+ config.version++;
+ config.members[indexToChange].votes =
+ (config.members[indexToChange].votes === 1 && numVotingNodes > 3) ? 0 : 1;
+ config.members[indexToChange].priority = config.members[indexToChange].votes;
+
+ let votingRes = conn.getDB("admin").runCommand({replSetReconfig: config});
+ if (!votingRes.ok && !isIgnorableError(votingRes.codeName)) {
+ jsTestLog("Reconfig to change votes FAILED.");
+ return votingRes;
+ }
+
+ return {ok: 1};
+}
+
+// It is possible that the primary will be killed before actually running the reconfig
+// command. If we fail with a network error, ignore it.
+let res;
+try {
+ const conn = connect(TestData.connectionString);
+ const topology = DiscoverTopology.findConnectedNodes(conn.getMongo());
+
+ if (topology.type !== Topology.kReplicaSet) {
+ throw new Error('Unsupported topology configuration: ' + tojson(topology));
+ }
+
+ const numNodes = topology.nodes.length;
+ res = reconfigBackground(topology.primary, numNodes);
+} catch (e) {
+ // If the ReplicaSetMonitor cannot find a primary because it has stepped down or
+ // been killed, it may take longer than 15 seconds for a new primary to step up.
+ // Ignore this error until we find a new primary.
+ const kReplicaSetMonitorError =
+ /^Could not find host matching read preference.*mode: "primary"/;
+
+ if (isNetworkError(e)) {
+ jsTestLog("Ignoring network error" + tojson(e));
+ } else if (e.message.match(kReplicaSetMonitorError)) {
+ jsTestLog("Ignoring read preference primary error" + tojson(e));
+ } else {
+ throw e;
+ }
+
+ res = {ok: 1};
+}
+
+assert.commandWorked(res, "reconfig hook failed: " + tojson(res));
+})();
diff --git a/jstests/libs/override_methods/network_error_and_txn_override.js b/jstests/libs/override_methods/network_error_and_txn_override.js
index 798d9bf7fda..b08b6d0b233 100644
--- a/jstests/libs/override_methods/network_error_and_txn_override.js
+++ b/jstests/libs/override_methods/network_error_and_txn_override.js
@@ -64,6 +64,11 @@ function configuredForTxnOverride() {
return TestData.networkErrorAndTxnOverrideConfig.wrapCRUDinTransactions;
}
+function configuredForBackgroundReconfigs() {
+ assert(TestData.networkErrorAndTxnOverrideConfig, TestData);
+ return TestData.networkErrorAndTxnOverrideConfig.backgroundReconfigs;
+}
+
// Commands assumed to not be blindly retryable.
const kNonRetryableCommands = new Set([
// Commands that take write concern and do not support txnNumbers.
@@ -128,6 +133,15 @@ const kAcceptableNonRetryableCommands = new Set([
"moveChunk",
]);
+// The following read operations defined in the CRUD specification are retryable.
+// Note that estimatedDocumentCount() and countDocuments() use the count command.
+const kRetryableReadCommands = new Set(["find", "aggregate", "distinct", "count"]);
+
+// Returns true if the command name is that of a retryable read command.
+function isRetryableReadCmdName(cmdName) {
+ return kRetryableReadCommands.has(cmdName);
+}
+
// Returns if the given failed response is a safe response to ignore when retrying the
// given command type.
function isAcceptableRetryFailedResponse(cmdName, res) {
@@ -186,6 +200,20 @@ function canRetryNetworkErrorForCommand(cmdName, cmdObj) {
return true;
}
+// Returns if the given command should retry a read error when reconfigs are present.
+function canRetryReadErrorDuringBackgroundReconfig(cmdName) {
+ if (!configuredForBackgroundReconfigs()) {
+ return false;
+ }
+ return isRetryableReadCmdName(cmdName);
+}
+
+// When running the reconfig command on a node, it will drop its snapshot. Read commands issued
+// to this node before it updates its snapshot will fail with ReadConcernMajorityNotAvailableYet.
+function isRetryableReadCode(code) {
+ return code === ErrorCodes.ReadConcernMajorityNotAvailableYet;
+}
+
// Several commands that use the plan executor swallow the actual error code from a failed plan
// into their error message and instead return OperationFailed.
//
@@ -905,6 +933,19 @@ function shouldRetryWithNetworkErrorOverride(
return res;
}
+function shouldRetryForBackgroundReconfigOverride(res, cmdName, logError) {
+ assert(configuredForBackgroundReconfigs());
+ // Background reconfigs can interfere with read commands if they are using readConcern: majority
+ // and readPreference: primary. If we're running a read command and it fails with
+ // ReadConcernMajorityNotAvailableYet, retry because it should eventually succeed.
+ if (isRetryableReadCmdName(cmdName) && isRetryableReadCode(res.code)) {
+ logError("Retrying read command after 100ms because of background reconfigs");
+ sleep(100);
+ return kContinue;
+ }
+ return res;
+}
+
// Processes exceptions if configured for txn override. Retries the entire transaction on
// transient transaction errors or network errors if configured for network errors as well.
// If a retry fails, returns the response, or returns null for further exception processing.
@@ -990,6 +1031,7 @@ function runCommandOverrideBody(conn, dbName, cmdName, cmdObj, lsid, clientFunct
}
const canRetryNetworkError = canRetryNetworkErrorForCommand(cmdName, cmdObj);
+ const canRetryReadError = canRetryReadErrorDuringBackgroundReconfig(cmdName);
let numNetworkErrorRetries = canRetryNetworkError ? kMaxNumRetries : 0;
do {
try {
@@ -1020,6 +1062,16 @@ function runCommandOverrideBody(conn, dbName, cmdName, cmdObj, lsid, clientFunct
}
}
+ if (canRetryReadError) {
+ const readRetryRes =
+ shouldRetryForBackgroundReconfigOverride(res, cmdName, logError);
+ if (readRetryRes === kContinue) {
+ continue;
+ } else {
+ res = readRetryRes;
+ }
+ }
+
return res;
} catch (e) {
diff --git a/jstests/replsets/txn_override_unittests.js b/jstests/replsets/txn_override_unittests.js
index 8fe114b7789..8bab3f5b167 100644
--- a/jstests/replsets/txn_override_unittests.js
+++ b/jstests/replsets/txn_override_unittests.js
@@ -1896,6 +1896,168 @@ const txnOverridePlusRetryOnNetworkErrorTestsFcv42 = [
}
];
+const retryOnReadErrorsFromBackgroundReconfigTest = [
+ {
+ name: "find retries on ReadConcernMajorityNotAvailableYet",
+ test: function() {
+ assert.commandWorked(testDB.createCollection(collName1));
+ assert.commandWorked(coll1.insert({_id: 1}));
+ failCommandWithFailPoint(["find"],
+ {errorCode: ErrorCodes.ReadConcernMajorityNotAvailableYet});
+ assert.eq(coll1.findOne({_id: 1}), {_id: 1});
+ }
+ },
+ {
+ name: "aggregate retries on ReadConcernMajorityNotAvailableYet",
+ test: function() {
+ assert.commandWorked(testDB.createCollection(collName1));
+ assert.commandWorked(coll1.insert({a: 1}));
+ assert.commandWorked(coll1.insert({a: 1}));
+ assert.commandWorked(coll1.insert({a: 2}));
+ failCommandWithFailPoint(["aggregate"],
+ {errorCode: ErrorCodes.ReadConcernMajorityNotAvailableYet});
+ const cursor = coll1.aggregate([{$match: {a: 1}}]);
+ assert.eq(cursor.toArray().length, 2);
+ }
+ },
+ {
+ name: "distinct retries on ReadConcernMajorityNotAvailableYet",
+ test: function() {
+ assert.commandWorked(testDB.createCollection(collName1));
+ assert.commandWorked(coll1.insert({a: 1}));
+ assert.commandWorked(coll1.insert({a: 1}));
+ assert.commandWorked(coll1.insert({a: 2}));
+ failCommandWithFailPoint(["distinct"],
+ {errorCode: ErrorCodes.ReadConcernMajorityNotAvailableYet});
+ assert.eq(coll1.distinct("a").sort(), [1, 2]);
+ }
+ },
+ {
+ name: "count retries on ReadConcernMajorityNotAvailableYet",
+ test: function() {
+ assert.commandWorked(testDB.createCollection(collName1));
+ assert.commandWorked(coll1.insert({a: 1}));
+ assert.commandWorked(coll1.insert({a: 1}));
+ assert.commandWorked(coll1.insert({a: 2}));
+ failCommandWithFailPoint(["count"],
+ {errorCode: ErrorCodes.ReadConcernMajorityNotAvailableYet});
+ assert.eq(coll1.count({a: 1}), 2);
+ }
+ },
+];
+
+const retryReadsOnNetworkErrorsWithNetworkRetryAndBackgroundReconfigTest = [
+ {
+ name: "find retries on network errors",
+ test: function() {
+ assert.commandWorked(testDB.createCollection(collName1));
+ assert.commandWorked(coll1.insert({_id: 1}));
+ failCommandWithFailPoint(["find"], {closeConnection: true});
+ assert.eq(coll1.findOne({_id: 1}), {_id: 1});
+ }
+ },
+ {
+ name: "aggregate retries on network errors",
+ test: function() {
+ assert.commandWorked(testDB.createCollection(collName1));
+ assert.commandWorked(coll1.insert({a: 1}));
+ assert.commandWorked(coll1.insert({a: 1}));
+ assert.commandWorked(coll1.insert({a: 2}));
+ failCommandWithFailPoint(["aggregate"], {closeConnection: true});
+ const cursor = coll1.aggregate([{$match: {a: 1}}]);
+ assert.eq(cursor.toArray().length, 2);
+ }
+ },
+ {
+ name: "distinct retries on network errors",
+ test: function() {
+ assert.commandWorked(testDB.createCollection(collName1));
+ assert.commandWorked(coll1.insert({a: 1}));
+ assert.commandWorked(coll1.insert({a: 1}));
+ assert.commandWorked(coll1.insert({a: 2}));
+ failCommandWithFailPoint(["distinct"], {closeConnection: true});
+ assert.eq(coll1.distinct("a").sort(), [1, 2]);
+ }
+ },
+ {
+ name: "count retries on network errors",
+ test: function() {
+ assert.commandWorked(testDB.createCollection(collName1));
+ assert.commandWorked(coll1.insert({a: 1}));
+ assert.commandWorked(coll1.insert({a: 1}));
+ assert.commandWorked(coll1.insert({a: 2}));
+ failCommandWithFailPoint(["count"], {closeConnection: true});
+ assert.eq(coll1.count({a: 1}), 2);
+ }
+ },
+];
+
+const doNotRetryReadErrorWithOutBackgroundReconfigTest = [
+ {
+ name: "find fails on ReadConcernMajorityNotAvailableYet",
+ test: function() {
+ assert.commandWorked(testDB.createCollection(collName1));
+ assert.commandWorked(coll1.insert({_id: 1}));
+ failCommandWithFailPoint(["find"],
+ {errorCode: ErrorCodes.ReadConcernMajorityNotAvailableYet});
+ assert.commandFailedWithCode(
+ assert.throws(function() {
+ coll1.findOne({_id: 1});
+ }),
+ ErrorCodes.ReadConcernMajorityNotAvailableYet);
+ }
+ },
+ {
+ name: "aggregate fails on ReadConcernMajorityNotAvailableYet",
+ test: function() {
+ assert.commandWorked(testDB.createCollection(collName1));
+ assert.commandWorked(coll1.insert({a: 1}));
+ assert.commandWorked(coll1.insert({a: 1}));
+ assert.commandWorked(coll1.insert({a: 2}));
+ failCommandWithFailPoint(["aggregate"],
+ {errorCode: ErrorCodes.ReadConcernMajorityNotAvailableYet});
+ assert.commandFailedWithCode(
+ assert.throws(function() {
+ const cursor = coll1.aggregate([{$match: {a: 1}}]);
+ assert.eq(cursor.toArray().length, 2);
+ }),
+ ErrorCodes.ReadConcernMajorityNotAvailableYet);
+ }
+ },
+ {
+ name: "distinct fails on ReadConcernMajorityNotAvailableYet",
+ test: function() {
+ assert.commandWorked(testDB.createCollection(collName1));
+ assert.commandWorked(coll1.insert({a: 1}));
+ assert.commandWorked(coll1.insert({a: 1}));
+ assert.commandWorked(coll1.insert({a: 2}));
+ failCommandWithFailPoint(["distinct"],
+ {errorCode: ErrorCodes.ReadConcernMajorityNotAvailableYet});
+ assert.commandFailedWithCode(
+ assert.throws(function() {
+ coll1.distinct("a");
+ }),
+ ErrorCodes.ReadConcernMajorityNotAvailableYet);
+ }
+ },
+ {
+ name: "count fails on ReadConcernMajorityNotAvailableYet",
+ test: function() {
+ assert.commandWorked(testDB.createCollection(collName1));
+ assert.commandWorked(coll1.insert({a: 1}));
+ assert.commandWorked(coll1.insert({a: 1}));
+ assert.commandWorked(coll1.insert({a: 2}));
+ failCommandWithFailPoint(["count"],
+ {errorCode: ErrorCodes.ReadConcernMajorityNotAvailableYet});
+ assert.commandFailedWithCode(
+ assert.throws(function() {
+ coll1.count({a: 1});
+ }),
+ ErrorCodes.ReadConcernMajorityNotAvailableYet);
+ }
+ },
+];
+
TestData.networkErrorAndTxnOverrideConfig = {};
TestData.sessionOptions = new SessionOptions();
TestData.overrideRetryAttempts = 3;
@@ -1912,6 +2074,7 @@ jsTestLog("=-=-=-=-=-= Testing with 'retry on network error' by itself. =-=-=-=-
TestData.sessionOptions = new SessionOptions({retryWrites: true});
TestData.networkErrorAndTxnOverrideConfig.retryOnNetworkErrors = true;
TestData.networkErrorAndTxnOverrideConfig.wrapCRUDinTransactions = false;
+TestData.networkErrorAndTxnOverrideConfig.backgroundReconfigs = false;
session = conn.startSession(TestData.sessionOptions);
testDB = session.getDatabase(dbName);
@@ -1924,6 +2087,7 @@ jsTestLog("=-=-=-=-=-= Testing with 'txn override' by itself. =-=-=-=-=-=");
TestData.sessionOptions = new SessionOptions({retryWrites: false});
TestData.networkErrorAndTxnOverrideConfig.retryOnNetworkErrors = false;
TestData.networkErrorAndTxnOverrideConfig.wrapCRUDinTransactions = true;
+TestData.networkErrorAndTxnOverrideConfig.backgroundReconfigs = false;
session = conn.startSession(TestData.sessionOptions);
testDB = session.getDatabase(dbName);
@@ -1939,6 +2103,7 @@ jsTestLog("=-=-=-=-=-= Testing 'both txn override and retry on network error'. =
TestData.sessionOptions = new SessionOptions({retryWrites: true});
TestData.networkErrorAndTxnOverrideConfig.retryOnNetworkErrors = true;
TestData.networkErrorAndTxnOverrideConfig.wrapCRUDinTransactions = true;
+TestData.networkErrorAndTxnOverrideConfig.backgroundReconfigs = false;
session = conn.startSession(TestData.sessionOptions);
testDB = session.getDatabase(dbName);
@@ -1952,5 +2117,50 @@ if (usingFcv42) {
(testCase) => runTest("txnOverridePlusRetryOnNetworkErrorTestsFcv42", testCase));
}
+jsTestLog("=-=-=-=-=-= Testing 'retry on read errors from background reconfigs'. =-=-=-=-=-=");
+TestData.sessionOptions = new SessionOptions({retryWrites: false});
+TestData.networkErrorAndTxnOverrideConfig.retryOnNetworkErrors = false;
+TestData.networkErrorAndTxnOverrideConfig.backgroundReconfigs = true;
+TestData.networkErrorAndTxnOverrideConfig.wrapCRUDinTransactions = false;
+
+session = conn.startSession(TestData.sessionOptions);
+testDB = session.getDatabase(dbName);
+coll1 = testDB[collName1];
+coll2 = testDB[collName2];
+
+retryOnReadErrorsFromBackgroundReconfigTest.forEach(
+ (testCase) => runTest("retryOnReadErrorsFromBackgroundReconfigTest", testCase));
+
+jsTestLog(
+ "=-=-=-=-=-= Testing 'retry on network errors during network error retry and background reconfigs'. =-=-=-=-=-=");
+TestData.sessionOptions = new SessionOptions({retryWrites: true});
+TestData.networkErrorAndTxnOverrideConfig.retryOnNetworkErrors = true;
+TestData.networkErrorAndTxnOverrideConfig.backgroundReconfigs = true;
+TestData.networkErrorAndTxnOverrideConfig.wrapCRUDinTransactions = false;
+
+session = conn.startSession(TestData.sessionOptions);
+testDB = session.getDatabase(dbName);
+coll1 = testDB[collName1];
+coll2 = testDB[collName2];
+
+retryReadsOnNetworkErrorsWithNetworkRetryAndBackgroundReconfigTest.forEach(
+ (testCase) =>
+ runTest("retryReadsOnNetworkErrorsWithNetworkRetryAndBackgroundReconfigTest", testCase));
+
+jsTestLog(
+ "=-=-=-=-=-= Testing 'don't retry on network errors during background reconfigs'. =-=-=-=-=-=");
+TestData.sessionOptions = new SessionOptions({retryWrites: true});
+TestData.networkErrorAndTxnOverrideConfig.retryOnNetworkErrors = true;
+TestData.networkErrorAndTxnOverrideConfig.backgroundReconfigs = false;
+TestData.networkErrorAndTxnOverrideConfig.wrapCRUDinTransactions = false;
+
+session = conn.startSession(TestData.sessionOptions);
+testDB = session.getDatabase(dbName);
+coll1 = testDB[collName1];
+coll2 = testDB[collName2];
+
+doNotRetryReadErrorWithOutBackgroundReconfigTest.forEach(
+ (testCase) => runTest("doNotRetryReadErrorWithOutBackgroundReconfigTest", testCase));
+
rst.stopSet();
})();