diff options
author | Pavi Vetriselvan <pvselvan@umich.edu> | 2020-03-16 11:13:36 -0400 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-05-14 12:42:24 +0000 |
commit | f4528563033d933ca920b3e4b2a5e3344e198a5c (patch) | |
tree | 8c20856b344e02483dceb1e13f35533e41db3ecd | |
parent | cd9fbb56900343e7b1193922a2c4b197895e7f56 (diff) | |
download | mongo-f4528563033d933ca920b3e4b2a5e3344e198a5c.tar.gz |
SERVER-45094 add disabled replica set reconfig passthroughs
SERVER-45094 add retryable read logic to network_error_and_txn_override.js
(cherry picked from commit f59f63db6c37c0d4657b57d559c95d830b0e34c2)
SERVER-45094 add replica_sets_reconfig_jscore_passthrough suite
(cherry picked from commit 4d91fac171cbe3f2af53d9258965399e648a1947)
SERVER-45094 use w:1 writes and remove causal consistency in reconfig passthrough
(cherry picked from commit a43cb23defc6182d08a7814e4731ef98f2d30b6a)
SERVER-45094 add replica_sets_reconfig_jscore_stepdown_passthrough
(cherry picked from commit 81e0ad27c280c02a49beb65ff4473d5dce62b089)
SERVER-45094 add replica_sets_reconfig_kill_primary_jscore_passthrough
(cherry picked from commit 2debab7987b24bf902f9a128654ce928441c29a2)
SERVER-47678 stepdown and kill primary reconfig passthroughs should ignore ReplicaSetMonitorErrors
(cherry picked from commit 91672e58f1169c7edd684b911f20f62b8a71f8d1)
SERVER-47544 always increase election timeout to 24 hours in passthrough suites
(cherry picked from commit 81d53a715f49827a9f2538d4572f9b01f2b12887)
10 files changed, 1043 insertions, 9 deletions
diff --git a/buildscripts/resmokeconfig/suites/replica_sets_reconfig_jscore_passthrough.yml b/buildscripts/resmokeconfig/suites/replica_sets_reconfig_jscore_passthrough.yml new file mode 100644 index 00000000000..8285339493b --- /dev/null +++ b/buildscripts/resmokeconfig/suites/replica_sets_reconfig_jscore_passthrough.yml @@ -0,0 +1,84 @@ +test_kind: js_test +# This suite starts a 5-node replica set and uses DoReconfigInBackground hook to periodically run +# safe reconfigs against the primary. These reconfigs change the number of voting nodes in the +# replica set, which changes the voting majority used to satisfy the config commitment check and +# oplog commitment check. + +selector: + roots: + - jstests/core/**/*.js + exclude_files: + # Transactions do not support retryability of individual operations. + # TODO: Remove this once it is supported (SERVER-33952). + - jstests/core/txns/**/*.js + # These tests are not expected to pass with replica-sets: + - jstests/core/dbadmin.js + - jstests/core/opcounters_write_cmd.js + - jstests/core/read_after_optime.js + - jstests/core/capped_update.js + # The set_param1.js test attempts to compare the response from running the {getParameter: "*"} + # command multiple times, which may observe the change to the "transactionLifetimeLimitSeconds" + # server parameter. + - jstests/core/set_param1.js + + # These test run commands using legacy queries, which are not supported on sessions. + - jstests/core/comment_field.js + - jstests/core/exhaust.js + - jstests/core/invalidated_legacy_cursors.js + - jstests/core/validate_cmd_ns.js + + # Unacknowledged writes prohibited in an explicit session. + - jstests/core/batch_write_command_w0.js + - jstests/core/crud_api.js + + - jstests/core/connection_string_validation.js # Does not expect a replica set connection string. + + exclude_with_any_tags: + - assumes_read_preference_unchanged + - requires_sharding + +executor: + archive: + hooks: + - CheckReplDBHash + - CheckReplOplogs + - ValidateCollections + config: + shell_options: + eval: >- + testingReplication = true; + load('jstests/libs/override_methods/network_error_and_txn_override.js'); + load("jstests/libs/override_methods/enable_sessions.js"); + global_vars: + TestData: + networkErrorAndTxnOverrideConfig: + backgroundReconfigs: true + sessionOptions: + # Force DBClientRS to find the primary for non-write commands to make sure reads still + # work as expected during reconfigs. + readPreference: + mode: "primary" + readMode: commands + hooks: + # The CheckReplDBHash hook waits until all operations have replicated to and have been applied + # on the secondaries, so we run the ValidateCollections hook after it to ensure we're + # validating the entire contents of the collection. + - class: DoReconfigInBackground + shell_options: + nodb: "" + - class: CheckReplOplogs + - class: CheckReplDBHash + - class: ValidateCollections + - class: CleanEveryN + n: 20 + fixture: + class: ReplicaSetFixture + mongod_options: + enableMajorityReadConcern: '' + set_parameters: + enableTestCommands: 1 + logComponentVerbosity: + replication: + heartbeats: 2 + all_nodes_electable: true + num_nodes: 5 diff --git a/buildscripts/resmokeconfig/suites/replica_sets_reconfig_jscore_stepdown_passthrough.yml b/buildscripts/resmokeconfig/suites/replica_sets_reconfig_jscore_stepdown_passthrough.yml new file mode 100644 index 00000000000..d221ccba640 --- /dev/null +++ b/buildscripts/resmokeconfig/suites/replica_sets_reconfig_jscore_stepdown_passthrough.yml @@ -0,0 +1,198 @@ +test_kind: js_test +# This suite starts a 5-node replica set and runs both the DoReconfigInBackground and +# ContinuousStepdown hook to periodically run safe reconfigs and step downs against the +# primary. This tests that the concurrency between stepdowns and reconfigs is still ultimately +# safe. + +selector: + roots: + - jstests/core/**/*.js + exclude_files: + # Transactions do not support retryability of individual operations. + # TODO: Remove this once it is supported (SERVER-33952). + - jstests/core/txns/**/*.js + # These tests are not expected to pass with replica-sets: + - jstests/core/dbadmin.js + - jstests/core/opcounters_write_cmd.js + - jstests/core/read_after_optime.js + - jstests/core/capped_update.js + # The set_param1.js test attempts to compare the response from running the {getParameter: "*"} + # command multiple times, which may observe the change to the "transactionLifetimeLimitSeconds" + # server parameter. + - jstests/core/set_param1.js + + # No-op retries are not ignored by top, the profiler, or opcount. + - jstests/core/operation_latency_histogram.js + - jstests/core/profile2.js + - jstests/core/profile3.js + - jstests/core/profile_findandmodify.js + - jstests/core/top.js + - jstests/core/views/views_stats.js + + # TODO SERVER-31249: getLastError should not be affected by no-op retries. + - jstests/core/bulk_legacy_enforce_gle.js + + # TODO SERVER-31242: findAndModify no-op retry should respect the fields option. + - jstests/core/crud_api.js + - jstests/core/find_and_modify.js + - jstests/core/find_and_modify2.js + - jstests/core/find_and_modify_pipeline_update.js + - jstests/core/find_and_modify_server6865.js + + # These test run commands using legacy queries, which are not supported on sessions. + - jstests/core/comment_field.js + - jstests/core/exhaust.js + - jstests/core/validate_cmd_ns.js + + # Stepdown commands during fsync lock will fail. + - jstests/core/currentop.js + - jstests/core/fsync.js + - jstests/core/killop_drop_collection.js + + # Assert on the ismaster field of an isMaster response. If a primary steps down after accepting + # an isMaster command and returns before its connection is closed, the response can contain + # ismaster: false. + - jstests/core/dbadmin.js + - jstests/core/ismaster.js + + # Spawns new mongo shells, which don't retry connecting on stepdown errors. + - jstests/core/shell_connection_strings.js + + # Expect drops/creates to fail or have a certain response: + - jstests/core/drop.js + - jstests/core/dropdb.js + - jstests/core/explain_upsert.js + - jstests/core/indexes_multiple_commands.js + + # Expect certain responses, but retries of successfully completed commands may return + # different values: + - jstests/core/create_indexes.js + - jstests/core/objid5.js + + # Unacknowledged writes prohibited in an explicit session. + - jstests/core/batch_write_command_w0.js + + - jstests/core/bench_test*.js # benchRun() used for writes + - jstests/core/benchrun_pipeline_updates.js # benchRun() used for writes + - jstests/core/connection_string_validation.js # Does not expect a replica set connection string. + - jstests/core/list_collections_filter.js # Temporary collections are dropped on failover. + - jstests/core/top.js # Tests read commands (including getMore) against the secondary + - jstests/core/drop3.js # getMore is not causally consistent if collection is dropped + - jstests/core/validate_cmd_ns.js # Calls _exec() directly, not retryable. + - jstests/core/list_collections_filter.js # Temporary collections are dropped on failover. + - jstests/core/explain_large_bounds.js # Stepdown can timeout waiting for global lock. + + # Tests that fail for Causal Consistency as they have statements that do not support + # non-local read concern. + - jstests/core/collation.js + # Parallel shell is not causally consistent + - jstests/core/benchrun_pipeline_updates.js + - jstests/core/find_and_modify_concurrent_update.js + - jstests/core/shellstartparallel.js + + exclude_with_any_tags: + ## + # The next four tags correspond to the special errors thrown by the auto_retry_on_network_error.js + # override when it refuses to run a certain command. Above each tag are the message(s) that cause + # the tag to be warranted. + ## + # "Refusing to run a test that issues a getMore command since if a network error occurs during + # it then we won't know whether the cursor was advanced or not" + - requires_getmore + # "Refusing to run a test that issues non-retryable write operations since the test likely makes + # assertions on the write results and can lead to spurious failures if a network error occurs" + - requires_non_retryable_writes + # "Refusing to run a test that issues commands that are not blindly retryable" + # "Refusing to run a test that issues an aggregation command with $out because it is not + # retryable" + - requires_non_retryable_commands + # "Refusing to run a test that issues commands that may return different values after a failover" + # "Refusing to run a test that issues an aggregation command with explain because it may return + # incomplete results" + # "Refusing to run a test that issues an aggregation command with + # $listLocalSessions because it relies on in-memory state that may not survive failovers" + # "Refusing to run a test that issues a mapReduce command, because it calls std::terminate() if + # interrupted by a stepdown" + - does_not_support_stepdowns + ## + # The next two tags correspond to the special errors thrown by the + # set_read_and_write_concerns.js override when it refuses to replace the readConcern or + # writeConcern of a particular command. Above each tag are the message(s) that cause the tag to be + # warranted. + ## + # "Cowardly refusing to override read concern of command: ..." + - assumes_read_concern_unchanged + # "Cowardly refusing to override write concern of command: ..." + - assumes_write_concern_unchanged + ## The next tag corresponds to long running-operations, as they may exhaust their number + # of retries and result in a network error being thrown. + - operations_longer_than_stepdown_interval + - does_not_support_causal_consistency + - uses_transactions + # collStats is not causally consistent + - requires_collstats + - requires_dbstats + - requires_datasize + - requires_sharding + +executor: + archive: + hooks: + - CheckReplDBHash + - CheckReplOplogs + - ValidateCollections + config: + shell_options: + eval: >- + testingReplication = true; + load('jstests/libs/override_methods/network_error_and_txn_override.js'); + db = connect(TestData.connectionString); + load("jstests/libs/override_methods/enable_sessions.js"); + load("jstests/libs/override_methods/set_read_and_write_concerns.js"); + global_vars: + TestData: + runningWithCausalConsistency: true + alwaysInjectTransactionNumber: true + defaultReadConcernLevel: "majority" + logRetryAttempts: true + networkErrorAndTxnOverrideConfig: + retryOnNetworkErrors: true + backgroundReconfigs: true + overrideRetryAttempts: 3 + sessionOptions: + writeConcern: + w: "majority" + readConcern: + level: "majority" + # Force DBClientRS to find the primary for non-write commands. + readPreference: + mode: "primary" + retryWrites: true + # We specify nodb so the shell used by each test will attempt to connect after loading the + # retry logic in auto_retry_on_network_error.js. + nodb: "" + readMode: commands + hooks: + - class: DoReconfigInBackground + shell_options: + nodb: "" + - class: ContinuousStepdown + # The CheckReplDBHash hook waits until all operations have replicated to and have been applied + # on the secondaries, so we run the ValidateCollections hook after it to ensure we're + # validating the entire contents of the collection. + - class: CheckReplOplogs + - class: CheckReplDBHash + - class: ValidateCollections + - class: CleanEveryN + n: 20 + fixture: + class: ReplicaSetFixture + mongod_options: + enableMajorityReadConcern: '' + set_parameters: + enableTestCommands: 1 + logComponentVerbosity: + replication: + heartbeats: 2 + all_nodes_electable: true + num_nodes: 5 diff --git a/buildscripts/resmokeconfig/suites/replica_sets_reconfig_kill_primary_jscore_passthrough.yml b/buildscripts/resmokeconfig/suites/replica_sets_reconfig_kill_primary_jscore_passthrough.yml new file mode 100644 index 00000000000..6c973e3dd06 --- /dev/null +++ b/buildscripts/resmokeconfig/suites/replica_sets_reconfig_kill_primary_jscore_passthrough.yml @@ -0,0 +1,214 @@ +test_kind: js_test +# This suite starts a 5-node replica set and runs both the DoReconfigInBackground and +# ContinuousStepdown hook with kill: true and background_reconfig: true to periodically run +# safe reconfigs and send kill signals to the primary. +# This tests that the concurrency between killing the primary and reconfigs is still +# ultimately safe. + +selector: + roots: + - jstests/core/**/*.js + exclude_files: + # Transactions do not support retryability of individual operations. + # TODO: Remove this once it is supported (SERVER-33952). + - jstests/core/txns/**/*.js + # These tests are not expected to pass with replica-sets: + - jstests/core/dbadmin.js + - jstests/core/opcounters_write_cmd.js + - jstests/core/read_after_optime.js + - jstests/core/capped_update.js + # The set_param1.js test attempts to compare the response from running the {getParameter: "*"} + # command multiple times, which may observe the change to the "transactionLifetimeLimitSeconds" + # server parameter. + - jstests/core/set_param1.js + + # No-op retries are not ignored by top, the profiler, or opcount. + - jstests/core/operation_latency_histogram.js + - jstests/core/profile2.js + - jstests/core/profile3.js + - jstests/core/profile_findandmodify.js + - jstests/core/top.js + - jstests/core/views/views_stats.js + + # TODO SERVER-31249: getLastError should not be affected by no-op retries. + - jstests/core/bulk_legacy_enforce_gle.js + + # TODO SERVER-31242: findAndModify no-op retry should respect the fields option. + - jstests/core/crud_api.js + - jstests/core/find_and_modify.js + - jstests/core/find_and_modify2.js + - jstests/core/find_and_modify_pipeline_update.js + - jstests/core/find_and_modify_server6865.js + + # These test run commands using legacy queries, which are not supported on sessions. + - jstests/core/comment_field.js + - jstests/core/exhaust.js + - jstests/core/validate_cmd_ns.js + + # Stepdown commands during fsync lock will fail. + - jstests/core/currentop.js + - jstests/core/fsync.js + - jstests/core/killop_drop_collection.js + + # Assert on the ismaster field of an isMaster response. If a primary steps down after accepting + # an isMaster command and returns before its connection is closed, the response can contain + # ismaster: false. + - jstests/core/dbadmin.js + - jstests/core/ismaster.js + + # Spawns new mongo shells, which don't retry connecting on stepdown errors. + - jstests/core/shell_connection_strings.js + + # Expect drops/creates to fail or have a certain response: + - jstests/core/drop.js + - jstests/core/dropdb.js + - jstests/core/explain_upsert.js + - jstests/core/indexes_multiple_commands.js + + # Expect certain responses, but retries of successfully completed commands may return + # different values: + - jstests/core/create_indexes.js + - jstests/core/objid5.js + + # Unacknowledged writes prohibited in an explicit session. + - jstests/core/batch_write_command_w0.js + + - jstests/core/bench_test*.js # benchRun() used for writes + - jstests/core/benchrun_pipeline_updates.js # benchRun() used for writes + - jstests/core/connection_string_validation.js # Does not expect a replica set connection string. + - jstests/core/list_collections_filter.js # Temporary collections are dropped on failover. + - jstests/core/top.js # Tests read commands (including getMore) against the secondary + - jstests/core/drop3.js # getMore is not causally consistent if collection is dropped + - jstests/core/validate_cmd_ns.js # Calls _exec() directly, not retryable. + - jstests/core/list_collections_filter.js # Temporary collections are dropped on failover. + - jstests/core/explain_large_bounds.js # Stepdown can timeout waiting for global lock. + + # Tests that fail for Causal Consistency as they have statements that do not support + # non-local read concern. + - jstests/core/collation.js + # Starts a parallel shell but won't restart it after unclean shutdown. + # TODO SERVER-33229: Remove these exclusions + - jstests/core/compact_keeps_indexes.js + - jstests/core/benchrun_pipeline_updates.js + - jstests/core/find_and_modify_concurrent_update.js + - jstests/core/shellstartparallel.js + + # Inserts enough data that recovery takes more than 8 seconds, so we never get a working primary. + - jstests/core/geo_s2ordering.js + + exclude_with_any_tags: + ## + # The next four tags correspond to the special errors thrown by the auto_retry_on_network_error.js + # override when it refuses to run a certain command. Above each tag are the message(s) that cause + # the tag to be warranted. + ## + # "Refusing to run a test that issues a getMore command since if a network error occurs during + # it then we won't know whether the cursor was advanced or not" + - requires_getmore + # "Refusing to run a test that issues non-retryable write operations since the test likely makes + # assertions on the write results and can lead to spurious failures if a network error occurs" + - requires_non_retryable_writes + # "Refusing to run a test that issues commands that are not blindly retryable" + # "Refusing to run a test that issues an aggregation command with $out because it is not + # retryable" + - requires_non_retryable_commands + # "Refusing to run a test that issues commands that may return different values after a failover" + # "Refusing to run a test that issues an aggregation command with explain because it may return + # incomplete results" + # "Refusing to run a test that issues an aggregation command with + # $listLocalSessions because it relies on in-memory state that may not survive failovers" + # "Refusing to run a test that issues a mapReduce command, because it calls std::terminate() if + # interrupted by a stepdown" + - does_not_support_stepdowns + ## + # The next two tags correspond to the special errors thrown by the + # set_read_and_write_concerns.js override when it refuses to replace the readConcern or + # writeConcern of a particular command. Above each tag are the message(s) that cause the tag to be + # warranted. + ## + # "Cowardly refusing to override read concern of command: ..." + - assumes_read_concern_unchanged + # "Cowardly refusing to override write concern of command: ..." + - assumes_write_concern_unchanged + ## + # The next three tags corresponds to the special errors thrown by the + # fail_unclean_shutdown_incompatible_commands.js override when it refuses to run commands that are + # inaccurate after an unclean shutdown. Above each tag is the message that causes the tag to be + # warranted. + ## + # "Cowardly fail if fastcount is run with a mongod that had an unclean shutdown: ..." + - requires_fastcount + # "Cowardly fail if dbStats is run with a mongod that had an unclean shutdown: ..." + - requires_dbstats + # "Cowardly fail if collStats is run with a mongod that had an unclean shutdown: ..." + - requires_collstats + # "Cowardly fail if unbounded dataSize is run with a mongod that had an unclean shutdown: ..." + - requires_datasize + ## The next tag corresponds to long running-operations, as they may exhaust their number + # of retries and result in a network error being thrown. + - operations_longer_than_stepdown_interval + +executor: + archive: + hooks: + - CheckReplDBHash + - CheckReplOplogs + - ValidateCollections + config: + shell_options: + eval: >- + testingReplication = true; + load('jstests/libs/override_methods/network_error_and_txn_override.js'); + db = connect(TestData.connectionString); + load("jstests/libs/override_methods/enable_sessions.js"); + load("jstests/libs/override_methods/set_read_and_write_concerns.js"); + load("jstests/libs/override_methods/fail_unclean_shutdown_incompatible_commands.js"); + load("jstests/libs/override_methods/fail_unclean_shutdown_start_parallel_shell.js"); + global_vars: + TestData: + alwaysInjectTransactionNumber: true + defaultReadConcernLevel: "majority" + logRetryAttempts: true + networkErrorAndTxnOverrideConfig: + retryOnNetworkErrors: true + backgroundReconfigs: true + overrideRetryAttempts: 3 + sessionOptions: + writeConcern: + w: "majority" + readConcern: + level: "majority" + # Force DBClientRS to find the primary for non-write commands. + readPreference: + mode: "primary" + retryWrites: true + # We specify nodb so the shell used by each test will attempt to connect after loading the + # retry logic in auto_retry_on_network_error.js. + nodb: "" + readMode: commands + hooks: + - class: DoReconfigInBackground + shell_options: + nodb: "" + - class: ContinuousStepdown + kill: true + background_reconfig: true + # The CheckReplDBHash hook waits until all operations have replicated to and have been applied + # on the secondaries, so we run the ValidateCollections hook after it to ensure we're + # validating the entire contents of the collection. + - class: CheckReplOplogs + - class: CheckReplDBHash + - class: ValidateCollections + - class: CleanEveryN + n: 20 + fixture: + class: ReplicaSetFixture + mongod_options: + enableMajorityReadConcern: '' + set_parameters: + enableTestCommands: 1 + logComponentVerbosity: + replication: + heartbeats: 2 + all_nodes_electable: true + num_nodes: 5 diff --git a/buildscripts/resmokelib/testing/fixtures/replicaset.py b/buildscripts/resmokelib/testing/fixtures/replicaset.py index fb8e90b5296..5de9996de1c 100644 --- a/buildscripts/resmokelib/testing/fixtures/replicaset.py +++ b/buildscripts/resmokelib/testing/fixtures/replicaset.py @@ -187,12 +187,10 @@ class ReplicaSetFixture(interface.ReplFixture): # pylint: disable=too-many-inst replset_settings = self.replset_config_options["settings"] repl_config["settings"] = replset_settings - # If not all nodes are electable and no election timeout was specified, then we increase - # the election timeout to 24 hours to prevent spurious elections. - if not self.all_nodes_electable: - repl_config.setdefault("settings", {}) - if "electionTimeoutMillis" not in repl_config["settings"]: - repl_config["settings"]["electionTimeoutMillis"] = 24 * 60 * 60 * 1000 + # Increase the election timeout to 24 hours to prevent spurious elections. + repl_config.setdefault("settings", {}) + if "electionTimeoutMillis" not in repl_config["settings"]: + repl_config["settings"]["electionTimeoutMillis"] = 24 * 60 * 60 * 1000 # Start up a single node replica set then reconfigure to the correct size (if the config # contains more than 1 node), so the primary is elected more quickly. @@ -500,6 +498,16 @@ class ReplicaSetFixture(interface.ReplFixture): # pylint: disable=too-many-inst primary = self.get_primary() return [node for node in self.nodes if node.port != primary.port] + def get_voting_members(self): + """Return the number of voting nodes in the replica set.""" + primary = self.get_primary() + client = primary.mongo_client() + + members = client.admin.command({"replSetGetConfig": 1})['config']['members'] + voting_members = [member['host'] for member in members if member['votes'] == 1] + + return voting_members + def get_initial_sync_node(self): """Return initial sync node from the replica set.""" return self.initial_sync_node diff --git a/buildscripts/resmokelib/testing/hooks/reconfig_background.py b/buildscripts/resmokelib/testing/hooks/reconfig_background.py new file mode 100644 index 00000000000..dbf9b33a242 --- /dev/null +++ b/buildscripts/resmokelib/testing/hooks/reconfig_background.py @@ -0,0 +1,71 @@ +"""Test hook for running safe reconfigs against the primary of a replica set. + +This hook runs continously in a background thread while the test is running. +""" + +import os.path + +from buildscripts.resmokelib import errors +from buildscripts.resmokelib.testing.hooks import jsfile +from buildscripts.resmokelib.testing.testcases import interface as testcase +from buildscripts.resmokelib.testing.hooks.background_job import _BackgroundJob, _ContinuousDynamicJSTestCase + + +class DoReconfigInBackground(jsfile.JSHook): + """A hook for running a safe reconfig against a replica set while a test is running.""" + + def __init__(self, hook_logger, fixture, shell_options=None): + """Initialize DoReconfigInBackground.""" + description = "Run reconfigs against the primary while the test is running." + js_filename = os.path.join("jstests", "hooks", "run_reconfig_background.js") + jsfile.JSHook.__init__(self, hook_logger, fixture, js_filename, description, + shell_options=shell_options) + + self._background_job = None + + def before_suite(self, test_report): + """Start the background thread.""" + self._background_job = _BackgroundJob("ReconfigInBackground") + self.logger.info("Starting the background reconfig thread.") + self._background_job.start() + + def after_suite(self, test_report): + """Signal the background thread to exit, and wait until it does.""" + if self._background_job is None: + return + + self.logger.info("Stopping the background reconfig thread.") + self._background_job.stop() + + def before_test(self, test, test_report): + """Instruct the background thread to run reconfigs while 'test' is also running.""" + if self._background_job is None: + return + + hook_test_case = _ContinuousDynamicJSTestCase.create_before_test( + self.logger.test_case_logger, test, self, self._js_filename, self._shell_options) + hook_test_case.configure(self.fixture) + + self.logger.info("Resuming the background reconfig thread.") + self._background_job.resume(hook_test_case, test_report) + + def after_test(self, test, test_report): # noqa: D205,D400 + """Instruct the background thread to stop running reconfigs now that 'test' has + finished running. + """ + if self._background_job is None: + return + + self.logger.info("Pausing the background reconfig thread.") + self._background_job.pause() + + if self._background_job.exc_info is not None: + if isinstance(self._background_job.exc_info[1], errors.TestFailure): + # If the mongo shell process running the JavaScript file exited with a non-zero + # return code, then we raise an errors.ServerFailure exception to cause resmoke.py's + # test execution to stop. + raise errors.ServerFailure(self._background_job.exc_info[1].args[0]) + else: + self.logger.error("Encountered an error inside the background reconfig thread.", + exc_info=self._background_job.exc_info) + raise self._background_job.exc_info[1] diff --git a/buildscripts/resmokelib/testing/hooks/stepdown.py b/buildscripts/resmokelib/testing/hooks/stepdown.py index de2c89e9a38..4cfd09fd52d 100644 --- a/buildscripts/resmokelib/testing/hooks/stepdown.py +++ b/buildscripts/resmokelib/testing/hooks/stepdown.py @@ -27,7 +27,7 @@ class ContinuousStepdown(interface.Hook): # pylint: disable=too-many-instance-a self, hook_logger, fixture, config_stepdown=True, shard_stepdown=True, stepdown_interval_ms=8000, terminate=False, kill=False, use_stepdown_permitted_file=False, wait_for_mongos_retarget=False, - stepdown_via_heartbeats=True): + stepdown_via_heartbeats=True, background_reconfig=False): """Initialize the ContinuousStepdown. Args: @@ -64,6 +64,8 @@ class ContinuousStepdown(interface.Hook): # pylint: disable=too-many-instance-a self._terminate = terminate or kill self._kill = kill + self._background_reconfig = background_reconfig + # The stepdown file names need to match the same construction as found in # jstests/concurrency/fsm_libs/resmoke_runner.js. dbpath_prefix = fixture.get_dbpath_prefix() @@ -87,7 +89,7 @@ class ContinuousStepdown(interface.Hook): # pylint: disable=too-many-instance-a self._stepdown_thread = _StepdownThread( self.logger, self._mongos_fixtures, self._rs_fixtures, self._stepdown_interval_secs, self._terminate, self._kill, lifecycle, self._wait_for_mongos_retarget, - self._stepdown_via_heartbeats) + self._stepdown_via_heartbeats, self._background_reconfig) self.logger.info("Starting the stepdown thread.") self._stepdown_thread.start() @@ -348,7 +350,8 @@ class FileBasedStepdownLifecycle(object): class _StepdownThread(threading.Thread): # pylint: disable=too-many-instance-attributes def __init__( # pylint: disable=too-many-arguments self, logger, mongos_fixtures, rs_fixtures, stepdown_interval_secs, terminate, kill, - stepdown_lifecycle, wait_for_mongos_retarget, stepdown_via_heartbeats): + stepdown_lifecycle, wait_for_mongos_retarget, stepdown_via_heartbeats, + background_reconfig): """Initialize _StepdownThread.""" threading.Thread.__init__(self, name="StepdownThread") self.daemon = True @@ -365,6 +368,7 @@ class _StepdownThread(threading.Thread): # pylint: disable=too-many-instance-at self.__lifecycle = stepdown_lifecycle self._should_wait_for_mongos_retarget = wait_for_mongos_retarget self._stepdown_via_heartbeats = stepdown_via_heartbeats + self._background_reconfig = background_reconfig self._last_exec = time.time() # Event set when the thread has been stopped using the 'stop()' method. @@ -474,6 +478,22 @@ class _StepdownThread(threading.Thread): # pylint: disable=too-many-instance-at rs_fixture.replset_name)) if self._terminate: + # If we're running with background reconfigs, it's possible to be in a scenario + # where we kill a necessary voting node (i.e. in a 5 node repl set), only 2 are + # voting. In this scenario, we want to avoid killing the primary because no + # secondary can step up. + if self._background_reconfig: + # stagger the kill thread so that it runs a little after the reconfig thread + time.sleep(1) + voting_members = rs_fixture.get_voting_members() + + self.logger.info("Current voting members: %s", voting_members) + + if len(voting_members) <= 3: + # Do not kill or terminate the primary if we don't have enough voting nodes to + # elect a new primary. + return + should_kill = self._kill and random.choice([True, False]) action = "Killing" if should_kill else "Terminating" self.logger.info("%s the primary on port %d of replica set '%s'.", action, primary.port, diff --git a/etc/evergreen.yml b/etc/evergreen.yml index 48f75418a26..fc22975e4b9 100644 --- a/etc/evergreen.yml +++ b/etc/evergreen.yml @@ -6088,6 +6088,36 @@ tasks: vars: resmoke_args: --suites=replica_sets_jscore_passthrough --storageEngine=wiredTiger +- <<: *task_template + name: replica_sets_reconfig_jscore_passthrough + depends_on: + - name: jsCore + commands: + - func: "do setup" + - func: "run tests" + vars: + resmoke_args: --suites=replica_sets_reconfig_jscore_passthrough --storageEngine=wiredTiger + +- <<: *task_template + name: replica_sets_reconfig_jscore_stepdown_passthrough + depends_on: + - name: jsCore + commands: + - func: "do setup" + - func: "run tests" + vars: + resmoke_args: --suites=replica_sets_reconfig_jscore_stepdown_passthrough --storageEngine=wiredTiger + +- <<: *task_template + name: replica_sets_reconfig_kill_primary_jscore_passthrough + depends_on: + - name: jsCore + commands: + - func: "do setup" + - func: "run tests" + vars: + resmoke_args: --suites=replica_sets_reconfig_kill_primary_jscore_passthrough --storageEngine=wiredTiger + - name: replica_sets_jscore_passthrough_gen depends_on: - name: jsCore diff --git a/jstests/hooks/run_reconfig_background.js b/jstests/hooks/run_reconfig_background.js new file mode 100644 index 00000000000..4717a8d4710 --- /dev/null +++ b/jstests/hooks/run_reconfig_background.js @@ -0,0 +1,147 @@ +/** + * This hook runs the reconfig command against the primary of a replica set: + * The reconfig command first chooses a random node (not the primary) and will change + * its votes and priority to 0 or 1 depending on the current value. + * + * This hook will run concurrently with tests. + */ + +'use strict'; + +(function() { +load('jstests/libs/discover_topology.js'); // For Topology and DiscoverTopology. +load('jstests/libs/parallelTester.js'); // For Thread. + +/** + * Returns true if the error code is transient. + */ +function isIgnorableError(codeName) { + if (codeName == "ConfigurationInProgress" || codeName == "NotMaster" || + codeName == "InterruptedDueToReplStateChange" || codeName == "PrimarySteppedDown" || + codeName === "NodeNotFound" || codeName === "ShutdownInProgress") { + return true; + } + return false; +} + +/** + * Runs the reconfig command against the primary of a replica set. + * + * The reconfig command randomly chooses a node to change it's votes and priority to 0 or 1 + * based on what the node's current votes and priority fields are. We always check to see that + * there exists at least two voting nodes in the set, which ensures that we can always have a + * primary in the case of stepdowns. + * We also want to avoid changing the votes and priority of the current primary to 0, since this + * will result in an error. + * + * The number of voting nodes in the replica set determines what the config majority is for both + * reconfig config commitment and reconfig oplog commitment. + * + * This function should not throw if everything is working properly. + */ +function reconfigBackground(primary, numNodes) { + // Calls 'func' with the print() function overridden to be a no-op. + Random.setRandomSeed(); + const quietly = (func) => { + const printOriginal = print; + try { + print = Function.prototype; + func(); + } finally { + print = printOriginal; + } + }; + + // The stepdown and kill primary hooks run concurrently with this reconfig hook. It is + // possible that the topology will not be properly updated in time, meaning that the + // current primary can be undefined if a secondary has not stepped up soon enough. + if (primary === undefined) { + jsTestLog("Skipping reconfig because we do not have a primary yet."); + return {ok: 1}; + } + + jsTestLog("primary is " + primary); + + // Suppress the log messages generated establishing new mongo connections. The + // run_reconfig_background.js hook is executed frequently by resmoke.py and + // could lead to generating an overwhelming amount of log messages. + let conn; + quietly(() => { + conn = new Mongo(primary); + }); + assert.neq( + null, conn, "Failed to connect to primary '" + primary + "' for background reconfigs"); + + var config = assert.commandWorked(conn.getDB("admin").runCommand({replSetGetConfig: 1})).config; + + // Find the correct host in the member config + const primaryHostIndex = (cfg, pHost) => cfg.members.findIndex(m => m.host === pHost); + const primaryIndex = primaryHostIndex(config, primary); + jsTestLog("primaryIndex is " + primaryIndex); + + // Calculate the total number of voting nodes in this set so that we make sure we + // always have at least two voting nodes. This is so that the primary can always + // safely step down because there is at least one other electable secondary. + const numVotingNodes = config.members.filter(member => member.votes === 1).length; + + // Randomly change the vote of a node to 1 or 0 depending on its current value. Do not + // change the primary's votes. + var indexToChange = primaryIndex; + while (indexToChange === primaryIndex) { + // randInt is exclusive of the upper bound. + indexToChange = Random.randInt(numNodes); + } + + jsTestLog("Running reconfig to change votes of node at index" + indexToChange); + + // Change the priority to correspond to the votes. If the member's current votes field + // is 1, only change it to 0 if there are more than 3 voting members in this set. + // We want to ensure that there are at least 3 voting nodes so that killing the primary + // will not affect a majority. + config.version++; + config.members[indexToChange].votes = + (config.members[indexToChange].votes === 1 && numVotingNodes > 3) ? 0 : 1; + config.members[indexToChange].priority = config.members[indexToChange].votes; + + let votingRes = conn.getDB("admin").runCommand({replSetReconfig: config}); + if (!votingRes.ok && !isIgnorableError(votingRes.codeName)) { + jsTestLog("Reconfig to change votes FAILED."); + return votingRes; + } + + return {ok: 1}; +} + +// It is possible that the primary will be killed before actually running the reconfig +// command. If we fail with a network error, ignore it. +let res; +try { + const conn = connect(TestData.connectionString); + const topology = DiscoverTopology.findConnectedNodes(conn.getMongo()); + + if (topology.type !== Topology.kReplicaSet) { + throw new Error('Unsupported topology configuration: ' + tojson(topology)); + } + + const numNodes = topology.nodes.length; + res = reconfigBackground(topology.primary, numNodes); +} catch (e) { + // If the ReplicaSetMonitor cannot find a primary because it has stepped down or + // been killed, it may take longer than 15 seconds for a new primary to step up. + // Ignore this error until we find a new primary. + const kReplicaSetMonitorError = + /^Could not find host matching read preference.*mode: "primary"/; + + if (isNetworkError(e)) { + jsTestLog("Ignoring network error" + tojson(e)); + } else if (e.message.match(kReplicaSetMonitorError)) { + jsTestLog("Ignoring read preference primary error" + tojson(e)); + } else { + throw e; + } + + res = {ok: 1}; +} + +assert.commandWorked(res, "reconfig hook failed: " + tojson(res)); +})(); diff --git a/jstests/libs/override_methods/network_error_and_txn_override.js b/jstests/libs/override_methods/network_error_and_txn_override.js index 798d9bf7fda..b08b6d0b233 100644 --- a/jstests/libs/override_methods/network_error_and_txn_override.js +++ b/jstests/libs/override_methods/network_error_and_txn_override.js @@ -64,6 +64,11 @@ function configuredForTxnOverride() { return TestData.networkErrorAndTxnOverrideConfig.wrapCRUDinTransactions; } +function configuredForBackgroundReconfigs() { + assert(TestData.networkErrorAndTxnOverrideConfig, TestData); + return TestData.networkErrorAndTxnOverrideConfig.backgroundReconfigs; +} + // Commands assumed to not be blindly retryable. const kNonRetryableCommands = new Set([ // Commands that take write concern and do not support txnNumbers. @@ -128,6 +133,15 @@ const kAcceptableNonRetryableCommands = new Set([ "moveChunk", ]); +// The following read operations defined in the CRUD specification are retryable. +// Note that estimatedDocumentCount() and countDocuments() use the count command. +const kRetryableReadCommands = new Set(["find", "aggregate", "distinct", "count"]); + +// Returns true if the command name is that of a retryable read command. +function isRetryableReadCmdName(cmdName) { + return kRetryableReadCommands.has(cmdName); +} + // Returns if the given failed response is a safe response to ignore when retrying the // given command type. function isAcceptableRetryFailedResponse(cmdName, res) { @@ -186,6 +200,20 @@ function canRetryNetworkErrorForCommand(cmdName, cmdObj) { return true; } +// Returns if the given command should retry a read error when reconfigs are present. +function canRetryReadErrorDuringBackgroundReconfig(cmdName) { + if (!configuredForBackgroundReconfigs()) { + return false; + } + return isRetryableReadCmdName(cmdName); +} + +// When running the reconfig command on a node, it will drop its snapshot. Read commands issued +// to this node before it updates its snapshot will fail with ReadConcernMajorityNotAvailableYet. +function isRetryableReadCode(code) { + return code === ErrorCodes.ReadConcernMajorityNotAvailableYet; +} + // Several commands that use the plan executor swallow the actual error code from a failed plan // into their error message and instead return OperationFailed. // @@ -905,6 +933,19 @@ function shouldRetryWithNetworkErrorOverride( return res; } +function shouldRetryForBackgroundReconfigOverride(res, cmdName, logError) { + assert(configuredForBackgroundReconfigs()); + // Background reconfigs can interfere with read commands if they are using readConcern: majority + // and readPreference: primary. If we're running a read command and it fails with + // ReadConcernMajorityNotAvailableYet, retry because it should eventually succeed. + if (isRetryableReadCmdName(cmdName) && isRetryableReadCode(res.code)) { + logError("Retrying read command after 100ms because of background reconfigs"); + sleep(100); + return kContinue; + } + return res; +} + // Processes exceptions if configured for txn override. Retries the entire transaction on // transient transaction errors or network errors if configured for network errors as well. // If a retry fails, returns the response, or returns null for further exception processing. @@ -990,6 +1031,7 @@ function runCommandOverrideBody(conn, dbName, cmdName, cmdObj, lsid, clientFunct } const canRetryNetworkError = canRetryNetworkErrorForCommand(cmdName, cmdObj); + const canRetryReadError = canRetryReadErrorDuringBackgroundReconfig(cmdName); let numNetworkErrorRetries = canRetryNetworkError ? kMaxNumRetries : 0; do { try { @@ -1020,6 +1062,16 @@ function runCommandOverrideBody(conn, dbName, cmdName, cmdObj, lsid, clientFunct } } + if (canRetryReadError) { + const readRetryRes = + shouldRetryForBackgroundReconfigOverride(res, cmdName, logError); + if (readRetryRes === kContinue) { + continue; + } else { + res = readRetryRes; + } + } + return res; } catch (e) { diff --git a/jstests/replsets/txn_override_unittests.js b/jstests/replsets/txn_override_unittests.js index 8fe114b7789..8bab3f5b167 100644 --- a/jstests/replsets/txn_override_unittests.js +++ b/jstests/replsets/txn_override_unittests.js @@ -1896,6 +1896,168 @@ const txnOverridePlusRetryOnNetworkErrorTestsFcv42 = [ } ]; +const retryOnReadErrorsFromBackgroundReconfigTest = [ + { + name: "find retries on ReadConcernMajorityNotAvailableYet", + test: function() { + assert.commandWorked(testDB.createCollection(collName1)); + assert.commandWorked(coll1.insert({_id: 1})); + failCommandWithFailPoint(["find"], + {errorCode: ErrorCodes.ReadConcernMajorityNotAvailableYet}); + assert.eq(coll1.findOne({_id: 1}), {_id: 1}); + } + }, + { + name: "aggregate retries on ReadConcernMajorityNotAvailableYet", + test: function() { + assert.commandWorked(testDB.createCollection(collName1)); + assert.commandWorked(coll1.insert({a: 1})); + assert.commandWorked(coll1.insert({a: 1})); + assert.commandWorked(coll1.insert({a: 2})); + failCommandWithFailPoint(["aggregate"], + {errorCode: ErrorCodes.ReadConcernMajorityNotAvailableYet}); + const cursor = coll1.aggregate([{$match: {a: 1}}]); + assert.eq(cursor.toArray().length, 2); + } + }, + { + name: "distinct retries on ReadConcernMajorityNotAvailableYet", + test: function() { + assert.commandWorked(testDB.createCollection(collName1)); + assert.commandWorked(coll1.insert({a: 1})); + assert.commandWorked(coll1.insert({a: 1})); + assert.commandWorked(coll1.insert({a: 2})); + failCommandWithFailPoint(["distinct"], + {errorCode: ErrorCodes.ReadConcernMajorityNotAvailableYet}); + assert.eq(coll1.distinct("a").sort(), [1, 2]); + } + }, + { + name: "count retries on ReadConcernMajorityNotAvailableYet", + test: function() { + assert.commandWorked(testDB.createCollection(collName1)); + assert.commandWorked(coll1.insert({a: 1})); + assert.commandWorked(coll1.insert({a: 1})); + assert.commandWorked(coll1.insert({a: 2})); + failCommandWithFailPoint(["count"], + {errorCode: ErrorCodes.ReadConcernMajorityNotAvailableYet}); + assert.eq(coll1.count({a: 1}), 2); + } + }, +]; + +const retryReadsOnNetworkErrorsWithNetworkRetryAndBackgroundReconfigTest = [ + { + name: "find retries on network errors", + test: function() { + assert.commandWorked(testDB.createCollection(collName1)); + assert.commandWorked(coll1.insert({_id: 1})); + failCommandWithFailPoint(["find"], {closeConnection: true}); + assert.eq(coll1.findOne({_id: 1}), {_id: 1}); + } + }, + { + name: "aggregate retries on network errors", + test: function() { + assert.commandWorked(testDB.createCollection(collName1)); + assert.commandWorked(coll1.insert({a: 1})); + assert.commandWorked(coll1.insert({a: 1})); + assert.commandWorked(coll1.insert({a: 2})); + failCommandWithFailPoint(["aggregate"], {closeConnection: true}); + const cursor = coll1.aggregate([{$match: {a: 1}}]); + assert.eq(cursor.toArray().length, 2); + } + }, + { + name: "distinct retries on network errors", + test: function() { + assert.commandWorked(testDB.createCollection(collName1)); + assert.commandWorked(coll1.insert({a: 1})); + assert.commandWorked(coll1.insert({a: 1})); + assert.commandWorked(coll1.insert({a: 2})); + failCommandWithFailPoint(["distinct"], {closeConnection: true}); + assert.eq(coll1.distinct("a").sort(), [1, 2]); + } + }, + { + name: "count retries on network errors", + test: function() { + assert.commandWorked(testDB.createCollection(collName1)); + assert.commandWorked(coll1.insert({a: 1})); + assert.commandWorked(coll1.insert({a: 1})); + assert.commandWorked(coll1.insert({a: 2})); + failCommandWithFailPoint(["count"], {closeConnection: true}); + assert.eq(coll1.count({a: 1}), 2); + } + }, +]; + +const doNotRetryReadErrorWithOutBackgroundReconfigTest = [ + { + name: "find fails on ReadConcernMajorityNotAvailableYet", + test: function() { + assert.commandWorked(testDB.createCollection(collName1)); + assert.commandWorked(coll1.insert({_id: 1})); + failCommandWithFailPoint(["find"], + {errorCode: ErrorCodes.ReadConcernMajorityNotAvailableYet}); + assert.commandFailedWithCode( + assert.throws(function() { + coll1.findOne({_id: 1}); + }), + ErrorCodes.ReadConcernMajorityNotAvailableYet); + } + }, + { + name: "aggregate fails on ReadConcernMajorityNotAvailableYet", + test: function() { + assert.commandWorked(testDB.createCollection(collName1)); + assert.commandWorked(coll1.insert({a: 1})); + assert.commandWorked(coll1.insert({a: 1})); + assert.commandWorked(coll1.insert({a: 2})); + failCommandWithFailPoint(["aggregate"], + {errorCode: ErrorCodes.ReadConcernMajorityNotAvailableYet}); + assert.commandFailedWithCode( + assert.throws(function() { + const cursor = coll1.aggregate([{$match: {a: 1}}]); + assert.eq(cursor.toArray().length, 2); + }), + ErrorCodes.ReadConcernMajorityNotAvailableYet); + } + }, + { + name: "distinct fails on ReadConcernMajorityNotAvailableYet", + test: function() { + assert.commandWorked(testDB.createCollection(collName1)); + assert.commandWorked(coll1.insert({a: 1})); + assert.commandWorked(coll1.insert({a: 1})); + assert.commandWorked(coll1.insert({a: 2})); + failCommandWithFailPoint(["distinct"], + {errorCode: ErrorCodes.ReadConcernMajorityNotAvailableYet}); + assert.commandFailedWithCode( + assert.throws(function() { + coll1.distinct("a"); + }), + ErrorCodes.ReadConcernMajorityNotAvailableYet); + } + }, + { + name: "count fails on ReadConcernMajorityNotAvailableYet", + test: function() { + assert.commandWorked(testDB.createCollection(collName1)); + assert.commandWorked(coll1.insert({a: 1})); + assert.commandWorked(coll1.insert({a: 1})); + assert.commandWorked(coll1.insert({a: 2})); + failCommandWithFailPoint(["count"], + {errorCode: ErrorCodes.ReadConcernMajorityNotAvailableYet}); + assert.commandFailedWithCode( + assert.throws(function() { + coll1.count({a: 1}); + }), + ErrorCodes.ReadConcernMajorityNotAvailableYet); + } + }, +]; + TestData.networkErrorAndTxnOverrideConfig = {}; TestData.sessionOptions = new SessionOptions(); TestData.overrideRetryAttempts = 3; @@ -1912,6 +2074,7 @@ jsTestLog("=-=-=-=-=-= Testing with 'retry on network error' by itself. =-=-=-=- TestData.sessionOptions = new SessionOptions({retryWrites: true}); TestData.networkErrorAndTxnOverrideConfig.retryOnNetworkErrors = true; TestData.networkErrorAndTxnOverrideConfig.wrapCRUDinTransactions = false; +TestData.networkErrorAndTxnOverrideConfig.backgroundReconfigs = false; session = conn.startSession(TestData.sessionOptions); testDB = session.getDatabase(dbName); @@ -1924,6 +2087,7 @@ jsTestLog("=-=-=-=-=-= Testing with 'txn override' by itself. =-=-=-=-=-="); TestData.sessionOptions = new SessionOptions({retryWrites: false}); TestData.networkErrorAndTxnOverrideConfig.retryOnNetworkErrors = false; TestData.networkErrorAndTxnOverrideConfig.wrapCRUDinTransactions = true; +TestData.networkErrorAndTxnOverrideConfig.backgroundReconfigs = false; session = conn.startSession(TestData.sessionOptions); testDB = session.getDatabase(dbName); @@ -1939,6 +2103,7 @@ jsTestLog("=-=-=-=-=-= Testing 'both txn override and retry on network error'. = TestData.sessionOptions = new SessionOptions({retryWrites: true}); TestData.networkErrorAndTxnOverrideConfig.retryOnNetworkErrors = true; TestData.networkErrorAndTxnOverrideConfig.wrapCRUDinTransactions = true; +TestData.networkErrorAndTxnOverrideConfig.backgroundReconfigs = false; session = conn.startSession(TestData.sessionOptions); testDB = session.getDatabase(dbName); @@ -1952,5 +2117,50 @@ if (usingFcv42) { (testCase) => runTest("txnOverridePlusRetryOnNetworkErrorTestsFcv42", testCase)); } +jsTestLog("=-=-=-=-=-= Testing 'retry on read errors from background reconfigs'. =-=-=-=-=-="); +TestData.sessionOptions = new SessionOptions({retryWrites: false}); +TestData.networkErrorAndTxnOverrideConfig.retryOnNetworkErrors = false; +TestData.networkErrorAndTxnOverrideConfig.backgroundReconfigs = true; +TestData.networkErrorAndTxnOverrideConfig.wrapCRUDinTransactions = false; + +session = conn.startSession(TestData.sessionOptions); +testDB = session.getDatabase(dbName); +coll1 = testDB[collName1]; +coll2 = testDB[collName2]; + +retryOnReadErrorsFromBackgroundReconfigTest.forEach( + (testCase) => runTest("retryOnReadErrorsFromBackgroundReconfigTest", testCase)); + +jsTestLog( + "=-=-=-=-=-= Testing 'retry on network errors during network error retry and background reconfigs'. =-=-=-=-=-="); +TestData.sessionOptions = new SessionOptions({retryWrites: true}); +TestData.networkErrorAndTxnOverrideConfig.retryOnNetworkErrors = true; +TestData.networkErrorAndTxnOverrideConfig.backgroundReconfigs = true; +TestData.networkErrorAndTxnOverrideConfig.wrapCRUDinTransactions = false; + +session = conn.startSession(TestData.sessionOptions); +testDB = session.getDatabase(dbName); +coll1 = testDB[collName1]; +coll2 = testDB[collName2]; + +retryReadsOnNetworkErrorsWithNetworkRetryAndBackgroundReconfigTest.forEach( + (testCase) => + runTest("retryReadsOnNetworkErrorsWithNetworkRetryAndBackgroundReconfigTest", testCase)); + +jsTestLog( + "=-=-=-=-=-= Testing 'don't retry on network errors during background reconfigs'. =-=-=-=-=-="); +TestData.sessionOptions = new SessionOptions({retryWrites: true}); +TestData.networkErrorAndTxnOverrideConfig.retryOnNetworkErrors = true; +TestData.networkErrorAndTxnOverrideConfig.backgroundReconfigs = false; +TestData.networkErrorAndTxnOverrideConfig.wrapCRUDinTransactions = false; + +session = conn.startSession(TestData.sessionOptions); +testDB = session.getDatabase(dbName); +coll1 = testDB[collName1]; +coll2 = testDB[collName2]; + +doNotRetryReadErrorWithOutBackgroundReconfigTest.forEach( + (testCase) => runTest("doNotRetryReadErrorWithOutBackgroundReconfigTest", testCase)); + rst.stopSet(); })(); |