diff options
author | Pavi Vetriselvan <pvselvan@umich.edu> | 2020-03-16 11:13:36 -0400 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-05-14 12:42:24 +0000 |
commit | f4528563033d933ca920b3e4b2a5e3344e198a5c (patch) | |
tree | 8c20856b344e02483dceb1e13f35533e41db3ecd /buildscripts | |
parent | cd9fbb56900343e7b1193922a2c4b197895e7f56 (diff) | |
download | mongo-f4528563033d933ca920b3e4b2a5e3344e198a5c.tar.gz |
SERVER-45094 add disabled replica set reconfig passthroughs
SERVER-45094 add retryable read logic to network_error_and_txn_override.js
(cherry picked from commit f59f63db6c37c0d4657b57d559c95d830b0e34c2)
SERVER-45094 add replica_sets_reconfig_jscore_passthrough suite
(cherry picked from commit 4d91fac171cbe3f2af53d9258965399e648a1947)
SERVER-45094 use w:1 writes and remove causal consistency in reconfig passthrough
(cherry picked from commit a43cb23defc6182d08a7814e4731ef98f2d30b6a)
SERVER-45094 add replica_sets_reconfig_jscore_stepdown_passthrough
(cherry picked from commit 81e0ad27c280c02a49beb65ff4473d5dce62b089)
SERVER-45094 add replica_sets_reconfig_kill_primary_jscore_passthrough
(cherry picked from commit 2debab7987b24bf902f9a128654ce928441c29a2)
SERVER-47678 stepdown and kill primary reconfig passthroughs should ignore ReplicaSetMonitorErrors
(cherry picked from commit 91672e58f1169c7edd684b911f20f62b8a71f8d1)
SERVER-47544 always increase election timeout to 24 hours in passthrough suites
(cherry picked from commit 81d53a715f49827a9f2538d4572f9b01f2b12887)
Diffstat (limited to 'buildscripts')
6 files changed, 604 insertions, 9 deletions
diff --git a/buildscripts/resmokeconfig/suites/replica_sets_reconfig_jscore_passthrough.yml b/buildscripts/resmokeconfig/suites/replica_sets_reconfig_jscore_passthrough.yml new file mode 100644 index 00000000000..8285339493b --- /dev/null +++ b/buildscripts/resmokeconfig/suites/replica_sets_reconfig_jscore_passthrough.yml @@ -0,0 +1,84 @@ +test_kind: js_test +# This suite starts a 5-node replica set and uses DoReconfigInBackground hook to periodically run +# safe reconfigs against the primary. These reconfigs change the number of voting nodes in the +# replica set, which changes the voting majority used to satisfy the config commitment check and +# oplog commitment check. + +selector: + roots: + - jstests/core/**/*.js + exclude_files: + # Transactions do not support retryability of individual operations. + # TODO: Remove this once it is supported (SERVER-33952). + - jstests/core/txns/**/*.js + # These tests are not expected to pass with replica-sets: + - jstests/core/dbadmin.js + - jstests/core/opcounters_write_cmd.js + - jstests/core/read_after_optime.js + - jstests/core/capped_update.js + # The set_param1.js test attempts to compare the response from running the {getParameter: "*"} + # command multiple times, which may observe the change to the "transactionLifetimeLimitSeconds" + # server parameter. + - jstests/core/set_param1.js + + # These test run commands using legacy queries, which are not supported on sessions. + - jstests/core/comment_field.js + - jstests/core/exhaust.js + - jstests/core/invalidated_legacy_cursors.js + - jstests/core/validate_cmd_ns.js + + # Unacknowledged writes prohibited in an explicit session. + - jstests/core/batch_write_command_w0.js + - jstests/core/crud_api.js + + - jstests/core/connection_string_validation.js # Does not expect a replica set connection string. + + exclude_with_any_tags: + - assumes_read_preference_unchanged + - requires_sharding + +executor: + archive: + hooks: + - CheckReplDBHash + - CheckReplOplogs + - ValidateCollections + config: + shell_options: + eval: >- + testingReplication = true; + load('jstests/libs/override_methods/network_error_and_txn_override.js'); + load("jstests/libs/override_methods/enable_sessions.js"); + global_vars: + TestData: + networkErrorAndTxnOverrideConfig: + backgroundReconfigs: true + sessionOptions: + # Force DBClientRS to find the primary for non-write commands to make sure reads still + # work as expected during reconfigs. + readPreference: + mode: "primary" + readMode: commands + hooks: + # The CheckReplDBHash hook waits until all operations have replicated to and have been applied + # on the secondaries, so we run the ValidateCollections hook after it to ensure we're + # validating the entire contents of the collection. + - class: DoReconfigInBackground + shell_options: + nodb: "" + - class: CheckReplOplogs + - class: CheckReplDBHash + - class: ValidateCollections + - class: CleanEveryN + n: 20 + fixture: + class: ReplicaSetFixture + mongod_options: + enableMajorityReadConcern: '' + set_parameters: + enableTestCommands: 1 + logComponentVerbosity: + replication: + heartbeats: 2 + all_nodes_electable: true + num_nodes: 5 diff --git a/buildscripts/resmokeconfig/suites/replica_sets_reconfig_jscore_stepdown_passthrough.yml b/buildscripts/resmokeconfig/suites/replica_sets_reconfig_jscore_stepdown_passthrough.yml new file mode 100644 index 00000000000..d221ccba640 --- /dev/null +++ b/buildscripts/resmokeconfig/suites/replica_sets_reconfig_jscore_stepdown_passthrough.yml @@ -0,0 +1,198 @@ +test_kind: js_test +# This suite starts a 5-node replica set and runs both the DoReconfigInBackground and +# ContinuousStepdown hook to periodically run safe reconfigs and step downs against the +# primary. This tests that the concurrency between stepdowns and reconfigs is still ultimately +# safe. + +selector: + roots: + - jstests/core/**/*.js + exclude_files: + # Transactions do not support retryability of individual operations. + # TODO: Remove this once it is supported (SERVER-33952). + - jstests/core/txns/**/*.js + # These tests are not expected to pass with replica-sets: + - jstests/core/dbadmin.js + - jstests/core/opcounters_write_cmd.js + - jstests/core/read_after_optime.js + - jstests/core/capped_update.js + # The set_param1.js test attempts to compare the response from running the {getParameter: "*"} + # command multiple times, which may observe the change to the "transactionLifetimeLimitSeconds" + # server parameter. + - jstests/core/set_param1.js + + # No-op retries are not ignored by top, the profiler, or opcount. + - jstests/core/operation_latency_histogram.js + - jstests/core/profile2.js + - jstests/core/profile3.js + - jstests/core/profile_findandmodify.js + - jstests/core/top.js + - jstests/core/views/views_stats.js + + # TODO SERVER-31249: getLastError should not be affected by no-op retries. + - jstests/core/bulk_legacy_enforce_gle.js + + # TODO SERVER-31242: findAndModify no-op retry should respect the fields option. + - jstests/core/crud_api.js + - jstests/core/find_and_modify.js + - jstests/core/find_and_modify2.js + - jstests/core/find_and_modify_pipeline_update.js + - jstests/core/find_and_modify_server6865.js + + # These test run commands using legacy queries, which are not supported on sessions. + - jstests/core/comment_field.js + - jstests/core/exhaust.js + - jstests/core/validate_cmd_ns.js + + # Stepdown commands during fsync lock will fail. + - jstests/core/currentop.js + - jstests/core/fsync.js + - jstests/core/killop_drop_collection.js + + # Assert on the ismaster field of an isMaster response. If a primary steps down after accepting + # an isMaster command and returns before its connection is closed, the response can contain + # ismaster: false. + - jstests/core/dbadmin.js + - jstests/core/ismaster.js + + # Spawns new mongo shells, which don't retry connecting on stepdown errors. + - jstests/core/shell_connection_strings.js + + # Expect drops/creates to fail or have a certain response: + - jstests/core/drop.js + - jstests/core/dropdb.js + - jstests/core/explain_upsert.js + - jstests/core/indexes_multiple_commands.js + + # Expect certain responses, but retries of successfully completed commands may return + # different values: + - jstests/core/create_indexes.js + - jstests/core/objid5.js + + # Unacknowledged writes prohibited in an explicit session. + - jstests/core/batch_write_command_w0.js + + - jstests/core/bench_test*.js # benchRun() used for writes + - jstests/core/benchrun_pipeline_updates.js # benchRun() used for writes + - jstests/core/connection_string_validation.js # Does not expect a replica set connection string. + - jstests/core/list_collections_filter.js # Temporary collections are dropped on failover. + - jstests/core/top.js # Tests read commands (including getMore) against the secondary + - jstests/core/drop3.js # getMore is not causally consistent if collection is dropped + - jstests/core/validate_cmd_ns.js # Calls _exec() directly, not retryable. + - jstests/core/list_collections_filter.js # Temporary collections are dropped on failover. + - jstests/core/explain_large_bounds.js # Stepdown can timeout waiting for global lock. + + # Tests that fail for Causal Consistency as they have statements that do not support + # non-local read concern. + - jstests/core/collation.js + # Parallel shell is not causally consistent + - jstests/core/benchrun_pipeline_updates.js + - jstests/core/find_and_modify_concurrent_update.js + - jstests/core/shellstartparallel.js + + exclude_with_any_tags: + ## + # The next four tags correspond to the special errors thrown by the auto_retry_on_network_error.js + # override when it refuses to run a certain command. Above each tag are the message(s) that cause + # the tag to be warranted. + ## + # "Refusing to run a test that issues a getMore command since if a network error occurs during + # it then we won't know whether the cursor was advanced or not" + - requires_getmore + # "Refusing to run a test that issues non-retryable write operations since the test likely makes + # assertions on the write results and can lead to spurious failures if a network error occurs" + - requires_non_retryable_writes + # "Refusing to run a test that issues commands that are not blindly retryable" + # "Refusing to run a test that issues an aggregation command with $out because it is not + # retryable" + - requires_non_retryable_commands + # "Refusing to run a test that issues commands that may return different values after a failover" + # "Refusing to run a test that issues an aggregation command with explain because it may return + # incomplete results" + # "Refusing to run a test that issues an aggregation command with + # $listLocalSessions because it relies on in-memory state that may not survive failovers" + # "Refusing to run a test that issues a mapReduce command, because it calls std::terminate() if + # interrupted by a stepdown" + - does_not_support_stepdowns + ## + # The next two tags correspond to the special errors thrown by the + # set_read_and_write_concerns.js override when it refuses to replace the readConcern or + # writeConcern of a particular command. Above each tag are the message(s) that cause the tag to be + # warranted. + ## + # "Cowardly refusing to override read concern of command: ..." + - assumes_read_concern_unchanged + # "Cowardly refusing to override write concern of command: ..." + - assumes_write_concern_unchanged + ## The next tag corresponds to long running-operations, as they may exhaust their number + # of retries and result in a network error being thrown. + - operations_longer_than_stepdown_interval + - does_not_support_causal_consistency + - uses_transactions + # collStats is not causally consistent + - requires_collstats + - requires_dbstats + - requires_datasize + - requires_sharding + +executor: + archive: + hooks: + - CheckReplDBHash + - CheckReplOplogs + - ValidateCollections + config: + shell_options: + eval: >- + testingReplication = true; + load('jstests/libs/override_methods/network_error_and_txn_override.js'); + db = connect(TestData.connectionString); + load("jstests/libs/override_methods/enable_sessions.js"); + load("jstests/libs/override_methods/set_read_and_write_concerns.js"); + global_vars: + TestData: + runningWithCausalConsistency: true + alwaysInjectTransactionNumber: true + defaultReadConcernLevel: "majority" + logRetryAttempts: true + networkErrorAndTxnOverrideConfig: + retryOnNetworkErrors: true + backgroundReconfigs: true + overrideRetryAttempts: 3 + sessionOptions: + writeConcern: + w: "majority" + readConcern: + level: "majority" + # Force DBClientRS to find the primary for non-write commands. + readPreference: + mode: "primary" + retryWrites: true + # We specify nodb so the shell used by each test will attempt to connect after loading the + # retry logic in auto_retry_on_network_error.js. + nodb: "" + readMode: commands + hooks: + - class: DoReconfigInBackground + shell_options: + nodb: "" + - class: ContinuousStepdown + # The CheckReplDBHash hook waits until all operations have replicated to and have been applied + # on the secondaries, so we run the ValidateCollections hook after it to ensure we're + # validating the entire contents of the collection. + - class: CheckReplOplogs + - class: CheckReplDBHash + - class: ValidateCollections + - class: CleanEveryN + n: 20 + fixture: + class: ReplicaSetFixture + mongod_options: + enableMajorityReadConcern: '' + set_parameters: + enableTestCommands: 1 + logComponentVerbosity: + replication: + heartbeats: 2 + all_nodes_electable: true + num_nodes: 5 diff --git a/buildscripts/resmokeconfig/suites/replica_sets_reconfig_kill_primary_jscore_passthrough.yml b/buildscripts/resmokeconfig/suites/replica_sets_reconfig_kill_primary_jscore_passthrough.yml new file mode 100644 index 00000000000..6c973e3dd06 --- /dev/null +++ b/buildscripts/resmokeconfig/suites/replica_sets_reconfig_kill_primary_jscore_passthrough.yml @@ -0,0 +1,214 @@ +test_kind: js_test +# This suite starts a 5-node replica set and runs both the DoReconfigInBackground and +# ContinuousStepdown hook with kill: true and background_reconfig: true to periodically run +# safe reconfigs and send kill signals to the primary. +# This tests that the concurrency between killing the primary and reconfigs is still +# ultimately safe. + +selector: + roots: + - jstests/core/**/*.js + exclude_files: + # Transactions do not support retryability of individual operations. + # TODO: Remove this once it is supported (SERVER-33952). + - jstests/core/txns/**/*.js + # These tests are not expected to pass with replica-sets: + - jstests/core/dbadmin.js + - jstests/core/opcounters_write_cmd.js + - jstests/core/read_after_optime.js + - jstests/core/capped_update.js + # The set_param1.js test attempts to compare the response from running the {getParameter: "*"} + # command multiple times, which may observe the change to the "transactionLifetimeLimitSeconds" + # server parameter. + - jstests/core/set_param1.js + + # No-op retries are not ignored by top, the profiler, or opcount. + - jstests/core/operation_latency_histogram.js + - jstests/core/profile2.js + - jstests/core/profile3.js + - jstests/core/profile_findandmodify.js + - jstests/core/top.js + - jstests/core/views/views_stats.js + + # TODO SERVER-31249: getLastError should not be affected by no-op retries. + - jstests/core/bulk_legacy_enforce_gle.js + + # TODO SERVER-31242: findAndModify no-op retry should respect the fields option. + - jstests/core/crud_api.js + - jstests/core/find_and_modify.js + - jstests/core/find_and_modify2.js + - jstests/core/find_and_modify_pipeline_update.js + - jstests/core/find_and_modify_server6865.js + + # These test run commands using legacy queries, which are not supported on sessions. + - jstests/core/comment_field.js + - jstests/core/exhaust.js + - jstests/core/validate_cmd_ns.js + + # Stepdown commands during fsync lock will fail. + - jstests/core/currentop.js + - jstests/core/fsync.js + - jstests/core/killop_drop_collection.js + + # Assert on the ismaster field of an isMaster response. If a primary steps down after accepting + # an isMaster command and returns before its connection is closed, the response can contain + # ismaster: false. + - jstests/core/dbadmin.js + - jstests/core/ismaster.js + + # Spawns new mongo shells, which don't retry connecting on stepdown errors. + - jstests/core/shell_connection_strings.js + + # Expect drops/creates to fail or have a certain response: + - jstests/core/drop.js + - jstests/core/dropdb.js + - jstests/core/explain_upsert.js + - jstests/core/indexes_multiple_commands.js + + # Expect certain responses, but retries of successfully completed commands may return + # different values: + - jstests/core/create_indexes.js + - jstests/core/objid5.js + + # Unacknowledged writes prohibited in an explicit session. + - jstests/core/batch_write_command_w0.js + + - jstests/core/bench_test*.js # benchRun() used for writes + - jstests/core/benchrun_pipeline_updates.js # benchRun() used for writes + - jstests/core/connection_string_validation.js # Does not expect a replica set connection string. + - jstests/core/list_collections_filter.js # Temporary collections are dropped on failover. + - jstests/core/top.js # Tests read commands (including getMore) against the secondary + - jstests/core/drop3.js # getMore is not causally consistent if collection is dropped + - jstests/core/validate_cmd_ns.js # Calls _exec() directly, not retryable. + - jstests/core/list_collections_filter.js # Temporary collections are dropped on failover. + - jstests/core/explain_large_bounds.js # Stepdown can timeout waiting for global lock. + + # Tests that fail for Causal Consistency as they have statements that do not support + # non-local read concern. + - jstests/core/collation.js + # Starts a parallel shell but won't restart it after unclean shutdown. + # TODO SERVER-33229: Remove these exclusions + - jstests/core/compact_keeps_indexes.js + - jstests/core/benchrun_pipeline_updates.js + - jstests/core/find_and_modify_concurrent_update.js + - jstests/core/shellstartparallel.js + + # Inserts enough data that recovery takes more than 8 seconds, so we never get a working primary. + - jstests/core/geo_s2ordering.js + + exclude_with_any_tags: + ## + # The next four tags correspond to the special errors thrown by the auto_retry_on_network_error.js + # override when it refuses to run a certain command. Above each tag are the message(s) that cause + # the tag to be warranted. + ## + # "Refusing to run a test that issues a getMore command since if a network error occurs during + # it then we won't know whether the cursor was advanced or not" + - requires_getmore + # "Refusing to run a test that issues non-retryable write operations since the test likely makes + # assertions on the write results and can lead to spurious failures if a network error occurs" + - requires_non_retryable_writes + # "Refusing to run a test that issues commands that are not blindly retryable" + # "Refusing to run a test that issues an aggregation command with $out because it is not + # retryable" + - requires_non_retryable_commands + # "Refusing to run a test that issues commands that may return different values after a failover" + # "Refusing to run a test that issues an aggregation command with explain because it may return + # incomplete results" + # "Refusing to run a test that issues an aggregation command with + # $listLocalSessions because it relies on in-memory state that may not survive failovers" + # "Refusing to run a test that issues a mapReduce command, because it calls std::terminate() if + # interrupted by a stepdown" + - does_not_support_stepdowns + ## + # The next two tags correspond to the special errors thrown by the + # set_read_and_write_concerns.js override when it refuses to replace the readConcern or + # writeConcern of a particular command. Above each tag are the message(s) that cause the tag to be + # warranted. + ## + # "Cowardly refusing to override read concern of command: ..." + - assumes_read_concern_unchanged + # "Cowardly refusing to override write concern of command: ..." + - assumes_write_concern_unchanged + ## + # The next three tags corresponds to the special errors thrown by the + # fail_unclean_shutdown_incompatible_commands.js override when it refuses to run commands that are + # inaccurate after an unclean shutdown. Above each tag is the message that causes the tag to be + # warranted. + ## + # "Cowardly fail if fastcount is run with a mongod that had an unclean shutdown: ..." + - requires_fastcount + # "Cowardly fail if dbStats is run with a mongod that had an unclean shutdown: ..." + - requires_dbstats + # "Cowardly fail if collStats is run with a mongod that had an unclean shutdown: ..." + - requires_collstats + # "Cowardly fail if unbounded dataSize is run with a mongod that had an unclean shutdown: ..." + - requires_datasize + ## The next tag corresponds to long running-operations, as they may exhaust their number + # of retries and result in a network error being thrown. + - operations_longer_than_stepdown_interval + +executor: + archive: + hooks: + - CheckReplDBHash + - CheckReplOplogs + - ValidateCollections + config: + shell_options: + eval: >- + testingReplication = true; + load('jstests/libs/override_methods/network_error_and_txn_override.js'); + db = connect(TestData.connectionString); + load("jstests/libs/override_methods/enable_sessions.js"); + load("jstests/libs/override_methods/set_read_and_write_concerns.js"); + load("jstests/libs/override_methods/fail_unclean_shutdown_incompatible_commands.js"); + load("jstests/libs/override_methods/fail_unclean_shutdown_start_parallel_shell.js"); + global_vars: + TestData: + alwaysInjectTransactionNumber: true + defaultReadConcernLevel: "majority" + logRetryAttempts: true + networkErrorAndTxnOverrideConfig: + retryOnNetworkErrors: true + backgroundReconfigs: true + overrideRetryAttempts: 3 + sessionOptions: + writeConcern: + w: "majority" + readConcern: + level: "majority" + # Force DBClientRS to find the primary for non-write commands. + readPreference: + mode: "primary" + retryWrites: true + # We specify nodb so the shell used by each test will attempt to connect after loading the + # retry logic in auto_retry_on_network_error.js. + nodb: "" + readMode: commands + hooks: + - class: DoReconfigInBackground + shell_options: + nodb: "" + - class: ContinuousStepdown + kill: true + background_reconfig: true + # The CheckReplDBHash hook waits until all operations have replicated to and have been applied + # on the secondaries, so we run the ValidateCollections hook after it to ensure we're + # validating the entire contents of the collection. + - class: CheckReplOplogs + - class: CheckReplDBHash + - class: ValidateCollections + - class: CleanEveryN + n: 20 + fixture: + class: ReplicaSetFixture + mongod_options: + enableMajorityReadConcern: '' + set_parameters: + enableTestCommands: 1 + logComponentVerbosity: + replication: + heartbeats: 2 + all_nodes_electable: true + num_nodes: 5 diff --git a/buildscripts/resmokelib/testing/fixtures/replicaset.py b/buildscripts/resmokelib/testing/fixtures/replicaset.py index fb8e90b5296..5de9996de1c 100644 --- a/buildscripts/resmokelib/testing/fixtures/replicaset.py +++ b/buildscripts/resmokelib/testing/fixtures/replicaset.py @@ -187,12 +187,10 @@ class ReplicaSetFixture(interface.ReplFixture): # pylint: disable=too-many-inst replset_settings = self.replset_config_options["settings"] repl_config["settings"] = replset_settings - # If not all nodes are electable and no election timeout was specified, then we increase - # the election timeout to 24 hours to prevent spurious elections. - if not self.all_nodes_electable: - repl_config.setdefault("settings", {}) - if "electionTimeoutMillis" not in repl_config["settings"]: - repl_config["settings"]["electionTimeoutMillis"] = 24 * 60 * 60 * 1000 + # Increase the election timeout to 24 hours to prevent spurious elections. + repl_config.setdefault("settings", {}) + if "electionTimeoutMillis" not in repl_config["settings"]: + repl_config["settings"]["electionTimeoutMillis"] = 24 * 60 * 60 * 1000 # Start up a single node replica set then reconfigure to the correct size (if the config # contains more than 1 node), so the primary is elected more quickly. @@ -500,6 +498,16 @@ class ReplicaSetFixture(interface.ReplFixture): # pylint: disable=too-many-inst primary = self.get_primary() return [node for node in self.nodes if node.port != primary.port] + def get_voting_members(self): + """Return the number of voting nodes in the replica set.""" + primary = self.get_primary() + client = primary.mongo_client() + + members = client.admin.command({"replSetGetConfig": 1})['config']['members'] + voting_members = [member['host'] for member in members if member['votes'] == 1] + + return voting_members + def get_initial_sync_node(self): """Return initial sync node from the replica set.""" return self.initial_sync_node diff --git a/buildscripts/resmokelib/testing/hooks/reconfig_background.py b/buildscripts/resmokelib/testing/hooks/reconfig_background.py new file mode 100644 index 00000000000..dbf9b33a242 --- /dev/null +++ b/buildscripts/resmokelib/testing/hooks/reconfig_background.py @@ -0,0 +1,71 @@ +"""Test hook for running safe reconfigs against the primary of a replica set. + +This hook runs continously in a background thread while the test is running. +""" + +import os.path + +from buildscripts.resmokelib import errors +from buildscripts.resmokelib.testing.hooks import jsfile +from buildscripts.resmokelib.testing.testcases import interface as testcase +from buildscripts.resmokelib.testing.hooks.background_job import _BackgroundJob, _ContinuousDynamicJSTestCase + + +class DoReconfigInBackground(jsfile.JSHook): + """A hook for running a safe reconfig against a replica set while a test is running.""" + + def __init__(self, hook_logger, fixture, shell_options=None): + """Initialize DoReconfigInBackground.""" + description = "Run reconfigs against the primary while the test is running." + js_filename = os.path.join("jstests", "hooks", "run_reconfig_background.js") + jsfile.JSHook.__init__(self, hook_logger, fixture, js_filename, description, + shell_options=shell_options) + + self._background_job = None + + def before_suite(self, test_report): + """Start the background thread.""" + self._background_job = _BackgroundJob("ReconfigInBackground") + self.logger.info("Starting the background reconfig thread.") + self._background_job.start() + + def after_suite(self, test_report): + """Signal the background thread to exit, and wait until it does.""" + if self._background_job is None: + return + + self.logger.info("Stopping the background reconfig thread.") + self._background_job.stop() + + def before_test(self, test, test_report): + """Instruct the background thread to run reconfigs while 'test' is also running.""" + if self._background_job is None: + return + + hook_test_case = _ContinuousDynamicJSTestCase.create_before_test( + self.logger.test_case_logger, test, self, self._js_filename, self._shell_options) + hook_test_case.configure(self.fixture) + + self.logger.info("Resuming the background reconfig thread.") + self._background_job.resume(hook_test_case, test_report) + + def after_test(self, test, test_report): # noqa: D205,D400 + """Instruct the background thread to stop running reconfigs now that 'test' has + finished running. + """ + if self._background_job is None: + return + + self.logger.info("Pausing the background reconfig thread.") + self._background_job.pause() + + if self._background_job.exc_info is not None: + if isinstance(self._background_job.exc_info[1], errors.TestFailure): + # If the mongo shell process running the JavaScript file exited with a non-zero + # return code, then we raise an errors.ServerFailure exception to cause resmoke.py's + # test execution to stop. + raise errors.ServerFailure(self._background_job.exc_info[1].args[0]) + else: + self.logger.error("Encountered an error inside the background reconfig thread.", + exc_info=self._background_job.exc_info) + raise self._background_job.exc_info[1] diff --git a/buildscripts/resmokelib/testing/hooks/stepdown.py b/buildscripts/resmokelib/testing/hooks/stepdown.py index de2c89e9a38..4cfd09fd52d 100644 --- a/buildscripts/resmokelib/testing/hooks/stepdown.py +++ b/buildscripts/resmokelib/testing/hooks/stepdown.py @@ -27,7 +27,7 @@ class ContinuousStepdown(interface.Hook): # pylint: disable=too-many-instance-a self, hook_logger, fixture, config_stepdown=True, shard_stepdown=True, stepdown_interval_ms=8000, terminate=False, kill=False, use_stepdown_permitted_file=False, wait_for_mongos_retarget=False, - stepdown_via_heartbeats=True): + stepdown_via_heartbeats=True, background_reconfig=False): """Initialize the ContinuousStepdown. Args: @@ -64,6 +64,8 @@ class ContinuousStepdown(interface.Hook): # pylint: disable=too-many-instance-a self._terminate = terminate or kill self._kill = kill + self._background_reconfig = background_reconfig + # The stepdown file names need to match the same construction as found in # jstests/concurrency/fsm_libs/resmoke_runner.js. dbpath_prefix = fixture.get_dbpath_prefix() @@ -87,7 +89,7 @@ class ContinuousStepdown(interface.Hook): # pylint: disable=too-many-instance-a self._stepdown_thread = _StepdownThread( self.logger, self._mongos_fixtures, self._rs_fixtures, self._stepdown_interval_secs, self._terminate, self._kill, lifecycle, self._wait_for_mongos_retarget, - self._stepdown_via_heartbeats) + self._stepdown_via_heartbeats, self._background_reconfig) self.logger.info("Starting the stepdown thread.") self._stepdown_thread.start() @@ -348,7 +350,8 @@ class FileBasedStepdownLifecycle(object): class _StepdownThread(threading.Thread): # pylint: disable=too-many-instance-attributes def __init__( # pylint: disable=too-many-arguments self, logger, mongos_fixtures, rs_fixtures, stepdown_interval_secs, terminate, kill, - stepdown_lifecycle, wait_for_mongos_retarget, stepdown_via_heartbeats): + stepdown_lifecycle, wait_for_mongos_retarget, stepdown_via_heartbeats, + background_reconfig): """Initialize _StepdownThread.""" threading.Thread.__init__(self, name="StepdownThread") self.daemon = True @@ -365,6 +368,7 @@ class _StepdownThread(threading.Thread): # pylint: disable=too-many-instance-at self.__lifecycle = stepdown_lifecycle self._should_wait_for_mongos_retarget = wait_for_mongos_retarget self._stepdown_via_heartbeats = stepdown_via_heartbeats + self._background_reconfig = background_reconfig self._last_exec = time.time() # Event set when the thread has been stopped using the 'stop()' method. @@ -474,6 +478,22 @@ class _StepdownThread(threading.Thread): # pylint: disable=too-many-instance-at rs_fixture.replset_name)) if self._terminate: + # If we're running with background reconfigs, it's possible to be in a scenario + # where we kill a necessary voting node (i.e. in a 5 node repl set), only 2 are + # voting. In this scenario, we want to avoid killing the primary because no + # secondary can step up. + if self._background_reconfig: + # stagger the kill thread so that it runs a little after the reconfig thread + time.sleep(1) + voting_members = rs_fixture.get_voting_members() + + self.logger.info("Current voting members: %s", voting_members) + + if len(voting_members) <= 3: + # Do not kill or terminate the primary if we don't have enough voting nodes to + # elect a new primary. + return + should_kill = self._kill and random.choice([True, False]) action = "Killing" if should_kill else "Terminating" self.logger.info("%s the primary on port %d of replica set '%s'.", action, primary.port, |