summaryrefslogtreecommitdiff
path: root/buildscripts
diff options
context:
space:
mode:
authorPavi Vetriselvan <pvselvan@umich.edu>2020-03-16 11:13:36 -0400
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2020-05-14 12:42:24 +0000
commitf4528563033d933ca920b3e4b2a5e3344e198a5c (patch)
tree8c20856b344e02483dceb1e13f35533e41db3ecd /buildscripts
parentcd9fbb56900343e7b1193922a2c4b197895e7f56 (diff)
downloadmongo-f4528563033d933ca920b3e4b2a5e3344e198a5c.tar.gz
SERVER-45094 add disabled replica set reconfig passthroughs
SERVER-45094 add retryable read logic to network_error_and_txn_override.js (cherry picked from commit f59f63db6c37c0d4657b57d559c95d830b0e34c2) SERVER-45094 add replica_sets_reconfig_jscore_passthrough suite (cherry picked from commit 4d91fac171cbe3f2af53d9258965399e648a1947) SERVER-45094 use w:1 writes and remove causal consistency in reconfig passthrough (cherry picked from commit a43cb23defc6182d08a7814e4731ef98f2d30b6a) SERVER-45094 add replica_sets_reconfig_jscore_stepdown_passthrough (cherry picked from commit 81e0ad27c280c02a49beb65ff4473d5dce62b089) SERVER-45094 add replica_sets_reconfig_kill_primary_jscore_passthrough (cherry picked from commit 2debab7987b24bf902f9a128654ce928441c29a2) SERVER-47678 stepdown and kill primary reconfig passthroughs should ignore ReplicaSetMonitorErrors (cherry picked from commit 91672e58f1169c7edd684b911f20f62b8a71f8d1) SERVER-47544 always increase election timeout to 24 hours in passthrough suites (cherry picked from commit 81d53a715f49827a9f2538d4572f9b01f2b12887)
Diffstat (limited to 'buildscripts')
-rw-r--r--buildscripts/resmokeconfig/suites/replica_sets_reconfig_jscore_passthrough.yml84
-rw-r--r--buildscripts/resmokeconfig/suites/replica_sets_reconfig_jscore_stepdown_passthrough.yml198
-rw-r--r--buildscripts/resmokeconfig/suites/replica_sets_reconfig_kill_primary_jscore_passthrough.yml214
-rw-r--r--buildscripts/resmokelib/testing/fixtures/replicaset.py20
-rw-r--r--buildscripts/resmokelib/testing/hooks/reconfig_background.py71
-rw-r--r--buildscripts/resmokelib/testing/hooks/stepdown.py26
6 files changed, 604 insertions, 9 deletions
diff --git a/buildscripts/resmokeconfig/suites/replica_sets_reconfig_jscore_passthrough.yml b/buildscripts/resmokeconfig/suites/replica_sets_reconfig_jscore_passthrough.yml
new file mode 100644
index 00000000000..8285339493b
--- /dev/null
+++ b/buildscripts/resmokeconfig/suites/replica_sets_reconfig_jscore_passthrough.yml
@@ -0,0 +1,84 @@
+test_kind: js_test
+# This suite starts a 5-node replica set and uses DoReconfigInBackground hook to periodically run
+# safe reconfigs against the primary. These reconfigs change the number of voting nodes in the
+# replica set, which changes the voting majority used to satisfy the config commitment check and
+# oplog commitment check.
+
+selector:
+ roots:
+ - jstests/core/**/*.js
+ exclude_files:
+ # Transactions do not support retryability of individual operations.
+ # TODO: Remove this once it is supported (SERVER-33952).
+ - jstests/core/txns/**/*.js
+ # These tests are not expected to pass with replica-sets:
+ - jstests/core/dbadmin.js
+ - jstests/core/opcounters_write_cmd.js
+ - jstests/core/read_after_optime.js
+ - jstests/core/capped_update.js
+ # The set_param1.js test attempts to compare the response from running the {getParameter: "*"}
+ # command multiple times, which may observe the change to the "transactionLifetimeLimitSeconds"
+ # server parameter.
+ - jstests/core/set_param1.js
+
+ # These test run commands using legacy queries, which are not supported on sessions.
+ - jstests/core/comment_field.js
+ - jstests/core/exhaust.js
+ - jstests/core/invalidated_legacy_cursors.js
+ - jstests/core/validate_cmd_ns.js
+
+ # Unacknowledged writes prohibited in an explicit session.
+ - jstests/core/batch_write_command_w0.js
+ - jstests/core/crud_api.js
+
+ - jstests/core/connection_string_validation.js # Does not expect a replica set connection string.
+
+ exclude_with_any_tags:
+ - assumes_read_preference_unchanged
+ - requires_sharding
+
+executor:
+ archive:
+ hooks:
+ - CheckReplDBHash
+ - CheckReplOplogs
+ - ValidateCollections
+ config:
+ shell_options:
+ eval: >-
+ testingReplication = true;
+ load('jstests/libs/override_methods/network_error_and_txn_override.js');
+ load("jstests/libs/override_methods/enable_sessions.js");
+ global_vars:
+ TestData:
+ networkErrorAndTxnOverrideConfig:
+ backgroundReconfigs: true
+ sessionOptions:
+ # Force DBClientRS to find the primary for non-write commands to make sure reads still
+ # work as expected during reconfigs.
+ readPreference:
+ mode: "primary"
+ readMode: commands
+ hooks:
+ # The CheckReplDBHash hook waits until all operations have replicated to and have been applied
+ # on the secondaries, so we run the ValidateCollections hook after it to ensure we're
+ # validating the entire contents of the collection.
+ - class: DoReconfigInBackground
+ shell_options:
+ nodb: ""
+ - class: CheckReplOplogs
+ - class: CheckReplDBHash
+ - class: ValidateCollections
+ - class: CleanEveryN
+ n: 20
+ fixture:
+ class: ReplicaSetFixture
+ mongod_options:
+ enableMajorityReadConcern: ''
+ set_parameters:
+ enableTestCommands: 1
+ logComponentVerbosity:
+ replication:
+ heartbeats: 2
+ all_nodes_electable: true
+ num_nodes: 5
diff --git a/buildscripts/resmokeconfig/suites/replica_sets_reconfig_jscore_stepdown_passthrough.yml b/buildscripts/resmokeconfig/suites/replica_sets_reconfig_jscore_stepdown_passthrough.yml
new file mode 100644
index 00000000000..d221ccba640
--- /dev/null
+++ b/buildscripts/resmokeconfig/suites/replica_sets_reconfig_jscore_stepdown_passthrough.yml
@@ -0,0 +1,198 @@
+test_kind: js_test
+# This suite starts a 5-node replica set and runs both the DoReconfigInBackground and
+# ContinuousStepdown hook to periodically run safe reconfigs and step downs against the
+# primary. This tests that the concurrency between stepdowns and reconfigs is still ultimately
+# safe.
+
+selector:
+ roots:
+ - jstests/core/**/*.js
+ exclude_files:
+ # Transactions do not support retryability of individual operations.
+ # TODO: Remove this once it is supported (SERVER-33952).
+ - jstests/core/txns/**/*.js
+ # These tests are not expected to pass with replica-sets:
+ - jstests/core/dbadmin.js
+ - jstests/core/opcounters_write_cmd.js
+ - jstests/core/read_after_optime.js
+ - jstests/core/capped_update.js
+ # The set_param1.js test attempts to compare the response from running the {getParameter: "*"}
+ # command multiple times, which may observe the change to the "transactionLifetimeLimitSeconds"
+ # server parameter.
+ - jstests/core/set_param1.js
+
+ # No-op retries are not ignored by top, the profiler, or opcount.
+ - jstests/core/operation_latency_histogram.js
+ - jstests/core/profile2.js
+ - jstests/core/profile3.js
+ - jstests/core/profile_findandmodify.js
+ - jstests/core/top.js
+ - jstests/core/views/views_stats.js
+
+ # TODO SERVER-31249: getLastError should not be affected by no-op retries.
+ - jstests/core/bulk_legacy_enforce_gle.js
+
+ # TODO SERVER-31242: findAndModify no-op retry should respect the fields option.
+ - jstests/core/crud_api.js
+ - jstests/core/find_and_modify.js
+ - jstests/core/find_and_modify2.js
+ - jstests/core/find_and_modify_pipeline_update.js
+ - jstests/core/find_and_modify_server6865.js
+
+ # These test run commands using legacy queries, which are not supported on sessions.
+ - jstests/core/comment_field.js
+ - jstests/core/exhaust.js
+ - jstests/core/validate_cmd_ns.js
+
+ # Stepdown commands during fsync lock will fail.
+ - jstests/core/currentop.js
+ - jstests/core/fsync.js
+ - jstests/core/killop_drop_collection.js
+
+ # Assert on the ismaster field of an isMaster response. If a primary steps down after accepting
+ # an isMaster command and returns before its connection is closed, the response can contain
+ # ismaster: false.
+ - jstests/core/dbadmin.js
+ - jstests/core/ismaster.js
+
+ # Spawns new mongo shells, which don't retry connecting on stepdown errors.
+ - jstests/core/shell_connection_strings.js
+
+ # Expect drops/creates to fail or have a certain response:
+ - jstests/core/drop.js
+ - jstests/core/dropdb.js
+ - jstests/core/explain_upsert.js
+ - jstests/core/indexes_multiple_commands.js
+
+ # Expect certain responses, but retries of successfully completed commands may return
+ # different values:
+ - jstests/core/create_indexes.js
+ - jstests/core/objid5.js
+
+ # Unacknowledged writes prohibited in an explicit session.
+ - jstests/core/batch_write_command_w0.js
+
+ - jstests/core/bench_test*.js # benchRun() used for writes
+ - jstests/core/benchrun_pipeline_updates.js # benchRun() used for writes
+ - jstests/core/connection_string_validation.js # Does not expect a replica set connection string.
+ - jstests/core/list_collections_filter.js # Temporary collections are dropped on failover.
+ - jstests/core/top.js # Tests read commands (including getMore) against the secondary
+ - jstests/core/drop3.js # getMore is not causally consistent if collection is dropped
+ - jstests/core/validate_cmd_ns.js # Calls _exec() directly, not retryable.
+ - jstests/core/list_collections_filter.js # Temporary collections are dropped on failover.
+ - jstests/core/explain_large_bounds.js # Stepdown can timeout waiting for global lock.
+
+ # Tests that fail for Causal Consistency as they have statements that do not support
+ # non-local read concern.
+ - jstests/core/collation.js
+ # Parallel shell is not causally consistent
+ - jstests/core/benchrun_pipeline_updates.js
+ - jstests/core/find_and_modify_concurrent_update.js
+ - jstests/core/shellstartparallel.js
+
+ exclude_with_any_tags:
+ ##
+ # The next four tags correspond to the special errors thrown by the auto_retry_on_network_error.js
+ # override when it refuses to run a certain command. Above each tag are the message(s) that cause
+ # the tag to be warranted.
+ ##
+ # "Refusing to run a test that issues a getMore command since if a network error occurs during
+ # it then we won't know whether the cursor was advanced or not"
+ - requires_getmore
+ # "Refusing to run a test that issues non-retryable write operations since the test likely makes
+ # assertions on the write results and can lead to spurious failures if a network error occurs"
+ - requires_non_retryable_writes
+ # "Refusing to run a test that issues commands that are not blindly retryable"
+ # "Refusing to run a test that issues an aggregation command with $out because it is not
+ # retryable"
+ - requires_non_retryable_commands
+ # "Refusing to run a test that issues commands that may return different values after a failover"
+ # "Refusing to run a test that issues an aggregation command with explain because it may return
+ # incomplete results"
+ # "Refusing to run a test that issues an aggregation command with
+ # $listLocalSessions because it relies on in-memory state that may not survive failovers"
+ # "Refusing to run a test that issues a mapReduce command, because it calls std::terminate() if
+ # interrupted by a stepdown"
+ - does_not_support_stepdowns
+ ##
+ # The next two tags correspond to the special errors thrown by the
+ # set_read_and_write_concerns.js override when it refuses to replace the readConcern or
+ # writeConcern of a particular command. Above each tag are the message(s) that cause the tag to be
+ # warranted.
+ ##
+ # "Cowardly refusing to override read concern of command: ..."
+ - assumes_read_concern_unchanged
+ # "Cowardly refusing to override write concern of command: ..."
+ - assumes_write_concern_unchanged
+ ## The next tag corresponds to long running-operations, as they may exhaust their number
+ # of retries and result in a network error being thrown.
+ - operations_longer_than_stepdown_interval
+ - does_not_support_causal_consistency
+ - uses_transactions
+ # collStats is not causally consistent
+ - requires_collstats
+ - requires_dbstats
+ - requires_datasize
+ - requires_sharding
+
+executor:
+ archive:
+ hooks:
+ - CheckReplDBHash
+ - CheckReplOplogs
+ - ValidateCollections
+ config:
+ shell_options:
+ eval: >-
+ testingReplication = true;
+ load('jstests/libs/override_methods/network_error_and_txn_override.js');
+ db = connect(TestData.connectionString);
+ load("jstests/libs/override_methods/enable_sessions.js");
+ load("jstests/libs/override_methods/set_read_and_write_concerns.js");
+ global_vars:
+ TestData:
+ runningWithCausalConsistency: true
+ alwaysInjectTransactionNumber: true
+ defaultReadConcernLevel: "majority"
+ logRetryAttempts: true
+ networkErrorAndTxnOverrideConfig:
+ retryOnNetworkErrors: true
+ backgroundReconfigs: true
+ overrideRetryAttempts: 3
+ sessionOptions:
+ writeConcern:
+ w: "majority"
+ readConcern:
+ level: "majority"
+ # Force DBClientRS to find the primary for non-write commands.
+ readPreference:
+ mode: "primary"
+ retryWrites: true
+ # We specify nodb so the shell used by each test will attempt to connect after loading the
+ # retry logic in auto_retry_on_network_error.js.
+ nodb: ""
+ readMode: commands
+ hooks:
+ - class: DoReconfigInBackground
+ shell_options:
+ nodb: ""
+ - class: ContinuousStepdown
+ # The CheckReplDBHash hook waits until all operations have replicated to and have been applied
+ # on the secondaries, so we run the ValidateCollections hook after it to ensure we're
+ # validating the entire contents of the collection.
+ - class: CheckReplOplogs
+ - class: CheckReplDBHash
+ - class: ValidateCollections
+ - class: CleanEveryN
+ n: 20
+ fixture:
+ class: ReplicaSetFixture
+ mongod_options:
+ enableMajorityReadConcern: ''
+ set_parameters:
+ enableTestCommands: 1
+ logComponentVerbosity:
+ replication:
+ heartbeats: 2
+ all_nodes_electable: true
+ num_nodes: 5
diff --git a/buildscripts/resmokeconfig/suites/replica_sets_reconfig_kill_primary_jscore_passthrough.yml b/buildscripts/resmokeconfig/suites/replica_sets_reconfig_kill_primary_jscore_passthrough.yml
new file mode 100644
index 00000000000..6c973e3dd06
--- /dev/null
+++ b/buildscripts/resmokeconfig/suites/replica_sets_reconfig_kill_primary_jscore_passthrough.yml
@@ -0,0 +1,214 @@
+test_kind: js_test
+# This suite starts a 5-node replica set and runs both the DoReconfigInBackground and
+# ContinuousStepdown hook with kill: true and background_reconfig: true to periodically run
+# safe reconfigs and send kill signals to the primary.
+# This tests that the concurrency between killing the primary and reconfigs is still
+# ultimately safe.
+
+selector:
+ roots:
+ - jstests/core/**/*.js
+ exclude_files:
+ # Transactions do not support retryability of individual operations.
+ # TODO: Remove this once it is supported (SERVER-33952).
+ - jstests/core/txns/**/*.js
+ # These tests are not expected to pass with replica-sets:
+ - jstests/core/dbadmin.js
+ - jstests/core/opcounters_write_cmd.js
+ - jstests/core/read_after_optime.js
+ - jstests/core/capped_update.js
+ # The set_param1.js test attempts to compare the response from running the {getParameter: "*"}
+ # command multiple times, which may observe the change to the "transactionLifetimeLimitSeconds"
+ # server parameter.
+ - jstests/core/set_param1.js
+
+ # No-op retries are not ignored by top, the profiler, or opcount.
+ - jstests/core/operation_latency_histogram.js
+ - jstests/core/profile2.js
+ - jstests/core/profile3.js
+ - jstests/core/profile_findandmodify.js
+ - jstests/core/top.js
+ - jstests/core/views/views_stats.js
+
+ # TODO SERVER-31249: getLastError should not be affected by no-op retries.
+ - jstests/core/bulk_legacy_enforce_gle.js
+
+ # TODO SERVER-31242: findAndModify no-op retry should respect the fields option.
+ - jstests/core/crud_api.js
+ - jstests/core/find_and_modify.js
+ - jstests/core/find_and_modify2.js
+ - jstests/core/find_and_modify_pipeline_update.js
+ - jstests/core/find_and_modify_server6865.js
+
+ # These test run commands using legacy queries, which are not supported on sessions.
+ - jstests/core/comment_field.js
+ - jstests/core/exhaust.js
+ - jstests/core/validate_cmd_ns.js
+
+ # Stepdown commands during fsync lock will fail.
+ - jstests/core/currentop.js
+ - jstests/core/fsync.js
+ - jstests/core/killop_drop_collection.js
+
+ # Assert on the ismaster field of an isMaster response. If a primary steps down after accepting
+ # an isMaster command and returns before its connection is closed, the response can contain
+ # ismaster: false.
+ - jstests/core/dbadmin.js
+ - jstests/core/ismaster.js
+
+ # Spawns new mongo shells, which don't retry connecting on stepdown errors.
+ - jstests/core/shell_connection_strings.js
+
+ # Expect drops/creates to fail or have a certain response:
+ - jstests/core/drop.js
+ - jstests/core/dropdb.js
+ - jstests/core/explain_upsert.js
+ - jstests/core/indexes_multiple_commands.js
+
+ # Expect certain responses, but retries of successfully completed commands may return
+ # different values:
+ - jstests/core/create_indexes.js
+ - jstests/core/objid5.js
+
+ # Unacknowledged writes prohibited in an explicit session.
+ - jstests/core/batch_write_command_w0.js
+
+ - jstests/core/bench_test*.js # benchRun() used for writes
+ - jstests/core/benchrun_pipeline_updates.js # benchRun() used for writes
+ - jstests/core/connection_string_validation.js # Does not expect a replica set connection string.
+ - jstests/core/list_collections_filter.js # Temporary collections are dropped on failover.
+ - jstests/core/top.js # Tests read commands (including getMore) against the secondary
+ - jstests/core/drop3.js # getMore is not causally consistent if collection is dropped
+ - jstests/core/validate_cmd_ns.js # Calls _exec() directly, not retryable.
+ - jstests/core/list_collections_filter.js # Temporary collections are dropped on failover.
+ - jstests/core/explain_large_bounds.js # Stepdown can timeout waiting for global lock.
+
+ # Tests that fail for Causal Consistency as they have statements that do not support
+ # non-local read concern.
+ - jstests/core/collation.js
+ # Starts a parallel shell but won't restart it after unclean shutdown.
+ # TODO SERVER-33229: Remove these exclusions
+ - jstests/core/compact_keeps_indexes.js
+ - jstests/core/benchrun_pipeline_updates.js
+ - jstests/core/find_and_modify_concurrent_update.js
+ - jstests/core/shellstartparallel.js
+
+ # Inserts enough data that recovery takes more than 8 seconds, so we never get a working primary.
+ - jstests/core/geo_s2ordering.js
+
+ exclude_with_any_tags:
+ ##
+ # The next four tags correspond to the special errors thrown by the auto_retry_on_network_error.js
+ # override when it refuses to run a certain command. Above each tag are the message(s) that cause
+ # the tag to be warranted.
+ ##
+ # "Refusing to run a test that issues a getMore command since if a network error occurs during
+ # it then we won't know whether the cursor was advanced or not"
+ - requires_getmore
+ # "Refusing to run a test that issues non-retryable write operations since the test likely makes
+ # assertions on the write results and can lead to spurious failures if a network error occurs"
+ - requires_non_retryable_writes
+ # "Refusing to run a test that issues commands that are not blindly retryable"
+ # "Refusing to run a test that issues an aggregation command with $out because it is not
+ # retryable"
+ - requires_non_retryable_commands
+ # "Refusing to run a test that issues commands that may return different values after a failover"
+ # "Refusing to run a test that issues an aggregation command with explain because it may return
+ # incomplete results"
+ # "Refusing to run a test that issues an aggregation command with
+ # $listLocalSessions because it relies on in-memory state that may not survive failovers"
+ # "Refusing to run a test that issues a mapReduce command, because it calls std::terminate() if
+ # interrupted by a stepdown"
+ - does_not_support_stepdowns
+ ##
+ # The next two tags correspond to the special errors thrown by the
+ # set_read_and_write_concerns.js override when it refuses to replace the readConcern or
+ # writeConcern of a particular command. Above each tag are the message(s) that cause the tag to be
+ # warranted.
+ ##
+ # "Cowardly refusing to override read concern of command: ..."
+ - assumes_read_concern_unchanged
+ # "Cowardly refusing to override write concern of command: ..."
+ - assumes_write_concern_unchanged
+ ##
+ # The next three tags corresponds to the special errors thrown by the
+ # fail_unclean_shutdown_incompatible_commands.js override when it refuses to run commands that are
+ # inaccurate after an unclean shutdown. Above each tag is the message that causes the tag to be
+ # warranted.
+ ##
+ # "Cowardly fail if fastcount is run with a mongod that had an unclean shutdown: ..."
+ - requires_fastcount
+ # "Cowardly fail if dbStats is run with a mongod that had an unclean shutdown: ..."
+ - requires_dbstats
+ # "Cowardly fail if collStats is run with a mongod that had an unclean shutdown: ..."
+ - requires_collstats
+ # "Cowardly fail if unbounded dataSize is run with a mongod that had an unclean shutdown: ..."
+ - requires_datasize
+ ## The next tag corresponds to long running-operations, as they may exhaust their number
+ # of retries and result in a network error being thrown.
+ - operations_longer_than_stepdown_interval
+
+executor:
+ archive:
+ hooks:
+ - CheckReplDBHash
+ - CheckReplOplogs
+ - ValidateCollections
+ config:
+ shell_options:
+ eval: >-
+ testingReplication = true;
+ load('jstests/libs/override_methods/network_error_and_txn_override.js');
+ db = connect(TestData.connectionString);
+ load("jstests/libs/override_methods/enable_sessions.js");
+ load("jstests/libs/override_methods/set_read_and_write_concerns.js");
+ load("jstests/libs/override_methods/fail_unclean_shutdown_incompatible_commands.js");
+ load("jstests/libs/override_methods/fail_unclean_shutdown_start_parallel_shell.js");
+ global_vars:
+ TestData:
+ alwaysInjectTransactionNumber: true
+ defaultReadConcernLevel: "majority"
+ logRetryAttempts: true
+ networkErrorAndTxnOverrideConfig:
+ retryOnNetworkErrors: true
+ backgroundReconfigs: true
+ overrideRetryAttempts: 3
+ sessionOptions:
+ writeConcern:
+ w: "majority"
+ readConcern:
+ level: "majority"
+ # Force DBClientRS to find the primary for non-write commands.
+ readPreference:
+ mode: "primary"
+ retryWrites: true
+ # We specify nodb so the shell used by each test will attempt to connect after loading the
+ # retry logic in auto_retry_on_network_error.js.
+ nodb: ""
+ readMode: commands
+ hooks:
+ - class: DoReconfigInBackground
+ shell_options:
+ nodb: ""
+ - class: ContinuousStepdown
+ kill: true
+ background_reconfig: true
+ # The CheckReplDBHash hook waits until all operations have replicated to and have been applied
+ # on the secondaries, so we run the ValidateCollections hook after it to ensure we're
+ # validating the entire contents of the collection.
+ - class: CheckReplOplogs
+ - class: CheckReplDBHash
+ - class: ValidateCollections
+ - class: CleanEveryN
+ n: 20
+ fixture:
+ class: ReplicaSetFixture
+ mongod_options:
+ enableMajorityReadConcern: ''
+ set_parameters:
+ enableTestCommands: 1
+ logComponentVerbosity:
+ replication:
+ heartbeats: 2
+ all_nodes_electable: true
+ num_nodes: 5
diff --git a/buildscripts/resmokelib/testing/fixtures/replicaset.py b/buildscripts/resmokelib/testing/fixtures/replicaset.py
index fb8e90b5296..5de9996de1c 100644
--- a/buildscripts/resmokelib/testing/fixtures/replicaset.py
+++ b/buildscripts/resmokelib/testing/fixtures/replicaset.py
@@ -187,12 +187,10 @@ class ReplicaSetFixture(interface.ReplFixture): # pylint: disable=too-many-inst
replset_settings = self.replset_config_options["settings"]
repl_config["settings"] = replset_settings
- # If not all nodes are electable and no election timeout was specified, then we increase
- # the election timeout to 24 hours to prevent spurious elections.
- if not self.all_nodes_electable:
- repl_config.setdefault("settings", {})
- if "electionTimeoutMillis" not in repl_config["settings"]:
- repl_config["settings"]["electionTimeoutMillis"] = 24 * 60 * 60 * 1000
+ # Increase the election timeout to 24 hours to prevent spurious elections.
+ repl_config.setdefault("settings", {})
+ if "electionTimeoutMillis" not in repl_config["settings"]:
+ repl_config["settings"]["electionTimeoutMillis"] = 24 * 60 * 60 * 1000
# Start up a single node replica set then reconfigure to the correct size (if the config
# contains more than 1 node), so the primary is elected more quickly.
@@ -500,6 +498,16 @@ class ReplicaSetFixture(interface.ReplFixture): # pylint: disable=too-many-inst
primary = self.get_primary()
return [node for node in self.nodes if node.port != primary.port]
+ def get_voting_members(self):
+ """Return the number of voting nodes in the replica set."""
+ primary = self.get_primary()
+ client = primary.mongo_client()
+
+ members = client.admin.command({"replSetGetConfig": 1})['config']['members']
+ voting_members = [member['host'] for member in members if member['votes'] == 1]
+
+ return voting_members
+
def get_initial_sync_node(self):
"""Return initial sync node from the replica set."""
return self.initial_sync_node
diff --git a/buildscripts/resmokelib/testing/hooks/reconfig_background.py b/buildscripts/resmokelib/testing/hooks/reconfig_background.py
new file mode 100644
index 00000000000..dbf9b33a242
--- /dev/null
+++ b/buildscripts/resmokelib/testing/hooks/reconfig_background.py
@@ -0,0 +1,71 @@
+"""Test hook for running safe reconfigs against the primary of a replica set.
+
+This hook runs continously in a background thread while the test is running.
+"""
+
+import os.path
+
+from buildscripts.resmokelib import errors
+from buildscripts.resmokelib.testing.hooks import jsfile
+from buildscripts.resmokelib.testing.testcases import interface as testcase
+from buildscripts.resmokelib.testing.hooks.background_job import _BackgroundJob, _ContinuousDynamicJSTestCase
+
+
+class DoReconfigInBackground(jsfile.JSHook):
+ """A hook for running a safe reconfig against a replica set while a test is running."""
+
+ def __init__(self, hook_logger, fixture, shell_options=None):
+ """Initialize DoReconfigInBackground."""
+ description = "Run reconfigs against the primary while the test is running."
+ js_filename = os.path.join("jstests", "hooks", "run_reconfig_background.js")
+ jsfile.JSHook.__init__(self, hook_logger, fixture, js_filename, description,
+ shell_options=shell_options)
+
+ self._background_job = None
+
+ def before_suite(self, test_report):
+ """Start the background thread."""
+ self._background_job = _BackgroundJob("ReconfigInBackground")
+ self.logger.info("Starting the background reconfig thread.")
+ self._background_job.start()
+
+ def after_suite(self, test_report):
+ """Signal the background thread to exit, and wait until it does."""
+ if self._background_job is None:
+ return
+
+ self.logger.info("Stopping the background reconfig thread.")
+ self._background_job.stop()
+
+ def before_test(self, test, test_report):
+ """Instruct the background thread to run reconfigs while 'test' is also running."""
+ if self._background_job is None:
+ return
+
+ hook_test_case = _ContinuousDynamicJSTestCase.create_before_test(
+ self.logger.test_case_logger, test, self, self._js_filename, self._shell_options)
+ hook_test_case.configure(self.fixture)
+
+ self.logger.info("Resuming the background reconfig thread.")
+ self._background_job.resume(hook_test_case, test_report)
+
+ def after_test(self, test, test_report): # noqa: D205,D400
+ """Instruct the background thread to stop running reconfigs now that 'test' has
+ finished running.
+ """
+ if self._background_job is None:
+ return
+
+ self.logger.info("Pausing the background reconfig thread.")
+ self._background_job.pause()
+
+ if self._background_job.exc_info is not None:
+ if isinstance(self._background_job.exc_info[1], errors.TestFailure):
+ # If the mongo shell process running the JavaScript file exited with a non-zero
+ # return code, then we raise an errors.ServerFailure exception to cause resmoke.py's
+ # test execution to stop.
+ raise errors.ServerFailure(self._background_job.exc_info[1].args[0])
+ else:
+ self.logger.error("Encountered an error inside the background reconfig thread.",
+ exc_info=self._background_job.exc_info)
+ raise self._background_job.exc_info[1]
diff --git a/buildscripts/resmokelib/testing/hooks/stepdown.py b/buildscripts/resmokelib/testing/hooks/stepdown.py
index de2c89e9a38..4cfd09fd52d 100644
--- a/buildscripts/resmokelib/testing/hooks/stepdown.py
+++ b/buildscripts/resmokelib/testing/hooks/stepdown.py
@@ -27,7 +27,7 @@ class ContinuousStepdown(interface.Hook): # pylint: disable=too-many-instance-a
self, hook_logger, fixture, config_stepdown=True, shard_stepdown=True,
stepdown_interval_ms=8000, terminate=False, kill=False,
use_stepdown_permitted_file=False, wait_for_mongos_retarget=False,
- stepdown_via_heartbeats=True):
+ stepdown_via_heartbeats=True, background_reconfig=False):
"""Initialize the ContinuousStepdown.
Args:
@@ -64,6 +64,8 @@ class ContinuousStepdown(interface.Hook): # pylint: disable=too-many-instance-a
self._terminate = terminate or kill
self._kill = kill
+ self._background_reconfig = background_reconfig
+
# The stepdown file names need to match the same construction as found in
# jstests/concurrency/fsm_libs/resmoke_runner.js.
dbpath_prefix = fixture.get_dbpath_prefix()
@@ -87,7 +89,7 @@ class ContinuousStepdown(interface.Hook): # pylint: disable=too-many-instance-a
self._stepdown_thread = _StepdownThread(
self.logger, self._mongos_fixtures, self._rs_fixtures, self._stepdown_interval_secs,
self._terminate, self._kill, lifecycle, self._wait_for_mongos_retarget,
- self._stepdown_via_heartbeats)
+ self._stepdown_via_heartbeats, self._background_reconfig)
self.logger.info("Starting the stepdown thread.")
self._stepdown_thread.start()
@@ -348,7 +350,8 @@ class FileBasedStepdownLifecycle(object):
class _StepdownThread(threading.Thread): # pylint: disable=too-many-instance-attributes
def __init__( # pylint: disable=too-many-arguments
self, logger, mongos_fixtures, rs_fixtures, stepdown_interval_secs, terminate, kill,
- stepdown_lifecycle, wait_for_mongos_retarget, stepdown_via_heartbeats):
+ stepdown_lifecycle, wait_for_mongos_retarget, stepdown_via_heartbeats,
+ background_reconfig):
"""Initialize _StepdownThread."""
threading.Thread.__init__(self, name="StepdownThread")
self.daemon = True
@@ -365,6 +368,7 @@ class _StepdownThread(threading.Thread): # pylint: disable=too-many-instance-at
self.__lifecycle = stepdown_lifecycle
self._should_wait_for_mongos_retarget = wait_for_mongos_retarget
self._stepdown_via_heartbeats = stepdown_via_heartbeats
+ self._background_reconfig = background_reconfig
self._last_exec = time.time()
# Event set when the thread has been stopped using the 'stop()' method.
@@ -474,6 +478,22 @@ class _StepdownThread(threading.Thread): # pylint: disable=too-many-instance-at
rs_fixture.replset_name))
if self._terminate:
+ # If we're running with background reconfigs, it's possible to be in a scenario
+ # where we kill a necessary voting node (i.e. in a 5 node repl set), only 2 are
+ # voting. In this scenario, we want to avoid killing the primary because no
+ # secondary can step up.
+ if self._background_reconfig:
+ # stagger the kill thread so that it runs a little after the reconfig thread
+ time.sleep(1)
+ voting_members = rs_fixture.get_voting_members()
+
+ self.logger.info("Current voting members: %s", voting_members)
+
+ if len(voting_members) <= 3:
+ # Do not kill or terminate the primary if we don't have enough voting nodes to
+ # elect a new primary.
+ return
+
should_kill = self._kill and random.choice([True, False])
action = "Killing" if should_kill else "Terminating"
self.logger.info("%s the primary on port %d of replica set '%s'.", action, primary.port,