SERVER-45094 add disabled replica set reconfig passthroughs

SERVER-45094 add retryable read logic to network_error_and_txn_override.js (cherry picked from commit f59f63db6c37c0d4657b57d559c95d830b0e34c2) SERVER-45094 add replica_sets_reconfig_jscore_passthrough suite (cherry picked from commit 4d91fac171cbe3f2af53d9258965399e648a1947) SERVER-45094 use w:1 writes and remove causal consistency in reconfig passthrough (cherry picked from commit a43cb23defc6182d08a7814e4731ef98f2d30b6a) SERVER-45094 add replica_sets_reconfig_jscore_stepdown_passthrough (cherry picked from commit 81e0ad27c280c02a49beb65ff4473d5dce62b089) SERVER-45094 add replica_sets_reconfig_kill_primary_jscore_passthrough (cherry picked from commit 2debab7987b24bf902f9a128654ce928441c29a2) SERVER-47678 stepdown and kill primary reconfig passthroughs should ignore ReplicaSetMonitorErrors (cherry picked from commit 91672e58f1169c7edd684b911f20f62b8a71f8d1) SERVER-47544 always increase election timeout to 24 hours in passthrough suites (cherry picked from commit 81d53a715f49827a9f2538d4572f9b01f2b12887)
author: Pavi Vetriselvan <pvselvan@umich.edu> 2020-03-16 11:13:36 -0400
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2020-05-14 12:42:24 +0000
commit: f4528563033d933ca920b3e4b2a5e3344e198a5c (patch)
tree: 8c20856b344e02483dceb1e13f35533e41db3ecd /buildscripts
parent: cd9fbb56900343e7b1193922a2c4b197895e7f56 (diff)
download: mongo-f4528563033d933ca920b3e4b2a5e3344e198a5c.tar.gz
6 files changed, 604 insertions, 9 deletions
diff --git a/buildscripts/resmokeconfig/suites/replica_sets_reconfig_jscore_passthrough.yml b/buildscripts/resmokeconfig/suites/replica_sets_reconfig_jscore_passthrough.yml
new file mode 100644
index 00000000000..8285339493b
--- /dev/null
+++ b/buildscripts/resmokeconfig/suites/replica_sets_reconfig_jscore_passthrough.yml
@@ -0,0 +1,84 @@
+test_kind: js_test
+# This suite starts a 5-node replica set and uses DoReconfigInBackground hook to periodically run
+# safe reconfigs against the primary. These reconfigs change the number of voting nodes in the
+# replica set, which changes the voting majority used to satisfy the config commitment check and
+# oplog commitment check.
+
+selector:
+  roots:
+  - jstests/core/**/*.js
+  exclude_files:
+  # Transactions do not support retryability of individual operations.
+  # TODO: Remove this once it is supported (SERVER-33952).
+  - jstests/core/txns/**/*.js
+  # These tests are not expected to pass with replica-sets:
+  - jstests/core/dbadmin.js
+  - jstests/core/opcounters_write_cmd.js
+  - jstests/core/read_after_optime.js
+  - jstests/core/capped_update.js
+  # The set_param1.js test attempts to compare the response from running the {getParameter: "*"}
+  # command multiple times, which may observe the change to the "transactionLifetimeLimitSeconds"
+  # server parameter.
+  - jstests/core/set_param1.js
+
+  # These test run commands using legacy queries, which are not supported on sessions.
+  - jstests/core/comment_field.js
+  - jstests/core/exhaust.js
+  - jstests/core/invalidated_legacy_cursors.js
+  - jstests/core/validate_cmd_ns.js
+
+  # Unacknowledged writes prohibited in an explicit session.
+  - jstests/core/batch_write_command_w0.js
+  - jstests/core/crud_api.js
+
+  - jstests/core/connection_string_validation.js # Does not expect a replica set connection string.
+
+  exclude_with_any_tags:
+  - assumes_read_preference_unchanged
+  - requires_sharding
+
+executor:
+  archive:
+    hooks:
+      - CheckReplDBHash
+      - CheckReplOplogs
+      - ValidateCollections
+  config:
+    shell_options:
+      eval: >-
+        testingReplication = true;
+        load('jstests/libs/override_methods/network_error_and_txn_override.js');
+        load("jstests/libs/override_methods/enable_sessions.js");
+      global_vars:
+        TestData:
+          networkErrorAndTxnOverrideConfig:
+            backgroundReconfigs: true
+          sessionOptions:
+            # Force DBClientRS to find the primary for non-write commands to make sure reads still
+            # work as expected during reconfigs.
+            readPreference:
+              mode: "primary"
+      readMode: commands
+  hooks:
+  # The CheckReplDBHash hook waits until all operations have replicated to and have been applied
+  # on the secondaries, so we run the ValidateCollections hook after it to ensure we're
+  # validating the entire contents of the collection.
+  - class: DoReconfigInBackground
+    shell_options:
+      nodb: ""
+  - class: CheckReplOplogs
+  - class: CheckReplDBHash
+  - class: ValidateCollections
+  - class: CleanEveryN
+    n: 20
+  fixture:
+    class: ReplicaSetFixture
+    mongod_options:
+      enableMajorityReadConcern: ''
+      set_parameters:
+        enableTestCommands: 1
+        logComponentVerbosity:
+          replication:
+            heartbeats: 2
+    all_nodes_electable: true
+    num_nodes: 5
diff --git a/buildscripts/resmokeconfig/suites/replica_sets_reconfig_jscore_stepdown_passthrough.yml b/buildscripts/resmokeconfig/suites/replica_sets_reconfig_jscore_stepdown_passthrough.yml
new file mode 100644
index 00000000000..d221ccba640
--- /dev/null
+++ b/buildscripts/resmokeconfig/suites/replica_sets_reconfig_jscore_stepdown_passthrough.yml
@@ -0,0 +1,198 @@
+test_kind: js_test
+# This suite starts a 5-node replica set and runs both the DoReconfigInBackground and
+# ContinuousStepdown hook to periodically run safe reconfigs and step downs against the
+# primary. This tests that the concurrency between stepdowns and reconfigs is still ultimately
+# safe.
+
+selector:
+  roots:
+  - jstests/core/**/*.js
+  exclude_files:
+  # Transactions do not support retryability of individual operations.
+  # TODO: Remove this once it is supported (SERVER-33952).
+  - jstests/core/txns/**/*.js
+  # These tests are not expected to pass with replica-sets:
+  - jstests/core/dbadmin.js
+  - jstests/core/opcounters_write_cmd.js
+  - jstests/core/read_after_optime.js
+  - jstests/core/capped_update.js
+  # The set_param1.js test attempts to compare the response from running the {getParameter: "*"}
+  # command multiple times, which may observe the change to the "transactionLifetimeLimitSeconds"
+  # server parameter.
+  - jstests/core/set_param1.js
+
+  # No-op retries are not ignored by top, the profiler, or opcount.
+  - jstests/core/operation_latency_histogram.js
+  - jstests/core/profile2.js
+  - jstests/core/profile3.js
+  - jstests/core/profile_findandmodify.js
+  - jstests/core/top.js
+  - jstests/core/views/views_stats.js
+
+  # TODO SERVER-31249: getLastError should not be affected by no-op retries.
+  - jstests/core/bulk_legacy_enforce_gle.js
+
+  # TODO SERVER-31242: findAndModify no-op retry should respect the fields option.
+  - jstests/core/crud_api.js
+  - jstests/core/find_and_modify.js
+  - jstests/core/find_and_modify2.js
+  - jstests/core/find_and_modify_pipeline_update.js
+  - jstests/core/find_and_modify_server6865.js
+
+  # These test run commands using legacy queries, which are not supported on sessions.
+  - jstests/core/comment_field.js
+  - jstests/core/exhaust.js
+  - jstests/core/validate_cmd_ns.js
+
+  # Stepdown commands during fsync lock will fail.
+  - jstests/core/currentop.js
+  - jstests/core/fsync.js
+  - jstests/core/killop_drop_collection.js
+
+  # Assert on the ismaster field of an isMaster response. If a primary steps down after accepting
+  # an isMaster command and returns before its connection is closed, the response can contain
+  # ismaster: false.
+  - jstests/core/dbadmin.js
+  - jstests/core/ismaster.js
+
+  # Spawns new mongo shells, which don't retry connecting on stepdown errors.
+  - jstests/core/shell_connection_strings.js
+
+  # Expect drops/creates to fail or have a certain response:
+  - jstests/core/drop.js
+  - jstests/core/dropdb.js
+  - jstests/core/explain_upsert.js
+  - jstests/core/indexes_multiple_commands.js
+
+  # Expect certain responses, but retries of successfully completed commands may return
+  # different values:
+  - jstests/core/create_indexes.js
+  - jstests/core/objid5.js
+
+  # Unacknowledged writes prohibited in an explicit session.
+  - jstests/core/batch_write_command_w0.js
+
+  - jstests/core/bench_test*.js # benchRun() used for writes
+  - jstests/core/benchrun_pipeline_updates.js # benchRun() used for writes
+  - jstests/core/connection_string_validation.js # Does not expect a replica set connection string.
+  - jstests/core/list_collections_filter.js # Temporary collections are dropped on failover.
+  - jstests/core/top.js # Tests read commands (including getMore) against the secondary
+  - jstests/core/drop3.js # getMore is not causally consistent if collection is dropped
+  - jstests/core/validate_cmd_ns.js # Calls _exec() directly, not retryable.
+  - jstests/core/list_collections_filter.js # Temporary collections are dropped on failover.
+  - jstests/core/explain_large_bounds.js # Stepdown can timeout waiting for global lock.
+
+  # Tests that fail for Causal Consistency as they have statements that do not support
+  # non-local read concern.
+  - jstests/core/collation.js
+  # Parallel shell is not causally consistent
+  - jstests/core/benchrun_pipeline_updates.js
+  - jstests/core/find_and_modify_concurrent_update.js
+  - jstests/core/shellstartparallel.js
+
+  exclude_with_any_tags:
+  ##
+  # The next four tags correspond to the special errors thrown by the auto_retry_on_network_error.js
+  # override when it refuses to run a certain command. Above each tag are the message(s) that cause
+  # the tag to be warranted.
+  ##
+  # "Refusing to run a test that issues a getMore command since if a network error occurs during
+  #   it then we won't know whether the cursor was advanced or not"
+  - requires_getmore
+  # "Refusing to run a test that issues non-retryable write operations since the test likely makes
+  #   assertions on the write results and can lead to spurious failures if a network error occurs"
+  - requires_non_retryable_writes
+  # "Refusing to run a test that issues commands that are not blindly retryable"
+  # "Refusing to run a test that issues an aggregation command with $out because it is not
+  #   retryable"
+  - requires_non_retryable_commands
+  # "Refusing to run a test that issues commands that may return different values after a failover"
+  # "Refusing to run a test that issues an aggregation command with explain because it may return
+  #   incomplete results"
+  # "Refusing to run a test that issues an aggregation command with
+  #   $listLocalSessions because it relies on in-memory state that may not survive failovers"
+  # "Refusing to run a test that issues a mapReduce command, because it calls std::terminate() if
+  #   interrupted by a stepdown"
+  - does_not_support_stepdowns
+  ##
+  # The next two tags correspond to the special errors thrown by the
+  # set_read_and_write_concerns.js override when it refuses to replace the readConcern or
+  # writeConcern of a particular command. Above each tag are the message(s) that cause the tag to be
+  # warranted.
+  ##
+  # "Cowardly refusing to override read concern of command: ..."
+  - assumes_read_concern_unchanged
+  # "Cowardly refusing to override write concern of command: ..."
+  - assumes_write_concern_unchanged
+  ## The next tag corresponds to long running-operations, as they may exhaust their number
+  # of retries and result in a network error being thrown.
+  - operations_longer_than_stepdown_interval
+  - does_not_support_causal_consistency
+  - uses_transactions
+  # collStats is not causally consistent
+  - requires_collstats
+  - requires_dbstats
+  - requires_datasize
+  - requires_sharding
+
+executor:
+  archive:
+    hooks:
+      - CheckReplDBHash
+      - CheckReplOplogs
+      - ValidateCollections
+  config:
+    shell_options:
+      eval: >-
+        testingReplication = true;
+        load('jstests/libs/override_methods/network_error_and_txn_override.js');
+        db = connect(TestData.connectionString);
+        load("jstests/libs/override_methods/enable_sessions.js");
+        load("jstests/libs/override_methods/set_read_and_write_concerns.js");
+      global_vars:
+        TestData:
+          runningWithCausalConsistency: true
+          alwaysInjectTransactionNumber: true
+          defaultReadConcernLevel: "majority"
+          logRetryAttempts: true
+          networkErrorAndTxnOverrideConfig:
+            retryOnNetworkErrors: true
+            backgroundReconfigs: true
+          overrideRetryAttempts: 3
+          sessionOptions:
+            writeConcern:
+              w: "majority"
+            readConcern:
+              level: "majority"
+            # Force DBClientRS to find the primary for non-write commands.
+            readPreference:
+              mode: "primary"
+            retryWrites: true
+      # We specify nodb so the shell used by each test will attempt to connect after loading the
+      # retry logic in auto_retry_on_network_error.js.
+      nodb: ""
+      readMode: commands
+  hooks:
+  - class: DoReconfigInBackground
+    shell_options:
+      nodb: ""
+  - class: ContinuousStepdown
+  # The CheckReplDBHash hook waits until all operations have replicated to and have been applied
+  # on the secondaries, so we run the ValidateCollections hook after it to ensure we're
+  # validating the entire contents of the collection.
+  - class: CheckReplOplogs
+  - class: CheckReplDBHash
+  - class: ValidateCollections
+  - class: CleanEveryN
+    n: 20
+  fixture:
+    class: ReplicaSetFixture
+    mongod_options:
+      enableMajorityReadConcern: ''
+      set_parameters:
+        enableTestCommands: 1
+        logComponentVerbosity:
+          replication:
+            heartbeats: 2
+    all_nodes_electable: true
+    num_nodes: 5
diff --git a/buildscripts/resmokeconfig/suites/replica_sets_reconfig_kill_primary_jscore_passthrough.yml b/buildscripts/resmokeconfig/suites/replica_sets_reconfig_kill_primary_jscore_passthrough.yml
new file mode 100644
index 00000000000..6c973e3dd06
--- /dev/null
+++ b/buildscripts/resmokeconfig/suites/replica_sets_reconfig_kill_primary_jscore_passthrough.yml
@@ -0,0 +1,214 @@
+test_kind: js_test
+# This suite starts a 5-node replica set and runs both the DoReconfigInBackground and
+# ContinuousStepdown hook with kill: true and background_reconfig: true to periodically run
+# safe reconfigs and send kill signals to the primary.
+# This tests that the concurrency between killing the primary and reconfigs is still
+# ultimately safe.
+
+selector:
+  roots:
+  - jstests/core/**/*.js
+  exclude_files:
+  # Transactions do not support retryability of individual operations.
+  # TODO: Remove this once it is supported (SERVER-33952).
+  - jstests/core/txns/**/*.js
+  # These tests are not expected to pass with replica-sets:
+  - jstests/core/dbadmin.js
+  - jstests/core/opcounters_write_cmd.js
+  - jstests/core/read_after_optime.js
+  - jstests/core/capped_update.js
+  # The set_param1.js test attempts to compare the response from running the {getParameter: "*"}
+  # command multiple times, which may observe the change to the "transactionLifetimeLimitSeconds"
+  # server parameter.
+  - jstests/core/set_param1.js
+
+  # No-op retries are not ignored by top, the profiler, or opcount.
+  - jstests/core/operation_latency_histogram.js
+  - jstests/core/profile2.js
+  - jstests/core/profile3.js
+  - jstests/core/profile_findandmodify.js
+  - jstests/core/top.js
+  - jstests/core/views/views_stats.js
+
+  # TODO SERVER-31249: getLastError should not be affected by no-op retries.
+  - jstests/core/bulk_legacy_enforce_gle.js
+
+  # TODO SERVER-31242: findAndModify no-op retry should respect the fields option.
+  - jstests/core/crud_api.js
+  - jstests/core/find_and_modify.js
+  - jstests/core/find_and_modify2.js
+  - jstests/core/find_and_modify_pipeline_update.js
+  - jstests/core/find_and_modify_server6865.js
+
+  # These test run commands using legacy queries, which are not supported on sessions.
+  - jstests/core/comment_field.js
+  - jstests/core/exhaust.js
+  - jstests/core/validate_cmd_ns.js
+
+  # Stepdown commands during fsync lock will fail.
+  - jstests/core/currentop.js
+  - jstests/core/fsync.js
+  - jstests/core/killop_drop_collection.js
+
+  # Assert on the ismaster field of an isMaster response. If a primary steps down after accepting
+  # an isMaster command and returns before its connection is closed, the response can contain
+  # ismaster: false.
+  - jstests/core/dbadmin.js
+  - jstests/core/ismaster.js
+
+  # Spawns new mongo shells, which don't retry connecting on stepdown errors.
+  - jstests/core/shell_connection_strings.js
+
+  # Expect drops/creates to fail or have a certain response:
+  - jstests/core/drop.js
+  - jstests/core/dropdb.js
+  - jstests/core/explain_upsert.js
+  - jstests/core/indexes_multiple_commands.js
+
+  # Expect certain responses, but retries of successfully completed commands may return
+  # different values:
+  - jstests/core/create_indexes.js
+  - jstests/core/objid5.js
+
+  # Unacknowledged writes prohibited in an explicit session.
+  - jstests/core/batch_write_command_w0.js
+
+  - jstests/core/bench_test*.js # benchRun() used for writes
+  - jstests/core/benchrun_pipeline_updates.js # benchRun() used for writes
+  - jstests/core/connection_string_validation.js # Does not expect a replica set connection string.
+  - jstests/core/list_collections_filter.js # Temporary collections are dropped on failover.
+  - jstests/core/top.js # Tests read commands (including getMore) against the secondary
+  - jstests/core/drop3.js # getMore is not causally consistent if collection is dropped
+  - jstests/core/validate_cmd_ns.js # Calls _exec() directly, not retryable.
+  - jstests/core/list_collections_filter.js # Temporary collections are dropped on failover.
+  - jstests/core/explain_large_bounds.js # Stepdown can timeout waiting for global lock.
+
+  # Tests that fail for Causal Consistency as they have statements that do not support
+  # non-local read concern.
+  - jstests/core/collation.js
+  # Starts a parallel shell but won't restart it after unclean shutdown.
+  # TODO SERVER-33229: Remove these exclusions
+  - jstests/core/compact_keeps_indexes.js
+  - jstests/core/benchrun_pipeline_updates.js
+  - jstests/core/find_and_modify_concurrent_update.js
+  - jstests/core/shellstartparallel.js
+
+  # Inserts enough data that recovery takes more than 8 seconds, so we never get a working primary.
+  - jstests/core/geo_s2ordering.js
+
+  exclude_with_any_tags:
+  ##
+  # The next four tags correspond to the special errors thrown by the auto_retry_on_network_error.js
+  # override when it refuses to run a certain command. Above each tag are the message(s) that cause
+  # the tag to be warranted.
+  ##
+  # "Refusing to run a test that issues a getMore command since if a network error occurs during
+  #   it then we won't know whether the cursor was advanced or not"
+  - requires_getmore
+  # "Refusing to run a test that issues non-retryable write operations since the test likely makes
+  #   assertions on the write results and can lead to spurious failures if a network error occurs"
+  - requires_non_retryable_writes
+  # "Refusing to run a test that issues commands that are not blindly retryable"
+  # "Refusing to run a test that issues an aggregation command with $out because it is not
+  #   retryable"
+  - requires_non_retryable_commands
+  # "Refusing to run a test that issues commands that may return different values after a failover"
+  # "Refusing to run a test that issues an aggregation command with explain because it may return
+  #   incomplete results"
+  # "Refusing to run a test that issues an aggregation command with
+  #   $listLocalSessions because it relies on in-memory state that may not survive failovers"
+  # "Refusing to run a test that issues a mapReduce command, because it calls std::terminate() if
+  #   interrupted by a stepdown"
+  - does_not_support_stepdowns
+  ##
+  # The next two tags correspond to the special errors thrown by the
+  # set_read_and_write_concerns.js override when it refuses to replace the readConcern or
+  # writeConcern of a particular command. Above each tag are the message(s) that cause the tag to be
+  # warranted.
+  ##
+  # "Cowardly refusing to override read concern of command: ..."
+  - assumes_read_concern_unchanged
+  # "Cowardly refusing to override write concern of command: ..."
+  - assumes_write_concern_unchanged
+  ##
+  # The next three tags corresponds to the special errors thrown by the
+  # fail_unclean_shutdown_incompatible_commands.js override when it refuses to run commands that are
+  # inaccurate after an unclean shutdown. Above each tag is the message that causes the tag to be
+  # warranted.
+  ##
+  # "Cowardly fail if fastcount is run with a mongod that had an unclean shutdown: ..."
+  - requires_fastcount
+  # "Cowardly fail if dbStats is run with a mongod that had an unclean shutdown: ..."
+  - requires_dbstats
+  # "Cowardly fail if collStats is run with a mongod that had an unclean shutdown: ..."
+  - requires_collstats
+  # "Cowardly fail if unbounded dataSize is run with a mongod that had an unclean shutdown: ..."
+  - requires_datasize
+  ## The next tag corresponds to long running-operations, as they may exhaust their number
+  # of retries and result in a network error being thrown.
+  - operations_longer_than_stepdown_interval
+
+executor:
+  archive:
+    hooks:
+      - CheckReplDBHash
+      - CheckReplOplogs
+      - ValidateCollections
+  config:
+    shell_options:
+      eval: >-
+        testingReplication = true;
+        load('jstests/libs/override_methods/network_error_and_txn_override.js');
+        db = connect(TestData.connectionString);
+        load("jstests/libs/override_methods/enable_sessions.js");
+        load("jstests/libs/override_methods/set_read_and_write_concerns.js");
+        load("jstests/libs/override_methods/fail_unclean_shutdown_incompatible_commands.js");
+        load("jstests/libs/override_methods/fail_unclean_shutdown_start_parallel_shell.js");
+      global_vars:
+        TestData:
+          alwaysInjectTransactionNumber: true
+          defaultReadConcernLevel: "majority"
+          logRetryAttempts: true
+          networkErrorAndTxnOverrideConfig:
+            retryOnNetworkErrors: true
+            backgroundReconfigs: true
+          overrideRetryAttempts: 3
+          sessionOptions:
+            writeConcern:
+              w: "majority"
+            readConcern:
+              level: "majority"
+            # Force DBClientRS to find the primary for non-write commands.
+            readPreference:
+              mode: "primary"
+            retryWrites: true
+      # We specify nodb so the shell used by each test will attempt to connect after loading the
+      # retry logic in auto_retry_on_network_error.js.
+      nodb: ""
+      readMode: commands
+  hooks:
+  - class: DoReconfigInBackground
+    shell_options:
+      nodb: ""
+  - class: ContinuousStepdown
+    kill: true
+    background_reconfig: true
+  # The CheckReplDBHash hook waits until all operations have replicated to and have been applied
+  # on the secondaries, so we run the ValidateCollections hook after it to ensure we're
+  # validating the entire contents of the collection.
+  - class: CheckReplOplogs
+  - class: CheckReplDBHash
+  - class: ValidateCollections
+  - class: CleanEveryN
+    n: 20
+  fixture:
+    class: ReplicaSetFixture
+    mongod_options:
+      enableMajorityReadConcern: ''
+      set_parameters:
+        enableTestCommands: 1
+        logComponentVerbosity:
+          replication:
+            heartbeats: 2
+    all_nodes_electable: true
+    num_nodes: 5
diff --git a/buildscripts/resmokelib/testing/fixtures/replicaset.py b/buildscripts/resmokelib/testing/fixtures/replicaset.py
index fb8e90b5296..5de9996de1c 100644
--- a/buildscripts/resmokelib/testing/fixtures/replicaset.py
+++ b/buildscripts/resmokelib/testing/fixtures/replicaset.py
@@ -187,12 +187,10 @@ class ReplicaSetFixture(interface.ReplFixture):  # pylint: disable=too-many-inst
             replset_settings = self.replset_config_options["settings"]
             repl_config["settings"] = replset_settings
 
-        # If not all nodes are electable and no election timeout was specified, then we increase
-        # the election timeout to 24 hours to prevent spurious elections.
-        if not self.all_nodes_electable:
-            repl_config.setdefault("settings", {})
-            if "electionTimeoutMillis" not in repl_config["settings"]:
-                repl_config["settings"]["electionTimeoutMillis"] = 24 * 60 * 60 * 1000
+        # Increase the election timeout to 24 hours to prevent spurious elections.
+        repl_config.setdefault("settings", {})
+        if "electionTimeoutMillis" not in repl_config["settings"]:
+            repl_config["settings"]["electionTimeoutMillis"] = 24 * 60 * 60 * 1000
 
         # Start up a single node replica set then reconfigure to the correct size (if the config
         # contains more than 1 node), so the primary is elected more quickly.
@@ -500,6 +498,16 @@ class ReplicaSetFixture(interface.ReplFixture):  # pylint: disable=too-many-inst
         primary = self.get_primary()
         return [node for node in self.nodes if node.port != primary.port]
 
+    def get_voting_members(self):
+        """Return the number of voting nodes in the replica set."""
+        primary = self.get_primary()
+        client = primary.mongo_client()
+
+        members = client.admin.command({"replSetGetConfig": 1})['config']['members']
+        voting_members = [member['host'] for member in members if member['votes'] == 1]
+
+        return voting_members
+
     def get_initial_sync_node(self):
         """Return initial sync node from the replica set."""
         return self.initial_sync_node
diff --git a/buildscripts/resmokelib/testing/hooks/reconfig_background.py b/buildscripts/resmokelib/testing/hooks/reconfig_background.py
new file mode 100644
index 00000000000..dbf9b33a242
--- /dev/null
+++ b/buildscripts/resmokelib/testing/hooks/reconfig_background.py
@@ -0,0 +1,71 @@
+"""Test hook for running safe reconfigs against the primary of a replica set.
+
+This hook runs continously in a background thread while the test is running.
+"""
+
+import os.path
+
+from buildscripts.resmokelib import errors
+from buildscripts.resmokelib.testing.hooks import jsfile
+from buildscripts.resmokelib.testing.testcases import interface as testcase
+from buildscripts.resmokelib.testing.hooks.background_job import _BackgroundJob, _ContinuousDynamicJSTestCase
+
+
+class DoReconfigInBackground(jsfile.JSHook):
+    """A hook for running a safe reconfig against a replica set while a test is running."""
+
+    def __init__(self, hook_logger, fixture, shell_options=None):
+        """Initialize DoReconfigInBackground."""
+        description = "Run reconfigs against the primary while the test is running."
+        js_filename = os.path.join("jstests", "hooks", "run_reconfig_background.js")
+        jsfile.JSHook.__init__(self, hook_logger, fixture, js_filename, description,
+                               shell_options=shell_options)
+
+        self._background_job = None
+
+    def before_suite(self, test_report):
+        """Start the background thread."""
+        self._background_job = _BackgroundJob("ReconfigInBackground")
+        self.logger.info("Starting the background reconfig thread.")
+        self._background_job.start()
+
+    def after_suite(self, test_report):
+        """Signal the background thread to exit, and wait until it does."""
+        if self._background_job is None:
+            return
+
+        self.logger.info("Stopping the background reconfig thread.")
+        self._background_job.stop()
+
+    def before_test(self, test, test_report):
+        """Instruct the background thread to run reconfigs while 'test' is also running."""
+        if self._background_job is None:
+            return
+
+        hook_test_case = _ContinuousDynamicJSTestCase.create_before_test(
+            self.logger.test_case_logger, test, self, self._js_filename, self._shell_options)
+        hook_test_case.configure(self.fixture)
+
+        self.logger.info("Resuming the background reconfig thread.")
+        self._background_job.resume(hook_test_case, test_report)
+
+    def after_test(self, test, test_report):  # noqa: D205,D400
+        """Instruct the background thread to stop running reconfigs now that 'test' has
+        finished running.
+        """
+        if self._background_job is None:
+            return
+
+        self.logger.info("Pausing the background reconfig thread.")
+        self._background_job.pause()
+
+        if self._background_job.exc_info is not None:
+            if isinstance(self._background_job.exc_info[1], errors.TestFailure):
+                # If the mongo shell process running the JavaScript file exited with a non-zero
+                # return code, then we raise an errors.ServerFailure exception to cause resmoke.py's
+                # test execution to stop.
+                raise errors.ServerFailure(self._background_job.exc_info[1].args[0])
+            else:
+                self.logger.error("Encountered an error inside the background reconfig thread.",
+                                  exc_info=self._background_job.exc_info)
+                raise self._background_job.exc_info[1]
diff --git a/buildscripts/resmokelib/testing/hooks/stepdown.py b/buildscripts/resmokelib/testing/hooks/stepdown.py
index de2c89e9a38..4cfd09fd52d 100644
--- a/buildscripts/resmokelib/testing/hooks/stepdown.py
+++ b/buildscripts/resmokelib/testing/hooks/stepdown.py
@@ -27,7 +27,7 @@ class ContinuousStepdown(interface.Hook):  # pylint: disable=too-many-instance-a
             self, hook_logger, fixture, config_stepdown=True, shard_stepdown=True,
             stepdown_interval_ms=8000, terminate=False, kill=False,
             use_stepdown_permitted_file=False, wait_for_mongos_retarget=False,
-            stepdown_via_heartbeats=True):
+            stepdown_via_heartbeats=True, background_reconfig=False):
         """Initialize the ContinuousStepdown.
 
         Args:
@@ -64,6 +64,8 @@ class ContinuousStepdown(interface.Hook):  # pylint: disable=too-many-instance-a
         self._terminate = terminate or kill
         self._kill = kill
 
+        self._background_reconfig = background_reconfig
+
         # The stepdown file names need to match the same construction as found in
         # jstests/concurrency/fsm_libs/resmoke_runner.js.
         dbpath_prefix = fixture.get_dbpath_prefix()
@@ -87,7 +89,7 @@ class ContinuousStepdown(interface.Hook):  # pylint: disable=too-many-instance-a
         self._stepdown_thread = _StepdownThread(
             self.logger, self._mongos_fixtures, self._rs_fixtures, self._stepdown_interval_secs,
             self._terminate, self._kill, lifecycle, self._wait_for_mongos_retarget,
-            self._stepdown_via_heartbeats)
+            self._stepdown_via_heartbeats, self._background_reconfig)
         self.logger.info("Starting the stepdown thread.")
         self._stepdown_thread.start()
 
@@ -348,7 +350,8 @@ class FileBasedStepdownLifecycle(object):
 class _StepdownThread(threading.Thread):  # pylint: disable=too-many-instance-attributes
     def __init__(  # pylint: disable=too-many-arguments
             self, logger, mongos_fixtures, rs_fixtures, stepdown_interval_secs, terminate, kill,
-            stepdown_lifecycle, wait_for_mongos_retarget, stepdown_via_heartbeats):
+            stepdown_lifecycle, wait_for_mongos_retarget, stepdown_via_heartbeats,
+            background_reconfig):
         """Initialize _StepdownThread."""
         threading.Thread.__init__(self, name="StepdownThread")
         self.daemon = True
@@ -365,6 +368,7 @@ class _StepdownThread(threading.Thread):  # pylint: disable=too-many-instance-at
         self.__lifecycle = stepdown_lifecycle
         self._should_wait_for_mongos_retarget = wait_for_mongos_retarget
         self._stepdown_via_heartbeats = stepdown_via_heartbeats
+        self._background_reconfig = background_reconfig
 
         self._last_exec = time.time()
         # Event set when the thread has been stopped using the 'stop()' method.
@@ -474,6 +478,22 @@ class _StepdownThread(threading.Thread):  # pylint: disable=too-many-instance-at
                                            rs_fixture.replset_name))
 
         if self._terminate:
+            # If we're running with background reconfigs, it's possible to be in a scenario
+            # where we kill a necessary voting node (i.e. in a 5 node repl set), only 2 are
+            # voting. In this scenario, we want to avoid killing the primary because no
+            # secondary can step up.
+            if self._background_reconfig:
+                # stagger the kill thread so that it runs a little after the reconfig thread
+                time.sleep(1)
+                voting_members = rs_fixture.get_voting_members()
+
+                self.logger.info("Current voting members: %s", voting_members)
+
+                if len(voting_members) <= 3:
+                    # Do not kill or terminate the primary if we don't have enough voting nodes to
+                    # elect a new primary.
+                    return
+
             should_kill = self._kill and random.choice([True, False])
             action = "Killing" if should_kill else "Terminating"
             self.logger.info("%s the primary on port %d of replica set '%s'.", action, primary.port,
author	Pavi Vetriselvan <pvselvan@umich.edu>	2020-03-16 11:13:36 -0400
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2020-05-14 12:42:24 +0000
commit	f4528563033d933ca920b3e4b2a5e3344e198a5c (patch)
tree	8c20856b344e02483dceb1e13f35533e41db3ecd /buildscripts
parent	cd9fbb56900343e7b1193922a2c4b197895e7f56 (diff)
download	mongo-f4528563033d933ca920b3e4b2a5e3344e198a5c.tar.gz