SERVER-27285 Run jsCore tests while periodically killing secondaries.

Adds a new replica_sets_kill_secondaries_jscore_passthrough.yml suite that after running tests for a certain period of time (defaults to 30 seconds), resmoke.py will send a SIGKILL to all of the replica set's secondaries. Each node is then restarted individually with the primary disabled to verify it reaches the SECONDARY state within 5 minutes of starting up. (cherry picked from commit 07f5d153305c0bf10ef55b5dc73eb9a2ca8cb104) (cherry picked from commit e02c3c769bbcbe26d9132caf28cad6d2d2b4766a) Also includes the remainder of the changes from 068878410614c789f23b2abc6c5b9680c82abe5e to rename core_small_oplog_rs_kill_secondaries.yml to replica_sets_kill_secondaries_jscore_passthrough.yml.
author: Max Hirschhorn <max.hirschhorn@mongodb.com> 2017-05-01 18:36:48 -0400
committer: Max Hirschhorn <max.hirschhorn@mongodb.com> 2017-05-01 18:36:48 -0400
commit: 2d7be840ecf1b7928a99def51fe3bea8304738f8 (patch)
tree: ef8edd3871e9b52d43bf14e87dbaf254b86f30ff
parent: 9964889f0936441211cf065a15504cdc5973d646 (diff)
download: mongo-2d7be840ecf1b7928a99def51fe3bea8304738f8.tar.gz
5 files changed, 365 insertions, 7 deletions
diff --git a/buildscripts/resmokeconfig/suites/replica_sets_kill_secondaries_jscore_passthrough.yml b/buildscripts/resmokeconfig/suites/replica_sets_kill_secondaries_jscore_passthrough.yml
new file mode 100644
index 00000000000..a4b6c5a5feb
--- /dev/null
+++ b/buildscripts/resmokeconfig/suites/replica_sets_kill_secondaries_jscore_passthrough.yml
@@ -0,0 +1,32 @@
+selector:
+  js_test:
+    roots:
+    - jstests/core/*.js
+    exclude_files:
+    # These tests are not expected to pass with replica-sets:
+    - jstests/core/dbadmin.js
+    - jstests/core/opcounters_write_cmd.js
+    - jstests/core/read_after_optime.js
+    - jstests/core/capped_update.js
+    # The following tests perform a write with a writeConcern of w=2 when 'testingReplication' is
+    # true. This causes the test to hang because the secondary is running with the "rsSyncStopApply"
+    # failpoint enabled.
+    - jstests/core/geo_update_btree.js
+
+executor:
+  js_test:
+    config:
+      shell_options:
+        eval: "testingReplication = true;"
+        readMode: commands
+    hooks:
+    - class: PeriodicKillSecondaries
+    fixture:
+      class: ReplicaSetFixture
+      mongod_options:
+        oplogSize: 511
+        set_parameters:
+          enableTestCommands: 1
+          numInitialSyncAttempts: 1
+      num_nodes: 2
+      voting_secondaries: false
diff --git a/buildscripts/resmokelib/core/process.py b/buildscripts/resmokelib/core/process.py
index b8efa8af25a..eaa03cb241c 100644
--- a/buildscripts/resmokelib/core/process.py
+++ b/buildscripts/resmokelib/core/process.py
@@ -163,12 +163,12 @@ class Process(object):
                 if return_code == win32con.STILL_ACTIVE:
                     raise
 
-    def stop(self):
+    def stop(self, kill=False):
         """Terminate the process."""
         if sys.platform == "win32":
 
             # Attempt to cleanly shutdown mongod.
-            if len(self.args) > 0 and self.args[0].find("mongod") != -1:
+            if not kill and len(self.args) > 0 and self.args[0].find("mongod") != -1:
                 mongo_signal_handle = None
                 try:
                     mongo_signal_handle = win32event.OpenEvent(
@@ -214,7 +214,10 @@ class Process(object):
                     raise
         else:
             try:
-                self._process.terminate()
+                if kill:
+                    self._process.kill()
+                else:
+                    self._process.terminate()
             except OSError as err:
                 # ESRCH (errno=3) is received when the process has already died.
                 if err.errno != 3:
diff --git a/buildscripts/resmokelib/testing/fixtures/replicaset.py b/buildscripts/resmokelib/testing/fixtures/replicaset.py
index 4928e88b3ff..ac65f037853 100644
--- a/buildscripts/resmokelib/testing/fixtures/replicaset.py
+++ b/buildscripts/resmokelib/testing/fixtures/replicaset.py
@@ -22,6 +22,7 @@ class ReplicaSetFixture(interface.ReplFixture):
     """
 
     # Error response codes copied from mongo/base/error_codes.err.
+    _ALREADY_INITIALIZED = 23
     _NODE_NOT_FOUND = 74
 
     def __init__(self,
@@ -35,7 +36,8 @@ class ReplicaSetFixture(interface.ReplFixture):
                  start_initial_sync_node=False,
                  write_concern_majority_journal_default=None,
                  auth_options=None,
-                 replset_config_options=None):
+                 replset_config_options=None,
+                 voting_secondaries=True):
 
         interface.ReplFixture.__init__(self, logger, job_num)
 
@@ -47,6 +49,7 @@ class ReplicaSetFixture(interface.ReplFixture):
         self.write_concern_majority_journal_default = write_concern_majority_journal_default
         self.auth_options = auth_options
         self.replset_config_options = utils.default_if_none(replset_config_options, {})
+        self.voting_secondaries = voting_secondaries
 
         # The dbpath in mongod_options is used as the dbpath prefix for replica set members and
         # takes precedence over other settings. The ShardedClusterFixture uses this parameter to
@@ -97,9 +100,10 @@ class ReplicaSetFixture(interface.ReplFixture):
             member_info = {"_id": i, "host": node.get_connection_string()}
             if i > 0:
                 member_info["priority"] = 0
-            if i >= 7:
-                # Only 7 nodes in a replica set can vote, so the other members must be non-voting.
-                member_info["votes"] = 0
+                if i >= 7 or not self.voting_secondaries:
+                    # Only 7 nodes in a replica set can vote, so the other members must still be
+                    # non-voting when this fixture is configured to have voting secondaries.
+                    member_info["votes"] = 0
             members.append(member_info)
         if self.initial_sync_node:
             members.append({"_id": self.initial_sync_node_idx,
@@ -164,6 +168,11 @@ class ReplicaSetFixture(interface.ReplFixture):
                 client.admin.command(cmd_obj)
                 break
             except pymongo.errors.OperationFailure as err:
+                # Ignore errors from the "replSetInitiate" command when the replica set has already
+                # been initiated.
+                if err.code == ReplicaSetFixture._ALREADY_INITIALIZED:
+                    return
+
                 # Retry on NodeNotFound errors from the "replSetInitiate" command.
                 if err.code != ReplicaSetFixture._NODE_NOT_FOUND:
                     raise
diff --git a/buildscripts/resmokelib/testing/hooks.py b/buildscripts/resmokelib/testing/hooks.py
index 3442c4d1ef8..db95390073a 100644
--- a/buildscripts/resmokelib/testing/hooks.py
+++ b/buildscripts/resmokelib/testing/hooks.py
@@ -7,6 +7,7 @@ from __future__ import absolute_import
 
 import os
 import sys
+import time
 
 import bson
 import pymongo
@@ -406,6 +407,295 @@ class CheckReplOplogs(JsCustomBehavior):
                                   shell_options=shell_options)
 
 
+class PeriodicKillSecondaries(CustomBehavior):
+    """
+    Periodically kills the secondaries in a replica set and verifies
+    that they can reach the SECONDARY state without having connectivity
+    to the primary after an unclean shutdown.
+    """
+
+    DEFAULT_PERIOD_SECS = 30
+
+    def __init__(self, logger, fixture, period_secs=DEFAULT_PERIOD_SECS):
+        if not isinstance(fixture, fixtures.ReplicaSetFixture):
+            raise TypeError("%s either does not support replication or does not support writing to"
+                            " its oplog early"
+                            % (fixture.__class__.__name__))
+
+        if fixture.num_nodes <= 1:
+            raise ValueError("PeriodicKillSecondaries requires the replica set to contain at least"
+                             " one secondary")
+
+        description = ("PeriodicKillSecondaries (kills the secondary after running tests for a"
+                       " configurable period of time)")
+        CustomBehavior.__init__(self, logger, fixture, description)
+
+        self._period_secs = period_secs
+        self._start_time = None
+
+    def after_suite(self, test_report):
+        if self._start_time is not None:
+            # Ensure that we test killing the secondary and having it reach state SECONDARY after
+            # being restarted at least once when running the suite.
+            self._run(test_report)
+
+    def before_test(self, test, test_report):
+        if self._start_time is not None:
+            # The "rsSyncApplyStop" failpoint is already enabled.
+            return
+
+        # Enable the "rsSyncApplyStop" failpoint on each of the secondaries to prevent them from
+        # applying any oplog entries while the test is running.
+        for secondary in self.fixture.get_secondaries():
+            client = utils.new_mongo_client(port=secondary.port)
+            try:
+                client.admin.command(bson.SON([
+                    ("configureFailPoint", "rsSyncApplyStop"),
+                    ("mode", "alwaysOn")]))
+            except pymongo.errors.OperationFailure as err:
+                self.logger.exception(
+                    "Unable to disable oplog application on the mongod on port %d", secondary.port)
+                raise errors.ServerFailure(
+                    "Unable to disable oplog application on the mongod on port %d: %s"
+                    % (secondary.port, err.args[0]))
+
+        self._start_time = time.time()
+
+    def after_test(self, test, test_report):
+        self._last_test_name = test.short_name()
+
+        # Kill the secondaries and verify that they can reach the SECONDARY state if the specified
+        # period has elapsed.
+        should_check_secondaries = time.time() - self._start_time >= self._period_secs
+        if not should_check_secondaries:
+            return
+
+        self._run(test_report)
+
+    def _run(self, test_report):
+        self.hook_test_case = testcases.TestCase(
+            self.logger,
+            "Hook",
+            "%s:%s" % (self._last_test_name, self.logger_name))
+        CustomBehavior.start_dynamic_test(self.hook_test_case, test_report)
+
+        try:
+            self._kill_secondaries()
+            self._check_secondaries_and_restart_fixture()
+
+            # Validate all collections on all nodes after having the secondaries reconcile the end
+            # of their oplogs.
+            self._validate_collections(test_report)
+
+            # Verify that the dbhashes match across all nodes after having the secondaries reconcile
+            # the end of their oplogs.
+            self._check_repl_dbhash(test_report)
+
+            self._restart_and_clear_fixture()
+        except Exception as err:
+            self.hook_test_case.logger.exception(
+                "Encountered an error running PeriodicKillSecondaries.")
+            self.hook_test_case.return_code = 2
+            test_report.addFailure(self.hook_test_case, sys.exc_info())
+            raise errors.StopExecution(err.args[0])
+        else:
+            self.hook_test_case.return_code = 0
+            test_report.addSuccess(self.hook_test_case)
+        finally:
+            test_report.stopTest(self.hook_test_case)
+
+            # Set the hook back into a state where it will disable oplog application at the start
+            # of the next test that runs.
+            self._start_time = None
+
+    def _kill_secondaries(self):
+        for secondary in self.fixture.get_secondaries():
+            # Disable the "rsSyncApplyStop" failpoint on the secondary to have it resume applying
+            # oplog entries.
+            for secondary in self.fixture.get_secondaries():
+                client = utils.new_mongo_client(port=secondary.port)
+                try:
+                    client.admin.command(bson.SON([
+                        ("configureFailPoint", "rsSyncApplyStop"),
+                        ("mode", "off")]))
+                except pymongo.errors.OperationFailure as err:
+                    self.logger.exception(
+                        "Unable to re-enable oplog application on the mongod on port %d",
+                        secondary.port)
+                    raise errors.ServerFailure(
+                        "Unable to re-enable oplog application on the mongod on port %d: %s"
+                        % (secondary.port, err.args[0]))
+
+            # Wait a little bit for the secondary to start apply oplog entries so that we are more
+            # likely to kill the mongod process while it is partway into applying a batch.
+            time.sleep(0.1)
+
+            # Check that the secondary is still running before forcibly terminating it. This ensures
+            # we still detect some cases in which the secondary has already crashed.
+            if not secondary.is_running():
+                raise errors.ServerFailure(
+                    "mongod on port %d was expected to be running in"
+                    " PeriodicKillSecondaries.after_test(), but wasn't."
+                    % (secondary.port))
+
+            self.hook_test_case.logger.info(
+                "Killing the secondary on port %d..." % (secondary.port))
+            secondary.mongod.stop(kill=True)
+
+        # Teardown may or may not be considered a success as a result of killing a secondary, so we
+        # ignore the return value of Fixture.teardown().
+        self.fixture.teardown()
+
+    def _check_secondaries_and_restart_fixture(self):
+        preserve_dbpaths = []
+        for node in self.fixture.nodes:
+            preserve_dbpaths.append(node.preserve_dbpath)
+            node.preserve_dbpath = True
+
+        for secondary in self.fixture.get_secondaries():
+            self._check_invariants_as_standalone(secondary)
+
+            # Start the 'secondary' mongod back up as part of the replica set and wait for it to
+            # reach state SECONDARY.
+            secondary.setup()
+            secondary.await_ready()
+            self._await_secondary_state(secondary)
+
+            teardown_success = secondary.teardown()
+            if not teardown_success:
+                raise errors.ServerFailure(
+                    "%s did not exit cleanly after reconciling the end of its oplog" % (secondary))
+
+        self.hook_test_case.logger.info(
+            "Starting the fixture back up again with its data files intact...")
+
+        try:
+            self.fixture.setup()
+            self.fixture.await_ready()
+        finally:
+            for (i, node) in enumerate(self.fixture.nodes):
+                node.preserve_dbpath = preserve_dbpaths[i]
+
+    def _validate_collections(self, test_report):
+        validate_test_case = ValidateCollections(self.logger, self.fixture)
+        validate_test_case.before_suite(test_report)
+        validate_test_case.before_test(self.hook_test_case, test_report)
+        validate_test_case.after_test(self.hook_test_case, test_report)
+        validate_test_case.after_suite(test_report)
+
+    def _check_repl_dbhash(self, test_report):
+        dbhash_test_case = CheckReplDBHash(self.logger, self.fixture)
+        dbhash_test_case.before_suite(test_report)
+        dbhash_test_case.before_test(self.hook_test_case, test_report)
+        dbhash_test_case.after_test(self.hook_test_case, test_report)
+        dbhash_test_case.after_suite(test_report)
+
+    def _restart_and_clear_fixture(self):
+        # We restart the fixture after setting 'preserve_dbpath' back to its original value in order
+        # to clear the contents of the data directory if desired. The CleanEveryN hook cannot be
+        # used in combination with the PeriodicKillSecondaries hook because we may attempt to call
+        # Fixture.teardown() while the "rsSyncApplyStop" failpoint is still enabled on the
+        # secondaries, causing them to exit with a non-zero return code.
+        self.hook_test_case.logger.info(
+            "Finished verifying data consistency, stopping the fixture...")
+
+        teardown_success = self.fixture.teardown()
+        if not teardown_success:
+            raise errors.ServerFailure(
+                "%s did not exit cleanly after verifying data consistency"
+                % (self.fixture))
+
+        self.hook_test_case.logger.info("Starting the fixture back up again...")
+        self.fixture.setup()
+        self.fixture.await_ready()
+
+    def _check_invariants_as_standalone(self, secondary):
+        # We remove the --replSet option in order to start the node as a standalone.
+        replset_name = secondary.mongod_options.pop("replSet")
+
+        try:
+            secondary.setup()
+            secondary.await_ready()
+
+            client = utils.new_mongo_client(port=secondary.port)
+            minvalid_doc = client.local["replset.minvalid"].find_one()
+
+            latest_oplog_doc = client.local["oplog.rs"].find_one(
+                sort=[("$natural", pymongo.DESCENDING)])
+
+            if minvalid_doc is not None:
+                # Check the invariants 'begin <= minValid', 'minValid <= oplogDeletePoint', and
+                # 'minValid <= top of oplog' before the secondary has reconciled the end of its
+                # oplog.
+                null_ts = bson.Timestamp(0, 0)
+                begin_ts = minvalid_doc.get("begin", {}).get("ts", null_ts)
+                minvalid_ts = minvalid_doc.get("ts", begin_ts)
+                oplog_delete_point_ts = minvalid_doc.get("oplogDeleteFromPoint", minvalid_ts)
+
+                if minvalid_ts == null_ts:
+                    # The server treats the "ts" field in the minValid document as missing when its
+                    # value is the null timestamp.
+                    minvalid_ts = begin_ts
+
+                if oplog_delete_point_ts == null_ts:
+                    # The server treats the "oplogDeleteFromPoint" field as missing when its value
+                    # is the null timestamp.
+                    oplog_delete_point_ts = minvalid_ts
+
+                latest_oplog_entry_ts = latest_oplog_doc.get("ts", oplog_delete_point_ts)
+
+                if not begin_ts <= minvalid_ts:
+                    raise errors.ServerFailure(
+                        "The condition begin <= minValid (%s <= %s) doesn't hold: minValid"
+                        " document=%s, latest oplog entry=%s"
+                        % (begin_ts, minvalid_ts, minvalid_doc, latest_oplog_doc))
+
+                if not minvalid_ts <= oplog_delete_point_ts:
+                    raise errors.ServerFailure(
+                        "The condition minValid <= oplogDeletePoint (%s <= %s) doesn't hold:"
+                        " minValid document=%s, latest oplog entry=%s"
+                        % (minvalid_ts, oplog_delete_point_ts, minvalid_doc, latest_oplog_doc))
+
+                if not minvalid_ts <= latest_oplog_entry_ts:
+                    raise errors.ServerFailure(
+                        "The condition minValid <= top of oplog (%s <= %s) doesn't hold: minValid"
+                        " document=%s, latest oplog entry=%s"
+                        % (minvalid_ts, latest_oplog_entry_ts, minvalid_doc, latest_oplog_doc))
+
+            teardown_success = secondary.teardown()
+            if not teardown_success:
+                raise errors.ServerFailure(
+                    "%s did not exit cleanly after being started up as a standalone" % (secondary))
+        except pymongo.errors.OperationFailure as err:
+            self.hook_test_case.logger.exception(
+                "Failed to read the minValid document or the latest oplog entry from the mongod on"
+                " port %d",
+                secondary.port)
+            raise errors.ServerFailure(
+                "Failed to read the minValid document or the latest oplog entry from the mongod on"
+                " port %d: %s"
+                % (secondary.port, err.args[0]))
+        finally:
+            # Set the secondary's options back to their original values.
+            secondary.mongod_options["replSet"] = replset_name
+
+    def _await_secondary_state(self, secondary):
+        client = utils.new_mongo_client(port=secondary.port)
+        try:
+            client.admin.command(bson.SON([
+                ("replSetTest", 1),
+                ("waitForMemberState", 2),  # 2 = SECONDARY
+                ("timeoutMillis", fixtures.ReplFixture.AWAIT_REPL_TIMEOUT_MINS * 60 * 1000)]))
+        except pymongo.errors.OperationFailure as err:
+            self.hook_test_case.logger.exception(
+                "mongod on port %d failed to reach state SECONDARY after %d seconds",
+                secondary.port,
+                fixtures.ReplFixture.AWAIT_REPL_TIMEOUT_MINS * 60)
+            raise errors.ServerFailure(
+                "mongod on port %d failed to reach state SECONDARY after %d seconds: %s"
+                % (secondary.port, fixtures.ReplFixture.AWAIT_REPL_TIMEOUT_MINS * 60, err.args[0]))
+
+
 _CUSTOM_BEHAVIORS = {
     "CleanEveryN": CleanEveryN,
     "CheckReplDBHash": CheckReplDBHash,
@@ -413,4 +703,5 @@ _CUSTOM_BEHAVIORS = {
     "ValidateCollections": ValidateCollections,
     "IntermediateInitialSync": IntermediateInitialSync,
     "BackgroundInitialSync": BackgroundInitialSync,
+    "PeriodicKillSecondaries": PeriodicKillSecondaries,
 }
diff --git a/etc/evergreen.yml b/etc/evergreen.yml
index 9d9b79123b8..a70fe56b87f 100644
--- a/etc/evergreen.yml
+++ b/etc/evergreen.yml
@@ -2128,6 +2128,17 @@ tasks:
       run_multiple_jobs: true
 
 - <<: *task_template
+  name: replica_sets_kill_secondaries_jscore_passthrough_WT
+  depends_on:
+  - name: jsCore_WT
+  commands:
+  - func: "do setup"
+  - func: "run tests"
+    vars:
+      resmoke_args: --suites=replica_sets_kill_secondaries_jscore_passthrough --storageEngine=wiredTiger
+      run_multiple_jobs: true
+
+- <<: *task_template
   name: mongosTest
   commands:
   - func: "do setup"
@@ -4938,6 +4949,9 @@ buildvariants:
   - name: replica_sets_resync_static_jscore_passthrough_WT
     distros:
     - windows-64-vs2015-large
+  - name: replica_sets_kill_secondaries_jscore_passthrough_WT
+    distros:
+    - windows-64-vs2015-large
   - name: master_slave
   - name: master_slave_WT
   - name: master_slave_jscore_passthrough
@@ -5377,6 +5391,7 @@ buildvariants:
   - name: replica_sets_WT
   - name: replica_sets_jscore_passthrough
   - name: replica_sets_jscore_passthrough_WT
+  - name: replica_sets_kill_secondaries_jscore_passthrough_WT
   - name: master_slave
   - name: master_slave_WT
   - name: master_slave_jscore_passthrough
@@ -5646,6 +5661,9 @@ buildvariants:
   - name: replica_sets_resync_static_jscore_passthrough_WT
     distros:
     - rhel62-large
+  - name: replica_sets_kill_secondaries_jscore_passthrough_WT
+    distros:
+    - rhel62-large
   - name: master_slave
   - name: master_slave_WT
   - name: master_slave_auth
@@ -5866,6 +5884,9 @@ buildvariants:
   - name: replica_sets_resync_static_jscore_passthrough_WT
     distros:
     - rhel62-large
+  - name: replica_sets_kill_secondaries_jscore_passthrough_WT
+    distros:
+    - rhel62-large
   - name: master_slave
   - name: master_slave_WT
   - name: master_slave_auth
@@ -8341,6 +8362,7 @@ buildvariants:
   - name: replica_sets_initsync_static_jscore_passthrough_WT
   - name: replica_sets_resync_static_jscore_passthrough
   - name: replica_sets_resync_static_jscore_passthrough_WT
+  - name: replica_sets_kill_secondaries_jscore_passthrough_WT
   - name: master_slave
   - name: master_slave_WT
   - name: master_slave_auth
@@ -8514,6 +8536,7 @@ buildvariants:
   - name: replica_sets_initsync_static_jscore_passthrough_WT
   - name: replica_sets_resync_static_jscore_passthrough
   - name: replica_sets_resync_static_jscore_passthrough_WT
+  - name: replica_sets_kill_secondaries_jscore_passthrough_WT
   - name: master_slave
   - name: master_slave_WT
   - name: master_slave_auth
author	Max Hirschhorn <max.hirschhorn@mongodb.com>	2017-05-01 18:36:48 -0400
committer	Max Hirschhorn <max.hirschhorn@mongodb.com>	2017-05-01 18:36:48 -0400
commit	2d7be840ecf1b7928a99def51fe3bea8304738f8 (patch)
tree	ef8edd3871e9b52d43bf14e87dbaf254b86f30ff
parent	9964889f0936441211cf065a15504cdc5973d646 (diff)
download	mongo-2d7be840ecf1b7928a99def51fe3bea8304738f8.tar.gz