diff options
author | Max Hirschhorn <max.hirschhorn@mongodb.com> | 2017-05-01 18:36:48 -0400 |
---|---|---|
committer | Max Hirschhorn <max.hirschhorn@mongodb.com> | 2017-05-01 18:36:48 -0400 |
commit | 2d7be840ecf1b7928a99def51fe3bea8304738f8 (patch) | |
tree | ef8edd3871e9b52d43bf14e87dbaf254b86f30ff | |
parent | 9964889f0936441211cf065a15504cdc5973d646 (diff) | |
download | mongo-2d7be840ecf1b7928a99def51fe3bea8304738f8.tar.gz |
SERVER-27285 Run jsCore tests while periodically killing secondaries.
Adds a new replica_sets_kill_secondaries_jscore_passthrough.yml suite
that after running tests for a certain period of time (defaults to 30
seconds), resmoke.py will send a SIGKILL to all of the replica set's
secondaries. Each node is then restarted individually with the primary
disabled to verify it reaches the SECONDARY state within 5 minutes of
starting up.
(cherry picked from commit 07f5d153305c0bf10ef55b5dc73eb9a2ca8cb104)
(cherry picked from commit e02c3c769bbcbe26d9132caf28cad6d2d2b4766a)
Also includes the remainder of the changes from
068878410614c789f23b2abc6c5b9680c82abe5e to rename
core_small_oplog_rs_kill_secondaries.yml to
replica_sets_kill_secondaries_jscore_passthrough.yml.
-rw-r--r-- | buildscripts/resmokeconfig/suites/replica_sets_kill_secondaries_jscore_passthrough.yml | 32 | ||||
-rw-r--r-- | buildscripts/resmokelib/core/process.py | 9 | ||||
-rw-r--r-- | buildscripts/resmokelib/testing/fixtures/replicaset.py | 17 | ||||
-rw-r--r-- | buildscripts/resmokelib/testing/hooks.py | 291 | ||||
-rw-r--r-- | etc/evergreen.yml | 23 |
5 files changed, 365 insertions, 7 deletions
diff --git a/buildscripts/resmokeconfig/suites/replica_sets_kill_secondaries_jscore_passthrough.yml b/buildscripts/resmokeconfig/suites/replica_sets_kill_secondaries_jscore_passthrough.yml new file mode 100644 index 00000000000..a4b6c5a5feb --- /dev/null +++ b/buildscripts/resmokeconfig/suites/replica_sets_kill_secondaries_jscore_passthrough.yml @@ -0,0 +1,32 @@ +selector: + js_test: + roots: + - jstests/core/*.js + exclude_files: + # These tests are not expected to pass with replica-sets: + - jstests/core/dbadmin.js + - jstests/core/opcounters_write_cmd.js + - jstests/core/read_after_optime.js + - jstests/core/capped_update.js + # The following tests perform a write with a writeConcern of w=2 when 'testingReplication' is + # true. This causes the test to hang because the secondary is running with the "rsSyncStopApply" + # failpoint enabled. + - jstests/core/geo_update_btree.js + +executor: + js_test: + config: + shell_options: + eval: "testingReplication = true;" + readMode: commands + hooks: + - class: PeriodicKillSecondaries + fixture: + class: ReplicaSetFixture + mongod_options: + oplogSize: 511 + set_parameters: + enableTestCommands: 1 + numInitialSyncAttempts: 1 + num_nodes: 2 + voting_secondaries: false diff --git a/buildscripts/resmokelib/core/process.py b/buildscripts/resmokelib/core/process.py index b8efa8af25a..eaa03cb241c 100644 --- a/buildscripts/resmokelib/core/process.py +++ b/buildscripts/resmokelib/core/process.py @@ -163,12 +163,12 @@ class Process(object): if return_code == win32con.STILL_ACTIVE: raise - def stop(self): + def stop(self, kill=False): """Terminate the process.""" if sys.platform == "win32": # Attempt to cleanly shutdown mongod. - if len(self.args) > 0 and self.args[0].find("mongod") != -1: + if not kill and len(self.args) > 0 and self.args[0].find("mongod") != -1: mongo_signal_handle = None try: mongo_signal_handle = win32event.OpenEvent( @@ -214,7 +214,10 @@ class Process(object): raise else: try: - self._process.terminate() + if kill: + self._process.kill() + else: + self._process.terminate() except OSError as err: # ESRCH (errno=3) is received when the process has already died. if err.errno != 3: diff --git a/buildscripts/resmokelib/testing/fixtures/replicaset.py b/buildscripts/resmokelib/testing/fixtures/replicaset.py index 4928e88b3ff..ac65f037853 100644 --- a/buildscripts/resmokelib/testing/fixtures/replicaset.py +++ b/buildscripts/resmokelib/testing/fixtures/replicaset.py @@ -22,6 +22,7 @@ class ReplicaSetFixture(interface.ReplFixture): """ # Error response codes copied from mongo/base/error_codes.err. + _ALREADY_INITIALIZED = 23 _NODE_NOT_FOUND = 74 def __init__(self, @@ -35,7 +36,8 @@ class ReplicaSetFixture(interface.ReplFixture): start_initial_sync_node=False, write_concern_majority_journal_default=None, auth_options=None, - replset_config_options=None): + replset_config_options=None, + voting_secondaries=True): interface.ReplFixture.__init__(self, logger, job_num) @@ -47,6 +49,7 @@ class ReplicaSetFixture(interface.ReplFixture): self.write_concern_majority_journal_default = write_concern_majority_journal_default self.auth_options = auth_options self.replset_config_options = utils.default_if_none(replset_config_options, {}) + self.voting_secondaries = voting_secondaries # The dbpath in mongod_options is used as the dbpath prefix for replica set members and # takes precedence over other settings. The ShardedClusterFixture uses this parameter to @@ -97,9 +100,10 @@ class ReplicaSetFixture(interface.ReplFixture): member_info = {"_id": i, "host": node.get_connection_string()} if i > 0: member_info["priority"] = 0 - if i >= 7: - # Only 7 nodes in a replica set can vote, so the other members must be non-voting. - member_info["votes"] = 0 + if i >= 7 or not self.voting_secondaries: + # Only 7 nodes in a replica set can vote, so the other members must still be + # non-voting when this fixture is configured to have voting secondaries. + member_info["votes"] = 0 members.append(member_info) if self.initial_sync_node: members.append({"_id": self.initial_sync_node_idx, @@ -164,6 +168,11 @@ class ReplicaSetFixture(interface.ReplFixture): client.admin.command(cmd_obj) break except pymongo.errors.OperationFailure as err: + # Ignore errors from the "replSetInitiate" command when the replica set has already + # been initiated. + if err.code == ReplicaSetFixture._ALREADY_INITIALIZED: + return + # Retry on NodeNotFound errors from the "replSetInitiate" command. if err.code != ReplicaSetFixture._NODE_NOT_FOUND: raise diff --git a/buildscripts/resmokelib/testing/hooks.py b/buildscripts/resmokelib/testing/hooks.py index 3442c4d1ef8..db95390073a 100644 --- a/buildscripts/resmokelib/testing/hooks.py +++ b/buildscripts/resmokelib/testing/hooks.py @@ -7,6 +7,7 @@ from __future__ import absolute_import import os import sys +import time import bson import pymongo @@ -406,6 +407,295 @@ class CheckReplOplogs(JsCustomBehavior): shell_options=shell_options) +class PeriodicKillSecondaries(CustomBehavior): + """ + Periodically kills the secondaries in a replica set and verifies + that they can reach the SECONDARY state without having connectivity + to the primary after an unclean shutdown. + """ + + DEFAULT_PERIOD_SECS = 30 + + def __init__(self, logger, fixture, period_secs=DEFAULT_PERIOD_SECS): + if not isinstance(fixture, fixtures.ReplicaSetFixture): + raise TypeError("%s either does not support replication or does not support writing to" + " its oplog early" + % (fixture.__class__.__name__)) + + if fixture.num_nodes <= 1: + raise ValueError("PeriodicKillSecondaries requires the replica set to contain at least" + " one secondary") + + description = ("PeriodicKillSecondaries (kills the secondary after running tests for a" + " configurable period of time)") + CustomBehavior.__init__(self, logger, fixture, description) + + self._period_secs = period_secs + self._start_time = None + + def after_suite(self, test_report): + if self._start_time is not None: + # Ensure that we test killing the secondary and having it reach state SECONDARY after + # being restarted at least once when running the suite. + self._run(test_report) + + def before_test(self, test, test_report): + if self._start_time is not None: + # The "rsSyncApplyStop" failpoint is already enabled. + return + + # Enable the "rsSyncApplyStop" failpoint on each of the secondaries to prevent them from + # applying any oplog entries while the test is running. + for secondary in self.fixture.get_secondaries(): + client = utils.new_mongo_client(port=secondary.port) + try: + client.admin.command(bson.SON([ + ("configureFailPoint", "rsSyncApplyStop"), + ("mode", "alwaysOn")])) + except pymongo.errors.OperationFailure as err: + self.logger.exception( + "Unable to disable oplog application on the mongod on port %d", secondary.port) + raise errors.ServerFailure( + "Unable to disable oplog application on the mongod on port %d: %s" + % (secondary.port, err.args[0])) + + self._start_time = time.time() + + def after_test(self, test, test_report): + self._last_test_name = test.short_name() + + # Kill the secondaries and verify that they can reach the SECONDARY state if the specified + # period has elapsed. + should_check_secondaries = time.time() - self._start_time >= self._period_secs + if not should_check_secondaries: + return + + self._run(test_report) + + def _run(self, test_report): + self.hook_test_case = testcases.TestCase( + self.logger, + "Hook", + "%s:%s" % (self._last_test_name, self.logger_name)) + CustomBehavior.start_dynamic_test(self.hook_test_case, test_report) + + try: + self._kill_secondaries() + self._check_secondaries_and_restart_fixture() + + # Validate all collections on all nodes after having the secondaries reconcile the end + # of their oplogs. + self._validate_collections(test_report) + + # Verify that the dbhashes match across all nodes after having the secondaries reconcile + # the end of their oplogs. + self._check_repl_dbhash(test_report) + + self._restart_and_clear_fixture() + except Exception as err: + self.hook_test_case.logger.exception( + "Encountered an error running PeriodicKillSecondaries.") + self.hook_test_case.return_code = 2 + test_report.addFailure(self.hook_test_case, sys.exc_info()) + raise errors.StopExecution(err.args[0]) + else: + self.hook_test_case.return_code = 0 + test_report.addSuccess(self.hook_test_case) + finally: + test_report.stopTest(self.hook_test_case) + + # Set the hook back into a state where it will disable oplog application at the start + # of the next test that runs. + self._start_time = None + + def _kill_secondaries(self): + for secondary in self.fixture.get_secondaries(): + # Disable the "rsSyncApplyStop" failpoint on the secondary to have it resume applying + # oplog entries. + for secondary in self.fixture.get_secondaries(): + client = utils.new_mongo_client(port=secondary.port) + try: + client.admin.command(bson.SON([ + ("configureFailPoint", "rsSyncApplyStop"), + ("mode", "off")])) + except pymongo.errors.OperationFailure as err: + self.logger.exception( + "Unable to re-enable oplog application on the mongod on port %d", + secondary.port) + raise errors.ServerFailure( + "Unable to re-enable oplog application on the mongod on port %d: %s" + % (secondary.port, err.args[0])) + + # Wait a little bit for the secondary to start apply oplog entries so that we are more + # likely to kill the mongod process while it is partway into applying a batch. + time.sleep(0.1) + + # Check that the secondary is still running before forcibly terminating it. This ensures + # we still detect some cases in which the secondary has already crashed. + if not secondary.is_running(): + raise errors.ServerFailure( + "mongod on port %d was expected to be running in" + " PeriodicKillSecondaries.after_test(), but wasn't." + % (secondary.port)) + + self.hook_test_case.logger.info( + "Killing the secondary on port %d..." % (secondary.port)) + secondary.mongod.stop(kill=True) + + # Teardown may or may not be considered a success as a result of killing a secondary, so we + # ignore the return value of Fixture.teardown(). + self.fixture.teardown() + + def _check_secondaries_and_restart_fixture(self): + preserve_dbpaths = [] + for node in self.fixture.nodes: + preserve_dbpaths.append(node.preserve_dbpath) + node.preserve_dbpath = True + + for secondary in self.fixture.get_secondaries(): + self._check_invariants_as_standalone(secondary) + + # Start the 'secondary' mongod back up as part of the replica set and wait for it to + # reach state SECONDARY. + secondary.setup() + secondary.await_ready() + self._await_secondary_state(secondary) + + teardown_success = secondary.teardown() + if not teardown_success: + raise errors.ServerFailure( + "%s did not exit cleanly after reconciling the end of its oplog" % (secondary)) + + self.hook_test_case.logger.info( + "Starting the fixture back up again with its data files intact...") + + try: + self.fixture.setup() + self.fixture.await_ready() + finally: + for (i, node) in enumerate(self.fixture.nodes): + node.preserve_dbpath = preserve_dbpaths[i] + + def _validate_collections(self, test_report): + validate_test_case = ValidateCollections(self.logger, self.fixture) + validate_test_case.before_suite(test_report) + validate_test_case.before_test(self.hook_test_case, test_report) + validate_test_case.after_test(self.hook_test_case, test_report) + validate_test_case.after_suite(test_report) + + def _check_repl_dbhash(self, test_report): + dbhash_test_case = CheckReplDBHash(self.logger, self.fixture) + dbhash_test_case.before_suite(test_report) + dbhash_test_case.before_test(self.hook_test_case, test_report) + dbhash_test_case.after_test(self.hook_test_case, test_report) + dbhash_test_case.after_suite(test_report) + + def _restart_and_clear_fixture(self): + # We restart the fixture after setting 'preserve_dbpath' back to its original value in order + # to clear the contents of the data directory if desired. The CleanEveryN hook cannot be + # used in combination with the PeriodicKillSecondaries hook because we may attempt to call + # Fixture.teardown() while the "rsSyncApplyStop" failpoint is still enabled on the + # secondaries, causing them to exit with a non-zero return code. + self.hook_test_case.logger.info( + "Finished verifying data consistency, stopping the fixture...") + + teardown_success = self.fixture.teardown() + if not teardown_success: + raise errors.ServerFailure( + "%s did not exit cleanly after verifying data consistency" + % (self.fixture)) + + self.hook_test_case.logger.info("Starting the fixture back up again...") + self.fixture.setup() + self.fixture.await_ready() + + def _check_invariants_as_standalone(self, secondary): + # We remove the --replSet option in order to start the node as a standalone. + replset_name = secondary.mongod_options.pop("replSet") + + try: + secondary.setup() + secondary.await_ready() + + client = utils.new_mongo_client(port=secondary.port) + minvalid_doc = client.local["replset.minvalid"].find_one() + + latest_oplog_doc = client.local["oplog.rs"].find_one( + sort=[("$natural", pymongo.DESCENDING)]) + + if minvalid_doc is not None: + # Check the invariants 'begin <= minValid', 'minValid <= oplogDeletePoint', and + # 'minValid <= top of oplog' before the secondary has reconciled the end of its + # oplog. + null_ts = bson.Timestamp(0, 0) + begin_ts = minvalid_doc.get("begin", {}).get("ts", null_ts) + minvalid_ts = minvalid_doc.get("ts", begin_ts) + oplog_delete_point_ts = minvalid_doc.get("oplogDeleteFromPoint", minvalid_ts) + + if minvalid_ts == null_ts: + # The server treats the "ts" field in the minValid document as missing when its + # value is the null timestamp. + minvalid_ts = begin_ts + + if oplog_delete_point_ts == null_ts: + # The server treats the "oplogDeleteFromPoint" field as missing when its value + # is the null timestamp. + oplog_delete_point_ts = minvalid_ts + + latest_oplog_entry_ts = latest_oplog_doc.get("ts", oplog_delete_point_ts) + + if not begin_ts <= minvalid_ts: + raise errors.ServerFailure( + "The condition begin <= minValid (%s <= %s) doesn't hold: minValid" + " document=%s, latest oplog entry=%s" + % (begin_ts, minvalid_ts, minvalid_doc, latest_oplog_doc)) + + if not minvalid_ts <= oplog_delete_point_ts: + raise errors.ServerFailure( + "The condition minValid <= oplogDeletePoint (%s <= %s) doesn't hold:" + " minValid document=%s, latest oplog entry=%s" + % (minvalid_ts, oplog_delete_point_ts, minvalid_doc, latest_oplog_doc)) + + if not minvalid_ts <= latest_oplog_entry_ts: + raise errors.ServerFailure( + "The condition minValid <= top of oplog (%s <= %s) doesn't hold: minValid" + " document=%s, latest oplog entry=%s" + % (minvalid_ts, latest_oplog_entry_ts, minvalid_doc, latest_oplog_doc)) + + teardown_success = secondary.teardown() + if not teardown_success: + raise errors.ServerFailure( + "%s did not exit cleanly after being started up as a standalone" % (secondary)) + except pymongo.errors.OperationFailure as err: + self.hook_test_case.logger.exception( + "Failed to read the minValid document or the latest oplog entry from the mongod on" + " port %d", + secondary.port) + raise errors.ServerFailure( + "Failed to read the minValid document or the latest oplog entry from the mongod on" + " port %d: %s" + % (secondary.port, err.args[0])) + finally: + # Set the secondary's options back to their original values. + secondary.mongod_options["replSet"] = replset_name + + def _await_secondary_state(self, secondary): + client = utils.new_mongo_client(port=secondary.port) + try: + client.admin.command(bson.SON([ + ("replSetTest", 1), + ("waitForMemberState", 2), # 2 = SECONDARY + ("timeoutMillis", fixtures.ReplFixture.AWAIT_REPL_TIMEOUT_MINS * 60 * 1000)])) + except pymongo.errors.OperationFailure as err: + self.hook_test_case.logger.exception( + "mongod on port %d failed to reach state SECONDARY after %d seconds", + secondary.port, + fixtures.ReplFixture.AWAIT_REPL_TIMEOUT_MINS * 60) + raise errors.ServerFailure( + "mongod on port %d failed to reach state SECONDARY after %d seconds: %s" + % (secondary.port, fixtures.ReplFixture.AWAIT_REPL_TIMEOUT_MINS * 60, err.args[0])) + + _CUSTOM_BEHAVIORS = { "CleanEveryN": CleanEveryN, "CheckReplDBHash": CheckReplDBHash, @@ -413,4 +703,5 @@ _CUSTOM_BEHAVIORS = { "ValidateCollections": ValidateCollections, "IntermediateInitialSync": IntermediateInitialSync, "BackgroundInitialSync": BackgroundInitialSync, + "PeriodicKillSecondaries": PeriodicKillSecondaries, } diff --git a/etc/evergreen.yml b/etc/evergreen.yml index 9d9b79123b8..a70fe56b87f 100644 --- a/etc/evergreen.yml +++ b/etc/evergreen.yml @@ -2128,6 +2128,17 @@ tasks: run_multiple_jobs: true - <<: *task_template + name: replica_sets_kill_secondaries_jscore_passthrough_WT + depends_on: + - name: jsCore_WT + commands: + - func: "do setup" + - func: "run tests" + vars: + resmoke_args: --suites=replica_sets_kill_secondaries_jscore_passthrough --storageEngine=wiredTiger + run_multiple_jobs: true + +- <<: *task_template name: mongosTest commands: - func: "do setup" @@ -4938,6 +4949,9 @@ buildvariants: - name: replica_sets_resync_static_jscore_passthrough_WT distros: - windows-64-vs2015-large + - name: replica_sets_kill_secondaries_jscore_passthrough_WT + distros: + - windows-64-vs2015-large - name: master_slave - name: master_slave_WT - name: master_slave_jscore_passthrough @@ -5377,6 +5391,7 @@ buildvariants: - name: replica_sets_WT - name: replica_sets_jscore_passthrough - name: replica_sets_jscore_passthrough_WT + - name: replica_sets_kill_secondaries_jscore_passthrough_WT - name: master_slave - name: master_slave_WT - name: master_slave_jscore_passthrough @@ -5646,6 +5661,9 @@ buildvariants: - name: replica_sets_resync_static_jscore_passthrough_WT distros: - rhel62-large + - name: replica_sets_kill_secondaries_jscore_passthrough_WT + distros: + - rhel62-large - name: master_slave - name: master_slave_WT - name: master_slave_auth @@ -5866,6 +5884,9 @@ buildvariants: - name: replica_sets_resync_static_jscore_passthrough_WT distros: - rhel62-large + - name: replica_sets_kill_secondaries_jscore_passthrough_WT + distros: + - rhel62-large - name: master_slave - name: master_slave_WT - name: master_slave_auth @@ -8341,6 +8362,7 @@ buildvariants: - name: replica_sets_initsync_static_jscore_passthrough_WT - name: replica_sets_resync_static_jscore_passthrough - name: replica_sets_resync_static_jscore_passthrough_WT + - name: replica_sets_kill_secondaries_jscore_passthrough_WT - name: master_slave - name: master_slave_WT - name: master_slave_auth @@ -8514,6 +8536,7 @@ buildvariants: - name: replica_sets_initsync_static_jscore_passthrough_WT - name: replica_sets_resync_static_jscore_passthrough - name: replica_sets_resync_static_jscore_passthrough_WT + - name: replica_sets_kill_secondaries_jscore_passthrough_WT - name: master_slave - name: master_slave_WT - name: master_slave_auth |