summaryrefslogtreecommitdiff
path: root/buildscripts
diff options
context:
space:
mode:
authorMax Hirschhorn <max.hirschhorn@mongodb.com>2017-01-30 20:33:17 -0500
committerMax Hirschhorn <max.hirschhorn@mongodb.com>2017-01-30 20:33:17 -0500
commit07f5d153305c0bf10ef55b5dc73eb9a2ca8cb104 (patch)
tree16fd1455655d46c45ad3cbcbdeabdd842ae45054 /buildscripts
parent3942d88af18cd7a2d9fff8ea3800f8c7769e5c9f (diff)
downloadmongo-07f5d153305c0bf10ef55b5dc73eb9a2ca8cb104.tar.gz
SERVER-27285 Run jsCore tests while periodically killing secondaries.
Adds a new core_small_oplog_rs_kill_secondaries.yml suite that after running tests for a certain period of time (defaults to 30 seconds), resmoke.py will send a SIGKILL to all of the replica set's secondaries. Each node is then restarted individually with the primary disabled to verify it reaches the SECONDARY state within 5 minutes of starting up.
Diffstat (limited to 'buildscripts')
-rw-r--r--buildscripts/resmokeconfig/suites/core_small_oplog_rs_kill_secondaries.yml32
-rw-r--r--buildscripts/resmokelib/core/process.py9
-rw-r--r--buildscripts/resmokelib/testing/fixtures/replicaset.py17
-rw-r--r--buildscripts/resmokelib/testing/hooks.py291
4 files changed, 342 insertions, 7 deletions
diff --git a/buildscripts/resmokeconfig/suites/core_small_oplog_rs_kill_secondaries.yml b/buildscripts/resmokeconfig/suites/core_small_oplog_rs_kill_secondaries.yml
new file mode 100644
index 00000000000..a4b6c5a5feb
--- /dev/null
+++ b/buildscripts/resmokeconfig/suites/core_small_oplog_rs_kill_secondaries.yml
@@ -0,0 +1,32 @@
+selector:
+ js_test:
+ roots:
+ - jstests/core/*.js
+ exclude_files:
+ # These tests are not expected to pass with replica-sets:
+ - jstests/core/dbadmin.js
+ - jstests/core/opcounters_write_cmd.js
+ - jstests/core/read_after_optime.js
+ - jstests/core/capped_update.js
+ # The following tests perform a write with a writeConcern of w=2 when 'testingReplication' is
+ # true. This causes the test to hang because the secondary is running with the "rsSyncStopApply"
+ # failpoint enabled.
+ - jstests/core/geo_update_btree.js
+
+executor:
+ js_test:
+ config:
+ shell_options:
+ eval: "testingReplication = true;"
+ readMode: commands
+ hooks:
+ - class: PeriodicKillSecondaries
+ fixture:
+ class: ReplicaSetFixture
+ mongod_options:
+ oplogSize: 511
+ set_parameters:
+ enableTestCommands: 1
+ numInitialSyncAttempts: 1
+ num_nodes: 2
+ voting_secondaries: false
diff --git a/buildscripts/resmokelib/core/process.py b/buildscripts/resmokelib/core/process.py
index b8efa8af25a..eaa03cb241c 100644
--- a/buildscripts/resmokelib/core/process.py
+++ b/buildscripts/resmokelib/core/process.py
@@ -163,12 +163,12 @@ class Process(object):
if return_code == win32con.STILL_ACTIVE:
raise
- def stop(self):
+ def stop(self, kill=False):
"""Terminate the process."""
if sys.platform == "win32":
# Attempt to cleanly shutdown mongod.
- if len(self.args) > 0 and self.args[0].find("mongod") != -1:
+ if not kill and len(self.args) > 0 and self.args[0].find("mongod") != -1:
mongo_signal_handle = None
try:
mongo_signal_handle = win32event.OpenEvent(
@@ -214,7 +214,10 @@ class Process(object):
raise
else:
try:
- self._process.terminate()
+ if kill:
+ self._process.kill()
+ else:
+ self._process.terminate()
except OSError as err:
# ESRCH (errno=3) is received when the process has already died.
if err.errno != 3:
diff --git a/buildscripts/resmokelib/testing/fixtures/replicaset.py b/buildscripts/resmokelib/testing/fixtures/replicaset.py
index 71e5925679b..46202a41043 100644
--- a/buildscripts/resmokelib/testing/fixtures/replicaset.py
+++ b/buildscripts/resmokelib/testing/fixtures/replicaset.py
@@ -22,6 +22,7 @@ class ReplicaSetFixture(interface.ReplFixture):
"""
# Error response codes copied from mongo/base/error_codes.err.
+ _ALREADY_INITIALIZED = 23
_NODE_NOT_FOUND = 74
def __init__(self,
@@ -35,7 +36,8 @@ class ReplicaSetFixture(interface.ReplFixture):
start_initial_sync_node=False,
write_concern_majority_journal_default=None,
auth_options=None,
- replset_config_options=None):
+ replset_config_options=None,
+ voting_secondaries=True):
interface.ReplFixture.__init__(self, logger, job_num)
@@ -47,6 +49,7 @@ class ReplicaSetFixture(interface.ReplFixture):
self.write_concern_majority_journal_default = write_concern_majority_journal_default
self.auth_options = auth_options
self.replset_config_options = utils.default_if_none(replset_config_options, {})
+ self.voting_secondaries = voting_secondaries
# The dbpath in mongod_options is used as the dbpath prefix for replica set members and
# takes precedence over other settings. The ShardedClusterFixture uses this parameter to
@@ -98,9 +101,10 @@ class ReplicaSetFixture(interface.ReplFixture):
member_info = {"_id": i, "host": node.get_connection_string()}
if i > 0:
member_info["priority"] = 0
- if i >= 7:
- # Only 7 nodes in a replica set can vote, so the other members must be non-voting.
- member_info["votes"] = 0
+ if i >= 7 or not self.voting_secondaries:
+ # Only 7 nodes in a replica set can vote, so the other members must still be
+ # non-voting when this fixture is configured to have voting secondaries.
+ member_info["votes"] = 0
members.append(member_info)
if self.initial_sync_node:
members.append({"_id": self.initial_sync_node_idx,
@@ -144,6 +148,11 @@ class ReplicaSetFixture(interface.ReplFixture):
client.admin.command(initiate_cmd_obj)
break
except pymongo.errors.OperationFailure as err:
+ # Ignore errors from the "replSetInitiate" command when the replica set has already
+ # been initiated.
+ if err.code == ReplicaSetFixture._ALREADY_INITIALIZED:
+ return
+
# Retry on NodeNotFound errors from the "replSetInitiate" command.
if err.code != ReplicaSetFixture._NODE_NOT_FOUND:
raise
diff --git a/buildscripts/resmokelib/testing/hooks.py b/buildscripts/resmokelib/testing/hooks.py
index ebcb562c731..82196af0c75 100644
--- a/buildscripts/resmokelib/testing/hooks.py
+++ b/buildscripts/resmokelib/testing/hooks.py
@@ -7,6 +7,7 @@ from __future__ import absolute_import
import os
import sys
+import time
import bson
import pymongo
@@ -390,6 +391,295 @@ class CheckReplOplogs(JsCustomBehavior):
shell_options=shell_options)
+class PeriodicKillSecondaries(CustomBehavior):
+ """
+ Periodically kills the secondaries in a replica set and verifies
+ that they can reach the SECONDARY state without having connectivity
+ to the primary after an unclean shutdown.
+ """
+
+ DEFAULT_PERIOD_SECS = 30
+
+ def __init__(self, logger, fixture, period_secs=DEFAULT_PERIOD_SECS):
+ if not isinstance(fixture, fixtures.ReplicaSetFixture):
+ raise TypeError("%s either does not support replication or does not support writing to"
+ " its oplog early"
+ % (fixture.__class__.__name__))
+
+ if fixture.num_nodes <= 1:
+ raise ValueError("PeriodicKillSecondaries requires the replica set to contain at least"
+ " one secondary")
+
+ description = ("PeriodicKillSecondaries (kills the secondary after running tests for a"
+ " configurable period of time)")
+ CustomBehavior.__init__(self, logger, fixture, description)
+
+ self._period_secs = period_secs
+ self._start_time = None
+
+ def after_suite(self, test_report):
+ if self._start_time is not None:
+ # Ensure that we test killing the secondary and having it reach state SECONDARY after
+ # being restarted at least once when running the suite.
+ self._run(test_report)
+
+ def before_test(self, test, test_report):
+ if self._start_time is not None:
+ # The "rsSyncApplyStop" failpoint is already enabled.
+ return
+
+ # Enable the "rsSyncApplyStop" failpoint on each of the secondaries to prevent them from
+ # applying any oplog entries while the test is running.
+ for secondary in self.fixture.get_secondaries():
+ client = utils.new_mongo_client(port=secondary.port)
+ try:
+ client.admin.command(bson.SON([
+ ("configureFailPoint", "rsSyncApplyStop"),
+ ("mode", "alwaysOn")]))
+ except pymongo.errors.OperationFailure as err:
+ self.logger.exception(
+ "Unable to disable oplog application on the mongod on port %d", secondary.port)
+ raise errors.ServerFailure(
+ "Unable to disable oplog application on the mongod on port %d: %s"
+ % (secondary.port, err.args[0]))
+
+ self._start_time = time.time()
+
+ def after_test(self, test, test_report):
+ self._last_test_name = test.short_name()
+
+ # Kill the secondaries and verify that they can reach the SECONDARY state if the specified
+ # period has elapsed.
+ should_check_secondaries = time.time() - self._start_time >= self._period_secs
+ if not should_check_secondaries:
+ return
+
+ self._run(test_report)
+
+ def _run(self, test_report):
+ self.hook_test_case = testcases.TestCase(
+ self.logger,
+ "Hook",
+ "%s:%s" % (self._last_test_name, self.logger_name))
+ CustomBehavior.start_dynamic_test(self.hook_test_case, test_report)
+
+ try:
+ self._kill_secondaries()
+ self._check_secondaries_and_restart_fixture()
+
+ # Validate all collections on all nodes after having the secondaries reconcile the end
+ # of their oplogs.
+ self._validate_collections(test_report)
+
+ # Verify that the dbhashes match across all nodes after having the secondaries reconcile
+ # the end of their oplogs.
+ self._check_repl_dbhash(test_report)
+
+ self._restart_and_clear_fixture()
+ except:
+ self.hook_test_case.logger.exception(
+ "Encountered an error running PeriodicKillSecondaries.")
+ self.hook_test_case.return_code = 2
+ test_report.addFailure(self.hook_test_case, sys.exc_info())
+ raise errors.StopExecution(err.args[0])
+ else:
+ self.hook_test_case.return_code = 0
+ test_report.addSuccess(self.hook_test_case)
+ finally:
+ test_report.stopTest(self.hook_test_case)
+
+ # Set the hook back into a state where it will disable oplog application at the start
+ # of the next test that runs.
+ self._start_time = None
+
+ def _kill_secondaries(self):
+ for secondary in self.fixture.get_secondaries():
+ # Disable the "rsSyncApplyStop" failpoint on the secondary to have it resume applying
+ # oplog entries.
+ for secondary in self.fixture.get_secondaries():
+ client = utils.new_mongo_client(port=secondary.port)
+ try:
+ client.admin.command(bson.SON([
+ ("configureFailPoint", "rsSyncApplyStop"),
+ ("mode", "off")]))
+ except pymongo.errors.OperationFailure as err:
+ self.logger.exception(
+ "Unable to re-enable oplog application on the mongod on port %d",
+ secondary.port)
+ raise errors.ServerFailure(
+ "Unable to re-enable oplog application on the mongod on port %d: %s"
+ % (secondary.port, err.args[0]))
+
+ # Wait a little bit for the secondary to start apply oplog entries so that we are more
+ # likely to kill the mongod process while it is partway into applying a batch.
+ time.sleep(0.1)
+
+ # Check that the secondary is still running before forcibly terminating it. This ensures
+ # we still detect some cases in which the secondary has already crashed.
+ if not secondary.is_running():
+ raise errors.ServerFailure(
+ "mongod on port %d was expected to be running in"
+ " PeriodicKillSecondaries.after_test(), but wasn't."
+ % (secondary.port))
+
+ self.hook_test_case.logger.info(
+ "Killing the secondary on port %d..." % (secondary.port))
+ secondary.mongod.stop(kill=True)
+
+ # Teardown may or may not be considered a success as a result of killing a secondary, so we
+ # ignore the return value of Fixture.teardown().
+ self.fixture.teardown()
+
+ def _check_secondaries_and_restart_fixture(self):
+ preserve_dbpaths = []
+ for node in self.fixture.nodes:
+ preserve_dbpaths.append(node.preserve_dbpath)
+ node.preserve_dbpath = True
+
+ for secondary in self.fixture.get_secondaries():
+ self._check_invariants_as_standalone(secondary)
+
+ # Start the 'secondary' mongod back up as part of the replica set and wait for it to
+ # reach state SECONDARY.
+ secondary.setup()
+ secondary.await_ready()
+ self._await_secondary_state(secondary)
+
+ teardown_success = secondary.teardown()
+ if not teardown_success:
+ raise errors.ServerFailure(
+ "%s did not exit cleanly after reconciling the end of its oplog" % (secondary))
+
+ self.hook_test_case.logger.info(
+ "Starting the fixture back up again with its data files intact...")
+
+ try:
+ self.fixture.setup()
+ self.fixture.await_ready()
+ finally:
+ for (i, node) in enumerate(self.fixture.nodes):
+ node.preserve_dbpath = preserve_dbpaths[i]
+
+ def _validate_collections(self, test_report):
+ validate_test_case = ValidateCollections(self.logger, self.fixture)
+ validate_test_case.before_suite(test_report)
+ validate_test_case.before_test(self.hook_test_case, test_report)
+ validate_test_case.after_test(self.hook_test_case, test_report)
+ validate_test_case.after_suite(test_report)
+
+ def _check_repl_dbhash(self, test_report):
+ dbhash_test_case = CheckReplDBHash(self.logger, self.fixture)
+ dbhash_test_case.before_suite(test_report)
+ dbhash_test_case.before_test(self.hook_test_case, test_report)
+ dbhash_test_case.after_test(self.hook_test_case, test_report)
+ dbhash_test_case.after_suite(test_report)
+
+ def _restart_and_clear_fixture(self):
+ # We restart the fixture after setting 'preserve_dbpath' back to its original value in order
+ # to clear the contents of the data directory if desired. The CleanEveryN hook cannot be
+ # used in combination with the PeriodicKillSecondaries hook because we may attempt to call
+ # Fixture.teardown() while the "rsSyncApplyStop" failpoint is still enabled on the
+ # secondaries, causing them to exit with a non-zero return code.
+ self.hook_test_case.logger.info(
+ "Finished verifying data consistency, stopping the fixture...")
+
+ teardown_success = self.fixture.teardown()
+ if not teardown_success:
+ raise errors.ServerFailure(
+ "%s did not exit cleanly after verifying data consistency"
+ % (self.fixture))
+
+ self.hook_test_case.logger.info("Starting the fixture back up again...")
+ self.fixture.setup()
+ self.fixture.await_ready()
+
+ def _check_invariants_as_standalone(self, secondary):
+ # We remove the --replSet option in order to start the node as a standalone.
+ replset_name = secondary.mongod_options.pop("replSet")
+
+ try:
+ secondary.setup()
+ secondary.await_ready()
+
+ client = utils.new_mongo_client(port=secondary.port)
+ minvalid_doc = client.local["replset.minvalid"].find_one()
+
+ latest_oplog_doc = client.local["oplog.rs"].find_one(
+ sort=[("$natural", pymongo.DESCENDING)])
+
+ if minvalid_doc is not None:
+ # Check the invariants 'begin <= minValid', 'minValid <= oplogDeletePoint', and
+ # 'minValid <= top of oplog' before the secondary has reconciled the end of its
+ # oplog.
+ null_ts = bson.Timestamp(0, 0)
+ begin_ts = minvalid_doc.get("begin", {}).get("ts", null_ts)
+ minvalid_ts = minvalid_doc.get("ts", begin_ts)
+ oplog_delete_point_ts = minvalid_doc.get("oplogDeleteFromPoint", minvalid_ts)
+
+ if minvalid_ts == null_ts:
+ # The server treats the "ts" field in the minValid document as missing when its
+ # value is the null timestamp.
+ minvalid_ts = begin_ts
+
+ if oplog_delete_point_ts == null_ts:
+ # The server treats the "oplogDeleteFromPoint" field as missing when its value
+ # is the null timestamp.
+ oplog_delete_point_ts = minvalid_ts
+
+ latest_oplog_entry_ts = latest_oplog_doc.get("ts", oplog_delete_point_ts)
+
+ if not begin_ts <= minvalid_ts:
+ raise errors.ServerFailure(
+ "The condition begin <= minValid (%s <= %s) doesn't hold: minValid"
+ " document=%s, latest oplog entry=%s"
+ % (begin_ts, minvalid_ts, minvalid_doc, latest_oplog_doc))
+
+ if not minvalid_ts <= oplog_delete_point_ts:
+ raise errors.ServerFailure(
+ "The condition minValid <= oplogDeletePoint (%s <= %s) doesn't hold:"
+ " minValid document=%s, latest oplog entry=%s"
+ % (minvalid_ts, oplog_delete_point_ts, minvalid_doc, latest_oplog_doc))
+
+ if not minvalid_ts <= latest_oplog_entry_ts:
+ raise errors.ServerFailure(
+ "The condition minValid <= top of oplog (%s <= %s) doesn't hold: minValid"
+ " document=%s, latest oplog entry=%s"
+ % (minvalid_ts, latest_oplog_entry_ts, minvalid_doc, latest_oplog_doc))
+
+ teardown_success = secondary.teardown()
+ if not teardown_success:
+ raise errors.ServerFailure(
+ "%s did not exit cleanly after being started up as a standalone" % (secondary))
+ except pymongo.errors.OperationFailure as err:
+ self.hook_test_case.logger.exception(
+ "Failed to read the minValid document or the latest oplog entry from the mongod on"
+ " port %d",
+ secondary.port)
+ raise errors.ServerFailure(
+ "Failed to read the minValid document or the latest oplog entry from the mongod on"
+ " port %d: %s"
+ % (secondary.port, err.args[0]))
+ finally:
+ # Set the secondary's options back to their original values.
+ secondary.mongod_options["replSet"] = replset_name
+
+ def _await_secondary_state(self, secondary):
+ client = utils.new_mongo_client(port=secondary.port)
+ try:
+ client.admin.command(bson.SON([
+ ("replSetTest", 1),
+ ("waitForMemberState", 2), # 2 = SECONDARY
+ ("timeoutMillis", fixtures.ReplFixture.AWAIT_REPL_TIMEOUT_MINS * 60 * 1000)]))
+ except pymongo.errors.OperationFailure as err:
+ self.hook_test_case.logger.exception(
+ "mongod on port %d failed to reach state SECONDARY after %d seconds",
+ secondary.port,
+ fixtures.ReplFixture.AWAIT_REPL_TIMEOUT_MINS * 60)
+ raise errors.ServerFailure(
+ "mongod on port %d failed to reach state SECONDARY after %d seconds: %s"
+ % (secondary.port, fixtures.ReplFixture.AWAIT_REPL_TIMEOUT_MINS * 60, err.args[0]))
+
+
_CUSTOM_BEHAVIORS = {
"CleanEveryN": CleanEveryN,
"CheckReplDBHash": CheckReplDBHash,
@@ -397,4 +687,5 @@ _CUSTOM_BEHAVIORS = {
"ValidateCollections": ValidateCollections,
"IntermediateInitialSync": IntermediateInitialSync,
"BackgroundInitialSync": BackgroundInitialSync,
+ "PeriodicKillSecondaries": PeriodicKillSecondaries,
}