diff options
author | Gregory Wlodarek <gregory.wlodarek@mongodb.com> | 2022-03-25 19:18:41 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-03-31 16:38:23 +0000 |
commit | e827fca1f586eb49ea1536b2985bd55a0105093b (patch) | |
tree | 15aa09e60ec6e3550c3307f2444721ed31a298a4 /buildscripts | |
parent | 2dde7918ca79c09720aa500914243c28f0392df3 (diff) | |
download | mongo-e827fca1f586eb49ea1536b2985bd55a0105093b.tar.gz |
SERVER-64647 Fix errors in SimulateCrashes hook and convert it to use BGHook
Diffstat (limited to 'buildscripts')
-rw-r--r-- | buildscripts/resmokelib/testing/hooks/simulate_crash.py | 132 |
1 files changed, 31 insertions, 101 deletions
diff --git a/buildscripts/resmokelib/testing/hooks/simulate_crash.py b/buildscripts/resmokelib/testing/hooks/simulate_crash.py index 0755b4236a4..4f5db37baf9 100644 --- a/buildscripts/resmokelib/testing/hooks/simulate_crash.py +++ b/buildscripts/resmokelib/testing/hooks/simulate_crash.py @@ -10,86 +10,55 @@ indicates a problem. """ import os -import pprint import random import shutil -import threading import time import pymongo from buildscripts.resmokelib.core import process -from buildscripts.resmokelib.testing.hooks import interface +from buildscripts.resmokelib.testing.hooks import bghook -def validate(mdb, logger): +def validate(mdb, logger, acceptable_err_codes): """Return true if all collections are valid.""" for db in mdb.database_names(): for coll in mdb.get_database(db).list_collection_names(): res = mdb.get_database(db).command({"validate": coll}, check=False) if res["ok"] != 1.0 or res["valid"] is not True: - if "code" in res and res["code"] == 166: + if "code" in res and res["code"] in acceptable_err_codes: # Command not supported on view. pass else: - logger.info("FAILURE!\nValidate Response: {}", pprint.pformat(res)) + logger.info("FAILURE!\nValidate Response: ") + logger.info(res) return False return True -class BGJob(threading.Thread): - """Background job to pause nodes, copy data files, resume nodes, and validate data files.""" +class SimulateCrash(bghook.BGHook): + """A hook to simulate crashes.""" - def __init__(self, hook): - """Initialize the background job.""" - threading.Thread.__init__(self, name="SimulateCrashes") - self.daemon = True - self._hook = hook - self._lock = threading.Lock() - self._is_alive = True + def __init__(self, hook_logger, fixture): + """Initialize SimulateCrash.""" + bghook.BGHook.__init__(self, hook_logger, fixture, "Simulate crashes hook") + self.acceptable_err_codes = [166, 11600] self.backup_num = 0 - self.found_error = False - - def run(self): - """Run the background job.""" - while True: - with self._lock: - if self.is_alive is False: - break - - self._hook.pause_and_copy(self.backup_num) - if not self._hook.validate_all(self.backup_num): - self.found_error = True - self._hook.running_test.fixture.teardown() - self.is_alive = False - return + self.validate_port = self.fixture.fixturelib.get_next_port(self.fixture.job_num) - time.sleep(random.randint(1, 5)) - self.backup_num += 1 + def run_action(self): + """Copy data files and run validation on all nodes.""" + self.pause_and_copy() - def kill(self): - """Kill the background job.""" - with self._lock: - self.is_alive = False + if not self.validate_all(): + raise ValueError("Validation failed") + time.sleep(random.randint(1, 5)) + self.backup_num += 1 -class SimulateCrash(interface.Hook): - """A hook to simulate crashes.""" - - IS_BACKGROUND = True - - def __init__(self, hook_logger, fixture): - """Initialize SimulateCrash.""" - interface.Hook.__init__(self, hook_logger, fixture, "Simulate crashes hook") - self.found_error = False - self.last_validate_port = 19000 - self.logger = hook_logger - self.running_test = None - self._background_job = None - - def pause_and_copy(self, backup_num): + def pause_and_copy(self): """For all replica set nodes, this will send a SIGSTOP signal, copy the data files and send a SIGCONT signal.""" - self.logger.info("Taking snapshot #{}".format(backup_num)) + self.logger.info("Taking snapshot #{}".format(self.backup_num)) nodes_to_copy = [x for x in self.fixture.nodes] random.shuffle(nodes_to_copy) @@ -111,7 +80,8 @@ class SimulateCrash(interface.Hook): fqfn = "/".join([tup[0], filename]) self.copy_file( node.get_dbpath_prefix(), fqfn, - node.get_dbpath_prefix() + "/simulateCrashes/{}".format(backup_num)) + node.get_dbpath_prefix() + "/simulateCrashes/{}".format( + self.backup_num)) finally: node.mongod.resume() @@ -128,27 +98,24 @@ class SimulateCrash(interface.Hook): os.close(out_fd) os.close(in_fd) - def validate_all(self, backup_num): + def validate_all(self): """Start a standalone node to validate all collections on the copied data files.""" for node in self.fixture.nodes: - if self.last_validate_port >= 20000: - self.last_validate_port = 19000 - validate_port = self.last_validate_port - self.last_validate_port += 1 - - path = node.get_dbpath_prefix() + "/simulateCrashes/{}".format(backup_num) + path = node.get_dbpath_prefix() + "/simulateCrashes/{}".format(self.backup_num) self.logger.info("Starting to validate. DBPath: {} Port: {}".format( - path, validate_port)) + path, self.validate_port)) mdb = process.Process(self.logger, [ node.mongod_executable, "--dbpath", path, "--port", - str(validate_port), "--logpath", + str(self.validate_port), "--logpath", node.get_dbpath_prefix() + "/simulateCrashes/validate.log" ]) mdb.start() - client = pymongo.MongoClient("localhost:{}".format(validate_port)) - is_valid = validate(client, self.logger) + client = pymongo.MongoClient(host="localhost", port=self.validate_port, connect=True, + connectTimeoutMS=120000, serverSelectionTimeoutMS=120000, + directConnection=True) + is_valid = validate(client, self.logger, self.acceptable_err_codes) mdb.stop() mdb.wait() @@ -158,40 +125,3 @@ class SimulateCrash(interface.Hook): shutil.rmtree(path, ignore_errors=True) return True - - def before_suite(self, test_report): - """Start the background thread.""" - self.logger.info("Starting the SimulateCrashes thread.") - self._background_job = BGJob(self) - self._background_job.start() - - def after_suite(self, test_report, teardown_flag=None): - """Signal the background thread to exit, and wait until it does.""" - if self._background_job is None: - return - - self.logger.info("Stopping the SimulateCrashes thread.") - self._background_job.kill() - self._background_job.join() - - if self._background_job.found_error: - self.logger.error("Encountered an error inside the simulate crashes hook.", - exc_info=self._background_job.exc_info) - - def before_test(self, test, test_report): - """Each test will call this before it executes.""" - self.running_test = test - - def after_test(self, test, test_report): - """Each test will call this after it executes. Check if the hook found an error.""" - if self._background_job is None: - return - - if not self._background_job.found_error: - return - - self._background_job.kill() - self._background_job.join() - - self.logger.error("Encountered an error inside the simulate crashes hook.", - exc_info=self._background_job.exc_info) |