summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGregory Wlodarek <gregory.wlodarek@mongodb.com>2022-03-25 19:18:41 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-03-31 16:38:23 +0000
commite827fca1f586eb49ea1536b2985bd55a0105093b (patch)
tree15aa09e60ec6e3550c3307f2444721ed31a298a4
parent2dde7918ca79c09720aa500914243c28f0392df3 (diff)
downloadmongo-e827fca1f586eb49ea1536b2985bd55a0105093b.tar.gz
SERVER-64647 Fix errors in SimulateCrashes hook and convert it to use BGHook
-rw-r--r--buildscripts/resmokelib/testing/hooks/simulate_crash.py132
1 files changed, 31 insertions, 101 deletions
diff --git a/buildscripts/resmokelib/testing/hooks/simulate_crash.py b/buildscripts/resmokelib/testing/hooks/simulate_crash.py
index 0755b4236a4..4f5db37baf9 100644
--- a/buildscripts/resmokelib/testing/hooks/simulate_crash.py
+++ b/buildscripts/resmokelib/testing/hooks/simulate_crash.py
@@ -10,86 +10,55 @@ indicates a problem.
"""
import os
-import pprint
import random
import shutil
-import threading
import time
import pymongo
from buildscripts.resmokelib.core import process
-from buildscripts.resmokelib.testing.hooks import interface
+from buildscripts.resmokelib.testing.hooks import bghook
-def validate(mdb, logger):
+def validate(mdb, logger, acceptable_err_codes):
"""Return true if all collections are valid."""
for db in mdb.database_names():
for coll in mdb.get_database(db).list_collection_names():
res = mdb.get_database(db).command({"validate": coll}, check=False)
if res["ok"] != 1.0 or res["valid"] is not True:
- if "code" in res and res["code"] == 166:
+ if "code" in res and res["code"] in acceptable_err_codes:
# Command not supported on view.
pass
else:
- logger.info("FAILURE!\nValidate Response: {}", pprint.pformat(res))
+ logger.info("FAILURE!\nValidate Response: ")
+ logger.info(res)
return False
return True
-class BGJob(threading.Thread):
- """Background job to pause nodes, copy data files, resume nodes, and validate data files."""
+class SimulateCrash(bghook.BGHook):
+ """A hook to simulate crashes."""
- def __init__(self, hook):
- """Initialize the background job."""
- threading.Thread.__init__(self, name="SimulateCrashes")
- self.daemon = True
- self._hook = hook
- self._lock = threading.Lock()
- self._is_alive = True
+ def __init__(self, hook_logger, fixture):
+ """Initialize SimulateCrash."""
+ bghook.BGHook.__init__(self, hook_logger, fixture, "Simulate crashes hook")
+ self.acceptable_err_codes = [166, 11600]
self.backup_num = 0
- self.found_error = False
-
- def run(self):
- """Run the background job."""
- while True:
- with self._lock:
- if self.is_alive is False:
- break
-
- self._hook.pause_and_copy(self.backup_num)
- if not self._hook.validate_all(self.backup_num):
- self.found_error = True
- self._hook.running_test.fixture.teardown()
- self.is_alive = False
- return
+ self.validate_port = self.fixture.fixturelib.get_next_port(self.fixture.job_num)
- time.sleep(random.randint(1, 5))
- self.backup_num += 1
+ def run_action(self):
+ """Copy data files and run validation on all nodes."""
+ self.pause_and_copy()
- def kill(self):
- """Kill the background job."""
- with self._lock:
- self.is_alive = False
+ if not self.validate_all():
+ raise ValueError("Validation failed")
+ time.sleep(random.randint(1, 5))
+ self.backup_num += 1
-class SimulateCrash(interface.Hook):
- """A hook to simulate crashes."""
-
- IS_BACKGROUND = True
-
- def __init__(self, hook_logger, fixture):
- """Initialize SimulateCrash."""
- interface.Hook.__init__(self, hook_logger, fixture, "Simulate crashes hook")
- self.found_error = False
- self.last_validate_port = 19000
- self.logger = hook_logger
- self.running_test = None
- self._background_job = None
-
- def pause_and_copy(self, backup_num):
+ def pause_and_copy(self):
"""For all replica set nodes, this will send a SIGSTOP signal, copy the data files and send a SIGCONT signal."""
- self.logger.info("Taking snapshot #{}".format(backup_num))
+ self.logger.info("Taking snapshot #{}".format(self.backup_num))
nodes_to_copy = [x for x in self.fixture.nodes]
random.shuffle(nodes_to_copy)
@@ -111,7 +80,8 @@ class SimulateCrash(interface.Hook):
fqfn = "/".join([tup[0], filename])
self.copy_file(
node.get_dbpath_prefix(), fqfn,
- node.get_dbpath_prefix() + "/simulateCrashes/{}".format(backup_num))
+ node.get_dbpath_prefix() + "/simulateCrashes/{}".format(
+ self.backup_num))
finally:
node.mongod.resume()
@@ -128,27 +98,24 @@ class SimulateCrash(interface.Hook):
os.close(out_fd)
os.close(in_fd)
- def validate_all(self, backup_num):
+ def validate_all(self):
"""Start a standalone node to validate all collections on the copied data files."""
for node in self.fixture.nodes:
- if self.last_validate_port >= 20000:
- self.last_validate_port = 19000
- validate_port = self.last_validate_port
- self.last_validate_port += 1
-
- path = node.get_dbpath_prefix() + "/simulateCrashes/{}".format(backup_num)
+ path = node.get_dbpath_prefix() + "/simulateCrashes/{}".format(self.backup_num)
self.logger.info("Starting to validate. DBPath: {} Port: {}".format(
- path, validate_port))
+ path, self.validate_port))
mdb = process.Process(self.logger, [
node.mongod_executable, "--dbpath", path, "--port",
- str(validate_port), "--logpath",
+ str(self.validate_port), "--logpath",
node.get_dbpath_prefix() + "/simulateCrashes/validate.log"
])
mdb.start()
- client = pymongo.MongoClient("localhost:{}".format(validate_port))
- is_valid = validate(client, self.logger)
+ client = pymongo.MongoClient(host="localhost", port=self.validate_port, connect=True,
+ connectTimeoutMS=120000, serverSelectionTimeoutMS=120000,
+ directConnection=True)
+ is_valid = validate(client, self.logger, self.acceptable_err_codes)
mdb.stop()
mdb.wait()
@@ -158,40 +125,3 @@ class SimulateCrash(interface.Hook):
shutil.rmtree(path, ignore_errors=True)
return True
-
- def before_suite(self, test_report):
- """Start the background thread."""
- self.logger.info("Starting the SimulateCrashes thread.")
- self._background_job = BGJob(self)
- self._background_job.start()
-
- def after_suite(self, test_report, teardown_flag=None):
- """Signal the background thread to exit, and wait until it does."""
- if self._background_job is None:
- return
-
- self.logger.info("Stopping the SimulateCrashes thread.")
- self._background_job.kill()
- self._background_job.join()
-
- if self._background_job.found_error:
- self.logger.error("Encountered an error inside the simulate crashes hook.",
- exc_info=self._background_job.exc_info)
-
- def before_test(self, test, test_report):
- """Each test will call this before it executes."""
- self.running_test = test
-
- def after_test(self, test, test_report):
- """Each test will call this after it executes. Check if the hook found an error."""
- if self._background_job is None:
- return
-
- if not self._background_job.found_error:
- return
-
- self._background_job.kill()
- self._background_job.join()
-
- self.logger.error("Encountered an error inside the simulate crashes hook.",
- exc_info=self._background_job.exc_info)