diff options
author | Jonathan Abrahams <jonathan@mongodb.com> | 2017-11-08 20:48:46 -0500 |
---|---|---|
committer | Jonathan Abrahams <jonathan@mongodb.com> | 2017-11-08 20:48:46 -0500 |
commit | 9dec24bdf2e1f63b0fcba6a32763e97288583e5d (patch) | |
tree | 04d078db235b8d506989e340fa3f25a6ecbc42c9 | |
parent | 6d8e6b9cce052cdd442e207a27df10e698b2bb00 (diff) | |
download | mongo-9dec24bdf2e1f63b0fcba6a32763e97288583e5d.tar.gz |
SERVER-31325 Powercycle new tasks:
- Replication WT
- Replication small oplog
- Sync delay
-rwxr-xr-x | buildscripts/aws_ec2.py | 45 | ||||
-rw-r--r-- | etc/evergreen.yml | 141 | ||||
-rwxr-xr-x | pytests/powertest.py | 266 |
3 files changed, 347 insertions, 105 deletions
diff --git a/buildscripts/aws_ec2.py b/buildscripts/aws_ec2.py index 2d643e045c1..6becdeaa6d6 100755 --- a/buildscripts/aws_ec2.py +++ b/buildscripts/aws_ec2.py @@ -44,24 +44,39 @@ class AwsEc2(object): @staticmethod def wait_for_state(instance, state, wait_time_secs=0, show_progress=False): """Wait up to 'wait_time_secs' for instance to be in 'state'. - Return True if 'state' reached.""" - end_time = time.time() + wait_time_secs + Return 0 if 'state' reached, 1 otherwise.""" if show_progress: print("Waiting for instance {} to reach '{}' state".format(instance, state), end="", file=sys.stdout) - while time.time() < end_time: + reached_state = False + end_time = time.time() + wait_time_secs + while True: if show_progress: print(".", end="", file=sys.stdout) sys.stdout.flush() - time.sleep(5) - instance.load() - if instance.state["Name"] == state: - if show_progress: - print(" Instance {}!".format(state), file=sys.stdout) - sys.stdout.flush() - return True - return False + try: + time_left = end_time - time.time() + instance.load() + if instance.state["Name"] == state: + reached_state = True + break + if time_left <= 0: + break + except botocore.exceptions.ClientError: + # A ClientError exception can sometimes be generated, due to RequestLimitExceeded, + # so we ignore it and retry until we time out. + pass + wait_interval_secs = 15 if time_left > 15 else time_left + time.sleep(wait_interval_secs) + if show_progress: + if reached_state: + print(" Instance {}!".format(instance.state["Name"]), file=sys.stdout) + else: + print(" Instance in state '{}', failed to reach state '{}'!".format( + instance.state["Name"], state), file=sys.stdout) + sys.stdout.flush() + return 0 if reached_state else 1 def control_instance(self, mode, image_id, wait_time_secs=0, show_progress=False): """Controls an AMI instance. Returns 0 & status information, if successful.""" @@ -69,6 +84,7 @@ class AwsEc2(object): raise ValueError( "Invalid mode '{}' specified, choose from {}.".format(mode, _MODES)) + sys.stdout.flush() instance = self.connection.Instance(image_id) try: if mode == "start": @@ -92,8 +108,9 @@ class AwsEc2(object): except botocore.exceptions.ClientError as err: return 1, err.message + ret = 0 if wait_time_secs > 0: - self.wait_for_state( + ret = self.wait_for_state( instance=instance, state=state, wait_time_secs=wait_time_secs, @@ -113,7 +130,7 @@ class AwsEc2(object): except botocore.exceptions.ClientError as err: return 1, err.message - return 0, status + return ret, status def tag_instance(self, image_id, tags): """Tags an AMI instance. """ @@ -208,7 +225,7 @@ def main(): control_options.add_option("--waitTimeSecs", dest="wait_time_secs", type=int, - default=60, + default=5 * 60, help="Time to wait for EC2 instance to reach it's new state," " defaults to '%default'.") diff --git a/etc/evergreen.yml b/etc/evergreen.yml index 626abf5dcbe..2ef2356604b 100644 --- a/etc/evergreen.yml +++ b/etc/evergreen.yml @@ -180,9 +180,9 @@ variables: ec2_artifacts: ${monitor_proc_file} ${monitor_system_file} ${log_path} ${backup_path_after} ${backup_path_before} program_options: --logLevel info --backupPathBefore ${backup_path_before} --backupPathAfter ${backup_path_after} connection_options: --sshUserHost ${ip_address} --sshConnection \"${ssh_identity} ${ssh_connection_options}\" - test_options: --testLoops 25 --seedDocNum 10000 --rsync --validate local --canary local + test_options: --testLoops 15 --seedDocNum 10000 --rsync --validate local --canary local crash_options: "--crashMethod aws_ec2 --crashOptions ${instance_id}:private_ip_address --crashWaitTime 30 --jitterForCrashWaitTime 5" - client_options: --numCrudClients 10 --numFsmClients 10 + client_options: --numCrudClients 20 --numFsmClients 20 mongodb_options: --rootDir ${remote_dir}-${task_id} --mongodbBinDir ${remote_dir} mongod_options: --mongodUsablePorts ${standard_port} ${secret_port} --dbPath ${db_path} --logPath ${log_path} mongod_extra_options: --mongodOptions \"--setParameter enableTestCommands=1\" @@ -1304,6 +1304,77 @@ functions: --retries ${ssh_retries|0} \ --commands "$cmds" + - command: shell.exec + params: + shell: bash + working_dir: src + script: | + set -o errexit + ${set_sudo} + # Many systems have the firewall disabled, by default. In case the firewall is + # enabled we add rules for the mongod ports on the remote. + # RHEL 7 firewall rules + if [ ! -z "$(which firewall-cmd 2> /dev/null)" ]; then + cmds="$sudo firewall-cmd --permanent --zone=public --add-port=ssh/tcp" + cmds="$cmds; $sudo firewall-cmd --permanent --zone=public --add-port=${standard_port}/tcp" + cmds="$cmds; $sudo firewall-cmd --permanent --zone=public --add-port=${secret_port}/tcp" + cmds="$cmds; $sudo firewall-cmd --reload" + cmds="$cmds; $sudo firewall-cmd --list-all" + # ArchLinux, Debian, RHEL 6 firewall rules + elif [ ! -z "$($sudo iptables --list 2> /dev/null)" ]; then + cmds="$sudo iptables -I INPUT 1 -p tcp --dport ssh -j ACCEPT" + cmds="$cmds; $sudo iptables -I INPUT 1 -p tcp --dport ${standard_port} -j ACCEPT" + cmds="$cmds; $sudo iptables -I INPUT 1 -p tcp --dport ${secret_port} -j ACCEPT" + if [ -d /etc/iptables ]; then + rules_file=/etc/iptables/iptables.rules + elif [ -f /etc/sysconfig/iptables ]; then + rules_file=/etc/sysconfig/iptables + else + rules_file=/etc/iptables.up.rules + fi + cmds="$cmds; $sudo iptables-save | $sudo tee $rules_file" + cmds="$cmds; $sudo iptables --list-rules" + elif [ ! -z "$($sudo service iptables status 2> /dev/null)" ]; then + cmds="$sudo iptables -I INPUT 1 -p tcp --dport ssh -j ACCEPT" + cmds="$cmds; $sudo iptables -I INPUT 1 -p tcp --dport ${standard_port} -j ACCEPT" + cmds="$cmds; $sudo iptables -I INPUT 1 -p tcp --dport ${secret_port} -j ACCEPT" + cmds="$cmds; $sudo service iptables save" + cmds="$cmds; $sudo service iptables status" + # Ubuntu firewall rules + elif [ ! -z "$($sudo ufw status 2> /dev/null)" ]; then + cmds="$sudo ufw allow ssh/tcp" + cmds="$cmds; $sudo ufw allow ${standard_port}/tcp" + cmds="$cmds; $sudo ufw allow ${secret_port}/tcp" + cmds="$cmds; $sudo ufw reload" + cmds="$cmds; $sudo ufw status" + # SuSE firewall rules + # TODO: Add firewall rules using SuSEfirewall2 + elif [ ! -z "$($sudo /sbin/SuSEfirewall2 help 2> /dev/null)" ]; then + cmds="$sudo /sbin/SuSEfirewall2 stop" + cmds="$cmds; $sudo /sbin/SuSEfirewall2 off" + # Windows firewall rules + elif [ ! -z "$(netsh advfirewall show store 2> /dev/null)" ]; then + add_rule="netsh advfirewall firewall add rule" + cmds="$add_rule name='MongoDB port ${standard_port} in' dir=in action=allow protocol=TCP localport=${standard_port}" + cmds="$cmds; $add_rule name='MongoDB port ${standard_port} out' dir=in action=allow protocol=TCP localport=${standard_port}" + cmds="$cmds; $add_rule name='MongoDB port ${secret_port} in' dir=in action=allow protocol=TCP localport=${secret_port}" + cmds="$cmds; $add_rule name='MongoDB port ${secret_port} out' dir=in action=allow protocol=TCP localport=${secret_port}" + cmds="$cmds; netsh advfirewall firewall show rule name=all | grep -A 13 'MongoDB'" + else + echo "Firewall not active or unknown firewall command on this platform" + exit 0 + fi + set -o errexit + if [ ! -z "$cmds" ]; then + ssh_connection_options="${ssh_identity} ${ssh_connection_options}" + ${python|/opt/mongodbtoolchain/v2/bin/python2} buildscripts/remote_operations.py \ + --verbose \ + --userHost $USER@${ip_address} \ + --sshConnectionOptions "$ssh_connection_options" \ + --retries ${ssh_retries|0} \ + --commands "$cmds" + fi + "copy EC2 artifacts": - command: shell.exec params: @@ -4157,7 +4228,7 @@ tasks: # mongod will not start if it crashed mongod while creating a namespace (SERVER-26499). vars: <<: *powercycle_test - client_options: --numCrudClients 10 --numFsmClients 0 + client_options: --numCrudClients 20 --numFsmClients 0 mongod_extra_options: --mongodOptions \"--setParameter enableTestCommands=1 --storageEngine mmapv1\" - name: powercycle_WT @@ -4196,10 +4267,67 @@ tasks: - func: "run powercycle test" vars: <<: *powercycle_test - client_options: --numCrudClients 10 --numFsmClients 10 --fsmWorkloadBlacklistFiles toggle_feature_compatibility.js + client_options: --numCrudClients 20 --numFsmClients 20 --fsmWorkloadBlacklistFiles toggle_feature_compatibility.js mongod_options: --mongodUsablePorts ${standard_port} ${secret_port} --dbPath ${db_path} --logPath ${log_path} --fcv 3.4 mongod_extra_options: --mongodOptions \"--setParameter enableTestCommands=1 --storageEngine wiredTiger\" +- name: powercycle_replication_WT + depends_on: + - name: compile + commands: + - func: "do setup" + - func: "set up remote credentials" + vars: + <<: *powercycle_remote_credentials + - func: "set up virtualenv" + - func: "set up EC2 instance" + vars: + <<: *powercycle_ec2_instance + - command: expansions.update + <<: *powercycle_expansions + - func: "run powercycle test" + vars: + <<: *powercycle_test + mongod_extra_options: --replSet powercyle --mongodOptions \"--setParameter enableTestCommands=1 --storageEngine wiredTiger\" + +- name: powercycle_replication_smalloplog_WT + depends_on: + - name: compile + commands: + - func: "do setup" + - func: "set up remote credentials" + vars: + <<: *powercycle_remote_credentials + - func: "set up virtualenv" + - func: "set up EC2 instance" + vars: + <<: *powercycle_ec2_instance + - command: expansions.update + <<: *powercycle_expansions + - func: "run powercycle test" + vars: + <<: *powercycle_test + mongod_extra_options: --replSet powercyle --mongodOptions \"--setParameter enableTestCommands=1 --oplogSize 1 --storageEngine wiredTiger\" + +- name: powercycle_syncdelay_WT + depends_on: + - name: compile + commands: + - func: "do setup" + - func: "set up remote credentials" + vars: + <<: *powercycle_remote_credentials + - func: "set up virtualenv" + - func: "set up EC2 instance" + vars: + <<: *powercycle_ec2_instance + - command: expansions.update + <<: *powercycle_expansions + - func: "run powercycle test" + vars: + <<: *powercycle_test + mongod_extra_options: --mongodOptions \"--setParameter enableTestCommands=1 --syncdelay 10 --storageEngine wiredTiger\" + - name: idl_tests depends_on: - name: compile @@ -5464,6 +5592,9 @@ buildvariants: - name: powercycle - name: powercycle_WT - name: powercycle_fcv3.4_WT + - name: powercycle_replication_WT + - name: powercycle_replication_smalloplog_WT + - name: powercycle_syncdelay_WT - name: package distros: - ubuntu1604-packer @@ -6000,6 +6131,8 @@ buildvariants: - name: snmp_WT - name: ssl - name: sslSpecial + - name: powercycle + - name: powercycle_WT - name: package distros: - ubuntu1604-packer diff --git a/pytests/powertest.py b/pytests/powertest.py index 94e9da2f840..92f145b4b0c 100755 --- a/pytests/powertest.py +++ b/pytests/powertest.py @@ -16,6 +16,8 @@ import importlib import logging import optparse import os +import pipes +import posixpath import random import re import shlex @@ -111,25 +113,58 @@ def exit_handler(): pass -def kill_processes(pids, kill_children=True): - """Kill a list of processes and optionally it's children.""" - for pid in pids: - LOGGER.debug("Killing process with pid %d", pid) +def child_processes(parent_pid): + """Returns a list of all child processes for a pid.""" + # The child processes cannot be obtained from the parent on Windows. See + # https://stackoverflow.com/questions/30220732/python-psutil-not-showing-all-child-processes + child_procs = [] + while psutil.pid_exists(parent_pid): try: - proc = psutil.Process(pid) + child_procs = [p for p in psutil.process_iter(attrs=["pid"]) if parent_pid == p.ppid()] + break except psutil.NoSuchProcess: - LOGGER.error("Could not kill process with pid %d, as it no longer exists", pid) - continue - if kill_children: - child_procs = proc.children(recursive=True) - child_pids = [] - for child in child_procs: - child_pids.append(child.pid) - kill_processes(child_pids, kill_children=False) + pass + for proc in child_procs: + proc_children = child_processes(proc.pid) + if proc_children: + child_procs += proc_children + return list(set(child_procs)) + + +def kill_process(pid, kill_children=True): + """Kill a process, and optionally it's children, by it's pid. Returns 0 if successful.""" + try: + parent = psutil.Process(pid) + except psutil.NoSuchProcess: + LOGGER.error("Could not kill process %d, as it no longer exists", pid) + return 0 + + procs = [] + if kill_children: + procs += child_processes(pid) + procs.append(parent) + + for proc in procs: try: + LOGGER.debug("Killing process %d", proc.pid) proc.kill() except psutil.NoSuchProcess: - LOGGER.error("Could not kill process with pid %d, as it no longer exists", pid) + LOGGER.error("Could not kill process %d, as it no longer exists", pid) + + _, alive = psutil.wait_procs(procs, timeout=30, callback=None) + if alive: + for proc in alive: + LOGGER.error("Process %d still alive!", proc.pid) + return 0 + + +def kill_processes(procs, kill_children=True): + """Kill a list of processes and optionally it's children.""" + for proc in procs: + LOGGER.debug("Killing parent process %d", proc.pid) + kill_process(proc.pid, kill_children=kill_children) + ret = proc.wait() + LOGGER.debug("Kill of parent process %d has return code of %d", proc.pid, ret) def get_extension(filename): @@ -166,14 +201,9 @@ def get_bin_dir(root_dir): return None -def executable_exists_in_path(executable): - """Returns True if 'executable' is in the PATH.""" - return distutils.spawn.find_executable(executable) is not None - - def create_temp_executable_file(cmds): """Creates an executable temporary file containing 'cmds'. Returns file name.""" - temp_file_name = NamedTempFile.create(suffix=".sh") + temp_file_name = NamedTempFile.create(suffix=".sh", dir="tmp") with NamedTempFile.get(temp_file_name) as temp_file: temp_file.write(cmds) os_st = os.stat(temp_file_name) @@ -182,7 +212,7 @@ def create_temp_executable_file(cmds): def start_cmd(cmd, use_file=False): - """Starts command and returns pid from Popen""" + """Starts command and returns proc instance from Popen""" orig_cmd = "" # Multi-commands need to be written to a temporary file to execute on Windows. @@ -202,9 +232,9 @@ def start_cmd(cmd, use_file=False): else: LOGGER.debug("Executing '%s'", cmd) - proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + proc = subprocess.Popen(cmd, close_fds=True) - return proc.pid + return proc def execute_cmd(cmd, use_file=False): @@ -483,11 +513,15 @@ class NamedTempFile(object): """Class to control temporary files.""" _FILE_MAP = {} + _DIR_LIST = [] @classmethod - def create(cls, suffix=""): - """Creates a temporary file and returns the file name.""" - temp_file = tempfile.NamedTemporaryFile(suffix=suffix, delete=False) + def create(cls, dir=None, suffix=""): + """Creates a temporary file, and optional directory, and returns the file name.""" + if dir and not os.path.isdir(dir): + os.makedirs(dir) + cls._DIR_LIST.append(dir) + temp_file = tempfile.NamedTemporaryFile(suffix=suffix, dir=dir, delete=False) cls._FILE_MAP[temp_file.name] = temp_file return temp_file.name @@ -503,14 +537,32 @@ class NamedTempFile(object): """Deletes temporary file. Raises an exception if the file is unknown.""" if name not in cls._FILE_MAP: raise Exception("Unknown temporary file {}.".format(name)) - os.remove(name) - del cls._FILE_MAP[name] + try: + os.remove(name) + except (IOError, OSError) as err: + LOGGER.warn("Unable to delete temporary file {} with error {}".format(name, err)) + if not os.path.exists(name): + del cls._FILE_MAP[name] + + @classmethod + def delete_dir(cls, dir): + """Deletes temporary directory. Raises an exception if the directory is unknown.""" + if dir not in cls._DIR_LIST: + raise Exception("Unknown temporary directory {}.".format(dir)) + try: + shutil.rmtree(dir) + except (IOError, OSError) as err: + LOGGER.warn("Unable to delete temporary directory {} with error {}".format(dir, err)) + if not os.path.exists(dir): + cls._DIR_LIST.remove(dir) @classmethod def delete_all(cls): - """Deletes all temporary files.""" + """Deletes all temporary files and directories.""" for name in list(cls._FILE_MAP): cls.delete(name) + for dir in cls._DIR_LIST: + cls.delete_dir(dir) class ProcessControl(object): @@ -1037,7 +1089,7 @@ def rsync(src_dir, dest_dir): """ Rsync 'src_dir' to 'dest_dir'. """ # Note rsync on Windows requires a Unix-style directory. LOGGER.info("Rsync'ing %s to %s", src_dir, dest_dir) - if not executable_exists_in_path("rsync"): + if not distutils.spawn.find_executable("rsync"): return 1, "No rsync exists on the host, not rsync'ing" cmds = "rsync -va --delete --quiet {} {}".format(src_dir, dest_dir) ret, output = execute_cmd(cmds) @@ -1077,7 +1129,8 @@ def internal_crash(use_sudo=False): def crash_server(options, crash_canary, canary_port, local_ops, script_name, client_args): - """ Crashes server and optionally writes canary doc before crash. """ + """ Crashes server and optionally writes canary doc before crash. + Returns tuple (ret, output). """ crash_wait_time = options.crash_wait_time + random.randint(0, options.crash_wait_time_jitter) LOGGER.info("Crashing server in %d seconds", crash_wait_time) @@ -1117,17 +1170,19 @@ def crash_server(options, crash_canary, canary_port, local_ops, script_name, cli ec2 = aws_ec2.AwsEc2() crash_func = ec2.control_instance instance_id, _ = get_aws_crash_options(options.crash_options) - crash_args = ["force-stop", instance_id, 60, True] + crash_args = ["force-stop", instance_id, 240, True] else: - LOGGER.error("Unsupported crash method '%s' provided", options.crash_method) - return + message = "Unsupported crash method '{}' provided".format(options.crash_method) + LOGGER.error("Unsupported crash method '%s' provided", message) + return 1, message # Invoke the crash canary function, right before crashing the server. if crash_canary and options.canary == "local": crash_canary["function"](*crash_canary["args"]) - _, output = crash_func(*crash_args) + ret, output = crash_func(*crash_args) LOGGER.info(output) + return ret, output def wait_for_mongod_shutdown(data_dir, timeout=120): @@ -1160,9 +1215,13 @@ def get_mongo_client_args(options): def mongo_shell(mongo_path, work_dir, host_port, mongo_cmds, retries=5, retry_sleep=5): """Starts mongo_path from work_dir, connecting to host_port and executes mongo_cmds.""" - cmds = """ + cmds = (""" cd {}; - echo '{}' | {} {}""".format(work_dir, mongo_cmds, mongo_path, host_port) + echo {} | {} {}""".format( + pipes.quote(work_dir), + pipes.quote(mongo_cmds), + pipes.quote(mongo_path), + host_port)) attempt_num = 0 while True: ret, output = execute_cmd(cmds, use_file=True) @@ -1207,9 +1266,17 @@ def mongo_reconfig_replication(mongo, host_port, repl_set): rs_config = {"_id": repl_set, "members": [{"_id": 0, "host": host_port}]} ret = mongo.admin.command("replSetInitiate", rs_config) else: - ret = mongo.admin.command("replSetGetConfig") - if ret["ok"] != 1: - return 1 + # Wait until replication is initialized. + while True: + try: + ret = mongo.admin.command("replSetGetConfig") + if ret["ok"] != 1: + return 1 + break + except pymongo.errors.OperationFailure as err: + # src/mongo/base/error_codes.err: error_code("NotYetInitialized", 94) + if err.code != 94: + return 1 rs_config = ret["config"] # We only reconfig if there is a change to 'host'. if rs_config["members"][0]["host"] != host_port: @@ -1311,31 +1378,38 @@ def resmoke_client(work_dir, host_port, js_test, resmoke_suite, + repeat_num=1, no_wait=False, log_file=None): """Starts resmoke client from work_dir, connecting to host_port and executes js_test.""" - log_output = "2>& 1 | tee -a {}".format(log_file) if log_file else "" - cmds = ("cd {} ; " - "python buildscripts/resmoke.py " - "--mongo {} " - "--suites {} " - "--shellConnString mongodb://{} " - "--continueOnFailure " - "{} " - "{}".format( - work_dir, mongo_path, resmoke_suite, host_port, js_test, log_output)) - ret, output, pid = None, None, None + log_output = ">> {} 2>&1".format(log_file) if log_file else "" + cmds = ("cd {}; " + "python buildscripts/resmoke.py" + " --mongo {}" + " --suites {}" + " --shellConnString mongodb://{}" + " --continueOnFailure" + " --repeat {}" + " {}" + " {}".format( + pipes.quote(work_dir), + pipes.quote(mongo_path), + pipes.quote(resmoke_suite), + host_port, + repeat_num, + pipes.quote(js_test), + log_output)) + ret, output, proc = None, None, None if no_wait: - pid = start_cmd(cmds, use_file=True) + proc = start_cmd(cmds, use_file=True) else: ret, output = execute_cmd(cmds, use_file=True) - return ret, output, pid + return ret, output, proc def main(): """ Main program. """ - atexit.register(exit_handler) parser = optparse.OptionParser(usage=""" @@ -1590,8 +1664,6 @@ Examples: # Client options mongo_path = distutils.spawn.find_executable( "mongo", os.getcwd() + os.pathsep + os.environ["PATH"]) - if mongo_path: - mongo_path = os.path.abspath(mongo_path) client_options.add_option("--mongoPath", dest="mongo_path", help="Path to mongo (shell) executable, if unspecifed, mongo client" @@ -1610,13 +1682,14 @@ Examples: " [default: '%default'].", default="jstests/hooks/crud_client.js") + with_external_server = "buildscripts/resmokeconfig/suites/with_external_server.yml" client_options.add_option("--configCrudClient", dest="config_crud_client", help="The path to the CRUD client configuration YML file on the" - " local host. This is the resmoke.py suite file. If" - " unspecified, a default configuration will be used that" + " local host. This is the resmoke.py suite file. If unspecified," + " a default configuration YML file (%default) will be used that" " provides a mongo (shell) DB connection to a running mongod.", - default=None) + default=with_external_server) client_options.add_option("--numCrudClients", dest="num_crud_clients", @@ -1716,7 +1789,9 @@ Examples: options, args = parser.parse_args() logging.basicConfig(format="%(asctime)s %(levelname)s %(message)s", - level=options.log_level.upper(), filename=options.log_file) + level=logging.ERROR, + filename=options.log_file) + logging.getLogger(__name__).setLevel(options.log_level.upper()) logging.Formatter.converter = time.gmtime LOGGER.info("powertest.py invocation: %s", " ".join(sys.argv)) @@ -1743,12 +1818,13 @@ Examples: options.crash_options = "{}:{}".format(instance_id, address_type) # Initialize the mongod options + # Note - We use posixpath for Windows client to Linux server scenarios. if not options.root_dir: options.root_dir = "mongodb-powertest-{}".format(int(time.time())) if not options.db_path: - options.db_path = os.path.join(options.root_dir, "data", "db") + options.db_path = posixpath.join(options.root_dir, "data", "db") if not options.log_path: - options.log_path = os.path.join(options.root_dir, "log", "mongod.log") + options.log_path = posixpath.join(options.root_dir, "log", "mongod.log") mongod_options_map = parse_options(options.mongod_options) set_fcv_cmd = "set_fcv" if options.fcv_version is not None else "" @@ -1782,16 +1858,18 @@ Examples: if (options.num_crud_clients > 0 or options.num_fsm_clients > 0 or options.validate_collections == "local"): - if options.mongo_path is None: + if not options.mongo_path: LOGGER.error("mongoPath must be specified") sys.exit(1) - elif not os.path.isfile(options.mongo_path): + if not os.path.isfile(options.mongo_path): LOGGER.error("mongoPath %s does not exist", options.mongo_path) sys.exit(1) + mongo_path = os.path.abspath(os.path.normpath(options.mongo_path)) # Setup the CRUD & FSM clients. - with_external_server = "buildscripts/resmokeconfig/suites/with_external_server.yml" - config_crud_client = options.config_crud_client + if not os.path.isfile(options.config_crud_client): + LOGGER.error("configCrudClient %s does not exist", options.config_crud_client) + sys.exit(1) fsm_client = "jstests/libs/fsm_serial_client.js" fsm_workload_files = [] for fsm_workload_file in options.fsm_workload_files: @@ -1912,6 +1990,8 @@ Examples: loop_num += 1 LOGGER.info("****Starting test loop %d test time %d seconds****", loop_num, test_time) + temp_client_files = [] + if options.canary and loop_num > 1: canary_opt = "--docForCanary \"{}\"".format(canary_doc) validate_canary_cmd = "validate_canary" if options.canary else "" @@ -1962,16 +2042,16 @@ Examples: # Optionally, run local validation of collections. if options.validate_collections == "local": host_port = "{}:{}".format(mongod_host, secret_port) - new_config_file = NamedTempFile.create(suffix=".yml") + new_config_file = NamedTempFile.create(suffix=".yml", dir="tmp") + temp_client_files.append(new_config_file) test_data = {"skipValidationOnNamespaceNotFound": True} new_resmoke_config(with_external_server, new_config_file, test_data) ret, output, _ = resmoke_client( mongo_repo_root_dir, - options.mongo_path, + mongo_path, host_port, "jstests/hooks/run_validate_collections.js", new_config_file) - NamedTempFile.delete(new_config_file) LOGGER.info("Collection validation: %d %s", ret, output) if ret: sys.exit(ret) @@ -2022,7 +2102,7 @@ Examples: sys.exit(ret) # Start CRUD clients - crud_pids = [] + crud_procs = [] if options.num_crud_clients > 0: host_port = "{}:{}".format(mongod_host, standard_port) test_data = {"dbName": options.db_name} @@ -2032,24 +2112,25 @@ Examples: test_data["writeConcern"] = write_concern for i in xrange(options.num_crud_clients): - crud_config_file = NamedTempFile.create(suffix=".yml") + crud_config_file = NamedTempFile.create(suffix=".yml", dir="tmp") + temp_client_files.append(crud_config_file) test_data["collectionName"] = "{}-{}".format(options.collection_name, i) - new_resmoke_config(with_external_server, crud_config_file, test_data) - _, _, pid = resmoke_client( + new_resmoke_config(options.config_crud_client, crud_config_file, test_data) + _, _, proc = resmoke_client( work_dir=mongo_repo_root_dir, - mongo_path=options.mongo_path, + mongo_path=mongo_path, host_port=host_port, js_test=options.crud_client, resmoke_suite=crud_config_file, + repeat_num=100, no_wait=True, log_file="crud_{}.log".format(i)) - crud_pids.append(pid) + crud_procs.append(proc) - LOGGER.info( - "****Started %d CRUD client(s) %s****", options.num_crud_clients, crud_pids) + LOGGER.info("****Started %d CRUD client(s)****", options.num_crud_clients) # Start FSM clients - fsm_pids = [] + fsm_procs = [] if options.num_fsm_clients > 0: test_data = {"fsmDbBlacklist": [options.db_name]} if fsm_workload_files: @@ -2058,22 +2139,24 @@ Examples: test_data["workloadBlacklistFiles"] = fsm_workload_blacklist_files for i in xrange(options.num_fsm_clients): - fsm_config_file = NamedTempFile.create(suffix=".yml") + fsm_config_file = NamedTempFile.create(suffix=".yml", dir="tmp") + temp_client_files.append(fsm_config_file) test_data["dbNamePrefix"] = "fsm-{}".format(i) # Do collection validation only for the first FSM client. test_data["validateCollections"] = True if i == 0 else False new_resmoke_config(with_external_server, fsm_config_file, test_data) - _, _, pid = resmoke_client( + _, _, proc = resmoke_client( work_dir=mongo_repo_root_dir, - mongo_path=options.mongo_path, + mongo_path=mongo_path, host_port=host_port, js_test=fsm_client, resmoke_suite=fsm_config_file, + repeat_num=100, no_wait=True, log_file="fsm_{}.log".format(i)) - fsm_pids.append(pid) + fsm_procs.append(proc) - LOGGER.info("****Started %d FSM client(s) %s****", options.num_fsm_clients, fsm_pids) + LOGGER.info("****Started %d FSM client(s)****", options.num_fsm_clients) # Crash the server. A pre-crash canary document is optionally written to the DB. crash_canary = {} @@ -2088,17 +2171,28 @@ Examples: options.db_name, options.collection_name, canary_doc] - crash_server(options, crash_canary, standard_port, local_ops, script_name, client_args) + ret, output = crash_server( + options, crash_canary, standard_port, local_ops, script_name, client_args) + # For internal crashes 'ret' is non-zero, because the ssh session unexpectedly terminates. + if options.crash_method != "internal" and ret: + raise Exception("Crash of server failed: {}", format(output)) # Wait a bit after sending command to crash the server to avoid connecting to the # server before the actual crash occurs. time.sleep(10) + # Kill any running clients and cleanup temporary files. + kill_processes(crud_procs + fsm_procs) + for temp_file in temp_client_files: + NamedTempFile.delete(temp_file) + # The EC2 instance address changes if the crash_method is 'aws_ec2'. if options.crash_method == "aws_ec2": ec2 = aws_ec2.AwsEc2() ret, aws_status = ec2.control_instance( - mode="start", image_id=instance_id, wait_time_secs=60, show_progress=True) + mode="start", image_id=instance_id, wait_time_secs=240, show_progress=True) LOGGER.info("Start instance: %d %s****", ret, aws_status) + if ret: + raise Exception("Start instance failed: {}".format(aws_status)) if not hasattr(aws_status, address_type): raise Exception("Cannot determine address_type {} from AWS EC2 status {}".format( address_type, aws_status)) @@ -2116,8 +2210,6 @@ Examples: canary_doc = copy.deepcopy(orig_canary_doc) - kill_processes(crud_pids + fsm_pids) - test_time = int(time.time()) - start_time LOGGER.info("****Completed test loop %d test time %d seconds****", loop_num, test_time) if loop_num == options.num_loops or test_time >= options.test_time: |