diff options
-rwxr-xr-x | buildscripts/aws_ec2.py | 113 | ||||
-rw-r--r-- | etc/evergreen.yml | 31 | ||||
-rwxr-xr-x | pytests/powertest.py | 84 |
3 files changed, 175 insertions, 53 deletions
diff --git a/buildscripts/aws_ec2.py b/buildscripts/aws_ec2.py index 788d5de65b8..2d643e045c1 100755 --- a/buildscripts/aws_ec2.py +++ b/buildscripts/aws_ec2.py @@ -20,9 +20,17 @@ _MODES = ("status", "create", "start", "stop", "force-stop", "reboot", "terminat class AwsEc2(object): """Class to support controlling AWS EC2 istances.""" - InstanceStatus = collections.namedtuple( - "InstanceStatus", - "instance_id image_id instance_type state private_ip_address public_ip_address tags") + InstanceStatus = collections.namedtuple("InstanceStatus", [ + "instance_id", + "image_id", + "instance_type", + "state", + "private_ip_address", + "public_ip_address", + "private_dns_name", + "public_dns_name", + "tags" + ]) def __init__(self): try: @@ -33,7 +41,29 @@ class AwsEc2(object): " for the variable names, file names and precedence order.") raise - def control_instance(self, mode, image_id): + @staticmethod + def wait_for_state(instance, state, wait_time_secs=0, show_progress=False): + """Wait up to 'wait_time_secs' for instance to be in 'state'. + Return True if 'state' reached.""" + end_time = time.time() + wait_time_secs + if show_progress: + print("Waiting for instance {} to reach '{}' state".format(instance, state), + end="", + file=sys.stdout) + while time.time() < end_time: + if show_progress: + print(".", end="", file=sys.stdout) + sys.stdout.flush() + time.sleep(5) + instance.load() + if instance.state["Name"] == state: + if show_progress: + print(" Instance {}!".format(state), file=sys.stdout) + sys.stdout.flush() + return True + return False + + def control_instance(self, mode, image_id, wait_time_secs=0, show_progress=False): """Controls an AMI instance. Returns 0 & status information, if successful.""" if mode not in _MODES: raise ValueError( @@ -42,18 +72,32 @@ class AwsEc2(object): instance = self.connection.Instance(image_id) try: if mode == "start": + state = "running" instance.start() elif mode == "stop": + state = "stopped" instance.stop() elif mode == "force-stop": + state = "stopped" instance.stop(Force=True) elif mode == "terminate": + state = "terminated" instance.terminate() elif mode == "reboot": + state = "running" instance.reboot() - except botocore.exceptions.ClientError as e: - return 1, e.message - + else: + state = None + wait_time_secs = 0 + except botocore.exceptions.ClientError as err: + return 1, err.message + + if wait_time_secs > 0: + self.wait_for_state( + instance=instance, + state=state, + wait_time_secs=wait_time_secs, + show_progress=show_progress) try: # Always provide status after executing command. status = self.InstanceStatus( @@ -63,9 +107,11 @@ class AwsEc2(object): getattr(instance, "state", None), getattr(instance, "private_ip_address", None), getattr(instance, "public_ip_address", None), + getattr(instance, "private_dns_name", None), + getattr(instance, "public_dns_name", None), getattr(instance, "tags", None)) - except botocore.exceptions.ClientError as e: - return 1, e.message + except botocore.exceptions.ClientError as err: + return 1, err.message return 0, status @@ -80,8 +126,8 @@ class AwsEc2(object): try: instance = self.connection.Instance(image_id) break - except botocore.exceptions.ClientError as e: - if e.response["Error"]["Code"] != "InvalidInstanceID.NotFound": + except botocore.exceptions.ClientError as err: + if err.response["Error"]["Code"] != "InvalidInstanceID.NotFound": raise time.sleep(i + 1) instance.create_tags(Tags=tags) @@ -122,27 +168,16 @@ class AwsEc2(object): MaxCount=1, MinCount=1, **kwargs) - except (botocore.exceptions.ClientError, botocore.exceptions.ParamValidationError) as e: - return 1, e.message + except (botocore.exceptions.ClientError, botocore.exceptions.ParamValidationError) as err: + return 1, err.message instance = instances[0] - - if wait_time_secs: - # Wait up to 'wait_time_secs' for instance to be 'running'. - end_time = time.time() + wait_time_secs - if show_progress: - print("Waiting for instance {} ".format(instance), end="", file=sys.stdout) - while time.time() < end_time: - if show_progress: - print(".", end="", file=sys.stdout) - sys.stdout.flush() - time.sleep(5) - instance.load() - if instance.state["Name"] == "running": - if show_progress: - print(" Instance running!", file=sys.stdout) - sys.stdout.flush() - break + if wait_time_secs > 0: + self.wait_for_state( + instance=instance, + state="running", + wait_time_secs=wait_time_secs, + show_progress=show_progress) self.tag_instance(instance.instance_id, tags) @@ -150,6 +185,7 @@ class AwsEc2(object): def main(): + """Main program.""" required_create_options = ["ami", "key_name"] @@ -169,6 +205,13 @@ def main(): default=None, help="EC2 image_id to perform operation on [REQUIRED for control].") + control_options.add_option("--waitTimeSecs", + dest="wait_time_secs", + type=int, + default=60, + help="Time to wait for EC2 instance to reach it's new state," + " defaults to '%default'.") + create_options.add_option("--ami", dest="ami", default=None, @@ -229,7 +272,7 @@ def main(): parser.add_option_group(control_options) parser.add_option_group(create_options) - (options, args) = parser.parse_args() + (options, _) = parser.parse_args() aws_ec2 = AwsEc2() @@ -265,7 +308,7 @@ def main(): key_name=options.key_name, security_groups=options.security_groups, tags=tags, - wait_time_secs=60, + wait_time_secs=options.wait_time_secs, show_progress=True, **my_kwargs) else: @@ -273,7 +316,11 @@ def main(): parser.print_help() parser.error("Missing required control option") - (ret_code, instance_status) = aws_ec2.control_instance(options.mode, options.image_id) + (ret_code, instance_status) = aws_ec2.control_instance( + mode=options.mode, + image_id=options.image_id, + wait_time_secs=options.wait_time_secs, + show_progress=True) print("Return code: {}, Instance status:".format(ret_code)) if ret_code: diff --git a/etc/evergreen.yml b/etc/evergreen.yml index 0bd6879d3b6..e8275f1ff46 100644 --- a/etc/evergreen.yml +++ b/etc/evergreen.yml @@ -176,11 +176,11 @@ variables: - &powercycle_test ec2_artifacts: ${log_path} ${backup_path_after} ${backup_path_before} - program_options: --logLevel debug --backupPathBefore ${backup_path_before} --backupPathAfter ${backup_path_after} + program_options: --logLevel info --backupPathBefore ${backup_path_before} --backupPathAfter ${backup_path_after} connection_options: --sshUserHost ${ip_address} --sshConnection \"${ssh_identity} ${ssh_connection_options}\" - test_options: --testLoops 25 --seedDocNum 10000 --rsync --validate local --canary remote - crash_options: --crashMethod internal --crashWaitTime 30 --jitterForCrashWaitTime 5 - client_options: --numCrudClients 5 --numFsmClients 5 + test_options: --testLoops 25 --seedDocNum 10000 --rsync --validate local --canary local + crash_options: "--crashMethod aws_ec2 --crashOptions ${instance_id}:private_ip_address --crashWaitTime 30 --jitterForCrashWaitTime 5" + client_options: --numCrudClients 10 --numFsmClients 10 mongodb_options: --rootDir ${remote_dir}-${task_id} --mongodbBinDir ${remote_dir} mongod_options: --mongodUsablePorts ${standard_port} ${secret_port} --dbPath ${db_path} --logPath ${log_path} mongod_extra_options: --mongodOptions \"--setParameter enableTestCommands=1\" @@ -1257,6 +1257,25 @@ functions: shell: bash working_dir: src script: | + if [ -z "${ec2_artifacts}" ]; then + exit 0 + fi + # Ensure we use the latest ip_address, as it could change if the EC2 instance + # has been stopped and started. + ${activate_virtualenv} + aws_ec2=$(python buildscripts/aws_ec2.py --imageId ${instance_id} --mode status) + ip_address=$(echo $aws_ec2 | sed -e "s/.*private_ip_address: //; s/ .*//") + sed -i -e "s/${ip_address}/$ip_address/" ${aws_ec2_yml} + + - command: expansions.update + params: + file: src/${aws_ec2_yml} + + - command: shell.exec + params: + shell: bash + working_dir: src + script: | # Tar/zip artifacts on remote host. if [ -z "${ec2_artifacts}" ]; then exit 0 @@ -4068,11 +4087,11 @@ tasks: - command: expansions.update <<: *powercycle_expansions - func: "run powercycle test" - # Disable the CRUD & FSM clients for mmapv1. + # Disable the FSM clients for mmapv1. # mongod will not start if it crashed mongod while creating a namespace (SERVER-26499). vars: <<: *powercycle_test - client_options: --numCrudClients 0 --numFsmClients 0 + client_options: --numCrudClients 10 --numFsmClients 0 mongod_extra_options: --mongodOptions \"--setParameter enableTestCommands=1 --storageEngine mmapv1\" - name: powercycle_WT diff --git a/pytests/powertest.py b/pytests/powertest.py index be18ebeabe2..11d642b0eb7 100755 --- a/pytests/powertest.py +++ b/pytests/powertest.py @@ -241,6 +241,20 @@ def execute_cmd(cmd, use_file=False): return error_code, output +def get_aws_crash_options(option): + """ Returns a tuple (instance_id, address_type) of the AWS crash option. """ + if ":" in option: + return tuple(option.split(":")) + return option, None + + +def get_user_host(user_host): + """ Returns a tuple (user, host) from the user_host string. """ + if "@" in user_host: + return tuple(user_host.split("@")) + return None, user_host + + def parse_options(options): """ Parses options and returns a dict. @@ -1060,10 +1074,9 @@ def crash_server(options, crash_canary, canary_port, local_ops, script_name, cli LOGGER.info("Crashing server in %d seconds", crash_wait_time) time.sleep(crash_wait_time) - crash_func = local_ops.shell - if options.crash_method == "mpower": # Provide time for power to dissipate by sleeping 10 seconds before turning it back on. + crash_func = local_ops.shell crash_args = [""" echo 0 > /dev/{crash_options} ; sleep 10 ; @@ -1072,7 +1085,6 @@ def crash_server(options, crash_canary, canary_port, local_ops, script_name, cli user_host=options.ssh_crash_user_host, ssh_connection_options=options.ssh_crash_options, shell_binary="/bin/sh") - crash_func = local_ops.shell elif options.crash_method == "internal": if options.canary == "remote": @@ -1084,6 +1096,7 @@ def crash_server(options, crash_canary, canary_port, local_ops, script_name, cli else: canary = "" canary_cmd = "" + crash_func = local_ops.shell crash_args = ["{} {} --remoteOperation {} {} {} crash_server".format( options.remote_python, script_name, @@ -1094,11 +1107,12 @@ def crash_server(options, crash_canary, canary_port, local_ops, script_name, cli elif options.crash_method == "aws_ec2": ec2 = aws_ec2.AwsEc2() crash_func = ec2.control_instance - crash_args = ["force-stop", options.crash_options] + instance_id, _ = get_aws_crash_options(options.crash_options) + crash_args = ["force-stop", instance_id, 60, True] else: LOGGER.error("Unsupported crash method '%s' provided", options.crash_method) - return False + return # Invoke the crash canary function, right before crashing the server. if crash_canary and options.canary == "local": @@ -1445,6 +1459,18 @@ Examples: help="Crash methods: {} [default: '%default']".format(crash_methods), default="internal") + aws_address_types = [ + "private_ip_address", "public_ip_address", "private_dns_name", "public_dns_name"] + crash_options.add_option("--crashOptions", + dest="crash_options", + help="Secondary argument (REQUIRED) for the following --crashMethod:" + " 'aws_ec2': specify EC2 'instance_id[:address_type]'." + " The address_type is one of {} and defaults to" + " 'public_ip_address'." + " 'mpower': specify output<num> to turn off/on, i.e.," + " 'output1'.".format(aws_address_types), + default=None) + crash_options.add_option("--crashWaitTime", dest="crash_wait_time", help="Time, in seconds, to wait before issuing crash [default:" @@ -1470,14 +1496,6 @@ Examples: help="The crash host's ssh connection options, i.e., '-i ident.pem'", default=None) - crash_options.add_option("--crashOptions", - dest="crash_options", - help="Secondary argument for the following --crashMethod:" - " 'aws_ec2': specify EC2 instance_id." - " 'mpower': specify output<num> to turn off/on, i.e.," - " 'output1'.", - default=None) - # MongoDB options mongodb_options.add_option("--downloadUrl", dest="tarball_url", @@ -1686,6 +1704,21 @@ Examples: print("{}:{}".format(script_name, __version__)) sys.exit(0) + # Setup the crash options + if ((options.crash_method == "aws_ec2" or options.crash_method == "mpower") and + options.crash_options is None): + parser.error("Missing required argument --crashOptions for crashMethod '{}'".format( + options.crash_method)) + + if options.crash_method == "aws_ec2": + instance_id, address_type = get_aws_crash_options(options.crash_options) + address_type = address_type if address_type is not None else "public_ip_address" + if address_type not in aws_address_types: + LOGGER.error("Invalid crashOptions address_type '%s' specified for crashMethod" + " 'aws_ec2', specify one of %s", address_type, aws_address_types) + sys.exit(1) + options.crash_options = "{}:{}".format(instance_id, address_type) + # Initialize the mongod options if not options.root_dir: options.root_dir = "mongodb-powertest-{}".format(int(time.time())) @@ -1768,7 +1801,8 @@ Examples: # The remote mongod host comes from the ssh_user_host, # which may be specified as user@host. - mongod_host = options.ssh_user_host.rsplit()[-1].rsplit("@")[-1] + ssh_user, ssh_host = get_user_host(options.ssh_user_host) + mongod_host = ssh_host ssh_connection_options = "{} {}".format( default_ssh_connection_options, @@ -2027,6 +2061,28 @@ Examples: # Wait a bit after sending command to crash the server to avoid connecting to the # server before the actual crash occurs. time.sleep(10) + + # The EC2 instance address changes if the crash_method is 'aws_ec2'. + if options.crash_method == "aws_ec2": + ec2 = aws_ec2.AwsEc2() + ret, aws_status = ec2.control_instance( + mode="start", image_id=instance_id, wait_time_secs=60, show_progress=True) + LOGGER.info("Start instance: %d %s****", ret, aws_status) + if not hasattr(aws_status, address_type): + raise Exception("Cannot determine address_type {} from AWS EC2 status {}".format( + address_type, aws_status)) + ssh_host = getattr(aws_status, address_type) + if ssh_user is None: + ssh_user_host = ssh_host + else: + ssh_user_host = "{}@{}".format(ssh_user, ssh_host) + mongod_host = ssh_host + local_ops = LocalToRemoteOperations( + user_host=ssh_user_host, + ssh_connection_options=ssh_connection_options, + ssh_options=ssh_options, + use_shell=True) + canary_doc = copy.deepcopy(orig_canary_doc) kill_processes(crud_pids + fsm_pids) |