summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xbuildscripts/aws_ec2.py113
-rw-r--r--etc/evergreen.yml31
-rwxr-xr-xpytests/powertest.py84
3 files changed, 175 insertions, 53 deletions
diff --git a/buildscripts/aws_ec2.py b/buildscripts/aws_ec2.py
index 788d5de65b8..2d643e045c1 100755
--- a/buildscripts/aws_ec2.py
+++ b/buildscripts/aws_ec2.py
@@ -20,9 +20,17 @@ _MODES = ("status", "create", "start", "stop", "force-stop", "reboot", "terminat
class AwsEc2(object):
"""Class to support controlling AWS EC2 istances."""
- InstanceStatus = collections.namedtuple(
- "InstanceStatus",
- "instance_id image_id instance_type state private_ip_address public_ip_address tags")
+ InstanceStatus = collections.namedtuple("InstanceStatus", [
+ "instance_id",
+ "image_id",
+ "instance_type",
+ "state",
+ "private_ip_address",
+ "public_ip_address",
+ "private_dns_name",
+ "public_dns_name",
+ "tags"
+ ])
def __init__(self):
try:
@@ -33,7 +41,29 @@ class AwsEc2(object):
" for the variable names, file names and precedence order.")
raise
- def control_instance(self, mode, image_id):
+ @staticmethod
+ def wait_for_state(instance, state, wait_time_secs=0, show_progress=False):
+ """Wait up to 'wait_time_secs' for instance to be in 'state'.
+ Return True if 'state' reached."""
+ end_time = time.time() + wait_time_secs
+ if show_progress:
+ print("Waiting for instance {} to reach '{}' state".format(instance, state),
+ end="",
+ file=sys.stdout)
+ while time.time() < end_time:
+ if show_progress:
+ print(".", end="", file=sys.stdout)
+ sys.stdout.flush()
+ time.sleep(5)
+ instance.load()
+ if instance.state["Name"] == state:
+ if show_progress:
+ print(" Instance {}!".format(state), file=sys.stdout)
+ sys.stdout.flush()
+ return True
+ return False
+
+ def control_instance(self, mode, image_id, wait_time_secs=0, show_progress=False):
"""Controls an AMI instance. Returns 0 & status information, if successful."""
if mode not in _MODES:
raise ValueError(
@@ -42,18 +72,32 @@ class AwsEc2(object):
instance = self.connection.Instance(image_id)
try:
if mode == "start":
+ state = "running"
instance.start()
elif mode == "stop":
+ state = "stopped"
instance.stop()
elif mode == "force-stop":
+ state = "stopped"
instance.stop(Force=True)
elif mode == "terminate":
+ state = "terminated"
instance.terminate()
elif mode == "reboot":
+ state = "running"
instance.reboot()
- except botocore.exceptions.ClientError as e:
- return 1, e.message
-
+ else:
+ state = None
+ wait_time_secs = 0
+ except botocore.exceptions.ClientError as err:
+ return 1, err.message
+
+ if wait_time_secs > 0:
+ self.wait_for_state(
+ instance=instance,
+ state=state,
+ wait_time_secs=wait_time_secs,
+ show_progress=show_progress)
try:
# Always provide status after executing command.
status = self.InstanceStatus(
@@ -63,9 +107,11 @@ class AwsEc2(object):
getattr(instance, "state", None),
getattr(instance, "private_ip_address", None),
getattr(instance, "public_ip_address", None),
+ getattr(instance, "private_dns_name", None),
+ getattr(instance, "public_dns_name", None),
getattr(instance, "tags", None))
- except botocore.exceptions.ClientError as e:
- return 1, e.message
+ except botocore.exceptions.ClientError as err:
+ return 1, err.message
return 0, status
@@ -80,8 +126,8 @@ class AwsEc2(object):
try:
instance = self.connection.Instance(image_id)
break
- except botocore.exceptions.ClientError as e:
- if e.response["Error"]["Code"] != "InvalidInstanceID.NotFound":
+ except botocore.exceptions.ClientError as err:
+ if err.response["Error"]["Code"] != "InvalidInstanceID.NotFound":
raise
time.sleep(i + 1)
instance.create_tags(Tags=tags)
@@ -122,27 +168,16 @@ class AwsEc2(object):
MaxCount=1,
MinCount=1,
**kwargs)
- except (botocore.exceptions.ClientError, botocore.exceptions.ParamValidationError) as e:
- return 1, e.message
+ except (botocore.exceptions.ClientError, botocore.exceptions.ParamValidationError) as err:
+ return 1, err.message
instance = instances[0]
-
- if wait_time_secs:
- # Wait up to 'wait_time_secs' for instance to be 'running'.
- end_time = time.time() + wait_time_secs
- if show_progress:
- print("Waiting for instance {} ".format(instance), end="", file=sys.stdout)
- while time.time() < end_time:
- if show_progress:
- print(".", end="", file=sys.stdout)
- sys.stdout.flush()
- time.sleep(5)
- instance.load()
- if instance.state["Name"] == "running":
- if show_progress:
- print(" Instance running!", file=sys.stdout)
- sys.stdout.flush()
- break
+ if wait_time_secs > 0:
+ self.wait_for_state(
+ instance=instance,
+ state="running",
+ wait_time_secs=wait_time_secs,
+ show_progress=show_progress)
self.tag_instance(instance.instance_id, tags)
@@ -150,6 +185,7 @@ class AwsEc2(object):
def main():
+ """Main program."""
required_create_options = ["ami", "key_name"]
@@ -169,6 +205,13 @@ def main():
default=None,
help="EC2 image_id to perform operation on [REQUIRED for control].")
+ control_options.add_option("--waitTimeSecs",
+ dest="wait_time_secs",
+ type=int,
+ default=60,
+ help="Time to wait for EC2 instance to reach it's new state,"
+ " defaults to '%default'.")
+
create_options.add_option("--ami",
dest="ami",
default=None,
@@ -229,7 +272,7 @@ def main():
parser.add_option_group(control_options)
parser.add_option_group(create_options)
- (options, args) = parser.parse_args()
+ (options, _) = parser.parse_args()
aws_ec2 = AwsEc2()
@@ -265,7 +308,7 @@ def main():
key_name=options.key_name,
security_groups=options.security_groups,
tags=tags,
- wait_time_secs=60,
+ wait_time_secs=options.wait_time_secs,
show_progress=True,
**my_kwargs)
else:
@@ -273,7 +316,11 @@ def main():
parser.print_help()
parser.error("Missing required control option")
- (ret_code, instance_status) = aws_ec2.control_instance(options.mode, options.image_id)
+ (ret_code, instance_status) = aws_ec2.control_instance(
+ mode=options.mode,
+ image_id=options.image_id,
+ wait_time_secs=options.wait_time_secs,
+ show_progress=True)
print("Return code: {}, Instance status:".format(ret_code))
if ret_code:
diff --git a/etc/evergreen.yml b/etc/evergreen.yml
index 0bd6879d3b6..e8275f1ff46 100644
--- a/etc/evergreen.yml
+++ b/etc/evergreen.yml
@@ -176,11 +176,11 @@ variables:
- &powercycle_test
ec2_artifacts: ${log_path} ${backup_path_after} ${backup_path_before}
- program_options: --logLevel debug --backupPathBefore ${backup_path_before} --backupPathAfter ${backup_path_after}
+ program_options: --logLevel info --backupPathBefore ${backup_path_before} --backupPathAfter ${backup_path_after}
connection_options: --sshUserHost ${ip_address} --sshConnection \"${ssh_identity} ${ssh_connection_options}\"
- test_options: --testLoops 25 --seedDocNum 10000 --rsync --validate local --canary remote
- crash_options: --crashMethod internal --crashWaitTime 30 --jitterForCrashWaitTime 5
- client_options: --numCrudClients 5 --numFsmClients 5
+ test_options: --testLoops 25 --seedDocNum 10000 --rsync --validate local --canary local
+ crash_options: "--crashMethod aws_ec2 --crashOptions ${instance_id}:private_ip_address --crashWaitTime 30 --jitterForCrashWaitTime 5"
+ client_options: --numCrudClients 10 --numFsmClients 10
mongodb_options: --rootDir ${remote_dir}-${task_id} --mongodbBinDir ${remote_dir}
mongod_options: --mongodUsablePorts ${standard_port} ${secret_port} --dbPath ${db_path} --logPath ${log_path}
mongod_extra_options: --mongodOptions \"--setParameter enableTestCommands=1\"
@@ -1257,6 +1257,25 @@ functions:
shell: bash
working_dir: src
script: |
+ if [ -z "${ec2_artifacts}" ]; then
+ exit 0
+ fi
+ # Ensure we use the latest ip_address, as it could change if the EC2 instance
+ # has been stopped and started.
+ ${activate_virtualenv}
+ aws_ec2=$(python buildscripts/aws_ec2.py --imageId ${instance_id} --mode status)
+ ip_address=$(echo $aws_ec2 | sed -e "s/.*private_ip_address: //; s/ .*//")
+ sed -i -e "s/${ip_address}/$ip_address/" ${aws_ec2_yml}
+
+ - command: expansions.update
+ params:
+ file: src/${aws_ec2_yml}
+
+ - command: shell.exec
+ params:
+ shell: bash
+ working_dir: src
+ script: |
# Tar/zip artifacts on remote host.
if [ -z "${ec2_artifacts}" ]; then
exit 0
@@ -4068,11 +4087,11 @@ tasks:
- command: expansions.update
<<: *powercycle_expansions
- func: "run powercycle test"
- # Disable the CRUD & FSM clients for mmapv1.
+ # Disable the FSM clients for mmapv1.
# mongod will not start if it crashed mongod while creating a namespace (SERVER-26499).
vars:
<<: *powercycle_test
- client_options: --numCrudClients 0 --numFsmClients 0
+ client_options: --numCrudClients 10 --numFsmClients 0
mongod_extra_options: --mongodOptions \"--setParameter enableTestCommands=1 --storageEngine mmapv1\"
- name: powercycle_WT
diff --git a/pytests/powertest.py b/pytests/powertest.py
index be18ebeabe2..11d642b0eb7 100755
--- a/pytests/powertest.py
+++ b/pytests/powertest.py
@@ -241,6 +241,20 @@ def execute_cmd(cmd, use_file=False):
return error_code, output
+def get_aws_crash_options(option):
+ """ Returns a tuple (instance_id, address_type) of the AWS crash option. """
+ if ":" in option:
+ return tuple(option.split(":"))
+ return option, None
+
+
+def get_user_host(user_host):
+ """ Returns a tuple (user, host) from the user_host string. """
+ if "@" in user_host:
+ return tuple(user_host.split("@"))
+ return None, user_host
+
+
def parse_options(options):
""" Parses options and returns a dict.
@@ -1060,10 +1074,9 @@ def crash_server(options, crash_canary, canary_port, local_ops, script_name, cli
LOGGER.info("Crashing server in %d seconds", crash_wait_time)
time.sleep(crash_wait_time)
- crash_func = local_ops.shell
-
if options.crash_method == "mpower":
# Provide time for power to dissipate by sleeping 10 seconds before turning it back on.
+ crash_func = local_ops.shell
crash_args = ["""
echo 0 > /dev/{crash_options} ;
sleep 10 ;
@@ -1072,7 +1085,6 @@ def crash_server(options, crash_canary, canary_port, local_ops, script_name, cli
user_host=options.ssh_crash_user_host,
ssh_connection_options=options.ssh_crash_options,
shell_binary="/bin/sh")
- crash_func = local_ops.shell
elif options.crash_method == "internal":
if options.canary == "remote":
@@ -1084,6 +1096,7 @@ def crash_server(options, crash_canary, canary_port, local_ops, script_name, cli
else:
canary = ""
canary_cmd = ""
+ crash_func = local_ops.shell
crash_args = ["{} {} --remoteOperation {} {} {} crash_server".format(
options.remote_python,
script_name,
@@ -1094,11 +1107,12 @@ def crash_server(options, crash_canary, canary_port, local_ops, script_name, cli
elif options.crash_method == "aws_ec2":
ec2 = aws_ec2.AwsEc2()
crash_func = ec2.control_instance
- crash_args = ["force-stop", options.crash_options]
+ instance_id, _ = get_aws_crash_options(options.crash_options)
+ crash_args = ["force-stop", instance_id, 60, True]
else:
LOGGER.error("Unsupported crash method '%s' provided", options.crash_method)
- return False
+ return
# Invoke the crash canary function, right before crashing the server.
if crash_canary and options.canary == "local":
@@ -1445,6 +1459,18 @@ Examples:
help="Crash methods: {} [default: '%default']".format(crash_methods),
default="internal")
+ aws_address_types = [
+ "private_ip_address", "public_ip_address", "private_dns_name", "public_dns_name"]
+ crash_options.add_option("--crashOptions",
+ dest="crash_options",
+ help="Secondary argument (REQUIRED) for the following --crashMethod:"
+ " 'aws_ec2': specify EC2 'instance_id[:address_type]'."
+ " The address_type is one of {} and defaults to"
+ " 'public_ip_address'."
+ " 'mpower': specify output<num> to turn off/on, i.e.,"
+ " 'output1'.".format(aws_address_types),
+ default=None)
+
crash_options.add_option("--crashWaitTime",
dest="crash_wait_time",
help="Time, in seconds, to wait before issuing crash [default:"
@@ -1470,14 +1496,6 @@ Examples:
help="The crash host's ssh connection options, i.e., '-i ident.pem'",
default=None)
- crash_options.add_option("--crashOptions",
- dest="crash_options",
- help="Secondary argument for the following --crashMethod:"
- " 'aws_ec2': specify EC2 instance_id."
- " 'mpower': specify output<num> to turn off/on, i.e.,"
- " 'output1'.",
- default=None)
-
# MongoDB options
mongodb_options.add_option("--downloadUrl",
dest="tarball_url",
@@ -1686,6 +1704,21 @@ Examples:
print("{}:{}".format(script_name, __version__))
sys.exit(0)
+ # Setup the crash options
+ if ((options.crash_method == "aws_ec2" or options.crash_method == "mpower") and
+ options.crash_options is None):
+ parser.error("Missing required argument --crashOptions for crashMethod '{}'".format(
+ options.crash_method))
+
+ if options.crash_method == "aws_ec2":
+ instance_id, address_type = get_aws_crash_options(options.crash_options)
+ address_type = address_type if address_type is not None else "public_ip_address"
+ if address_type not in aws_address_types:
+ LOGGER.error("Invalid crashOptions address_type '%s' specified for crashMethod"
+ " 'aws_ec2', specify one of %s", address_type, aws_address_types)
+ sys.exit(1)
+ options.crash_options = "{}:{}".format(instance_id, address_type)
+
# Initialize the mongod options
if not options.root_dir:
options.root_dir = "mongodb-powertest-{}".format(int(time.time()))
@@ -1768,7 +1801,8 @@ Examples:
# The remote mongod host comes from the ssh_user_host,
# which may be specified as user@host.
- mongod_host = options.ssh_user_host.rsplit()[-1].rsplit("@")[-1]
+ ssh_user, ssh_host = get_user_host(options.ssh_user_host)
+ mongod_host = ssh_host
ssh_connection_options = "{} {}".format(
default_ssh_connection_options,
@@ -2027,6 +2061,28 @@ Examples:
# Wait a bit after sending command to crash the server to avoid connecting to the
# server before the actual crash occurs.
time.sleep(10)
+
+ # The EC2 instance address changes if the crash_method is 'aws_ec2'.
+ if options.crash_method == "aws_ec2":
+ ec2 = aws_ec2.AwsEc2()
+ ret, aws_status = ec2.control_instance(
+ mode="start", image_id=instance_id, wait_time_secs=60, show_progress=True)
+ LOGGER.info("Start instance: %d %s****", ret, aws_status)
+ if not hasattr(aws_status, address_type):
+ raise Exception("Cannot determine address_type {} from AWS EC2 status {}".format(
+ address_type, aws_status))
+ ssh_host = getattr(aws_status, address_type)
+ if ssh_user is None:
+ ssh_user_host = ssh_host
+ else:
+ ssh_user_host = "{}@{}".format(ssh_user, ssh_host)
+ mongod_host = ssh_host
+ local_ops = LocalToRemoteOperations(
+ user_host=ssh_user_host,
+ ssh_connection_options=ssh_connection_options,
+ ssh_options=ssh_options,
+ use_shell=True)
+
canary_doc = copy.deepcopy(orig_canary_doc)
kill_processes(crud_pids + fsm_pids)