summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJonathan Abrahams <jonathan@Jonathans-MacBook-Pro.local>2018-09-13 10:30:39 -0400
committerJonathan Abrahams <jonathan@mongodb.com>2018-09-13 12:29:28 -0400
commitf4d62c2ba9a27dc03663779d0817bc399ab2e91f (patch)
tree9b2a9d4d362853d765f39b24d16d07f7ad103d70
parent0aae246c5dc62e4f6541b3afe66ba04eb2d02799 (diff)
downloadmongo-f4d62c2ba9a27dc03663779d0817bc399ab2e91f.tar.gz
SERVER-36162 Powercycle - ensure internal crash command has been executed on the remote host
-rwxr-xr-xbuildscripts/remote_operations.py12
-rwxr-xr-xpytests/powertest.py56
2 files changed, 59 insertions, 9 deletions
diff --git a/buildscripts/remote_operations.py b/buildscripts/remote_operations.py
index a1f95f117e8..93798ae7ac1 100755
--- a/buildscripts/remote_operations.py
+++ b/buildscripts/remote_operations.py
@@ -35,6 +35,7 @@ _OPERATIONS = ["shell", "copy_to", "copy_from"]
_SSH_CONNECTION_ERRORS = [
"Connection refused",
+ "Connection timed out during banner exchange",
"Permission denied",
"System is booting up.",
"ssh_exchange_identification: read: Connection reset by peer",
@@ -110,13 +111,22 @@ class RemoteOperations(object): # pylint: disable=too-many-instance-attributes
return self._call(cmd)
def access_established(self):
- """Return True if initial access was establsished."""
+ """Return True if initial access was established."""
return not self._access_code
def access_info(self):
"""Return the return code and output buffer from initial access attempt(s)."""
return self._access_code, self._access_buff
+ @staticmethod
+ def ssh_error(message):
+ """Return True if the error message is generated from the ssh client.
+
+ This can help determine if an error is due to a remote operation failing or an ssh
+ related issue, like a connection issue.
+ """
+ return message.startswith("ssh:")
+
def operation( # pylint: disable=too-many-branches
self, operation_type, operation_param, operation_dir=None):
"""Execute Main entry for remote operations. Returns (code, output).
diff --git a/pytests/powertest.py b/pytests/powertest.py
index 72a4486fe75..aad08ea0e5e 100755
--- a/pytests/powertest.py
+++ b/pytests/powertest.py
@@ -630,6 +630,17 @@ def install_mongod(bin_dir=None, tarball_url="latest", root_dir=None):
symlink_dir(tarball_bin_dir, root_bin_dir)
+def get_boot_datetime(uptime_string):
+ """Return the datetime value of boot_time from formatted print_uptime 'uptime_string'.
+
+ Return -1 if it is not found in 'uptime_string'.
+ """
+ match = re.search(r"last booted (.*), up", uptime_string)
+ if match:
+ return datetime.datetime(*map(int, map(float, re.split("[ :-]", match.groups()[0]))))
+ return -1
+
+
def print_uptime():
"""Print the last time the system was booted, and the uptime (in seconds)."""
boot_time_epoch = psutil.boot_time()
@@ -639,7 +650,7 @@ def print_uptime():
def call_remote_operation(local_ops, remote_python, script_name, client_args, operation):
- """Call the remote operation and returns tuple (ret, ouput)."""
+ """Call the remote operation and return tuple (ret, ouput)."""
client_call = "{} {} {} {}".format(remote_python, script_name, client_args, operation)
ret, output = local_ops.shell(client_call)
return ret, output
@@ -1101,13 +1112,18 @@ class MongodControl(object): # pylint: disable=too-many-instance-attributes
return self.service.get_pids()
+def ssh_failure_exit(code, output):
+ """Exit on ssh failure with code."""
+ EXIT_YML["ec2_ssh_failure"] = output
+ local_exit(code)
+
+
def verify_remote_access(remote_op):
"""Exit if the remote host is not accessible and save result to YML file."""
if not remote_op.access_established():
code, output = remote_op.access_info()
LOGGER.error("Exiting, unable to establish access (%d): %s", code, output)
- EXIT_YML["ec2_ssh_failure"] = output
- local_exit(code)
+ ssh_failure_exit(code, output)
class LocalToRemoteOperations(object):
@@ -1142,6 +1158,10 @@ class LocalToRemoteOperations(object):
"""Return True if remote access has been established."""
return self.remote_op.access_established()
+ def ssh_error(self, output):
+ """Return True if 'output' contains an ssh error."""
+ return self.remote_op.ssh_error(output)
+
def access_info(self):
"""Return the return code and output buffer from initial access attempt(s)."""
return self.remote_op.access_info()
@@ -1180,11 +1200,14 @@ def remote_handler(options, operations): # pylint: disable=too-many-branches,to
mongo_client_opts = get_mongo_client_args(host=host, port=options.port, options=options)
- # Perform the sequence of operations specified. If any operation fails
- # then return immediately.
+ # Perform the sequence of operations specified. If any operation fails then return immediately.
for operation in operations:
+ ret = 0
+ if operation == "noop":
+ pass
+
# This is the internal "crash" mechanism, which is executed on the remote host.
- if operation == "crash_server":
+ elif operation == "crash_server":
ret, output = internal_crash(options.remote_sudo, options.crash_option)
# An internal crash on Windows is not immediate
try:
@@ -1244,7 +1267,6 @@ def remote_handler(options, operations): # pylint: disable=too-many-branches,to
LOGGER.info("Server serverStatus: %s", mongo.admin.command("serverStatus"))
if options.repl_set:
ret = mongo_reconfig_replication(mongo, host_port, options.repl_set)
- ret = 0 if not ret else 1
elif operation == "stop_mongod":
ret, output = mongod.stop()
@@ -1299,7 +1321,6 @@ def remote_handler(options, operations): # pylint: disable=too-many-branches,to
elif operation == "remove_lock_file":
lock_file = os.path.join(options.db_path, "mongod.lock")
- ret = 0
if os.path.exists(lock_file):
LOGGER.debug("Deleting mongod lockfile %s", lock_file)
try:
@@ -2405,6 +2426,8 @@ Examples:
if ret:
local_exit(ret)
+ boot_time_after_recovery = get_boot_datetime(output)
+
# Start CRUD clients
host_port = "{}:{}".format(mongod_host, standard_port)
for i in xrange(options.num_crud_clients):
@@ -2449,11 +2472,17 @@ Examples:
crash_canary["args"] = [mongo, options.db_name, options.collection_name, canary_doc]
ret, output = crash_server_or_kill_mongod(options, crash_canary, standard_port, local_ops,
script_name, client_args)
+
+ LOGGER.info("Crash server or Kill mongod: %d %s****", ret, output)
+
# For internal crashes 'ret' is non-zero, because the ssh session unexpectedly terminates.
if options.crash_method != "internal" and ret:
raise Exception("Crash of server failed: {}".format(output))
if options.crash_method != "kill":
+ # Check if the crash failed due to an ssh error.
+ if options.crash_method == "internal" and local_ops.ssh_error(output):
+ ssh_failure_exit(ret, output)
# Wait a bit after sending command to crash the server to avoid connecting to the
# server before the actual crash occurs.
time.sleep(10)
@@ -2491,6 +2520,17 @@ Examples:
ssh_connection_options=ssh_connection_options,
ssh_options=ssh_options, use_shell=True)
verify_remote_access(local_ops)
+ ret, output = call_remote_operation(local_ops, options.remote_python, script_name,
+ client_args, "--remoteOperation noop")
+ boot_time_after_crash = get_boot_datetime(output)
+ if boot_time_after_crash == -1 or boot_time_after_recovery == -1:
+ LOGGER.warning(
+ "Cannot compare boot time after recovery: %s with boot time after crash: %s",
+ boot_time_after_recovery, boot_time_after_crash)
+ elif options.crash_method != "kill" and boot_time_after_crash <= boot_time_after_recovery:
+ raise Exception(
+ "System boot time after crash ({}) is not newer than boot time before crash ({})".
+ format(boot_time_after_crash, boot_time_after_recovery))
canary_doc = copy.deepcopy(orig_canary_doc)