diff options
author | Jonathan Abrahams <jonathan@Jonathans-MacBook-Pro.local> | 2018-09-13 10:30:39 -0400 |
---|---|---|
committer | Jonathan Abrahams <jonathan@mongodb.com> | 2018-09-13 12:29:28 -0400 |
commit | f4d62c2ba9a27dc03663779d0817bc399ab2e91f (patch) | |
tree | 9b2a9d4d362853d765f39b24d16d07f7ad103d70 | |
parent | 0aae246c5dc62e4f6541b3afe66ba04eb2d02799 (diff) | |
download | mongo-f4d62c2ba9a27dc03663779d0817bc399ab2e91f.tar.gz |
SERVER-36162 Powercycle - ensure internal crash command has been executed on the remote host
-rwxr-xr-x | buildscripts/remote_operations.py | 12 | ||||
-rwxr-xr-x | pytests/powertest.py | 56 |
2 files changed, 59 insertions, 9 deletions
diff --git a/buildscripts/remote_operations.py b/buildscripts/remote_operations.py index a1f95f117e8..93798ae7ac1 100755 --- a/buildscripts/remote_operations.py +++ b/buildscripts/remote_operations.py @@ -35,6 +35,7 @@ _OPERATIONS = ["shell", "copy_to", "copy_from"] _SSH_CONNECTION_ERRORS = [ "Connection refused", + "Connection timed out during banner exchange", "Permission denied", "System is booting up.", "ssh_exchange_identification: read: Connection reset by peer", @@ -110,13 +111,22 @@ class RemoteOperations(object): # pylint: disable=too-many-instance-attributes return self._call(cmd) def access_established(self): - """Return True if initial access was establsished.""" + """Return True if initial access was established.""" return not self._access_code def access_info(self): """Return the return code and output buffer from initial access attempt(s).""" return self._access_code, self._access_buff + @staticmethod + def ssh_error(message): + """Return True if the error message is generated from the ssh client. + + This can help determine if an error is due to a remote operation failing or an ssh + related issue, like a connection issue. + """ + return message.startswith("ssh:") + def operation( # pylint: disable=too-many-branches self, operation_type, operation_param, operation_dir=None): """Execute Main entry for remote operations. Returns (code, output). diff --git a/pytests/powertest.py b/pytests/powertest.py index 72a4486fe75..aad08ea0e5e 100755 --- a/pytests/powertest.py +++ b/pytests/powertest.py @@ -630,6 +630,17 @@ def install_mongod(bin_dir=None, tarball_url="latest", root_dir=None): symlink_dir(tarball_bin_dir, root_bin_dir) +def get_boot_datetime(uptime_string): + """Return the datetime value of boot_time from formatted print_uptime 'uptime_string'. + + Return -1 if it is not found in 'uptime_string'. + """ + match = re.search(r"last booted (.*), up", uptime_string) + if match: + return datetime.datetime(*map(int, map(float, re.split("[ :-]", match.groups()[0])))) + return -1 + + def print_uptime(): """Print the last time the system was booted, and the uptime (in seconds).""" boot_time_epoch = psutil.boot_time() @@ -639,7 +650,7 @@ def print_uptime(): def call_remote_operation(local_ops, remote_python, script_name, client_args, operation): - """Call the remote operation and returns tuple (ret, ouput).""" + """Call the remote operation and return tuple (ret, ouput).""" client_call = "{} {} {} {}".format(remote_python, script_name, client_args, operation) ret, output = local_ops.shell(client_call) return ret, output @@ -1101,13 +1112,18 @@ class MongodControl(object): # pylint: disable=too-many-instance-attributes return self.service.get_pids() +def ssh_failure_exit(code, output): + """Exit on ssh failure with code.""" + EXIT_YML["ec2_ssh_failure"] = output + local_exit(code) + + def verify_remote_access(remote_op): """Exit if the remote host is not accessible and save result to YML file.""" if not remote_op.access_established(): code, output = remote_op.access_info() LOGGER.error("Exiting, unable to establish access (%d): %s", code, output) - EXIT_YML["ec2_ssh_failure"] = output - local_exit(code) + ssh_failure_exit(code, output) class LocalToRemoteOperations(object): @@ -1142,6 +1158,10 @@ class LocalToRemoteOperations(object): """Return True if remote access has been established.""" return self.remote_op.access_established() + def ssh_error(self, output): + """Return True if 'output' contains an ssh error.""" + return self.remote_op.ssh_error(output) + def access_info(self): """Return the return code and output buffer from initial access attempt(s).""" return self.remote_op.access_info() @@ -1180,11 +1200,14 @@ def remote_handler(options, operations): # pylint: disable=too-many-branches,to mongo_client_opts = get_mongo_client_args(host=host, port=options.port, options=options) - # Perform the sequence of operations specified. If any operation fails - # then return immediately. + # Perform the sequence of operations specified. If any operation fails then return immediately. for operation in operations: + ret = 0 + if operation == "noop": + pass + # This is the internal "crash" mechanism, which is executed on the remote host. - if operation == "crash_server": + elif operation == "crash_server": ret, output = internal_crash(options.remote_sudo, options.crash_option) # An internal crash on Windows is not immediate try: @@ -1244,7 +1267,6 @@ def remote_handler(options, operations): # pylint: disable=too-many-branches,to LOGGER.info("Server serverStatus: %s", mongo.admin.command("serverStatus")) if options.repl_set: ret = mongo_reconfig_replication(mongo, host_port, options.repl_set) - ret = 0 if not ret else 1 elif operation == "stop_mongod": ret, output = mongod.stop() @@ -1299,7 +1321,6 @@ def remote_handler(options, operations): # pylint: disable=too-many-branches,to elif operation == "remove_lock_file": lock_file = os.path.join(options.db_path, "mongod.lock") - ret = 0 if os.path.exists(lock_file): LOGGER.debug("Deleting mongod lockfile %s", lock_file) try: @@ -2405,6 +2426,8 @@ Examples: if ret: local_exit(ret) + boot_time_after_recovery = get_boot_datetime(output) + # Start CRUD clients host_port = "{}:{}".format(mongod_host, standard_port) for i in xrange(options.num_crud_clients): @@ -2449,11 +2472,17 @@ Examples: crash_canary["args"] = [mongo, options.db_name, options.collection_name, canary_doc] ret, output = crash_server_or_kill_mongod(options, crash_canary, standard_port, local_ops, script_name, client_args) + + LOGGER.info("Crash server or Kill mongod: %d %s****", ret, output) + # For internal crashes 'ret' is non-zero, because the ssh session unexpectedly terminates. if options.crash_method != "internal" and ret: raise Exception("Crash of server failed: {}".format(output)) if options.crash_method != "kill": + # Check if the crash failed due to an ssh error. + if options.crash_method == "internal" and local_ops.ssh_error(output): + ssh_failure_exit(ret, output) # Wait a bit after sending command to crash the server to avoid connecting to the # server before the actual crash occurs. time.sleep(10) @@ -2491,6 +2520,17 @@ Examples: ssh_connection_options=ssh_connection_options, ssh_options=ssh_options, use_shell=True) verify_remote_access(local_ops) + ret, output = call_remote_operation(local_ops, options.remote_python, script_name, + client_args, "--remoteOperation noop") + boot_time_after_crash = get_boot_datetime(output) + if boot_time_after_crash == -1 or boot_time_after_recovery == -1: + LOGGER.warning( + "Cannot compare boot time after recovery: %s with boot time after crash: %s", + boot_time_after_recovery, boot_time_after_crash) + elif options.crash_method != "kill" and boot_time_after_crash <= boot_time_after_recovery: + raise Exception( + "System boot time after crash ({}) is not newer than boot time before crash ({})". + format(boot_time_after_crash, boot_time_after_recovery)) canary_doc = copy.deepcopy(orig_canary_doc) |