diff options
author | Mikhail Shchatko <mikhail.shchatko@mongodb.com> | 2021-04-26 11:44:09 +0300 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-04-28 16:16:48 +0000 |
commit | 522e0db207fe2a0d3b2bc75437b6af5b70b8827c (patch) | |
tree | c073fc97b76c5a2bd86d76f04048b9abfcb9c65b | |
parent | 9d463d0283018bd0c07604b3f74ad6830f0e513b (diff) | |
download | mongo-522e0db207fe2a0d3b2bc75437b6af5b70b8827c.tar.gz |
SERVER-56340 Retry remote host setup operations
(cherry picked from commit 3d63ca6b013383cf12e22d414697f62bae2a2b6d)
-rw-r--r-- | buildscripts/resmokelib/powercycle/lib/remote_operations.py | 16 | ||||
-rw-r--r-- | buildscripts/resmokelib/powercycle/setup/__init__.py | 16 |
2 files changed, 16 insertions, 16 deletions
diff --git a/buildscripts/resmokelib/powercycle/lib/remote_operations.py b/buildscripts/resmokelib/powercycle/lib/remote_operations.py index 1e3178163b5..8ae7d9caec7 100644 --- a/buildscripts/resmokelib/powercycle/lib/remote_operations.py +++ b/buildscripts/resmokelib/powercycle/lib/remote_operations.py @@ -58,7 +58,6 @@ class RemoteOperations(object): # pylint: disable=too-many-instance-attributes self.ssh_connection_options = ssh_connection_options if ssh_connection_options else "" self.ssh_options = ssh_options if ssh_options else "" self.scp_options = scp_options if scp_options else "" - self.retries = 5 self.retry_sleep = 10 self.ignore_ret = ignore_ret self.shell_binary = shell_binary @@ -80,7 +79,7 @@ class RemoteOperations(object): # pylint: disable=too-many-instance-attributes print(textwrap.indent(buff, "[result body] ")) return process.poll(), buff - def _call_retries(self, cmd): + def _call_retries(self, cmd, retry_count): attempt_num = 0 while True: ret, buff = self._call(cmd) @@ -88,7 +87,7 @@ class RemoteOperations(object): # pylint: disable=too-many-instance-attributes if not ret and not any(ssh_error in buff for ssh_error in _SSH_CONNECTION_ERRORS): return ret, buff attempt_num += 1 - if attempt_num > self.retries: + if attempt_num > retry_count: print("Exhausted all retry attempts.") break print("Remote attempt {} unsuccessful, retrying in {} seconds".format( @@ -100,11 +99,11 @@ class RemoteOperations(object): # pylint: disable=too-many-instance-attributes """Check if a remote session is possible.""" cmd = "ssh {} {} {} date".format(self.ssh_connection_options, self.ssh_options, self.user_host) - return self._call_retries(cmd) + return self._call_retries(cmd, 5) - def _perform_operation(self, cmd, retry): + def _perform_operation(self, cmd, retry, retry_count): if retry: - return self._call_retries(cmd) + return self._call_retries(cmd, retry_count) return self._call(cmd) @@ -126,7 +125,8 @@ class RemoteOperations(object): # pylint: disable=too-many-instance-attributes return message.startswith("ssh:") # pylint: disable=too-many-branches,too-many-arguments,too-many-locals,inconsistent-return-statements - def operation(self, operation_type, operation_param, operation_dir=None, retry=False): + def operation(self, operation_type, operation_param, operation_dir=None, retry=False, + retry_count=5): """Execute Main entry for remote operations. Returns (code, output). 'operation_type' supports remote shell and copy operations. @@ -196,7 +196,7 @@ class RemoteOperations(object): # pylint: disable=too-many-instance-attributes print(f"Created {operation_type} operation") buff = "" - ret, new_buff = self._perform_operation(cmd, retry) + ret, new_buff = self._perform_operation(cmd, retry, retry_count) buff += new_buff if ret != 0: diff --git a/buildscripts/resmokelib/powercycle/setup/__init__.py b/buildscripts/resmokelib/powercycle/setup/__init__.py index fb980750466..f0b9be02b95 100644 --- a/buildscripts/resmokelib/powercycle/setup/__init__.py +++ b/buildscripts/resmokelib/powercycle/setup/__init__.py @@ -31,7 +31,7 @@ class SetUpEC2Instance(PowercycleCommand): cmds = f"{self.sudo} mkdir -p {remote_dir}; {self.sudo} chown -R {user_group} {remote_dir}; {set_permission_stmt} {remote_dir}; ls -ld {remote_dir}" cmds = f"{cmds}; {self.sudo} mkdir -p {db_path}; {self.sudo} chown -R {user_group} {db_path}; {set_permission_stmt} {db_path}; ls -ld {db_path}" - self.remote_op.operation(SSHOperation.SHELL, cmds, None) + self.remote_op.operation(SSHOperation.SHELL, cmds, retry=True, retry_count=2) # Second operation - # Copy buildscripts and mongoDB executables to the remote host. @@ -41,7 +41,7 @@ class SetUpEC2Instance(PowercycleCommand): if os.path.isdir(shared_libs): files.append(shared_libs) - self.remote_op.operation(SSHOperation.COPY_TO, files, remote_dir) + self.remote_op.operation(SSHOperation.COPY_TO, files, remote_dir, retry=True, retry_count=2) # Third operation - # Set up virtualenv on remote. @@ -57,7 +57,7 @@ class SetUpEC2Instance(PowercycleCommand): cmds = f"{cmds}; . $activate" cmds = f"{cmds}; pip3 install -r $remote_dir/etc/pip/powercycle-requirements.txt" - self.remote_op.operation(SSHOperation.SHELL, cmds, None) + self.remote_op.operation(SSHOperation.SHELL, cmds, retry=True, retry_count=2) # Fourth operation - # Enable core dumps on non-Windows remote hosts. @@ -72,7 +72,7 @@ class SetUpEC2Instance(PowercycleCommand): cmds = f"{cmds}; if [ -f {sysctl_conf} ]" cmds = f"{cmds}; then grep ^kernel.core_pattern {sysctl_conf}" cmds = f"{cmds}; if [ $? -eq 0 ]" - cmds = f"{cmds}; then {self.sudo} sed -i \"s,kernel.core_pattern=.*,kernel.core_pattern=$core_pattern,\" {sysctl_conf}" + cmds = f"{cmds}; then {self.sudo} sed -i \"s,kernel.core_pattern=.*,kernel.core_pattern={core_pattern},\" {sysctl_conf}" cmds = f"{cmds}; else echo \"kernel.core_pattern={core_pattern}\" | {self.sudo} tee -a {sysctl_conf}" cmds = f"{cmds}; fi" cmds = f"{cmds}; else echo Cannot change the core pattern and no core dumps will be generated." @@ -81,7 +81,7 @@ class SetUpEC2Instance(PowercycleCommand): # https://unix.stackexchange.com/a/349558 in order to ensure the ssh client gets a # response from the remote machine before it restarts. cmds = f"{cmds}; nohup {self.sudo} reboot &>/dev/null & exit" - self.remote_op.operation(SSHOperation.SHELL, cmds, None) + self.remote_op.operation(SSHOperation.SHELL, cmds, retry=True, retry_count=2) # Fifth operation - # Print the ulimit & kernel.core_pattern @@ -93,7 +93,7 @@ class SetUpEC2Instance(PowercycleCommand): cmds = f"{cmds}; then /sbin/sysctl kernel.core_pattern" cmds = f"{cmds}; fi" - self.remote_op.operation(SSHOperation.SHELL, cmds, None, True) + self.remote_op.operation(SSHOperation.SHELL, cmds, retry=True, retry_count=2) # Sixth operation - # Set up curator to collect system & process stats on remote. @@ -120,7 +120,7 @@ class SetUpEC2Instance(PowercycleCommand): cmds = f"{cmds}; crontab -l" cmds = f"{cmds}; {{ {self.sudo} $HOME/curator stat system --file {monitor_system_file} > /dev/null 2>&1 & {self.sudo} $HOME/curator stat process-all --file {monitor_proc_file} > /dev/null 2>&1 & }} & disown" - self.remote_op.operation(SSHOperation.SHELL, cmds, retry=True) + self.remote_op.operation(SSHOperation.SHELL, cmds, retry=True, retry_count=2) # Seventh operation - # Install NotMyFault, used to crash Windows. @@ -132,4 +132,4 @@ class SetUpEC2Instance(PowercycleCommand): cmds = f"curl -s -o {windows_crash_zip} {windows_crash_dl}" cmds = f"{cmds}; unzip -q {windows_crash_zip} -d {windows_crash_dir}" cmds = f"{cmds}; chmod +x {windows_crash_dir}/*.exe" - self.remote_op.operation(SSHOperation.SHELL, cmds, None) + self.remote_op.operation(SSHOperation.SHELL, cmds, retry=True, retry_count=2) |