summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMikhail Shchatko <mikhail.shchatko@mongodb.com>2021-04-26 11:44:09 +0300
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-04-28 16:16:48 +0000
commit522e0db207fe2a0d3b2bc75437b6af5b70b8827c (patch)
treec073fc97b76c5a2bd86d76f04048b9abfcb9c65b
parent9d463d0283018bd0c07604b3f74ad6830f0e513b (diff)
downloadmongo-522e0db207fe2a0d3b2bc75437b6af5b70b8827c.tar.gz
SERVER-56340 Retry remote host setup operations
(cherry picked from commit 3d63ca6b013383cf12e22d414697f62bae2a2b6d)
-rw-r--r--buildscripts/resmokelib/powercycle/lib/remote_operations.py16
-rw-r--r--buildscripts/resmokelib/powercycle/setup/__init__.py16
2 files changed, 16 insertions, 16 deletions
diff --git a/buildscripts/resmokelib/powercycle/lib/remote_operations.py b/buildscripts/resmokelib/powercycle/lib/remote_operations.py
index 1e3178163b5..8ae7d9caec7 100644
--- a/buildscripts/resmokelib/powercycle/lib/remote_operations.py
+++ b/buildscripts/resmokelib/powercycle/lib/remote_operations.py
@@ -58,7 +58,6 @@ class RemoteOperations(object): # pylint: disable=too-many-instance-attributes
self.ssh_connection_options = ssh_connection_options if ssh_connection_options else ""
self.ssh_options = ssh_options if ssh_options else ""
self.scp_options = scp_options if scp_options else ""
- self.retries = 5
self.retry_sleep = 10
self.ignore_ret = ignore_ret
self.shell_binary = shell_binary
@@ -80,7 +79,7 @@ class RemoteOperations(object): # pylint: disable=too-many-instance-attributes
print(textwrap.indent(buff, "[result body] "))
return process.poll(), buff
- def _call_retries(self, cmd):
+ def _call_retries(self, cmd, retry_count):
attempt_num = 0
while True:
ret, buff = self._call(cmd)
@@ -88,7 +87,7 @@ class RemoteOperations(object): # pylint: disable=too-many-instance-attributes
if not ret and not any(ssh_error in buff for ssh_error in _SSH_CONNECTION_ERRORS):
return ret, buff
attempt_num += 1
- if attempt_num > self.retries:
+ if attempt_num > retry_count:
print("Exhausted all retry attempts.")
break
print("Remote attempt {} unsuccessful, retrying in {} seconds".format(
@@ -100,11 +99,11 @@ class RemoteOperations(object): # pylint: disable=too-many-instance-attributes
"""Check if a remote session is possible."""
cmd = "ssh {} {} {} date".format(self.ssh_connection_options, self.ssh_options,
self.user_host)
- return self._call_retries(cmd)
+ return self._call_retries(cmd, 5)
- def _perform_operation(self, cmd, retry):
+ def _perform_operation(self, cmd, retry, retry_count):
if retry:
- return self._call_retries(cmd)
+ return self._call_retries(cmd, retry_count)
return self._call(cmd)
@@ -126,7 +125,8 @@ class RemoteOperations(object): # pylint: disable=too-many-instance-attributes
return message.startswith("ssh:")
# pylint: disable=too-many-branches,too-many-arguments,too-many-locals,inconsistent-return-statements
- def operation(self, operation_type, operation_param, operation_dir=None, retry=False):
+ def operation(self, operation_type, operation_param, operation_dir=None, retry=False,
+ retry_count=5):
"""Execute Main entry for remote operations. Returns (code, output).
'operation_type' supports remote shell and copy operations.
@@ -196,7 +196,7 @@ class RemoteOperations(object): # pylint: disable=too-many-instance-attributes
print(f"Created {operation_type} operation")
buff = ""
- ret, new_buff = self._perform_operation(cmd, retry)
+ ret, new_buff = self._perform_operation(cmd, retry, retry_count)
buff += new_buff
if ret != 0:
diff --git a/buildscripts/resmokelib/powercycle/setup/__init__.py b/buildscripts/resmokelib/powercycle/setup/__init__.py
index fb980750466..f0b9be02b95 100644
--- a/buildscripts/resmokelib/powercycle/setup/__init__.py
+++ b/buildscripts/resmokelib/powercycle/setup/__init__.py
@@ -31,7 +31,7 @@ class SetUpEC2Instance(PowercycleCommand):
cmds = f"{self.sudo} mkdir -p {remote_dir}; {self.sudo} chown -R {user_group} {remote_dir}; {set_permission_stmt} {remote_dir}; ls -ld {remote_dir}"
cmds = f"{cmds}; {self.sudo} mkdir -p {db_path}; {self.sudo} chown -R {user_group} {db_path}; {set_permission_stmt} {db_path}; ls -ld {db_path}"
- self.remote_op.operation(SSHOperation.SHELL, cmds, None)
+ self.remote_op.operation(SSHOperation.SHELL, cmds, retry=True, retry_count=2)
# Second operation -
# Copy buildscripts and mongoDB executables to the remote host.
@@ -41,7 +41,7 @@ class SetUpEC2Instance(PowercycleCommand):
if os.path.isdir(shared_libs):
files.append(shared_libs)
- self.remote_op.operation(SSHOperation.COPY_TO, files, remote_dir)
+ self.remote_op.operation(SSHOperation.COPY_TO, files, remote_dir, retry=True, retry_count=2)
# Third operation -
# Set up virtualenv on remote.
@@ -57,7 +57,7 @@ class SetUpEC2Instance(PowercycleCommand):
cmds = f"{cmds}; . $activate"
cmds = f"{cmds}; pip3 install -r $remote_dir/etc/pip/powercycle-requirements.txt"
- self.remote_op.operation(SSHOperation.SHELL, cmds, None)
+ self.remote_op.operation(SSHOperation.SHELL, cmds, retry=True, retry_count=2)
# Fourth operation -
# Enable core dumps on non-Windows remote hosts.
@@ -72,7 +72,7 @@ class SetUpEC2Instance(PowercycleCommand):
cmds = f"{cmds}; if [ -f {sysctl_conf} ]"
cmds = f"{cmds}; then grep ^kernel.core_pattern {sysctl_conf}"
cmds = f"{cmds}; if [ $? -eq 0 ]"
- cmds = f"{cmds}; then {self.sudo} sed -i \"s,kernel.core_pattern=.*,kernel.core_pattern=$core_pattern,\" {sysctl_conf}"
+ cmds = f"{cmds}; then {self.sudo} sed -i \"s,kernel.core_pattern=.*,kernel.core_pattern={core_pattern},\" {sysctl_conf}"
cmds = f"{cmds}; else echo \"kernel.core_pattern={core_pattern}\" | {self.sudo} tee -a {sysctl_conf}"
cmds = f"{cmds}; fi"
cmds = f"{cmds}; else echo Cannot change the core pattern and no core dumps will be generated."
@@ -81,7 +81,7 @@ class SetUpEC2Instance(PowercycleCommand):
# https://unix.stackexchange.com/a/349558 in order to ensure the ssh client gets a
# response from the remote machine before it restarts.
cmds = f"{cmds}; nohup {self.sudo} reboot &>/dev/null & exit"
- self.remote_op.operation(SSHOperation.SHELL, cmds, None)
+ self.remote_op.operation(SSHOperation.SHELL, cmds, retry=True, retry_count=2)
# Fifth operation -
# Print the ulimit & kernel.core_pattern
@@ -93,7 +93,7 @@ class SetUpEC2Instance(PowercycleCommand):
cmds = f"{cmds}; then /sbin/sysctl kernel.core_pattern"
cmds = f"{cmds}; fi"
- self.remote_op.operation(SSHOperation.SHELL, cmds, None, True)
+ self.remote_op.operation(SSHOperation.SHELL, cmds, retry=True, retry_count=2)
# Sixth operation -
# Set up curator to collect system & process stats on remote.
@@ -120,7 +120,7 @@ class SetUpEC2Instance(PowercycleCommand):
cmds = f"{cmds}; crontab -l"
cmds = f"{cmds}; {{ {self.sudo} $HOME/curator stat system --file {monitor_system_file} > /dev/null 2>&1 & {self.sudo} $HOME/curator stat process-all --file {monitor_proc_file} > /dev/null 2>&1 & }} & disown"
- self.remote_op.operation(SSHOperation.SHELL, cmds, retry=True)
+ self.remote_op.operation(SSHOperation.SHELL, cmds, retry=True, retry_count=2)
# Seventh operation -
# Install NotMyFault, used to crash Windows.
@@ -132,4 +132,4 @@ class SetUpEC2Instance(PowercycleCommand):
cmds = f"curl -s -o {windows_crash_zip} {windows_crash_dl}"
cmds = f"{cmds}; unzip -q {windows_crash_zip} -d {windows_crash_dir}"
cmds = f"{cmds}; chmod +x {windows_crash_dir}/*.exe"
- self.remote_op.operation(SSHOperation.SHELL, cmds, None)
+ self.remote_op.operation(SSHOperation.SHELL, cmds, retry=True, retry_count=2)