summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Martz <matt@sivel.net>2023-04-12 15:13:45 -0500
committerGitHub <noreply@github.com>2023-04-12 15:13:45 -0500
commitdab364052716530f770b2cef3588e6d03456a2b9 (patch)
treed784c9f3f1176aba62f7d6461fbe933c982ed92e
parent2e50a9e4826f946fe200b7e647e59189175f48bc (diff)
downloadansible-dab364052716530f770b2cef3588e6d03456a2b9.tar.gz
[stable-2.13] ansible-galaxy collection install retry improvements (#80180) (#80275)
* clog frag * Fix retries so that each explicit call to _call_galaxy is retried for the correct number of attempts. Fixes #80174 * Extend retry logic to common URL related connection errors. Fixes #80170 * Extend retries to downloading artifacts * Extend param docs for change * Rework the exception handling * Don't be overly broad, reduce to TimeoutError, and BadStatusLine for now * _download_file needs to raise AnsibleError.orig_exc * Remove unused import * Add IncompleteRead * Add socket.timeout for py39 * Add 502 to retry codes * Move http error code checking first * Use itertools.tee to replay the backoff_iterator instead of using a callable * Actually set a CLI default of 60s for timeout, to prevent implicit galaxy from using 10s as default from Request.open * Import typing * fix type hints * Use http.HTTPStatus instead of int HTTP error codes where feasible * Split exception handling * Add missing import --------- . (cherry picked from commit 2ae013667ef226635fe521be886efd1bf58cd46f)
-rw-r--r--changelogs/fragments/galaxy-improve-retries.yml3
-rw-r--r--lib/ansible/galaxy/api.py36
-rw-r--r--lib/ansible/galaxy/collection/concrete_artifact_manager.py51
-rw-r--r--lib/ansible/module_utils/api.py15
4 files changed, 79 insertions, 26 deletions
diff --git a/changelogs/fragments/galaxy-improve-retries.yml b/changelogs/fragments/galaxy-improve-retries.yml
new file mode 100644
index 0000000000..ab0c7b216c
--- /dev/null
+++ b/changelogs/fragments/galaxy-improve-retries.yml
@@ -0,0 +1,3 @@
+bugfixes:
+- ansible-galaxy - Improve retries for collection installs, to properly retry, and extend retry logic to common URL related connection errors
+ (https://github.com/ansible/ansible/issues/80170 https://github.com/ansible/ansible/issues/80174)
diff --git a/lib/ansible/galaxy/api.py b/lib/ansible/galaxy/api.py
index 627f71fa5f..ef16093347 100644
--- a/lib/ansible/galaxy/api.py
+++ b/lib/ansible/galaxy/api.py
@@ -11,12 +11,15 @@ import functools
import hashlib
import json
import os
+import socket
import stat
import tarfile
import time
import threading
-from urllib.error import HTTPError
+from http import HTTPStatus
+from http.client import BadStatusLine, IncompleteRead
+from urllib.error import HTTPError, URLError
from urllib.parse import quote as urlquote, urlencode, urlparse, parse_qs, urljoin
from ansible import constants as C
@@ -34,10 +37,11 @@ from ansible.utils.path import makedirs_safe
display = Display()
_CACHE_LOCK = threading.Lock()
COLLECTION_PAGE_SIZE = 100
-RETRY_HTTP_ERROR_CODES = [ # TODO: Allow user-configuration
- 429, # Too Many Requests
+RETRY_HTTP_ERROR_CODES = { # TODO: Allow user-configuration
+ HTTPStatus.TOO_MANY_REQUESTS,
520, # Galaxy rate limit error code (Cloudflare unknown error)
-]
+ HTTPStatus.BAD_GATEWAY, # Common error from galaxy that may represent any number of transient backend issues
+}
def cache_lock(func):
@@ -48,11 +52,24 @@ def cache_lock(func):
return wrapped
-def is_rate_limit_exception(exception):
+def should_retry_error(exception):
# Note: cloud.redhat.com masks rate limit errors with 403 (Forbidden) error codes.
# Since 403 could reflect the actual problem (such as an expired token), we should
# not retry by default.
- return isinstance(exception, GalaxyError) and exception.http_code in RETRY_HTTP_ERROR_CODES
+ if isinstance(exception, GalaxyError) and exception.http_code in RETRY_HTTP_ERROR_CODES:
+ return True
+
+ if isinstance(exception, AnsibleError) and (orig_exc := getattr(exception, 'orig_exc', None)):
+ # URLError is often a proxy for an underlying error, handle wrapped exceptions
+ if isinstance(orig_exc, URLError):
+ orig_exc = orig_exc.reason
+
+ # Handle common URL related errors such as TimeoutError, and BadStatusLine
+ # Note: socket.timeout is only required for Py3.9
+ if isinstance(orig_exc, (TimeoutError, BadStatusLine, IncompleteRead, socket.timeout)):
+ return True
+
+ return False
def g_connect(versions):
@@ -326,7 +343,7 @@ class GalaxyAPI:
@retry_with_delays_and_condition(
backoff_iterator=generate_jittered_backoff(retries=6, delay_base=2, delay_threshold=40),
- should_retry_error=is_rate_limit_exception
+ should_retry_error=should_retry_error
)
def _call_galaxy(self, url, args=None, headers=None, method=None, auth_required=False, error_context_msg=None,
cache=False, cache_key=None):
@@ -384,7 +401,10 @@ class GalaxyAPI:
except HTTPError as e:
raise GalaxyError(e, error_context_msg)
except Exception as e:
- raise AnsibleError("Unknown error when attempting to call Galaxy at '%s': %s" % (url, to_native(e)))
+ raise AnsibleError(
+ "Unknown error when attempting to call Galaxy at '%s': %s" % (url, to_native(e)),
+ orig_exc=e
+ )
resp_data = to_text(resp.read(), errors='surrogate_or_strict')
try:
diff --git a/lib/ansible/galaxy/collection/concrete_artifact_manager.py b/lib/ansible/galaxy/collection/concrete_artifact_manager.py
index 4115aeed80..d480af8b5f 100644
--- a/lib/ansible/galaxy/collection/concrete_artifact_manager.py
+++ b/lib/ansible/galaxy/collection/concrete_artifact_manager.py
@@ -27,9 +27,12 @@ if t.TYPE_CHECKING:
from ansible.errors import AnsibleError
from ansible.galaxy import get_collections_galaxy_meta_info
+from ansible.galaxy.api import should_retry_error
from ansible.galaxy.dependency_resolution.dataclasses import _GALAXY_YAML
from ansible.galaxy.user_agent import user_agent
from ansible.module_utils._text import to_bytes, to_native, to_text
+from ansible.module_utils.api import retry_with_delays_and_condition
+from ansible.module_utils.api import generate_jittered_backoff
from ansible.module_utils.common.process import get_bin_path
from ansible.module_utils.common._collections_compat import MutableMapping
from ansible.module_utils.common.yaml import yaml_load
@@ -159,17 +162,24 @@ class ConcreteArtifactsManager:
token=token,
) # type: bytes
except URLError as err:
- raise_from(
- AnsibleError(
- 'Failed to download collection tar '
- "from '{coll_src!s}': {download_err!s}".
- format(
- coll_src=to_native(collection.src),
- download_err=to_native(err),
- ),
+ raise AnsibleError(
+ 'Failed to download collection tar '
+ "from '{coll_src!s}': {download_err!s}".
+ format(
+ coll_src=to_native(collection.src),
+ download_err=to_native(err),
),
- err,
- )
+ ) from err
+ except Exception as err:
+ raise AnsibleError(
+ 'Failed to download collection tar '
+ "from '{coll_src!s}' due to the following unforeseen error: "
+ '{download_err!s}'.
+ format(
+ coll_src=to_native(collection.src),
+ download_err=to_native(err),
+ ),
+ ) from err
else:
display.vvv(
"Collection '{coll!s}' obtained from "
@@ -456,6 +466,10 @@ def _extract_collection_from_git(repo_url, coll_ver, b_path):
# FIXME: use random subdirs while preserving the file names
+@retry_with_delays_and_condition(
+ backoff_iterator=generate_jittered_backoff(retries=6, delay_base=2, delay_threshold=40),
+ should_retry_error=should_retry_error
+)
def _download_file(url, b_path, expected_hash, validate_certs, token=None, timeout=60):
# type: (str, bytes, t.Optional[str], bool, GalaxyToken, int) -> bytes
# ^ NOTE: used in download and verify_collections ^
@@ -474,13 +488,16 @@ def _download_file(url, b_path, expected_hash, validate_certs, token=None, timeo
display.display("Downloading %s to %s" % (url, to_text(b_tarball_dir)))
# NOTE: Galaxy redirects downloads to S3 which rejects the request
# NOTE: if an Authorization header is attached so don't redirect it
- resp = open_url(
- to_native(url, errors='surrogate_or_strict'),
- validate_certs=validate_certs,
- headers=None if token is None else token.headers(),
- unredirected_headers=['Authorization'], http_agent=user_agent(),
- timeout=timeout
- )
+ try:
+ resp = open_url(
+ to_native(url, errors='surrogate_or_strict'),
+ validate_certs=validate_certs,
+ headers=None if token is None else token.headers(),
+ unredirected_headers=['Authorization'], http_agent=user_agent(),
+ timeout=timeout
+ )
+ except Exception as err:
+ raise AnsibleError(to_native(err), orig_exc=err)
with open(b_file_path, 'wb') as download_file: # type: t.BinaryIO
actual_hash = _consume_file(resp, write_to=download_file)
diff --git a/lib/ansible/module_utils/api.py b/lib/ansible/module_utils/api.py
index e780ec6b50..2de8a4efc1 100644
--- a/lib/ansible/module_utils/api.py
+++ b/lib/ansible/module_utils/api.py
@@ -26,11 +26,15 @@ The 'api' module provides the following common argument specs:
from __future__ import (absolute_import, division, print_function)
__metaclass__ = type
+import copy
import functools
+import itertools
import random
import sys
import time
+import ansible.module_utils.compat.typing as t
+
def rate_limit_argument_spec(spec=None):
"""Creates an argument spec for working with rate limiting"""
@@ -141,6 +145,15 @@ def retry_with_delays_and_condition(backoff_iterator, should_retry_error=None):
:param backoff_iterator: An iterable of delays in seconds.
:param should_retry_error: A callable that takes an exception of the decorated function and decides whether to retry or not (returns a bool).
"""
+ def _emit_isolated_iterator_copies(original_iterator): # type: (t.Iterable[t.Any]) -> t.Generator
+ # Ref: https://stackoverflow.com/a/30232619/595220
+ _copiable_iterator, _first_iterator_copy = itertools.tee(original_iterator)
+ yield _first_iterator_copy
+ while True:
+ yield copy.copy(_copiable_iterator)
+ backoff_iterator_generator = _emit_isolated_iterator_copies(backoff_iterator)
+ del backoff_iterator # prevent accidental use elsewhere
+
if should_retry_error is None:
should_retry_error = retry_never
@@ -152,7 +165,7 @@ def retry_with_delays_and_condition(backoff_iterator, should_retry_error=None):
"""
call_retryable_function = functools.partial(function, *args, **kwargs)
- for delay in backoff_iterator:
+ for delay in next(backoff_iterator_generator):
try:
return call_retryable_function()
except Exception as e: