diff options
author | Max Hirschhorn <max.hirschhorn@mongodb.com> | 2018-06-12 19:26:55 -0500 |
---|---|---|
committer | Max Hirschhorn <max.hirschhorn@mongodb.com> | 2018-06-12 19:26:55 -0500 |
commit | 1d12fbc7362925399e8ad4304307cdffd1d60362 (patch) | |
tree | 6cf3d2d750d5f709561fb4b68aa972e8fb6c678f | |
parent | 2c33be816fac4e37ed04b13f0cf5ef55e2eda4e7 (diff) | |
download | mongo-1d12fbc7362925399e8ad4304307cdffd1d60362.tar.gz |
SERVER-35472 Avoid falling back to stderr in resmoke.py.
Instead a special return code of 75 is used to indicate that the log
output would otherwise be incomplete. The incomplete log output state is
ignored if all tests passed.
(cherry picked from commit 6ad26a359f7bc366da13d433ed612ff22f69d323)
(cherry picked from commit 8fae595c44b14aa61951ba413e293449c2b49866)
-rwxr-xr-x | buildscripts/resmoke.py | 42 | ||||
-rw-r--r-- | buildscripts/resmokelib/errors.py | 7 | ||||
-rw-r--r-- | buildscripts/resmokelib/logging/buildlogger.py | 45 | ||||
-rw-r--r-- | buildscripts/resmokelib/logging/loggers.py | 59 | ||||
-rw-r--r-- | etc/evergreen.yml | 18 |
5 files changed, 115 insertions, 56 deletions
diff --git a/buildscripts/resmoke.py b/buildscripts/resmoke.py index 82b6bcbec00..1b9865dba1b 100755 --- a/buildscripts/resmoke.py +++ b/buildscripts/resmoke.py @@ -35,6 +35,7 @@ class Resmoke(object): self._resmoke_logger = None self._archive = None self._interrupted = False + self._exit_code = 0 def configure_from_command_line(self): """Configure this instance using the command line arguments.""" @@ -46,6 +47,28 @@ class Resmoke(object): self._exec_logger = logging.loggers.EXECUTOR_LOGGER self._resmoke_logger = self._exec_logger.new_resmoke_logger() + def _exit_logging(self): + if not self._interrupted: + logging.flush.stop_thread() + self._exit_on_incomplete_logging() + + def _exit_on_incomplete_logging(self): + if logging.buildlogger.is_log_output_incomplete(): + if self._exit_code == 0: + # We don't anticipate users to look at passing Evergreen tasks very often that even + # if the log output is incomplete, we'd still rather not show anything in the + # Evergreen UI or cause a JIRA ticket to be created. + self._resmoke_logger.info( + "We failed to flush all log output to logkeeper but all tests passed, so" + " ignoring.") + return + + exit_code = errors.LoggerRuntimeConfigError.EXIT_CODE + self._resmoke_logger.info( + "Exiting with code %d rather than requested code %d because we failed to flush all" + " log output to logkeeper.", exit_code, self._exit_code) + self.exit(exit_code) + def run(self): """Run resmoke.""" if self._config is None: @@ -62,8 +85,7 @@ class Resmoke(object): else: self.run_tests() finally: - if not self._interrupted: - logging.flush.stop_thread() + self._exit_logging() def list_suites(self): """List the suites that are available to execute.""" @@ -126,8 +148,7 @@ class Resmoke(object): exit_code = max(suite.return_code for suite in suites) self.exit(exit_code) finally: - if not self._interrupted: - self._exit_archival() + self._exit_archival() if suites: reportfile.write(suites) @@ -160,12 +181,14 @@ class Resmoke(object): suite.return_code = 0 return False executor_config = suite.get_executor_config() - executor = testing.executor.TestSuiteExecutor( - self._exec_logger, suite, archive_instance=self._archive, **executor_config) try: + executor = testing.executor.TestSuiteExecutor( + self._exec_logger, suite, archive_instance=self._archive, **executor_config) executor.run() - except errors.UserInterrupt: - suite.return_code = 130 # Simulate SIGINT as exit code. + except (errors.UserInterrupt, errors.LoggerRuntimeConfigError) as err: + self._exec_logger.error("Encountered an error when running %ss of suite %s: %s", + suite.test_kind, suite.get_display_name(), err) + suite.return_code = err.EXIT_CODE return True except IOError: suite.return_code = 74 # Exit code for IOError on POSIX systems. @@ -225,11 +248,12 @@ class Resmoke(object): def _exit_archival(self): """Finish up archival tasks before exit if enabled in the cli options.""" - if self._archive: + if self._archive and not self._interrupted: self._archive.exit() def exit(self, exit_code): """Exit with the provided exit code.""" + self._exit_code = exit_code self._resmoke_logger.info("Exiting with code: %d", exit_code) sys.exit(exit_code) diff --git a/buildscripts/resmokelib/errors.py b/buildscripts/resmokelib/errors.py index 8f49a567a79..a0bddfdbc23 100644 --- a/buildscripts/resmokelib/errors.py +++ b/buildscripts/resmokelib/errors.py @@ -18,7 +18,12 @@ class StopExecution(ResmokeError): # noqa: D204 class UserInterrupt(StopExecution): # noqa: D204 """Exception raised when a user signals resmoke.py to unconditionally stop executing tests.""" - pass + EXIT_CODE = 130 # Simulate SIGINT as exit code. + + +class LoggerRuntimeConfigError(StopExecution): # noqa: D204 + """Exception raised when a logging handler couldn't be configured at runtime.""" + EXIT_CODE = 75 class TestFailure(ResmokeError): # noqa: D204 diff --git a/buildscripts/resmokelib/logging/buildlogger.py b/buildscripts/resmokelib/logging/buildlogger.py index 5fa52c85c69..2e48101d517 100644 --- a/buildscripts/resmokelib/logging/buildlogger.py +++ b/buildscripts/resmokelib/logging/buildlogger.py @@ -4,6 +4,7 @@ from __future__ import absolute_import import functools import json +import threading import requests @@ -23,6 +24,20 @@ _SEND_AFTER_SECS = 10 # Initialized by resmokelib.logging.loggers.configure_loggers() BUILDLOGGER_FALLBACK = None +_INCOMPLETE_LOG_OUTPUT = threading.Event() + + +def is_log_output_incomplete(): # noqa: D205,D400 + """Return true if we failed to write all of the log output to the buildlogger server, and return + false otherwise. + """ + return _INCOMPLETE_LOG_OUTPUT.is_set() + + +def set_log_output_incomplete(): + """Indicate that we failed to write all of the log output to the buildlogger server.""" + _INCOMPLETE_LOG_OUTPUT.set() + def _log_on_error(func): """Provide decorator that causes exceptions to be logged by the "buildlogger" Logger instance. @@ -35,6 +50,10 @@ def _log_on_error(func): """Provide wrapper function.""" try: return func(*args, **kwargs) + except requests.HTTPError as err: + BUILDLOGGER_FALLBACK.error("Encountered an HTTP error: %s", err) + except requests.RequestException as err: + BUILDLOGGER_FALLBACK.error("Encountered a network error: %s", err) except: # pylint: disable=bare-except BUILDLOGGER_FALLBACK.exception("Encountered an error.") return None @@ -155,11 +174,12 @@ class _BaseBuildloggerHandler(handlers.BufferedHandler): new_max_size) self.max_size = new_max_size return self._append_logs(log_lines_chunk) - BUILDLOGGER_FALLBACK.exception("Encountered an error.") - return 0 + BUILDLOGGER_FALLBACK.error("Encountered an HTTP error: %s", err) + except requests.RequestException as err: + BUILDLOGGER_FALLBACK.error("Encountered a network error: %s", err) except: # pylint: disable=bare-except BUILDLOGGER_FALLBACK.exception("Encountered an error.") - return 0 + return 0 def _flush_buffer_with_lock(self, buf, close_called): """Ensure all logging output has been flushed to the buildlogger server. @@ -175,14 +195,17 @@ class _BaseBuildloggerHandler(handlers.BufferedHandler): if nb_sent: self.retry_buffer = self.retry_buffer[nb_sent:] if close_called and self.retry_buffer: - # Request to the buildlogger server returned an error, so use the fallback logger to - # avoid losing the log messages entirely. - for (_, message) in self.retry_buffer: - # TODO: construct an LogRecord instance equivalent to the one passed to the - # process_record() method if we ever decide to log the time when the - # LogRecord was created, e.g. using %(asctime)s in - # _fallback_buildlogger_handler(). - BUILDLOGGER_FALLBACK.info(message) + # The request to the logkeeper returned an error. We discard the log output rather than + # writing the messages to the fallback logkeeper to avoid putting additional pressure on + # the Evergreen database. + BUILDLOGGER_FALLBACK.warning( + "Failed to flush all log output (%d messages) to logkeeper.", len( + self.retry_buffer)) + + # We set a flag to indicate that we failed to flush all log output to logkeeper so + # resmoke.py can exit with a special return code. + set_log_output_incomplete() + self.retry_buffer = [] diff --git a/buildscripts/resmokelib/logging/loggers.py b/buildscripts/resmokelib/logging/loggers.py index d319928eb5f..deb3f6c38a2 100644 --- a/buildscripts/resmokelib/logging/loggers.py +++ b/buildscripts/resmokelib/logging/loggers.py @@ -7,6 +7,7 @@ import sys from . import buildlogger from . import formatters +from .. import errors _DEFAULT_FORMAT = "[%(name)s] %(message)s" @@ -174,15 +175,18 @@ class JobLogger(BaseLogger): BaseLogger.__init__(self, name, parent=parent) self.job_num = job_num self.fixture_root_logger = fixture_root_logger - # create build_id if it should if self.build_logger_server: + # If we're configured to log messages to the buildlogger server, then request a new + # build_id for this job. self.build_id = self.build_logger_server.new_build_id("job%d" % job_num) - if self.build_id: - url = self.build_logger_server.get_build_log_url(self.build_id) - parent.info("Writing output of job #%d to %s.", job_num, url) - else: - parent.info("Encountered an error configuring buildlogger for job #%d, falling" - " back to stderr.", job_num) + if not self.build_id: + buildlogger.set_log_output_incomplete() + raise errors.LoggerRuntimeConfigError( + "Encountered an error configuring buildlogger for job #{:d}: Failed to get a" + " new build_id".format(job_num)) + + url = self.build_logger_server.get_build_log_url(self.build_id) + parent.info("Writing output of job #%d to %s.", job_num, url) else: self.build_id = None @@ -193,14 +197,19 @@ class JobLogger(BaseLogger): def new_test_logger(self, test_shortname, test_basename, command, parent): """Create a new test logger that will be a child of the given parent.""" if self.build_id: + # If we're configured to log messages to the buildlogger server, then request a new + # test_id for this test. test_id = self.build_logger_server.new_test_id(self.build_id, test_basename, command) - if test_id: - url = self.build_logger_server.get_test_log_url(self.build_id, test_id) - self.info("Writing output of %s to %s.", test_shortname, url) - return TestLogger(test_shortname, parent, self.build_id, test_id, url) - else: - self.info("Encountered an error configuring buildlogger for test %s, falling" - " back to stderr.", test_shortname) + if not test_id: + buildlogger.set_log_output_incomplete() + raise errors.LoggerRuntimeConfigError( + "Encountered an error configuring buildlogger for test {}: Failed to get a new" + " test_id".format(test_basename)) + + url = self.build_logger_server.get_test_log_url(self.build_id, test_id) + self.info("Writing output of %s to %s.", test_basename, url) + return TestLogger(test_shortname, parent, self.build_id, test_id, url) + return TestLogger(test_shortname, parent) @@ -226,14 +235,9 @@ class TestLogger(BaseLogger): logger_info = self.logging_config[TESTS_LOGGER_NAME] handler_info = _get_buildlogger_handler_info(logger_info) if handler_info is not None: - if build_id and test_id: - handler = self.build_logger_server.get_test_handler(build_id, test_id, handler_info) - handler.setFormatter(self.get_formatter(logger_info)) - self.addHandler(handler) - else: - # The build_id or test_id could not be obtained from the build logger server, so - # we're falling back to stderr. - self.addHandler(_fallback_buildlogger_handler()) + handler = self.build_logger_server.get_test_handler(build_id, test_id, handler_info) + handler.setFormatter(self.get_formatter(logger_info)) + self.addHandler(handler) def new_test_thread_logger(self, test_kind, thread_id): """Create a new child test thread logger.""" @@ -272,14 +276,9 @@ class FixtureLogger(BaseLogger): logger_info = self.logging_config[FIXTURE_LOGGER_NAME] handler_info = _get_buildlogger_handler_info(logger_info) if handler_info is not None: - if build_id: - handler = self.build_logger_server.get_global_handler(build_id, handler_info) - handler.setFormatter(self.get_formatter(logger_info)) - self.addHandler(handler) - else: - # The build_id could not be obtained from the build logger server, so we're - # falling back to stderr. - self.addHandler(_fallback_buildlogger_handler()) + handler = self.build_logger_server.get_global_handler(build_id, handler_info) + handler.setFormatter(self.get_formatter(logger_info)) + self.addHandler(handler) def new_fixture_node_logger(self, node_name): """Create a new child FixtureNodeLogger.""" diff --git a/etc/evergreen.yml b/etc/evergreen.yml index def28fbc40f..2d1d74598a3 100644 --- a/etc/evergreen.yml +++ b/etc/evergreen.yml @@ -1381,21 +1381,29 @@ functions: resmoke_exit_code=$? set -o errexit - # 74 is exit code for IOError on POSIX systems. - if [ $resmoke_exit_code = 74 ]; then - echo $resmoke_exit_code > run_tests_system_failure + # 74 is exit code for IOError on POSIX systems, which is raised when the machine is + # shutting down. + # + # 75 is exit code resmoke.py uses when the log output would be incomplete due to failing + # to communicate with logkeeper. + if [[ $resmoke_exit_code = 74 || $resmoke_exit_code = 75 ]]; then + echo $resmoke_exit_code > run_tests_infrastructure_failure exit 0 fi exit $resmoke_exit_code fi # end if [[ ${disable_unit_tests} && ! -f ${skip_tests|/dev/null} ]] + # The existence of the "run_tests_infrastructure_failure" file indicates this failure isn't + # directly actionable. We use type=setup rather than type=system or type=test for this command + # because we don't intend for any human to look at this failure. - command: shell.exec + type: setup params: working_dir: src script: | set -o verbose - if [ -f run_tests_system_failure ]; then - exit $(cat run_tests_system_failure) + if [ -f run_tests_infrastructure_failure ]; then + exit $(cat run_tests_infrastructure_failure) fi "scons compile" : |