diff options
-rw-r--r-- | buildscripts/jepsen_report.py | 226 | ||||
-rw-r--r-- | buildscripts/tests/test_jepsen_report.py | 195 | ||||
-rw-r--r-- | buildscripts/tests/test_jepsen_report_corpus.log.txt | 67 | ||||
-rw-r--r-- | etc/evergreen_yml_components/definitions.yml | 14 | ||||
-rw-r--r-- | evergreen/jepsen_docker/list-append.sh | 16 | ||||
-rw-r--r-- | evergreen/jepsen_test_run.sh | 5 |
6 files changed, 520 insertions, 3 deletions
diff --git a/buildscripts/jepsen_report.py b/buildscripts/jepsen_report.py new file mode 100644 index 00000000000..00747f42f09 --- /dev/null +++ b/buildscripts/jepsen_report.py @@ -0,0 +1,226 @@ +"""Generate Evergreen reports from the Jepsen list-append workload.""" +import json +import re +import sys +import os +from datetime import datetime, timezone +from typing import List, Optional, Tuple + +from typing_extensions import TypedDict +import click + +from buildscripts.simple_report import Result, Report + + +class ParserOutput(TypedDict): + """Result of parsing jepsen log file. Each List[str] is a list of test names.""" + + success: List[str] + unknown: List[str] + crashed: List[str] + failed: List[str] + start: int + end: int + elapsed: int + + +_JEPSEN_TIME_FORMAT = "%Y-%m-%d %H:%M:%S" +_JEPSEN_MILLI_RE = re.compile("([0-9]+){(.*)}") +_JEPSEN_TIME_RE = re.compile("[0-9]{4}-[0-8]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]+{.*}") + + +def _time_parse(time: str): + split = time.split(",") + date = datetime.strptime(split[0], _JEPSEN_TIME_FORMAT) + match = _JEPSEN_MILLI_RE.match(split[1]) + microseconds = 0 + if match: + microseconds = int(match[1]) * 1000 + + return date.replace(microsecond=microseconds, tzinfo=timezone.utc) + + +def _calc_time_from_log(log: str) -> Tuple[int, int, int]: + if not log: + return (0, 0, 0) + start_time = None + end_time = None + for line in log.splitlines(): + if _JEPSEN_TIME_RE.match(line): + if start_time is None: + start_time = _time_parse(line) + else: + end_time = _time_parse(line) + + if start_time is None or end_time is None: + return (0, 0, 0) + + elapsed_time = int(end_time.timestamp() - start_time.timestamp()) + + return (int(start_time.timestamp()), int(end_time.timestamp()), elapsed_time) + + +SUCCESS_RE = re.compile("([0-9]+) successes") +CRASH_RE = re.compile("([0-9]+) crashed") +UNKNOWN_RE = re.compile("([0-9]+) unknown") +FAIL_RE = re.compile("([0-9]+) failures") + + +def parse(text: List[str]) -> ParserOutput: + """Given a List of strings representing jepsen log file split by newlines, return the ParserOutput struct.""" + + successful_tests: List[str] = [] + indeterminate_tests: List[str] = [] + crashed_tests: List[str] = [] + failed_tests: List[str] = [] + target = None + table_matches = 0 + for line in text: + if "# Successful tests" in line: + target = successful_tests + continue + elif "# Indeterminate tests" in line: + target = indeterminate_tests + continue + elif "# Crashed tests" in line: + target = crashed_tests + continue + elif "# Failed tests" in line: + target = failed_tests + continue + + # at this point we're parsing this table: + # 29 successes + # 0 unknown + # 1 crashed + # 0 failures + s_match = SUCCESS_RE.match(line) + if s_match: + target = None + assert int(s_match[1]) == len(successful_tests) + table_matches += 1 + + u_match = UNKNOWN_RE.match(line) + if u_match: + target = None + assert int(u_match[1]) == len(indeterminate_tests) + table_matches += 1 + c_match = CRASH_RE.match(line) + if c_match: + target = None + assert int(c_match[1]) == len(crashed_tests) + table_matches += 1 + f_match = FAIL_RE.match(line) + if f_match: + target = None + assert int(f_match[1]) == len(failed_tests) + table_matches += 1 + + if target is not None and line.strip(): + target.append(line) + + assert table_matches == 4, f"Failed to parse summary table. Expected 4, found {table_matches}" + return ParserOutput({ + 'success': successful_tests, + 'unknown': indeterminate_tests, + 'crashed': crashed_tests, + 'failed': failed_tests, + }) + + +def _try_find_log_file(store: Optional[str], test_name) -> str: + if store is None: + return "" + + try: + with open(os.path.join(store, test_name, "jepsen.log")) as fh: + return fh.read() + + except Exception: # pylint: disable=broad-except + return "" + + +def report(out: ParserOutput, start_time: int, end_time: int, elapsed: int, + store: Optional[str]) -> Report: + """Given ParserOutput, return report.json as a dict.""" + + results = [] + failures = 0 + for test_name in out['success']: + log_raw = _try_find_log_file(store, test_name) + start_time, end_time, elapsed_time = _calc_time_from_log(log_raw) + results.append( + Result(status='pass', exit_code=0, test_file=test_name, start=start_time, end=end_time, + elapsed=elapsed_time, log_raw=log_raw)) + + for test_name in out['failed']: + log_raw = _try_find_log_file(store, test_name) + start_time, end_time, elapsed_time = _calc_time_from_log(log_raw) + failures += 1 + results.append( + Result(status='fail', exit_code=1, test_file=test_name, start=start_time, end=end_time, + elapsed=elapsed_time, log_raw=log_raw)) + + for test_name in out['crashed']: + log_raw = "Log files are unavailable for crashed tests because Jepsen does not save them separately. You may be able to find the exception and stack trace in the task log" + failures += 1 + results.append( + Result(status='fail', exit_code=1, test_file=test_name, start=start_time, end=end_time, + elapsed=elapsed, log_raw=log_raw)) + + for test_name in out['unknown']: + log_raw = _try_find_log_file(store, test_name) + start_time, end_time, elapsed_time = _calc_time_from_log(log_raw) + failures += 1 + results.append( + Result(status='fail', exit_code=1, test_file=test_name, start=start_time, end=end_time, + elapsed=elapsed_time, log_raw=log_raw)) + return Report({ + "failures": failures, + "results": results, + }) + + +def _get_log_lines(filename: str) -> List[str]: + with open(filename) as fh: + return fh.read().splitlines() + + +def _put_report(report_: Report) -> None: + with open("report.json", "w") as fh: + json.dump(report_, fh) + + +@click.command() +@click.option("--start_time", type=int, required=True) +@click.option("--end_time", type=int, required=True) +@click.option("--elapsed", type=int, required=True) +@click.option("--emit_status_files", type=bool, is_flag=True, default=False, + help="If true, emit status files for marking Evergreen tasks as system fails") +@click.option("--store", type=str, default=None, + help="Path to folder containing jepsen 'store' directory") +@click.argument("filename", type=str) +def main(filename: str, start_time: str, end_time: str, elapsed: str, emit_status_files: bool, + store: Optional[str]): + """Generate Evergreen reports from the Jepsen list-append workload.""" + + out = parse(_get_log_lines(filename)) + _put_report(report(out, start_time, end_time, elapsed, store)) + + exit_code = 255 + if out['crashed']: + exit_code = 2 + if emit_status_files: + with open("jepsen_system_fail.txt", "w") as fh: + fh.write(str(exit_code)) + else: + if out['unknown'] or out['failed']: + exit_code = 1 + else: + exit_code = 0 + + sys.exit(exit_code) + + +if __name__ == "__main__": + main() # pylint: disable=no-value-for-parameter diff --git a/buildscripts/tests/test_jepsen_report.py b/buildscripts/tests/test_jepsen_report.py new file mode 100644 index 00000000000..cdfb095ba5a --- /dev/null +++ b/buildscripts/tests/test_jepsen_report.py @@ -0,0 +1,195 @@ +"""Tests for jepsen report generator.""" +import unittest +import textwrap +import random +import os +from unittest.mock import patch, mock_open, MagicMock +from click.testing import CliRunner + +from buildscripts.jepsen_report import parse, ParserOutput, main + +_CORPUS = textwrap.dedent("""\ + "indeterminate: Command failed with error 251 (NoSuchTransaction): 'Transaction was aborted :: caused by :: from shard rs_shard2 :: caused by :: Given transaction number 53 does not match any in-progress transactions. The active transaction number is -1' on server n9:27017. The full response is {\"writeConcernError\": {\"code\": 6, \"codeName\": \"HostUnreachable\", \"errmsg\": \"operation was interrupted\", \"errInfo\": {\"writeConcern\": {\"w\": \"majority\", \"wtimeout\": 0, \"provenance\": \"clientSupplied\"}}}, \"topologyVersion\": {\"processId\": {\"$oid\": \"625f0b0ef9d6a12d9b562ff9\"}, \"counter\": 21}, \"ok\": 0.0, \"errmsg\": \"Transaction was aborted :: caused by :: from shard rs_shard2 :: caused by :: Given transaction number 53 does not match any in-progress transactions. The active transaction number is -1\", \"code\": 251, \"codeName\": \"NoSuchTransaction\", \"$clusterTime\": {\"clusterTime\": {\"$timestamp\": {\"t\": 1650395950, \"i\": 16}}, \"signature\": {\"hash\": {\"$binary\": {\"base64\": \"AAAAAAAAAAAAAAAAAAAAAAAAAAA=\", \"subType\": \"00\"}}, \"keyId\": 0}}, \"operationTime\": {\"$timestamp\": {\"t\": 1650395950, \"i\": 5}}, \"recoveryToken\": {\"recoveryShardId\": \"rs_shard1\"}}", + :index 1141}})}, + :workload {:valid? true}, + :valid? true} + + +Everything looks good! ヽ(‘ー`)ノ + + + +# Successful tests + +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220419T163539.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220419T164131.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220419T164724.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220419T165317.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220419T165910.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220419T170503.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220419T171055.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220419T171648.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220419T172239.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220419T172832.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220419T173634.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220419T174227.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220419T174820.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220419T175413.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220419T180006.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220419T180558.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220419T181148.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220419T181946.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220419T182539.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220419T183131.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220419T183720.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220419T184313.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220419T184906.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220419T185459.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220419T190052.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220419T190645.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220419T191239.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220419T191833.000Z + +# Crashed tests + +mongodb list-append w:majority r:majority tw:majority tr:snapshot partition +mongodb list-append w:majority r:majority tw:majority tr:snapshot partition + +28 successes +0 unknown +2 crashed +0 failures +""").splitlines() + + +class TestParser(unittest.TestCase): + """TestParser.""" + + @classmethod + def _corpus_generator(cls): + n_pass = random.randint(1, 30) + n_fail = random.randint(1, 30) + n_unknown = random.randint(1, 30) + n_crash = random.randint(1, 30) + + corpus = textwrap.dedent("""\ + :index 2893}})}, + :workload {:valid? true}, + :valid? true} +Everything looks good! ヽ(‘ー`)ノ +""") + successful_tests = [] + if n_pass > 0: + corpus += "\n# Successful tests\n\n" + for i in range(0, n_pass): + corpus += f"test {i}\n" + successful_tests.append(f"test {i}") + + failed_tests = [] + if n_fail > 0: + corpus += "\n# Failed tests\n\n" + for i in range(0, n_fail): + corpus += f"test {i}\n" + failed_tests.append(f"test {i}") + + indeterminate_tests = [] + if n_unknown > 0: + corpus += "\n# Indeterminate tests\n\n" + for i in range(0, n_unknown): + corpus += f"test {i}\n" + indeterminate_tests.append(f"test {i}") + + crashed_tests = [] + if n_crash > 0: + corpus += "\n# Crashed tests\n\n" + for i in range(0, n_crash): + corpus += f"test {i}\n" + crashed_tests.append(f"test {i}") + # note leading newline for this block is required to match the actual + # logs + corpus += textwrap.dedent(f""" +{n_pass} successes +{n_unknown} unknown +{n_crash} crashed +{n_fail} failures +""") + + return { + 'expected': + ParserOutput({ + 'success': successful_tests, + 'unknown': indeterminate_tests, + 'crashed': crashed_tests, + 'failed': failed_tests, + }), 'corpus': + corpus + } + + def test_parser(self): + """Test with embedded corpus.""" + out = parse(_CORPUS) + self.assertEqual(len(out['success']), 28) + self.assertEqual(len(out['unknown']), 0) + self.assertEqual(len(out['crashed']), 2) + self.assertEqual(len(out['failed']), 0) + + def test_parser2(self): + """Test with jepsen.log file.""" + with open(os.path.join(os.path.dirname(__file__), + "test_jepsen_report_corpus.log.txt")) as fh: + corpus = fh.read().splitlines() + out = parse(corpus) + self.assertEqual(len(out['success']), 29) + self.assertEqual(len(out['unknown']), 0) + self.assertEqual(len(out['crashed']), 1) + self.assertEqual(len(out['failed']), 0) + + def test_generated_corpus(self): + """Generate 100 corpuses and test them.""" + for _ in range(0, 100): + self._test_generated_corpus() + + def _test_generated_corpus(self): + gen = self._corpus_generator() + corpus = gen['corpus'].splitlines() + out = parse(corpus) + self.assertDictEqual(out, gen['expected']) + + @patch('buildscripts.jepsen_report._try_find_log_file') + @patch('buildscripts.jepsen_report._get_log_lines') + @patch('buildscripts.jepsen_report._put_report') + def test_main(self, mock_put_report, mock_get_log_lines, mock_try_find_log_file): + """Test main function.""" + gen = self._corpus_generator() + corpus = gen['corpus'].splitlines() + mock_get_log_lines.return_value = corpus + + def _try_find_log_file(_store, _test): + if _try_find_log_file.counter == 0: + _try_find_log_file.counter += 1 + with open( + os.path.join( + os.path.dirname(__file__), "test_jepsen_report_corpus.log.txt")) as fh: + return fh.read() + return "" + + _try_find_log_file.counter = 0 + mock_try_find_log_file.side_effect = _try_find_log_file + + runner = CliRunner() + result = runner.invoke(main, + ["--start_time=0", "--end_time=10", "--elapsed=10", "test.log"]) + num_tests = len(gen['expected']['success']) + len(gen['expected']['unknown']) + len( + gen['expected']['crashed']) + len(gen['expected']['failed']) + num_fails = num_tests - len(gen['expected']['success']) + + callee_dict = mock_put_report.call_args[0][0] + self.assertEqual(callee_dict['failures'], num_fails) + self.assertEqual(len(callee_dict['results']), num_tests) + mock_get_log_lines.assert_called_once_with('test.log') + if gen['expected']['crashed']: + self.assertEqual(result.exit_code, 2) + elif gen['expected']['unknown'] or gen['expected']['failure']: + self.assertEqual(result.exit_code, 1) + else: + self.assertEqual(result.exit_code, 0) diff --git a/buildscripts/tests/test_jepsen_report_corpus.log.txt b/buildscripts/tests/test_jepsen_report_corpus.log.txt new file mode 100644 index 00000000000..89d4e91892b --- /dev/null +++ b/buildscripts/tests/test_jepsen_report_corpus.log.txt @@ -0,0 +1,67 @@ +INFO [2022-04-20 02:46:34,505] jepsen test runner - jepsen.core {:perf + {:latency-graph {:valid? true}, + :rate-graph {:valid? true}, + :valid? true}, + :clock {:valid? true}, + :stats + {:valid? true, + :count 5768, + :ok-count 4930, + :fail-count 0, + :info-count 838, + :by-f + {:txn + {:valid? true, + :count 5768, + :ok-count 4930, + :fail-count 0, + :info-count 838}}}, + :exceptions {:valid? true}, + :workload {:valid? true}, + :valid? true} + + +Everything looks good! ヽ(‘ー`)ノ + + + +# Successful tests + +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220419T235350.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220419T235940.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220420T000533.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220420T001131.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220420T001724.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220420T002316.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220420T002909.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220420T003502.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220420T004057.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220420T004651.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220420T005244.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220420T005837.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220420T010429.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220420T011021.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220420T011617.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220420T012210.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220420T012802.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220420T013356.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220420T014154.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220420T014746.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220420T015338.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220420T015931.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220420T020525.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220420T021116.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220420T021708.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220420T022301.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220420T022854.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220420T023446.000Z +store/mongodb list-append w:majority r:majority tw:majority tr:snapshot partition/20220420T024039.000Z + +# Crashed tests + +mongodb list-append w:majority r:majority tw:majority tr:snapshot partition + +29 successes +0 unknown +1 crashed +0 failures diff --git a/etc/evergreen_yml_components/definitions.yml b/etc/evergreen_yml_components/definitions.yml index 1f71a030001..5552f136010 100644 --- a/etc/evergreen_yml_components/definitions.yml +++ b/etc/evergreen_yml_components/definitions.yml @@ -1699,11 +1699,25 @@ functions: display_name: Jepsen Test Results - ${execution} optional: true + "upload jepsen_log": &upload_jepsen_log + command: s3.put + params: + optional: true + aws_key: ${aws_key} + aws_secret: ${aws_secret} + local_file: src/jepsen-mongodb/jepsen_${task_name}_${execution}.log + remote_file: ${project}/${build_variant}/${revision}/jepsen_${task_name}_${execution}.log + bucket: mciuploads + permissions: public-read + content_type: text/plain + display_name: Jepsen Test Log + "save jepsen artifacts": - *tar_jepsen_logs - *archive_jepsen_logs - *tar_jepsen_results - *archive_jepsen_results + - *upload_jepsen_log ### Process & archive mongo coredumps ### "gather mongo coredumps": &gather_mongo_coredumps diff --git a/evergreen/jepsen_docker/list-append.sh b/evergreen/jepsen_docker/list-append.sh index 3eb8a8b67fa..4378b1bcadc 100644 --- a/evergreen/jepsen_docker/list-append.sh +++ b/evergreen/jepsen_docker/list-append.sh @@ -18,8 +18,18 @@ elapsed_secs=$((end_time - start_time)) cd ../../ mkdir -p src/jepsen-mongodb sudo docker cp jepsen-control:/jepsen/mongodb/store src/jepsen-mongodb/store -cp jepsen/docker/jepsen_${task_name}_${execution}.log src/jepsen-mongodb +cp jepsen/docker/jepsen_${task_name}_${execution}.log src/jepsen-mongodb/ sudo docker cp jepsen-control:/jepsen/mongodb src/jepsen-workdir -cd src/jepsen-mongodb -. ../evergreen/jepsen_report.sh +cd src +activate_venv +$python buildscripts/jepsen_report.py --start_time=$start_time --end_time=$end_time --elapsed=$elapsed_secs --emit_status_files --store ./jepsen-mongodb jepsen-mongodb/jepsen_${task_name}_${execution}.log +exit_code=$? +cat report.json + +if [ -f "jepsen_system_fail.txt" ]; then + mv jepsen_system_fail.txt jepsen-mongodb/jepsen_system_failure_${task_name}_${execution} + exit 0 +fi + +exit $exit_code diff --git a/evergreen/jepsen_test_run.sh b/evergreen/jepsen_test_run.sh index 2cf8b0ed387..b5f79426079 100644 --- a/evergreen/jepsen_test_run.sh +++ b/evergreen/jepsen_test_run.sh @@ -16,6 +16,11 @@ mkdir -p $TMPDIR export _JAVA_OPTIONS=-Djava.io.tmpdir=$TMPDIR start_time=$(date +%s) +# XXX Do not run this test in a loop or with Jepsen's repeat flag +# the report generator (jepsen_report.sh) does not support parsing more than one +# test at a time. If you need to run these tests with a repeat flag, you +# will need to adopt buildscripts/jepsen_report.py to support the log output +# of older-style Jepsen tests lein run test --test ${jepsen_test_name} \ --mongodb-dir ../ \ --working-dir ${workdir}/src/jepsen-workdir \ |