From 796a9f1504ea088bca412b8af787b735943245cf Mon Sep 17 00:00:00 2001 From: Robert Guo Date: Tue, 15 Jun 2021 17:51:01 -0400 Subject: SERVER-57439 add unittest task with undodb recordings to ! RHEL 8 --- buildscripts/resmokelib/core/process.py | 25 +++++++++--- .../resmokelib/hang_analyzer/hang_analyzer.py | 10 ++++- .../resmokelib/testing/testcases/cpp_unittest.py | 24 +++++++++++ .../resmokelib/testing/testcases/interface.py | 16 ++++++-- .../resmokelib/testing/testcases/jstest.py | 15 +++++-- etc/evergreen.yml | 46 +++++++++++++++++++++- evergreen/failed_unittests_gather.sh | 16 +++++++- evergreen/resmoke_tests_execute.sh | 8 ++-- evergreen/undo_wiki_page.sh | 13 ++++++ src/mongo/shell/servers.js | 2 +- 10 files changed, 153 insertions(+), 22 deletions(-) create mode 100644 evergreen/undo_wiki_page.sh diff --git a/buildscripts/resmokelib/core/process.py b/buildscripts/resmokelib/core/process.py index b23ee5782c1..63e18575cba 100644 --- a/buildscripts/resmokelib/core/process.py +++ b/buildscripts/resmokelib/core/process.py @@ -117,13 +117,26 @@ class Process(object): close_fds = (sys.platform != "win32") with _POPEN_LOCK: - self._process = subprocess.Popen( - self.args, bufsize=buffer_size, stdout=subprocess.PIPE, stderr=subprocess.PIPE, - close_fds=close_fds, env=self.env, creationflags=creation_flags, cwd=self._cwd) + + # Record unittests directly since resmoke doesn't not interact with them and they can finish + # too quickly for the recorder to have a chance at attaching. + recorder_args = [] + if _config.UNDO_RECORDER_PATH is not None and self.args[0].endswith("_test"): + now_str = datetime.now().strftime("%Y-%m-%dT%H-%M-%S") + # Only use the process name since we have to be able to correlate the recording name + # with the binary name easily. + recorder_output_file = "{process}-{t}.undo".format( + process=os.path.basename(self.args[0]), t=now_str) + recorder_args = [_config.UNDO_RECORDER_PATH, "-o", recorder_output_file] + + self._process = subprocess.Popen(recorder_args + self.args, bufsize=buffer_size, + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + close_fds=close_fds, env=self.env, + creationflags=creation_flags, cwd=self._cwd) self.pid = self._process.pid - if _config.UNDO_RECORDER_PATH is not None and ("mongod" in self.args[0] - or "mongos" in self.args[0]): + if _config.UNDO_RECORDER_PATH is not None and (not self.args[0].endswith("_test")) and ( + "mongod" in self.args[0] or "mongos" in self.args[0]): now_str = datetime.now().strftime("%Y-%m-%dT%H-%M-%S") recorder_output_file = "{logger}-{process}-{pid}-{t}.undo".format( logger=self.logger.name.replace('/', '-'), @@ -236,7 +249,7 @@ class Process(object): if recorder_return != 0: raise errors.ServerFailure( "UndoDB live-record did not terminate correctly. This is likely a bug with UndoDB. " - "Please record the logs and notify the #server-tig Slack channel") + "Please record the logs and notify the #server-testing Slack channel") if self._stdout_pipe: self._stdout_pipe.wait_until_finished() diff --git a/buildscripts/resmokelib/hang_analyzer/hang_analyzer.py b/buildscripts/resmokelib/hang_analyzer/hang_analyzer.py index 7e53c11679e..9034394e9bf 100755 --- a/buildscripts/resmokelib/hang_analyzer/hang_analyzer.py +++ b/buildscripts/resmokelib/hang_analyzer/hang_analyzer.py @@ -71,15 +71,21 @@ class HangAnalyzer(Subcommand): processes = process_list.get_processes(self.process_ids, self.interesting_processes, self.options.process_match, self.root_logger) + def is_python_process(pname: str): + # "live-record*" and "python*" are Python processes. Sending SIGUSR1 causes resmoke.py + # to dump its stack and run the hang analyzer on its child processes. + # Sending SIGUSR1 causes live-record to save its recording and terminate. + return pname.startswith("python") or pname.startswith("live-record") + # Suspending all processes, except python, to prevent them from getting unstuck when # the hang analyzer attaches to them. - for pinfo in [pinfo for pinfo in processes if not pinfo.name.startswith("python")]: + for pinfo in [pinfo for pinfo in processes if not is_python_process(pinfo.name)]: for pid in pinfo.pidv: process.pause_process(self.root_logger, pinfo.name, pid) # Dump python processes by signalling them. The resmoke.py process will generate # the report.json, when signalled, so we do this before attaching to other processes. - for pinfo in [pinfo for pinfo in processes if pinfo.name.startswith("python")]: + for pinfo in [pinfo for pinfo in processes if is_python_process(pinfo.name)]: for pid in pinfo.pidv: process.signal_python(self.root_logger, pinfo.name, pid) diff --git a/buildscripts/resmokelib/testing/testcases/cpp_unittest.py b/buildscripts/resmokelib/testing/testcases/cpp_unittest.py index 08b4a7e0cbd..9162bce16c9 100644 --- a/buildscripts/resmokelib/testing/testcases/cpp_unittest.py +++ b/buildscripts/resmokelib/testing/testcases/cpp_unittest.py @@ -1,5 +1,7 @@ """The unittest.TestCase for C++ unit tests.""" +import os +from buildscripts.resmokelib import config from buildscripts.resmokelib import core from buildscripts.resmokelib import utils from buildscripts.resmokelib.testing.testcases import interface @@ -18,6 +20,28 @@ class CPPUnitTestCase(interface.ProcessTestCase): self.program_executable = program_executable self.program_options = utils.default_if_none(program_options, {}).copy() + def run_test(self): + """Run the test.""" + try: + super().run_test() + except self.failureException: + if config.UNDO_RECORDER_PATH: + # Record the list of failed tests so we can upload them to the Evergreen task. + # Non-recorded tests rely on the core dump content to identify the test binaries. + with open("failed_recorded_tests.txt", 'a') as failure_list: + failure_list.write(self.program_executable) + failure_list.write("\n") + self.logger.exception( + "*** Failed test run was recorded. ***\n" + "For instructions on using the recording instead of core dumps, see\n" + "https://wiki.corp.mongodb.com/display/COREENG/Time+Travel+Debugging+in+MongoDB\n" + "For questions or bug reports, please reach our in #server-testing") + + # Archive any available recordings if there's any failure. It's possible a problem + # with the recorder will cause no recordings to be generated. + self._cull_recordings(os.path.basename(self.program_executable)) + raise + def _make_process(self): self.program_options["job_num"] = self.fixture.job_num self.program_options["test_id"] = self._id diff --git a/buildscripts/resmokelib/testing/testcases/interface.py b/buildscripts/resmokelib/testing/testcases/interface.py index 474ddb2ae01..67733e74737 100644 --- a/buildscripts/resmokelib/testing/testcases/interface.py +++ b/buildscripts/resmokelib/testing/testcases/interface.py @@ -2,12 +2,13 @@ This is used to perform the actual test case. """ - +import glob import os import os.path import unittest import uuid +from buildscripts.resmokelib import config from buildscripts.resmokelib import logging from buildscripts.resmokelib.utils import registry @@ -116,8 +117,8 @@ class ProcessTestCase(TestCase): # pylint: disable=abstract-method def run_test(self): """Run the test.""" try: - shell = self._make_process() - self._execute(shell) + proc = self._make_process() + self._execute(proc) except self.failureException: raise except: @@ -145,3 +146,12 @@ class ProcessTestCase(TestCase): # pylint: disable=abstract-method def _make_process(self): """Return a new Process instance that could be used to run the test or log the command.""" raise NotImplementedError("_make_process must be implemented by TestCase subclasses") + + def _cull_recordings(self, program_executable): + """Move recordings if test fails so it doesn't get deleted.""" + # Only store my recordings. Concurrent processes may generate their own recordings that we + # should ignore. There's a problem with duplicate program names under different directories + # But that should be rare and there's no harm in having more recordings stored. + for recording in glob.glob(program_executable + "*.undo"): + self.logger.info("Keeping recording %s", recording) + os.rename(recording, recording + '.tokeep') diff --git a/buildscripts/resmokelib/testing/testcases/jstest.py b/buildscripts/resmokelib/testing/testcases/jstest.py index b2ab85df514..f6ae1033e94 100644 --- a/buildscripts/resmokelib/testing/testcases/jstest.py +++ b/buildscripts/resmokelib/testing/testcases/jstest.py @@ -230,10 +230,17 @@ class JSTestCase(interface.ProcessTestCase): def run_test(self): """Execute the test.""" - if self.num_clients == 1: - self._run_single_copy() - else: - self._run_multiple_copies() + try: + if self.num_clients == 1: + self._run_single_copy() + else: + self._run_multiple_copies() + except: + # Archive any available recordings if there's any failure. It's possible a problem + # with the recorder will cause no recordings to be generated. There will also be + # recordings of other processes, we keep them to avoid complicating this code. + self._cull_recordings("mongo") + raise def _raise_if_unsafe_exit(self, return_code): """Determine if a return code represents and unsafe exit.""" diff --git a/etc/evergreen.yml b/etc/evergreen.yml index 34de733d607..c812aa201aa 100644 --- a/etc/evergreen.yml +++ b/etc/evergreen.yml @@ -180,6 +180,7 @@ variables: - func: "save code coverage data" - func: "save mongo coredumps" - func: "save failed unittests" + - func: "save UndoDB recordings" - func: "save unstripped dbtest" - func: "save hang analyzer debugger files" - func: "save disk statistics" @@ -1960,6 +1961,7 @@ functions: target: undodb-recordings.tgz source_dir: src include: + - "./*.undo.tokeep" - "./*.undo" "archive UndoDB recordings": &archive_undodb_recordings @@ -2322,7 +2324,8 @@ tasks: - "./src/evergreen/run_clang_tidy.sh" ## compile_unittests ## -- name: compile_unittests +- &compile_unittests + name: compile_unittests depends_on: - name: compile_dist_test commands: @@ -2333,6 +2336,10 @@ tasks: --separate-debug compiling_for_test: true +## A copy of the compile_unittests task for the recorded unittest taskgroup ## +- <<: *compile_unittests + name: compile_unittests_for_recorded_unittest + ## run_unittests ## - name: run_unittests tags: [] @@ -2348,6 +2355,34 @@ tasks: vars: resmoke_args: --suites=unittests +## run_unittests with UndoDB live-record ## +- name: run_unittests_with_recording + depends_on: + - name: compile_unittests_for_recorded_unittest + commands: + - *f_expansions_write + - func: "run diskstats" + - func: "f_expansions_write" + - func: "monitor process threads" + - func: "collect system resource info" + - command: subprocess.exec + params: + binary: bash + args: + - "./src/evergreen/undo_wiki_page.sh" + - command: attach.artifacts + params: + files: + - undo_wiki_page_location.json + - func: "run tests" + vars: + resmoke_args: --suites=unittests + record_with: --recordWith /opt/undodb5/bin/live-record + # Start fewer jobs since there's a constant amount of overhead of starting + # live-record for each job. + resmoke_jobs_factor: 0.3 + + ##compile_and_archive_libfuzzertests - build libfuzzertests ## - name: compile_and_archive_libfuzzertests tags: [] @@ -7083,6 +7118,12 @@ task_groups: - compile_unittests - run_unittests +- <<: *compile_task_group_template + name: compile_test_and_package_parallel_unittest_stream_with_recording_TG + tasks: + - compile_unittests_for_recorded_unittest + - run_unittests_with_recording + - <<: *compile_task_group_template name: compile_test_and_package_parallel_dbtest_stream_TG tasks: @@ -8987,6 +9028,9 @@ buildvariants: - name: compile_test_and_package_parallel_core_stream_TG distros: - rhel80-xlarge + - name: compile_test_and_package_parallel_unittest_stream_with_recording_TG + distros: + - rhel80-xlarge - name: compile_test_and_package_parallel_unittest_stream_TG distros: - rhel80-xlarge diff --git a/evergreen/failed_unittests_gather.sh b/evergreen/failed_unittests_gather.sh index dedff4be837..ebd1e13394a 100644 --- a/evergreen/failed_unittests_gather.sh +++ b/evergreen/failed_unittests_gather.sh @@ -6,7 +6,7 @@ cd src set -eou pipefail # Only run on unit test tasks so we don't target mongod binaries from cores. -if [ "${task_name}" != "run_unittests" ] && [ "${task_name}" != "run_dbtest" ]; then +if [ "${task_name}" != "run_unittests" ] && [ "${task_name}" != "run_dbtest" ] && [ "${task_name}" != "run_unittests_with_recording" ]; then exit 0 fi @@ -31,9 +31,11 @@ for core_file in $core_files; do # may return more than 1 file. binary_file_locations=$(/usr/bin/find -H . -executable -name "$binary_file*${exe}" 2>/dev/null) fi + if [ -z "$binary_file_locations" ]; then echo "Cannot locate the unittest binary file ($binary_file) that generated the core file $core_file" fi + for binary_file_location in $binary_file_locations; do new_binary_file=$unittest_bin_dir/$(echo "$binary_file_location" | sed "s/.*\///") if [ -f "$binary_file_location" ] && [ ! -f "$new_binary_file" ]; then @@ -63,8 +65,18 @@ for core_file in $core_files; do done done +# For recorded tests, use the text file to copy them over instead of relying on core dumps. +has_recorded_failures="" +if [[ -f "failed_recorded_tests.txt" ]]; then + while read -r line; do + cp "$line" . + done <"failed_recorded_tests.txt" + + has_recorded_failures="true" +fi + # Copy debug symbols for dynamic builds lib_dir=build/install/lib -if [ -d "$lib_dir" ] && [[ -n "$core_files" ]]; then +if [ -d "$lib_dir" ] && [[ -n "$core_files" || -n "$has_recorded_failures" ]]; then cp -r "$lib_dir" dist-unittests fi diff --git a/evergreen/resmoke_tests_execute.sh b/evergreen/resmoke_tests_execute.sh index 561feb9c620..fbd3decd4a5 100644 --- a/evergreen/resmoke_tests_execute.sh +++ b/evergreen/resmoke_tests_execute.sh @@ -130,10 +130,11 @@ if [[ ${disable_unit_tests} = "false" && ! -f ${skip_tests} ]]; then set -o errexit if [[ -n "${record_with}" ]]; then - recording_size=$(du -ch *.undo | grep total) + recording_size=$( (du -ch ./*.undo ./*.undo.tokeep || true) | grep total) echo "UndoDB produced recordings that were $recording_size (uncompressed) on disk" - if [[ $resmoke_exit_code = 0 ]]; then - echo "Resmoke exited successfully. UndoDB recordings will not be saved." + # Unittests recordings are renamed so there's never a need to store any .undo files. + if [[ $resmoke_exit_code = 0 || "${task_name}" == "run_unittests_with_recording" ]]; then + echo "Removing UndoDB recordings of successful tests." rm *.undo || true fi fi @@ -154,5 +155,6 @@ if [[ ${disable_unit_tests} = "false" && ! -f ${skip_tests} ]]; then core_files=$(/usr/bin/find -H .. \( -name "*.core" -o -name "*.mdmp" \) 2>/dev/null) rm -rf $core_files fi + exit $resmoke_exit_code fi # end if [[ ${disable_unit_tests} && ! -f ${skip_tests|/dev/null} ]] diff --git a/evergreen/undo_wiki_page.sh b/evergreen/undo_wiki_page.sh new file mode 100644 index 00000000000..91855244e95 --- /dev/null +++ b/evergreen/undo_wiki_page.sh @@ -0,0 +1,13 @@ +DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" +. "$DIR/prelude.sh" + +set -o errexit +set -o verbose + +activate_venv +$python -c 'import json; print(json.dumps([{ + "name": "*** How to use UndoDB Recordings instead of Core Dumps or Log Files ***", + "link": "https://wiki.corp.mongodb.com/display/COREENG/Time+Travel+Debugging+in+MongoDB", + "visibility": "public", + "ignore_for_fetch": True +}]))' >undo_wiki_page_location.json diff --git a/src/mongo/shell/servers.js b/src/mongo/shell/servers.js index 68b4ce21251..e33a1a386e8 100644 --- a/src/mongo/shell/servers.js +++ b/src/mongo/shell/servers.js @@ -1462,7 +1462,7 @@ var _stopUndoLiveRecord = function(undoLiveRecordPid) { if (undoReturnCode !== 0) { throw new Error( "Undo live-record failed to terminate correctly. This is likely a bug in Undo. " + - "Please record any logs and send them to the #server-tig Slack channel"); + "Please record any logs and send them to the #server-testing Slack channel"); } }; -- cgit v1.2.1