summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRobert Guo <robert.guo@mongodb.com>2021-06-15 17:51:01 -0400
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-06-16 16:08:23 +0000
commit796a9f1504ea088bca412b8af787b735943245cf (patch)
treed7af48782eb4494338ee50c689a74b6207f58930
parentc73b1c09eb5ce2053577abac0a4ba360e3845de8 (diff)
downloadmongo-796a9f1504ea088bca412b8af787b735943245cf.tar.gz
SERVER-57439 add unittest task with undodb recordings to ! RHEL 8
-rw-r--r--buildscripts/resmokelib/core/process.py25
-rwxr-xr-xbuildscripts/resmokelib/hang_analyzer/hang_analyzer.py10
-rw-r--r--buildscripts/resmokelib/testing/testcases/cpp_unittest.py24
-rw-r--r--buildscripts/resmokelib/testing/testcases/interface.py16
-rw-r--r--buildscripts/resmokelib/testing/testcases/jstest.py15
-rw-r--r--etc/evergreen.yml46
-rw-r--r--evergreen/failed_unittests_gather.sh16
-rw-r--r--evergreen/resmoke_tests_execute.sh8
-rw-r--r--evergreen/undo_wiki_page.sh13
-rw-r--r--src/mongo/shell/servers.js2
10 files changed, 153 insertions, 22 deletions
diff --git a/buildscripts/resmokelib/core/process.py b/buildscripts/resmokelib/core/process.py
index b23ee5782c1..63e18575cba 100644
--- a/buildscripts/resmokelib/core/process.py
+++ b/buildscripts/resmokelib/core/process.py
@@ -117,13 +117,26 @@ class Process(object):
close_fds = (sys.platform != "win32")
with _POPEN_LOCK:
- self._process = subprocess.Popen(
- self.args, bufsize=buffer_size, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
- close_fds=close_fds, env=self.env, creationflags=creation_flags, cwd=self._cwd)
+
+ # Record unittests directly since resmoke doesn't not interact with them and they can finish
+ # too quickly for the recorder to have a chance at attaching.
+ recorder_args = []
+ if _config.UNDO_RECORDER_PATH is not None and self.args[0].endswith("_test"):
+ now_str = datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
+ # Only use the process name since we have to be able to correlate the recording name
+ # with the binary name easily.
+ recorder_output_file = "{process}-{t}.undo".format(
+ process=os.path.basename(self.args[0]), t=now_str)
+ recorder_args = [_config.UNDO_RECORDER_PATH, "-o", recorder_output_file]
+
+ self._process = subprocess.Popen(recorder_args + self.args, bufsize=buffer_size,
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+ close_fds=close_fds, env=self.env,
+ creationflags=creation_flags, cwd=self._cwd)
self.pid = self._process.pid
- if _config.UNDO_RECORDER_PATH is not None and ("mongod" in self.args[0]
- or "mongos" in self.args[0]):
+ if _config.UNDO_RECORDER_PATH is not None and (not self.args[0].endswith("_test")) and (
+ "mongod" in self.args[0] or "mongos" in self.args[0]):
now_str = datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
recorder_output_file = "{logger}-{process}-{pid}-{t}.undo".format(
logger=self.logger.name.replace('/', '-'),
@@ -236,7 +249,7 @@ class Process(object):
if recorder_return != 0:
raise errors.ServerFailure(
"UndoDB live-record did not terminate correctly. This is likely a bug with UndoDB. "
- "Please record the logs and notify the #server-tig Slack channel")
+ "Please record the logs and notify the #server-testing Slack channel")
if self._stdout_pipe:
self._stdout_pipe.wait_until_finished()
diff --git a/buildscripts/resmokelib/hang_analyzer/hang_analyzer.py b/buildscripts/resmokelib/hang_analyzer/hang_analyzer.py
index 7e53c11679e..9034394e9bf 100755
--- a/buildscripts/resmokelib/hang_analyzer/hang_analyzer.py
+++ b/buildscripts/resmokelib/hang_analyzer/hang_analyzer.py
@@ -71,15 +71,21 @@ class HangAnalyzer(Subcommand):
processes = process_list.get_processes(self.process_ids, self.interesting_processes,
self.options.process_match, self.root_logger)
+ def is_python_process(pname: str):
+ # "live-record*" and "python*" are Python processes. Sending SIGUSR1 causes resmoke.py
+ # to dump its stack and run the hang analyzer on its child processes.
+ # Sending SIGUSR1 causes live-record to save its recording and terminate.
+ return pname.startswith("python") or pname.startswith("live-record")
+
# Suspending all processes, except python, to prevent them from getting unstuck when
# the hang analyzer attaches to them.
- for pinfo in [pinfo for pinfo in processes if not pinfo.name.startswith("python")]:
+ for pinfo in [pinfo for pinfo in processes if not is_python_process(pinfo.name)]:
for pid in pinfo.pidv:
process.pause_process(self.root_logger, pinfo.name, pid)
# Dump python processes by signalling them. The resmoke.py process will generate
# the report.json, when signalled, so we do this before attaching to other processes.
- for pinfo in [pinfo for pinfo in processes if pinfo.name.startswith("python")]:
+ for pinfo in [pinfo for pinfo in processes if is_python_process(pinfo.name)]:
for pid in pinfo.pidv:
process.signal_python(self.root_logger, pinfo.name, pid)
diff --git a/buildscripts/resmokelib/testing/testcases/cpp_unittest.py b/buildscripts/resmokelib/testing/testcases/cpp_unittest.py
index 08b4a7e0cbd..9162bce16c9 100644
--- a/buildscripts/resmokelib/testing/testcases/cpp_unittest.py
+++ b/buildscripts/resmokelib/testing/testcases/cpp_unittest.py
@@ -1,5 +1,7 @@
"""The unittest.TestCase for C++ unit tests."""
+import os
+from buildscripts.resmokelib import config
from buildscripts.resmokelib import core
from buildscripts.resmokelib import utils
from buildscripts.resmokelib.testing.testcases import interface
@@ -18,6 +20,28 @@ class CPPUnitTestCase(interface.ProcessTestCase):
self.program_executable = program_executable
self.program_options = utils.default_if_none(program_options, {}).copy()
+ def run_test(self):
+ """Run the test."""
+ try:
+ super().run_test()
+ except self.failureException:
+ if config.UNDO_RECORDER_PATH:
+ # Record the list of failed tests so we can upload them to the Evergreen task.
+ # Non-recorded tests rely on the core dump content to identify the test binaries.
+ with open("failed_recorded_tests.txt", 'a') as failure_list:
+ failure_list.write(self.program_executable)
+ failure_list.write("\n")
+ self.logger.exception(
+ "*** Failed test run was recorded. ***\n"
+ "For instructions on using the recording instead of core dumps, see\n"
+ "https://wiki.corp.mongodb.com/display/COREENG/Time+Travel+Debugging+in+MongoDB\n"
+ "For questions or bug reports, please reach our in #server-testing")
+
+ # Archive any available recordings if there's any failure. It's possible a problem
+ # with the recorder will cause no recordings to be generated.
+ self._cull_recordings(os.path.basename(self.program_executable))
+ raise
+
def _make_process(self):
self.program_options["job_num"] = self.fixture.job_num
self.program_options["test_id"] = self._id
diff --git a/buildscripts/resmokelib/testing/testcases/interface.py b/buildscripts/resmokelib/testing/testcases/interface.py
index 474ddb2ae01..67733e74737 100644
--- a/buildscripts/resmokelib/testing/testcases/interface.py
+++ b/buildscripts/resmokelib/testing/testcases/interface.py
@@ -2,12 +2,13 @@
This is used to perform the actual test case.
"""
-
+import glob
import os
import os.path
import unittest
import uuid
+from buildscripts.resmokelib import config
from buildscripts.resmokelib import logging
from buildscripts.resmokelib.utils import registry
@@ -116,8 +117,8 @@ class ProcessTestCase(TestCase): # pylint: disable=abstract-method
def run_test(self):
"""Run the test."""
try:
- shell = self._make_process()
- self._execute(shell)
+ proc = self._make_process()
+ self._execute(proc)
except self.failureException:
raise
except:
@@ -145,3 +146,12 @@ class ProcessTestCase(TestCase): # pylint: disable=abstract-method
def _make_process(self):
"""Return a new Process instance that could be used to run the test or log the command."""
raise NotImplementedError("_make_process must be implemented by TestCase subclasses")
+
+ def _cull_recordings(self, program_executable):
+ """Move recordings if test fails so it doesn't get deleted."""
+ # Only store my recordings. Concurrent processes may generate their own recordings that we
+ # should ignore. There's a problem with duplicate program names under different directories
+ # But that should be rare and there's no harm in having more recordings stored.
+ for recording in glob.glob(program_executable + "*.undo"):
+ self.logger.info("Keeping recording %s", recording)
+ os.rename(recording, recording + '.tokeep')
diff --git a/buildscripts/resmokelib/testing/testcases/jstest.py b/buildscripts/resmokelib/testing/testcases/jstest.py
index b2ab85df514..f6ae1033e94 100644
--- a/buildscripts/resmokelib/testing/testcases/jstest.py
+++ b/buildscripts/resmokelib/testing/testcases/jstest.py
@@ -230,10 +230,17 @@ class JSTestCase(interface.ProcessTestCase):
def run_test(self):
"""Execute the test."""
- if self.num_clients == 1:
- self._run_single_copy()
- else:
- self._run_multiple_copies()
+ try:
+ if self.num_clients == 1:
+ self._run_single_copy()
+ else:
+ self._run_multiple_copies()
+ except:
+ # Archive any available recordings if there's any failure. It's possible a problem
+ # with the recorder will cause no recordings to be generated. There will also be
+ # recordings of other processes, we keep them to avoid complicating this code.
+ self._cull_recordings("mongo")
+ raise
def _raise_if_unsafe_exit(self, return_code):
"""Determine if a return code represents and unsafe exit."""
diff --git a/etc/evergreen.yml b/etc/evergreen.yml
index 34de733d607..c812aa201aa 100644
--- a/etc/evergreen.yml
+++ b/etc/evergreen.yml
@@ -180,6 +180,7 @@ variables:
- func: "save code coverage data"
- func: "save mongo coredumps"
- func: "save failed unittests"
+ - func: "save UndoDB recordings"
- func: "save unstripped dbtest"
- func: "save hang analyzer debugger files"
- func: "save disk statistics"
@@ -1960,6 +1961,7 @@ functions:
target: undodb-recordings.tgz
source_dir: src
include:
+ - "./*.undo.tokeep"
- "./*.undo"
"archive UndoDB recordings": &archive_undodb_recordings
@@ -2322,7 +2324,8 @@ tasks:
- "./src/evergreen/run_clang_tidy.sh"
## compile_unittests ##
-- name: compile_unittests
+- &compile_unittests
+ name: compile_unittests
depends_on:
- name: compile_dist_test
commands:
@@ -2333,6 +2336,10 @@ tasks:
--separate-debug
compiling_for_test: true
+## A copy of the compile_unittests task for the recorded unittest taskgroup ##
+- <<: *compile_unittests
+ name: compile_unittests_for_recorded_unittest
+
## run_unittests ##
- name: run_unittests
tags: []
@@ -2348,6 +2355,34 @@ tasks:
vars:
resmoke_args: --suites=unittests
+## run_unittests with UndoDB live-record ##
+- name: run_unittests_with_recording
+ depends_on:
+ - name: compile_unittests_for_recorded_unittest
+ commands:
+ - *f_expansions_write
+ - func: "run diskstats"
+ - func: "f_expansions_write"
+ - func: "monitor process threads"
+ - func: "collect system resource info"
+ - command: subprocess.exec
+ params:
+ binary: bash
+ args:
+ - "./src/evergreen/undo_wiki_page.sh"
+ - command: attach.artifacts
+ params:
+ files:
+ - undo_wiki_page_location.json
+ - func: "run tests"
+ vars:
+ resmoke_args: --suites=unittests
+ record_with: --recordWith /opt/undodb5/bin/live-record
+ # Start fewer jobs since there's a constant amount of overhead of starting
+ # live-record for each job.
+ resmoke_jobs_factor: 0.3
+
+
##compile_and_archive_libfuzzertests - build libfuzzertests ##
- name: compile_and_archive_libfuzzertests
tags: []
@@ -7084,6 +7119,12 @@ task_groups:
- run_unittests
- <<: *compile_task_group_template
+ name: compile_test_and_package_parallel_unittest_stream_with_recording_TG
+ tasks:
+ - compile_unittests_for_recorded_unittest
+ - run_unittests_with_recording
+
+- <<: *compile_task_group_template
name: compile_test_and_package_parallel_dbtest_stream_TG
tasks:
- compile_dbtest
@@ -8987,6 +9028,9 @@ buildvariants:
- name: compile_test_and_package_parallel_core_stream_TG
distros:
- rhel80-xlarge
+ - name: compile_test_and_package_parallel_unittest_stream_with_recording_TG
+ distros:
+ - rhel80-xlarge
- name: compile_test_and_package_parallel_unittest_stream_TG
distros:
- rhel80-xlarge
diff --git a/evergreen/failed_unittests_gather.sh b/evergreen/failed_unittests_gather.sh
index dedff4be837..ebd1e13394a 100644
--- a/evergreen/failed_unittests_gather.sh
+++ b/evergreen/failed_unittests_gather.sh
@@ -6,7 +6,7 @@ cd src
set -eou pipefail
# Only run on unit test tasks so we don't target mongod binaries from cores.
-if [ "${task_name}" != "run_unittests" ] && [ "${task_name}" != "run_dbtest" ]; then
+if [ "${task_name}" != "run_unittests" ] && [ "${task_name}" != "run_dbtest" ] && [ "${task_name}" != "run_unittests_with_recording" ]; then
exit 0
fi
@@ -31,9 +31,11 @@ for core_file in $core_files; do
# may return more than 1 file.
binary_file_locations=$(/usr/bin/find -H . -executable -name "$binary_file*${exe}" 2>/dev/null)
fi
+
if [ -z "$binary_file_locations" ]; then
echo "Cannot locate the unittest binary file ($binary_file) that generated the core file $core_file"
fi
+
for binary_file_location in $binary_file_locations; do
new_binary_file=$unittest_bin_dir/$(echo "$binary_file_location" | sed "s/.*\///")
if [ -f "$binary_file_location" ] && [ ! -f "$new_binary_file" ]; then
@@ -63,8 +65,18 @@ for core_file in $core_files; do
done
done
+# For recorded tests, use the text file to copy them over instead of relying on core dumps.
+has_recorded_failures=""
+if [[ -f "failed_recorded_tests.txt" ]]; then
+ while read -r line; do
+ cp "$line" .
+ done <"failed_recorded_tests.txt"
+
+ has_recorded_failures="true"
+fi
+
# Copy debug symbols for dynamic builds
lib_dir=build/install/lib
-if [ -d "$lib_dir" ] && [[ -n "$core_files" ]]; then
+if [ -d "$lib_dir" ] && [[ -n "$core_files" || -n "$has_recorded_failures" ]]; then
cp -r "$lib_dir" dist-unittests
fi
diff --git a/evergreen/resmoke_tests_execute.sh b/evergreen/resmoke_tests_execute.sh
index 561feb9c620..fbd3decd4a5 100644
--- a/evergreen/resmoke_tests_execute.sh
+++ b/evergreen/resmoke_tests_execute.sh
@@ -130,10 +130,11 @@ if [[ ${disable_unit_tests} = "false" && ! -f ${skip_tests} ]]; then
set -o errexit
if [[ -n "${record_with}" ]]; then
- recording_size=$(du -ch *.undo | grep total)
+ recording_size=$( (du -ch ./*.undo ./*.undo.tokeep || true) | grep total)
echo "UndoDB produced recordings that were $recording_size (uncompressed) on disk"
- if [[ $resmoke_exit_code = 0 ]]; then
- echo "Resmoke exited successfully. UndoDB recordings will not be saved."
+ # Unittests recordings are renamed so there's never a need to store any .undo files.
+ if [[ $resmoke_exit_code = 0 || "${task_name}" == "run_unittests_with_recording" ]]; then
+ echo "Removing UndoDB recordings of successful tests."
rm *.undo || true
fi
fi
@@ -154,5 +155,6 @@ if [[ ${disable_unit_tests} = "false" && ! -f ${skip_tests} ]]; then
core_files=$(/usr/bin/find -H .. \( -name "*.core" -o -name "*.mdmp" \) 2>/dev/null)
rm -rf $core_files
fi
+
exit $resmoke_exit_code
fi # end if [[ ${disable_unit_tests} && ! -f ${skip_tests|/dev/null} ]]
diff --git a/evergreen/undo_wiki_page.sh b/evergreen/undo_wiki_page.sh
new file mode 100644
index 00000000000..91855244e95
--- /dev/null
+++ b/evergreen/undo_wiki_page.sh
@@ -0,0 +1,13 @@
+DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
+. "$DIR/prelude.sh"
+
+set -o errexit
+set -o verbose
+
+activate_venv
+$python -c 'import json; print(json.dumps([{
+ "name": "*** How to use UndoDB Recordings instead of Core Dumps or Log Files ***",
+ "link": "https://wiki.corp.mongodb.com/display/COREENG/Time+Travel+Debugging+in+MongoDB",
+ "visibility": "public",
+ "ignore_for_fetch": True
+}]))' >undo_wiki_page_location.json
diff --git a/src/mongo/shell/servers.js b/src/mongo/shell/servers.js
index 68b4ce21251..e33a1a386e8 100644
--- a/src/mongo/shell/servers.js
+++ b/src/mongo/shell/servers.js
@@ -1462,7 +1462,7 @@ var _stopUndoLiveRecord = function(undoLiveRecordPid) {
if (undoReturnCode !== 0) {
throw new Error(
"Undo live-record failed to terminate correctly. This is likely a bug in Undo. " +
- "Please record any logs and send them to the #server-tig Slack channel");
+ "Please record any logs and send them to the #server-testing Slack channel");
}
};