diff options
author | Ben Gamari <ben@smart-cactus.org> | 2019-01-31 19:47:53 -0500 |
---|---|---|
committer | Ben Gamari <ben@smart-cactus.org> | 2019-01-31 19:47:53 -0500 |
commit | ebe2d344899e2165182d3c41b0796ec022561790 (patch) | |
tree | 0771fe65262a7bbb7a465f1b9e9bb50b91649f29 /testsuite/driver | |
parent | 21462a3a4edede710582b8038695667d2b9d0159 (diff) | |
download | haskell-ebe2d344899e2165182d3c41b0796ec022561790.tar.gz |
Revert "Performance tests: recover a baseline from ancestor commits and CI results."
Unfortunately this has broken all future commits due to spurious(?)
performance changes which I have been unable to work around.
This reverts commit cc2261d42f6a954d88e355aaad41f001f65c95da.
Diffstat (limited to 'testsuite/driver')
-rw-r--r-- | testsuite/driver/perf_notes.py | 202 | ||||
-rw-r--r-- | testsuite/driver/runtests.py | 36 | ||||
-rw-r--r-- | testsuite/driver/testglobals.py | 12 | ||||
-rw-r--r-- | testsuite/driver/testlib.py | 69 | ||||
-rw-r--r-- | testsuite/driver/testutil.py | 2 |
5 files changed, 62 insertions, 259 deletions
diff --git a/testsuite/driver/perf_notes.py b/testsuite/driver/perf_notes.py index c5d09e04db..6d80e07e12 100644 --- a/testsuite/driver/perf_notes.py +++ b/testsuite/driver/perf_notes.py @@ -13,7 +13,6 @@ import argparse import re import subprocess import time -import sys from collections import namedtuple from math import ceil, trunc @@ -42,7 +41,7 @@ def is_worktree_dirty(): # The metrics (a.k.a stats) are named tuples, PerfStat, in this form: # # ( test_env : 'val', # Test environment. -# test : 'val', # Name of the test +# test : 'val', # Name of the test # way : 'val', # metric : 'val', # Metric being recorded # value : 'val', # The statistic result e.g. runtime @@ -74,21 +73,6 @@ def get_perf_stats(commit='HEAD', namespace='perf'): log = [parse_perf_stat(stat_str) for stat_str in log] return log -# Check if a str is in a 40 character git commit hash. -# str -> bool -_commit_hash_re = re.compile('[0-9a-f]' * 40) -def is_commit_hash(hash): - return _commit_hash_re.fullmatch(hash) != None - -# Convert a <ref> to a commit hash code. -# str -> str -def commit_hash(commit): - if is_commit_hash(commit): - return commit - return subprocess.check_output(['git', 'rev-parse', commit], \ - stderr=subprocess.STDOUT) \ - .decode() \ - .strip() # Get allowed changes to performance. This is extracted from the commit message of # the given commit in this form: @@ -99,20 +83,13 @@ def commit_hash(commit): # 'metrics': ['metricA', 'metricB', ...], # 'opts': { # 'optionA': 'string value', -# 'optionB': 'string value', # e.g. test_env: "x86_64-linux" +# 'optionB': 'string value', # ... # } # } -_get_allowed_perf_changes_cache = {} def get_allowed_perf_changes(commit='HEAD'): - global _get_allowed_perf_changes_cache - commit = commit_hash(commit) - if not commit in _get_allowed_perf_changes_cache: - commitByteStr = subprocess.check_output(\ - ['git', '--no-pager', 'log', '-n1', '--format=%B', commit]) - _get_allowed_perf_changes_cache[commit] \ - = parse_allowed_perf_changes(commitByteStr.decode()) - return _get_allowed_perf_changes_cache[commit] + commitByteStr = subprocess.check_output(['git', '--no-pager', 'log', '-n1', '--format=%B', commit]) + return parse_allowed_perf_changes(commitByteStr.decode()) def parse_allowed_perf_changes(commitMsg): # Helper regex. Non-capturing unless postfixed with Cap. @@ -125,7 +102,7 @@ def parse_allowed_perf_changes(commitMsg): exp = (r"^Metric" +s+r"(Increase|Decrease)" +s+r"?("+qstr+r"|"+qstrList+r")?" # Metric or list of metrics.s.. - +s+r"?(\(" + r"(?:[^')]|"+qstr+r")*" + r"\))?" # Options surrounded in parenthesis. (allow parenthases in quoted strings) + +s+r"?(\(" + r"(?:[^')]|"+qstr+r")*" + r"\))?" # Options surounded in parenthesis. (allow parenthases in quoted strings)) +s+r"?:?" # Optional ":" +s+r"?((?:(?!\n\n)(?!\n[^\s])(?:.|\n))*)" # Test names. Stop parsing on empty or non-indented new line. ) @@ -236,176 +213,11 @@ def append_perf_stat(stats, commit='HEAD', namespace='perf', max_tries=5): tries += 1 time.sleep(1) - print("\nAn error occurred while writing the performance metrics to git notes.\n \ - This is usually due to a lock-file existing somewhere in the git repo.") + print("\nAn error occured while writing the performance metrics to git notes.\n \ + This is usually due to a lock-file existing somewhere in the git repo.") return False -# -# Baseline calculation -# - -# Max number of ancestor commits to search when compiling a baseline performance metric. -BaselineSearchDepth = 75 - -# The git notes name space for local results. -LocalNamespace = "perf" - -# The git notes name space for ci results. -CiNamespace = "ci/" + LocalNamespace - -# (isCalculated, best fit ci test_env or None) -BestFitCiTestEnv = (False, None) - -# test_env string or None -def best_fit_ci_test_env(): - global BestFitCiTestEnv - if not BestFitCiTestEnv[0]: - platform = sys.platform - isArch64 = sys.maxsize > 2**32 - arch = "x86_64" if isArch64 else "i386" - - if platform.startswith("linux"): - test_env = arch + "-linux-deb9" - elif platform.startswith("win32"): - # There are no windows CI test results. - test_env = None - elif isArch64 and platform.startswith("darwin"): - test_env = arch + "-darwin" - elif isArch64 and platform.startswith("freebsd"): - test_env = arch + "-freebsd" - else: - test_env = None - - BestFitCiTestEnv = (True, test_env) - - return BestFitCiTestEnv[1] - -_baseline_depth_commit_log = {} - -# Get the commit hashes for the last BaselineSearchDepth commits from and -# including the input commit. The output commits are all commit hashes. -# str -> [str] -def baseline_commit_log(commit): - global _baseline_depth_commit_log - commit = commit_hash(commit) - if not commit in _baseline_depth_commit_log: - _baseline_depth_commit_log[commit] = \ - subprocess.check_output(['git', 'log', '--format=%H', \ - '-n' + str(BaselineSearchDepth)]) \ - .decode().split('\n') - return _baseline_depth_commit_log[commit] - -# Cache of baseline values. This is a dict of dicts indexed on: -# (useCiNamespace, commit) -> (test_env, test, metric, way) -> baseline -# (bool , str ) -> (str , str , str , str) -> float -_commit_metric_cache = {} - -# Get the baseline (expected value) of a test at a given commit. This searches -# git notes from older commits for recorded metrics (locally and from ci). More -# recent commits are favoured, then local results over ci results are favoured. -# -# commit: str - must be a commit hash (see commit_has()) -# name: str - test name -# test_env: str - test environment (note a best fit test_env will be used -# instead when looking for ci results) -# metric: str - test metric -# way: str - test way -# returns: the baseline float or None if no metric was found within -# BaselineSearchDepth commits and since the last expected change. -def baseline_metric(commit, name, test_env, metric, way): - # For performance reasons (in order to avoid calling commit_hash), we assert - # commit is already a commit hash. - assert is_commit_hash(commit) - - # Get all recent commit hashes. - commit_hashes = baseline_commit_log(commit) - - # TODO PERF use git log to get hashes of all BaselineSearchDepth commits - def depth_to_commit(depth): - return commit_hashes[depth] - - def has_expected_change(commit): - return get_allowed_perf_changes(commit).get(name) \ - != None - - # Bool -> String - def namespace(useCiNamespace): - return CiNamespace if useCiNamespace else LocalNamespace - - ci_test_env = best_fit_ci_test_env() - - # gets the metric of a given commit - # (Bool, Int) -> (float | None) - def commit_metric(useCiNamespace, currentCommit): - global _commit_metric_cache - - # Get test environment. - effective_test_env = ci_test_env if useCiNamespace else test_env - if effective_test_env == None: - # This can happen when no best fit ci test is found. - return None - - # Check for cached value. - cacheKeyA = (useCiNamespace, currentCommit) - cacheKeyB = (effective_test_env, name, metric, way) - if cacheKeyA in _commit_metric_cache: - return _commit_metric_cache[cacheKeyA].get(cacheKeyB) - - # Cache miss. - # Calculate baselines from the current commit's git note. - # Note that the git note may contain data for other tests. All tests' - # baselines will be collected and cached for future use. - allCommitMetrics = get_perf_stats( - currentCommit, - namespace(useCiNamespace)) - - # Collect recorded values by cacheKeyB. - values_by_cache_key_b = {} - for perfStat in allCommitMetrics: - currentCacheKey = (perfStat.test_env, perfStat.test, \ - perfStat.metric, perfStat.way) - currentValues = values_by_cache_key_b.setdefault(currentCacheKey, []) - currentValues.append(float(perfStat.value)) - - # Calculate and baseline (average of values) by cacheKeyB. - baseline_by_cache_key_b = {} - for currentCacheKey, currentValues in values_by_cache_key_b.items(): - baseline_by_cache_key_b[currentCacheKey] = \ - sum(currentValues) / len(currentValues) - - # Save baselines to the cache. - _commit_metric_cache[cacheKeyA] = baseline_by_cache_key_b - return baseline_by_cache_key_b.get(cacheKeyB) - - # Searches through previous commits trying local then ci for each commit in. - def search(useCiNamespace, depth): - # Stop if reached the max search depth, or if - # there is an expected change at the child commit (depth-1). This is a - # subtlety: Metrics recorded on commit x incorporate the expected - # changes for commit x. Hence metrics from x are still a valid baseline, - # while older commits are not. This is why we check for expected changes - # on depth-1 rather than depth. - if depth >= BaselineSearchDepth or has_expected_change( \ - depth_to_commit(depth - 1)): - return None - - # Check for a metric on this commit. - current_metric = commit_metric(useCiNamespace, depth_to_commit(depth)) - if current_metric != None: - return current_metric - - # Metric is not available. - # If tried local, now try CI. Else move to the parent commit. - if not useCiNamespace: - return search(True, depth) - else: - return search(False, depth + 1) - - # Start search from parent commit using local name space. - return search(False, 1) - - # Check test stats. This prints the results for the user. # actual: the PerfStat with actual value. # expected_val: the expected value (this should generally be derived from get_perf_stats()) diff --git a/testsuite/driver/runtests.py b/testsuite/driver/runtests.py index 73297dae46..247a5cc330 100644 --- a/testsuite/driver/runtests.py +++ b/testsuite/driver/runtests.py @@ -379,37 +379,18 @@ else: new_metrics = [metric for (change, metric) in t.metrics if change == MetricChange.NewMetric] if any(new_metrics): if canGitStatus: - reason = 'a baseline (expected value) cannot be recovered from' + \ - ' previous git commits. This may be due to HEAD having' + \ - ' new tests or having expected changes, the presence of' + \ - ' expected changes since the last run of the tests, and/or' + \ - ' the latest test run being too old.' - fix = 'If the tests exist on the previous' + \ - ' commit (And are configured to run with the same ways),' + \ - ' then check out that commit and run the tests to generate' + \ - ' the missing metrics. Alternatively, a baseline may be' + \ - ' recovered from ci results once fetched (where origin' + \ - ' is the official ghc git repo):\n\n' + \ - spacing + 'git fetch ' + \ - 'https://gitlab.haskell.org/ghc/ghc-performance-notes.git' + \ - ' refs/notes/perf:refs/notes/' + Perf.CiNamespace + reason = 'the previous git commit doesn\'t have recorded metrics for the following tests.' + \ + ' If the tests exist on the previous commit, then check it out and run the tests to generate the missing metrics.' else: - reason = "this is not a git repo so the previous git commit's" + \ - " metrics cannot be loaded from git notes:" - fix = "" + reason = 'this is not a git repo so the previous git commit\'s metrics cannot be loaded from git notes:' print() - print(str_warn('Missing Baseline Metrics') + \ - ' these metrics trivially pass because ' + reason) - print(spacing + (' ').join(set([metric.test for metric in new_metrics]))) - if fix != "": - print() - print(fix) + print(str_warn('New Metrics') + ' these metrics trivially pass because ' + reason) + print(spacing + ('\n' + spacing).join(set([metric.test for metric in new_metrics]))) # Inform of how to accept metric changes. if (len(t.unexpected_stat_failures) > 0): print() - print(str_info("Some stats have changed") + " If this is expected, " + \ - "allow changes by appending the git commit message with this:") + print(str_info("Some stats have changed") + " If this is expected, allow changes by appending the git commit message with this:") print('-' * 25) print(Perf.allow_changes_string(t.metrics)) print('-' * 25) @@ -425,9 +406,8 @@ else: elif canGitStatus and any(stats): if is_worktree_dirty(): print() - print(str_warn('Performance Metrics NOT Saved') + \ - ' working tree is dirty. Commit changes or use ' + \ - '--metrics-file to save metrics to a file.') + print(str_warn('Working Tree is Dirty') + ' performance metrics will not be saved.' + \ + ' Commit changes or use --metrics-file to save metrics to a file.') else: Perf.append_perf_stat(stats) diff --git a/testsuite/driver/testglobals.py b/testsuite/driver/testglobals.py index 423925e87e..0e0240db8e 100644 --- a/testsuite/driver/testglobals.py +++ b/testsuite/driver/testglobals.py @@ -235,17 +235,11 @@ class TestOptions: # extra files to copy to the testdir self.extra_files = [] - # Map from metric to (fuction from way to baseline value, allowed percentage deviation) e.g. - # { 'bytes allocated': ( - # lambda way: if way1: return None ... elif way2:return 9300000000 ..., - # 10) } - # This means no baseline is available for way1. For way 2, allow a 10% - # deviation from 9300000000. + # Map from metric to expectected value and allowed percentage deviation. e.g. + # { 'bytes allocated': (9300000000, 10) } + # To allow a 10% deviation from 9300000000 for the 'bytes allocated' metric. self.stats_range_fields = {} - # Is the test testing performance? - self.is_stats_test = False - # Does this test the compiler's performance as opposed to the generated code. self.is_compiler_stats_test = False diff --git a/testsuite/driver/testlib.py b/testsuite/driver/testlib.py index b637b1992d..040e674312 100644 --- a/testsuite/driver/testlib.py +++ b/testsuite/driver/testlib.py @@ -65,7 +65,7 @@ def isCompilerStatsTest(): def isStatsTest(): opts = getTestOpts() - return opts.is_stats_test + return bool(opts.stats_range_fields) # This can be called at the top of a file of tests, to set default test options @@ -348,18 +348,29 @@ def testing_metrics(): # measures the performance numbers of the compiler. # As this is a fairly rare case in the testsuite, it defaults to false to # indicate that it is a 'normal' performance test. -def _collect_stats(name, opts, metrics, deviation, is_compiler_stats_test=False): +def _collect_stats(name, opts, metric, deviation, is_compiler_stats_test=False): if not re.match('^[0-9]*[a-zA-Z][a-zA-Z0-9._-]*$', name): failBecause('This test has an invalid name.') - # Normalize metrics to a list of strings. - if isinstance(metrics, str): - if metrics == 'all': - metrics = testing_metrics() - else: - metrics = [metrics] + tests = Perf.get_perf_stats('HEAD^') + + # Might have multiple metrics being measured for a single test. + test = [t for t in tests if t.test == name] + + if tests == [] or test == []: + # There are no prior metrics for this test. + if isinstance(metric, str): + if metric == 'all': + for field in testing_metrics(): + opts.stats_range_fields[field] = None + else: + opts.stats_range_fields[metric] = None + if isinstance(metric, list): + for field in metric: + opts.stats_range_fields[field] = None + + return - opts.is_stats_test = True if is_compiler_stats_test: opts.is_compiler_stats_test = True @@ -368,11 +379,24 @@ def _collect_stats(name, opts, metrics, deviation, is_compiler_stats_test=False) if config.compiler_debugged and is_compiler_stats_test: opts.skip = 1 - for metric in metrics: - baselineByWay = lambda way, target_commit: Perf.baseline_metric( \ - target_commit, name, config.test_env, metric, way) + # get the average value of the given metric from test + def get_avg_val(metric_2): + metric_2_metrics = [float(t.value) for t in test if t.metric == metric_2] + return sum(metric_2_metrics) / len(metric_2_metrics) + + # 'all' is a shorthand to test for bytes allocated, peak megabytes allocated, and max bytes used. + if isinstance(metric, str): + if metric == 'all': + for field in testing_metrics(): + opts.stats_range_fields[field] = (get_avg_val(field), deviation) + return + else: + opts.stats_range_fields[metric] = (get_avg_val(metric), deviation) + return - opts.stats_range_fields[metric] = (baselineByWay, deviation) + if isinstance(metric, list): + for field in metric: + opts.stats_range_fields[field] = (get_avg_val(field), deviation) # ----- @@ -1140,11 +1164,10 @@ def metric_dict(name, way, metric, value): # name: name of the test. # way: the way. # stats_file: the path of the stats_file containing the stats for the test. -# range_fields: see TestOptions.stats_range_fields +# range_fields # Returns a pass/fail object. Passes if the stats are withing the expected value ranges. # This prints the results for the user. def check_stats(name, way, stats_file, range_fields): - head_commit = Perf.commit_hash('HEAD') result = passed() if range_fields: try: @@ -1154,7 +1177,7 @@ def check_stats(name, way, stats_file, range_fields): stats_file_contents = f.read() f.close() - for (metric, baseline_and_dev) in range_fields.items(): + for (metric, range_val_dev) in range_fields.items(): field_match = re.search('\("' + metric + '", "([0-9]+)"\)', stats_file_contents) if field_match == None: print('Failed to find metric: ', metric) @@ -1167,15 +1190,14 @@ def check_stats(name, way, stats_file, range_fields): change = None # If this is the first time running the benchmark, then pass. - baseline = baseline_and_dev[0](way, head_commit) - if baseline == None: + if range_val_dev == None: metric_result = passed() change = MetricChange.NewMetric else: - tolerance_dev = baseline_and_dev[1] + (expected_val, tolerance_dev) = range_val_dev (change, metric_result) = Perf.check_stats_change( perf_stat, - baseline, + expected_val, tolerance_dev, config.allowed_perf_changes, config.verbose >= 4) @@ -1308,13 +1330,8 @@ def simple_run(name, way, prog, extra_run_opts): my_rts_flags = rts_flags(way) - # Collect stats if necessary: - # isStatsTest and not isCompilerStatsTest(): - # assume we are running a ghc compiled program. Collect stats. - # isStatsTest and way == 'ghci': - # assume we are running a program via ghci. Collect stats stats_file = name + '.stats' - if isStatsTest() and (not isCompilerStatsTest() or way == 'ghci'): + if isStatsTest() and not isCompilerStatsTest(): stats_args = ' +RTS -V0 -t' + stats_file + ' --machine-readable -RTS' else: stats_args = '' diff --git a/testsuite/driver/testutil.py b/testsuite/driver/testutil.py index cc4a4eea0d..6e0c2684d7 100644 --- a/testsuite/driver/testutil.py +++ b/testsuite/driver/testutil.py @@ -16,7 +16,7 @@ def strip_quotes(s): return s.strip('\'"') def str_fail(s): - return '\033[1m\033[31m' + s + '\033[0m' + return '\033[1m\033[43m\033[31m' + s + '\033[0m' def str_pass(s): return '\033[1m\033[32m' + s + '\033[0m' |