5 files changed, 277 insertions, 65 deletions
diff --git a/testsuite/driver/perf_notes.py b/testsuite/driver/perf_notes.py
index 6d80e07e12..365732ca69 100644
--- a/testsuite/driver/perf_notes.py
+++ b/testsuite/driver/perf_notes.py
@@ -13,6 +13,7 @@ import argparse
 import re
 import subprocess
 import time
+import sys
 
 from collections import namedtuple
 from math import ceil, trunc
@@ -41,7 +42,7 @@ def is_worktree_dirty():
 # The metrics (a.k.a stats) are named tuples, PerfStat, in this form:
 #
 # ( test_env : 'val',      # Test environment.
-#   test     : 'val',      # Name of the test 
+#   test     : 'val',      # Name of the test
 #   way      : 'val',
 #   metric   : 'val',      # Metric being recorded
 #   value    : 'val',      # The statistic result e.g. runtime
@@ -50,6 +51,9 @@ def is_worktree_dirty():
 # All the fields of a metric (excluding commit field).
 PerfStat = namedtuple('PerfStat', ['test_env','test','way','metric','value'])
 
+# A baseline recovered form stored metrics.
+Baseline = namedtuple('Baseline', ['perfStat','commit','commitDepth'])
+
 class MetricChange:
     NewMetric = 'NewMetric'
     NoChange = 'NoChange'
@@ -73,6 +77,21 @@ def get_perf_stats(commit='HEAD', namespace='perf'):
     log = [parse_perf_stat(stat_str) for stat_str in log]
     return log
 
+# Check if a str is in a 40 character git commit hash.
+# str -> bool
+_commit_hash_re = re.compile('[0-9a-f]' * 40)
+def is_commit_hash(hash):
+    return _commit_hash_re.fullmatch(hash) != None
+
+# Convert a <ref> to a commit hash code.
+# str -> str
+def commit_hash(commit):
+    if is_commit_hash(commit):
+        return commit
+    return subprocess.check_output(['git', 'rev-parse', commit], \
+            stderr=subprocess.STDOUT) \
+            .decode() \
+            .strip()
 
 # Get allowed changes to performance. This is extracted from the commit message of
 # the given commit in this form:
@@ -83,13 +102,20 @@ def get_perf_stats(commit='HEAD', namespace='perf'):
 #           'metrics': ['metricA', 'metricB', ...],
 #           'opts': {
 #                   'optionA': 'string value',
-#                   'optionB': 'string value',
+#                   'optionB': 'string value',          # e.g. test_env: "x86_64-linux"
 #                   ...
 #               }
 #   }
+_get_allowed_perf_changes_cache = {}
 def get_allowed_perf_changes(commit='HEAD'):
-    commitByteStr = subprocess.check_output(['git', '--no-pager', 'log', '-n1', '--format=%B', commit])
-    return parse_allowed_perf_changes(commitByteStr.decode())
+    global _get_allowed_perf_changes_cache
+    commit =  commit_hash(commit)
+    if not commit in _get_allowed_perf_changes_cache:
+        commitByteStr = subprocess.check_output(\
+            ['git', '--no-pager', 'log', '-n1', '--format=%B', commit])
+        _get_allowed_perf_changes_cache[commit] \
+            = parse_allowed_perf_changes(commitByteStr.decode())
+    return _get_allowed_perf_changes_cache[commit]
 
 def parse_allowed_perf_changes(commitMsg):
     # Helper regex. Non-capturing unless postfixed with Cap.
@@ -102,7 +128,7 @@ def parse_allowed_perf_changes(commitMsg):
     exp = (r"^Metric"
         +s+r"(Increase|Decrease)"
         +s+r"?("+qstr+r"|"+qstrList+r")?"                   # Metric or list of metrics.s..
-        +s+r"?(\(" + r"(?:[^')]|"+qstr+r")*" + r"\))?"      # Options surounded in parenthesis. (allow parenthases in quoted strings))
+        +s+r"?(\(" + r"(?:[^')]|"+qstr+r")*" + r"\))?"      # Options surrounded in parenthesis. (allow parenthases in quoted strings)
         +s+r"?:?"                                           # Optional ":"
         +s+r"?((?:(?!\n\n)(?!\n[^\s])(?:.|\n))*)"           # Test names. Stop parsing on empty or non-indented new line.
         )
@@ -213,19 +239,191 @@ def append_perf_stat(stats, commit='HEAD', namespace='perf', max_tries=5):
         tries += 1
         time.sleep(1)
 
-    print("\nAn error occured while writing the performance metrics to git notes.\n \
-	            This is usually due to a lock-file existing somewhere in the git repo.")
+    print("\nAn error occurred while writing the performance metrics to git notes.\n \
+            This is usually due to a lock-file existing somewhere in the git repo.")
 
     return False
 
+#
+# Baseline calculation
+#
+
+# Max number of ancestor commits to search when compiling a baseline performance metric.
+BaselineSearchDepth = 75
+
+# The git notes name space for local results.
+LocalNamespace = "perf"
+
+# The git notes name space for ci results.
+CiNamespace = "ci/" + LocalNamespace
+
+# (isCalculated, best fit ci test_env or None)
+BestFitCiTestEnv = (False, None)
+
+# test_env string or None
+def best_fit_ci_test_env():
+    global BestFitCiTestEnv
+    if not BestFitCiTestEnv[0]:
+        platform = sys.platform
+        isArch64 = sys.maxsize > 2**32
+        arch = "x86_64" if isArch64 else "i386"
+
+        if platform.startswith("linux"):
+            test_env = arch + "-linux-deb9"
+        elif platform.startswith("win32"):
+            # There are no windows CI test results.
+            test_env = None
+        elif isArch64 and platform.startswith("darwin"):
+            test_env = arch + "-darwin"
+        elif isArch64 and platform.startswith("freebsd"):
+            test_env = arch + "-freebsd"
+        else:
+            test_env = None
+
+        BestFitCiTestEnv = (True, test_env)
+
+    return BestFitCiTestEnv[1]
+
+_baseline_depth_commit_log = {}
+
+# Get the commit hashes for the last BaselineSearchDepth commits from and
+# including the input commit. The output commits are all commit hashes.
+# str -> [str]
+def baseline_commit_log(commit):
+    global _baseline_depth_commit_log
+    commit = commit_hash(commit)
+    if not commit in _baseline_depth_commit_log:
+        _baseline_depth_commit_log[commit] = \
+            subprocess.check_output(['git', 'log', '--format=%H', \
+                             '-n' + str(BaselineSearchDepth)]) \
+                .decode().split('\n')
+    return _baseline_depth_commit_log[commit]
+
+# Cache of baseline values. This is a dict of dicts indexed on:
+# (useCiNamespace, commit) -> (test_env, test, metric, way) -> baseline
+# (bool          , str   ) -> (str     , str , str   , str) -> float
+_commit_metric_cache = {}
+
+# Get the baseline (expected value) of a test at a given commit. This searches
+# git notes from older commits for recorded metrics (locally and from ci). More
+# recent commits are favoured, then local results over ci results are favoured.
+#
+# commit: str - must be a commit hash (see commit_has())
+# name: str - test name
+# test_env: str - test environment (note a best fit test_env will be used
+#                      instead when looking for ci results)
+# metric: str - test metric
+# way: str - test way
+# returns: the Baseline named tuple or None if no metric was found within
+#          BaselineSearchDepth commits and since the last expected change.
+def baseline_metric(commit, name, test_env, metric, way):
+    # For performance reasons (in order to avoid calling commit_hash), we assert
+    # commit is already a commit hash.
+    assert is_commit_hash(commit)
+
+    # Get all recent commit hashes.
+    commit_hashes = baseline_commit_log(commit)
+    def depth_to_commit(depth):
+        return commit_hashes[depth]
+
+    def has_expected_change(commit):
+        return get_allowed_perf_changes(commit).get(name) \
+                != None
+
+    # Bool -> String
+    def namespace(useCiNamespace):
+        return CiNamespace if useCiNamespace else LocalNamespace
+
+    ci_test_env = best_fit_ci_test_env()
+
+    # gets the metric of a given commit
+    # (Bool, Int) -> (float | None)
+    def commit_metric(useCiNamespace, depth):
+        global _commit_metric_cache
+        currentCommit = depth_to_commit(depth)
+
+        # Get test environment.
+        effective_test_env = ci_test_env if useCiNamespace else test_env
+        if effective_test_env == None:
+            # This can happen when no best fit ci test is found.
+            return None
+
+        # Check for cached value.
+        cacheKeyA = (useCiNamespace, currentCommit)
+        cacheKeyB = (effective_test_env, name, metric, way)
+        if cacheKeyA in _commit_metric_cache:
+            return _commit_metric_cache[cacheKeyA].get(cacheKeyB)
+
+        # Cache miss.
+        # Calculate baselines from the current commit's git note.
+        # Note that the git note may contain data for other tests. All tests'
+        # baselines will be collected and cached for future use.
+        allCommitMetrics = get_perf_stats(
+                                currentCommit,
+                                namespace(useCiNamespace))
+
+        # Collect recorded values by cacheKeyB.
+        values_by_cache_key_b = {}
+        for perfStat in allCommitMetrics:
+            currentCacheKey = (perfStat.test_env, perfStat.test, \
+                               perfStat.metric, perfStat.way)
+            currentValues = values_by_cache_key_b.setdefault(currentCacheKey, [])
+            currentValues.append(float(perfStat.value))
+
+        # Calculate and baseline (average of values) by cacheKeyB.
+        baseline_by_cache_key_b = {}
+        for currentCacheKey, currentValues in values_by_cache_key_b.items():
+            baseline_by_cache_key_b[currentCacheKey] = Baseline( \
+                PerfStat( \
+                    currentCacheKey[0],
+                    currentCacheKey[1],
+                    currentCacheKey[3],
+                    currentCacheKey[2],
+                    sum(currentValues) / len(currentValues)),
+                currentCommit,
+                depth)
+
+        # Save baselines to the cache.
+        _commit_metric_cache[cacheKeyA] = baseline_by_cache_key_b
+        return baseline_by_cache_key_b.get(cacheKeyB)
+
+    # Searches through previous commits trying local then ci for each commit in.
+    def search(useCiNamespace, depth):
+        # Stop if reached the max search depth, or if
+        # there is an expected change at the child commit (depth-1). This is a
+        # subtlety: Metrics recorded on commit x incorporate the expected
+        # changes for commit x. Hence metrics from x are still a valid baseline,
+        # while older commits are not. This is why we check for expected changes
+        # on depth-1 rather than depth.
+        if depth >= BaselineSearchDepth or has_expected_change( \
+                        depth_to_commit(depth - 1)):
+            return None
+
+        # Check for a metric on this commit.
+        current_metric = commit_metric(useCiNamespace, depth)
+        if current_metric != None:
+            return current_metric
+
+        # Metric is not available.
+        # If tried local, now try CI. Else move to the parent commit.
+        if not useCiNamespace:
+            return search(True, depth)
+        else:
+            return search(False, depth + 1)
+
+    # Start search from parent commit using local name space.
+    return search(False, 1)
+
+
 # Check test stats. This prints the results for the user.
 # actual: the PerfStat with actual value.
-# expected_val: the expected value (this should generally be derived from get_perf_stats())
+# baseline: the expected Baseline value (this should generally be derived from baseline_metric())
 # tolerance_dev: allowed deviation of the actual value from the expected value.
 # allowed_perf_changes: allowed changes in stats. This is a dictionary as returned by get_allowed_perf_changes().
 # force_print: Print stats even if the test stat was in the tolerance range.
 # Returns a (MetricChange, pass/fail object) tuple. Passes if the stats are withing the expected value ranges.
-def check_stats_change(actual, expected_val, tolerance_dev, allowed_perf_changes = {}, force_print = False):
+def check_stats_change(actual, baseline, tolerance_dev, allowed_perf_changes = {}, force_print = False):
+    expected_val = baseline.perfStat.value
     full_name = actual.test + ' (' + actual.way + ')'
 
     lowerBound = trunc(           int(expected_val) * ((100 - float(tolerance_dev))/100))
@@ -256,7 +454,8 @@ def check_stats_change(actual, expected_val, tolerance_dev, allowed_perf_changes
     # Print errors and create pass/fail object.
     result = passed()
     if not change_allowed:
-        error = change + ' not allowed'
+        error = change + ' from ' + baseline.perfStat.test_env + \
+                ' baseline @ HEAD~' + str(baseline.commitDepth)
         print(actual.metric, error + ':')
         result = failBecause('stat ' + error, tag='stat')
 
diff --git a/testsuite/driver/runtests.py b/testsuite/driver/runtests.py
index 247a5cc330..f61ebbc6b1 100644
--- a/testsuite/driver/runtests.py
+++ b/testsuite/driver/runtests.py
@@ -379,18 +379,36 @@ else:
     new_metrics = [metric for (change, metric) in t.metrics if change == MetricChange.NewMetric]
     if any(new_metrics):
         if canGitStatus:
-            reason = 'the previous git commit doesn\'t have recorded metrics for the following tests.' + \
-                  ' If the tests exist on the previous commit, then check it out and run the tests to generate the missing metrics.'
+            reason = 'a baseline (expected value) cannot be recovered from' + \
+                ' previous git commits. This may be due to HEAD having' + \
+                ' new tests or having expected changes, the presence of' + \
+                ' expected changes since the last run of the tests, and/or' + \
+                ' the latest test run being too old.'
+            fix = 'If the tests exist on the previous' + \
+                ' commit (And are configured to run with the same ways),' + \
+                ' then check out that commit and run the tests to generate' + \
+                ' the missing metrics. Alternatively, a baseline may be' + \
+                ' recovered from ci results once fetched:\n\n' + \
+                spacing + 'git fetch ' + \
+                  'https://gitlab.haskell.org/ghc/ghc-performance-notes.git' + \
+                  ' refs/notes/perf:refs/notes/' + Perf.CiNamespace
         else:
-            reason = 'this is not a git repo so the previous git commit\'s metrics cannot be loaded from git notes:'
+            reason = "this is not a git repo so the previous git commit's" + \
+                     " metrics cannot be loaded from git notes:"
+            fix = ""
         print()
-        print(str_warn('New Metrics') + ' these metrics trivially pass because ' + reason)
-        print(spacing + ('\n' + spacing).join(set([metric.test for metric in new_metrics])))
+        print(str_warn('Missing Baseline Metrics') + \
+                ' these metrics trivially pass because ' + reason)
+        print(spacing + (' ').join(set([metric.test for metric in new_metrics])))
+        if fix != "":
+            print()
+            print(fix)
 
     # Inform of how to accept metric changes.
     if (len(t.unexpected_stat_failures) > 0):
         print()
-        print(str_info("Some stats have changed") + " If this is expected, allow changes by appending the git commit message with this:")
+        print(str_info("Some stats have changed") + " If this is expected, " + \
+            "allow changes by appending the git commit message with this:")
         print('-' * 25)
         print(Perf.allow_changes_string(t.metrics))
         print('-' * 25)
@@ -406,8 +424,9 @@ else:
     elif canGitStatus and any(stats):
         if is_worktree_dirty():
             print()
-            print(str_warn('Working Tree is Dirty') + ' performance metrics will not be saved.' + \
-                                    ' Commit changes or use --metrics-file to save metrics to a file.')
+            print(str_warn('Performance Metrics NOT Saved') + \
+                ' working tree is dirty. Commit changes or use ' + \
+                '--metrics-file to save metrics to a file.')
         else:
             Perf.append_perf_stat(stats)
 
diff --git a/testsuite/driver/testglobals.py b/testsuite/driver/testglobals.py
index de1d1e660a..b3c9a55131 100644
--- a/testsuite/driver/testglobals.py
+++ b/testsuite/driver/testglobals.py
@@ -250,11 +250,21 @@ class TestOptions:
        # extra files to copy to the testdir
        self.extra_files = []
 
-       # Map from metric to expectected value and allowed percentage deviation. e.g.
-       #     { 'bytes allocated': (9300000000, 10) }
-       # To allow a 10% deviation from 9300000000 for the 'bytes allocated' metric.
+       # Map from metric to (function from way and commit to baseline value, allowed percentage deviation) e.g.
+       #     { 'bytes allocated': (
+       #              lambda way commit:
+       #                    ...
+       #                    if way1: return None ...
+       #                    elif way2:return 9300000000 ...
+       #                    ...
+       #              , 10) }
+       # This means no baseline is available for way1. For way 2, allow a 10%
+       # deviation from 9300000000.
        self.stats_range_fields = {}
 
+       # Is the test testing performance?
+       self.is_stats_test = False
+
        # Does this test the compiler's performance as opposed to the generated code.
        self.is_compiler_stats_test = False
 
diff --git a/testsuite/driver/testlib.py b/testsuite/driver/testlib.py
index 710800b9f0..dd66340e5f 100644
--- a/testsuite/driver/testlib.py
+++ b/testsuite/driver/testlib.py
@@ -65,7 +65,7 @@ def isCompilerStatsTest():
 
 def isStatsTest():
     opts = getTestOpts()
-    return bool(opts.stats_range_fields)
+    return opts.is_stats_test
 
 
 # This can be called at the top of a file of tests, to set default test options
@@ -348,29 +348,18 @@ def testing_metrics():
 # measures the performance numbers of the compiler.
 # As this is a fairly rare case in the testsuite, it defaults to false to
 # indicate that it is a 'normal' performance test.
-def _collect_stats(name, opts, metric, deviation, is_compiler_stats_test=False):
+def _collect_stats(name, opts, metrics, deviation, is_compiler_stats_test=False):
     if not re.match('^[0-9]*[a-zA-Z][a-zA-Z0-9._-]*$', name):
         failBecause('This test has an invalid name.')
 
-    tests = Perf.get_perf_stats('HEAD^')
-
-    # Might have multiple metrics being measured for a single test.
-    test = [t for t in tests if t.test == name]
-
-    if tests == [] or test == []:
-        # There are no prior metrics for this test.
-        if isinstance(metric, str):
-            if metric == 'all':
-                for field in testing_metrics():
-                    opts.stats_range_fields[field] = None
-            else:
-                opts.stats_range_fields[metric] = None
-        if isinstance(metric, list):
-            for field in metric:
-                opts.stats_range_fields[field] = None
-
-        return
+    # Normalize metrics to a list of strings.
+    if isinstance(metrics, str):
+        if metrics == 'all':
+            metrics = testing_metrics()
+        else:
+            metrics = [metrics]
 
+    opts.is_stats_test = True
     if is_compiler_stats_test:
         opts.is_compiler_stats_test = True
 
@@ -379,24 +368,12 @@ def _collect_stats(name, opts, metric, deviation, is_compiler_stats_test=False):
     if config.compiler_debugged and is_compiler_stats_test:
         opts.skip = 1
 
-    # get the average value of the given metric from test
-    def get_avg_val(metric_2):
-        metric_2_metrics = [float(t.value) for t in test if t.metric == metric_2]
-        return sum(metric_2_metrics) / len(metric_2_metrics)
-
-    # 'all' is a shorthand to test for bytes allocated, peak megabytes allocated, and max bytes used.
-    if isinstance(metric, str):
-        if metric == 'all':
-            for field in testing_metrics():
-                opts.stats_range_fields[field] = (get_avg_val(field), deviation)
-                return
-        else:
-            opts.stats_range_fields[metric] = (get_avg_val(metric), deviation)
-            return
+    for metric in metrics:
+        def baselineByWay(way, target_commit, metric=metric):
+            return Perf.baseline_metric( \
+                              target_commit, name, config.test_env, metric, way)
 
-    if isinstance(metric, list):
-        for field in metric:
-            opts.stats_range_fields[field] = (get_avg_val(field), deviation)
+        opts.stats_range_fields[metric] = (baselineByWay, deviation)
 
 # -----
 
@@ -1179,10 +1156,11 @@ def metric_dict(name, way, metric, value):
 # name: name of the test.
 # way: the way.
 # stats_file: the path of the stats_file containing the stats for the test.
-# range_fields
+# range_fields: see TestOptions.stats_range_fields
 # Returns a pass/fail object. Passes if the stats are withing the expected value ranges.
 # This prints the results for the user.
 def check_stats(name, way, stats_file, range_fields):
+    head_commit = Perf.commit_hash('HEAD')
     result = passed()
     if range_fields:
         try:
@@ -1192,7 +1170,7 @@ def check_stats(name, way, stats_file, range_fields):
         stats_file_contents = f.read()
         f.close()
 
-        for (metric, range_val_dev) in range_fields.items():
+        for (metric, baseline_and_dev) in range_fields.items():
             field_match = re.search('\("' + metric + '", "([0-9]+)"\)', stats_file_contents)
             if field_match == None:
                 print('Failed to find metric: ', metric)
@@ -1205,14 +1183,15 @@ def check_stats(name, way, stats_file, range_fields):
                 change = None
 
                 # If this is the first time running the benchmark, then pass.
-                if range_val_dev == None:
+                baseline = baseline_and_dev[0](way, head_commit)
+                if baseline == None:
                     metric_result = passed()
                     change = MetricChange.NewMetric
                 else:
-                    (expected_val, tolerance_dev) = range_val_dev
+                    tolerance_dev = baseline_and_dev[1]
                     (change, metric_result) = Perf.check_stats_change(
                         perf_stat,
-                        expected_val,
+                        baseline,
                         tolerance_dev,
                         config.allowed_perf_changes,
                         config.verbose >= 4)
@@ -1348,8 +1327,13 @@ def simple_run(name, way, prog, extra_run_opts):
 
     my_rts_flags = rts_flags(way)
 
+    # Collect stats if necessary:
+    # isStatsTest and not isCompilerStatsTest():
+    #   assume we are running a ghc compiled program. Collect stats.
+    # isStatsTest and way == 'ghci':
+    #   assume we are running a program via ghci. Collect stats
     stats_file = name + '.stats'
-    if isStatsTest() and not isCompilerStatsTest():
+    if isStatsTest() and (not isCompilerStatsTest() or way == 'ghci'):
         stats_args = ' +RTS -V0 -t' + stats_file + ' --machine-readable -RTS'
     else:
         stats_args = ''
diff --git a/testsuite/driver/testutil.py b/testsuite/driver/testutil.py
index d5bd2f33c3..b73204fe43 100644
--- a/testsuite/driver/testutil.py
+++ b/testsuite/driver/testutil.py
@@ -19,7 +19,7 @@ def strip_quotes(s):
     return s.strip('\'"')
 
 def str_fail(s):
-    return '\033[1m\033[43m\033[31m' + s + '\033[0m'
+    return '\033[1m\033[31m' + s + '\033[0m'
 
 def str_pass(s):
     return '\033[1m\033[32m' + s + '\033[0m'