21 files changed, 1020 insertions, 1737 deletions
diff --git a/.circleci/config.yml b/.circleci/config.yml
index f35690124b..f80b2b321b 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -18,7 +18,7 @@ aliases:
     # ideally we would simply set THREADS here instead of re-detecting it every
     # time we need it below. Unfortunately, there is no way to set an environment
     # variable with the result of a shell script.
-    SKIP_PERF_TESTS: YES
+    SKIP_PERF_TESTS: NO
     VERBOSE: 2
   - &boot
     run:
@@ -32,6 +32,12 @@ aliases:
         include mk/flavours/\$(BuildFlavour).mk
         endif
         EOF
+  - &set_git_identity
+      run:
+        name: Set Git Identity
+        command: |
+          git config user.email "ghc-circleci@haskell.org"
+          git config user.name "GHC CircleCI"
   - &configure_unix
     run:
       name: Configure
@@ -64,10 +70,16 @@ aliases:
       name: Test
       command: |
         mkdir -p test-results
-        make test THREADS=`mk/detect-cpu-count.sh` SKIP_PERF_TESTS=YES JUNIT_FILE=../../test-results/junit.xml
+        METRICS_FILE=$(mktemp)
+        echo "export METRICS_FILE=$METRICS_FILE" >> $BASH_ENV
+        make test THREADS=`mk/detect-cpu-count.sh` SKIP_PERF_TESTS=$SKIP_PERF_TESTS TEST_ENV=$TEST_ENV JUNIT_FILE=../../test-results/junit.xml METRICS_FILE=$METRICS_FILE
   - &store_test_results
     store_test_results:
       path: test-results
+  - &push_perf_note
+    run:
+      name: Push Performance Git Notes
+      command: .circleci/push-test-metrics.sh
   - &slowtest
     run:
       name: Full Test
@@ -102,8 +114,10 @@ jobs:
     environment:
       <<: *buildenv
       GHC_COLLECTOR_FLAVOR: x86_64-linux
+      TEST_ENV: x86_64-linux
     steps:
       - checkout
+      - *set_git_identity
       - *prepare
       - *submodules
       - *boot
@@ -113,6 +127,7 @@ jobs:
       - *storeartifacts
       - *test
       - *store_test_results
+      - *push_perf_note
 
   "validate-x86_64-freebsd":
     resource_class: xlarge
@@ -122,8 +137,10 @@ jobs:
       TARGET: FreeBSD
       <<: *buildenv
       GHC_COLLECTOR_FLAVOR: x86_64-freebsd
+      TEST_ENV: x86_64-freebsd
     steps:
       - checkout
+      - *set_git_identity
       - *prepare
       - *submodules
       - *boot
@@ -133,6 +150,7 @@ jobs:
       - *storeartifacts
       - *test
       - *store_test_results
+      - *push_perf_note
 
   "validate-x86_64-darwin":
     macos:
@@ -147,8 +165,10 @@ jobs:
       # Build with in-tree GMP since this isn't available on OS X by default.
       CONFIGURE_OPTS: --with-intree-gmp
       <<: *buildenv
+      TEST_ENV: x86_64-darwin
     steps:
       - checkout
+      - *set_git_identity
       - *prepare
       - *submodules
       - *boot
@@ -158,6 +178,7 @@ jobs:
       - *storeartifacts
       - *test
       - *store_test_results
+      - *push_perf_note
 
   "validate-hadrian-x86_64-linux":
     resource_class: xlarge
@@ -167,6 +188,7 @@ jobs:
       <<: *buildenv
     steps:
       - checkout
+      - *set_git_identity
       - *prepare
       - *submodules
       - *boot
@@ -179,8 +201,10 @@ jobs:
       - image: ghcci/x86_64-linux:0.0.4
     environment:
       <<: *buildenv
+      TEST_ENV: x86_64-linux-unreg
     steps:
       - checkout
+      - *set_git_identity
       - *prepare
       - *submodules
       - *boot
@@ -188,6 +212,7 @@ jobs:
       - *make
       - *test
       - *store_test_results
+      - *push_perf_note
 
   "validate-x86_64-linux-llvm":
     resource_class: xlarge
@@ -196,6 +221,7 @@ jobs:
     environment:
       <<: *buildenv
       BUILD_FLAVOUR: perf-llvm
+      TEST_ENV: x86_64-linux-llvm
     steps:
       - run:
           name: Install LLVM
@@ -206,12 +232,14 @@ jobs:
           name: Verify that llc works
           command: llc
       - checkout
+      - *set_git_identity
       - *prepare
       - *submodules
       - *boot
       - *configure_unix
       - *make
       - *test
+      - *push_perf_note
 
   # Nightly build with -DDEBUG using devel2 flavour
   "validate-x86_64-linux-debug":
@@ -221,8 +249,11 @@ jobs:
     environment:
       BUILD_FLAVOUR: devel2
       <<: *buildenv
+      TEST_ENV: x86_64-linux-debug
+      SKIP_PERF_TESTS: YES
     steps:
       - checkout
+      - *set_git_identity
       - *prepare
       - *submodules
       - *boot
@@ -230,6 +261,7 @@ jobs:
       - *make
       - *test
       - *store_test_results
+      - *push_perf_note
 
   "validate-i386-linux":
     resource_class: xlarge
@@ -238,8 +270,10 @@ jobs:
     environment:
       <<: *buildenv
       GHC_COLLECTOR_FLAVOR: i386-linux
+      TEST_ENV: i386-linux
     steps:
       - checkout
+      - *set_git_identity
       - *prepare
       - *submodules
       - *boot
@@ -249,6 +283,7 @@ jobs:
       - *storeartifacts
       - *test
       - *store_test_results
+      - *push_perf_note
 
   "validate-x86_64-fedora":
     resource_class: xlarge
@@ -257,8 +292,10 @@ jobs:
     environment:
       <<: *buildenv
       GHC_COLLECTOR_FLAVOR: x86_64-fedora
+      TEST_ENV: x86_64-fedora
     steps:
       - checkout
+      - *set_git_identity
       - *prepare
       - *submodules
       - *boot
@@ -268,6 +305,7 @@ jobs:
       - *storeartifacts
       - *test
       - *store_test_results
+      - *push_perf_note
 
   "slow-validate-x86_64-linux":
     resource_class: xlarge
@@ -285,6 +323,7 @@ jobs:
       - *make
       - *slowtest
       - *store_test_results
+      - *push_perf_note
 
 workflows:
   version: 2
diff --git a/.circleci/push-test-metrics.sh b/.circleci/push-test-metrics.sh
new file mode 100755
index 0000000000..4ea6958d99
--- /dev/null
+++ b/.circleci/push-test-metrics.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+# vim: sw=2 et
+set -euo pipefail
+
+fail() {
+  echo "ERROR: $*" >&2
+  exit 1
+}
+
+GHC_ORIGIN=git@git.haskell.org:ghc
+
+# Add git.haskell.org as a known host.
+echo "|1|F3mPVCE55+KfApNIMYQ3Dv39sGE=|1bRkvJEJhAN2R0LE/lAjFCEJGl0= ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBBUZS9jGBkE5UzpSo6irnIgcQcfzvbuIOsFc8+N61FwtZncRntbaKPuUimOFPgeaUZLl6Iajz6IIs7aduU0/v+I=" >> ~/.ssh/known_hosts
+echo "|1|2VUMjYSRVpT2qJPA0rA9ap9xILY=|5OThkI4ED9V0J+Es7D5FOD55Klk= ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC+3TLluLAO4lkW60W+N2DFkS+WoRFGqLwHzgd1ifxG9TIm31wChPY3E/hgMnJmgGqWCF4UDUemmyCycEaL7FtKfzjTAclg9EfpQnozyE3T5hIo2WL7SN5O8ttG/bYGuDnn14jLnWwJyN4oz/znWFiDG9e2Oc9YFNlQ+PK8ae5xR4gqBB7EOoj9J1EiPqG2OXRr5Mei3TLsRDU6fnz/e4oFJpKWWeN6M63oePv0qoaGjxcrATZUWsuWrxVMmYo9kP1xRuFJbAUw2m4uVP+793SW1zxySi1HBMtJG+gCDdZZSwYbkV1hassLWBHv1qPttncfX8Zek3Z3VolaTmfWJTo9" >> ~/.ssh/known_hosts
+
+# Check that a git notes dont already exist.
+# This is a percausion as we reset refs/notes/perf and we want to avoid data loss.
+if [ $(git notes --ref=perf list | wc -l) -ne 0 ]
+then
+  fail "Found an existing git note on HEAD. Expected no git note."
+fi
+
+# Assert that the METRICS_FILE exists and can be read.
+if [ "$METRICS_FILE" = "" ] || ! [ -r $METRICS_FILE ]
+then
+  fail "Metrics file not found: $METRICS_FILE"
+fi
+
+# Reset the git notes and append the metrics file to the notes, then push and return the result.
+# This is favoured over a git notes merge as it avoids potential data loss/duplication from the merge strategy.
+function reset_append_note_push {
+  git fetch -f $GHC_ORIGIN refs/notes/perf:refs/notes/perf || true
+  echo "git notes --ref=perf append -F $METRICS_FILE HEAD"
+  git notes --ref=perf append -F $METRICS_FILE HEAD
+  git push $GHC_ORIGIN refs/notes/perf
+}
+
+# Push the metrics file as a git note. This may fail if another task pushes a note first. In that case
+# the latest note is fetched and appended.
+MAX_RETRY=20
+until reset_append_note_push || [ MAX_RETRY = 0 ]
+do
+  ((MAX_RETRY--))
+  echo ""
+  echo "Failed to push git notes. Fetching, appending, and retrying..."
+done
diff --git a/libraries/base/tests/all.T b/libraries/base/tests/all.T
index 90af9020d6..aaf4aa2789 100644
--- a/libraries/base/tests/all.T
+++ b/libraries/base/tests/all.T
@@ -176,12 +176,7 @@ test('topHandler04',
 
 
 test('T8766',
-        [ stats_num_field('bytes allocated',
-                          [ (wordsize(64), 16828144, 5)
-	# with GHC-7.6.3: 83937384 (but faster execution than the next line)
-	# before:         58771216 (without call-arity-analysis)
-	# expected value: 16828144 (2014-01-14)
-                          , (wordsize(32), 8433644, 5) ])
+        [ collect_stats('bytes allocated',5)
         , only_ways(['normal'])],
       compile_and_run,
       ['-O'])
@@ -208,9 +203,7 @@ test('T8089',
 test('T8684', expect_broken(8684), compile_and_run, [''])
 test('T9826',normal, compile_and_run,[''])
 test('T9848',
-        [ stats_num_field('bytes allocated',
-                          [ (wordsize(64), 51840, 20)
-                          , (wordsize(32), 47348, 20) ])
+        [ collect_stats('bytes allocated')
         , only_ways(['normal'])],
       compile_and_run,
       ['-O'])
@@ -223,10 +216,7 @@ test('lazySTexamples', normal, compile_and_run, [''])
 test('T11760', req_smp, compile_and_run, ['-threaded -with-rtsopts=-N2'])
 test('T12874', normal, compile_and_run, [''])
 test('T13191',
-        [ stats_num_field('bytes allocated',
-                          [ (wordsize(64), 185943272, 5) ])
-        # with GHC-8.1 before liftA2 change: 325065128
-        # GHC-8.1 with custom liftA2:        185943272
+        [ collect_stats('bytes allocated', 5)
         , only_ways(['normal'])],
       compile_and_run,
       ['-O'])
@@ -234,7 +224,7 @@ test('T13525', when(opsys('mingw32'), skip), compile_and_run, [''])
 test('T13097', normal, compile_and_run, [''])
 test('functorOperators', normal, compile_and_run, [''])
 test('T3474',
-     [stats_num_field('max_bytes_used', [ (wordsize(64), 44504, 5) ]),
+     [collect_stats('max_bytes_used',5),
       only_ways(['normal'])],
      compile_and_run, ['-O'])
 test('T14425', normal, compile_and_run, [''])
diff --git a/testsuite/driver/README.md b/testsuite/driver/README.md
new file mode 100644
index 0000000000..9324fd3df6
--- /dev/null
+++ b/testsuite/driver/README.md
@@ -0,0 +1,133 @@
+GHC Driver Readme
+=================
+
+Greetings and well met.  If you are reading this, I can only assume that you
+are likely interested in working on the testsuite in some capacity.  For more
+detailed documentation, please see [here][1].
+
+## ToC
+
+1. Entry points of the testsuite performance tests
+2. Quick overview of program parts
+3. How to use the comparison tool
+4. Important Types
+5. Quick answers for "how do I do X"?
+
+
+## Entry Points of the testsuite performance tests
+
+The testsuite has two main entry points depending on which perspective you
+approach it.  From the perspective of the test writer, the entry point is the
+collect_stats function called in *.T files.  This function is declared in
+perf_notes.py along with its associated infrastructure.  The purpose of this
+function is to tell the test driver what metrics to compare when processing
+the test. From the perspective of running the test-suite e.g. via make, its
+entry point is the runtests.py file. That file contains the main logic for
+running the individual tests, collecting information, handling failure, and
+outputting the final results.
+
+## Overview of how the performance test bits work.
+During a Haskell Summer of Code project, an intern went through and revamped
+most of the performance test code, as such there have been a few changes to it
+that might be unusual to anyone previously familiar with the testsuite. One of
+the biggest immediate benefits is that all platform differences, compiler
+differences, and things such as that are not necessary to be considered by the
+test writer anymore. This is due to the fact that the test comparison relies
+entirely on locally collected metrics on the testing machine.
+
+As such, it is perfectly sufficient to write `collect_stats('all',20)` in the
+".T" files to measure the 3 potential stats that can be collected for that test
+and automatically test them for regressions, failing if there is more than a 20%
+change in any direction. In fact, even that is not necessary as
+`collect_stats()` defaults to 'all', and 20% deviation allowed.
+
+The function `collect_compiler_stats()` is completely equivalent in every way to
+`collect_stats` except that it measures the performance of the compiler itself
+rather than the performance of the code generated by the compiler. See the
+implementation of collect_stats in /driver/testlib.py for more information.
+
+If the performance of a test is improved so much that the test fails, the value
+will still be recorded. The warning that will be emitted is merely a precaution
+so that the programmer can double-check that they didn't introduce a bug;
+something that might be suspicious if the test suddenly improves by 70%,
+for example.
+
+Performance metrics for performance tests are now stored in git notes under the
+namespace 'perf'.  The format of the git note file is that each line represents
+a single metric for a particular test: `$test_env $test_name $test_way
+$metric_measured $value_collected` (delimited by tabs).
+
+One can view the maximum deviation a test allows by looking inside its
+respective all.T file; additionally, if one sets the verbosity level of the
+test-suite to a value >= 4, they will see a good amount of output per test
+detailing all the information about values.  This information will also print
+if the test falls outside of the allowed bounds.  (see the test_cmp function in
+/driver/perf_notes.py for exact formatting of the message)
+
+The git notes are only appended to by the testsuite in a single atomic python
+subprocess at the end of the test run; if the run is canceled at any time, the
+notes will not be written.  The note appending command will be retried up to 4
+times in the event of a failure (such as one happening due to a lock on the
+repo) although this is never anticipated to happen.  If, for some reason, the 5
+attempts were not enough, an error message will be printed out.  Further, there
+is no current process or method for stripping duplicates, updating values, etc,
+so if the testsuite is ran multiple times per commit there will be multiple
+values in the git notes corresponding to the tests ran.  In this case the
+average value is used.
+
+## Quick overview of program parts
+
+The relevant bits of the directory tree are as such:
+
+```
+├── driver                   -- Testsuite driver directory
+    ├── junit.py             -- Contains code implementing JUnit features.
+    ├── kill_extra_files.py  -- Some of the uglier implementation details.
+    ├── perf_notes.py        -- Comparison tool and performance tests.
+    ├── runtests.py          -- Main entrypoint for program; runs tests.
+    ├── testglobals.py       -- Global data structures and objects.
+    ├── testlib.py           -- Bulk of implementation is in here.
+    └── testutil.py          -- Misc helper functions.
+├── mk
+    └── test.mk              -- Master makefile for running tests.
+├── tests                    -- Main tests directory.
+```
+
+## How to Use the Comparison Tool
+
+The comparison tool exists in `/driver/perf_notes.py`.
+
+When the testsuite is ran, the performance metrics of the performance tests are
+saved automatically in a local git note that will be attached to the commit.
+The comparison tool is designed to help analyze performance metrics across
+commits using this performance information.
+
+Currently, it can only be ran by executing the file directly, like so:
+```
+$ python3 perf_notes.py (arguments go here)
+```
+
+If you run `perf_notes.py -h` you will see a description of all of the
+arguments and how to use them.  The optional arguments exist to filter the
+output to include only commits that you're interested in.  The most typical
+usage of this tool will likely be running `perf_notes.py HEAD 'HEAD~1' '(commit
+hash)' ...`
+
+The way the performance metrics are stored in git notes remains strictly local
+to the machine; as such, performance metrics will not exist for a commit until
+you checkout that commit and run the testsuite (or test).
+
+## Quick Answers for "How do I do X?"
+
+* Q: How do I add a flag to "make test" to extend the testsuite functionality?
+    1. Add the flag in the appropriate global object in testglobals.py
+    2. Add a argument to the parser in runtests.py that sets the flag
+    3. Go to the `testsuite/mk/test.mk` file and add a new ifeq (or ifneq)
+        block. I suggest adding the block around line 200.
+* Q: How do I modify how performance tests work?
+    * That functionality resides in perf_notes.py which has pretty good
+      in-code documentation.
+    * Additionally, one will want to look at `compile_and_run`, `simple_run`,
+      and `simple_build` in testutil.py
+
+  [1]: http://ghc.haskell.org/trac/ghc/wiki/Building/RunningTests
diff --git a/testsuite/driver/perf_notes.py b/testsuite/driver/perf_notes.py
new file mode 100644
index 0000000000..f162164e3e
--- /dev/null
+++ b/testsuite/driver/perf_notes.py
@@ -0,0 +1,382 @@
+#!/usr/bin/env python3
+
+#
+# (c) Jared Weakly 2017
+#
+# This file will be a utility to help facilitate the comparison of performance
+# metrics across arbitrary commits. The file will produce a table comparing
+# metrics between measurements taken for given commits in the environment
+# (which defaults to 'local' if not given by --test-env).
+#
+
+import argparse
+import re
+import subprocess
+import time
+
+from collections import namedtuple
+from math import ceil, trunc
+
+from testutil import passed, failBecause
+
+
+#
+# Some data access functions. A the moment this uses git notes.
+#
+
+# The metrics (a.k.a stats) are named tuples, PerfStat, in this form:
+#
+# ( test_env : 'val',      # Test environment.
+#   test     : 'val',      # Name of the test 
+#   way      : 'val',
+#   metric   : 'val',      # Metric being recorded
+#   value    : 'val',      # The statistic result e.g. runtime
+# )
+
+# All the fields of a metric (excluding commit field).
+PerfStat = namedtuple('PerfStat', ['test_env','test','way','metric','value'])
+
+class MetricChange:
+    NewMetric = 'NewMetric'
+    NoChange = 'NoChange'
+    Increase = 'Increase'
+    Decrease = 'Decrease'
+
+def parse_perf_stat(stat_str):
+    field_vals = stat_str.strip('\t').split('\t')
+    return PerfStat(*field_vals)
+
+# Get all recorded (in a git note) metrics for a given commit.
+# Returns an empty array if the note is not found.
+def get_perf_stats(commit='HEAD', namespace='perf'):
+    try:
+        log = subprocess.check_output(['git', 'notes', '--ref=' + namespace, 'show', commit], stderr=subprocess.STDOUT).decode('utf-8')
+    except subprocess.CalledProcessError:
+        return []
+
+    log = log.strip('\n').split('\n')
+    log = list(filter(None, log))
+    log = [parse_perf_stat(stat_str) for stat_str in log]
+    return log
+
+
+# Get allowed changes to performance. This is extracted from the commit message of
+# the given commit in this form:
+#     Metric  (Increase | Decrease)  ['metric' | \['metrics',..\]]  [\((test_env|way)='abc',...\)]: TestName01, TestName02, ...
+# Returns a *dictionary* from test name to a *list* of items of the form:
+#   {
+#           'direction': either 'Increase' or 'Decrease,
+#           'metrics': ['metricA', 'metricB', ...],
+#           'opts': {
+#                   'optionA': 'string value',
+#                   'optionB': 'string value',
+#                   ...
+#               }
+#   }
+def get_allowed_perf_changes(commit='HEAD'):
+    commitByteStr = subprocess.check_output(['git', '--no-pager', 'log', '-n1', '--format=%B', commit])
+    return parse_allowed_perf_changes(commitByteStr.decode())
+
+def parse_allowed_perf_changes(commitMsg):
+    # Helper regex. Non-capturing unless postfixed with Cap.
+    s = r"(?:\s*\n?\s+)"                                    # Space, possible new line with an indent.
+    qstr = r"(?:'(?:[^'\\]|\\.)*')"                         # Quoted string.
+    qstrCap = r"(?:'((?:[^'\\]|\\.)*)')"                    # Quoted string. Captures the string without the quotes.
+    innerQstrList = r"(?:"+qstr+r"(?:"+s+r"?,"+s+r"?"+qstr+r")*)?"     # Inside of a list of strings.gs.s..
+    qstrList = r"(?:\["+s+r"?"+innerQstrList+s+r"?\])"      # A list of strings (using box brackets)..
+
+    exp = (r"^Metric"
+        +s+r"(Increase|Decrease)"
+        +s+r"?("+qstr+r"|"+qstrList+r")?"                   # Metric or list of metrics.s..
+        +s+r"?(\(" + r"(?:[^')]|"+qstr+r")*" + r"\))?"      # Options surounded in parenthesis. (allow parenthases in quoted strings))
+        +s+r"?:?"                                           # Optional ":"
+        +s+r"?((?:(?!\n\n)(?!\n[^\s])(?:.|\n))*)"           # Test names. Stop parsing on empty or non-indented new line.
+        )
+
+    matches = re.findall(exp, commitMsg, re.M)
+    changes = {}
+    for (direction, metrics_str, opts_str, tests_str) in matches:
+        tests = re.findall(r"(\w+)", tests_str)
+        for test in tests:
+            changes.setdefault(test, []).append({
+                'direction': direction,
+                'metrics': re.findall(qstrCap, metrics_str),
+                'opts': dict(re.findall(r"(\w+)"+s+r"?="+s+r"?"+qstrCap, opts_str))
+            })
+
+    return changes
+
+# Calculates a suggested string to append to the git commit in order to accept the
+# given changes.
+# changes: [(MetricChange, PerfStat)]
+def allow_changes_string(changes):
+    Dec = MetricChange.Decrease
+    Inc = MetricChange.Increase
+
+    # We only care about increase / decrease metrics.
+    changes = [change for change in changes if change[0] in [Inc, Dec]]
+
+    # Map tests to a map from change direction to metrics.
+    test_to_dir_to_metrics = {}
+    for (change, perf_stat) in changes:
+        change_dir_to_metrics = test_to_dir_to_metrics.setdefault(perf_stat.test, { Inc: [], Dec: [] })
+        change_dir_to_metrics[change].append(perf_stat.metric)
+
+    # Split into 3 groups.
+    # Tests where all changes are *increasing*.
+    # Tests where all changes are *decreasing*.
+    # Tests where changes are *mixed* increasing and decreasing.
+    groupDec = []
+    groupInc = []
+    groupMix = []
+    for (test, decsAndIncs) in test_to_dir_to_metrics.items():
+        decs = decsAndIncs[Dec]
+        incs = decsAndIncs[Inc]
+        if decs and incs:
+            groupMix.append(test)
+        elif not decs:
+            groupInc.append(test)
+        else:
+            groupDec.append(test)
+
+    msgs = []
+    nltab = '\n    '
+
+    # Decreasing group.
+    if groupDec:
+        msgs.append('Metric Decrease:' + nltab + nltab.join(groupDec))
+
+    # Increasing group.
+    if groupInc:
+        msgs.append('Metric Increase:' + nltab + nltab.join(groupInc))
+
+    # Mixed group.
+    if groupMix:
+        # Split mixed group tests by decrease/increase, then by metric.
+        dir_to_metric_to_tests = {
+                Dec: {},
+                Inc: {}
+            }
+        for test in groupMix:
+            for change_dir, metrics in test_to_dir_to_metrics[test].items():
+                for metric in metrics:
+                    dir_to_metric_to_tests[change_dir].setdefault(metric, []).append(test)
+
+        for change_dir in [Dec, Inc]:
+            metric_to_tests = dir_to_metric_to_tests[change_dir]
+            for metric in sorted(metric_to_tests.keys()):
+                tests = metric_to_tests[metric]
+                msgs.append('Metric ' + change_dir + ' \'' + metric + '\':' + nltab + nltab.join(tests))
+
+    return '\n\n'.join(msgs)
+
+# Formats a list of metrics into a string. Used e.g. to save metrics to a file or git note.
+def format_perf_stat(stats):
+    # If a single stat, convert to a singleton list.
+    if not isinstance(stats, list):
+        stats = [stats]
+
+    return "\n".join(["\t".join([str(stat_val) for stat_val in stat]) for stat in stats])
+
+# Appends a list of metrics to the git note of the given commit.
+# Tries up to max_tries times to write to git notes should it fail for some reason.
+# Each retry will wait 1 second.
+# Returns True if the note was successfully appended.
+def append_perf_stat(stats, commit='HEAD', namespace='perf', max_tries=5):
+    # Append to git note
+    print('Appending ' + str(len(stats)) + ' stats to git notes.')
+    stats_str = format_perf_stat(stats)
+    def try_append():
+            try:
+                return subprocess.check_output(['git', 'notes', '--ref=' + namespace, 'append', commit, '-m', stats_str])
+            except subprocess.CalledProcessError:
+                return b'Git - fatal'
+
+    tries = 0
+    while tries < max_tries:
+        if not b'Git - fatal' in try_append():
+            return True
+        tries += 1
+        time.sleep(1)
+
+    print("\nAn error occured while writing the performance metrics to git notes.\n \
+	            This is usually due to a lock-file existing somewhere in the git repo.")
+
+    return False
+
+# Check test stats. This prints the results for the user.
+# actual: the PerfStat with actual value.
+# expected_val: the expected value (this should generally be derived from get_perf_stats())
+# tolerance_dev: allowed deviation of the actual value from the expected value.
+# allowed_perf_changes: allowed changes in stats. This is a dictionary as returned by get_allowed_perf_changes().
+# force_print: Print stats even if the test stat was in the tolerance range.
+# Returns a (MetricChange, pass/fail object) tuple. Passes if the stats are withing the expected value ranges.
+def check_stats_change(actual, expected_val, tolerance_dev, allowed_perf_changes = {}, force_print = False):
+    full_name = actual.test + ' (' + actual.way + ')'
+
+    lowerBound = trunc(           int(expected_val) * ((100 - float(tolerance_dev))/100))
+    upperBound = trunc(0.5 + ceil(int(expected_val) * ((100 + float(tolerance_dev))/100)))
+
+    actual_dev = round(((float(actual.value) * 100)/ int(expected_val)) - 100, 1)
+
+    # Find the direction of change.
+    change = MetricChange.NoChange
+    if actual.value < lowerBound:
+        change = MetricChange.Decrease
+    elif actual.value > upperBound:
+        change = MetricChange.Increase
+
+    # Is the change allowed?
+    allowed_change_directions =  [MetricChange.NoChange] + [ allow_stmt['direction']
+            for allow_stmt in allowed_perf_changes.get(actual.test, [])
+
+            # List of metrics are not specified or the metric is in the list of metrics.
+            if not allow_stmt['metrics'] or actual.metric in allow_stmt['metrics']
+
+            # way/test are not specified, or match the actual way/test.
+            if ((not 'way'      in allow_stmt['opts'].keys()) or actual.way      == allow_stmt['opts']['way'])
+            if ((not 'test_env' in allow_stmt['opts'].keys()) or actual.test_env == allow_stmt['opts']['test_env'])
+        ]
+    change_allowed = change in allowed_change_directions
+
+    # Print errors and create pass/fail object.
+    result = passed()
+    if not change_allowed:
+        error = change + ' not allowed'
+        print(actual.metric, error + ':')
+        result = failBecause('stat ' + error, tag='stat')
+
+    if not change_allowed or force_print:
+        length = max(len(str(x)) for x in [expected_val, lowerBound, upperBound, actual.value])
+
+        def display(descr, val, extra):
+            print(descr, str(val).rjust(length), extra)
+
+        display('    Expected    ' + full_name + ' ' + actual.metric + ':', expected_val, '+/-' + str(tolerance_dev) + '%')
+        display('    Lower bound ' + full_name + ' ' + actual.metric + ':', lowerBound, '')
+        display('    Upper bound ' + full_name + ' ' + actual.metric + ':', upperBound, '')
+        display('    Actual      ' + full_name + ' ' + actual.metric + ':', actual.value, '')
+        if actual.value != expected_val:
+            display('    Deviation   ' + full_name + ' ' + actual.metric + ':', actual_dev, '%')
+
+    return (change, result)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--test-env",
+                        help="The given test environment to be compared.")
+    parser.add_argument("--test-name",
+                        help="If given, filters table to include only \
+                        tests matching the given regular expression.")
+    parser.add_argument("--add-note", nargs=3,
+                        help="Development only. --add-note N commit seed \
+                        Adds N fake metrics to the given commit using the random seed.")
+    parser.add_argument("commits", nargs=argparse.REMAINDER,
+                        help="The rest of the arguments will be the commits that will be used.")
+    args = parser.parse_args()
+
+    env = 'local'
+    name = re.compile('.*')
+    # metrics is a tuple (str commit, PerfStat stat)
+    CommitAndStat = namedtuple('CommitAndStat', ['commit', 'stat'])
+    metrics = []
+    singleton_commit = len(args.commits) == 1
+
+    #
+    # Main logic of the program when called from the command-line.
+    #
+
+    if args.commits:
+        for c in args.commits:
+            metrics += [CommitAndStat(c, stat) for stat in get_perf_stats(c)]
+
+    if args.test_env:
+        metrics = [test for test in metrics if test.stat.test_env == args.test_env]
+
+    if args.test_name:
+        nameRe = re.compile(args.test_name)
+        metrics = [test for test in metrics if nameRe.search(test.test)]
+
+    if args.add_note:
+        def note_gen(n, commit, delta=''):
+            note = []
+            # Generates simple fake data. Likely not comprehensive enough to catch all edge cases.
+            if not delta:
+                note.extend([PerfStat('local', 'T'+ str(i*100), 'some_way', 'some_field', str(i*1000)) for i in range(1,int(int(n)/2)+1)])
+                note.extend([PerfStat('non-local', 'W'+ str(i*100), 'other_way', 'other_field', str(i*100)) for i in range(int(int(n)/2)+1,int(n)+1)])
+            if delta:
+                hu = abs(hash(delta))
+                hv = abs(hash(hu))
+                u = int(hu % 100)
+                v = int(hv % 10)
+                note.extend([PerfStat('local', 'T'+ str(i*100), 'some_way', 'some_field', str(i*u)) for i in range(1,int(int(n)/2)+1)])
+                note.extend([PerfStat('non-local', 'W'+ str(i*100), 'other_way', 'other_field', str(i*v)) for i in range(int(int(n)/2)+1,int(n)+1)])
+
+            append_perf_stat(note, commit)
+
+        note_gen(args.add_note[0],args.add_note[1],args.add_note[2])
+
+    #
+    # String utilities for pretty-printing
+    #
+
+    row_fmt = '{:18}' * len(args.commits)
+    commits = row_fmt.format(*[c[:10] for c in args.commits])
+
+    def cmtline(insert):
+        return row_fmt.format(*[insert for c in args.commits]).strip()
+
+    def header(unit):
+        first_line = "{:27}{:30}".format('    ','      ') + cmtline(unit)
+        second_line = ("{:27}{:30}".format('Test','Metric') + commits).strip()
+
+        # Test   Metric   c1   c2   c3 ...
+        print("-" * (len(second_line)+1))
+        print(first_line)
+        print(second_line)
+        print("-" * (len(second_line)+1))
+
+    def commit_string(test, flag):
+        def delta(v1, v2):
+            return round((100 * (v1 - v2)/v2),2)
+
+        # Get the average value per commit (or None if that commit contains no metrics).
+        # Note: if the test environment is not set, this will combine metrics from all test environments.
+        averageValuesOrNones = []
+        for commit in args.commits:
+            values = [float(t.stat.value) for t in metrics if t.commit == commit and t.stat.test == test]
+            if values == []:
+                averageValuesOrNones.append(None)
+            else:
+                averageValuesOrNones.append(sum(values) / len(values))
+
+        if flag == 'metrics':
+            strings = [str(v) if v != None else '-' for v in averageValuesOrNones]
+        if flag == 'percentages':
+            # If the baseline commit has no stats, then we can not produce any percentages.
+            baseline = averageValuesOrNones[0]
+            if baseline == None:
+                strings = ['-' for v in averageValuesOrNones]
+            else:
+                baseline = float(baseline)
+                strings = ['-' if val == None else str(delta(baseline,float(val))) + '%' for val in averageValuesOrNones]
+
+        return row_fmt.format(*strings).strip()
+
+    #
+    # The pretty-printed output
+    #
+
+    header('commit')
+    # Printing out metrics.
+    all_tests = sorted(set([(test.stat.test, test.stat.metric) for test in metrics]))
+    for test, metric in all_tests:
+        print("{:27}{:30}".format(test, metric) + commit_string(test,'metrics'))
+
+    # Has no meaningful output if there is no commit to compare to.
+    if not singleton_commit:
+        header('percent')
+
+        # Printing out percentages.
+        for test, metric in all_tests:
+            print("{:27}{:30}".format(test, metric) + commit_string(test,'percentages'))
+\ No newline at end of file
diff --git a/testsuite/driver/runtests.py b/testsuite/driver/runtests.py
index b956239d2a..fb3fe6ad54 100644
--- a/testsuite/driver/runtests.py
+++ b/testsuite/driver/runtests.py
@@ -23,8 +23,9 @@ import traceback
 # So we import it here first, so that the testsuite doesn't appear to fail.
 import subprocess
 
-from testutil import getStdout, Watcher
+from testutil import getStdout, Watcher, str_warn, str_info
 from testglobals import getConfig, ghc_env, getTestRun, TestOptions, brokens
+from perf_notes import MetricChange
 from junit import junit
 
 # Readline sometimes spews out ANSI escapes for some values of TERM,
@@ -43,11 +44,13 @@ def signal_handler(signal, frame):
 # cmd-line options
 
 parser = argparse.ArgumentParser(description="GHC's testsuite driver")
+perf_group = parser.add_mutually_exclusive_group()
 
 parser.add_argument("-e", action='append', help="A string to execute from the command line.")
 parser.add_argument("--config-file", action="append", help="config file")
 parser.add_argument("--config", action='append', help="config field")
 parser.add_argument("--rootdir", action='append', help="root of tree containing tests (default: .)")
+parser.add_argument("--metrics-file", help="file in which to save (append) the performance test metrics. If omitted, git notes will be used.")
 parser.add_argument("--summary-file", help="file in which to save the (human-readable) summary")
 parser.add_argument("--no-print-summary", action="store_true", help="should we print the summary?")
 parser.add_argument("--only", action="append", help="just this test (can be give multiple --only= flags)")
@@ -55,23 +58,32 @@ parser.add_argument("--way", action="append", help="just this way")
 parser.add_argument("--skipway", action="append", help="skip this way")
 parser.add_argument("--threads", type=int, help="threads to run simultaneously")
 parser.add_argument("--verbose", type=int, choices=[0,1,2,3,4,5], help="verbose (Values 0 through 5 accepted)")
-parser.add_argument("--skip-perf-tests", action="store_true", help="skip performance tests")
 parser.add_argument("--junit", type=argparse.FileType('wb'), help="output testsuite summary in JUnit format")
+parser.add_argument("--test-env", default='local', help="Override default chosen test-env.")
+perf_group.add_argument("--skip-perf-tests", action="store_true", help="skip performance tests")
+perf_group.add_argument("--only-perf-tests", action="store_true", help="Only do performance tests")
 
 args = parser.parse_args()
 
-for e in args.e:
-    exec(e)
+if args.e:
+    for e in args.e:
+        exec(e)
 
-for arg in args.config_file:
-    exec(open(arg).read())
+if args.config_file:
+    for arg in args.config_file:
+        exec(open(arg).read())
 
-for arg in args.config:
-    field, value = arg.split('=', 1)
-    setattr(config, field, value)
+if args.config:
+    for arg in args.config:
+        field, value = arg.split('=', 1)
+        setattr(config, field, value)
 
 all_ways = config.run_ways+config.compile_ways+config.other_ways
-config.rootdirs = args.rootdir
+
+if args.rootdir:
+    config.rootdirs = args.rootdir
+
+config.metrics_file = args.metrics_file
 config.summary_file = args.summary_file
 config.no_print_summary = args.no_print_summary
 
@@ -104,7 +116,12 @@ if args.threads:
 
 if args.verbose is not None:
     config.verbose = args.verbose
+
 config.skip_perf_tests = args.skip_perf_tests
+config.only_perf_tests = args.only_perf_tests
+
+if args.test_env:
+    config.test_env = args.test_env
 
 config.cygwin = False
 config.msys = False
@@ -223,6 +240,14 @@ if config.timeout == -1:
 
 print('Timeout is ' + str(config.timeout))
 
+# Try get allowed performance changes from the git commit.
+try:
+    config.allowed_perf_changes = Perf.get_allowed_perf_changes()
+except subprocess.CalledProcessError:
+    print('Failed to get allowed metric changes from the HEAD git commit message.')
+
+print(len(config.allowed_perf_changes))
+
 # -----------------------------------------------------------------------------
 # The main dude
 
@@ -326,7 +351,31 @@ else:
     # flush everything before we continue
     sys.stdout.flush()
 
-    summary(t, sys.stdout, config.no_print_summary)
+    # Warn of new metrics.
+    new_metrics = [metric for (change, metric) in t.metrics if change == MetricChange.NewMetric]
+    spacing = "    "
+    if any(new_metrics):
+        print()
+        print(str_warn('New Metrics') + ' the previous git commit doesn\'t have metrics for the following tests:')
+        print(spacing + ('\n' + spacing).join(set([metric.test for metric in new_metrics])))
+
+    # Inform of how to accept metric changes.
+    if (len(t.unexpected_stat_failures) > 0):
+        print()
+        print(str_info("Some stats have changed") + " If this is expected, allow changes by appending the git commit message with this:")
+        print('-' * 25)
+        print(Perf.allow_changes_string(t.metrics))
+        print('-' * 25)
+
+    summary(t, sys.stdout, config.no_print_summary, True)
+
+    stats = [stat for (_, stat) in t.metrics]
+    if config.metrics_file:
+        print('Appending ' + str(len(stats)) + ' stats to file: ' + config.metrics_file)
+        with open(config.metrics_file, 'a') as file:
+            file.write("\n" + Perf.format_perf_stat(stats))
+    else:
+        Perf.append_perf_stat(stats)
 
     if config.summary_file:
         with open(config.summary_file, 'w') as file:
diff --git a/testsuite/driver/testglobals.py b/testsuite/driver/testglobals.py
index 311e39be7f..03a62503b4 100644
--- a/testsuite/driver/testglobals.py
+++ b/testsuite/driver/testglobals.py
@@ -31,6 +31,9 @@ class TestConfig:
         self.accept_platform = False
         self.accept_os = False
 
+        # File in which to save the performance metrics.
+        self.metrics_file = ''
+
         # File in which to save the summary
         self.summary_file = ''
 
@@ -122,6 +125,15 @@ class TestConfig:
         # Should we skip performance tests
         self.skip_perf_tests = False
 
+        # Only do performance tests
+        self.only_perf_tests = False
+
+        # Allowed performance changes (see perf_notes.get_allowed_perf_changes())
+        self.allowed_perf_changes = {}
+
+        # The test environment.
+        self.test_env = 'local'
+
 global config
 config = TestConfig()
 
@@ -156,6 +168,12 @@ class TestRun:
        self.unexpected_failures = []
        self.unexpected_stat_failures = []
 
+       # List of all metrics measured in this test run.
+       # [(change, PerfStat)] where change is one of the  MetricChange
+       # constants: NewMetric, NoChange, Increase, Decrease.
+       # NewMetric happens when the previous git commit has no metric recorded.
+       self.metrics = []
+
 global t
 t = TestRun()
 
@@ -215,16 +233,14 @@ class TestOptions:
        # extra files to copy to the testdir
        self.extra_files = []
 
-       # which -t numeric fields do we want to look at, and what bounds must
-       # they fall within?
-       # Elements of these lists should be things like
-       # ('bytes allocated',
-       #   9300000000,
-       #   10)
-       # To allow a 10% deviation from 9300000000.
-       self.compiler_stats_range_fields = {}
+       # Map from metric to expectected value and allowed percentage deviation. e.g.
+       #     { 'bytes allocated': (9300000000, 10) }
+       # To allow a 10% deviation from 9300000000 for the 'bytes allocated' metric.
        self.stats_range_fields = {}
 
+       # Does this test the compiler's performance as opposed to the generated code.
+       self.is_compiler_stats_test = False
+
        # should we run this test alone, i.e. not run it in parallel with
        # any other threads
        self.alone = False
@@ -292,4 +308,3 @@ default_testopts = TestOptions()
 # (bug, directory, name) of tests marked broken
 global brokens
 brokens = []
-
diff --git a/testsuite/driver/testlib.py b/testsuite/driver/testlib.py
index ff6a8c8e74..761ba67fd2 100644
--- a/testsuite/driver/testlib.py
+++ b/testsuite/driver/testlib.py
@@ -19,7 +19,9 @@ import collections
 import subprocess
 
 from testglobals import config, ghc_env, default_testopts, brokens, t
-from testutil import strip_quotes, lndir, link_or_copy_file
+from testutil import strip_quotes, lndir, link_or_copy_file, passed, failBecause, str_fail, str_pass
+import perf_notes as Perf
+from perf_notes import MetricChange
 extra_src_files = {'T4198': ['exitminus1.c']} # TODO: See #12223
 
 global pool_sema
@@ -56,9 +58,13 @@ def setLocalTestOpts(opts):
     global testopts_local
     testopts_local.x=opts
 
+def isCompilerStatsTest():
+    opts = getTestOpts()
+    return bool(opts.is_compiler_stats_test)
+
 def isStatsTest():
     opts = getTestOpts()
-    return bool(opts.compiler_stats_range_fields or opts.stats_range_fields)
+    return bool(opts.stats_range_fields)
 
 
 # This can be called at the top of a file of tests, to set default test options
@@ -254,14 +260,14 @@ def _exit_code( name, opts, v ):
 
 def signal_exit_code( val ):
     if opsys('solaris2'):
-        return exit_code( val );
+        return exit_code( val )
     else:
         # When application running on Linux receives fatal error
         # signal, then its exit code is encoded as 128 + signal
         # value. See http://www.tldp.org/LDP/abs/html/exitcodes.html
         # I assume that Mac OS X behaves in the same way at least Mac
         # OS X builder behavior suggests this.
-        return exit_code( val+128 );
+        return exit_code( val+128 )
 
 # -----
 
@@ -307,42 +313,85 @@ def _extra_files(name, opts, files):
 
 # -----
 
-def stats_num_field( field, expecteds ):
-    return lambda name, opts, f=field, e=expecteds: _stats_num_field(name, opts, f, e);
+# Defaults to "test everything, and only break on extreme cases"
+#
+# The inputs to this function are slightly interesting:
+# metric can be either:
+#     - 'all', in which case all 3 possible metrics are collected and compared.
+#     - The specific metric one wants to use in the test.
+#     - A list of the metrics one wants to use in the test.
+#
+# Deviation defaults to 20% because the goal is correctness over performance.
+# The testsuite should avoid breaking when there is not an actual error.
+# Instead, the testsuite should notify of regressions in a non-breaking manner.
+#
+# collect_compiler_stats is used when the metrics collected are about the compiler.
+# collect_stats is used in the majority case when the metrics to be collected
+# are about the performance of the runtime code generated by the compiler.
+def collect_compiler_stats(metric='all',deviation=20):
+    return lambda name, opts, m=metric, d=deviation: _collect_stats(name, opts, m,d, True)
+
+def collect_stats(metric='all', deviation=20):
+    return lambda name, opts, m=metric, d=deviation: _collect_stats(name, opts, m, d)
+
+def testing_metrics():
+    return ['bytes allocated', 'peak_megabytes_allocated', 'max_bytes_used']
+
+# This is an internal function that is used only in the implementation.
+# 'is_compiler_stats_test' is somewhat of an unfortunate name.
+# If the boolean is set to true, it indicates that this test is one that
+# measures the performance numbers of the compiler.
+# As this is a fairly rare case in the testsuite, it defaults to false to
+# indicate that it is a 'normal' performance test.
+def _collect_stats(name, opts, metric, deviation, is_compiler_stats_test=False):
+    if not re.match('^[0-9]*[a-zA-Z][a-zA-Z0-9._-]*$', name):
+        failBecause('This test has an invalid name.')
 
-def _stats_num_field( name, opts, field, expecteds ):
-    if field in opts.stats_range_fields:
-        framework_fail(name, 'duplicate-numfield', 'Duplicate ' + field + ' num_field check')
+    tests = Perf.get_perf_stats('HEAD^')
 
-    if type(expecteds) is list:
-        for (b, expected, dev) in expecteds:
-            if b:
-                opts.stats_range_fields[field] = (expected, dev)
-                return
-        framework_warn(name, 'numfield-no-expected', 'No expected value found for ' + field + ' in num_field check')
+    # Might have multiple metrics being measured for a single test.
+    test = [t for t in tests if t.test == name]
 
-    else:
-        (expected, dev) = expecteds
-        opts.stats_range_fields[field] = (expected, dev)
+    if tests == [] or test == []:
+        # There are no prior metrics for this test.
+        if isinstance(metric, str):
+            if metric == 'all':
+                for field in testing_metrics():
+                    opts.stats_range_fields[field] = None
+            else:
+                opts.stats_range_fields[metric] = None
+        if isinstance(metric, list):
+            for field in metric:
+                opts.stats_range_fields[field] = None
 
-def compiler_stats_num_field( field, expecteds ):
-    return lambda name, opts, f=field, e=expecteds: _compiler_stats_num_field(name, opts, f, e);
+        return
 
-def _compiler_stats_num_field( name, opts, field, expecteds ):
-    if field in opts.compiler_stats_range_fields:
-        framework_fail(name, 'duplicate-numfield', 'Duplicate ' + field + ' num_field check')
+    if is_compiler_stats_test:
+        opts.is_compiler_stats_test = True
 
     # Compiler performance numbers change when debugging is on, making the results
     # useless and confusing. Therefore, skip if debugging is on.
-    if compiler_debugged():
-        skip(name, opts)
-
-    for (b, expected, dev) in expecteds:
-        if b:
-            opts.compiler_stats_range_fields[field] = (expected, dev)
+    if config.compiler_debugged and is_compiler_stats_test:
+        opts.skip = 1
+
+    # get the average value of the given metric from test
+    def get_avg_val(metric_2):
+        metric_2_metrics = [float(t.value) for t in test if t.metric == metric_2]
+        return sum(metric_2_metrics) / len(metric_2_metrics)
+
+    # 'all' is a shorthand to test for bytes allocated, peak megabytes allocated, and max bytes used.
+    if isinstance(metric, str):
+        if metric == 'all':
+            for field in testing_metrics():
+                opts.stats_range_fields[field] = (get_avg_val(field), deviation)
+                return
+        else:
+            opts.stats_range_fields[metric] = (get_avg_val(metric), deviation)
             return
 
-    framework_warn(name, 'numfield-no-expected', 'No expected value found for ' + field + ' in num_field check')
+    if isinstance(metric, list):
+        for field in metric:
+            opts.stats_range_fields[field] = (get_avg_val(field), deviation)
 
 # -----
 
@@ -720,6 +769,7 @@ def test_common_work(watcher, name, opts, func, args):
             and (getTestOpts().only_ways == None or way in getTestOpts().only_ways) \
             and (config.cmdline_ways == [] or way in config.cmdline_ways) \
             and (not (config.skip_perf_tests and isStatsTest())) \
+            and (not (config.only_perf_tests and not isStatsTest())) \
             and way not in getTestOpts().omit_ways
 
         # Which ways we are asked to skip
@@ -927,12 +977,6 @@ def badResult(result):
     except (KeyError, TypeError):
         return True
 
-def passed():
-    return {'passFail': 'pass'}
-
-def failBecause(reason, tag=None):
-    return {'passFail': 'fail', 'reason': reason, 'tag': tag}
-
 # -----------------------------------------------------------------------------
 # Generic command tests
 
@@ -1087,56 +1131,65 @@ def multi_compile_and_run( name, way, top_mod, extra_mods, extra_hc_opts ):
 
 def stats( name, way, stats_file ):
     opts = getTestOpts()
-    return checkStats(name, way, stats_file, opts.stats_range_fields)
+    return check_stats(name, way, stats_file, opts.stats_range_fields)
 
-# -----------------------------------------------------------------------------
-# Check -t stats info
-
-def checkStats(name, way, stats_file, range_fields):
-    full_name = name + '(' + way + ')'
+def metric_dict(name, way, metric, value):
+    return Perf.PerfStat(
+        test_env = config.test_env,
+        test     = name,
+        way      = way,
+        metric   = metric,
+        value    = value)
 
+# -----------------------------------------------------------------------------
+# Check test stats. This prints the results for the user.
+# name: name of the test.
+# way: the way.
+# stats_file: the path of the stats_file containing the stats for the test.
+# range_fields
+# Returns a pass/fail object. Passes if the stats are withing the expected value ranges.
+# This prints the results for the user.
+def check_stats(name, way, stats_file, range_fields):
     result = passed()
     if range_fields:
         try:
             f = open(in_testdir(stats_file))
         except IOError as e:
             return failBecause(str(e))
-        contents = f.read()
+        stats_file_contents = f.read()
         f.close()
 
-        for (field, (expected, dev)) in range_fields.items():
-            m = re.search('\("' + field + '", "([0-9]+)"\)', contents)
-            if m == None:
-                print('Failed to find field: ', field)
-                result = failBecause('no such stats field')
-            val = int(m.group(1))
-
-            lowerBound = trunc(           expected * ((100 - float(dev))/100))
-            upperBound = trunc(0.5 + ceil(expected * ((100 + float(dev))/100)))
-
-            deviation = round(((float(val) * 100)/ expected) - 100, 1)
-
-            if val < lowerBound:
-                print(field, 'value is too low:')
-                print('(If this is because you have improved GHC, please')
-                print('update the test so that GHC doesn\'t regress again)')
-                result = failBecause('stat too good', tag='stat')
-            if val > upperBound:
-                print(field, 'value is too high:')
-                result = failBecause('stat not good enough', tag='stat')
-
-            if val < lowerBound or val > upperBound or config.verbose >= 4:
-                length = max(len(str(x)) for x in [expected, lowerBound, upperBound, val])
-
-                def display(descr, val, extra):
-                    print(descr, str(val).rjust(length), extra)
-
-                display('    Expected    ' + full_name + ' ' + field + ':', expected, '+/-' + str(dev) + '%')
-                display('    Lower bound ' + full_name + ' ' + field + ':', lowerBound, '')
-                display('    Upper bound ' + full_name + ' ' + field + ':', upperBound, '')
-                display('    Actual      ' + full_name + ' ' + field + ':', val, '')
-                if val != expected:
-                    display('    Deviation   ' + full_name + ' ' + field + ':', deviation, '%')
+        for (metric, range_val_dev) in range_fields.items():
+            field_match = re.search('\("' + metric + '", "([0-9]+)"\)', stats_file_contents)
+            if field_match == None:
+                print('Failed to find metric: ', metric)
+                metric_result = failBecause('no such stats metric')
+            else:
+                actual_val = int(field_match.group(1))
+                
+                # Store the metric so it can later be stored in a git note.
+                perf_stat = metric_dict(name, way, metric, actual_val)
+                change = None
+
+                # If this is the first time running the benchmark, then pass.
+                if range_val_dev == None:
+                    metric_result = passed()
+                    change = MetricChange.NewMetric
+                else:
+                    (expected_val, tolerance_dev) = range_val_dev
+                    (change, metric_result) = Perf.check_stats_change(
+                        perf_stat,
+                        expected_val,
+                        tolerance_dev,
+                        config.allowed_perf_changes,
+                        config.verbose >= 4)
+                t.metrics.append((change, perf_stat))
+
+            # If any metric fails then the test fails.
+            # Note, the remaining metrics are still run so that
+            # a complete list of changes can be presented to the user.
+            if metric_result['passFail'] == 'fail':
+                result = metric_result
 
     return result
 
@@ -1186,7 +1239,7 @@ def simple_build(name, way, extra_hc_opts, should_fail, top_mod, link, addsuf, b
         to_do = '-c' # just compile
 
     stats_file = name + '.comp.stats'
-    if opts.compiler_stats_range_fields:
+    if isCompilerStatsTest():
         extra_hc_opts += ' +RTS -V0 -t' + stats_file + ' --machine-readable -RTS'
     if backpack:
         extra_hc_opts += ' -outputdir ' + name + '.out'
@@ -1219,10 +1272,10 @@ def simple_build(name, way, extra_hc_opts, should_fail, top_mod, link, addsuf, b
 
     # ToDo: if the sub-shell was killed by ^C, then exit
 
-    statsResult = checkStats(name, way, stats_file, opts.compiler_stats_range_fields)
-
-    if badResult(statsResult):
-        return statsResult
+    if isCompilerStatsTest():
+        statsResult = check_stats(name, way, stats_file, opts.stats_range_fields)
+        if badResult(statsResult):
+            return statsResult
 
     if should_fail:
         if exit_code == 0:
@@ -1260,7 +1313,7 @@ def simple_run(name, way, prog, extra_run_opts):
     my_rts_flags = rts_flags(way)
 
     stats_file = name + '.stats'
-    if opts.stats_range_fields:
+    if isStatsTest() and not isCompilerStatsTest():
         stats_args = ' +RTS -V0 -t' + stats_file + ' --machine-readable -RTS'
     else:
         stats_args = ''
@@ -1298,7 +1351,7 @@ def simple_run(name, way, prog, extra_run_opts):
     if check_prof and not check_prof_ok(name, way):
         return failBecause('bad profile')
 
-    return checkStats(name, way, stats_file, opts.stats_range_fields)
+    return check_stats(name, way, stats_file, opts.stats_range_fields)
 
 def rts_flags(way):
     args = config.way_rts_flags.get(way, [])
@@ -1993,7 +2046,7 @@ def findTFiles(roots):
 # -----------------------------------------------------------------------------
 # Output a test summary to the specified file object
 
-def summary(t, file, short=False):
+def summary(t, file, short=False, color=False):
 
     file.write('\n')
     printUnexpectedTests(file,
@@ -2004,7 +2057,16 @@ def summary(t, file, short=False):
         # Only print the list of unexpected tests above.
         return
 
-    file.write('SUMMARY for test run started at '
+    colorize = lambda s: s
+    if color:
+        if len(t.unexpected_failures) > 0 or \
+            len(t.unexpected_stat_failures) > 0 or \
+            len(t.framework_failures) > 0:
+            colorize = str_fail
+        else:
+            colorize = str_pass
+
+    file.write(colorize('SUMMARY') + ' for test run started at '
                + time.strftime("%c %Z", t.start_time) + '\n'
                + str(datetime.timedelta(seconds=
                     round(time.time() - time.mktime(t.start_time)))).rjust(8)
diff --git a/testsuite/driver/testutil.py b/testsuite/driver/testutil.py
index 15587e6960..6e0c2684d7 100644
--- a/testsuite/driver/testutil.py
+++ b/testsuite/driver/testutil.py
@@ -5,10 +5,28 @@ import shutil
 
 import threading
 
+def passed():
+    return {'passFail': 'pass'}
+
+def failBecause(reason, tag=None):
+    return {'passFail': 'fail', 'reason': reason, 'tag': tag}
+
 def strip_quotes(s):
     # Don't wrap commands to subprocess.call/Popen in quotes.
     return s.strip('\'"')
 
+def str_fail(s):
+    return '\033[1m\033[43m\033[31m' + s + '\033[0m'
+
+def str_pass(s):
+    return '\033[1m\033[32m' + s + '\033[0m'
+
+def str_warn(s):
+    return '\033[1m\033[33m' + s + '\033[0m'
+
+def str_info(s):
+    return '\033[1m\033[34m' + s + '\033[0m'
+
 def getStdout(cmd_and_args):
     # Can't use subprocess.check_output, since we also verify that
     # no stderr was produced
diff --git a/testsuite/mk/test.mk b/testsuite/mk/test.mk
index f036110e07..65e897d849 100644
--- a/testsuite/mk/test.mk
+++ b/testsuite/mk/test.mk
@@ -215,6 +215,14 @@ ifeq "$(SKIP_PERF_TESTS)" "YES"
 RUNTEST_OPTS += --skip-perf-tests
 endif
 
+ifeq "$(ONLY_PERF_TESTS)" "YES"
+RUNTEST_OPTS += --only-perf-tests
+endif
+
+ifneq "$(TEST_ENV)" ""
+RUNTEST_OPTS += --test-env="$(TEST_ENV)"
+endif
+
 ifeq "$(CLEANUP)" "0"
 RUNTEST_OPTS += -e config.cleanup=False
 else ifeq "$(CLEANUP)" "NO"
@@ -266,6 +274,10 @@ RUNTEST_OPTS +=  \
 
 RUNTEST_OPTS += -e "config.stage=$(GhcStage)"
 
+ifneq "$(METRICS_FILE)" ""
+RUNTEST_OPTS +=  \
+	--metrics-file "$(METRICS_FILE)"
+endif
 ifneq "$(JUNIT_FILE)" ""
 RUNTEST_OPTS +=  \
   --junit "$(JUNIT_FILE)"
diff --git a/testsuite/tests/callarity/perf/all.T b/testsuite/tests/callarity/perf/all.T
index 83083d4b4d..37e40e6f9c 100644
--- a/testsuite/tests/callarity/perf/all.T
+++ b/testsuite/tests/callarity/perf/all.T
@@ -1,13 +1,7 @@
 test('T3924',
-     [stats_num_field('bytes allocated', 
-          [ (wordsize(64), 50760, 8),
-              # previously, without call-arity: 22326544
-              # 2014-01-18: 51480  (amd64/Linux)
-              # 2014-07-17: 50760  (amd64/Linux) (Roundabout adjustment)
-              # 2015-04-03: Widen 5->8% (amd64/Windows was doing better)
-            (wordsize(32), 44988, 5) ]),
-              # 2014-04-04: 44988  (Windows, 64-bit machine)
-      only_ways(['normal'])
+     [collect_stats('bytes allocated',8)
+     , only_ways(['normal'])
       ],
      compile_and_run,
      ['-O'])
+
diff --git a/testsuite/tests/deriving/perf/all.T b/testsuite/tests/deriving/perf/all.T
index 240571b4a2..1402a38b5d 100644
--- a/testsuite/tests/deriving/perf/all.T
+++ b/testsuite/tests/deriving/perf/all.T
@@ -1,13 +1,6 @@
 test('T10858',
-     [compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 221895064, 8) ]),
-          # Initial:    222312440
-          # 2016-12-19  247768192  Join points (#19288)
-          # 2017-02-12  304094944  Type-indexed Typeable
-          # 2017-02-25  275357824  Early inline patch
-          # 2017-03-28  241242968  Run Core Lint less
-          # 2017-06-07  221895064  Apparently been reducing for some time
-          #                        Today it crossed the boundary; good
+     [ collect_compiler_stats('bytes allocated',8),
       only_ways(['normal'])],
      compile,
      ['-O'])
+
diff --git a/testsuite/tests/perf/compiler/all.T b/testsuite/tests/perf/compiler/all.T
index d1d5a1ce70..b2ca109000 100644
--- a/testsuite/tests/perf/compiler/all.T
+++ b/testsuite/tests/perf/compiler/all.T
@@ -1,4 +1,4 @@
-# Tests that call 'compiler_stats_num_field' are skipped when debugging is on.
+# Tests that call 'collect_compiler_stats' are skipped when debugging is on.
 # See testsuite/driver/testlib.py.
 
 def no_lint(name, opts):
@@ -29,112 +29,8 @@ setTestOpts(no_lint)
 
 test('T1969',
      [# expect_broken(12437),
-      compiler_stats_num_field('peak_megabytes_allocated', # Note [residency]
-          [(wordsize(32), 30, 15),
-             # 2010-05-17 14 (x86/Windows)
-             #            15 (x86/OS X)
-             #            19 (x86/OS X)
-             # 2013-02-10 13 (x86/Windows)
-             # 2013-02-10 14 (x86/OSX)
-             # 2013-11-13 17 (x86/Windows, 64bit machine)
-             # 2015-07-11 21 (x86/Linux, 64bit machine) use +RTS -G1
-             # 2016-04-06 30 (x86/Linux, 64bit machine)
-           (wordsize(64), 73, 20)]),
-             #            28 (amd64/Linux)
-             #            34 (amd64/Linux)
-             # 2012-09-20 23 (amd64/Linux)
-             # 2012-10-03 25 (amd64/Linux if .hi exists)
-             # 2013-02-13 23, but unstable so increased to 10% range
-             # 2013-02-13 27, very unstable!
-             # 2014-09-10 29 (amd64/Linux) post-AMP-cleanup
-             # 2013-09-11 30, 10 (amd64/Linux)
-             # 2013-09-11 30, 15 (adapt to Phab CI)
-             # 2015-06-03 41, (amd64/Linux) use +RTS -G1
-             # 2015-10-28 55, (amd64/Linux) emit Typeable at definition site
-             # 2016-10-20 68, (amd64/Linux) allow top-level string literals
-             #                See the comment 16 on #8472.
-             # 2017-02-17 83  (amd64/Linux) Type-indexed Typeable
-             # 2017-03-31 61  (amd64/Linux) Fix memory leak in simplifier
-             # 2018-01-25 78  (amd64/Linux) Use CoreExpr for EvTerm
-             # 2018-07-10 73  (amd64/Linux) Fix space leaks
-      compiler_stats_num_field('max_bytes_used',
-          [(platform('i386-unknown-mingw32'), 5719436, 20),
-                                 # 2010-05-17 5717704 (x86/Windows)
-                                 # 2013-02-10 5159748 (x86/Windows)
-                                 # 2013-02-10 5030080 (x86/Windows)
-                                 # 2013-11-13 7295012 (x86/Windows, 64bit machine)
-                                 # 2014-04-24 5719436 (x86/Windows, 64bit machine)
-           (wordsize(32), 9418680, 1),
-             #            6707308 (x86/OS X)
-             # 2009-12-31 6149572 (x86/Linux)
-             # 2014-01-22 6429864 (x86/Linux)
-             # 2014-06-29 5949188 (x86/Linux)
-             # 2015-07-11 6241108 (x86/Linux, 64-bit machine) use +RTS -G1
-             # 2016-04-06 9093608 (x86/Linux, 64-bit machine)
-             # 2017-03-24 9261052 (x86/Linux, 64-bit machine)
-             # 2017-04-06 9418680 (x86/Linux, 64-bit machine)
-
-           (wordsize(64), 19738608, 15)]),
-             # 2014-09-10 10463640, 10  # post-AMP-update (somewhat stabelish)
-               # looks like the peak is around ~10M, but we're
-               # unlikely to GC exactly on the peak.
-               # varies quite a lot with CLEANUP and BINDIST,
-               # hence 10% range.
-               # See Note [residency] to get an accurate view.
-             # 2014-09-14  9684256, 10 # try to lower it a bit more to match Phab's CI
-             # 2014-11-03 10584344,    # ghcspeed reports higher numbers consistently
-             # 2015-07-11 11670120 (amd64/Linux)
-             # 2015-10-28 15017528 (amd64/Linux) emit typeable at definition site
-             # 2016-10-12 17285216 (amd64/Linux) it's not entirely clear why
-             # 2017-02-01 19924328 (amd64/Linux) Join points (#12988)
-             # 2017-02-14 16393848 Early inline patch
-             # 2017-03-31 16679176 Fix memory leak in simplifier
-             # 2017-08-25 19199872 Refactor the Mighty Simplifier
-             # 2018-02-19 22311600 (amd64/Linux) Unknown
-             # 2018-07-10 19738608 (amd64/Linux) Fix space leaks
-
-      compiler_stats_num_field('bytes allocated',
-          [(platform('i386-unknown-mingw32'), 301784492, 5),
-                                 #            215582916 (x86/Windows)
-                                 # 2012-10-29 298921816 (x86/Windows)
-                                 # 2013-02-10 310633884 (x86/Windows)
-                                 # 2013-11-13 317975916 (x86/Windows, 64bit machine)
-                                 # 2014-04-04 301784492 (x86/Windows, 64bit machine)
-           (wordsize(32), 324586096, 1),
-             #            221667908 (x86/OS X)
-             #            274932264 (x86/Linux)
-             # 2012-10-08 303930948 (x86/Linux, new codegen)
-             # 2013-02-10 322937684 (x86/OSX)
-             # 2014-01-22 316103268 (x86/Linux)
-             # 2014-06-29 303300692 (x86/Linux)
-             # 2015-07-11 288699104 (x86/Linux, 64-bit machine) use +RTS -G1
-             # 2016-04-06 344730660 (x86/Linux, 64-bit machine)
-             # 2017-03-24 324586096 (x86/Linux, 64-bit machine)
-           (wordsize(64), 670839456, 5)]),
-             # 2009-11-17 434845560 (amd64/Linux)
-             # 2009-12-08 459776680 (amd64/Linux)
-             # 2010-05-17 519377728 (amd64/Linux)
-             # 2011-08-05 561382568 (amd64/OS X)
-             # 2012-07-16 589168872 (amd64/Linux)
-             # 2012-07-20 595936240 (amd64/Linux)
-             # 2012-08-23 606230880 (amd64/Linux)
-             # 2012-08-29 633334184 (amd64/Linux) new codegen
-             # 2012-09-18 641959976 (amd64/Linux)
-             # 2012-10-19 661832592 (amd64/Linux) -fPIC turned on
-             # 2012-10-23 642594312 (amd64/Linux) -fPIC turned off again
-             # 2012-11-12 658786936 (amd64/Linux) UNKNOWN REASON
-             # 2013-91-17 667160192 (x86_64/Linux) new demand analyser
-             # 2013-10-18 698612512 (x86_64/Linux) fix for #8456
-             # 2014-02-10 660922376 (x86_64/Linux) call arity analysis
-             # 2014-07-17 651626680 (x86_64/Linux) roundabout update
-             # 2014-09-10 630299456 (x86_64/Linux) post-AMP-cleanup
-             # 2015-06-03 581460896 (x86_64/Linux) use +RTS -G1
-             # 2015-10-28 695430728 (x86_64/Linux) emit Typeable at definition site
-             # 2015-10-28 756138176 (x86_64/Linux) inst-decl defaults go via typechecker (#12220)
-             # 2017-02-17 831733376 (x86_64/Linux) Type-indexed Typeable
-             # 2017-02-25 695354904 (x86_64/Linux) Early inlining patch
-             # 2017-04-21 659863176 (x86_64/Linux) Unknown
-             # 2018-07-10 670839456 (x86_64/Linux) Unknown (just updating)
+      collect_compiler_stats(['peak_megabytes_allocated','max_bytes_used'],15),
+      collect_compiler_stats('bytes allocated',5),
       only_ways(['normal']),
 
       extra_hc_opts('-dcore-lint -static'),
@@ -161,63 +57,8 @@ else:
 
 test('T3294',
      [
-      compiler_stats_num_field('max_bytes_used', # Note [residency]
-          [(wordsize(32), 28686588, 15),
-             #            17725476 (x86/OS X)
-             #            14593500 (Windows)
-             # 2013-02-10 20651576 (x86/Windows)
-             # 2013-02-10 20772984 (x86/OSX)
-             # 2013-11-13 24009436 (x86/Windows, 64bit machine)
-             # 2014-04-24 19882188 (x86/Windows, 64bit machine)
-             # 2014-12-22 26525384 (x86/Windows) Increase due to silent superclasses?
-             # 2015-07-11 43196344 (x86/Linux, 64-bit machine) use +RTS -G1
-             # 2016-04-06 28686588 (x86/Linux, 64-bit machine)
-
-           (wordsize(64), 34050960, 20)]),
-             # prev:           25753192 (amd64/Linux)
-             # 29/08/2012:     37724352 (amd64/Linux)
-             #  (increase due to new codegen, see #7198)
-             # 13/13/2012:     44894544 (amd64/Linux)
-             #  (reason for increase unknown)
-             # 15/5/2013:      36904752  (amd64/Linux)
-             #  (reason for decrease unknown)
-             # 29/5/2013:      43224080  (amd64/Linux)
-             #  (reason for increase back to earlier value unknown)
-             # 2014-07-14:     36670800  (amd64/Linux)
-             #  (reason unknown, setting expected value somewhere in between)
-             # 2015-01-22:     45000000  (amd64/Linux)
-             #  varies between 40959592 and 52914488... increasing to +-20%
-             # 2015-10-28:     50367248  (amd64/Linux)
-             #  D757: emit Typeable instances at site of type definition
-             # 2016-07-11:     54609256  (Windows) before fix for #12227
-             # 2016-07-11:     52992688  (Windows) after fix for #12227
-             # 2017-02-17:     63131248  (amd64/Linux) Type indexed Typeable
-             # 2017-05-14:     34050960  (amd64/Linux) Two-pass CmmLayoutStack
-
-      compiler_stats_num_field('bytes allocated',
-          [(wordsize(32), 1377050640, 5),
-           # previous:     815479800  (x86/Linux)
-           # (^ increase due to new codegen, see #7198)
-           # 2012-10-08:  1373514844 (x86/Linux)
-           # 2013-11-13: 1478325844  (x86/Windows, 64bit machine)
-           # 2014-01-12: 1565185140  (x86/Linux)
-           # 2013-04-04: 1377050640  (x86/Windows, 64bit machine)
-           (wordsize(64), 1858491504, 5)]),
-            # old:        1357587088 (amd64/Linux)
-            # 29/08/2012: 2961778696 (amd64/Linux)
-            # (^ increase due to new codegen, see #7198)
-            # 18/09/2012: 2717327208 (amd64/Linux)
-            # 08/06/2013: 2901451552 (amd64/Linux) (reason unknown)
-            # 12/12/2013: 3083825616 (amd64/Linux) (reason unknown)
-            # 18/02/2014: 2897630040 (amd64/Linux) (call arity improvements)
-            # 12/03/2014: 2705289664 (amd64/Linux) (more call arity improvements)
-            # 2014-17-07: 2671595512 (amd64/Linux) (round-about update)
-            # 2014-09-10: 2709595808 (amd64/Linux) post-AMP cleanup
-            # 2016-07-11: 2664479936 (Windows) before fix for #12227
-            # 2016-07-11: 2739731144 (Windows) after fix for #12227 (ignoring)
-            # 2017-02-17: 2758641264 (amd64/Linux) (Type indexed Typeable)
-            # 2017-05-14: 2253557280 (amd64/Linux) Two-pass CmmLayoutStack
-            # 2017-10-24: 1858491504 (amd64/Linux) Improved linear regAlloc
+      collect_compiler_stats('max_bytes_used',15),
+      collect_compiler_stats('bytes allocated',5),
       conf_3294,
 
       # Use `+RTS -G1` for more stable residency measurements. Note [residency].
@@ -227,173 +68,27 @@ test('T3294',
      [''])
 
 test('T4801',
-     [ # expect_broken(5224),
-       # temporarily unbroken (#5227)
-###################################
-# deactivated for now, as this metric became too volatile recently
-#      compiler_stats_num_field('peak_megabytes_allocated',# Note [residency]
-#          [(platform('x86_64-apple-darwin'), 70, 1),
-#                           # expected value: 58 (amd64/OS X)
-#                           # 13/01/2014 - 70
-#           (wordsize(32), 30, 20),
-#           (wordsize(64), 48, 20)]),
-#            # prev:       50 (amd64/Linux)
-#            # 19/10/2012: 64 (amd64/Linux)
-#            #                (^ REASON UNKNOWN!)
-#            # 12/11/2012: 49 (amd64/Linux)
-#            #                (^ REASON UNKNOWN!)
-#            # 28/8/13:    60 (amd64/Linux)
-#            #                (^ REASON UNKNOWN!)
-#            # 2014-09-10: 55 post-AMP-cleanup
-#            # 2014-10-08: 62 (jumps between 55 and 71 observed -- GC tipping point?)
-#            # 2014-10-13: 48 stricter seqDmdType
-
-      compiler_stats_num_field('bytes allocated',
-          [(platform('x86_64-apple-darwin'), 417302064, 10),
-           # prev:       510938976 (amd64/OS X):
-           # 2015-12-11: 465653312 (amd64/OS X) Update, bump tolerance to +/-10%
-           # 2017-03-24: 417302064 (amd64/OS X) Correlated with Linux improvement
-
-           (wordsize(32), 199856388, 10),
-           # prev:        185669232 (x86/OSX)
-           # 2014-01-22:  211198056 (x86/Linux)
-           # 2014-09-03:  185242032 (Windows laptop)
-           # 2014-12-01:  203962148 (Windows laptop)
-           # 2016-04-06:  239556572 (x86/Linux)
-           # 2017-03-24:  199856388 (x86/Linux)
-           (wordsize(64), 388898280, 10)]),
-            # prev:       360243576 (amd64/Linux)
-            # 19/10/2012: 447190832 (amd64/Linux) (-fPIC turned on)
-            # 19/10/2012: 392409984 (amd64/Linux) (-fPIC turned off)
-            # 2014-04-08: 362939272 (amd64/Linux) cumulation of various smaller improvements over recent commits
-            # 2014-10-08: 382056344 (amd64/Linux) stricter foldr2 488e95b
-            # 2015-10-28: 434278248 (amd64/Linux) emit Typeable at definition site
-            # 2016-10-19: 388898280 (amd64/Linux) Refactor traceRn interface (#12617)
-
-###################################
-# deactivated for now, as this metric became too volatile recently
-#
-#     compiler_stats_num_field('max_bytes_used',
-#         [(platform('x86_64-apple-darwin'), 25145320, 5),
-#          (wordsize(32), 11829000, 15),
-#            #              9651948 (x86/OSX)
-#            #              10290952 (windows)
-#            # 2013-02-10   11071060 (x86/Windows)
-#            # 2013-02-10:  11207828 (x86/OSX)
-#            # (some date): 11139444
-#            # 2013-11-13:  11829000 (x86/Windows, 64bit machine)
-#          (wordsize(64), 19296544, 15)]),
-#               # prev:       20486256 (amd64/OS X)
-#               # 30/08/2012: 17305600--20391920 (varies a lot)
-#               # 19/10/2012: 26882576 (-fPIC turned on)
-#               # 19/10/2012: 18619912 (-fPIC turned off)
-#               # 24/12/2012: 21657520 (perhaps gc sampling time wibbles?)
-#               # 10/01/2014: 25166280
-#               # 13/01/2014: 22646000 (mostly due to #8647)
-#               # 18/02/2014: 25002136 (call arity analysis changes)
-#               # 12/05/2014: 25002136 (specialisation and inlining changes)
-#               # 10/09/2014: 19296544, 10 (post-AMP-cleanup)
-#               # 14/09/2014: 19585456, 15 (adapt to Phab CI env)
-       only_ways(['normal']),
-       extra_hc_opts('-static'),
+     [# collect_compiler_stats('peak_megabytes_allocated',1),
+      # expect_broken(5224),
+      # temporarily unbroken (#5227)
+      # deactivated for now, as this metric became too volatile recently
+      collect_compiler_stats('bytes allocated',10),
+      # collect_compiler_stats('max_bytes_used',5),
+      only_ways(['normal']),
+      extra_hc_opts('-static'),
 
-       # Use `+RTS -G1` for more stable residency measurements. Note [residency].
-       extra_hc_opts('+RTS -G1 -RTS')
-       ],
+      # Use `+RTS -G1` for more stable residency measurements. Note [residency].
+      extra_hc_opts('+RTS -G1 -RTS')
+      ],
      compile,
      [''])
 
 test('T3064',
-     [compiler_stats_num_field('peak_megabytes_allocated',# Note [residency]
-          [(wordsize(32), 36, 20),
-            # expected value: 14 (x86/Linux 28-06-2012):
-            # 2013-11-13:     18 (x86/Windows, 64bit machine)
-            # 2014-01-22:     23 (x86/Linux)
-            # 2014-12-22:     23 (x86/Linux) death to silent superclasses
-            # 2015-07-11:     28 (x86/Linux, 64-bit machine) use +RTS -G1
-            # 2017-04-06:     36 (x86/Linux, 64-bit machine) it's unclear
+     [collect_compiler_stats('peak_megabytes_allocated',20),
+      collect_compiler_stats('bytes allocated',10),
 
-           (wordsize(64), 66, 20)]),
-            # (amd64/Linux):            18
-            # (amd64/Linux) 2012-02-07: 26
-            # (amd64/Linux) 2013-02-12: 23; increased range to 10%
-            # (amd64/Linux) 2013-04-03: 26
-            # (amd64/Linux) 2013-09-11: 30; result of AMP patch
-            # Increased range to 20%.  peak-usage varies from 22 to 26,
-            #  depending on whether the old .hi file exists
-            # (amd64/Linux) 2013-09-11: 37; better arity analysis (weird)
-            # (amd64/Linux) (09/09/2014): 42, AMP changes (larger interfaces, more loading)
-            # (amd64/Linux) 2014-10-13: 38: Stricter seqDmdType
-            # (amd64/Linux) 2014-12-22: 27: death to silent superclasses
-            # (amd64/Linux) 2015-01-22: 32: Varies from 30 to 34, at least here.
-            # (amd64/Linux) 2015-06-03: 54: use +RTS -G1
-            # (amd64/Linux) 2016-10-25: 66: Presumably creep
-
-      compiler_stats_num_field('bytes allocated',
-          [(wordsize(32), 134044092, 10),
-            # 2011-06-28: 56380288  (x86/Linux)
-            # 2012-10-30: 111189536 (x86/Windows)
-            # 2013-11-13: 146626504 (x86/Windows, 64bit machine)
-            # 2014-01-22: 162457940 (x86/Linux)
-            # 2014-12-01: 162457940 (Windows)
-            # 2014-12-22: 122836340 (Windows) Death to silent superclasses
-            # 2016-04-06: 153261024 (x86/Linux) probably wildcard refactor
-            # 2017-03-24: 134044092 (x86/Linux, 64-bit machine) Update
-
-           (wordsize(64), 272759920, 5)]),
-            # (amd64/Linux) (2011-06-28):  73259544
-            # (amd64/Linux) (2013-02-07): 224798696
-            # (amd64/Linux) (2013-08-02): 236404384, increase from roles
-            # (amd64/Linux) (2013-09-11): 290165632, increase from AMP warnings
-            # (amd64/Linux) (2013-11-22): 308300448, GND via Coercible and counters for constraints solving
-            # (amd64/Linux) (2013-12-02): 329795912, Coercible refactor
-            # (amd64/Linux) (2014-02-11): 308422280, optimize Coercions in simpleOptExpr
-            # (amd64/Linux) (2014-05-23): 324022680, unknown cause
-            # (amd64/Linux) (2014-07-17): 332702112, general round of updates
-            # (amd64/Linux) (2014-08-29): 313638592, w/w for INLINABLE things
-            # (amd64/Linux) (2014-09-09): 407416464, AMP changes (larger interfaces, more loading)
-            # (amd64/Linux) (2014-09-14): 385145080, BPP changes (more NoImplicitPrelude in base)
-            # (amd64/Linux) (2014-12-10): 363103840, improvements in constraint solver
-            # (Mac)         (2014-12-18): 350418600, improvements to flattener
-            # (amd64/Linux) (2014-12-22): 243670824, Ha! Death to superclass constraints, makes
-            #                                        much less code for Monad instances
-            # (amd64/Linux) (2015-12-01): 264952256, Regression due to Simon's wildcard refactor
-            #                                        Tracked as #11151.
-            # (amd64/Linux) (2015-12-11): 304344936, Regression due to TypeInType
-            #                                        Tracked as #11196
-            # (amd64/Linux) (2016-04-15): 287460128  Improvement due to using coercionKind instead
-            #                                        of zonkTcType (Trac #11882)
-            # (amd64/Darwin) (2017-01-23): 306222424 Presumably creep from recent changes (Typeable?)
-            # (amd64/Linux) (2017-02-14): 259815560  Early inline patch: 9% improvement
-            # (amd64/Linux) (2017-03-31): 265950920  Fix memory leak in simplifier
-            # (amd64/Linux) (2017-05-01): 281509496  Avoid excessive space usage from unfoldings in CoreTidy
-            # (amd64/Linux) (2017-05-01): 258505536  I think this is improvement in coercionKind e4ab65bd
-            # (amd64/Linux) (2018-08-04): 272759920  It's unclear
-
-###################################
-# deactivated for now, as this metric became too volatile recently
-#
-#      compiler_stats_num_field('max_bytes_used',
-#          [(wordsize(32), 11202304, 20),
-#            # 2011-06-28:  2247016 (x86/Linux) (28/6/2011):
-#            #(some date):  5511604
-#            # 2013-11-13:  7218200 (x86/Windows, 64bit machine)
-#            # 2014-04-04: 11202304 (x86/Windows, 64bit machine)
-#           (wordsize(64), 13251728, 20)]),
-#            # (amd64/Linux, intree) (28/06/2011):  4032024
-#            # (amd64/Linux, intree) (07/02/2013):  9819288
-#            # (amd64/Linux)         (14/02/2013):  8687360
-#            # (amd64/Linux)         (18/02/2013):  9397488
-#            # (amd64/Linux)         (02/08/2013): 10742536, increase from roles
-#            # (amd64/Linux)         (19/08/2013): 9211816,  decrease apparently from better eta reduction
-#            # (amd64/Linux)         (11/09/2013): 12000480, increase from AMP warnings
-#            #                                     933cdf15a2d85229d3df04b437da31fdfbf4961f
-#            # (amd64/Linux)         (22/11/2013): 16266992, GND via Coercible and counters for constraints solving
-#            # (amd64/Linux)         (12/12/2013): 19821544, better One shot analysis
-#            # (amd64/Linux)         (09/09/2014): 24357392, AMP changes (larger interfaces, more loading)
-#            # (amd64/Linux)         (14/09/2014): 16053888, BPP changes (more NoImplicitPrelude in base)
-#            # (amd64/Linux)         (19/09/2014): 18744992, unknown
-#            # (amd64/Linux)         2014-10-13:   13251728, Stricter seqDmdType
+      # deactivated for now, as this metric became too volatile recently
+      # collect_compiler_stats('max_bytes_used',20)
 
        only_ways(['normal']),
 
@@ -409,37 +104,7 @@ test('T4007',
      ['$MAKE -s --no-print-directory T4007'])
 
 test('T5030',
-     [compiler_stats_num_field('bytes allocated',
-          [(wordsize(32), 345668088, 10),
-           # previous:    196457520
-           # 2012-10-08:  259547660 (x86/Linux, new codegen)
-           # 2013-11-21:  198573456 (x86 Windows, 64 bit machine)
-           # 2014-12-10:  227205560 constraint solver got worse again; more aggressive solving
-           #                        of family-applications leads to less sharing, I think
-           # 2015-07-11:  201882912 reason unknown
-           # 2016-04-06:  345668088 likely TypeInType
-
-           (wordsize(64), 794426536, 10)]),
-             # Previously 530000000 (+/- 10%)
-             # 17/1/13:   602993184  (x86_64/Linux)
-             #            (new demand analyser)
-             # 2013-06-08 538467496  (x86_64/Linux)
-             # ^ reason unknown
-             # 2013-08-02 454498592  (amd64/Linux)
-             # decrease from more aggressive coercion optimisations from roles
-             # 2013-11-12 397672152  (amd64/Linux)
-             # big decrease following better CSE and arity
-             # 2014-07-17 409314320  (amd64/Linux)
-             # general round of updates
-             # 2014-09-10 385152728  post-AMP-cleanup
-             # 2014-12-08 340969128  constraint solver perf improvements (esp kick-out)
-             # 2014-12-10 449042120  constraint solver got worse again; more aggressive solving
-             #                          of family-applications leads to less sharing, I think
-             # 2015-03-17 403932600  tweak to solver algorithm
-             # 2015-12-11 653710960  TypeInType (see #11196)
-             # 2016-10-17 794426536  20% big increase following
-             #                       31621b12 * A collection of type-inference refactorings.
-             #                       See ticket for more info
+     [collect_compiler_stats('bytes allocated', 10),
 
        only_ways(['normal'])
       ],
@@ -447,47 +112,14 @@ test('T5030',
      ['-freduction-depth=300'])
 
 test('T5631',
-     [compiler_stats_num_field('bytes allocated',
-          [(wordsize(32), 570137436, 10),
-        # expected value: 392904228 (x86/Linux)
-        # 2014-04-04:     346389856 (x86 Windows, 64 bit machine)
-        # 2014-12-01:     390199244 (Windows laptop)
-        # 2016-04-06:     570137436 (amd64/Linux) many reasons
-           (wordsize(64), 1161885448, 5)]),
-        # expected value: 774595008 (amd64/Linux):
-        # expected value: 735486328 (amd64/Linux) 2012/12/12:
-        # expected value: 690742040 (amd64/Linux) Call Arity improvements
-        # 2014-09-09:     739704712 (amd64/Linux) AMP changes
-        # 2014-11-04:     776121120 (amd64/Linux) new-flatten-skolems
-        # 2015-06-01:     812288344 (amd64/Linux) unknown cause
-        # 2015-12-11:     1128828928 (amd64/Linux) TypeInType (see #11196)
-        # 2015-12-21:     1198327544 (Mac) TypeApplications (will fix with #11196)
-        # 2015-03-18:     1124068664 (Mac) optimize Unify & zonking
-        # 2016-10-19:     1024926024 (amd64/Linux) Refactor traceRn interface (#12617)
-        # 2016-11-10:     1077429456 (amd64/Linux) Stop -dno-debug-output suppressing -ddump-tc-trace
-        # 2017-02-17:     1517484488 (amd64/Linux) Type-indexed Typeable
-        # 2017-03-03:     1065147968 (amd64/Linux) Share Typeable KindReps
-        # 2017-03-31:     1037482512 (amd64/Linux) Fix memory leak in simplifier
-        # 2017-07-27:     1106015512 (Mac) Regresssion from tracking visibility in TypeEqOrigin
-        #                                  should be fixed by #14037
-        # 2018-06-18:     1161885448 (Mac) Not entirely clear
-       only_ways(['normal'])
+     [collect_compiler_stats('bytes allocated',10),
+      only_ways(['normal'])
       ],
      compile,
      [''])
 
 test('parsing001',
-     [compiler_stats_num_field('bytes allocated',
-          [(wordsize(32), 232777056, 10),
-        # Initial:        274000576
-        # 2017-03-24:     232777056
-           (wordsize(64), 519401296, 5)]),
-        # expected value: 587079016 (amd64/Linux)
-        # 2016-09-01:     581551384 (amd64/Linux) Restore w/w limit (#11565)
-        # 2016-12-19:     493730288 (amd64/Linux) Join points (#12988)
-        # 2017-02-14:     463931280 Early inlining patch; acutal improvement 7%
-        # 2017-12-11:     490228304 BlockArguments
-        # 2018-04-09:     519401296 Inexplicable, collateral of #14737
+     [collect_compiler_stats('bytes allocated',10),
        only_ways(['normal']),
       ],
      compile_fail, [''])
@@ -495,333 +127,53 @@ test('parsing001',
 
 test('T783',
      [ only_ways(['normal']),  # no optimisation for this one
-      # expected value: 175,569,928 (x86/Linux)
-      compiler_stats_num_field('bytes allocated',
-          [(wordsize(32), 225911912, 5),
-            # 2012-10-08: 226907420 (x86/Linux)
-            # 2013-02-10: 329202116 (x86/Windows)
-            # 2013-02-10: 338465200 (x86/OSX)
-            # 2014-04-04: 319179104 (x86 Windows, 64 bit machine)
-            # 2014-09-03: 223377364 (Windows) better specialisation, raft of core-to-core optimisations
-            # 2014-12-22: 235002220 (Windows) not sure why
-            # 2016-04-06: 249332816 (x86/Linux, 64-bit machine)
-            # 2017-03-24: 225911912 (x86/Linux, 64-bit machine)
-
-           (wordsize(64), 481875416, 10)]),
-            # prev:       349263216 (amd64/Linux)
-            # 07/08/2012: 384479856 (amd64/Linux)
-            # 29/08/2012: 436927840 (amd64/Linux)
-            # 12/11/2012: 640324528 (amd64/Linux)
-            #   (OldCmm removed: not sure why this got worse, the
-            #    other perf tests remained about the same)
-            # 18/10/2013: 734038080 (amd64/Linux)
-            #   (fix for #8456)
-            # 24/10/2013: 654804144 (amd64/Linux)
-            #   (fix previous fix for #8456)
-            # 2014-07-17: 640031840 (amd64/Linux)
-            #   (general round of updates)
-            # 2014-08-29: 441932632 (amd64/Linux)
-            #   (better specialisation, raft of core-to-core optimisations)
-            # 2014-08-29: 719814352 (amd64/Linux)
-            #   (changed order of cmm block causes analyses to allocate much more,
-            #      but the changed order is slighly better in terms of runtime, and
-            #      this test seems to be an extreme outlier.)
-            # 2015-05-16: 548288760 (amd64/Linux)
-            #   (improved sequenceBlocks in nativeCodeGen, #10422)
-            # 2015-08-07: 470738808 (amd64/Linux)
-            #   (simplifying the switch plan code path for simple checks, #10677)
-            # 2015-08-28: 526230456 (amd64/Linux)
-            #    (D757: Emit Typeable instances at site of type definition)
-            # 2015-12-04: 1134085384 (amd64/Linux)
-            #    (D1535: Major overhaul of pattern match checker, #11162)
-            # 2016-02-03: 488592288 (amd64/Linux)
-            #    (D1795: Another overhaul of pattern match checker, #11374)
-            # 2017-02-14    436978192    Early inlining: 5% improvement
-            # 2017-09-08    481875416    Unknown
-
+      collect_compiler_stats('bytes allocated',10),
       extra_hc_opts('-static')
       ],
       compile,[''])
 
 test('T5321Fun',
      [ only_ways(['normal']),  # no optimisation for this one
-       compiler_stats_num_field('bytes allocated',
-           [(wordsize(32), 244387620, 10),
-             # prev:       300000000
-             # 2012-10-08: 344416344 x86/Linux
-             #  (increase due to new codegen)
-             # 2014-09-03: 299656164     (specialisation and inlining)
-             # 2014-12-10: 206406188     #  Improvements in constraint solver
-             # 2016-04-06: 279922360 x86/Linux
-             # 2017-03-24: 244387620 x86/Linux (64-bit machine)
-
-            (platform('x86_64-apple-darwin'), 446893600, 5),
-             # 2018-03-17: 423774560     #  OS X-only (reason unknown, see #11753)
-
-            (wordsize(64), 423774560, 5)])
-             # prev:       585521080
-             # 2012-08-29: 713385808     #  (increase due to new codegen)
-             # 2013-05-15: 628341952     #  (reason for decrease unknown)
-             # 2013-06-24: 694019152     #  (reason for re-increase unknown)
-             # 2014-05-12: 614409344     #  (specialisation and inlining changes)
-             # 2014-09-10: 601629032     #  post-AMP-cleanup
-             # 2014-11-06: 541287000     #  Simon's flat-skol changes to the constraint solver
-             # 2014-12-10: 408110888     #  Improvements in constraint solver
-             # 2014-12-16: 429921312     #  Flattener parameterized over roles
-             # 2015-08-10: 509921312
-             #  (undefined now takes an implicit parameter and GHC -O0 does
-             #  not recognize that the application is bottom)
-             # 2015-12-11: 565883176     #  TypeInType (see #11196)
-             # 2017-01-06: 497356688     #  Small coercion optimisations
-                                         #  The actual decrease was only 2%; earlier
-                                         #    commits had drifted down
-             # 2017-01-22: 525895608     #  Allow top-level string literals in Core. I'm not
-                                         #    convinced that this patch is
-                                         #    responsible for all of this
-                                         #    change, however. Namely I am
-                                         #    quite skeptical of the downward
-                                         #    "drift" reported above
-             # 2017-01-31: 498135752     #  Join points (#12988)
-             # 2017-02-23: 524706256     #  Type-indexed Typeable? (on Darwin)
-             # 2017-02-25: 488295304     #  Early inlining patch
-             # 2017-05-14: 449577856     #  (amd64/Linxu) Two-pass CmmLayoutStack
-             # 2017-12-13: 423774560     #  (amd64/Linxu) Typechecker improvements
+       collect_compiler_stats('bytes allocated',10)
       ],
       compile,[''])
 
 test('T5321FD',
      [ only_ways(['normal']),  # no optimisation for this one
-      compiler_stats_num_field('bytes allocated',
-          [(wordsize(32), 250757460, 10),
-            # prev:       213380256
-            # 2012-10-08: 240302920 (x86/Linux)
-            #  (increase due to new codegen)
-            # 2014-07-31: 211699816 (Windows) (-11%)
-            #  (due to better optCoercion, 5e7406d9, #9233)
-            # 2016-04-06: 250757460 (x86/Linux)
-
-           (wordsize(64), 371826136, 10)])
-            # prev:       418306336
-            # 29/08/2012: 492905640
-            #  (increase due to new codegen)
-            # 15/05/2013: 406039584
-            #  (reason for decrease unknown)
-            # 08/06/2013: 476497048
-            #  (reason for increase unknown)
-            # before 2014-07-17: 441997096
-            #  (with -8%, still in range, hence cause not known)
-            # 2014-07-17: 426960992 (-11% of previous value)
-            #  (due to better optCoercion, 5e7406d9, #9233)
-            # 2014-10-08  410895536
-            #  (various changes; biggest improvements due to 949ad67 and FastString package ids)
-            # 2015-08-10: 470895536
-            #  (undefined now takes an implicit parameter and GHC -O0 does
-            #  not recognize that the application is bottom)
-            # 2015-10-28: 532365376
-            #  D757: emit Typeable instances at site of type definition
-            # 2016-07-16: 477840432
-            #  Optimize handling of built-in OccNames
-            # 2017-05-14: 415136648 (amd64/Linux) Two-pass CmmLayoutStack
-            # 2018-04-24: 371826136 (amd64/Linux) Store size in LitString
+      collect_compiler_stats('bytes allocated',10)
       ],
       compile,[''])
 
 test('T5642',
      [ only_ways(['normal']),
        normal,
-       compiler_stats_num_field('bytes allocated',
-           [(wordsize(32), 413517560, 10),
-                     # sample from x86/Linux
-            # prev:        650000000
-            # 2014-09-03:  753045568
-            # 2014-12-10:  641085256 Improvements in constraints solver
-            # 2016-04-06:  462677300
-            # 2017-03-24:  413517560 (x86/Linux, 64-bit machine)
-
-            (wordsize(64),  838316496, 10)])
-            # prev:        1300000000
-            # 2014-07-17:  1358833928 (general round of updates)
-            # 2014-08-07:  1402242360 (caused by 1fc60ea)
-# Watch out for:
-            # 23/05/2014:  1452688392 (More aggressive specialisation means we get
-            #                          specialised copies of imported functions that
-            #                          are ultimately discarded by trimAutoRules
-            #                          It's a bizarre program with LOTS of data types)
-            # 2014-09-10:  1536924976 post-AMP-cleanup
-            # 2014-12-10:  1282916024 Improvements in constraints solver
-            # 2015-10-28:  1412808976 Emit Typeable at definition site
-            # 2015-11-22:  1071915072 Use TypeLits in the metadata encoding
-            # 2016-02-08:   950004816 Pattern match checker re-rework
-            # 2016-05-12:  1300685592 Make Generic1 poly-kinded
-            # 2016-06-05:   916484672 Refactor derived Generic instances to reduce allocations
-            # 2016-09-03:   838316496 Derive the Generic instance in perf/compiler/T5642
+       collect_compiler_stats('bytes allocated',10)
       ],
       compile,['-O'])
 
 test('T5837',
      [ only_ways(['normal']),
-      compiler_stats_num_field('bytes allocated',
-          [(wordsize(32), 27028956, 10),
-             # 40000000 (x86/Linux)
-             # 2013-11-13:  45520936 (x86/Windows, 64bit machine)
-             # 2014-09-03:  37096484 (Windows laptop, w/w for INLINABLE things
-             # 2014-12-01: 135914136 (Windows laptop, regression see below)
-             # 2014-12-08: 115905208  Constraint solver perf improvements (esp kick-out)
-             # 2016-04-06: 24199320  (x86/Linux, 64-bit machine) TypeInType
-             # 2017-03-24: 27028956  (x86/Linux, 64-bit machine)
-
-           (platform('x86_64-unknown-mingw32'), 61806136, 7),
-             # 2017-02-19                       59161648 (x64/Windows) - Unknown
-             # 2017-04-21                       54985248 (x64/Windows) - Unknown
-             # 2017-12-24                       54793816 (x64/Windows) - Unknown
-             # 2018-09-23                       61806136 (x64/Windows) - Unknown
-
-           (wordsize(64), 55813608, 7)])
-             # sample: 3926235424 (amd64/Linux, 15/2/2012)
-             # 2012-10-02 81879216
-             # 2012-09-20 87254264 amd64/Linux
-             # 2013-09-18 90587232 amd64/Linux
-             # 2013-11-21 86795752 amd64/Linux, GND via Coercible and counters
-             #                                  for constraints solving
-             # 2014-08-29 73639840 amd64/Linux, w/w for INLINABLE things
-             # 2014-10-08 73639840 amd64/Linux, Burning Bridges and other small changes
-             # 2014-11-06 271028976       Linux, Accept big regression;
-             #   See Note [An alternative story for the inert substitution] in TcFlatten
-             # 2014-12-08 234790312 Constraint solver perf improvements (esp kick-out)
-             # 2014-12-16 231155640 Mac  Flattener parameterized over roles;
-             #                           some optimization
-             # 2015-03-17 53424304  Mac  Better depth checking; fails earlier
-             # 2015-06-09 38834096  Better "improvement"; I'm not sure whey it improves things
-             # 2015-12-11 43877520  amd64/Linux, TypeInType (see #11196)
-             # 2016-03-18 48507272  Mac, accept small regression in exchange
-             #                           for other optimisations
-             # 2016-09-15 42445672  Linux; fixing #12422
-             # 2016-09-25 41832056  amd64/Linux, Rework handling of names (D2469)
-             # 2016-10-25 52597024  amd64/Linux, the test now passes (hooray), and so
-             #                          allocates more because it goes right down the
-             #                          compilation pipeline
-             # 2017-01-24 57861352  amd64/Linux, very likely due to the top-level strings
-             #                          in Core patch.
-             # 2017-02-07 50253880  Another improvement in SetLevels.  I don't think
-             #                      all the gain here is from this patch, but I think it
-             #                      just pushed it over the edge, so I'm re-centreing, and
-             #                      changing to 5% tolerance
-             # 2017-02-07 53592736  amd64/Linux Simon's earlier decrease appears
-             #                      to be environmentally-dependent.
-             #                      Also bumped acceptance threshold to 7%.
-             # 2017-02-20 58648600  amd64/Linux Type-indexed Typeable
-             # 2017-02-28 54151864  amd64/Linux Likely drift due to recent simplifier improvements
-             # 2017-02-25 52625920  amd64/Linux Early inlining patch
-             # 2017-09-06 56782344  amd64/Linux Drift manifest in unrelated LLVM patch
-             # 2017-10-24 52089424  amd64/linux Fix space leak in BinIface.getSymbolTable
-             # 2018-02-19 55813608  amd64/Linux Unknown
+      collect_compiler_stats('bytes allocated',10)
       ],
       compile, ['-freduction-depth=50'])
 
 test('T6048',
      [ only_ways(['optasm']),
-      compiler_stats_num_field('bytes allocated',
-          [(wordsize(32), 55701280, 10),
-            # prev:       38000000 (x86/Linux)
-            # 2012-10-08: 48887164 (x86/Linux)
-            # 2014-04-04: 62618072 (x86 Windows, 64 bit machine)
-            # 2014-09-03: 56315812 (x86 Windows, w/w for INLINABLE)
-            # 2014-12-01: 49987836 (x86 Windows)
-            # 2016-04-06: 55701280 (x86/Linux, 64-bit machine)
-
-           (wordsize(64), 100574504, 10)])
-             # 2012-09-18  97247032 amd64/Linux
-             # 2014-01-16 108578664 amd64/Linux (unknown, likely foldl-via-foldr)
-             # 2014-01-18  95960720 amd64/Linux Call Arity improvements
-             # 2014-02-28 105556793 amd64/Linux (unknown, tweak in base/4d9e7c9e3 resulted in change)
-             # 2014-03-05 110646312 amd64/Linux Call Arity became more elaborate
-             # 2014-07-14 125431448 amd64/Linux unknown reason. Even worse in GHC-7.8.3. *shurg*
-             # 2014-08-29 108354472 amd64/Linux w/w for INLINABLE things
-             # 2014-09-14  88186056 amd64/Linux BPP part1 change (more NoImplicitPreludes in base)
-             # 2014-01-08  95946688 amd64/Linux Mostly 4c834fd. Occasional spikes to 103822120!
-             # 2016-03-11 108225624 amd64/Linux unknown reason sadly; likely gradual creep.
-             # 2016-11-25  94327392 amd64/Linux Back down again hooray; still not sure why
-             # 2017-02-17 115715592 amd64/Linux Type-indexed Typeable
-             # 2017-04-28  90996312 Join point refactoring
-             # 2018-06-18 100574504 Darwin      Unclear
+      collect_compiler_stats('bytes allocated',10)
       ],
       compile,[''])
 
 test('T9020',
      [ only_ways(['optasm']),
-      compiler_stats_num_field('bytes allocated',
-          [(wordsize(32), 249904136, 10),
-           # Original:    381360728
-           # 2014-07-31:  343005716 (Windows) (general round of updates)
-           # 2017-03-24:  249904136 (x86/Linux, 64-bit machine)
-
-           (wordsize(64), 391876936, 10)])
-           # prev:        795469104
-           # 2014-07-17:  728263536 (general round of updates)
-           # 2014-09-10:  785871680 post-AMP-cleanup
-           # 2014-11-03:  680162056 Further Applicative and Monad adjustments
-           # 2015-10-21:  786189008 Make stronglyConnCompFromEdgedVertices deterministic
-           # 2016-01-26:  698401736 improvement from using ExpTypes instead of ReturnTvs
-           # 2016-04-06:  852298336 Refactoring of CSE #11781
-           # 2016-04-06:  698401736 Use thenIO in Applicative IO
-           # 2017-02-03:  764866144 Join points
-           # 2017-02-14:  500707080 Early inline patch; 35% decrease!
-           #                        Program size collapses in first simplification
-           # 2017-03-31:  493596312 Fix memory leak in simplifier
-           # 2017-04-28:  423163832  Remove exponential behaviour in simplifier
-           # 2018-04-09:  562206104 Inexplicable, collateral of #14737
-           # 2018-05-14:  391876936 Improved simplCast performance #15019
+      collect_compiler_stats('bytes allocated',10)
       ],
       compile,[''])
 
 test('T9675',
      [ only_ways(['optasm']),
-       compiler_stats_num_field('max_bytes_used', # Note [residency]
-          [(wordsize(64), 20499224, 15),
-          # 2014-10-13    29596552
-          # 2014-10-13    26570896   seq the DmdEnv in seqDmdType as well
-          # 2014-10-13    18582472   different machines giving different results..
-          # 2014-10-13    22220552   use the mean
-          # 2015-06-21    28056344   switch to `+RTS -G1`, tighten bound to 15%
-          # 2015-10-28    23776640   emit Typeable at definition site
-          # 2015-12-11    30837312   TypeInType (see #11196)
-          # 2016-03-14    38776008   Final demand analyzer run
-          # 2016-04-01    29871032   Fix leaks in demand analysis
-          # 2016-04-30    17675240   Fix leaks in tidy unfoldings
-          # 2018-09-21    20499224   See #15663
-           (wordsize(32), 18043224, 15)
-          # 2015-07-11    15341228   (x86/Linux, 64-bit machine) use +RTS -G1
-          # 2016-04-06    18043224   (x86/Linux, 64-bit machine)
-          ]),
-       compiler_stats_num_field('peak_megabytes_allocated', # Note [residency]
-          [(wordsize(64), 75, 15),
-          # 2014-10-13    66
-          # 2014-10-13    58         seq the DmdEnv in seqDmdType as well
-          # 2014-10-13    49         different machines giving different results...
-          # 2014-10-13    53         use the mean
-          # 2015-06-15    44         reduced for some reason
-          # 2015-06-21    105        switch to `+RTS -G1`
-          # 2015-12-04    88         new pattern checker (D1535)
-          # 2015-12-11    113        TypeInType (see #11196)
-          # 2016-04-14    144        Final demand analyzer run
-          # 2016-07-26    121        Unboxed sums?
-          # 2017-04-30    63         Fix leaks in tidy unfoldings
-          # 2018-09-21    75         See #15663
-            (wordsize(32), 56, 15)
-          # 2015-07-11    56         (x86/Linux, 64-bit machine) use +RTS -G1
-          ]),
-       compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 656137960, 10)
-          # 2014-10-13    544489040
-          # 2015-10-28    608284152  emit Typeable at definition site
-          # 2017-02-17    731171072  Type-indexed Typeable
-          # 2017-03-13    656137960  Put join ceiling underneath lambdas?
-
-          ,(wordsize(32), 322901484, 10)
-          # 2015-07-11    279480696  (x86/Linux, 64-bit machine) use +RTS -G1
-          # 2017-03-24    322901484  (x86/Linux, 64-bit machine)
-
-          ]),
+       # Note [residency]
+       collect_compiler_stats(['max_bytes_used','peak_megabytes_allocated'],15),
+       collect_compiler_stats('bytes allocated',10),
 
        # Use `+RTS -G1` for more stable residency measurements. Note [residency].
        extra_hc_opts('+RTS -G1 -RTS')
@@ -831,145 +183,40 @@ test('T9675',
 
 test('T9872a',
      [ only_ways(['normal']),
-       compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 2729927408, 5),
-          # 2014-12-10    5521332656    Initally created
-          # 2014-12-16    5848657456    Flattener parameterized over roles
-          # 2014-12-18    2680733672    Reduce type families even more eagerly
-          # 2015-12-11    3581500440    TypeInType (see #11196)
-          # 2016-04-07    3352882080    CSE improvements
-          # 2016-10-19    3134866040    Refactor traceRn interface (#12617)
-          # 2017-02-17    3298422648    Type-indexed Typeable
-          # 2017-02-25    3005891848    Early inlining patch
-          # 2018-03-26    2729927408    Flattener update with optimizations (#12919)
-
-           (wordsize(32), 1493198244, 5)
-          # was           1325592896
-          # 2016-04-06    1740903516    x86/Linux
-          # 2017-03-24    1493198244    x86/Linux, 64-bit machine
-          ]),
+       collect_compiler_stats('bytes allocated',5)
       ],
      compile_fail,
      [''])
 
 test('T9872b',
      [ only_ways(['normal']),
-       compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 3730686224, 5),
-          # 2014-12-10    6483306280    Initally created
-          # 2014-12-16    6892251912    Flattener parameterized over roles
-          # 2014-12-18    3480212048    Reduce type families even more eagerly
-          # 2015-12-11    5199926080    TypeInType (see #11196)
-          # 2016-02-08    4918990352    Improved a bit by tyConRolesRepresentational
-          # 2016-04-06:   4600233488    Refactoring of CSE #11781
-          # 2016-09-15:   4069522928    Fix #12422
-          # 2017-02-14    3730686224    Early inlining: 5% improvement
-
-           (wordsize(32), 1894037608, 5)
-          # was           1700000000
-          # 2016-04-06    2422750696    x86/Linux
-          # 2017-03-24    1894037608    x86/Linux, 64-bit machine
-          ]),
+       collect_compiler_stats('bytes allocated',5)
       ],
      compile_fail,
      [''])
 test('T9872c',
      [ only_ways(['normal']),
-       compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 3096670112, 5),
-          # 2014-12-10    5495850096    Initally created
-          # 2014-12-16    5842024784    Flattener parameterized over roles
-          # 2014-12-18    2963554096    Reduce type families even more eagerly
-          # 2015-12-11    4723613784    TypeInType (see #11196)
-          # 2016-02-08    4454071184    Improved a bit by tyConRolesRepresentational
-          # 2016-04-06:   4306667256    Refactoring of CSE #11781
-          # 2016-09-15:   3702580928    Fixing #12422
-          # 2017-02-14    3404346032    Early inlining: 5% improvement
-          # 2018-03-25    3096670112    Flattener patch with optimizations (#12919)
-
-           (wordsize(32), 1727582260, 5)
-          # was           1500000000
-          # 2016-04-06    2257242896
-          # 2017-03-24    1727582260    x86/Linux, 64-bit machine
-          ]),
+       collect_compiler_stats('bytes allocated',5)
       ],
      compile_fail,
      [''])
 test('T9872d',
      [ only_ways(['normal']),
-       compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 578498120, 7),
-          # 2014-12-18    796071864   Initally created
-          # 2014-12-18    739189056   Reduce type families even more eagerly
-          # 2015-01-07    687562440   TrieMap leaf compression
-          # 2015-03-17    726679784   tweak to solver; probably flattens more
-          # 2015-12-11    566134504   TypeInType; see #11196
-          # 2016-02-08    534693648   Improved a bit by tyConRolesRepresentational
-          # 2016-03-18    506691240   optimize Unify & zonking
-          # 2016-12-05    478169352   using tyConIsTyFamFree, I think, but only
-          #                           a 1% improvement 482 -> 478
-          # 2017-02-17    535565128   Type-indexed Typeable
-          # 2017-02-25    498855104   Early inlining
-          # 2017-03-03    462817352   Share Typeable KindReps
-          # 2018-03-25    526485920   Flattener patch does more work (#12919)
-          # 2018-04-11    572537984   simplCast improvement collateral (#11735)
-          # 2018-07-04    578498120   introduce GRefl (#15192)
-
-           (wordsize(32), 232954000, 5)
-          # some date     328810212
-          # 2015-07-11    350369584
-          # 2016-04-06    264566040   x86/Linux
-          # 2017-03-24    232954000   x86/Linux, 64-bit machine
-          ]),
+       collect_compiler_stats('bytes allocated',5)
       ],
      compile,
      [''])
 
 test('T9961',
      [ only_ways(['normal']),
-       compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 498326216, 5),
-          # 2015-01-12    807117816   Initally created
-          # 2015-spring   772510192   Got better
-          # 2015-05-22    663978160   Fix for #10370 improves it more
-          # 2015-10-28    708680480   x86_64/Linux   Emit Typeable at definition site
-          # 2015-12-17    745044392   x86_64/Darwin  Creep upwards
-          # 2016-03-20    519436672   x64_64/Linux   Don't use build desugaring for large lists (#11707)
-          # 2016-03-24    568526784   x64_64/Linux   Add eqInt* variants (#11688)
-          # 2016-09-01    537297968   x64_64/Linux   Restore w/w limit (#11565)
-          # 2016-12-19    571246936   x64_64/Linux   Join points (#12988)
-          # 2017-02-14    498326216   Early inline patch; 13% improvement
-
-           (wordsize(32), 255409052, 5)
-          # was           375647160
-          # 2016-04-06    275264188   x86/Linux
-          # 2017-03-24    255409052   x86/Linux, 64-bit machine
-          ]),
+       collect_compiler_stats('bytes allocated',5)
       ],
      compile,
      ['-O'])
 
 test('T9233',
     [ only_ways(['normal']),
-      compiler_stats_num_field('bytes allocated',
-        [(wordsize(64),  973149832, 5),
-         # 2015-08-04    999826288     initial value
-         # 2016-04-14   1066246248     Final demand analyzer run
-         # 2016-06-18    984268712     shuffling around of Data.Functor.Identity
-         # 2017-01-20    920101608     Improvement to SetLevels apparently saved 4.2% in
-         #                             compiler allocation.  Program size seems virtually
-         #                             unchanged; maybe the compiler itself is a little faster
-         # 2017-01-23    861862608     worker/wrapper evald-ness flags; another 5% improvement!
-         # 2017-02-01    894486272     Join points
-         # 2017-02-07    884436192     Another improvement to SetLevels
-         # 2017-02-17    974530192     Type-indexed Typeable
-         # 2017-03-21    924299320     It's unclear
-         # 2018-06-09    973149832     It's unclear
-
-         (wordsize(32),  460112888, 5)
-         # 2016-04-06    515672240     (x86/Linux) initial value
-         # 2017-03-24    460112888     x86/Linux, 64-bit machine
-        ]),
+      collect_compiler_stats('bytes allocated',5),
       extra_clean(['T9233a.hi', 'T9233a.o'])
     ],
     multimod_compile,
@@ -977,42 +224,8 @@ test('T9233',
 
 test('T10370',
      [ only_ways(['optasm']),
-       compiler_stats_num_field('max_bytes_used', # Note [residency]
-          [(wordsize(64), 31524048, 15),
-          # 2015-10-22    19548720
-          # 2016-02-24    22823976   Changing Levity to RuntimeRep; not sure why this regresses though, even after some analysis
-          # 2016-04-14    28256896   final demand analyzer run
-          # 2016-08-08    33049304
-          #     This change happened because we changed the behavior
-          #     of inlining across hs-boot files, so that we don't
-          #     inline if something comes from a boot file.  This
-          #     affected stats on bootstrapped GHC.  However,
-          #     when I set -i0.01 with profiling, the heap profiles
-          #     were identical, so I think it's just GC noise.
-          # 2016-10-20    38221184   Allow top-level string literals.
-          #                          See the comment 16 on #8472.
-          # 2017-02-17    51126304   Type-indexed Typeable
-          # 2017-02-27    43455848   Likely drift from recent simplifier improvements
-          # 2017-02-25    41291976   Early inline patch
-          # 2017-04-30    31524048   Fix leaks in tidy unfoldings
+       collect_compiler_stats(['max_bytes_used','peak_megabytes_allocated'], 15),
 
-           (wordsize(32), 19276304, 15),
-          # 2015-10-22    11371496
-          # 2017-03-24    19276304 (x86/Linux, 64-bit machine)
-          ]),
-       compiler_stats_num_field('peak_megabytes_allocated', # Note [residency]
-          [(wordsize(64), 117, 15),
-          # 2015-10-22     76
-          # 2016-04-14    101 final demand analyzer run
-          # 2016-08-08    121 see above
-          # 2017-01-18    146 Allow top-level string literals in Core
-          # 2017-02-17    187 Type-indexed Typeable
-          # 2017-02-25    154 Early inline patch
-          # 2017-04-30    117 Fix leaks in tidy unfoldings
-           (wordsize(32),  69, 15),
-          # 2015-10-22     39
-          # 2017-03-24     69
-          ]),
        # Use `+RTS -G1` for more stable residency measurements. Note [residency].
        extra_hc_opts('+RTS -G1 -RTS')
      ],
@@ -1020,32 +233,14 @@ test('T10370',
      [''])
 
 test('T10547',
-     [ compiler_stats_num_field('bytes allocated',
-          [(platform('x86_64-unknown-mingw32'), 37485128, 20),
-          # 2017-02-19                         37485128 (x64/Windows) - Unknown
-
-           (wordsize(64), 37681360, 20),
-          # initial:    39165544
-          # 2016-11-25: 31041520 Linux   Around the time of refactoring the constraint solver;
-          #                              but I think that only pushed it over the edge
-          # 2017-02-20: 38681216 Linux   Type-indexed Typeable
-          ]),
+     [ collect_compiler_stats('bytes allocated', 20),
      ],
      compile_fail,
      ['-fprint-expanded-synonyms'])
 
 test('T12227',
      [ only_ways(['normal']),
-       compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 752214784, 5),
-          # 2016-07-11    5650186880 (Windows) before fix for #12227
-          # 2016-07-11    1822822016 (Windows) after fix for #12227
-          # 2016-12-20    1715827784 after d250d493 (INLINE in Traversable dms)
-          #                          (or thereabouts in the commit history)
-          # 2017-02-14    1060158624  Early inlining: 35% improvement
-          # 2018-01-04    812869424   Drop unused givens (#13032): 23% better
-          # 2018-06-27    752214784   Trac #15421
-          ]),
+       collect_compiler_stats('bytes allocated',5)
      ],
      compile,
      # Use `-M1G` to prevent memory thrashing with ghc-8.0.1.
@@ -1053,52 +248,21 @@ test('T12227',
 
 test('T12425',
      [ only_ways(['optasm']),
-       compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 139100464, 5),
-          # initial:      125831400
-          # 2017-01-18:   133380960  Allow top-level string literals in Core
-          # 2017-02-17:   153611448  Type-indexed Typeable
-          # 2017-03-03:   142256192  Share Typeable KindReps
-          # 2017-03-21:   134334800  Unclear
-          # 2017-04-28:   127500136  Remove exponential behaviour in simplifier
-          # 2017-05-23:   134780272  Addition of llvm-targets in dynflags (D3352)
-          # 2018-04-15:   141952368  Collateral of #14737
-          # 2018-04-26:   150743648  Do not unpack class dictionaries with INLINABLE
-          # 2018-05-14:   139100464  improved simplCast performance #15019
-          ]),
+       collect_compiler_stats('bytes allocated',5)
      ],
      compile,
      [''])
 
 test('T12234',
      [ only_ways(['optasm']),
-       compiler_stats_num_field('bytes allocated',
-          [(platform('x86_64-unknown-mingw32'), 79889200, 5),
-          # initial:      83032768
-          # 2017-02-19    89180624 (x64/Windows) - Unknown
-          # 2017-02-25    79889200 (x64/Windows) - Early inline patch
-          # 2018-05-04    86938328 (x64/Windows) - Unknown and horrible
-           (wordsize(64), 85961968, 5),
-          # initial:      72958288
-          # 2016-01-17:   76848856  (x86-64, Linux. drift?)
-          # 2017-02-01:   80882208  (Use superclass instances when solving)
-          # 2017-02-05:   74374440  (Probably OccAnal fixes)
-          # 2017-02-17:   86525344  (Type-indexed Typeable)
-          # 2017-02-25:   83032768  (Early inline patch)
-          # 2017-09-07:   81696664  (Semigroup=>Monoid patch, D3927)
-          # 2018-04-26:   85961968  (Do not unpack class dictionaries with INLINABLE)
-          ]),
+       collect_compiler_stats('bytes allocated',5),
      ],
      compile,
      [''])
 
 test('T12545',
      [ only_ways(['normal']),
-       compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 3249613688, 5),
-          # 2017-06-08    3538652464  initial
-          # 2018-06-27    3249613688  Trac #15421
-          ]),
+       collect_compiler_stats('bytes allocated',5),
        extra_clean(['T12545a.hi', 'T12545a.o'])
      ],
      multimod_compile,
@@ -1106,88 +270,39 @@ test('T12545',
 
 test('T13035',
      [ only_ways(['normal']),
-       compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 125020728, 5),
-          # 2017-01-05   90595208  initial
-          # 2017-01-19   95269000  Allow top-level string literals in Core
-          # 2017-02-05   88806416  Probably OccAnal fixes
-          # 2017-02-17   103890200 Type-indexed Typeable
-          # 2017-02-25   98390488  Early inline patch
-          # 2017-03-21   93249744  It's unclear
-          # 2017-07-19   118665640 Generate Typeable bindings for data instances
-          # 2018-06-10   125020728 It's unclear
-          ]),
+       collect_compiler_stats('bytes allocated',5),
      ],
      compile,
      [''] )
 
 test('T13056',
      [ only_ways(['optasm']),
-       compiler_stats_num_field('bytes allocated',
-         [(wordsize(64), 440548592, 10),
-         # 2017-01-06    520166912 initial
-         # 2017-01-31    546800240 Join points (#12988)
-         # 2017-02-07    524611224 new SetLevels
-         # 2017-02-14    440548592 Early inline patch: 16% improvement
-         # 2017-04-21    417860736 (darwin)
-         # 2017-04-22    Increase to +/- 10% (Darwin and Linux differ significantly)
-         ]),
+       collect_compiler_stats('bytes allocated',10),
      ],
      compile,
      ['-O1'])
 
 test('T12707',
-     [ compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 1201750816, 5),
-          # initial:    1271577192
-          # 2017-01-22: 1348865648  Allow top-level strings in Core
-          # 2017-01-31: 1280336112  Join points (#12988)
-          # 2017-02-11: 1310037632  Check local family instances vs imports
-          # 2017-02-23: 1386110512  Type-indexed Typeable? (on Darwin)
-          # 2017-03-02: 1231809592  Drift from recent simplifier improvements
-          # 2017-05-14: 1163821528  (amd64/Linux) Two-pass CmmLayoutStack
-          # 2018-04-09: 1237898376  Inexplicable, collateral of #14737
-          # 2018-04-30: 1141555816  improved simplCast performance #15019
-          # 2018-09-21: 1201750816  (amd64/darwin) Drift
-          ]),
+     [ collect_compiler_stats('bytes allocated',5),
      ],
      compile,
      [''])
 
 test('T12150',
      [ only_ways(['optasm']),
-       compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 77557800, 10)
-          # initial:    70773000
-          # 2017-08-25: 74358208  Refactor the Mighty Simplifier
-          # 2017-08-25: 78300680  Drift
-          # 2017-10-25: 73769936  amd64/linux Fix space leak in BinIface.getSymbolTable
-          # 2018-04-26: 77557800  Do not unpack class dictionaries with INLINABLE
-          ]),
+       collect_compiler_stats('bytes allocated',5)
      ],
     compile,
      [''])
 
 test('T13379',
-     [ compiler_stats_num_field('bytes allocated',
-          [(platform('x86_64-apple-darwin'), 453166912, 10),
-          # 453166912: add osx-specific after two-pass CmmLayoutStack
-           (wordsize(64), 411597856, 10),
-          # initial:    411597856
-          # widen window to 10%, Darwin had 449080520, a 9.1% difference
-          ]),
+     [ collect_compiler_stats('bytes allocated',10),
      ],
      compile,
      [''])
 
 test('MultiLayerModules',
-     [ compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 5619893176, 10),
-          # initial:    12139116496
-          # 2017-05-12: 6956533312   Revert "Use a deterministic map for imp_dep_mods"
-          # 2017-05-31: 6294813000   Faster checkFamInstConsistency
-          # 2018-01-21: 5619893176   Allocate less in plus_mod_dep
-          ]),
+     [ collect_compiler_stats('bytes allocated',10),
        pre_cmd('./genMultiLayerModules'),
        extra_files(['genMultiLayerModules']),
        compile_timeout_multiplier(5)
@@ -1200,11 +315,7 @@ test('MultiLayerModules',
      ['MultiLayerModules', '-v0'])
 
 test('ManyConstructors',
-     [ compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 4246959352, 10),
-          # initial:    8130527160
-          # 2018-01-05: 4246959352  Lift constructor tag allocation out of a loop
-          ]),
+     [ collect_compiler_stats('bytes allocated',10),
        pre_cmd('./genManyConstructors'),
        extra_files(['genManyConstructors']),
      ],
@@ -1212,11 +323,7 @@ test('ManyConstructors',
      ['ManyConstructors', '-v0'])
 
 test('ManyAlternatives',
-     [ compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 1398898072, 10),
-          # initial:    1756999240
-          # 2018-01-20: 1398898072  Use IntSet in Dataflow
-          ]),
+     [ collect_compiler_stats('bytes allocated',10),
        pre_cmd('./genManyAlternatives'),
        extra_files(['genManyAlternatives']),
      ],
@@ -1224,18 +331,7 @@ test('ManyAlternatives',
      ['ManyAlternatives', '-v0'])
 
 test('T13701',
-     [ compiler_stats_num_field('bytes allocated',
-          [(platform('x86_64-apple-darwin'), 2549206272, 10),
-           (platform('x86_64-unknown-linux'), 2413253392, 10),
-           # initial:     2511285600
-           # 2017-06-23:  2188045288    treat banged variable bindings as FunBinds
-           # 2017-07-11:  2187920960
-           # 2017-07-12:  2412223768    inconsistency between Ben's machine and Harbormaster?
-           # 2017-07-17:  2133380768    Resolved the issue causing the inconsistencies in this test
-           # 2018-05-09:  2413253392    D4659 (Fix GHCi space leaks) added
-           #                            some strictness which causes some extra
-           #                            work to be done in this test.
-          ]),
+     [ collect_compiler_stats('bytes allocated',10),
        pre_cmd('./genT13701'),
        extra_files(['genT13701']),
        compile_timeout_multiplier(4)
@@ -1248,11 +344,7 @@ test('T13701',
      ['T13701', '-v0'])
 
 test('T13719',
-     [ compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 5187889872, 10),
-          # initial:    49907410784
-          # 2017-05-31: 5187889872   Faster checkFamInstConsistency
-          ]),
+     [ collect_compiler_stats('bytes allocated',10),
        pre_cmd('./genT13719'),
        extra_files(['genT13719']),
        compile_timeout_multiplier(2)
@@ -1265,12 +357,7 @@ test('T13719',
      ['T13719', '-v0'])
 
 test('T14697',
-     [ compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 371030912, 10),
-          # initial:    635573784
-          # 2018-02-23: 337290376 Cache the fingerprint of sOpt_P
-          # 2018-05-24: 371030912 Unknown
-          ]),
+     [ collect_compiler_stats('bytes allocated',10),
        # This generates too large of a command-line for poor Windows
        when(opsys('mingw32'), expect_broken(15072)),
        pre_cmd('./genT14697'),
@@ -1281,12 +368,7 @@ test('T14697',
      ['T14697', '-v0'])
 
 test('T14683',
-     [ compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 10521594688, 10),
-          # initial:      25189145632
-          # 2018-04-19:   14675353056  Cache NthCo role (#14683)
-          # 2018-04-20:   10521594688  Remove unnecessary check in simplCast
-          ]),
+     [ collect_compiler_stats('bytes allocated',10),
      ],
      multimod_compile,
      ['T14683', '-v0'])
@@ -1294,36 +376,20 @@ test('T14683',
 test('Naperian',
      [ reqlib('vector'),
        only_ways(['optasm']),
-       compiler_stats_num_field('bytes allocated',
-          [(platform('x86_64-unknown-mingw32'), 54116696, 10),
-           # 2017-12-24                       54116696 (x64/Windows) - Unknown
-          (wordsize(64), 53576760, 10)])
-           # 2018-01-25                       53576760 (x64/Linux) - The previous value looked very wrong
+       collect_compiler_stats('bytes allocated',10),
      ],
      compile,
      [''])
 
 test ('T9630',
-      [ compiler_stats_num_field('max_bytes_used', # Note [residency]
-          [(platform('x86_64-unknown-mingw32'),   39867088, 15),
-          # 2017-12-24:                     34171816 (x64/Windows)
-          (wordsize(64), 41365088, 15)
-          # initial:    56955240
-          # 2017-06-07: 41568168     Stop the specialiser generating loopy code
-          # 2018-02-25: 35324712     It's not entirely clear
-          # 2018-09-22: 41365088     It's not entirely clear (x86_64/darwin)
-          ]),
+      [ collect_compiler_stats('max_bytes_used',15), # Note [residency]
       extra_clean(['T9630a.hi', 'T9630a.o'])
       ],
       multimod_compile,
       ['T9630', '-v0 -O'])
 
 test ('T15164',
-      [ compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 1945564312, 10)
-          # initial:      3423873408
-          # 2018-05-22:   1945564312   Fix bottleneck in CallArity
-          ])
+      [ collect_compiler_stats('bytes allocated',10)
       ],
       compile,
       ['-v0 -O'])
diff --git a/testsuite/tests/perf/haddock/all.T b/testsuite/tests/perf/haddock/all.T
index 4161c2e6d3..fca30366f9 100644
--- a/testsuite/tests/perf/haddock/all.T
+++ b/testsuite/tests/perf/haddock/all.T
@@ -5,72 +5,7 @@
 test('haddock.base',
      [extra_files(['../../../../libraries/base/dist-install/haddock.t']),
       unless(in_tree_compiler(), skip), req_haddock
-     ,stats_num_field('bytes allocated',
-          [(platform('x86_64-unknown-mingw32'), 18733710728, 5)
-            # 2017-02-19                        24286343184 (x64/Windows) - Generalize kind of (->)
-            # 2017-12-24                        18733710728 (x64/Windows) - Unknown
-
-          ,(wordsize(64), 25913205656, 5)
-            # 2012-08-14:  5920822352 (amd64/Linux)
-            # 2012-09-20:  5829972376 (amd64/Linux)
-            # 2012-10-08:  5902601224 (amd64/Linux)
-            # 2013-01-17:  6064874536 (x86_64/Linux)
-            # 2013-02-10:  6282746976 (x86_64/Linux)
-            # 2013-09-17:  6634886456 (x86_64/Linux)
-            # 2013-09-18:  6294339840 (x86_64/Linux)
-            # 2013-11-21:  6756213256 (x86_64/Linux)
-            # 2014-01-12:  7128342344 (x86_64/Linux)
-            # 2014-06-12:  7498123680 (x86_64/Linux)
-            # 2014-08-05:  7992757384 (x86_64/Linux - bugfix for #314, Haddock now parses more URLs)
-            # 2014-08-08:  7946284944 (x86_64/Linux - Haddock updates to attoparsec-0.12.1.0)
-            # 2014-09-09:  8354439016 (x86_64/Linux - Applicative/Monad changes, according to Austin)
-            # 2014-09-10:  7901230808 (x86_64/Linux - Applicative/Monad changes, according to Joachim)
-            # 2014-10-07:  8322584616 (x86_64/Linux)
-            # 2014-12-14:  9502647104 (x86_64/Linux) - Update to Haddock 2.16
-            # 2014-01-08:  9014511528 (x86_64/Linux) - Eliminate so-called "silent superclass parameters" (and others)
-            # 2015-07-22:  9418857192 (x86_64/Linux) - Just slowly creeping up.
-            # 2015-10-03:  9894189856 (x86_64/Linux) - Still creeping
-            # 2015-12-11: 11119767632 (amd64/Linux) - TypeInType (see #11196)
-            # 2015-12-17: 26282821104 (x86_64/Linux) - Update Haddock to master
-            # 2015-12-17: 27812188000 (x86_64/Linux) - Move Data.Functor.* into base
-            # 2016-02-25: 30987348040 (x86_64/Linux) - RuntimeRep
-            # 2016-05-12: 32855223200 (x86_64/Linux) - Make Generic1 poly-kinded
-            # 2017-01-11: 31115778088 (x86_64/Linux) - Join points (#12988)
-            # 2017-02-11: 34819979936 (x86_64/Linux) - OccurAnal / One-Shot  (#13227)
-            # 2017-02-16: 32695562088 Better Lint for join points
-            # 2017-02-17: 38425793776 (x86_64/Linux) - Generalize kind of (->)
-            # 2017-02-12: 25592972912 (x86_64/Linux) - Type-indexed Typeable
-            # 2017-06-05: 27868466432 (x86_64/Linux) - Desugar modules compiled with -fno-code
-            # 2017-06-06: 25173968808 (x86_64/Linux) - Don't pass on -dcore-lint in Haddock.mk
-            # 2017-07-12: 23677299848 (x86_64/Linux) - Use getNameToInstancesIndex
-            # 2017-08-22: 19694554424 (x86_64/Linux) - Various Haddock optimizations
-	        # 2018-03-31: 20980255200 (x86_64/Linux) - Track type variable scope more carefully
-	        # previous to this last commit, the allocations were right below the top
-            # of the range. This commit adds only ~1.5% allocations.
-            # 2018-04-10: 18511324808 (x86_64/Linux) - TTG HsBinds and Data instances
-            # 2018-04-11: 20727464616 (x86_64/Linux) - Collateral of simplCast improvement (#14737)
-            # 2018-04-20: 18971030224 (x86_64/Linux) - Cache coercion roles
-            # 2018-05-14: 21123660336 (amd64/Linux) - D4659: strictness to fix space leaks
-            # 2018-06-14: 24662232152 (amd64/Linux) - Bump haddock
-            # 2018-10-08: 25913205656 (amd64/Linux&OSX) - D5167: Improve GHC.Prim docs
-
-          ,(platform('i386-unknown-mingw32'), 2885173512, 5)
-            # 2013-02-10:                     3358693084 (x86/Windows)
-            # 2013-11-13:                     3097751052 (x86/Windows, 64bit machine)
-            # 2014-04-04:                     3548581572 (x86/Windows, 64bit machine)
-            # 2014-12-01:                     4202377432 (x86/Windows, 64bit machine)
-            # 2015-01-20:                     4434804940 (x86/Windows, 64bit machine)
-            # 2017-04-02:                     2885173512 update
-
-          ,(wordsize(32), 3445319728, 5)])
-            # 2012-08-14: 3046487920 (x86/OSX)
-            # 2012-10-30: 2955470952 (x86/Windows)
-            # 2013-02-10: 3146596848 (x86/OSX)
-            # 2014-02-22: 3554624600 (x86/Linux - new haddock)
-            # 2014-06-29: 3799130400 (x86/Linux)
-            # 2016-04-06: 5509757068 (x86/Linux)
-            # 2017-03-24: 3819657568 (x86/Linux)
-            # 2017-04-06: 3445319728 (x86/Linux)
+     ,collect_stats('bytes allocated',5)
       ],
      stats,
      ['haddock.t'])
@@ -78,78 +13,7 @@ test('haddock.base',
 test('haddock.Cabal',
      [extra_files(['../../../../libraries/Cabal/Cabal/dist-install/haddock.t']),
       unless(in_tree_compiler(), skip), req_haddock
-     ,stats_num_field('bytes allocated',
-          [(wordsize(64), 27520214496, 5)
-            # 2012-08-14:  3255435248 (amd64/Linux)
-            # 2012-08-29:  3324606664 (amd64/Linux, new codegen)
-            # 2012-10-08:  3373401360 (amd64/Linux)
-            # 2013-03-13:  3626604824 (amd64/Linux) Cabal updated
-            # 2013-03-28:  3517301864 (amd64/Linux) fixed #7796
-            # 2013-04-26:  3658801800 (amd64/Linux) Cabal updated
-            # 2013-08-26:  3808466816 (amd64/Linux) Cabal updated
-            # 2013-11-21:  3908586784 (amd64/Linux) Cabal updated
-            # 2013-12-12:  3828567272 (amd64/Linux)
-            # 2014-01-12:  3979151552 (amd64/Linux) new parser
-            # 2014-06-29:  4200993768 (amd64/Linux)
-            # 2014-08-05:  4493770224 (x86_64/Linux - bugfix for #314, Haddock now parses more URLs)
-            # 2014-08-29:  4267311856 (x86_64/Linux - w/w for INLINABLE things)
-            # 2014-09-09:  4660249216 (x86_64/Linux - Applicative/Monad changes according to Austin)
-            # 2014-09-10:  4500376192 (x86_64/Linux - Applicative/Monad changes according to Joachim)
-            # 2014-09-24:  5840893376 (x86_64/Linux - Cabal update)
-            # 2014-10-04:  6019839624 (x86_64/Linux - Burning Bridges, Cabal update)
-            # 2014-12-14:  6387320816 (x86_64/Linux) - Update to Haddock 2.16
-            # 2015-01-22:  6710234312 (x86_64/Linux) - Cabal updated
-            # 2015-06-29:  7413958344 (x86_64/Linux) - due to #10482, not yet investigated
-            # 2015-12-11:  8114833312 (amd64/Linux) - TypeInType (See #11196)
-            # 2015-12-17:  9982130512 (amd64/Linux) - Update Haddock to master
-            # 2015-12-22: 10519532424 (amd64/Linux) - Lots of new Semigroup instances in Cabal
-            # 2016-03-29: 11517963232 (amd64/Linux) - not yet investigated
-            # 2016-03-30: 10941742184 (amd64/Linux) - defer inlining of Int* Ord methods
-            # 2016-04-06: 11542374816 (amd64/Linux) - CSE improvements and others
-            # 2016-04-07: 10963514352 (amd64/Linux) - Revert to what phabricator claims
-            # 2016-05-22: 11805238152 (amd64/Linux) - Make Generic1 poly-kinded
-            # 2016-06-05: 10997887320 (amd64/Linux) - Refactor derived Generic instances to reduce allocations
-            # 2016-06-21: 10070330520 (amd64/Linux) - D2350: Make checkFamInstConsistency less expensive
-            # 2016-08-07: 16001233464 (amd64/Linux) - Cabal update
-            #   It's worth noting that allocations scale up with the number
-            #   of modules in Cabal.  This Cabal update added a large number
-            #   of new modules; if you exclude them from the haddock run
-            #   the stats are comparable.
-            # 2016-10-01: 20619433656 (amd64/Linux) - Cabal update
-            # 2016-10-03: 21554874976 (amd64/Linux) - Cabal update
-            # 2016-10-06: 23706190072 (amd64/Linux) - Cabal update
-            # 2016-12-20: 25478853176 (amd64/Linux) - Cabal update
-            # 2017-01-14: 23272708864 (amd64/Linux) - Join points (#12988)
-            # 2017-02-11: 25533642168 (amd64/Linux) - OccurAnal / One-Shot  (#13227)
-            # 2017-02-16: 23867276992  Better Lint for join points
-            # 2017-02-17: 27784875792 (amd64/Linux) - Generalize kind of (->)
-            # 2017-02-12: 18865432648 (amd64/Linux) - Type-indexed Typeable
-            # 2017-05-31: 18269309128 (amd64/Linux) - Faster checkFamInstConsistency
-            # 2017-06-05: 22294859000 (amd64/Linux) - Desugar modules compiled with -fno-code
-            # 2017-06-05: 18753963960 (amd64/Linux) - Don't pass on -dcore-lint in Haddock.mk
-            # 2017-08-22: 15857428040 (amd64/Linux) - Various Haddock optimizations
-            # 2017-11-02: 17133915848 (amd64/Linux) - Phabricator D4144
-            # 2017-11-06: 18936339648 (amd64/Linux) - Unknown
-            # 2017-11-09: 20104611952 (amd64/Linux) - Bump Cabal
-            # 2018-01-22: 25261834904 (amd64/Linux) - Bump Cabal
-            # 2018-04-10: 23525241536 (amd64/Linux) - TTG HsBinds and Data instances
-            # 2018-05-14: 24519860272 (amd64/Linux) - D4659: strictness to fix space leaks
-            # 2018-06-14: 27520214496 (amd64/Linux) - Bump haddock
-
-          ,(platform('i386-unknown-mingw32'), 3293415576, 5)
-            # 2012-10-30:                     1733638168 (x86/Windows)
-            # 2013-02-10:                     1906532680 (x86/Windows)
-            # 2014-01-28:                     1966911336 (x86/Windows)
-            # 2014-04-24:                     2052220292 (x86/Windows)
-            # 2014-12-01:                     3088635556 (x86/Windows)
-            # 2015-01-20:                     3293415576
-
-          ,(wordsize(32), 3511151136, 5)])
-            # 2012-08-14: 1648610180 (x86/OSX)
-            # 2014-01-22: 1986290624 (x86/Linux)
-            # 2014-06-29: 2127198484 (x86/Linux)
-            # 2016-04-06: 6268156056 (x86/Linux)
-            # 2017-03-24: 3511151136 (x86/Linux)
+     ,collect_stats('bytes allocated',5)
       ],
      stats,
      ['haddock.t'])
@@ -157,49 +21,7 @@ test('haddock.Cabal',
 test('haddock.compiler',
      [extra_files(['../../../../compiler/stage2/haddock.t']),
       unless(in_tree_compiler(), skip), req_haddock
-     ,stats_num_field('bytes allocated',
-          [(platform('x86_64-unknown-mingw32'),   56775301896, 10),
-            # 2017-12-24:                     56775301896 (x64/Windows)
-            (wordsize(64), 63038317672, 10)
-            # 2012-08-14: 26070600504 (amd64/Linux)
-            # 2012-08-29: 26353100288 (amd64/Linux, new CG)
-            # 2012-09-18: 26882813032 (amd64/Linux)
-            # 2012-11-12: 25990254632 (amd64/Linux)
-            # 2014-07-17: 29809571376 (amd64/Linux) general round of updates
-            # 2012-11-27: 28708374824 (amd64/Linux)
-            # 2014-09-10: 30353349160 (amd64/Linux) post-AMP cleanup
-            # 2014-11-22: 33562468736 (amd64/Linux)
-            # 2015-06-02: 36740649320 (amd64/Linux) unknown cause
-            # 2015-06-29: 40624322224 (amd64/Linux) due to #10482, not yet investigated
-            # 2015-12-03: 44721228752 (amd64/Linux) slow creep upwards
-            # 2015-12-15: 49395782136 (amd64/Linux) more creep, following kind-equalities
-            # 2015-12-17: 58017214568 (amd64/Linux) update Haddock to master
-            # 2016-06-21: 55314944264 (amd64/Linux) D2350: Make checkFamInstConsistency less expensive
-            # 2016-11-29: 60911147344 (amd64/Linux) unknown cause
-            # 2017-02-11: 62070477608 (amd64/Linux) OccurAnal / One-Shot  (#13227) (and others)
-            # 2017-02-25: 55777283352 (amd64/Linux) Early inline patch
-            # 2017-05-31: 52762752968 (amd64/Linux) Faster checkFamInstConsistency
-            # 2017-06-05: 65378619232 (amd64/Linux) Desugar modules compiled with -fno-code
-            # 2017-06-06: 55990521024 (amd64/Linux) Don't pass on -dcore-lint in Haddock.mk
-            # 2017-07-12: 51592019560 (amd64/Linux) Use getNameToInstancesIndex
-            # 2018-04-08: 91115212032 (amd64/Linux) Trees that grow
-            # 2018-04-10: 58410358720 (amd64/Linux) Trees that grow (HsBinds, Data instances)
-            # 2018-05-14: 63038317672 (amd64/Linux) D4659: strictness to fix space leaks
-
-          ,(platform('i386-unknown-mingw32'),   367546388, 10)
-            # 2012-10-30:                     13773051312 (x86/Windows)
-            # 2013-02-10:                     14925262356 (x86/Windows)
-            # 2013-11-13:                     14328363592 (x86/Windows, 64bit machine)
-            # 2014-12-01:                       104140852 (x86/Windows, sudden shrinkage!)
-            # 2014-12-10:                       217933548 increased again
-            # 2017-04-02:                       367546388 update
-
-          ,(wordsize(32), 3775852520, 5)])
-            # 2012-08-14: 13471797488 (x86/OSX)
-            # 2014-01-22: 14581475024 (x86/Linux - new haddock)
-            # 2014-06-29: 15110426000 (x86/Linux)
-            # 2016-04-06: 16222702892 (x86/Linux)
-            # 2017-03-24: 3775852520  (x86/Linux)
+     ,collect_stats('bytes allocated',10)
       ],
      stats,
      ['haddock.t'])
diff --git a/testsuite/tests/perf/join_points/all.T b/testsuite/tests/perf/join_points/all.T
index fe202b6487..eedf0c0bff 100644
--- a/testsuite/tests/perf/join_points/all.T
+++ b/testsuite/tests/perf/join_points/all.T
@@ -7,17 +7,15 @@ setTestOpts(f)
 test('join001', normal, compile, [''])
 
 test('join002',
-  [stats_num_field('bytes allocated', [(wordsize(64), 2000290792, 5)])],
+  [collect_stats('bytes allocated',5),],
   compile_and_run,
   [''])
 test('join003',
-  [stats_num_field('bytes allocated', [(wordsize(64), 2000290792, 5)])],
+  [collect_stats('bytes allocated',5),],
   compile_and_run,
   [''])
 test('join004',
-  [stats_num_field('bytes allocated', [(wordsize(64), 16130592, 5)])],
-   # 2017-01-24   48146720    Join point rework
-   # 2017-02-05   16130592    Do Worker/Wrapper for NOINLINE things
+  [collect_stats('bytes allocated',5),],
   compile_and_run,
   [''])
 
@@ -25,11 +23,7 @@ test('join005', normal, compile, [''])
 test('join006', normal, compile, [''])
 
 test('join007',
-  [stats_num_field('bytes allocated',
-      [(platform('x86_64-unknown-mingw32'), 47368, 5),
-       # 2017-02-19                         47368 (x64/Windows) - Unknown
-
-       (wordsize(64), 50944, 5)])],
-       # Initial 50944
+  [collect_stats('bytes allocated',5),],
   compile_and_run,
   [''])
+
diff --git a/testsuite/tests/perf/should_run/all.T b/testsuite/tests/perf/should_run/all.T
index e3fb136d9f..0b70398e46 100644
--- a/testsuite/tests/perf/should_run/all.T
+++ b/testsuite/tests/perf/should_run/all.T
@@ -3,16 +3,14 @@
 # See Note [Solving from instances when interacting Dicts]
 
 test('T5835',
-     [stats_num_field('max_bytes_used',
-           [(wordsize(64), 44312, 10)]),
+     [collect_stats('max_bytes_used',10),
        only_ways(['normal'])
        ],
      compile_and_run,
      ['-O'])
 
 test('T12791',
-     [stats_num_field('max_bytes_used',
-           [(wordsize(64), 44312, 10)]),
+     [collect_stats('max_bytes_used',10),
        only_ways(['normal'])
        ],
      compile_and_run,
@@ -21,20 +19,14 @@ test('T12791',
 # Tests that newArray/newArray_ is being optimised correctly
 
 test('T10359',
-     [stats_num_field('bytes allocated',
-          [(wordsize(64), 450920, 5),
-           # previously   499512    (amd64/Linux)
-           # 2017-03-10   450920    (amd64/Linux)  Don't generate wrapper for !Int#
-           (wordsize(32), 351508, 5)]),
+     [collect_stats('bytes allocated',5),
       only_ways(['normal'])
       ],
      compile_and_run,
      ['-O'])
 
 test('T14955',
-     [stats_num_field('bytes allocated',
-          [(wordsize(64), 48050760, 5),
-           (wordsize(32), 351508, 5)]),
+     [collect_stats('bytes allocated',5),
       only_ways(['normal'])
       ],
      multimod_compile_and_run,
@@ -43,11 +35,8 @@ test('T14955',
 # fortunately the values here are mostly independent of the wordsize,
 # because the test allocates an unboxed array of doubles.
 test('T3586',
-     [stats_num_field('peak_megabytes_allocated', (17, 1)),
-                                 # expected value: 17 (amd64/Linux)
-      stats_num_field('bytes allocated', (16102024, 5)),
-                        # prev:           16835544 (amd64/Linux)
-                        # 2014-07-17:     16102024 (amd64/Linux), general round of updates
+     [collect_stats('peak_megabytes_allocated',1),
+      collect_stats('bytes allocated', 5),
       only_ways(['normal']),
 
       # Use `+RTS -G1` for more stable residency measurements. Note [residency].
@@ -58,17 +47,7 @@ test('T3586',
      ['-O'])
 
 test('T4830',
-     [stats_num_field('bytes allocated',
-          [(wordsize(64), 98248, 4),
-           #             127000 (amd64/Linux)
-           # 2013-02-07:  99264 (amd64/Linux)
-           # 2014-01-13:  98248 (amd64/Linux) due to #8647
-           # 2015-04-03: Widen 1->4% (amd64/Windows was doing better)
-           (wordsize(32), 70646, 3)]),
-           # 2013-02-10:  69744 (x86/Windows)
-           # 2013-02-10:  71548 (x86/OSX)
-           # 2014-01-28:  Widen range 2->3
-           #                (x86/Windows - actual 69000, lower was 69233)
+     [collect_stats('bytes allocated',4),
       only_ways(['normal'])
       ],
      compile_and_run,
@@ -82,27 +61,8 @@ test('T3245', [when(doing_ghci(), extra_hc_opts('-fobject-code'))],
 #
 test('lazy-bs-alloc',
      [extra_files(['../../numeric/should_run/arith011.stdout']),
-      stats_num_field('peak_megabytes_allocated', (2, 1)),
-                                 # expected value: 2 (amd64/Linux)
-      stats_num_field('bytes allocated',
-          [(wordsize(64), 421792, 5),
-            #             489776 (amd64/Linux)
-            # 2013-02-07: 429744 (amd64/Linux)
-            # 2013-12-12: 425400 (amd64/Linux)
-            # 2015-04-04: Widen 1->3% (amd64/Windows was failing)
-            # 2015-08-15: 431500 (Windows not good enough. avg of Windows&Linux)
-            # 2015-12-15: 444720 (amd64/Linux, D1616)
-            # 2015-12-17: 444720 (widen 3->5%, Windows is at 462688)
-            # 2017-01-30: 421792 (amd64/Linux, strangely Type-indexed Typeable)
-           (wordsize(32), 410040, 5)]),
-            # 2013-02-10: 421296 (x86/Windows)
-            # 2013-02-10: 414180 (x86/OSX)
-            # 2014-01-22: 411500 (x86/Linux)
-            # 2014-01-28: Widen 1->2% (x86/Windows was at 425212)
-            # 2016-04-06: 429760 (x86/Linux) no idea what happened
-            # 2017-02-14: 421448 Early inline patch
-            # 2017-03-24: 410040 It's not entirely clear, widen threshold to match 64-bit case
-
+      collect_stats('peak_megabytes_allocated', 1),
+      collect_stats('bytes allocated',5),
       only_ways(['normal']),
       extra_run_opts('arith011.stdout'),
       ignore_stdout,
@@ -116,25 +76,7 @@ test('lazy-bs-alloc',
      ['-O'])
 
 test('T876',
-     [stats_num_field('bytes allocated',
-          [(platform('x86_64-unknown-mingw32'), 53472, 5),
-              # 2015-04-03: 71904 (amd64/Windows, unknown cause)
-              # 2016-11-27: 66928 (amd64/Windows, unknown cause)
-              # 2017-12-24: 53472 (amd64/Windows, unknown cause)
-
-           (wordsize(64), 58128, 5),
-              # 2013-02-14: 1263712 (x86_64/Linux)
-              # 2014-02-10:   63216 (x86_64/Linux), call arity analysis
-              # 2016-11-11:   58128 (x86_64/Linux), it's not clear
-
-           (wordsize(32), 50408, 5) ]),
-              # some date:  663712  (Windows, 64-bit machine)
-              # 2014-04-04:  56820  (Windows, 64-bit machine)
-              # 2014-06-29:  53024  (x86_64/Linux)
-              # 2014-12-01:  56796  (Windows)
-              # 2015-07-11:  53156  (x86_64/Linux)
-              # 2017-03-24:  50408  (x86/Linux, 64-bit machine)
-
+     [collect_stats('bytes allocated',5),
       only_ways(['normal']),
       extra_run_opts('10000')
       ],
@@ -154,34 +96,16 @@ test('T4321',
 test('T3736', [], run_command, ['$MAKE -s --no-print-directory T3736'])
 test('T3738',
      [extra_clean(['T3738a.hi', 'T3738a.o']),
-      stats_num_field('peak_megabytes_allocated', (2, 0)),
-                    # expected value: 1 (amd64/Linux)
-                    # 2016-08-31:     2 (allocation area size bumped to 1MB)
-      stats_num_field('bytes allocated',
-                      [(wordsize(32), 45648, 5),
-                    # expected value: 50520 (x86/Linux)
-                       (wordsize(64), 50592, 8)]),
-                    # prev:           49400 (amd64/Linux)
-                    # 2014-07-17:     50520 (amd64/Linux) general round of updates
-                    # 2014-09-10:     50592 (amd64/Linux) post-AMP-update
-                    # 2015-04-03: Widen 5->8% (amd64/Windows was doing better)
+      collect_stats('peak_megabytes_allocated', 0),
+      collect_stats('bytes allocated',8),
       only_ways(['normal'])
       ],
      compile_and_run,
      ['-O'])
 
 test('MethSharing',
-     [stats_num_field('peak_megabytes_allocated', (2, 0)),
-                    # expected value: 1 (amd64/Linux)
-                    # 2016-08-31:     2 (allocation area size bumped to 1MB)
-      stats_num_field('bytes allocated',
-                      [(wordsize(32), 240071008, 5),
-                    # expected value: 2685858140 (x86/OS X)
-                    # expected:       360940756 (x86/Linux)
-                    # 2017-03-24:     240071008 (x86/Linux, 64-bit machine)
-                       (wordsize(64), 480098192, 5)]),
-                    # expected:   640067672 (amd64/Linux)
-                    # 2017-01-31: 480098192 work/wrap noinline things
+     [collect_stats('peak_megabytes_allocated',  0),
+      collect_stats('bytes allocated',5),
       only_ways(['normal'])
       ],
      compile_and_run,
@@ -197,9 +121,7 @@ test('T149',
      ['$MAKE -s --no-print-directory T149'])
 
 test('T5113',
-     [stats_num_field('bytes allocated',
-                      [(wordsize(32), 4000000, 5),
-                       (wordsize(64), 8000000, 5)]),
+     [collect_stats('bytes allocated',5),
       only_ways(['normal'])
       ],
      compile_and_run,
@@ -207,103 +129,54 @@ test('T5113',
 
 
 test('T4978',
-     [stats_num_field('bytes allocated',
-                      [(wordsize(32), 10000000, 5),
-                       (wordsize(64), 10137680, 5)]),
-                    # expected value: 10137680 (amd64/Linux)
+     [collect_stats('bytes allocated',5),
       only_ways(['normal'])
       ],
      compile_and_run,
      ['-O2'])
 
 test('T5205',
-     [stats_num_field('bytes allocated',
-                      [(wordsize(32), 49460, 5),
-                    # expected value: 47088 (x86/Darwin)
-                    # 2017-03-24:     49460 (x86/Linux, 64-bit machine)
-
-                       (platform('x86_64-unknown-mingw32'), 52264, 5),
-                    # 2016-12-14: 52264 (Separate out Windows results)
-
-                       (wordsize(64), 56208, 5)]),
-                    # expected value: 51320 (amd64/Linux)
-                    # 2014-07-17:     52600 (amd64/Linux) general round of updates
-                    # 2015-04-03: Widen 5->7% (amd64/Windows was doing better)
-                    # 2015-08-15: 50648 (Windows too good. avg of Windows&Linux)
-                    # 2015-10-30: 56208 (D757: Emit Typeable at definition site)
-                    # 2016-12-14: Narrow 7->5% (Separate out Windows results)
+     [collect_stats('bytes allocated',5),
       only_ways(['normal', 'optasm'])
       ],
      compile_and_run,
      [''])
 
 test('T5549',
-     [stats_num_field('bytes allocated',
-                      [(wordsize(32), 2896607976, 5),
-                    # expected value: 3362958676 (Windows)
-                    # 2014-12-01:     4096606332 (Windows) integer-gmp2
-                    # 2017-03-24:     2896607976 (x86/Linux, 64-bit machine)
-
-                       (wordsize(64), 5793140200, 5)]),
-                    # expected value: 6725846120 (amd64/Linux)
-                    #                 8193140752 (amd64/Linux) integer-gmp2
-                    #                 5793140200 (amd64/Linux) integer-gmp2
+     [collect_stats('bytes allocated',5),
       only_ways(['normal'])
       ],
      compile_and_run,
      ['-O'])
 
 test('T4474a',
-     [stats_num_field('bytes allocated',
-                      [(wordsize(32), 2405242767, 5),
-                       (wordsize(64), 4831890304, 5)]),
-                    # expected value: 4831890304 (amd64/OSX)
+     [collect_stats('bytes allocated',5),
       only_ways(['normal'])
       ],
      compile_and_run,
      ['-O'])
 test('T4474b',
-     [stats_num_field('bytes allocated',
-                      [(wordsize(32), 2405242767, 5),
-                       (wordsize(64), 4831890304, 5)]),
-                    # expected value: 4831890304 (amd64/OSX)
+     [collect_stats('bytes allocated',5),
       only_ways(['normal'])
       ],
      compile_and_run,
      ['-O'])
 test('T4474c',
-     [stats_num_field('bytes allocated',
-                      [(wordsize(32), 2405242767, 5),
-                       (wordsize(64), 4831890304, 5)]),
-                    # expected value: 4831890304 (amd64/OSX)
+     [collect_stats('bytes allocated',5),
       only_ways(['normal'])
       ],
      compile_and_run,
      ['-O'])
 
 test('T5237',
-     [stats_num_field('bytes allocated',
-                        [(platform('i386-unknown-mingw32'), 73280, 5),
-                         (wordsize(32), 78328, 5),
-                      # expected value: 78328 (i386/Linux)
-                         (wordsize(64), 104176, 5)]),
-                      # expected value: 110888 (amd64/Linux)
-                      # expected value: 104176 (amd64/Linux)
+     [collect_stats('bytes allocated',5),
      only_ways(['normal'])
      ],
     compile_and_run,
     ['-O ' + sse2_opts])
 
 test('T5536',
-     [stats_num_field('bytes allocated',
-                      [(wordsize(32), 446260520, 1),
-                                   # 1246287228 (i386/Linux)
-                                    # 446328556 (i386/Windows)
-                                    # 446192484 (i386/OSX)
-                       (wordsize(64), 892399040, 5)]),
-                   # expected value: 2492589480 (amd64/Linux)
-                   # 17/1/13:         892399040 (x86_64/Linux)
-                   #                  (new demand analyser)
+     [collect_stats('bytes allocated',1),
      extra_clean(['T5536.data']),
      ignore_stdout,
      only_ways(['normal'])
@@ -312,37 +185,14 @@ test('T5536',
     ['-O'])
 
 test('T7257',
-     [stats_num_field('bytes allocated',
-                      [(wordsize(32), 869850704, 10),
-                        # expected value: 1246287228 (i386/Linux)
-                        # 2016-04-06: 989850664 (i386/Linux) no idea what happened
-                        # 2017-03-25: 869850704 (x86/Linux, 64-bit machine) probably sizeExpr fix
-                       (wordsize(64), 1297293264, 5)]),
-                        # 2012-09-21: 1774893760 (amd64/Linux)
-                        # 2015-11-03: 1654893248 (amd64/Linux)
-                        # 2016-06-22: 1414893248 (amd64/Linux, sizeExpr fix)
-                        # 2018-06-22: 1297293264 (amd64/Linux, atomicModifyMutVar# replacement)
-      stats_num_field('peak_megabytes_allocated',
-                      [(wordsize(32), 217, 5),
-                        # 2012-10-08: 217 (x86/Linux)
-                       (wordsize(64), 227, 5)]),
-                        # 2012-09-21: 227 (amd64/Linux)
-
+     [collect_stats('bytes allocated',10),
+      collect_stats('peak_megabytes_allocated',5),
       only_ways(['normal'])
      ],
     compile_and_run, ['-O'])
 
 test('Conversions',
-     [stats_num_field('bytes allocated',
-                      [(wordsize(32), 76768, 3),
-                        # 2012-12-18: 55316 Guessed 64-bit value / 2
-                        # 2013-02-10: 77472 (x86/OSX)
-                        # 2013-02-10: 79276 (x86/Windows)
-                        # 2014-01-13: 76768 (x86/Linux) due to #8647
-                       (wordsize(64), 107544, 5)]),
-                        # 2012-12-18: 109608 (amd64/OS X)
-                        # 2014-07-17: 107544 (amd64/Linux)
-
+     [collect_stats('bytes allocated',3),
       only_ways(['normal'])
      ],
     compile_and_run, ['-O -msse2'])
@@ -351,29 +201,14 @@ test('T7507', omit_ways(['ghci']), compile_and_run, ['-O'])
 # For 7507, stack overflow is the bad case
 
 test('T7436',
-     [stats_num_field('max_bytes_used',
-          [(wordsize(64), 60360, 4),
-           #             127000 (amd64/Linux)
-           # 2013-02-07:  60360 (amd64/Linux)
-           # 2015-04-03: Widen 1->4% (amd64/Windows was doing better)
-           (wordsize(32), 42772, 4)]),
-           # 2013-02-10: 58032 (x86/Windows)
-           # 2013-02-10: 58836 (x86/OSX)
-           # 2017-03-24: 42772 (x86/Linux, 64-bit machine) no idea why
-           # 2017-04-02: Widen 1->4% (i386/Windows was doing better)
+     [collect_stats('max_bytes_used',4),
       only_ways(['normal'])
       ],
      compile_and_run,
      ['-O'])
 
 test('T7797',
-      [stats_num_field('bytes allocated',
-                      [(wordsize(32), 240044984, 5),
-                          # expected value: 2685858140 (x86/OS X)
-                          # expected: 360940756 (x86/Linux)
-                          # expected: 240044984 (x86/Windows, 64bit machine)
-                       (wordsize(64), 480050944, 5)]),
-                          # expected: 480050944 (amd64/Linux)
+      [collect_stats('bytes allocated',5),
       extra_clean(['T7797a.hi', 'T7797a.o']),
       only_ways(['normal'])
       ],
@@ -381,114 +216,62 @@ test('T7797',
      ['-O'])
 
 test('T7954',
-      [stats_num_field('bytes allocated',
-                      [(wordsize(32), 920045264, 10),
-              # some date:  1380051408    (64-bit Windows machine)
-              # 2014-04-04:  920045264    (64-bit Windows machine)
-                       (wordsize(64), 1280051632, 10)]),
-              # 2014-02-10: 1680051336 (x86_64/Linux), call arity analysis
-              # 2018-05-03: 1280051632 (x86_64/Linux), refactor numericEnumFrom
+      [collect_stats('bytes allocated',10),
       only_ways(['normal'])
       ],
      compile_and_run,
      ['-O'])
 
 test('T7850',
-     [stats_num_field('peak_megabytes_allocated',
-                      [(wordsize(32), 2, 10),
-                       (wordsize(64), 4, 10)]),
+     [collect_stats('peak_megabytes_allocated',10),
       only_ways(['normal'])],
      compile_and_run,
      ['-O'])
 
 test('T5949',
-     [stats_num_field('bytes allocated',
-                      [ (wordsize(32), 116020, 10),
-                        (wordsize(64), 201008, 10)]),
-                      # previously, it was >400000 bytes
+     [collect_stats('bytes allocated',10),
       only_ways(['normal'])],
      compile_and_run,
      ['-O'])
 
 test('T4267',
-     [stats_num_field('bytes allocated',
-                      [ (wordsize(32), 36012, 10)
-                      # 32-bit value close to 64 bit; c.f. T7619
-                      , (wordsize(64), 40992, 10) ]),
-                      # previously, it was >170000 bytes
-                      # 2014-01-17: 130000
-                      # 2014-02-10: 40992 (x86_64/Linux), call arity analysis
+     [collect_stats('bytes allocated',10),
       only_ways(['normal'])],
-     compile_and_run,
-     ['-O'])
+      compile_and_run,
+      ['-O'])
 
 test('T7619',
-     [stats_num_field('bytes allocated',
-                      [ (wordsize(32), 36012, 10)
-                      # 32-bit close to 64-bit value; most of this very
-                      # small number is standard start-up boilerplate I think
-                      , (wordsize(64), 40992, 10) ]),
-                      # previously, it was >400000 bytes
+     [collect_stats('bytes allocated',10),
       only_ways(['normal'])],
      compile_and_run,
      ['-O'])
 
 test('InlineArrayAlloc',
-     [stats_num_field('bytes allocated',
-                      [ (wordsize(32), 800040960, 5)
-                      , (wordsize(64), 1600040960, 5) ]),
+     [collect_stats('bytes allocated',5),
       only_ways(['normal'])],
      compile_and_run,
      ['-O2'])
 
 test('InlineByteArrayAlloc',
-     [stats_num_field('bytes allocated',
-                      [ (wordsize(32), 1360036012, 5)
-                      , (wordsize(64), 1440040960, 5) ]),
-         # 32 and 64 bit not so different, because
-         # we are allocating *byte* arrays
+     [collect_stats('bytes allocated',5),
       only_ways(['normal'])],
      compile_and_run,
      ['-O2'])
 
 test('InlineCloneArrayAlloc',
-     [stats_num_field('bytes allocated',
-                      [ (wordsize(32), 800041120, 5)
-                      , (wordsize(64), 1600041120, 5) ]),
+     [collect_stats('bytes allocated',5),
       only_ways(['normal'])],
      compile_and_run,
      ['-O2'])
 
 test('T9203',
-     [stats_num_field('bytes allocated',
-                      [ (wordsize(32), 77969268, 5)
-                      # was
-                      # 2016-04-06     84345136 (i386/Debian) not sure
-                      # 2017-03-24     77969268 (x86/Linux, 64-bit machine) probably join points
-
-                      , (wordsize(64), 98360576, 5) ]),
-                      # was            95747304
-                      # 2019-09-10     94547280 post-AMP cleanup
-                      # 2015-10-28     95451192 emit Typeable at definition site
-                      # 2016-12-19     84620888 Join points
-                      # 2018-07-30     98360576 it's unclear
+     [collect_stats('bytes allocated',5),
       only_ways(['normal'])],
      compile_and_run,
      ['-O2'])
 
 test('T9339',
-     [stats_num_field('bytes allocated',
-                      [ (wordsize(32),    46904, 5)
-                      # is this number correct? Seems very high.
-                      # 2017-03-24:       46904 (x86/Linux, 64-bit machine) who knows
-
-                      , (platform('x86_64-unknown-mingw32'), 47088, 7)
-                      # 2017-02-19                           47088 (x64/Windows) - Unknown
-
-                      , (wordsize(64),       50728, 5) ]),
-                      # w/o fusing last: 320005080
-                      # 2014-07-22:       80050760
-                      # 2016-08-17:          50728 Join points (#12988)
+     [collect_stats('bytes allocated',5),
       only_ways(['normal'])],
      compile_and_run,
      ['-O2 -fspec-constr-keen'])
@@ -496,64 +279,44 @@ test('T9339',
 
 
 test('T8472',
-     [stats_num_field('bytes allocated',
-                      [ (wordsize(32),    50000, 80)
-                      , (wordsize(64),    51424, 80) ]),
+     [collect_stats('bytes allocated',80),
       only_ways(['normal'])],
      compile_and_run,
      ['-O2'])
 
 test('T12996',
-     [stats_num_field('bytes allocated',
-                      [ (wordsize(64),    76776, 5) ]),
+     [collect_stats('bytes allocated',5),
       only_ways(['normal'])],
      compile_and_run,
      ['-O2'])
 
 test('T13001',
-     [stats_num_field('bytes allocated',
-                      [ (wordsize(32),    46728, 20)
-                      , (wordsize(64),    50600, 20) ]),
+     [collect_stats('bytes allocated',20),
       only_ways(['normal'])],
      compile_and_run,
      ['-O2'])
 
 test('T8763',
-     [stats_num_field('bytes allocated',
-                      [ (wordsize(64),    41056, 20) ]),
+     [collect_stats('bytes allocated', 20),
       only_ways(['normal'])],
      compile_and_run,
      ['-O2'])
 
 test('T12990',
-    [stats_num_field('bytes allocated',
-                     [ (wordsize(64), 20040936, 5) ]),
-                     # 2017-01-03     34440936  w/o inlining unsaturated
-                     #                          constructor wrappers
-                     # 2017-01-03     21640904 inline wrappers
-                     # 2017-01-31     20040936 work/wrap noinline things
+    [collect_stats('bytes allocated',5),
      only_ways(['normal'])],
     compile_and_run,
     ['-O2'])
 
 test('T13218',
-    [stats_num_field('bytes allocated',
-                     [ (wordsize(64), 82040056, 5) ]),
-                     # 8.1 with default <$  163644216
-                     # 8.1 with derived <$   82040056
-     stats_num_field('max_bytes_used',
-                     [ (wordsize(64), 359128, 10) ]),
-                     # 8.1 with default <$  64408248
-                     # 8.1 with derived <$    359128
+    [collect_stats('bytes allocated',5),
+     collect_stats('max_bytes_used',10),
      only_ways(['normal'])],
     compile_and_run,
     ['-O'])
 
 test('DeriveNull',
-    [stats_num_field('bytes allocated',
-                    [ (wordsize(64), 112050856, 5) ]),
-                    # 2017-04-01     152083704 w/o derived null
-                    # 2017-04-02     112050856 derive null
+    [collect_stats('bytes allocated',5),
      only_ways(['normal'])],
     compile_and_run,
     ['-O'])
@@ -561,66 +324,41 @@ test('DeriveNull',
 test('DeriveNullTermination', normal, compile_and_run, [''])
 
 test('T13623',
-    [stats_num_field('bytes allocated',
-                    [(platform('x86_64-unknown-mingw32'),   47232, 10),
-                    # 2017-12-24     47232 unknown
-                    (wordsize(64), 50936, 5)]),
-                    # 2017-05-02     50936 initial
+    [collect_stats('bytes allocated',10),
      only_ways(['normal'])],
     compile_and_run,
     ['-O2'])
 
 test('T14052',
-     [compiler_stats_num_field('bytes allocated',
-                      [ (wordsize(64), 2346183840, 15) ])],
+     [collect_compiler_stats('bytes allocated',15)],
      ghci_script,
      ['T14052.script'])
 
 test('T14936',
-     [stats_num_field('bytes allocated',
-                      [(platform('x86_64-unknown-mingw32'),   47536, 10),
-                       # 2018-05-04     47536 unknown
-                       (wordsize(64), 51792, 5) ])],
+     [collect_stats('bytes allocated',10)],
      compile_and_run,
      ['-O2'])
 
 test('T15226',
-    [stats_num_field('bytes allocated',
-                    [(platform('x86_64-unknown-mingw32'),   37488, 4),
-                    # 2018-09-23   37488  Linker changes
-                     (wordsize(64), 41040, 5) ]),
-                    # 2018-06-06   41040  Let the simplifier know the result
-                    #                     of seq# is in WHNF
-                    # initial  400041040
+    [collect_stats('bytes allocated',5),
      only_ways(['normal'])],
     compile_and_run,
     ['-O'])
 
 test('T15226a',
-    [stats_num_field('bytes allocated',
-                    [(platform('x86_64-unknown-mingw32'),   37488, 4),
-                    # 2018-09-23   37488  Linker changes
-                     (wordsize(64), 41040, 5) ]),
-                    # 2018-06-06   41040  Look through casts for seq#
-                    # initial  400041040
+    [collect_stats('bytes allocated',5),
      only_ways(['normal'])],
     compile_and_run,
     ['-O'])
 
 test('T15426',
-    [stats_num_field('bytes allocated',
-                    [ (wordsize(64), 41272, 20) ]),
-		    # 2018-08-10   41272  Change findIndices from INLINE to INLINABLE
-		    # initial  160041176
-     only_ways(['normal'])],
+    [collect_stats('bytes allocated', 20),
+    only_ways(['normal'])],
     compile_and_run,
     ['-O2'])
 
 test('T15578',
-    [stats_num_field('bytes allocated',
-                    [ (wordsize(64), 800041456, 5) ]),
-                    # 2018-09-07     800041456   Improvements from #15578
-                    # initial      42400041456
+    [collect_stats('bytes allocated', 5),
      only_ways(['normal'])],
     compile_and_run,
     ['-O2'])
diff --git a/testsuite/tests/perf/space_leaks/all.T b/testsuite/tests/perf/space_leaks/all.T
index a23796d532..1f69d12112 100644
--- a/testsuite/tests/perf/space_leaks/all.T
+++ b/testsuite/tests/perf/space_leaks/all.T
@@ -1,36 +1,10 @@
 
 test('space_leak_001',
-     # Before trac #2747 was fixed this was 565.
-     # Now it's: 3 (amd64/Linux)
-     #           4 (x86/OS X)
-     #           5 (x86/Linux)
-     [stats_num_field('peak_megabytes_allocated', (3, 1)),
-                        # 3 (amd64/Linux, integer-gmp2)
-      stats_num_field('max_bytes_used',
-          [(wordsize(64), 440000, 15),
-                        # 440224 (amd64/Linux)
-                        # 417016 (x86/OS X)
-                        # 415672 (x86/Windows)
-                        # 481456 (unreg amd64/Linux)
-           (wordsize(32), 428220, 10)]),
-             # 2013-02-10 372072 (x86/OSX)
-             # 2013-02-10 439228 (x86/OSX)
-             # 2016-04-06 361400 (x86/Linux)
-             # 2017-03-24 428220 (x86/Linux, 64-bit machine)
-
-      stats_num_field('bytes allocated', 
-           [ (wordsize(64), 11315747416, 5),
-                        # expected value: 9079316016 (amd64/Linux)
-                        #                 9331570416 (x86/Linux)
-                        #                 9329073952 (x86/OS X)
-                        #                 9327959840 (x86/Windows)
-                        #                 11315747416 (amd64/Lnx, integer-gmp2)
-
-             (wordsize(32), 661907800, 5),
-              # 2014-12-01  13550759068 (Windows)
-              # 2017-03-24  661907800   (x86/Linux, 64-bit machine) No idea
-              
-            ]),
+     # This could potentially be replaced with
+     # collect_stats('all',5) to test all 3 with 
+     # 5% possible deviation.
+     [collect_stats(['peak_megabytes_allocated','bytes allocated'],5),
+      collect_stats('max_bytes_used',15),
       omit_ways(['profasm','profthreaded','threaded1','threaded2'])
       ],
      compile_and_run,
@@ -39,18 +13,14 @@ test('space_leak_001',
 test('T4334',
      # Test for a space leak in Data.List.lines (fixed with #4334)
      [extra_run_opts('1000000 2 t'),
-      stats_num_field('peak_megabytes_allocated', (2, 1)),
+      collect_stats('peak_megabytes_allocated',2),
       # prof ways don't work well with +RTS -V0
       omit_ways(['profasm','profthreaded'])
       ],
      compile_and_run, [''])
 
 test('T2762',
-     [stats_num_field('peak_megabytes_allocated', (2, 0)),
-      # peak_megabytes_allocated is 2 with 7.0.2.
-      # Was 57 with 6.12.3.
-      # 2016-08-31:     3 (allocation area size bumped to 1MB)
-      # 2017-02-22:     2 (refactor fiBind)
+     [collect_stats('peak_megabytes_allocated',2),
       only_ways(['normal']),
       extra_run_opts('+RTS -G1 -RTS' ),
       extra_clean(['T2762A.hi', 'T2762A.o'])],
@@ -61,36 +31,9 @@ test('T4018',
      compile_and_run, ['-fno-state-hack'])
 
 test('T4029',
-     [stats_num_field('peak_megabytes_allocated',
-          [(wordsize(64), 65, 10)]),
-            # 2016-02-26: 66 (amd64/Linux)           INITIAL
-            # 2016-05-23: 82 (amd64/Linux)           Use -G1
-            # 2016-07-13: 92 (amd64/Linux)           Changes to tidyType
-            # 2016-09-01: 71 (amd64/Linux)           Restore w/w limit (#11565)
-            # 2017-02-12: 80 (amd64/Linux)           Type-indexed Typeable
-            # 2017-02-20: 76 (amd64/Linux)           Better reading of iface files
-            # 2017-03-03: 65 (amd64/Linux)           Share Typeable KindReps or more
-            #                                        lazy interface file reading
-      stats_num_field('max_bytes_used',
-          [(wordsize(64), 18208944, 15)]),
-            # 2016-02-26: 24071720 (amd64/Linux)     INITIAL
-            # 2016-04-21: 25542832 (amd64/Linux)
-            # 2016-05-23: 25247216 (amd64/Linux)     Use -G1
-            # 2016-07-13: 27575416 (amd64/Linux)     Changes to tidyType
-            # 2016-07-20: 22920616 (amd64/Linux)     Fix laziness of instance matching
-            # 2016-09-01: 21648488 (amd64/Linux)     Restore w/w limit (#11565)
-            # 2016-10-13: 20325248 (amd64/Linux)     Creep (downwards, yay!)
-            # 2016-11-14: 21387048 (amd64/Linux)     Creep back upwards :(
-            # 2017-01-18: 21670448 (amd64/Linux)     Float string literals to toplevel
-            # 2017-02-07: 22770352 (amd64/Linux)     It is unclear
-            # 2017-02-12: 24151096 (amd64/Linux)     Type-indexed Typeable
-            # 2017-02-20: 22016200 (amd64/Linux)     Better reading of iface files
-            # 2017-03-03: 19172360 (amd64/Linux)     Share Typeable KindReps or more
-            #                                        lazy interface file reading
-            # 2017-03-07: 20476360 (amd64/Linux)     It's not entirely clear
-            # 2017-03-14: 18208944 (amd64/Darwin)    Again, not clear
-            # 2017-03-15: bumped margin to 15% due to instability
+     [collect_stats(['peak_megabytes_allocated','max_bytes_used'],10),
       extra_hc_opts('+RTS -G1 -RTS' ),
       ],
      ghci_script,
      ['T4029.script'])
+
diff --git a/testsuite/tests/pmcheck/should_compile/all.T b/testsuite/tests/pmcheck/should_compile/all.T
index 079978b5f5..393ce92463 100644
--- a/testsuite/tests/pmcheck/should_compile/all.T
+++ b/testsuite/tests/pmcheck/should_compile/all.T
@@ -36,26 +36,19 @@ test('T9951b', [], compile,
      ['-fwarn-incomplete-patterns -fwarn-overlapping-patterns'])
 test('T9951', [], compile,
      ['-fwarn-incomplete-patterns -fwarn-overlapping-patterns'])
-test('T11303', normal, compile, ['-fwarn-incomplete-patterns -fwarn-overlapping-patterns +RTS -M1G -RTS'])
-test('T11276', compiler_stats_num_field('bytes allocated',
-  [(wordsize(64), 165890392, 10)]
-    # 2018-07-14: 165890392   INITIAL
-  ), compile, ['-fwarn-incomplete-patterns -fwarn-overlapping-patterns +RTS -M1G -RTS'])
+test('T11303', normal, compile,
+     ['-fwarn-incomplete-patterns -fwarn-overlapping-patterns +RTS -M1G -RTS'])
+test('T11276', collect_compiler_stats('bytes allocated',10), compile,
+     ['-fwarn-incomplete-patterns -fwarn-overlapping-patterns +RTS -M1G -RTS'])
 
-test('T11303b', compiler_stats_num_field('bytes allocated',
-  [(wordsize(64), 54373936, 10)]
-    # 2018-07-14: 54373936    INITIAL
-  ), compile, ['-fwarn-incomplete-patterns -fwarn-overlapping-patterns +RTS -M1G -RTS'])
+test('T11303b', collect_compiler_stats('bytes allocated',10), compile,
+     ['-fwarn-incomplete-patterns -fwarn-overlapping-patterns +RTS -M1G -RTS'])
 
-test('T11374', compiler_stats_num_field('bytes allocated',
-  [(wordsize(64), 280144864, 10)]
-    # 2018-07-14: 280144864   INITIAL
-  ), compile, ['-fwarn-incomplete-patterns -fwarn-overlapping-patterns +RTS -M1G -RTS'])
+test('T11374', collect_compiler_stats('bytes allocated',10), compile,
+     ['-fwarn-incomplete-patterns -fwarn-overlapping-patterns +RTS -M1G -RTS'])
 
-test('T11195', compiler_stats_num_field('bytes allocated',
-  [(wordsize(64), 7852567480, 10)]
-    # 2018-07-14: 7852567480   INITIAL
-  ), compile, ['-package ghc -fwarn-incomplete-patterns -fwarn-overlapping-patterns +RTS -M2G -RTS'])
+test('T11195', collect_compiler_stats('bytes allocated',10), compile,
+     ['-package ghc -fwarn-incomplete-patterns -fwarn-overlapping-patterns +RTS -M2G -RTS'])
 
 test('T11984', normal, compile,
     ['-fwarn-incomplete-patterns -fwarn-overlapping-patterns'])
diff --git a/testsuite/tests/primops/should_run/all.T b/testsuite/tests/primops/should_run/all.T
index ecf995bea8..c7cdd348bf 100644
--- a/testsuite/tests/primops/should_run/all.T
+++ b/testsuite/tests/primops/should_run/all.T
@@ -7,11 +7,7 @@ test('T4442',
      compile_and_run, [''])
 test('T10481', exit_code(1), compile_and_run, [''])
 test('T10678',
-     [stats_num_field('bytes allocated',
-                      [(wordsize(64), 64004171, 5)
-                       # 2015-11-04: 88041768 +/- 5%  (before runRW#)
-                       # 2015-11-04: 64004171         (after runRW#)
-                      ]),
+     [ collect_stats('bytes allocated',5),
       only_ways('normal')
      ],
      compile_and_run, ['-O'])
diff --git a/testsuite/tests/simplCore/should_compile/all.T b/testsuite/tests/simplCore/should_compile/all.T
index d6b9aa0cf9..1f6ef0059f 100644
--- a/testsuite/tests/simplCore/should_compile/all.T
+++ b/testsuite/tests/simplCore/should_compile/all.T
@@ -152,8 +152,7 @@ test('T7702',
       # we say 18mb peak allocated +/- 70% because other compiler flags have
       # a large effect on allocation which is hard to separate from the
       # allocation done by the plugin... but a regression allocates > 90mb
-      compiler_stats_num_field('peak_megabytes_allocated',
-          [(wordsize(32), 18, 70), (wordsize(64), 18, 70)])
+      collect_compiler_stats('peak_megabytes_allocated',70),
      ],
      compile,
      ['-v0 -package-db T7702plugin/pkg.T7702/local.package.conf -fplugin T7702Plugin -package T7702plugin ' + config.plugin_way_flags])
diff --git a/testsuite/tests/simplStg/should_run/all.T b/testsuite/tests/simplStg/should_run/all.T
index d3aa9376ee..2f7c69f5db 100644
--- a/testsuite/tests/simplStg/should_run/all.T
+++ b/testsuite/tests/simplStg/should_run/all.T
@@ -13,10 +13,9 @@ test('T9291', normal, compile_and_run, [''])
 test('T13536', normal, compile_and_run, [''])
 
 test('T13536a',
-    [stats_num_field('bytes allocated',
-                    [ (wordsize(64), 86664, 5) ]),
-                    # 2017-04-10     86664 -- 25769889696 if broken
+        [ collect_stats('bytes allocated',5),
      only_ways(['optasm'])],
     compile_and_run,
     [''])
 
+