testsuite: Save performance metrics in git notes.

This patch makes the following improvement: - Automatically records test metrics (per test environment) so that the programmer need not supply nor update expected values in *.T files. - On expected metric changes, the programmer need only indicate the direction of change in the git commit message. - Provides a simple python tool "perf_notes.py" to compare metrics over time. Issues: - Using just the previous commit allows performance to drift with each commit. - Currently we allow drift as we have a preference for minimizing false positives. - Some possible alternatives include: - Use metrics from a fixed commit per test: the last commit that allowed a change in performance (else the oldest metric) - Or use some sort of aggregate since the last commit that allowed a change in performance (else all available metrics) - These alternatives may result in a performance issue (with the test driver) having to heavily search git commits/notes. - Run locally, performance tests will trivially pass unless the tests were run locally on the previous commit. This is often not the case e.g. after pulling recent changes. Previously, *.T files contain statements such as: ``` stats_num_field('peak_megabytes_allocated', (2, 1)) compiler_stats_num_field('bytes allocated', [(wordsize(64), 165890392, 10)]) ``` This required the programmer to give the expected values and a tolerance deviation (percentage). With this patch, the above statements are replaced with: ``` collect_stats('peak_megabytes_allocated', 5) collect_compiler_stats('bytes allocated', 10) ``` So that programmer must only enter which metrics to test and a tolerance deviation. No expected value is required. CircleCI will then run the tests per test environment and record the metrics to a git note for that commit and push them to the git.haskell.org ghc repo. Metrics will be compared to the previous commit. If they are different by the tolerance deviation from the *.T file, then the corresponding test will fail. By adding to the git commit message e.g. ``` # Metric (In|De)crease <metric(s)> <options>: <tests> Metric Increase ['bytes allocated', 'peak_megabytes_allocated'] \ (test_env='linux_x86', way='default'): Test012, Test345 Metric Decrease 'bytes allocated': Test678 Metric Increase: Test711 ``` This will allow the noted changes (letting the test pass). Note that by omitting metrics or options, the change will apply to all possible metrics/options (i.e. in the above, an increase for all metrics in all test environments is allowed for Test711) phabricator will use the message in the description Reviewers: bgamari, hvr Reviewed By: bgamari Subscribers: rwbarton, carter GHC Trac Issues: #12758 Differential Revision: https://phabricator.haskell.org/D5059
author: David Eichmann <davide@Well-Typed.com> 2018-11-07 12:02:47 -0500
committer: Ben Gamari <ben@smart-cactus.org> 2018-11-07 12:07:11 -0500
commit: 932cd41d8c7984c767c1b3b58e05146f69cc5c15 (patch)
tree: 41e77f048036a19100c5bee508c77b2ab8ec55d4
parent: 82a5c2410a47b16df09039b9786c2c0e34ba130e (diff)
download: haskell-932cd41d8c7984c767c1b3b58e05146f69cc5c15.tar.gz
21 files changed, 1020 insertions, 1737 deletions
diff --git a/.circleci/config.yml b/.circleci/config.yml
index f35690124b..f80b2b321b 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -18,7 +18,7 @@ aliases:
     # ideally we would simply set THREADS here instead of re-detecting it every
     # time we need it below. Unfortunately, there is no way to set an environment
     # variable with the result of a shell script.
-    SKIP_PERF_TESTS: YES
+    SKIP_PERF_TESTS: NO
     VERBOSE: 2
   - &boot
     run:
@@ -32,6 +32,12 @@ aliases:
         include mk/flavours/\$(BuildFlavour).mk
         endif
         EOF
+  - &set_git_identity
+      run:
+        name: Set Git Identity
+        command: |
+          git config user.email "ghc-circleci@haskell.org"
+          git config user.name "GHC CircleCI"
   - &configure_unix
     run:
       name: Configure
@@ -64,10 +70,16 @@ aliases:
       name: Test
       command: |
         mkdir -p test-results
-        make test THREADS=`mk/detect-cpu-count.sh` SKIP_PERF_TESTS=YES JUNIT_FILE=../../test-results/junit.xml
+        METRICS_FILE=$(mktemp)
+        echo "export METRICS_FILE=$METRICS_FILE" >> $BASH_ENV
+        make test THREADS=`mk/detect-cpu-count.sh` SKIP_PERF_TESTS=$SKIP_PERF_TESTS TEST_ENV=$TEST_ENV JUNIT_FILE=../../test-results/junit.xml METRICS_FILE=$METRICS_FILE
   - &store_test_results
     store_test_results:
       path: test-results
+  - &push_perf_note
+    run:
+      name: Push Performance Git Notes
+      command: .circleci/push-test-metrics.sh
   - &slowtest
     run:
       name: Full Test
@@ -102,8 +114,10 @@ jobs:
     environment:
       <<: *buildenv
       GHC_COLLECTOR_FLAVOR: x86_64-linux
+      TEST_ENV: x86_64-linux
     steps:
       - checkout
+      - *set_git_identity
       - *prepare
       - *submodules
       - *boot
@@ -113,6 +127,7 @@ jobs:
       - *storeartifacts
       - *test
       - *store_test_results
+      - *push_perf_note
 
   "validate-x86_64-freebsd":
     resource_class: xlarge
@@ -122,8 +137,10 @@ jobs:
       TARGET: FreeBSD
       <<: *buildenv
       GHC_COLLECTOR_FLAVOR: x86_64-freebsd
+      TEST_ENV: x86_64-freebsd
     steps:
       - checkout
+      - *set_git_identity
       - *prepare
       - *submodules
       - *boot
@@ -133,6 +150,7 @@ jobs:
       - *storeartifacts
       - *test
       - *store_test_results
+      - *push_perf_note
 
   "validate-x86_64-darwin":
     macos:
@@ -147,8 +165,10 @@ jobs:
       # Build with in-tree GMP since this isn't available on OS X by default.
       CONFIGURE_OPTS: --with-intree-gmp
       <<: *buildenv
+      TEST_ENV: x86_64-darwin
     steps:
       - checkout
+      - *set_git_identity
       - *prepare
       - *submodules
       - *boot
@@ -158,6 +178,7 @@ jobs:
       - *storeartifacts
       - *test
       - *store_test_results
+      - *push_perf_note
 
   "validate-hadrian-x86_64-linux":
     resource_class: xlarge
@@ -167,6 +188,7 @@ jobs:
       <<: *buildenv
     steps:
       - checkout
+      - *set_git_identity
       - *prepare
       - *submodules
       - *boot
@@ -179,8 +201,10 @@ jobs:
       - image: ghcci/x86_64-linux:0.0.4
     environment:
       <<: *buildenv
+      TEST_ENV: x86_64-linux-unreg
     steps:
       - checkout
+      - *set_git_identity
       - *prepare
       - *submodules
       - *boot
@@ -188,6 +212,7 @@ jobs:
       - *make
       - *test
       - *store_test_results
+      - *push_perf_note
 
   "validate-x86_64-linux-llvm":
     resource_class: xlarge
@@ -196,6 +221,7 @@ jobs:
     environment:
       <<: *buildenv
       BUILD_FLAVOUR: perf-llvm
+      TEST_ENV: x86_64-linux-llvm
     steps:
       - run:
           name: Install LLVM
@@ -206,12 +232,14 @@ jobs:
           name: Verify that llc works
           command: llc
       - checkout
+      - *set_git_identity
       - *prepare
       - *submodules
       - *boot
       - *configure_unix
       - *make
       - *test
+      - *push_perf_note
 
   # Nightly build with -DDEBUG using devel2 flavour
   "validate-x86_64-linux-debug":
@@ -221,8 +249,11 @@ jobs:
     environment:
       BUILD_FLAVOUR: devel2
       <<: *buildenv
+      TEST_ENV: x86_64-linux-debug
+      SKIP_PERF_TESTS: YES
     steps:
       - checkout
+      - *set_git_identity
       - *prepare
       - *submodules
       - *boot
@@ -230,6 +261,7 @@ jobs:
       - *make
       - *test
       - *store_test_results
+      - *push_perf_note
 
   "validate-i386-linux":
     resource_class: xlarge
@@ -238,8 +270,10 @@ jobs:
     environment:
       <<: *buildenv
       GHC_COLLECTOR_FLAVOR: i386-linux
+      TEST_ENV: i386-linux
     steps:
       - checkout
+      - *set_git_identity
       - *prepare
       - *submodules
       - *boot
@@ -249,6 +283,7 @@ jobs:
       - *storeartifacts
       - *test
       - *store_test_results
+      - *push_perf_note
 
   "validate-x86_64-fedora":
     resource_class: xlarge
@@ -257,8 +292,10 @@ jobs:
     environment:
       <<: *buildenv
       GHC_COLLECTOR_FLAVOR: x86_64-fedora
+      TEST_ENV: x86_64-fedora
     steps:
       - checkout
+      - *set_git_identity
       - *prepare
       - *submodules
       - *boot
@@ -268,6 +305,7 @@ jobs:
       - *storeartifacts
       - *test
       - *store_test_results
+      - *push_perf_note
 
   "slow-validate-x86_64-linux":
     resource_class: xlarge
@@ -285,6 +323,7 @@ jobs:
       - *make
       - *slowtest
       - *store_test_results
+      - *push_perf_note
 
 workflows:
   version: 2
diff --git a/.circleci/push-test-metrics.sh b/.circleci/push-test-metrics.sh
new file mode 100755
index 0000000000..4ea6958d99
--- /dev/null
+++ b/.circleci/push-test-metrics.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+# vim: sw=2 et
+set -euo pipefail
+
+fail() {
+  echo "ERROR: $*" >&2
+  exit 1
+}
+
+GHC_ORIGIN=git@git.haskell.org:ghc
+
+# Add git.haskell.org as a known host.
+echo "|1|F3mPVCE55+KfApNIMYQ3Dv39sGE=|1bRkvJEJhAN2R0LE/lAjFCEJGl0= ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBBUZS9jGBkE5UzpSo6irnIgcQcfzvbuIOsFc8+N61FwtZncRntbaKPuUimOFPgeaUZLl6Iajz6IIs7aduU0/v+I=" >> ~/.ssh/known_hosts
+echo "|1|2VUMjYSRVpT2qJPA0rA9ap9xILY=|5OThkI4ED9V0J+Es7D5FOD55Klk= ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC+3TLluLAO4lkW60W+N2DFkS+WoRFGqLwHzgd1ifxG9TIm31wChPY3E/hgMnJmgGqWCF4UDUemmyCycEaL7FtKfzjTAclg9EfpQnozyE3T5hIo2WL7SN5O8ttG/bYGuDnn14jLnWwJyN4oz/znWFiDG9e2Oc9YFNlQ+PK8ae5xR4gqBB7EOoj9J1EiPqG2OXRr5Mei3TLsRDU6fnz/e4oFJpKWWeN6M63oePv0qoaGjxcrATZUWsuWrxVMmYo9kP1xRuFJbAUw2m4uVP+793SW1zxySi1HBMtJG+gCDdZZSwYbkV1hassLWBHv1qPttncfX8Zek3Z3VolaTmfWJTo9" >> ~/.ssh/known_hosts
+
+# Check that a git notes dont already exist.
+# This is a percausion as we reset refs/notes/perf and we want to avoid data loss.
+if [ $(git notes --ref=perf list | wc -l) -ne 0 ]
+then
+  fail "Found an existing git note on HEAD. Expected no git note."
+fi
+
+# Assert that the METRICS_FILE exists and can be read.
+if [ "$METRICS_FILE" = "" ] || ! [ -r $METRICS_FILE ]
+then
+  fail "Metrics file not found: $METRICS_FILE"
+fi
+
+# Reset the git notes and append the metrics file to the notes, then push and return the result.
+# This is favoured over a git notes merge as it avoids potential data loss/duplication from the merge strategy.
+function reset_append_note_push {
+  git fetch -f $GHC_ORIGIN refs/notes/perf:refs/notes/perf || true
+  echo "git notes --ref=perf append -F $METRICS_FILE HEAD"
+  git notes --ref=perf append -F $METRICS_FILE HEAD
+  git push $GHC_ORIGIN refs/notes/perf
+}
+
+# Push the metrics file as a git note. This may fail if another task pushes a note first. In that case
+# the latest note is fetched and appended.
+MAX_RETRY=20
+until reset_append_note_push || [ MAX_RETRY = 0 ]
+do
+  ((MAX_RETRY--))
+  echo ""
+  echo "Failed to push git notes. Fetching, appending, and retrying..."
+done
diff --git a/libraries/base/tests/all.T b/libraries/base/tests/all.T
index 90af9020d6..aaf4aa2789 100644
--- a/libraries/base/tests/all.T
+++ b/libraries/base/tests/all.T
@@ -176,12 +176,7 @@ test('topHandler04',
 
 
 test('T8766',
-        [ stats_num_field('bytes allocated',
-                          [ (wordsize(64), 16828144, 5)
-	# with GHC-7.6.3: 83937384 (but faster execution than the next line)
-	# before:         58771216 (without call-arity-analysis)
-	# expected value: 16828144 (2014-01-14)
-                          , (wordsize(32), 8433644, 5) ])
+        [ collect_stats('bytes allocated',5)
         , only_ways(['normal'])],
       compile_and_run,
       ['-O'])
@@ -208,9 +203,7 @@ test('T8089',
 test('T8684', expect_broken(8684), compile_and_run, [''])
 test('T9826',normal, compile_and_run,[''])
 test('T9848',
-        [ stats_num_field('bytes allocated',
-                          [ (wordsize(64), 51840, 20)
-                          , (wordsize(32), 47348, 20) ])
+        [ collect_stats('bytes allocated')
         , only_ways(['normal'])],
       compile_and_run,
       ['-O'])
@@ -223,10 +216,7 @@ test('lazySTexamples', normal, compile_and_run, [''])
 test('T11760', req_smp, compile_and_run, ['-threaded -with-rtsopts=-N2'])
 test('T12874', normal, compile_and_run, [''])
 test('T13191',
-        [ stats_num_field('bytes allocated',
-                          [ (wordsize(64), 185943272, 5) ])
-        # with GHC-8.1 before liftA2 change: 325065128
-        # GHC-8.1 with custom liftA2:        185943272
+        [ collect_stats('bytes allocated', 5)
         , only_ways(['normal'])],
       compile_and_run,
       ['-O'])
@@ -234,7 +224,7 @@ test('T13525', when(opsys('mingw32'), skip), compile_and_run, [''])
 test('T13097', normal, compile_and_run, [''])
 test('functorOperators', normal, compile_and_run, [''])
 test('T3474',
-     [stats_num_field('max_bytes_used', [ (wordsize(64), 44504, 5) ]),
+     [collect_stats('max_bytes_used',5),
       only_ways(['normal'])],
      compile_and_run, ['-O'])
 test('T14425', normal, compile_and_run, [''])
diff --git a/testsuite/driver/README.md b/testsuite/driver/README.md
new file mode 100644
index 0000000000..9324fd3df6
--- /dev/null
+++ b/testsuite/driver/README.md
@@ -0,0 +1,133 @@
+GHC Driver Readme
+=================
+
+Greetings and well met.  If you are reading this, I can only assume that you
+are likely interested in working on the testsuite in some capacity.  For more
+detailed documentation, please see [here][1].
+
+## ToC
+
+1. Entry points of the testsuite performance tests
+2. Quick overview of program parts
+3. How to use the comparison tool
+4. Important Types
+5. Quick answers for "how do I do X"?
+
+
+## Entry Points of the testsuite performance tests
+
+The testsuite has two main entry points depending on which perspective you
+approach it.  From the perspective of the test writer, the entry point is the
+collect_stats function called in *.T files.  This function is declared in
+perf_notes.py along with its associated infrastructure.  The purpose of this
+function is to tell the test driver what metrics to compare when processing
+the test. From the perspective of running the test-suite e.g. via make, its
+entry point is the runtests.py file. That file contains the main logic for
+running the individual tests, collecting information, handling failure, and
+outputting the final results.
+
+## Overview of how the performance test bits work.
+During a Haskell Summer of Code project, an intern went through and revamped
+most of the performance test code, as such there have been a few changes to it
+that might be unusual to anyone previously familiar with the testsuite. One of
+the biggest immediate benefits is that all platform differences, compiler
+differences, and things such as that are not necessary to be considered by the
+test writer anymore. This is due to the fact that the test comparison relies
+entirely on locally collected metrics on the testing machine.
+
+As such, it is perfectly sufficient to write `collect_stats('all',20)` in the
+".T" files to measure the 3 potential stats that can be collected for that test
+and automatically test them for regressions, failing if there is more than a 20%
+change in any direction. In fact, even that is not necessary as
+`collect_stats()` defaults to 'all', and 20% deviation allowed.
+
+The function `collect_compiler_stats()` is completely equivalent in every way to
+`collect_stats` except that it measures the performance of the compiler itself
+rather than the performance of the code generated by the compiler. See the
+implementation of collect_stats in /driver/testlib.py for more information.
+
+If the performance of a test is improved so much that the test fails, the value
+will still be recorded. The warning that will be emitted is merely a precaution
+so that the programmer can double-check that they didn't introduce a bug;
+something that might be suspicious if the test suddenly improves by 70%,
+for example.
+
+Performance metrics for performance tests are now stored in git notes under the
+namespace 'perf'.  The format of the git note file is that each line represents
+a single metric for a particular test: `$test_env $test_name $test_way
+$metric_measured $value_collected` (delimited by tabs).
+
+One can view the maximum deviation a test allows by looking inside its
+respective all.T file; additionally, if one sets the verbosity level of the
+test-suite to a value >= 4, they will see a good amount of output per test
+detailing all the information about values.  This information will also print
+if the test falls outside of the allowed bounds.  (see the test_cmp function in
+/driver/perf_notes.py for exact formatting of the message)
+
+The git notes are only appended to by the testsuite in a single atomic python
+subprocess at the end of the test run; if the run is canceled at any time, the
+notes will not be written.  The note appending command will be retried up to 4
+times in the event of a failure (such as one happening due to a lock on the
+repo) although this is never anticipated to happen.  If, for some reason, the 5
+attempts were not enough, an error message will be printed out.  Further, there
+is no current process or method for stripping duplicates, updating values, etc,
+so if the testsuite is ran multiple times per commit there will be multiple
+values in the git notes corresponding to the tests ran.  In this case the
+average value is used.
+
+## Quick overview of program parts
+
+The relevant bits of the directory tree are as such:
+
+```
+├── driver                   -- Testsuite driver directory
+    ├── junit.py             -- Contains code implementing JUnit features.
+    ├── kill_extra_files.py  -- Some of the uglier implementation details.
+    ├── perf_notes.py        -- Comparison tool and performance tests.
+    ├── runtests.py          -- Main entrypoint for program; runs tests.
+    ├── testglobals.py       -- Global data structures and objects.
+    ├── testlib.py           -- Bulk of implementation is in here.
+    └── testutil.py          -- Misc helper functions.
+├── mk
+    └── test.mk              -- Master makefile for running tests.
+├── tests                    -- Main tests directory.
+```
+
+## How to Use the Comparison Tool
+
+The comparison tool exists in `/driver/perf_notes.py`.
+
+When the testsuite is ran, the performance metrics of the performance tests are
+saved automatically in a local git note that will be attached to the commit.
+The comparison tool is designed to help analyze performance metrics across
+commits using this performance information.
+
+Currently, it can only be ran by executing the file directly, like so:
+```
+$ python3 perf_notes.py (arguments go here)
+```
+
+If you run `perf_notes.py -h` you will see a description of all of the
+arguments and how to use them.  The optional arguments exist to filter the
+output to include only commits that you're interested in.  The most typical
+usage of this tool will likely be running `perf_notes.py HEAD 'HEAD~1' '(commit
+hash)' ...`
+
+The way the performance metrics are stored in git notes remains strictly local
+to the machine; as such, performance metrics will not exist for a commit until
+you checkout that commit and run the testsuite (or test).
+
+## Quick Answers for "How do I do X?"
+
+* Q: How do I add a flag to "make test" to extend the testsuite functionality?
+    1. Add the flag in the appropriate global object in testglobals.py
+    2. Add a argument to the parser in runtests.py that sets the flag
+    3. Go to the `testsuite/mk/test.mk` file and add a new ifeq (or ifneq)
+        block. I suggest adding the block around line 200.
+* Q: How do I modify how performance tests work?
+    * That functionality resides in perf_notes.py which has pretty good
+      in-code documentation.
+    * Additionally, one will want to look at `compile_and_run`, `simple_run`,
+      and `simple_build` in testutil.py
+
+  [1]: http://ghc.haskell.org/trac/ghc/wiki/Building/RunningTests
diff --git a/testsuite/driver/perf_notes.py b/testsuite/driver/perf_notes.py
new file mode 100644
index 0000000000..f162164e3e
--- /dev/null
+++ b/testsuite/driver/perf_notes.py
@@ -0,0 +1,382 @@
+#!/usr/bin/env python3
+
+#
+# (c) Jared Weakly 2017
+#
+# This file will be a utility to help facilitate the comparison of performance
+# metrics across arbitrary commits. The file will produce a table comparing
+# metrics between measurements taken for given commits in the environment
+# (which defaults to 'local' if not given by --test-env).
+#
+
+import argparse
+import re
+import subprocess
+import time
+
+from collections import namedtuple
+from math import ceil, trunc
+
+from testutil import passed, failBecause
+
+
+#
+# Some data access functions. A the moment this uses git notes.
+#
+
+# The metrics (a.k.a stats) are named tuples, PerfStat, in this form:
+#
+# ( test_env : 'val',      # Test environment.
+#   test     : 'val',      # Name of the test 
+#   way      : 'val',
+#   metric   : 'val',      # Metric being recorded
+#   value    : 'val',      # The statistic result e.g. runtime
+# )
+
+# All the fields of a metric (excluding commit field).
+PerfStat = namedtuple('PerfStat', ['test_env','test','way','metric','value'])
+
+class MetricChange:
+    NewMetric = 'NewMetric'
+    NoChange = 'NoChange'
+    Increase = 'Increase'
+    Decrease = 'Decrease'
+
+def parse_perf_stat(stat_str):
+    field_vals = stat_str.strip('\t').split('\t')
+    return PerfStat(*field_vals)
+
+# Get all recorded (in a git note) metrics for a given commit.
+# Returns an empty array if the note is not found.
+def get_perf_stats(commit='HEAD', namespace='perf'):
+    try:
+        log = subprocess.check_output(['git', 'notes', '--ref=' + namespace, 'show', commit], stderr=subprocess.STDOUT).decode('utf-8')
+    except subprocess.CalledProcessError:
+        return []
+
+    log = log.strip('\n').split('\n')
+    log = list(filter(None, log))
+    log = [parse_perf_stat(stat_str) for stat_str in log]
+    return log
+
+
+# Get allowed changes to performance. This is extracted from the commit message of
+# the given commit in this form:
+#     Metric  (Increase | Decrease)  ['metric' | \['metrics',..\]]  [\((test_env|way)='abc',...\)]: TestName01, TestName02, ...
+# Returns a *dictionary* from test name to a *list* of items of the form:
+#   {
+#           'direction': either 'Increase' or 'Decrease,
+#           'metrics': ['metricA', 'metricB', ...],
+#           'opts': {
+#                   'optionA': 'string value',
+#                   'optionB': 'string value',
+#                   ...
+#               }
+#   }
+def get_allowed_perf_changes(commit='HEAD'):
+    commitByteStr = subprocess.check_output(['git', '--no-pager', 'log', '-n1', '--format=%B', commit])
+    return parse_allowed_perf_changes(commitByteStr.decode())
+
+def parse_allowed_perf_changes(commitMsg):
+    # Helper regex. Non-capturing unless postfixed with Cap.
+    s = r"(?:\s*\n?\s+)"                                    # Space, possible new line with an indent.
+    qstr = r"(?:'(?:[^'\\]|\\.)*')"                         # Quoted string.
+    qstrCap = r"(?:'((?:[^'\\]|\\.)*)')"                    # Quoted string. Captures the string without the quotes.
+    innerQstrList = r"(?:"+qstr+r"(?:"+s+r"?,"+s+r"?"+qstr+r")*)?"     # Inside of a list of strings.gs.s..
+    qstrList = r"(?:\["+s+r"?"+innerQstrList+s+r"?\])"      # A list of strings (using box brackets)..
+
+    exp = (r"^Metric"
+        +s+r"(Increase|Decrease)"
+        +s+r"?("+qstr+r"|"+qstrList+r")?"                   # Metric or list of metrics.s..
+        +s+r"?(\(" + r"(?:[^')]|"+qstr+r")*" + r"\))?"      # Options surounded in parenthesis. (allow parenthases in quoted strings))
+        +s+r"?:?"                                           # Optional ":"
+        +s+r"?((?:(?!\n\n)(?!\n[^\s])(?:.|\n))*)"           # Test names. Stop parsing on empty or non-indented new line.
+        )
+
+    matches = re.findall(exp, commitMsg, re.M)
+    changes = {}
+    for (direction, metrics_str, opts_str, tests_str) in matches:
+        tests = re.findall(r"(\w+)", tests_str)
+        for test in tests:
+            changes.setdefault(test, []).append({
+                'direction': direction,
+                'metrics': re.findall(qstrCap, metrics_str),
+                'opts': dict(re.findall(r"(\w+)"+s+r"?="+s+r"?"+qstrCap, opts_str))
+            })
+
+    return changes
+
+# Calculates a suggested string to append to the git commit in order to accept the
+# given changes.
+# changes: [(MetricChange, PerfStat)]
+def allow_changes_string(changes):
+    Dec = MetricChange.Decrease
+    Inc = MetricChange.Increase
+
+    # We only care about increase / decrease metrics.
+    changes = [change for change in changes if change[0] in [Inc, Dec]]
+
+    # Map tests to a map from change direction to metrics.
+    test_to_dir_to_metrics = {}
+    for (change, perf_stat) in changes:
+        change_dir_to_metrics = test_to_dir_to_metrics.setdefault(perf_stat.test, { Inc: [], Dec: [] })
+        change_dir_to_metrics[change].append(perf_stat.metric)
+
+    # Split into 3 groups.
+    # Tests where all changes are *increasing*.
+    # Tests where all changes are *decreasing*.
+    # Tests where changes are *mixed* increasing and decreasing.
+    groupDec = []
+    groupInc = []
+    groupMix = []
+    for (test, decsAndIncs) in test_to_dir_to_metrics.items():
+        decs = decsAndIncs[Dec]
+        incs = decsAndIncs[Inc]
+        if decs and incs:
+            groupMix.append(test)
+        elif not decs:
+            groupInc.append(test)
+        else:
+            groupDec.append(test)
+
+    msgs = []
+    nltab = '\n    '
+
+    # Decreasing group.
+    if groupDec:
+        msgs.append('Metric Decrease:' + nltab + nltab.join(groupDec))
+
+    # Increasing group.
+    if groupInc:
+        msgs.append('Metric Increase:' + nltab + nltab.join(groupInc))
+
+    # Mixed group.
+    if groupMix:
+        # Split mixed group tests by decrease/increase, then by metric.
+        dir_to_metric_to_tests = {
+                Dec: {},
+                Inc: {}
+            }
+        for test in groupMix:
+            for change_dir, metrics in test_to_dir_to_metrics[test].items():
+                for metric in metrics:
+                    dir_to_metric_to_tests[change_dir].setdefault(metric, []).append(test)
+
+        for change_dir in [Dec, Inc]:
+            metric_to_tests = dir_to_metric_to_tests[change_dir]
+            for metric in sorted(metric_to_tests.keys()):
+                tests = metric_to_tests[metric]
+                msgs.append('Metric ' + change_dir + ' \'' + metric + '\':' + nltab + nltab.join(tests))
+
+    return '\n\n'.join(msgs)
+
+# Formats a list of metrics into a string. Used e.g. to save metrics to a file or git note.
+def format_perf_stat(stats):
+    # If a single stat, convert to a singleton list.
+    if not isinstance(stats, list):
+        stats = [stats]
+
+    return "\n".join(["\t".join([str(stat_val) for stat_val in stat]) for stat in stats])
+
+# Appends a list of metrics to the git note of the given commit.
+# Tries up to max_tries times to write to git notes should it fail for some reason.
+# Each retry will wait 1 second.
+# Returns True if the note was successfully appended.
+def append_perf_stat(stats, commit='HEAD', namespace='perf', max_tries=5):
+    # Append to git note
+    print('Appending ' + str(len(stats)) + ' stats to git notes.')
+    stats_str = format_perf_stat(stats)
+    def try_append():
+            try:
+                return subprocess.check_output(['git', 'notes', '--ref=' + namespace, 'append', commit, '-m', stats_str])
+            except subprocess.CalledProcessError:
+                return b'Git - fatal'
+
+    tries = 0
+    while tries < max_tries:
+        if not b'Git - fatal' in try_append():
+            return True
+        tries += 1
+        time.sleep(1)
+
+    print("\nAn error occured while writing the performance metrics to git notes.\n \
+	            This is usually due to a lock-file existing somewhere in the git repo.")
+
+    return False
+
+# Check test stats. This prints the results for the user.
+# actual: the PerfStat with actual value.
+# expected_val: the expected value (this should generally be derived from get_perf_stats())
+# tolerance_dev: allowed deviation of the actual value from the expected value.
+# allowed_perf_changes: allowed changes in stats. This is a dictionary as returned by get_allowed_perf_changes().
+# force_print: Print stats even if the test stat was in the tolerance range.
+# Returns a (MetricChange, pass/fail object) tuple. Passes if the stats are withing the expected value ranges.
+def check_stats_change(actual, expected_val, tolerance_dev, allowed_perf_changes = {}, force_print = False):
+    full_name = actual.test + ' (' + actual.way + ')'
+
+    lowerBound = trunc(           int(expected_val) * ((100 - float(tolerance_dev))/100))
+    upperBound = trunc(0.5 + ceil(int(expected_val) * ((100 + float(tolerance_dev))/100)))
+
+    actual_dev = round(((float(actual.value) * 100)/ int(expected_val)) - 100, 1)
+
+    # Find the direction of change.
+    change = MetricChange.NoChange
+    if actual.value < lowerBound:
+        change = MetricChange.Decrease
+    elif actual.value > upperBound:
+        change = MetricChange.Increase
+
+    # Is the change allowed?
+    allowed_change_directions =  [MetricChange.NoChange] + [ allow_stmt['direction']
+            for allow_stmt in allowed_perf_changes.get(actual.test, [])
+
+            # List of metrics are not specified or the metric is in the list of metrics.
+            if not allow_stmt['metrics'] or actual.metric in allow_stmt['metrics']
+
+            # way/test are not specified, or match the actual way/test.
+            if ((not 'way'      in allow_stmt['opts'].keys()) or actual.way      == allow_stmt['opts']['way'])
+            if ((not 'test_env' in allow_stmt['opts'].keys()) or actual.test_env == allow_stmt['opts']['test_env'])
+        ]
+    change_allowed = change in allowed_change_directions
+
+    # Print errors and create pass/fail object.
+    result = passed()
+    if not change_allowed:
+        error = change + ' not allowed'
+        print(actual.metric, error + ':')
+        result = failBecause('stat ' + error, tag='stat')
+
+    if not change_allowed or force_print:
+        length = max(len(str(x)) for x in [expected_val, lowerBound, upperBound, actual.value])
+
+        def display(descr, val, extra):
+            print(descr, str(val).rjust(length), extra)
+
+        display('    Expected    ' + full_name + ' ' + actual.metric + ':', expected_val, '+/-' + str(tolerance_dev) + '%')
+        display('    Lower bound ' + full_name + ' ' + actual.metric + ':', lowerBound, '')
+        display('    Upper bound ' + full_name + ' ' + actual.metric + ':', upperBound, '')
+        display('    Actual      ' + full_name + ' ' + actual.metric + ':', actual.value, '')
+        if actual.value != expected_val:
+            display('    Deviation   ' + full_name + ' ' + actual.metric + ':', actual_dev, '%')
+
+    return (change, result)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--test-env",
+                        help="The given test environment to be compared.")
+    parser.add_argument("--test-name",
+                        help="If given, filters table to include only \
+                        tests matching the given regular expression.")
+    parser.add_argument("--add-note", nargs=3,
+                        help="Development only. --add-note N commit seed \
+                        Adds N fake metrics to the given commit using the random seed.")
+    parser.add_argument("commits", nargs=argparse.REMAINDER,
+                        help="The rest of the arguments will be the commits that will be used.")
+    args = parser.parse_args()
+
+    env = 'local'
+    name = re.compile('.*')
+    # metrics is a tuple (str commit, PerfStat stat)
+    CommitAndStat = namedtuple('CommitAndStat', ['commit', 'stat'])
+    metrics = []
+    singleton_commit = len(args.commits) == 1
+
+    #
+    # Main logic of the program when called from the command-line.
+    #
+
+    if args.commits:
+        for c in args.commits:
+            metrics += [CommitAndStat(c, stat) for stat in get_perf_stats(c)]
+
+    if args.test_env:
+        metrics = [test for test in metrics if test.stat.test_env == args.test_env]
+
+    if args.test_name:
+        nameRe = re.compile(args.test_name)
+        metrics = [test for test in metrics if nameRe.search(test.test)]
+
+    if args.add_note:
+        def note_gen(n, commit, delta=''):
+            note = []
+            # Generates simple fake data. Likely not comprehensive enough to catch all edge cases.
+            if not delta:
+                note.extend([PerfStat('local', 'T'+ str(i*100), 'some_way', 'some_field', str(i*1000)) for i in range(1,int(int(n)/2)+1)])
+                note.extend([PerfStat('non-local', 'W'+ str(i*100), 'other_way', 'other_field', str(i*100)) for i in range(int(int(n)/2)+1,int(n)+1)])
+            if delta:
+                hu = abs(hash(delta))
+                hv = abs(hash(hu))
+                u = int(hu % 100)
+                v = int(hv % 10)
+                note.extend([PerfStat('local', 'T'+ str(i*100), 'some_way', 'some_field', str(i*u)) for i in range(1,int(int(n)/2)+1)])
+                note.extend([PerfStat('non-local', 'W'+ str(i*100), 'other_way', 'other_field', str(i*v)) for i in range(int(int(n)/2)+1,int(n)+1)])
+
+            append_perf_stat(note, commit)
+
+        note_gen(args.add_note[0],args.add_note[1],args.add_note[2])
+
+    #
+    # String utilities for pretty-printing
+    #
+
+    row_fmt = '{:18}' * len(args.commits)
+    commits = row_fmt.format(*[c[:10] for c in args.commits])
+
+    def cmtline(insert):
+        return row_fmt.format(*[insert for c in args.commits]).strip()
+
+    def header(unit):
+        first_line = "{:27}{:30}".format('    ','      ') + cmtline(unit)
+        second_line = ("{:27}{:30}".format('Test','Metric') + commits).strip()
+
+        # Test   Metric   c1   c2   c3 ...
+        print("-" * (len(second_line)+1))
+        print(first_line)
+        print(second_line)
+        print("-" * (len(second_line)+1))
+
+    def commit_string(test, flag):
+        def delta(v1, v2):
+            return round((100 * (v1 - v2)/v2),2)
+
+        # Get the average value per commit (or None if that commit contains no metrics).
+        # Note: if the test environment is not set, this will combine metrics from all test environments.
+        averageValuesOrNones = []
+        for commit in args.commits:
+            values = [float(t.stat.value) for t in metrics if t.commit == commit and t.stat.test == test]
+            if values == []:
+                averageValuesOrNones.append(None)
+            else:
+                averageValuesOrNones.append(sum(values) / len(values))
+
+        if flag == 'metrics':
+            strings = [str(v) if v != None else '-' for v in averageValuesOrNones]
+        if flag == 'percentages':
+            # If the baseline commit has no stats, then we can not produce any percentages.
+            baseline = averageValuesOrNones[0]
+            if baseline == None:
+                strings = ['-' for v in averageValuesOrNones]
+            else:
+                baseline = float(baseline)
+                strings = ['-' if val == None else str(delta(baseline,float(val))) + '%' for val in averageValuesOrNones]
+
+        return row_fmt.format(*strings).strip()
+
+    #
+    # The pretty-printed output
+    #
+
+    header('commit')
+    # Printing out metrics.
+    all_tests = sorted(set([(test.stat.test, test.stat.metric) for test in metrics]))
+    for test, metric in all_tests:
+        print("{:27}{:30}".format(test, metric) + commit_string(test,'metrics'))
+
+    # Has no meaningful output if there is no commit to compare to.
+    if not singleton_commit:
+        header('percent')
+
+        # Printing out percentages.
+        for test, metric in all_tests:
+            print("{:27}{:30}".format(test, metric) + commit_string(test,'percentages'))
+\ No newline at end of file
diff --git a/testsuite/driver/runtests.py b/testsuite/driver/runtests.py
index b956239d2a..fb3fe6ad54 100644
--- a/testsuite/driver/runtests.py
+++ b/testsuite/driver/runtests.py
@@ -23,8 +23,9 @@ import traceback
 # So we import it here first, so that the testsuite doesn't appear to fail.
 import subprocess
 
-from testutil import getStdout, Watcher
+from testutil import getStdout, Watcher, str_warn, str_info
 from testglobals import getConfig, ghc_env, getTestRun, TestOptions, brokens
+from perf_notes import MetricChange
 from junit import junit
 
 # Readline sometimes spews out ANSI escapes for some values of TERM,
@@ -43,11 +44,13 @@ def signal_handler(signal, frame):
 # cmd-line options
 
 parser = argparse.ArgumentParser(description="GHC's testsuite driver")
+perf_group = parser.add_mutually_exclusive_group()
 
 parser.add_argument("-e", action='append', help="A string to execute from the command line.")
 parser.add_argument("--config-file", action="append", help="config file")
 parser.add_argument("--config", action='append', help="config field")
 parser.add_argument("--rootdir", action='append', help="root of tree containing tests (default: .)")
+parser.add_argument("--metrics-file", help="file in which to save (append) the performance test metrics. If omitted, git notes will be used.")
 parser.add_argument("--summary-file", help="file in which to save the (human-readable) summary")
 parser.add_argument("--no-print-summary", action="store_true", help="should we print the summary?")
 parser.add_argument("--only", action="append", help="just this test (can be give multiple --only= flags)")
@@ -55,23 +58,32 @@ parser.add_argument("--way", action="append", help="just this way")
 parser.add_argument("--skipway", action="append", help="skip this way")
 parser.add_argument("--threads", type=int, help="threads to run simultaneously")
 parser.add_argument("--verbose", type=int, choices=[0,1,2,3,4,5], help="verbose (Values 0 through 5 accepted)")
-parser.add_argument("--skip-perf-tests", action="store_true", help="skip performance tests")
 parser.add_argument("--junit", type=argparse.FileType('wb'), help="output testsuite summary in JUnit format")
+parser.add_argument("--test-env", default='local', help="Override default chosen test-env.")
+perf_group.add_argument("--skip-perf-tests", action="store_true", help="skip performance tests")
+perf_group.add_argument("--only-perf-tests", action="store_true", help="Only do performance tests")
 
 args = parser.parse_args()
 
-for e in args.e:
-    exec(e)
+if args.e:
+    for e in args.e:
+        exec(e)
 
-for arg in args.config_file:
-    exec(open(arg).read())
+if args.config_file:
+    for arg in args.config_file:
+        exec(open(arg).read())
 
-for arg in args.config:
-    field, value = arg.split('=', 1)
-    setattr(config, field, value)
+if args.config:
+    for arg in args.config:
+        field, value = arg.split('=', 1)
+        setattr(config, field, value)
 
 all_ways = config.run_ways+config.compile_ways+config.other_ways
-config.rootdirs = args.rootdir
+
+if args.rootdir:
+    config.rootdirs = args.rootdir
+
+config.metrics_file = args.metrics_file
 config.summary_file = args.summary_file
 config.no_print_summary = args.no_print_summary
 
@@ -104,7 +116,12 @@ if args.threads:
 
 if args.verbose is not None:
     config.verbose = args.verbose
+
 config.skip_perf_tests = args.skip_perf_tests
+config.only_perf_tests = args.only_perf_tests
+
+if args.test_env:
+    config.test_env = args.test_env
 
 config.cygwin = False
 config.msys = False
@@ -223,6 +240,14 @@ if config.timeout == -1:
 
 print('Timeout is ' + str(config.timeout))
 
+# Try get allowed performance changes from the git commit.
+try:
+    config.allowed_perf_changes = Perf.get_allowed_perf_changes()
+except subprocess.CalledProcessError:
+    print('Failed to get allowed metric changes from the HEAD git commit message.')
+
+print(len(config.allowed_perf_changes))
+
 # -----------------------------------------------------------------------------
 # The main dude
 
@@ -326,7 +351,31 @@ else:
     # flush everything before we continue
     sys.stdout.flush()
 
-    summary(t, sys.stdout, config.no_print_summary)
+    # Warn of new metrics.
+    new_metrics = [metric for (change, metric) in t.metrics if change == MetricChange.NewMetric]
+    spacing = "    "
+    if any(new_metrics):
+        print()
+        print(str_warn('New Metrics') + ' the previous git commit doesn\'t have metrics for the following tests:')
+        print(spacing + ('\n' + spacing).join(set([metric.test for metric in new_metrics])))
+
+    # Inform of how to accept metric changes.
+    if (len(t.unexpected_stat_failures) > 0):
+        print()
+        print(str_info("Some stats have changed") + " If this is expected, allow changes by appending the git commit message with this:")
+        print('-' * 25)
+        print(Perf.allow_changes_string(t.metrics))
+        print('-' * 25)
+
+    summary(t, sys.stdout, config.no_print_summary, True)
+
+    stats = [stat for (_, stat) in t.metrics]
+    if config.metrics_file:
+        print('Appending ' + str(len(stats)) + ' stats to file: ' + config.metrics_file)
+        with open(config.metrics_file, 'a') as file:
+            file.write("\n" + Perf.format_perf_stat(stats))
+    else:
+        Perf.append_perf_stat(stats)
 
     if config.summary_file:
         with open(config.summary_file, 'w') as file:
diff --git a/testsuite/driver/testglobals.py b/testsuite/driver/testglobals.py
index 311e39be7f..03a62503b4 100644
--- a/testsuite/driver/testglobals.py
+++ b/testsuite/driver/testglobals.py
@@ -31,6 +31,9 @@ class TestConfig:
         self.accept_platform = False
         self.accept_os = False
 
+        # File in which to save the performance metrics.
+        self.metrics_file = ''
+
         # File in which to save the summary
         self.summary_file = ''
 
@@ -122,6 +125,15 @@ class TestConfig:
         # Should we skip performance tests
         self.skip_perf_tests = False
 
+        # Only do performance tests
+        self.only_perf_tests = False
+
+        # Allowed performance changes (see perf_notes.get_allowed_perf_changes())
+        self.allowed_perf_changes = {}
+
+        # The test environment.
+        self.test_env = 'local'
+
 global config
 config = TestConfig()
 
@@ -156,6 +168,12 @@ class TestRun:
        self.unexpected_failures = []
        self.unexpected_stat_failures = []
 
+       # List of all metrics measured in this test run.
+       # [(change, PerfStat)] where change is one of the  MetricChange
+       # constants: NewMetric, NoChange, Increase, Decrease.
+       # NewMetric happens when the previous git commit has no metric recorded.
+       self.metrics = []
+
 global t
 t = TestRun()
 
@@ -215,16 +233,14 @@ class TestOptions:
        # extra files to copy to the testdir
        self.extra_files = []
 
-       # which -t numeric fields do we want to look at, and what bounds must
-       # they fall within?
-       # Elements of these lists should be things like
-       # ('bytes allocated',
-       #   9300000000,
-       #   10)
-       # To allow a 10% deviation from 9300000000.
-       self.compiler_stats_range_fields = {}
+       # Map from metric to expectected value and allowed percentage deviation. e.g.
+       #     { 'bytes allocated': (9300000000, 10) }
+       # To allow a 10% deviation from 9300000000 for the 'bytes allocated' metric.
        self.stats_range_fields = {}
 
+       # Does this test the compiler's performance as opposed to the generated code.
+       self.is_compiler_stats_test = False
+
        # should we run this test alone, i.e. not run it in parallel with
        # any other threads
        self.alone = False
@@ -292,4 +308,3 @@ default_testopts = TestOptions()
 # (bug, directory, name) of tests marked broken
 global brokens
 brokens = []
-
diff --git a/testsuite/driver/testlib.py b/testsuite/driver/testlib.py
index ff6a8c8e74..761ba67fd2 100644
--- a/testsuite/driver/testlib.py
+++ b/testsuite/driver/testlib.py
@@ -19,7 +19,9 @@ import collections
 import subprocess
 
 from testglobals import config, ghc_env, default_testopts, brokens, t
-from testutil import strip_quotes, lndir, link_or_copy_file
+from testutil import strip_quotes, lndir, link_or_copy_file, passed, failBecause, str_fail, str_pass
+import perf_notes as Perf
+from perf_notes import MetricChange
 extra_src_files = {'T4198': ['exitminus1.c']} # TODO: See #12223
 
 global pool_sema
@@ -56,9 +58,13 @@ def setLocalTestOpts(opts):
     global testopts_local
     testopts_local.x=opts
 
+def isCompilerStatsTest():
+    opts = getTestOpts()
+    return bool(opts.is_compiler_stats_test)
+
 def isStatsTest():
     opts = getTestOpts()
-    return bool(opts.compiler_stats_range_fields or opts.stats_range_fields)
+    return bool(opts.stats_range_fields)
 
 
 # This can be called at the top of a file of tests, to set default test options
@@ -254,14 +260,14 @@ def _exit_code( name, opts, v ):
 
 def signal_exit_code( val ):
     if opsys('solaris2'):
-        return exit_code( val );
+        return exit_code( val )
     else:
         # When application running on Linux receives fatal error
         # signal, then its exit code is encoded as 128 + signal
         # value. See http://www.tldp.org/LDP/abs/html/exitcodes.html
         # I assume that Mac OS X behaves in the same way at least Mac
         # OS X builder behavior suggests this.
-        return exit_code( val+128 );
+        return exit_code( val+128 )
 
 # -----
 
@@ -307,42 +313,85 @@ def _extra_files(name, opts, files):
 
 # -----
 
-def stats_num_field( field, expecteds ):
-    return lambda name, opts, f=field, e=expecteds: _stats_num_field(name, opts, f, e);
+# Defaults to "test everything, and only break on extreme cases"
+#
+# The inputs to this function are slightly interesting:
+# metric can be either:
+#     - 'all', in which case all 3 possible metrics are collected and compared.
+#     - The specific metric one wants to use in the test.
+#     - A list of the metrics one wants to use in the test.
+#
+# Deviation defaults to 20% because the goal is correctness over performance.
+# The testsuite should avoid breaking when there is not an actual error.
+# Instead, the testsuite should notify of regressions in a non-breaking manner.
+#
+# collect_compiler_stats is used when the metrics collected are about the compiler.
+# collect_stats is used in the majority case when the metrics to be collected
+# are about the performance of the runtime code generated by the compiler.
+def collect_compiler_stats(metric='all',deviation=20):
+    return lambda name, opts, m=metric, d=deviation: _collect_stats(name, opts, m,d, True)
+
+def collect_stats(metric='all', deviation=20):
+    return lambda name, opts, m=metric, d=deviation: _collect_stats(name, opts, m, d)
+
+def testing_metrics():
+    return ['bytes allocated', 'peak_megabytes_allocated', 'max_bytes_used']
+
+# This is an internal function that is used only in the implementation.
+# 'is_compiler_stats_test' is somewhat of an unfortunate name.
+# If the boolean is set to true, it indicates that this test is one that
+# measures the performance numbers of the compiler.
+# As this is a fairly rare case in the testsuite, it defaults to false to
+# indicate that it is a 'normal' performance test.
+def _collect_stats(name, opts, metric, deviation, is_compiler_stats_test=False):
+    if not re.match('^[0-9]*[a-zA-Z][a-zA-Z0-9._-]*$', name):
+        failBecause('This test has an invalid name.')
 
-def _stats_num_field( name, opts, field, expecteds ):
-    if field in opts.stats_range_fields:
-        framework_fail(name, 'duplicate-numfield', 'Duplicate ' + field + ' num_field check')
+    tests = Perf.get_perf_stats('HEAD^')
 
-    if type(expecteds) is list:
-        for (b, expected, dev) in expecteds:
-            if b:
-                opts.stats_range_fields[field] = (expected, dev)
-                return
-        framework_warn(name, 'numfield-no-expected', 'No expected value found for ' + field + ' in num_field check')
+    # Might have multiple metrics being measured for a single test.
+    test = [t for t in tests if t.test == name]
 
-    else:
-        (expected, dev) = expecteds
-        opts.stats_range_fields[field] = (expected, dev)
+    if tests == [] or test == []:
+        # There are no prior metrics for this test.
+        if isinstance(metric, str):
+            if metric == 'all':
+                for field in testing_metrics():
+                    opts.stats_range_fields[field] = None
+            else:
+                opts.stats_range_fields[metric] = None
+        if isinstance(metric, list):
+            for field in metric:
+                opts.stats_range_fields[field] = None
 
-def compiler_stats_num_field( field, expecteds ):
-    return lambda name, opts, f=field, e=expecteds: _compiler_stats_num_field(name, opts, f, e);
+        return
 
-def _compiler_stats_num_field( name, opts, field, expecteds ):
-    if field in opts.compiler_stats_range_fields:
-        framework_fail(name, 'duplicate-numfield', 'Duplicate ' + field + ' num_field check')
+    if is_compiler_stats_test:
+        opts.is_compiler_stats_test = True
 
     # Compiler performance numbers change when debugging is on, making the results
     # useless and confusing. Therefore, skip if debugging is on.
-    if compiler_debugged():
-        skip(name, opts)
-
-    for (b, expected, dev) in expecteds:
-        if b:
-            opts.compiler_stats_range_fields[field] = (expected, dev)
+    if config.compiler_debugged and is_compiler_stats_test:
+        opts.skip = 1
+
+    # get the average value of the given metric from test
+    def get_avg_val(metric_2):
+        metric_2_metrics = [float(t.value) for t in test if t.metric == metric_2]
+        return sum(metric_2_metrics) / len(metric_2_metrics)
+
+    # 'all' is a shorthand to test for bytes allocated, peak megabytes allocated, and max bytes used.
+    if isinstance(metric, str):
+        if metric == 'all':
+            for field in testing_metrics():
+                opts.stats_range_fields[field] = (get_avg_val(field), deviation)
+                return
+        else:
+            opts.stats_range_fields[metric] = (get_avg_val(metric), deviation)
             return
 
-    framework_warn(name, 'numfield-no-expected', 'No expected value found for ' + field + ' in num_field check')
+    if isinstance(metric, list):
+        for field in metric:
+            opts.stats_range_fields[field] = (get_avg_val(field), deviation)
 
 # -----
 
@@ -720,6 +769,7 @@ def test_common_work(watcher, name, opts, func, args):
             and (getTestOpts().only_ways == None or way in getTestOpts().only_ways) \
             and (config.cmdline_ways == [] or way in config.cmdline_ways) \
             and (not (config.skip_perf_tests and isStatsTest())) \
+            and (not (config.only_perf_tests and not isStatsTest())) \
             and way not in getTestOpts().omit_ways
 
         # Which ways we are asked to skip
@@ -927,12 +977,6 @@ def badResult(result):
     except (KeyError, TypeError):
         return True
 
-def passed():
-    return {'passFail': 'pass'}
-
-def failBecause(reason, tag=None):
-    return {'passFail': 'fail', 'reason': reason, 'tag': tag}
-
 # -----------------------------------------------------------------------------
 # Generic command tests
 
@@ -1087,56 +1131,65 @@ def multi_compile_and_run( name, way, top_mod, extra_mods, extra_hc_opts ):
 
 def stats( name, way, stats_file ):
     opts = getTestOpts()
-    return checkStats(name, way, stats_file, opts.stats_range_fields)
+    return check_stats(name, way, stats_file, opts.stats_range_fields)
 
-# -----------------------------------------------------------------------------
-# Check -t stats info
-
-def checkStats(name, way, stats_file, range_fields):
-    full_name = name + '(' + way + ')'
+def metric_dict(name, way, metric, value):
+    return Perf.PerfStat(
+        test_env = config.test_env,
+        test     = name,
+        way      = way,
+        metric   = metric,
+        value    = value)
 
+# -----------------------------------------------------------------------------
+# Check test stats. This prints the results for the user.
+# name: name of the test.
+# way: the way.
+# stats_file: the path of the stats_file containing the stats for the test.
+# range_fields
+# Returns a pass/fail object. Passes if the stats are withing the expected value ranges.
+# This prints the results for the user.
+def check_stats(name, way, stats_file, range_fields):
     result = passed()
     if range_fields:
         try:
             f = open(in_testdir(stats_file))
         except IOError as e:
             return failBecause(str(e))
-        contents = f.read()
+        stats_file_contents = f.read()
         f.close()
 
-        for (field, (expected, dev)) in range_fields.items():
-            m = re.search('\("' + field + '", "([0-9]+)"\)', contents)
-            if m == None:
-                print('Failed to find field: ', field)
-                result = failBecause('no such stats field')
-            val = int(m.group(1))
-
-            lowerBound = trunc(           expected * ((100 - float(dev))/100))
-            upperBound = trunc(0.5 + ceil(expected * ((100 + float(dev))/100)))
-
-            deviation = round(((float(val) * 100)/ expected) - 100, 1)
-
-            if val < lowerBound:
-                print(field, 'value is too low:')
-                print('(If this is because you have improved GHC, please')
-                print('update the test so that GHC doesn\'t regress again)')
-                result = failBecause('stat too good', tag='stat')
-            if val > upperBound:
-                print(field, 'value is too high:')
-                result = failBecause('stat not good enough', tag='stat')
-
-            if val < lowerBound or val > upperBound or config.verbose >= 4:
-                length = max(len(str(x)) for x in [expected, lowerBound, upperBound, val])
-
-                def display(descr, val, extra):
-                    print(descr, str(val).rjust(length), extra)
-
-                display('    Expected    ' + full_name + ' ' + field + ':', expected, '+/-' + str(dev) + '%')
-                display('    Lower bound ' + full_name + ' ' + field + ':', lowerBound, '')
-                display('    Upper bound ' + full_name + ' ' + field + ':', upperBound, '')
-                display('    Actual      ' + full_name + ' ' + field + ':', val, '')
-                if val != expected:
-                    display('    Deviation   ' + full_name + ' ' + field + ':', deviation, '%')
+        for (metric, range_val_dev) in range_fields.items():
+            field_match = re.search('\("' + metric + '", "([0-9]+)"\)', stats_file_contents)
+            if field_match == None:
+                print('Failed to find metric: ', metric)
+                metric_result = failBecause('no such stats metric')
+            else:
+                actual_val = int(field_match.group(1))
+                
+                # Store the metric so it can later be stored in a git note.
+                perf_stat = metric_dict(name, way, metric, actual_val)
+                change = None
+
+                # If this is the first time running the benchmark, then pass.
+                if range_val_dev == None:
+                    metric_result = passed()
+                    change = MetricChange.NewMetric
+                else:
+                    (expected_val, tolerance_dev) = range_val_dev
+                    (change, metric_result) = Perf.check_stats_change(
+                        perf_stat,
+                        expected_val,
+                        tolerance_dev,
+                        config.allowed_perf_changes,
+                        config.verbose >= 4)
+                t.metrics.append((change, perf_stat))
+
+            # If any metric fails then the test fails.
+            # Note, the remaining metrics are still run so that
+            # a complete list of changes can be presented to the user.
+            if metric_result['passFail'] == 'fail':
+                result = metric_result
 
     return result
 
@@ -1186,7 +1239,7 @@ def simple_build(name, way, extra_hc_opts, should_fail, top_mod, link, addsuf, b
         to_do = '-c' # just compile
 
     stats_file = name + '.comp.stats'
-    if opts.compiler_stats_range_fields:
+    if isCompilerStatsTest():
         extra_hc_opts += ' +RTS -V0 -t' + stats_file + ' --machine-readable -RTS'
     if backpack:
         extra_hc_opts += ' -outputdir ' + name + '.out'
@@ -1219,10 +1272,10 @@ def simple_build(name, way, extra_hc_opts, should_fail, top_mod, link, addsuf, b
 
     # ToDo: if the sub-shell was killed by ^C, then exit
 
-    statsResult = checkStats(name, way, stats_file, opts.compiler_stats_range_fields)
-
-    if badResult(statsResult):
-        return statsResult
+    if isCompilerStatsTest():
+        statsResult = check_stats(name, way, stats_file, opts.stats_range_fields)
+        if badResult(statsResult):
+            return statsResult
 
     if should_fail:
         if exit_code == 0:
@@ -1260,7 +1313,7 @@ def simple_run(name, way, prog, extra_run_opts):
     my_rts_flags = rts_flags(way)
 
     stats_file = name + '.stats'
-    if opts.stats_range_fields:
+    if isStatsTest() and not isCompilerStatsTest():
         stats_args = ' +RTS -V0 -t' + stats_file + ' --machine-readable -RTS'
     else:
         stats_args = ''
@@ -1298,7 +1351,7 @@ def simple_run(name, way, prog, extra_run_opts):
     if check_prof and not check_prof_ok(name, way):
         return failBecause('bad profile')
 
-    return checkStats(name, way, stats_file, opts.stats_range_fields)
+    return check_stats(name, way, stats_file, opts.stats_range_fields)
 
 def rts_flags(way):
     args = config.way_rts_flags.get(way, [])
@@ -1993,7 +2046,7 @@ def findTFiles(roots):
 # -----------------------------------------------------------------------------
 # Output a test summary to the specified file object
 
-def summary(t, file, short=False):
+def summary(t, file, short=False, color=False):
 
     file.write('\n')
     printUnexpectedTests(file,
@@ -2004,7 +2057,16 @@ def summary(t, file, short=False):
         # Only print the list of unexpected tests above.
         return
 
-    file.write('SUMMARY for test run started at '
+    colorize = lambda s: s
+    if color:
+        if len(t.unexpected_failures) > 0 or \
+            len(t.unexpected_stat_failures) > 0 or \
+            len(t.framework_failures) > 0:
+            colorize = str_fail
+        else:
+            colorize = str_pass
+
+    file.write(colorize('SUMMARY') + ' for test run started at '
                + time.strftime("%c %Z", t.start_time) + '\n'
                + str(datetime.timedelta(seconds=
                     round(time.time() - time.mktime(t.start_time)))).rjust(8)
diff --git a/testsuite/driver/testutil.py b/testsuite/driver/testutil.py
index 15587e6960..6e0c2684d7 100644
--- a/testsuite/driver/testutil.py
+++ b/testsuite/driver/testutil.py
@@ -5,10 +5,28 @@ import shutil
 
 import threading
 
+def passed():
+    return {'passFail': 'pass'}
+
+def failBecause(reason, tag=None):
+    return {'passFail': 'fail', 'reason': reason, 'tag': tag}
+
 def strip_quotes(s):
     # Don't wrap commands to subprocess.call/Popen in quotes.
     return s.strip('\'"')
 
+def str_fail(s):
+    return '\033[1m\033[43m\033[31m' + s + '\033[0m'
+
+def str_pass(s):
+    return '\033[1m\033[32m' + s + '\033[0m'
+
+def str_warn(s):
+    return '\033[1m\033[33m' + s + '\033[0m'
+
+def str_info(s):
+    return '\033[1m\033[34m' + s + '\033[0m'
+
 def getStdout(cmd_and_args):
     # Can't use subprocess.check_output, since we also verify that
     # no stderr was produced
diff --git a/testsuite/mk/test.mk b/testsuite/mk/test.mk
index f036110e07..65e897d849 100644
--- a/testsuite/mk/test.mk
+++ b/testsuite/mk/test.mk
@@ -215,6 +215,14 @@ ifeq "$(SKIP_PERF_TESTS)" "YES"
 RUNTEST_OPTS += --skip-perf-tests
 endif
 
+ifeq "$(ONLY_PERF_TESTS)" "YES"
+RUNTEST_OPTS += --only-perf-tests
+endif
+
+ifneq "$(TEST_ENV)" ""
+RUNTEST_OPTS += --test-env="$(TEST_ENV)"
+endif
+
 ifeq "$(CLEANUP)" "0"
 RUNTEST_OPTS += -e config.cleanup=False
 else ifeq "$(CLEANUP)" "NO"
@@ -266,6 +274,10 @@ RUNTEST_OPTS +=  \
 
 RUNTEST_OPTS += -e "config.stage=$(GhcStage)"
 
+ifneq "$(METRICS_FILE)" ""
+RUNTEST_OPTS +=  \
+	--metrics-file "$(METRICS_FILE)"
+endif
 ifneq "$(JUNIT_FILE)" ""
 RUNTEST_OPTS +=  \
   --junit "$(JUNIT_FILE)"
diff --git a/testsuite/tests/callarity/perf/all.T b/testsuite/tests/callarity/perf/all.T
index 83083d4b4d..37e40e6f9c 100644
--- a/testsuite/tests/callarity/perf/all.T
+++ b/testsuite/tests/callarity/perf/all.T
@@ -1,13 +1,7 @@
 test('T3924',
-     [stats_num_field('bytes allocated', 
-          [ (wordsize(64), 50760, 8),
-              # previously, without call-arity: 22326544
-              # 2014-01-18: 51480  (amd64/Linux)
-              # 2014-07-17: 50760  (amd64/Linux) (Roundabout adjustment)
-              # 2015-04-03: Widen 5->8% (amd64/Windows was doing better)
-            (wordsize(32), 44988, 5) ]),
-              # 2014-04-04: 44988  (Windows, 64-bit machine)
-      only_ways(['normal'])
+     [collect_stats('bytes allocated',8)
+     , only_ways(['normal'])
       ],
      compile_and_run,
      ['-O'])
+
diff --git a/testsuite/tests/deriving/perf/all.T b/testsuite/tests/deriving/perf/all.T
index 240571b4a2..1402a38b5d 100644
--- a/testsuite/tests/deriving/perf/all.T
+++ b/testsuite/tests/deriving/perf/all.T
@@ -1,13 +1,6 @@
 test('T10858',
-     [compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 221895064, 8) ]),
-          # Initial:    222312440
-          # 2016-12-19  247768192  Join points (#19288)
-          # 2017-02-12  304094944  Type-indexed Typeable
-          # 2017-02-25  275357824  Early inline patch
-          # 2017-03-28  241242968  Run Core Lint less
-          # 2017-06-07  221895064  Apparently been reducing for some time
-          #                        Today it crossed the boundary; good
+     [ collect_compiler_stats('bytes allocated',8),
       only_ways(['normal'])],
      compile,
      ['-O'])
+
diff --git a/testsuite/tests/perf/compiler/all.T b/testsuite/tests/perf/compiler/all.T
index d1d5a1ce70..b2ca109000 100644
--- a/testsuite/tests/perf/compiler/all.T
+++ b/testsuite/tests/perf/compiler/all.T
@@ -1,4 +1,4 @@
-# Tests that call 'compiler_stats_num_field' are skipped when debugging is on.
+# Tests that call 'collect_compiler_stats' are skipped when debugging is on.
 # See testsuite/driver/testlib.py.
 
 def no_lint(name, opts):
@@ -29,112 +29,8 @@ setTestOpts(no_lint)
 
 test('T1969',
      [# expect_broken(12437),
-      compiler_stats_num_field('peak_megabytes_allocated', # Note [residency]
-          [(wordsize(32), 30, 15),
-             # 2010-05-17 14 (x86/Windows)
-             #            15 (x86/OS X)
-             #            19 (x86/OS X)
-             # 2013-02-10 13 (x86/Windows)
-             # 2013-02-10 14 (x86/OSX)
-             # 2013-11-13 17 (x86/Windows, 64bit machine)
-             # 2015-07-11 21 (x86/Linux, 64bit machine) use +RTS -G1
-             # 2016-04-06 30 (x86/Linux, 64bit machine)
-           (wordsize(64), 73, 20)]),
-             #            28 (amd64/Linux)
-             #            34 (amd64/Linux)
-             # 2012-09-20 23 (amd64/Linux)
-             # 2012-10-03 25 (amd64/Linux if .hi exists)
-             # 2013-02-13 23, but unstable so increased to 10% range
-             # 2013-02-13 27, very unstable!
-             # 2014-09-10 29 (amd64/Linux) post-AMP-cleanup
-             # 2013-09-11 30, 10 (amd64/Linux)
-             # 2013-09-11 30, 15 (adapt to Phab CI)
-             # 2015-06-03 41, (amd64/Linux) use +RTS -G1
-             # 2015-10-28 55, (amd64/Linux) emit Typeable at definition site
-             # 2016-10-20 68, (amd64/Linux) allow top-level string literals
-             #                See the comment 16 on #8472.
-             # 2017-02-17 83  (amd64/Linux) Type-indexed Typeable
-             # 2017-03-31 61  (amd64/Linux) Fix memory leak in simplifier
-             # 2018-01-25 78  (amd64/Linux) Use CoreExpr for EvTerm
-             # 2018-07-10 73  (amd64/Linux) Fix space leaks
-      compiler_stats_num_field('max_bytes_used',
-          [(platform('i386-unknown-mingw32'), 5719436, 20),
-                                 # 2010-05-17 5717704 (x86/Windows)
-                                 # 2013-02-10 5159748 (x86/Windows)
-                                 # 2013-02-10 5030080 (x86/Windows)
-                                 # 2013-11-13 7295012 (x86/Windows, 64bit machine)
-                                 # 2014-04-24 5719436 (x86/Windows, 64bit machine)
-           (wordsize(32), 9418680, 1),
-             #            6707308 (x86/OS X)
-             # 2009-12-31 6149572 (x86/Linux)
-             # 2014-01-22 6429864 (x86/Linux)
-             # 2014-06-29 5949188 (x86/Linux)
-             # 2015-07-11 6241108 (x86/Linux, 64-bit machine) use +RTS -G1
-             # 2016-04-06 9093608 (x86/Linux, 64-bit machine)
-             # 2017-03-24 9261052 (x86/Linux, 64-bit machine)
-             # 2017-04-06 9418680 (x86/Linux, 64-bit machine)
-
-           (wordsize(64), 19738608, 15)]),
-             # 2014-09-10 10463640, 10  # post-AMP-update (somewhat stabelish)
-               # looks like the peak is around ~10M, but we're
-               # unlikely to GC exactly on the peak.
-               # varies quite a lot with CLEANUP and BINDIST,
-               # hence 10% range.
-               # See Note [residency] to get an accurate view.
-             # 2014-09-14  9684256, 10 # try to lower it a bit more to match Phab's CI
-             # 2014-11-03 10584344,    # ghcspeed reports higher numbers consistently
-             # 2015-07-11 11670120 (amd64/Linux)
-             # 2015-10-28 15017528 (amd64/Linux) emit typeable at definition site
-             # 2016-10-12 17285216 (amd64/Linux) it's not entirely clear why
-             # 2017-02-01 19924328 (amd64/Linux) Join points (#12988)
-             # 2017-02-14 16393848 Early inline patch
-             # 2017-03-31 16679176 Fix memory leak in simplifier
-             # 2017-08-25 19199872 Refactor the Mighty Simplifier
-             # 2018-02-19 22311600 (amd64/Linux) Unknown
-             # 2018-07-10 19738608 (amd64/Linux) Fix space leaks
-
-      compiler_stats_num_field('bytes allocated',
-          [(platform('i386-unknown-mingw32'), 301784492, 5),
-                                 #            215582916 (x86/Windows)
-                                 # 2012-10-29 298921816 (x86/Windows)
-                                 # 2013-02-10 310633884 (x86/Windows)
-                                 # 2013-11-13 317975916 (x86/Windows, 64bit machine)
-                                 # 2014-04-04 301784492 (x86/Windows, 64bit machine)
-           (wordsize(32), 324586096, 1),
-             #            221667908 (x86/OS X)
-             #            274932264 (x86/Linux)
-             # 2012-10-08 303930948 (x86/Linux, new codegen)
-             # 2013-02-10 322937684 (x86/OSX)
-             # 2014-01-22 316103268 (x86/Linux)
-             # 2014-06-29 303300692 (x86/Linux)
-             # 2015-07-11 288699104 (x86/Linux, 64-bit machine) use +RTS -G1
-             # 2016-04-06 344730660 (x86/Linux, 64-bit machine)
-             # 2017-03-24 324586096 (x86/Linux, 64-bit machine)
-           (wordsize(64), 670839456, 5)]),
-             # 2009-11-17 434845560 (amd64/Linux)
-             # 2009-12-08 459776680 (amd64/Linux)
-             # 2010-05-17 519377728 (amd64/Linux)
-             # 2011-08-05 561382568 (amd64/OS X)
-             # 2012-07-16 589168872 (amd64/Linux)
-             # 2012-07-20 595936240 (amd64/Linux)
-             # 2012-08-23 606230880 (amd64/Linux)
-             # 2012-08-29 633334184 (amd64/Linux) new codegen
-             # 2012-09-18 641959976 (amd64/Linux)
-             # 2012-10-19 661832592 (amd64/Linux) -fPIC turned on
-             # 2012-10-23 642594312 (amd64/Linux) -fPIC turned off again
-             # 2012-11-12 658786936 (amd64/Linux) UNKNOWN REASON
-             # 2013-91-17 667160192 (x86_64/Linux) new demand analyser
-             # 2013-10-18 698612512 (x86_64/Linux) fix for #8456
-             # 2014-02-10 660922376 (x86_64/Linux) call arity analysis
-             # 2014-07-17 651626680 (x86_64/Linux) roundabout update
-             # 2014-09-10 630299456 (x86_64/Linux) post-AMP-cleanup
-             # 2015-06-03 581460896 (x86_64/Linux) use +RTS -G1
-             # 2015-10-28 695430728 (x86_64/Linux) emit Typeable at definition site
-             # 2015-10-28 756138176 (x86_64/Linux) inst-decl defaults go via typechecker (#12220)
-             # 2017-02-17 831733376 (x86_64/Linux) Type-indexed Typeable
-             # 2017-02-25 695354904 (x86_64/Linux) Early inlining patch
-             # 2017-04-21 659863176 (x86_64/Linux) Unknown
-             # 2018-07-10 670839456 (x86_64/Linux) Unknown (just updating)
+      collect_compiler_stats(['peak_megabytes_allocated','max_bytes_used'],15),
+      collect_compiler_stats('bytes allocated',5),
       only_ways(['normal']),
 
       extra_hc_opts('-dcore-lint -static'),
@@ -161,63 +57,8 @@ else:
 
 test('T3294',
      [
-      compiler_stats_num_field('max_bytes_used', # Note [residency]
-          [(wordsize(32), 28686588, 15),
-             #            17725476 (x86/OS X)
-             #            14593500 (Windows)
-             # 2013-02-10 20651576 (x86/Windows)
-             # 2013-02-10 20772984 (x86/OSX)
-             # 2013-11-13 24009436 (x86/Windows, 64bit machine)
-             # 2014-04-24 19882188 (x86/Windows, 64bit machine)
-             # 2014-12-22 26525384 (x86/Windows) Increase due to silent superclasses?
-             # 2015-07-11 43196344 (x86/Linux, 64-bit machine) use +RTS -G1
-             # 2016-04-06 28686588 (x86/Linux, 64-bit machine)
-
-           (wordsize(64), 34050960, 20)]),
-             # prev:           25753192 (amd64/Linux)
-             # 29/08/2012:     37724352 (amd64/Linux)
-             #  (increase due to new codegen, see #7198)
-             # 13/13/2012:     44894544 (amd64/Linux)
-             #  (reason for increase unknown)
-             # 15/5/2013:      36904752  (amd64/Linux)
-             #  (reason for decrease unknown)
-             # 29/5/2013:      43224080  (amd64/Linux)
-             #  (reason for increase back to earlier value unknown)
-             # 2014-07-14:     36670800  (amd64/Linux)
-             #  (reason unknown, setting expected value somewhere in between)
-             # 2015-01-22:     45000000  (amd64/Linux)
-             #  varies between 40959592 and 52914488... increasing to +-20%
-             # 2015-10-28:     50367248  (amd64/Linux)
-             #  D757: emit Typeable instances at site of type definition
-             # 2016-07-11:     54609256  (Windows) before fix for #12227
-             # 2016-07-11:     52992688  (Windows) after fix for #12227
-             # 2017-02-17:     63131248  (amd64/Linux) Type indexed Typeable
-             # 2017-05-14:     34050960  (amd64/Linux) Two-pass CmmLayoutStack
-
-      compiler_stats_num_field('bytes allocated',
-          [(wordsize(32), 1377050640, 5),
-           # previous:     815479800  (x86/Linux)
-           # (^ increase due to new codegen, see #7198)
-           # 2012-10-08:  1373514844 (x86/Linux)
-           # 2013-11-13: 1478325844  (x86/Windows, 64bit machine)
-           # 2014-01-12: 1565185140  (x86/Linux)
-           # 2013-04-04: 1377050640  (x86/Windows, 64bit machine)
-           (wordsize(64), 1858491504, 5)]),
-            # old:        1357587088 (amd64/Linux)
-            # 29/08/2012: 2961778696 (amd64/Linux)
-            # (^ increase due to new codegen, see #7198)
-            # 18/09/2012: 2717327208 (amd64/Linux)
-            # 08/06/2013: 2901451552 (amd64/Linux) (reason unknown)
-            # 12/12/2013: 3083825616 (amd64/Linux) (reason unknown)
-            # 18/02/2014: 2897630040 (amd64/Linux) (call arity improvements)
-            # 12/03/2014: 2705289664 (amd64/Linux) (more call arity improvements)
-            # 2014-17-07: 2671595512 (amd64/Linux) (round-about update)
-            # 2014-09-10: 2709595808 (amd64/Linux) post-AMP cleanup
-            # 2016-07-11: 2664479936 (Windows) before fix for #12227
-            # 2016-07-11: 2739731144 (Windows) after fix for #12227 (ignoring)
-            # 2017-02-17: 2758641264 (amd64/Linux) (Type indexed Typeable)
-            # 2017-05-14: 2253557280 (amd64/Linux) Two-pass CmmLayoutStack
-            # 2017-10-24: 1858491504 (amd64/Linux) Improved linear regAlloc
+      collect_compiler_stats('max_bytes_used',15),
+      collect_compiler_stats('bytes allocated',5),
       conf_3294,
 
       # Use `+RTS -G1` for more stable residency measurements. Note [residency].
@@ -227,173 +68,27 @@ test('T3294',
      [''])
 
 test('T4801',
-     [ # expect_broken(5224),
-       # temporarily unbroken (#5227)
-###################################
-# deactivated for now, as this metric became too volatile recently
-#      compiler_stats_num_field('peak_megabytes_allocated',# Note [residency]
-#          [(platform('x86_64-apple-darwin'), 70, 1),
-#                           # expected value: 58 (amd64/OS X)
-#                           # 13/01/2014 - 70
-#           (wordsize(32), 30, 20),
-#           (wordsize(64), 48, 20)]),
-#            # prev:       50 (amd64/Linux)
-#            # 19/10/2012: 64 (amd64/Linux)
-#            #                (^ REASON UNKNOWN!)
-#            # 12/11/2012: 49 (amd64/Linux)
-#            #                (^ REASON UNKNOWN!)
-#            # 28/8/13:    60 (amd64/Linux)
-#            #                (^ REASON UNKNOWN!)
-#            # 2014-09-10: 55 post-AMP-cleanup
-#            # 2014-10-08: 62 (jumps between 55 and 71 observed -- GC tipping point?)
-#            # 2014-10-13: 48 stricter seqDmdType
-
-      compiler_stats_num_field('bytes allocated',
-          [(platform('x86_64-apple-darwin'), 417302064, 10),
-           # prev:       510938976 (amd64/OS X):
-           # 2015-12-11: 465653312 (amd64/OS X) Update, bump tolerance to +/-10%
-           # 2017-03-24: 417302064 (amd64/OS X) Correlated with Linux improvement
-
-           (wordsize(32), 199856388, 10),
-           # prev:        185669232 (x86/OSX)
-           # 2014-01-22:  211198056 (x86/Linux)
-           # 2014-09-03:  185242032 (Windows laptop)
-           # 2014-12-01:  203962148 (Windows laptop)
-           # 2016-04-06:  239556572 (x86/Linux)
-           # 2017-03-24:  199856388 (x86/Linux)
-           (wordsize(64), 388898280, 10)]),
-            # prev:       360243576 (amd64/Linux)
-            # 19/10/2012: 447190832 (amd64/Linux) (-fPIC turned on)
-            # 19/10/2012: 392409984 (amd64/Linux) (-fPIC turned off)
-            # 2014-04-08: 362939272 (amd64/Linux) cumulation of various smaller improvements over recent commits
-            # 2014-10-08: 382056344 (amd64/Linux) stricter foldr2 488e95b
-            # 2015-10-28: 434278248 (amd64/Linux) emit Typeable at definition site
-            # 2016-10-19: 388898280 (amd64/Linux) Refactor traceRn interface (#12617)
-
-###################################
-# deactivated for now, as this metric became too volatile recently
-#
-#     compiler_stats_num_field('max_bytes_used',
-#         [(platform('x86_64-apple-darwin'), 25145320, 5),
-#          (wordsize(32), 11829000, 15),
-#            #              9651948 (x86/OSX)
-#            #              10290952 (windows)
-#            # 2013-02-10   11071060 (x86/Windows)
-#            # 2013-02-10:  11207828 (x86/OSX)
-#            # (some date): 11139444
-#            # 2013-11-13:  11829000 (x86/Windows, 64bit machine)
-#          (wordsize(64), 19296544, 15)]),
-#               # prev:       20486256 (amd64/OS X)
-#               # 30/08/2012: 17305600--20391920 (varies a lot)
-#               # 19/10/2012: 26882576 (-fPIC turned on)
-#               # 19/10/2012: 18619912 (-fPIC turned off)
-#               # 24/12/2012: 21657520 (perhaps gc sampling time wibbles?)
-#               # 10/01/2014: 25166280
-#               # 13/01/2014: 22646000 (mostly due to #8647)
-#               # 18/02/2014: 25002136 (call arity analysis changes)
-#               # 12/05/2014: 25002136 (specialisation and inlining changes)
-#               # 10/09/2014: 19296544, 10 (post-AMP-cleanup)
-#               # 14/09/2014: 19585456, 15 (adapt to Phab CI env)
-       only_ways(['normal']),
-       extra_hc_opts('-static'),
+     [# collect_compiler_stats('peak_megabytes_allocated',1),
+      # expect_broken(5224),
+      # temporarily unbroken (#5227)
+      # deactivated for now, as this metric became too volatile recently
+      collect_compiler_stats('bytes allocated',10),
+      # collect_compiler_stats('max_bytes_used',5),
+      only_ways(['normal']),
+      extra_hc_opts('-static'),
 
-       # Use `+RTS -G1` for more stable residency measurements. Note [residency].
-       extra_hc_opts('+RTS -G1 -RTS')
-       ],
+      # Use `+RTS -G1` for more stable residency measurements. Note [residency].
+      extra_hc_opts('+RTS -G1 -RTS')
+      ],
      compile,
      [''])
 
 test('T3064',
-     [compiler_stats_num_field('peak_megabytes_allocated',# Note [residency]
-          [(wordsize(32), 36, 20),
-            # expected value: 14 (x86/Linux 28-06-2012):
-            # 2013-11-13:     18 (x86/Windows, 64bit machine)
-            # 2014-01-22:     23 (x86/Linux)
-            # 2014-12-22:     23 (x86/Linux) death to silent superclasses
-            # 2015-07-11:     28 (x86/Linux, 64-bit machine) use +RTS -G1
-            # 2017-04-06:     36 (x86/Linux, 64-bit machine) it's unclear
+     [collect_compiler_stats('peak_megabytes_allocated',20),
+      collect_compiler_stats('bytes allocated',10),
 
-           (wordsize(64), 66, 20)]),
-            # (amd64/Linux):            18
-            # (amd64/Linux) 2012-02-07: 26
-            # (amd64/Linux) 2013-02-12: 23; increased range to 10%
-            # (amd64/Linux) 2013-04-03: 26
-            # (amd64/Linux) 2013-09-11: 30; result of AMP patch
-            # Increased range to 20%.  peak-usage varies from 22 to 26,
-            #  depending on whether the old .hi file exists
-            # (amd64/Linux) 2013-09-11: 37; better arity analysis (weird)
-            # (amd64/Linux) (09/09/2014): 42, AMP changes (larger interfaces, more loading)
-            # (amd64/Linux) 2014-10-13: 38: Stricter seqDmdType
-            # (amd64/Linux) 2014-12-22: 27: death to silent superclasses
-            # (amd64/Linux) 2015-01-22: 32: Varies from 30 to 34, at least here.
-            # (amd64/Linux) 2015-06-03: 54: use +RTS -G1
-            # (amd64/Linux) 2016-10-25: 66: Presumably creep
-
-      compiler_stats_num_field('bytes allocated',
-          [(wordsize(32), 134044092, 10),
-            # 2011-06-28: 56380288  (x86/Linux)
-            # 2012-10-30: 111189536 (x86/Windows)
-            # 2013-11-13: 146626504 (x86/Windows, 64bit machine)
-            # 2014-01-22: 162457940 (x86/Linux)
-            # 2014-12-01: 162457940 (Windows)
-            # 2014-12-22: 122836340 (Windows) Death to silent superclasses
-            # 2016-04-06: 153261024 (x86/Linux) probably wildcard refactor
-            # 2017-03-24: 134044092 (x86/Linux, 64-bit machine) Update
-
-           (wordsize(64), 272759920, 5)]),
-            # (amd64/Linux) (2011-06-28):  73259544
-            # (amd64/Linux) (2013-02-07): 224798696
-            # (amd64/Linux) (2013-08-02): 236404384, increase from roles
-            # (amd64/Linux) (2013-09-11): 290165632, increase from AMP warnings
-            # (amd64/Linux) (2013-11-22): 308300448, GND via Coercible and counters for constraints solving
-            # (amd64/Linux) (2013-12-02): 329795912, Coercible refactor
-            # (amd64/Linux) (2014-02-11): 308422280, optimize Coercions in simpleOptExpr
-            # (amd64/Linux) (2014-05-23): 324022680, unknown cause
-            # (amd64/Linux) (2014-07-17): 332702112, general round of updates
-            # (amd64/Linux) (2014-08-29): 313638592, w/w for INLINABLE things
-            # (amd64/Linux) (2014-09-09): 407416464, AMP changes (larger interfaces, more loading)
-            # (amd64/Linux) (2014-09-14): 385145080, BPP changes (more NoImplicitPrelude in base)
-            # (amd64/Linux) (2014-12-10): 363103840, improvements in constraint solver
-            # (Mac)         (2014-12-18): 350418600, improvements to flattener
-            # (amd64/Linux) (2014-12-22): 243670824, Ha! Death to superclass constraints, makes
-            #                                        much less code for Monad instances
-            # (amd64/Linux) (2015-12-01): 264952256, Regression due to Simon's wildcard refactor
-            #                                        Tracked as #11151.
-            # (amd64/Linux) (2015-12-11): 304344936, Regression due to TypeInType
-            #                                        Tracked as #11196
-            # (amd64/Linux) (2016-04-15): 287460128  Improvement due to using coercionKind instead
-            #                                        of zonkTcType (Trac #11882)
-            # (amd64/Darwin) (2017-01-23): 306222424 Presumably creep from recent changes (Typeable?)
-            # (amd64/Linux) (2017-02-14): 259815560  Early inline patch: 9% improvement
-            # (amd64/Linux) (2017-03-31): 265950920  Fix memory leak in simplifier
-            # (amd64/Linux) (2017-05-01): 281509496  Avoid excessive space usage from unfoldings in CoreTidy
-            # (amd64/Linux) (2017-05-01): 258505536  I think this is improvement in coercionKind e4ab65bd
-            # (amd64/Linux) (2018-08-04): 272759920  It's unclear
-
-###################################
-# deactivated for now, as this metric became too volatile recently
-#
-#      compiler_stats_num_field('max_bytes_used',
-#          [(wordsize(32), 11202304, 20),
-#            # 2011-06-28:  2247016 (x86/Linux) (28/6/2011):
-#            #(some date):  5511604
-#            # 2013-11-13:  7218200 (x86/Windows, 64bit machine)
-#            # 2014-04-04: 11202304 (x86/Windows, 64bit machine)
-#           (wordsize(64), 13251728, 20)]),
-#            # (amd64/Linux, intree) (28/06/2011):  4032024
-#            # (amd64/Linux, intree) (07/02/2013):  9819288
-#            # (amd64/Linux)         (14/02/2013):  8687360
-#            # (amd64/Linux)         (18/02/2013):  9397488
-#            # (amd64/Linux)         (02/08/2013): 10742536, increase from roles
-#            # (amd64/Linux)         (19/08/2013): 9211816,  decrease apparently from better eta reduction
-#            # (amd64/Linux)         (11/09/2013): 12000480, increase from AMP warnings
-#            #                                     933cdf15a2d85229d3df04b437da31fdfbf4961f
-#            # (amd64/Linux)         (22/11/2013): 16266992, GND via Coercible and counters for constraints solving
-#            # (amd64/Linux)         (12/12/2013): 19821544, better One shot analysis
-#            # (amd64/Linux)         (09/09/2014): 24357392, AMP changes (larger interfaces, more loading)
-#            # (amd64/Linux)         (14/09/2014): 16053888, BPP changes (more NoImplicitPrelude in base)
-#            # (amd64/Linux)         (19/09/2014): 18744992, unknown
-#            # (amd64/Linux)         2014-10-13:   13251728, Stricter seqDmdType
+      # deactivated for now, as this metric became too volatile recently
+      # collect_compiler_stats('max_bytes_used',20)
 
        only_ways(['normal']),
 
@@ -409,37 +104,7 @@ test('T4007',
      ['$MAKE -s --no-print-directory T4007'])
 
 test('T5030',
-     [compiler_stats_num_field('bytes allocated',
-          [(wordsize(32), 345668088, 10),
-           # previous:    196457520
-           # 2012-10-08:  259547660 (x86/Linux, new codegen)
-           # 2013-11-21:  198573456 (x86 Windows, 64 bit machine)
-           # 2014-12-10:  227205560 constraint solver got worse again; more aggressive solving
-           #                        of family-applications leads to less sharing, I think
-           # 2015-07-11:  201882912 reason unknown
-           # 2016-04-06:  345668088 likely TypeInType
-
-           (wordsize(64), 794426536, 10)]),
-             # Previously 530000000 (+/- 10%)
-             # 17/1/13:   602993184  (x86_64/Linux)
-             #            (new demand analyser)
-             # 2013-06-08 538467496  (x86_64/Linux)
-             # ^ reason unknown
-             # 2013-08-02 454498592  (amd64/Linux)
-             # decrease from more aggressive coercion optimisations from roles
-             # 2013-11-12 397672152  (amd64/Linux)
-             # big decrease following better CSE and arity
-             # 2014-07-17 409314320  (amd64/Linux)
-             # general round of updates
-             # 2014-09-10 385152728  post-AMP-cleanup
-             # 2014-12-08 340969128  constraint solver perf improvements (esp kick-out)
-             # 2014-12-10 449042120  constraint solver got worse again; more aggressive solving
-             #                          of family-applications leads to less sharing, I think
-             # 2015-03-17 403932600  tweak to solver algorithm
-             # 2015-12-11 653710960  TypeInType (see #11196)
-             # 2016-10-17 794426536  20% big increase following
-             #                       31621b12 * A collection of type-inference refactorings.
-             #                       See ticket for more info
+     [collect_compiler_stats('bytes allocated', 10),
 
        only_ways(['normal'])
       ],
@@ -447,47 +112,14 @@ test('T5030',
      ['-freduction-depth=300'])
 
 test('T5631',
-     [compiler_stats_num_field('bytes allocated',
-          [(wordsize(32), 570137436, 10),
-        # expected value: 392904228 (x86/Linux)
-        # 2014-04-04:     346389856 (x86 Windows, 64 bit machine)
-        # 2014-12-01:     390199244 (Windows laptop)
-        # 2016-04-06:     570137436 (amd64/Linux) many reasons
-           (wordsize(64), 1161885448, 5)]),
-        # expected value: 774595008 (amd64/Linux):
-        # expected value: 735486328 (amd64/Linux) 2012/12/12:
-        # expected value: 690742040 (amd64/Linux) Call Arity improvements
-        # 2014-09-09:     739704712 (amd64/Linux) AMP changes
-        # 2014-11-04:     776121120 (amd64/Linux) new-flatten-skolems
-        # 2015-06-01:     812288344 (amd64/Linux) unknown cause
-        # 2015-12-11:     1128828928 (amd64/Linux) TypeInType (see #11196)
-        # 2015-12-21:     1198327544 (Mac) TypeApplications (will fix with #11196)
-        # 2015-03-18:     1124068664 (Mac) optimize Unify & zonking
-        # 2016-10-19:     1024926024 (amd64/Linux) Refactor traceRn interface (#12617)
-        # 2016-11-10:     1077429456 (amd64/Linux) Stop -dno-debug-output suppressing -ddump-tc-trace
-        # 2017-02-17:     1517484488 (amd64/Linux) Type-indexed Typeable
-        # 2017-03-03:     1065147968 (amd64/Linux) Share Typeable KindReps
-        # 2017-03-31:     1037482512 (amd64/Linux) Fix memory leak in simplifier
-        # 2017-07-27:     1106015512 (Mac) Regresssion from tracking visibility in TypeEqOrigin
-        #                                  should be fixed by #14037
-        # 2018-06-18:     1161885448 (Mac) Not entirely clear
-       only_ways(['normal'])
+     [collect_compiler_stats('bytes allocated',10),
+      only_ways(['normal'])
       ],
      compile,
      [''])
 
 test('parsing001',
-     [compiler_stats_num_field('bytes allocated',
-          [(wordsize(32), 232777056, 10),
-        # Initial:        274000576
-        # 2017-03-24:     232777056
-           (wordsize(64), 519401296, 5)]),
-        # expected value: 587079016 (amd64/Linux)
-        # 2016-09-01:     581551384 (amd64/Linux) Restore w/w limit (#11565)
-        # 2016-12-19:     493730288 (amd64/Linux) Join points (#12988)
-        # 2017-02-14:     463931280 Early inlining patch; acutal improvement 7%
-        # 2017-12-11:     490228304 BlockArguments
-        # 2018-04-09:     519401296 Inexplicable, collateral of #14737
+     [collect_compiler_stats('bytes allocated',10),
        only_ways(['normal']),
       ],
      compile_fail, [''])
@@ -495,333 +127,53 @@ test('parsing001',
 
 test('T783',
      [ only_ways(['normal']),  # no optimisation for this one
-      # expected value: 175,569,928 (x86/Linux)
-      compiler_stats_num_field('bytes allocated',
-          [(wordsize(32), 225911912, 5),
-            # 2012-10-08: 226907420 (x86/Linux)
-            # 2013-02-10: 329202116 (x86/Windows)
-            # 2013-02-10: 338465200 (x86/OSX)
-            # 2014-04-04: 319179104 (x86 Windows, 64 bit machine)
-            # 2014-09-03: 223377364 (Windows) better specialisation, raft of core-to-core optimisations
-            # 2014-12-22: 235002220 (Windows) not sure why
-            # 2016-04-06: 249332816 (x86/Linux, 64-bit machine)
-            # 2017-03-24: 225911912 (x86/Linux, 64-bit machine)
-
-           (wordsize(64), 481875416, 10)]),
-            # prev:       349263216 (amd64/Linux)
-            # 07/08/2012: 384479856 (amd64/Linux)
-            # 29/08/2012: 436927840 (amd64/Linux)
-            # 12/11/2012: 640324528 (amd64/Linux)
-            #   (OldCmm removed: not sure why this got worse, the
-            #    other perf tests remained about the same)
-            # 18/10/2013: 734038080 (amd64/Linux)
-            #   (fix for #8456)
-            # 24/10/2013: 654804144 (amd64/Linux)
-            #   (fix previous fix for #8456)
-            # 2014-07-17: 640031840 (amd64/Linux)
-            #   (general round of updates)
-            # 2014-08-29: 441932632 (amd64/Linux)
-            #   (better specialisation, raft of core-to-core optimisations)
-            # 2014-08-29: 719814352 (amd64/Linux)
-            #   (changed order of cmm block causes analyses to allocate much more,
-            #      but the changed order is slighly better in terms of runtime, and
-            #      this test seems to be an extreme outlier.)
-            # 2015-05-16: 548288760 (amd64/Linux)
-            #   (improved sequenceBlocks in nativeCodeGen, #10422)
-            # 2015-08-07: 470738808 (amd64/Linux)
-            #   (simplifying the switch plan code path for simple checks, #10677)
-            # 2015-08-28: 526230456 (amd64/Linux)
-            #    (D757: Emit Typeable instances at site of type definition)
-            # 2015-12-04: 1134085384 (amd64/Linux)
-            #    (D1535: Major overhaul of pattern match checker, #11162)
-            # 2016-02-03: 488592288 (amd64/Linux)
-            #    (D1795: Another overhaul of pattern match checker, #11374)
-            # 2017-02-14    436978192    Early inlining: 5% improvement
-            # 2017-09-08    481875416    Unknown
-
+      collect_compiler_stats('bytes allocated',10),
       extra_hc_opts('-static')
       ],
       compile,[''])
 
 test('T5321Fun',
      [ only_ways(['normal']),  # no optimisation for this one
-       compiler_stats_num_field('bytes allocated',
-           [(wordsize(32), 244387620, 10),
-             # prev:       300000000
-             # 2012-10-08: 344416344 x86/Linux
-             #  (increase due to new codegen)
-             # 2014-09-03: 299656164     (specialisation and inlining)
-             # 2014-12-10: 206406188     #  Improvements in constraint solver
-             # 2016-04-06: 279922360 x86/Linux
-             # 2017-03-24: 244387620 x86/Linux (64-bit machine)
-
-            (platform('x86_64-apple-darwin'), 446893600, 5),
-             # 2018-03-17: 423774560     #  OS X-only (reason unknown, see #11753)
-
-            (wordsize(64), 423774560, 5)])
-             # prev:       585521080
-             # 2012-08-29: 713385808     #  (increase due to new codegen)
-             # 2013-05-15: 628341952     #  (reason for decrease unknown)
-             # 2013-06-24: 694019152     #  (reason for re-increase unknown)
-             # 2014-05-12: 614409344     #  (specialisation and inlining changes)
-             # 2014-09-10: 601629032     #  post-AMP-cleanup
-             # 2014-11-06: 541287000     #  Simon's flat-skol changes to the constraint solver
-             # 2014-12-10: 408110888     #  Improvements in constraint solver
-             # 2014-12-16: 429921312     #  Flattener parameterized over roles
-             # 2015-08-10: 509921312
-             #  (undefined now takes an implicit parameter and GHC -O0 does
-             #  not recognize that the application is bottom)
-             # 2015-12-11: 565883176     #  TypeInType (see #11196)
-             # 2017-01-06: 497356688     #  Small coercion optimisations
-                                         #  The actual decrease was only 2%; earlier
-                                         #    commits had drifted down
-             # 2017-01-22: 525895608     #  Allow top-level string literals in Core. I'm not
-                                         #    convinced that this patch is
-                                         #    responsible for all of this
-                                         #    change, however. Namely I am
-                                         #    quite skeptical of the downward
-                                         #    "drift" reported above
-             # 2017-01-31: 498135752     #  Join points (#12988)
-             # 2017-02-23: 524706256     #  Type-indexed Typeable? (on Darwin)
-             # 2017-02-25: 488295304     #  Early inlining patch
-             # 2017-05-14: 449577856     #  (amd64/Linxu) Two-pass CmmLayoutStack
-             # 2017-12-13: 423774560     #  (amd64/Linxu) Typechecker improvements
+       collect_compiler_stats('bytes allocated',10)
       ],
       compile,[''])
 
 test('T5321FD',
      [ only_ways(['normal']),  # no optimisation for this one
-      compiler_stats_num_field('bytes allocated',
-          [(wordsize(32), 250757460, 10),
-            # prev:       213380256
-            # 2012-10-08: 240302920 (x86/Linux)
-            #  (increase due to new codegen)
-            # 2014-07-31: 211699816 (Windows) (-11%)
-            #  (due to better optCoercion, 5e7406d9, #9233)
-            # 2016-04-06: 250757460 (x86/Linux)
-
-           (wordsize(64), 371826136, 10)])
-            # prev:       418306336
-            # 29/08/2012: 492905640
-            #  (increase due to new codegen)
-            # 15/05/2013: 406039584
-            #  (reason for decrease unknown)
-            # 08/06/2013: 476497048
-            #  (reason for increase unknown)
-            # before 2014-07-17: 441997096
-            #  (with -8%, still in range, hence cause not known)
-            # 2014-07-17: 426960992 (-11% of previous value)
-            #  (due to better optCoercion, 5e7406d9, #9233)
-            # 2014-10-08  410895536
-            #  (various changes; biggest improvements due to 949ad67 and FastString package ids)
-            # 2015-08-10: 470895536
-            #  (undefined now takes an implicit parameter and GHC -O0 does
-            #  not recognize that the application is bottom)
-            # 2015-10-28: 532365376
-            #  D757: emit Typeable instances at site of type definition
-            # 2016-07-16: 477840432
-            #  Optimize handling of built-in OccNames
-            # 2017-05-14: 415136648 (amd64/Linux) Two-pass CmmLayoutStack
-            # 2018-04-24: 371826136 (amd64/Linux) Store size in LitString
+      collect_compiler_stats('bytes allocated',10)
       ],
       compile,[''])
 
 test('T5642',
      [ only_ways(['normal']),
        normal,
-       compiler_stats_num_field('bytes allocated',
-           [(wordsize(32), 413517560, 10),
-                     # sample from x86/Linux
-            # prev:        650000000
-            # 2014-09-03:  753045568
-            # 2014-12-10:  641085256 Improvements in constraints solver
-            # 2016-04-06:  462677300
-            # 2017-03-24:  413517560 (x86/Linux, 64-bit machine)
-
-            (wordsize(64),  838316496, 10)])
-            # prev:        1300000000
-            # 2014-07-17:  1358833928 (general round of updates)
-            # 2014-08-07:  1402242360 (caused by 1fc60ea)
-# Watch out for:
-            # 23/05/2014:  1452688392 (More aggressive specialisation means we get
-            #                          specialised copies of imported functions that
-            #                          are ultimately discarded by trimAutoRules
-            #                          It's a bizarre program with LOTS of data types)
-            # 2014-09-10:  1536924976 post-AMP-cleanup
-            # 2014-12-10:  1282916024 Improvements in constraints solver
-            # 2015-10-28:  1412808976 Emit Typeable at definition site
-            # 2015-11-22:  1071915072 Use TypeLits in the metadata encoding
-            # 2016-02-08:   950004816 Pattern match checker re-rework
-            # 2016-05-12:  1300685592 Make Generic1 poly-kinded
-            # 2016-06-05:   916484672 Refactor derived Generic instances to reduce allocations
-            # 2016-09-03:   838316496 Derive the Generic instance in perf/compiler/T5642
+       collect_compiler_stats('bytes allocated',10)
       ],
       compile,['-O'])
 
 test('T5837',
      [ only_ways(['normal']),
-      compiler_stats_num_field('bytes allocated',
-          [(wordsize(32), 27028956, 10),
-             # 40000000 (x86/Linux)
-             # 2013-11-13:  45520936 (x86/Windows, 64bit machine)
-             # 2014-09-03:  37096484 (Windows laptop, w/w for INLINABLE things
-             # 2014-12-01: 135914136 (Windows laptop, regression see below)
-             # 2014-12-08: 115905208  Constraint solver perf improvements (esp kick-out)
-             # 2016-04-06: 24199320  (x86/Linux, 64-bit machine) TypeInType
-             # 2017-03-24: 27028956  (x86/Linux, 64-bit machine)
-
-           (platform('x86_64-unknown-mingw32'), 61806136, 7),
-             # 2017-02-19                       59161648 (x64/Windows) - Unknown
-             # 2017-04-21                       54985248 (x64/Windows) - Unknown
-             # 2017-12-24                       54793816 (x64/Windows) - Unknown
-             # 2018-09-23                       61806136 (x64/Windows) - Unknown
-
-           (wordsize(64), 55813608, 7)])
-             # sample: 3926235424 (amd64/Linux, 15/2/2012)
-             # 2012-10-02 81879216
-             # 2012-09-20 87254264 amd64/Linux
-             # 2013-09-18 90587232 amd64/Linux
-             # 2013-11-21 86795752 amd64/Linux, GND via Coercible and counters
-             #                                  for constraints solving
-             # 2014-08-29 73639840 amd64/Linux, w/w for INLINABLE things
-             # 2014-10-08 73639840 amd64/Linux, Burning Bridges and other small changes
-             # 2014-11-06 271028976       Linux, Accept big regression;
-             #   See Note [An alternative story for the inert substitution] in TcFlatten
-             # 2014-12-08 234790312 Constraint solver perf improvements (esp kick-out)
-             # 2014-12-16 231155640 Mac  Flattener parameterized over roles;
-             #                           some optimization
-             # 2015-03-17 53424304  Mac  Better depth checking; fails earlier
-             # 2015-06-09 38834096  Better "improvement"; I'm not sure whey it improves things
-             # 2015-12-11 43877520  amd64/Linux, TypeInType (see #11196)
-             # 2016-03-18 48507272  Mac, accept small regression in exchange
-             #                           for other optimisations
-             # 2016-09-15 42445672  Linux; fixing #12422
-             # 2016-09-25 41832056  amd64/Linux, Rework handling of names (D2469)
-             # 2016-10-25 52597024  amd64/Linux, the test now passes (hooray), and so
-             #                          allocates more because it goes right down the
-             #                          compilation pipeline
-             # 2017-01-24 57861352  amd64/Linux, very likely due to the top-level strings
-             #                          in Core patch.
-             # 2017-02-07 50253880  Another improvement in SetLevels.  I don't think
-             #                      all the gain here is from this patch, but I think it
-             #                      just pushed it over the edge, so I'm re-centreing, and
-             #                      changing to 5% tolerance
-             # 2017-02-07 53592736  amd64/Linux Simon's earlier decrease appears
-             #                      to be environmentally-dependent.
-             #                      Also bumped acceptance threshold to 7%.
-             # 2017-02-20 58648600  amd64/Linux Type-indexed Typeable
-             # 2017-02-28 54151864  amd64/Linux Likely drift due to recent simplifier improvements
-             # 2017-02-25 52625920  amd64/Linux Early inlining patch
-             # 2017-09-06 56782344  amd64/Linux Drift manifest in unrelated LLVM patch
-             # 2017-10-24 52089424  amd64/linux Fix space leak in BinIface.getSymbolTable
-             # 2018-02-19 55813608  amd64/Linux Unknown
+      collect_compiler_stats('bytes allocated',10)
       ],
       compile, ['-freduction-depth=50'])
 
 test('T6048',
      [ only_ways(['optasm']),
-      compiler_stats_num_field('bytes allocated',
-          [(wordsize(32), 55701280, 10),
-            # prev:       38000000 (x86/Linux)
-            # 2012-10-08: 48887164 (x86/Linux)
-            # 2014-04-04: 62618072 (x86 Windows, 64 bit machine)
-            # 2014-09-03: 56315812 (x86 Windows, w/w for INLINABLE)
-            # 2014-12-01: 49987836 (x86 Windows)
-            # 2016-04-06: 55701280 (x86/Linux, 64-bit machine)
-
-           (wordsize(64), 100574504, 10)])
-             # 2012-09-18  97247032 amd64/Linux
-             # 2014-01-16 108578664 amd64/Linux (unknown, likely foldl-via-foldr)
-             # 2014-01-18  95960720 amd64/Linux Call Arity improvements
-             # 2014-02-28 105556793 amd64/Linux (unknown, tweak in base/4d9e7c9e3 resulted in change)
-             # 2014-03-05 110646312 amd64/Linux Call Arity became more elaborate
-             # 2014-07-14 125431448 amd64/Linux unknown reason. Even worse in GHC-7.8.3. *shurg*
-             # 2014-08-29 108354472 amd64/Linux w/w for INLINABLE things
-             # 2014-09-14  88186056 amd64/Linux BPP part1 change (more NoImplicitPreludes in base)
-             # 2014-01-08  95946688 amd64/Linux Mostly 4c834fd. Occasional spikes to 103822120!
-             # 2016-03-11 108225624 amd64/Linux unknown reason sadly; likely gradual creep.
-             # 2016-11-25  94327392 amd64/Linux Back down again hooray; still not sure why
-             # 2017-02-17 115715592 amd64/Linux Type-indexed Typeable
-             # 2017-04-28  90996312 Join point refactoring
-             # 2018-06-18 100574504 Darwin      Unclear
+      collect_compiler_stats('bytes allocated',10)
       ],
       compile,[''])
 
 test('T9020',
      [ only_ways(['optasm']),
-      compiler_stats_num_field('bytes allocated',
-          [(wordsize(32), 249904136, 10),
-           # Original:    381360728
-           # 2014-07-31:  343005716 (Windows) (general round of updates)
-           # 2017-03-24:  249904136 (x86/Linux, 64-bit machine)
-
-           (wordsize(64), 391876936, 10)])
-           # prev:        795469104
-           # 2014-07-17:  728263536 (general round of updates)
-           # 2014-09-10:  785871680 post-AMP-cleanup
-           # 2014-11-03:  680162056 Further Applicative and Monad adjustments
-           # 2015-10-21:  786189008 Make stronglyConnCompFromEdgedVertices deterministic
-           # 2016-01-26:  698401736 improvement from using ExpTypes instead of ReturnTvs
-           # 2016-04-06:  852298336 Refactoring of CSE #11781
-           # 2016-04-06:  698401736 Use thenIO in Applicative IO
-           # 2017-02-03:  764866144 Join points
-           # 2017-02-14:  500707080 Early inline patch; 35% decrease!
-           #                        Program size collapses in first simplification
-           # 2017-03-31:  493596312 Fix memory leak in simplifier
-           # 2017-04-28:  423163832  Remove exponential behaviour in simplifier
-           # 2018-04-09:  562206104 Inexplicable, collateral of #14737
-           # 2018-05-14:  391876936 Improved simplCast performance #15019
+      collect_compiler_stats('bytes allocated',10)
       ],
       compile,[''])
 
 test('T9675',
      [ only_ways(['optasm']),
-       compiler_stats_num_field('max_bytes_used', # Note [residency]
-          [(wordsize(64), 20499224, 15),
-          # 2014-10-13    29596552
-          # 2014-10-13    26570896   seq the DmdEnv in seqDmdType as well
-          # 2014-10-13    18582472   different machines giving different results..
-          # 2014-10-13    22220552   use the mean
-          # 2015-06-21    28056344   switch to `+RTS -G1`, tighten bound to 15%
-          # 2015-10-28    23776640   emit Typeable at definition site
-          # 2015-12-11    30837312   TypeInType (see #11196)
-          # 2016-03-14    38776008   Final demand analyzer run
-          # 2016-04-01    29871032   Fix leaks in demand analysis
-          # 2016-04-30    17675240   Fix leaks in tidy unfoldings
-          # 2018-09-21    20499224   See #15663
-           (wordsize(32), 18043224, 15)
-          # 2015-07-11    15341228   (x86/Linux, 64-bit machine) use +RTS -G1
-          # 2016-04-06    18043224   (x86/Linux, 64-bit machine)
-          ]),
-       compiler_stats_num_field('peak_megabytes_allocated', # Note [residency]
-          [(wordsize(64), 75, 15),
-          # 2014-10-13    66
-          # 2014-10-13    58         seq the DmdEnv in seqDmdType as well
-          # 2014-10-13    49         different machines giving different results...
-          # 2014-10-13    53         use the mean
-          # 2015-06-15    44         reduced for some reason
-          # 2015-06-21    105        switch to `+RTS -G1`
-          # 2015-12-04    88         new pattern checker (D1535)
-          # 2015-12-11    113        TypeInType (see #11196)
-          # 2016-04-14    144        Final demand analyzer run
-          # 2016-07-26    121        Unboxed sums?
-          # 2017-04-30    63         Fix leaks in tidy unfoldings
-          # 2018-09-21    75         See #15663
-            (wordsize(32), 56, 15)
-          # 2015-07-11    56         (x86/Linux, 64-bit machine) use +RTS -G1
-          ]),
-       compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 656137960, 10)
-          # 2014-10-13    544489040
-          # 2015-10-28    608284152  emit Typeable at definition site
-          # 2017-02-17    731171072  Type-indexed Typeable
-          # 2017-03-13    656137960  Put join ceiling underneath lambdas?
-
-          ,(wordsize(32), 322901484, 10)
-          # 2015-07-11    279480696  (x86/Linux, 64-bit machine) use +RTS -G1
-          # 2017-03-24    322901484  (x86/Linux, 64-bit machine)
-
-          ]),
+       # Note [residency]
+       collect_compiler_stats(['max_bytes_used','peak_megabytes_allocated'],15),
+       collect_compiler_stats('bytes allocated',10),
 
        # Use `+RTS -G1` for more stable residency measurements. Note [residency].
        extra_hc_opts('+RTS -G1 -RTS')
@@ -831,145 +183,40 @@ test('T9675',
 
 test('T9872a',
      [ only_ways(['normal']),
-       compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 2729927408, 5),
-          # 2014-12-10    5521332656    Initally created
-          # 2014-12-16    5848657456    Flattener parameterized over roles
-          # 2014-12-18    2680733672    Reduce type families even more eagerly
-          # 2015-12-11    3581500440    TypeInType (see #11196)
-          # 2016-04-07    3352882080    CSE improvements
-          # 2016-10-19    3134866040    Refactor traceRn interface (#12617)
-          # 2017-02-17    3298422648    Type-indexed Typeable
-          # 2017-02-25    3005891848    Early inlining patch
-          # 2018-03-26    2729927408    Flattener update with optimizations (#12919)
-
-           (wordsize(32), 1493198244, 5)
-          # was           1325592896
-          # 2016-04-06    1740903516    x86/Linux
-          # 2017-03-24    1493198244    x86/Linux, 64-bit machine
-          ]),
+       collect_compiler_stats('bytes allocated',5)
       ],
      compile_fail,
      [''])
 
 test('T9872b',
      [ only_ways(['normal']),
-       compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 3730686224, 5),
-          # 2014-12-10    6483306280    Initally created
-          # 2014-12-16    6892251912    Flattener parameterized over roles
-          # 2014-12-18    3480212048    Reduce type families even more eagerly
-          # 2015-12-11    5199926080    TypeInType (see #11196)
-          # 2016-02-08    4918990352    Improved a bit by tyConRolesRepresentational
-          # 2016-04-06:   4600233488    Refactoring of CSE #11781
-          # 2016-09-15:   4069522928    Fix #12422
-          # 2017-02-14    3730686224    Early inlining: 5% improvement
-
-           (wordsize(32), 1894037608, 5)
-          # was           1700000000
-          # 2016-04-06    2422750696    x86/Linux
-          # 2017-03-24    1894037608    x86/Linux, 64-bit machine
-          ]),
+       collect_compiler_stats('bytes allocated',5)
       ],
      compile_fail,
      [''])
 test('T9872c',
      [ only_ways(['normal']),
-       compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 3096670112, 5),
-          # 2014-12-10    5495850096    Initally created
-          # 2014-12-16    5842024784    Flattener parameterized over roles
-          # 2014-12-18    2963554096    Reduce type families even more eagerly
-          # 2015-12-11    4723613784    TypeInType (see #11196)
-          # 2016-02-08    4454071184    Improved a bit by tyConRolesRepresentational
-          # 2016-04-06:   4306667256    Refactoring of CSE #11781
-          # 2016-09-15:   3702580928    Fixing #12422
-          # 2017-02-14    3404346032    Early inlining: 5% improvement
-          # 2018-03-25    3096670112    Flattener patch with optimizations (#12919)
-
-           (wordsize(32), 1727582260, 5)
-          # was           1500000000
-          # 2016-04-06    2257242896
-          # 2017-03-24    1727582260    x86/Linux, 64-bit machine
-          ]),
+       collect_compiler_stats('bytes allocated',5)
       ],
      compile_fail,
      [''])
 test('T9872d',
      [ only_ways(['normal']),
-       compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 578498120, 7),
-          # 2014-12-18    796071864   Initally created
-          # 2014-12-18    739189056   Reduce type families even more eagerly
-          # 2015-01-07    687562440   TrieMap leaf compression
-          # 2015-03-17    726679784   tweak to solver; probably flattens more
-          # 2015-12-11    566134504   TypeInType; see #11196
-          # 2016-02-08    534693648   Improved a bit by tyConRolesRepresentational
-          # 2016-03-18    506691240   optimize Unify & zonking
-          # 2016-12-05    478169352   using tyConIsTyFamFree, I think, but only
-          #                           a 1% improvement 482 -> 478
-          # 2017-02-17    535565128   Type-indexed Typeable
-          # 2017-02-25    498855104   Early inlining
-          # 2017-03-03    462817352   Share Typeable KindReps
-          # 2018-03-25    526485920   Flattener patch does more work (#12919)
-          # 2018-04-11    572537984   simplCast improvement collateral (#11735)
-          # 2018-07-04    578498120   introduce GRefl (#15192)
-
-           (wordsize(32), 232954000, 5)
-          # some date     328810212
-          # 2015-07-11    350369584
-          # 2016-04-06    264566040   x86/Linux
-          # 2017-03-24    232954000   x86/Linux, 64-bit machine
-          ]),
+       collect_compiler_stats('bytes allocated',5)
       ],
      compile,
      [''])
 
 test('T9961',
      [ only_ways(['normal']),
-       compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 498326216, 5),
-          # 2015-01-12    807117816   Initally created
-          # 2015-spring   772510192   Got better
-          # 2015-05-22    663978160   Fix for #10370 improves it more
-          # 2015-10-28    708680480   x86_64/Linux   Emit Typeable at definition site
-          # 2015-12-17    745044392   x86_64/Darwin  Creep upwards
-          # 2016-03-20    519436672   x64_64/Linux   Don't use build desugaring for large lists (#11707)
-          # 2016-03-24    568526784   x64_64/Linux   Add eqInt* variants (#11688)
-          # 2016-09-01    537297968   x64_64/Linux   Restore w/w limit (#11565)
-          # 2016-12-19    571246936   x64_64/Linux   Join points (#12988)
-          # 2017-02-14    498326216   Early inline patch; 13% improvement
-
-           (wordsize(32), 255409052, 5)
-          # was           375647160
-          # 2016-04-06    275264188   x86/Linux
-          # 2017-03-24    255409052   x86/Linux, 64-bit machine
-          ]),
+       collect_compiler_stats('bytes allocated',5)
       ],
      compile,
      ['-O'])
 
 test('T9233',
     [ only_ways(['normal']),
-      compiler_stats_num_field('bytes allocated',
-        [(wordsize(64),  973149832, 5),
-         # 2015-08-04    999826288     initial value
-         # 2016-04-14   1066246248     Final demand analyzer run
-         # 2016-06-18    984268712     shuffling around of Data.Functor.Identity
-         # 2017-01-20    920101608     Improvement to SetLevels apparently saved 4.2% in
-         #                             compiler allocation.  Program size seems virtually
-         #                             unchanged; maybe the compiler itself is a little faster
-         # 2017-01-23    861862608     worker/wrapper evald-ness flags; another 5% improvement!
-         # 2017-02-01    894486272     Join points
-         # 2017-02-07    884436192     Another improvement to SetLevels
-         # 2017-02-17    974530192     Type-indexed Typeable
-         # 2017-03-21    924299320     It's unclear
-         # 2018-06-09    973149832     It's unclear
-
-         (wordsize(32),  460112888, 5)
-         # 2016-04-06    515672240     (x86/Linux) initial value
-         # 2017-03-24    460112888     x86/Linux, 64-bit machine
-        ]),
+      collect_compiler_stats('bytes allocated',5),
       extra_clean(['T9233a.hi', 'T9233a.o'])
     ],
     multimod_compile,
@@ -977,42 +224,8 @@ test('T9233',
 
 test('T10370',
      [ only_ways(['optasm']),
-       compiler_stats_num_field('max_bytes_used', # Note [residency]
-          [(wordsize(64), 31524048, 15),
-          # 2015-10-22    19548720
-          # 2016-02-24    22823976   Changing Levity to RuntimeRep; not sure why this regresses though, even after some analysis
-          # 2016-04-14    28256896   final demand analyzer run
-          # 2016-08-08    33049304
-          #     This change happened because we changed the behavior
-          #     of inlining across hs-boot files, so that we don't
-          #     inline if something comes from a boot file.  This
-          #     affected stats on bootstrapped GHC.  However,
-          #     when I set -i0.01 with profiling, the heap profiles
-          #     were identical, so I think it's just GC noise.
-          # 2016-10-20    38221184   Allow top-level string literals.
-          #                          See the comment 16 on #8472.
-          # 2017-02-17    51126304   Type-indexed Typeable
-          # 2017-02-27    43455848   Likely drift from recent simplifier improvements
-          # 2017-02-25    41291976   Early inline patch
-          # 2017-04-30    31524048   Fix leaks in tidy unfoldings
+       collect_compiler_stats(['max_bytes_used','peak_megabytes_allocated'], 15),
 
-           (wordsize(32), 19276304, 15),
-          # 2015-10-22    11371496
-          # 2017-03-24    19276304 (x86/Linux, 64-bit machine)
-          ]),
-       compiler_stats_num_field('peak_megabytes_allocated', # Note [residency]
-          [(wordsize(64), 117, 15),
-          # 2015-10-22     76
-          # 2016-04-14    101 final demand analyzer run
-          # 2016-08-08    121 see above
-          # 2017-01-18    146 Allow top-level string literals in Core
-          # 2017-02-17    187 Type-indexed Typeable
-          # 2017-02-25    154 Early inline patch
-          # 2017-04-30    117 Fix leaks in tidy unfoldings
-           (wordsize(32),  69, 15),
-          # 2015-10-22     39
-          # 2017-03-24     69
-          ]),
        # Use `+RTS -G1` for more stable residency measurements. Note [residency].
        extra_hc_opts('+RTS -G1 -RTS')
      ],
@@ -1020,32 +233,14 @@ test('T10370',
      [''])
 
 test('T10547',
-     [ compiler_stats_num_field('bytes allocated',
-          [(platform('x86_64-unknown-mingw32'), 37485128, 20),
-          # 2017-02-19                         37485128 (x64/Windows) - Unknown
-
-           (wordsize(64), 37681360, 20),
-          # initial:    39165544
-          # 2016-11-25: 31041520 Linux   Around the time of refactoring the constraint solver;
-          #                              but I think that only pushed it over the edge
-          # 2017-02-20: 38681216 Linux   Type-indexed Typeable
-          ]),
+     [ collect_compiler_stats('bytes allocated', 20),
      ],
      compile_fail,
      ['-fprint-expanded-synonyms'])
 
 test('T12227',
      [ only_ways(['normal']),
-       compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 752214784, 5),
-          # 2016-07-11    5650186880 (Windows) before fix for #12227
-          # 2016-07-11    1822822016 (Windows) after fix for #12227
-          # 2016-12-20    1715827784 after d250d493 (INLINE in Traversable dms)
-          #                          (or thereabouts in the commit history)
-          # 2017-02-14    1060158624  Early inlining: 35% improvement
-          # 2018-01-04    812869424   Drop unused givens (#13032): 23% better
-          # 2018-06-27    752214784   Trac #15421
-          ]),
+       collect_compiler_stats('bytes allocated',5)
      ],
      compile,
      # Use `-M1G` to prevent memory thrashing with ghc-8.0.1.
@@ -1053,52 +248,21 @@ test('T12227',
 
 test('T12425',
      [ only_ways(['optasm']),
-       compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 139100464, 5),
-          # initial:      125831400
-          # 2017-01-18:   133380960  Allow top-level string literals in Core
-          # 2017-02-17:   153611448  Type-indexed Typeable
-          # 2017-03-03:   142256192  Share Typeable KindReps
-          # 2017-03-21:   134334800  Unclear
-          # 2017-04-28:   127500136  Remove exponential behaviour in simplifier
-          # 2017-05-23:   134780272  Addition of llvm-targets in dynflags (D3352)
-          # 2018-04-15:   141952368  Collateral of #14737
-          # 2018-04-26:   150743648  Do not unpack class dictionaries with INLINABLE
-          # 2018-05-14:   139100464  improved simplCast performance #15019
-          ]),
+       collect_compiler_stats('bytes allocated',5)
      ],
      compile,
      [''])
 
 test('T12234',
      [ only_ways(['optasm']),
-       compiler_stats_num_field('bytes allocated',
-          [(platform('x86_64-unknown-mingw32'), 79889200, 5),
-          # initial:      83032768
-          # 2017-02-19    89180624 (x64/Windows) - Unknown
-          # 2017-02-25    79889200 (x64/Windows) - Early inline patch
-          # 2018-05-04    86938328 (x64/Windows) - Unknown and horrible
-           (wordsize(64), 85961968, 5),
-          # initial:      72958288
-          # 2016-01-17:   76848856  (x86-64, Linux. drift?)
-          # 2017-02-01:   80882208  (Use superclass instances when solving)
-          # 2017-02-05:   74374440  (Probably OccAnal fixes)
-          # 2017-02-17:   86525344  (Type-indexed Typeable)
-          # 2017-02-25:   83032768  (Early inline patch)
-          # 2017-09-07:   81696664  (Semigroup=>Monoid patch, D3927)
-          # 2018-04-26:   85961968  (Do not unpack class dictionaries with INLINABLE)
-          ]),
+       collect_compiler_stats('bytes allocated',5),
      ],
      compile,
      [''])
 
 test('T12545',
      [ only_ways(['normal']),
-       compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 3249613688, 5),
-          # 2017-06-08    3538652464  initial
-          # 2018-06-27    3249613688  Trac #15421
-          ]),
+       collect_compiler_stats('bytes allocated',5),
        extra_clean(['T12545a.hi', 'T12545a.o'])
      ],
      multimod_compile,
@@ -1106,88 +270,39 @@ test('T12545',
 
 test('T13035',
      [ only_ways(['normal']),
-       compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 125020728, 5),
-          # 2017-01-05   90595208  initial
-          # 2017-01-19   95269000  Allow top-level string literals in Core
-          # 2017-02-05   88806416  Probably OccAnal fixes
-          # 2017-02-17   103890200 Type-indexed Typeable
-          # 2017-02-25   98390488  Early inline patch
-          # 2017-03-21   93249744  It's unclear
-          # 2017-07-19   118665640 Generate Typeable bindings for data instances
-          # 2018-06-10   125020728 It's unclear
-          ]),
+       collect_compiler_stats('bytes allocated',5),
      ],
      compile,
      [''] )
 
 test('T13056',
      [ only_ways(['optasm']),
-       compiler_stats_num_field('bytes allocated',
-         [(wordsize(64), 440548592, 10),
-         # 2017-01-06    520166912 initial
-         # 2017-01-31    546800240 Join points (#12988)
-         # 2017-02-07    524611224 new SetLevels
-         # 2017-02-14    440548592 Early inline patch: 16% improvement
-         # 2017-04-21    417860736 (darwin)
-         # 2017-04-22    Increase to +/- 10% (Darwin and Linux differ significantly)
-         ]),
+       collect_compiler_stats('bytes allocated',10),
      ],
      compile,
      ['-O1'])
 
 test('T12707',
-     [ compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 1201750816, 5),
-          # initial:    1271577192
-          # 2017-01-22: 1348865648  Allow top-level strings in Core
-          # 2017-01-31: 1280336112  Join points (#12988)
-          # 2017-02-11: 1310037632  Check local family instances vs imports
-          # 2017-02-23: 1386110512  Type-indexed Typeable? (on Darwin)
-          # 2017-03-02: 1231809592  Drift from recent simplifier improvements
-          # 2017-05-14: 1163821528  (amd64/Linux) Two-pass CmmLayoutStack
-          # 2018-04-09: 1237898376  Inexplicable, collateral of #14737
-          # 2018-04-30: 1141555816  improved simplCast performance #15019
-          # 2018-09-21: 1201750816  (amd64/darwin) Drift
-          ]),
+     [ collect_compiler_stats('bytes allocated',5),
      ],
      compile,
      [''])
 
 test('T12150',
      [ only_ways(['optasm']),
-       compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 77557800, 10)
-          # initial:    70773000
-          # 2017-08-25: 74358208  Refactor the Mighty Simplifier
-          # 2017-08-25: 78300680  Drift
-          # 2017-10-25: 73769936  amd64/linux Fix space leak in BinIface.getSymbolTable
-          # 2018-04-26: 77557800  Do not unpack class dictionaries with INLINABLE
-          ]),
+       collect_compiler_stats('bytes allocated',5)
      ],
     compile,
      [''])
 
 test('T13379',
-     [ compiler_stats_num_field('bytes allocated',
-          [(platform('x86_64-apple-darwin'), 453166912, 10),
-          # 453166912: add osx-specific after two-pass CmmLayoutStack
-           (wordsize(64), 411597856, 10),
-          # initial:    411597856
-          # widen window to 10%, Darwin had 449080520, a 9.1% difference
-          ]),
+     [ collect_compiler_stats('bytes allocated',10),
      ],
      compile,
      [''])
 
 test('MultiLayerModules',
-     [ compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 5619893176, 10),
-          # initial:    12139116496
-          # 2017-05-12: 6956533312   Revert "Use a deterministic map for imp_dep_mods"
-          # 2017-05-31: 6294813000   Faster checkFamInstConsistency
-          # 2018-01-21: 5619893176   Allocate less in plus_mod_dep
-          ]),
+     [ collect_compiler_stats('bytes allocated',10),
        pre_cmd('./genMultiLayerModules'),
        extra_files(['genMultiLayerModules']),
        compile_timeout_multiplier(5)
@@ -1200,11 +315,7 @@ test('MultiLayerModules',
      ['MultiLayerModules', '-v0'])
 
 test('ManyConstructors',
-     [ compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 4246959352, 10),
-          # initial:    8130527160
-          # 2018-01-05: 4246959352  Lift constructor tag allocation out of a loop
-          ]),
+     [ collect_compiler_stats('bytes allocated',10),
        pre_cmd('./genManyConstructors'),
        extra_files(['genManyConstructors']),
      ],
@@ -1212,11 +323,7 @@ test('ManyConstructors',
      ['ManyConstructors', '-v0'])
 
 test('ManyAlternatives',
-     [ compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 1398898072, 10),
-          # initial:    1756999240
-          # 2018-01-20: 1398898072  Use IntSet in Dataflow
-          ]),
+     [ collect_compiler_stats('bytes allocated',10),
        pre_cmd('./genManyAlternatives'),
        extra_files(['genManyAlternatives']),
      ],
@@ -1224,18 +331,7 @@ test('ManyAlternatives',
      ['ManyAlternatives', '-v0'])
 
 test('T13701',
-     [ compiler_stats_num_field('bytes allocated',
-          [(platform('x86_64-apple-darwin'), 2549206272, 10),
-           (platform('x86_64-unknown-linux'), 2413253392, 10),
-           # initial:     2511285600
-           # 2017-06-23:  2188045288    treat banged variable bindings as FunBinds
-           # 2017-07-11:  2187920960
-           # 2017-07-12:  2412223768    inconsistency between Ben's machine and Harbormaster?
-           # 2017-07-17:  2133380768    Resolved the issue causing the inconsistencies in this test
-           # 2018-05-09:  2413253392    D4659 (Fix GHCi space leaks) added
-           #                            some strictness which causes some extra
-           #                            work to be done in this test.
-          ]),
+     [ collect_compiler_stats('bytes allocated',10),
        pre_cmd('./genT13701'),
        extra_files(['genT13701']),
        compile_timeout_multiplier(4)
@@ -1248,11 +344,7 @@ test('T13701',
      ['T13701', '-v0'])
 
 test('T13719',
-     [ compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 5187889872, 10),
-          # initial:    49907410784
-          # 2017-05-31: 5187889872   Faster checkFamInstConsistency
-          ]),
+     [ collect_compiler_stats('bytes allocated',10),
        pre_cmd('./genT13719'),
        extra_files(['genT13719']),
        compile_timeout_multiplier(2)
@@ -1265,12 +357,7 @@ test('T13719',
      ['T13719', '-v0'])
 
 test('T14697',
-     [ compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 371030912, 10),
-          # initial:    635573784
-          # 2018-02-23: 337290376 Cache the fingerprint of sOpt_P
-          # 2018-05-24: 371030912 Unknown
-          ]),
+     [ collect_compiler_stats('bytes allocated',10),
        # This generates too large of a command-line for poor Windows
        when(opsys('mingw32'), expect_broken(15072)),
        pre_cmd('./genT14697'),
@@ -1281,12 +368,7 @@ test('T14697',
      ['T14697', '-v0'])
 
 test('T14683',
-     [ compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 10521594688, 10),
-          # initial:      25189145632
-          # 2018-04-19:   14675353056  Cache NthCo role (#14683)
-          # 2018-04-20:   10521594688  Remove unnecessary check in simplCast
-          ]),
+     [ collect_compiler_stats('bytes allocated',10),
      ],
      multimod_compile,
      ['T14683', '-v0'])
@@ -1294,36 +376,20 @@ test('T14683',
 test('Naperian',
      [ reqlib('vector'),
        only_ways(['optasm']),
-       compiler_stats_num_field('bytes allocated',
-          [(platform('x86_64-unknown-mingw32'), 54116696, 10),
-           # 2017-12-24                       54116696 (x64/Windows) - Unknown
-          (wordsize(64), 53576760, 10)])
-           # 2018-01-25                       53576760 (x64/Linux) - The previous value looked very wrong
+       collect_compiler_stats('bytes allocated',10),
      ],
      compile,
      [''])
 
 test ('T9630',
-      [ compiler_stats_num_field('max_bytes_used', # Note [residency]
-          [(platform('x86_64-unknown-mingw32'),   39867088, 15),
-          # 2017-12-24:                     34171816 (x64/Windows)
-          (wordsize(64), 41365088, 15)
-          # initial:    56955240
-          # 2017-06-07: 41568168     Stop the specialiser generating loopy code
-          # 2018-02-25: 35324712     It's not entirely clear
-          # 2018-09-22: 41365088     It's not entirely clear (x86_64/darwin)
-          ]),
+      [ collect_compiler_stats('max_bytes_used',15), # Note [residency]
       extra_clean(['T9630a.hi', 'T9630a.o'])
       ],
       multimod_compile,
       ['T9630', '-v0 -O'])
 
 test ('T15164',
-      [ compiler_stats_num_field('bytes allocated',
-          [(wordsize(64), 1945564312, 10)
-          # initial:      3423873408
-          # 2018-05-22:   1945564312   Fix bottleneck in CallArity
-          ])
+      [ collect_compiler_stats('bytes allocated',10)
       ],
       compile,
       ['-v0 -O'])
diff --git a/testsuite/tests/perf/haddock/all.T b/testsuite/tests/perf/haddock/all.T
index 4161c2e6d3..fca30366f9 100644
--- a/testsuite/tests/perf/haddock/all.T
+++ b/testsuite/tests/perf/haddock/all.T
@@ -5,72 +5,7 @@
 test('haddock.base',
      [extra_files(['../../../../libraries/base/dist-install/haddock.t']),
       unless(in_tree_compiler(), skip), req_haddock
-     ,stats_num_field('bytes allocated',
-          [(platform('x86_64-unknown-mingw32'), 18733710728, 5)
-            # 2017-02-19                        24286343184 (x64/Windows) - Generalize kind of (->)
-            # 2017-12-24                        18733710728 (x64/Windows) - Unknown
-
-          ,(wordsize(64), 25913205656, 5)
-            # 2012-08-14:  5920822352 (amd64/Linux)
-            # 2012-09-20:  5829972376 (amd64/Linux)
-            # 2012-10-08:  5902601224 (amd64/Linux)
-            # 2013-01-17:  6064874536 (x86_64/Linux)
-            # 2013-02-10:  6282746976 (x86_64/Linux)
-            # 2013-09-17:  6634886456 (x86_64/Linux)
-            # 2013-09-18:  6294339840 (x86_64/Linux)
-            # 2013-11-21:  6756213256 (x86_64/Linux)
-            # 2014-01-12:  7128342344 (x86_64/Linux)
-            # 2014-06-12:  7498123680 (x86_64/Linux)
-            # 2014-08-05:  7992757384 (x86_64/Linux - bugfix for #314, Haddock now parses more URLs)
-            # 2014-08-08:  7946284944 (x86_64/Linux - Haddock updates to attoparsec-0.12.1.0)
-            # 2014-09-09:  8354439016 (x86_64/Linux - Applicative/Monad changes, according to Austin)
-            # 2014-09-10:  7901230808 (x86_64/Linux - Applicative/Monad changes, according to Joachim)
-            # 2014-10-07:  8322584616 (x86_64/Linux)
-            # 2014-12-14:  9502647104 (x86_64/Linux) - Update to Haddock 2.16
-            # 2014-01-08:  9014511528 (x86_64/Linux) - Eliminate so-called "silent superclass parameters" (and others)
-            # 2015-07-22:  9418857192 (x86_64/Linux) - Just slowly creeping up.
-            # 2015-10-03:  9894189856 (x86_64/Linux) - Still creeping
-            # 2015-12-11: 11119767632 (amd64/Linux) - TypeInType (see #11196)
-            # 2015-12-17: 26282821104 (x86_64/Linux) - Update Haddock to master
-            # 2015-12-17: 27812188000 (x86_64/Linux) - Move Data.Functor.* into base
-            # 2016-02-25: 30987348040 (x86_64/Linux) - RuntimeRep
-            # 2016-05-12: 32855223200 (x86_64/Linux) - Make Generic1 poly-kinded
-            # 2017-01-11: 31115778088 (x86_64/Linux) - Join points (#12988)
-            # 2017-02-11: 34819979936 (x86_64/Linux) - OccurAnal / One-Shot  (#13227)
-            # 2017-02-16: 32695562088 Better Lint for join points
-            # 2017-02-17: 38425793776 (x86_64/Linux) - Generalize kind of (->)
-            # 2017-02-12: 25592972912 (x86_64/Linux) - Type-indexed Typeable
-            # 2017-06-05: 27868466432 (x86_64/Linux) - Desugar modules compiled with -fno-code
-            # 2017-06-06: 25173968808 (x86_64/Linux) - Don't pass on -dcore-lint in Haddock.mk
-            # 2017-07-12: 23677299848 (x86_64/Linux) - Use getNameToInstancesIndex
-            # 2017-08-22: 19694554424 (x86_64/Linux) - Various Haddock optimizations
-	        # 2018-03-31: 20980255200 (x86_64/Linux) - Track type variable scope more carefully
-	        # previous to this last commit, the allocations were right below the top
-            # of the range. This commit adds only ~1.5% allocations.
-            # 2018-04-10: 18511324808 (x86_64/Linux) - TTG HsBinds and Data instances
-            # 2018-04-11: 20727464616 (x86_64/Linux) - Collateral of simplCast improvement (#14737)
-            # 2018-04-20: 18971030224 (x86_64/Linux) - Cache coercion roles
-            # 2018-05-14: 21123660336 (amd64/Linux) - D4659: strictness to fix space leaks
-            # 2018-06-14: 24662232152 (amd64/Linux) - Bump haddock
-            # 2018-10-08: 25913205656 (amd64/Linux&OSX) - D5167: Improve GHC.Prim docs
-
-          ,(platform('i386-unknown-mingw32'), 2885173512, 5)
-            # 2013-02-10:                     3358693084 (x86/Windows)
-            # 2013-11-13:                     3097751052 (x86/Windows, 64bit machine)
-            # 2014-04-04:                     3548581572 (x86/Windows, 64bit machine)
-            # 2014-12-01:                     4202377432 (x86/Windows, 64bit machine)
-            # 2015-01-20:                     4434804940 (x86/Windows, 64bit machine)
-            # 2017-04-02:                     2885173512 update
-
-          ,(wordsize(32), 3445319728, 5)])
-            # 2012-08-14: 3046487920 (x86/OSX)
-            # 2012-10-30: 2955470952 (x86/Windows)
-            # 2013-02-10: 3146596848 (x86/OSX)
-            # 2014-02-22: 3554624600 (x86/Linux - new haddock)
-            # 2014-06-29: 3799130400 (x86/Linux)
-            # 2016-04-06: 5509757068 (x86/Linux)
-            # 2017-03-24: 3819657568 (x86/Linux)
-            # 2017-04-06: 3445319728 (x86/Linux)
+     ,collect_stats('bytes allocated',5)
       ],
      stats,
      ['haddock.t'])
@@ -78,78 +13,7 @@ test('haddock.base',
 test('haddock.Cabal',
      [extra_files(['../../../../libraries/Cabal/Cabal/dist-install/haddock.t']),
       unless(in_tree_compiler(), skip), req_haddock
-     ,stats_num_field('bytes allocated',
-          [(wordsize(64), 27520214496, 5)
-            # 2012-08-14:  3255435248 (amd64/Linux)
-            # 2012-08-29:  3324606664 (amd64/Linux, new codegen)
-            # 2012-10-08:  3373401360 (amd64/Linux)
-            # 2013-03-13:  3626604824 (amd64/Linux) Cabal updated
-            # 2013-03-28:  3517301864 (amd64/Linux) fixed #7796
-            # 2013-04-26:  3658801800 (amd64/Linux) Cabal updated
-            # 2013-08-26:  3808466816 (amd64/Linux) Cabal updated
-            # 2013-11-21:  3908586784 (amd64/Linux) Cabal updated
-            # 2013-12-12:  3828567272 (amd64/Linux)
-            # 2014-01-12:  3979151552 (amd64/Linux) new parser
-            # 2014-06-29:  4200993768 (amd64/Linux)
-            # 2014-08-05:  4493770224 (x86_64/Linux - bugfix for #314, Haddock now parses more URLs)
-            # 2014-08-29:  4267311856 (x86_64/Linux - w/w for INLINABLE things)
-            # 2014-09-09:  4660249216 (x86_64/Linux - Applicative/Monad changes according to Austin)
-            # 2014-09-10:  4500376192 (x86_64/Linux - Applicative/Monad changes according to Joachim)
-            # 2014-09-24:  5840893376 (x86_64/Linux - Cabal update)
-            # 2014-10-04:  6019839624 (x86_64/Linux - Burning Bridges, Cabal update)
-            # 2014-12-14:  6387320816 (x86_64/Linux) - Update to Haddock 2.16
-            # 2015-01-22:  6710234312 (x86_64/Linux) - Cabal updated
-            # 2015-06-29:  7413958344 (x86_64/Linux) - due to #10482, not yet investigated
-            # 2015-12-11:  8114833312 (amd64/Linux) - TypeInType (See #11196)
-            # 2015-12-17:  9982130512 (amd64/Linux) - Update Haddock to master
-            # 2015-12-22: 10519532424 (amd64/Linux) - Lots of new Semigroup instances in Cabal
-            # 2016-03-29: 11517963232 (amd64/Linux) - not yet investigated
-            # 2016-03-30: 10941742184 (amd64/Linux) - defer inlining of Int* Ord methods
-            # 2016-04-06: 11542374816 (amd64/Linux) - CSE improvements and others
-            # 2016-04-07: 10963514352 (amd64/Linux) - Revert to what phabricator claims
-            # 2016-05-22: 11805238152 (amd64/Linux) - Make Generic1 poly-kinded
-            # 2016-06-05: 10997887320 (amd64/Linux) - Refactor derived Generic instances to reduce allocations
-            # 2016-06-21: 10070330520 (amd64/Linux) - D2350: Make checkFamInstConsistency less expensive
-            # 2016-08-07: 16001233464 (amd64/Linux) - Cabal update
-            #   It's worth noting that allocations scale up with the number
-            #   of modules in Cabal.  This Cabal update added a large number
-            #   of new modules; if you exclude them from the haddock run
-            #   the stats are comparable.
-            # 2016-10-01: 20619433656 (amd64/Linux) - Cabal update
-            # 2016-10-03: 21554874976 (amd64/Linux) - Cabal update
-            # 2016-10-06: 23706190072 (amd64/Linux) - Cabal update
-            # 2016-12-20: 25478853176 (amd64/Linux) - Cabal update
-            # 2017-01-14: 23272708864 (amd64/Linux) - Join points (#12988)
-            # 2017-02-11: 25533642168 (amd64/Linux) - OccurAnal / One-Shot  (#13227)
-            # 2017-02-16: 23867276992  Better Lint for join points
-            # 2017-02-17: 27784875792 (amd64/Linux) - Generalize kind of (->)
-            # 2017-02-12: 18865432648 (amd64/Linux) - Type-indexed Typeable
-            # 2017-05-31: 18269309128 (amd64/Linux) - Faster checkFamInstConsistency
-            # 2017-06-05: 22294859000 (amd64/Linux) - Desugar modules compiled with -fno-code
-            # 2017-06-05: 18753963960 (amd64/Linux) - Don't pass on -dcore-lint in Haddock.mk
-            # 2017-08-22: 15857428040 (amd64/Linux) - Various Haddock optimizations
-            # 2017-11-02: 17133915848 (amd64/Linux) - Phabricator D4144
-            # 2017-11-06: 18936339648 (amd64/Linux) - Unknown
-            # 2017-11-09: 20104611952 (amd64/Linux) - Bump Cabal
-            # 2018-01-22: 25261834904 (amd64/Linux) - Bump Cabal
-            # 2018-04-10: 23525241536 (amd64/Linux) - TTG HsBinds and Data instances
-            # 2018-05-14: 24519860272 (amd64/Linux) - D4659: strictness to fix space leaks
-            # 2018-06-14: 27520214496 (amd64/Linux) - Bump haddock
-
-          ,(platform('i386-unknown-mingw32'), 3293415576, 5)
-            # 2012-10-30:                     1733638168 (x86/Windows)
-            # 2013-02-10:                     1906532680 (x86/Windows)
-            # 2014-01-28:                     1966911336 (x86/Windows)
-            # 2014-04-24:                     2052220292 (x86/Windows)
-            # 2014-12-01:                     3088635556 (x86/Windows)
-            # 2015-01-20:                     3293415576
-
-          ,(wordsize(32), 3511151136, 5)])
-            # 2012-08-14: 1648610180 (x86/OSX)
-            # 2014-01-22: 1986290624 (x86/Linux)
-            # 2014-06-29: 2127198484 (x86/Linux)
-            # 2016-04-06: 6268156056 (x86/Linux)
-            # 2017-03-24: 3511151136 (x86/Linux)
+     ,collect_stats('bytes allocated',5)
       ],
      stats,
      ['haddock.t'])
@@ -157,49 +21,7 @@ test('haddock.Cabal',
 test('haddock.compiler',
      [extra_files(['../../../../compiler/stage2/haddock.t']),
       unless(in_tree_compiler(), skip), req_haddock
-     ,stats_num_field('bytes allocated',
-          [(platform('x86_64-unknown-mingw32'),   56775301896, 10),
-            # 2017-12-24:                     56775301896 (x64/Windows)
-            (wordsize(64), 63038317672, 10)
-            # 2012-08-14: 26070600504 (amd64/Linux)
-            # 2012-08-29: 26353100288 (amd64/Linux, new CG)
-            # 2012-09-18: 26882813032 (amd64/Linux)
-            # 2012-11-12: 25990254632 (amd64/Linux)
-            # 2014-07-17: 29809571376 (amd64/Linux) general round of updates
-            # 2012-11-27: 28708374824 (amd64/Linux)
-            # 2014-09-10: 30353349160 (amd64/Linux) post-AMP cleanup
-            # 2014-11-22: 33562468736 (amd64/Linux)
-            # 2015-06-02: 36740649320 (amd64/Linux) unknown cause
-            # 2015-06-29: 40624322224 (amd64/Linux) due to #10482, not yet investigated
-            # 2015-12-03: 44721228752 (amd64/Linux) slow creep upwards
-            # 2015-12-15: 49395782136 (amd64/Linux) more creep, following kind-equalities
-            # 2015-12-17: 58017214568 (amd64/Linux) update Haddock to master
-            # 2016-06-21: 55314944264 (amd64/Linux) D2350: Make checkFamInstConsistency less expensive
-            # 2016-11-29: 60911147344 (amd64/Linux) unknown cause
-            # 2017-02-11: 62070477608 (amd64/Linux) OccurAnal / One-Shot  (#13227) (and others)
-            # 2017-02-25: 55777283352 (amd64/Linux) Early inline patch
-            # 2017-05-31: 52762752968 (amd64/Linux) Faster checkFamInstConsistency
-            # 2017-06-05: 65378619232 (amd64/Linux) Desugar modules compiled with -fno-code
-            # 2017-06-06: 55990521024 (amd64/Linux) Don't pass on -dcore-lint in Haddock.mk
-            # 2017-07-12: 51592019560 (amd64/Linux) Use getNameToInstancesIndex
-            # 2018-04-08: 91115212032 (amd64/Linux) Trees that grow
-            # 2018-04-10: 58410358720 (amd64/Linux) Trees that grow (HsBinds, Data instances)
-            # 2018-05-14: 63038317672 (amd64/Linux) D4659: strictness to fix space leaks
-
-          ,(platform('i386-unknown-mingw32'),   367546388, 10)
-            # 2012-10-30:                     13773051312 (x86/Windows)
-            # 2013-02-10:                     14925262356 (x86/Windows)
-            # 2013-11-13:                     14328363592 (x86/Windows, 64bit machine)
-            # 2014-12-01:                       104140852 (x86/Windows, sudden shrinkage!)
-            # 2014-12-10:                       217933548 increased again
-            # 2017-04-02:                       367546388 update
-
-          ,(wordsize(32), 3775852520, 5)])
-            # 2012-08-14: 13471797488 (x86/OSX)
-            # 2014-01-22: 14581475024 (x86/Linux - new haddock)
-            # 2014-06-29: 15110426000 (x86/Linux)
-            # 2016-04-06: 16222702892 (x86/Linux)
-            # 2017-03-24: 3775852520  (x86/Linux)
+     ,collect_stats('bytes allocated',10)
       ],
      stats,
      ['haddock.t'])
diff --git a/testsuite/tests/perf/join_points/all.T b/testsuite/tests/perf/join_points/all.T
index fe202b6487..eedf0c0bff 100644
--- a/testsuite/tests/perf/join_points/all.T
+++ b/testsuite/tests/perf/join_points/all.T
@@ -7,17 +7,15 @@ setTestOpts(f)
 test('join001', normal, compile, [''])
 
 test('join002',
-  [stats_num_field('bytes allocated', [(wordsize(64), 2000290792, 5)])],
+  [collect_stats('bytes allocated',5),],
   compile_and_run,
   [''])
 test('join003',
-  [stats_num_field('bytes allocated', [(wordsize(64), 2000290792, 5)])],
+  [collect_stats('bytes allocated',5),],
   compile_and_run,
   [''])
 test('join004',
-  [stats_num_field('bytes allocated', [(wordsize(64), 16130592, 5)])],
-   # 2017-01-24   48146720    Join point rework
-   # 2017-02-05   16130592    Do Worker/Wrapper for NOINLINE things
+  [collect_stats('bytes allocated',5),],
   compile_and_run,
   [''])
 
@@ -25,11 +23,7 @@ test('join005', normal, compile, [''])
 test('join006', normal, compile, [''])
 
 test('join007',
-  [stats_num_field('bytes allocated',
-      [(platform('x86_64-unknown-mingw32'), 47368, 5),
-       # 2017-02-19                         47368 (x64/Windows) - Unknown
-
-       (wordsize(64), 50944, 5)])],
-       # Initial 50944
+  [collect_stats('bytes allocated',5),],
   compile_and_run,
   [''])
+
diff --git a/testsuite/tests/perf/should_run/all.T b/testsuite/tests/perf/should_run/all.T
index e3fb136d9f..0b70398e46 100644
--- a/testsuite/tests/perf/should_run/all.T
+++ b/testsuite/tests/perf/should_run/all.T
@@ -3,16 +3,14 @@
 # See Note [Solving from instances when interacting Dicts]
 
 test('T5835',
-     [stats_num_field('max_bytes_used',
-           [(wordsize(64), 44312, 10)]),
+     [collect_stats('max_bytes_used',10),
        only_ways(['normal'])
        ],
      compile_and_run,
      ['-O'])
 
 test('T12791',
-     [stats_num_field('max_bytes_used',
-           [(wordsize(64), 44312, 10)]),
+     [collect_stats('max_bytes_used',10),
        only_ways(['normal'])
        ],
      compile_and_run,
@@ -21,20 +19,14 @@ test('T12791',
 # Tests that newArray/newArray_ is being optimised correctly
 
 test('T10359',
-     [stats_num_field('bytes allocated',
-          [(wordsize(64), 450920, 5),
-           # previously   499512    (amd64/Linux)
-           # 2017-03-10   450920    (amd64/Linux)  Don't generate wrapper for !Int#
-           (wordsize(32), 351508, 5)]),
+     [collect_stats('bytes allocated',5),
       only_ways(['normal'])
       ],
      compile_and_run,
      ['-O'])
 
 test('T14955',
-     [stats_num_field('bytes allocated',
-          [(wordsize(64), 48050760, 5),
-           (wordsize(32), 351508, 5)]),
+     [collect_stats('bytes allocated',5),
       only_ways(['normal'])
       ],
      multimod_compile_and_run,
@@ -43,11 +35,8 @@ test('T14955',
 # fortunately the values here are mostly independent of the wordsize,
 # because the test allocates an unboxed array of doubles.
 test('T3586',
-     [stats_num_field('peak_megabytes_allocated', (17, 1)),
-                                 # expected value: 17 (amd64/Linux)
-      stats_num_field('bytes allocated', (16102024, 5)),
-                        # prev:           16835544 (amd64/Linux)
-                        # 2014-07-17:     16102024 (amd64/Linux), general round of updates
+     [collect_stats('peak_megabytes_allocated',1),
+      collect_stats('bytes allocated', 5),
       only_ways(['normal']),
 
       # Use `+RTS -G1` for more stable residency measurements. Note [residency].
@@ -58,17 +47,7 @@ test('T3586',
      ['-O'])
 
 test('T4830',
-     [stats_num_field('bytes allocated',
-          [(wordsize(64), 98248, 4),
-           #             127000 (amd64/Linux)
-           # 2013-02-07:  99264 (amd64/Linux)
-           # 2014-01-13:  98248 (amd64/Linux) due to #8647
-           # 2015-04-03: Widen 1->4% (amd64/Windows was doing better)
-           (wordsize(32), 70646, 3)]),
-           # 2013-02-10:  69744 (x86/Windows)
-           # 2013-02-10:  71548 (x86/OSX)
-           # 2014-01-28:  Widen range 2->3
-           #                (x86/Windows - actual 69000, lower was 69233)
+     [collect_stats('bytes allocated',4),
       only_ways(['normal'])
       ],
      compile_and_run,
@@ -82,27 +61,8 @@ test('T3245', [when(doing_ghci(), extra_hc_opts('-fobject-code'))],
 #
 test('lazy-bs-alloc',
      [extra_files(['../../numeric/should_run/arith011.stdout']),
-      stats_num_field('peak_megabytes_allocated', (2, 1)),
-                                 # expected value: 2 (amd64/Linux)
-      stats_num_field('bytes allocated',
-          [(wordsize(64), 421792, 5),
-            #             489776 (amd64/Linux)
-            # 2013-02-07: 429744 (amd64/Linux)
-            # 2013-12-12: 425400 (amd64/Linux)
-            # 2015-04-04: Widen 1->3% (amd64/Windows was failing)
-            # 2015-08-15: 431500 (Windows not good enough. avg of Windows&Linux)
-            # 2015-12-15: 444720 (amd64/Linux, D1616)
-            # 2015-12-17: 444720 (widen 3->5%, Windows is at 462688)
-            # 2017-01-30: 421792 (amd64/Linux, strangely Type-indexed Typeable)
-           (wordsize(32), 410040, 5)]),
-            # 2013-02-10: 421296 (x86/Windows)
-            # 2013-02-10: 414180 (x86/OSX)
-            # 2014-01-22: 411500 (x86/Linux)
-            # 2014-01-28: Widen 1->2% (x86/Windows was at 425212)
-            # 2016-04-06: 429760 (x86/Linux) no idea what happened
-            # 2017-02-14: 421448 Early inline patch
-            # 2017-03-24: 410040 It's not entirely clear, widen threshold to match 64-bit case
-
+      collect_stats('peak_megabytes_allocated', 1),
+      collect_stats('bytes allocated',5),
       only_ways(['normal']),
       extra_run_opts('arith011.stdout'),
       ignore_stdout,
@@ -116,25 +76,7 @@ test('lazy-bs-alloc',
      ['-O'])
 
 test('T876',
-     [stats_num_field('bytes allocated',
-          [(platform('x86_64-unknown-mingw32'), 53472, 5),
-              # 2015-04-03: 71904 (amd64/Windows, unknown cause)
-              # 2016-11-27: 66928 (amd64/Windows, unknown cause)
-              # 2017-12-24: 53472 (amd64/Windows, unknown cause)
-
-           (wordsize(64), 58128, 5),
-              # 2013-02-14: 1263712 (x86_64/Linux)
-              # 2014-02-10:   63216 (x86_64/Linux), call arity analysis
-              # 2016-11-11:   58128 (x86_64/Linux), it's not clear
-
-           (wordsize(32), 50408, 5) ]),
-              # some date:  663712  (Windows, 64-bit machine)
-              # 2014-04-04:  56820  (Windows, 64-bit machine)
-              # 2014-06-29:  53024  (x86_64/Linux)
-              # 2014-12-01:  56796  (Windows)
-              # 2015-07-11:  53156  (x86_64/Linux)
-              # 2017-03-24:  50408  (x86/Linux, 64-bit machine)
-
+     [collect_stats('bytes allocated',5),
       only_ways(['normal']),
       extra_run_opts('10000')
       ],
@@ -154,34 +96,16 @@ test('T4321',
 test('T3736', [], run_command, ['$MAKE -s --no-print-directory T3736'])
 test('T3738',
      [extra_clean(['T3738a.hi', 'T3738a.o']),
-      stats_num_field('peak_megabytes_allocated', (2, 0)),
-                    # expected value: 1 (amd64/Linux)
-                    # 2016-08-31:     2 (allocation area size bumped to 1MB)
-      stats_num_field('bytes allocated',
-                      [(wordsize(32), 45648, 5),
-                    # expected value: 50520 (x86/Linux)
-                       (wordsize(64), 50592, 8)]),
-                    # prev:           49400 (amd64/Linux)
-                    # 2014-07-17:     50520 (amd64/Linux) general round of updates
-                    # 2014-09-10:     50592 (amd64/Linux) post-AMP-update
-                    # 2015-04-03: Widen 5->8% (amd64/Windows was doing better)
+      collect_stats('peak_megabytes_allocated', 0),
+      collect_stats('bytes allocated',8),
       only_ways(['normal'])
       ],
      compile_and_run,
      ['-O'])
 
 test('MethSharing',
-     [stats_num_field('peak_megabytes_allocated', (2, 0)),
-                    # expected value: 1 (amd64/Linux)
-                    # 2016-08-31:     2 (allocation area size bumped to 1MB)
-      stats_num_field('bytes allocated',
-                      [(wordsize(32), 240071008, 5),
-                    # expected value: 2685858140 (x86/OS X)
-                    # expected:       360940756 (x86/Linux)
-                    # 2017-03-24:     240071008 (x86/Linux, 64-bit machine)
-                       (wordsize(64), 480098192, 5)]),
-                    # expected:   640067672 (amd64/Linux)
-                    # 2017-01-31: 480098192 work/wrap noinline things
+     [collect_stats('peak_megabytes_allocated',  0),
+      collect_stats('bytes allocated',5),
       only_ways(['normal'])
       ],
      compile_and_run,
@@ -197,9 +121,7 @@ test('T149',
      ['$MAKE -s --no-print-directory T149'])
 
 test('T5113',
-     [stats_num_field('bytes allocated',
-                      [(wordsize(32), 4000000, 5),
-                       (wordsize(64), 8000000, 5)]),
+     [collect_stats('bytes allocated',5),
       only_ways(['normal'])
       ],
      compile_and_run,
@@ -207,103 +129,54 @@ test('T5113',
 
 
 test('T4978',
-     [stats_num_field('bytes allocated',
-                      [(wordsize(32), 10000000, 5),
-                       (wordsize(64), 10137680, 5)]),
-                    # expected value: 10137680 (amd64/Linux)
+     [collect_stats('bytes allocated',5),
       only_ways(['normal'])
       ],
      compile_and_run,
      ['-O2'])
 
 test('T5205',
-     [stats_num_field('bytes allocated',
-                      [(wordsize(32), 49460, 5),
-                    # expected value: 47088 (x86/Darwin)
-                    # 2017-03-24:     49460 (x86/Linux, 64-bit machine)
-
-                       (platform('x86_64-unknown-mingw32'), 52264, 5),
-                    # 2016-12-14: 52264 (Separate out Windows results)
-
-                       (wordsize(64), 56208, 5)]),
-                    # expected value: 51320 (amd64/Linux)
-                    # 2014-07-17:     52600 (amd64/Linux) general round of updates
-                    # 2015-04-03: Widen 5->7% (amd64/Windows was doing better)
-                    # 2015-08-15: 50648 (Windows too good. avg of Windows&Linux)
-                    # 2015-10-30: 56208 (D757: Emit Typeable at definition site)
-                    # 2016-12-14: Narrow 7->5% (Separate out Windows results)
+     [collect_stats('bytes allocated',5),
       only_ways(['normal', 'optasm'])
       ],
      compile_and_run,
      [''])
 
 test('T5549',
-     [stats_num_field('bytes allocated',
-                      [(wordsize(32), 2896607976, 5),
-                    # expected value: 3362958676 (Windows)
-                    # 2014-12-01:     4096606332 (Windows) integer-gmp2
-                    # 2017-03-24:     2896607976 (x86/Linux, 64-bit machine)
-
-                       (wordsize(64), 5793140200, 5)]),
-                    # expected value: 6725846120 (amd64/Linux)
-                    #                 8193140752 (amd64/Linux) integer-gmp2
-                    #                 5793140200 (amd64/Linux) integer-gmp2
+     [collect_stats('bytes allocated',5),
       only_ways(['normal'])
       ],
      compile_and_run,
      ['-O'])
 
 test('T4474a',
-     [stats_num_field('bytes allocated',
-                      [(wordsize(32), 2405242767, 5),
-                       (wordsize(64), 4831890304, 5)]),
-                    # expected value: 4831890304 (amd64/OSX)
+     [collect_stats('bytes allocated',5),
       only_ways(['normal'])
       ],
      compile_and_run,
      ['-O'])
 test('T4474b',
-     [stats_num_field('bytes allocated',
-                      [(wordsize(32), 2405242767, 5),
-                       (wordsize(64), 4831890304, 5)]),
-                    # expected value: 4831890304 (amd64/OSX)
+     [collect_stats('bytes allocated',5),
       only_ways(['normal'])
       ],
      compile_and_run,
      ['-O'])
 test('T4474c',
-     [stats_num_field('bytes allocated',
-                      [(wordsize(32), 2405242767, 5),
-                       (wordsize(64), 4831890304, 5)]),
-                    # expected value: 4831890304 (amd64/OSX)
+     [collect_stats('bytes allocated',5),
       only_ways(['normal'])
       ],
      compile_and_run,
      ['-O'])
 
 test('T5237',
-     [stats_num_field('bytes allocated',
-                        [(platform('i386-unknown-mingw32'), 73280, 5),
-                         (wordsize(32), 78328, 5),
-                      # expected value: 78328 (i386/Linux)
-                         (wordsize(64), 104176, 5)]),
-                      # expected value: 110888 (amd64/Linux)
-                      # expected value: 104176 (amd64/Linux)
+     [collect_stats('bytes allocated',5),
      only_ways(['normal'])
      ],
     compile_and_run,
     ['-O ' + sse2_opts])
 
 test('T5536',
-     [stats_num_field('bytes allocated',
-                      [(wordsize(32), 446260520, 1),
-                                   # 1246287228 (i386/Linux)
-                                    # 446328556 (i386/Windows)
-                                    # 446192484 (i386/OSX)
-                       (wordsize(64), 892399040, 5)]),
-                   # expected value: 2492589480 (amd64/Linux)
-                   # 17/1/13:         892399040 (x86_64/Linux)
-                   #                  (new demand analyser)
+     [collect_stats('bytes allocated',1),
      extra_clean(['T5536.data']),
      ignore_stdout,
      only_ways(['normal'])
@@ -312,37 +185,14 @@ test('T5536',
     ['-O'])
 
 test('T7257',
-     [stats_num_field('bytes allocated',
-                      [(wordsize(32), 869850704, 10),
-                        # expected value: 1246287228 (i386/Linux)
-                        # 2016-04-06: 989850664 (i386/Linux) no idea what happened
-                        # 2017-03-25: 869850704 (x86/Linux, 64-bit machine) probably sizeExpr fix
-                       (wordsize(64), 1297293264, 5)]),
-                        # 2012-09-21: 1774893760 (amd64/Linux)
-                        # 2015-11-03: 1654893248 (amd64/Linux)
-                        # 2016-06-22: 1414893248 (amd64/Linux, sizeExpr fix)
-                        # 2018-06-22: 1297293264 (amd64/Linux, atomicModifyMutVar# replacement)
-      stats_num_field('peak_megabytes_allocated',
-                      [(wordsize(32), 217, 5),
-                        # 2012-10-08: 217 (x86/Linux)
-                       (wordsize(64), 227, 5)]),
-                        # 2012-09-21: 227 (amd64/Linux)
-
+     [collect_stats('bytes allocated',10),
+      collect_stats('peak_megabytes_allocated',5),
       only_ways(['normal'])
      ],
     compile_and_run, ['-O'])
 
 test('Conversions',
-     [stats_num_field('bytes allocated',
-                      [(wordsize(32), 76768, 3),
-                        # 2012-12-18: 55316 Guessed 64-bit value / 2
-                        # 2013-02-10: 77472 (x86/OSX)
-                        # 2013-02-10: 79276 (x86/Windows)
-                        # 2014-01-13: 76768 (x86/Linux) due to #8647
-                       (wordsize(64), 107544, 5)]),
-                        # 2012-12-18: 109608 (amd64/OS X)
-                        # 2014-07-17: 107544 (amd64/Linux)
-
+     [collect_stats('bytes allocated',3),
       only_ways(['normal'])
      ],
     compile_and_run, ['-O -msse2'])
@@ -351,29 +201,14 @@ test('T7507', omit_ways(['ghci']), compile_and_run, ['-O'])
 # For 7507, stack overflow is the bad case
 
 test('T7436',
-     [stats_num_field('max_bytes_used',
-          [(wordsize(64), 60360, 4),
-           #             127000 (amd64/Linux)
-           # 2013-02-07:  60360 (amd64/Linux)
-           # 2015-04-03: Widen 1->4% (amd64/Windows was doing better)
-           (wordsize(32), 42772, 4)]),
-           # 2013-02-10: 58032 (x86/Windows)
-           # 2013-02-10: 58836 (x86/OSX)
-           # 2017-03-24: 42772 (x86/Linux, 64-bit machine) no idea why
-           # 2017-04-02: Widen 1->4% (i386/Windows was doing better)
+     [collect_stats('max_bytes_used',4),
       only_ways(['normal'])
       ],
      compile_and_run,
      ['-O'])
 
 test('T7797',
-      [stats_num_field('bytes allocated',
-                      [(wordsize(32), 240044984, 5),
-                          # expected value: 2685858140 (x86/OS X)
-                          # expected: 360940756 (x86/Linux)
-                          # expected: 240044984 (x86/Windows, 64bit machine)
-                       (wordsize(64), 480050944, 5)]),
-                          # expected: 480050944 (amd64/Linux)
+      [collect_stats('bytes allocated',5),
       extra_clean(['T7797a.hi', 'T7797a.o']),
       only_ways(['normal'])
       ],
@@ -381,114 +216,62 @@ test('T7797',
      ['-O'])
 
 test('T7954',
-      [stats_num_field('bytes allocated',
-                      [(wordsize(32), 920045264, 10),
-              # some date:  1380051408    (64-bit Windows machine)
-              # 2014-04-04:  920045264    (64-bit Windows machine)
-                       (wordsize(64), 1280051632, 10)]),
-              # 2014-02-10: 1680051336 (x86_64/Linux), call arity analysis
-              # 2018-05-03: 1280051632 (x86_64/Linux), refactor numericEnumFrom
+      [collect_stats('bytes allocated',10),
       only_ways(['normal'])
       ],
      compile_and_run,
      ['-O'])
 
 test('T7850',
-     [stats_num_field('peak_megabytes_allocated',
-                      [(wordsize(32), 2, 10),
-                       (wordsize(64), 4, 10)]),
+     [collect_stats('peak_megabytes_allocated',10),
       only_ways(['normal'])],
      compile_and_run,
      ['-O'])
 
 test('T5949',
-     [stats_num_field('bytes allocated',
-                      [ (wordsize(32), 116020, 10),
-                        (wordsize(64), 201008, 10)]),
-                      # previously, it was >400000 bytes
+     [collect_stats('bytes allocated',10),
       only_ways(['normal'])],
      compile_and_run,
      ['-O'])
 
 test('T4267',
-     [stats_num_field('bytes allocated',
-                      [ (wordsize(32), 36012, 10)
-                      # 32-bit value close to 64 bit; c.f. T7619
-                      , (wordsize(64), 40992, 10) ]),
-                      # previously, it was >170000 bytes
-                      # 2014-01-17: 130000
-                      # 2014-02-10: 40992 (x86_64/Linux), call arity analysis
+     [collect_stats('bytes allocated',10),
       only_ways(['normal'])],
-     compile_and_run,
-     ['-O'])
+      compile_and_run,
+      ['-O'])
 
 test('T7619',
-     [stats_num_field('bytes allocated',
-                      [ (wordsize(32), 36012, 10)
-                      # 32-bit close to 64-bit value; most of this very
-                      # small number is standard start-up boilerplate I think
-                      , (wordsize(64), 40992, 10) ]),
-                      # previously, it was >400000 bytes
+     [collect_stats('bytes allocated',10),
       only_ways(['normal'])],
      compile_and_run,
      ['-O'])
 
 test('InlineArrayAlloc',
-     [stats_num_field('bytes allocated',
-                      [ (wordsize(32), 800040960, 5)
-                      , (wordsize(64), 1600040960, 5) ]),
+     [collect_stats('bytes allocated',5),
       only_ways(['normal'])],
      compile_and_run,
      ['-O2'])
 
 test('InlineByteArrayAlloc',
-     [stats_num_field('bytes allocated',
-                      [ (wordsize(32), 1360036012, 5)
-                      , (wordsize(64), 1440040960, 5) ]),
-         # 32 and 64 bit not so different, because
-         # we are allocating *byte* arrays
+     [collect_stats('bytes allocated',5),
       only_ways(['normal'])],
      compile_and_run,
      ['-O2'])
 
 test('InlineCloneArrayAlloc',
-     [stats_num_field('bytes allocated',
-                      [ (wordsize(32), 800041120, 5)
-                      , (wordsize(64), 1600041120, 5) ]),
+     [collect_stats('bytes allocated',5),
       only_ways(['normal'])],
      compile_and_run,
      ['-O2'])
 
 test('T9203',
-     [stats_num_field('bytes allocated',
-                      [ (wordsize(32), 77969268, 5)
-                      # was
-                      # 2016-04-06     84345136 (i386/Debian) not sure
-                      # 2017-03-24     77969268 (x86/Linux, 64-bit machine) probably join points
-
-                      , (wordsize(64), 98360576, 5) ]),
-                      # was            95747304
-                      # 2019-09-10     94547280 post-AMP cleanup
-                      # 2015-10-28     95451192 emit Typeable at definition site
-                      # 2016-12-19     84620888 Join points
-                      # 2018-07-30     98360576 it's unclear
+     [collect_stats('bytes allocated',5),
       only_ways(['normal'])],
      compile_and_run,
      ['-O2'])
 
 test('T9339',
-     [stats_num_field('bytes allocated',
-                      [ (wordsize(32),    46904, 5)
-                      # is this number correct? Seems very high.
-                      # 2017-03-24:       46904 (x86/Linux, 64-bit machine) who knows
-
-                      , (platform('x86_64-unknown-mingw32'), 47088, 7)
-                      # 2017-02-19                           47088 (x64/Windows) - Unknown
-
-                      , (wordsize(64),       50728, 5) ]),
-                      # w/o fusing last: 320005080
-                      # 2014-07-22:       80050760
-                      # 2016-08-17:          50728 Join points (#12988)
+     [collect_stats('bytes allocated',5),
       only_ways(['normal'])],
      compile_and_run,
      ['-O2 -fspec-constr-keen'])
@@ -496,64 +279,44 @@ test('T9339',
 
 
 test('T8472',
-     [stats_num_field('bytes allocated',
-                      [ (wordsize(32),    50000, 80)
-                      , (wordsize(64),    51424, 80) ]),
+     [collect_stats('bytes allocated',80),
       only_ways(['normal'])],
      compile_and_run,
      ['-O2'])
 
 test('T12996',
-     [stats_num_field('bytes allocated',
-                      [ (wordsize(64),    76776, 5) ]),
+     [collect_stats('bytes allocated',5),
       only_ways(['normal'])],
      compile_and_run,
      ['-O2'])
 
 test('T13001',
-     [stats_num_field('bytes allocated',
-                      [ (wordsize(32),    46728, 20)
-                      , (wordsize(64),    50600, 20) ]),
+     [collect_stats('bytes allocated',20),
       only_ways(['normal'])],
      compile_and_run,
      ['-O2'])
 
 test('T8763',
-     [stats_num_field('bytes allocated',
-                      [ (wordsize(64),    41056, 20) ]),
+     [collect_stats('bytes allocated', 20),
       only_ways(['normal'])],
      compile_and_run,
      ['-O2'])
 
 test('T12990',
-    [stats_num_field('bytes allocated',
-                     [ (wordsize(64), 20040936, 5) ]),
-                     # 2017-01-03     34440936  w/o inlining unsaturated
-                     #                          constructor wrappers
-                     # 2017-01-03     21640904 inline wrappers
-                     # 2017-01-31     20040936 work/wrap noinline things
+    [collect_stats('bytes allocated',5),
      only_ways(['normal'])],
     compile_and_run,
     ['-O2'])
 
 test('T13218',
-    [stats_num_field('bytes allocated',
-                     [ (wordsize(64), 82040056, 5) ]),
-                     # 8.1 with default <$  163644216
-                     # 8.1 with derived <$   82040056
-     stats_num_field('max_bytes_used',
-                     [ (wordsize(64), 359128, 10) ]),
-                     # 8.1 with default <$  64408248
-                     # 8.1 with derived <$    359128
+    [collect_stats('bytes allocated',5),
+     collect_stats('max_bytes_used',10),
      only_ways(['normal'])],
     compile_and_run,
     ['-O'])
 
 test('DeriveNull',
-    [stats_num_field('bytes allocated',
-                    [ (wordsize(64), 112050856, 5) ]),
-                    # 2017-04-01     152083704 w/o derived null
-                    # 2017-04-02     112050856 derive null
+    [collect_stats('bytes allocated',5),
      only_ways(['normal'])],
     compile_and_run,
     ['-O'])
@@ -561,66 +324,41 @@ test('DeriveNull',
 test('DeriveNullTermination', normal, compile_and_run, [''])
 
 test('T13623',
-    [stats_num_field('bytes allocated',
-                    [(platform('x86_64-unknown-mingw32'),   47232, 10),
-                    # 2017-12-24     47232 unknown
-                    (wordsize(64), 50936, 5)]),
-                    # 2017-05-02     50936 initial
+    [collect_stats('bytes allocated',10),
      only_ways(['normal'])],
     compile_and_run,
     ['-O2'])
 
 test('T14052',
-     [compiler_stats_num_field('bytes allocated',
-                      [ (wordsize(64), 2346183840, 15) ])],
+     [collect_compiler_stats('bytes allocated',15)],
      ghci_script,
      ['T14052.script'])
 
 test('T14936',
-     [stats_num_field('bytes allocated',
-                      [(platform('x86_64-unknown-mingw32'),   47536, 10),
-                       # 2018-05-04     47536 unknown
-                       (wordsize(64), 51792, 5) ])],
+     [collect_stats('bytes allocated',10)],
      compile_and_run,
      ['-O2'])
 
 test('T15226',
-    [stats_num_field('bytes allocated',
-                    [(platform('x86_64-unknown-mingw32'),   37488, 4),
-                    # 2018-09-23   37488  Linker changes
-                     (wordsize(64), 41040, 5) ]),
-                    # 2018-06-06   41040  Let the simplifier know the result
-                    #                     of seq# is in WHNF
-                    # initial  400041040
+    [collect_stats('bytes allocated',5),
      only_ways(['normal'])],
     compile_and_run,
     ['-O'])
 
 test('T15226a',
-    [stats_num_field('bytes allocated',
-                    [(platform('x86_64-unknown-mingw32'),   37488, 4),
-                    # 2018-09-23   37488  Linker changes
-                     (wordsize(64), 41040, 5) ]),
-                    # 2018-06-06   41040  Look through casts for seq#
-                    # initial  400041040
+    [collect_stats('bytes allocated',5),
      only_ways(['normal'])],
     compile_and_run,
     ['-O'])
 
 test('T15426',
-    [stats_num_field('bytes allocated',
-                    [ (wordsize(64), 41272, 20) ]),
-		    # 2018-08-10   41272  Change findIndices from INLINE to INLINABLE
-		    # initial  160041176
-     only_ways(['normal'])],
+    [collect_stats('bytes allocated', 20),
+    only_ways(['normal'])],
     compile_and_run,
     ['-O2'])
 
 test('T15578',
-    [stats_num_field('bytes allocated',
-                    [ (wordsize(64), 800041456, 5) ]),
-                    # 2018-09-07     800041456   Improvements from #15578
-                    # initial      42400041456
+    [collect_stats('bytes allocated', 5),
      only_ways(['normal'])],
     compile_and_run,
     ['-O2'])
diff --git a/testsuite/tests/perf/space_leaks/all.T b/testsuite/tests/perf/space_leaks/all.T
index a23796d532..1f69d12112 100644
--- a/testsuite/tests/perf/space_leaks/all.T
+++ b/testsuite/tests/perf/space_leaks/all.T
@@ -1,36 +1,10 @@
 
 test('space_leak_001',
-     # Before trac #2747 was fixed this was 565.
-     # Now it's: 3 (amd64/Linux)
-     #           4 (x86/OS X)
-     #           5 (x86/Linux)
-     [stats_num_field('peak_megabytes_allocated', (3, 1)),
-                        # 3 (amd64/Linux, integer-gmp2)
-      stats_num_field('max_bytes_used',
-          [(wordsize(64), 440000, 15),
-                        # 440224 (amd64/Linux)
-                        # 417016 (x86/OS X)
-                        # 415672 (x86/Windows)
-                        # 481456 (unreg amd64/Linux)
-           (wordsize(32), 428220, 10)]),
-             # 2013-02-10 372072 (x86/OSX)
-             # 2013-02-10 439228 (x86/OSX)
-             # 2016-04-06 361400 (x86/Linux)
-             # 2017-03-24 428220 (x86/Linux, 64-bit machine)
-
-      stats_num_field('bytes allocated', 
-           [ (wordsize(64), 11315747416, 5),
-                        # expected value: 9079316016 (amd64/Linux)
-                        #                 9331570416 (x86/Linux)
-                        #                 9329073952 (x86/OS X)
-                        #                 9327959840 (x86/Windows)
-                        #                 11315747416 (amd64/Lnx, integer-gmp2)
-
-             (wordsize(32), 661907800, 5),
-              # 2014-12-01  13550759068 (Windows)
-              # 2017-03-24  661907800   (x86/Linux, 64-bit machine) No idea
-              
-            ]),
+     # This could potentially be replaced with
+     # collect_stats('all',5) to test all 3 with 
+     # 5% possible deviation.
+     [collect_stats(['peak_megabytes_allocated','bytes allocated'],5),
+      collect_stats('max_bytes_used',15),
       omit_ways(['profasm','profthreaded','threaded1','threaded2'])
       ],
      compile_and_run,
@@ -39,18 +13,14 @@ test('space_leak_001',
 test('T4334',
      # Test for a space leak in Data.List.lines (fixed with #4334)
      [extra_run_opts('1000000 2 t'),
-      stats_num_field('peak_megabytes_allocated', (2, 1)),
+      collect_stats('peak_megabytes_allocated',2),
       # prof ways don't work well with +RTS -V0
       omit_ways(['profasm','profthreaded'])
       ],
      compile_and_run, [''])
 
 test('T2762',
-     [stats_num_field('peak_megabytes_allocated', (2, 0)),
-      # peak_megabytes_allocated is 2 with 7.0.2.
-      # Was 57 with 6.12.3.
-      # 2016-08-31:     3 (allocation area size bumped to 1MB)
-      # 2017-02-22:     2 (refactor fiBind)
+     [collect_stats('peak_megabytes_allocated',2),
       only_ways(['normal']),
       extra_run_opts('+RTS -G1 -RTS' ),
       extra_clean(['T2762A.hi', 'T2762A.o'])],
@@ -61,36 +31,9 @@ test('T4018',
      compile_and_run, ['-fno-state-hack'])
 
 test('T4029',
-     [stats_num_field('peak_megabytes_allocated',
-          [(wordsize(64), 65, 10)]),
-            # 2016-02-26: 66 (amd64/Linux)           INITIAL
-            # 2016-05-23: 82 (amd64/Linux)           Use -G1
-            # 2016-07-13: 92 (amd64/Linux)           Changes to tidyType
-            # 2016-09-01: 71 (amd64/Linux)           Restore w/w limit (#11565)
-            # 2017-02-12: 80 (amd64/Linux)           Type-indexed Typeable
-            # 2017-02-20: 76 (amd64/Linux)           Better reading of iface files
-            # 2017-03-03: 65 (amd64/Linux)           Share Typeable KindReps or more
-            #                                        lazy interface file reading
-      stats_num_field('max_bytes_used',
-          [(wordsize(64), 18208944, 15)]),
-            # 2016-02-26: 24071720 (amd64/Linux)     INITIAL
-            # 2016-04-21: 25542832 (amd64/Linux)
-            # 2016-05-23: 25247216 (amd64/Linux)     Use -G1
-            # 2016-07-13: 27575416 (amd64/Linux)     Changes to tidyType
-            # 2016-07-20: 22920616 (amd64/Linux)     Fix laziness of instance matching
-            # 2016-09-01: 21648488 (amd64/Linux)     Restore w/w limit (#11565)
-            # 2016-10-13: 20325248 (amd64/Linux)     Creep (downwards, yay!)
-            # 2016-11-14: 21387048 (amd64/Linux)     Creep back upwards :(
-            # 2017-01-18: 21670448 (amd64/Linux)     Float string literals to toplevel
-            # 2017-02-07: 22770352 (amd64/Linux)     It is unclear
-            # 2017-02-12: 24151096 (amd64/Linux)     Type-indexed Typeable
-            # 2017-02-20: 22016200 (amd64/Linux)     Better reading of iface files
-            # 2017-03-03: 19172360 (amd64/Linux)     Share Typeable KindReps or more
-            #                                        lazy interface file reading
-            # 2017-03-07: 20476360 (amd64/Linux)     It's not entirely clear
-            # 2017-03-14: 18208944 (amd64/Darwin)    Again, not clear
-            # 2017-03-15: bumped margin to 15% due to instability
+     [collect_stats(['peak_megabytes_allocated','max_bytes_used'],10),
       extra_hc_opts('+RTS -G1 -RTS' ),
       ],
      ghci_script,
      ['T4029.script'])
+
diff --git a/testsuite/tests/pmcheck/should_compile/all.T b/testsuite/tests/pmcheck/should_compile/all.T
index 079978b5f5..393ce92463 100644
--- a/testsuite/tests/pmcheck/should_compile/all.T
+++ b/testsuite/tests/pmcheck/should_compile/all.T
@@ -36,26 +36,19 @@ test('T9951b', [], compile,
      ['-fwarn-incomplete-patterns -fwarn-overlapping-patterns'])
 test('T9951', [], compile,
      ['-fwarn-incomplete-patterns -fwarn-overlapping-patterns'])
-test('T11303', normal, compile, ['-fwarn-incomplete-patterns -fwarn-overlapping-patterns +RTS -M1G -RTS'])
-test('T11276', compiler_stats_num_field('bytes allocated',
-  [(wordsize(64), 165890392, 10)]
-    # 2018-07-14: 165890392   INITIAL
-  ), compile, ['-fwarn-incomplete-patterns -fwarn-overlapping-patterns +RTS -M1G -RTS'])
+test('T11303', normal, compile,
+     ['-fwarn-incomplete-patterns -fwarn-overlapping-patterns +RTS -M1G -RTS'])
+test('T11276', collect_compiler_stats('bytes allocated',10), compile,
+     ['-fwarn-incomplete-patterns -fwarn-overlapping-patterns +RTS -M1G -RTS'])
 
-test('T11303b', compiler_stats_num_field('bytes allocated',
-  [(wordsize(64), 54373936, 10)]
-    # 2018-07-14: 54373936    INITIAL
-  ), compile, ['-fwarn-incomplete-patterns -fwarn-overlapping-patterns +RTS -M1G -RTS'])
+test('T11303b', collect_compiler_stats('bytes allocated',10), compile,
+     ['-fwarn-incomplete-patterns -fwarn-overlapping-patterns +RTS -M1G -RTS'])
 
-test('T11374', compiler_stats_num_field('bytes allocated',
-  [(wordsize(64), 280144864, 10)]
-    # 2018-07-14: 280144864   INITIAL
-  ), compile, ['-fwarn-incomplete-patterns -fwarn-overlapping-patterns +RTS -M1G -RTS'])
+test('T11374', collect_compiler_stats('bytes allocated',10), compile,
+     ['-fwarn-incomplete-patterns -fwarn-overlapping-patterns +RTS -M1G -RTS'])
 
-test('T11195', compiler_stats_num_field('bytes allocated',
-  [(wordsize(64), 7852567480, 10)]
-    # 2018-07-14: 7852567480   INITIAL
-  ), compile, ['-package ghc -fwarn-incomplete-patterns -fwarn-overlapping-patterns +RTS -M2G -RTS'])
+test('T11195', collect_compiler_stats('bytes allocated',10), compile,
+     ['-package ghc -fwarn-incomplete-patterns -fwarn-overlapping-patterns +RTS -M2G -RTS'])
 
 test('T11984', normal, compile,
     ['-fwarn-incomplete-patterns -fwarn-overlapping-patterns'])
diff --git a/testsuite/tests/primops/should_run/all.T b/testsuite/tests/primops/should_run/all.T
index ecf995bea8..c7cdd348bf 100644
--- a/testsuite/tests/primops/should_run/all.T
+++ b/testsuite/tests/primops/should_run/all.T
@@ -7,11 +7,7 @@ test('T4442',
      compile_and_run, [''])
 test('T10481', exit_code(1), compile_and_run, [''])
 test('T10678',
-     [stats_num_field('bytes allocated',
-                      [(wordsize(64), 64004171, 5)
-                       # 2015-11-04: 88041768 +/- 5%  (before runRW#)
-                       # 2015-11-04: 64004171         (after runRW#)
-                      ]),
+     [ collect_stats('bytes allocated',5),
       only_ways('normal')
      ],
      compile_and_run, ['-O'])
diff --git a/testsuite/tests/simplCore/should_compile/all.T b/testsuite/tests/simplCore/should_compile/all.T
index d6b9aa0cf9..1f6ef0059f 100644
--- a/testsuite/tests/simplCore/should_compile/all.T
+++ b/testsuite/tests/simplCore/should_compile/all.T
@@ -152,8 +152,7 @@ test('T7702',
       # we say 18mb peak allocated +/- 70% because other compiler flags have
       # a large effect on allocation which is hard to separate from the
       # allocation done by the plugin... but a regression allocates > 90mb
-      compiler_stats_num_field('peak_megabytes_allocated',
-          [(wordsize(32), 18, 70), (wordsize(64), 18, 70)])
+      collect_compiler_stats('peak_megabytes_allocated',70),
      ],
      compile,
      ['-v0 -package-db T7702plugin/pkg.T7702/local.package.conf -fplugin T7702Plugin -package T7702plugin ' + config.plugin_way_flags])
diff --git a/testsuite/tests/simplStg/should_run/all.T b/testsuite/tests/simplStg/should_run/all.T
index d3aa9376ee..2f7c69f5db 100644
--- a/testsuite/tests/simplStg/should_run/all.T
+++ b/testsuite/tests/simplStg/should_run/all.T
@@ -13,10 +13,9 @@ test('T9291', normal, compile_and_run, [''])
 test('T13536', normal, compile_and_run, [''])
 
 test('T13536a',
-    [stats_num_field('bytes allocated',
-                    [ (wordsize(64), 86664, 5) ]),
-                    # 2017-04-10     86664 -- 25769889696 if broken
+        [ collect_stats('bytes allocated',5),
      only_ways(['optasm'])],
     compile_and_run,
     [''])
 
+
author	David Eichmann <davide@Well-Typed.com>	2018-11-07 12:02:47 -0500
committer	Ben Gamari <ben@smart-cactus.org>	2018-11-07 12:07:11 -0500
commit	932cd41d8c7984c767c1b3b58e05146f69cc5c15 (patch)
tree	41e77f048036a19100c5bee508c77b2ab8ec55d4
parent	82a5c2410a47b16df09039b9786c2c0e34ba130e (diff)
download	haskell-932cd41d8c7984c767c1b3b58e05146f69cc5c15.tar.gz