buildscripts/perf_regression_check.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344

import argparse
import json
import sys
import itertools
from dateutil import parser
from datetime import timedelta, datetime

# Example usage:
# perf_regression_check.py -f history_file.json --rev 18808cd923789a34abd7f13d62e7a73fafd5ce5f
# Loads the history json file, and looks for regressions at the revision 18808cd...
# Will exit with status code 1 if any regression is found, 0 otherwise.

def compareOneResultNoise(this_one, reference, label, threadlevel="max", noiseLevel=0,
                          noiseMultiple=1, minThreshold=0.05):
    '''
    Take two result series and compare them to see if they are acceptable.
    Return true if failed, and false if pass
    Uses historical noise data for the comparison.

    '''
    failed = False;
    if not reference:
        return failed

    ref = ""
    current = ""
    noise = 0

    if threadlevel == "max":
        ref = reference["max"]
        current = this_one["max"]
    else:
        # Don't do a comparison if the thread data is missing
        if not threadlevel in reference["results"].keys():
            return failed
        ref = reference["results"][threadlevel]['ops_per_sec']
        current = this_one["results"][threadlevel]['ops_per_sec']

    noise = noiseLevel * noiseMultiple
    delta = minThreshold * ref
    if (delta < noise):
        delta = noise
    # Do the check
    if ref - current >= delta:
        print ("\tregression found on %s: drop from %.2f ops/sec (commit %s) to %.2f ops/sec for comparison %s. Diff is"
               " %.2f ops/sec (%.2f%%), noise level is %.2f ops/sec and multiple is %.2f" %
               (threadlevel, ref, reference["revision"][:5], current, label, ref - current,
                100*(ref-current)/ref, noiseLevel, noiseMultiple))
        failed = True
    return failed


def compareResults(this_one, reference, threshold, label, noiseLevels={}, noiseMultiple=1, threadThreshold=None, threadNoiseMultiple=None):
    '''
    Take two result series and compare them to see if they are acceptable.
    Return true if failed, and false if pass
    '''

    failed = False;
    if not reference:
        return failed
    # Default threadThreshold to the same as the max threshold
    if  not threadThreshold:
        threadThreshold = threshold
    if not threadNoiseMultiple : 
        threadNoiseMultiple = noiseMultiple

    # Check max throughput first
    noise = 0
    # For the max throughput, use the max noise across the thread levels as the noise parameter
    if len(noiseLevels.values()) > 0:
        noise = max(noiseLevels.values())
    if compareOneResultNoise(this_one, reference, label, "max", noiseLevel=noise,
                             noiseMultiple=noiseMultiple, minThreshold=threshold):
        failed = True;
    # Check for regression on threading levels
    for (level, ops_per_sec) in (((r, this_one["results"][r]['ops_per_sec']) for r in
                                  this_one["results"] if type(this_one["results"][r]) == type({}))):
        noise = 0
        if level in noiseLevels:
            noise = noiseLevels[level]
        if compareOneResultNoise(this_one, reference, label, level, noiseLevel=noise,
                                 noiseMultiple=threadNoiseMultiple, minThreshold=threadThreshold):
            failed = True
    if not failed:
        print "\tno regression against %s and githash %s" %(label, reference["revision"][:5])
    return failed


def main(args):
    parser = argparse.ArgumentParser()
    parser.add_argument("-f", "--file", dest="file", help="path to json file containing"
                        "history data")
    parser.add_argument("-t", "--tagFile", dest="tfile", help="path to json file containing"
                        "tag data")
    parser.add_argument("--rev", dest="rev", help="revision to examine for regressions")
    parser.add_argument("--ndays", default=7, type=int, dest="ndays", help="Check against"
                        "commit from n days ago.")
    parser.add_argument("--threshold", default=0.05, type=float, dest="threshold", help=
                        "Don't flag an error if throughput is less than 'threshold'x100 percent off")
    parser.add_argument("--noiseLevel", default=1, type=float, dest="noise", help=
                        "Don't flag an error if throughput is less than 'noise' times the computed noise level off")
    parser.add_argument("--threadThreshold", default=0.1, type=float, dest="threadThreshold", help=
                        "Don't flag an error if thread level throughput is more than"
                        "'threadThreshold'x100 percent off")
    parser.add_argument("--threadNoiseLevel", default=2, type=float, dest="threadNoise", help=
                        "Don't flag an error if thread level throughput is less than 'noise' times the computed noise level off")
    parser.add_argument("--refTag", dest="reference", help=
                        "Reference tag to compare against. Should be a valid tag name")
    parser.add_argument("--overrideFile", dest="overrideFile", help="File to read for comparison override information")
    parser.add_argument("--variant", dest="variant", help="Variant to lookup in the override file")

    args = parser.parse_args()
    tagHistory = ""
    j = get_json(args.file)
    if args.tfile : 
        t = get_json(args.tfile)
        tagHistory = History(t)
    history = History(j)
    testnames = history.testnames()
    failed = False
    failed = 0

    results = []
    # Default empty override structure
    overrides = {'ndays' : {}, 'reference' : {}}
    if args.overrideFile :
        # Read the overrides file
        foverrides = get_json(args.overrideFile)
        # Is this variant in the overrides file?
        if args.variant in foverrides : 
            overrides = foverrides[args.variant]

    for test in testnames:
        # The first entry is valid. The rest is dummy data to match the existing format
        result = {'test_file' : test, 'exit_code' : 0, 'elapsed' : 5, 'start': 1441227291.962453, 'end': 1441227293.428761}
        this_one = history.seriesAtRevision(test, args.rev)
        testFailed = False
        print "checking %s.." % (test)
        if not this_one:
            print "\tno data at this revision, skipping"
            continue

        #If the new build is 10% lower than the target (3.0 will be
        #used as the baseline for 3.2 for instance), consider it
        #regressed.
        previous = history.seriesItemsNBefore(test, args.rev, 1)
        if not previous:
            print "\tno previous data, skipping"
            continue
        if compareResults(this_one, previous[0], args.threshold, "Previous", history.noiseLevels(test),
                          args.noise, args.threadThreshold, args.threadNoise):
            testFailed = True
            result['PreviousCompare'] = 'fail'
        else :
            result['PreviousCompare'] = 'pass'

        daysprevious = history.seriesItemsNDaysBefore(test, args.rev,args.ndays)
        if test in overrides['ndays']:
            print "Override in ndays for test %s" % test
            daysprevious = overrides['ndays'][test]
        if compareResults(this_one, daysprevious, args.threshold, "NDays", history.noiseLevels(test),
                          args.noise, args.threadThreshold, args.threadNoise):
            testFailed = True
            result['NDayCompare'] = 'fail'
        else :
            result['NDayCompare'] = 'pass'
        if tagHistory : 
            reference = tagHistory.seriesAtTag(test, args.reference)
            if not reference : 
                print "Didn't get any data for test %s with baseline %s" % (test, args.reference)
            if test in overrides['reference']:
                print "Override in references for test %s" % test
                reference = overrides['reference'][test]
            if compareResults(this_one, reference, args.threshold, "Baseline Comparison " + args.reference, history.noiseLevels(test),
                              args.noise, args.threadThreshold, args.threadNoise):
                testFailed = True
                result['BaselineCompare'] = 'fail'
            else :
                result['BaselineCompare'] = 'pass'
        if testFailed :
            result['status'] = 'fail'
            failed += 1
        else :
            result['status'] = 'pass'
        results.append(result)
        
    report = {}
    report['failures'] = failed
    report['results'] = results

    reportFile = open('report.json', 'w')
    json.dump(report, reportFile, indent=4, separators=(',', ': '))
    if failed > 0 :
        sys.exit(1)
    else:
        sys.exit(0)

# We wouldn't need this function if we had numpy installed on the system
def computeRange(result_list):
    '''
       Compute the max, min, and range (max - min) for the result list
    '''
    min = max = result_list[0]
    for result in result_list:
        if result < min:
            min = result
        if result > max:
            max = result
    return (max,min,max-min)

def get_json(filename):
    jf = open(filename, 'r')
    json_obj = json.load(jf)
    return json_obj

class History(object):
    def __init__(self, jsonobj):
        self._raw = sorted(jsonobj, key=lambda d: d["order"])
        self._noise = None

    def testnames(self):
        return set(list(itertools.chain.from_iterable([[z["name"] for z in c["data"]["results"]]
                                                       for c in self._raw])))

    def seriesAtRevision(self, testname, revision):
        s = self.series(testname)
        for result in s:
            if result["revision"] == revision:
                return result
        return None

    def seriesAtTag(self, testname, tagName):
        s = self.series(testname)
        for result in s:
            if result["tag"] == tagName:
                return result
        return None

    def seriesItemsNBefore(self, testname, revision, n):
        """
            Returns the 'n' items in the series under the given test name that
            appear prior to the specified revision.
        """
        results = []
        found = False
        s = self.series(testname)
        for result in s:
            if result["revision"] == revision:
                found = True
                break
            results.append(result)

        if found:
            return results[-1*n:]
        return []

    def computeNoiseLevels(self):
        """
        For each test, go through all results, and compute the average
        noise (max - min) for the series

        """
        self._noise = {}
        testnames = self.testnames()
        for test in testnames:
            self._noise[test] = {}
            s = self.series(test)
            threads = []
            for result in s:
                threads = result["threads"]
                break

            # Determine levels from last commit? Probably a better way to do this.
            for thread in threads:
                s = self.series(test)
                self._noise[test][thread] = sum((computeRange(x["results"][thread]["ops_per_sec_values"])[2]
                                                 for x in s))
                s = self.series(test)
                self._noise[test][thread] /= sum(1 for x in s)


    def noiseLevels(self, testname):
        """
        Returns the average noise level of the given test. Noise levels
        are thread specific. Returns an array

        """
        # check if noise has been computed. Compute if it hasn't
        if not self._noise:
            print "Computing noise levels"
            self.computeNoiseLevels()
        # Look up noise value for test
        if not testname in self._noise:
            print "Test %s not in self._noise" % (testname)
        return self._noise[testname]


    def seriesItemsNDaysBefore(self, testname, revision, n):
        """
            Returns the items in the series under the given test name that
            appear 'n' days prior to the specified revision.
        """
        results = {}
        # Date for this revision
        s = self.seriesAtRevision(testname, revision)
        if s==[]:
            return []
        refdate = parser.parse(s["create_time"]) - timedelta(days=n)

        s = self.series(testname)
        for result in s:
            if parser.parse(result["create_time"]) < refdate:
                results = result
        return results


    def series(self, testname):
        for commit in self._raw:
            # get a copy of the samples for those whose name matches the given testname
            matching = filter( lambda x: x["name"]==testname, commit["data"]["results"])
            if matching:
                result = matching[0]
                result["revision"] = commit["revision"]
                result["tag"] = commit["tag"]
                result["create_time"] = commit["create_time"]
                result["order"] = commit["order"]
                result["max"] = max(f["ops_per_sec"] for f in result["results"].values()
                                    if type(f) == type({}))
                result["threads"] = [f for f in result["results"] if type(result["results"][f])
                                     == type({})]
                yield result


class TestResult:
    def __init__(self, json):
        self._raw = json

    #def max(self):

if __name__ == '__main__':
    main(sys.argv[1:])