buildscripts/resmokelib/testing/hooks/combine_benchmark_results.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212

"""Module for generating the test results file fed into the perf plugin."""

from __future__ import absolute_import
from __future__ import division

import collections
import datetime
import json

from buildscripts.resmokelib import config as _config
from buildscripts.resmokelib.testing.hooks import interface


class CombineBenchmarkResults(interface.Hook):
    """
    The CombineBenchmarkResults hook combines test results from
    individual benchmark files to a single file. This is useful for
    generating the json file to feed into the Evergreen performance
    visualization plugin.
    """

    DESCRIPTION = "Combine JSON results from individual benchmarks"

    def __init__(self, hook_logger, fixture):
        interface.Hook.__init__(self, hook_logger, fixture, CombineBenchmarkResults.DESCRIPTION)
        self.report_file = _config.PERF_REPORT_FILE

        # Reports grouped by name without thread.
        self.benchmark_reports = {}

        self.create_time = None
        self.end_time = None

    @staticmethod
    def _strftime(time):
        return time.strftime("%Y-%m-%dT%H:%M:%SZ")

    def after_test(self, test_case, test_report):
        if self.report_file is None:
            return

        bm_report_path = test_case.report_name()

        with open(bm_report_path, "r") as report_file:
            report_dict = json.load(report_file)
            self._parse_report(report_dict)

    def before_suite(self, test_report):
        self.create_time = datetime.datetime.now()

    def after_suite(self, test_report):
        if self.report_file is None:
            return

        self.end_time = datetime.datetime.now()
        report = self._generate_perf_plugin_report()
        with open(self.report_file, "w") as f:
            json.dump(report, f)

    def _generate_perf_plugin_report(self):
        """Format the data to look like a perf plugin report."""
        perf_report = {
            "start": self._strftime(self.create_time),
            "end": self._strftime(self.end_time),
            "errors": [],  # There are no errors if we have gotten this far.
            "results": []
        }

        for name, report in self.benchmark_reports.items():
            test_report = {
                "name": name,
                "context": report.context._asdict(),
                "results": report.generate_perf_plugin_dict()
            }

            perf_report["results"].append(test_report)

        return perf_report

    def _parse_report(self, report_dict):
        context = report_dict["context"]

        for benchmark_res in report_dict["benchmarks"]:
            bm_name_obj = _BenchmarkThreadsReport.parse_bm_name(benchmark_res["name"])

            # Don't show Benchmark's included statistics to prevent cluttering up the graph.
            if bm_name_obj.statistic_type is not None:
                continue

            if bm_name_obj.base_name not in self.benchmark_reports:
                self.benchmark_reports[bm_name_obj.base_name] = _BenchmarkThreadsReport(context)
            self.benchmark_reports[bm_name_obj.base_name].add_report(bm_name_obj, benchmark_res)


# Capture information from a Benchmark name in a logical format.
_BenchmarkName = collections.namedtuple("_BenchmarkName", [
    "base_name",
    "thread_count",
    "statistic_type"
]);


class _BenchmarkThreadsReport(object):
    """
    Class representation of a report for all thread levels of a single
    benchmark test. Each report is designed to correspond to one graph
    in the Evergreen perf plugin.

    A raw Benchmark report looks like the following:
    {
      "context": {
        "date": "2015/03/17-18:40:25",
        "num_cpus": 40,
        "mhz_per_cpu": 2801,
        "cpu_scaling_enabled": false,
        "library_build_type": "debug"
      },
      "benchmarks": [
        {
          "name": "BM_SetInsert/arg name:1024/threads:10",
          "iterations": 21393,
          "real_time": 32724,
          "cpu_time": 33355,
          "bytes_per_second": 1199226,
          "items_per_second": 299807
        }
      ]
    }
    """
    CONTEXT_FIELDS = [
        "date", "cpu_scaling_enabled", "num_cpus", "mhz_per_cpu", "library_build_type"
    ]
    Context = collections.namedtuple("Context", CONTEXT_FIELDS)

    def __init__(self, context_dict):
        self.context = self.Context(**context_dict)

        # list of benchmark runs for each thread.
        self.thread_benchmark_map = collections.defaultdict(list)

    def add_report(self, bm_name_obj, report):
        self.thread_benchmark_map[bm_name_obj.thread_count].append(report)

    def generate_perf_plugin_dict(self):
        """
        Generate perf plugin data points of the following format:

        "1": {
          "error_values": [
            0,
            0,
            0
          ],
          "ops_per_sec": 9552.108279243452,
          "ops_per_sec_values": [
            9574.812658450564,
            9522.642340821469,
            9536.252775275878
          ]
        },
        """

        res = {}
        for thread_count, reports in self.thread_benchmark_map.items():
            thread_report = {
                "error_values": [0 for _ in range(len(reports))],
                "ops_per_sec_values": []  # This is actually storing latency per op, not ops/s
            }

            # Take the negative of the latency numbers to preserve the higher is better semantics.
            for report in reports:
                thread_report["ops_per_sec_values"].append(-1 * report["cpu_time"])
            thread_report["ops_per_sec"] = sum(thread_report["ops_per_sec_values"]) / len(reports)

            res[thread_count] = thread_report

        return res

    @staticmethod
    def parse_bm_name(name_str):
        """
        Split the benchmark name into base_name, thread_count and statistic_type.

        The base name is the benchmark name minus the thread count and any statistics.
        Testcases of the same group will be shown on a single perf graph.

        name_str look like the following:
        "BM_SetInsert/arg name:1024/threads:10_mean"
        "BM_SetInsert/arg 1/arg 2"
        "BM_SetInsert_mean"
        """

        base_name = None
        thread_count = None
        statistic_type = None

        # Step 1: get the statistic type.
        if name_str.count("_") == 2:  # There is statistics.
            statistic_type = name_str.rsplit("_", 1)[-1]
            # Remove the statistic type suffix from the name.
            name_str = name_str[:-len(statistic_type) - 1]

        # Step 2: Get the thread count and name.
        thread_section = name_str.rsplit("/", 1)[-1]
        if thread_section.startswith("threads:"):
            base_name = name_str.rsplit("/", 1)[0]
            thread_count = thread_section.split(":")[-1]
        else:  # There is no explicit thread count, so the thread count is 1.
            thread_count = "1"
            base_name = name_str

        return _BenchmarkName(base_name, thread_count, statistic_type)