chromium/tools/cygprofile/process_profiles.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436

#!/usr/bin/env vpython
# Copyright 2017 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""Lists all the reached symbols from an instrumentation dump."""

import argparse
import logging
import operator
import os
import sys

_SRC_PATH = os.path.abspath(os.path.join(
    os.path.dirname(__file__), os.pardir, os.pardir))
path = os.path.join(_SRC_PATH, 'tools', 'cygprofile')
sys.path.append(path)
import symbol_extractor


def _Median(items):
  if not items:
    return None
  sorted_items = sorted(items)
  if len(sorted_items) & 1:
    return sorted_items[len(sorted_items)/2]
  else:
    return (sorted_items[len(sorted_items)/2 - 1] +
            sorted_items[len(sorted_items)/2]) / 2


class SymbolOffsetProcessor(object):
  """Utility for processing symbols in binaries.

  This class is used to translate between general offsets into a binary and the
  starting offset of symbols in the binary. Because later phases in orderfile
  generation have complicated strategies for resolving multiple symbols that map
  to the same binary offset, this class is concerned with locating a symbol
  containing a binary offset. If such a symbol exists, the start offset will be
  unique, even when there are multiple symbol names at the same location in the
  binary.

  In the function names below, "dump" is used to refer to arbitrary offsets in a
  binary (eg, from a profiling run), while "offset" refers to a symbol
  offset. The dump offsets are relative to the start of text, as returned by
  lightweight_cygprofile.cc.

  This class manages expensive operations like extracting symbols, so that
  higher-level operations can be done in different orders without the caller
  managing all the state.
  """

  def __init__(self, binary_filename):
    self._binary_filename = binary_filename
    self._symbol_infos = None
    self._name_to_symbol = None
    self._offset_to_primary = None
    self._offset_to_symbols = None

  def SymbolInfos(self):
    """The symbols associated with this processor's binary.

    The symbols are ordered by offset.

    Returns:
      [symbol_extractor.SymbolInfo]
    """
    if self._symbol_infos is None:
      self._symbol_infos = symbol_extractor.SymbolInfosFromBinary(
          self._binary_filename)
      self._symbol_infos.sort(key=lambda s: s.offset)
      logging.info('%d symbols from %s',
                   len(self._symbol_infos), self._binary_filename)
    return self._symbol_infos

  def NameToSymbolMap(self):
    """Map symbol names to their full information.

    Returns:
      {symbol name (str): symbol_extractor.SymbolInfo}
    """
    if self._name_to_symbol is None:
      self._name_to_symbol = {s.name: s for s in self.SymbolInfos()}
    return self._name_to_symbol

  def OffsetToPrimaryMap(self):
    """The map of a symbol offset in this binary to its primary symbol.

    Several symbols can be aliased to the same address, through ICF. This
    returns the first one. The order is consistent for a given binary, as it's
    derived from the file layout. We assert that all aliased symbols are the
    same size.

    Returns:
      {offset (int): primary (symbol_extractor.SymbolInfo)}
    """
    if self._offset_to_primary is None:
      self._offset_to_primary = {}
      for s in self.SymbolInfos():
        if s.offset not in self._offset_to_primary:
          self._offset_to_primary[s.offset] = s
        else:
          curr = self._offset_to_primary[s.offset]
          if curr.size != s.size:
            assert curr.size == 0 or s.size == 0, (
                'Nonzero size mismatch between {} and {}'.format(
                    curr.name, s.name))
            # Upgrade to a symbol with nonzero size, otherwise don't change
            # anything so that we use the earliest nonzero-size symbol.
            if curr.size == 0 and s.size != 0:
              self._offset_to_primary[s.offset] = s

    return self._offset_to_primary

  def OffsetToSymbolsMap(self):
    """Map offsets to the set of matching symbols.

    Unlike OffsetToPrimaryMap, this is a 1-to-many mapping.

    Returns;
      {offset (int): [symbol_extractor.SymbolInfo]}
    """
    if self._offset_to_symbols is None:
      self._offset_to_symbols = symbol_extractor.GroupSymbolInfosByOffset(
          self.SymbolInfos())
    return self._offset_to_symbols

  def OffsetsPrimarySize(self, offsets):
    """Computes the total primary size of a set of offsets.

    Args:
      offsets (int iterable) a set of offsets.

    Returns
      int The sum of the primary size of the offsets.
    """
    return sum(self.OffsetToPrimaryMap()[x].size for x in offsets)

  def GetReachedOffsetsFromDump(self, dump):
    """Find the symbol offsets from a list of binary offsets.

    The dump is a list offsets into a .text section. This finds the symbols
    which contain the dump offsets, and returns their offsets. Note that while
    usually a symbol offset corresponds to a single symbol, in some cases
    several symbols will map to the same offset. For that reason this function
    returns only the offset list. See cyglog_to_orderfile.py for computing more
    information about symbols.

    Args:
     dump: (int iterable) Dump offsets, for example as returned by MergeDumps().

    Returns:
      [int] Reached symbol offsets.
    """
    dump_offset_to_symbol_info = self._GetDumpOffsetToSymbolInfo()
    logging.info('Offset to Symbol size = %d', len(dump_offset_to_symbol_info))
    assert max(dump) / 4 <= len(dump_offset_to_symbol_info)
    already_seen = set()
    reached_offsets = []
    reached_return_addresses_not_found = 0
    for dump_offset in dump:
      symbol_info = dump_offset_to_symbol_info[dump_offset / 4]
      if symbol_info is None:
        reached_return_addresses_not_found += 1
        continue
      if symbol_info.offset in already_seen:
        continue
      reached_offsets.append(symbol_info.offset)
      already_seen.add(symbol_info.offset)
    if reached_return_addresses_not_found:
      logging.warning('%d return addresses don\'t map to any symbol',
                      reached_return_addresses_not_found)
    return reached_offsets

  def MatchSymbolNames(self, symbol_names):
    """Find the symbols in this binary which match a list of symbols.

    Args:
      symbol_names (str iterable) List of symbol names.

    Returns:
      [symbol_extractor.SymbolInfo] Symbols in this binary matching the names.
    """
    our_symbol_names = set(s.name for s in self.SymbolInfos())
    matched_names = our_symbol_names.intersection(set(symbol_names))
    return [self.NameToSymbolMap()[n] for n in matched_names]

  def _GetDumpOffsetToSymbolInfo(self):
    """Computes an array mapping each word in .text to a symbol.

    Returns:
      [symbol_extractor.SymbolInfo or None] For every 4 bytes of the .text
        section, maps it to a symbol, or None.
    """
    min_offset = min(s.offset for s in self.SymbolInfos())
    max_offset = max(s.offset + s.size for s in self.SymbolInfos())
    text_length_words = (max_offset - min_offset) / 4
    offset_to_symbol_info = [None for _ in xrange(text_length_words)]
    for s in self.SymbolInfos():
      offset = s.offset - min_offset
      for i in range(offset / 4, (offset + s.size) / 4):
        offset_to_symbol_info[i] = s
    return offset_to_symbol_info


class ProfileManager(object):
  """Manipulates sets of profiles.

  The manager supports only lightweight-style profiles (see
  lightweight_cygprofile.cc) and not the older cygprofile offset lists.

  A "profile set" refers to a set of data from an instrumented version of chrome
  that will be processed together, usually to produce a single orderfile. A
  "run" refers to a session of chrome, visiting several pages and thus
  comprising a browser process and at least one renderer process. A "dump"
  refers to the instrumentation in chrome writing out offsets of instrumented
  functions. There may be several dumps per run, for example one describing
  chrome startup and a second describing steady-state page interaction. Each
  process in a run produces one file per dump.

  These dump files have a timestamp of the dump time. Each process produces its
  own timestamp, but the dumps from each process occur very near in time to each
  other (< 1 second). If there are several dumps per run, each set of dumps is
  marked by a "phase" in the filename which is consistent across processes. For
  example the dump for the startup could be phase 0 and then the steady-state
  would be labeled phase 1.

  We assume the files are named like *-TIMESTAMP.SUFFIX_PHASE, where TIMESTAMP
  is in nanoseconds, SUFFIX is string without dashes, PHASE is an integer
  numbering the phases as 0, 1, 2..., and the only dot is the one between
  TIMESTAMP and SUFFIX. Note that the current dump filename also includes a
  process id which is currently unused.

  This manager supports several configurations of dumps.

  * A single dump from a single run. These files are merged together to produce
    a single dump without regard for browser versus renderer methods.

  * Several phases of dumps from a single run. Files are grouped by phase as
    described above.

  * Several phases of dumps from multiple runs from a set of telemetry
    benchmarks. The timestamp is used to distinguish each run because each
    benchmark takes < 10 seconds to run but there are > 50 seconds of setup
    time. This files can be grouped into run sets that are within 30 seconds of
    each other. Each run set is then grouped into phases as before.
  """
  class _RunGroup(object):
    RUN_GROUP_THRESHOLD_NS = 30e9

    def __init__(self):
      self._filenames = []

    def Filenames(self, phase=None):
      if phase is None:
        return self._filenames
      return [f for f in self._filenames
              if ProfileManager._Phase(f) == phase]

    def Add(self, filename):
      self._filenames.append(filename)

    def IsCloseTo(self, filename):
      run_group_ts = _Median(
          [ProfileManager._Timestamp(f) for f in self._filenames])
      return abs(ProfileManager._Timestamp(filename) -
                 run_group_ts) < self.RUN_GROUP_THRESHOLD_NS

  def __init__(self, filenames):
    """Initialize a ProfileManager.

    Args:
      filenames ([str]): List of filenames describe the profile set.
    """
    self._filenames = sorted(filenames, key=self._Timestamp)
    self._run_groups = None

  def GetPhases(self):
    """Return the set of phases of all orderfiles.

    Returns:
      set(int)
    """
    return set(self._Phase(f) for f in self._filenames)

  def GetMergedOffsets(self, phase=None):
    """Merges files, as if from a single dump.

    Args:
      phase (int, optional) If present, restrict to this phase.

    Returns:
      [int] Ordered list of reached offsets. Each offset only appears
      once in the output, in the order of the first dump that contains it.
    """
    if phase is None:
      return self._GetOffsetsForGroup(self._filenames)
    return self._GetOffsetsForGroup(f for f in self._filenames
                                    if self._Phase(f) == phase)

  def GetRunGroupOffsets(self, phase=None):
    """Merges files from each run group and returns offset list for each.

    Args:
      phase (int, optional) If present, restrict to this phase.

    Returns:
     [ [int] ] List of offsets lists, each as from GetMergedOffsets.
    """
    return [self._GetOffsetsForGroup(g) for g in self._GetRunGroups(phase)]

  def _GetOffsetsForGroup(self, filenames):
    dumps = [self._ReadOffsets(f) for f in filenames]
    seen_offsets = set()
    result = []
    for dump in dumps:
      for offset in dump:
        if offset not in seen_offsets:
          result.append(offset)
          seen_offsets.add(offset)
    return result

  def _GetRunGroups(self, phase=None):
    if self._run_groups is None:
      self._ComputeRunGroups()
    return [g.Filenames(phase) for g in self._run_groups]

  @classmethod
  def _Timestamp(cls, filename):
      dash_index = filename.rindex('-')
      dot_index = filename.rindex('.')
      return int(filename[dash_index+1:dot_index])

  @classmethod
  def _Phase(cls, filename):
    return int(filename.split('_')[-1])

  def _ReadOffsets(self, filename):
    return [int(x.strip()) for x in open(filename)]

  def _ComputeRunGroups(self):
    self._run_groups = []
    for f in self._filenames:
      for g in self._run_groups:
        if g.IsCloseTo(f):
          g.Add(f)
          break
      else:
        g = self._RunGroup()
        g.Add(f)
        self._run_groups.append(g)


def GetReachedOffsetsFromDumpFiles(dump_filenames, library_filename):
  """Produces a list of symbol offsets reached by the dumps.

  Args:
    dump_filenames (str iterable) A list of dump filenames.
    library_filename (str) The library file which the dumps refer to.

  Returns:
    [int] A list of symbol offsets. This order of symbol offsets produced is
      given by the deduplicated order of offsets found in dump_filenames (see
      also MergeDumps().
  """
  dump = ProfileManager(dump_filenames).GetMergedOffsets()
  if not dump:
    logging.error('Empty dump, cannot continue: %s', '\n'.join(dump_filenames))
    return None
  logging.info('Reached offsets = %d', len(dump))
  processor = SymbolOffsetProcessor(library_filename)
  return processor.GetReachedOffsetsFromDump(dump)


def CreateArgumentParser():
  """Returns an ArgumentParser."""
  parser = argparse.ArgumentParser(description='Outputs reached symbols')
  parser.add_argument('--instrumented-build-dir', type=str,
                      help='Path to the instrumented build', required=True)
  parser.add_argument('--build-dir', type=str, help='Path to the build dir',
                      required=True)
  parser.add_argument('--dumps', type=str, help='A comma-separated list of '
                      'files with instrumentation dumps', required=True)
  parser.add_argument('--output', type=str, help='Output filename',
                      required=True)
  parser.add_argument('--offsets-output', type=str,
                      help='Output filename for the symbol offsets',
                      required=False, default=None)
  parser.add_argument('--library-name', default='libchrome.so',
                      help=('Chrome shared library name (usually libchrome.so '
                            'or libmonochrome.so'))
  return parser


def main():
  logging.basicConfig(level=logging.INFO)
  parser = CreateArgumentParser()
  args = parser.parse_args()
  logging.info('Merging dumps')
  dump_files = args.dumps.split(',')
  profile_manager = ProfileManager(dump_files)
  profile_manager.SortByTimestamp()
  dumps = profile_manager.GetMergedOffsets()

  instrumented_native_lib = os.path.join(args.instrumented_build_dir,
                                         'lib.unstripped', args.library_name)
  regular_native_lib = os.path.join(args.build_dir,
                                    'lib.unstripped', args.library_name)

  instrumented_processor = SymbolOffsetProcessor(instrumented_native_lib)

  reached_offsets = instrumented_processor.GetReachedOffsetsFromDumps(dumps)
  if args.offsets_output:
    with file(args.offsets_output, 'w') as f:
      f.write('\n'.join(map(str, reached_offsets)))
  logging.info('Reached Offsets = %d', len(reached_offsets))

  primary_map = instrumented_processor.OffsetToPrimaryMap()
  reached_primary_symbols = set(
      primary_map[offset] for offset in reached_offsets)
  logging.info('Reached symbol names = %d', len(reached_primary_symbols))

  regular_processor = SymbolOffsetProcessor(regular_native_lib)
  matched_in_regular_build = regular_processor.MatchSymbolNames(
      s.name for s in reached_primary_symbols)
  logging.info('Matched symbols = %d', len(matched_in_regular_build))
  total_size = sum(s.size for s in matched_in_regular_build)
  logging.info('Total reached size = %d', total_size)

  with open(args.output, 'w') as f:
    for s in matched_in_regular_build:
      f.write(s.name + '\n')


if __name__ == '__main__':
  main()