tools/dev/benchmarks/RepoPerf/copy_repo.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313

#!/usr/bin/env python
#
#  copy_repo.py: create multiple, interleaved copies of a set of repositories.
#
#  Subversion is a tool for revision control.
#  See http://subversion.apache.org for more information.
#
# ====================================================================
#    Licensed to the Apache Software Foundation (ASF) under one
#    or more contributor license agreements.  See the NOTICE file
#    distributed with this work for additional information
#    regarding copyright ownership.  The ASF licenses this file
#    to you under the Apache License, Version 2.0 (the
#    "License"); you may not use this file except in compliance
#    with the License.  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing,
#    software distributed under the License is distributed on an
#    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
#    KIND, either express or implied.  See the License for the
#    specific language governing permissions and limitations
#    under the License.
######################################################################

# General modules
import os
import random
import shutil
import sys

class Separators:
  """ This class is a container for dummy / filler files.
      It will be used to create spaces between repository
      versions on disk, i.e. to simulate some aspect of
      real-world FS fragmentation.

      It gets initialized with some parent path as well as
      the desired average file size and will create a new
      such file with each call to write().  Automatic
      sharding keeps FS specific overhead at bay.  Call
      cleanup() to eventually delete all dummy files. """

  buffer = "A" * 4096
     """ Write this non-NULL contents into the dummy files. """

  def __init__(self, path, average_size):
    """ Initialize and store all dummy files in a '__tmp'
        sub-folder of PATH.  The size of each dummy file
        is a random value and will be slightly AVERAGE_SIZE
        kBytes on average.  A value of 0 will effectively
        disable dummy file creation. """

    self.path = os.path.join(path, '__tmp')
    self.size = average_size
    self.count = 0

    if os.path.exists(self.path):
      shutil.rmtree(self.path)

    os.mkdir(self.path)

  def write(self):
    """ Add a new dummy file """

    # Throw dice of a file size.
    # Factor 1024 for kBytes, factor 2 for being an average.
    size = (int)(float(self.size) * random.random() * 2 * 1024.0)

    # Don't create empty files.  This also implements the
    # "average = 0 means no files" rule.
    if size > 0:
      self.count += 1

      # Create a new shard for every 1000 files
      subfolder = os.path.join(self.path, str(self.count / 1000))
      if not os.path.exists(subfolder):
        os.mkdir(subfolder)

      # Create and write the file in 4k chunks.
      # Writing full chunks will result in average file sizes
      # being slightly above the SELF.SIZE.  That's good enough
      # for our purposes.
      f = open(os.path.join(subfolder, str(self.count)), "wb")
      while size > 0:
        f.write(self.buffer)
        size -= len(self.buffer)

      f.close()

  def cleanup(self):
    """ Get rid of all the files (and folders) that we created. """

    shutil.rmtree(self.path)

class Repository:
  """ Encapsulates key information of a repository.  Is is being
      used for copy sources only and contains information about
      its NAME, PATH, SHARD_SIZE, HEAD revision and MIN_UNPACKED_REV. """

  def _read_config(self, filename):
    """ Read and return all lines from FILENAME.
        This will be used to read 'format', 'current' etc. . """

    f = open(os.path.join(self.path, 'db', filename), "rb")
    lines = f.readlines()
    f.close()

    return lines

  def __init__(self, parent, name):
    """ Constructor collecting everything we need to know about
        the repository NAME within PARENT folder. """

    self.name = name
    self.path = os.path.join(parent, name)

    self.shard_size = int(self._read_config('format')[1].split(' ')[2])
    self.min_unpacked_rev = int(self._read_config('min-unpacked-rev')[0])
    self.head = int(self._read_config('current')[0])

  def needs_copy(self, revision):
    """ Return True if REVISION is a revision in this repository
        and is "directly copyable", i.e. is either non-packed or
        the first rev in a packed shard.  Everything else is either
        not a valid rev or already gets / got copied as part of
        some packed shard. """

    if revision > self.head:
      return False
    if revision < self.min_unpacked_rev:
      return revision % self.shard_size == 0

    return True

  @classmethod
  def is_repository(cls, path):
    """ Quick check that PATH is (probably) a repository.
        This is mainly to filter out aux files put next to
        (not inside) the repositories to copy. """

    format_path = os.path.join(path, 'db', 'format')
    return os.path.isfile(format_path)

class Multicopy:
  """ Helper class doing the actual copying.  It copies individual
      revisions and packed shards from the one source repository
      to multiple copies of it.  The copies have the same name
      as the source repo but with numbers 0 .. N-1 appended to it.

      The copy process is being initiated by the constructor
      (copies the repo skeleton w/o revision contents).  Revision
      contents is then copied by successive calls to the copy()
      method. """

  def _init_copy(self, number):
    """ Called from the constructor, this will copy SELF.SOURCE_REPO
        into NUMBER new repos below SELF.DEST_BASE but omit everything
        below db/revs and db/revprops. """

    src = self.source_repo.path
    dst = self.dest_base + str(number)

    # Copy the repo skeleton w/o revs and revprops
    shutil.copytree(src, dst, ignore=shutil.ignore_patterns('revs', 'revprops'))

    # Add revs and revprops
    self.dst_revs.append(os.path.join(dst, 'db', 'revs'))
    self.dst_revprops.append(os.path.join(dst, 'db', 'revprops'))

    os.mkdir(self.dst_revs[number])
    os.mkdir(self.dst_revprops[number])

  def _copy_packed_shard(self, shard, number):
    """ Copy packed shard number SHARD from SELF.SOURCE_REPO to
        the copy NUMBER below SELF.DEST_BASE. """

    # Shards are simple subtrees
    src_revs = os.path.join(self.src_revs, str(shard) + '.pack')
    dst_revs = os.path.join(self.dst_revs[number], str(shard) + '.pack')
    src_revprops = os.path.join(self.src_revprops, str(shard) + '.pack')
    dst_revprops = os.path.join(self.dst_revprops[number], str(shard) + '.pack')

    shutil.copytree(src_revs, dst_revs)
    shutil.copytree(src_revprops, dst_revprops)

    # Special case: revprops of rev 0 are never packed => extra copy
    if shard == 0:
      src_revprops = os.path.join(self.src_revprops, '0')
      dest_revprops = os.path.join(self.dst_revprops[number], '0')

      shutil.copytree(src_revprops, dest_revprops)

  def _copy_single_revision(self, revision, number):
    """ Copy non-packed REVISION from SELF.SOURCE_REPO to the copy
        NUMBER below SELF.DEST_BASE. """

    shard = str(revision / self.source_repo.shard_size)

    # Auto-create shard folder
    if revision % self.source_repo.shard_size == 0:
      os.mkdir(os.path.join(self.dst_revs[number], shard))
      os.mkdir(os.path.join(self.dst_revprops[number], shard))

    # Copy the rev file and the revprop file
    src_rev = os.path.join(self.src_revs, shard, str(revision))
    dest_rev = os.path.join(self.dst_revs[number], shard, str(revision))
    src_revprop = os.path.join(self.src_revprops, shard, str(revision))
    dest_revprop = os.path.join(self.dst_revprops[number], shard, str(revision))

    shutil.copyfile(src_rev, dest_rev)
    shutil.copyfile(src_revprop, dest_revprop)

  def __init__(self, source, target_parent, count):
    """ Initiate the copy process for the SOURCE repository to
        be copied COUNT times into the TARGET_PARENT directory. """

    self.source_repo = source
    self.dest_base = os.path.join(target_parent, source.name)

    self.src_revs = os.path.join(source.path, 'db', 'revs')
    self.src_revprops = os.path.join(source.path, 'db', 'revprops')

    self.dst_revs = []
    self.dst_revprops = []
    for i in range(0, count):
      self._init_copy(i)

  def copy(self, revision, number):
    """ Copy (packed or non-packed) REVISION from SELF.SOURCE_REPO
        to the copy NUMBER below SELF.DEST_BASE.

        SELF.SOURCE_REPO.needs_copy(REVISION) must be True. """

    if revision < self.source_repo.min_unpacked_rev:
      self._copy_packed_shard(revision / self.source_repo.shard_size, number)
    else:
      self._copy_single_revision(revision, number)

def copy_repos(src, dst, count, separator_size):
  """ Under DST, create COUNT copies of all repositories immediately
      below SRC.

      All copies will "interleaved" such that we copy each individual
      revision / packed shard to all target repos first before
      continuing with the next revision / packed shard.  After each
      round (revision / packed shard) insert a temporary file of
      SEPARATOR_SIZE kBytes on average to add more spacing between
      revisions.  The temp files get automatically removed at the end.

      Please note that this function will clear DST before copying
      anything into it. """

  # Remove any remnants from the target folder.
  # (DST gets auto-created by the first repo copy.)
  shutil.rmtree(dst)

  # Repositories to copy and the respective copy utilities
  repositories = []
  copies = []

  # Find repositories, initiate copies and determine the range of
  # revisions to copy in total
  max_revision = 0
  for name in os.listdir(src):
    if Repository.is_repository(os.path.join(src, name)):
      repository = Repository(src, name)
      repositories.append(repository)
      copies.append(Multicopy(repository, dst, count))

      if repository.head > max_revision:
        max_revision = repository.head

  # Temp file collection (spacers)
  separators = Separators(dst, separator_size)

  # Copy all repos in revision,number-major order
  for revision in xrange(0, max_revision + 1):
    for number in xrange(0, count):

      any_copy = False
      for i in xrange(0, len(repositories)):
        if repositories[i].needs_copy(revision):
          any_copy = True
          copies[i].copy(revision, number)

      # Don't add spacers when nothing got copied (REVISION is
      # packed in all repositories).
      if any_copy:
        separators.write()

  # Now that all data is in position, remove the spacers
  separators.cleanup()

def show_usage():
  """ Write a simple CL docstring """

  print "Copies and duplicates repositories in a way that mimics larger deployments."
  print
  print "Usage:"
  print "copy_repo.py SRC DST COUNT SEPARATOR_SIZE"
  print
  print "SRC            Immediate parent folder of all the repositories to copy."
  print "DST            Folder to copy into; current contents will be lost."
  print "COUNT          Number of copies to create of each source repository."
  print "SEPARATOR_SIZE Additional spacing, in kBytes, between revisions."

#main function
if len(argv) == 5:
  copy_repos(sys.argv[1], sys.argv[2], int(sys.argv[3]), int(sys.argv[4]))
else:
  show_usage()