1 files changed, 313 insertions, 0 deletions
diff --git a/tools/dev/benchmarks/RepoPerf/copy_repo.py b/tools/dev/benchmarks/RepoPerf/copy_repo.py
new file mode 100644
index 0000000..a95a82d
--- /dev/null
+++ b/tools/dev/benchmarks/RepoPerf/copy_repo.py
@@ -0,0 +1,313 @@
+#!/usr/bin/env python
+#
+#  copy_repo.py: create multiple, interleaved copies of a set of repositories.
+#
+#  Subversion is a tool for revision control.
+#  See http://subversion.apache.org for more information.
+#
+# ====================================================================
+#    Licensed to the Apache Software Foundation (ASF) under one
+#    or more contributor license agreements.  See the NOTICE file
+#    distributed with this work for additional information
+#    regarding copyright ownership.  The ASF licenses this file
+#    to you under the Apache License, Version 2.0 (the
+#    "License"); you may not use this file except in compliance
+#    with the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing,
+#    software distributed under the License is distributed on an
+#    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#    KIND, either express or implied.  See the License for the
+#    specific language governing permissions and limitations
+#    under the License.
+######################################################################
+
+# General modules
+import os
+import random
+import shutil
+import sys
+
+class Separators:
+  """ This class is a container for dummy / filler files.
+      It will be used to create spaces between repository
+      versions on disk, i.e. to simulate some aspect of
+      real-world FS fragmentation.
+
+      It gets initialized with some parent path as well as
+      the desired average file size and will create a new
+      such file with each call to write().  Automatic
+      sharding keeps FS specific overhead at bay.  Call
+      cleanup() to eventually delete all dummy files. """
+
+  buffer = "A" * 4096
+     """ Write this non-NULL contents into the dummy files. """
+
+  def __init__(self, path, average_size):
+    """ Initialize and store all dummy files in a '__tmp'
+        sub-folder of PATH.  The size of each dummy file
+        is a random value and will be slightly AVERAGE_SIZE
+        kBytes on average.  A value of 0 will effectively
+        disable dummy file creation. """
+
+    self.path = os.path.join(path, '__tmp')
+    self.size = average_size
+    self.count = 0
+
+    if os.path.exists(self.path):
+      shutil.rmtree(self.path)
+
+    os.mkdir(self.path)
+
+  def write(self):
+    """ Add a new dummy file """
+
+    # Throw dice of a file size.
+    # Factor 1024 for kBytes, factor 2 for being an average.
+    size = (int)(float(self.size) * random.random() * 2 * 1024.0)
+
+    # Don't create empty files.  This also implements the
+    # "average = 0 means no files" rule.
+    if size > 0:
+      self.count += 1
+
+      # Create a new shard for every 1000 files
+      subfolder = os.path.join(self.path, str(self.count / 1000))
+      if not os.path.exists(subfolder):
+        os.mkdir(subfolder)
+
+      # Create and write the file in 4k chunks.
+      # Writing full chunks will result in average file sizes
+      # being slightly above the SELF.SIZE.  That's good enough
+      # for our purposes.
+      f = open(os.path.join(subfolder, str(self.count)), "wb")
+      while size > 0:
+        f.write(self.buffer)
+        size -= len(self.buffer)
+
+      f.close()
+
+  def cleanup(self):
+    """ Get rid of all the files (and folders) that we created. """
+
+    shutil.rmtree(self.path)
+
+class Repository:
+  """ Encapsulates key information of a repository.  Is is being
+      used for copy sources only and contains information about
+      its NAME, PATH, SHARD_SIZE, HEAD revision and MIN_UNPACKED_REV. """
+
+  def _read_config(self, filename):
+    """ Read and return all lines from FILENAME.
+        This will be used to read 'format', 'current' etc. . """
+
+    f = open(os.path.join(self.path, 'db', filename), "rb")
+    lines = f.readlines()
+    f.close()
+
+    return lines
+
+  def __init__(self, parent, name):
+    """ Constructor collecting everything we need to know about
+        the repository NAME within PARENT folder. """
+
+    self.name = name
+    self.path = os.path.join(parent, name)
+
+    self.shard_size = int(self._read_config('format')[1].split(' ')[2])
+    self.min_unpacked_rev = int(self._read_config('min-unpacked-rev')[0])
+    self.head = int(self._read_config('current')[0])
+
+  def needs_copy(self, revision):
+    """ Return True if REVISION is a revision in this repository
+        and is "directly copyable", i.e. is either non-packed or
+        the first rev in a packed shard.  Everything else is either
+        not a valid rev or already gets / got copied as part of
+        some packed shard. """
+
+    if revision > self.head:
+      return False
+    if revision < self.min_unpacked_rev:
+      return revision % self.shard_size == 0
+
+    return True
+
+  @classmethod
+  def is_repository(cls, path):
+    """ Quick check that PATH is (probably) a repository.
+        This is mainly to filter out aux files put next to
+        (not inside) the repositories to copy. """
+
+    format_path = os.path.join(path, 'db', 'format')
+    return os.path.isfile(format_path)
+
+class Multicopy:
+  """ Helper class doing the actual copying.  It copies individual
+      revisions and packed shards from the one source repository
+      to multiple copies of it.  The copies have the same name
+      as the source repo but with numbers 0 .. N-1 appended to it.
+
+      The copy process is being initiated by the constructor
+      (copies the repo skeleton w/o revision contents).  Revision
+      contents is then copied by successive calls to the copy()
+      method. """
+
+  def _init_copy(self, number):
+    """ Called from the constructor, this will copy SELF.SOURCE_REPO
+        into NUMBER new repos below SELF.DEST_BASE but omit everything
+        below db/revs and db/revprops. """
+
+    src = self.source_repo.path
+    dst = self.dest_base + str(number)
+
+    # Copy the repo skeleton w/o revs and revprops
+    shutil.copytree(src, dst, ignore=shutil.ignore_patterns('revs', 'revprops'))
+
+    # Add revs and revprops
+    self.dst_revs.append(os.path.join(dst, 'db', 'revs'))
+    self.dst_revprops.append(os.path.join(dst, 'db', 'revprops'))
+
+    os.mkdir(self.dst_revs[number])
+    os.mkdir(self.dst_revprops[number])
+
+  def _copy_packed_shard(self, shard, number):
+    """ Copy packed shard number SHARD from SELF.SOURCE_REPO to
+        the copy NUMBER below SELF.DEST_BASE. """
+
+    # Shards are simple subtrees
+    src_revs = os.path.join(self.src_revs, str(shard) + '.pack')
+    dst_revs = os.path.join(self.dst_revs[number], str(shard) + '.pack')
+    src_revprops = os.path.join(self.src_revprops, str(shard) + '.pack')
+    dst_revprops = os.path.join(self.dst_revprops[number], str(shard) + '.pack')
+
+    shutil.copytree(src_revs, dst_revs)
+    shutil.copytree(src_revprops, dst_revprops)
+
+    # Special case: revprops of rev 0 are never packed => extra copy
+    if shard == 0:
+      src_revprops = os.path.join(self.src_revprops, '0')
+      dest_revprops = os.path.join(self.dst_revprops[number], '0')
+
+      shutil.copytree(src_revprops, dest_revprops)
+
+  def _copy_single_revision(self, revision, number):
+    """ Copy non-packed REVISION from SELF.SOURCE_REPO to the copy
+        NUMBER below SELF.DEST_BASE. """
+
+    shard = str(revision / self.source_repo.shard_size)
+
+    # Auto-create shard folder
+    if revision % self.source_repo.shard_size == 0:
+      os.mkdir(os.path.join(self.dst_revs[number], shard))
+      os.mkdir(os.path.join(self.dst_revprops[number], shard))
+
+    # Copy the rev file and the revprop file
+    src_rev = os.path.join(self.src_revs, shard, str(revision))
+    dest_rev = os.path.join(self.dst_revs[number], shard, str(revision))
+    src_revprop = os.path.join(self.src_revprops, shard, str(revision))
+    dest_revprop = os.path.join(self.dst_revprops[number], shard, str(revision))
+
+    shutil.copyfile(src_rev, dest_rev)
+    shutil.copyfile(src_revprop, dest_revprop)
+
+  def __init__(self, source, target_parent, count):
+    """ Initiate the copy process for the SOURCE repository to
+        be copied COUNT times into the TARGET_PARENT directory. """
+
+    self.source_repo = source
+    self.dest_base = os.path.join(target_parent, source.name)
+
+    self.src_revs = os.path.join(source.path, 'db', 'revs')
+    self.src_revprops = os.path.join(source.path, 'db', 'revprops')
+
+    self.dst_revs = []
+    self.dst_revprops = []
+    for i in range(0, count):
+      self._init_copy(i)
+
+  def copy(self, revision, number):
+    """ Copy (packed or non-packed) REVISION from SELF.SOURCE_REPO
+        to the copy NUMBER below SELF.DEST_BASE.
+
+        SELF.SOURCE_REPO.needs_copy(REVISION) must be True. """
+
+    if revision < self.source_repo.min_unpacked_rev:
+      self._copy_packed_shard(revision / self.source_repo.shard_size, number)
+    else:
+      self._copy_single_revision(revision, number)
+
+def copy_repos(src, dst, count, separator_size):
+  """ Under DST, create COUNT copies of all repositories immediately
+      below SRC.
+
+      All copies will "interleaved" such that we copy each individual
+      revision / packed shard to all target repos first before
+      continuing with the next revision / packed shard.  After each
+      round (revision / packed shard) insert a temporary file of
+      SEPARATOR_SIZE kBytes on average to add more spacing between
+      revisions.  The temp files get automatically removed at the end.
+
+      Please note that this function will clear DST before copying
+      anything into it. """
+
+  # Remove any remnants from the target folder.
+  # (DST gets auto-created by the first repo copy.)
+  shutil.rmtree(dst)
+
+  # Repositories to copy and the respective copy utilities
+  repositories = []
+  copies = []
+
+  # Find repositories, initiate copies and determine the range of
+  # revisions to copy in total
+  max_revision = 0
+  for name in os.listdir(src):
+    if Repository.is_repository(os.path.join(src, name)):
+      repository = Repository(src, name)
+      repositories.append(repository)
+      copies.append(Multicopy(repository, dst, count))
+
+      if repository.head > max_revision:
+        max_revision = repository.head
+
+  # Temp file collection (spacers)
+  separators = Separators(dst, separator_size)
+
+  # Copy all repos in revision,number-major order
+  for revision in xrange(0, max_revision + 1):
+    for number in xrange(0, count):
+
+      any_copy = False
+      for i in xrange(0, len(repositories)):
+        if repositories[i].needs_copy(revision):
+          any_copy = True
+          copies[i].copy(revision, number)
+
+      # Don't add spacers when nothing got copied (REVISION is
+      # packed in all repositories).
+      if any_copy:
+        separators.write()
+
+  # Now that all data is in position, remove the spacers
+  separators.cleanup()
+
+def show_usage():
+  """ Write a simple CL docstring """
+
+  print "Copies and duplicates repositories in a way that mimics larger deployments."
+  print
+  print "Usage:"
+  print "copy_repo.py SRC DST COUNT SEPARATOR_SIZE"
+  print
+  print "SRC            Immediate parent folder of all the repositories to copy."
+  print "DST            Folder to copy into; current contents will be lost."
+  print "COUNT          Number of copies to create of each source repository."
+  print "SEPARATOR_SIZE Additional spacing, in kBytes, between revisions."
+
+#main function
+if len(argv) == 5:
+  copy_repos(sys.argv[1], sys.argv[2], int(sys.argv[3]), int(sys.argv[4]))
+else:
+  show_usage()