summaryrefslogtreecommitdiff
path: root/tools/dev/benchmarks/RepoPerf/copy_repo.py
diff options
context:
space:
mode:
Diffstat (limited to 'tools/dev/benchmarks/RepoPerf/copy_repo.py')
-rw-r--r--tools/dev/benchmarks/RepoPerf/copy_repo.py313
1 files changed, 313 insertions, 0 deletions
diff --git a/tools/dev/benchmarks/RepoPerf/copy_repo.py b/tools/dev/benchmarks/RepoPerf/copy_repo.py
new file mode 100644
index 0000000..a95a82d
--- /dev/null
+++ b/tools/dev/benchmarks/RepoPerf/copy_repo.py
@@ -0,0 +1,313 @@
+#!/usr/bin/env python
+#
+# copy_repo.py: create multiple, interleaved copies of a set of repositories.
+#
+# Subversion is a tool for revision control.
+# See http://subversion.apache.org for more information.
+#
+# ====================================================================
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+######################################################################
+
+# General modules
+import os
+import random
+import shutil
+import sys
+
+class Separators:
+ """ This class is a container for dummy / filler files.
+ It will be used to create spaces between repository
+ versions on disk, i.e. to simulate some aspect of
+ real-world FS fragmentation.
+
+ It gets initialized with some parent path as well as
+ the desired average file size and will create a new
+ such file with each call to write(). Automatic
+ sharding keeps FS specific overhead at bay. Call
+ cleanup() to eventually delete all dummy files. """
+
+ buffer = "A" * 4096
+ """ Write this non-NULL contents into the dummy files. """
+
+ def __init__(self, path, average_size):
+ """ Initialize and store all dummy files in a '__tmp'
+ sub-folder of PATH. The size of each dummy file
+ is a random value and will be slightly AVERAGE_SIZE
+ kBytes on average. A value of 0 will effectively
+ disable dummy file creation. """
+
+ self.path = os.path.join(path, '__tmp')
+ self.size = average_size
+ self.count = 0
+
+ if os.path.exists(self.path):
+ shutil.rmtree(self.path)
+
+ os.mkdir(self.path)
+
+ def write(self):
+ """ Add a new dummy file """
+
+ # Throw dice of a file size.
+ # Factor 1024 for kBytes, factor 2 for being an average.
+ size = (int)(float(self.size) * random.random() * 2 * 1024.0)
+
+ # Don't create empty files. This also implements the
+ # "average = 0 means no files" rule.
+ if size > 0:
+ self.count += 1
+
+ # Create a new shard for every 1000 files
+ subfolder = os.path.join(self.path, str(self.count / 1000))
+ if not os.path.exists(subfolder):
+ os.mkdir(subfolder)
+
+ # Create and write the file in 4k chunks.
+ # Writing full chunks will result in average file sizes
+ # being slightly above the SELF.SIZE. That's good enough
+ # for our purposes.
+ f = open(os.path.join(subfolder, str(self.count)), "wb")
+ while size > 0:
+ f.write(self.buffer)
+ size -= len(self.buffer)
+
+ f.close()
+
+ def cleanup(self):
+ """ Get rid of all the files (and folders) that we created. """
+
+ shutil.rmtree(self.path)
+
+class Repository:
+ """ Encapsulates key information of a repository. Is is being
+ used for copy sources only and contains information about
+ its NAME, PATH, SHARD_SIZE, HEAD revision and MIN_UNPACKED_REV. """
+
+ def _read_config(self, filename):
+ """ Read and return all lines from FILENAME.
+ This will be used to read 'format', 'current' etc. . """
+
+ f = open(os.path.join(self.path, 'db', filename), "rb")
+ lines = f.readlines()
+ f.close()
+
+ return lines
+
+ def __init__(self, parent, name):
+ """ Constructor collecting everything we need to know about
+ the repository NAME within PARENT folder. """
+
+ self.name = name
+ self.path = os.path.join(parent, name)
+
+ self.shard_size = int(self._read_config('format')[1].split(' ')[2])
+ self.min_unpacked_rev = int(self._read_config('min-unpacked-rev')[0])
+ self.head = int(self._read_config('current')[0])
+
+ def needs_copy(self, revision):
+ """ Return True if REVISION is a revision in this repository
+ and is "directly copyable", i.e. is either non-packed or
+ the first rev in a packed shard. Everything else is either
+ not a valid rev or already gets / got copied as part of
+ some packed shard. """
+
+ if revision > self.head:
+ return False
+ if revision < self.min_unpacked_rev:
+ return revision % self.shard_size == 0
+
+ return True
+
+ @classmethod
+ def is_repository(cls, path):
+ """ Quick check that PATH is (probably) a repository.
+ This is mainly to filter out aux files put next to
+ (not inside) the repositories to copy. """
+
+ format_path = os.path.join(path, 'db', 'format')
+ return os.path.isfile(format_path)
+
+class Multicopy:
+ """ Helper class doing the actual copying. It copies individual
+ revisions and packed shards from the one source repository
+ to multiple copies of it. The copies have the same name
+ as the source repo but with numbers 0 .. N-1 appended to it.
+
+ The copy process is being initiated by the constructor
+ (copies the repo skeleton w/o revision contents). Revision
+ contents is then copied by successive calls to the copy()
+ method. """
+
+ def _init_copy(self, number):
+ """ Called from the constructor, this will copy SELF.SOURCE_REPO
+ into NUMBER new repos below SELF.DEST_BASE but omit everything
+ below db/revs and db/revprops. """
+
+ src = self.source_repo.path
+ dst = self.dest_base + str(number)
+
+ # Copy the repo skeleton w/o revs and revprops
+ shutil.copytree(src, dst, ignore=shutil.ignore_patterns('revs', 'revprops'))
+
+ # Add revs and revprops
+ self.dst_revs.append(os.path.join(dst, 'db', 'revs'))
+ self.dst_revprops.append(os.path.join(dst, 'db', 'revprops'))
+
+ os.mkdir(self.dst_revs[number])
+ os.mkdir(self.dst_revprops[number])
+
+ def _copy_packed_shard(self, shard, number):
+ """ Copy packed shard number SHARD from SELF.SOURCE_REPO to
+ the copy NUMBER below SELF.DEST_BASE. """
+
+ # Shards are simple subtrees
+ src_revs = os.path.join(self.src_revs, str(shard) + '.pack')
+ dst_revs = os.path.join(self.dst_revs[number], str(shard) + '.pack')
+ src_revprops = os.path.join(self.src_revprops, str(shard) + '.pack')
+ dst_revprops = os.path.join(self.dst_revprops[number], str(shard) + '.pack')
+
+ shutil.copytree(src_revs, dst_revs)
+ shutil.copytree(src_revprops, dst_revprops)
+
+ # Special case: revprops of rev 0 are never packed => extra copy
+ if shard == 0:
+ src_revprops = os.path.join(self.src_revprops, '0')
+ dest_revprops = os.path.join(self.dst_revprops[number], '0')
+
+ shutil.copytree(src_revprops, dest_revprops)
+
+ def _copy_single_revision(self, revision, number):
+ """ Copy non-packed REVISION from SELF.SOURCE_REPO to the copy
+ NUMBER below SELF.DEST_BASE. """
+
+ shard = str(revision / self.source_repo.shard_size)
+
+ # Auto-create shard folder
+ if revision % self.source_repo.shard_size == 0:
+ os.mkdir(os.path.join(self.dst_revs[number], shard))
+ os.mkdir(os.path.join(self.dst_revprops[number], shard))
+
+ # Copy the rev file and the revprop file
+ src_rev = os.path.join(self.src_revs, shard, str(revision))
+ dest_rev = os.path.join(self.dst_revs[number], shard, str(revision))
+ src_revprop = os.path.join(self.src_revprops, shard, str(revision))
+ dest_revprop = os.path.join(self.dst_revprops[number], shard, str(revision))
+
+ shutil.copyfile(src_rev, dest_rev)
+ shutil.copyfile(src_revprop, dest_revprop)
+
+ def __init__(self, source, target_parent, count):
+ """ Initiate the copy process for the SOURCE repository to
+ be copied COUNT times into the TARGET_PARENT directory. """
+
+ self.source_repo = source
+ self.dest_base = os.path.join(target_parent, source.name)
+
+ self.src_revs = os.path.join(source.path, 'db', 'revs')
+ self.src_revprops = os.path.join(source.path, 'db', 'revprops')
+
+ self.dst_revs = []
+ self.dst_revprops = []
+ for i in range(0, count):
+ self._init_copy(i)
+
+ def copy(self, revision, number):
+ """ Copy (packed or non-packed) REVISION from SELF.SOURCE_REPO
+ to the copy NUMBER below SELF.DEST_BASE.
+
+ SELF.SOURCE_REPO.needs_copy(REVISION) must be True. """
+
+ if revision < self.source_repo.min_unpacked_rev:
+ self._copy_packed_shard(revision / self.source_repo.shard_size, number)
+ else:
+ self._copy_single_revision(revision, number)
+
+def copy_repos(src, dst, count, separator_size):
+ """ Under DST, create COUNT copies of all repositories immediately
+ below SRC.
+
+ All copies will "interleaved" such that we copy each individual
+ revision / packed shard to all target repos first before
+ continuing with the next revision / packed shard. After each
+ round (revision / packed shard) insert a temporary file of
+ SEPARATOR_SIZE kBytes on average to add more spacing between
+ revisions. The temp files get automatically removed at the end.
+
+ Please note that this function will clear DST before copying
+ anything into it. """
+
+ # Remove any remnants from the target folder.
+ # (DST gets auto-created by the first repo copy.)
+ shutil.rmtree(dst)
+
+ # Repositories to copy and the respective copy utilities
+ repositories = []
+ copies = []
+
+ # Find repositories, initiate copies and determine the range of
+ # revisions to copy in total
+ max_revision = 0
+ for name in os.listdir(src):
+ if Repository.is_repository(os.path.join(src, name)):
+ repository = Repository(src, name)
+ repositories.append(repository)
+ copies.append(Multicopy(repository, dst, count))
+
+ if repository.head > max_revision:
+ max_revision = repository.head
+
+ # Temp file collection (spacers)
+ separators = Separators(dst, separator_size)
+
+ # Copy all repos in revision,number-major order
+ for revision in xrange(0, max_revision + 1):
+ for number in xrange(0, count):
+
+ any_copy = False
+ for i in xrange(0, len(repositories)):
+ if repositories[i].needs_copy(revision):
+ any_copy = True
+ copies[i].copy(revision, number)
+
+ # Don't add spacers when nothing got copied (REVISION is
+ # packed in all repositories).
+ if any_copy:
+ separators.write()
+
+ # Now that all data is in position, remove the spacers
+ separators.cleanup()
+
+def show_usage():
+ """ Write a simple CL docstring """
+
+ print "Copies and duplicates repositories in a way that mimics larger deployments."
+ print
+ print "Usage:"
+ print "copy_repo.py SRC DST COUNT SEPARATOR_SIZE"
+ print
+ print "SRC Immediate parent folder of all the repositories to copy."
+ print "DST Folder to copy into; current contents will be lost."
+ print "COUNT Number of copies to create of each source repository."
+ print "SEPARATOR_SIZE Additional spacing, in kBytes, between revisions."
+
+#main function
+if len(argv) == 5:
+ copy_repos(sys.argv[1], sys.argv[2], int(sys.argv[3]), int(sys.argv[4]))
+else:
+ show_usage()