1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
|
#!/usr/bin/env python
#
# copy_repo.py: create multiple, interleaved copies of a set of repositories.
#
# Subversion is a tool for revision control.
# See http://subversion.apache.org for more information.
#
# ====================================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
######################################################################
# General modules
import os
import random
import shutil
import sys
class Separators:
""" This class is a container for dummy / filler files.
It will be used to create spaces between repository
versions on disk, i.e. to simulate some aspect of
real-world FS fragmentation.
It gets initialized with some parent path as well as
the desired average file size and will create a new
such file with each call to write(). Automatic
sharding keeps FS specific overhead at bay. Call
cleanup() to eventually delete all dummy files. """
buffer = "A" * 4096
""" Write this non-NULL contents into the dummy files. """
def __init__(self, path, average_size):
""" Initialize and store all dummy files in a '__tmp'
sub-folder of PATH. The size of each dummy file
is a random value and will be slightly AVERAGE_SIZE
kBytes on average. A value of 0 will effectively
disable dummy file creation. """
self.path = os.path.join(path, '__tmp')
self.size = average_size
self.count = 0
if os.path.exists(self.path):
shutil.rmtree(self.path)
os.mkdir(self.path)
def write(self):
""" Add a new dummy file """
# Throw dice of a file size.
# Factor 1024 for kBytes, factor 2 for being an average.
size = (int)(float(self.size) * random.random() * 2 * 1024.0)
# Don't create empty files. This also implements the
# "average = 0 means no files" rule.
if size > 0:
self.count += 1
# Create a new shard for every 1000 files
subfolder = os.path.join(self.path, str(self.count / 1000))
if not os.path.exists(subfolder):
os.mkdir(subfolder)
# Create and write the file in 4k chunks.
# Writing full chunks will result in average file sizes
# being slightly above the SELF.SIZE. That's good enough
# for our purposes.
f = open(os.path.join(subfolder, str(self.count)), "wb")
while size > 0:
f.write(self.buffer)
size -= len(self.buffer)
f.close()
def cleanup(self):
""" Get rid of all the files (and folders) that we created. """
shutil.rmtree(self.path)
class Repository:
""" Encapsulates key information of a repository. Is is being
used for copy sources only and contains information about
its NAME, PATH, SHARD_SIZE, HEAD revision and MIN_UNPACKED_REV. """
def _read_config(self, filename):
""" Read and return all lines from FILENAME.
This will be used to read 'format', 'current' etc. . """
f = open(os.path.join(self.path, 'db', filename), "rb")
lines = f.readlines()
f.close()
return lines
def __init__(self, parent, name):
""" Constructor collecting everything we need to know about
the repository NAME within PARENT folder. """
self.name = name
self.path = os.path.join(parent, name)
self.shard_size = int(self._read_config('format')[1].split(' ')[2])
self.min_unpacked_rev = int(self._read_config('min-unpacked-rev')[0])
self.head = int(self._read_config('current')[0])
def needs_copy(self, revision):
""" Return True if REVISION is a revision in this repository
and is "directly copyable", i.e. is either non-packed or
the first rev in a packed shard. Everything else is either
not a valid rev or already gets / got copied as part of
some packed shard. """
if revision > self.head:
return False
if revision < self.min_unpacked_rev:
return revision % self.shard_size == 0
return True
@classmethod
def is_repository(cls, path):
""" Quick check that PATH is (probably) a repository.
This is mainly to filter out aux files put next to
(not inside) the repositories to copy. """
format_path = os.path.join(path, 'db', 'format')
return os.path.isfile(format_path)
class Multicopy:
""" Helper class doing the actual copying. It copies individual
revisions and packed shards from the one source repository
to multiple copies of it. The copies have the same name
as the source repo but with numbers 0 .. N-1 appended to it.
The copy process is being initiated by the constructor
(copies the repo skeleton w/o revision contents). Revision
contents is then copied by successive calls to the copy()
method. """
def _init_copy(self, number):
""" Called from the constructor, this will copy SELF.SOURCE_REPO
into NUMBER new repos below SELF.DEST_BASE but omit everything
below db/revs and db/revprops. """
src = self.source_repo.path
dst = self.dest_base + str(number)
# Copy the repo skeleton w/o revs and revprops
shutil.copytree(src, dst, ignore=shutil.ignore_patterns('revs', 'revprops'))
# Add revs and revprops
self.dst_revs.append(os.path.join(dst, 'db', 'revs'))
self.dst_revprops.append(os.path.join(dst, 'db', 'revprops'))
os.mkdir(self.dst_revs[number])
os.mkdir(self.dst_revprops[number])
def _copy_packed_shard(self, shard, number):
""" Copy packed shard number SHARD from SELF.SOURCE_REPO to
the copy NUMBER below SELF.DEST_BASE. """
# Shards are simple subtrees
src_revs = os.path.join(self.src_revs, str(shard) + '.pack')
dst_revs = os.path.join(self.dst_revs[number], str(shard) + '.pack')
src_revprops = os.path.join(self.src_revprops, str(shard) + '.pack')
dst_revprops = os.path.join(self.dst_revprops[number], str(shard) + '.pack')
shutil.copytree(src_revs, dst_revs)
shutil.copytree(src_revprops, dst_revprops)
# Special case: revprops of rev 0 are never packed => extra copy
if shard == 0:
src_revprops = os.path.join(self.src_revprops, '0')
dest_revprops = os.path.join(self.dst_revprops[number], '0')
shutil.copytree(src_revprops, dest_revprops)
def _copy_single_revision(self, revision, number):
""" Copy non-packed REVISION from SELF.SOURCE_REPO to the copy
NUMBER below SELF.DEST_BASE. """
shard = str(revision / self.source_repo.shard_size)
# Auto-create shard folder
if revision % self.source_repo.shard_size == 0:
os.mkdir(os.path.join(self.dst_revs[number], shard))
os.mkdir(os.path.join(self.dst_revprops[number], shard))
# Copy the rev file and the revprop file
src_rev = os.path.join(self.src_revs, shard, str(revision))
dest_rev = os.path.join(self.dst_revs[number], shard, str(revision))
src_revprop = os.path.join(self.src_revprops, shard, str(revision))
dest_revprop = os.path.join(self.dst_revprops[number], shard, str(revision))
shutil.copyfile(src_rev, dest_rev)
shutil.copyfile(src_revprop, dest_revprop)
def __init__(self, source, target_parent, count):
""" Initiate the copy process for the SOURCE repository to
be copied COUNT times into the TARGET_PARENT directory. """
self.source_repo = source
self.dest_base = os.path.join(target_parent, source.name)
self.src_revs = os.path.join(source.path, 'db', 'revs')
self.src_revprops = os.path.join(source.path, 'db', 'revprops')
self.dst_revs = []
self.dst_revprops = []
for i in range(0, count):
self._init_copy(i)
def copy(self, revision, number):
""" Copy (packed or non-packed) REVISION from SELF.SOURCE_REPO
to the copy NUMBER below SELF.DEST_BASE.
SELF.SOURCE_REPO.needs_copy(REVISION) must be True. """
if revision < self.source_repo.min_unpacked_rev:
self._copy_packed_shard(revision / self.source_repo.shard_size, number)
else:
self._copy_single_revision(revision, number)
def copy_repos(src, dst, count, separator_size):
""" Under DST, create COUNT copies of all repositories immediately
below SRC.
All copies will "interleaved" such that we copy each individual
revision / packed shard to all target repos first before
continuing with the next revision / packed shard. After each
round (revision / packed shard) insert a temporary file of
SEPARATOR_SIZE kBytes on average to add more spacing between
revisions. The temp files get automatically removed at the end.
Please note that this function will clear DST before copying
anything into it. """
# Remove any remnants from the target folder.
# (DST gets auto-created by the first repo copy.)
shutil.rmtree(dst)
# Repositories to copy and the respective copy utilities
repositories = []
copies = []
# Find repositories, initiate copies and determine the range of
# revisions to copy in total
max_revision = 0
for name in os.listdir(src):
if Repository.is_repository(os.path.join(src, name)):
repository = Repository(src, name)
repositories.append(repository)
copies.append(Multicopy(repository, dst, count))
if repository.head > max_revision:
max_revision = repository.head
# Temp file collection (spacers)
separators = Separators(dst, separator_size)
# Copy all repos in revision,number-major order
for revision in xrange(0, max_revision + 1):
for number in xrange(0, count):
any_copy = False
for i in xrange(0, len(repositories)):
if repositories[i].needs_copy(revision):
any_copy = True
copies[i].copy(revision, number)
# Don't add spacers when nothing got copied (REVISION is
# packed in all repositories).
if any_copy:
separators.write()
# Now that all data is in position, remove the spacers
separators.cleanup()
def show_usage():
""" Write a simple CL docstring """
print "Copies and duplicates repositories in a way that mimics larger deployments."
print
print "Usage:"
print "copy_repo.py SRC DST COUNT SEPARATOR_SIZE"
print
print "SRC Immediate parent folder of all the repositories to copy."
print "DST Folder to copy into; current contents will be lost."
print "COUNT Number of copies to create of each source repository."
print "SEPARATOR_SIZE Additional spacing, in kBytes, between revisions."
#main function
if len(argv) == 5:
copy_repos(sys.argv[1], sys.argv[2], int(sys.argv[3]), int(sys.argv[4]))
else:
show_usage()
|