3
# copy_repo.py: create multiple, interleaved copies of a set of repositories.
5
# Subversion is a tool for revision control.
6
# See http://subversion.apache.org for more information.
8
# ====================================================================
9
# Licensed to the Apache Software Foundation (ASF) under one
10
# or more contributor license agreements. See the NOTICE file
11
# distributed with this work for additional information
12
# regarding copyright ownership. The ASF licenses this file
13
# to you under the Apache License, Version 2.0 (the
14
# "License"); you may not use this file except in compliance
15
# with the License. You may obtain a copy of the License at
17
# http://www.apache.org/licenses/LICENSE-2.0
19
# Unless required by applicable law or agreed to in writing,
20
# software distributed under the License is distributed on an
21
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
22
# KIND, either express or implied. See the License for the
23
# specific language governing permissions and limitations
25
######################################################################
34
""" This class is a container for dummy / filler files.
35
It will be used to create spaces between repository
36
versions on disk, i.e. to simulate some aspect of
37
real-world FS fragmentation.
39
It gets initialized with some parent path as well as
40
the desired average file size and will create a new
41
such file with each call to write(). Automatic
42
sharding keeps FS specific overhead at bay. Call
43
cleanup() to eventually delete all dummy files. """
46
""" Write this non-NULL contents into the dummy files. """
48
def __init__(self, path, average_size):
49
""" Initialize and store all dummy files in a '__tmp'
50
sub-folder of PATH. The size of each dummy file
51
is a random value and will be slightly AVERAGE_SIZE
52
kBytes on average. A value of 0 will effectively
53
disable dummy file creation. """
55
self.path = os.path.join(path, '__tmp')
56
self.size = average_size
59
if os.path.exists(self.path):
60
shutil.rmtree(self.path)
65
""" Add a new dummy file """
67
# Throw dice of a file size.
68
# Factor 1024 for kBytes, factor 2 for being an average.
69
size = (int)(float(self.size) * random.random() * 2 * 1024.0)
71
# Don't create empty files. This also implements the
72
# "average = 0 means no files" rule.
76
# Create a new shard for every 1000 files
77
subfolder = os.path.join(self.path, str(self.count / 1000))
78
if not os.path.exists(subfolder):
81
# Create and write the file in 4k chunks.
82
# Writing full chunks will result in average file sizes
83
# being slightly above the SELF.SIZE. That's good enough
85
f = open(os.path.join(subfolder, str(self.count)), "wb")
88
size -= len(self.buffer)
93
""" Get rid of all the files (and folders) that we created. """
95
shutil.rmtree(self.path)
98
""" Encapsulates key information of a repository. Is is being
99
used for copy sources only and contains information about
100
its NAME, PATH, SHARD_SIZE, HEAD revision and MIN_UNPACKED_REV. """
102
def _read_config(self, filename):
103
""" Read and return all lines from FILENAME.
104
This will be used to read 'format', 'current' etc. . """
106
f = open(os.path.join(self.path, 'db', filename), "rb")
107
lines = f.readlines()
112
def __init__(self, parent, name):
113
""" Constructor collecting everything we need to know about
114
the repository NAME within PARENT folder. """
117
self.path = os.path.join(parent, name)
119
self.shard_size = int(self._read_config('format')[1].split(' ')[2])
120
self.min_unpacked_rev = int(self._read_config('min-unpacked-rev')[0])
121
self.head = int(self._read_config('current')[0])
123
def needs_copy(self, revision):
124
""" Return True if REVISION is a revision in this repository
125
and is "directly copyable", i.e. is either non-packed or
126
the first rev in a packed shard. Everything else is either
127
not a valid rev or already gets / got copied as part of
128
some packed shard. """
130
if revision > self.head:
132
if revision < self.min_unpacked_rev:
133
return revision % self.shard_size == 0
138
def is_repository(cls, path):
139
""" Quick check that PATH is (probably) a repository.
140
This is mainly to filter out aux files put next to
141
(not inside) the repositories to copy. """
143
format_path = os.path.join(path, 'db', 'format')
144
return os.path.isfile(format_path)
147
""" Helper class doing the actual copying. It copies individual
148
revisions and packed shards from the one source repository
149
to multiple copies of it. The copies have the same name
150
as the source repo but with numbers 0 .. N-1 appended to it.
152
The copy process is being initiated by the constructor
153
(copies the repo skeleton w/o revision contents). Revision
154
contents is then copied by successive calls to the copy()
157
def _init_copy(self, number):
158
""" Called from the constructor, this will copy SELF.SOURCE_REPO
159
into NUMBER new repos below SELF.DEST_BASE but omit everything
160
below db/revs and db/revprops. """
162
src = self.source_repo.path
163
dst = self.dest_base + str(number)
165
# Copy the repo skeleton w/o revs and revprops
166
shutil.copytree(src, dst, ignore=shutil.ignore_patterns('revs', 'revprops'))
168
# Add revs and revprops
169
self.dst_revs.append(os.path.join(dst, 'db', 'revs'))
170
self.dst_revprops.append(os.path.join(dst, 'db', 'revprops'))
172
os.mkdir(self.dst_revs[number])
173
os.mkdir(self.dst_revprops[number])
175
def _copy_packed_shard(self, shard, number):
176
""" Copy packed shard number SHARD from SELF.SOURCE_REPO to
177
the copy NUMBER below SELF.DEST_BASE. """
179
# Shards are simple subtrees
180
src_revs = os.path.join(self.src_revs, str(shard) + '.pack')
181
dst_revs = os.path.join(self.dst_revs[number], str(shard) + '.pack')
182
src_revprops = os.path.join(self.src_revprops, str(shard) + '.pack')
183
dst_revprops = os.path.join(self.dst_revprops[number], str(shard) + '.pack')
185
shutil.copytree(src_revs, dst_revs)
186
shutil.copytree(src_revprops, dst_revprops)
188
# Special case: revprops of rev 0 are never packed => extra copy
190
src_revprops = os.path.join(self.src_revprops, '0')
191
dest_revprops = os.path.join(self.dst_revprops[number], '0')
193
shutil.copytree(src_revprops, dest_revprops)
195
def _copy_single_revision(self, revision, number):
196
""" Copy non-packed REVISION from SELF.SOURCE_REPO to the copy
197
NUMBER below SELF.DEST_BASE. """
199
shard = str(revision / self.source_repo.shard_size)
201
# Auto-create shard folder
202
if revision % self.source_repo.shard_size == 0:
203
os.mkdir(os.path.join(self.dst_revs[number], shard))
204
os.mkdir(os.path.join(self.dst_revprops[number], shard))
206
# Copy the rev file and the revprop file
207
src_rev = os.path.join(self.src_revs, shard, str(revision))
208
dest_rev = os.path.join(self.dst_revs[number], shard, str(revision))
209
src_revprop = os.path.join(self.src_revprops, shard, str(revision))
210
dest_revprop = os.path.join(self.dst_revprops[number], shard, str(revision))
212
shutil.copyfile(src_rev, dest_rev)
213
shutil.copyfile(src_revprop, dest_revprop)
215
def __init__(self, source, target_parent, count):
216
""" Initiate the copy process for the SOURCE repository to
217
be copied COUNT times into the TARGET_PARENT directory. """
219
self.source_repo = source
220
self.dest_base = os.path.join(target_parent, source.name)
222
self.src_revs = os.path.join(source.path, 'db', 'revs')
223
self.src_revprops = os.path.join(source.path, 'db', 'revprops')
226
self.dst_revprops = []
227
for i in range(0, count):
230
def copy(self, revision, number):
231
""" Copy (packed or non-packed) REVISION from SELF.SOURCE_REPO
232
to the copy NUMBER below SELF.DEST_BASE.
234
SELF.SOURCE_REPO.needs_copy(REVISION) must be True. """
236
if revision < self.source_repo.min_unpacked_rev:
237
self._copy_packed_shard(revision / self.source_repo.shard_size, number)
239
self._copy_single_revision(revision, number)
241
def copy_repos(src, dst, count, separator_size):
242
""" Under DST, create COUNT copies of all repositories immediately
245
All copies will "interleaved" such that we copy each individual
246
revision / packed shard to all target repos first before
247
continuing with the next revision / packed shard. After each
248
round (revision / packed shard) insert a temporary file of
249
SEPARATOR_SIZE kBytes on average to add more spacing between
250
revisions. The temp files get automatically removed at the end.
252
Please note that this function will clear DST before copying
253
anything into it. """
255
# Remove any remnants from the target folder.
256
# (DST gets auto-created by the first repo copy.)
259
# Repositories to copy and the respective copy utilities
263
# Find repositories, initiate copies and determine the range of
264
# revisions to copy in total
266
for name in os.listdir(src):
267
if Repository.is_repository(os.path.join(src, name)):
268
repository = Repository(src, name)
269
repositories.append(repository)
270
copies.append(Multicopy(repository, dst, count))
272
if repository.head > max_revision:
273
max_revision = repository.head
275
# Temp file collection (spacers)
276
separators = Separators(dst, separator_size)
278
# Copy all repos in revision,number-major order
279
for revision in xrange(0, max_revision + 1):
280
for number in xrange(0, count):
283
for i in xrange(0, len(repositories)):
284
if repositories[i].needs_copy(revision):
286
copies[i].copy(revision, number)
288
# Don't add spacers when nothing got copied (REVISION is
289
# packed in all repositories).
293
# Now that all data is in position, remove the spacers
297
""" Write a simple CL docstring """
299
print "Copies and duplicates repositories in a way that mimics larger deployments."
302
print "copy_repo.py SRC DST COUNT SEPARATOR_SIZE"
304
print "SRC Immediate parent folder of all the repositories to copy."
305
print "DST Folder to copy into; current contents will be lost."
306
print "COUNT Number of copies to create of each source repository."
307
print "SEPARATOR_SIZE Additional spacing, in kBytes, between revisions."
311
copy_repos(sys.argv[1], sys.argv[2], int(sys.argv[3]), int(sys.argv[4]))