1
# Copyright (C) 2008 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17
"""Import processor that dump stats about the input (and doesn't import)."""
20
from bzrlib.trace import (
24
from bzrlib.plugins.fastimport import (
32
class InfoProcessor(processor.ImportProcessor):
33
"""An import processor that dumps statistics about the input.
35
No changes to the current repository are made.
37
As well as providing useful information about an import
38
stream before importing it, this processor is useful for
39
benchmarking the speed at which data can be extracted from
43
def __init__(self, target=None, params=None, verbose=0, outf=None):
44
# Allow creation without a target
45
processor.ImportProcessor.__init__(self, target, params, verbose,
48
def pre_process(self):
49
self.note("Collecting statistics ...")
52
for cmd in commands.COMMAND_NAMES:
53
self.cmd_counts[cmd] = 0
54
self.file_cmd_counts = {}
55
for fc in commands.FILE_COMMAND_NAMES:
56
self.file_cmd_counts[fc] = 0
57
self.parent_counts = {}
58
self.max_parent_count = 0
59
self.committers = set()
60
self.separate_authors_found = False
61
self.symlinks_found = False
62
self.executables_found = False
63
self.sha_blob_references = False
64
self.lightweight_tags = 0
67
for usage in ['new', 'used', 'unknown', 'unmarked']:
68
self.blobs[usage] = set()
69
self.blob_ref_counts = {}
70
# Head tracking - delegate to the cache manager
71
self.cache_mgr = cache_manager.CacheManager(inventory_cache_size=0)
72
# Stuff to cache: a map from mark to # of times that mark is merged
74
# Stuff to cache: these are maps from mark to sets
75
self.rename_old_paths = {}
76
self.copy_source_paths = {}
78
def post_process(self):
80
cmd_names = commands.COMMAND_NAMES
81
fc_names = commands.FILE_COMMAND_NAMES
82
self._dump_stats_group("Command counts",
83
[(c, self.cmd_counts[c]) for c in cmd_names], str)
84
self._dump_stats_group("File command counts",
85
[(c, self.file_cmd_counts[c]) for c in fc_names], str)
88
if self.cmd_counts['commit']:
90
for i in xrange(0, self.max_parent_count + 1):
91
if i in self.parent_counts:
92
count = self.parent_counts[i]
93
p_items.append(("parents-%d" % i, count))
94
merges_count = len(self.merges.keys())
95
p_items.append(('total revisions merged', merges_count))
97
'separate authors found': self.separate_authors_found,
98
'executables': self.executables_found,
99
'symlinks': self.symlinks_found,
100
'blobs referenced by SHA': self.sha_blob_references,
102
self._dump_stats_group("Parent counts", p_items, str)
103
self._dump_stats_group("Commit analysis", flags.iteritems(), _found)
104
heads = helpers.invert_dictset(self.cache_mgr.heads)
105
self._dump_stats_group("Head analysis", heads.iteritems(), None,
106
_iterable_as_config_list)
107
# note("\t%d\t%s" % (len(self.committers), 'unique committers'))
108
self._dump_stats_group("Merges", self.merges.iteritems(), None)
109
# We only show the rename old path and copy source paths when -vv
110
# (verbose=2) is specified. The output here for mysql's data can't
111
# be parsed currently so this bit of code needs more work anyhow ..
112
if self.verbose >= 2:
113
self._dump_stats_group("Rename old paths",
114
self.rename_old_paths.iteritems(), len,
115
_iterable_as_config_list)
116
self._dump_stats_group("Copy source paths",
117
self.copy_source_paths.iteritems(), len,
118
_iterable_as_config_list)
121
if self.cmd_counts['blob']:
122
# In verbose mode, don't list every blob used
124
del self.blobs['used']
125
self._dump_stats_group("Blob usage tracking",
126
self.blobs.iteritems(), len, _iterable_as_config_list)
127
if self.blob_ref_counts:
128
blobs_by_count = helpers.invert_dict(self.blob_ref_counts)
129
blob_items = blobs_by_count.items()
131
self._dump_stats_group("Blob reference counts",
132
blob_items, len, _iterable_as_config_list)
135
if self.cmd_counts['reset']:
137
'lightweight tags': self.lightweight_tags,
139
self._dump_stats_group("Reset analysis", reset_stats.iteritems())
141
def _dump_stats_group(self, title, items, normal_formatter=None,
142
verbose_formatter=None):
143
"""Dump a statistics group.
145
In verbose mode, do so as a config file so
146
that other processors can load the information if they want to.
147
:param normal_formatter: the callable to apply to the value
148
before displaying it in normal mode
149
:param verbose_formatter: the callable to apply to the value
150
before displaying it in verbose mode
153
self.outf.write("[%s]\n" % (title,))
154
for name, value in items:
155
if verbose_formatter is not None:
156
value = verbose_formatter(value)
157
if type(name) == str:
158
name = name.replace(' ', '-')
159
self.outf.write("%s = %s\n" % (name, value))
160
self.outf.write("\n")
162
self.outf.write("%s:\n" % (title,))
163
for name, value in items:
164
if normal_formatter is not None:
165
value = normal_formatter(value)
166
self.outf.write("\t%s\t%s\n" % (value, name))
168
def progress_handler(self, cmd):
169
"""Process a ProgressCommand."""
170
self.cmd_counts[cmd.name] += 1
172
def blob_handler(self, cmd):
173
"""Process a BlobCommand."""
174
self.cmd_counts[cmd.name] += 1
176
self.blobs['unmarked'].add(cmd.id)
178
self.blobs['new'].add(cmd.id)
179
# Marks can be re-used so remove it from used if already there.
180
# Note: we definitely do NOT want to remove it from multi if
181
# it's already in that set.
183
self.blobs['used'].remove(cmd.id)
187
def checkpoint_handler(self, cmd):
188
"""Process a CheckpointCommand."""
189
self.cmd_counts[cmd.name] += 1
191
def commit_handler(self, cmd):
192
"""Process a CommitCommand."""
193
self.cmd_counts[cmd.name] += 1
194
self.committers.add(cmd.committer)
195
if cmd.author is not None:
196
self.separate_authors_found = True
197
for fc in cmd.file_iter():
198
self.file_cmd_counts[fc.name] += 1
199
if isinstance(fc, commands.FileModifyCommand):
201
self.executables_found = True
202
if fc.kind == commands.SYMLINK_KIND:
203
self.symlinks_found = True
204
if fc.dataref is not None:
205
if fc.dataref[0] == ':':
206
self._track_blob(fc.dataref)
208
self.sha_blob_references = True
209
elif isinstance(fc, commands.FileRenameCommand):
210
self.rename_old_paths.setdefault(cmd.id, set()).add(fc.old_path)
211
elif isinstance(fc, commands.FileCopyCommand):
212
self.copy_source_paths.setdefault(cmd.id, set()).add(fc.src_path)
215
parents = self.cache_mgr.track_heads(cmd)
217
# Track the parent counts
218
parent_count = len(parents)
219
if self.parent_counts.has_key(parent_count):
220
self.parent_counts[parent_count] += 1
222
self.parent_counts[parent_count] = 1
223
if parent_count > self.max_parent_count:
224
self.max_parent_count = parent_count
226
# Remember the merges
228
#self.merges.setdefault(cmd.ref, set()).update(cmd.merges)
229
for merge in cmd.merges:
230
if merge in self.merges:
231
self.merges[merge] += 1
233
self.merges[merge] = 1
235
def reset_handler(self, cmd):
236
"""Process a ResetCommand."""
237
self.cmd_counts[cmd.name] += 1
238
if cmd.ref.startswith('refs/tags/'):
239
self.lightweight_tags += 1
241
if cmd.from_ is not None:
242
self.cache_mgr.track_heads_for_ref(cmd.ref, cmd.from_)
244
def tag_handler(self, cmd):
245
"""Process a TagCommand."""
246
self.cmd_counts[cmd.name] += 1
248
def feature_handler(self, cmd):
249
"""Process a FeatureCommand."""
250
self.cmd_counts[cmd.name] += 1
251
feature = cmd.feature_name
252
if feature not in commands.FEATURE_NAMES:
253
self.warning("feature %s is not supported - parsing may fail"
256
def _track_blob(self, mark):
257
if mark in self.blob_ref_counts:
258
self.blob_ref_counts[mark] += 1
260
elif mark in self.blobs['used']:
261
self.blob_ref_counts[mark] = 2
262
self.blobs['used'].remove(mark)
263
elif mark in self.blobs['new']:
264
self.blobs['used'].add(mark)
265
self.blobs['new'].remove(mark)
267
self.blobs['unknown'].add(mark)
270
"""Format a found boolean as a string."""
271
return ['no', 'found'][b]
273
def _iterable_as_config_list(s):
274
"""Format an iterable as a sequence of comma-separated strings.
276
To match what ConfigObj expects, a single item list has a trailing comma.
280
return "%s," % (items[0],)
282
return ", ".join(items)