1
# Copyright (C) 2008 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program. If not, see <http://www.gnu.org/licenses/>.
16
"""Import processor that dump stats about the input (and doesn't import)."""
18
from bzrlib.plugins.fastimport import (
21
from fastimport import (
25
from fastimport.helpers import (
32
class InfoProcessor(processor.ImportProcessor):
33
"""An import processor that dumps statistics about the input.
35
No changes to the current repository are made.
37
As well as providing useful information about an import
38
stream before importing it, this processor is useful for
39
benchmarking the speed at which data can be extracted from
43
def __init__(self, params=None, verbose=0, outf=None):
44
processor.ImportProcessor.__init__(self, params, verbose,
47
def pre_process(self):
50
for cmd in commands.COMMAND_NAMES:
51
self.cmd_counts[cmd] = 0
52
self.file_cmd_counts = {}
53
for fc in commands.FILE_COMMAND_NAMES:
54
self.file_cmd_counts[fc] = 0
55
self.parent_counts = {}
56
self.max_parent_count = 0
57
self.committers = set()
58
self.separate_authors_found = False
59
self.symlinks_found = False
60
self.executables_found = False
61
self.sha_blob_references = False
62
self.lightweight_tags = 0
65
for usage in ['new', 'used', 'unknown', 'unmarked']:
66
self.blobs[usage] = set()
67
self.blob_ref_counts = {}
69
self.reftracker = reftracker.RefTracker()
70
# Stuff to cache: a map from mark to # of times that mark is merged
72
# Stuff to cache: these are maps from mark to sets
73
self.rename_old_paths = {}
74
self.copy_source_paths = {}
76
def post_process(self):
78
cmd_names = commands.COMMAND_NAMES
79
fc_names = commands.FILE_COMMAND_NAMES
80
self._dump_stats_group("Command counts",
81
[(c, self.cmd_counts[c]) for c in cmd_names], str)
82
self._dump_stats_group("File command counts",
83
[(c, self.file_cmd_counts[c]) for c in fc_names], str)
86
if self.cmd_counts['commit']:
88
for i in xrange(0, self.max_parent_count + 1):
89
if i in self.parent_counts:
90
count = self.parent_counts[i]
91
p_items.append(("parents-%d" % i, count))
92
merges_count = len(self.merges.keys())
93
p_items.append(('total revisions merged', merges_count))
95
'separate authors found': self.separate_authors_found,
96
'executables': self.executables_found,
97
'symlinks': self.symlinks_found,
98
'blobs referenced by SHA': self.sha_blob_references,
100
self._dump_stats_group("Parent counts", p_items, str)
101
self._dump_stats_group("Commit analysis", flags.iteritems(), _found)
102
heads = invert_dictset(self.reftracker.heads)
103
self._dump_stats_group("Head analysis", heads.iteritems(), None,
104
_iterable_as_config_list)
105
# note("\t%d\t%s" % (len(self.committers), 'unique committers'))
106
self._dump_stats_group("Merges", self.merges.iteritems(), None)
107
# We only show the rename old path and copy source paths when -vv
108
# (verbose=2) is specified. The output here for mysql's data can't
109
# be parsed currently so this bit of code needs more work anyhow ..
110
if self.verbose >= 2:
111
self._dump_stats_group("Rename old paths",
112
self.rename_old_paths.iteritems(), len,
113
_iterable_as_config_list)
114
self._dump_stats_group("Copy source paths",
115
self.copy_source_paths.iteritems(), len,
116
_iterable_as_config_list)
119
if self.cmd_counts['blob']:
120
# In verbose mode, don't list every blob used
122
del self.blobs['used']
123
self._dump_stats_group("Blob usage tracking",
124
self.blobs.iteritems(), len, _iterable_as_config_list)
125
if self.blob_ref_counts:
126
blobs_by_count = invert_dict(self.blob_ref_counts)
127
blob_items = blobs_by_count.items()
129
self._dump_stats_group("Blob reference counts",
130
blob_items, len, _iterable_as_config_list)
133
if self.cmd_counts['reset']:
135
'lightweight tags': self.lightweight_tags,
137
self._dump_stats_group("Reset analysis", reset_stats.iteritems())
139
def _dump_stats_group(self, title, items, normal_formatter=None,
140
verbose_formatter=None):
141
"""Dump a statistics group.
143
In verbose mode, do so as a config file so
144
that other processors can load the information if they want to.
145
:param normal_formatter: the callable to apply to the value
146
before displaying it in normal mode
147
:param verbose_formatter: the callable to apply to the value
148
before displaying it in verbose mode
151
self.outf.write("[%s]\n" % (title,))
152
for name, value in items:
153
if verbose_formatter is not None:
154
value = verbose_formatter(value)
155
if type(name) == str:
156
name = name.replace(' ', '-')
157
self.outf.write("%s = %s\n" % (name, value))
158
self.outf.write("\n")
160
self.outf.write("%s:\n" % (title,))
161
for name, value in items:
162
if normal_formatter is not None:
163
value = normal_formatter(value)
164
self.outf.write("\t%s\t%s\n" % (value, name))
166
def progress_handler(self, cmd):
167
"""Process a ProgressCommand."""
168
self.cmd_counts[cmd.name] += 1
170
def blob_handler(self, cmd):
171
"""Process a BlobCommand."""
172
self.cmd_counts[cmd.name] += 1
174
self.blobs['unmarked'].add(cmd.id)
176
self.blobs['new'].add(cmd.id)
177
# Marks can be re-used so remove it from used if already there.
178
# Note: we definitely do NOT want to remove it from multi if
179
# it's already in that set.
181
self.blobs['used'].remove(cmd.id)
185
def checkpoint_handler(self, cmd):
186
"""Process a CheckpointCommand."""
187
self.cmd_counts[cmd.name] += 1
189
def commit_handler(self, cmd):
190
"""Process a CommitCommand."""
191
self.cmd_counts[cmd.name] += 1
192
self.committers.add(cmd.committer)
193
if cmd.author is not None:
194
self.separate_authors_found = True
195
for fc in cmd.iter_files():
196
self.file_cmd_counts[fc.name] += 1
197
if isinstance(fc, commands.FileModifyCommand):
199
self.executables_found = True
200
if stat.S_ISLNK(fc.mode):
201
self.symlinks_found = True
202
if fc.dataref is not None:
203
if fc.dataref[0] == ':':
204
self._track_blob(fc.dataref)
206
self.sha_blob_references = True
207
elif isinstance(fc, commands.FileRenameCommand):
208
self.rename_old_paths.setdefault(cmd.id, set()).add(fc.old_path)
209
elif isinstance(fc, commands.FileCopyCommand):
210
self.copy_source_paths.setdefault(cmd.id, set()).add(fc.src_path)
213
parents = self.reftracker.track_heads(cmd)
215
# Track the parent counts
216
parent_count = len(parents)
217
if self.parent_counts.has_key(parent_count):
218
self.parent_counts[parent_count] += 1
220
self.parent_counts[parent_count] = 1
221
if parent_count > self.max_parent_count:
222
self.max_parent_count = parent_count
224
# Remember the merges
226
#self.merges.setdefault(cmd.ref, set()).update(cmd.merges)
227
for merge in cmd.merges:
228
if merge in self.merges:
229
self.merges[merge] += 1
231
self.merges[merge] = 1
233
def reset_handler(self, cmd):
234
"""Process a ResetCommand."""
235
self.cmd_counts[cmd.name] += 1
236
if cmd.ref.startswith('refs/tags/'):
237
self.lightweight_tags += 1
239
if cmd.from_ is not None:
240
self.reftracker.track_heads_for_ref(
243
def tag_handler(self, cmd):
244
"""Process a TagCommand."""
245
self.cmd_counts[cmd.name] += 1
247
def feature_handler(self, cmd):
248
"""Process a FeatureCommand."""
249
self.cmd_counts[cmd.name] += 1
250
feature = cmd.feature_name
251
if feature not in commands.FEATURE_NAMES:
252
self.warning("feature %s is not supported - parsing may fail"
255
def _track_blob(self, mark):
256
if mark in self.blob_ref_counts:
257
self.blob_ref_counts[mark] += 1
259
elif mark in self.blobs['used']:
260
self.blob_ref_counts[mark] = 2
261
self.blobs['used'].remove(mark)
262
elif mark in self.blobs['new']:
263
self.blobs['used'].add(mark)
264
self.blobs['new'].remove(mark)
266
self.blobs['unknown'].add(mark)
269
"""Format a found boolean as a string."""
270
return ['no', 'found'][b]
272
def _iterable_as_config_list(s):
273
"""Format an iterable as a sequence of comma-separated strings.
275
To match what ConfigObj expects, a single item list has a trailing comma.
279
return "%s," % (items[0],)
281
return ", ".join(items)