~ubuntu-branches/debian/jessie/bzr-fastimport/jessie

« back to all changes in this revision

Viewing changes to processors/info_processor.py

Committer: Bazaar Package Importer
Author(s): Jelmer Vernooij
Date: 2010-11-06 18:40:27 UTC
mfrom: (1.1.6 upstream)
Revision ID: james.westby@ubuntu.com-20101106184027-iclo8iim9equ6i8b

Tags: 0.9.0+bzr279-1

* New upstream snapshot.
* Bump standards version to 3.9.1 (no changes).
* Run testsuite during package build.

files added:
.pc

.pc/.version

.pc/applied-patches

.pc/debian-changes-0.9.0+bzr279-1

.pc/debian-changes-0.9.0+bzr279-1/tests

.pc/debian-changes-0.9.0+bzr279-1/tests/test_branch_mapper.py

.pc/debian-changes-0.9.0+bzr279-1/tests/test_filter_processor.py

.pc/debian-changes-0.9.0+bzr279-1/tests/test_head_tracking.py

.pc/debian-changes-0.9.0+bzr279-1/tests/test_helpers.py

debian/patches

debian/patches/debian-changes-0.9.0+bzr279-1

debian/patches/series

exporters/darcs/t/testimport-gitsymlink.sh

files removed:
commands.py

dates.py

errors.py

idmapfile.py

parser.py

processor.py

processors/filter_processor.py

processors/info_processor.py

processors/query_processor.py

tests/test_commands.py

tests/test_errors.py

tests/test_parser.py

files modified:
NEWS

README.txt

__init__.py

branch_updater.py

bzr_commit_handler.py

bzr_exporter.py

cache_manager.py

debian/changelog

debian/control

debian/rules

exporters/darcs/TODO

exporters/darcs/darcs-fast-export

exporters/darcs/darcs-fast-import

exporters/darcs/darcs-fast-import.txt

exporters/darcs/git-darcs

exporters/darcs/git-darcs.txt

exporters/darcs/t/lib.sh

exporters/darcs/x2d

exporters/darcs/x2d.txt

helpers.py

processors/generic_processor.py

revision_store.py

setup.py

tests/__init__.py

tests/test_generic_processor.py

tests/test_revision_store.py

Show diffs side-by-side

added added

removed removed

processors/info_processor.py

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation; either version 2 of the License, or

# (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program; if not, write to the Free Software

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

"""Import processor that dump stats about the input (and doesn't import)."""

from bzrlib.trace import (

note,

warning,

)

from bzrlib.plugins.fastimport import (

cache_manager,

commands,

helpers,

processor,

)

class InfoProcessor(processor.ImportProcessor):

"""An import processor that dumps statistics about the input.

No changes to the current repository are made.

As well as providing useful information about an import

stream before importing it, this processor is useful for

benchmarking the speed at which data can be extracted from

the source.

"""

def __init__(self, target=None, params=None, verbose=0, outf=None):

# Allow creation without a target

processor.ImportProcessor.__init__(self, target, params, verbose,

outf=outf)

def pre_process(self):

self.note("Collecting statistics ...")

# Init statistics

self.cmd_counts = {}

for cmd in commands.COMMAND_NAMES:

self.cmd_counts[cmd] = 0

self.file_cmd_counts = {}

for fc in commands.FILE_COMMAND_NAMES:

self.file_cmd_counts[fc] = 0

self.parent_counts = {}

self.max_parent_count = 0

self.committers = set()

self.separate_authors_found = False

self.symlinks_found = False

self.executables_found = False

self.sha_blob_references = False

self.lightweight_tags = 0

# Blob usage tracking

self.blobs = {}

for usage in ['new', 'used', 'unknown', 'unmarked']:

self.blobs[usage] = set()

self.blob_ref_counts = {}

# Head tracking - delegate to the cache manager

self.cache_mgr = cache_manager.CacheManager(inventory_cache_size=0)

# Stuff to cache: a map from mark to # of times that mark is merged

self.merges = {}

# Stuff to cache: these are maps from mark to sets

self.rename_old_paths = {}

self.copy_source_paths = {}

def post_process(self):

# Dump statistics

cmd_names = commands.COMMAND_NAMES

fc_names = commands.FILE_COMMAND_NAMES

self._dump_stats_group("Command counts",

[(c, self.cmd_counts[c]) for c in cmd_names], str)

self._dump_stats_group("File command counts",

[(c, self.file_cmd_counts[c]) for c in fc_names], str)

# Commit stats

if self.cmd_counts['commit']:

p_items = []

for i in xrange(0, self.max_parent_count + 1):

if i in self.parent_counts:

count = self.parent_counts[i]

p_items.append(("parents-%d" % i, count))

merges_count = len(self.merges.keys())

p_items.append(('total revisions merged', merges_count))

flags = {

'separate authors found': self.separate_authors_found,

'executables': self.executables_found,

'symlinks': self.symlinks_found,

100

'blobs referenced by SHA': self.sha_blob_references,

101

}

102

self._dump_stats_group("Parent counts", p_items, str)

103

self._dump_stats_group("Commit analysis", flags.iteritems(), _found)

104

heads = helpers.invert_dictset(self.cache_mgr.heads)

105

self._dump_stats_group("Head analysis", heads.iteritems(), None,

106

_iterable_as_config_list)

107

# note("\t%d\t%s" % (len(self.committers), 'unique committers'))

108

self._dump_stats_group("Merges", self.merges.iteritems(), None)

109

# We only show the rename old path and copy source paths when -vv

110

# (verbose=2) is specified. The output here for mysql's data can't

111

# be parsed currently so this bit of code needs more work anyhow ..

112

if self.verbose >= 2:

113

self._dump_stats_group("Rename old paths",

114

self.rename_old_paths.iteritems(), len,

115

_iterable_as_config_list)

116

self._dump_stats_group("Copy source paths",

117

self.copy_source_paths.iteritems(), len,

118

_iterable_as_config_list)

119

120

# Blob stats

121

if self.cmd_counts['blob']:

122

# In verbose mode, don't list every blob used

123

if self.verbose:

124

del self.blobs['used']

125

self._dump_stats_group("Blob usage tracking",

126

self.blobs.iteritems(), len, _iterable_as_config_list)

127

if self.blob_ref_counts:

128

blobs_by_count = helpers.invert_dict(self.blob_ref_counts)

129

blob_items = blobs_by_count.items()

130

blob_items.sort()

131

self._dump_stats_group("Blob reference counts",

132

blob_items, len, _iterable_as_config_list)

133

134

# Other stats

135

if self.cmd_counts['reset']:

136

reset_stats = {

137

'lightweight tags': self.lightweight_tags,

138

}

139

self._dump_stats_group("Reset analysis", reset_stats.iteritems())

140

141

def _dump_stats_group(self, title, items, normal_formatter=None,

142

verbose_formatter=None):

143

"""Dump a statistics group.

144

145

In verbose mode, do so as a config file so

146

that other processors can load the information if they want to.

147

:param normal_formatter: the callable to apply to the value

148

before displaying it in normal mode

149

:param verbose_formatter: the callable to apply to the value

150

before displaying it in verbose mode

151

"""

152

if self.verbose:

153

self.outf.write("[%s]\n" % (title,))

154

for name, value in items:

155

if verbose_formatter is not None:

156

value = verbose_formatter(value)

157

if type(name) == str:

158

name = name.replace(' ', '-')

159

self.outf.write("%s = %s\n" % (name, value))

160

self.outf.write("\n")

161

else:

162

self.outf.write("%s:\n" % (title,))

163

for name, value in items:

164

if normal_formatter is not None:

165

value = normal_formatter(value)

166

self.outf.write("\t%s\t%s\n" % (value, name))

167

168

def progress_handler(self, cmd):

169

"""Process a ProgressCommand."""

170

self.cmd_counts[cmd.name] += 1

171

172

def blob_handler(self, cmd):

173

"""Process a BlobCommand."""

174

self.cmd_counts[cmd.name] += 1

175

if cmd.mark is None:

176

self.blobs['unmarked'].add(cmd.id)

177

else:

178

self.blobs['new'].add(cmd.id)

179

# Marks can be re-used so remove it from used if already there.

180

# Note: we definitely do NOT want to remove it from multi if

181

# it's already in that set.

182

try:

183

self.blobs['used'].remove(cmd.id)

184

except KeyError:

185

pass

186

187

def checkpoint_handler(self, cmd):

188

"""Process a CheckpointCommand."""

189

self.cmd_counts[cmd.name] += 1

190

191

def commit_handler(self, cmd):

192

"""Process a CommitCommand."""

193

self.cmd_counts[cmd.name] += 1

194

self.committers.add(cmd.committer)

195

if cmd.author is not None:

196

self.separate_authors_found = True

197

for fc in cmd.file_iter():

198

self.file_cmd_counts[fc.name] += 1

199

if isinstance(fc, commands.FileModifyCommand):

200

if fc.is_executable:

201

self.executables_found = True

202

if fc.kind == commands.SYMLINK_KIND:

203

self.symlinks_found = True

204

if fc.dataref is not None:

205

if fc.dataref[0] == ':':

206

self._track_blob(fc.dataref)

207

else:

208

self.sha_blob_references = True

209

elif isinstance(fc, commands.FileRenameCommand):

210

self.rename_old_paths.setdefault(cmd.id, set()).add(fc.old_path)

211

elif isinstance(fc, commands.FileCopyCommand):

212

self.copy_source_paths.setdefault(cmd.id, set()).add(fc.src_path)

213

214

# Track the heads

215

parents = self.cache_mgr.track_heads(cmd)

216

217

# Track the parent counts

218

parent_count = len(parents)

219

if self.parent_counts.has_key(parent_count):

220

self.parent_counts[parent_count] += 1

221

else:

222

self.parent_counts[parent_count] = 1

223

if parent_count > self.max_parent_count:

224

self.max_parent_count = parent_count

225

226

# Remember the merges

227

if cmd.merges:

228

#self.merges.setdefault(cmd.ref, set()).update(cmd.merges)

229

for merge in cmd.merges:

230

if merge in self.merges:

231

self.merges[merge] += 1

232

else:

233

self.merges[merge] = 1

234

235

def reset_handler(self, cmd):

236

"""Process a ResetCommand."""

237

self.cmd_counts[cmd.name] += 1

238

if cmd.ref.startswith('refs/tags/'):

239

self.lightweight_tags += 1

240

else:

241

if cmd.from_ is not None:

242

self.cache_mgr.track_heads_for_ref(cmd.ref, cmd.from_)

243

244

def tag_handler(self, cmd):

245

"""Process a TagCommand."""

246

self.cmd_counts[cmd.name] += 1

247

248

def feature_handler(self, cmd):

249

"""Process a FeatureCommand."""

250

self.cmd_counts[cmd.name] += 1

251

feature = cmd.feature_name

252

if feature not in commands.FEATURE_NAMES:

253

self.warning("feature %s is not supported - parsing may fail"

254

% (feature,))

255

256

def _track_blob(self, mark):

257

if mark in self.blob_ref_counts:

258

self.blob_ref_counts[mark] += 1

259

pass

260

elif mark in self.blobs['used']:

261

self.blob_ref_counts[mark] = 2

262

self.blobs['used'].remove(mark)

263

elif mark in self.blobs['new']:

264

self.blobs['used'].add(mark)

265

self.blobs['new'].remove(mark)

266

else:

267

self.blobs['unknown'].add(mark)

268

269

def _found(b):

270

"""Format a found boolean as a string."""

271

return ['no', 'found'][b]

272

273

def _iterable_as_config_list(s):

274

"""Format an iterable as a sequence of comma-separated strings.

275

276

To match what ConfigObj expects, a single item list has a trailing comma.

277

"""

278

items = sorted(s)

279

if len(items) == 1:

280

return "%s," % (items[0],)

281

else:

282

return ", ".join(items)

Older »