~a-s-usov/bzr-fastimport/fastimport : revision 349

1

2

#

3

# This program is free software; you can redistribute it and/or modify

4

# it under the terms of the GNU General Public License as published by

5

# the Free Software Foundation; either version 2 of the License, or

6

# (at your option) any later version.

7

#

8

# This program is distributed in the hope that it will be useful,

9

# but WITHOUT ANY WARRANTY; without even the implied warranty of

10

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

11

# GNU General Public License for more details.

12

#

13

# You should have received a copy of the GNU General Public License

14

# along with this program. If not, see <http://www.gnu.org/licenses/>.

15

16

"""Import processor that dump stats about the input (and doesn't import)."""

17

18

from bzrlib.plugins.fastimport import (

19

reftracker,

20

)

21

from fastimport import (

22

commands,

23

processor,

24

)

25

from fastimport.helpers import (

26

invert_dict,

27

invert_dictset,

28

)

29

import stat

30

31

32

class InfoProcessor(processor.ImportProcessor):

33

"""An import processor that dumps statistics about the input.

34

35

No changes to the current repository are made.

36

37

As well as providing useful information about an import

38

stream before importing it, this processor is useful for

39

benchmarking the speed at which data can be extracted from

40

the source.

41

"""

42

43

def __init__(self, params=None, verbose=0, outf=None):

44

processor.ImportProcessor.__init__(self, params, verbose,

45

outf=outf)

46

47

def pre_process(self):

48

# Init statistics

49

self.cmd_counts = {}

50

for cmd in commands.COMMAND_NAMES:

51

self.cmd_counts[cmd] = 0

52

self.file_cmd_counts = {}

53

for fc in commands.FILE_COMMAND_NAMES:

54

self.file_cmd_counts[fc] = 0

55

self.parent_counts = {}

56

self.max_parent_count = 0

57

self.committers = set()

58

self.separate_authors_found = False

59

self.symlinks_found = False

60

self.executables_found = False

61

self.sha_blob_references = False

62

self.lightweight_tags = 0

63

# Blob usage tracking

64

self.blobs = {}

65

for usage in ['new', 'used', 'unknown', 'unmarked']:

66

self.blobs[usage] = set()

67

self.blob_ref_counts = {}

68

# Head tracking

69

self.reftracker = reftracker.RefTracker()

70

# Stuff to cache: a map from mark to # of times that mark is merged

71

self.merges = {}

72

# Stuff to cache: these are maps from mark to sets

73

self.rename_old_paths = {}

74

self.copy_source_paths = {}

75

76

def post_process(self):

77

# Dump statistics

78

cmd_names = commands.COMMAND_NAMES

79

fc_names = commands.FILE_COMMAND_NAMES

80

self._dump_stats_group("Command counts",

81

[(c, self.cmd_counts[c]) for c in cmd_names], str)

82

self._dump_stats_group("File command counts",

83

[(c, self.file_cmd_counts[c]) for c in fc_names], str)

84

85

# Commit stats

86

if self.cmd_counts['commit']:

87

p_items = []

88

for i in xrange(0, self.max_parent_count + 1):

89

if i in self.parent_counts:

90

count = self.parent_counts[i]

91

p_items.append(("parents-%d" % i, count))

92

merges_count = len(self.merges.keys())

93

p_items.append(('total revisions merged', merges_count))

94

flags = {

95

'separate authors found': self.separate_authors_found,

96

'executables': self.executables_found,

97

'symlinks': self.symlinks_found,

98

'blobs referenced by SHA': self.sha_blob_references,

99

}

100

self._dump_stats_group("Parent counts", p_items, str)

101

self._dump_stats_group("Commit analysis", flags.iteritems(), _found)

102

heads = invert_dictset(self.reftracker.heads)

103

self._dump_stats_group("Head analysis", heads.iteritems(), None,

104

_iterable_as_config_list)

105

# note("\t%d\t%s" % (len(self.committers), 'unique committers'))

106

self._dump_stats_group("Merges", self.merges.iteritems(), None)

107

# We only show the rename old path and copy source paths when -vv

108

# (verbose=2) is specified. The output here for mysql's data can't

109

# be parsed currently so this bit of code needs more work anyhow ..

110

if self.verbose >= 2:

111

self._dump_stats_group("Rename old paths",

112

self.rename_old_paths.iteritems(), len,

113

_iterable_as_config_list)

114

self._dump_stats_group("Copy source paths",

115

self.copy_source_paths.iteritems(), len,

116

_iterable_as_config_list)

117

118

# Blob stats

119

if self.cmd_counts['blob']:

120

# In verbose mode, don't list every blob used

121

if self.verbose:

122

del self.blobs['used']

123

self._dump_stats_group("Blob usage tracking",

124

self.blobs.iteritems(), len, _iterable_as_config_list)

125

if self.blob_ref_counts:

126

blobs_by_count = invert_dict(self.blob_ref_counts)

127

blob_items = blobs_by_count.items()

128

blob_items.sort()

129

self._dump_stats_group("Blob reference counts",

130

blob_items, len, _iterable_as_config_list)

131

132

# Other stats

133

if self.cmd_counts['reset']:

134

reset_stats = {

135

'lightweight tags': self.lightweight_tags,

136

}

137

self._dump_stats_group("Reset analysis", reset_stats.iteritems())

138

139

def _dump_stats_group(self, title, items, normal_formatter=None,

140

verbose_formatter=None):

141

"""Dump a statistics group.

142

143

In verbose mode, do so as a config file so

144

that other processors can load the information if they want to.

145

:param normal_formatter: the callable to apply to the value

146

before displaying it in normal mode

147

:param verbose_formatter: the callable to apply to the value

148

before displaying it in verbose mode

149

"""

150

if self.verbose:

151

self.outf.write("[%s]\n" % (title,))

152

for name, value in items:

153

if verbose_formatter is not None:

154

value = verbose_formatter(value)

155

if type(name) == str:

156

name = name.replace(' ', '-')

157

self.outf.write("%s = %s\n" % (name, value))

158

self.outf.write("\n")

159

else:

160

self.outf.write("%s:\n" % (title,))

161

for name, value in items:

162

if normal_formatter is not None:

163

value = normal_formatter(value)

164

self.outf.write("\t%s\t%s\n" % (value, name))

165

166

def progress_handler(self, cmd):

167

"""Process a ProgressCommand."""

168

self.cmd_counts[cmd.name] += 1

169

170

def blob_handler(self, cmd):

171

"""Process a BlobCommand."""

172

self.cmd_counts[cmd.name] += 1

173

if cmd.mark is None:

174

self.blobs['unmarked'].add(cmd.id)

175

else:

176

self.blobs['new'].add(cmd.id)

177

# Marks can be re-used so remove it from used if already there.

178

# Note: we definitely do NOT want to remove it from multi if

179

# it's already in that set.

180

try:

181

self.blobs['used'].remove(cmd.id)

182

except KeyError:

183

pass

184

185

def checkpoint_handler(self, cmd):

186

"""Process a CheckpointCommand."""

187

self.cmd_counts[cmd.name] += 1

188

189

def commit_handler(self, cmd):

190

"""Process a CommitCommand."""

191

self.cmd_counts[cmd.name] += 1

192

self.committers.add(cmd.committer)

193

if cmd.author is not None:

194

self.separate_authors_found = True

195

for fc in cmd.iter_files():

196

self.file_cmd_counts[fc.name] += 1

197

if isinstance(fc, commands.FileModifyCommand):

198

if fc.mode & 0111:

199

self.executables_found = True

200

if stat.S_ISLNK(fc.mode):

201

self.symlinks_found = True

202

if fc.dataref is not None:

203

if fc.dataref[0] == ':':

204

self._track_blob(fc.dataref)

205

else:

206

self.sha_blob_references = True

207

elif isinstance(fc, commands.FileRenameCommand):

208

self.rename_old_paths.setdefault(cmd.id, set()).add(fc.old_path)

209

elif isinstance(fc, commands.FileCopyCommand):

210

self.copy_source_paths.setdefault(cmd.id, set()).add(fc.src_path)

211

212

# Track the heads

213

parents = self.reftracker.track_heads(cmd)

214

215

# Track the parent counts

216

parent_count = len(parents)

217

if self.parent_counts.has_key(parent_count):

218

self.parent_counts[parent_count] += 1

219

else:

220

self.parent_counts[parent_count] = 1

221

if parent_count > self.max_parent_count:

222

self.max_parent_count = parent_count

223

224

# Remember the merges

225

if cmd.merges:

226

#self.merges.setdefault(cmd.ref, set()).update(cmd.merges)

227

for merge in cmd.merges:

228

if merge in self.merges:

229

self.merges[merge] += 1

230

else:

231

self.merges[merge] = 1

232

233

def reset_handler(self, cmd):

234

"""Process a ResetCommand."""

235

self.cmd_counts[cmd.name] += 1

236

if cmd.ref.startswith('refs/tags/'):

237

self.lightweight_tags += 1

238

else:

239

if cmd.from_ is not None:

240

self.reftracker.track_heads_for_ref(

241

cmd.ref, cmd.from_)

242

243

def tag_handler(self, cmd):

244

"""Process a TagCommand."""

245

self.cmd_counts[cmd.name] += 1

246

247

def feature_handler(self, cmd):

248

"""Process a FeatureCommand."""

249

self.cmd_counts[cmd.name] += 1

250

feature = cmd.feature_name

251

if feature not in commands.FEATURE_NAMES:

252

self.warning("feature %s is not supported - parsing may fail"

253

% (feature,))

254

255

def _track_blob(self, mark):

256

if mark in self.blob_ref_counts:

257

self.blob_ref_counts[mark] += 1

258

pass

259

elif mark in self.blobs['used']:

260

self.blob_ref_counts[mark] = 2

261

self.blobs['used'].remove(mark)

262

elif mark in self.blobs['new']:

263

self.blobs['used'].add(mark)

264

self.blobs['new'].remove(mark)

265

else:

266

self.blobs['unknown'].add(mark)

267

268

def _found(b):

269

"""Format a found boolean as a string."""

270

return ['no', 'found'][b]

271

272

def _iterable_as_config_list(s):

273

"""Format an iterable as a sequence of comma-separated strings.

274

275

To match what ConfigObj expects, a single item list has a trailing comma.

276

"""

277

items = sorted(s)

278

if len(items) == 1:

279

return "%s," % (items[0],)

280

else:

281

return ", ".join(items)