1
#! /usr/bin/python -Qwarnall
3
# This script profiles loading of documents in Calligra.
4
# It is called like this:
5
# profileOfficeFileLoading.py $dir $outputfile
6
# The script outputs an csv file that contains the times various functions took
7
# for each file in directory $dir. The directory $dir is scanned recursively.
9
# Copyright 2010 Jos van den Oever <jos@vandenoever.info>
11
import sys, os, tempfile, time, signal, subprocess, re, lxml.etree, zipfile
14
'calligrawords': ['odt', 'doc', 'docx'],
15
'calligrastage': ['odp', 'ppt', 'pptx'],
16
'calligrasheets': ['ods', 'xls', 'xlsx']
30
# limit how many backtraces are recordes, since it takes a lot of time
33
def scanDirectory(rootdir, extensions):
34
if os.path.isfile(rootdir):
37
filext = map(lambda e: "." + e, extensions)
39
for root, directories, files in os.walk(rootdir):
41
if file.startswith('.'):
43
if any(map(lambda e: file.endswith(e), filext)):
44
list.append(os.path.join(root, file))
45
return frozenset(list)
47
def getExtensions(list):
50
(path, ext) = os.path.splitext(file)
51
extensions.append(ext)
52
return frozenset(extensions)
58
# functions for logging, in this case to TeamCity
62
def escape(self, text):
64
if m.group(0) == '\n':
66
if m.group(0) == '\r':
69
return re.sub('[\'\n\r\|\]]', escape, text)
70
def startTestSuite(self, name):
72
print "##teamcity[testSuiteStarted name='" + self.suitename \
75
def endTestSuite(self):
76
if not self.suitename: return
77
print "##teamcity[testSuiteFinished name='" + self.suitename \
81
def startTest(self, name):
82
if not self.suitename: return
84
print "##teamcity[testStarted name='" + self.testname + "']"
86
# fail the current test
87
def failTest(self, backtrace):
88
if not self.suitename or not self.testname: return
91
bt = bt + self.escape(l)
92
print "##teamcity[testFailed name='" + self.testname \
93
+ "' details='" + bt + "']"
95
# end test, pass duration as integer representing the milliseconds
96
def endTest(self, duration):
97
if not self.suitename or not self.testname: return
98
print "##teamcity[testFinished name='" + self.testname \
99
+ "' duration='" + str(duration) + "']"
103
def containsRealError(err):
104
# some errors reported by libxml2 are false positives, we filter them
106
if str(err).find("ERROR:RELAXNGV:RELAXNG_ERR_CONTENTVALID: Element styles failed to validate content") != -1:
108
if str(err).find("ERROR:RELAXNGV:RELAXNG_ERR_CONTENTVALID: Element automatic-styles failed to validate content") != -1:
115
self.relaxNGValidator = lxml.etree.RelaxNG( \
116
lxml.etree.parse(open(os.path.join(path, \
117
'OpenDocument-v1.2-cs01-schema-calligra.rng'),
119
self.relaxNGManifextValidator = lxml.etree.RelaxNG( \
120
lxml.etree.parse(open(os.path.join(path, \
121
'OpenDocument-v1.2-cs01-manifest-schema.rng'), \
123
# returns error string on error, None otherwise
124
def validate(self, odtpath):
126
zip = zipfile.ZipFile(odtpath, 'r')
127
except zipfile.BadZipfile as e:
129
err = self.validateFile(zip, 'content.xml',
130
self.relaxNGValidator)
131
if (containsRealError(err)):
133
err = self.validateFile(zip, 'styles.xml',
134
self.relaxNGValidator)
135
if (containsRealError(err)):
137
err = self.validateFile(zip, 'META-INF/manifest.xml',
138
self.relaxNGManifextValidator)
139
if (containsRealError(err)):
143
def validateFile(self, zip, file, validator):
145
data = zip.read(file)
146
xml = lxml.etree.XML(data);
147
except lxml.etree.XMLSyntaxError as e:
149
except KeyError as e:
151
if len(data) > 1000000:
152
# if the xml file is larger than 1M, the validator may hang
154
if not validator.validate(xml):
155
return validator.error_log.last_error
157
def getExecutablePath(exe):
160
for p in env['PATH'].split(':'):
161
exepath = os.path.join(p, exe)
162
if os.path.exists(exepath):
166
def runCommand(exepath, arguments, captureStdOut):
170
(fileno, tmpfilename) = tempfile.mkstemp()
171
stdout = os.fdopen(fileno, 'w')
173
args = 'ulimit -v 1000000 -t 60 -c 0;' + exepath
175
args += ' "' + s + '"'
177
process = subprocess.Popen(['/bin/bash', '-c', args], env=env,
178
close_fds=True, stdout=stdout, stderr=None)
179
s = os.wait4(process.pid, os.WNOHANG)
183
while s[0] == 0 and s[1] == 0 and waited < maxwaittime:
187
s = os.wait4(process.pid, os.WNOHANG)
188
if waited >= maxwaittime:
189
# if the process did not end nicely, kill it
191
os.killpg(process.pid, signal.SIGKILL)
195
s = os.wait4(process.pid, 0)
199
r.utime = s[2].ru_utime
200
r.stime = s[2].ru_stime
205
stdout = open(tmpfilename, 'r')
206
r.stdout = stdout.readlines()
208
os.remove(tmpfilename)
211
def profile(dir, file, logger, validator):
213
logger.startTest(file)
214
file = os.path.join(dir, file)
215
(path, ext) = os.path.splitext(file)
218
for f in applications.keys():
219
if ext in applications[f]:
221
exepath = getExecutablePath(exe)
224
(fileno, tmpfilename) = tempfile.mkstemp()
225
roundtripfilename = None
227
# in case of ODF file, do a roundtrip
228
m = re.match('.*(\.od.)$', file)
229
(roundtripfd, roundtripfilename) = tempfile.mkstemp("." + extensions[ext])
235
args += ["--roundtrip-filename", roundtripfilename]
236
args += ["--benchmark-loading", "--profile-filename", tmpfilename,
237
"--nocrashhandler", file]
238
r = runCommand(exepath, args, False)
239
outfile = os.fdopen(fileno, 'r')
240
r.lines = outfile.readlines()
242
os.close(roundtripfd)
244
if r.returnValue != 0:
245
if maxbacktraces > 0:
247
# generate a backtrace
248
args = ["--batch", "--eval-command=run",
249
"--eval-command=bt", "--args"] + [exepath] + args
250
exepath = getExecutablePath("gdb")
251
debugresult = runCommand(exepath, args, True)
252
r.backtrace = debugresult.stdout
253
for l in r.backtrace:
256
logger.failTest(r.backtrace)
258
logger.failTest("Crash, no backtrace: limit reached.")
261
# convert ms office file to odf
262
exepath = getExecutablePath("calligraconverter")
263
args = [file, roundtripfilename]
264
cr = runCommand(exepath, args, False)
266
err = validator.validate(roundtripfilename);
268
logger.failTest(str(err))
270
# remove the roundtripfile and the temporary file
271
os.remove(tmpfilename)
272
os.remove(roundtripfilename)
274
logger.endTest(int((r.utime + r.stime)*1000))
277
def getGlobalMinMax(times):
278
keys = times.min.keys()
279
min = times.min[keys[0]]
280
max = times.max[keys[0]]
282
if times.min[k] <= min:
284
if times.max[k] >= max:
288
def getGlobalTask(times):
289
keys = times.min.keys()
290
(min, max) = getGlobalMinMax(times)
293
if times.min[k] == min and times.max[k] == max:
297
def getFurthestEnd(start, times, excludedkey):
299
for k in times.min.keys():
300
if times.min[k] <= start and times.max[k] > end and k != excludedkey:
306
def getClosestStart(start, times, excludedkey):
308
for k in times.min.keys():
309
if times.min[k] > start and times.min[k] < s and k != excludedkey:
313
# if there is one task that starts before all other tasks and ends after all
314
# other tasks, then count the time that there is no other task running as a sub-
315
# task of this global task
316
# if there is no global task, then count the time between the first and last
317
# timepoint where there is no task running
318
def getUnaccountedTime(lines, times):
319
if len(times.min.keys()) == 0:
322
globaltask = getGlobalTask(times)
323
(min, max) = getGlobalMinMax(times)
327
while currentmax != max:
328
s = getFurthestEnd(currentmax, times, globaltask)
330
s = getClosestStart(currentmax, times, globaltask)
333
unaccounted += s - currentmax
337
def summarize(lines):
342
(fieldname, time, percentdone) = l.split('\t')
344
if not fieldname in r.min or r.min[fieldname] > time:
345
r.min[fieldname] = time
346
if not fieldname in r.max or r.max[fieldname] < time:
347
r.max[fieldname] = time
350
def profileAll(dir, loggername):
352
for v in applications.values():
355
officefiles = scanDirectory(dir, exts);
356
usedExts = getExtensions(officefiles)
359
validator = odfvalidator()
362
log.startTestSuite(loggername + '-' + ext[1:])
363
for f in officefiles:
364
(path, pathext) = os.path.splitext(f)
366
relf = os.path.relpath(f, dir)
367
result = profile(dir, relf, log, validator)
372
def addMapEntry(map, start, end):
385
def createStackTraceGraph(results):
386
exepath = getExecutablePath('dot')
392
for file in results.keys():
396
name = os.path.basename(file)
398
for l in r.backtrace:
401
m = re.search('/calligra/.*/([^/]+:\d+)$', l)
405
edges = addMapEntry(edges, key, name)
408
(fileno, tmpfilename) = tempfile.mkstemp()
409
out = os.fdopen(fileno, 'w')
410
out.write('digraph {')
411
svn = 'http://websvn.kde.org/trunk'
413
m = re.search('(/calligra/.*):(\d+)$', nodes[a])
414
n = '"' + a + '" [URL = "'
416
out.write(n + svn + m.group(1) + '?view=markup#l'
417
+ m.group(2) + '"];')
419
m = re.search('(/calligratests/.*)', nodes[a])
421
out.write(n + svn + '/tests' + m.group(1)
423
for a in edges.keys():
424
for b in edges[a].keys():
425
out.write('"' + a + '" -> "' + b + '" [penwidth='
426
+ str(edges[a][b]) + '];')
431
args = ["-Tsvg", "-ostacktraces.svg", tmpfilename]
432
r = runCommand(exepath, args, False)
433
os.remove(tmpfilename)
435
if __name__ == '__main__':
439
if len(sys.argv) > 3:
440
loggername = sys.argv[3]
441
results = profileAll(dir, loggername)
444
for r in results.keys():
445
results[r].times = summarize(results[r].lines)
446
results[r].unaccounted = getUnaccountedTime(results[r].lines,
448
for f in results[r].times.min:
452
fields[f].totalduration = 0
453
fields[f].totaldurationsquared = 0
454
fields[f].min = 10000000
455
min = results[r].times.min[f]
456
duration = results[r].times.max[f] - min
457
fields[f].totalduration += duration
458
fields[f].totaldurationsquared += duration*duration
460
if fields[f].min > min:
462
fieldnames = fields.keys()
463
def sort(keya, keyb):
464
return cmp(fields[keya].min, fields[keyb].min)
465
fieldnames.sort(sort)
467
# collect all fieldnames
468
out = open(output, 'wb')
470
out.write('filename\text\tsize\tr\tutime\tstime\tunaccounted')
475
out.write('average\t\t\t\t\t\t')
480
out.write(str(f.totalduration/f.count))
482
# write for each analyzed file
483
for file in results.keys():
485
(path, ext) = os.path.splitext(file)
487
out.write(file + '\t' + ext[1:] + '\t' + str(stat.st_size) + '\t' + str(r.returnValue))
488
out.write('\t' + str(r.utime) + '\t' + str(r.stime))
489
out.write('\t' + str(r.unaccounted))
493
out.write(str(r.times.max[f]-r.times.min[f]))
497
createStackTraceGraph(results)