1
# Copyright (C) 2009, 2010 Canonical Ltd
3
# This program is free software: you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License version 3 as
5
# published by the Free Software Foundation.
7
# This program is distributed in the hope that it will be useful, but
8
# WITHOUT ANY WARRANTY; without even the implied warranty of
9
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10
# General Public License for more details.
12
# You should have received a copy of the GNU General Public License
13
# along with this program. If not, see <http://www.gnu.org/licenses/>.
15
"""Take a given dump file, and bring the data back.
17
Currently requires simplejson to parse.
41
if sys.platform == 'win32':
44
# This is the minimal regex that is guaranteed to match. In testing, it is
45
# about 3x faster than using simplejson, it is just less generic.
46
_object_re = re.compile(
47
r'\{"address": (?P<address>\d+)'
48
r', "type": "(?P<type>.*)"'
49
r', "size": (?P<size>\d+)'
50
r'(, "name": "(?P<name>.*)")?'
51
r'(, "len": (?P<len>\d+))?'
52
r'(, "value": "?(?P<value>.*?)"?)?'
53
r', "refs": \[(?P<refs>[^]]*)\]'
56
_refs_re = re.compile(
61
def _from_json(cls, line, temp_cache=None):
62
val = simplejson.loads(line)
63
# simplejson likes to turn everything into unicode strings, but we know
64
# everything is just a plain 'str', and we can save some bytes if we
66
obj = cls(address=val['address'],
67
type_str=str(val['type']),
70
length=val.get('len', None),
71
value=val.get('value', None),
72
name=val.get('name', None))
73
if (obj.type_str == 'str'):
74
if type(obj.value) is unicode:
75
obj.value = obj.value.encode('latin-1')
76
if temp_cache is not None:
77
obj._intern_from_cache(temp_cache)
81
def _from_line(cls, line, temp_cache=None):
82
m = _object_re.match(line)
84
raise RuntimeError('Failed to parse line: %r' % (line,))
85
(address, type_str, size, name, length, value,
86
refs) = m.group('address', 'type', 'size', 'name', 'len',
88
assert '\\' not in type_str
90
assert '\\' not in name
91
if length is not None:
93
refs = [int(val) for val in _refs_re.findall(refs)]
94
obj = cls(address=int(address),
101
if (obj.type_str == 'str'):
102
if type(obj.value) is unicode:
103
obj.value = obj.value.encode('latin-1')
104
if temp_cache is not None:
105
obj._intern_from_cache(temp_cache)
109
class _TypeSummary(object):
110
"""Information about a given type."""
112
def __init__(self, type_str):
113
self.type_str = type_str
116
self.sq_sum = 0 # used for stddev computation
118
self.max_address = None
125
avg = self.total_size / float(self.count)
126
exp_x2 = self.sq_sum / float(self.count)
127
stddev = math.sqrt(exp_x2 - avg*avg)
128
return '%s: %d, %d bytes, %.3f avg bytes, %.3f std dev, %d max @ %d' % (
129
self.type_str, self.count, self.total_size, avg, stddev,
130
self.max_size, self.max_address)
132
def _add(self, memobj):
134
self.total_size += memobj.size
135
self.sq_sum += (memobj.size * memobj.size)
136
if memobj.size > self.max_size:
137
self.max_size = memobj.size
138
self.max_address = memobj.address
141
class _ObjSummary(object):
142
"""Tracks the summary stats about objects listed."""
145
self.type_summaries = {}
148
self.summaries = None
150
def _add(self, memobj):
152
type_summary = self.type_summaries[memobj.type_str]
154
type_summary = _TypeSummary(memobj.type_str)
155
self.type_summaries[memobj.type_str] = type_summary
156
type_summary._add(memobj)
157
self.total_count += 1
158
self.total_size += memobj.size
161
if self.summaries is None:
164
'Total %d objects, %d types, Total size = %.1fMiB (%d bytes)'
165
% (self.total_count, len(self.summaries), self.total_size / 1024. / 1024,
167
' Index Count % Size % Cum Max Kind'
170
for i in xrange(min(20, len(self.summaries))):
171
summary = self.summaries[i]
172
cumulative += summary.total_size
174
'%6d%8d%4d%10d%4d%4d%8d %s'
175
% (i, summary.count, summary.count * 100.0 / self.total_count,
177
summary.total_size * 100.0 / self.total_size,
178
cumulative * 100.0 / self.total_size, summary.max_size,
180
return '\n'.join(out)
183
summaries = sorted(self.type_summaries.itervalues(),
184
key=lambda x: (x.total_size, x.count),
186
self.summaries = summaries
189
summaries = sorted(self.type_summaries.itervalues(),
190
key=lambda x: (x.count, x.total_size),
192
self.summaries = summaries
195
class ObjManager(object):
196
"""Manage the collection of MemObjects.
198
This is the interface for doing queries, etc.
201
def __init__(self, objs, show_progress=True):
203
self.show_progress = show_progress
205
def __getitem__(self, address):
206
return self.objs[address]
208
def compute_referrers(self):
209
"""Deprecated, use compute_parents instead."""
210
warn.deprecated('.compute_referrers is deprecated.'
211
' Use .compute_parents instead.')
212
return self.compute_parents()
214
def compute_parents(self):
215
"""For each object, figure out who is referencing it."""
217
get_refs = parents.get
218
total = len(self.objs)
220
enabled = gc.isenabled()
222
# We create a *lot* of temporary objects here, which are all
223
# cleaned up perfectly by refcounting, so disable gc for this loop.
226
for idx, obj in enumerate(self.objs.itervalues()):
227
if self.show_progress and idx & 0x3f == 0:
229
if tnow - tlast > 0.1:
231
sys.stderr.write('compute parents %8d / %8d \r'
233
address = obj.address
234
for ref in obj.children:
235
refs = get_refs(ref, None)
236
# This is ugly, so it should be explained.
237
# To save memory pressure, parents will point to one of 4
239
# 1) A simple integer, representing a single referrer
240
# this saves the allocation of a separate structure
242
# 2) A tuple, slightly more efficient than a list, but
243
# requires creating a new tuple to 'add' an entry.
244
# 3) A list, as before, for things with lots of
245
# parents, we use a regular list to let it grow.
246
# 4) None, no references from this object
250
elif t in (int, long):
251
refs = (refs, address)
257
refs = refs + (address,)
261
raise TypeError('unknown refs type: %s\n' % (t,))
263
if self.show_progress:
264
sys.stderr.write('compute parents %8d / %8d \r'
266
for idx, obj in enumerate(self.objs.itervalues()):
267
if self.show_progress and idx & 0x3f == 0:
269
if tnow - tlast > 0.1:
271
sys.stderr.write('set parents %8d / %8d \r'
274
refs = parents.pop(obj.address)
280
elif type(refs) in (int, long):
281
obj.parents = (refs,)
287
if self.show_progress:
288
sys.stderr.write('set parents %8d / %8d \n'
291
def remove_expensive_references(self):
292
"""Filter out references that are mere houskeeping links.
294
module.__dict__ tends to reference lots of other modules, which in turn
295
brings in the global reference cycle. Going further
296
function.__globals__ references module.__dict__, so it *too* ends up in
297
the global cycle. Generally these references aren't interesting, simply
298
because they end up referring to *everything*.
300
We filter out any reference to modules, frames, types, function globals
301
pointers & LRU sideways references.
303
source = lambda:self.objs.itervalues()
304
total_objs = len(self.objs)
305
# Add the 'null' object
306
self.objs.add(0, '<ex-reference>', 0, [])
307
for changed, obj in remove_expensive_references(source, total_objs,
311
def _compute_total_size(self, obj):
312
pending_descendents = list(obj.children)
313
seen = _intset.IDSet()
314
seen.add(obj.address)
315
total_size = obj.size
316
while pending_descendents:
317
next_ref = pending_descendents.pop()
321
next_obj = self.objs.get(next_ref, None)
324
# type and frame types tend to cause us to recurse into
325
# everything. So for now, when we encounter them, don't add
327
total_size += next_obj.size
328
pending_descendents.extend([ref for ref in next_obj.children
331
## # This single object references more than 10% of all objects, and
332
## # expands to more that 10x its direct references
333
## if count > obj.num_refs * 10 and count > break_on:
334
## import pdb; pdb.set_trace()
335
obj.total_size = total_size
338
def compute_total_size(self):
339
"""This computes the total bytes referenced from this object."""
340
# Unfortunately, this is an N^2 operation :(. The problem is that
341
# a.total_size + b.total_size != c.total_size (if c references a & b).
342
# This is because a & b may refer to common objects. Consider something
349
# D & C participate in a refcycle, and B has an alternative path to D.
350
# You certainly don't want to count D 2 times when computing the total
351
# size of A. Also, how do you give the relative contribution of B vs C
353
total = len(self.objs)
354
break_on = total / 10
355
for idx, obj in enumerate(self.objs.itervalues()):
356
if self.show_progress and idx & 0x1ff == 0:
357
sys.stderr.write('compute size %8d / %8d \r'
359
self._compute_total_size(obj)
360
if self.show_progress:
361
sys.stderr.write('compute size %8d / %8d \n'
365
summary = _ObjSummary()
366
for obj in self.objs.itervalues():
370
def get_all(self, type_str):
371
"""Return all objects that match a given type."""
372
all = [o for o in self.objs.itervalues() if o.type_str == type_str]
373
all.sort(key=lambda x:(x.size, len(x), x.num_parents),
377
def collapse_instance_dicts(self):
378
"""Hide the __dict__ member of instances.
380
When a class does not have __slots__ defined, all instances get a
381
separate '__dict__' attribute that actually holds their contents. This
382
adds a level of indirection that can make it harder than it needs to
383
be, to actually find what instance holds what objects.
385
So we collapse those references back into the object, and grow its
386
'size' at the same time.
388
# The instances I'm focusing on have a custom type name, and every
389
# instance has 2 pointers. The first is to __dict__, and the second is
390
# to the 'type' object whose name matches the type of the instance.
391
# Also __dict__ has only 1 referrer, and that is *this* object
393
# TODO: Handle old style classes. They seem to have type 'instanceobj',
394
# and reference a 'classobj' with the actual type name
396
total = len(self.objs)
398
for item_idx, (address, obj) in enumerate(self.objs.items()):
399
if obj.type_str in ('str', 'dict', 'tuple', 'list', 'type',
400
'function', 'wrapper_descriptor',
401
'code', 'classobj', 'int',
404
if self.show_progress and item_idx & 0x3f:
406
if tnow - tlast > 0.1:
408
sys.stderr.write('checked %8d / %8d collapsed %8d \r'
409
% (item_idx, total, collapsed))
410
if obj.type_str == 'module' and len(obj) == 1:
412
if dict_obj.type_str != 'dict':
419
if obj_1.type_str == 'dict' and obj_2.type_str == 'type':
420
# This is a new-style class
423
elif (obj.type_str == 'instance'
424
and obj_1.type_str == 'classobj'
425
and obj_2.type_str == 'dict'):
426
# This is an old-style class
431
extra_refs = [type_obj.address]
432
if (dict_obj.num_parents != 1
433
or dict_obj.parents[0] != address):
436
# We found an instance \o/
437
new_refs = list(dict_obj.children)
438
new_refs.extend(extra_refs)
439
obj.children = new_refs
440
obj.size = obj.size + dict_obj.size
442
if obj.type_str == 'instance':
443
obj.type_str = type_obj.value
444
# Now that all the data has been moved into the instance, remove
445
# the dict from the collection
446
del self.objs[dict_obj.address]
447
if self.show_progress:
448
sys.stderr.write('checked %8d / %8d collapsed %8d \n'
449
% (item_idx, total, collapsed))
451
self.compute_parents()
453
def refs_as_dict(self, obj):
454
"""Expand the ref list considering it to be a 'dict' structure.
456
Often we have dicts that point to simple strings and ints, etc. This
457
tries to expand that as much as possible.
459
:param obj: Should be a MemObject representing an instance (that has
460
been collapsed) or a dict.
462
return obj.refs_as_dict()
464
def refs_as_list(self, obj):
465
"""Expand the ref list, considering it to be a list structure."""
467
children = obj.children
468
for addr in children:
469
val = self.objs[addr]
470
if val.type_str == 'bool':
471
val = (val.value == 'True')
472
elif val.value is not None:
474
elif val.type_str == 'NoneType':
481
def load(source, using_json=None, show_prog=True):
482
"""Load objects from the given source.
484
:param source: If this is a string, we will open it as a file and read all
485
objects. For any other type, we will simply iterate and parse objects
486
out, so the object should be an iterator of json lines.
487
:param using_json: Use simplejson rather than the regex. This allows
488
arbitrary ordered json dicts to be parsed but still requires per-line
489
layout. Set to 'False' to indicate you want to use the regex, set to
490
'True' to force using simplejson. None will probe to see if simplejson
491
is available, and use it if it is. (With _speedups built, simplejson
492
parses faster and more accurately than the regex.)
493
:param show_prog: If True, display the progress as we read in data
496
if isinstance(source, str):
497
source, cleanup = files.open_file(source)
498
if isinstance(source, file):
499
input_size = os.fstat(source.fileno()).st_size
502
elif isinstance(source, (list, tuple)):
503
input_size = sum(map(len, source))
506
if using_json is None:
507
using_json = (simplejson is not None)
509
return _load(source, using_json, show_prog, input_size)
511
if cleanup is not None:
515
def iter_objs(source, using_json=False, show_prog=False, input_size=0,
516
objs=None, factory=None):
517
"""Iterate MemObjects from json.
519
:param source: A line iterator.
520
:param using_json: Use simplejson. See load().
521
:param show_prog: Show progress.
522
:param input_size: The size of the input if known (in bytes) or 0.
523
:param objs: Either None or a dict containing objects by address. If not
524
None, then duplicate objects will not be parsed or output.
525
:param factory: Use this to create new instances, if None, use
526
_loader._MemObjectProxy.from_args
527
:return: A generator of memory objects.
531
input_mb = input_size / 1024. / 1024.
533
address_re = re.compile(
534
r'{"address": (?P<address>\d+)'
536
bytes_read = count = 0
544
factory = _loader._MemObjectProxy_from_args
545
for line_num, line in enumerate(source):
546
bytes_read += len(line)
547
if line in ("[\n", "]\n"):
549
if line.endswith(',\n'):
552
# Skip duplicate objects
553
m = address_re.match(line)
556
address = int(m.group('address'))
559
yield decoder(factory, line, temp_cache=temp_cache)
560
if show_prog and (line_num - last > 5000):
562
mb_read = bytes_read / 1024. / 1024
563
tdelta = timer() - tstart
565
'loading... line %d, %d objs, %5.1f / %5.1f MiB read in %.1fs\r'
566
% (line_num, len(objs), mb_read, input_mb, tdelta))
568
mb_read = bytes_read / 1024. / 1024
569
tdelta = timer() - tstart
571
'loaded line %d, %d objs, %5.1f / %5.1f MiB read in %.1fs \n'
572
% (line_num, len(objs), mb_read, input_mb, tdelta))
575
def _load(source, using_json, show_prog, input_size):
576
objs = _loader.MemObjectCollection()
577
for memobj in iter_objs(source, using_json, show_prog, input_size, objs,
579
# objs.add automatically adds the object as it is created
581
return ObjManager(objs, show_progress=show_prog)
584
def remove_expensive_references(source, total_objs=0, show_progress=False):
585
"""Filter out references that are mere houskeeping links.
587
module.__dict__ tends to reference lots of other modules, which in turn
588
brings in the global reference cycle. Going further
589
function.__globals__ references module.__dict__, so it *too* ends up in
590
the global cycle. Generally these references aren't interesting, simply
591
because they end up referring to *everything*.
593
We filter out any reference to modules, frames, types, function globals
594
pointers & LRU sideways references.
596
:param source: A callable that returns an iterator of MemObjects. This
597
will be called twice.
598
:param total_objs: The total objects to be filtered, if known. If
599
show_progress is False or the count of objects is unknown, 0.
600
:return: An iterator of (changed, MemObject) objects with expensive
603
# First pass, find objects we don't want to reference any more
604
noref_objs = _intset.IDSet()
605
lru_objs = _intset.IDSet()
606
total_steps = total_objs * 2
608
for idx, obj in enumerate(source()):
609
# 'module's have a single __dict__, which tends to refer to other
610
# modules. As you start tracking into that, you end up getting into
611
# reference cycles, etc, which generally ends up referencing every
613
# 'frame' also tends to be self referential, and a single frame
614
# ends up referencing the entire current state
615
# 'type' generally is self referential through several attributes.
616
# __bases__ means we recurse all the way up to object, and object
617
# has __subclasses__, which means we recurse down into all types.
618
# In general, not helpful for debugging memory consumption
619
if show_progress and idx & 0x1ff == 0:
620
sys.stderr.write('finding expensive refs... %8d / %8d \r'
621
% (idx, total_steps))
622
if obj.type_str in ('module', 'frame', 'type'):
623
noref_objs.add(obj.address)
624
if obj.type_str == '_LRUNode':
625
lru_objs.add(obj.address)
628
# Second pass, any object which refers to something in noref_objs will
629
# have that reference removed, and replaced with the null_memobj
630
num_expensive = len(noref_objs)
631
null_memobj = _loader._MemObjectProxy_from_args(0, '<ex-reference>', 0, [])
633
yield (True, null_memobj)
634
if show_progress and total_objs == 0:
636
total_steps = total_objs * 2
637
for idx, obj in enumerate(source()):
638
if show_progress and idx & 0x1ff == 0:
639
sys.stderr.write('removing %d expensive refs... %8d / %8d \r'
640
% (num_expensive, idx + total_objs,
642
if obj.type_str == 'function':
643
# Functions have a reference to 'globals' which is not very
644
# helpful for having a clear understanding of what is going on
645
# especially since the function itself is in its own globals
646
# XXX: This is probably not a guaranteed order, but currently
647
# func_traverse returns:
648
# func_code, func_globals, func_module, func_defaults,
649
# func_doc, func_name, func_dict, func_closure
650
# We want to remove the reference to globals and module
651
refs = list(obj.children)
652
obj.children = refs[:1] + refs[3:] + [0]
655
elif obj.type_str == '_LRUNode':
656
# We remove the 'sideways' references
657
obj.children = [ref for ref in obj.children
658
if ref not in lru_objs]
661
for ref in obj.children:
662
if ref in noref_objs:
665
# No bad references, keep going
668
new_ref_list = [ref for ref in obj.children
669
if ref not in noref_objs]
670
new_ref_list.append(0)
671
obj.children = new_ref_list
674
sys.stderr.write('removed %d expensive refs from %d objs%s\n'
675
% (num_expensive, total_objs, ' '*20))