~ubuntu-branches/debian/sid/meliae/sid : revision 1

1

2

#

3

# This program is free software: you can redistribute it and/or modify

4

# it under the terms of the GNU General Public License version 3 as

5

# published by the Free Software Foundation.

6

#

7

# This program is distributed in the hope that it will be useful, but

8

# WITHOUT ANY WARRANTY; without even the implied warranty of

9

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

10

# General Public License for more details.

11

#

12

# You should have received a copy of the GNU General Public License

13

# along with this program. If not, see <http://www.gnu.org/licenses/>.

14

15

"""Take a given dump file, and bring the data back.

16

17

Currently requires simplejson to parse.

18

"""

19

20

import gc

21

import math

22

import os

23

import re

24

import sys

25

import time

26

27

try:

28

import simplejson

29

except ImportError:

30

simplejson = None

31

32

from meliae import (

33

files,

34

_intset,

35

_loader,

36

warn,

37

)

38

39

40

timer = time.time

41

if sys.platform == 'win32':

42

timer = time.clock

43

44

# This is the minimal regex that is guaranteed to match. In testing, it is

45

# about 3x faster than using simplejson, it is just less generic.

46

_object_re = re.compile(

47

r'\{"address": (?P<address>\d+)'

48

r', "type": "(?P<type>.*)"'

49

r', "size": (?P<size>\d+)'

50

r'(, "name": "(?P<name>.*)")?'

51

r'(, "len": (?P<len>\d+))?'

52

r'(, "value": "?(?P<value>.*?)"?)?'

53

r', "refs": \[(?P<refs>[^]]*)\]'

54

r'\}')

55

56

_refs_re = re.compile(

57

r'(?P<ref>\d+)'

58

)

59

60

61

def _from_json(cls, line, temp_cache=None):

62

val = simplejson.loads(line)

63

# simplejson likes to turn everything into unicode strings, but we know

64

# everything is just a plain 'str', and we can save some bytes if we

65

# cast it back

66

obj = cls(address=val['address'],

67

type_str=str(val['type']),

68

size=val['size'],

69

children=val['refs'],

70

length=val.get('len', None),

71

value=val.get('value', None),

72

name=val.get('name', None))

73

if (obj.type_str == 'str'):

74

if type(obj.value) is unicode:

75

obj.value = obj.value.encode('latin-1')

76

if temp_cache is not None:

77

obj._intern_from_cache(temp_cache)

78

return obj

79

80

81

def _from_line(cls, line, temp_cache=None):

82

m = _object_re.match(line)

83

if not m:

84

raise RuntimeError('Failed to parse line: %r' % (line,))

85

(address, type_str, size, name, length, value,

86

refs) = m.group('address', 'type', 'size', 'name', 'len',

87

'value', 'refs')

88

assert '\\' not in type_str

89

if name is not None:

90

assert '\\' not in name

91

if length is not None:

92

length = int(length)

93

refs = [int(val) for val in _refs_re.findall(refs)]

94

obj = cls(address=int(address),

95

type_str=type_str,

96

size=int(size),

97

children=refs,

98

length=length,

99

value=value,

100

name=name)

101

if (obj.type_str == 'str'):

102

if type(obj.value) is unicode:

103

obj.value = obj.value.encode('latin-1')

104

if temp_cache is not None:

105

obj._intern_from_cache(temp_cache)

106

return obj

107

108

109

class _TypeSummary(object):

110

"""Information about a given type."""

111

112

def __init__(self, type_str):

113

self.type_str = type_str

114

self.count = 0

115

self.total_size = 0

116

self.sq_sum = 0 # used for stddev computation

117

self.max_size = 0

118

self.max_address = None

119

120

def __repr__(self):

121

if self.count == 0:

122

avg = 0

123

stddev = 0

124

else:

125

avg = self.total_size / float(self.count)

126

exp_x2 = self.sq_sum / float(self.count)

127

stddev = math.sqrt(exp_x2 - avg*avg)

128

return '%s: %d, %d bytes, %.3f avg bytes, %.3f std dev, %d max @ %d' % (

129

self.type_str, self.count, self.total_size, avg, stddev,

130

self.max_size, self.max_address)

131

132

def _add(self, memobj):

133

self.count += 1

134

self.total_size += memobj.size

135

self.sq_sum += (memobj.size * memobj.size)

136

if memobj.size > self.max_size:

137

self.max_size = memobj.size

138

self.max_address = memobj.address

139

140

141

class _ObjSummary(object):

142

"""Tracks the summary stats about objects listed."""

143

144

def __init__(self):

145

self.type_summaries = {}

146

self.total_count = 0

147

self.total_size = 0

148

self.summaries = None

149

150

def _add(self, memobj):

151

try:

152

type_summary = self.type_summaries[memobj.type_str]

153

except KeyError:

154

type_summary = _TypeSummary(memobj.type_str)

155

self.type_summaries[memobj.type_str] = type_summary

156

type_summary._add(memobj)

157

self.total_count += 1

158

self.total_size += memobj.size

159

160

def __repr__(self):

161

if self.summaries is None:

162

self.by_size()

163

out = [

164

'Total %d objects, %d types, Total size = %.1fMiB (%d bytes)'

165

% (self.total_count, len(self.summaries), self.total_size / 1024. / 1024,

166

self.total_size),

167

' Index Count % Size % Cum Max Kind'

168

]

169

cumulative = 0

170

for i in xrange(min(20, len(self.summaries))):

171

summary = self.summaries[i]

172

cumulative += summary.total_size

173

out.append(

174

'%6d%8d%4d%10d%4d%4d%8d %s'

175

% (i, summary.count, summary.count * 100.0 / self.total_count,

176

summary.total_size,

177

summary.total_size * 100.0 / self.total_size,

178

cumulative * 100.0 / self.total_size, summary.max_size,

179

summary.type_str))

180

return '\n'.join(out)

181

182

def by_size(self):

183

summaries = sorted(self.type_summaries.itervalues(),

184

key=lambda x: (x.total_size, x.count),

185

reverse=True)

186

self.summaries = summaries

187

188

def by_count(self):

189

summaries = sorted(self.type_summaries.itervalues(),

190

key=lambda x: (x.count, x.total_size),

191

reverse=True)

192

self.summaries = summaries

193

194

195

class ObjManager(object):

196

"""Manage the collection of MemObjects.

197

198

This is the interface for doing queries, etc.

199

"""

200

201

def __init__(self, objs, show_progress=True):

202

self.objs = objs

203

self.show_progress = show_progress

204

205

def __getitem__(self, address):

206

return self.objs[address]

207

208

def compute_referrers(self):

209

"""Deprecated, use compute_parents instead."""

210

warn.deprecated('.compute_referrers is deprecated.'

211

' Use .compute_parents instead.')

212

return self.compute_parents()

213

214

def compute_parents(self):

215

"""For each object, figure out who is referencing it."""

216

parents = {}

217

get_refs = parents.get

218

total = len(self.objs)

219

tlast = timer()-20

220

enabled = gc.isenabled()

221

if enabled:

222

# We create a *lot* of temporary objects here, which are all

223

# cleaned up perfectly by refcounting, so disable gc for this loop.

224

gc.disable()

225

try:

226

for idx, obj in enumerate(self.objs.itervalues()):

227

if self.show_progress and idx & 0x3f == 0:

228

tnow = timer()

229

if tnow - tlast > 0.1:

230

tlast = tnow

231

sys.stderr.write('compute parents %8d / %8d \r'

232

% (idx, total))

233

address = obj.address

234

for ref in obj.children:

235

refs = get_refs(ref, None)

236

# This is ugly, so it should be explained.

237

# To save memory pressure, parents will point to one of 4

238

# types.

239

# 1) A simple integer, representing a single referrer

240

# this saves the allocation of a separate structure

241

# entirely

242

# 2) A tuple, slightly more efficient than a list, but

243

# requires creating a new tuple to 'add' an entry.

244

# 3) A list, as before, for things with lots of

245

# parents, we use a regular list to let it grow.

246

# 4) None, no references from this object

247

t = type(refs)

248

if refs is None:

249

refs = address

250

elif t in (int, long):

251

refs = (refs, address)

252

elif t is tuple:

253

if len(refs) >= 10:

254

refs = list(refs)

255

refs.append(address)

256

else:

257

refs = refs + (address,)

258

elif t is list:

259

refs.append(address)

260

else:

261

raise TypeError('unknown refs type: %s\n' % (t,))

262

parents[ref] = refs

263

if self.show_progress:

264

sys.stderr.write('compute parents %8d / %8d \r'

265

% (idx, total))

266

for idx, obj in enumerate(self.objs.itervalues()):

267

if self.show_progress and idx & 0x3f == 0:

268

tnow = timer()

269

if tnow - tlast > 0.1:

270

tlast = tnow

271

sys.stderr.write('set parents %8d / %8d \r'

272

% (idx, total))

273

try:

274

refs = parents.pop(obj.address)

275

except KeyError:

276

obj.parents = ()

277

else:

278

if refs is None:

279

obj.parents = ()

280

elif type(refs) in (int, long):

281

obj.parents = (refs,)

282

else:

283

obj.parents = refs

284

finally:

285

if enabled:

286

gc.enable()

287

if self.show_progress:

288

sys.stderr.write('set parents %8d / %8d \n'

289

% (idx, total))

290

291

def remove_expensive_references(self):

292

"""Filter out references that are mere houskeeping links.

293

294

module.__dict__ tends to reference lots of other modules, which in turn

295

brings in the global reference cycle. Going further

296

function.__globals__ references module.__dict__, so it *too* ends up in

297

the global cycle. Generally these references aren't interesting, simply

298

because they end up referring to *everything*.

299

300

We filter out any reference to modules, frames, types, function globals

301

pointers & LRU sideways references.

302

"""

303

source = lambda:self.objs.itervalues()

304

total_objs = len(self.objs)

305

# Add the 'null' object

306

self.objs.add(0, '<ex-reference>', 0, [])

307

for changed, obj in remove_expensive_references(source, total_objs,

308

self.show_progress):

309

continue

310

311

def _compute_total_size(self, obj):

312

pending_descendents = list(obj.children)

313

seen = _intset.IDSet()

314

seen.add(obj.address)

315

total_size = obj.size

316

while pending_descendents:

317

next_ref = pending_descendents.pop()

318

if next_ref in seen:

319

continue

320

seen.add(next_ref)

321

next_obj = self.objs.get(next_ref, None)

322

if next_obj is None:

323

continue

324

# type and frame types tend to cause us to recurse into

325

# everything. So for now, when we encounter them, don't add

326

# their references

327

total_size += next_obj.size

328

pending_descendents.extend([ref for ref in next_obj.children

329

if ref not in seen])

330

## count = len(seen)

331

## # This single object references more than 10% of all objects, and

332

## # expands to more that 10x its direct references

333

## if count > obj.num_refs * 10 and count > break_on:

334

## import pdb; pdb.set_trace()

335

obj.total_size = total_size

336

return obj

337

338

def compute_total_size(self):

339

"""This computes the total bytes referenced from this object."""

340

# Unfortunately, this is an N^2 operation :(. The problem is that

341

# a.total_size + b.total_size != c.total_size (if c references a & b).

342

# This is because a & b may refer to common objects. Consider something

343

# like:

344

# A _

345

# / \ / \

346

# B C |

347

# \ / /

348

# D--'

349

# D & C participate in a refcycle, and B has an alternative path to D.

350

# You certainly don't want to count D 2 times when computing the total

351

# size of A. Also, how do you give the relative contribution of B vs C

352

# in this graph?

353

total = len(self.objs)

354

break_on = total / 10

355

for idx, obj in enumerate(self.objs.itervalues()):

356

if self.show_progress and idx & 0x1ff == 0:

357

sys.stderr.write('compute size %8d / %8d \r'

358

% (idx, total))

359

self._compute_total_size(obj)

360

if self.show_progress:

361

sys.stderr.write('compute size %8d / %8d \n'

362

% (idx, total))

363

364

def summarize(self):

365

summary = _ObjSummary()

366

for obj in self.objs.itervalues():

367

summary._add(obj)

368

return summary

369

370

def get_all(self, type_str):

371

"""Return all objects that match a given type."""

372

all = [o for o in self.objs.itervalues() if o.type_str == type_str]

373

all.sort(key=lambda x:(x.size, len(x), x.num_parents),

374

reverse=True)

375

return all

376

377

def collapse_instance_dicts(self):

378

"""Hide the __dict__ member of instances.

379

380

When a class does not have __slots__ defined, all instances get a

381

separate '__dict__' attribute that actually holds their contents. This

382

adds a level of indirection that can make it harder than it needs to

383

be, to actually find what instance holds what objects.

384

385

So we collapse those references back into the object, and grow its

386

'size' at the same time.

387

"""

388

# The instances I'm focusing on have a custom type name, and every

389

# instance has 2 pointers. The first is to __dict__, and the second is

390

# to the 'type' object whose name matches the type of the instance.

391

# Also __dict__ has only 1 referrer, and that is *this* object

392

393

# TODO: Handle old style classes. They seem to have type 'instanceobj',

394

# and reference a 'classobj' with the actual type name

395

collapsed = 0

396

total = len(self.objs)

397

tlast = timer()-20

398

for item_idx, (address, obj) in enumerate(self.objs.items()):

399

if obj.type_str in ('str', 'dict', 'tuple', 'list', 'type',

400

'function', 'wrapper_descriptor',

401

'code', 'classobj', 'int',

402

'weakref'):

403

continue

404

if self.show_progress and item_idx & 0x3f:

405

tnow = timer()

406

if tnow - tlast > 0.1:

407

tlast = tnow

408

sys.stderr.write('checked %8d / %8d collapsed %8d \r'

409

% (item_idx, total, collapsed))

410

if obj.type_str == 'module' and len(obj) == 1:

411

(dict_obj,) = obj

412

if dict_obj.type_str != 'dict':

413

continue

414

extra_refs = []

415

else:

416

if len(obj) != 2:

417

continue

418

obj_1, obj_2 = obj

419

if obj_1.type_str == 'dict' and obj_2.type_str == 'type':

420

# This is a new-style class

421

dict_obj = obj_1

422

type_obj = obj_2

423

elif (obj.type_str == 'instance'

424

and obj_1.type_str == 'classobj'

425

and obj_2.type_str == 'dict'):

426

# This is an old-style class

427

type_obj = obj_1

428

dict_obj = obj_2

429

else:

430

continue

431

extra_refs = [type_obj.address]

432

if (dict_obj.num_parents != 1

433

or dict_obj.parents[0] != address):

434

continue

435

collapsed += 1

436

# We found an instance \o/

437

new_refs = list(dict_obj.children)

438

new_refs.extend(extra_refs)

439

obj.children = new_refs

440

obj.size = obj.size + dict_obj.size

441

obj.total_size = 0

442

if obj.type_str == 'instance':

443

obj.type_str = type_obj.value

444

# Now that all the data has been moved into the instance, remove

445

# the dict from the collection

446

del self.objs[dict_obj.address]

447

if self.show_progress:

448

sys.stderr.write('checked %8d / %8d collapsed %8d \n'

449

% (item_idx, total, collapsed))

450

if collapsed:

451

self.compute_parents()

452

453

def refs_as_dict(self, obj):

454

"""Expand the ref list considering it to be a 'dict' structure.

455

456

Often we have dicts that point to simple strings and ints, etc. This

457

tries to expand that as much as possible.

458

459

:param obj: Should be a MemObject representing an instance (that has

460

been collapsed) or a dict.

461

"""

462

return obj.refs_as_dict()

463

464

def refs_as_list(self, obj):

465

"""Expand the ref list, considering it to be a list structure."""

466

as_list = []

467

children = obj.children

468

for addr in children:

469

val = self.objs[addr]

470

if val.type_str == 'bool':

471

val = (val.value == 'True')

472

elif val.value is not None:

473

val = val.value

474

elif val.type_str == 'NoneType':

475

val = None

476

as_list.append(val)

477

return as_list

478

479

480

481

def load(source, using_json=None, show_prog=True):

482

"""Load objects from the given source.

483

484

:param source: If this is a string, we will open it as a file and read all

485

objects. For any other type, we will simply iterate and parse objects

486

out, so the object should be an iterator of json lines.

487

:param using_json: Use simplejson rather than the regex. This allows

488

arbitrary ordered json dicts to be parsed but still requires per-line

489

layout. Set to 'False' to indicate you want to use the regex, set to

490

'True' to force using simplejson. None will probe to see if simplejson

491

is available, and use it if it is. (With _speedups built, simplejson

492

parses faster and more accurately than the regex.)

493

:param show_prog: If True, display the progress as we read in data

494

"""

495

cleanup = None

496

if isinstance(source, str):

497

source, cleanup = files.open_file(source)

498

if isinstance(source, file):

499

input_size = os.fstat(source.fileno()).st_size

500

else:

501

input_size = 0

502

elif isinstance(source, (list, tuple)):

503

input_size = sum(map(len, source))

504

else:

505

input_size = 0

506

if using_json is None:

507

using_json = (simplejson is not None)

508

try:

509

return _load(source, using_json, show_prog, input_size)

510

finally:

511

if cleanup is not None:

512

cleanup()

513

514

515

def iter_objs(source, using_json=False, show_prog=False, input_size=0,

516

objs=None, factory=None):

517

"""Iterate MemObjects from json.

518

519

:param source: A line iterator.

520

:param using_json: Use simplejson. See load().

521

:param show_prog: Show progress.

522

:param input_size: The size of the input if known (in bytes) or 0.

523

:param objs: Either None or a dict containing objects by address. If not

524

None, then duplicate objects will not be parsed or output.

525

:param factory: Use this to create new instances, if None, use

526

_loader._MemObjectProxy.from_args

527

:return: A generator of memory objects.

528

"""

529

# TODO: cStringIO?

530

tstart = timer()

531

input_mb = input_size / 1024. / 1024.

532

temp_cache = {}

533

address_re = re.compile(

534

r'{"address": (?P<address>\d+)'

535

)

536

bytes_read = count = 0

537

last = 0

538

mb_read = 0

539

if using_json:

540

decoder = _from_json

541

else:

542

decoder = _from_line

543

if factory is None:

544

factory = _loader._MemObjectProxy_from_args

545

for line_num, line in enumerate(source):

546

bytes_read += len(line)

547

if line in ("[\n", "]\n"):

548

continue

549

if line.endswith(',\n'):

550

line = line[:-2]

551

if objs:

552

# Skip duplicate objects

553

m = address_re.match(line)

554

if not m:

555

continue

556

address = int(m.group('address'))

557

if address in objs:

558

continue

559

yield decoder(factory, line, temp_cache=temp_cache)

560

if show_prog and (line_num - last > 5000):

561

last = line_num

562

mb_read = bytes_read / 1024. / 1024

563

tdelta = timer() - tstart

564

sys.stderr.write(

565

'loading... line %d, %d objs, %5.1f / %5.1f MiB read in %.1fs\r'

566

% (line_num, len(objs), mb_read, input_mb, tdelta))

567

if show_prog:

568

mb_read = bytes_read / 1024. / 1024

569

tdelta = timer() - tstart

570

sys.stderr.write(

571

'loaded line %d, %d objs, %5.1f / %5.1f MiB read in %.1fs \n'

572

% (line_num, len(objs), mb_read, input_mb, tdelta))

573

574

575

def _load(source, using_json, show_prog, input_size):

576

objs = _loader.MemObjectCollection()

577

for memobj in iter_objs(source, using_json, show_prog, input_size, objs,

578

factory=objs.add):

579

# objs.add automatically adds the object as it is created

580

pass

581

return ObjManager(objs, show_progress=show_prog)

582

583

584

def remove_expensive_references(source, total_objs=0, show_progress=False):

585

"""Filter out references that are mere houskeeping links.

586

587

module.__dict__ tends to reference lots of other modules, which in turn

588

brings in the global reference cycle. Going further

589

function.__globals__ references module.__dict__, so it *too* ends up in

590

the global cycle. Generally these references aren't interesting, simply

591

because they end up referring to *everything*.

592

593

We filter out any reference to modules, frames, types, function globals

594

pointers & LRU sideways references.

595

596

:param source: A callable that returns an iterator of MemObjects. This

597

will be called twice.

598

:param total_objs: The total objects to be filtered, if known. If

599

show_progress is False or the count of objects is unknown, 0.

600

:return: An iterator of (changed, MemObject) objects with expensive

601

references removed.

602

"""

603

# First pass, find objects we don't want to reference any more

604

noref_objs = _intset.IDSet()

605

lru_objs = _intset.IDSet()

606

total_steps = total_objs * 2

607

seen_zero = False

608

for idx, obj in enumerate(source()):

609

# 'module's have a single __dict__, which tends to refer to other

610

# modules. As you start tracking into that, you end up getting into

611

# reference cycles, etc, which generally ends up referencing every

612

# object in memory.

613

# 'frame' also tends to be self referential, and a single frame

614

# ends up referencing the entire current state

615

# 'type' generally is self referential through several attributes.

616

# __bases__ means we recurse all the way up to object, and object

617

# has __subclasses__, which means we recurse down into all types.

618

# In general, not helpful for debugging memory consumption

619

if show_progress and idx & 0x1ff == 0:

620

sys.stderr.write('finding expensive refs... %8d / %8d \r'

621

% (idx, total_steps))

622

if obj.type_str in ('module', 'frame', 'type'):

623

noref_objs.add(obj.address)

624

if obj.type_str == '_LRUNode':

625

lru_objs.add(obj.address)

626

if obj.address == 0:

627

seen_zero = True

628

# Second pass, any object which refers to something in noref_objs will

629

# have that reference removed, and replaced with the null_memobj

630

num_expensive = len(noref_objs)

631

null_memobj = _loader._MemObjectProxy_from_args(0, '<ex-reference>', 0, [])

632

if not seen_zero:

633

yield (True, null_memobj)

634

if show_progress and total_objs == 0:

635

total_objs = idx

636

total_steps = total_objs * 2

637

for idx, obj in enumerate(source()):

638

if show_progress and idx & 0x1ff == 0:

639

sys.stderr.write('removing %d expensive refs... %8d / %8d \r'

640

% (num_expensive, idx + total_objs,

641

total_steps))

642

if obj.type_str == 'function':

643

# Functions have a reference to 'globals' which is not very

644

# helpful for having a clear understanding of what is going on

645

# especially since the function itself is in its own globals

646

# XXX: This is probably not a guaranteed order, but currently

647

# func_traverse returns:

648

# func_code, func_globals, func_module, func_defaults,

649

# func_doc, func_name, func_dict, func_closure

650

# We want to remove the reference to globals and module

651

refs = list(obj.children)

652

obj.children = refs[:1] + refs[3:] + [0]

653

yield (True, obj)

654

continue

655

elif obj.type_str == '_LRUNode':

656

# We remove the 'sideways' references

657

obj.children = [ref for ref in obj.children

658

if ref not in lru_objs]

659

yield (True, obj)

660

continue

661

for ref in obj.children:

662

if ref in noref_objs:

663

break

664

else:

665

# No bad references, keep going

666

yield (False, obj)

667

continue

668

new_ref_list = [ref for ref in obj.children

669

if ref not in noref_objs]

670

new_ref_list.append(0)

671

obj.children = new_ref_list

672

yield (True, obj)

673

if show_progress:

674

sys.stderr.write('removed %d expensive refs from %d objs%s\n'

675

% (num_expensive, total_objs, ' '*20))