~ubuntu-branches/ubuntu/trusty/python-meld3/trusty : revision 1

1

import htmlentitydefs

2

import os

3

import re

4

import types

5

import mimetools

6

from StringIO import StringIO

7

8

try:

9

from elementtree.ElementTree import TreeBuilder

10

from elementtree.ElementTree import XMLTreeBuilder

11

from elementtree.ElementTree import Comment

12

from elementtree.ElementTree import ProcessingInstruction

13

from elementtree.ElementTree import QName

14

from elementtree.ElementTree import _raise_serialization_error

15

from elementtree.ElementTree import _namespace_map

16

from elementtree.ElementTree import fixtag

17

from elementtree.ElementTree import parse as et_parse

18

from elementtree.ElementTree import ElementPath

19

except ImportError:

20

from xml.etree.ElementTree import TreeBuilder

21

from xml.etree.ElementTree import XMLTreeBuilder

22

from xml.etree.ElementTree import Comment

23

from xml.etree.ElementTree import ProcessingInstruction

24

from xml.etree.ElementTree import QName

25

from xml.etree.ElementTree import _raise_serialization_error

26

from xml.etree.ElementTree import _namespace_map

27

from xml.etree.ElementTree import fixtag

28

from xml.etree.ElementTree import parse as et_parse

29

from xml.etree.ElementTree import ElementPath

30

31

# HTMLTreeBuilder does not exist in python 2.5 standard elementtree

32

from HTMLParser import HTMLParser

33

AUTOCLOSE = "p", "li", "tr", "th", "td", "head", "body"

34

IGNOREEND = "img", "hr", "meta", "link", "br"

35

is_not_ascii = re.compile(eval(r'u"[\u0080-\uffff]"')).search

36

37

# replace element factory

38

def Replace(text, structure=False):

39

element = _MeldElementInterface(Replace, {})

40

element.text = text

41

element.structure = structure

42

return element

43

44

class IO:

45

def __init__(self):

46

self.data = ""

47

48

def write(self, data):

49

self.data += data

50

51

def getvalue(self):

52

return self.data

53

54

def clear(self):

55

self.data = ""

56

57

class PyHelper:

58

def findmeld(self, node, name, default=None):

59

iterator = self.getiterator(node)

60

for element in iterator:

61

val = element.attrib.get(_MELD_ID)

62

if val == name:

63

return element

64

return default

65

66

def clone(self, node, parent=None):

67

# NOTE: this is not implemented by the C version (it used to be

68

# but I don't want to maintain it)

69

element = _MeldElementInterface(node.tag, node.attrib.copy())

70

element.text = node.text

71

element.tail = node.tail

72

element.structure = node.structure

73

if parent is not None:

74

# avoid calling self.append to reduce function call overhead

75

parent._children.append(element)

76

element.parent = parent

77

for child in node._children:

78

self.clone(child, element)

79

return element

80

81

def _bfclone(self, nodes, parent):

82

L = []

83

for node in nodes:

84

element = _MeldElementInterface(node.tag, node.attrib.copy())

85

element.parent = parent

86

element.text = node.text

87

element.tail = node.tail

88

element.structure = node.structure

89

if node._children:

90

self._bfclone(node._children, element)

91

L.append(element)

92

parent._children = L

93

94

def bfclone(self, node, parent=None):

95

element = _MeldElementInterface(node.tag, node.attrib.copy())

96

element.text = node.text

97

element.tail = node.tail

98

element.structure = node.structure

99

element.parent = parent

100

if parent is not None:

101

parent._children.append(element)

102

if node._children:

103

self._bfclone(node._children, element)

104

return element

105

106

def getiterator(self, node, tag=None):

107

nodes = []

108

if tag == "*":

109

tag = None

110

if tag is None or node.tag == tag:

111

nodes.append(node)

112

for element in node._children:

113

nodes.extend(self.getiterator(element, tag))

114

return nodes

115

116

def content(self, node, text, structure=False):

117

node.text = None

118

replacenode = Replace(text, structure)

119

replacenode.parent = node

120

replacenode.text = text

121

replacenode.structure = structure

122

node._children = [replacenode]

123

124

pyhelper = PyHelper()

125

126

try:

127

import cmeld3 as chelper

128

except ImportError:

129

chelper = None

130

131

if chelper and not os.getenv('MELD3_PYIMPL'):

132

helper = chelper

133

else:

134

helper = pyhelper

135

136

_MELD_NS_URL = 'http://www.plope.com/software/meld3'

137

_MELD_PREFIX = '{%s}' % _MELD_NS_URL

138

_MELD_LOCAL = 'id'

139

_MELD_ID = '%s%s' % (_MELD_PREFIX, _MELD_LOCAL)

140

_MELD_SHORT_ID = 'meld:%s' % _MELD_LOCAL

141

_XHTML_NS_URL = 'http://www.w3.org/1999/xhtml'

142

_XHTML_PREFIX = '{%s}' % _XHTML_NS_URL

143

_XHTML_PREFIX_LEN = len(_XHTML_PREFIX)

144

145

146

_marker = []

147

148

class doctype:

149

# lookup table for ease of use in external code

150

html_strict = ('HTML', '-//W3C//DTD HTML 4.01//EN',

151

'http://www.w3.org/TR/html4/strict.dtd')

152

html = ('HTML', '-//W3C//DTD HTML 4.01 Transitional//EN',

153

'http://www.w3.org/TR/html4/loose.dtd')

154

xhtml_strict = ('html', '-//W3C//DTD XHTML 1.0 Strict//EN',

155

'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd')

156

xhtml = ('html', '-//W3C//DTD XHTML 1.0 Transitional//EN',

157

'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd')

158

159

class _MeldElementInterface:

160

parent = None

161

attrib = None

162

text = None

163

tail = None

164

structure = None

165

Replace = [Replace] # this is used by C code

166

167

# overrides to reduce MRU lookups

168

def __init__(self, tag, attrib):

169

self.tag = tag

170

self.attrib = attrib

171

self._children = []

172

173

def __repr__(self):

174

return "<MeldElement %s at %x>" % (self.tag, id(self))

175

176

def __len__(self):

177

return len(self._children)

178

179

def __getitem__(self, index):

180

return self._children[index]

181

182

def __getslice__(self, start, stop):

183

return self._children[start:stop]

184

185

def getchildren(self):

186

return self._children

187

188

def find(self, path):

189

return ElementPath.find(self, path)

190

191

def findtext(self, path, default=None):

192

return ElementPath.findtext(self, path, default)

193

194

def findall(self, path):

195

return ElementPath.findall(self, path)

196

197

def clear(self):

198

self.attrib.clear()

199

self._children = []

200

self.text = self.tail = None

201

202

def get(self, key, default=None):

203

return self.attrib.get(key, default)

204

205

def set(self, key, value):

206

self.attrib[key] = value

207

208

def keys(self):

209

return self.attrib.keys()

210

211

def items(self):

212

return self.attrib.items()

213

214

def getiterator(self, *ignored_args, **ignored_kw):

215

# we ignore any tag= passed in to us, because it's too painful

216

# to support in our C version

217

return helper.getiterator(self)

218

219

# overrides to support parent pointers and factories

220

221

def __setitem__(self, index, element):

222

self._children[index] = element

223

element.parent = self

224

225

def __setslice__(self, start, stop, elements):

226

for element in elements:

227

element.parent = self

228

self._children[start:stop] = list(elements)

229

230

def append(self, element):

231

self._children.append(element)

232

element.parent = self

233

234

def insert(self, index, element):

235

self._children.insert(index, element)

236

element.parent = self

237

238

def __delitem__(self, index):

239

ob = self._children[index]

240

ob.parent = None

241

del self._children[index]

242

243

def __delslice__(self, start, stop):

244

obs = self._children[start:stop]

245

for ob in obs:

246

ob.parent = None

247

del self._children[start:stop]

248

249

def remove(self, element):

250

self._children.remove(element)

251

element.parent = None

252

253

def makeelement(self, tag, attrib):

254

return self.__class__(tag, attrib)

255

256

# meld-specific

257

258

def __mod__(self, other):

259

""" Fill in the text values of meld nodes in tree; only

260

support dictionarylike operand (sequence operand doesn't seem

261

to make sense here)"""

262

return self.fillmelds(**other)

263

264

def fillmelds(self, **kw):

265

""" Fill in the text values of meld nodes in tree using the

266

keyword arguments passed in; use the keyword keys as meld ids

267

and the keyword values as text that should fill in the node

268

text on which that meld id is found. Return a list of keys

269

from **kw that were not able to be found anywhere in the tree.

270

Never raises an exception. """

271

unfilled = []

272

for k in kw:

273

node = self.findmeld(k)

274

if node is None:

275

unfilled.append(k)

276

else:

277

node.text = kw[k]

278

return unfilled

279

280

def fillmeldhtmlform(self, **kw):

281

""" Perform magic to 'fill in' HTML form element values from a

282

dictionary. Unlike 'fillmelds', the type of element being

283

'filled' is taken into consideration.

284

285

Perform a 'findmeld' on each key in the dictionary and use the

286

value that corresponds to the key to perform mutation of the

287

tree, changing data in what is presumed to be one or more HTML

288

form elements according to the following rules::

289

290

If the found element is an 'input group' (its meld id ends

291

with the string ':inputgroup'), set the 'checked' attribute

292

on the appropriate subelement which has a 'value' attribute

293

which matches the dictionary value. Also remove the

294

'checked' attribute from every other 'input' subelement of

295

the input group. If no input subelement's value matches the

296

dictionary value, this key is treated as 'unfilled'.

297

298

If the found element is an 'input type=text', 'input

299

type=hidden', 'input type=submit', 'input type=password',

300

'input type=reset' or 'input type=file' element, replace its

301

'value' attribute with the value.

302

303

If the found element is an 'input type=checkbox' or 'input

304

type='radio' element, set its 'checked' attribute to true if

305

the dict value is true, or remove its 'checked' attribute if

306

the dict value is false.

307

308

If the found element is a 'select' element and the value

309

exists in the 'value=' attribute of one of its 'option'

310

subelements, change that option's 'selected' attribute to

311

true and mark all other option elements as unselected. If

312

the select element does not contain an option with a value

313

that matches the dictionary value, do nothing and return

314

this key as unfilled.

315

316

If the found element is a 'textarea' or any other kind of

317

element, replace its text with the value.

318

319

If the element corresponding to the key is not found,

320

do nothing and treat the key as 'unfilled'.

321

322

Return a list of 'unfilled' keys, representing meld ids

323

present in the dictionary but not present in the element tree

324

or meld ids which could not be filled due to the lack of any

325

matching subelements for 'select' nodes or 'inputgroup' nodes.

326

"""

327

328

unfilled = []

329

330

for k in kw:

331

node = self.findmeld(k)

332

333

if node is None:

334

unfilled.append(k)

335

continue

336

337

val = kw[k]

338

339

if k.endswith(':inputgroup'):

340

# an input group is a list of input type="checkbox" or

341

# input type="radio" elements that can be treated as a group

342

# because they attempt to specify the same value

343

344

345

found = []

346

unfound = []

347

348

for child in node.findall('input'):

349

input_type = child.attrib.get('type', '').lower()

350

if input_type not in ('checkbox', 'radio'):

351

continue

352

353

input_val = child.attrib.get('value', '')

354

355

if val == input_val:

356

found.append(child)

357

else:

358

unfound.append(child)

359

360

if not found:

361

unfilled.append(k)

362

363

else:

364

for option in found:

365

option.attrib['checked'] = 'checked'

366

for option in unfound:

367

try:

368

del option.attrib['checked']

369

except KeyError:

370

pass

371

else:

372

373

tag = node.tag.lower()

374

375

if tag == 'input':

376

377

input_type = node.attrib.get('type', 'text').lower()

378

379

# fill in value attrib for most input types

380

if input_type in ('hidden', 'submit', 'text',

381

'password', 'reset', 'file'):

382

node.attrib['value'] = val

383

384

# unless it's a checkbox or radio attribute, then we

385

# fill in its checked attribute

386

elif input_type in ('checkbox', 'radio'):

387

if val:

388

node.attrib['checked'] = 'checked'

389

else:

390

try:

391

del node.attrib['checked']

392

except KeyError:

393

pass

394

else:

395

396

unfilled.append(k)

397

398

elif tag == 'select':

399

# if the node is a select node, we want to select

400

# the value matching val, otherwise it's unfilled

401

402

found = []

403

unfound = []

404

405

for option in node.findall('option'):

406

if option.attrib.get('value', '') == val:

407

found.append(option)

408

else:

409

unfound.append(option)

410

if not found:

411

unfilled.append(k)

412

else:

413

for option in found:

414

option.attrib['selected'] = 'selected'

415

for option in unfound:

416

try:

417

del option.attrib['selected']

418

except KeyError:

419

pass

420

else:

421

node.text = kw[k]

422

423

return unfilled

424

425

def findmeld(self, name, default=None):

426

""" Find a node in the tree that has a 'meld id' corresponding

427

to 'name'. Iterate over all subnodes recursively looking for a

428

node which matches. If we can't find the node, return None."""

429

# this could be faster if we indexed all the meld nodes in the

430

# tree; we just walk the whole hierarchy now.

431

result = helper.findmeld(self, name)

432

if result is None:

433

return default

434

return result

435

436

def findmelds(self):

437

""" Find all nodes that have a meld id attribute and return

438

the found nodes in a list"""

439

return self.findwithattrib(_MELD_ID)

440

441

def findwithattrib(self, attrib, value=None):

442

""" Find all nodes that have an attribute named 'attrib'. If

443

'value' is not None, omit nodes on which the attribute value

444

does not compare equally to 'value'. Return the found nodes in

445

a list."""

446

iterator = helper.getiterator(self)

447

elements = []

448

for element in iterator:

449

attribval = element.attrib.get(attrib)

450

if attribval is not None:

451

if value is None:

452

elements.append(element)

453

else:

454

if value == attribval:

455

elements.append(element)

456

return elements

457

458

# ZPT-alike methods

459

def repeat(self, iterable, childname=None):

460

"""repeats an element with values from an iterable. If

461

'childname' is not None, repeat the element on which the

462

repeat is called, otherwise find the child element with a

463

'meld:id' matching 'childname' and repeat that. The element

464

is repeated within its parent element (nodes that are created

465

as a result of a repeat share the same parent). This method

466

returns an iterable; the value of each iteration is a

467

two-sequence in the form (newelement, data). 'newelement' is

468

a clone of the template element (including clones of its

469

children) which has already been seated in its parent element

470

in the template. 'data' is a value from the passed in

471

iterable. Changing 'newelement' (typically based on values

472

from 'data') mutates the element 'in place'."""

473

if childname:

474

element = self.findmeld(childname)

475

else:

476

element = self

477

478

parent = element.parent

479

# creating a list is faster than yielding a generator (py 2.4)

480

L = []

481

first = True

482

for thing in iterable:

483

if first is True:

484

clone = element

485

else:

486

clone = helper.bfclone(element, parent)

487

L.append((clone, thing))

488

first = False

489

return L

490

491

def replace(self, text, structure=False):

492

""" Replace this element with a Replace node in our parent with

493

the text 'text' and return the index of our position in

494

our parent. If we have no parent, do nothing, and return None.

495

Pass the 'structure' flag to the replace node so it can do the right

496

thing at render time. """

497

parent = self.parent

498

i = self.deparent()

499

if i is not None:

500

# reduce function call overhead by not calliing self.insert

501

node = Replace(text, structure)

502

parent._children.insert(i, node)

503

node.parent = parent

504

return i

505

506

def content(self, text, structure=False):

507

""" Delete this node's children and append a Replace node that

508

contains text. Always return None. Pass the 'structure' flag

509

to the replace node so it can do the right thing at render

510

time."""

511

helper.content(self, text, structure)

512

513

def attributes(self, **kw):

514

""" Set attributes on this node. """

515

for k, v in kw.items():

516

# prevent this from getting to the parser if possible

517

if not isinstance(k, types.StringTypes):

518

raise ValueError, 'do not set non-stringtype as key: %s' % k

519

if not isinstance(v, types.StringTypes):

520

raise ValueError, 'do not set non-stringtype as val: %s' % v

521

self.attrib[k] = kw[k]

522

523

# output methods

524

def write_xmlstring(self, encoding=None, doctype=None, fragment=False,

525

declaration=True, pipeline=False):

526

data = []

527

write = data.append

528

if not fragment:

529

if declaration:

530

_write_declaration(write, encoding)

531

if doctype:

532

_write_doctype(write, doctype)

533

_write_xml(write, self, encoding, {}, pipeline)

534

return ''.join(data)

535

536

def write_xml(self, file, encoding=None, doctype=None,

537

fragment=False, declaration=True, pipeline=False):

538

""" Write XML to 'file' (which can be a filename or filelike object)

539

540

encoding - encoding string (if None, 'utf-8' encoding is assumed)

541

Must be a recognizable Python encoding type.

542

doctype - 3-tuple indicating name, pubid, system of doctype.

543

The default is to prevent a doctype from being emitted.

544

fragment - True if a 'fragment' should be emitted for this node (no

545

declaration, no doctype). This causes both the

546

'declaration' and 'doctype' parameters to become ignored

547

if provided.

548

declaration - emit an xml declaration header (including an encoding

549

if it's not None). The default is to emit the

550

doctype.

551

pipeline - preserve 'meld' namespace identifiers in output

552

for use in pipelining

553

"""

554

if not hasattr(file, "write"):

555

file = open(file, "wb")

556

data = self.write_xmlstring(encoding, doctype, fragment, declaration,

557

pipeline)

558

file.write(data)

559

560

def write_htmlstring(self, encoding=None, doctype=doctype.html,

561

fragment=False):

562

data = []

563

write = data.append

564

if encoding is None:

565

encoding = 'utf8'

566

if encoding in ('utf8', 'utf-8', 'latin-1', 'latin1',

567

'ascii'):

568

# optimize for common dumb-American case (only encode once at

569

# the end)

570

if not fragment:

571

if doctype:

572

_write_doctype(write, doctype)

573

_write_html_no_encoding(write, self, {})

574

joined = ''.join(data)

575

return joined

576

else:

577

if not fragment:

578

if doctype:

579

_write_doctype(write, doctype)

580

_write_html(write, self, encoding, {})

581

joined = ''.join(data)

582

return joined

583

584

def write_html(self, file, encoding=None, doctype=doctype.html,

585

fragment=False):

586

""" Write HTML to 'file' (which can be a filename or filelike object)

587

588

encoding - encoding string (if None, 'utf-8' encoding is assumed).

589

Unlike XML output, this is not used in a declaration,

590

but it is used to do actual character encoding during

591

output. Must be a recognizable Python encoding type.

592

doctype - 3-tuple indicating name, pubid, system of doctype.

593

The default is the value of doctype.html (HTML 4.0

594

'loose')

595

fragment - True if a "fragment" should be omitted (no doctype).

596

This overrides any provided "doctype" parameter if

597

provided.

598

599

Namespace'd elements and attributes have their namespaces removed

600

during output when writing HTML, so pipelining cannot be performed.

601

602

HTML is not valid XML, so an XML declaration header is never emitted.

603

"""

604

if not hasattr(file, "write"):

605

file = open(file, "wb")

606

page = self.write_htmlstring(encoding, doctype, fragment)

607

file.write(page)

608

609

def write_xhtmlstring(self, encoding=None, doctype=doctype.xhtml,

610

fragment=False, declaration=False, pipeline=False):

611

data = []

612

write = data.append

613

if not fragment:

614

if declaration:

615

_write_declaration(write, encoding)

616

if doctype:

617

_write_doctype(write, doctype)

618

_write_xml(write, self, encoding, {}, pipeline, xhtml=True)

619

return ''.join(data)

620

621

def write_xhtml(self, file, encoding=None, doctype=doctype.xhtml,

622

fragment=False, declaration=False, pipeline=False):

623

""" Write XHTML to 'file' (which can be a filename or filelike object)

624

625

encoding - encoding string (if None, 'utf-8' encoding is assumed)

626

Must be a recognizable Python encoding type.

627

doctype - 3-tuple indicating name, pubid, system of doctype.

628

The default is the value of doctype.xhtml (XHTML

629

'loose').

630

fragment - True if a 'fragment' should be emitted for this node (no

631

declaration, no doctype). This causes both the

632

'declaration' and 'doctype' parameters to be ignored.

633

declaration - emit an xml declaration header (including an encoding

634

string if 'encoding' is not None)

635

pipeline - preserve 'meld' namespace identifiers in output

636

for use in pipelining

637

"""

638

# use a list as a collector, and only call the write method of

639

# the file once we've collected all output (reduce function call

640

# overhead)

641

data = []

642

write = data.append

643

if not hasattr(file, "write"):

644

file = open(file, "wb")

645

page = self.write_xhtmlstring(encoding, doctype, fragment, declaration,

646

pipeline)

647

file.write(page)

648

649

def clone(self, parent=None):

650

""" Create a clone of an element. If parent is not None,

651

append the element to the parent. Recurse as necessary to create

652

a deep clone of the element. """

653

return helper.bfclone(self, parent)

654

655

def deparent(self):

656

""" Remove ourselves from our parent node (de-parent) and return

657

the index of the parent which was deleted. """

658

i = self.parentindex()

659

if i is not None:

660

del self.parent[i]

661

return i

662

663

def parentindex(self):

664

""" Return the parent node index in which we live """

665

parent = self.parent

666

if parent is not None:

667

return parent._children.index(self)

668

669

def shortrepr(self, encoding=None):

670

data = []

671

_write_html(data.append, self, encoding, {}, maxdepth=2)

672

return ''.join(data)

673

674

def diffmeld(self, other):

675

""" Compute the meld element differences from this node (the

676

source) to 'other' (the target). Return a dictionary of

677

sequences in the form {'unreduced:

678

{'added':[], 'removed':[], 'moved':[]},

679

'reduced':

680

{'added':[], 'removed':[], 'moved':[]},}

681

"""

682

srcelements = self.findmelds()

683

tgtelements = other.findmelds()

684

srcids = [ x.meldid() for x in srcelements ]

685

tgtids = [ x.meldid() for x in tgtelements ]

686

687

removed = []

688

for srcelement in srcelements:

689

if srcelement.meldid() not in tgtids:

690

removed.append(srcelement)

691

692

added = []

693

for tgtelement in tgtelements:

694

if tgtelement.meldid() not in srcids:

695

added.append(tgtelement)

696

697

moved = []

698

for srcelement in srcelements:

699

srcid = srcelement.meldid()

700

if srcid in tgtids:

701

i = tgtids.index(srcid)

702

tgtelement = tgtelements[i]

703

if not sharedlineage(srcelement, tgtelement):

704

moved.append(tgtelement)

705

706

unreduced = {'added':added, 'removed':removed, 'moved':moved}

707

708

moved_reduced = diffreduce(moved)

709

added_reduced = diffreduce(added)

710

removed_reduced = diffreduce(removed)

711

712

reduced = {'moved':moved_reduced, 'added':added_reduced,

713

'removed':removed_reduced}

714

715

return {'unreduced':unreduced,

716

'reduced':reduced}

717

718

def meldid(self):

719

return self.attrib.get(_MELD_ID)

720

721

def lineage(self):

722

L = []

723

parent = self

724

while parent is not None:

725

L.append(parent)

726

parent = parent.parent

727

return L

728

729

730

def MeldTreeBuilder():

731

return TreeBuilder(element_factory=_MeldElementInterface)

732

733

class MeldParser(XMLTreeBuilder):

734

735

""" A parser based on Fredrik's PIParser at

736

http://effbot.org/zone/element-pi.htm. It blithely ignores the

737

case of a comment existing outside the root element and ignores

738

processing instructions entirely. We need to validate that there

739

are no repeated meld id's in the source as well """

740

741

def __init__(self, html=0, target=None):

742

XMLTreeBuilder.__init__(self, html, target)

743

# assumes ElementTree 1.2.X

744

self._parser.CommentHandler = self.handle_comment

745

self.meldids = {}

746

747

def handle_comment(self, data):

748

self._target.start(Comment, {})

749

self._target.data(data)

750

self._target.end(Comment)

751

752

def _start(self, tag, attrib_in):

753

# this is used by self._parser (an Expat parser) as

754

# StartElementHandler but only if _start_list is not

755

# provided... so why does this method exist?

756

for key in attrib_in:

757

if '{' + key == _MELD_ID:

758

meldid = attrib_in[key]

759

if self.meldids.get(meldid):

760

raise ValueError, ('Repeated meld id "%s" in source' %

761

meldid)

762

self.meldids[meldid] = 1

763

return XMLTreeBuilder._start(self, tag, attrib_in)

764

765

def _start_list(self, tag, attrib_in):

766

# This is used by self._parser (an Expat parser)

767

# as StartElementHandler. attrib_in is a flat even-length

768

# sequence of name, value pairs for all attributes.

769

# See http://python.org/doc/lib/xmlparser-objects.html

770

for i in range(0, len(attrib_in), 2):

771

# For some reason, clark names are missing the leading '{'

772

attrib = self._fixname(attrib_in[i])

773

if _MELD_ID == attrib:

774

meldid = attrib_in[i+1]

775

if self.meldids.get(meldid):

776

raise ValueError, ('Repeated meld id "%s" in source' %

777

meldid)

778

self.meldids[meldid] = 1

779

return XMLTreeBuilder._start_list(self, tag, attrib_in)

780

781

def close(self):

782

val = XMLTreeBuilder.close(self)

783

self.meldids = {}

784

return val

785

786

class HTMLMeldParser(HTMLParser):

787

""" A mostly-cut-and-paste of ElementTree's HTMLTreeBuilder that

788

does special meld3 things (like preserve comments and munge meld

789

ids). Subclassing is not possible due to private attributes. :-("""

790

791

def __init__(self, builder=None, encoding=None):

792

self.__stack = []

793

if builder is None:

794

builder = MeldTreeBuilder()

795

self.builder = builder

796

self.encoding = encoding or "iso-8859-1"

797

HTMLParser.__init__(self)

798

self.meldids = {}

799

800

def close(self):

801

HTMLParser.close(self)

802

self.meldids = {}

803

return self.builder.close()

804

805

def handle_starttag(self, tag, attrs):

806

if tag == "meta":

807

# look for encoding directives

808

http_equiv = content = None

809

for k, v in attrs:

810

if k == "http-equiv":

811

http_equiv = v.lower()

812

elif k == "content":

813

content = v

814

if http_equiv == "content-type" and content:

815

# use mimetools to parse the http header

816

header = mimetools.Message(

817

StringIO("%s: %s\n\n" % (http_equiv, content))

818

)

819

encoding = header.getparam("charset")

820

if encoding:

821

self.encoding = encoding

822

if tag in AUTOCLOSE:

823

if self.__stack and self.__stack[-1] == tag:

824

self.handle_endtag(tag)

825

self.__stack.append(tag)

826

attrib = {}

827

if attrs:

828

for k, v in attrs:

829

if k == _MELD_SHORT_ID:

830

k = _MELD_ID

831

if self.meldids.get(v):

832

raise ValueError, ('Repeated meld id "%s" in source' %

833

v)

834

self.meldids[v] = 1

835

else:

836

k = k.lower()

837

attrib[k] = v

838

self.builder.start(tag, attrib)

839

if tag in IGNOREEND:

840

self.__stack.pop()

841

self.builder.end(tag)

842

843

def handle_endtag(self, tag):

844

if tag in IGNOREEND:

845

return

846

lasttag = self.__stack.pop()

847

if tag != lasttag and lasttag in AUTOCLOSE:

848

self.handle_endtag(lasttag)

849

self.builder.end(tag)

850

851

def handle_charref(self, char):

852

if char[:1] == "x":

853

char = int(char[1:], 16)

854

else:

855

char = int(char)

856

if 0 <= char < 128:

857

self.builder.data(chr(char))

858

else:

859

self.builder.data(unichr(char))

860

861

def handle_entityref(self, name):

862

entity = htmlentitydefs.entitydefs.get(name)

863

if entity:

864

if len(entity) == 1:

865

entity = ord(entity)

866

else:

867

entity = int(entity[2:-1])

868

if 0 <= entity < 128:

869

self.builder.data(chr(entity))

870

else:

871

self.builder.data(unichr(entity))

872

else:

873

self.unknown_entityref(name)

874

875

def handle_data(self, data):

876

if isinstance(data, type('')) and is_not_ascii(data):

877

# convert to unicode, but only if necessary

878

data = unicode(data, self.encoding, "ignore")

879

self.builder.data(data)

880

881

def unknown_entityref(self, name):

882

pass # ignore by default; override if necessary

883

884

def handle_comment(self, data):

885

self.builder.start(Comment, {})

886

self.builder.data(data)

887

self.builder.end(Comment)

888

889

def do_parse(source, parser):

890

root = et_parse(source, parser=parser).getroot()

891

iterator = root.getiterator()

892

for p in iterator:

893

for c in p:

894

c.parent = p

895

return root

896

897

def parse_xml(source):

898

""" Parse source (a filelike object) into an element tree. If

899

html is true, use a parser that can resolve somewhat ambiguous

900

HTML into XHTML. Otherwise use a 'normal' parser only."""

901

builder = MeldTreeBuilder()

902

parser = MeldParser(target=builder)

903

return do_parse(source, parser)

904

905

def parse_html(source, encoding=None):

906

builder = MeldTreeBuilder()

907

parser = HTMLMeldParser(builder, encoding)

908

return do_parse(source, parser)

909

910

def parse_xmlstring(text):

911

source = StringIO(text)

912

return parse_xml(source)

913

914

def parse_htmlstring(text, encoding=None):

915

source = StringIO(text)

916

return parse_html(source, encoding)

917

918

attrib_needs_escaping = re.compile(r'[&"<]').search

919

cdata_needs_escaping = re.compile(r'[&<]').search

920

921

def _both_case(mapping):

922

# Add equivalent upper-case keys to mapping.

923

lc_keys = mapping.keys()

924

for k in lc_keys:

925

mapping[k.upper()] = mapping[k]

926

927

928

_HTMLTAGS_UNBALANCED = {'area':1, 'base':1, 'basefont':1, 'br':1, 'col':1,

929

'frame':1, 'hr':1, 'img':1, 'input':1, 'isindex':1,

930

'link':1, 'meta':1, 'param':1}

931

_both_case(_HTMLTAGS_UNBALANCED)

932

933

_HTMLTAGS_NOESCAPE = {'script':1, 'style':1}

934

_both_case(_HTMLTAGS_NOESCAPE)

935

936

_HTMLATTRS_BOOLEAN = {'selected':1, 'checked':1, 'compact':1, 'declare':1,

937

'defer':1, 'disabled':1, 'ismap':1, 'multiple':1,

938

'nohref':1, 'noresize':1, 'noshade':1, 'nowrap':1}

939

_both_case(_HTMLATTRS_BOOLEAN)

940

941

def _write_html(write, node, encoding, namespaces, depth=-1, maxdepth=None):

942

" Write HTML to file """

943

if encoding is None:

944

encoding = 'utf-8'

945

946

tag = node.tag

947

tail = node.tail

948

text = node.text

949

tail = node.tail

950

951

to_write = ""

952

953

if tag is Replace:

954

if not node.structure:

955

if cdata_needs_escaping(text):

956

text = _escape_cdata(text)

957

write(text.encode(encoding))

958

959

elif tag is Comment:

960

if cdata_needs_escaping(text):

961

text = _escape_cdata(text)

962

write(''.encode(encoding))

963

964

elif tag is ProcessingInstruction:

965

if cdata_needs_escaping(text):

966

text = _escape_cdata(text)

967

write(''.encode(encoding))

968

969

else:

970

xmlns_items = [] # new namespaces in this scope

971

try:

972

if tag[:1] == "{":

973

if tag[:_XHTML_PREFIX_LEN] == _XHTML_PREFIX:

974

tag = tag[_XHTML_PREFIX_LEN:]

975

else:

976

tag, xmlns = fixtag(tag, namespaces)

977

if xmlns:

978

xmlns_items.append(xmlns)

979

except TypeError:

980

_raise_serialization_error(tag)

981

982

to_write += "<%s" % tag.encode(encoding)

983

984

attrib = node.attrib

985

986

if attrib is not None:

987

if len(attrib) > 1:

988

attrib_keys = attrib.keys()

989

attrib_keys.sort()

990

else:

991

attrib_keys = attrib

992

for k in attrib_keys:

993

try:

994

if k[:1] == "{":

995

continue

996

except TypeError:

997

_raise_serialization_error(k)

998

if k in _HTMLATTRS_BOOLEAN:

999

to_write += ' ' + k.encode(encoding)

1000

else:

1001

v = attrib[k]

1002

to_write += " %s=\"%s\"" % (k, v)

1003

1004

for k, v in xmlns_items:

1005

to_write += " %s=\"%s\"" % (k, v)

1006

1007

to_write += ">"

1008

1009

if text is not None and text:

1010

if tag in _HTMLTAGS_NOESCAPE:

1011

to_write += text.encode(encoding)

1012

elif cdata_needs_escaping(text):

1013

to_write += _escape_cdata(text)

1014

else:

1015

to_write += text.encode(encoding)

1016

1017

write(to_write)

1018

1019

for child in node._children:

1020

if maxdepth is not None:

1021

depth = depth + 1

1022

if depth < maxdepth:

1023

_write_html(write, child, encoding, namespaces, depth,

1024

maxdepth)

1025

elif depth == maxdepth and text:

1026

write(' [...]\n')

1027

1028

else:

1029

_write_html(write, child, encoding, namespaces, depth, maxdepth)

1030

1031

if text or node._children or tag not in _HTMLTAGS_UNBALANCED:

1032

write("</" + tag.encode(encoding) + ">")

1033

1034

if tail:

1035

if cdata_needs_escaping(tail):

1036

write(_escape_cdata(tail))

1037

else:

1038

write(tail.encode(encoding))

1039

1040

def _write_html_no_encoding(write, node, namespaces):

1041

""" Append HTML to string without any particular unicode encoding.

1042

We have a separate function for this due to the fact that encoding

1043

while recursing is very expensive if this will get serialized out to

1044

utf8 anyway (the encoding can happen afterwards). We append to a string

1045

because it's faster than calling any 'write' or 'append' function."""

1046

1047

tag = node.tag

1048

tail = node.tail

1049

text = node.text

1050

tail = node.tail

1051

1052

to_write = ""

1053

1054

if tag is Replace:

1055

if not node.structure:

1056

if cdata_needs_escaping(text):

1057

text = _escape_cdata_noencoding(text)

1058

write(text)

1059

1060

elif tag is Comment:

1061

if cdata_needs_escaping(text):

1062

text = _escape_cdata_noencoding(text)

1063

write('')

1064

1065

elif tag is ProcessingInstruction:

1066

if cdata_needs_escaping(text):

1067

text = _escape_cdata_noencoding(text)

1068

write('')

1069

1070

else:

1071

xmlns_items = [] # new namespaces in this scope

1072

try:

1073

if tag[:1] == "{":

1074

if tag[:_XHTML_PREFIX_LEN] == _XHTML_PREFIX:

1075

tag = tag[_XHTML_PREFIX_LEN:]

1076

else:

1077

tag, xmlns = fixtag(tag, namespaces)

1078

if xmlns:

1079

xmlns_items.append(xmlns)

1080

except TypeError:

1081

_raise_serialization_error(tag)

1082

1083

to_write += "<" + tag

1084

1085

attrib = node.attrib

1086

1087

if attrib is not None:

1088

if len(attrib) > 1:

1089

attrib_keys = attrib.keys()

1090

attrib_keys.sort()

1091

1092

else:

1093

attrib_keys = attrib

1094

for k in attrib_keys:

1095

try:

1096

if k[:1] == "{":

1097

continue

1098

except TypeError:

1099

_raise_serialization_error(k)

1100

if k in _HTMLATTRS_BOOLEAN:

1101

to_write += ' ' + k

1102

else:

1103

v = attrib[k]

1104

to_write += " %s=\"%s\"" % (k, v)

1105

1106

for k, v in xmlns_items:

1107

to_write += " %s=\"%s\"" % (k, v)

1108

1109

to_write += ">"

1110

1111

if text is not None and text:

1112

if tag in _HTMLTAGS_NOESCAPE:

1113

to_write += text

1114

elif cdata_needs_escaping(text):

1115

to_write += _escape_cdata_noencoding(text)

1116

else:

1117

to_write += text

1118

1119

write(to_write)

1120

1121

for child in node._children:

1122

_write_html_no_encoding(write, child, namespaces)

1123

1124

if text or node._children or tag not in _HTMLTAGS_UNBALANCED:

1125

write("</" + tag + ">")

1126

1127

if tail:

1128

if cdata_needs_escaping(tail):

1129

write(_escape_cdata_noencoding(tail))

1130

else:

1131

write(tail)

1132

1133

def _write_xml(write, node, encoding, namespaces, pipeline, xhtml=False):

1134

""" Write XML to a file """

1135

if encoding is None:

1136

encoding = 'utf-8'

1137

tag = node.tag

1138

if tag is Comment:

1139

write("" % _escape_cdata(node.text, encoding))

1140

elif tag is ProcessingInstruction:

1141

write("<?%s?>" % _escape_cdata(node.text, encoding))

1142

elif tag is Replace:

1143

if node.structure:

1144

# this may produce invalid xml

1145

write(node.text.encode(encoding))

1146

else:

1147

write(_escape_cdata(node.text, encoding))

1148

else:

1149

if xhtml:

1150

if tag[:_XHTML_PREFIX_LEN] == _XHTML_PREFIX:

1151

tag = tag[_XHTML_PREFIX_LEN:]

1152

if node.attrib:

1153

items = node.attrib.items()

1154

else:

1155

items = [] # must always be sortable.

1156

xmlns_items = [] # new namespaces in this scope

1157

try:

1158

if tag[:1] == "{":

1159

tag, xmlns = fixtag(tag, namespaces)

1160

if xmlns:

1161

xmlns_items.append(xmlns)

1162

except TypeError:

1163

_raise_serialization_error(tag)

1164

write("<" + tag.encode(encoding))

1165

if items or xmlns_items:

1166

items.sort() # lexical order

1167

for k, v in items:

1168

try:

1169

if k[:1] == "{":

1170

if not pipeline:

1171

if k == _MELD_ID:

1172

continue

1173

k, xmlns = fixtag(k, namespaces)

1174

if xmlns: xmlns_items.append(xmlns)

1175

if not pipeline:

1176

# special-case for HTML input

1177

if k == 'xmlns:meld':

1178

continue

1179

except TypeError:

1180

_raise_serialization_error(k)

1181

write(" %s=\"%s\"" % (k.encode(encoding),

1182

_escape_attrib(v, encoding)))

1183

for k, v in xmlns_items:

1184

write(" %s=\"%s\"" % (k.encode(encoding),

1185

_escape_attrib(v, encoding)))

1186

if node.text or node._children:

1187

write(">")

1188

if node.text:

1189

write(_escape_cdata(node.text, encoding))

1190

for n in node._children:

1191

_write_xml(write, n, encoding, namespaces, pipeline, xhtml)

1192

write("</" + tag.encode(encoding) + ">")

1193

else:

1194

write(" />")

1195

for k, v in xmlns_items:

1196

del namespaces[v]

1197

if node.tail:

1198

write(_escape_cdata(node.tail, encoding))

1199

1200

# overrides to elementtree to increase speed and get entity quoting correct.

1201

1202

nonentity_re = re.compile('&(?!([#\w]*;))') # negative lookahead assertion

1203

1204

def _escape_cdata(text, encoding=None):

1205

# escape character data

1206

try:

1207

if encoding:

1208

try:

1209

text = text.encode(encoding)

1210

except UnicodeError:

1211

return _encode_entity(text)

1212

text = nonentity_re.sub('&', text)

1213

text = text.replace("<", "<")

1214

return text

1215

except (TypeError, AttributeError):

1216

_raise_serialization_error(text)

1217

1218

def _escape_attrib(text, encoding=None):

1219

# escape attribute value

1220

try:

1221

if encoding:

1222

try:

1223

text = text.encode(encoding)

1224

except UnicodeError:

1225

return _encode_entity(text)

1226

# don't requote properly-quoted entities

1227

text = nonentity_re.sub('&', text)

1228

text = text.replace("<", "<")

1229

text = text.replace('"', """)

1230

return text

1231

except (TypeError, AttributeError):

1232

_raise_serialization_error(text)

1233

1234

def _escape_cdata_noencoding(text):

1235

# escape character data

1236

text = nonentity_re.sub('&', text)

1237

text = text.replace("<", "<")

1238

return text

1239

1240

def _escape_attrib_noencoding(text):

1241

# don't requote properly-quoted entities

1242

text = nonentity_re.sub('&', text)

1243

text = text.replace("<", "<")

1244

text = text.replace('"', """)

1245

return text

1246

1247

# utility functions

1248

1249

def _write_declaration(write, encoding):

1250

if not encoding:

1251

write('<?xml version="1.0"?>\n')

1252

else:

1253

write('<?xml version="1.0" encoding="%s"?>\n' % encoding)

1254

1255

def _write_doctype(write, doctype):

1256

try:

1257

name, pubid, system = doctype

1258

except (ValueError, TypeError):

1259

raise ValueError, ("doctype must be supplied as a 3-tuple in the form "

1260

"(name, pubid, system) e.g. '%s'" % doctype.xhtml)

1261

write('<!DOCTYPE %s PUBLIC "%s" "%s">\n' % (name, pubid, system))

1262

1263

xml_decl_re = re.compile(r'<\?xml .*?\?>')

1264

begin_tag_re = re.compile(r'<[^/?!]?\w+')

1265

'<!DOCTYPE %s PUBLIC "%s" "%s">' % doctype.html

1266

1267

def insert_doctype(data, doctype=doctype.xhtml):

1268

# jam an html doctype declaration into 'data' if it

1269

# doesn't already contain a doctype declaration

1270

match = xml_decl_re.search(data)

1271

dt_string = '<!DOCTYPE %s PUBLIC "%s" "%s">' % doctype

1272

if match is not None:

1273

start, end = match.span(0)

1274

before = data[:start]

1275

tag = data[start:end]

1276

after = data[end:]

1277

return before + tag + dt_string + after

1278

else:

1279

return dt_string + data

1280

1281

def insert_meld_ns_decl(data):

1282

match = begin_tag_re.search(data)

1283

if match is not None:

1284

start, end = match.span(0)

1285

before = data[:start]

1286

tag = data[start:end] + ' xmlns:meld="%s"' % _MELD_NS_URL

1287

after = data[end:]

1288

data = before + tag + after

1289

return data

1290

1291

def prefeed(data, doctype=doctype.xhtml):

1292

if data.find('<!DOCTYPE') == -1:

1293

data = insert_doctype(data, doctype)

1294

if data.find('xmlns:meld') == -1:

1295

data = insert_meld_ns_decl(data)

1296

return data

1297

1298

def sharedlineage(srcelement, tgtelement):

1299

srcparent = srcelement.parent

1300

tgtparent = tgtelement.parent

1301

srcparenttag = getattr(srcparent, 'tag', None)

1302

tgtparenttag = getattr(tgtparent, 'tag', None)

1303

if srcparenttag != tgtparenttag:

1304

return False

1305

elif tgtparenttag is None and srcparenttag is None:

1306

return True

1307

elif tgtparent and srcparent:

1308

return sharedlineage(srcparent, tgtparent)

1309

return False

1310

1311

def diffreduce(elements):

1312

# each element in 'elements' should all have non-None meldids, and should

1313

# be preordered in depth-first traversal order

1314

reduced = []

1315

for element in elements:

1316

parent = element.parent

1317

if parent is None:

1318

reduced.append(element)

1319

continue

1320

if parent in reduced:

1321

continue

1322

reduced.append(element)

1323

return reduced

1324

1325

def intersection(S1, S2):

1326

L = []

1327

for element in S1:

1328

if element in S2:

1329

L.append(element)

1330

return L

1331

1332

def melditerator(element, meldid=None, _MELD_ID=_MELD_ID):

1333

nodeid = element.attrib.get(_MELD_ID)

1334

if nodeid is not None:

1335

if meldid is None or nodeid == meldid:

1336

yield element

1337

for child in element._children:

1338

for el2 in melditerator(child, meldid):

1339

nodeid = el2.attrib.get(_MELD_ID)

1340

if nodeid is not None:

1341

if meldid is None or nodeid == meldid:

1342

yield el2

1343

1344

def search(name):

1345

if not "." in name:

1346

raise ValueError("unloadable datatype name: " + `name`)

1347

components = name.split('.')

1348

start = components[0]

1349

g = globals()

1350

package = __import__(start, g, g)

1351

modulenames = [start]

1352

for component in components[1:]:

1353

modulenames.append(component)

1354

try:

1355

package = getattr(package, component)

1356

except AttributeError:

1357

n = '.'.join(modulenames)

1358

package = __import__(n, g, g, component)

1359

return package

1360

1361

def sample_mutator(root):

1362

values = []

1363

for thing in range(0, 20):

1364

values.append((str(thing), str(thing)))

1365

1366

ob = root.findmeld('tr')

1367

for tr, (name, desc) in ob.repeat(values):

1368

tr.findmeld('td1').content(name)

1369

tr.findmeld('td2').content(desc)

1370

1371

1372

1373

if __name__ == '__main__':

1374

# call interactively by invoking meld3.py with a filename and

1375

# a dotted-python-path name to a mutator function that accepts a single

1376

# argument (the root), e.g.:

1377

#

1378

# python meld3.py sample.html meld3.sample_mutator

1379

#

1380

# the rendering will be sent to stdout

1381

import sys

1382

filename = sys.argv[1]

1383

try:

1384

mutator = sys.argv[2]

1385

except IndexError:

1386

mutator = None

1387

import timeit

1388

root = parse_html(open(filename, 'r'))

1389

io = StringIO()

1390

if mutator:

1391

mutator = search(mutator)

1392

mutator(root)

1393

root.write_html(io)

1394

sys.stdout.write(io.getvalue())

1395