6
from StringIO import StringIO
9
from elementtree.ElementTree import TreeBuilder
10
from elementtree.ElementTree import XMLTreeBuilder
11
from elementtree.ElementTree import Comment
12
from elementtree.ElementTree import ProcessingInstruction
13
from elementtree.ElementTree import QName
14
from elementtree.ElementTree import _raise_serialization_error
15
from elementtree.ElementTree import _namespace_map
16
from elementtree.ElementTree import fixtag
17
from elementtree.ElementTree import parse as et_parse
18
from elementtree.ElementTree import ElementPath
20
from xml.etree.ElementTree import TreeBuilder
21
from xml.etree.ElementTree import XMLTreeBuilder
22
from xml.etree.ElementTree import Comment
23
from xml.etree.ElementTree import ProcessingInstruction
24
from xml.etree.ElementTree import QName
25
from xml.etree.ElementTree import _raise_serialization_error
26
from xml.etree.ElementTree import _namespace_map
27
from xml.etree.ElementTree import fixtag
28
from xml.etree.ElementTree import parse as et_parse
29
from xml.etree.ElementTree import ElementPath
31
# HTMLTreeBuilder does not exist in python 2.5 standard elementtree
32
from HTMLParser import HTMLParser
33
AUTOCLOSE = "p", "li", "tr", "th", "td", "head", "body"
34
IGNOREEND = "img", "hr", "meta", "link", "br"
35
is_not_ascii = re.compile(eval(r'u"[\u0080-\uffff]"')).search
37
# replace element factory
38
def Replace(text, structure=False):
39
element = _MeldElementInterface(Replace, {})
41
element.structure = structure
48
def write(self, data):
58
def findmeld(self, node, name, default=None):
59
iterator = self.getiterator(node)
60
for element in iterator:
61
val = element.attrib.get(_MELD_ID)
66
def clone(self, node, parent=None):
67
# NOTE: this is not implemented by the C version (it used to be
68
# but I don't want to maintain it)
69
element = _MeldElementInterface(node.tag, node.attrib.copy())
70
element.text = node.text
71
element.tail = node.tail
72
element.structure = node.structure
73
if parent is not None:
74
# avoid calling self.append to reduce function call overhead
75
parent._children.append(element)
76
element.parent = parent
77
for child in node._children:
78
self.clone(child, element)
81
def _bfclone(self, nodes, parent):
84
element = _MeldElementInterface(node.tag, node.attrib.copy())
85
element.parent = parent
86
element.text = node.text
87
element.tail = node.tail
88
element.structure = node.structure
90
self._bfclone(node._children, element)
94
def bfclone(self, node, parent=None):
95
element = _MeldElementInterface(node.tag, node.attrib.copy())
96
element.text = node.text
97
element.tail = node.tail
98
element.structure = node.structure
99
element.parent = parent
100
if parent is not None:
101
parent._children.append(element)
103
self._bfclone(node._children, element)
106
def getiterator(self, node, tag=None):
110
if tag is None or node.tag == tag:
112
for element in node._children:
113
nodes.extend(self.getiterator(element, tag))
116
def content(self, node, text, structure=False):
118
replacenode = Replace(text, structure)
119
replacenode.parent = node
120
replacenode.text = text
121
replacenode.structure = structure
122
node._children = [replacenode]
124
pyhelper = PyHelper()
127
import cmeld3 as chelper
131
if chelper and not os.getenv('MELD3_PYIMPL'):
136
_MELD_NS_URL = 'http://www.plope.com/software/meld3'
137
_MELD_PREFIX = '{%s}' % _MELD_NS_URL
139
_MELD_ID = '%s%s' % (_MELD_PREFIX, _MELD_LOCAL)
140
_MELD_SHORT_ID = 'meld:%s' % _MELD_LOCAL
141
_XHTML_NS_URL = 'http://www.w3.org/1999/xhtml'
142
_XHTML_PREFIX = '{%s}' % _XHTML_NS_URL
143
_XHTML_PREFIX_LEN = len(_XHTML_PREFIX)
149
# lookup table for ease of use in external code
150
html_strict = ('HTML', '-//W3C//DTD HTML 4.01//EN',
151
'http://www.w3.org/TR/html4/strict.dtd')
152
html = ('HTML', '-//W3C//DTD HTML 4.01 Transitional//EN',
153
'http://www.w3.org/TR/html4/loose.dtd')
154
xhtml_strict = ('html', '-//W3C//DTD XHTML 1.0 Strict//EN',
155
'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd')
156
xhtml = ('html', '-//W3C//DTD XHTML 1.0 Transitional//EN',
157
'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd')
159
class _MeldElementInterface:
165
Replace = [Replace] # this is used by C code
167
# overrides to reduce MRU lookups
168
def __init__(self, tag, attrib):
174
return "<MeldElement %s at %x>" % (self.tag, id(self))
177
return len(self._children)
179
def __getitem__(self, index):
180
return self._children[index]
182
def __getslice__(self, start, stop):
183
return self._children[start:stop]
185
def getchildren(self):
186
return self._children
188
def find(self, path):
189
return ElementPath.find(self, path)
191
def findtext(self, path, default=None):
192
return ElementPath.findtext(self, path, default)
194
def findall(self, path):
195
return ElementPath.findall(self, path)
200
self.text = self.tail = None
202
def get(self, key, default=None):
203
return self.attrib.get(key, default)
205
def set(self, key, value):
206
self.attrib[key] = value
209
return self.attrib.keys()
212
return self.attrib.items()
214
def getiterator(self, *ignored_args, **ignored_kw):
215
# we ignore any tag= passed in to us, because it's too painful
216
# to support in our C version
217
return helper.getiterator(self)
219
# overrides to support parent pointers and factories
221
def __setitem__(self, index, element):
222
self._children[index] = element
223
element.parent = self
225
def __setslice__(self, start, stop, elements):
226
for element in elements:
227
element.parent = self
228
self._children[start:stop] = list(elements)
230
def append(self, element):
231
self._children.append(element)
232
element.parent = self
234
def insert(self, index, element):
235
self._children.insert(index, element)
236
element.parent = self
238
def __delitem__(self, index):
239
ob = self._children[index]
241
del self._children[index]
243
def __delslice__(self, start, stop):
244
obs = self._children[start:stop]
247
del self._children[start:stop]
249
def remove(self, element):
250
self._children.remove(element)
251
element.parent = None
253
def makeelement(self, tag, attrib):
254
return self.__class__(tag, attrib)
258
def __mod__(self, other):
259
""" Fill in the text values of meld nodes in tree; only
260
support dictionarylike operand (sequence operand doesn't seem
261
to make sense here)"""
262
return self.fillmelds(**other)
264
def fillmelds(self, **kw):
265
""" Fill in the text values of meld nodes in tree using the
266
keyword arguments passed in; use the keyword keys as meld ids
267
and the keyword values as text that should fill in the node
268
text on which that meld id is found. Return a list of keys
269
from **kw that were not able to be found anywhere in the tree.
270
Never raises an exception. """
273
node = self.findmeld(k)
280
def fillmeldhtmlform(self, **kw):
281
""" Perform magic to 'fill in' HTML form element values from a
282
dictionary. Unlike 'fillmelds', the type of element being
283
'filled' is taken into consideration.
285
Perform a 'findmeld' on each key in the dictionary and use the
286
value that corresponds to the key to perform mutation of the
287
tree, changing data in what is presumed to be one or more HTML
288
form elements according to the following rules::
290
If the found element is an 'input group' (its meld id ends
291
with the string ':inputgroup'), set the 'checked' attribute
292
on the appropriate subelement which has a 'value' attribute
293
which matches the dictionary value. Also remove the
294
'checked' attribute from every other 'input' subelement of
295
the input group. If no input subelement's value matches the
296
dictionary value, this key is treated as 'unfilled'.
298
If the found element is an 'input type=text', 'input
299
type=hidden', 'input type=submit', 'input type=password',
300
'input type=reset' or 'input type=file' element, replace its
301
'value' attribute with the value.
303
If the found element is an 'input type=checkbox' or 'input
304
type='radio' element, set its 'checked' attribute to true if
305
the dict value is true, or remove its 'checked' attribute if
306
the dict value is false.
308
If the found element is a 'select' element and the value
309
exists in the 'value=' attribute of one of its 'option'
310
subelements, change that option's 'selected' attribute to
311
true and mark all other option elements as unselected. If
312
the select element does not contain an option with a value
313
that matches the dictionary value, do nothing and return
314
this key as unfilled.
316
If the found element is a 'textarea' or any other kind of
317
element, replace its text with the value.
319
If the element corresponding to the key is not found,
320
do nothing and treat the key as 'unfilled'.
322
Return a list of 'unfilled' keys, representing meld ids
323
present in the dictionary but not present in the element tree
324
or meld ids which could not be filled due to the lack of any
325
matching subelements for 'select' nodes or 'inputgroup' nodes.
331
node = self.findmeld(k)
339
if k.endswith(':inputgroup'):
340
# an input group is a list of input type="checkbox" or
341
# input type="radio" elements that can be treated as a group
342
# because they attempt to specify the same value
348
for child in node.findall('input'):
349
input_type = child.attrib.get('type', '').lower()
350
if input_type not in ('checkbox', 'radio'):
353
input_val = child.attrib.get('value', '')
358
unfound.append(child)
365
option.attrib['checked'] = 'checked'
366
for option in unfound:
368
del option.attrib['checked']
373
tag = node.tag.lower()
377
input_type = node.attrib.get('type', 'text').lower()
379
# fill in value attrib for most input types
380
if input_type in ('hidden', 'submit', 'text',
381
'password', 'reset', 'file'):
382
node.attrib['value'] = val
384
# unless it's a checkbox or radio attribute, then we
385
# fill in its checked attribute
386
elif input_type in ('checkbox', 'radio'):
388
node.attrib['checked'] = 'checked'
391
del node.attrib['checked']
398
elif tag == 'select':
399
# if the node is a select node, we want to select
400
# the value matching val, otherwise it's unfilled
405
for option in node.findall('option'):
406
if option.attrib.get('value', '') == val:
409
unfound.append(option)
414
option.attrib['selected'] = 'selected'
415
for option in unfound:
417
del option.attrib['selected']
425
def findmeld(self, name, default=None):
426
""" Find a node in the tree that has a 'meld id' corresponding
427
to 'name'. Iterate over all subnodes recursively looking for a
428
node which matches. If we can't find the node, return None."""
429
# this could be faster if we indexed all the meld nodes in the
430
# tree; we just walk the whole hierarchy now.
431
result = helper.findmeld(self, name)
437
""" Find all nodes that have a meld id attribute and return
438
the found nodes in a list"""
439
return self.findwithattrib(_MELD_ID)
441
def findwithattrib(self, attrib, value=None):
442
""" Find all nodes that have an attribute named 'attrib'. If
443
'value' is not None, omit nodes on which the attribute value
444
does not compare equally to 'value'. Return the found nodes in
446
iterator = helper.getiterator(self)
448
for element in iterator:
449
attribval = element.attrib.get(attrib)
450
if attribval is not None:
452
elements.append(element)
454
if value == attribval:
455
elements.append(element)
459
def repeat(self, iterable, childname=None):
460
"""repeats an element with values from an iterable. If
461
'childname' is not None, repeat the element on which the
462
repeat is called, otherwise find the child element with a
463
'meld:id' matching 'childname' and repeat that. The element
464
is repeated within its parent element (nodes that are created
465
as a result of a repeat share the same parent). This method
466
returns an iterable; the value of each iteration is a
467
two-sequence in the form (newelement, data). 'newelement' is
468
a clone of the template element (including clones of its
469
children) which has already been seated in its parent element
470
in the template. 'data' is a value from the passed in
471
iterable. Changing 'newelement' (typically based on values
472
from 'data') mutates the element 'in place'."""
474
element = self.findmeld(childname)
478
parent = element.parent
479
# creating a list is faster than yielding a generator (py 2.4)
482
for thing in iterable:
486
clone = helper.bfclone(element, parent)
487
L.append((clone, thing))
491
def replace(self, text, structure=False):
492
""" Replace this element with a Replace node in our parent with
493
the text 'text' and return the index of our position in
494
our parent. If we have no parent, do nothing, and return None.
495
Pass the 'structure' flag to the replace node so it can do the right
496
thing at render time. """
500
# reduce function call overhead by not calliing self.insert
501
node = Replace(text, structure)
502
parent._children.insert(i, node)
506
def content(self, text, structure=False):
507
""" Delete this node's children and append a Replace node that
508
contains text. Always return None. Pass the 'structure' flag
509
to the replace node so it can do the right thing at render
511
helper.content(self, text, structure)
513
def attributes(self, **kw):
514
""" Set attributes on this node. """
515
for k, v in kw.items():
516
# prevent this from getting to the parser if possible
517
if not isinstance(k, types.StringTypes):
518
raise ValueError, 'do not set non-stringtype as key: %s' % k
519
if not isinstance(v, types.StringTypes):
520
raise ValueError, 'do not set non-stringtype as val: %s' % v
521
self.attrib[k] = kw[k]
524
def write_xmlstring(self, encoding=None, doctype=None, fragment=False,
525
declaration=True, pipeline=False):
530
_write_declaration(write, encoding)
532
_write_doctype(write, doctype)
533
_write_xml(write, self, encoding, {}, pipeline)
536
def write_xml(self, file, encoding=None, doctype=None,
537
fragment=False, declaration=True, pipeline=False):
538
""" Write XML to 'file' (which can be a filename or filelike object)
540
encoding - encoding string (if None, 'utf-8' encoding is assumed)
541
Must be a recognizable Python encoding type.
542
doctype - 3-tuple indicating name, pubid, system of doctype.
543
The default is to prevent a doctype from being emitted.
544
fragment - True if a 'fragment' should be emitted for this node (no
545
declaration, no doctype). This causes both the
546
'declaration' and 'doctype' parameters to become ignored
548
declaration - emit an xml declaration header (including an encoding
549
if it's not None). The default is to emit the
551
pipeline - preserve 'meld' namespace identifiers in output
552
for use in pipelining
554
if not hasattr(file, "write"):
555
file = open(file, "wb")
556
data = self.write_xmlstring(encoding, doctype, fragment, declaration,
560
def write_htmlstring(self, encoding=None, doctype=doctype.html,
566
if encoding in ('utf8', 'utf-8', 'latin-1', 'latin1',
568
# optimize for common dumb-American case (only encode once at
572
_write_doctype(write, doctype)
573
_write_html_no_encoding(write, self, {})
574
joined = ''.join(data)
579
_write_doctype(write, doctype)
580
_write_html(write, self, encoding, {})
581
joined = ''.join(data)
584
def write_html(self, file, encoding=None, doctype=doctype.html,
586
""" Write HTML to 'file' (which can be a filename or filelike object)
588
encoding - encoding string (if None, 'utf-8' encoding is assumed).
589
Unlike XML output, this is not used in a declaration,
590
but it is used to do actual character encoding during
591
output. Must be a recognizable Python encoding type.
592
doctype - 3-tuple indicating name, pubid, system of doctype.
593
The default is the value of doctype.html (HTML 4.0
595
fragment - True if a "fragment" should be omitted (no doctype).
596
This overrides any provided "doctype" parameter if
599
Namespace'd elements and attributes have their namespaces removed
600
during output when writing HTML, so pipelining cannot be performed.
602
HTML is not valid XML, so an XML declaration header is never emitted.
604
if not hasattr(file, "write"):
605
file = open(file, "wb")
606
page = self.write_htmlstring(encoding, doctype, fragment)
609
def write_xhtmlstring(self, encoding=None, doctype=doctype.xhtml,
610
fragment=False, declaration=False, pipeline=False):
615
_write_declaration(write, encoding)
617
_write_doctype(write, doctype)
618
_write_xml(write, self, encoding, {}, pipeline, xhtml=True)
621
def write_xhtml(self, file, encoding=None, doctype=doctype.xhtml,
622
fragment=False, declaration=False, pipeline=False):
623
""" Write XHTML to 'file' (which can be a filename or filelike object)
625
encoding - encoding string (if None, 'utf-8' encoding is assumed)
626
Must be a recognizable Python encoding type.
627
doctype - 3-tuple indicating name, pubid, system of doctype.
628
The default is the value of doctype.xhtml (XHTML
630
fragment - True if a 'fragment' should be emitted for this node (no
631
declaration, no doctype). This causes both the
632
'declaration' and 'doctype' parameters to be ignored.
633
declaration - emit an xml declaration header (including an encoding
634
string if 'encoding' is not None)
635
pipeline - preserve 'meld' namespace identifiers in output
636
for use in pipelining
638
# use a list as a collector, and only call the write method of
639
# the file once we've collected all output (reduce function call
643
if not hasattr(file, "write"):
644
file = open(file, "wb")
645
page = self.write_xhtmlstring(encoding, doctype, fragment, declaration,
649
def clone(self, parent=None):
650
""" Create a clone of an element. If parent is not None,
651
append the element to the parent. Recurse as necessary to create
652
a deep clone of the element. """
653
return helper.bfclone(self, parent)
656
""" Remove ourselves from our parent node (de-parent) and return
657
the index of the parent which was deleted. """
658
i = self.parentindex()
663
def parentindex(self):
664
""" Return the parent node index in which we live """
666
if parent is not None:
667
return parent._children.index(self)
669
def shortrepr(self, encoding=None):
671
_write_html(data.append, self, encoding, {}, maxdepth=2)
674
def diffmeld(self, other):
675
""" Compute the meld element differences from this node (the
676
source) to 'other' (the target). Return a dictionary of
677
sequences in the form {'unreduced:
678
{'added':[], 'removed':[], 'moved':[]},
680
{'added':[], 'removed':[], 'moved':[]},}
682
srcelements = self.findmelds()
683
tgtelements = other.findmelds()
684
srcids = [ x.meldid() for x in srcelements ]
685
tgtids = [ x.meldid() for x in tgtelements ]
688
for srcelement in srcelements:
689
if srcelement.meldid() not in tgtids:
690
removed.append(srcelement)
693
for tgtelement in tgtelements:
694
if tgtelement.meldid() not in srcids:
695
added.append(tgtelement)
698
for srcelement in srcelements:
699
srcid = srcelement.meldid()
701
i = tgtids.index(srcid)
702
tgtelement = tgtelements[i]
703
if not sharedlineage(srcelement, tgtelement):
704
moved.append(tgtelement)
706
unreduced = {'added':added, 'removed':removed, 'moved':moved}
708
moved_reduced = diffreduce(moved)
709
added_reduced = diffreduce(added)
710
removed_reduced = diffreduce(removed)
712
reduced = {'moved':moved_reduced, 'added':added_reduced,
713
'removed':removed_reduced}
715
return {'unreduced':unreduced,
719
return self.attrib.get(_MELD_ID)
724
while parent is not None:
726
parent = parent.parent
730
def MeldTreeBuilder():
731
return TreeBuilder(element_factory=_MeldElementInterface)
733
class MeldParser(XMLTreeBuilder):
735
""" A parser based on Fredrik's PIParser at
736
http://effbot.org/zone/element-pi.htm. It blithely ignores the
737
case of a comment existing outside the root element and ignores
738
processing instructions entirely. We need to validate that there
739
are no repeated meld id's in the source as well """
741
def __init__(self, html=0, target=None):
742
XMLTreeBuilder.__init__(self, html, target)
743
# assumes ElementTree 1.2.X
744
self._parser.CommentHandler = self.handle_comment
747
def handle_comment(self, data):
748
self._target.start(Comment, {})
749
self._target.data(data)
750
self._target.end(Comment)
752
def _start(self, tag, attrib_in):
753
# this is used by self._parser (an Expat parser) as
754
# StartElementHandler but only if _start_list is not
755
# provided... so why does this method exist?
756
for key in attrib_in:
757
if '{' + key == _MELD_ID:
758
meldid = attrib_in[key]
759
if self.meldids.get(meldid):
760
raise ValueError, ('Repeated meld id "%s" in source' %
762
self.meldids[meldid] = 1
763
return XMLTreeBuilder._start(self, tag, attrib_in)
765
def _start_list(self, tag, attrib_in):
766
# This is used by self._parser (an Expat parser)
767
# as StartElementHandler. attrib_in is a flat even-length
768
# sequence of name, value pairs for all attributes.
769
# See http://python.org/doc/lib/xmlparser-objects.html
770
for i in range(0, len(attrib_in), 2):
771
# For some reason, clark names are missing the leading '{'
772
attrib = self._fixname(attrib_in[i])
773
if _MELD_ID == attrib:
774
meldid = attrib_in[i+1]
775
if self.meldids.get(meldid):
776
raise ValueError, ('Repeated meld id "%s" in source' %
778
self.meldids[meldid] = 1
779
return XMLTreeBuilder._start_list(self, tag, attrib_in)
782
val = XMLTreeBuilder.close(self)
786
class HTMLMeldParser(HTMLParser):
787
""" A mostly-cut-and-paste of ElementTree's HTMLTreeBuilder that
788
does special meld3 things (like preserve comments and munge meld
789
ids). Subclassing is not possible due to private attributes. :-("""
791
def __init__(self, builder=None, encoding=None):
794
builder = MeldTreeBuilder()
795
self.builder = builder
796
self.encoding = encoding or "iso-8859-1"
797
HTMLParser.__init__(self)
801
HTMLParser.close(self)
803
return self.builder.close()
805
def handle_starttag(self, tag, attrs):
807
# look for encoding directives
808
http_equiv = content = None
810
if k == "http-equiv":
811
http_equiv = v.lower()
814
if http_equiv == "content-type" and content:
815
# use mimetools to parse the http header
816
header = mimetools.Message(
817
StringIO("%s: %s\n\n" % (http_equiv, content))
819
encoding = header.getparam("charset")
821
self.encoding = encoding
823
if self.__stack and self.__stack[-1] == tag:
824
self.handle_endtag(tag)
825
self.__stack.append(tag)
829
if k == _MELD_SHORT_ID:
831
if self.meldids.get(v):
832
raise ValueError, ('Repeated meld id "%s" in source' %
838
self.builder.start(tag, attrib)
841
self.builder.end(tag)
843
def handle_endtag(self, tag):
846
lasttag = self.__stack.pop()
847
if tag != lasttag and lasttag in AUTOCLOSE:
848
self.handle_endtag(lasttag)
849
self.builder.end(tag)
851
def handle_charref(self, char):
853
char = int(char[1:], 16)
857
self.builder.data(chr(char))
859
self.builder.data(unichr(char))
861
def handle_entityref(self, name):
862
entity = htmlentitydefs.entitydefs.get(name)
867
entity = int(entity[2:-1])
868
if 0 <= entity < 128:
869
self.builder.data(chr(entity))
871
self.builder.data(unichr(entity))
873
self.unknown_entityref(name)
875
def handle_data(self, data):
876
if isinstance(data, type('')) and is_not_ascii(data):
877
# convert to unicode, but only if necessary
878
data = unicode(data, self.encoding, "ignore")
879
self.builder.data(data)
881
def unknown_entityref(self, name):
882
pass # ignore by default; override if necessary
884
def handle_comment(self, data):
885
self.builder.start(Comment, {})
886
self.builder.data(data)
887
self.builder.end(Comment)
889
def do_parse(source, parser):
890
root = et_parse(source, parser=parser).getroot()
891
iterator = root.getiterator()
897
def parse_xml(source):
898
""" Parse source (a filelike object) into an element tree. If
899
html is true, use a parser that can resolve somewhat ambiguous
900
HTML into XHTML. Otherwise use a 'normal' parser only."""
901
builder = MeldTreeBuilder()
902
parser = MeldParser(target=builder)
903
return do_parse(source, parser)
905
def parse_html(source, encoding=None):
906
builder = MeldTreeBuilder()
907
parser = HTMLMeldParser(builder, encoding)
908
return do_parse(source, parser)
910
def parse_xmlstring(text):
911
source = StringIO(text)
912
return parse_xml(source)
914
def parse_htmlstring(text, encoding=None):
915
source = StringIO(text)
916
return parse_html(source, encoding)
918
attrib_needs_escaping = re.compile(r'[&"<]').search
919
cdata_needs_escaping = re.compile(r'[&<]').search
921
def _both_case(mapping):
922
# Add equivalent upper-case keys to mapping.
923
lc_keys = mapping.keys()
925
mapping[k.upper()] = mapping[k]
928
_HTMLTAGS_UNBALANCED = {'area':1, 'base':1, 'basefont':1, 'br':1, 'col':1,
929
'frame':1, 'hr':1, 'img':1, 'input':1, 'isindex':1,
930
'link':1, 'meta':1, 'param':1}
931
_both_case(_HTMLTAGS_UNBALANCED)
933
_HTMLTAGS_NOESCAPE = {'script':1, 'style':1}
934
_both_case(_HTMLTAGS_NOESCAPE)
936
_HTMLATTRS_BOOLEAN = {'selected':1, 'checked':1, 'compact':1, 'declare':1,
937
'defer':1, 'disabled':1, 'ismap':1, 'multiple':1,
938
'nohref':1, 'noresize':1, 'noshade':1, 'nowrap':1}
939
_both_case(_HTMLATTRS_BOOLEAN)
941
def _write_html(write, node, encoding, namespaces, depth=-1, maxdepth=None):
942
" Write HTML to file """
954
if not node.structure:
955
if cdata_needs_escaping(text):
956
text = _escape_cdata(text)
957
write(text.encode(encoding))
960
if cdata_needs_escaping(text):
961
text = _escape_cdata(text)
962
write('<!-- ' + text + ' -->'.encode(encoding))
964
elif tag is ProcessingInstruction:
965
if cdata_needs_escaping(text):
966
text = _escape_cdata(text)
967
write('<!-- ' + text + ' -->'.encode(encoding))
970
xmlns_items = [] # new namespaces in this scope
973
if tag[:_XHTML_PREFIX_LEN] == _XHTML_PREFIX:
974
tag = tag[_XHTML_PREFIX_LEN:]
976
tag, xmlns = fixtag(tag, namespaces)
978
xmlns_items.append(xmlns)
980
_raise_serialization_error(tag)
982
to_write += "<%s" % tag.encode(encoding)
986
if attrib is not None:
988
attrib_keys = attrib.keys()
992
for k in attrib_keys:
997
_raise_serialization_error(k)
998
if k in _HTMLATTRS_BOOLEAN:
999
to_write += ' ' + k.encode(encoding)
1002
to_write += " %s=\"%s\"" % (k, v)
1004
for k, v in xmlns_items:
1005
to_write += " %s=\"%s\"" % (k, v)
1009
if text is not None and text:
1010
if tag in _HTMLTAGS_NOESCAPE:
1011
to_write += text.encode(encoding)
1012
elif cdata_needs_escaping(text):
1013
to_write += _escape_cdata(text)
1015
to_write += text.encode(encoding)
1019
for child in node._children:
1020
if maxdepth is not None:
1022
if depth < maxdepth:
1023
_write_html(write, child, encoding, namespaces, depth,
1025
elif depth == maxdepth and text:
1029
_write_html(write, child, encoding, namespaces, depth, maxdepth)
1031
if text or node._children or tag not in _HTMLTAGS_UNBALANCED:
1032
write("</" + tag.encode(encoding) + ">")
1035
if cdata_needs_escaping(tail):
1036
write(_escape_cdata(tail))
1038
write(tail.encode(encoding))
1040
def _write_html_no_encoding(write, node, namespaces):
1041
""" Append HTML to string without any particular unicode encoding.
1042
We have a separate function for this due to the fact that encoding
1043
while recursing is very expensive if this will get serialized out to
1044
utf8 anyway (the encoding can happen afterwards). We append to a string
1045
because it's faster than calling any 'write' or 'append' function."""
1055
if not node.structure:
1056
if cdata_needs_escaping(text):
1057
text = _escape_cdata_noencoding(text)
1060
elif tag is Comment:
1061
if cdata_needs_escaping(text):
1062
text = _escape_cdata_noencoding(text)
1063
write('<!-- ' + text + ' -->')
1065
elif tag is ProcessingInstruction:
1066
if cdata_needs_escaping(text):
1067
text = _escape_cdata_noencoding(text)
1068
write('<!-- ' + text + ' -->')
1071
xmlns_items = [] # new namespaces in this scope
1074
if tag[:_XHTML_PREFIX_LEN] == _XHTML_PREFIX:
1075
tag = tag[_XHTML_PREFIX_LEN:]
1077
tag, xmlns = fixtag(tag, namespaces)
1079
xmlns_items.append(xmlns)
1081
_raise_serialization_error(tag)
1083
to_write += "<" + tag
1085
attrib = node.attrib
1087
if attrib is not None:
1089
attrib_keys = attrib.keys()
1093
attrib_keys = attrib
1094
for k in attrib_keys:
1099
_raise_serialization_error(k)
1100
if k in _HTMLATTRS_BOOLEAN:
1104
to_write += " %s=\"%s\"" % (k, v)
1106
for k, v in xmlns_items:
1107
to_write += " %s=\"%s\"" % (k, v)
1111
if text is not None and text:
1112
if tag in _HTMLTAGS_NOESCAPE:
1114
elif cdata_needs_escaping(text):
1115
to_write += _escape_cdata_noencoding(text)
1121
for child in node._children:
1122
_write_html_no_encoding(write, child, namespaces)
1124
if text or node._children or tag not in _HTMLTAGS_UNBALANCED:
1125
write("</" + tag + ">")
1128
if cdata_needs_escaping(tail):
1129
write(_escape_cdata_noencoding(tail))
1133
def _write_xml(write, node, encoding, namespaces, pipeline, xhtml=False):
1134
""" Write XML to a file """
1135
if encoding is None:
1139
write("<!-- %s -->" % _escape_cdata(node.text, encoding))
1140
elif tag is ProcessingInstruction:
1141
write("<?%s?>" % _escape_cdata(node.text, encoding))
1142
elif tag is Replace:
1144
# this may produce invalid xml
1145
write(node.text.encode(encoding))
1147
write(_escape_cdata(node.text, encoding))
1150
if tag[:_XHTML_PREFIX_LEN] == _XHTML_PREFIX:
1151
tag = tag[_XHTML_PREFIX_LEN:]
1153
items = node.attrib.items()
1155
items = [] # must always be sortable.
1156
xmlns_items = [] # new namespaces in this scope
1159
tag, xmlns = fixtag(tag, namespaces)
1161
xmlns_items.append(xmlns)
1163
_raise_serialization_error(tag)
1164
write("<" + tag.encode(encoding))
1165
if items or xmlns_items:
1166
items.sort() # lexical order
1173
k, xmlns = fixtag(k, namespaces)
1174
if xmlns: xmlns_items.append(xmlns)
1176
# special-case for HTML input
1177
if k == 'xmlns:meld':
1180
_raise_serialization_error(k)
1181
write(" %s=\"%s\"" % (k.encode(encoding),
1182
_escape_attrib(v, encoding)))
1183
for k, v in xmlns_items:
1184
write(" %s=\"%s\"" % (k.encode(encoding),
1185
_escape_attrib(v, encoding)))
1186
if node.text or node._children:
1189
write(_escape_cdata(node.text, encoding))
1190
for n in node._children:
1191
_write_xml(write, n, encoding, namespaces, pipeline, xhtml)
1192
write("</" + tag.encode(encoding) + ">")
1195
for k, v in xmlns_items:
1198
write(_escape_cdata(node.tail, encoding))
1200
# overrides to elementtree to increase speed and get entity quoting correct.
1202
nonentity_re = re.compile('&(?!([#\w]*;))') # negative lookahead assertion
1204
def _escape_cdata(text, encoding=None):
1205
# escape character data
1209
text = text.encode(encoding)
1210
except UnicodeError:
1211
return _encode_entity(text)
1212
text = nonentity_re.sub('&', text)
1213
text = text.replace("<", "<")
1215
except (TypeError, AttributeError):
1216
_raise_serialization_error(text)
1218
def _escape_attrib(text, encoding=None):
1219
# escape attribute value
1223
text = text.encode(encoding)
1224
except UnicodeError:
1225
return _encode_entity(text)
1226
# don't requote properly-quoted entities
1227
text = nonentity_re.sub('&', text)
1228
text = text.replace("<", "<")
1229
text = text.replace('"', """)
1231
except (TypeError, AttributeError):
1232
_raise_serialization_error(text)
1234
def _escape_cdata_noencoding(text):
1235
# escape character data
1236
text = nonentity_re.sub('&', text)
1237
text = text.replace("<", "<")
1240
def _escape_attrib_noencoding(text):
1241
# don't requote properly-quoted entities
1242
text = nonentity_re.sub('&', text)
1243
text = text.replace("<", "<")
1244
text = text.replace('"', """)
1249
def _write_declaration(write, encoding):
1251
write('<?xml version="1.0"?>\n')
1253
write('<?xml version="1.0" encoding="%s"?>\n' % encoding)
1255
def _write_doctype(write, doctype):
1257
name, pubid, system = doctype
1258
except (ValueError, TypeError):
1259
raise ValueError, ("doctype must be supplied as a 3-tuple in the form "
1260
"(name, pubid, system) e.g. '%s'" % doctype.xhtml)
1261
write('<!DOCTYPE %s PUBLIC "%s" "%s">\n' % (name, pubid, system))
1263
xml_decl_re = re.compile(r'<\?xml .*?\?>')
1264
begin_tag_re = re.compile(r'<[^/?!]?\w+')
1265
'<!DOCTYPE %s PUBLIC "%s" "%s">' % doctype.html
1267
def insert_doctype(data, doctype=doctype.xhtml):
1268
# jam an html doctype declaration into 'data' if it
1269
# doesn't already contain a doctype declaration
1270
match = xml_decl_re.search(data)
1271
dt_string = '<!DOCTYPE %s PUBLIC "%s" "%s">' % doctype
1272
if match is not None:
1273
start, end = match.span(0)
1274
before = data[:start]
1275
tag = data[start:end]
1277
return before + tag + dt_string + after
1279
return dt_string + data
1281
def insert_meld_ns_decl(data):
1282
match = begin_tag_re.search(data)
1283
if match is not None:
1284
start, end = match.span(0)
1285
before = data[:start]
1286
tag = data[start:end] + ' xmlns:meld="%s"' % _MELD_NS_URL
1288
data = before + tag + after
1291
def prefeed(data, doctype=doctype.xhtml):
1292
if data.find('<!DOCTYPE') == -1:
1293
data = insert_doctype(data, doctype)
1294
if data.find('xmlns:meld') == -1:
1295
data = insert_meld_ns_decl(data)
1298
def sharedlineage(srcelement, tgtelement):
1299
srcparent = srcelement.parent
1300
tgtparent = tgtelement.parent
1301
srcparenttag = getattr(srcparent, 'tag', None)
1302
tgtparenttag = getattr(tgtparent, 'tag', None)
1303
if srcparenttag != tgtparenttag:
1305
elif tgtparenttag is None and srcparenttag is None:
1307
elif tgtparent and srcparent:
1308
return sharedlineage(srcparent, tgtparent)
1311
def diffreduce(elements):
1312
# each element in 'elements' should all have non-None meldids, and should
1313
# be preordered in depth-first traversal order
1315
for element in elements:
1316
parent = element.parent
1318
reduced.append(element)
1320
if parent in reduced:
1322
reduced.append(element)
1325
def intersection(S1, S2):
1332
def melditerator(element, meldid=None, _MELD_ID=_MELD_ID):
1333
nodeid = element.attrib.get(_MELD_ID)
1334
if nodeid is not None:
1335
if meldid is None or nodeid == meldid:
1337
for child in element._children:
1338
for el2 in melditerator(child, meldid):
1339
nodeid = el2.attrib.get(_MELD_ID)
1340
if nodeid is not None:
1341
if meldid is None or nodeid == meldid:
1346
raise ValueError("unloadable datatype name: " + `name`)
1347
components = name.split('.')
1348
start = components[0]
1350
package = __import__(start, g, g)
1351
modulenames = [start]
1352
for component in components[1:]:
1353
modulenames.append(component)
1355
package = getattr(package, component)
1356
except AttributeError:
1357
n = '.'.join(modulenames)
1358
package = __import__(n, g, g, component)
1361
def sample_mutator(root):
1363
for thing in range(0, 20):
1364
values.append((str(thing), str(thing)))
1366
ob = root.findmeld('tr')
1367
for tr, (name, desc) in ob.repeat(values):
1368
tr.findmeld('td1').content(name)
1369
tr.findmeld('td2').content(desc)
1373
if __name__ == '__main__':
1374
# call interactively by invoking meld3.py with a filename and
1375
# a dotted-python-path name to a mutator function that accepts a single
1376
# argument (the root), e.g.:
1378
# python meld3.py sample.html meld3.sample_mutator
1380
# the rendering will be sent to stdout
1382
filename = sys.argv[1]
1384
mutator = sys.argv[2]
1388
root = parse_html(open(filename, 'r'))
1391
mutator = search(mutator)
1394
sys.stdout.write(io.getvalue())