1
from __future__ import absolute_import, division, unicode_literals
2
from pip.vendor.six import with_metaclass
6
from . import inputstream
7
from . import tokenizer
9
from . import treebuilders
10
from .treebuilders._base import Marker
13
from . import constants
14
from .constants import spaceCharacters, asciiUpper2Lower
15
from .constants import specialElements
16
from .constants import headingElements
17
from .constants import cdataElements, rcdataElements
18
from .constants import tokenTypes, ReparseException, namespaces
19
from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements
22
def parse(doc, treebuilder="etree", encoding=None,
23
namespaceHTMLElements=True):
24
"""Parse a string or file-like object into a tree"""
25
tb = treebuilders.getTreeBuilder(treebuilder)
26
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
27
return p.parse(doc, encoding=encoding)
30
def parseFragment(doc, container="div", treebuilder="etree", encoding=None,
31
namespaceHTMLElements=True):
32
tb = treebuilders.getTreeBuilder(treebuilder)
33
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
34
return p.parseFragment(doc, container=container, encoding=encoding)
37
def method_decorator_metaclass(function):
38
class Decorated(type):
39
def __new__(meta, classname, bases, classDict):
40
for attributeName, attribute in classDict.items():
41
if isinstance(attribute, types.FunctionType):
42
attribute = function(attribute)
44
classDict[attributeName] = attribute
45
return type.__new__(meta, classname, bases, classDict)
49
class HTMLParser(object):
50
"""HTML parser. Generates a tree structure from a stream of (possibly
53
def __init__(self, tree=None, tokenizer=tokenizer.HTMLTokenizer,
54
strict=False, namespaceHTMLElements=True, debug=False):
56
strict - raise an exception when a parse error is encountered
58
tree - a treebuilder class controlling the type of tree that will be
59
returned. Built in treebuilders can be accessed through
60
html5lib.treebuilders.getTreeBuilder(treeType)
62
tokenizer - a class that provides a stream of tokens to the treebuilder.
63
This may be replaced for e.g. a sanitizer which converts some tags to
67
# Raise an exception on the first error encountered
71
tree = treebuilders.getTreeBuilder("etree")
72
self.tree = tree(namespaceHTMLElements)
73
self.tokenizer_class = tokenizer
76
self.phases = dict([(name, cls(self, self.tree)) for name, cls in
77
getPhases(debug).items()])
79
def _parse(self, stream, innerHTML=False, container="div",
80
encoding=None, parseMeta=True, useChardet=True, **kwargs):
82
self.innerHTMLMode = innerHTML
83
self.container = container
84
self.tokenizer = self.tokenizer_class(stream, encoding=encoding,
86
useChardet=useChardet,
87
parser=self, **kwargs)
94
except ReparseException:
99
self.firstStartTag = False
101
self.log = [] # only used with debug mode
102
# "quirks" / "limited quirks" / "no quirks"
103
self.compatMode = "no quirks"
105
if self.innerHTMLMode:
106
self.innerHTML = self.container.lower()
108
if self.innerHTML in cdataElements:
109
self.tokenizer.state = self.tokenizer.rcdataState
110
elif self.innerHTML in rcdataElements:
111
self.tokenizer.state = self.tokenizer.rawtextState
112
elif self.innerHTML == 'plaintext':
113
self.tokenizer.state = self.tokenizer.plaintextState
115
# state already is data state
116
# self.tokenizer.state = self.tokenizer.dataState
118
self.phase = self.phases["beforeHtml"]
119
self.phase.insertHtmlElement()
120
self.resetInsertionMode()
122
self.innerHTML = False
123
self.phase = self.phases["initial"]
125
self.lastPhase = None
127
self.beforeRCDataPhase = None
129
self.framesetOK = True
131
def isHTMLIntegrationPoint(self, element):
132
if (element.name == "annotation-xml" and
133
element.namespace == namespaces["mathml"]):
134
return ("encoding" in element.attributes and
135
element.attributes["encoding"].translate(
137
("text/html", "application/xhtml+xml"))
139
return (element.namespace, element.name) in htmlIntegrationPointElements
141
def isMathMLTextIntegrationPoint(self, element):
142
return (element.namespace, element.name) in mathmlTextIntegrationPointElements
145
CharactersToken = tokenTypes["Characters"]
146
SpaceCharactersToken = tokenTypes["SpaceCharacters"]
147
StartTagToken = tokenTypes["StartTag"]
148
EndTagToken = tokenTypes["EndTag"]
149
CommentToken = tokenTypes["Comment"]
150
DoctypeToken = tokenTypes["Doctype"]
151
ParseErrorToken = tokenTypes["ParseError"]
153
for token in self.normalizedTokens():
155
while new_token is not None:
156
currentNode = self.tree.openElements[-1] if self.tree.openElements else None
157
currentNodeNamespace = currentNode.namespace if currentNode else None
158
currentNodeName = currentNode.name if currentNode else None
160
type = new_token["type"]
162
if type == ParseErrorToken:
163
self.parseError(new_token["data"], new_token.get("datavars", {}))
166
if (len(self.tree.openElements) == 0 or
167
currentNodeNamespace == self.tree.defaultNamespace or
168
(self.isMathMLTextIntegrationPoint(currentNode) and
169
((type == StartTagToken and
170
token["name"] not in frozenset(["mglyph", "malignmark"])) or
171
type in (CharactersToken, SpaceCharactersToken))) or
172
(currentNodeNamespace == namespaces["mathml"] and
173
currentNodeName == "annotation-xml" and
174
token["name"] == "svg") or
175
(self.isHTMLIntegrationPoint(currentNode) and
176
type in (StartTagToken, CharactersToken, SpaceCharactersToken))):
179
phase = self.phases["inForeignContent"]
181
if type == CharactersToken:
182
new_token = phase.processCharacters(new_token)
183
elif type == SpaceCharactersToken:
184
new_token = phase.processSpaceCharacters(new_token)
185
elif type == StartTagToken:
186
new_token = phase.processStartTag(new_token)
187
elif type == EndTagToken:
188
new_token = phase.processEndTag(new_token)
189
elif type == CommentToken:
190
new_token = phase.processComment(new_token)
191
elif type == DoctypeToken:
192
new_token = phase.processDoctype(new_token)
194
if (type == StartTagToken and token["selfClosing"]
195
and not token["selfClosingAcknowledged"]):
196
self.parseError("non-void-element-with-trailing-solidus",
197
{"name": token["name"]})
199
# When the loop finishes it's EOF
203
phases.append(self.phase)
204
reprocess = self.phase.processEOF()
206
assert self.phase not in phases
208
def normalizedTokens(self):
209
for token in self.tokenizer:
210
yield self.normalizeToken(token)
212
def parse(self, stream, encoding=None, parseMeta=True, useChardet=True):
213
"""Parse a HTML document into a well-formed tree
215
stream - a filelike object or string containing the HTML to be parsed
217
The optional encoding parameter must be a string that indicates
218
the encoding. If specified, that encoding will be used,
219
regardless of any BOM or later declaration (such as in a meta
222
self._parse(stream, innerHTML=False, encoding=encoding,
223
parseMeta=parseMeta, useChardet=useChardet)
224
return self.tree.getDocument()
226
def parseFragment(self, stream, container="div", encoding=None,
227
parseMeta=False, useChardet=True):
228
"""Parse a HTML fragment into a well-formed tree fragment
230
container - name of the element we're setting the innerHTML property
231
if set to None, default to 'div'
233
stream - a filelike object or string containing the HTML to be parsed
235
The optional encoding parameter must be a string that indicates
236
the encoding. If specified, that encoding will be used,
237
regardless of any BOM or later declaration (such as in a meta
240
self._parse(stream, True, container=container, encoding=encoding)
241
return self.tree.getFragment()
243
def parseError(self, errorcode="XXX-undefined-error", datavars={}):
244
# XXX The idea is to make errorcode mandatory.
245
self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
249
def normalizeToken(self, token):
250
""" HTML5 specific normalizations to the token stream """
252
if token["type"] == tokenTypes["StartTag"]:
253
token["data"] = dict(token["data"][::-1])
257
def adjustMathMLAttributes(self, token):
258
replacements = {"definitionurl": "definitionURL"}
259
for k, v in replacements.items():
260
if k in token["data"]:
261
token["data"][v] = token["data"][k]
264
def adjustSVGAttributes(self, token):
266
"attributename": "attributeName",
267
"attributetype": "attributeType",
268
"basefrequency": "baseFrequency",
269
"baseprofile": "baseProfile",
270
"calcmode": "calcMode",
271
"clippathunits": "clipPathUnits",
272
"contentscripttype": "contentScriptType",
273
"contentstyletype": "contentStyleType",
274
"diffuseconstant": "diffuseConstant",
275
"edgemode": "edgeMode",
276
"externalresourcesrequired": "externalResourcesRequired",
277
"filterres": "filterRes",
278
"filterunits": "filterUnits",
279
"glyphref": "glyphRef",
280
"gradienttransform": "gradientTransform",
281
"gradientunits": "gradientUnits",
282
"kernelmatrix": "kernelMatrix",
283
"kernelunitlength": "kernelUnitLength",
284
"keypoints": "keyPoints",
285
"keysplines": "keySplines",
286
"keytimes": "keyTimes",
287
"lengthadjust": "lengthAdjust",
288
"limitingconeangle": "limitingConeAngle",
289
"markerheight": "markerHeight",
290
"markerunits": "markerUnits",
291
"markerwidth": "markerWidth",
292
"maskcontentunits": "maskContentUnits",
293
"maskunits": "maskUnits",
294
"numoctaves": "numOctaves",
295
"pathlength": "pathLength",
296
"patterncontentunits": "patternContentUnits",
297
"patterntransform": "patternTransform",
298
"patternunits": "patternUnits",
299
"pointsatx": "pointsAtX",
300
"pointsaty": "pointsAtY",
301
"pointsatz": "pointsAtZ",
302
"preservealpha": "preserveAlpha",
303
"preserveaspectratio": "preserveAspectRatio",
304
"primitiveunits": "primitiveUnits",
307
"repeatcount": "repeatCount",
308
"repeatdur": "repeatDur",
309
"requiredextensions": "requiredExtensions",
310
"requiredfeatures": "requiredFeatures",
311
"specularconstant": "specularConstant",
312
"specularexponent": "specularExponent",
313
"spreadmethod": "spreadMethod",
314
"startoffset": "startOffset",
315
"stddeviation": "stdDeviation",
316
"stitchtiles": "stitchTiles",
317
"surfacescale": "surfaceScale",
318
"systemlanguage": "systemLanguage",
319
"tablevalues": "tableValues",
320
"targetx": "targetX",
321
"targety": "targetY",
322
"textlength": "textLength",
323
"viewbox": "viewBox",
324
"viewtarget": "viewTarget",
325
"xchannelselector": "xChannelSelector",
326
"ychannelselector": "yChannelSelector",
327
"zoomandpan": "zoomAndPan"
329
for originalName in list(token["data"].keys()):
330
if originalName in replacements:
331
svgName = replacements[originalName]
332
token["data"][svgName] = token["data"][originalName]
333
del token["data"][originalName]
335
def adjustForeignAttributes(self, token):
337
"xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
338
"xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]),
339
"xlink:href": ("xlink", "href", namespaces["xlink"]),
340
"xlink:role": ("xlink", "role", namespaces["xlink"]),
341
"xlink:show": ("xlink", "show", namespaces["xlink"]),
342
"xlink:title": ("xlink", "title", namespaces["xlink"]),
343
"xlink:type": ("xlink", "type", namespaces["xlink"]),
344
"xml:base": ("xml", "base", namespaces["xml"]),
345
"xml:lang": ("xml", "lang", namespaces["xml"]),
346
"xml:space": ("xml", "space", namespaces["xml"]),
347
"xmlns": (None, "xmlns", namespaces["xmlns"]),
348
"xmlns:xlink": ("xmlns", "xlink", namespaces["xmlns"])
351
for originalName in token["data"].keys():
352
if originalName in replacements:
353
foreignName = replacements[originalName]
354
token["data"][foreignName] = token["data"][originalName]
355
del token["data"][originalName]
357
def reparseTokenNormal(self, token):
360
def resetInsertionMode(self):
361
# The name of this method is mostly historical. (It's also used in the
365
"select": "inSelect",
369
"tbody": "inTableBody",
370
"thead": "inTableBody",
371
"tfoot": "inTableBody",
372
"caption": "inCaption",
373
"colgroup": "inColumnGroup",
377
"frameset": "inFrameset",
380
for node in self.tree.openElements[::-1]:
383
if node == self.tree.openElements[0]:
384
assert self.innerHTML
386
nodeName = self.innerHTML
387
# Check for conditions that should only happen in the innerHTML
389
if nodeName in ("select", "colgroup", "head", "html"):
390
assert self.innerHTML
392
if not last and node.namespace != self.tree.defaultNamespace:
395
if nodeName in newModes:
396
new_phase = self.phases[newModes[nodeName]]
399
new_phase = self.phases["inBody"]
402
self.phase = new_phase
404
def parseRCDataRawtext(self, token, contentType):
405
"""Generic RCDATA/RAWTEXT Parsing algorithm
406
contentType - RCDATA or RAWTEXT
408
assert contentType in ("RAWTEXT", "RCDATA")
410
self.tree.insertElement(token)
412
if contentType == "RAWTEXT":
413
self.tokenizer.state = self.tokenizer.rawtextState
415
self.tokenizer.state = self.tokenizer.rcdataState
417
self.originalPhase = self.phase
419
self.phase = self.phases["text"]
422
def getPhases(debug):
424
"""Logger that records which phase processes each token"""
425
type_names = dict((value, key) for key, value in
426
constants.tokenTypes.items())
428
def wrapped(self, *args, **kwargs):
429
if function.__name__.startswith("process") and len(args) > 0:
432
info = {"type": type_names[token['type']]}
435
if token['type'] in constants.tagTokenTypes:
436
info["name"] = token['name']
438
self.parser.log.append((self.parser.tokenizer.state.__name__,
439
self.parser.phase.__class__.__name__,
440
self.__class__.__name__,
443
return function(self, *args, **kwargs)
445
return function(self, *args, **kwargs)
448
def getMetaclass(use_metaclass, metaclass_func):
450
return method_decorator_metaclass(metaclass_func)
454
class Phase(with_metaclass(getMetaclass(debug, log))):
455
"""Base class for helper object that implements each phase of processing
458
def __init__(self, parser, tree):
462
def processEOF(self):
463
raise NotImplementedError
465
def processComment(self, token):
466
# For most phases the following is correct. Where it's not it will be
468
self.tree.insertComment(token, self.tree.openElements[-1])
470
def processDoctype(self, token):
471
self.parser.parseError("unexpected-doctype")
473
def processCharacters(self, token):
474
self.tree.insertText(token["data"])
476
def processSpaceCharacters(self, token):
477
self.tree.insertText(token["data"])
479
def processStartTag(self, token):
480
return self.startTagHandler[token["name"]](token)
482
def startTagHtml(self, token):
483
if not self.parser.firstStartTag and token["name"] == "html":
484
self.parser.parseError("non-html-root")
485
# XXX Need a check here to see if the first start tag token emitted is
486
# this token... If it's not, invoke self.parser.parseError().
487
for attr, value in token["data"].items():
488
if attr not in self.tree.openElements[0].attributes:
489
self.tree.openElements[0].attributes[attr] = value
490
self.parser.firstStartTag = False
492
def processEndTag(self, token):
493
return self.endTagHandler[token["name"]](token)
495
class InitialPhase(Phase):
496
def processSpaceCharacters(self, token):
499
def processComment(self, token):
500
self.tree.insertComment(token, self.tree.document)
502
def processDoctype(self, token):
504
publicId = token["publicId"]
505
systemId = token["systemId"]
506
correct = token["correct"]
508
if (name != "html" or publicId is not None or
509
systemId is not None and systemId != "about:legacy-compat"):
510
self.parser.parseError("unknown-doctype")
515
self.tree.insertDoctype(token)
518
publicId = publicId.translate(asciiUpper2Lower)
520
if (not correct or token["name"] != "html"
521
or publicId.startswith(
522
("+//silmaril//dtd html pro v0r11 19970101//",
523
"-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
524
"-//as//dtd html 3.0 aswedit + extensions//",
525
"-//ietf//dtd html 2.0 level 1//",
526
"-//ietf//dtd html 2.0 level 2//",
527
"-//ietf//dtd html 2.0 strict level 1//",
528
"-//ietf//dtd html 2.0 strict level 2//",
529
"-//ietf//dtd html 2.0 strict//",
530
"-//ietf//dtd html 2.0//",
531
"-//ietf//dtd html 2.1e//",
532
"-//ietf//dtd html 3.0//",
533
"-//ietf//dtd html 3.2 final//",
534
"-//ietf//dtd html 3.2//",
535
"-//ietf//dtd html 3//",
536
"-//ietf//dtd html level 0//",
537
"-//ietf//dtd html level 1//",
538
"-//ietf//dtd html level 2//",
539
"-//ietf//dtd html level 3//",
540
"-//ietf//dtd html strict level 0//",
541
"-//ietf//dtd html strict level 1//",
542
"-//ietf//dtd html strict level 2//",
543
"-//ietf//dtd html strict level 3//",
544
"-//ietf//dtd html strict//",
545
"-//ietf//dtd html//",
546
"-//metrius//dtd metrius presentational//",
547
"-//microsoft//dtd internet explorer 2.0 html strict//",
548
"-//microsoft//dtd internet explorer 2.0 html//",
549
"-//microsoft//dtd internet explorer 2.0 tables//",
550
"-//microsoft//dtd internet explorer 3.0 html strict//",
551
"-//microsoft//dtd internet explorer 3.0 html//",
552
"-//microsoft//dtd internet explorer 3.0 tables//",
553
"-//netscape comm. corp.//dtd html//",
554
"-//netscape comm. corp.//dtd strict html//",
555
"-//o'reilly and associates//dtd html 2.0//",
556
"-//o'reilly and associates//dtd html extended 1.0//",
557
"-//o'reilly and associates//dtd html extended relaxed 1.0//",
558
"-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
559
"-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
560
"-//spyglass//dtd html 2.0 extended//",
561
"-//sq//dtd html 2.0 hotmetal + extensions//",
562
"-//sun microsystems corp.//dtd hotjava html//",
563
"-//sun microsystems corp.//dtd hotjava strict html//",
564
"-//w3c//dtd html 3 1995-03-24//",
565
"-//w3c//dtd html 3.2 draft//",
566
"-//w3c//dtd html 3.2 final//",
567
"-//w3c//dtd html 3.2//",
568
"-//w3c//dtd html 3.2s draft//",
569
"-//w3c//dtd html 4.0 frameset//",
570
"-//w3c//dtd html 4.0 transitional//",
571
"-//w3c//dtd html experimental 19960712//",
572
"-//w3c//dtd html experimental 970421//",
573
"-//w3c//dtd w3 html//",
574
"-//w3o//dtd w3 html 3.0//",
575
"-//webtechs//dtd mozilla html 2.0//",
576
"-//webtechs//dtd mozilla html//"))
578
("-//w3o//dtd w3 html strict 3.0//en//",
579
"-/w3c/dtd html 4.0 transitional/en",
581
or publicId.startswith(
582
("-//w3c//dtd html 4.01 frameset//",
583
"-//w3c//dtd html 4.01 transitional//")) and
585
or systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
586
self.parser.compatMode = "quirks"
587
elif (publicId.startswith(
588
("-//w3c//dtd xhtml 1.0 frameset//",
589
"-//w3c//dtd xhtml 1.0 transitional//"))
590
or publicId.startswith(
591
("-//w3c//dtd html 4.01 frameset//",
592
"-//w3c//dtd html 4.01 transitional//")) and
593
systemId is not None):
594
self.parser.compatMode = "limited quirks"
596
self.parser.phase = self.parser.phases["beforeHtml"]
598
def anythingElse(self):
599
self.parser.compatMode = "quirks"
600
self.parser.phase = self.parser.phases["beforeHtml"]
602
def processCharacters(self, token):
603
self.parser.parseError("expected-doctype-but-got-chars")
607
def processStartTag(self, token):
608
self.parser.parseError("expected-doctype-but-got-start-tag",
609
{"name": token["name"]})
613
def processEndTag(self, token):
614
self.parser.parseError("expected-doctype-but-got-end-tag",
615
{"name": token["name"]})
619
def processEOF(self):
620
self.parser.parseError("expected-doctype-but-got-eof")
624
class BeforeHtmlPhase(Phase):
626
def insertHtmlElement(self):
627
self.tree.insertRoot(impliedTagToken("html", "StartTag"))
628
self.parser.phase = self.parser.phases["beforeHead"]
631
def processEOF(self):
632
self.insertHtmlElement()
635
def processComment(self, token):
636
self.tree.insertComment(token, self.tree.document)
638
def processSpaceCharacters(self, token):
641
def processCharacters(self, token):
642
self.insertHtmlElement()
645
def processStartTag(self, token):
646
if token["name"] == "html":
647
self.parser.firstStartTag = True
648
self.insertHtmlElement()
651
def processEndTag(self, token):
652
if token["name"] not in ("head", "body", "html", "br"):
653
self.parser.parseError("unexpected-end-tag-before-html",
654
{"name": token["name"]})
656
self.insertHtmlElement()
659
class BeforeHeadPhase(Phase):
660
def __init__(self, parser, tree):
661
Phase.__init__(self, parser, tree)
663
self.startTagHandler = utils.MethodDispatcher([
664
("html", self.startTagHtml),
665
("head", self.startTagHead)
667
self.startTagHandler.default = self.startTagOther
669
self.endTagHandler = utils.MethodDispatcher([
670
(("head", "body", "html", "br"), self.endTagImplyHead)
672
self.endTagHandler.default = self.endTagOther
674
def processEOF(self):
675
self.startTagHead(impliedTagToken("head", "StartTag"))
678
def processSpaceCharacters(self, token):
681
def processCharacters(self, token):
682
self.startTagHead(impliedTagToken("head", "StartTag"))
685
def startTagHtml(self, token):
686
return self.parser.phases["inBody"].processStartTag(token)
688
def startTagHead(self, token):
689
self.tree.insertElement(token)
690
self.tree.headPointer = self.tree.openElements[-1]
691
self.parser.phase = self.parser.phases["inHead"]
693
def startTagOther(self, token):
694
self.startTagHead(impliedTagToken("head", "StartTag"))
697
def endTagImplyHead(self, token):
698
self.startTagHead(impliedTagToken("head", "StartTag"))
701
def endTagOther(self, token):
702
self.parser.parseError("end-tag-after-implied-root",
703
{"name": token["name"]})
705
class InHeadPhase(Phase):
706
def __init__(self, parser, tree):
707
Phase.__init__(self, parser, tree)
709
self.startTagHandler = utils.MethodDispatcher([
710
("html", self.startTagHtml),
711
("title", self.startTagTitle),
712
(("noscript", "noframes", "style"), self.startTagNoScriptNoFramesStyle),
713
("script", self.startTagScript),
714
(("base", "basefont", "bgsound", "command", "link"),
715
self.startTagBaseLinkCommand),
716
("meta", self.startTagMeta),
717
("head", self.startTagHead)
719
self.startTagHandler.default = self.startTagOther
721
self. endTagHandler = utils.MethodDispatcher([
722
("head", self.endTagHead),
723
(("br", "html", "body"), self.endTagHtmlBodyBr)
725
self.endTagHandler.default = self.endTagOther
728
def processEOF(self):
732
def processCharacters(self, token):
736
def startTagHtml(self, token):
737
return self.parser.phases["inBody"].processStartTag(token)
739
def startTagHead(self, token):
740
self.parser.parseError("two-heads-are-not-better-than-one")
742
def startTagBaseLinkCommand(self, token):
743
self.tree.insertElement(token)
744
self.tree.openElements.pop()
745
token["selfClosingAcknowledged"] = True
747
def startTagMeta(self, token):
748
self.tree.insertElement(token)
749
self.tree.openElements.pop()
750
token["selfClosingAcknowledged"] = True
752
attributes = token["data"]
753
if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
754
if "charset" in attributes:
755
self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
756
elif ("content" in attributes and
757
"http-equiv" in attributes and
758
attributes["http-equiv"].lower() == "content-type"):
759
# Encoding it as UTF-8 here is a hack, as really we should pass
760
# the abstract Unicode string, and just use the
761
# ContentAttrParser on that, but using UTF-8 allows all chars
762
# to be encoded and as a ASCII-superset works.
763
data = inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
764
parser = inputstream.ContentAttrParser(data)
765
codec = parser.parse()
766
self.parser.tokenizer.stream.changeEncoding(codec)
768
def startTagTitle(self, token):
769
self.parser.parseRCDataRawtext(token, "RCDATA")
771
def startTagNoScriptNoFramesStyle(self, token):
772
# Need to decide whether to implement the scripting-disabled case
773
self.parser.parseRCDataRawtext(token, "RAWTEXT")
775
def startTagScript(self, token):
776
self.tree.insertElement(token)
777
self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
778
self.parser.originalPhase = self.parser.phase
779
self.parser.phase = self.parser.phases["text"]
781
def startTagOther(self, token):
785
def endTagHead(self, token):
786
node = self.parser.tree.openElements.pop()
787
assert node.name == "head", "Expected head got %s" % node.name
788
self.parser.phase = self.parser.phases["afterHead"]
790
def endTagHtmlBodyBr(self, token):
794
def endTagOther(self, token):
795
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
797
def anythingElse(self):
798
self.endTagHead(impliedTagToken("head"))
800
# XXX If we implement a parser for which scripting is disabled we need to
801
# implement this phase.
803
# class InHeadNoScriptPhase(Phase):
804
class AfterHeadPhase(Phase):
805
def __init__(self, parser, tree):
806
Phase.__init__(self, parser, tree)
808
self.startTagHandler = utils.MethodDispatcher([
809
("html", self.startTagHtml),
810
("body", self.startTagBody),
811
("frameset", self.startTagFrameset),
812
(("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
814
self.startTagFromHead),
815
("head", self.startTagHead)
817
self.startTagHandler.default = self.startTagOther
818
self.endTagHandler = utils.MethodDispatcher([(("body", "html", "br"),
819
self.endTagHtmlBodyBr)])
820
self.endTagHandler.default = self.endTagOther
822
def processEOF(self):
826
def processCharacters(self, token):
830
def startTagHtml(self, token):
831
return self.parser.phases["inBody"].processStartTag(token)
833
def startTagBody(self, token):
834
self.parser.framesetOK = False
835
self.tree.insertElement(token)
836
self.parser.phase = self.parser.phases["inBody"]
838
def startTagFrameset(self, token):
839
self.tree.insertElement(token)
840
self.parser.phase = self.parser.phases["inFrameset"]
842
def startTagFromHead(self, token):
843
self.parser.parseError("unexpected-start-tag-out-of-my-head",
844
{"name": token["name"]})
845
self.tree.openElements.append(self.tree.headPointer)
846
self.parser.phases["inHead"].processStartTag(token)
847
for node in self.tree.openElements[::-1]:
848
if node.name == "head":
849
self.tree.openElements.remove(node)
852
def startTagHead(self, token):
853
self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
855
def startTagOther(self, token):
859
def endTagHtmlBodyBr(self, token):
863
def endTagOther(self, token):
864
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
866
def anythingElse(self):
867
self.tree.insertElement(impliedTagToken("body", "StartTag"))
868
self.parser.phase = self.parser.phases["inBody"]
869
self.parser.framesetOK = True
871
class InBodyPhase(Phase):
872
# http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
873
# the really-really-really-very crazy mode
874
def __init__(self, parser, tree):
875
Phase.__init__(self, parser, tree)
877
# Keep a ref to this for special handling of whitespace in <pre>
878
self.processSpaceCharactersNonPre = self.processSpaceCharacters
880
self.startTagHandler = utils.MethodDispatcher([
881
("html", self.startTagHtml),
882
(("base", "basefont", "bgsound", "command", "link", "meta",
883
"noframes", "script", "style", "title"),
884
self.startTagProcessInHead),
885
("body", self.startTagBody),
886
("frameset", self.startTagFrameset),
887
(("address", "article", "aside", "blockquote", "center", "details",
888
"details", "dir", "div", "dl", "fieldset", "figcaption", "figure",
889
"footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
890
"section", "summary", "ul"),
891
self.startTagCloseP),
892
(headingElements, self.startTagHeading),
893
(("pre", "listing"), self.startTagPreListing),
894
("form", self.startTagForm),
895
(("li", "dd", "dt"), self.startTagListItem),
896
("plaintext", self.startTagPlaintext),
897
("a", self.startTagA),
898
(("b", "big", "code", "em", "font", "i", "s", "small", "strike",
899
"strong", "tt", "u"), self.startTagFormatting),
900
("nobr", self.startTagNobr),
901
("button", self.startTagButton),
902
(("applet", "marquee", "object"), self.startTagAppletMarqueeObject),
903
("xmp", self.startTagXmp),
904
("table", self.startTagTable),
905
(("area", "br", "embed", "img", "keygen", "wbr"),
906
self.startTagVoidFormatting),
907
(("param", "source", "track"), self.startTagParamSource),
908
("input", self.startTagInput),
909
("hr", self.startTagHr),
910
("image", self.startTagImage),
911
("isindex", self.startTagIsIndex),
912
("textarea", self.startTagTextarea),
913
("iframe", self.startTagIFrame),
914
(("noembed", "noframes", "noscript"), self.startTagRawtext),
915
("select", self.startTagSelect),
916
(("rp", "rt"), self.startTagRpRt),
917
(("option", "optgroup"), self.startTagOpt),
918
(("math"), self.startTagMath),
919
(("svg"), self.startTagSvg),
920
(("caption", "col", "colgroup", "frame", "head",
921
"tbody", "td", "tfoot", "th", "thead",
922
"tr"), self.startTagMisplaced)
924
self.startTagHandler.default = self.startTagOther
926
self.endTagHandler = utils.MethodDispatcher([
927
("body", self.endTagBody),
928
("html", self.endTagHtml),
929
(("address", "article", "aside", "blockquote", "button", "center",
930
"details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",
931
"footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",
932
"section", "summary", "ul"), self.endTagBlock),
933
("form", self.endTagForm),
935
(("dd", "dt", "li"), self.endTagListItem),
936
(headingElements, self.endTagHeading),
937
(("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
938
"strike", "strong", "tt", "u"), self.endTagFormatting),
939
(("applet", "marquee", "object"), self.endTagAppletMarqueeObject),
940
("br", self.endTagBr),
942
self.endTagHandler.default = self.endTagOther
944
def isMatchingFormattingElement(self, node1, node2):
945
if node1.name != node2.name or node1.namespace != node2.namespace:
947
elif len(node1.attributes) != len(node2.attributes):
950
attributes1 = sorted(node1.attributes.items())
951
attributes2 = sorted(node2.attributes.items())
952
for attr1, attr2 in zip(attributes1, attributes2):
958
def addFormattingElement(self, token):
959
self.tree.insertElement(token)
960
element = self.tree.openElements[-1]
962
matchingElements = []
963
for node in self.tree.activeFormattingElements[::-1]:
966
elif self.isMatchingFormattingElement(node, element):
967
matchingElements.append(node)
969
assert len(matchingElements) <= 3
970
if len(matchingElements) == 3:
971
self.tree.activeFormattingElements.remove(matchingElements[-1])
972
self.tree.activeFormattingElements.append(element)
975
def processEOF(self):
976
allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td",
977
"tfoot", "th", "thead", "tr", "body",
979
for node in self.tree.openElements[::-1]:
980
if node.name not in allowed_elements:
981
self.parser.parseError("expected-closing-tag-but-got-eof")
985
def processSpaceCharactersDropNewline(self, token):
986
# Sometimes (start of <pre>, <listing>, and <textarea> blocks) we
987
# want to drop leading newlines
989
self.processSpaceCharacters = self.processSpaceCharactersNonPre
990
if (data.startswith("\n") and
991
self.tree.openElements[-1].name in ("pre", "listing", "textarea")
992
and not self.tree.openElements[-1].hasContent()):
995
self.tree.reconstructActiveFormattingElements()
996
self.tree.insertText(data)
998
def processCharacters(self, token):
999
if token["data"] == "\u0000":
1000
# The tokenizer should always emit null on its own
1002
self.tree.reconstructActiveFormattingElements()
1003
self.tree.insertText(token["data"])
1004
# This must be bad for performance
1005
if (self.parser.framesetOK and
1006
any([char not in spaceCharacters
1007
for char in token["data"]])):
1008
self.parser.framesetOK = False
1010
def processSpaceCharacters(self, token):
1011
self.tree.reconstructActiveFormattingElements()
1012
self.tree.insertText(token["data"])
1014
def startTagProcessInHead(self, token):
1015
return self.parser.phases["inHead"].processStartTag(token)
1017
def startTagBody(self, token):
1018
self.parser.parseError("unexpected-start-tag", {"name": "body"})
1019
if (len(self.tree.openElements) == 1
1020
or self.tree.openElements[1].name != "body"):
1021
assert self.parser.innerHTML
1023
self.parser.framesetOK = False
1024
for attr, value in token["data"].items():
1025
if attr not in self.tree.openElements[1].attributes:
1026
self.tree.openElements[1].attributes[attr] = value
1028
def startTagFrameset(self, token):
1029
self.parser.parseError("unexpected-start-tag", {"name": "frameset"})
1030
if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"):
1031
assert self.parser.innerHTML
1032
elif not self.parser.framesetOK:
1035
if self.tree.openElements[1].parent:
1036
self.tree.openElements[1].parent.removeChild(self.tree.openElements[1])
1037
while self.tree.openElements[-1].name != "html":
1038
self.tree.openElements.pop()
1039
self.tree.insertElement(token)
1040
self.parser.phase = self.parser.phases["inFrameset"]
1042
def startTagCloseP(self, token):
1043
if self.tree.elementInScope("p", variant="button"):
1044
self.endTagP(impliedTagToken("p"))
1045
self.tree.insertElement(token)
1047
def startTagPreListing(self, token):
1048
if self.tree.elementInScope("p", variant="button"):
1049
self.endTagP(impliedTagToken("p"))
1050
self.tree.insertElement(token)
1051
self.parser.framesetOK = False
1052
self.processSpaceCharacters = self.processSpaceCharactersDropNewline
1054
def startTagForm(self, token):
1055
if self.tree.formPointer:
1056
self.parser.parseError("unexpected-start-tag", {"name": "form"})
1058
if self.tree.elementInScope("p", variant="button"):
1059
self.endTagP(impliedTagToken("p"))
1060
self.tree.insertElement(token)
1061
self.tree.formPointer = self.tree.openElements[-1]
1063
def startTagListItem(self, token):
1064
self.parser.framesetOK = False
1066
stopNamesMap = {"li": ["li"],
1069
stopNames = stopNamesMap[token["name"]]
1070
for node in reversed(self.tree.openElements):
1071
if node.name in stopNames:
1072
self.parser.phase.processEndTag(
1073
impliedTagToken(node.name, "EndTag"))
1075
if (node.nameTuple in specialElements and
1076
node.name not in ("address", "div", "p")):
1079
if self.tree.elementInScope("p", variant="button"):
1080
self.parser.phase.processEndTag(
1081
impliedTagToken("p", "EndTag"))
1083
self.tree.insertElement(token)
1085
def startTagPlaintext(self, token):
1086
if self.tree.elementInScope("p", variant="button"):
1087
self.endTagP(impliedTagToken("p"))
1088
self.tree.insertElement(token)
1089
self.parser.tokenizer.state = self.parser.tokenizer.plaintextState
1091
def startTagHeading(self, token):
1092
if self.tree.elementInScope("p", variant="button"):
1093
self.endTagP(impliedTagToken("p"))
1094
if self.tree.openElements[-1].name in headingElements:
1095
self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
1096
self.tree.openElements.pop()
1097
self.tree.insertElement(token)
1099
def startTagA(self, token):
1100
afeAElement = self.tree.elementInActiveFormattingElements("a")
1102
self.parser.parseError("unexpected-start-tag-implies-end-tag",
1103
{"startName": "a", "endName": "a"})
1104
self.endTagFormatting(impliedTagToken("a"))
1105
if afeAElement in self.tree.openElements:
1106
self.tree.openElements.remove(afeAElement)
1107
if afeAElement in self.tree.activeFormattingElements:
1108
self.tree.activeFormattingElements.remove(afeAElement)
1109
self.tree.reconstructActiveFormattingElements()
1110
self.addFormattingElement(token)
1112
def startTagFormatting(self, token):
1113
self.tree.reconstructActiveFormattingElements()
1114
self.addFormattingElement(token)
1116
def startTagNobr(self, token):
1117
self.tree.reconstructActiveFormattingElements()
1118
if self.tree.elementInScope("nobr"):
1119
self.parser.parseError("unexpected-start-tag-implies-end-tag",
1120
{"startName": "nobr", "endName": "nobr"})
1121
self.processEndTag(impliedTagToken("nobr"))
1122
# XXX Need tests that trigger the following
1123
self.tree.reconstructActiveFormattingElements()
1124
self.addFormattingElement(token)
1126
def startTagButton(self, token):
1127
if self.tree.elementInScope("button"):
1128
self.parser.parseError("unexpected-start-tag-implies-end-tag",
1129
{"startName": "button", "endName": "button"})
1130
self.processEndTag(impliedTagToken("button"))
1133
self.tree.reconstructActiveFormattingElements()
1134
self.tree.insertElement(token)
1135
self.parser.framesetOK = False
1137
def startTagAppletMarqueeObject(self, token):
1138
self.tree.reconstructActiveFormattingElements()
1139
self.tree.insertElement(token)
1140
self.tree.activeFormattingElements.append(Marker)
1141
self.parser.framesetOK = False
1143
def startTagXmp(self, token):
1144
if self.tree.elementInScope("p", variant="button"):
1145
self.endTagP(impliedTagToken("p"))
1146
self.tree.reconstructActiveFormattingElements()
1147
self.parser.framesetOK = False
1148
self.parser.parseRCDataRawtext(token, "RAWTEXT")
1150
def startTagTable(self, token):
1151
if self.parser.compatMode != "quirks":
1152
if self.tree.elementInScope("p", variant="button"):
1153
self.processEndTag(impliedTagToken("p"))
1154
self.tree.insertElement(token)
1155
self.parser.framesetOK = False
1156
self.parser.phase = self.parser.phases["inTable"]
1158
def startTagVoidFormatting(self, token):
1159
self.tree.reconstructActiveFormattingElements()
1160
self.tree.insertElement(token)
1161
self.tree.openElements.pop()
1162
token["selfClosingAcknowledged"] = True
1163
self.parser.framesetOK = False
1165
def startTagInput(self, token):
1166
framesetOK = self.parser.framesetOK
1167
self.startTagVoidFormatting(token)
1168
if ("type" in token["data"] and
1169
token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
1170
# input type=hidden doesn't change framesetOK
1171
self.parser.framesetOK = framesetOK
1173
def startTagParamSource(self, token):
1174
self.tree.insertElement(token)
1175
self.tree.openElements.pop()
1176
token["selfClosingAcknowledged"] = True
1178
def startTagHr(self, token):
1179
if self.tree.elementInScope("p", variant="button"):
1180
self.endTagP(impliedTagToken("p"))
1181
self.tree.insertElement(token)
1182
self.tree.openElements.pop()
1183
token["selfClosingAcknowledged"] = True
1184
self.parser.framesetOK = False
1186
def startTagImage(self, token):
1188
self.parser.parseError("unexpected-start-tag-treated-as",
1189
{"originalName": "image", "newName": "img"})
1190
self.processStartTag(impliedTagToken("img", "StartTag",
1191
attributes=token["data"],
1192
selfClosing=token["selfClosing"]))
1194
def startTagIsIndex(self, token):
1195
self.parser.parseError("deprecated-tag", {"name": "isindex"})
1196
if self.tree.formPointer:
1199
if "action" in token["data"]:
1200
form_attrs["action"] = token["data"]["action"]
1201
self.processStartTag(impliedTagToken("form", "StartTag",
1202
attributes=form_attrs))
1203
self.processStartTag(impliedTagToken("hr", "StartTag"))
1204
self.processStartTag(impliedTagToken("label", "StartTag"))
1205
# XXX Localization ...
1206
if "prompt" in token["data"]:
1207
prompt = token["data"]["prompt"]
1209
prompt = "This is a searchable index. Enter search keywords: "
1210
self.processCharacters(
1211
{"type": tokenTypes["Characters"], "data": prompt})
1212
attributes = token["data"].copy()
1213
if "action" in attributes:
1214
del attributes["action"]
1215
if "prompt" in attributes:
1216
del attributes["prompt"]
1217
attributes["name"] = "isindex"
1218
self.processStartTag(impliedTagToken("input", "StartTag",
1219
attributes=attributes,
1221
token["selfClosing"]))
1222
self.processEndTag(impliedTagToken("label"))
1223
self.processStartTag(impliedTagToken("hr", "StartTag"))
1224
self.processEndTag(impliedTagToken("form"))
1226
def startTagTextarea(self, token):
1227
self.tree.insertElement(token)
1228
self.parser.tokenizer.state = self.parser.tokenizer.rcdataState
1229
self.processSpaceCharacters = self.processSpaceCharactersDropNewline
1230
self.parser.framesetOK = False
1232
def startTagIFrame(self, token):
1233
self.parser.framesetOK = False
1234
self.startTagRawtext(token)
1236
def startTagRawtext(self, token):
1237
"""iframe, noembed noframes, noscript(if scripting enabled)"""
1238
self.parser.parseRCDataRawtext(token, "RAWTEXT")
1240
def startTagOpt(self, token):
1241
if self.tree.openElements[-1].name == "option":
1242
self.parser.phase.processEndTag(impliedTagToken("option"))
1243
self.tree.reconstructActiveFormattingElements()
1244
self.parser.tree.insertElement(token)
1246
def startTagSelect(self, token):
1247
self.tree.reconstructActiveFormattingElements()
1248
self.tree.insertElement(token)
1249
self.parser.framesetOK = False
1250
if self.parser.phase in (self.parser.phases["inTable"],
1251
self.parser.phases["inCaption"],
1252
self.parser.phases["inColumnGroup"],
1253
self.parser.phases["inTableBody"],
1254
self.parser.phases["inRow"],
1255
self.parser.phases["inCell"]):
1256
self.parser.phase = self.parser.phases["inSelectInTable"]
1258
self.parser.phase = self.parser.phases["inSelect"]
1260
def startTagRpRt(self, token):
1261
if self.tree.elementInScope("ruby"):
1262
self.tree.generateImpliedEndTags()
1263
if self.tree.openElements[-1].name != "ruby":
1264
self.parser.parseError()
1265
self.tree.insertElement(token)
1267
def startTagMath(self, token):
1268
self.tree.reconstructActiveFormattingElements()
1269
self.parser.adjustMathMLAttributes(token)
1270
self.parser.adjustForeignAttributes(token)
1271
token["namespace"] = namespaces["mathml"]
1272
self.tree.insertElement(token)
1273
# Need to get the parse error right for the case where the token
1274
# has a namespace not equal to the xmlns attribute
1275
if token["selfClosing"]:
1276
self.tree.openElements.pop()
1277
token["selfClosingAcknowledged"] = True
1279
def startTagSvg(self, token):
1280
self.tree.reconstructActiveFormattingElements()
1281
self.parser.adjustSVGAttributes(token)
1282
self.parser.adjustForeignAttributes(token)
1283
token["namespace"] = namespaces["svg"]
1284
self.tree.insertElement(token)
1285
# Need to get the parse error right for the case where the token
1286
# has a namespace not equal to the xmlns attribute
1287
if token["selfClosing"]:
1288
self.tree.openElements.pop()
1289
token["selfClosingAcknowledged"] = True
1291
def startTagMisplaced(self, token):
1292
""" Elements that should be children of other elements that have a
1293
different insertion mode; here they are ignored
1294
"caption", "col", "colgroup", "frame", "frameset", "head",
1295
"option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
1298
self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]})
1300
def startTagOther(self, token):
1301
self.tree.reconstructActiveFormattingElements()
1302
self.tree.insertElement(token)
1304
def endTagP(self, token):
1305
if not self.tree.elementInScope("p", variant="button"):
1306
self.startTagCloseP(impliedTagToken("p", "StartTag"))
1307
self.parser.parseError("unexpected-end-tag", {"name": "p"})
1308
self.endTagP(impliedTagToken("p", "EndTag"))
1310
self.tree.generateImpliedEndTags("p")
1311
if self.tree.openElements[-1].name != "p":
1312
self.parser.parseError("unexpected-end-tag", {"name": "p"})
1313
node = self.tree.openElements.pop()
1314
while node.name != "p":
1315
node = self.tree.openElements.pop()
1317
def endTagBody(self, token):
1318
if not self.tree.elementInScope("body"):
1319
self.parser.parseError()
1321
elif self.tree.openElements[-1].name != "body":
1322
for node in self.tree.openElements[2:]:
1323
if node.name not in frozenset(("dd", "dt", "li", "optgroup",
1324
"option", "p", "rp", "rt",
1325
"tbody", "td", "tfoot",
1326
"th", "thead", "tr", "body",
1328
# Not sure this is the correct name for the parse error
1329
self.parser.parseError(
1330
"expected-one-end-tag-but-got-another",
1331
{"expectedName": "body", "gotName": node.name})
1333
self.parser.phase = self.parser.phases["afterBody"]
1335
def endTagHtml(self, token):
1336
# We repeat the test for the body end tag token being ignored here
1337
if self.tree.elementInScope("body"):
1338
self.endTagBody(impliedTagToken("body"))
1341
def endTagBlock(self, token):
1342
# Put us back in the right whitespace handling mode
1343
if token["name"] == "pre":
1344
self.processSpaceCharacters = self.processSpaceCharactersNonPre
1345
inScope = self.tree.elementInScope(token["name"])
1347
self.tree.generateImpliedEndTags()
1348
if self.tree.openElements[-1].name != token["name"]:
1349
self.parser.parseError("end-tag-too-early", {"name": token["name"]})
1351
node = self.tree.openElements.pop()
1352
while node.name != token["name"]:
1353
node = self.tree.openElements.pop()
1355
def endTagForm(self, token):
1356
node = self.tree.formPointer
1357
self.tree.formPointer = None
1358
if node is None or not self.tree.elementInScope(node):
1359
self.parser.parseError("unexpected-end-tag",
1362
self.tree.generateImpliedEndTags()
1363
if self.tree.openElements[-1] != node:
1364
self.parser.parseError("end-tag-too-early-ignored",
1366
self.tree.openElements.remove(node)
1368
def endTagListItem(self, token):
1369
if token["name"] == "li":
1373
if not self.tree.elementInScope(token["name"], variant=variant):
1374
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1376
self.tree.generateImpliedEndTags(exclude=token["name"])
1377
if self.tree.openElements[-1].name != token["name"]:
1378
self.parser.parseError(
1379
"end-tag-too-early",
1380
{"name": token["name"]})
1381
node = self.tree.openElements.pop()
1382
while node.name != token["name"]:
1383
node = self.tree.openElements.pop()
1385
def endTagHeading(self, token):
1386
for item in headingElements:
1387
if self.tree.elementInScope(item):
1388
self.tree.generateImpliedEndTags()
1390
if self.tree.openElements[-1].name != token["name"]:
1391
self.parser.parseError("end-tag-too-early", {"name": token["name"]})
1393
for item in headingElements:
1394
if self.tree.elementInScope(item):
1395
item = self.tree.openElements.pop()
1396
while item.name not in headingElements:
1397
item = self.tree.openElements.pop()
1400
def endTagFormatting(self, token):
1401
"""The much-feared adoption agency algorithm"""
1402
# http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867
1403
# XXX Better parseError messages appreciated.
1406
outerLoopCounter = 0
1409
while outerLoopCounter < 8:
1412
outerLoopCounter += 1
1416
# Let the formatting element be the last element in
1417
# the list of active formatting elements that:
1418
# - is between the end of the list and the last scope
1419
# marker in the list, if any, or the start of the list
1421
# - has the same tag name as the token.
1422
formattingElement = self.tree.elementInActiveFormattingElements(
1424
if (not formattingElement or
1425
(formattingElement in self.tree.openElements and
1426
not self.tree.elementInScope(formattingElement.name))):
1427
# If there is no such node, then abort these steps
1428
# and instead act as described in the "any other
1429
# end tag" entry below.
1430
self.endTagOther(token)
1433
# Otherwise, if there is such a node, but that node is
1434
# not in the stack of open elements, then this is a
1435
# parse error; remove the element from the list, and
1436
# abort these steps.
1437
elif formattingElement not in self.tree.openElements:
1438
self.parser.parseError("adoption-agency-1.2", {"name": token["name"]})
1439
self.tree.activeFormattingElements.remove(formattingElement)
1442
# Otherwise, if there is such a node, and that node is
1443
# also in the stack of open elements, but the element
1444
# is not in scope, then this is a parse error; ignore
1445
# the token, and abort these steps.
1446
elif not self.tree.elementInScope(formattingElement.name):
1447
self.parser.parseError("adoption-agency-4.4", {"name": token["name"]})
1450
# Otherwise, there is a formatting element and that
1451
# element is in the stack and is in scope. If the
1452
# element is not the current node, this is a parse
1453
# error. In any case, proceed with the algorithm as
1454
# written in the following steps.
1456
if formattingElement != self.tree.openElements[-1]:
1457
self.parser.parseError("adoption-agency-1.3", {"name": token["name"]})
1461
# Let the furthest block be the topmost node in the
1462
# stack of open elements that is lower in the stack
1463
# than the formatting element, and is an element in
1464
# the special category. There might not be one.
1465
afeIndex = self.tree.openElements.index(formattingElement)
1466
furthestBlock = None
1467
for element in self.tree.openElements[afeIndex:]:
1468
if element.nameTuple in specialElements:
1469
furthestBlock = element
1474
# If there is no furthest block, then the UA must
1475
# first pop all the nodes from the bottom of the stack
1476
# of open elements, from the current node up to and
1477
# including the formatting element, then remove the
1478
# formatting element from the list of active
1479
# formatting elements, and finally abort these steps.
1480
if furthestBlock is None:
1481
element = self.tree.openElements.pop()
1482
while element != formattingElement:
1483
element = self.tree.openElements.pop()
1484
self.tree.activeFormattingElements.remove(element)
1488
commonAncestor = self.tree.openElements[afeIndex - 1]
1491
# The bookmark is supposed to help us identify where to reinsert
1492
# nodes in step 15. We have to ensure that we reinsert nodes after
1493
# the node before the active formatting element. Note the bookmark
1494
# can move in step 9.7
1495
bookmark = self.tree.activeFormattingElements.index(formattingElement)
1498
lastNode = node = furthestBlock
1499
innerLoopCounter = 0
1501
index = self.tree.openElements.index(node)
1502
while innerLoopCounter < 3:
1503
innerLoopCounter += 1
1504
# Node is element before node in open elements
1506
node = self.tree.openElements[index]
1507
if node not in self.tree.activeFormattingElements:
1508
self.tree.openElements.remove(node)
1511
if node == formattingElement:
1514
if lastNode == furthestBlock:
1515
bookmark = self.tree.activeFormattingElements.index(node) + 1
1517
clone = node.cloneNode()
1518
# Replace node with clone
1519
self.tree.activeFormattingElements[
1520
self.tree.activeFormattingElements.index(node)] = clone
1521
self.tree.openElements[
1522
self.tree.openElements.index(node)] = clone
1525
# Remove lastNode from its parents, if any
1527
lastNode.parent.removeChild(lastNode)
1528
node.appendChild(lastNode)
1533
# Foster parent lastNode if commonAncestor is a
1534
# table, tbody, tfoot, thead, or tr we need to foster
1535
# parent the lastNode
1537
lastNode.parent.removeChild(lastNode)
1539
if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")):
1540
parent, insertBefore = self.tree.getTableMisnestedNodePosition()
1541
parent.insertBefore(lastNode, insertBefore)
1543
commonAncestor.appendChild(lastNode)
1546
clone = formattingElement.cloneNode()
1549
furthestBlock.reparentChildren(clone)
1552
furthestBlock.appendChild(clone)
1555
self.tree.activeFormattingElements.remove(formattingElement)
1556
self.tree.activeFormattingElements.insert(bookmark, clone)
1559
self.tree.openElements.remove(formattingElement)
1560
self.tree.openElements.insert(
1561
self.tree.openElements.index(furthestBlock) + 1, clone)
1563
def endTagAppletMarqueeObject(self, token):
1564
if self.tree.elementInScope(token["name"]):
1565
self.tree.generateImpliedEndTags()
1566
if self.tree.openElements[-1].name != token["name"]:
1567
self.parser.parseError("end-tag-too-early", {"name": token["name"]})
1569
if self.tree.elementInScope(token["name"]):
1570
element = self.tree.openElements.pop()
1571
while element.name != token["name"]:
1572
element = self.tree.openElements.pop()
1573
self.tree.clearActiveFormattingElements()
1575
def endTagBr(self, token):
1576
self.parser.parseError("unexpected-end-tag-treated-as",
1577
{"originalName": "br", "newName": "br element"})
1578
self.tree.reconstructActiveFormattingElements()
1579
self.tree.insertElement(impliedTagToken("br", "StartTag"))
1580
self.tree.openElements.pop()
1582
def endTagOther(self, token):
1583
for node in self.tree.openElements[::-1]:
1584
if node.name == token["name"]:
1585
self.tree.generateImpliedEndTags(exclude=token["name"])
1586
if self.tree.openElements[-1].name != token["name"]:
1587
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1588
while self.tree.openElements.pop() != node:
1592
if node.nameTuple in specialElements:
1593
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1596
class TextPhase(Phase):
1597
def __init__(self, parser, tree):
1598
Phase.__init__(self, parser, tree)
1599
self.startTagHandler = utils.MethodDispatcher([])
1600
self.startTagHandler.default = self.startTagOther
1601
self.endTagHandler = utils.MethodDispatcher([
1602
("script", self.endTagScript)])
1603
self.endTagHandler.default = self.endTagOther
1605
def processCharacters(self, token):
1606
self.tree.insertText(token["data"])
1608
def processEOF(self):
1609
self.parser.parseError("expected-named-closing-tag-but-got-eof",
1610
{"name": self.tree.openElements[-1].name})
1611
self.tree.openElements.pop()
1612
self.parser.phase = self.parser.originalPhase
1615
def startTagOther(self, token):
1616
assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name']
1618
def endTagScript(self, token):
1619
node = self.tree.openElements.pop()
1620
assert node.name == "script"
1621
self.parser.phase = self.parser.originalPhase
1622
# The rest of this method is all stuff that only happens if
1623
# document.write works
1625
def endTagOther(self, token):
1626
self.tree.openElements.pop()
1627
self.parser.phase = self.parser.originalPhase
1629
class InTablePhase(Phase):
1630
# http://www.whatwg.org/specs/web-apps/current-work/#in-table
1631
def __init__(self, parser, tree):
1632
Phase.__init__(self, parser, tree)
1633
self.startTagHandler = utils.MethodDispatcher([
1634
("html", self.startTagHtml),
1635
("caption", self.startTagCaption),
1636
("colgroup", self.startTagColgroup),
1637
("col", self.startTagCol),
1638
(("tbody", "tfoot", "thead"), self.startTagRowGroup),
1639
(("td", "th", "tr"), self.startTagImplyTbody),
1640
("table", self.startTagTable),
1641
(("style", "script"), self.startTagStyleScript),
1642
("input", self.startTagInput),
1643
("form", self.startTagForm)
1645
self.startTagHandler.default = self.startTagOther
1647
self.endTagHandler = utils.MethodDispatcher([
1648
("table", self.endTagTable),
1649
(("body", "caption", "col", "colgroup", "html", "tbody", "td",
1650
"tfoot", "th", "thead", "tr"), self.endTagIgnore)
1652
self.endTagHandler.default = self.endTagOther
1655
def clearStackToTableContext(self):
1656
# "clear the stack back to a table context"
1657
while self.tree.openElements[-1].name not in ("table", "html"):
1658
# self.parser.parseError("unexpected-implied-end-tag-in-table",
1659
# {"name": self.tree.openElements[-1].name})
1660
self.tree.openElements.pop()
1661
# When the current node is <html> it's an innerHTML case
1663
# processing methods
1664
def processEOF(self):
1665
if self.tree.openElements[-1].name != "html":
1666
self.parser.parseError("eof-in-table")
1668
assert self.parser.innerHTML
1671
def processSpaceCharacters(self, token):
1672
originalPhase = self.parser.phase
1673
self.parser.phase = self.parser.phases["inTableText"]
1674
self.parser.phase.originalPhase = originalPhase
1675
self.parser.phase.processSpaceCharacters(token)
1677
def processCharacters(self, token):
1678
originalPhase = self.parser.phase
1679
self.parser.phase = self.parser.phases["inTableText"]
1680
self.parser.phase.originalPhase = originalPhase
1681
self.parser.phase.processCharacters(token)
1683
def insertText(self, token):
1684
# If we get here there must be at least one non-whitespace character
1685
# Do the table magic!
1686
self.tree.insertFromTable = True
1687
self.parser.phases["inBody"].processCharacters(token)
1688
self.tree.insertFromTable = False
1690
def startTagCaption(self, token):
1691
self.clearStackToTableContext()
1692
self.tree.activeFormattingElements.append(Marker)
1693
self.tree.insertElement(token)
1694
self.parser.phase = self.parser.phases["inCaption"]
1696
def startTagColgroup(self, token):
1697
self.clearStackToTableContext()
1698
self.tree.insertElement(token)
1699
self.parser.phase = self.parser.phases["inColumnGroup"]
1701
def startTagCol(self, token):
1702
self.startTagColgroup(impliedTagToken("colgroup", "StartTag"))
1705
def startTagRowGroup(self, token):
1706
self.clearStackToTableContext()
1707
self.tree.insertElement(token)
1708
self.parser.phase = self.parser.phases["inTableBody"]
1710
def startTagImplyTbody(self, token):
1711
self.startTagRowGroup(impliedTagToken("tbody", "StartTag"))
1714
def startTagTable(self, token):
1715
self.parser.parseError("unexpected-start-tag-implies-end-tag",
1716
{"startName": "table", "endName": "table"})
1717
self.parser.phase.processEndTag(impliedTagToken("table"))
1718
if not self.parser.innerHTML:
1721
def startTagStyleScript(self, token):
1722
return self.parser.phases["inHead"].processStartTag(token)
1724
def startTagInput(self, token):
1725
if ("type" in token["data"] and
1726
token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
1727
self.parser.parseError("unexpected-hidden-input-in-table")
1728
self.tree.insertElement(token)
1729
# XXX associate with form
1730
self.tree.openElements.pop()
1732
self.startTagOther(token)
1734
def startTagForm(self, token):
1735
self.parser.parseError("unexpected-form-in-table")
1736
if self.tree.formPointer is None:
1737
self.tree.insertElement(token)
1738
self.tree.formPointer = self.tree.openElements[-1]
1739
self.tree.openElements.pop()
1741
def startTagOther(self, token):
1742
self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]})
1743
# Do the table magic!
1744
self.tree.insertFromTable = True
1745
self.parser.phases["inBody"].processStartTag(token)
1746
self.tree.insertFromTable = False
1748
def endTagTable(self, token):
1749
if self.tree.elementInScope("table", variant="table"):
1750
self.tree.generateImpliedEndTags()
1751
if self.tree.openElements[-1].name != "table":
1752
self.parser.parseError("end-tag-too-early-named",
1753
{"gotName": "table",
1754
"expectedName": self.tree.openElements[-1].name})
1755
while self.tree.openElements[-1].name != "table":
1756
self.tree.openElements.pop()
1757
self.tree.openElements.pop()
1758
self.parser.resetInsertionMode()
1761
assert self.parser.innerHTML
1762
self.parser.parseError()
1764
def endTagIgnore(self, token):
1765
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1767
def endTagOther(self, token):
1768
self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]})
1769
# Do the table magic!
1770
self.tree.insertFromTable = True
1771
self.parser.phases["inBody"].processEndTag(token)
1772
self.tree.insertFromTable = False
1774
class InTableTextPhase(Phase):
1775
def __init__(self, parser, tree):
1776
Phase.__init__(self, parser, tree)
1777
self.originalPhase = None
1778
self.characterTokens = []
1780
def flushCharacters(self):
1781
data = "".join([item["data"] for item in self.characterTokens])
1782
if any([item not in spaceCharacters for item in data]):
1783
token = {"type": tokenTypes["Characters"], "data": data}
1784
self.parser.phases["inTable"].insertText(token)
1786
self.tree.insertText(data)
1787
self.characterTokens = []
1789
def processComment(self, token):
1790
self.flushCharacters()
1791
self.parser.phase = self.originalPhase
1794
def processEOF(self):
1795
self.flushCharacters()
1796
self.parser.phase = self.originalPhase
1799
def processCharacters(self, token):
1800
if token["data"] == "\u0000":
1802
self.characterTokens.append(token)
1804
def processSpaceCharacters(self, token):
1805
# pretty sure we should never reach here
1806
self.characterTokens.append(token)
1809
def processStartTag(self, token):
1810
self.flushCharacters()
1811
self.parser.phase = self.originalPhase
1814
def processEndTag(self, token):
1815
self.flushCharacters()
1816
self.parser.phase = self.originalPhase
1819
class InCaptionPhase(Phase):
1820
# http://www.whatwg.org/specs/web-apps/current-work/#in-caption
1821
def __init__(self, parser, tree):
1822
Phase.__init__(self, parser, tree)
1824
self.startTagHandler = utils.MethodDispatcher([
1825
("html", self.startTagHtml),
1826
(("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
1827
"thead", "tr"), self.startTagTableElement)
1829
self.startTagHandler.default = self.startTagOther
1831
self.endTagHandler = utils.MethodDispatcher([
1832
("caption", self.endTagCaption),
1833
("table", self.endTagTable),
1834
(("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
1835
"thead", "tr"), self.endTagIgnore)
1837
self.endTagHandler.default = self.endTagOther
1839
def ignoreEndTagCaption(self):
1840
return not self.tree.elementInScope("caption", variant="table")
1842
def processEOF(self):
1843
self.parser.phases["inBody"].processEOF()
1845
def processCharacters(self, token):
1846
return self.parser.phases["inBody"].processCharacters(token)
1848
def startTagTableElement(self, token):
1849
self.parser.parseError()
1850
# XXX Have to duplicate logic here to find out if the tag is ignored
1851
ignoreEndTag = self.ignoreEndTagCaption()
1852
self.parser.phase.processEndTag(impliedTagToken("caption"))
1853
if not ignoreEndTag:
1856
def startTagOther(self, token):
1857
return self.parser.phases["inBody"].processStartTag(token)
1859
def endTagCaption(self, token):
1860
if not self.ignoreEndTagCaption():
1861
# AT this code is quite similar to endTagTable in "InTable"
1862
self.tree.generateImpliedEndTags()
1863
if self.tree.openElements[-1].name != "caption":
1864
self.parser.parseError("expected-one-end-tag-but-got-another",
1865
{"gotName": "caption",
1866
"expectedName": self.tree.openElements[-1].name})
1867
while self.tree.openElements[-1].name != "caption":
1868
self.tree.openElements.pop()
1869
self.tree.openElements.pop()
1870
self.tree.clearActiveFormattingElements()
1871
self.parser.phase = self.parser.phases["inTable"]
1874
assert self.parser.innerHTML
1875
self.parser.parseError()
1877
def endTagTable(self, token):
1878
self.parser.parseError()
1879
ignoreEndTag = self.ignoreEndTagCaption()
1880
self.parser.phase.processEndTag(impliedTagToken("caption"))
1881
if not ignoreEndTag:
1884
def endTagIgnore(self, token):
1885
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1887
def endTagOther(self, token):
1888
return self.parser.phases["inBody"].processEndTag(token)
1890
class InColumnGroupPhase(Phase):
1891
# http://www.whatwg.org/specs/web-apps/current-work/#in-column
1893
def __init__(self, parser, tree):
1894
Phase.__init__(self, parser, tree)
1896
self.startTagHandler = utils.MethodDispatcher([
1897
("html", self.startTagHtml),
1898
("col", self.startTagCol)
1900
self.startTagHandler.default = self.startTagOther
1902
self.endTagHandler = utils.MethodDispatcher([
1903
("colgroup", self.endTagColgroup),
1904
("col", self.endTagCol)
1906
self.endTagHandler.default = self.endTagOther
1908
def ignoreEndTagColgroup(self):
1909
return self.tree.openElements[-1].name == "html"
1911
def processEOF(self):
1912
if self.tree.openElements[-1].name == "html":
1913
assert self.parser.innerHTML
1916
ignoreEndTag = self.ignoreEndTagColgroup()
1917
self.endTagColgroup(impliedTagToken("colgroup"))
1918
if not ignoreEndTag:
1921
def processCharacters(self, token):
1922
ignoreEndTag = self.ignoreEndTagColgroup()
1923
self.endTagColgroup(impliedTagToken("colgroup"))
1924
if not ignoreEndTag:
1927
def startTagCol(self, token):
1928
self.tree.insertElement(token)
1929
self.tree.openElements.pop()
1931
def startTagOther(self, token):
1932
ignoreEndTag = self.ignoreEndTagColgroup()
1933
self.endTagColgroup(impliedTagToken("colgroup"))
1934
if not ignoreEndTag:
1937
def endTagColgroup(self, token):
1938
if self.ignoreEndTagColgroup():
1940
assert self.parser.innerHTML
1941
self.parser.parseError()
1943
self.tree.openElements.pop()
1944
self.parser.phase = self.parser.phases["inTable"]
1946
def endTagCol(self, token):
1947
self.parser.parseError("no-end-tag", {"name": "col"})
1949
def endTagOther(self, token):
1950
ignoreEndTag = self.ignoreEndTagColgroup()
1951
self.endTagColgroup(impliedTagToken("colgroup"))
1952
if not ignoreEndTag:
1955
class InTableBodyPhase(Phase):
1956
# http://www.whatwg.org/specs/web-apps/current-work/#in-table0
1957
def __init__(self, parser, tree):
1958
Phase.__init__(self, parser, tree)
1959
self.startTagHandler = utils.MethodDispatcher([
1960
("html", self.startTagHtml),
1961
("tr", self.startTagTr),
1962
(("td", "th"), self.startTagTableCell),
1963
(("caption", "col", "colgroup", "tbody", "tfoot", "thead"),
1964
self.startTagTableOther)
1966
self.startTagHandler.default = self.startTagOther
1968
self.endTagHandler = utils.MethodDispatcher([
1969
(("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
1970
("table", self.endTagTable),
1971
(("body", "caption", "col", "colgroup", "html", "td", "th",
1972
"tr"), self.endTagIgnore)
1974
self.endTagHandler.default = self.endTagOther
1977
def clearStackToTableBodyContext(self):
1978
while self.tree.openElements[-1].name not in ("tbody", "tfoot",
1980
# self.parser.parseError("unexpected-implied-end-tag-in-table",
1981
# {"name": self.tree.openElements[-1].name})
1982
self.tree.openElements.pop()
1983
if self.tree.openElements[-1].name == "html":
1984
assert self.parser.innerHTML
1987
def processEOF(self):
1988
self.parser.phases["inTable"].processEOF()
1990
def processSpaceCharacters(self, token):
1991
return self.parser.phases["inTable"].processSpaceCharacters(token)
1993
def processCharacters(self, token):
1994
return self.parser.phases["inTable"].processCharacters(token)
1996
def startTagTr(self, token):
1997
self.clearStackToTableBodyContext()
1998
self.tree.insertElement(token)
1999
self.parser.phase = self.parser.phases["inRow"]
2001
def startTagTableCell(self, token):
2002
self.parser.parseError("unexpected-cell-in-table-body",
2003
{"name": token["name"]})
2004
self.startTagTr(impliedTagToken("tr", "StartTag"))
2007
def startTagTableOther(self, token):
2008
# XXX AT Any ideas on how to share this with endTagTable?
2009
if (self.tree.elementInScope("tbody", variant="table") or
2010
self.tree.elementInScope("thead", variant="table") or
2011
self.tree.elementInScope("tfoot", variant="table")):
2012
self.clearStackToTableBodyContext()
2013
self.endTagTableRowGroup(
2014
impliedTagToken(self.tree.openElements[-1].name))
2018
assert self.parser.innerHTML
2019
self.parser.parseError()
2021
def startTagOther(self, token):
2022
return self.parser.phases["inTable"].processStartTag(token)
2024
def endTagTableRowGroup(self, token):
2025
if self.tree.elementInScope(token["name"], variant="table"):
2026
self.clearStackToTableBodyContext()
2027
self.tree.openElements.pop()
2028
self.parser.phase = self.parser.phases["inTable"]
2030
self.parser.parseError("unexpected-end-tag-in-table-body",
2031
{"name": token["name"]})
2033
def endTagTable(self, token):
2034
if (self.tree.elementInScope("tbody", variant="table") or
2035
self.tree.elementInScope("thead", variant="table") or
2036
self.tree.elementInScope("tfoot", variant="table")):
2037
self.clearStackToTableBodyContext()
2038
self.endTagTableRowGroup(
2039
impliedTagToken(self.tree.openElements[-1].name))
2043
assert self.parser.innerHTML
2044
self.parser.parseError()
2046
def endTagIgnore(self, token):
2047
self.parser.parseError("unexpected-end-tag-in-table-body",
2048
{"name": token["name"]})
2050
def endTagOther(self, token):
2051
return self.parser.phases["inTable"].processEndTag(token)
2053
class InRowPhase(Phase):
2054
# http://www.whatwg.org/specs/web-apps/current-work/#in-row
2055
def __init__(self, parser, tree):
2056
Phase.__init__(self, parser, tree)
2057
self.startTagHandler = utils.MethodDispatcher([
2058
("html", self.startTagHtml),
2059
(("td", "th"), self.startTagTableCell),
2060
(("caption", "col", "colgroup", "tbody", "tfoot", "thead",
2061
"tr"), self.startTagTableOther)
2063
self.startTagHandler.default = self.startTagOther
2065
self.endTagHandler = utils.MethodDispatcher([
2066
("tr", self.endTagTr),
2067
("table", self.endTagTable),
2068
(("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
2069
(("body", "caption", "col", "colgroup", "html", "td", "th"),
2072
self.endTagHandler.default = self.endTagOther
2074
# helper methods (XXX unify this with other table helper methods)
2075
def clearStackToTableRowContext(self):
2076
while self.tree.openElements[-1].name not in ("tr", "html"):
2077
self.parser.parseError("unexpected-implied-end-tag-in-table-row",
2078
{"name": self.tree.openElements[-1].name})
2079
self.tree.openElements.pop()
2081
def ignoreEndTagTr(self):
2082
return not self.tree.elementInScope("tr", variant="table")
2085
def processEOF(self):
2086
self.parser.phases["inTable"].processEOF()
2088
def processSpaceCharacters(self, token):
2089
return self.parser.phases["inTable"].processSpaceCharacters(token)
2091
def processCharacters(self, token):
2092
return self.parser.phases["inTable"].processCharacters(token)
2094
def startTagTableCell(self, token):
2095
self.clearStackToTableRowContext()
2096
self.tree.insertElement(token)
2097
self.parser.phase = self.parser.phases["inCell"]
2098
self.tree.activeFormattingElements.append(Marker)
2100
def startTagTableOther(self, token):
2101
ignoreEndTag = self.ignoreEndTagTr()
2102
self.endTagTr(impliedTagToken("tr"))
2103
# XXX how are we sure it's always ignored in the innerHTML case?
2104
if not ignoreEndTag:
2107
def startTagOther(self, token):
2108
return self.parser.phases["inTable"].processStartTag(token)
2110
def endTagTr(self, token):
2111
if not self.ignoreEndTagTr():
2112
self.clearStackToTableRowContext()
2113
self.tree.openElements.pop()
2114
self.parser.phase = self.parser.phases["inTableBody"]
2117
assert self.parser.innerHTML
2118
self.parser.parseError()
2120
def endTagTable(self, token):
2121
ignoreEndTag = self.ignoreEndTagTr()
2122
self.endTagTr(impliedTagToken("tr"))
2123
# Reprocess the current tag if the tr end tag was not ignored
2124
# XXX how are we sure it's always ignored in the innerHTML case?
2125
if not ignoreEndTag:
2128
def endTagTableRowGroup(self, token):
2129
if self.tree.elementInScope(token["name"], variant="table"):
2130
self.endTagTr(impliedTagToken("tr"))
2133
self.parser.parseError()
2135
def endTagIgnore(self, token):
2136
self.parser.parseError("unexpected-end-tag-in-table-row",
2137
{"name": token["name"]})
2139
def endTagOther(self, token):
2140
return self.parser.phases["inTable"].processEndTag(token)
2142
class InCellPhase(Phase):
2143
# http://www.whatwg.org/specs/web-apps/current-work/#in-cell
2144
def __init__(self, parser, tree):
2145
Phase.__init__(self, parser, tree)
2146
self.startTagHandler = utils.MethodDispatcher([
2147
("html", self.startTagHtml),
2148
(("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
2149
"thead", "tr"), self.startTagTableOther)
2151
self.startTagHandler.default = self.startTagOther
2153
self.endTagHandler = utils.MethodDispatcher([
2154
(("td", "th"), self.endTagTableCell),
2155
(("body", "caption", "col", "colgroup", "html"), self.endTagIgnore),
2156
(("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply)
2158
self.endTagHandler.default = self.endTagOther
2161
def closeCell(self):
2162
if self.tree.elementInScope("td", variant="table"):
2163
self.endTagTableCell(impliedTagToken("td"))
2164
elif self.tree.elementInScope("th", variant="table"):
2165
self.endTagTableCell(impliedTagToken("th"))
2168
def processEOF(self):
2169
self.parser.phases["inBody"].processEOF()
2171
def processCharacters(self, token):
2172
return self.parser.phases["inBody"].processCharacters(token)
2174
def startTagTableOther(self, token):
2175
if (self.tree.elementInScope("td", variant="table") or
2176
self.tree.elementInScope("th", variant="table")):
2181
assert self.parser.innerHTML
2182
self.parser.parseError()
2184
def startTagOther(self, token):
2185
return self.parser.phases["inBody"].processStartTag(token)
2187
def endTagTableCell(self, token):
2188
if self.tree.elementInScope(token["name"], variant="table"):
2189
self.tree.generateImpliedEndTags(token["name"])
2190
if self.tree.openElements[-1].name != token["name"]:
2191
self.parser.parseError("unexpected-cell-end-tag",
2192
{"name": token["name"]})
2194
node = self.tree.openElements.pop()
2195
if node.name == token["name"]:
2198
self.tree.openElements.pop()
2199
self.tree.clearActiveFormattingElements()
2200
self.parser.phase = self.parser.phases["inRow"]
2202
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
2204
def endTagIgnore(self, token):
2205
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
2207
def endTagImply(self, token):
2208
if self.tree.elementInScope(token["name"], variant="table"):
2212
# sometimes innerHTML case
2213
self.parser.parseError()
2215
def endTagOther(self, token):
2216
return self.parser.phases["inBody"].processEndTag(token)
2218
class InSelectPhase(Phase):
2219
def __init__(self, parser, tree):
2220
Phase.__init__(self, parser, tree)
2222
self.startTagHandler = utils.MethodDispatcher([
2223
("html", self.startTagHtml),
2224
("option", self.startTagOption),
2225
("optgroup", self.startTagOptgroup),
2226
("select", self.startTagSelect),
2227
(("input", "keygen", "textarea"), self.startTagInput),
2228
("script", self.startTagScript)
2230
self.startTagHandler.default = self.startTagOther
2232
self.endTagHandler = utils.MethodDispatcher([
2233
("option", self.endTagOption),
2234
("optgroup", self.endTagOptgroup),
2235
("select", self.endTagSelect)
2237
self.endTagHandler.default = self.endTagOther
2239
# http://www.whatwg.org/specs/web-apps/current-work/#in-select
2240
def processEOF(self):
2241
if self.tree.openElements[-1].name != "html":
2242
self.parser.parseError("eof-in-select")
2244
assert self.parser.innerHTML
2246
def processCharacters(self, token):
2247
if token["data"] == "\u0000":
2249
self.tree.insertText(token["data"])
2251
def startTagOption(self, token):
2252
# We need to imply </option> if <option> is the current node.
2253
if self.tree.openElements[-1].name == "option":
2254
self.tree.openElements.pop()
2255
self.tree.insertElement(token)
2257
def startTagOptgroup(self, token):
2258
if self.tree.openElements[-1].name == "option":
2259
self.tree.openElements.pop()
2260
if self.tree.openElements[-1].name == "optgroup":
2261
self.tree.openElements.pop()
2262
self.tree.insertElement(token)
2264
def startTagSelect(self, token):
2265
self.parser.parseError("unexpected-select-in-select")
2266
self.endTagSelect(impliedTagToken("select"))
2268
def startTagInput(self, token):
2269
self.parser.parseError("unexpected-input-in-select")
2270
if self.tree.elementInScope("select", variant="select"):
2271
self.endTagSelect(impliedTagToken("select"))
2274
assert self.parser.innerHTML
2276
def startTagScript(self, token):
2277
return self.parser.phases["inHead"].processStartTag(token)
2279
def startTagOther(self, token):
2280
self.parser.parseError("unexpected-start-tag-in-select",
2281
{"name": token["name"]})
2283
def endTagOption(self, token):
2284
if self.tree.openElements[-1].name == "option":
2285
self.tree.openElements.pop()
2287
self.parser.parseError("unexpected-end-tag-in-select",
2290
def endTagOptgroup(self, token):
2291
# </optgroup> implicitly closes <option>
2292
if (self.tree.openElements[-1].name == "option" and
2293
self.tree.openElements[-2].name == "optgroup"):
2294
self.tree.openElements.pop()
2295
# It also closes </optgroup>
2296
if self.tree.openElements[-1].name == "optgroup":
2297
self.tree.openElements.pop()
2300
self.parser.parseError("unexpected-end-tag-in-select",
2301
{"name": "optgroup"})
2303
def endTagSelect(self, token):
2304
if self.tree.elementInScope("select", variant="select"):
2305
node = self.tree.openElements.pop()
2306
while node.name != "select":
2307
node = self.tree.openElements.pop()
2308
self.parser.resetInsertionMode()
2311
assert self.parser.innerHTML
2312
self.parser.parseError()
2314
def endTagOther(self, token):
2315
self.parser.parseError("unexpected-end-tag-in-select",
2316
{"name": token["name"]})
2318
class InSelectInTablePhase(Phase):
2319
def __init__(self, parser, tree):
2320
Phase.__init__(self, parser, tree)
2322
self.startTagHandler = utils.MethodDispatcher([
2323
(("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
2326
self.startTagHandler.default = self.startTagOther
2328
self.endTagHandler = utils.MethodDispatcher([
2329
(("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
2332
self.endTagHandler.default = self.endTagOther
2334
def processEOF(self):
2335
self.parser.phases["inSelect"].processEOF()
2337
def processCharacters(self, token):
2338
return self.parser.phases["inSelect"].processCharacters(token)
2340
def startTagTable(self, token):
2341
self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]})
2342
self.endTagOther(impliedTagToken("select"))
2345
def startTagOther(self, token):
2346
return self.parser.phases["inSelect"].processStartTag(token)
2348
def endTagTable(self, token):
2349
self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]})
2350
if self.tree.elementInScope(token["name"], variant="table"):
2351
self.endTagOther(impliedTagToken("select"))
2354
def endTagOther(self, token):
2355
return self.parser.phases["inSelect"].processEndTag(token)
2357
class InForeignContentPhase(Phase):
2358
breakoutElements = frozenset(["b", "big", "blockquote", "body", "br",
2359
"center", "code", "dd", "div", "dl", "dt",
2360
"em", "embed", "h1", "h2", "h3",
2361
"h4", "h5", "h6", "head", "hr", "i", "img",
2362
"li", "listing", "menu", "meta", "nobr",
2363
"ol", "p", "pre", "ruby", "s", "small",
2364
"span", "strong", "strike", "sub", "sup",
2365
"table", "tt", "u", "ul", "var"])
2367
def __init__(self, parser, tree):
2368
Phase.__init__(self, parser, tree)
2370
def adjustSVGTagNames(self, token):
2371
replacements = {"altglyph": "altGlyph",
2372
"altglyphdef": "altGlyphDef",
2373
"altglyphitem": "altGlyphItem",
2374
"animatecolor": "animateColor",
2375
"animatemotion": "animateMotion",
2376
"animatetransform": "animateTransform",
2377
"clippath": "clipPath",
2378
"feblend": "feBlend",
2379
"fecolormatrix": "feColorMatrix",
2380
"fecomponenttransfer": "feComponentTransfer",
2381
"fecomposite": "feComposite",
2382
"feconvolvematrix": "feConvolveMatrix",
2383
"fediffuselighting": "feDiffuseLighting",
2384
"fedisplacementmap": "feDisplacementMap",
2385
"fedistantlight": "feDistantLight",
2386
"feflood": "feFlood",
2387
"fefunca": "feFuncA",
2388
"fefuncb": "feFuncB",
2389
"fefuncg": "feFuncG",
2390
"fefuncr": "feFuncR",
2391
"fegaussianblur": "feGaussianBlur",
2392
"feimage": "feImage",
2393
"femerge": "feMerge",
2394
"femergenode": "feMergeNode",
2395
"femorphology": "feMorphology",
2396
"feoffset": "feOffset",
2397
"fepointlight": "fePointLight",
2398
"fespecularlighting": "feSpecularLighting",
2399
"fespotlight": "feSpotLight",
2401
"feturbulence": "feTurbulence",
2402
"foreignobject": "foreignObject",
2403
"glyphref": "glyphRef",
2404
"lineargradient": "linearGradient",
2405
"radialgradient": "radialGradient",
2406
"textpath": "textPath"}
2408
if token["name"] in replacements:
2409
token["name"] = replacements[token["name"]]
2411
def processCharacters(self, token):
2412
if token["data"] == "\u0000":
2413
token["data"] = "\uFFFD"
2414
elif (self.parser.framesetOK and
2415
any(char not in spaceCharacters for char in token["data"])):
2416
self.parser.framesetOK = False
2417
Phase.processCharacters(self, token)
2419
def processStartTag(self, token):
2420
currentNode = self.tree.openElements[-1]
2421
if (token["name"] in self.breakoutElements or
2422
(token["name"] == "font" and
2423
set(token["data"].keys()) & set(["color", "face", "size"]))):
2424
self.parser.parseError("unexpected-html-element-in-foreign-content",
2425
{"name": token["name"]})
2426
while (self.tree.openElements[-1].namespace !=
2427
self.tree.defaultNamespace and
2428
not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and
2429
not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])):
2430
self.tree.openElements.pop()
2434
if currentNode.namespace == namespaces["mathml"]:
2435
self.parser.adjustMathMLAttributes(token)
2436
elif currentNode.namespace == namespaces["svg"]:
2437
self.adjustSVGTagNames(token)
2438
self.parser.adjustSVGAttributes(token)
2439
self.parser.adjustForeignAttributes(token)
2440
token["namespace"] = currentNode.namespace
2441
self.tree.insertElement(token)
2442
if token["selfClosing"]:
2443
self.tree.openElements.pop()
2444
token["selfClosingAcknowledged"] = True
2446
def processEndTag(self, token):
2447
nodeIndex = len(self.tree.openElements) - 1
2448
node = self.tree.openElements[-1]
2449
if node.name != token["name"]:
2450
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
2453
if node.name.translate(asciiUpper2Lower) == token["name"]:
2454
# XXX this isn't in the spec but it seems necessary
2455
if self.parser.phase == self.parser.phases["inTableText"]:
2456
self.parser.phase.flushCharacters()
2457
self.parser.phase = self.parser.phase.originalPhase
2458
while self.tree.openElements.pop() != node:
2459
assert self.tree.openElements
2464
node = self.tree.openElements[nodeIndex]
2465
if node.namespace != self.tree.defaultNamespace:
2468
new_token = self.parser.phase.processEndTag(token)
2472
class AfterBodyPhase(Phase):
2473
def __init__(self, parser, tree):
2474
Phase.__init__(self, parser, tree)
2476
self.startTagHandler = utils.MethodDispatcher([
2477
("html", self.startTagHtml)
2479
self.startTagHandler.default = self.startTagOther
2481
self.endTagHandler = utils.MethodDispatcher([("html", self.endTagHtml)])
2482
self.endTagHandler.default = self.endTagOther
2484
def processEOF(self):
2488
def processComment(self, token):
2489
# This is needed because data is to be appended to the <html> element
2490
# here and not to whatever is currently open.
2491
self.tree.insertComment(token, self.tree.openElements[0])
2493
def processCharacters(self, token):
2494
self.parser.parseError("unexpected-char-after-body")
2495
self.parser.phase = self.parser.phases["inBody"]
2498
def startTagHtml(self, token):
2499
return self.parser.phases["inBody"].processStartTag(token)
2501
def startTagOther(self, token):
2502
self.parser.parseError("unexpected-start-tag-after-body",
2503
{"name": token["name"]})
2504
self.parser.phase = self.parser.phases["inBody"]
2507
def endTagHtml(self, name):
2508
if self.parser.innerHTML:
2509
self.parser.parseError("unexpected-end-tag-after-body-innerhtml")
2511
self.parser.phase = self.parser.phases["afterAfterBody"]
2513
def endTagOther(self, token):
2514
self.parser.parseError("unexpected-end-tag-after-body",
2515
{"name": token["name"]})
2516
self.parser.phase = self.parser.phases["inBody"]
2519
class InFramesetPhase(Phase):
2520
# http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
2521
def __init__(self, parser, tree):
2522
Phase.__init__(self, parser, tree)
2524
self.startTagHandler = utils.MethodDispatcher([
2525
("html", self.startTagHtml),
2526
("frameset", self.startTagFrameset),
2527
("frame", self.startTagFrame),
2528
("noframes", self.startTagNoframes)
2530
self.startTagHandler.default = self.startTagOther
2532
self.endTagHandler = utils.MethodDispatcher([
2533
("frameset", self.endTagFrameset)
2535
self.endTagHandler.default = self.endTagOther
2537
def processEOF(self):
2538
if self.tree.openElements[-1].name != "html":
2539
self.parser.parseError("eof-in-frameset")
2541
assert self.parser.innerHTML
2543
def processCharacters(self, token):
2544
self.parser.parseError("unexpected-char-in-frameset")
2546
def startTagFrameset(self, token):
2547
self.tree.insertElement(token)
2549
def startTagFrame(self, token):
2550
self.tree.insertElement(token)
2551
self.tree.openElements.pop()
2553
def startTagNoframes(self, token):
2554
return self.parser.phases["inBody"].processStartTag(token)
2556
def startTagOther(self, token):
2557
self.parser.parseError("unexpected-start-tag-in-frameset",
2558
{"name": token["name"]})
2560
def endTagFrameset(self, token):
2561
if self.tree.openElements[-1].name == "html":
2563
self.parser.parseError("unexpected-frameset-in-frameset-innerhtml")
2565
self.tree.openElements.pop()
2566
if (not self.parser.innerHTML and
2567
self.tree.openElements[-1].name != "frameset"):
2568
# If we're not in innerHTML mode and the the current node is not a
2569
# "frameset" element (anymore) then switch.
2570
self.parser.phase = self.parser.phases["afterFrameset"]
2572
def endTagOther(self, token):
2573
self.parser.parseError("unexpected-end-tag-in-frameset",
2574
{"name": token["name"]})
2576
class AfterFramesetPhase(Phase):
2577
# http://www.whatwg.org/specs/web-apps/current-work/#after3
2578
def __init__(self, parser, tree):
2579
Phase.__init__(self, parser, tree)
2581
self.startTagHandler = utils.MethodDispatcher([
2582
("html", self.startTagHtml),
2583
("noframes", self.startTagNoframes)
2585
self.startTagHandler.default = self.startTagOther
2587
self.endTagHandler = utils.MethodDispatcher([
2588
("html", self.endTagHtml)
2590
self.endTagHandler.default = self.endTagOther
2592
def processEOF(self):
2596
def processCharacters(self, token):
2597
self.parser.parseError("unexpected-char-after-frameset")
2599
def startTagNoframes(self, token):
2600
return self.parser.phases["inHead"].processStartTag(token)
2602
def startTagOther(self, token):
2603
self.parser.parseError("unexpected-start-tag-after-frameset",
2604
{"name": token["name"]})
2606
def endTagHtml(self, token):
2607
self.parser.phase = self.parser.phases["afterAfterFrameset"]
2609
def endTagOther(self, token):
2610
self.parser.parseError("unexpected-end-tag-after-frameset",
2611
{"name": token["name"]})
2613
class AfterAfterBodyPhase(Phase):
2614
def __init__(self, parser, tree):
2615
Phase.__init__(self, parser, tree)
2617
self.startTagHandler = utils.MethodDispatcher([
2618
("html", self.startTagHtml)
2620
self.startTagHandler.default = self.startTagOther
2622
def processEOF(self):
2625
def processComment(self, token):
2626
self.tree.insertComment(token, self.tree.document)
2628
def processSpaceCharacters(self, token):
2629
return self.parser.phases["inBody"].processSpaceCharacters(token)
2631
def processCharacters(self, token):
2632
self.parser.parseError("expected-eof-but-got-char")
2633
self.parser.phase = self.parser.phases["inBody"]
2636
def startTagHtml(self, token):
2637
return self.parser.phases["inBody"].processStartTag(token)
2639
def startTagOther(self, token):
2640
self.parser.parseError("expected-eof-but-got-start-tag",
2641
{"name": token["name"]})
2642
self.parser.phase = self.parser.phases["inBody"]
2645
def processEndTag(self, token):
2646
self.parser.parseError("expected-eof-but-got-end-tag",
2647
{"name": token["name"]})
2648
self.parser.phase = self.parser.phases["inBody"]
2651
class AfterAfterFramesetPhase(Phase):
2652
def __init__(self, parser, tree):
2653
Phase.__init__(self, parser, tree)
2655
self.startTagHandler = utils.MethodDispatcher([
2656
("html", self.startTagHtml),
2657
("noframes", self.startTagNoFrames)
2659
self.startTagHandler.default = self.startTagOther
2661
def processEOF(self):
2664
def processComment(self, token):
2665
self.tree.insertComment(token, self.tree.document)
2667
def processSpaceCharacters(self, token):
2668
return self.parser.phases["inBody"].processSpaceCharacters(token)
2670
def processCharacters(self, token):
2671
self.parser.parseError("expected-eof-but-got-char")
2673
def startTagHtml(self, token):
2674
return self.parser.phases["inBody"].processStartTag(token)
2676
def startTagNoFrames(self, token):
2677
return self.parser.phases["inHead"].processStartTag(token)
2679
def startTagOther(self, token):
2680
self.parser.parseError("expected-eof-but-got-start-tag",
2681
{"name": token["name"]})
2683
def processEndTag(self, token):
2684
self.parser.parseError("expected-eof-but-got-end-tag",
2685
{"name": token["name"]})
2688
"initial": InitialPhase,
2689
"beforeHtml": BeforeHtmlPhase,
2690
"beforeHead": BeforeHeadPhase,
2691
"inHead": InHeadPhase,
2692
# XXX "inHeadNoscript": InHeadNoScriptPhase,
2693
"afterHead": AfterHeadPhase,
2694
"inBody": InBodyPhase,
2696
"inTable": InTablePhase,
2697
"inTableText": InTableTextPhase,
2698
"inCaption": InCaptionPhase,
2699
"inColumnGroup": InColumnGroupPhase,
2700
"inTableBody": InTableBodyPhase,
2701
"inRow": InRowPhase,
2702
"inCell": InCellPhase,
2703
"inSelect": InSelectPhase,
2704
"inSelectInTable": InSelectInTablePhase,
2705
"inForeignContent": InForeignContentPhase,
2706
"afterBody": AfterBodyPhase,
2707
"inFrameset": InFramesetPhase,
2708
"afterFrameset": AfterFramesetPhase,
2709
"afterAfterBody": AfterAfterBodyPhase,
2710
"afterAfterFrameset": AfterAfterFramesetPhase,
2711
# XXX after after frameset
2715
def impliedTagToken(name, type="EndTag", attributes=None,
2717
if attributes is None:
2719
return {"type": tokenTypes[type], "name": name, "data": attributes,
2720
"selfClosing": selfClosing}
2723
class ParseError(Exception):
2724
"""Error in parsed document"""