2
* Copyright (C) 2019 Apple Inc. All rights reserved.
4
* Redistribution and use in source and binary forms, with or without
5
* modification, are permitted provided that the following conditions
7
* 1. Redistributions of source code must retain the above copyright
8
* notice, this list of conditions and the following disclaimer.
9
* 2. Redistributions in binary form must reproduce the above copyright
10
* notice, this list of conditions and the following disclaimer in the
11
* documentation and/or other materials provided with the distribution.
13
* THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
14
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
15
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
17
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
23
* THE POSSIBILITY OF SUCH DAMAGE.
26
HTMLParser = class HTMLParser {
30
parseDocument(sourceText, treeBuilder, {isXML} = {})
32
console.assert(typeof sourceText === "string");
33
console.assert(treeBuilder);
34
console.assert(treeBuilder.pushParserNode);
36
this._treeBuilder = treeBuilder;
39
this._mode = HTMLParser.Mode.Data;
40
this._data = sourceText;
41
this._bogusCommentOpener = null;
42
this._isXML = !!isXML;
44
if (this._treeBuilder.begin)
45
this._treeBuilder.begin();
47
while (this._pos < this._data.length)
50
if (this._treeBuilder.end)
51
this._treeBuilder.end();
58
return this._pos === this._data.length;
63
return this._data.substring(this._pos, this._pos + n);
66
_peekCharacterRegex(regex)
68
return regex.test(this._data.charAt(this._pos));
73
for (let i = 0; i < str.length; ++i) {
75
if (this._data.charAt(this._pos + i) !== c)
82
_peekCaseInsensitiveString(str)
84
console.assert(str.toLowerCase() === str, "String should be passed in as lowercase.");
86
for (let i = 0; i < str.length; ++i) {
87
let d = this._data.charAt(this._pos + i);
91
if (d.toLowerCase() !== c)
100
let startIndex = this._pos;
101
while (regex.test(this._data.charAt(this._pos)))
104
return this._data.substring(startIndex, this._pos);
109
return this._consumeRegex(/\s/);
112
_consumeUntilString(str, newMode)
114
let index = this._data.indexOf(str, this._pos);
116
let startIndex = this._pos;
117
this._pos = this._data.length;
119
this._mode = newMode;
120
return this._data.substring(startIndex, this._data.length);
123
let startIndex = this._pos;
124
this._pos = index + str.length;
126
this._mode = newMode;
127
return this._data.substring(startIndex, index);
130
_consumeDoubleQuotedString()
132
console.assert(this._peekString(`"`));
134
let string = this._consumeUntilString(`"`);
138
_consumeSingleQuotedString()
140
console.assert(this._peekString(`'`));
142
let string = this._consumeUntilString(`'`);
147
// This is a crude implementation of HTML tokenization:
148
// https://html.spec.whatwg.org/multipage/parsing.html
152
switch (this._mode) {
153
case HTMLParser.Mode.Data:
154
return this._parseData();
155
case HTMLParser.Mode.ScriptData:
156
return this._parseScriptData();
157
case HTMLParser.Mode.TagOpen:
158
return this._parseTagOpen();
159
case HTMLParser.Mode.Attr:
160
return this._parseAttr();
161
case HTMLParser.Mode.CData:
162
return this._parseCData();
163
case HTMLParser.Mode.Doctype:
164
return this._parseDoctype();
165
case HTMLParser.Mode.Comment:
166
return this._parseComment();
167
case HTMLParser.Mode.BogusComment:
168
return this._parseBogusComment();
172
throw "Missing parser mode";
177
let startPos = this._pos;
178
let text = this._consumeUntilString("<", HTMLParser.Mode.TagOpen);
180
this._push({type: HTMLParser.NodeType.Text, data: text, pos: startPos});
182
if (this._isEOF() && this._data.endsWith("<"))
183
this._handleEOF(this._pos - 1);
188
let startPos = this._pos;
191
// Parse as text until </script>.
193
scriptText += this._consumeUntilString("<");
194
if (this._peekCaseInsensitiveString("/script>")) {
195
this._pos += "/script>".length;
196
this._mode = HTMLParser.Mode.Data;
199
if (this._handleEOF(startPos))
205
this._push({type: HTMLParser.NodeType.Text, data: scriptText, pos: startPos});
206
this._push({type: HTMLParser.NodeType.CloseTag, name: "script", pos: startPos + scriptText.length});
212
this._currentTagStartPos = this._pos - 1;
214
if (this._peekString("!")) {
216
if (this._peekString("!--")) {
217
this._pos += "!--".length;
218
this._mode = HTMLParser.Mode.Comment;
219
this._handleEOF(this._currentTagStartPos);
224
if (this._peekCaseInsensitiveString("!doctype")) {
225
let startPos = this._pos;
226
this._pos += "!DOCTYPE".length;
227
this._doctypeRaw = this._data.substring(startPos, this._pos);
228
this._mode = HTMLParser.Mode.Doctype;
229
this._handleEOF(this._currentTagStartPos);
234
if (this._peekString("![CDATA[")) {
235
this._pos += "![CDATA[".length;
236
this._mode = HTMLParser.Mode.CData;
237
this._handleEOF(this._currentTagStartPos);
243
this._mode = HTMLParser.Mode.BogusComment;
244
this._handleEOF(this._currentTagStartPos);
248
if (this._peekString("?")) {
251
this._mode = HTMLParser.Mode.BogusComment;
252
this._bogusCommentOpener = "<?";
253
this._handleEOF(this._currentTagStartPos);
257
if (this._peekString("/")) {
260
let text = this._consumeUntilString(">", HTMLParser.Mode.Data);
261
this._push({type: HTMLParser.NodeType.CloseTag, name: text, pos: this._currentTagStartPos});
266
if (this._peekCharacterRegex(/[a-z]/i)) {
267
let text = this._consumeRegex(/[^\s/>]+/);
269
if (this._peekCharacterRegex(/\s/)) {
270
this._currentTagName = text;
271
this._currentTagAttributes = [];
272
this._mode = HTMLParser.Mode.Attr;
276
if (this._peekString("/>")) {
277
this._pos += "/>".length;
278
this._mode = HTMLParser.Mode.Data;
279
this._push({type: HTMLParser.NodeType.OpenTag, name: text, closed: true, pos: this._currentTagStartPos});
283
if (this._peekString(">")) {
285
this._mode = HTMLParser.Mode.Data;
286
this._push({type: HTMLParser.NodeType.OpenTag, name: text, closed: false, pos: this._currentTagStartPos});
290
// End of document. Output any remaining data as error text.
291
console.assert(this._isEOF());
292
this._push({type: HTMLParser.NodeType.ErrorText, data: "<" + text, pos: this._currentTagStartPos});
297
// Anything else, treat as text.
298
this._push({type: HTMLParser.NodeType.Text, data: "<", pos: this._currentTagStartPos});
299
this._mode = HTMLParser.Mode.Data;
304
this._consumeWhitespace();
306
if (this._peekString("/>")) {
307
this._pos += "/>".length;
308
this._mode = HTMLParser.Mode.Data;
309
this._push({type: HTMLParser.NodeType.OpenTag, name: this._currentTagName, closed: true, attributes: this._currentTagAttributes, pos: this._currentTagStartPos});
313
if (this._peekString(">")) {
315
this._mode = HTMLParser.Mode.Data;
316
this._push({type: HTMLParser.NodeType.OpenTag, name: this._currentTagName, closed: false, attributes: this._currentTagAttributes, pos: this._currentTagStartPos});
321
let attributeNameStartPos = this._pos;
323
let attributeName = this._consumeRegex(/[^\s=/>]+/);
324
// console.assert(attributeName.length > 0, "Unexpected empty attribute name");
325
if (this._peekString("/") || this._peekString(">")) {
327
this._pushAttribute({name: attributeName, value: undefined, namePos: attributeNameStartPos});
331
this._consumeWhitespace();
333
if (this._peekString("=")) {
337
let attributeValueStartPos = this._pos;
339
this._consumeWhitespace();
341
if (this._peekString(`"`)) {
342
let attributeValue = this._consumeDoubleQuotedString();
343
this._pushAttribute({name: attributeName, value: attributeValue, quote: HTMLParser.AttrQuoteType.Double, namePos: attributeNameStartPos, valuePos: attributeValueStartPos});
347
if (this._peekString(`'`)) {
348
let attributeValue = this._consumeSingleQuotedString();
349
this._pushAttribute({name: attributeName, value: attributeValue, quote: HTMLParser.AttrQuoteType.Single, namePos: attributeNameStartPos, valuePos: attributeValueStartPos});
353
if (this._peekString(">")) {
355
this._mode = HTMLParser.Mode.Data;
356
this._push({type: HTMLParser.NodeType.OpenTag, name: this._currentTagName, closed: false, attributes: this._currentTagAttributes, pos: this._currentTagStartPos});
360
let whitespace = this._consumeWhitespace();
362
this._pushAttribute({name: attributeName, value: undefined, quote: HTMLParser.AttrQuoteType.None, namePos: attributeNameStartPos});
366
let attributeValue = this._consumeRegex(/[^\s=>]+/);
367
this._pushAttribute({name: attributeName, value: attributeValue, quote: HTMLParser.AttrQuoteType.None, namePos: attributeNameStartPos, valuePos: attributeValueStartPos});
371
if (!this._isEOF()) {
372
this._pushAttribute({name: attributeName, value: undefined, quote: HTMLParser.AttrQuoteType.None, namePos: attributeNameStartPos});
376
// End of document. Treat everything up to now as error text.
377
console.assert(this._isEOF());
378
this._push({type: HTMLParser.NodeType.ErrorText, data: this._data.substring(this._currentTagStartPos), pos: this._currentTagStartPos});
384
let text = this._consumeUntilString("-->", HTMLParser.Mode.Data);
385
if (this._isEOF() && !this._data.endsWith("-->")) {
386
this._push({type: HTMLParser.NodeType.ErrorText, data: this._data.substring(this._currentTagStartPos), pos: this._currentTagStartPos});
390
let closePos = this._pos - "-->".length;
391
this._push({type: HTMLParser.NodeType.Comment, data: text, pos: this._currentTagStartPos, closePos});
396
let text = this._consumeUntilString(">", HTMLParser.Mode.Data);
397
if (this._isEOF() && !this._data.endsWith(">")) {
398
this._push({type: HTMLParser.NodeType.ErrorText, data: this._data.substring(this._currentTagStartPos), pos: this._currentTagStartPos});
402
let closePos = this._pos - ">".length;
403
this._push({type: HTMLParser.NodeType.Comment, data: text, opener: this._bogusCommentOpener || "", pos: this._currentTagStartPos, closePos});
404
this._bogusCommentOpener = null;
409
let text = this._consumeUntilString(">", HTMLParser.Mode.Data);
410
if (this._isEOF() && !this._data.endsWith(">")) {
411
this._push({type: HTMLParser.NodeType.ErrorText, data: this._data.substring(this._currentTagStartPos), pos: this._currentTagStartPos});
415
let closePos = this._pos - ">".length;
416
this._push({type: HTMLParser.NodeType.Doctype, data: text, raw: this._doctypeRaw, pos: this._currentTagStartPos, closePos});
417
this._doctypeRaw = null;
422
let text = this._consumeUntilString("]]>", HTMLParser.Mode.Data);
423
if (this._isEOF() && !this._data.endsWith("]]>")) {
424
this._push({type: HTMLParser.NodeType.ErrorText, data: this._data.substring(this._currentTagStartPos), pos: this._currentTagStartPos});
428
let closePos = this._pos - "]]>".length;
429
this._push({type: HTMLParser.NodeType.CData, data: text, pos: this._currentTagStartPos, closePos});
434
this._currentTagAttributes.push(attr);
435
this._handleEOF(this._currentTagStartPos);
438
_handleEOF(lastPosition)
443
// End of document. Treat everything from the last position as error text.
444
this._push({type: HTMLParser.NodeType.ErrorText, data: this._data.substring(lastPosition), pos: lastPosition});
450
// Custom mode for some elements.
451
if (node.type === HTMLParser.NodeType.OpenTag) {
452
if (!this._isXML && node.name.toLowerCase() === "script")
453
this._mode = HTMLParser.Mode.ScriptData;
456
this._treeBuilder.pushParserNode(node);
463
ScriptData: "script-data",
468
BogusComment: "bogus-comment",
471
HTMLParser.NodeType = {
473
ErrorText: "error-text",
475
CloseTag: "close-tag",
481
HTMLParser.AttrQuoteType = {