2
* Copyright 2010 Google Inc.
4
* Licensed under the Apache License, Version 2.0 (the "License");
5
* you may not use this file except in compliance with the License.
6
* You may obtain a copy of the License at
8
* http://www.apache.org/licenses/LICENSE-2.0
10
* Unless required by applicable law or agreed to in writing, software
11
* distributed under the License is distributed on an "AS IS" BASIS,
12
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
* See the License for the specific language governing permissions and
14
* limitations under the License.
17
// Author: jmarantz@google.com (Joshua Marantz)
19
#ifndef PAGESPEED_KERNEL_HTML_HTML_LEXER_H_
20
#define PAGESPEED_KERNEL_HTML_HTML_LEXER_H_
24
#include "pagespeed/kernel/base/basictypes.h"
25
#include "pagespeed/kernel/base/printf_format.h"
26
#include "pagespeed/kernel/base/string.h"
27
#include "pagespeed/kernel/base/string_util.h"
28
#include "pagespeed/kernel/html/doctype.h"
29
#include "pagespeed/kernel/html/html_element.h"
30
#include "pagespeed/kernel/html/html_name.h"
31
#include "pagespeed/kernel/http/content_type.h"
33
namespace net_instaweb {
37
// Constructs a re-entrant HTML lexer. This lexer minimally parses tags,
38
// attributes, and comments. It is intended to parse the Wild West of the
39
// Web. It's designed to be tolerant of syntactic transgressions, merely
40
// passing through unparseable chunks as Characters.
42
// TODO(jmarantz): refactor this with html_parse, so that this class owns
43
// the symbol table and the event queue, and no longer needs to mutually
44
// depend on HtmlParse. That will make it easier to unit-test.
47
explicit HtmlLexer(HtmlParse* html_parse);
50
// Initialize a new parse session, id is only used for error messages.
51
void StartParse(const StringPiece& id, const ContentType& content_type);
53
// Parse a chunk of text, adding events to the parser by calling
54
// html_parse_->AddEvent(...).
55
void Parse(const char* text, int size);
57
// Completes parse, reporting any leftover text as a final HtmlCharacterEvent.
60
// Determines whether a tag should be terminated in HTML.
61
bool IsImplicitlyClosedTag(HtmlName::Keyword keyword) const;
63
// Determines whether a tag should be interpreted as a 'literal'
64
// tag. That is, a tag whose contents are not parsed until a
65
// corresponding matching end tag is encountered.
66
static bool IsLiteralTag(HtmlName::Keyword keyword);
68
// Determines whether a tag is interpreted as a 'literal' tag in
69
// some user agents. Since some user agents will interpret the
70
// contents of these tags, our lexer never treats them as literal
72
static bool IsSometimesLiteralTag(HtmlName::Keyword keyword);
74
// Determines whether a tag can be terminated briefly (e.g. <tag/>)
75
bool TagAllowsBriefTermination(HtmlName::Keyword keyword) const;
77
// Determines whether it's OK to leave a tag unclosed.
78
bool IsOptionallyClosedTag(HtmlName::Keyword keyword) const;
80
// Print element stack to stdout (for debugging).
81
void DebugPrintStack();
83
// Returns the current lowest-level parent element in the element stack, or
84
// NULL if the stack is empty.
85
HtmlElement* Parent() const;
87
// Return the current assumed doctype of the document (based on the content
88
// type and any HTML directives encountered so far).
89
const DocType& doctype() const { return doctype_; }
91
// Sets the limit on the maximum number of bytes that should be parsed.
92
void set_size_limit(int64 x) { size_limit_ = x; }
94
// Indicates whether we have exceeded the limit on the maximum number of bytes
95
// that we should parse.
96
bool size_limit_exceeded() const { return size_limit_exceeded_; }
99
// Most of these routines expect c to be the last character of literal_
100
inline void EvalStart(char c);
101
inline void EvalTag(char c);
102
inline void EvalTagOpen(char c);
103
inline void EvalTagCloseNoName(char c);
104
inline void EvalTagClose(char c);
105
inline void EvalTagBriefClose(char c);
106
inline void EvalCommentStart1(char c);
107
inline void EvalCommentStart2(char c);
108
inline void EvalCommentBody(char c);
109
inline void EvalCommentEnd1(char c);
110
inline void EvalCommentEnd2(char c);
111
inline void EvalCdataStart1(char c);
112
inline void EvalCdataStart2(char c);
113
inline void EvalCdataStart3(char c);
114
inline void EvalCdataStart4(char c);
115
inline void EvalCdataStart5(char c);
116
inline void EvalCdataStart6(char c);
117
inline void EvalCdataBody(char c);
118
inline void EvalCdataEnd1(char c);
119
inline void EvalCdataEnd2(char c);
120
inline void EvalAttribute(char c);
121
inline void EvalAttrName(char c);
122
inline void EvalAttrNameSpace(char c);
123
inline void EvalAttrEq(char c);
124
inline void EvalAttrVal(char c);
125
inline void EvalAttrValSq(char c);
126
inline void EvalAttrValDq(char c);
127
inline void EvalLiteralTag(char c);
128
inline void EvalScriptTag(char c);
129
inline void EvalDirective(char c);
130
inline void EvalBogusComment(char c);
132
// Makes an element based on token_, which will be parsed as the tag
136
void MakeAttribute(bool has_value);
137
void FinishAttribute(char c, bool has_value, bool brief_close);
142
void EmitTagOpen(bool allow_implicit_close); // expects element_ != NULL.
143
void EmitTagClose(HtmlElement::Style style);
144
void EmitTagBriefClose();
145
void EmitDirective();
146
void Restart(char c);
148
// Emits a syntax error message.
149
void SyntaxError(const char* format, ...) INSTAWEB_PRINTF_FORMAT(2, 3);
151
// Tries to find a HTML element on the stack matching a tag. If it
152
// finds it, it pops all the intervening elements off the stack,
153
// issuing warnings for each discarded tag, the matching element is
154
// also popped off the stack, and returned.
156
// If the tag is not matched, then no mutations are done to the stack,
157
// and NULL is returned.
159
// The tag name should be interned.
160
// TODO(jmarantz): use type system
161
HtmlElement* PopElementMatchingTag(const StringPiece& tag);
163
HtmlElement* PopElement();
164
void CloseElement(HtmlElement* element, HtmlElement::Style style);
166
// Minimal i18n analysis. With utf-8 and gb2312 we can do this
167
// context-free, and thus the method can be static. If we add
168
// more encodings we may need to turn this into a non-static method.
169
static inline bool IsI18nChar(char c) {return (((c) & 0x80) != 0); }
171
// Determines whether a character can be used in a tag name as first char ...
172
static inline bool IsLegalTagFirstChar(char c);
173
// ... or subsequent char.
174
static inline bool IsLegalTagChar(char c);
176
// Determines whether a character can be used in an attribute name.
177
static inline bool IsLegalAttrNameChar(char c);
179
// The lexer is implemented as a pure state machine. There is
180
// no lookahead. The state is understood primarily in this
181
// enum, although there are a few state flavors that are managed
182
// by the other member variables, notably: has_attr_value_ and
183
// attr_name_.empty(). Those could be eliminated by adding
184
// a few more explicit states.
188
TAG_CLOSE_NO_NAME, // "</"
190
TAG_CLOSE_TERMINATE, // "</x "
192
TAG_BRIEF_CLOSE, // "<x/" or "<x /" or "<x y/" etc
193
COMMENT_START1, // "<!"
194
COMMENT_START2, // "<!-"
195
COMMENT_BODY, // "<!--"
197
COMMENT_END2, // "--"
198
CDATA_START1, // "<!["
199
CDATA_START2, // "<![C"
200
CDATA_START3, // "<![CD"
201
CDATA_START4, // "<![CDA"
202
CDATA_START5, // "<![CDAT"
203
CDATA_START6, // "<![CDATA"
204
CDATA_BODY, // "<![CDATA["
207
TAG_ATTRIBUTE, // "<x "
208
TAG_ATTR_NAME, // "<x y"
209
TAG_ATTR_NAME_SPACE, // "<x y "
210
TAG_ATTR_EQ, // "<x y="
211
TAG_ATTR_VAL, // "<x y=x" value terminated by whitespace or >
212
TAG_ATTR_VALDQ, // '<x y="' value terminated by double-quote
213
TAG_ATTR_VALSQ, // "<x y='" value terminated by single-quote
214
LITERAL_TAG, // "<style " or "<iframe ", etc.
215
SCRIPT_TAG, // "<script "
217
BOGUS_COMMENT, // "<?foo>" or "</?foo>"
220
HtmlParse* html_parse_;
222
GoogleString token_; // accumulates tag names and comments
223
GoogleString literal_; // accumulates raw text to pass through
224
GoogleString attr_name_; // accumulates attribute name
225
GoogleString attr_value_; // accumulates attribute value
226
HtmlElement::QuoteStyle attr_quote_; // quote used to delimit attribute
227
bool has_attr_value_; // distinguishes <a n=> from <a n>
228
HtmlElement* element_; // current element; used to collect attributes
230
int tag_start_line_; // line at which we last transitioned to TAG state
232
GoogleString literal_close_; // specific tag go close, e.g </script>
233
bool script_html_comment_; // inside <script> <!--
234
bool script_html_comment_script_; // inside <script> <!-- <script>
235
// in some cases we have to drop what looks like attributes on a closing
236
// tag as part of error recovery.
237
bool discard_until_start_state_for_error_recovery_;
239
ContentType content_type_;
242
std::vector<HtmlElement*> element_stack_;
244
// Indicates that we have exceeded the enforced size limit on the maximum
245
// number of input HTML that we can parse.
246
bool size_limit_exceeded_;
247
// Whether we should skip parsing of all subsequent bytes. HtmlParse calls
248
// this once it has started or ended an HtmlElement.
250
int64 num_bytes_parsed_;
253
DISALLOW_COPY_AND_ASSIGN(HtmlLexer);
256
} // namespace net_instaweb
258
#endif // PAGESPEED_KERNEL_HTML_HTML_LEXER_H_