2
* Copyright 2010 Google Inc.
4
* Licensed under the Apache License, Version 2.0 (the "License");
5
* you may not use this file except in compliance with the License.
6
* You may obtain a copy of the License at
8
* http://www.apache.org/licenses/LICENSE-2.0
10
* Unless required by applicable law or agreed to in writing, software
11
* distributed under the License is distributed on an "AS IS" BASIS,
12
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
* See the License for the specific language governing permissions and
14
* limitations under the License.
17
// Author: jmarantz@google.com (Joshua Marantz)
19
#ifndef PAGESPEED_KERNEL_HTML_HTML_ELEMENT_H_
20
#define PAGESPEED_KERNEL_HTML_HTML_ELEMENT_H_
22
#include "pagespeed/kernel/base/basictypes.h"
23
#include "pagespeed/kernel/base/inline_slist.h"
24
#include "pagespeed/kernel/base/scoped_ptr.h"
25
#include "pagespeed/kernel/base/string.h"
26
#include "pagespeed/kernel/base/string_util.h"
27
#include "pagespeed/kernel/html/html_name.h"
28
#include "pagespeed/kernel/html/html_node.h"
30
namespace net_instaweb {
32
// Represents an HTML tag, including all its attributes. These are never
33
// constructed independently, but are managed by class HtmlParse. They
34
// are constructed when parsing an HTML document, and they can also be
35
// synthesized via methods in HtmlParse::NewElement.
37
// Note that HtmlElement* saved during filter execution are valid only until
38
// a Flush occurs. HtmlElement* can still be fully accessed during a Flush, but
39
// after that, to save memory, the contents of the HtmlElement* are cleared.
40
// After that, the only method it's legal to do is to call is
41
// HtmlParse::IsRewriteable(), which will return false.
42
class HtmlElement : public HtmlNode {
44
// Tags can be closed in three ways: implicitly (e.g. <img ..>),
45
// briefly (e.g. <br/>), or explicitly (<a...>...</a>). The
46
// Lexer will always record the way it parsed a tag, but synthesized
47
// elements will have AUTO_CLOSE, and rewritten elements may
48
// no longer qualify for the closing style with which they were
51
AUTO_CLOSE, // synthesized tag, or not yet closed in source
52
IMPLICIT_CLOSE, // E.g. <img...> <meta...> <link...> <br...> <input...>
53
EXPLICIT_CLOSE, // E.g. <a href=...>anchor</a>
54
BRIEF_CLOSE, // E.g. <head/>
55
UNCLOSED, // Was never closed in source, so don't serialize close-tag
56
INVISIBLE, // Programatically hidden element
59
// Various ways things can be quoted (or not)
66
class Attribute : public InlineSListElement<Attribute> {
68
// A large quantity of HTML in the wild has attributes that are
69
// improperly escaped. Browsers are generally tolerant of this.
70
// But we want to avoid corrupting pages we do not understand.
72
// The result of DecodedValueOrNull() and escaped_value() is still
73
// owned by this, and will be invalidated by a subsequent call to
74
// SetValue() or SetUnescapedValue
76
// Returns the attribute name, which is not guaranteed to be case-folded.
77
// Compare keyword() to the Keyword constant found in html_name.h for
78
// fast attribute comparisons.
79
StringPiece name_str() const { return name_.value(); }
81
// Returns the HTML keyword enum. If this attribute name is not
82
// recognized, returns HtmlName::kNotAKeyword, and you can examine
84
HtmlName::Keyword keyword() const { return name_.keyword(); }
86
HtmlName name() const { return name_; }
87
void set_name(const HtmlName& name) { name_ = name; }
89
// Returns the value in its original directly from the HTML source.
90
// This may have HTML escapes in it, such as "&".
91
const char* escaped_value() const { return escaped_value_.get(); }
93
// The result of DecodedValueOrNull() is still owned by this, and
94
// will be invalidated by a subsequent call to SetValue().
96
// The result will be a NUL-terminated string containing the value of the
97
// attribute, or NULL if the attribute has no value at all (this is
98
// distinct from having the empty string for a value), or there is
99
// a decoding error. E.g.
100
// <tag a="val"> --> "val"
101
// <tag a="&"> --> "&"
104
// <tag a="muñecos"> --> NULL (decoding_error()==true)
106
// Returns the unescaped value, suitable for directly operating on
107
// in filters as URLs or other data. Note that decoding_error() is
108
// true if the parsed value from HTML could not be decoded. This
110
// - the charset is not known
111
// - the charset is not supported. Currently none are supported and
112
// only values that fall in 7-bit ascii can be interpreted.
113
// - the charset is known & supported but the value does not appear to be
116
// The decoded value uses 8-bit characters to represent any unicode
117
// code-point less than 256.
118
const char* DecodedValueOrNull() const {
119
if (!decoded_value_computed_) {
120
ComputeDecodedValue();
122
return decoded_value_.get();
125
void set_decoding_error(bool x) { decoding_error_ = x; }
126
bool decoding_error() const {
127
if (!decoded_value_computed_) {
128
ComputeDecodedValue();
130
return decoding_error_;
133
// See comment about quote on constructor for Attribute.
134
// Returns the quotation mark associated with this URL.
135
QuoteStyle quote_style() const { return quote_style_; }
137
// Textual form of quote for printing.
138
const char* quote_str() const;
140
// Two related methods to modify the value of attribute (eg to rewrite
141
// dest of src or href). As with the constructor, copies the string in,
142
// so caller retains ownership of value.
144
// A StringPiece pointing to an empty string (that is, a char array {'\0'})
145
// indicates that the attribute value is the empty string (e.g. <foo
146
// bar="">); however, a StringPiece with a data() pointer of NULL indicates
147
// that the attribute has no value at all (e.g. <foo bar>). This is an
148
// important distinction.
150
// Note that passing a value containing NULs in the middle will cause
151
// breakage, but this isn't currently checked for.
152
// TODO(mdsteele): Perhaps we should check for this?
154
// Sets the value of the attribute. No HTML escaping is expected.
155
// This call causes the HTML-escaped value to be automatically computed
156
// by scanning the value and escaping any characters required in HTML
158
void SetValue(const StringPiece& value);
160
// Sets the escaped value. This is intended to be called from the HTML
161
// Lexer, and results in the Value being computed automatically by
162
// scanning the value for escape sequences.
163
void SetEscapedValue(const StringPiece& value);
165
void set_quote_style(QuoteStyle new_quote_style) {
166
quote_style_ = new_quote_style;
169
friend class HtmlElement;
172
void ComputeDecodedValue() const;
174
// This should only be called from AddAttribute
175
Attribute(const HtmlName& name, const StringPiece& escaped_value,
176
QuoteStyle quote_style);
178
static inline void CopyValue(const StringPiece& src,
179
scoped_array<char>* dst);
182
QuoteStyle quote_style_ : 8;
183
mutable bool decoding_error_;
184
mutable bool decoded_value_computed_;
186
// Attribute value represented as ascii and
187
// HTML-escape-sequences, typically parsed directly from an HTML
188
// file. This is the canonical representation, and it can handle
189
// any arbitrary multi-byte characters.
191
// Note that it is acceptable to have 8-bit characters in escape
192
// sequences (typically iso8859). However we will not be able to
193
// decode such attributes.
194
scoped_array<char> escaped_value_;
196
// An 8-bit representation of the escaped_value. Escape sequences
197
// that contain character-codes >= 256 are not decoded, and will
198
// result in decoding_error_==true. Also note that a literal 8-bit
199
// code in escaped_value_ cannot be decoded either.
201
// We can get fewer decoding errors if we are careful to track the
202
// character-encoding for the document, and implement some of the
203
// popular ones, e.g. utf8, gb2312 and iso8859. Note that failing
204
// to decode an attribute value does not impact our ability to
205
// parse and reserialize the document. It just prevents us from
206
// looking at the decoded value, which is a requirement primarily
207
// for tags referencing URLs, e.g. <img src=...>.
209
// Note that we do not decode non-ASCII characters but we can
210
// represent them in escaped_value_. We can get 8-bit characters
211
// into decoded_value_ via  etc.
212
mutable scoped_array<char> decoded_value_;
214
DISALLOW_COPY_AND_ASSIGN(Attribute);
217
typedef InlineSList<Attribute> AttributeList;
218
typedef InlineSList<Attribute>::Iterator AttributeIterator;
219
typedef InlineSList<Attribute>::ConstIterator AttributeConstIterator;
221
virtual ~HtmlElement();
223
// Determines whether this node is still accessible via API. Note that
224
// when a FLUSH occurs after an open-element, the element will be live()
225
// but will not be rewritable. Specifically, node->live() can be true when
226
// html_parse->IsRewritable(node) is false. Once a node is closed, a FLUSH
227
// will cause the node's data to be freed, which triggers this method
229
virtual bool live() const { return (data_.get() != NULL) && data_->live_; }
231
virtual void MarkAsDead(const HtmlEventListIterator& end);
233
// Add a copy of an attribute to this element. The attribute may come
234
// from this element, or another one.
235
void AddAttribute(const Attribute& attr);
237
// Unconditionally add attribute, copying value.
238
// For binary attributes (those without values) use value=NULL.
239
// TODO(sligocki): StringPiece(NULL) seems fragile because what it is or
240
// how it's treated is not docutmented.
242
// Doesn't check for attribute duplication (which is illegal in html).
244
// The value, if non-null, is assumed to be unescaped. See also
245
// AddEscapedAttribute.
246
void AddAttribute(const HtmlName& name,
247
const StringPiece& decoded_value,
248
QuoteStyle quote_style);
249
// As AddAttribute, but assumes value has been escaped for html output.
250
void AddEscapedAttribute(const HtmlName& name,
251
const StringPiece& escaped_value,
252
QuoteStyle quote_style);
254
// Remove the attribute with the given name. Return true if the attribute
255
// was deleted, false if it wasn't there to begin with.
256
bool DeleteAttribute(HtmlName::Keyword keyword);
257
bool DeleteAttribute(StringPiece name);
259
// Look up attribute by name. NULL if no attribute exists.
260
// Use this for attributes whose value you might want to change
262
const Attribute* FindAttribute(HtmlName::Keyword keyword) const;
263
Attribute* FindAttribute(HtmlName::Keyword keyword) {
264
const HtmlElement* const_this = this;
265
const Attribute* result = const_this->FindAttribute(keyword);
266
return const_cast<Attribute*>(result);
269
const Attribute* FindAttribute(StringPiece name) const;
270
Attribute* FindAttribute(StringPiece name) {
271
const HtmlElement* const_this = this;
272
const Attribute* result = const_this->FindAttribute(name);
273
return const_cast<Attribute*>(result);
276
// Look up decoded attribute value by name.
278
// 1. no attribute exists
279
// 2. the attribute has no value.
280
// 3. the attribute has a value, but it cannot currently be safely decoded.
281
// If you care about this distinction, call FindAttribute.
282
// Use this only if you don't intend to change the attribute value;
283
// if you might change the attribute value, use FindAttribute instead
284
// (this avoids a double lookup).
285
const char* AttributeValue(HtmlName::Keyword name) const {
286
const Attribute* attribute = FindAttribute(name);
287
if (attribute != NULL) {
288
return attribute->DecodedValueOrNull();
293
// Look up escaped attribute value by name.
295
// 1. no attribute exists
296
// 2. the attribute has no value.
297
// If you care about this distinction, call FindAttribute.
298
// Use this only if you don't intend to change the attribute value;
299
// if you might change the attribute value, use FindAttribute instead
300
// (this avoids a double lookup).
301
const char* EscapedAttributeValue(HtmlName::Keyword name) const {
302
const Attribute* attribute = FindAttribute(name);
303
if (attribute != NULL) {
304
return attribute->escaped_value();
309
// Returns the element tag name, which is not guaranteed to be
310
// case-folded. Compare keyword() to the Keyword constant found in
311
// html_name.h for fast tag name comparisons.
312
StringPiece name_str() const { return data_->name_.value(); }
314
// Returns the HTML keyword enum. If this tag name is not
315
// recognized, returns HtmlName::kNotAKeyword, and you can
316
// examine name_str().
317
HtmlName::Keyword keyword() const { return data_->name_.keyword(); }
319
const HtmlName& name() const { return data_->name_; }
321
// Changing that tag of an element should only occur if the caller knows
322
// that the old attributes make sense for the new tag. E.g. a div could
323
// be changed to a span.
324
void set_name(const HtmlName& new_tag) { data_->name_ = new_tag; }
326
const AttributeList& attributes() const { return data_->attributes_; }
327
AttributeList* mutable_attributes() { return &data_->attributes_; }
329
friend class HtmlParse;
330
friend class HtmlLexer;
332
Style style() const { return data_->style_; }
333
void set_style(Style style) { data_->style_ = style; }
335
// Render an element as a string for debugging. This is not
336
// intended as a fully legal serialization.
337
virtual GoogleString ToString() const;
338
void DebugPrint() const;
340
int begin_line_number() const { return data_->begin_line_number_; }
341
int end_line_number() const { return data_->end_line_number_; }
344
virtual void SynthesizeEvents(const HtmlEventListIterator& iter,
345
HtmlEventList* queue);
347
virtual HtmlEventListIterator begin() const { return data_->begin_; }
348
virtual HtmlEventListIterator end() const { return data_->end_; }
351
// All of the data associated with an HtmlElement is indirected through this
352
// class, so we can delete it on Flush after a CloseElement event.
354
Data(const HtmlName& name,
355
const HtmlEventListIterator& begin,
356
const HtmlEventListIterator& end);
359
// Max value for the line numbers below. Since they are 24-bits,
360
// comparing against -1 does not work properly.
361
static const unsigned kMaxLineNumber = 0x00ffffff;
363
// Pack four fields into 64 bits using bitfields. Warning: this
364
// stuff is quite sensitive to details, so make sure to look at
365
// object sizes before changing! Interleaving the 24-bit and
366
// 8-bit member variables gives a total size of 8 bytes for these
367
// 4 variables on a gcc 64-bit compile. But putting the two
368
// 24-bit integers together gives a total size of 16 bytes, so
371
// HtmlParse::DeleteNode will set live_ to false without
372
// deleting element->data_. Flushing an ElementClose deletes
373
// data_ but HtmlElement knows that null data_ implies !live().
374
unsigned begin_line_number_ : 24;
376
unsigned end_line_number_ : 24;
380
AttributeList attributes_;
381
HtmlEventListIterator begin_;
382
HtmlEventListIterator end_;
385
// Begin/end event iterators are used by HtmlParse to keep track
386
// of the span of events underneath an element. This is primarily to
387
// help delete the element. Events are not public.
388
void set_begin(const HtmlEventListIterator& begin) { data_->begin_ = begin; }
389
void set_end(const HtmlEventListIterator& end) { data_->end_ = end; }
391
void set_begin_line_number(int line) { data_->begin_line_number_ = line; }
392
void set_end_line_number(int line) { data_->end_line_number_ = line; }
394
// construct via HtmlParse::NewElement
395
HtmlElement(HtmlElement* parent, const HtmlName& name,
396
const HtmlEventListIterator& begin,
397
const HtmlEventListIterator& end);
399
// HtmlElement data is held in HtmlElement::Data*, which is freed
400
// when a CloseElement is Flushed. The pointers themselves are
401
// retained and can correctly answer element->IsRewritable() and
402
// element->is_live(), but the rest of the data (attributes etc)
404
void FreeData() { data_.reset(NULL); }
406
scoped_ptr<Data> data_;
408
DISALLOW_COPY_AND_ASSIGN(HtmlElement);
411
} // namespace net_instaweb
413
#endif // PAGESPEED_KERNEL_HTML_HTML_ELEMENT_H_