2
* Copyright 2010 Google Inc.
4
* Licensed under the Apache License, Version 2.0 (the "License");
5
* you may not use this file except in compliance with the License.
6
* You may obtain a copy of the License at
8
* http://www.apache.org/licenses/LICENSE-2.0
10
* Unless required by applicable law or agreed to in writing, software
11
* distributed under the License is distributed on an "AS IS" BASIS,
12
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
* See the License for the specific language governing permissions and
14
* limitations under the License.
17
// Author: sligocki@google.com (Shawn Ligocki)
19
#ifndef NET_INSTAWEB_REWRITER_PUBLIC_COMMON_FILTER_H_
20
#define NET_INSTAWEB_REWRITER_PUBLIC_COMMON_FILTER_H_
22
#include "net/instaweb/htmlparse/public/empty_html_filter.h"
23
#include "net/instaweb/rewriter/public/resource.h"
24
#include "net/instaweb/rewriter/public/rewrite_driver.h"
25
#include "net/instaweb/util/public/basictypes.h"
26
#include "net/instaweb/util/public/string.h"
27
#include "net/instaweb/util/public/string_util.h"
29
namespace net_instaweb {
31
class HtmlCharactersNode;
34
class ResponseHeaders;
38
// CommonFilter encapsulates useful functionality that many filters will want.
39
// All filters who want this functionality should inherit from CommonFilter and
40
// define the Helper methods rather than the main methods.
42
// Currently, it stores whether we are in a <noscript> element (in
43
// which case, we should be careful about moving things out of this
46
// The base-tag is maintained in the RewriteDriver, although it can be
47
// accessed via a convenience method here for historical reasons.
48
class CommonFilter : public EmptyHtmlFilter {
50
// Debug message to be inserted when resource creation fails.
51
static const char kCreateResourceFailedDebugMsg[];
53
explicit CommonFilter(RewriteDriver* driver);
54
virtual ~CommonFilter();
58
// URL of the requested HTML or resource.
59
const GoogleUrl& base_url() const;
61
// For rewritten resources, decoded_base_url() is the base of the original
62
// (un-rewritten) resource's URL.
63
const GoogleUrl& decoded_base_url() const;
65
RewriteDriver* driver() const { return driver_; }
66
HtmlElement* noscript_element() const { return noscript_element_; }
68
// Insert a node at the best available location in or near the closing body
69
// tag during EndDocument. This is useful for filters that want to insert
70
// scripts or summary data at the end of body, but need to wait until
71
// EndDocument to do so.
73
// Tries to inject just before </body> if nothing else intervenes; otherwise
74
// tries to inject before </html> or, failing that, at the end of all content.
75
// This latter case still works in browsers, but breaks HTML validation (and
76
// is incredibly ugly). It can be necessitated by other post-</html> content,
77
// or by flushes in the body.
79
// Note that if a subclass overloads the Characters function, it needs to call
80
// the parent implementation for this function to be correct.
81
void InsertNodeAtBodyEnd(HtmlNode* data);
83
// Note: Don't overload these methods, overload the implementers instead!
84
virtual void StartDocument();
85
virtual void StartElement(HtmlElement* element);
86
virtual void EndElement(HtmlElement* element);
88
// If a subclass overloads this function and wishes to use
89
// InsertNodeAtBodyEnd(), it needs to make an upcall to this implementation
90
// for InsertNodeAtBodyEnd() to work correctly.
91
virtual void Characters(HtmlCharactersNode* characters);
93
// Creates an input resource with the url evaluated based on input_url
94
// which may need to be absolutified relative to base_url(). Returns NULL
95
// if input resource url isn't valid, or can't legally be rewritten in the
96
// context of this page. *is_authorized will be set to false if the domain
97
// of input_url is not authorized, which could true of false regardless of
98
// the return value: for example if we are allowing inlining of resources
99
// from unauthorized domains we will return non-NULL but *is_authorized will
100
// be false; converse cases are possible too (e.g. input_url is a data URI).
101
ResourcePtr CreateInputResource(StringPiece input_url, bool* is_authorized);
103
// Similar to CreateInputResource except that if the input_url is not
104
// authorized we insert a debug comment after the given element if possible
105
// (debug is enabled and the element is writable). The returned ResourcePtr
106
// is guaranteed to be non-NULL iff the input_url is authorized.
107
ResourcePtr CreateInputResourceOrInsertDebugComment(StringPiece input_url,
108
HtmlElement* element);
110
// Resolves input_url based on the driver's location and any base tag into
111
// out_url. If resolution fails, the resulting URL may be invalid.
112
void ResolveUrl(StringPiece input_url, GoogleUrl* out_url);
114
// Returns whether or not the base url is valid. This value will change
115
// as a filter processes the document. E.g. If there are url refs before
116
// the base tag is reached, it will return false until the filter sees the
117
// base tag. After the filter sees the base tag, it will return true.
118
bool BaseUrlIsValid() const;
120
// Returns whether the current options specify the "debug" filter.
121
// If set, then other filters can annotate output HTML with HTML
122
// comments indicating why they did or did not do an optimization,
123
// using HtmlParse::InsertComment.
124
bool DebugMode() const { return driver_->DebugMode(); }
126
// Utility function to extract the mime type and/or charset from a meta tag,
127
// either the HTML4 http-equiv form or the HTML5 charset form:
128
// element is the meta tag element to process.
129
// headers is optional: if provided it is checked to see if it already has
130
// a content type with the tag's value; if so, returns false.
131
// content is set to the content attribute's value, http-equiv form only.
132
// mime_type is set to the extracted mime type, if any.
133
// charset is the set to the extracted charset, if any.
134
// returns true if the details were extracted, false if not. If true is
135
// returned then content will be empty for the HTML5 charset form and
136
// non-empty for the HTML4 http-equiv form; also an http-equiv attribute
137
// with a blank mime type returns false as it's not a valid format.
138
static bool ExtractMetaTagDetails(const HtmlElement& element,
139
const ResponseHeaders* headers,
140
GoogleString* content,
141
GoogleString* mime_type,
142
GoogleString* charset);
144
// Returns true if the image element is not in a <noscript> block and it has
145
// a) no onload attribute or
146
// b) an onload attribute exists with the value being equal to the
147
// CriticalImagesBeaconFilter::kImageOnloadCode.
148
bool CanAddPagespeedOnloadToImage(const HtmlElement&);
150
// Add this filter to the logged list of applied rewriters. The intended
151
// semantics of this are that it should only include filters that modified the
152
// content of the response to the request being processed.
153
// This class logs using Name(); subclasses may do otherwise.
154
virtual void LogFilterModifiedContent();
156
// Returns true if this filter allows domains not authorized by any pagespeed
157
// directive to be optimized. Filters that end up inlining content onto the
158
// HTML are almost the only ones that can safely do this.
159
virtual RewriteDriver::InlineAuthorizationPolicy AllowUnauthorizedDomain()
160
const { return RewriteDriver::kInlineOnlyAuthorizedResources; }
162
// Returns true if the filter intends to inline the resource it fetches. This
163
// is to support AllowWhenInlining. Unlike AllowUnauthorizedDomain() this
164
// doesn't have security implications and is just used for performance tuning.
165
virtual bool IntendedForInlining() const { return false; }
168
ServerContext* server_context() const { return server_context_; }
169
const RewriteOptions* rewrite_options() { return rewrite_options_; }
171
// Overload these implementer methods:
172
// Intentionally left abstract so that implementers don't forget to change
173
// the name from Blah to BlahImpl.
174
virtual void StartDocumentImpl() = 0;
175
virtual void StartElementImpl(HtmlElement* element) = 0;
176
virtual void EndElementImpl(HtmlElement* element) = 0;
178
// ID string used in logging. Inheritors should supply whatever short ID
180
virtual const char* LoggingId() { return Name(); }
183
// These fields are gettable by inheritors.
184
RewriteDriver* driver_;
185
ServerContext* server_context_;
186
const RewriteOptions* rewrite_options_;
187
HtmlElement* noscript_element_;
188
// These are private.
189
HtmlElement* end_body_point_;
192
DISALLOW_COPY_AND_ASSIGN(CommonFilter);
195
} // namespace net_instaweb
197
#endif // NET_INSTAWEB_REWRITER_PUBLIC_COMMON_FILTER_H_