2
* Copyright 2010 Google Inc.
4
* Licensed under the Apache License, Version 2.0 (the "License");
5
* you may not use this file except in compliance with the License.
6
* You may obtain a copy of the License at
8
* http://www.apache.org/licenses/LICENSE-2.0
10
* Unless required by applicable law or agreed to in writing, software
11
* distributed under the License is distributed on an "AS IS" BASIS,
12
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
* See the License for the specific language governing permissions and
14
* limitations under the License.
17
// Author: jmarantz@google.com (Joshua Marantz)
19
#ifndef NET_INSTAWEB_REWRITER_PUBLIC_REWRITE_DRIVER_H_
20
#define NET_INSTAWEB_REWRITER_PUBLIC_REWRITE_DRIVER_H_
26
#include "base/logging.h"
27
#include "net/instaweb/htmlparse/public/html_element.h"
28
#include "net/instaweb/htmlparse/public/html_parse.h"
29
#include "net/instaweb/http/public/cache_url_async_fetcher.h"
30
#include "net/instaweb/http/public/http_cache.h"
31
#include "net/instaweb/http/public/request_context.h"
32
#include "net/instaweb/http/public/user_agent_matcher.h"
33
#include "net/instaweb/rewriter/public/critical_images_finder.h"
34
#include "net/instaweb/rewriter/public/critical_selector_finder.h"
35
#include "net/instaweb/rewriter/public/downstream_cache_purger.h"
36
#include "net/instaweb/rewriter/public/output_resource_kind.h"
37
#include "net/instaweb/rewriter/public/resource.h"
38
#include "net/instaweb/rewriter/public/resource_slot.h"
39
#include "net/instaweb/rewriter/public/rewrite_context.h"
40
#include "net/instaweb/rewriter/public/rewrite_options.h"
41
#include "net/instaweb/rewriter/public/scan_filter.h"
42
#include "net/instaweb/rewriter/public/server_context.h"
43
#include "net/instaweb/util/public/basictypes.h"
44
#include "net/instaweb/util/public/google_url.h"
45
#include "net/instaweb/util/public/printf_format.h"
46
#include "net/instaweb/util/public/proto_util.h"
47
#include "net/instaweb/util/public/queued_worker_pool.h"
48
#include "net/instaweb/util/public/scheduler.h"
49
#include "net/instaweb/util/public/scoped_ptr.h"
50
#include "net/instaweb/util/public/string.h"
51
#include "net/instaweb/util/public/string_util.h"
52
#include "net/instaweb/util/public/thread_system.h"
53
#include "net/instaweb/util/public/url_segment_encoder.h"
54
#include "pagespeed/kernel/base/abstract_mutex.h"
55
#include "pagespeed/kernel/base/thread_annotations.h"
56
#include "pagespeed/kernel/http/content_type.h"
57
#include "pagespeed/kernel/http/response_headers.h"
58
#include "pagespeed/kernel/util/categorized_refcount.h"
60
namespace net_instaweb {
62
class AbstractLogRecord;
64
class AbstractPropertyPage;
66
class CriticalCssResult;
68
class CriticalLineInfo;
71
class DomainRewriteFilter;
72
class FallbackPropertyPage;
75
class FlushEarlyRenderInfo;
78
class HtmlWriterFilter;
83
class RequestProperties;
85
class ResourceContext;
87
class RewriteDriverPool;
89
class SplitHtmlConfig;
91
class UrlAsyncFetcher;
92
class UrlLeftTrimFilter;
96
// This extends class HtmlParse (which should renamed HtmlContext) by providing
97
// context for rewriting resources (css, js, images).
98
class RewriteDriver : public HtmlParse {
100
// Status return-code for ResolveCssUrls.
101
enum CssResolutionStatus {
107
// Mode for BoundedWaitForCompletion
109
kNoWait, // Used internally. Do not pass in.
110
kWaitForCompletion, // wait for everything to complete (up to deadline)
111
kWaitForCachedRender, // wait for at least cached rewrites to complete,
112
// and anything else that finishes within deadline.
113
kWaitForShutDown // Makes sure that all work, including any that's
114
// being done in background, finishes.
117
// Indicates document's mimetype as XHTML, HTML, or is not
118
// known/something else. Note that in Apache we might not know the
119
// correct mimetype because a downstream module might change it.
120
// It's not clear how likely this is, since mod_rewrite and mod_mime
121
// run upstream of mod_pagespeed. However if anyone sets mimetype
122
// via "Header Add", it would affect the Browser's view of the
123
// document's mimetype (which is what determines the parsing) but
124
// mod_pagespeed would not know.
126
// Note that we also have doctype().IsXhtml() but that indicates quirks-mode
127
// for CSS, and does not control how the parser parses the document.
134
// See CreateInputResource.
135
enum InlineAuthorizationPolicy {
136
kInlineUnauthorizedResources,
137
kInlineOnlyAuthorizedResources
140
// See CreateInputResource.
142
kIntendedForInlining,
146
// This string identifies, for the PropertyCache, a group of properties
147
// that are computed from the DOM, and thus can, if desired, be rewritten
148
// on every HTML request.
149
static const char kDomCohort[];
150
// The cohort for properties that are written by the beacon handler.
151
static const char kBeaconCohort[];
153
// Property Names in DomCohort.
154
// Tracks the timestamp when we last received a request for this url.
155
static const char kLastRequestTimestamp[];
156
// Tracks if we exceeded the maximum size limit of html which we should parse.
157
static const char kParseSizeLimitExceeded[];
158
// Flush Subresources Info associted with the HTML page.
159
static const char kSubresourcesPropertyName[];
160
// Status codes of previous responses.
161
static const char kStatusCodePropertyName[];
163
RewriteDriver(MessageHandler* message_handler,
164
FileSystem* file_system,
165
UrlAsyncFetcher* url_async_fetcher);
167
// Need explicit destructors to allow destruction of scoped_ptr-controlled
168
// instances without propagating the include files.
169
virtual ~RewriteDriver();
171
// Returns a fresh instance using the same options we do, using the same log
172
// record. Drivers should only be cloned within the same request.
173
RewriteDriver* Clone();
175
// Clears the current request cache of resources and base URL. The
176
// filter-chain is left intact so that a new request can be issued.
177
// Deletes all RewriteContexts.
179
// WaitForCompletion must be called prior to Clear().
182
// Initialize statistics for all filters that need it.
183
static void InitStats(Statistics* statistics);
185
// Initialize statics. Initialize/Terminate calls must be paired.
186
static void Initialize();
187
static void Terminate();
189
// Formats a "deadline exceeded" message for a given filter.
190
static GoogleString DeadlineExceededMessage(StringPiece filter_name);
192
// Sets a server context enabling the rewriting of
193
// resources. This will replace any previous server context.
194
void SetServerContext(ServerContext* server_context);
196
// Returns true if we may cache extend Css, Images, PDFs, or Scripts
198
bool MayCacheExtendCss() const;
199
bool MayCacheExtendImages() const;
200
bool MayCacheExtendPdfs() const;
201
bool MayCacheExtendScripts() const;
203
const GoogleString& user_agent() const { return user_agent_; }
205
void SetUserAgent(const StringPiece& user_agent_string);
207
const RequestProperties* request_properties() const {
208
return request_properties_.get();
211
// Reinitializes request_properties_, clearing any cached values.
212
void ClearRequestProperties();
214
// Returns true if the request we're rewriting was made using SPDY.
215
bool using_spdy() const { return request_context_->using_spdy(); }
217
bool write_property_cache_dom_cohort() const {
218
return write_property_cache_dom_cohort_;
220
void set_write_property_cache_dom_cohort(bool x) {
221
write_property_cache_dom_cohort_ = x;
224
RequestContextPtr request_context() { return request_context_; }
225
void set_request_context(const RequestContextPtr& x);
227
// Convenience method to return the trace context from the request_context()
228
// if both are configured and NULL otherwise.
229
RequestTrace* trace_context();
231
// Convenience method to issue a trace annotation if tracing is enabled.
232
// If tracing is disabled, this function is a no-op.
233
void TracePrintf(const char* fmt, ...);
235
// Return a mutable pointer to the response headers that filters can update
236
// before the first flush. Returns NULL after Flush has occurred.
237
ResponseHeaders* mutable_response_headers() {
238
return flush_occurred_ ? NULL : response_headers_;
241
// Returns a const version of the ResponseHeaders*, indepdendent of whether
242
// Flush has occurred. Note that ResponseHeaders* may still be NULL if
243
// no one has called set_response_headers_ptr.
245
// TODO(jmarantz): Change API to require response_headers in StartParse so
246
// we can guarantee this is non-null.
247
const ResponseHeaders* response_headers() {
248
return response_headers_;
251
// Set the pointer to the response headers that filters can update
252
// before the first flush. RewriteDriver does NOT take ownership
254
void set_response_headers_ptr(ResponseHeaders* headers) {
255
response_headers_ = headers;
258
// Reinitializes request_headers_ (a scoped ptr) with a copy of the original
259
// request headers. Note that the fetches associated with the driver could
260
// be using a modified version of the original request headers.
261
// There MUST be at most 1 call to this method after a rewrite driver object
262
// has been constructed or recycled.
263
void SetRequestHeaders(const RequestHeaders& headers);
265
const RequestHeaders* request_headers() const {
266
return request_headers_.get();
269
UserAgentMatcher* user_agent_matcher() const {
270
DCHECK(server_context() != NULL);
271
return server_context()->user_agent_matcher();
274
// Adds the filters from the options, specified by name in enabled_filters.
275
// This must be called explicitly after object construction to provide an
276
// opportunity to programatically add custom filters beyond those defined
277
// in RewriteOptions, via AddFilter(HtmlFilter* filter) (below).
280
// Adds a filter to the very beginning of the pre-render chain, taking
281
// ownership. This should only be used for filters that must run before any
282
// filter added via PrependOwnedPreRenderFilter.
283
void AddOwnedEarlyPreRenderFilter(HtmlFilter* filter);
285
// Adds a filter to the beginning of the pre-render chain, taking ownership.
286
void PrependOwnedPreRenderFilter(HtmlFilter* filter);
287
// Adds a filter to the end of the pre-render chain, taking ownership.
288
void AppendOwnedPreRenderFilter(HtmlFilter* filter);
290
// Adds a filter to the end of the post-render chain, taking ownership.
291
void AddOwnedPostRenderFilter(HtmlFilter* filter);
292
// Same, without taking ownership.
293
void AddUnownedPostRenderFilter(HtmlFilter* filter);
295
// Add a RewriteFilter to the end of the pre-render chain and take ownership
296
// of the filter. This differs from AppendOwnedPreRenderFilter in that
297
// it adds the filter's ID into a dispatch table for serving
298
// rewritten resources. E.g. if your filter->id == "xy" and
299
// FetchResource("NAME.pagespeed.xy.HASH.EXT"...) is called, then
300
// RewriteDriver will dispatch to filter->Fetch().
302
// This is used when the filter being added is not part of the
303
// core set built into RewriteDriver and RewriteOptions, such
304
// as platform-specific or server-specific filters, or filters
305
// invented for unit-testing the framework.
306
void AppendRewriteFilter(RewriteFilter* filter);
308
// Like AppendRewriteFilter, but adds the filter to the beginning of the
310
void PrependRewriteFilter(RewriteFilter* filter);
312
// Tells RewriteDriver that a certain portion of URL namespace should not
313
// be handled via usual (HTTP proxy semantics) means. It's up to
314
// the filters to actually arrange for that to do something.
315
// Takes ownership of the claimant object. Note that it's important for the
316
// claims to be disjoint, since the RewriteContext framework needs to
317
// be able to assign compatible Resource objects for same URLs/slots among
318
// all filters that deal with them.
319
void AddResourceUrlClaimant(ResourceUrlClaimant* claimant);
321
// Controls how HTML output is written. Be sure to call this last, after
322
// all other filters have been established.
324
// TODO(jmarantz): fix this in the implementation so that the caller can
325
// install filters in any order and the writer will always be last.
326
void SetWriter(Writer* writer);
328
Writer* writer() const { return writer_; }
330
// Initiates an async fetch for a rewritten resource with the specified name.
331
// If url matches the pattern of what the driver is authorized to serve,
332
// then true is returned and the caller must listen on the callback for
333
// the completion of the request.
335
// If the driver is not authorized to serve the resource for any of the
336
// following reasons, false is returned and the callback will -not- be
337
// called - the request should be passed to another handler.
338
// * The URL is invalid or it does not match the general pagespeed pattern.
339
// * The filter id in the URL does not map to a known filter.
340
// * The filter for the id in the URL doesn't recognize the format of the URL.
341
// * The filter for the id in the URL is forbidden.
343
// In other words there are three outcomes for this routine:
344
// 1. the request was handled immediately and the callback called
345
// before the method returns. true is returned.
346
// 2. the request looks good but was queued because some other resource
347
// fetch is needed to satisfy it. true is returned.
348
// 3. the request does not look like it belongs to Instaweb. The callback
349
// will not be called, and false will be returned.
351
// In even other words, if this routine returns 'false' then the callback
352
// will not be called. If the callback -is- called, then this should be the
353
// 'final word' on this request, whether it was called with success=true or
356
// Note that if the request headers have not yet been set on the driver then
357
// they'll be taken from the fetch.
358
bool FetchResource(const StringPiece& url, AsyncFetch* fetch);
360
// Initiates an In-Place Resource Optimization (IPRO) fetch (A resource which
361
// is served under the original URL, but is still able to be rewritten).
363
// proxy_mode indicates whether we are running as a proxy where users
364
// depend on us to send contents. When set true, we will perform HTTP fetches
365
// to get contents if not in cache and will ignore kRecentFetchNotCacheable
366
// and kRecentFetchFailed since we'll have to fetch the resource for users
367
// anyway. Origin implementations (like mod_pagespeed) should set this to
368
// false and let the serve serve the resource if it's not in cache.
370
// If proxy_mode is false and the resource could not be found in HTTP cache,
371
// async_fetch->Done(false) will be called and async_fetch->status_code()
372
// will be CacheUrlAsyncFetcher::kNotInCacheStatus (to distinguish this
373
// from a different reason for failure, like kRecentFetchNotCacheable).
375
// Note that if the request headers have not yet been set on the driver then
376
// they'll be taken from the fetch.
377
void FetchInPlaceResource(const GoogleUrl& gurl, bool proxy_mode,
378
AsyncFetch* async_fetch);
380
// See FetchResource. There are two differences:
381
// 1. It takes an OutputResource instead of a URL.
382
// 2. It returns whether a fetch was queued or not. This is safe
383
// to ignore because in either case the callback will be called.
384
// 3. If 'filter' is NULL then the request only checks cache and
385
// (if enabled) the file system.
386
bool FetchOutputResource(const OutputResourcePtr& output_resource,
387
RewriteFilter* filter,
388
AsyncFetch* async_fetch);
390
// Attempts to decode an output resource based on the URL pattern
391
// without actually rewriting it. No permission checks are performed on the
392
// url, though it is parsed to see if it looks like the url of a generated
393
// resource (which should mean checking the hash to ensure we generated it
395
// TODO(jmaessen): add url hash & check thereof.
396
OutputResourcePtr DecodeOutputResource(const GoogleUrl& url,
397
RewriteFilter** filter) const;
399
// As above, but does not actually create a resource object,
400
// and instead outputs the decoded information into the various out
401
// parameters. Returns whether decoding successful or not.
402
// Uses options_to_use rather than this->options() to determine which
403
// drivers are forbidden from applying, etc.
404
bool DecodeOutputResourceName(const GoogleUrl& url,
405
const RewriteOptions* options_to_use,
406
const UrlNamer* url_namer,
407
ResourceNamer* name_out,
408
OutputResourceKind* kind_out,
409
RewriteFilter** filter_out) const;
411
// Attempts to lookup the metadata cache info that would be used for the
412
// output resource at url with the RewriteOptions set on this driver.
414
// If there is a problem with the URL, returns false, and *error_out
415
// will contain an error message.
417
// If it can determine the metadata cache key successfully, returns true,
418
// and eventually callback will be invoked with the metadata cache key
419
// and the decoding results.
421
// After calling this method, the driver should not be used for anything else.
422
bool LookupMetadataForOutputResource(
424
GoogleString* error_out,
425
RewriteContext::CacheLookupResultCallback* callback);
427
// Decodes the incoming pagespeed url to original url(s).
428
bool DecodeUrl(const GoogleUrl& url,
429
StringVector* decoded_urls) const;
431
// As above, but lets one specify the options and URL namer to use.
432
// Meant for use with the decoding_driver.
433
bool DecodeUrlGivenOptions(const GoogleUrl& url,
434
const RewriteOptions* options,
435
const UrlNamer* url_namer,
436
StringVector* decoded_urls) const;
438
FileSystem* file_system() { return file_system_; }
439
UrlAsyncFetcher* async_fetcher() { return url_async_fetcher_; }
441
// Set a fetcher that will be used by RewriteDriver for current request
442
// only (that is, until Clear()). RewriteDriver will take ownership of this
443
// fetcher, and will keep it around until Clear(), even if further calls
444
// to this method are made.
445
void SetSessionFetcher(UrlAsyncFetcher* f);
447
UrlAsyncFetcher* distributed_fetcher() { return distributed_async_fetcher_; }
448
// Does not take ownership.
449
void set_distributed_fetcher(UrlAsyncFetcher* fetcher) {
450
distributed_async_fetcher_ = fetcher;
453
// Creates a cache fetcher that uses the driver's fetcher and its options.
454
// Note: this means the driver's fetcher must survive as long as this does.
455
CacheUrlAsyncFetcher* CreateCacheFetcher();
456
// Returns a cache fetcher that does not fall back to an actual fetcher.
457
CacheUrlAsyncFetcher* CreateCacheOnlyFetcher();
459
ServerContext* server_context() const { return server_context_; }
460
Statistics* statistics() const;
462
// Takes ownership of 'options'.
463
void set_custom_options(RewriteOptions* options) {
464
set_options_for_pool(NULL, options);
467
// Takes ownership of 'options'. pool denotes the pool of rewrite drivers that
468
// use these options. May be NULL if using custom options.
469
void set_options_for_pool(RewriteDriverPool* pool, RewriteOptions* options) {
470
controlling_pool_ = pool;
471
options_.reset(options);
474
// Pool in which this driver can be recycled. May be NULL.
475
RewriteDriverPool* controlling_pool() { return controlling_pool_; }
477
// Return the options used for this RewriteDriver.
478
const RewriteOptions* options() const { return options_.get(); }
480
// Override HtmlParse's StartParseId to propagate any required options.
481
// Note that if this (or other variants) returns true you should use
482
// FinishParse(), otherwise Cleanup().
483
virtual bool StartParseId(const StringPiece& url, const StringPiece& id,
484
const ContentType& content_type);
486
// Override HtmlParse's FinishParse to ensure that the
487
// request-scoped cache is cleared immediately.
489
// Note that the RewriteDriver can delete itself in this method, if
490
// it's not externally managed, and if all RewriteContexts have been
492
virtual void FinishParse();
494
// As above, but asynchronous. Note that the RewriteDriver may already be
495
// deleted at the point the callback is invoked. The scheduler lock will
496
// not be held when the callback is run.
497
void FinishParseAsync(Function* callback);
499
// Report error message with description of context's location
500
// (such as filenames and line numbers). context may be NULL, in which case
501
// the current parse position will be used.
502
void InfoAt(const RewriteContext* context,
503
const char* msg, ...) INSTAWEB_PRINTF_FORMAT(3, 4);
505
// Creates a reference-counted pointer to a new OutputResource object.
507
// The content type is taken from the input_resource, but can be modified
508
// with SetType later if that is not correct (e.g. due to image transcoding).
510
// Constructs an output resource corresponding to the specified input resource
511
// and encoded using the provided encoder. Assumes permissions checking
512
// occurred when the input resource was constructed, and does not do it again.
513
// To avoid if-chains, tolerates a NULL input_resource (by returning NULL).
514
// TODO(jmaessen, jmarantz): Do we want to permit NULL input_resources here?
515
// jmarantz has evinced a distaste.
516
OutputResourcePtr CreateOutputResourceFromResource(
517
const StringPiece& filter_id,
518
const UrlSegmentEncoder* encoder,
519
const ResourceContext* data,
520
const ResourcePtr& input_resource,
521
OutputResourceKind kind,
522
GoogleString* failure_reason);
524
// Creates an output resource where the name is provided. The intent is to
525
// be able to derive the content from the name, for example, by encoding
526
// URLs and metadata.
528
// This method succeeds unless the filename is too long.
530
// This name is prepended with path for writing hrefs, and the resulting url
531
// is encoded and stored at file_prefix when working with the file system.
533
// $(PATH)/$(NAME).pagespeed[.$EXPERIMENT].$(FILTER_PREFIX).
534
// $(HASH).$(CONTENT_TYPE_EXT)
536
// EXPERIMENT is set only when there is an active experiment_spec.
538
// Could be private since you should use one of the versions below but put
539
// here with the rest like it and for documentation clarity.
540
OutputResourcePtr CreateOutputResourceWithPath(
541
const StringPiece& mapped_path, const StringPiece& unmapped_path,
542
const StringPiece& base_url, const StringPiece& filter_id,
543
const StringPiece& name, OutputResourceKind kind,
544
GoogleString* failure_reason);
546
// Fills in the resource namer based on the give filter_id, name and options
547
// stored in the driver.
548
void PopulateResourceNamer(
549
const StringPiece& filter_id,
550
const StringPiece& name,
551
ResourceNamer* full_name);
553
// Version of CreateOutputResourceWithPath which first takes only the
554
// unmapped path and finds the mapped path using the DomainLawyer
555
// and the base_url is this driver's base_url.
556
OutputResourcePtr CreateOutputResourceWithUnmappedUrl(
557
const GoogleUrl& unmapped_gurl, const StringPiece& filter_id,
558
const StringPiece& name, OutputResourceKind kind,
559
GoogleString* failure_reason);
561
// Version of CreateOutputResourceWithPath where the unmapped and mapped
562
// paths are different and the base_url is this driver's base_url.
563
OutputResourcePtr CreateOutputResourceWithMappedPath(
564
const StringPiece& mapped_path, const StringPiece& unmapped_path,
565
const StringPiece& filter_id, const StringPiece& name,
566
OutputResourceKind kind, GoogleString* failure_reason) {
567
return CreateOutputResourceWithPath(mapped_path, unmapped_path,
568
decoded_base_url_.AllExceptLeaf(),
569
filter_id, name, kind, failure_reason);
572
// Version of CreateOutputResourceWithPath where the unmapped and mapped
573
// paths and the base url are all the same. FOR TESTS ONLY.
574
OutputResourcePtr CreateOutputResourceWithPath(
575
const StringPiece& path, const StringPiece& filter_id,
576
const StringPiece& name, OutputResourceKind kind,
577
GoogleString* failure_reason) {
578
return CreateOutputResourceWithPath(path, path, path, filter_id, name,
579
kind, failure_reason);
582
// Creates an input resource based on input_url. Returns NULL if the input
583
// resource url isn't valid or is a data url, or can't legally be rewritten
584
// in the context of this page, in which case *is_authorized will be false.
585
// Assumes that resources from unauthorized domains may not be rewritten and
586
// that the resource is not intended exclusively for inlining.
587
ResourcePtr CreateInputResource(const GoogleUrl& input_url,
588
bool* is_authorized);
590
// Creates an input resource. Returns NULL if the input resource url isn't
591
// valid or is a data url, or can't legally be rewritten in the context of
592
// this page (which could mean that it was a resource from an unauthorized
593
// domain being processed by a filter that does not allow unauthorized
594
// resources, in which case *is_authorized will be false).
596
// There are two "special" options, and if you don't care about them you
597
// should just call CreateInputResource(input_url, is_authorized) to use
599
// * If resources from unauthorized domains may be inlined, set
600
// inline_authorization_policy to kInlineUnauthorizedResources, otherwise
601
// set it to kInlineOnlyAuthorizedResources.
602
// * If this resource will be inlined after fetching, then set intended_for to
603
// kIntendedForInlining, otherwise use kIntendedForGeneral. This is to
604
// support AllowWhenInlining.
605
ResourcePtr CreateInputResource(
606
const GoogleUrl& input_url,
607
InlineAuthorizationPolicy inline_authorization_policy,
608
IntendedFor intended_for,
609
bool* is_authorized);
611
// Creates an input resource from the given absolute url. Requires that the
612
// provided url has been checked, and can legally be rewritten in the current
613
// page context. Only for use by unit tests.
614
ResourcePtr CreateInputResourceAbsoluteUncheckedForTestsOnly(
615
const StringPiece& absolute_url);
617
// Returns true if some ResourceUrlClaimant has staked a claim on given URL.
618
// If this returns true, CreateInputResource will fail, but it's probably
619
// not worth logging any debug filter hints about that.
620
bool IsResourceUrlClaimed(const GoogleUrl& url) const;
622
// Checks to see if the input_url has the same origin as and the base url, to
623
// make sure we're not fetching from another server. Does not consult the
624
// domain lawyer, and is not affected by AddDomain().
625
// Precondition: input_url.IsWebValid()
626
bool MatchesBaseUrl(const GoogleUrl& input_url) const;
628
// Checks to see if we can write the input_url resource in the domain_url
629
// taking into account domain authorization, wildcard allow/disallow from
630
// RewriteOptions, and the intended use of the url's resource. After the
631
// function is executed, is_authorized_domain will indicate whether input_url
632
// was found to belong to an authorized domain or not.
633
bool MayRewriteUrl(const GoogleUrl& domain_url,
634
const GoogleUrl& input_url,
635
InlineAuthorizationPolicy inline_authorization_policy,
636
IntendedFor intended_for,
637
bool* is_authorized_domain) const;
639
// Returns the appropriate base gurl to be used for resolving hrefs
640
// in the document. Note that HtmlParse::google_url() is the URL
641
// for the HTML file and is used for printing html syntax errors.
642
const GoogleUrl& base_url() const { return base_url_; }
644
// The URL that was requested if FetchResource was called.
645
StringPiece fetch_url() const { return fetch_url_; }
647
// Returns the decoded version of base_gurl() in case it was encoded by a
648
// non-default UrlNamer (for the default UrlNamer this returns the same value
649
// as base_url()). Required when fetching a resource by its encoded name.
650
const GoogleUrl& decoded_base_url() const { return decoded_base_url_; }
651
StringPiece decoded_base() const { return decoded_base_url_.Spec(); }
653
// Quick way to tell if the document url is https (ie was fetched via https).
654
bool IsHttps() const { return google_url().SchemeIs("https"); }
656
const UrlSegmentEncoder* default_encoder() const { return &default_encoder_; }
658
// Finds a filter with the given ID, or returns NULL if none found.
659
RewriteFilter* FindFilter(const StringPiece& id) const;
661
// Returns refs_before_base.
662
bool refs_before_base() { return refs_before_base_; }
664
// Sets whether or not there were references to urls before the
665
// base tag (if there is a base tag). This variable has document-level
666
// scope. It is reset at the beginning of every document by
668
void set_refs_before_base() { refs_before_base_ = true; }
670
// Get/set the charset of the containing HTML page. See scan_filter.cc for
671
// an explanation of how this is determined, but NOTE that the determined
672
// charset can change as more of the HTML is seen, in particular after a
674
StringPiece containing_charset() { return containing_charset_; }
675
void set_containing_charset(const StringPiece charset) {
676
charset.CopyToString(&containing_charset_);
679
// Establishes a HtmlElement slot for rewriting.
680
HtmlResourceSlotPtr GetSlot(const ResourcePtr& resource,
682
HtmlElement::Attribute* attr);
684
// Method to start a resource rewrite. This is called by a filter during
685
// parsing, although the Rewrite might continue after deadlines expire
686
// and the rewritten HTML must be flushed. Returns InitiateRewrite returns
687
// false if the system is not healthy enough to support resource rewrites.
688
bool InitiateRewrite(RewriteContext* rewrite_context)
689
LOCKS_EXCLUDED(rewrite_mutex());
690
void InitiateFetch(RewriteContext* rewrite_context);
692
// Provides a mechanism for a RewriteContext to notify a
693
// RewriteDriver that it is complete, to allow the RewriteDriver
694
// to delete itself or return it back to a free pool in the ServerContext.
696
// This will also call back into RewriteContext::Propagate, letting it
697
// know whether the context is still attached to the HTML DOM
698
// (and hence safe to render), and to do other bookkeeping.
700
// If 'permit_render' is false, no rendering will be asked for even if
701
// the context is still attached.
702
void RewriteComplete(RewriteContext* rewrite_context, bool permit_render);
704
// Provides a mechanism for a RewriteContext to notify a
705
// RewriteDriver that a certain number of rewrites have been discovered
706
// to need to take the slow path.
707
void ReportSlowRewrites(int num);
709
// If there are not outstanding references to this RewriteDriver,
710
// delete it or recycle it to a free pool in the ServerContext.
711
// If this is a fetch, calling this also signals to the system that you
712
// are no longer interested in its results.
715
// Adds an extra external reference to the object. You should not
716
// normally need to call it (NewRewriteDriver does it initially), unless for
717
// some reason you want to pin the object (e.g. in tests). Matches up with
719
void AddUserReference();
721
// Debugging routines to print out data about the driver.
722
GoogleString ToString(bool show_detached_contexts) const;
723
void PrintState(bool show_detached_contexts); // For debugging.
724
void PrintStateToErrorLog(bool show_detached_contexts); // For logs.
726
// Wait for outstanding Rewrite to complete. Once the rewrites are
727
// complete they can be rendered.
728
void WaitForCompletion();
730
// Wait for outstanding rewrite to complete, including any background
731
// work that may be ongoing even after results were reported.
733
// Note: while this guarantees that the result of the computation is
734
// known, the thread that performed it may still be running for a
735
// little bit and accessing the driver.
736
void WaitForShutDown();
738
// As above, but with a time bound, and taking a mode parameter to decide
739
// between WaitForCompletion or WaitForShutDown behavior.
740
// If timeout_ms <= 0, no time bound will be used.
741
void BoundedWaitFor(WaitMode mode, int64 timeout_ms)
742
LOCKS_EXCLUDED(rewrite_mutex());
744
// If this is set to true, during a Flush of HTML the system will
745
// wait for results of all rewrites rather than just waiting for
746
// cache lookups and a small deadline. Note, however, that in very
747
// rare circumstances some rewrites may still be dropped due to
750
// Note: reset every time the driver is recycled.
751
void set_fully_rewrite_on_flush(bool x) {
752
fully_rewrite_on_flush_ = x;
755
// Returns if this response has a blocking rewrite or not.
756
bool fully_rewrite_on_flush() const {
757
return fully_rewrite_on_flush_;
760
// This is relevant only when fully_rewrite_on_flush is true.
761
// When this is set to true, Flush of HTML will not wait for async events
762
// while it does wait when it is set to false.
763
void set_fast_blocking_rewrite(bool x) {
764
fast_blocking_rewrite_ = x;
767
bool fast_blocking_rewrite() const {
768
return fast_blocking_rewrite_;
771
// If the value of X-PSA-Blocking-Rewrite request header matches the blocking
772
// rewrite key, set fully_rewrite_on_flush flag.
773
void EnableBlockingRewrite(RequestHeaders* request_headers);
775
// Indicate that this RewriteDriver will be explicitly deleted, and
776
// thus should not be auto-deleted at the end of the parse. This is
777
// primarily for tests.
779
// TODO(jmarantz): Consider phasing this out to make tests behave
780
// more like servers.
781
void set_externally_managed(bool x) { externally_managed_ = x; }
783
// Called by RewriteContext to let RewriteDriver know it will be continuing
784
// on the fetch in background, and so it should defer doing full cleanup
785
// sequences until DetachedFetchComplete() is called.
788
// Called by RewriteContext when a detached async fetch is complete, allowing
789
// the RewriteDriver to be recycled if FetchComplete() got invoked as well.
790
void DetachedFetchComplete();
792
// Cleans up the driver and any fetch rewrite contexts, unless the fetch
793
// rewrite got detached by a call to DetachFetch(), in which case a call to
794
// DetachedFetchComplete() must also be performed.
795
void FetchComplete();
797
// Deletes the specified RewriteContext. If this is the last RewriteContext
798
// active on this Driver, and there is no other outstanding activity, then
799
// the RewriteDriver itself can be recycled, and WaitForCompletion can return.
801
// We expect to this method to be called on the Rewrite thread.
802
void DeleteRewriteContext(RewriteContext* rewrite_context);
804
int rewrite_deadline_ms() { return options()->rewrite_deadline_ms(); }
806
// Sets a maximum amount of time to process a page across all flush
807
// windows; i.e., the entire lifecycle of this driver during a given pageload.
808
// A negative value indicates no limit.
809
// Setting fully_rewrite_on_flush() overrides this.
810
void set_max_page_processing_delay_ms(int x) {
811
max_page_processing_delay_ms_ = x;
813
int max_page_processing_delay_ms() { return max_page_processing_delay_ms_; }
815
// Sets the device type chosen for the current property_page.
816
void set_device_type(UserAgentMatcher::DeviceType x) { device_type_ = x; }
817
UserAgentMatcher::DeviceType device_type() const { return device_type_; }
819
// Tries to register the given rewrite context as working on
820
// its partition key. If this context is the first one to try to handle it,
821
// returns NULL. Otherwise returns the previous such context.
823
// Must only be called from rewrite thread.
824
RewriteContext* RegisterForPartitionKey(const GoogleString& partition_key,
825
RewriteContext* candidate);
827
// Must be called after all other rewrites that are currently relying on this
828
// one have had their RepeatedSuccess or RepeatedFailure methods called.
830
// Must only be called from rewrite thread.
831
void DeregisterForPartitionKey(
832
const GoogleString& partition_key, RewriteContext* candidate);
834
// Indicates that a Flush through the HTML parser chain should happen
835
// soon, e.g. once the network pauses its incoming byte stream.
836
void RequestFlush() { flush_requested_ = true; }
837
bool flush_requested() const { return flush_requested_; }
839
// Executes an Flush() if RequestFlush() was called, e.g. from the
840
// Listener Filter (see set_event_listener below). Consider an HTML
841
// parse driven by a UrlAsyncFetcher. When the UrlAsyncFetcher
842
// temporarily runs out of bytes to read, it calls
843
// response_writer->Flush(). When that happens, we may want to
844
// consider flushing the outstanding HTML events through the system
845
// so that the browser can start fetching subresources and
846
// rendering. The event_listener (see set_event_listener below)
847
// helps determine whether enough "interesting" events have passed
848
// in the current flush window so that we should take this incoming
849
// network pause as an opportunity.
850
void ExecuteFlushIfRequested();
852
// Asynchronous version of the above. Note that you should not
853
// attempt to write out any data until the callback is invoked.
854
// (If a flush is not needed, the callback will be invoked immediately).
855
void ExecuteFlushIfRequestedAsync(Function* callback);
857
// Overrides HtmlParse::Flush so that it can happen in two phases:
858
// 1. Pre-render chain runs, resulting in async rewrite activity
859
// 2. async rewrite activity ends, calling callback, and post-render
861
// This API is used for unit-tests & Apache (which lacks a useful event
862
// model) and results in blocking behavior.
864
// FlushAsync is prefered for event-driven servers.
865
virtual void Flush();
867
// Initiates an asynchronous Flush. done->Run() will be called when
868
// the flush is complete. Further calls to ParseText should be deferred until
869
// the callback is called. Scheduler mutex is not held while done is called.
870
void FlushAsync(Function* done);
872
// Queues up a task to run on the (high-priority) rewrite thread.
873
void AddRewriteTask(Function* task);
875
// Queues up a task to run on the low-priority rewrite thread.
876
// Such tasks are expected to be safely cancelable.
877
void AddLowPriorityRewriteTask(Function* task);
879
QueuedWorkerPool::Sequence* html_worker() { return html_worker_; }
880
QueuedWorkerPool::Sequence* rewrite_worker() { return rewrite_worker_; }
881
QueuedWorkerPool::Sequence* low_priority_rewrite_worker() {
882
return low_priority_rewrite_worker_;
885
Scheduler* scheduler() { return scheduler_; }
887
// Used by CacheExtender, CssCombineFilter, etc. for rewriting domains
888
// of sub-resources in CSS.
889
DomainRewriteFilter* domain_rewriter() { return domain_rewriter_.get(); }
890
UrlLeftTrimFilter* url_trim_filter() { return url_trim_filter_.get(); }
892
// Rewrites CSS content to absolutify any relative embedded URLs, streaming
893
// the results to the writer. Returns 'false' if the writer returns false
894
// or if the content was not rewritten because the domains of the gurl
895
// and resolved_base match.
897
// input_css_base contains the path where the CSS text came from.
898
// output_css_base contains the path where the CSS will be written.
899
CssResolutionStatus ResolveCssUrls(const GoogleUrl& input_css_base,
900
const StringPiece& output_css_base,
901
const StringPiece& contents,
903
MessageHandler* handler);
905
// Determines if an URL relative to the given input_base needs to be
906
// absolutified given that it will end up under output_base:
907
// - If we are proxying and input_base isn't proxy encoded, then yes.
908
// - If we aren't proxying and input_base != output_base, then yes.
909
// - If we aren't proxying and the domain lawyer will shard or rewrite
910
// input_base, then yes.
911
// If not NULL also set *proxy_mode to whether proxy mode is active or not.
912
bool ShouldAbsolutifyUrl(const GoogleUrl& input_base,
913
const GoogleUrl& output_base,
914
bool* proxy_mode) const;
916
// Update the PropertyValue named 'property_name' in dom cohort with
917
// the value 'property_value'. It is the responsibility of the client to
918
// ensure that property cache and dom cohort are enabled when this function is
919
// called. It is a programming error to call this function when property
920
// cache or dom cohort is not available, more so since the value payload has
921
// to be serialised before calling this function. Hence this function will
922
// DFATAL if property cache or dom cohort is not available.
923
void UpdatePropertyValueInDomCohort(
924
AbstractPropertyPage* page,
925
StringPiece property_name,
926
StringPiece property_value);
928
// Returns the property page which contains the cached properties associated
929
// with the current URL.
930
PropertyPage* property_page() const;
931
// Returns the property page which contains the cached properties associated
932
// with the current URL and fallback URL (i.e. without query params). This
933
// should be used where a property is interested in fallback values if
934
// actual values are not present.
935
FallbackPropertyPage* fallback_property_page() const {
936
return fallback_property_page_;
938
// Takes ownership of page.
939
void set_property_page(PropertyPage* page);
940
// Takes ownership of page.
941
void set_fallback_property_page(FallbackPropertyPage* page);
942
// Does not take the ownership of the page.
943
void set_unowned_fallback_property_page(FallbackPropertyPage* page);
945
// Used by ImageRewriteFilter for identifying critical images.
946
const CriticalLineInfo* critical_line_info() const;
948
// Inserts the critical images present on the requested html page. It takes
949
// the ownership of critical_line_info.
950
void set_critical_line_info(CriticalLineInfo* critical_line_info);
952
CriticalKeys* beacon_critical_line_info() const;
953
void set_beacon_critical_line_info(CriticalKeys* beacon_critical_line_info);
955
const SplitHtmlConfig* split_html_config();
957
CriticalCssResult* critical_css_result() const;
958
// Sets the Critical CSS rules info in the driver and the ownership of
959
// the rules stays with the driver.
960
void set_critical_css_result(CriticalCssResult* critical_css_rules);
962
// The JS to detect above-the-fold images should only be enabled if one of the
963
// filters that uses critical image information is enabled, the property cache
964
// is enabled (since the critical image information is stored in the property
965
// cache), and it is not explicitly disabled through options.
966
bool is_critical_images_beacon_enabled();
968
// Used by ImageRewriteFilter for identifying critical images.
969
CriticalImagesInfo* critical_images_info() const {
970
return critical_images_info_.get();
973
// This should only be called by the CriticalSelectorFinder. Normal users
974
// should call CriticalSelectorFinder::IsCriticalImage.
975
// TODO(jud): Remove when the finders reside in RewriteDriver and manage their
977
CriticalSelectorInfo* critical_selector_info() {
978
return critical_selector_info_.get();
981
// This should only be called by the CriticalSelectorFinder.
982
// TODO(jud): Remove when the finders reside in RewriteDriver and manage their
984
void set_critical_selector_info(CriticalSelectorInfo* info) {
985
critical_selector_info_.reset(info);
988
// Inserts the critical images present on the requested html page. It takes
989
// ownership of critical_images_info. This should only be called by the
990
// CriticalImagesFinder, normal users should just be using the automatic
991
// management of critical_images_info that CriticalImagesFinder provides.
992
void set_critical_images_info(CriticalImagesInfo* critical_images_info) {
993
critical_images_info_.reset(critical_images_info);
996
// Return true if we must prioritize critical selectors, and we should
997
// therefore enable its prerequisite filters as well.
998
bool CriticalSelectorsEnabled() const;
1000
// Return true if we must flatten css imports, either because the filter is
1001
// enabled explicitly or because it is enabled by CriticalSelectorsEnabled.
1002
bool FlattenCssImportsEnabled() const {
1003
return (options()->Enabled(RewriteOptions::kFlattenCssImports) ||
1004
(!options()->Forbidden(RewriteOptions::kFlattenCssImports) &&
1005
(CriticalSelectorsEnabled() ||
1006
options()->Enabled(RewriteOptions::kComputeCriticalCss))));
1009
// We expect to this method to be called on the HTML parser thread.
1010
// Returns the number of images whose low quality images are inlined in the
1012
int num_inline_preview_images() const { return num_inline_preview_images_; }
1014
// We expect to this method to be called on the HTML parser thread.
1015
void increment_num_inline_preview_images();
1017
// We expect to this method to be called on the HTML parser thread.
1018
// Returns the number of pagespeed resources flushed by flush early flow.
1019
int num_flushed_early_pagespeed_resources() const {
1020
return num_flushed_early_pagespeed_resources_;
1023
// We expect to this method to be called on the HTML parser thread or after
1024
// parsing is completed.
1025
void increment_num_flushed_early_pagespeed_resources() {
1026
++num_flushed_early_pagespeed_resources_;
1029
// Increment reference count for misc. async ops that need the RewriteDriver
1031
void increment_async_events_count();
1033
// Decrements a reference count bumped up by increment_async_events_count()
1034
void decrement_async_events_count();
1036
// Determines whether the document's Content-Type has a mimetype indicating
1037
// that browsers should parse it as XHTML.
1038
XhtmlStatus MimeTypeXhtmlStatus();
1040
void set_flushed_cached_html(bool x) { flushed_cached_html_ = x; }
1041
bool flushed_cached_html() { return flushed_cached_html_; }
1043
void set_flushing_cached_html(bool x) { flushing_cached_html_ = x; }
1044
bool flushing_cached_html() const { return flushing_cached_html_; }
1046
void set_flushed_early(bool x) { flushed_early_ = x; }
1047
bool flushed_early() const { return flushed_early_; }
1049
void set_flushing_early(bool x) { flushing_early_ = x; }
1050
bool flushing_early() const { return flushing_early_; }
1052
void set_is_lazyload_script_flushed(bool x) {
1053
is_lazyload_script_flushed_ = x;
1055
bool is_lazyload_script_flushed() const {
1056
return is_lazyload_script_flushed_; }
1058
// This method is not thread-safe. Call it only from the html parser thread.
1059
FlushEarlyInfo* flush_early_info();
1061
FlushEarlyRenderInfo* flush_early_render_info() const;
1063
// Takes the ownership of flush_early_render_info. This method is not
1064
// thread-safe. Call it only from the html parser thread.
1065
void set_flush_early_render_info(
1066
FlushEarlyRenderInfo* flush_early_render_info);
1068
// Determines whether we are currently in Debug mode; meaning that the
1069
// site owner or user has enabled filter kDebug.
1070
bool DebugMode() const { return options()->Enabled(RewriteOptions::kDebug); }
1072
// Log the given debug message(s) as HTML comments after the given element,
1073
// if not NULL, it has not been flushed, and if debug is enabled. The form
1074
// that takes a repeated field is intended for use by CachedResult, e.g:
1075
// InsertDebugComment(cached_result.debug_message(), element);
1076
// Messages are HTML-escaped before being written out to the DOM.
1077
void InsertDebugComment(StringPiece unescaped_message, HtmlNode* node);
1078
void InsertDebugComments(
1079
const protobuf::RepeatedPtrField<GoogleString>& unescaped_messages,
1080
HtmlElement* element);
1081
void InsertUnauthorizedDomainDebugComment(StringPiece url,
1082
HtmlElement* element);
1084
// Generates an unauthorized domain debug comment. Public for unit tests.
1085
static GoogleString GenerateUnauthorizedDomainDebugComment(
1086
const GoogleUrl& gurl);
1088
// Saves the origin headers for a request in flush_early_info so that it can
1089
// be used in subsequent request.
1090
void SaveOriginalHeaders(const ResponseHeaders& response_headers);
1092
// log_record() always returns a pointer to a valid AbstractLogRecord, owned
1093
// by the rewrite_driver's request context.
1094
AbstractLogRecord* log_record();
1096
DomStatsFilter* dom_stats_filter() const {
1097
return dom_stats_filter_;
1100
// Determines whether the system is healthy enough to rewrite resources.
1101
// Currently, systems get sick based on the health of the metadata cache.
1102
bool can_rewrite_resources() const { return can_rewrite_resources_; }
1104
// Determine whether this driver is nested inside another.
1105
bool is_nested() const { return is_nested_; }
1107
// Determines whether metadata was requested in the response headers and
1108
// verifies that the key in the header is the same as the expected key. An
1109
// empty expected key returns false.
1110
bool MetadataRequested(const RequestHeaders& request_headers) const;
1112
// Did the driver attempt to distribute the fetch?
1113
bool tried_to_distribute_fetch() const { return tried_to_distribute_fetch_; }
1115
// Writes the specified contents into the output resource, and marks it
1116
// as optimized. 'inputs' described the input resources that were used
1117
// to construct the output, and is used to determine whether the
1118
// result can be safely cache extended and be marked publicly cacheable.
1119
// 'content_type' and 'charset' specify the mimetype and encoding of
1120
// the contents, and will help form the Content-Type header.
1121
// 'charset' may be empty when not specified.
1123
// Note that this does not escape charset.
1125
// Callers should take care that dangerous types like 'text/html' do not
1126
// sneak into content_type.
1127
bool Write(const ResourceVector& inputs,
1128
const StringPiece& contents,
1129
const ContentType* type,
1130
StringPiece charset,
1131
OutputResource* output);
1133
void set_defer_instrumentation_script(bool x) {
1134
defer_instrumentation_script_ = x;
1136
bool defer_instrumentation_script() const {
1137
return defer_instrumentation_script_;
1140
// Sets the num_initiated_rewrites_. This should only be called from test
1142
void set_num_initiated_rewrites(int64 x) {
1143
ScopedMutex lock(rewrite_mutex());
1144
num_initiated_rewrites_ = x;
1146
int64 num_initiated_rewrites() const {
1147
ScopedMutex lock(rewrite_mutex());
1148
return num_initiated_rewrites_;
1150
// Sets the num_detached_rewrites_. This should only be called from test code.
1151
void set_num_detached_rewrites(int64 x) {
1152
ScopedMutex lock(rewrite_mutex());
1153
num_detached_rewrites_ = x;
1155
int64 num_detached_rewrites() const {
1156
ScopedMutex lock(rewrite_mutex());
1157
return num_detached_rewrites_;
1160
void set_pagespeed_query_params(StringPiece x) {
1161
x.CopyToString(&pagespeed_query_params_);
1163
StringPiece pagespeed_query_params() const {
1164
return pagespeed_query_params_;
1167
void set_pagespeed_option_cookies(StringPiece x) {
1168
x.CopyToString(&pagespeed_option_cookies_);
1170
StringPiece pagespeed_option_cookies() const {
1171
return pagespeed_option_cookies_;
1174
// We fragment the cache based on the hostname we got from the request, unless
1175
// that was overridden in the options with a cache_fragment.
1176
const GoogleString& CacheFragment() const;
1178
// Utility function to set/clear cookies for PageSpeed options. gurl is the
1179
// URL of the request from which the host is extracted for a cookie attribute.
1180
// TODO(matterbury): Get the URL from 'this' which we can't do now because it
1181
// isn't set until we've decided that the content of requested URL is HTML.
1182
// Returns true if any Set-Cookie headers are added, in which case
1183
// ComputeCaching has been called on response_headers.
1184
bool SetOrClearPageSpeedOptionCookies(const GoogleUrl& gurl,
1185
ResponseHeaders* response_headers);
1187
// Calls the provided ResourceNamer's Decode() function, passing the hash and
1188
// signature lengths from this RewriteDriver.
1189
bool Decode(StringPiece leaf, ResourceNamer* resource_namer) const;
1192
virtual void DetermineEnabledFiltersImpl();
1195
friend class DistributedRewriteContextTest;
1196
friend class RewriteContext;
1197
friend class RewriteDriverTest;
1198
friend class RewriteTestBase;
1199
friend class ServerContextTest;
1201
typedef std::map<GoogleString, RewriteFilter*> StringFilterMap;
1203
// Returns true if the given fetch request should be distributed.
1204
bool ShouldDistributeFetch(const StringPiece& filter_id);
1206
// Distributes the fetch to another task if ShouldDistributeFetch allows it
1207
// for the provided filter_id and streams the result to the provided fetch
1210
// Returns true if an attempt to distribute was made. If the attempt fails
1211
// before async_fetch was written to (before ResponseHeaders) it will call
1212
// RewriteDriver::FetchResource() and skip distribution. If the attempt fails
1213
// after writing to the ResponseHeaders then the fetch will ultimately fail
1214
// and the client will get a broken resource.
1216
// Returns false if ShouldDistributeFetch disallows the distribution.
1217
bool DistributeFetch(const StringPiece& url, const StringPiece& filter_id,
1218
AsyncFetch* async_fetch);
1220
// Checks whether outstanding rewrites are completed in a satisfactory fashion
1221
// with respect to given wait_mode and timeout, and invokes done->Run() (with
1222
// rewrite_mutex released) when either finished or timed out. May relinquish
1223
// rewrite_mutex() temporarily to invoke done.
1224
void CheckForCompletionAsync(WaitMode wait_mode, int64 timeout_ms,
1226
EXCLUSIVE_LOCKS_REQUIRED(rewrite_mutex());
1228
// A single check attempt for the above. Will either invoke callback (with
1229
// rewrite_mutex released) or ask scheduler to check again. May relinquish
1230
// rewrite_mutex() temporarily to invoke done.
1231
void TryCheckForCompletion(WaitMode wait_mode, int64 end_time_ms,
1233
EXCLUSIVE_LOCKS_REQUIRED(rewrite_mutex());
1235
// Termination predicate for above.
1236
bool IsDone(WaitMode wait_mode, bool deadline_reached)
1237
EXCLUSIVE_LOCKS_REQUIRED(rewrite_mutex());
1239
// Always wait for pending async events during shutdown or while waiting for
1240
// the completion of all rewriting (except in fast_blocking_rewrite mode).
1241
bool WaitForPendingAsyncEvents(WaitMode wait_mode) {
1242
return wait_mode == kWaitForShutDown ||
1243
(fully_rewrite_on_flush_ && !fast_blocking_rewrite_);
1246
// Portion of flush that happens asynchronously off the scheduler
1247
// once the rendering is complete. Calls back to 'callback' after its
1248
// processing, but with the lock released.
1249
void FlushAsyncDone(int num_rewrites, Function* callback);
1251
// Returns the amount of time to wait for rewrites to complete for the
1252
// current flush window. This combines the per-flush window deadline
1253
// (configured via rewrite_deadline_ms()) and the per-page deadline
1254
// (configured via max_page_processing_delay_ms()).
1255
int64 ComputeCurrentFlushWindowRewriteDelayMs();
1257
// Queues up invocation of FlushAsyncDone in our html_workers sequence.
1258
void QueueFlushAsyncDone(int num_rewrites, Function* callback);
1260
// Called as part of implementation of FinishParseAsync, after the
1261
// flush is complete.
1262
void QueueFinishParseAfterFlush(Function* user_callback);
1263
void FinishParseAfterFlush(Function* user_callback);
1265
bool RewritesComplete() const EXCLUSIVE_LOCKS_REQUIRED(rewrite_mutex());
1267
// Sets the base GURL in response to a base-tag being parsed. This
1268
// should only be called by ScanFilter.
1269
void SetBaseUrlIfUnset(const StringPiece& new_base);
1271
// Sets the base URL for a resource fetch. This should only be called from
1272
// test code and from FetchResource.
1273
void SetBaseUrlForFetch(const StringPiece& url);
1275
// Saves a decoding of the Base URL in decoded_base_url_. Use this
1276
// whenever updating base_url_.
1277
void SetDecodedUrlFromBase();
1279
// The rewrite_mutex is owned by the scheduler.
1280
AbstractMutex* rewrite_mutex() const LOCK_RETURNED(scheduler_->mutex()) {
1281
return scheduler_->mutex();
1284
// Parses an arbitrary block of an html file
1285
virtual void ParseTextInternal(const char* content, int size);
1287
// Indicates whether we should skip parsing for the given request.
1288
bool ShouldSkipParsing();
1290
// Returns the length of the signature on a signed resource URL.
1291
int SignatureLength() const;
1293
friend class ScanFilter;
1295
// Registers RewriteFilter in the map, but does not put it in the
1296
// html parse filter chain. This allows it to serve resource
1298
void RegisterRewriteFilter(RewriteFilter* filter);
1300
// Adds an already-owned rewrite filter to the pre-render chain. This
1301
// is used for filters that are unconditionally created for handling of
1302
// resources, but their presence in the html-rewrite chain is conditional
1304
void EnableRewriteFilter(const char* id);
1306
// Internal low-level helper for resource creation.
1307
// Use only when permission checking has been done explicitly on the
1308
// caller side. is_authorized_domain is passed along to Resource object
1309
// creation, in order to decide whether to keep the resource in the usual
1310
// key space or a separate one meant for unauthorized resources only.
1311
ResourcePtr CreateInputResourceUnchecked(const GoogleUrl& gurl,
1312
bool is_authorized_domain);
1314
void AddPreRenderFilters();
1315
void AddPostRenderFilters();
1317
// Helper function to decode the pagespeed url.
1318
bool DecodeOutputResourceNameHelper(const GoogleUrl& url,
1319
const RewriteOptions* options_to_use,
1320
const UrlNamer* url_namer,
1321
ResourceNamer* name_out,
1322
OutputResourceKind* kind_out,
1323
RewriteFilter** filter_out,
1324
GoogleString* url_base,
1325
StringVector* urls) const;
1327
// When HTML parsing is complete, we have learned all we can about the DOM, so
1328
// immediately write anything required into that Cohort into the page property
1329
// cache. Writes to this cohort are predicated so that they only occur if a
1330
// filter that actually makes use of it is enabled. This prevents filling the
1331
// cache with unnecessary entries. To enable writing, a filter should override
1332
// DetermineEnabled to call
1333
// RewriteDriver::set_write_property_cache_dom_cohort(true), or in the case of
1334
// a RewriteFilter, should override
1335
// RewriteFilter::UsesPropertyCacheDomCohort() to return true.
1336
void WriteDomCohortIntoPropertyCache();
1338
// Used by CreateCacheFetcher() and CreateCacheOnlyFetcher().
1339
CacheUrlAsyncFetcher* CreateCustomCacheFetcher(UrlAsyncFetcher* base_fetcher);
1341
// Just before releasing the rewrite driver, check if the feature for storing
1342
// rewritten responses (e.g. html) in cache is enabled. If yes, purge the
1343
// old response if significant amount of rewriting happened after this
1344
// response was stored in the cache. If not, release the rewrite driver. If a
1345
// purge fetch request is issued, the rewrite driver will be released after
1346
// this async fetch request is completed.
1347
void PossiblyPurgeCachedResponseAndReleaseDriver();
1349
// Log statistics to the AbstractLogRecord.
1352
// This pair of calls helps determine if code that changes event state
1353
// should wake up anyone waiting for rewrite driver's completion.
1355
// The usage pattern is something like this:
1356
// ScopedMutex lock(rewrite_mutex());
1357
// bool should_signal_cookie = PrepareShouldSignal();
1362
// SignalIfRequired(should_signal_cookie);
1364
// WARNING: SignalIfRequired() drops the lock on rewrite_mutex() temporarily,
1365
// so 'this' could get deleted after it returns, so it should not be accessed
1367
bool PrepareShouldSignal() EXCLUSIVE_LOCKS_REQUIRED(rewrite_mutex());
1368
void SignalIfRequired(bool result_of_prepare_should_signal)
1369
EXCLUSIVE_LOCKS_REQUIRED(rewrite_mutex());
1371
// Only the first base-tag is significant for a document -- any subsequent
1372
// ones are ignored. There should be no URLs referenced prior to the base
1373
// tag, if one exists. See
1375
// http://www.whatwg.org/specs/web-apps/current-work/multipage/
1376
// semantics.html#the-base-element
1377
// http://www.whatwg.org/specs/web-apps/current-work/multipage/
1378
// urls.html#document-base-url
1380
// Thus we keep the base-tag in the RewriteDriver, and also keep track of
1381
// whether it's been reset already within the document.
1384
// Stores whether or not there were references to urls before the
1385
// base tag (if there is a base tag) in this document. If there is
1386
// no base tag, this should be false. If the base tag is before all
1387
// other url references, this should also be false.
1388
bool refs_before_base_;
1390
// The charset of the containing HTML page.
1391
GoogleString containing_charset_;
1393
// Copies properties from the request headers to the request context,
1394
// if both are non-null.
1395
void PopulateRequestContext();
1397
bool filters_added_;
1398
bool externally_managed_;
1400
// Memory management stuff. Some of the reference counts we keep track of
1401
// also are used as a count of events, to help determine when we are done.
1403
// WARNING: every time you decrement reference counts, you should
1404
// check release_driver_ within the critical section, and call
1405
// PossiblyPurgeCachedResponseAndReleaseDriver() if it is true
1406
// after releasing the lock. The easiest way to get it right is to just call
1409
kRefUser, // External refcount from users
1410
kRefParsing, // Parser active
1412
// The number of rewrites (RewriteContext) that have been requested,
1413
// and not yet completed, and for which we still hope to render
1414
// them within the flush window. This is waited for.
1415
kRefPendingRewrites,
1417
// The number of rewrites (RewriteContext) that have missed the rendering
1418
// deadline. We don't wait for them, but they still need to keep
1419
// the RewriteDriver alive.
1420
kRefDetachedRewrites,
1422
// Tracks the number of RewriteContexts that have been completed,
1423
// but not yet deleted. Once RewriteComplete has been called,
1424
// rewrite_context->Propagate() is called to render slots (if not
1425
// detached) and to queue up activity that must occur prior to the
1426
// context being deleted: specifically running any successors.
1427
// After all that occurs, DeleteRewriteContext must be called and
1428
// that will decrement this counter.
1429
kRefDeletingRewrites,
1431
// Keeps track of fetch-responding work that's user-facing.
1432
kRefFetchUserFacing,
1434
// Keeps track of any background continuation of a fetch.
1435
kRefFetchBackground,
1437
// Misc async references from outside
1439
// TODO(morlovich): Split between events people might want to wait for
1440
// and events which they don't in a follow up.
1446
friend class CategorizedRefcount<RewriteDriver, RefCategory>;
1448
// Protected by rewrite_mutex().
1449
CategorizedRefcount<RewriteDriver, RefCategory> ref_counts_;
1451
// Interface to CategorizedRefcount
1452
void LastRefRemoved();
1453
StringPiece RefCategoryName(RefCategory cat);
1455
// Drops a reference of given kind, signaling any waiters
1456
// and potentially even releasing the rewrite driver.
1457
void DropReference(RefCategory cat);
1459
// Set to true when the refcount reaches 0. See comment
1460
// above RefCategory for how this should be used.
1461
bool release_driver_;
1463
// Indicates that the rewrite driver is currently parsing the HTML,
1464
// and thus should not be recycled under FinishParse() is called.
1465
bool parsing_ GUARDED_BY(rewrite_mutex());
1467
// If not kNoWait, indicates that WaitForCompletion or similar method
1468
// have been called, and an another thread is waiting for us to notify it of
1469
// everything having been finished in a given mode.
1470
WaitMode waiting_ GUARDED_BY(rewrite_mutex());
1472
// This is set to true if the current wait's deadline has expired.
1473
bool waiting_deadline_reached_ GUARDED_BY(rewrite_mutex());
1475
// If this is true, the usual HTML streaming interface will let rendering
1476
// of every flush window fully complete before proceeding rather than
1477
// use a deadline. This means rewriting of HTML may be slow, and hence
1478
// should not be used for online traffic.
1479
bool fully_rewrite_on_flush_;
1481
// If this is true, we don't wait for async events before flushing bytes to
1482
// the client during a blocking rewrite; else we do wait for async events.
1483
bool fast_blocking_rewrite_;
1485
bool flush_requested_;
1486
bool flush_occurred_;
1488
// If it is true, then cached html is flushed.
1489
bool flushed_cached_html_;
1491
// If it is true, then we are using this RewriteDriver to flush cached html.
1492
bool flushing_cached_html_;
1494
// If it is true, then the bytes were flushed before receiving bytes from the
1496
bool flushed_early_;
1497
// If set to true, then we are using this RewriteDriver to flush HTML to the
1498
// user early. This is only set to true when
1499
// enable_flush_subresources_experimental is true.
1500
bool flushing_early_;
1502
// If it is set to true, then lazyload script is flushed with flush early
1504
bool is_lazyload_script_flushed_;
1506
// Tracks whether any filter that uses the dom cohort of the property cache is
1507
// enabled. Writes to the property cache for this cohort are predicated on
1509
bool write_property_cache_dom_cohort_;
1511
// URL of the HTML pages being rewritten in the HTML flow or the
1512
// of the resource being rewritten in the resource flow.
1513
GoogleUrl base_url_;
1515
// In the resource flow, the URL requested may not have the same
1516
// base as the original resource. decoded_base_url_ stores the base
1517
// of the original (un-rewritten) resource.
1518
GoogleUrl decoded_base_url_;
1520
// This is the URL that is being fetched in a fetch path (not valid in HTML
1522
GoogleString fetch_url_;
1524
GoogleString user_agent_;
1526
LazyBool should_skip_parsing_;
1528
StringFilterMap resource_filter_map_;
1530
ResponseHeaders* response_headers_;
1532
// request_headers_ is a copy of the Fetch's request headers, and it
1533
// stays alive until the rewrite driver is recycled or deleted.
1534
scoped_ptr<const RequestHeaders> request_headers_;
1536
int status_code_; // Status code of response for this request.
1538
// This group of rewrite-context-related variables is accessed
1539
// only in the main thread of RewriteDriver (aka the HTML thread).
1540
typedef std::vector<RewriteContext*> RewriteContextVector;
1541
RewriteContextVector rewrites_; // ordered list of rewrites to initiate
1543
// The maximum amount of time to wait for page processing across all flush
1544
// windows. A negative value implies no limit.
1545
int max_page_processing_delay_ms_;
1547
typedef std::set<RewriteContext*> RewriteContextSet;
1549
// Contains the RewriteContext* that have been queued into the
1550
// RewriteThread, but have not gotten to the point where
1551
// RewriteComplete() has been called. This set is cleared
1552
// one the rewrite_deadline_ms has passed.
1553
RewriteContextSet initiated_rewrites_ GUARDED_BY(rewrite_mutex());
1555
// Number of total initiated rewrites for the request.
1556
int64 num_initiated_rewrites_ GUARDED_BY(rewrite_mutex());
1558
// Number of total detached rewrites for the request, i.e. rewrites whose
1559
// results did not make it to the response. This is different from
1560
// kRefDetachedRewrites (and detached_rewrites_.size(), which is equal to it)
1561
// since that counter is for the number of rewrites
1562
// currently in the detached state for the current flush window,
1563
// while this variable is total that ever got detached over all of the
1565
int64 num_detached_rewrites_ GUARDED_BY(rewrite_mutex());
1567
// Contains the RewriteContext* that were still running at the deadline.
1568
// They are said to be in a "detached" state although the RewriteContexts
1569
// themselves don't know that. They will continue performing their
1570
// Rewrite in the RewriteThread, and caching the results. And until
1571
// they complete, the RewriteDriver must stay alive and not be Recycled
1572
// or deleted. WaitForCompletion() blocks until all detached_rewrites
1573
// have been retired.
1574
RewriteContextSet detached_rewrites_ GUARDED_BY(rewrite_mutex());
1576
// Rewrites that may possibly be satisfied from metadata cache alone.
1577
int possibly_quick_rewrites_ GUARDED_BY(rewrite_mutex());
1579
// List of RewriteContext objects for fetch to delete. We do it in
1580
// clear as a simplification.
1581
RewriteContextVector fetch_rewrites_;
1583
// These objects are provided on construction or later, and are
1584
// owned by the caller.
1585
FileSystem* file_system_;
1586
ServerContext* server_context_;
1587
Scheduler* scheduler_;
1588
UrlAsyncFetcher* default_url_async_fetcher_; // the fetcher we got at ctor
1590
// This is the fetcher we use --- it's either the default_url_async_fetcher_,
1591
// or whatever it was temporarily overridden to by SetSessionFetcher.
1592
// This is either owned externally or via owned_url_async_fetchers_.
1593
UrlAsyncFetcher* url_async_fetcher_;
1595
// This is the fetcher that is used to distribute rewrites if enabled. This
1596
// can be NULL if distributed rewriting is not configured. This is owned
1598
UrlAsyncFetcher* distributed_async_fetcher_;
1600
// A list of all the UrlAsyncFetchers that we own, as set with
1601
// SetSessionFetcher.
1602
std::vector<UrlAsyncFetcher*> owned_url_async_fetchers_;
1604
DomStatsFilter* dom_stats_filter_;
1605
scoped_ptr<HtmlWriterFilter> html_writer_filter_;
1607
ScanFilter scan_filter_;
1608
scoped_ptr<DomainRewriteFilter> domain_rewriter_;
1609
scoped_ptr<UrlLeftTrimFilter> url_trim_filter_;
1611
// Maps rewrite context partition keys to the context responsible for
1612
// rewriting them, in case a URL occurs more than once.
1613
typedef std::map<GoogleString, RewriteContext*> PrimaryRewriteContextMap;
1614
PrimaryRewriteContextMap primary_rewrite_context_map_;
1616
HtmlResourceSlotSet slots_;
1618
scoped_ptr<RewriteOptions> options_;
1620
RewriteDriverPool* controlling_pool_; // or NULL if this has custom options.
1622
// Object which manages CacheUrlAsyncFetcher async operations.
1623
scoped_ptr<CacheUrlAsyncFetcher::AsyncOpHooks>
1624
cache_url_async_fetcher_async_op_hooks_;
1626
// The default resource encoder
1627
UrlSegmentEncoder default_encoder_;
1629
// The first chain of filters called before waiting for Rewrites to complete.
1630
FilterList early_pre_render_filters_;
1631
// The second chain of filters called before waiting for Rewrites to complete.
1632
FilterList pre_render_filters_;
1635
std::vector<ResourceUrlClaimant*> resource_claimants_;
1637
// A container of filters to delete when RewriteDriver is deleted. This
1638
// can include pre_render_filters as well as those added to the post-render
1639
// chain owned by HtmlParse.
1640
FilterVector filters_to_delete_;
1642
QueuedWorkerPool::Sequence* html_worker_;
1643
QueuedWorkerPool::Sequence* rewrite_worker_;
1644
QueuedWorkerPool::Sequence* low_priority_rewrite_worker_;
1648
// Stores any cached properties associated with the current URL and fallback
1649
// URL (i.e. without query params).
1650
FallbackPropertyPage* fallback_property_page_;
1652
// Boolean value which tells whether property page is owned by driver or not.
1653
bool owns_property_page_;
1655
// Device type for the current property page.
1656
UserAgentMatcher::DeviceType device_type_;
1658
scoped_ptr<CriticalLineInfo> critical_line_info_;
1659
scoped_ptr<CriticalKeys> beacon_critical_line_info_;
1661
scoped_ptr<SplitHtmlConfig> split_html_config_;
1663
// The critical image finder and critical selector finder will lazy-init these
1665
scoped_ptr<CriticalImagesInfo> critical_images_info_;
1666
scoped_ptr<CriticalSelectorInfo> critical_selector_info_;
1668
scoped_ptr<CriticalCssResult> critical_css_result_;
1670
// Memoized computation of whether the current doc has an XHTML mimetype.
1671
bool xhtml_mimetype_computed_;
1672
XhtmlStatus xhtml_status_ : 8;
1674
// Number of images whose low quality images are inlined in the html page by
1675
// InlinePreviewFilter.
1676
int num_inline_preview_images_;
1678
// Number of flushed early pagespeed rewritten resource.
1679
int num_flushed_early_pagespeed_resources_;
1681
// The total number of bytes for which ParseText is called.
1684
DebugFilter* debug_filter_;
1686
scoped_ptr<FlushEarlyInfo> flush_early_info_;
1687
scoped_ptr<FlushEarlyRenderInfo> flush_early_render_info_;
1689
bool can_rewrite_resources_;
1692
// Additional request context that may outlive this RewriteDriver. (Thus,
1693
// the context is reference counted.)
1694
RequestContextPtr request_context_;
1696
// Start time for HTML requests. Used for statistics reporting.
1697
int64 start_time_ms_;
1699
scoped_ptr<RequestProperties> request_properties_;
1701
// Helps make sure RewriteDriver and its children are initialized exactly
1702
// once, allowing for multiple calls to RewriteDriver::Initialize as long
1703
// as they are matched to RewriteDriver::Terminate.
1704
static int initialized_count_;
1706
// True if this RewriteDriver attempted to distribute the rewrite. This is
1707
// used to prevent a second attempt in case the first errored out.
1708
bool tried_to_distribute_fetch_;
1710
// If false, add pagespeed_no_defer attribute to the script inserted by
1711
// add_instrumentation filter.
1712
bool defer_instrumentation_script_;
1714
// Downstream cache object used for issuing purges.
1715
DownstreamCachePurger downstream_cache_purger_;
1717
// Any PageSpeed options stripped from the original URL.
1718
GoogleString pagespeed_query_params_;
1720
// Any PageSpeed option cookies from the original request.
1721
GoogleString pagespeed_option_cookies_;
1723
DISALLOW_COPY_AND_ASSIGN(RewriteDriver);
1726
// Subclass of HTTPCache::Callback that incorporates a given RewriteOptions'
1727
// invalidation policy.
1728
class OptionsAwareHTTPCacheCallback : public HTTPCache::Callback {
1730
virtual ~OptionsAwareHTTPCacheCallback();
1731
virtual bool IsCacheValid(const GoogleString& key,
1732
const ResponseHeaders& headers);
1733
virtual int64 OverrideCacheTtlMs(const GoogleString& key);
1734
virtual ResponseHeaders::VaryOption RespectVaryOnResources() const;
1736
// Validates the specified response for the URL, request, given the specified
1737
// options. This is for checking if cache response can still be used, not for
1738
// determining whether an entry should be written to an HTTP cache.
1739
static bool IsCacheValid(const GoogleString& key,
1740
const RewriteOptions& rewrite_options,
1741
const RequestContextPtr& request_ctx,
1742
const ResponseHeaders& headers);
1745
// Sub-classes need to ensure that rewrite_options remains valid till
1746
// Callback::Done finishes.
1747
OptionsAwareHTTPCacheCallback(
1748
const RewriteOptions* rewrite_options,
1749
const RequestContextPtr& request_ctx);
1752
const RewriteOptions* rewrite_options_;
1754
DISALLOW_COPY_AND_ASSIGN(OptionsAwareHTTPCacheCallback);
1757
} // namespace net_instaweb
1759
#endif // NET_INSTAWEB_REWRITER_PUBLIC_REWRITE_DRIVER_H_