1
// Copyright 2014 Google Inc. All Rights Reserved.
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
7
// http://www.apache.org/licenses/LICENSE-2.0
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.
15
#ifndef PAGESPEED_KERNEL_JS_JS_TOKENIZER_H_
16
#define PAGESPEED_KERNEL_JS_JS_TOKENIZER_H_
22
#include "pagespeed/kernel/base/basictypes.h"
23
#include "pagespeed/kernel/base/string.h"
24
#include "pagespeed/kernel/base/string_util.h"
25
#include "pagespeed/kernel/js/js_keywords.h"
26
#include "pagespeed/kernel/util/re2.h"
32
class JsTokenizerPatterns;
34
// This class accurately breaks up JavaScript code into a sequence of tokens.
35
// This includes tokens for comments and whitespace; every byte of the input is
36
// represented in the token stream, so that concatenating the text of each
37
// token will perfectly recover the original input, even in error cases (since
38
// the final, error token will contain the entire rest of the input). Also,
39
// each whitespace token is classified by the tokenizer as 1) not containing
40
// linebreaks, 2) containing linebreaks but not inducing semicolon insertion,
41
// or 3) inducing semicolon insertion.
43
// To do all this, JsTokenizer keeps track of a minimal amount of parse state
44
// to allow it to accurately differentiate between division operators and regex
45
// literals, and to determine which linebreaks will result in semicolon
46
// insertion and which will not. If the given JavaScript code is syntactically
47
// incorrect such that this differentiation becomes impossible, this class will
48
// return an error, but will still tokenize as much as it can up to that point
49
// (note however that many other kinds of syntax errors will be ignored; being
50
// a complete parser or syntax checker is a non-goal of this class).
52
// This separation of tokens and classification of whitespace means that this
53
// class can be used to create a robust JavaScript minifier (see js_minify.h).
54
// It could also perhaps be used as the basis of a more complete JavaScript
58
// Creates a tokenizer that will tokenize the given UTF8-encoded input string
59
// (which must outlive the JsTokenizer object).
60
JsTokenizer(const JsTokenizerPatterns* patterns, StringPiece input);
64
// Gets the next token type from the input, and stores the relevant substring
65
// of the original input in token_out (which must be non-NULL). If the end
66
// of input has been reached, returns kEndOfInput and sets token_out to the
67
// empty string. If an error is encountered, sets has_error() to true,
68
// returns kError, and sets token_out to the remainder of the input.
69
JsKeywords::Type NextToken(StringPiece* token_out);
71
// True if an error has been encountered. All future calls to NextToken()
72
// will return JsKeywords::kError with an empty token string.
73
bool has_error() const { return error_; }
75
// Return a string representing the current parse stack, for testing only.
76
GoogleString ParseStackForTest() const;
79
// An entry in the parse stack. This does not fully capture the grammar of
80
// JavaScript -- far from it -- rather, it is just barely nuanced enough to
81
// determine which linebreaks are important for semicolon insertion, and to
82
// tell whether or not a given slash begins a regex literal. If it turns out
83
// to insufficiently nuanced (i.e. we find new bugs), it can be refined by
84
// adding more parse states.
86
kStartOfInput, // For convenience, the bottom of the stack is always this.
88
kOperator, // A prefix or binary operator (including some keywords).
93
kBlockKeyword, // Keyword that preceeds "(...)", e.g. "if" or "for".
94
kBlockHeader, // Start of block, e.g. "if (...)", "for (...)", or "else".
95
kReturnThrow, // A return or throw keyword.
96
kJumpKeyword, // A break, continue, or debugger keyword.
97
kOtherKeyword, // A const, default, or var keyword.
100
// Consumes an appropriate amount of input and return an appropriate token.
101
JsKeywords::Type ConsumeOpenBrace(StringPiece* token_out);
102
JsKeywords::Type ConsumeCloseBrace(StringPiece* token_out);
103
JsKeywords::Type ConsumeOpenBracket(StringPiece* token_out);
104
JsKeywords::Type ConsumeCloseBracket(StringPiece* token_out);
105
JsKeywords::Type ConsumeOpenParen(StringPiece* token_out);
106
JsKeywords::Type ConsumeCloseParen(StringPiece* token_out);
107
JsKeywords::Type ConsumeBlockComment(StringPiece* token_out);
108
JsKeywords::Type ConsumeLineComment(StringPiece* token_out);
109
JsKeywords::Type ConsumeColon(StringPiece* token_out);
110
JsKeywords::Type ConsumeNumber(StringPiece* token_out);
111
JsKeywords::Type ConsumeOperator(StringPiece* token_out);
112
JsKeywords::Type ConsumePeriod(StringPiece* token_out);
113
JsKeywords::Type ConsumeQuestionMark(StringPiece* token_out);
114
JsKeywords::Type ConsumeRegex(StringPiece* token_out);
115
JsKeywords::Type ConsumeSemicolon(StringPiece* token_out);
116
JsKeywords::Type ConsumeSlash(StringPiece* token_out);
117
JsKeywords::Type ConsumeString(StringPiece* token_out);
119
// For each of these methods, if the start of the input is that kind of
120
// token, consumes the token and returns true, otherwise returns false
121
// without making changes.
122
bool TryConsumeComment(
123
JsKeywords::Type* type_out, StringPiece* token_out);
124
bool TryConsumeIdentifierOrKeyword(
125
JsKeywords::Type* type_out, StringPiece* token_out);
126
bool TryConsumeWhitespace(
127
bool allow_semicolon_insertion,
128
JsKeywords::Type* type_out, StringPiece* token_out);
130
// Sets error_ to true and returns an error token.
131
JsKeywords::Type Error(StringPiece* token_out);
133
// Stores the next num_chars characters of the input into *token_out, and
134
// then increment the start of input_ by num_chars characters. If
135
// non_whitespace is true, also sets start_of_line_ to false.
136
void Emit(int num_chars, bool non_whitespace, StringPiece* token_out);
138
// Pushes a new state onto the parse_stack_, merging states as needed.
139
void PushBlockHeader();
140
void PushExpression();
143
// If a semicolon will be inserted between the previous token and the next
144
// token (assuming there was a linebreak in between) that _wouldn't_ be
145
// inserted if the linebreak weren't there, update the parse stack to reflect
146
// the semicolon insertion and return true; otherwise do nothing and return
148
bool TryInsertLinebreakSemicolon();
150
const JsTokenizerPatterns* patterns_;
151
std::vector<ParseState> parse_stack_;
152
std::deque<std::pair<JsKeywords::Type, StringPiece> > lookahead_queue_;
153
StringPiece input_; // The portion of input that has yet to be consumed.
154
bool start_of_line_; // No non-whitespace/comment tokens on this line yet.
157
DISALLOW_COPY_AND_ASSIGN(JsTokenizer);
160
// Structure to store RE2 patterns that can be shared by instances of
161
// JsTokenizer. These patterns are slightly expensive to compile, so we'd
162
// rather not create one for every JsTokenizer instance, but unfortunately C++
163
// static initializers can run in non-deterministic order and cause other
164
// integration issues. Instead, you must create a JsTokenizerPatterns object
165
// yourself and pass it to the JsTokenizer constructor; ideally, you would just
166
// create one and share it for all JsTokenizer instances.
167
struct JsTokenizerPatterns {
169
JsTokenizerPatterns();
170
~JsTokenizerPatterns();
172
const RE2 identifier_pattern;
173
const RE2 line_comment_pattern;
174
const RE2 numeric_literal_pattern;
175
const RE2 operator_pattern;
176
const RE2 regex_literal_pattern;
177
const RE2 string_literal_pattern;
178
const RE2 whitespace_pattern;
179
const RE2 line_continuation_pattern;
182
DISALLOW_COPY_AND_ASSIGN(JsTokenizerPatterns);
187
} // namespace pagespeed
189
#endif // PAGESPEED_KERNEL_JS_JS_TOKENIZER_H_