1
/* Copyright 2009-2010 Yorba Foundation
3
* This software is licensed under the GNU Lesser General Public License
4
* (version 2.1 or later). See the COPYING file in this distribution.
12
CHAR, // an unrecognized punctuation character
13
CHAR_LITERAL, // a literal such as 'x'
18
ASTERISK, LEFT_BRACE, RIGHT_BRACE, LEFT_BRACKET, RIGHT_BRACKET, COLON, COMMA, EQUALS, ELLIPSIS,
19
HASH, LEFT_PAREN, RIGHT_PAREN, PERIOD, QUESTION_MARK, SEMICOLON, LESS_THAN, GREATER_THAN,
22
ABSTRACT, ASYNC, BASE, CLASS, CONST, CONSTRUCT, DELEGATE, ELSE, ENUM, EXTERN, FOR, FOREACH, IF,
23
INLINE, INTERFACE, INTERNAL, NAMESPACE, NEW, OUT, OVERRIDE, OWNED, PRIVATE, PROTECTED, PUBLIC,
24
REF, RETURN, SIGNAL, STATIC, STRUCT, THIS, UNOWNED, USING, VIRTUAL, WEAK, WHILE
32
const Keyword[] keywords = {
33
{ "abstract", Token.ABSTRACT },
34
{ "async", Token.ASYNC },
35
{ "base", Token.BASE },
36
{ "class", Token.CLASS },
37
{ "const", Token.CONST },
38
{ "construct", Token.CONSTRUCT },
39
{ "delegate", Token.DELEGATE },
40
{ "else", Token.ELSE },
41
{ "enum", Token.ENUM },
42
{ "extern", Token.EXTERN },
44
{ "foreach", Token.FOREACH },
46
{ "inline", Token.INLINE },
47
{ "interface", Token.INTERFACE },
48
{ "internal", Token.INTERNAL },
49
{ "namespace", Token.NAMESPACE },
52
{ "override", Token.OVERRIDE },
53
{ "owned", Token.OWNED },
54
{ "private", Token.PRIVATE },
55
{ "protected", Token.PROTECTED },
56
{ "public", Token.PUBLIC },
58
{ "return", Token.RETURN },
59
{ "signal", Token.SIGNAL },
60
{ "static", Token.STATIC },
61
{ "struct", Token.STRUCT },
62
{ "this", Token.THIS },
63
{ "unowned", Token.UNOWNED },
64
{ "using", Token.USING },
65
{ "virtual", Token.VIRTUAL },
66
{ "weak", Token.WEAK },
67
{ "while", Token.WHILE }
70
class Scanner : Object {
71
// The lookahead token. If not NONE, it extends from characters (token_start_char) to (input),
72
// and from positions (token_start) to (input_pos).
73
Token token = Token.NONE;
75
weak string token_start_char;
76
weak string input_begin;
82
// The last token retrieved with next_token() extends from characters (start_char) to
83
// (end_char), and from positions (start) to (end).
84
weak string start_char;
86
public int start; // starting character position
87
public int end; // ending character position
89
public Scanner(string input) {
95
input = input.next_char();
99
unichar peek_char() { return input.get_char(); }
101
// Peek two characters ahead.
102
unichar peek_char2() {
103
return input == "" ? '\0' : input.next_char().get_char();
106
unichar next_char() {
107
unichar c = peek_char();
112
bool accept(unichar c) {
113
if (peek_char() == c) {
120
// Return true if the current token equals s.
121
bool match(string s) {
122
char *p = token_start_char;
124
while (*p != 0 && *q != 0 && *p == *q) {
128
return p == input && *q == 0;
131
// Read characters until we reach a triple quote (""") string terminator.
132
void read_triple_string() {
134
if (next_char() == '"' && accept('"') && accept('"'))
139
while (input != "") {
140
unichar c = next_char();
146
bool is_first_token_on_line() {
147
weak string line = input;
148
// Go back to the '#' character
149
line = line.prev_char();
150
if (direct_equal(line, input_begin))
154
line = line.prev_char();
155
unichar c = line.get_char();
156
if (direct_equal(line, input_begin) && c.isspace())
160
else if (!c.isspace())
166
while (input != "") {
167
token_start_char = input;
168
token_start = input_pos;
169
unichar c = next_char();
174
bool accept_all_chars_as_id = false;
176
accept_all_chars_as_id = true;
177
// Don't include the '@' in ID's
178
token_start_char = input;
179
token_start = input_pos;
184
if (c.isalpha() || c == '_' || (accept_all_chars_as_id && c.isalnum())) {
187
if (!c.isalnum() && c != '_')
191
// We don't use the foreach statement to iterate over the keywords array;
192
// that would copy the Keyword structure (and the string it contains) on
193
// each iteration, which would be slow.
194
if (!accept_all_chars_as_id) {
195
for (int i = 0 ; i < keywords.length ; ++i)
196
if (match(keywords[i].name))
197
return keywords[i].token;
203
unichar d = peek_char();
204
if (d == '/') { // single-line comment
205
while (input != "" && next_char() != '\n')
207
token_start_char = input;
208
token_start = input_pos;
211
if (d == '*') { // multi-line comment
212
advance(); // move past '*'
213
while (input != "") {
214
if (next_char() == '*' && peek_char() == '/') {
215
advance(); // move past '/'
219
token_start_char = input;
220
token_start = input_pos;
225
if (accept('"')) { // ""
226
if (accept('"')) // """
227
read_triple_string();
229
while (input != "") {
230
unichar d = next_char();
231
if (d == '"' || d == '\n')
233
else if (d == '\'') // escape sequence
237
return Token.STRING_LITERAL;
239
accept('\\'); // optional backslash beginning escape sequence
241
accept('\''); // closing single quote
242
return Token.CHAR_LITERAL;
243
case '*': return Token.ASTERISK;
244
case '{': return Token.LEFT_BRACE;
245
case '}': return Token.RIGHT_BRACE;
246
case '[': return Token.LEFT_BRACKET;
247
case ']': return Token.RIGHT_BRACKET;
248
case ':': return Token.COLON;
249
case ',': return Token.COMMA;
250
case '=': return Token.EQUALS;
252
if (is_first_token_on_line()) {
255
} else return Token.HASH;
256
case '(': return Token.LEFT_PAREN;
257
case ')': return Token.RIGHT_PAREN;
259
if (peek_char() == '.' && peek_char2() == '.') {
262
return Token.ELLIPSIS;
265
case '?': return Token.QUESTION_MARK;
266
case ';': return Token.SEMICOLON;
267
case '<': return Token.LESS_THAN;
268
case '>': return Token.GREATER_THAN;
269
default: return Token.CHAR;
275
public Token peek_token() {
276
if (token == Token.NONE)
277
token = read_token();
281
public Token next_token() {
282
Token t = peek_token();
284
start_char = token_start_char;
291
public bool accept_token(Token t) {
292
if (peek_token() == t) {
299
public bool eof() { return peek_token() == Token.EOF; }
301
// Return the source text of the last token retrieved.
302
public string val() {
303
size_t bytes = (char *) end_char - (char *) start_char;
304
return start_char.ndup(bytes);
307
public unowned string get_start() {
311
public unowned string get_start_after_comments() {
312
// Skip any comments after the end character and take the first character after them
314
return token_start_char;