1
// Copyright 2009 The Go Authors. All rights reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
5
// A scanner for Go source text. Takes a []byte as source which can
6
// then be tokenized through repeated calls to the Scan function.
10
// fset := token.NewFileSet() // position information is relative to fset
11
// file := fset.AddFile(filename, fset.Base(), len(src)) // register file
12
// s.Init(file, src, nil /* no error handler */, 0)
14
// pos, tok, lit := s.Scan()
15
// if tok == token.EOF {
18
// // do something here with pos, tok, and lit
33
// A Scanner holds the scanner's internal state while processing
34
// a given text. It can be allocated as part of another data
35
// structure but must be initialized via Init before use.
39
file *token.File // source file handle
40
dir string // directory portion of file.Name()
42
err ErrorHandler // error reporting; or nil
43
mode uint // scanning mode
46
ch int // current character
47
offset int // character offset
48
rdOffset int // reading offset (position after current character)
49
lineOffset int // current line offset
50
insertSemi bool // insert a semicolon before next newline
52
// public state - ok to modify
53
ErrorCount int // number of errors encountered
57
// Read the next Unicode char into S.ch.
58
// S.ch < 0 means end-of-file.
60
func (S *Scanner) next() {
61
if S.rdOffset < len(S.src) {
64
S.lineOffset = S.offset
65
S.file.AddLine(S.offset)
67
r, w := int(S.src[S.rdOffset]), 1
70
S.error(S.offset, "illegal character NUL")
73
r, w = utf8.DecodeRune(S.src[S.rdOffset:])
74
if r == utf8.RuneError && w == 1 {
75
S.error(S.offset, "illegal UTF-8 encoding")
83
S.lineOffset = S.offset
84
S.file.AddLine(S.offset)
91
// The mode parameter to the Init function is a set of flags (or 0).
92
// They control scanner behavior.
95
ScanComments = 1 << iota // return comments as COMMENT tokens
96
AllowIllegalChars // do not report an error for illegal chars
97
InsertSemis // automatically insert semicolons
100
// Init prepares the scanner S to tokenize the text src by setting the
101
// scanner at the beginning of src. The scanner uses the file set file
102
// for position information and it adds line information for each line.
103
// It is ok to re-use the same file when re-scanning the same file as
104
// line information which is already present is ignored. Init causes a
105
// panic if the file size does not match the src size.
107
// Calls to Scan will use the error handler err if they encounter a
108
// syntax error and err is not nil. Also, for each error encountered,
109
// the Scanner field ErrorCount is incremented by one. The mode parameter
110
// determines how comments, illegal characters, and semicolons are handled.
112
// Note that Init may call err if there is an error in the first character
115
func (S *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode uint) {
116
// Explicitly initialize all fields since a scanner may be reused.
117
if file.Size() != len(src) {
118
panic("file size does not match src len")
121
S.dir, _ = filepath.Split(file.Name())
137
func charString(ch int) string {
163
return "'" + s + "' (U+" + strconv.Itob(ch, 16) + ")"
167
func (S *Scanner) error(offs int, msg string) {
169
S.err.Error(S.file.Position(S.file.Pos(offs)), msg)
175
var prefix = []byte("//line ")
177
func (S *Scanner) interpretLineComment(text []byte) {
178
if bytes.HasPrefix(text, prefix) {
179
// get filename and line number, if any
180
if i := bytes.Index(text, []byte{':'}); i > 0 {
181
if line, err := strconv.Atoi(string(text[i+1:])); err == nil && line > 0 {
182
// valid //line filename:line comment;
183
filename := filepath.Clean(string(text[len(prefix):i]))
184
if filename[0] != '/' {
185
// make filename relative to current directory
186
filename = filepath.Join(S.dir, filename)
188
// update scanner position
189
S.file.AddLineInfo(S.lineOffset, filename, line-1) // -1 since comment applies to next line
196
func (S *Scanner) scanComment() {
197
// initial '/' already consumed; S.ch == '/' || S.ch == '*'
198
offs := S.offset - 1 // position of initial '/'
203
for S.ch != '\n' && S.ch >= 0 {
206
if offs == S.lineOffset {
207
// comment starts at the beginning of the current line
208
S.interpretLineComment(S.src[offs:S.offset])
218
if ch == '*' && S.ch == '/' {
224
S.error(offs, "comment not terminated")
228
func (S *Scanner) findLineEnd() bool {
229
// initial '/' already consumed
231
defer func(offs int) {
232
// reset scanner state to where it was upon calling findLineEnd
235
S.rdOffset = offs + 1
236
S.next() // consume initial '/' again
239
// read ahead until a newline, EOF, or non-comment token is found
240
for S.ch == '/' || S.ch == '*' {
242
//-style comment always contains a newline
245
/*-style comment: look for newline */
253
if ch == '*' && S.ch == '/' {
258
S.skipWhitespace() // S.insertSemi is set
259
if S.ch < 0 || S.ch == '\n' {
266
S.next() // consume '/'
273
func isLetter(ch int) bool {
274
return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch)
278
func isDigit(ch int) bool {
279
return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch)
283
func (S *Scanner) scanIdentifier() token.Token {
285
for isLetter(S.ch) || isDigit(S.ch) {
288
return token.Lookup(S.src[offs:S.offset])
292
func digitVal(ch int) int {
294
case '0' <= ch && ch <= '9':
296
case 'a' <= ch && ch <= 'f':
298
case 'A' <= ch && ch <= 'F':
301
return 16 // larger than any legal digit val
305
func (S *Scanner) scanMantissa(base int) {
306
for digitVal(S.ch) < base {
312
func (S *Scanner) scanNumber(seenDecimalPoint bool) token.Token {
313
// digitVal(S.ch) < 10
316
if seenDecimalPoint {
326
if S.ch == 'x' || S.ch == 'X' {
331
// octal int or float
332
seenDecimalDigit := false
334
if S.ch == '8' || S.ch == '9' {
335
// illegal octal int or float
336
seenDecimalDigit = true
339
if S.ch == '.' || S.ch == 'e' || S.ch == 'E' || S.ch == 'i' {
343
if seenDecimalDigit {
344
S.error(offs, "illegal octal number")
350
// decimal int or float
361
if S.ch == 'e' || S.ch == 'E' {
364
if S.ch == '-' || S.ch == '+' {
380
func (S *Scanner) scanEscape(quote int) {
383
var i, base, max uint32
385
case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
388
case '0', '1', '2', '3', '4', '5', '6', '7':
389
i, base, max = 3, 8, 255
392
i, base, max = 2, 16, 255
395
i, base, max = 4, 16, unicode.MaxRune
398
i, base, max = 8, 16, unicode.MaxRune
400
S.next() // always make progress
401
S.error(offs, "unknown escape sequence")
406
for ; i > 0 && S.ch != quote && S.ch >= 0; i-- {
407
d := uint32(digitVal(S.ch))
409
S.error(S.offset, "illegal character in escape sequence")
415
// in case of an error, consume remaining chars
416
for ; i > 0 && S.ch != quote && S.ch >= 0; i-- {
419
if x > max || 0xd800 <= x && x < 0xe000 {
420
S.error(offs, "escape sequence is invalid Unicode code point")
425
func (S *Scanner) scanChar() {
426
// '\'' opening already consumed
434
if ch == '\n' || ch < 0 {
435
S.error(offs, "character literal not terminated")
447
S.error(offs, "illegal character literal")
452
func (S *Scanner) scanString() {
453
// '"' opening already consumed
459
if ch == '\n' || ch < 0 {
460
S.error(offs, "string not terminated")
472
func (S *Scanner) scanRawString() {
473
// '`' opening already consumed
480
S.error(offs, "string not terminated")
489
func (S *Scanner) skipWhitespace() {
490
for S.ch == ' ' || S.ch == '\t' || S.ch == '\n' && !S.insertSemi || S.ch == '\r' {
496
// Helper functions for scanning multi-byte tokens such as >> += >>= .
497
// Different routines recognize different length tok_i based on matches
498
// of ch_i. If a token ends in '=', the result is tok1 or tok3
499
// respectively. Otherwise, the result is tok0 if there was no other
500
// matching character, or tok2 if the matching character was ch2.
502
func (S *Scanner) switch2(tok0, tok1 token.Token) token.Token {
511
func (S *Scanner) switch3(tok0, tok1 token.Token, ch2 int, tok2 token.Token) token.Token {
524
func (S *Scanner) switch4(tok0, tok1 token.Token, ch2 int, tok2, tok3 token.Token) token.Token {
541
var newline = []byte{'\n'}
543
// Scan scans the next token and returns the token position pos,
544
// the token tok, and the literal text lit corresponding to the
545
// token. The source end is indicated by token.EOF.
547
// If the returned token is token.SEMICOLON, the corresponding
548
// literal value is ";" if the semicolon was present in the source,
549
// and "\n" if the semicolon was inserted because of a newline or
552
// For more tolerant parsing, Scan will return a valid token if
553
// possible even if a syntax error was encountered. Thus, even
554
// if the resulting token sequence contains no illegal tokens,
555
// a client may not assume that no error occurred. Instead it
556
// must check the scanner's ErrorCount or the number of calls
557
// of the error handler, if there was one installed.
559
// Scan adds line information to the file added to the file
560
// set with Init. Token positions are relative to that file
561
// and thus relative to the file set.
563
func (S *Scanner) Scan() (token.Pos, token.Token, []byte) {
567
// current token start
572
// determine token value
575
tok = S.scanIdentifier()
577
case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
580
case digitVal(ch) < 10:
582
tok = S.scanNumber(false)
584
S.next() // always make progress
588
S.insertSemi = false // EOF consumed
589
return S.file.Pos(offs), token.SEMICOLON, newline
593
// we only reach here if S.insertSemi was
594
// set in the first place and exited early
595
// from S.skipWhitespace()
596
S.insertSemi = false // newline consumed
597
return S.file.Pos(offs), token.SEMICOLON, newline
611
tok = S.switch2(token.COLON, token.DEFINE)
613
if digitVal(S.ch) < 10 {
615
tok = S.scanNumber(true)
616
} else if S.ch == '.' {
628
tok = token.SEMICOLON
645
tok = S.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
646
if tok == token.INC {
650
tok = S.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
651
if tok == token.DEC {
655
tok = S.switch2(token.MUL, token.MUL_ASSIGN)
657
if S.ch == '/' || S.ch == '*' {
659
if S.insertSemi && S.findLineEnd() {
660
// reset position to the beginning of the comment
663
S.rdOffset = offs + 1
664
S.insertSemi = false // newline consumed
665
return S.file.Pos(offs), token.SEMICOLON, newline
668
if S.mode&ScanComments == 0 {
670
S.insertSemi = false // newline consumed
675
tok = S.switch2(token.QUO, token.QUO_ASSIGN)
678
tok = S.switch2(token.REM, token.REM_ASSIGN)
680
tok = S.switch2(token.XOR, token.XOR_ASSIGN)
686
tok = S.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN)
689
tok = S.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN)
691
tok = S.switch2(token.ASSIGN, token.EQL)
693
tok = S.switch2(token.NOT, token.NEQ)
697
tok = S.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
699
tok = S.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND)
702
tok = S.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
704
if S.mode&AllowIllegalChars == 0 {
705
S.error(offs, "illegal character "+charString(ch))
707
insertSemi = S.insertSemi // preserve insertSemi info
711
if S.mode&InsertSemis != 0 {
712
S.insertSemi = insertSemi
714
return S.file.Pos(offs), tok, S.src[offs:S.offset]