1
%% ``The contents of this file are subject to the Erlang Public License,
2
%% Version 1.1, (the "License"); you may not use this file except in
3
%% compliance with the License. You should have received a copy of the
4
%% Erlang Public License along with this software. If not, it can be
5
%% retrieved via the world wide web at http://www.erlang.org/.
7
%% Software distributed under the License is distributed on an "AS IS"
8
%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
9
%% the License for the specific language governing rights and limitations
12
%% The Initial Developer of the Original Code is Ericsson Utvecklings AB.
13
%% Portions created by Ericsson are Copyright 1999, Ericsson Utvecklings
14
%% AB. All Rights Reserved.''
16
%% $Id: core_scan.erl,v 1.1 2008/12/17 09:53:42 mikpe Exp $
18
%% Purpose: Scanner for Core Erlang.
20
%% For handling ISO 8859-1 (Latin-1) we use the following type
23
%% 000 - 037 NUL - US control
24
%% 040 - 057 SPC - / punctuation
25
%% 060 - 071 0 - 9 digit
26
%% 072 - 100 : - @ punctuation
27
%% 101 - 132 A - Z uppercase
28
%% 133 - 140 [ - ` punctuation
29
%% 141 - 172 a - z lowercase
30
%% 173 - 176 { - ~ punctuation
33
%% 240 - 277 NBSP - � punctuation
34
%% 300 - 326 � - � uppercase
36
%% 330 - 336 � - � uppercase
37
%% 337 - 366 � - � lowercase
39
%% 370 - 377 � - � lowercase
41
%% Many punctuation characters region have special meaning. Must
42
%% watch using � \327, bvery close to x \170
46
-export([string/1,string/2,tokens/3,format_error/1]).
48
-import(lists, [reverse/1]).
50
%% tokens(Continuation, CharList, StartPos) ->
51
%% {done, {ok, [Tok], EndPos}, Rest} |
52
%% {done, {error,{ErrorPos,core_scan,What}, EndPos}, Rest} |
53
%% {more, Continuation'}
54
%% This is the main function into the re-entrant scanner. It calls the
55
%% re-entrant pre-scanner until this says done, then calls scan/1 on
58
%% The continuation has the form:
59
%% {RestChars,CharsSoFar,CurrentPos,StartPos}
61
tokens([], Chars, Pos) -> %First call
62
tokens({[],[],Pos,Pos}, Chars, Pos);
63
tokens({Chars,SoFar0,Cp,Sp}, MoreChars, _) ->
64
In = Chars ++ MoreChars,
65
case pre_scan(In, SoFar0, Cp) of
66
{done,_,[],Ep} -> %Found nothing
68
{done,_,SoFar1,Ep} -> %Got complete tokens
69
Res = case scan(reverse(SoFar1), Sp) of
70
{ok,Toks} -> {ok,Toks,Ep};
71
{error,E} -> {error,E,Ep}
74
{more,Rest,SoFar1,Cp1} -> %Missing end token
75
{more,{Rest,SoFar1,Cp1,Sp}};
76
Other -> %An error has occurred
81
%% string([Char], StartPos) ->
82
%% {ok, [Tok], EndPos} |
83
%% {error,{Pos,core_scan,What}, EndPos}
85
string(Cs) -> string(Cs, 1).
88
%% Add an 'eof' to always get correct handling.
89
case string_pre_scan(Cs, [], Sp) of
90
{done,_,SoFar,Ep} -> %Got tokens
91
case scan(reverse(SoFar), Sp) of
92
{ok,Toks} -> {ok,Toks,Ep};
93
{error,E} -> {error,E,Ep}
95
Other -> Other %An error has occurred
98
%% string_pre_scan(Cs, SoFar0, StartPos) ->
99
%% {done,Rest,SoFar,EndPos} | {error,E,EndPos}.
101
string_pre_scan(Cs, SoFar0, Sp) ->
102
case pre_scan(Cs, SoFar0, Sp) of
103
{done,Rest,SoFar1,Ep} -> %Got complete tokens
104
{done,Rest,SoFar1,Ep};
105
{more,Rest,SoFar1,Ep} -> %Missing end token
106
string_pre_scan(Rest ++ eof, SoFar1, Ep);
107
Other -> Other %An error has occurred
110
%% format_error(Error)
111
%% Return a string describing the error.
113
format_error({string,Quote,Head}) ->
114
["unterminated " ++ string_thing(Quote) ++
115
" starting with " ++ io_lib:write_string(Head,Quote)];
116
format_error({illegal,Type}) -> io_lib:fwrite("illegal ~w", [Type]);
117
format_error(char) -> "unterminated character";
118
format_error(scan) -> "premature end";
119
format_error({base,Base}) -> io_lib:fwrite("illegal base '~w'", [Base]);
120
format_error(float) -> "bad float";
121
format_error(Other) -> io_lib:write(Other).
123
string_thing($') -> "atom";
124
string_thing($") -> "string".
126
%% Re-entrant pre-scanner.
128
%% If the input list of characters is insufficient to build a term the
129
%% scanner returns a request for more characters and a continuation to be
130
%% used when trying to build a term with more characters. To indicate
131
%% end-of-file the input character list should be replaced with 'eof'
132
%% as an empty list has meaning.
134
%% When more characters are need inside a comment, string or quoted
135
%% atom, which can become rather long, instead of pushing the
136
%% characters read so far back onto RestChars to be reread, a special
137
%% reentry token is returned indicating the middle of a construct.
138
%% The token is the start character as an atom, '%', '"' and '\''.
140
%% pre_scan([Char], SoFar, StartPos) ->
141
%% {done,RestChars,ScannedChars,NewPos} |
142
%% {more,RestChars,ScannedChars,NewPos} |
143
%% {error,{ErrorPos,core_scan,Description},NewPos}.
144
%% Main pre-scan function. It has been split into 2 functions because of
145
%% efficiency, with a good indexing compiler it would be unnecessary.
147
pre_scan([C|Cs], SoFar, Pos) ->
148
pre_scan(C, Cs, SoFar, Pos);
149
pre_scan([], SoFar, Pos) ->
151
pre_scan(eof, SoFar, Pos) ->
152
{done,eof,SoFar,Pos}.
154
%% pre_scan(Char, [Char], SoFar, Pos)
156
pre_scan($$, Cs0, SoFar0, Pos) ->
157
case pre_char(Cs0, [$$|SoFar0]) of
159
pre_scan(Cs, SoFar, Pos);
161
{more,[$$|Cs0],SoFar0, Pos};
163
pre_error(char, Pos, Pos)
165
pre_scan($', Cs, SoFar, Pos) ->
166
pre_string(Cs, $', '\'', Pos, [$'|SoFar], Pos);
167
pre_scan({'\'',Sp}, Cs, SoFar, Pos) -> %Re-entering quoted atom
168
pre_string(Cs, $', '\'', Sp, SoFar, Pos);
169
pre_scan($", Cs, SoFar, Pos) ->
170
pre_string(Cs, $", '"', Pos, [$"|SoFar], Pos);
171
pre_scan({'"',Sp}, Cs, SoFar, Pos) -> %Re-entering string
172
pre_string(Cs, $", '"', Sp, SoFar, Pos);
173
pre_scan($%, Cs, SoFar, Pos) ->
174
pre_comment(Cs, SoFar, Pos);
175
pre_scan('%', Cs, SoFar, Pos) -> %Re-entering comment
176
pre_comment(Cs, SoFar, Pos);
177
pre_scan($\n, Cs, SoFar, Pos) ->
178
pre_scan(Cs, [$\n|SoFar], Pos+1);
179
pre_scan(C, Cs, SoFar, Pos) ->
180
pre_scan(Cs, [C|SoFar], Pos).
182
%% pre_string([Char], Quote, Reent, StartPos, SoFar, Pos)
184
pre_string([Q|Cs], Q, _, _, SoFar, Pos) ->
185
pre_scan(Cs, [Q|SoFar], Pos);
186
pre_string([$\n|Cs], Q, Reent, Sp, SoFar, Pos) ->
187
pre_string(Cs, Q, Reent, Sp, [$\n|SoFar], Pos+1);
188
pre_string([$\\|Cs0], Q, Reent, Sp, SoFar0, Pos) ->
189
case pre_escape(Cs0, SoFar0) of
191
pre_string(Cs, Q, Reent, Sp, SoFar, Pos);
193
{more,[{Reent,Sp},$\\|Cs0],SoFar0,Pos};
195
pre_string_error(Q, Sp, SoFar0, Pos)
197
pre_string([C|Cs], Q, Reent, Sp, SoFar, Pos) ->
198
pre_string(Cs, Q, Reent, Sp, [C|SoFar], Pos);
199
pre_string([], _, Reent, Sp, SoFar, Pos) ->
200
{more,[{Reent,Sp}],SoFar,Pos};
201
pre_string(eof, Q, _, Sp, SoFar, Pos) ->
202
pre_string_error(Q, Sp, SoFar, Pos).
204
pre_string_error(Q, Sp, SoFar, Pos) ->
205
S = reverse(string:substr(SoFar, 1, string:chr(SoFar, Q)-1)),
206
pre_error({string,Q,string:substr(S, 1, 16)}, Sp, Pos).
208
pre_char([C|Cs], SoFar) -> pre_char(C, Cs, SoFar);
209
pre_char([], _) -> more;
210
pre_char(eof, _) -> error.
212
pre_char($\\, Cs, SoFar) ->
213
pre_escape(Cs, SoFar);
214
pre_char(C, Cs, SoFar) ->
217
pre_escape([$^|Cs0], SoFar) ->
220
{Cs,[C3,$^,$\\|SoFar]};
224
pre_escape([C|Cs], SoFar) ->
226
pre_escape([], _) -> more;
227
pre_escape(eof, _) -> error.
229
%% pre_comment([Char], SoFar, Pos)
230
%% Comments are replaced by one SPACE.
232
pre_comment([$\n|Cs], SoFar, Pos) ->
233
pre_scan(Cs, [$\n,$\s|SoFar], Pos+1); %Terminate comment
234
pre_comment([_|Cs], SoFar, Pos) ->
235
pre_comment(Cs, SoFar, Pos);
236
pre_comment([], SoFar, Pos) ->
237
{more,['%'],SoFar,Pos};
238
pre_comment(eof, Sofar, Pos) ->
239
pre_scan(eof, [$\s|Sofar], Pos).
241
pre_error(E, Epos, Pos) ->
242
{error,{Epos,core_scan,E}, Pos}.
244
%% scan(CharList, StartPos)
245
%% This takes a list of characters and tries to tokenise them.
247
%% The token list is built in reverse order (in a stack) to save appending
248
%% and then reversed when all the tokens have been collected. Most tokens
249
%% are built in the same way.
253
%% {error,{ErrorPos,core_scan,What}}
258
%% scan1(Characters, TokenStack, Position)
259
%% Scan a list of characters into tokens.
261
scan1([$\n|Cs], Toks, Pos) -> %Skip newline
262
scan1(Cs, Toks, Pos+1);
263
scan1([C|Cs], Toks, Pos) when C >= $\000, C =< $\s -> %Skip control chars
264
scan1(Cs, Toks, Pos);
265
scan1([C|Cs], Toks, Pos) when C >= $\200, C =< $\240 ->
266
scan1(Cs, Toks, Pos);
267
scan1([C|Cs], Toks, Pos) when C >= $a, C =< $z -> %Keywords
268
scan_key_word(C, Cs, Toks, Pos);
269
scan1([C|Cs], Toks, Pos) when C >= $�, C =< $�, C /= $� ->
270
scan_key_word(C, Cs, Toks, Pos);
271
scan1([C|Cs], Toks, Pos) when C >= $A, C =< $Z -> %Variables
272
scan_variable(C, Cs, Toks, Pos);
273
scan1([C|Cs], Toks, Pos) when C >= $�, C =< $�, C /= $� ->
274
scan_variable(C, Cs, Toks, Pos);
275
scan1([C|Cs], Toks, Pos) when C >= $0, C =< $9 -> %Numbers
276
scan_number(C, Cs, Toks, Pos);
277
scan1([$-,C|Cs], Toks, Pos) when C >= $0, C =< $9 -> %Signed numbers
278
scan_signed_number($-, C, Cs, Toks, Pos);
279
scan1([$+,C|Cs], Toks, Pos) when C >= $0, C =< $9 -> %Signed numbers
280
scan_signed_number($+, C, Cs, Toks, Pos);
281
scan1([$_|Cs], Toks, Pos) -> %_ variables
282
scan_variable($_, Cs, Toks, Pos);
283
scan1([$$|Cs0], Toks, Pos) -> %Character constant
284
{C,Cs,Pos1} = scan_char(Cs0, Pos),
285
scan1(Cs, [{char,Pos,C}|Toks], Pos1);
286
scan1([$'|Cs0], Toks, Pos) -> %Atom (always quoted)
287
{S,Cs1,Pos1} = scan_string(Cs0, $', Pos),
288
case catch list_to_atom(S) of
290
scan1(Cs1, [{atom,Pos,A}|Toks], Pos1);
291
_Error -> scan_error({illegal,atom}, Pos)
293
scan1([$"|Cs0], Toks, Pos) -> %String
294
{S,Cs1,Pos1} = scan_string(Cs0, $", Pos),
295
scan1(Cs1, [{string,Pos,S}|Toks], Pos1);
296
%% Punctuation characters and operators, first recognise multiples.
297
scan1("->" ++ Cs, Toks, Pos) ->
298
scan1(Cs, [{'->',Pos}|Toks], Pos);
299
scan1("-|" ++ Cs, Toks, Pos) ->
300
scan1(Cs, [{'-|',Pos}|Toks], Pos);
301
scan1([C|Cs], Toks, Pos) -> %Punctuation character
302
P = list_to_atom([C]),
303
scan1(Cs, [{P,Pos}|Toks], Pos);
304
scan1([], Toks0, _) ->
305
Toks = reverse(Toks0),
308
%% scan_key_word(FirstChar, CharList, Tokens, Pos)
309
%% scan_variable(FirstChar, CharList, Tokens, Pos)
311
scan_key_word(C, Cs0, Toks, Pos) ->
312
{Wcs,Cs} = scan_name(Cs0, []),
313
case catch list_to_atom([C|reverse(Wcs)]) of
314
Name when atom(Name) ->
315
scan1(Cs, [{Name,Pos}|Toks], Pos);
316
_Error -> scan_error({illegal,atom}, Pos)
319
scan_variable(C, Cs0, Toks, Pos) ->
320
{Wcs,Cs} = scan_name(Cs0, []),
321
case catch list_to_atom([C|reverse(Wcs)]) of
322
Name when atom(Name) ->
323
scan1(Cs, [{var,Pos,Name}|Toks], Pos);
324
_Error -> scan_error({illegal,var}, Pos)
327
%% scan_name(Cs) -> lists:splitwith(fun (C) -> name_char(C) end, Cs).
329
scan_name([C|Cs], Ncs) ->
331
true -> scan_name(Cs, [C|Ncs]);
332
false -> {Ncs,[C|Cs]} %Must rebuild here, sigh!
334
scan_name([], Ncs) ->
337
name_char(C) when C >= $a, C =< $z -> true;
338
name_char(C) when C >= $�, C =< $�, C /= $� -> true;
339
name_char(C) when C >= $A, C =< $Z -> true;
340
name_char(C) when C >= $�, C =< $�, C /= $� -> true;
341
name_char(C) when C >= $0, C =< $9 -> true;
342
name_char($_) -> true;
343
name_char($@) -> true;
344
name_char(_) -> false.
346
%% scan_string(CharList, QuoteChar, Pos) -> {StringChars,RestChars,NewPos}.
348
scan_string(Cs, Q, Pos) ->
349
scan_string(Cs, [], Q, Pos).
351
scan_string([Q|Cs], Scs, Q, Pos) ->
352
{reverse(Scs),Cs,Pos};
353
scan_string([$\n|Cs], Scs, Q, Pos) ->
354
scan_string(Cs, [$\n|Scs], Q, Pos+1);
355
scan_string([$\\|Cs0], Scs, Q, Pos) ->
356
{C,Cs,Pos1} = scan_escape(Cs0, Pos),
357
scan_string(Cs, [C|Scs], Q, Pos1);
358
scan_string([C|Cs], Scs, Q, Pos) ->
359
scan_string(Cs, [C|Scs], Q, Pos).
361
%% scan_char(Chars, Pos) -> {Char,RestChars,NewPos}.
362
%% Read a single character from a character constant. The pre-scan
363
%% phase has checked for errors here.
365
scan_char([$\\|Cs], Pos) ->
366
scan_escape(Cs, Pos);
367
scan_char([$\n|Cs], Pos) -> %Newline
369
scan_char([C|Cs], Pos) ->
372
scan_escape([O1,O2,O3|Cs], Pos) when %\<1-3> octal digits
373
O1 >= $0, O1 =< $7, O2 >= $0, O2 =< $7, O3 >= $0, O3 =< $7 ->
374
Val = (O1*8 + O2)*8 + O3 - 73*$0,
376
scan_escape([O1,O2|Cs], Pos) when
377
O1 >= $0, O1 =< $7, O2 >= $0, O2 =< $7 ->
378
Val = (O1*8 + O2) - 9*$0,
380
scan_escape([O1|Cs], Pos) when
381
O1 >= $0, O1 =< $7 ->
383
scan_escape([$^,C|Cs], Pos) -> %\^X -> CTL-X
386
%scan_escape([$\n,C1|Cs],Pos) ->
388
%scan_escape([C,C1|Cs],Pos) when C >= $\000, C =< $\s ->
390
scan_escape([$\n|Cs],Pos) ->
392
scan_escape([C0|Cs],Pos) ->
396
escape_char($n) -> $\n; %\n = LF
397
escape_char($r) -> $\r; %\r = CR
398
escape_char($t) -> $\t; %\t = TAB
399
escape_char($v) -> $\v; %\v = VT
400
escape_char($b) -> $\b; %\b = BS
401
escape_char($f) -> $\f; %\f = FF
402
escape_char($e) -> $\e; %\e = ESC
403
escape_char($s) -> $\s; %\s = SPC
404
escape_char($d) -> $\d; %\d = DEL
407
%% scan_number(Char, CharList, TokenStack, Pos)
408
%% We can handle simple radix notation:
409
%% <digit>#<digits> - the digits read in that base
410
%% <digits> - the digits in base 10
412
%% <digits>.<digits>E+-<digits>
414
%% Except for explicitly based integers we build a list of all the
415
%% characters and then use list_to_integer/1 or list_to_float/1 to
416
%% generate the value.
418
%% SPos == Start position
419
%% CPos == Current position
421
scan_number(C, Cs0, Toks, Pos) ->
422
{Ncs,Cs,Pos1} = scan_integer(Cs0, [C], Pos),
423
scan_after_int(Cs, Ncs, Toks, Pos, Pos1).
425
scan_signed_number(S, C, Cs0, Toks, Pos) ->
426
{Ncs,Cs,Pos1} = scan_integer(Cs0, [C,S], Pos),
427
scan_after_int(Cs, Ncs, Toks, Pos, Pos1).
429
scan_integer([C|Cs], Stack, Pos) when C >= $0, C =< $9 ->
430
scan_integer(Cs, [C|Stack], Pos);
431
scan_integer(Cs, Stack, Pos) ->
434
scan_after_int([$.,C|Cs0], Ncs0, Toks, SPos, CPos) when C >= $0, C =< $9 ->
435
{Ncs,Cs,CPos1} = scan_integer(Cs0, [C,$.|Ncs0], CPos),
436
scan_after_fraction(Cs, Ncs, Toks, SPos, CPos1);
437
scan_after_int([$#|Cs], Ncs, Toks, SPos, CPos) ->
438
case list_to_integer(reverse(Ncs)) of
439
Base when Base >= 2, Base =< 16 ->
440
scan_based_int(Cs, 0, Base, Toks, SPos, CPos);
442
scan_error({base,Base}, CPos)
444
scan_after_int(Cs, Ncs, Toks, SPos, CPos) ->
445
N = list_to_integer(reverse(Ncs)),
446
scan1(Cs, [{integer,SPos,N}|Toks], CPos).
448
scan_based_int([C|Cs], SoFar, Base, Toks, SPos, CPos) when
449
C >= $0, C =< $9, C < Base + $0 ->
450
Next = SoFar * Base + (C - $0),
451
scan_based_int(Cs, Next, Base, Toks, SPos, CPos);
452
scan_based_int([C|Cs], SoFar, Base, Toks, SPos, CPos) when
453
C >= $a, C =< $f, C < Base + $a - 10 ->
454
Next = SoFar * Base + (C - $a + 10),
455
scan_based_int(Cs, Next, Base, Toks, SPos, CPos);
456
scan_based_int([C|Cs], SoFar, Base, Toks, SPos, CPos) when
457
C >= $A, C =< $F, C < Base + $A - 10 ->
458
Next = SoFar * Base + (C - $A + 10),
459
scan_based_int(Cs, Next, Base, Toks, SPos, CPos);
460
scan_based_int(Cs, SoFar, _, Toks, SPos, CPos) ->
461
scan1(Cs, [{integer,SPos,SoFar}|Toks], CPos).
463
scan_after_fraction([$E|Cs], Ncs, Toks, SPos, CPos) ->
464
scan_exponent(Cs, [$E|Ncs], Toks, SPos, CPos);
465
scan_after_fraction([$e|Cs], Ncs, Toks, SPos, CPos) ->
466
scan_exponent(Cs, [$E|Ncs], Toks, SPos, CPos);
467
scan_after_fraction(Cs, Ncs, Toks, SPos, CPos) ->
468
case catch list_to_float(reverse(Ncs)) of
470
scan1(Cs, [{float,SPos,N}|Toks], CPos);
471
_Error -> scan_error({illegal,float}, SPos)
474
%% scan_exponent(CharList, NumberCharStack, TokenStack, StartPos, CurPos)
475
%% Generate an error here if E{+|-} not followed by any digits.
477
scan_exponent([$+|Cs], Ncs, Toks, SPos, CPos) ->
478
scan_exponent1(Cs, [$+|Ncs], Toks, SPos, CPos);
479
scan_exponent([$-|Cs], Ncs, Toks, SPos, CPos) ->
480
scan_exponent1(Cs, [$-|Ncs], Toks, SPos, CPos);
481
scan_exponent(Cs, Ncs, Toks, SPos, CPos) ->
482
scan_exponent1(Cs, Ncs, Toks, SPos, CPos).
484
scan_exponent1([C|Cs0], Ncs0, Toks, SPos, CPos) when C >= $0, C =< $9 ->
485
{Ncs,Cs,CPos1} = scan_integer(Cs0, [C|Ncs0], CPos),
486
case catch list_to_float(reverse(Ncs)) of
488
scan1(Cs, [{float,SPos,N}|Toks], CPos1);
489
_Error -> scan_error({illegal,float}, SPos)
491
scan_exponent1(_, _, _, _, CPos) ->
492
scan_error(float, CPos).
494
scan_error(In, Pos) ->
495
{error,{Pos,core_scan,In}}.