1
%% ``The contents of this file are subject to the Erlang Public License,
2
%% Version 1.1, (the "License"); you may not use this file except in
3
%% compliance with the License. You should have received a copy of the
4
%% Erlang Public License along with this software. If not, it can be
5
%% retrieved via the world wide web at http://www.erlang.org/.
7
%% Software distributed under the License is distributed on an "AS IS"
8
%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
9
%% the License for the specific language governing rights and
10
%% limitations under the License.
12
%% The Initial Developer of the Original Code is Ericsson Utvecklings
13
%% AB. Portions created by Ericsson are Copyright 1999, Ericsson
14
%% Utvecklings AB. All Rights Reserved.''
16
%% $Id: edoc_scanner.erl,v 1.8 2004/11/30 00:39:34 richardc Exp $
19
%% @copyright Richard Carlsson 2001-2003. Portions created by Ericsson
20
%% are Copyright 1999, Ericsson Utvecklings AB. All Rights Reserved.
21
%% @author Richard Carlsson <richardc@csd.uu.se>
25
%% @doc Tokeniser for EDoc. Based on the Erlang standard library module
26
%% {@link //stdlib/erl_scan}.
28
-module(edoc_scanner).
30
%% NOTE: the interface to this module is ancient and should be updated.
31
%% Please do not regard these exported functions as stable. Their
32
%% behaviour is described in the documentation of the module `erl_scan'.
34
%% Since there are no `full stop' tokens in EDoc specifications, the
35
%% `tokens' function *always* returns `{more, Continuation}' unless an
38
-export([string/1,string/2,format_error/1]).
40
-import(lists, [reverse/1]).
42
string(Cs) -> string(Cs, 1).
44
string(Cs, StartPos) ->
45
case scan(Cs, StartPos) of
46
{ok,Toks} -> {ok,Toks,StartPos};
47
{error,E} -> {error,E,StartPos}
50
%% format_error(Error)
51
%% Return a string describing the error.
53
format_error({string,Quote,Head}) ->
54
["unterminated string starting with " ++ io_lib:write_string(Head,Quote)];
55
format_error({illegal,Type}) -> io_lib:fwrite("illegal ~w", [Type]);
56
format_error(char) -> "unterminated character";
57
format_error(scan) -> "premature end";
58
format_error({base,Base}) -> io_lib:fwrite("illegal base '~w'", [Base]);
59
format_error(float) -> "bad float";
61
format_error(Other) -> io_lib:write(Other).
63
%% scan(CharList, StartPos)
64
%% This takes a list of characters and tries to tokenise them.
66
%% The token list is built in reverse order (in a stack) to save appending
67
%% and then reversed when all the tokens have been collected. Most tokens
68
%% are built in the same way.
72
%% {error,{ErrorPos,edoc_scanner,What}}
77
%% scan1(Characters, TokenStack, Position)
78
%% Scan a list of characters into tokens.
80
scan1([$\n|Cs], Toks, Pos) -> % Newline
81
scan1(Cs, Toks, Pos+1);
82
scan1([C|Cs], Toks, Pos) when C >= 0, C =< $ -> % Skip blanks
84
scan1([C|Cs], Toks, Pos) when C >= $a, C =< $z -> % Unquoted atom
85
scan_atom(C, Cs, Toks, Pos);
86
scan1([C|Cs], Toks, Pos) when C >= $0, C =< $9 -> % Numbers
87
scan_number(C, Cs, Toks, Pos);
88
scan1([$-,C| Cs], Toks, Pos) when C >= $0, C =< $9 -> % Signed numbers
89
scan_signed_number($-, C, Cs, Toks, Pos);
90
scan1([$+,C| Cs], Toks, Pos) when C >= $0, C =< $9 -> % Signed numbers
91
scan_signed_number($+, C, Cs, Toks, Pos);
92
scan1([C|Cs], Toks, Pos) when C >= $A, C =< $Z -> % Variables
93
scan_variable(C, Cs, Toks, Pos);
94
scan1([$_|Cs], Toks, Pos) -> % Variables
95
scan_variable($_, Cs, Toks, Pos);
96
scan1([$$|Cs], Toks, Pos) -> % Character constant
97
case scan_char_const(Cs, Toks, Pos) of
100
{error, truncated_char} ->
101
scan_error(char, Pos);
102
{error, illegal_character} ->
103
scan_error({illegal, char}, Pos)
105
scan1([$'|Cs0], Toks, Pos) -> % Quoted atom
106
case scan_string(Cs0, $', Pos) of
108
case catch list_to_atom(S) of
110
scan1(Cs1, [{atom,Pos,A}|Toks], Pos1);
111
_Error -> scan_error({illegal,atom}, Pos)
113
{error, premature_end} ->
114
scan_error({string,$',Cs0}, Pos);
115
{error, truncated_char} ->
116
scan_error(char, Pos);
117
{error, illegal_character} ->
118
scan_error({illegal, atom}, Pos)
120
scan1([$"|Cs0], Toks, Pos) -> % String
121
case scan_string(Cs0, $", Pos) of
124
[{string, Pos0, S0} | Toks1] ->
125
scan1(Cs1, [{string, Pos0, S0 ++ S} | Toks1],
128
scan1(Cs1, [{string,Pos,S}|Toks], Pos1)
130
{error, premature_end} ->
131
scan_error({string,$",Cs0}, Pos);
132
{error, truncated_char} ->
133
scan_error(char, Pos);
134
{error, illegal_character} ->
135
scan_error({illegal, string}, Pos)
137
%% Punctuation characters and operators, first recognise multiples.
138
scan1([$-,$>|Cs], Toks, Pos) ->
139
scan1(Cs, [{'->',Pos}|Toks], Pos);
140
scan1([$:,$:|Cs], Toks, Pos) ->
141
scan1(Cs, [{'::',Pos}|Toks], Pos);
142
scan1([$/,$/|Cs], Toks, Pos) ->
143
scan1(Cs, [{'//',Pos}|Toks], Pos);
144
scan1([C|Cs], Toks, Pos) -> % Punctuation character
145
P = list_to_atom([C]),
146
scan1(Cs, [{P,Pos}|Toks], Pos);
147
scan1([], Toks0, _Pos) ->
148
Toks = reverse(Toks0),
151
%% Note that `_' is not accepted as a variable token.
152
scan_variable(C, Cs, Toks, Pos) ->
153
{Wcs,Cs1} = scan_name(Cs, []),
154
W = [C|reverse(Wcs)],
157
scan_error({illegal,token}, Pos);
159
case catch list_to_atom(W) of
161
scan1(Cs1, [{var,Pos,A}|Toks], Pos);
163
scan_error({illegal,variable}, Pos)
167
scan_atom(C, Cs, Toks, Pos) ->
168
{Wcs,Cs1} = scan_name(Cs, []),
169
W = [C|reverse(Wcs)],
170
case catch list_to_atom(W) of
172
scan1(Cs1, [{atom,Pos,A}|Toks], Pos);
174
scan_error({illegal,token}, Pos)
177
%% scan_name(Cs) -> lists:splitwith(fun (C) -> name_char(C) end, Cs).
179
scan_name([C|Cs], Ncs) ->
182
scan_name(Cs, [C|Ncs]);
184
{Ncs,[C|Cs]} % Must rebuild here, sigh!
186
scan_name([], Ncs) ->
189
name_char(C) when C >= $a, C =< $z -> true;
190
name_char(C) when C >= $\337, C =< $\377, C /= $\367 -> true;
191
name_char(C) when C >= $A, C =< $Z -> true;
192
name_char(C) when C >= $\300, C =< $\336, C /= $\327 -> true;
193
name_char(C) when C >= $0, C =< $9 -> true;
194
name_char($_) -> true;
195
name_char($@) -> true;
196
name_char(_) -> false.
198
%% scan_string(CharList, QuoteChar, Pos) ->
199
%% {StringChars,RestChars, NewPos}
201
scan_string(Cs, Quote, Pos) ->
202
scan_string(Cs, [], Quote, Pos).
204
scan_string([Quote|Cs], Scs, Quote, Pos) ->
205
{reverse(Scs),Cs,Pos};
206
scan_string([], _Scs, _Quote, _Pos) ->
207
{error, premature_end};
208
scan_string(Cs0, Scs, Quote, Pos) ->
209
case scan_char(Cs0, Pos) of
211
%% Only build the string here
212
scan_string(Cs, [C|Scs], Quote, Pos1);
217
%% Note that space characters are not allowed
218
scan_char_const([$\040 | _Cs0], _Toks, _Pos) ->
219
{error, illegal_character};
220
scan_char_const(Cs0, Toks, Pos) ->
221
case scan_char(Cs0, Pos) of
223
scan1(Cs, [{char,Pos,C}|Toks], Pos1);
228
%% {Character,RestChars,NewPos} = scan_char(Chars, Pos)
229
%% Read a single character from a string or character constant. The
230
%% pre-scan phase has checked for errors here.
231
%% Note that control characters are not allowed.
233
scan_char([$\\|Cs], Pos) ->
234
scan_escape(Cs, Pos);
235
scan_char([C | _Cs], _Pos) when C =< 16#1f ->
236
{error, illegal_character};
237
scan_char([C|Cs], Pos) ->
239
scan_char([], _Pos) ->
240
{error, truncated_char}.
242
%% The following conforms to Standard Erlang escape sequences.
244
scan_escape([O1, O2, O3 | Cs], Pos) when % \<1-3> octal digits
245
O1 >= $0, O1 =< $3, O2 >= $0, O2 =< $7, O3 >= $0, O3 =< $7 ->
246
Val = (O1*8 + O2)*8 + O3 - 73*$0,
248
scan_escape([O1, O2 | Cs], Pos) when
249
O1 >= $0, O1 =< $7, O2 >= $0, O2 =< $7 ->
250
Val = (O1*8 + O2) - 9*$0,
252
scan_escape([O1 | Cs], Pos) when
253
O1 >= $0, O1 =< $7 ->
255
scan_escape([$^, C | Cs], Pos) -> % \^X -> CTL-X
256
if C >= $\100, C =< $\137 ->
258
true -> {error, illegal_control_character}
260
scan_escape([C | Cs], Pos) ->
261
case escape_char(C) of
262
C1 when C1 > $\000 -> {C1,Cs,Pos};
263
_ -> {error, undefined_escape_sequence}
265
scan_escape([], _Pos) ->
266
{error, truncated_char}.
268
%% Note that we return $\000 for undefined escapes.
269
escape_char($b) -> $\010; % \b = BS
270
escape_char($d) -> $\177; % \d = DEL
271
escape_char($e) -> $\033; % \e = ESC
272
escape_char($f) -> $\014; % \f = FF
273
escape_char($n) -> $\012; % \n = LF
274
escape_char($r) -> $\015; % \r = CR
275
escape_char($s) -> $\040; % \s = SPC
276
escape_char($t) -> $\011; % \t = HT
277
escape_char($v) -> $\013; % \v = VT
278
escape_char($\\) -> $\134; % \\ = \
279
escape_char($') -> $\047; % \' = '
280
escape_char($") -> $\042; % \" = "
281
escape_char(_C) -> $\000.
283
%% scan_number(Char, CharList, TokenStack, Pos)
284
%% We handle sign and radix notation:
285
%% [+-]<digits> - the digits in base [+-]10
286
%% [+-]<digits>.<digits>
287
%% [+-]<digits>.<digits>E+-<digits>
288
%% [+-]<digits>#<digits> - the digits read in base [+-]B
290
%% Except for explicitly based integers we build a list of all the
291
%% characters and then use list_to_integer/1 or list_to_float/1 to
292
%% generate the value.
294
%% SPos == Start position
295
%% CPos == Current position
297
scan_number(C, Cs0, Toks, Pos) ->
298
{Ncs,Cs,Pos1} = scan_integer(Cs0, [C], Pos),
299
scan_after_int(Cs, Ncs, Toks, Pos, Pos1).
301
scan_signed_number(S, C, Cs0, Toks, Pos) ->
302
{Ncs,Cs,Pos1} = scan_integer(Cs0, [C, S], Pos),
303
scan_after_int(Cs, Ncs, Toks, Pos, Pos1).
305
scan_integer([C|Cs], Stack, Pos) when C >= $0, C =< $9 ->
306
scan_integer(Cs, [C|Stack], Pos);
307
scan_integer(Cs, Stack, Pos) ->
310
scan_after_int([$.,C|Cs0], Ncs0, Toks, SPos, CPos) when C >= $0, C =< $9 ->
311
{Ncs,Cs,CPos1} = scan_integer(Cs0, [C,$.|Ncs0], CPos),
312
scan_after_fraction(Cs, Ncs, Toks, SPos, CPos1);
313
scan_after_int(Cs, Ncs, Toks, SPos, CPos) ->
314
N = list_to_integer(reverse(Ncs)),
315
scan1(Cs, [{integer,SPos,N}|Toks], CPos).
317
scan_after_fraction([$E|Cs], Ncs, Toks, SPos, CPos) ->
318
scan_exponent(Cs, [$E|Ncs], Toks, SPos, CPos);
319
scan_after_fraction([$e|Cs], Ncs, Toks, SPos, CPos) ->
320
scan_exponent(Cs, [$e|Ncs], Toks, SPos, CPos);
321
scan_after_fraction(Cs, Ncs, Toks, SPos, CPos) ->
322
case catch list_to_float(reverse(Ncs)) of
324
scan1(Cs, [{float,SPos,N}|Toks], CPos);
325
_Error -> scan_error({illegal,float}, SPos)
328
%% scan_exponent(CharList, NumberCharStack, TokenStack, StartPos, CurPos)
329
%% Generate an error here if E{+|-} not followed by any digits.
331
scan_exponent([$+|Cs], Ncs, Toks, SPos, CPos) ->
332
scan_exponent1(Cs, [$+|Ncs], Toks, SPos, CPos);
333
scan_exponent([$-|Cs], Ncs, Toks, SPos, CPos) ->
334
scan_exponent1(Cs, [$-|Ncs], Toks, SPos, CPos);
335
scan_exponent(Cs, Ncs, Toks, SPos, CPos) ->
336
scan_exponent1(Cs, Ncs, Toks, SPos, CPos).
338
scan_exponent1([C|Cs0], Ncs0, Toks, SPos, CPos) when C >= $0, C =< $9 ->
339
{Ncs,Cs,CPos1} = scan_integer(Cs0, [C|Ncs0], CPos),
340
case catch list_to_float(reverse(Ncs)) of
342
scan1(Cs, [{float,SPos,N}|Toks], CPos1);
343
_Error -> scan_error({illegal,float}, SPos)
345
scan_exponent1(_, _, _, _, CPos) ->
346
scan_error(float, CPos).
348
scan_error(In, Pos) ->
349
{error,{Pos,edoc_scanner,In}}.