3
(* xmllexer.ml provide a generic XML lexer for Camlp4. *)
4
(* Manuel.Maarek@macs.hw.ac.uk (Heriot-Watt University) *)
5
(* Manuel.Maarek@spi.lip6.fr (LIP6) *)
6
(* Modif by VB for xmlp4 *)
9
(* derived from the default lexer of Camlp4: plexer.ml *)
10
(***********************************************************************)
14
(* Daniel de Rauglaudre, projet Cristal, INRIA Rocquencourt *)
16
(* Copyright 2002 Institut National de Recherche en Informatique et *)
17
(* Automatique. Distributed only by permission. *)
19
(***********************************************************************)
20
(* based on: plexer.ml,v 1.10 2002/07/19 *)
26
(* The string buffering machinery *)
28
value buff = ref (String.create 80);
31
if len >= String.length buff.val then
32
buff.val := buff.val ^ String.create (String.length buff.val)
39
add_rec len 0 where rec add_rec len i =
40
if i == String.length s then len else add_rec (store len s.[i]) (succ i)
42
value get_buff len = String.sub buff.val 0 len;
45
value string_of_space_char = fun
55
[ [: `('A'..'Z' | 'a'..'z' | '\192'..'\214' | '\216'..'\246' |
56
'\248'..'\255' | '0'..'9' | '_' | ''' as
64
[ [: `('!' | '?' | '~' | '=' | '@' | '^' | '&' | '+' | '-' | '*' | '/' |
65
'%' | '.' | ':' | '<' | '>' | '|' | '$' as
69
ident2 (store len c) s
73
[ [: `('0'..'9' | 'A'..'Z' | 'a'..'z' | '\192'..'\214' | '\216'..'\246' |
74
'\248'..'\255' | '_' | '!' | '%' | '&' | '*' | '+' | '-' | '.' |
75
'/' | ':' | '<' | '=' | '>' | '?' | '@' | '^' | '|' | '~' | ''' |
80
ident3 (store len c) s
84
value error_on_unknown_keywords = ref False;
85
value err loc msg = raise_with_loc loc (Token.Error msg);
87
value ((begin_attr, end_attr), waiting_for_attr) =
89
(((fun () -> b.val := True),
90
(fun () -> b.val := False)),
94
value ((push_tag,pop_tag),(size_tag_stack,string_of_tag_stack)) =
95
let stack = Stack.create () in
97
(fun t -> Stack.push t stack)
99
(fun () -> try Stack.pop stack with [Stack.Empty -> "" ])
101
((fun () -> Stack.length stack)
107
(fun tag -> s.val := (if (s.val == "") then " " else s.val) ^ tag)
113
value next_token_fun find_kwd fname lnum bolpos =
116
{Lexing.pos_fname = fname.val; Lexing.pos_lnum = lnum.val;
117
Lexing.pos_bol = bolpos.val; Lexing.pos_cnum = p}} in
118
let mkloc (bp, ep) = (make_pos bp, make_pos ep) in
119
let keyword_or_error (bp,ep) s =
120
let loc = mkloc (bp, ep) in
121
try (("", find_kwd s), loc) with
123
if error_on_unknown_keywords.val then err loc ("illegal token: " ^ s)
124
else (("", s), loc) ]
128
if waiting_for_attr () then
131
next_token_normal s )
132
and next_token_attr =
134
[ [: `(' ' | '\t'); s :] -> next_token s
135
| [: `('\013' | '\010'); s :] ep ->
136
do {bolpos.val := ep; incr lnum; next_token s}
137
| [: `('a'..'z' | 'A'..'Z' | '_' | ':' as c); s
138
:] -> name_attribut bp (store 0 c) s
139
| [: `'=' :] ep -> (("","="),mkloc (bp,ep))
140
| [: `'"'; s :] ep ->
141
(("VALUE", get_buff (value_attribut_in_double_quote bp 0 s)),mkloc (bp,ep))
142
| [: `'''; s :] ep ->
143
(("VALUE", get_buff (value_attribut_in_quote bp 0 s)),mkloc (bp,ep))
144
| [: `'$'; s :] -> (* for caml expressions *) camlexprattr (bp+1) 0 s
145
| [: `'/'; `'>' :] ep ->
146
let name = pop_tag () in
149
(("GAT",name),mkloc (bp,ep)) }
150
| [: `'>'; s :] -> do { end_attr (); next_token s}
152
and next_token_normal =
154
[ [: `(' ' | '\t' | '\026' | '\012' as c); s :] ep ->
155
let c_string = string_of_space_char c in
157
get_buff (whitespaces bp (mstore 0 c_string) s)),mkloc (bp,ep))
158
| [: `('\013' | '\010' as c); s :] ep ->
159
let c_string = string_of_space_char c in
160
do {bolpos.val := ep; incr lnum;
162
get_buff (whitespaces bp (mstore 0 c_string) s)),mkloc (bp,ep))}
163
| [: `('A'..'Z' | '\192'..'\214' | '\216'..'\222' | 'a'..'z' |
164
'\223'..'\246' | '\248'..'\255' | '_' | '1'..'9' | '0' | '\'' | '"' |
166
'!' | '=' | '@' | '^' | '&' | '+' | '-' | '*' | '/' | '%' | '~' |
167
'?' | ':' | '|' | '[' | ']' | '{' | '}' | '.' | ';' | ',' | '\\' | '(' |
168
')' | '#' | '>' | '`'
170
(("DATA", get_buff (data bp (store 0 c) s)),mkloc (bp,ep))
171
| [: `'$'; s :] -> (* for caml expr *) camlexpr (bp+1) s
174
[ [: `'?'; `'x'; `'m'; `'l';
175
len = question_mark_tag bp 0
176
:] ep -> (("XMLDECL", get_buff len), mkloc (bp, ep))
179
[ [: `'-'; `'-'; ct :] ep ->
180
(("COMMENT", get_buff (comment_tag 0 bp 0 ct)),
182
| [: len = exclamation_mark_tag bp 0
183
:] ep -> (("DECL", get_buff len),mkloc (bp,ep)) ]
184
| [: `'/'; `('a'..'z' | 'A'..'Z' | '_' | ':' as c); s
185
:] -> end_tag bp (store 0 c) s
186
| [: `('a'..'z' | 'A'..'Z' | '_' (* | ':'*) as c); s
187
:] -> start_tag bp (store 0 c) s ]
188
| [: `c :] ep -> keyword_or_error (bp,ep) (String.make 1 c)
189
| [: _ = Stream.empty :] ep ->
190
let size = size_tag_stack () in
194
(("EOI", ""), mkloc (bp, succ bp))}
196
err (mkloc (bp, ep)) ( (string_of_int size) ^ " tag" ^ (if size == 1 then "" else "s") ^ " not terminated (" ^ (string_of_tag_stack ()) ^ ").")
200
and question_mark_tag bp len =
202
[ [: `'?'; `'>' :] -> len
203
| [: `c; s :] -> question_mark_tag bp (store len c) s
204
| [: :] ep -> err (mkloc (bp, ep)) "XMLDecl tag (<? ?>) not terminated" ]
205
and comment_tag prof bp len =
207
[ [: `'-'; s :] -> dash_in_comment_tag prof bp len s
208
| [: `'<'; s :] -> comment_tag2 prof bp len s
209
| [: `c; s :] -> comment_tag prof bp (store len c) s
210
| [: :] ep -> err (mkloc (bp, ep)) "comment (<!-- -->) not terminated" ]
211
and comment_tag2 prof bp len =
213
[ [: `'!'; s :] -> comment_tag3 prof bp len s
214
| [: s :] -> comment_tag prof bp (store len '<') s ]
215
and comment_tag3 prof bp len =
217
[ [: `'-'; s :] -> comment_tag4 prof bp len s
218
| [: s :] -> comment_tag prof bp (mstore len "<!") s ]
219
and comment_tag4 prof bp len =
221
[ [: `'-'; s :] -> comment_tag (prof+1) bp (mstore len "<!--") s
222
| [: s :] -> comment_tag prof bp (mstore len "<!-") s ]
223
and dash_in_comment_tag prof bp len =
225
[ [: `'-'; s :] -> dash_in_comment_tag2 prof bp len s
226
| [: a = comment_tag prof bp (store len '-') :] -> a ]
227
and dash_in_comment_tag2 prof bp len =
229
[ [: `'>'; s :] -> if prof = 0
231
else comment_tag (prof-1) bp (mstore len "-->") s
232
| [: a = comment_tag prof bp (mstore len "--") :] -> a ]
233
and exclamation_mark_tag bp len =
236
| [: `('[' as c); s :] ->
237
sq_bracket_in_exclamation_mark_tag bp_sb (store len c) s
238
| [: `c; s :] -> exclamation_mark_tag bp (store len c) s
239
| [: :] ep -> err (mkloc (bp, ep)) "Decl tag (<! >) not terminated" ]
240
and sq_bracket_in_exclamation_mark_tag bp len =
242
[ [: `(']' as c ); s :] ->
243
exclamation_mark_tag bp (store len c) s
244
| [: `c; s :] -> sq_bracket_in_exclamation_mark_tag bp (store len c) s
245
| [: :] ep -> err (mkloc (bp, ep))
246
"Decl tag with open square bracket (<! ... [) not terminated" ]
250
let name = get_buff len in
251
let last_open = pop_tag () in
252
if last_open = name then
253
(("GAT",name), mkloc (bp,ep))
255
err (mkloc (bp, ep)) ("bad end tag: </"
259
if last_open = "" then
262
" (</" ^last_open ^ "> expected)"
264
| [: `('a'..'z' | 'A'..'Z' | '1'..'9' | '0' |
265
'.' | '-' | '_' | ':' as c); s :] -> end_tag bp (store len c) s
266
| [: :] ep -> err (mkloc (bp, ep)) "end tag (</ >) not terminated" ]
267
and start_tag bp len =
270
let name = get_buff len in
272
(("TAG", name), mkloc (bp,ep)) }
273
| [: `('\013' | ' ' | '\t') :] ep ->
274
let name = get_buff len in
277
(("TAG", name), mkloc (bp,ep)) }
278
| [: `('\010') :] ep ->
279
let name = get_buff len in
280
do { bolpos.val := ep; incr lnum;
283
(("TAG", name), mkloc (bp,ep)) }
284
| [: `( 'a'..'z' | 'A'..'Z' | '1'..'9' | '0' |
285
'.' | '-' | '_' | ':' as c); s :] ->
286
start_tag bp (store len c) s
288
let name = get_buff len in
291
(("TAG", name), mkloc (bp,ep)) } ]
292
(* | [: :] ep -> err (mkloc (bp, ep)) "start tag (< > or < />) not terminated" ] *)
293
and name_attribut bp len =
295
[ [: `('\013' | ' ' | '\t') :] ep ->
296
(("ATTR",get_buff len), mkloc (bp,ep))
297
| [: `('\010') :] ep ->
298
do { bolpos.val := ep; incr lnum ; (("ATTR",get_buff len), mkloc (bp,ep))}
299
| [: `( 'a'..'z' | 'A'..'Z' | '1'..'9' | '0' |
300
'.' | '-' | '_' | ':'
301
as c); s :] -> name_attribut bp (store len c) s
302
| [: s :] ep -> (("ATTR",get_buff len), mkloc (bp,ep))
303
| [: :] ep -> err (mkloc (bp, ep)) "start tag (< > or < />) not terminated" ]
304
and value_attribut_in_double_quote bp len =
307
(* | [: `'\\'; `'"'; s :] ->
308
value_attribut_in_double_quote bp (store (store len '\\') '"') s *)
310
value_attribut_in_double_quote bp (store len c) s
311
| [: :] ep -> err (mkloc (bp, ep)) "attribut value not terminated" ]
314
[ [: `('a'..'z' | '\223'..'\246' | '\248'..'\255' | '_' as c); s :] ->
315
antiquotname bp ((ident (store 0 c)) s) s
316
| [: `'$'; s :] ep -> (("DATA", "$"), mkloc (bp,ep))
317
| [: s :] ep -> (("CAMLEXPRXML", get_buff (camlexpr2 bp 0 s)),
319
and antiquotname bp len =
321
[ [: `':'; s :] ep -> let buff = get_buff len in
324
then (("CAMLEXPRXMLL", get_buff (camlexpr2 bp2 0 s)), mkloc (bp2,ep))
326
then (("CAMLEXPRXMLS", get_buff (camlexpr2 bp2 0 s)), mkloc (bp2,ep))
327
else err (mkloc (bp, ep)) "unknown antiquotation"
328
| [: `'$'; s :] ep -> (("CAMLEXPRXML", (get_buff len)), mkloc (bp,ep))
329
| [: s :] ep -> (("CAMLEXPRXML", get_buff (camlexpr2 bp len s)),
331
and camlexpr2 bp len =
333
[ [: `'$'; s :] -> len
334
| [: `('\010' as c) ; s :] ep ->
335
do { bolpos.val := ep; incr lnum;
336
camlexpr2 bp (store len c) s}
337
| [: `(_ as c) ; s :] -> camlexpr2 bp (store len c) s ]
341
and camlexprattr bp len =
343
[ [: `('a'..'z' | '\223'..'\246' | '\248'..'\255' | '_' as c); s :] ->
344
antiquotname_attr bp ((ident (store 0 c)) s) s
345
| [: `'$'; s :] ep -> (("DATA", "$"), mkloc (bp,ep))
346
| [: s :] ep -> (("CAMLEXPR", get_buff (camlexpr2 bp 0 s)),
348
and antiquotname_attr bp len =
350
[ [: `':'; s :] ep -> let buff = get_buff len in
352
then (("CAMLEXPRL", get_buff (camlexpr2 bp 0 s)), mkloc (bp,ep))
353
else err (mkloc (bp, ep)) "unknown antiquotation"
354
| [: `'$'; s :] ep -> (("CAMLEXPR", get_buff len), mkloc (bp,ep))
355
| [: s :] ep -> (("CAMLEXPR", get_buff (camlexpr2 bp len s)), mkloc (bp,ep)) ]
357
and value_attribut_in_quote bp len =
360
| [: `'\\'; `'''; s :] ->
361
value_attribut_in_quote bp (store (store len '\\') ''') s
363
value_attribut_in_quote bp (store len c) s
364
| [: :] ep -> err (mkloc (bp, ep)) "attribut value not terminated" ]
367
[ (* [: `(' ' | '\n' | '\t' as c); s :] ->
368
let c_string = string_of_space_char c in
369
spaces_in_data bp len c_string s
370
| *) [: `('A'..'Z' | '\192'..'\214' | '\216'..'\222' | 'a'..'z' |
371
'\223'..'\246' | '\248'..'\255' | '_' | '1'..'9' | '0' | '\'' | '"' |
373
'!' | '=' | '@' | '^' | '&' | '+' | '-' | '*' | '/' | '%' | '~' |
374
'?' | ':' | '|' | '[' | ']' | '{' | '}' | '.' | ';' | ',' | '\\' | '(' |
375
')' | '#' | '>' | '`'
376
as c) ; s :] -> data bp (store len c) s
378
(* and spaces_in_data bp len spaces =
380
[ [: `('\n' as c); s :] ep ->
381
let c_string = string_of_space_char c in
382
do { bolpos.val := ep; incr lnum ; spaces_in_data bp len (spaces ^ c_string) s}
383
| [: `(' ' | '\t' as c); s :] ->
384
let c_string = string_of_space_char c in
385
spaces_in_data bp len (spaces ^ c_string) s
386
| [: `('A'..'Z' | '\192'..'\214' | '\216'..'\222' | 'a'..'z' |
387
'\223'..'\246' | '\248'..'\255' | '_' | '1'..'9' | '0' | '\'' | '"' |
389
'!' | '=' | '@' | '^' | '&' | '+' | '-' | '*' | '/' | '%' | '~' |
390
'?' | ':' | '|' | '[' | ']' | '{' | '}' | '.' | ';' | ',' | '\\' | '(' |
393
data bp (store (mstore len spaces) c) s
394
| [: s :] -> len ] *)
395
and whitespaces bp len =
397
[ [: `('\013' | ' ' | '\t' | '\026' | '\012' as c); s :] ->
398
let c_string = string_of_space_char c in
399
whitespaces bp (mstore len c_string) s
400
| [: `('\010' as c); s :] ep ->
401
let c_string = string_of_space_char c in
402
do { bolpos.val := ep; incr lnum;
403
whitespaces bp (mstore len c_string) s}
412
[ Stream.Error str ->
413
err (mkloc (Stream.count cstrm, Stream.count cstrm + 1)) str ]
416
value func kwd_table =
417
let bolpos = ref 0 in
419
let fname = ref "" in
420
let find = Hashtbl.find kwd_table in
421
Token.lexer_func_of_parser (next_token_fun find fname lnum bolpos)
424
value rec check_keyword_stream =
425
parser [: _ = check; _ = Stream.empty :] -> True
428
[ [: `'A'..'Z' | 'a'..'z' | '\192'..'\214' | '\216'..'\246' | '\248'..'\255'
432
| [: `'!' | '?' | '~' | '=' | '@' | '^' | '&' | '+' | '-' | '*' | '/' |
438
match Stream.npeek 1 s with
440
| _ -> check_ident2 s ]
444
[ [: `']' | ':' | '=' | '>' :] -> ()
445
| [: :] -> () ] :] ->
450
[ [: `']' | '}' :] -> ()
451
| [: a = check_ident2 :] -> a ] :] ->
453
| [: `'[' | '{'; s :] ->
454
match Stream.npeek 2 s with
455
[ ['<'; '<' | ':'] -> ()
458
[ [: `'|' | '<' | ':' :] -> ()
464
| [: :] -> () ] :] ->
469
[ [: `'A'..'Z' | 'a'..'z' | '\192'..'\214' | '\216'..'\246' |
470
'\248'..'\255' | '0'..'9' | '_' | '''
477
[ [: `'!' | '?' | '~' | '=' | '@' | '^' | '&' | '+' | '-' | '*' | '/' |
478
'%' | '.' | ':' | '<' | '>' | '|'
485
value check_keyword s =
486
try check_keyword_stream (Stream.of_string s) with _ -> False
489
value error_no_respect_rules p_con p_prm =
493
(if p_con = "" then "\"" ^ p_prm ^ "\""
494
else if p_prm = "" then p_con
495
else p_con ^ " \"" ^ p_prm ^ "\"") ^
496
" does not respect Plexer rules"))
499
value error_ident_and_keyword p_con p_prm =
502
("the token \"" ^ p_prm ^ "\" is used as " ^ p_con ^
506
value using_token kwd_table ident_table (p_con, p_prm) =
509
if not (Hashtbl.mem kwd_table p_prm) then
510
if check_keyword p_prm then
511
if Hashtbl.mem ident_table p_prm then
512
error_ident_and_keyword (Hashtbl.find ident_table p_prm) p_prm
513
else Hashtbl.add kwd_table p_prm p_prm
514
else error_no_respect_rules p_con p_prm
517
"CAMLEXPR" | "CAMLEXPRXML" |
518
"CAMLEXPRL" | "CAMLEXPRXMLL" |
519
"CAMLEXPRXMLS" | "COMMENT" |
520
"TAG" | "GAT" | "ATTR" | "VALUE" | "XMLDECL" |
521
"DECL" | "DATA" | "WHITESPACE" | "EOI" ->
526
("the constructor \"" ^ p_con ^
527
"\" is not recognized by Plexer")) ]
530
value removing_token kwd_table ident_table (p_con, p_prm) =
532
[ "" -> Hashtbl.remove kwd_table p_prm
533
| "LIDENT" | "UIDENT" ->
534
if p_prm <> "" then Hashtbl.remove ident_table p_prm
541
[ ("", t) -> "'" ^ t ^ "'"
542
| ("CAMLEXPR", k) -> "camlexpr \"" ^ k ^ "\""
543
| ("CAMLEXPRXML", k) -> "camlvarxml \"" ^ k ^ "\""
544
| ("CAMLEXPRL", k) -> "camlexprl \"" ^ k ^ "\""
545
| ("CAMLEXPRXMLL", k) -> "camlvarxmll \"" ^ k ^ "\""
546
| ("COMMENT", k) -> "comment \"" ^ k ^ "\""
548
| ("TAG","") -> "tag"
549
| ("TAG",t) -> "tag \"" ^ t ^ "\""
550
| ("GAT","") -> "end tag"
551
| ("ATTR","") -> "attribut"
552
| ("ATTR",a) -> "attribut \"" ^ a ^ "\""
553
| ("VALUE","") -> "value"
554
| ("VALUE",v) -> "value \"" ^ v ^ "\""
555
| ("XMLDECL","") -> "XML declaration"
556
| ("DECL","") -> "declaration"
557
| ("DATA","") -> "data"
558
| ("WHITESPACE","") -> "whitespace"
560
| ("EOI", "") -> "end of input"
562
| (con, prm) -> con ^ " \"" ^ prm ^ "\"" ]
566
fun tok -> Token.default_match tok
570
let kwd_table = Hashtbl.create 301 in
571
let id_table = Hashtbl.create 301 in
572
{tok_func = func kwd_table; tok_using = using_token kwd_table id_table;
573
tok_removing = removing_token kwd_table id_table;
574
tok_match = tok_match; tok_text = text;
583
let kwd_table = Hashtbl.create 301 in
584
let id_table = Hashtbl.create 301 in
585
{func = func kwd_table; using = using_token kwd_table id_table;
586
removing = removing_token kwd_table id_table; tparse = tparse;
590
(* $Id: xmllexer.ml,v 1.4 2005/06/04 21:02:26 balat Exp $ *)