1
(* $Id: link_wlex.src 675 2004-06-13 16:03:16Z gerd $
2
* ----------------------------------------------------------------------
6
(* This file is divided up into sections, marked by (* [SECTION] *).
7
* Sections are processed by lexpp.
10
(* ---------------------------------------------------------------------- *)
13
(* This section contains code going into the link module. The patterns
14
* $ {name} (w/o space) are substituted by generated strings:
15
* - $ {encoding} is replaced by the name of the character encoding
16
* - all other names must be names of lexical rules, and $ {name} is
17
* replaced by Module.name, i.e. the correct module prefix is prepended
21
module L = Lex_engines;;
22
module P = ${scan_document};;
24
(* Note: Support for ISO-8859-1 has been removed from Pxp_wlex:
25
* - because it is not necessary and the ocamllex-based lexer for ISO-8859-1
27
* - now only one encoding is possible per lexer module (because the lexer
28
* knows the lexer record)
34
let a = String.make 256 (Char.chr Pxp_wlex.otherChar) in
36
a.[code] <- Char.chr Pxp_wlex.invalid
39
(fun (code, classnr) ->
40
a.[code] <- Char.chr classnr;
42
Pxp_wlex.one_char_classes;
43
for code = 192 to 255 do
44
if code <> 215 && code <> 247 then
45
a.[code] <- Char.chr Pxp_wlex.unicode_baseChar;
47
a.[183] <- Char.chr Pxp_wlex.extender;
48
for code = 48 to 57 do
49
a.[code] <- Char.chr Pxp_wlex.ascii_digit
56
L.engine_tiny_8bit table_iso88591
59
--- END REMOVED --- *)
63
let a = String.make 0x312D (Char.chr P.otherChar) in
65
a.[code] <- Char.chr P.invalid
68
(fun (code, classnr) ->
69
a.[code] <- Char.chr classnr;
74
(fun (fromcode, tocode) ->
75
for code = fromcode to tocode do
76
a.[code] <- Char.chr P.unicode_baseChar
79
[ 0x00C0,0x00D6; 0x00D8,0x00F6;
80
0x00F8,0x00FF; 0x0100,0x0131; 0x0134,0x013E; 0x0141,0x0148;
81
0x014A,0x017E; 0x0180,0x01C3; 0x01CD,0x01F0; 0x01F4,0x01F5;
82
0x01FA,0x0217; 0x0250,0x02A8; 0x02BB,0x02C1; 0x0386,0x0386;
83
0x0388,0x038A; 0x038C,0x038C; 0x038E,0x03A1; 0x03A3,0x03CE;
84
0x03D0,0x03D6; 0x03DA,0x03DA; 0x03DC,0x03DC; 0x03DE,0x03DE;
85
0x03E0,0x03E0; 0x03E2,0x03F3;
86
0x0401,0x040C; 0x040E,0x044F; 0x0451,0x045C; 0x045E,0x0481;
87
0x0490,0x04C4; 0x04C7,0x04C8; 0x04CB,0x04CC; 0x04D0,0x04EB;
88
0x04EE,0x04F5; 0x04F8,0x04F9; 0x0531,0x0556; 0x0559,0x0559;
89
0x0561,0x0586; 0x05D0,0x05EA; 0x05F0,0x05F2; 0x0621,0x063A;
90
0x0641,0x064A; 0x0671,0x06B7; 0x06BA,0x06BE; 0x06C0,0x06CE;
91
0x06D0,0x06D3; 0x06D5,0x06D5; 0x06E5,0x06E6; 0x0905,0x0939; 0x093D,0x093D;
92
0x0958,0x0961; 0x0985,0x098C; 0x098F,0x0990; 0x0993,0x09A8;
93
0x09AA,0x09B0; 0x09B2,0x09B2; 0x09B6,0x09B9; 0x09DC,0x09DD;
94
0x09DF,0x09E1; 0x09F0,0x09F1; 0x0A05,0x0A0A; 0x0A0F,0x0A10;
95
0x0A13,0x0A28; 0x0A2A,0x0A30; 0x0A32,0x0A33; 0x0A35,0x0A36;
96
0x0A38,0x0A39; 0x0A59,0x0A5C; 0x0A5E,0x0A5E; 0x0A72,0x0A74;
97
0x0A85,0x0A8B; 0x0A8D,0x0A8D; 0x0A8F,0x0A91; 0x0A93,0x0AA8;
98
0x0AAA,0x0AB0; 0x0AB2,0x0AB3; 0x0AB5,0x0AB9; 0x0ABD,0x0ABD; 0x0AE0,0x0AE0;
99
0x0B05,0x0B0C; 0x0B0F,0x0B10; 0x0B13,0x0B28; 0x0B2A,0x0B30;
100
0x0B32,0x0B33; 0x0B36,0x0B39; 0x0B3D,0x0B3D; 0x0B5C,0x0B5D;
101
0x0B5F,0x0B61; 0x0B85,0x0B8A; 0x0B8E,0x0B90; 0x0B92,0x0B95;
102
0x0B99,0x0B9A; 0x0B9C,0x0B9C; 0x0B9E,0x0B9F; 0x0BA3,0x0BA4;
103
0x0BA8,0x0BAA; 0x0BAE,0x0BB5; 0x0BB7,0x0BB9; 0x0C05,0x0C0C;
104
0x0C0E,0x0C10; 0x0C12,0x0C28; 0x0C2A,0x0C33; 0x0C35,0x0C39;
105
0x0C60,0x0C61; 0x0C85,0x0C8C; 0x0C8E,0x0C90; 0x0C92,0x0CA8;
106
0x0CAA,0x0CB3; 0x0CB5,0x0CB9; 0x0CDE,0x0CDE; 0x0CE0,0x0CE1;
107
0x0D05,0x0D0C; 0x0D0E,0x0D10; 0x0D12,0x0D28; 0x0D2A,0x0D39;
108
0x0D60,0x0D61; 0x0E01,0x0E2E; 0x0E30,0x0E30; 0x0E32,0x0E33;
109
0x0E40,0x0E45; 0x0E81,0x0E82; 0x0E84,0x0E84; 0x0E87,0x0E88; 0x0E8A,0x0E8A;
110
0x0E8D,0x0E8D; 0x0E94,0x0E97; 0x0E99,0x0E9F; 0x0EA1,0x0EA3; 0x0EA5,0x0EA5;
111
0x0EA7,0x0EA7; 0x0EAA,0x0EAB; 0x0EAD,0x0EAE; 0x0EB0,0x0EB0; 0x0EB2,0x0EB3;
112
0x0EBD,0x0EBD; 0x0EC0,0x0EC4; 0x0F40,0x0F47; 0x0F49,0x0F69;
113
0x10A0,0x10C5; 0x10D0,0x10F6; 0x1100,0x1100; 0x1102,0x1103;
114
0x1105,0x1107; 0x1109,0x1109; 0x110B,0x110C; 0x110E,0x1112; 0x113C,0x113C;
115
0x113E,0x113E; 0x1140,0x1140; 0x114C,0x114C; 0x114E,0x114E;
116
0x1150,0x1150; 0x1154,0x1155; 0x1159,0x1159;
117
0x115F,0x1161; 0x1163,0x1163; 0x1165,0x1165; 0x1167,0x1167;
118
0x1169,0x1169; 0x116D,0x116E;
119
0x1172,0x1173; 0x1175,0x1175; 0x119E,0x119E; 0x11A8,0x11A8;
120
0x11AB,0x11AB; 0x11AE,0x11AF;
121
0x11B7,0x11B8; 0x11BA,0x11BA; 0x11BC,0x11C2; 0x11EB,0x11EB;
122
0x11F0,0x11F0; 0x11F9,0x11F9;
123
0x1E00,0x1E9B; 0x1EA0,0x1EF9; 0x1F00,0x1F15; 0x1F18,0x1F1D;
124
0x1F20,0x1F45; 0x1F48,0x1F4D; 0x1F50,0x1F57; 0x1F59,0x1F59; 0x1F5B,0x1F5B;
125
0x1F5D,0x1F5D; 0x1F5F,0x1F7D; 0x1F80,0x1FB4; 0x1FB6,0x1FBC; 0x1FBE,0x1FBE;
126
0x1FC2,0x1FC4; 0x1FC6,0x1FCC; 0x1FD0,0x1FD3; 0x1FD6,0x1FDB;
127
0x1FE0,0x1FEC; 0x1FF2,0x1FF4; 0x1FF6,0x1FFC; 0x2126,0x2126;
128
0x212A,0x212B; 0x212E,0x212E; 0x2180,0x2182; 0x3041,0x3094;
129
0x30A1,0x30FA; 0x3105,0x312C; (* 0xAC00,0xD7A3 *) ];
132
(fun (fromcode, tocode) ->
133
for code = fromcode to tocode do
134
a.[code] <- Char.chr P.ideographic
137
[ 0x3007,0x3007; 0x3021,0x3029 (* 0x4E00-0x9FA5 *) ];
140
(fun (fromcode, tocode) ->
141
for code = fromcode to tocode do
142
a.[code] <- Char.chr P.combiningChar
145
[ 0x0300,0x0345; 0x0360,0x0361; 0x0483,0x0486; 0x0591,0x05A1;
146
0x05A3,0x05B9; 0x05BB,0x05BD; 0x05BF,0x05BF; 0x05C1,0x05C2;
147
0x05C4,0x05C4; 0x064B,0x0652; 0x0670,0x0670; 0x06D6,0x06DC;
148
0x06DD,0x06DF; 0x06E0,0x06E4; 0x06E7,0x06E8; 0x06EA,0x06ED;
149
0x0901,0x0903; 0x093C,0x093C; 0x093E,0x094C; 0x094D,0x094D;
150
0x0951,0x0954; 0x0962,0x0963; 0x0981,0x0983; 0x09BC,0x09BC;
151
0x09BE,0x09BE; 0x09BF,0x09BF; 0x09C0,0x09C4; 0x09C7,0x09C8;
152
0x09CB,0x09CD; 0x09D7,0x09D7; 0x09E2,0x09E3; 0x0A02,0x0A02;
153
0x0A3C,0x0A3C; 0x0A3E,0x0A3E; 0x0A3F,0x0A3F; 0x0A40,0x0A42;
154
0x0A47,0x0A48; 0x0A4B,0x0A4D; 0x0A70,0x0A71; 0x0A81,0x0A83;
155
0x0ABC,0x0ABC; 0x0ABE,0x0AC5; 0x0AC7,0x0AC9; 0x0ACB,0x0ACD;
156
0x0B01,0x0B03; 0x0B3C,0x0B3C; 0x0B3E,0x0B43; 0x0B47,0x0B48;
157
0x0B4B,0x0B4D; 0x0B56,0x0B57; 0x0B82,0x0B83; 0x0BBE,0x0BC2;
158
0x0BC6,0x0BC8; 0x0BCA,0x0BCD; 0x0BD7,0x0BD7; 0x0C01,0x0C03;
159
0x0C3E,0x0C44; 0x0C46,0x0C48; 0x0C4A,0x0C4D; 0x0C55,0x0C56;
160
0x0C82,0x0C83; 0x0CBE,0x0CC4; 0x0CC6,0x0CC8; 0x0CCA,0x0CCD;
161
0x0CD5,0x0CD6; 0x0D02,0x0D03; 0x0D3E,0x0D43; 0x0D46,0x0D48;
162
0x0D4A,0x0D4D; 0x0D57,0x0D57; 0x0E31,0x0E31; 0x0E34,0x0E3A;
163
0x0E47,0x0E4E; 0x0EB1,0x0EB1; 0x0EB4,0x0EB9; 0x0EBB,0x0EBC;
164
0x0EC8,0x0ECD; 0x0F18,0x0F19; 0x0F35,0x0F35; 0x0F37,0x0F37;
165
0x0F39,0x0F39; 0x0F3E,0x0F3E; 0x0F3F,0x0F3F; 0x0F71,0x0F84;
166
0x0F86,0x0F8B; 0x0F90,0x0F95; 0x0F97,0x0F97; 0x0F99,0x0FAD;
167
0x0FB1,0x0FB7; 0x0FB9,0x0FB9; 0x20D0,0x20DC; 0x20E1,0x20E1;
168
0x302A,0x302F; 0x3099,0x3099; 0x309A,0x309A ];
171
(fun (fromcode, tocode) ->
172
for code = fromcode to tocode do
173
a.[code] <- Char.chr P.unicode_digit
176
[ 0x0660,0x0669; 0x06F0,0x06F9; 0x0966,0x096F; 0x09E6,0x09EF;
177
0x0A66,0x0A6F; 0x0AE6,0x0AEF; 0x0B66,0x0B6F; 0x0BE7,0x0BEF;
178
0x0C66,0x0C6F; 0x0CE6,0x0CEF; 0x0D66,0x0D6F; 0x0E50,0x0E59;
179
0x0ED0,0x0ED9; 0x0F20,0x0F29 ];
181
for code = 48 to 57 do
182
a.[code] <- Char.chr P.ascii_digit
186
(fun (fromcode, tocode) ->
187
for code = fromcode to tocode do
188
a.[code] <- Char.chr P.extender
191
[ 0x00B7,0x00B7; 0x02D0,0x02D1; 0x0387,0x0387; 0x0640,0x0640;
192
0x0E46,0x0E46; 0x0EC6,0x0EC6; 0x3005,0x3005; 0x3031,0x3035;
193
0x309D,0x309E; 0x30FC,0x30FE ];
203
if code >= 0x4E00 && code <= 0x9FA5 then
205
else if code >= 0xAC00 && code <= 0xD7A3 then
207
else if code <= 0xD7FF || (code >= 0xE000 && code <= 0xFFFD) ||
208
(code >= 0x10000 && code <= 0x10FFFF) then
222
(* ----------- ISO-8859-1 ----------- *)
226
let lexer_set_iso88591 =
227
{ lex_encoding = `Enc_iso88591;
228
scan_document = Pxp_wlex.scan_document engine_iso88591;
229
scan_content = Pxp_wlex.scan_content engine_iso88591;
230
scan_within_tag = Pxp_wlex.scan_within_tag engine_iso88591;
231
scan_document_type = Pxp_wlex.
232
scan_document_type engine_iso88591;
233
scan_declaration = Pxp_wlex.scan_declaration engine_iso88591;
234
scan_content_comment = Pxp_wlex.scan_content_comment engine_iso88591;
235
scan_decl_comment = Pxp_wlex.scan_decl_comment engine_iso88591;
236
scan_document_comment = Pxp_wlex.scan_document_comment engine_iso88591;
237
scan_ignored_section = Pxp_wlex.
238
scan_ignored_section engine_iso88591;
239
scan_xml_pi = Pxp_wlex.scan_xml_pi engine_iso88591;
240
scan_dtd_string = Pxp_wlex.scan_dtd_string engine_iso88591;
241
scan_content_string = Pxp_wlex.
242
scan_content_string engine_iso88591;
243
scan_name_string = Pxp_wlex.scan_name_string engine_iso88591;
244
scan_only_xml_decl = Pxp_wlex.scan_only_xml_decl engine_iso88591;
245
scan_for_crlf = Pxp_wlex.scan_for_crlf engine_iso88591;
246
scan_tag_eb = Pxp_wlex.scan_tag_eb engine_iso88591;
247
scan_tag_eb_att = Pxp_wlex.scan_tag_eb_att engine_iso88591;
252
Pxp_lexers.init lexer_set_iso88591
255
--- END REMOVED --- *)
258
(* ---------- UTF8 --------- *)
260
(* This version works only for UTF8: *)
261
assert(`Enc_${encoding} = `Enc_utf8);;
263
class lfactory : lexer_factory =
265
method encoding = `Enc_${encoding}
267
method open_source src =
269
(self : #lexer_factory :> lexer_factory)
270
(Lazy.force src.lsrc_lexbuf) false
272
method open_string s =
274
(self : #lexer_factory :> lexer_factory)
275
(Lexing.from_string s) true
277
method open_string_inplace s =
279
(self : #lexer_factory :> lexer_factory)
280
(Pxp_lexing.from_string_inplace s) true
283
and lobj factory _lexbuf _lexbuf_from_string : lexer_obj =
285
val mutable lexbuf = _lexbuf
286
val mutable lexbuf_from_string = _lexbuf_from_string
288
method factory = factory
290
method encoding = `Enc_${encoding}
292
method open_source src =
293
lexbuf <- Lazy.force src.lsrc_lexbuf;
294
lexbuf_from_string <- false
296
method open_string s =
297
lexbuf <- Lexing.from_string s;
298
lexbuf_from_string <- true
300
method open_string_inplace s =
301
if lexbuf_from_string then (
302
Pxp_lexing.from_another_string_inplace lexbuf s
305
lexbuf <- Pxp_lexing.from_string_inplace s;
306
lexbuf_from_string <- true
309
method lexeme_length =
310
(* Very inefficient: *)
311
let s = Lexing.lexeme lexbuf in
312
Netconversion.ustring_length `Enc_${encoding} s
314
method lexeme_char pos =
315
(* Very inefficient: *)
316
let s = Lexing.lexeme lexbuf in
317
let cs = Netconversion.create_cursor `Enc_${encoding} s in
318
Netconversion.move ~num:pos cs;
319
Netconversion.uchar_at cs
324
method lexeme_strlen =
325
Pxp_lexing.lexeme_len lexbuf
327
method sub_lexeme pos len =
328
(* Very inefficient: *)
329
let s = Lexing.lexeme lexbuf in
330
let ua = Netconversion.uarray_of_ustring `Enc_${encoding} s in
331
Netconversion.ustring_of_uarray `Enc_${encoding} ~pos ~len ua
333
method scan_document =
335
${scan_document}.scan_document
336
(self : #lexer_obj :> lexer_obj) engine_utf8 lexbuf
338
method scan_content =
340
${scan_content}.scan_content
341
(self : #lexer_obj :> lexer_obj) engine_utf8 lexbuf
343
method scan_within_tag =
345
${scan_within_tag}.scan_within_tag
346
(self : #lexer_obj :> lexer_obj) engine_utf8 lexbuf
348
method scan_document_type =
350
${scan_document_type}.scan_document_type
351
(self : #lexer_obj :> lexer_obj) engine_utf8 lexbuf
353
method scan_declaration =
355
${scan_declaration}.scan_declaration
356
(self : #lexer_obj :> lexer_obj) engine_utf8 lexbuf
358
method scan_comment =
360
${scan_comment}.scan_comment
361
(self : #lexer_obj :> lexer_obj) engine_utf8 lexbuf
363
method scan_ignored_section =
365
${scan_ignored_section}.scan_ignored_section
366
(self : #lexer_obj :> lexer_obj) engine_utf8 lexbuf
368
method detect_xml_pi =
370
${detect_xml_pi}.detect_xml_pi
371
(self : #lexer_obj :> lexer_obj) engine_utf8 lexbuf
375
${scan_xml_pi}.scan_xml_pi
376
(self : #lexer_obj :> lexer_obj) engine_utf8 lexbuf
378
method scan_pi_string =
380
${scan_pi_string}.scan_pi_string
381
(self : #lexer_obj :> lexer_obj) engine_utf8 lexbuf
383
method scan_dtd_string =
385
${scan_dtd_string}.scan_dtd_string
386
(self : #lexer_obj :> lexer_obj) engine_utf8 lexbuf
388
method scan_content_string =
390
${scan_content_string}.scan_content_string
391
(self : #lexer_obj :> lexer_obj) engine_utf8 lexbuf
393
method scan_name_string =
395
${scan_name_string}.scan_name_string
396
(self : #lexer_obj :> lexer_obj) engine_utf8 lexbuf
398
method scan_for_crlf =
400
${scan_for_crlf}.scan_for_crlf
401
(self : #lexer_obj :> lexer_obj) engine_utf8 lexbuf
403
method scan_characters =
405
${scan_characters}.scan_characters
406
(self : #lexer_obj :> lexer_obj) engine_utf8 lexbuf
408
method scan_character =
410
${scan_character}.scan_character
411
(self : #lexer_obj :> lexer_obj) engine_utf8 lexbuf
415
${scan_tag_eb}.scan_tag_eb
416
(self : #lexer_obj :> lexer_obj) engine_utf8 lexbuf
418
method scan_tag_eb_att =
420
${scan_tag_eb_att}.scan_tag_eb_att
421
(self : #lexer_obj :> lexer_obj) engine_utf8 lexbuf
427
Pxp_lexers.init (new lfactory)