1
(* $Id: uTF8.ml,v 1.11 2004/09/04 16:07:38 yori Exp $ *)
2
(* Copyright 2002, 2003 Yamagata Yoriyuki. distributed with LGPL *)
9
let n = Char.code s.[i] in
10
if n < 0x80 then n else
12
(n - 0xc0) lsl 6 lor (0x7f land (Char.code s.[i + 1]))
13
else if n <= 0xef then
15
let m0 = Char.code s.[i + 2] in
16
let m = Char.code (String.unsafe_get s (i + 1)) in
17
let n' = n' lsl 6 lor (0x7f land m) in
18
n' lsl 6 lor (0x7f land m0)
19
else if n <= 0xf7 then
21
let m0 = Char.code s.[i + 3] in
22
let m = Char.code (String.unsafe_get s (i + 1)) in
23
let n' = n' lsl 6 lor (0x7f land m) in
24
let m = Char.code (String.unsafe_get s (i + 2)) in
25
let n' = n' lsl 6 lor (0x7f land m) in
26
n' lsl 6 lor (0x7f land m0)
27
else if n <= 0xfb then
29
let m0 = Char.code s.[i + 4] in
30
let m = Char.code (String.unsafe_get s (i + 1)) in
31
let n' = n' lsl 6 lor (0x7f land m) in
32
let m = Char.code (String.unsafe_get s (i + 2)) in
33
let n' = n' lsl 6 lor (0x7f land m) in
34
let m = Char.code (String.unsafe_get s (i + 3)) in
35
let n' = n' lsl 6 lor (0x7f land m) in
36
n' lsl 6 lor (0x7f land m0)
37
else if n <= 0xfd then
39
let m0 = Char.code s.[i + 5] in
40
let m = Char.code (String.unsafe_get s (i + 1)) in
41
let n' = n' lsl 6 lor (0x7f land m) in
42
let m = Char.code (String.unsafe_get s (i + 2)) in
43
let n' = n' lsl 6 lor (0x7f land m) in
44
let m = Char.code (String.unsafe_get s (i + 3)) in
45
let n' = n' lsl 6 lor (0x7f land m) in
46
let m = Char.code (String.unsafe_get s (i + 4)) in
47
let n' = n' lsl 6 lor (0x7f land m) in
48
n' lsl 6 lor (0x7f land m0)
49
else invalid_arg "UTF8.look"
53
let rec search_head s i =
54
if i >= String.length s then i else
55
let n = Char.code (String.unsafe_get s i) in
56
if n < 0x80 || n >= 0xc2 then i else
60
let n = Char.code s.[i] in
61
if n < 0x80 then i + 1 else
62
if n < 0xc0 then search_head s (i + 1) else
63
if n <= 0xdf then i + 2
64
else if n <= 0xef then i + 3
65
else if n <= 0xf7 then i + 4
66
else if n <= 0xfb then i + 5
67
else if n <= 0xfd then i + 6
68
else invalid_arg "UTF8.next"
70
let rec search_head_backward s i =
72
let n = Char.code s.[i] in
73
if n < 0x80 || n >= 0xc2 then i else
74
search_head_backward s (i - 1)
76
let prev s i = search_head_backward s (i - 1)
80
let rec loop i n = if n <= 0 then i else loop (next s i) (n - 1) in
83
let rec loop i n = if n >= 0 then i else loop (prev s i) (n + 1) in
86
let rec nth_aux s i n =
88
nth_aux s (next s i) (n - 1)
90
let nth s n = nth_aux s 0 n
94
let last s = search_head_backward s (String.length s - 1)
96
let out_of_range s i = i < 0 || i >= String.length s
98
let compare_index _ i j = i - j
100
let get s n = look s (nth s n)
102
let add_uchar buf u =
103
let masq = 0b111111 in
104
let k = UChar.uint_code u in
105
if k < 0 || k >= 0x4000000 then begin
106
Buffer.add_char buf (Char.chr (0xfc + (k lsr 30)));
107
Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((k lsr 24) land masq)));
108
Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((k lsr 18) land masq)));
109
Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((k lsr 12) land masq)));
110
Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((k lsr 6) land masq)));
111
Buffer.add_char buf (Char.unsafe_chr (0x80 lor (k land masq)));
112
end else if k <= 0x7f then
113
Buffer.add_char buf (Char.unsafe_chr k)
114
else if k <= 0x7ff then begin
115
Buffer.add_char buf (Char.unsafe_chr (0xc0 lor (k lsr 6)));
116
Buffer.add_char buf (Char.unsafe_chr (0x80 lor (k land masq)))
117
end else if k <= 0xffff then begin
118
Buffer.add_char buf (Char.unsafe_chr (0xe0 lor (k lsr 12)));
119
Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((k lsr 6) land masq)));
120
Buffer.add_char buf (Char.unsafe_chr (0x80 lor (k land masq)));
121
end else if k <= 0x1fffff then begin
122
Buffer.add_char buf (Char.unsafe_chr (0xf0 + (k lsr 18)));
123
Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((k lsr 12) land masq)));
124
Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((k lsr 6) land masq)));
125
Buffer.add_char buf (Char.unsafe_chr (0x80 lor (k land masq)));
127
Buffer.add_char buf (Char.unsafe_chr (0xf8 + (k lsr 24)));
128
Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((k lsr 18) land masq)));
129
Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((k lsr 12) land masq)));
130
Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((k lsr 6) land masq)));
131
Buffer.add_char buf (Char.unsafe_chr (0x80 lor (k land masq)));
135
let buf = Buffer.create len in
136
for c = 0 to len - 1 do add_uchar buf (f c) done;
140
let rec length_aux s c i =
141
if i >= String.length s then c else
142
let n = Char.code (String.unsafe_get s i) in
144
if n < 0x80 then 1 else
145
if n < 0xc0 then invalid_arg "UTF8.length" else
146
if n < 0xe0 then 2 else
147
if n < 0xf0 then 3 else
148
if n < 0xf8 then 4 else
149
if n < 0xfc then 5 else
150
if n < 0xfe then 6 else
151
invalid_arg "UTF8.length" in
152
length_aux s (c + 1) (i + k)
154
let length s = length_aux s 0 0
156
let rec iter_aux proc s i =
157
if i >= String.length s then () else
160
iter_aux proc s (next s i)
162
let iter proc s = iter_aux proc s 0
164
let compare s1 s2 = Pervasives.compare s1 s2
166
exception Malformed_code
169
let rec trail c i a =
171
if i >= String.length s then raise Malformed_code else
172
let n = Char.code (String.unsafe_get s i) in
173
if n < 0x80 || n >= 0xc0 then raise Malformed_code else
174
trail (c - 1) (i + 1) (a lsl 6 lor (n - 0x80)) in
176
if i >= String.length s then () else
177
let n = Char.code (String.unsafe_get s i) in
178
if n < 0x80 then main (i + 1) else
179
if n < 0xc2 then raise Malformed_code else
181
if trail 1 (i + 1) (n - 0xc0) < 0x80 then raise Malformed_code else
183
else if n <= 0xef then
184
if trail 2 (i + 1) (n - 0xe0) < 0x800 then raise Malformed_code else
186
else if n <= 0xf7 then
187
if trail 3 (i + 1) (n - 0xf0) < 0x10000 then raise Malformed_code else
189
else if n <= 0xfb then
190
if trail 4 (i + 1) (n - 0xf8) < 0x200000 then raise Malformed_code else
192
else if n <= 0xfd then
193
let n = trail 5 (i + 1) (n - 0xfc) in
194
if n lsr 16 < 0x400 then raise Malformed_code else
196
else raise Malformed_code in
203
let add_char = add_uchar