1
(* $Id: parse_unidata.ml,v 1.18 2003/12/19 17:24:34 yori Exp $ *)
2
(* Copyright 2002 Yamagata Yoriyuki *)
6
(* It seems that the default value of combined class is 0 *)
7
type combined_class = int
9
let num_of_combined_class cc = cc
11
let combined_class_of_num i = i
13
let null = UChar.chr_of_uint 0
14
let max_uchar = UChar.chr_of_uint 0x7fffffff
16
let cat_tbl = ref (UMap.add_range null max_uchar 0 UMap.empty)
18
let combcl_tbl = ref UMap.empty
20
let decomp_tbl : decomposition_info UMap.t ref = ref UMap.empty
22
let to_lower1 = ref UMap.empty
23
let to_title1 = ref UMap.empty
24
let to_upper1 = ref UMap.empty
26
let scolon_pat = Str.regexp ";"
27
let blank_pat = Str.regexp "[ \t]+"
28
let mark_pat = Str.regexp "<.*>"
30
let int_of_code code = int_of_string ("0x"^code)
31
let uchar_of_code code = UChar.chr_of_uint (int_of_code code)
32
let option_uchar_of_code code =
33
if code = "" then None else Some (uchar_of_code code)
37
let s = read_line () in
38
let tokens = Str.split_delim scolon_pat s in
40
[code; name; catname; comb_cl_str; bidi_str; decomp_str;
41
dec_digit_str; digit_str; num_str; mirrored_str; old_name; comment;
42
upper_str; lower_str; title_str] ->
43
let i0 = int_of_code code in
44
if i0 >= 0xf0000 && i0 <= 0xffffd then () else
45
if i0 >= 0x100000 && i0 <= 0x10fffd then () else
46
let i1 = (*continous region*)
47
if i0 = 0x3400 then 0x4db4 (*CJK Ideographic Extension A*)
48
else if i0 = 0x4e00 then 0x9fa4 (*CJK Ideographic*)
49
else if i0 = 0xac00 then 0xd7a3 (*Hangul Syllable*)
50
else if i0 = 0xd800 then 0xdbfe (*High Surrogate*)
51
else if i0 = 0xdc00 then 0xdefe (*Low Surrogate*)
52
else if i0 = 0xe000 then 0xf8fe (*Private Zone*)
53
else if i0 = 0x20000 then 0x2a6d5(*CJK Ideographic Extension B*)
56
let cat_num = num_of_cat (cat_of_name catname) in
57
let comb_cl = int_of_string comb_cl_str in
59
let char_str = Str.split blank_pat decomp_str in
61
if 0xac00 <= i0 && i0 <= 0xd7a3 then `HangulSyllable
63
else if Str.string_match mark_pat (List.hd char_str) 0 then
66
UChar.chr_of_uint (int_of_string ("0x"^s))) (List.tl char_str)
68
match Str.matched_string (List.hd char_str) with
69
"<font>" -> `Composite (`Font, us)
70
| "<noBreak>" -> `Composite (`NoBreak, us)
71
| "<initial>" -> `Composite (`Initial, us)
72
| "<medial>" -> `Composite (`Medial, us)
73
| "<final>" -> `Composite (`Final, us)
74
| "<isolated>" -> `Composite (`Isolated, us)
75
| "<circle>" -> `Composite (`Circle, us)
76
| "<super>" -> `Composite (`Super, us)
77
| "<sub>" -> `Composite (`Sub, us)
78
| "<vertical>" -> `Composite (`Vertical, us)
79
| "<wide>" -> `Composite (`Wide, us)
80
| "<narrow>" -> `Composite (`Narrow, us)
81
| "<small>" -> `Composite (`Small, us)
82
| "<square>" -> `Composite (`Square, us)
83
| "<fraction>" -> `Composite (`Fraction, us)
84
| "<compat>" -> `Composite (`Compat, us)
85
| _ -> failwith ("Malformed Table"^s)
89
UChar.chr_of_uint (int_of_string ("0x"^s)))
92
`Composite(`Canon, us)
94
let upper_us = option_uchar_of_code upper_str in
95
let title_us = option_uchar_of_code title_str in
96
let lower_us = option_uchar_of_code lower_str in
97
let u0 = UChar.chr_of_uint i0 in
98
let u1 = UChar.chr_of_uint i1 in
99
if cat_num <> 0 then cat_tbl := UMap.add_range u0 u1 cat_num !cat_tbl;
101
combcl_tbl := UMap.add_range u0 u1 comb_cl !combcl_tbl;
102
if decomp <> `Canonform then
103
decomp_tbl := UMap.add_range u0 u1 decomp !decomp_tbl;
104
(match upper_us with None -> () | Some u' ->
105
to_upper1 := UMap.add_range u0 u1 u' !to_upper1);
106
(match title_us with None -> () | Some u' ->
107
to_title1 := UMap.add_range u0 u1 u' !to_title1);
108
(match lower_us with None -> () | Some u' ->
109
to_lower1 := UMap.add_range u0 u1 u' !to_lower1);
110
| _ -> failwith ("Malformed Table "^s)
111
done with End_of_file -> ()
113
let rec decompose decomp_tbl u =
114
try match UMap.find u decomp_tbl with
115
`Composite (`Canon, us) ->
116
List.fold_right (fun u a -> (decompose decomp_tbl u) @ a) us []
117
| `HangulSyllable -> Hangul.decompose u
122
module CompositeTbl =
123
UCharTbl.Make (struct
124
type t = (UChar.t * UChar.t) list
126
let hash = Hashtbl.hash
130
UCharTbl.Make (struct
131
type t = Unidata.decomposition_info
133
let hash = Hashtbl.hash
137
UCharTbl.Make (struct
140
let hash u = UChar.uint_code u
146
Arg.parse [] (fun s -> dir := s) "Parse the Unicode data file";
151
`Composite (`Canon, [u1; u2]) ->
152
let l = try UMap.find u1 tbl with Not_found -> [] in
153
(* Printf.printf "\\u%04x : [" (int_of_uchar u1);
154
List.iter (fun (u2, u) ->
155
Printf.printf "(\\u%04x, \\u%04x);"
159
print_string "] \n"; *)
160
UMap.add u1 ((u2, u) :: l) tbl
163
UMap.fold f !decomp_tbl UMap.empty in
164
let comp_tbl_ro = CompositeTbl.of_map [] comp_tbl in
166
UMap.fold (fun u d decomps ->
168
`Composite (`Canon, us) ->
171
(fun u a -> (decompose !decomp_tbl u) @ a)
175
UMap.add u (`Composite (`Canon, d)) decomps
176
| x -> UMap.add u x decomps)
180
let cat_tbl_ro = UCharTbl.Bits.of_map 0 !cat_tbl in
184
(UMap.map Char.chr !combcl_tbl) in
185
let decomp_tbl_ro = DecompTbl.of_map `Canonform decomps in
186
let null = UChar.chr_of_uint 0 in
187
let to_lower1_ro = UTbl.of_map null !to_lower1 in
188
let to_title1_ro = UTbl.of_map null !to_title1 in
189
let to_upper1_ro = UTbl.of_map null !to_upper1 in
192
let filename = Filename.concat !dir "/general_category_map.mar" in
193
open_out_bin filename in
194
let gen_cat = UMap.map cat_of_num !cat_tbl in
195
output_value c gen_cat; close_out c;
196
let c = open_out_bin (Filename.concat !dir "/general_category.mar") in
197
output_value c cat_tbl_ro; close_out c;
199
let filename = Filename.concat !dir "/combined_class_map.mar" in
200
open_out_bin filename in
201
output_value c !combcl_tbl; close_out c;
202
let c = open_out_bin (Filename.concat !dir "/combined_class.mar") in
203
output_value c combcl_tbl_ro; close_out c;
204
let c = open_out_bin (Filename.concat !dir "/decomposition.mar") in
205
output_value c decomp_tbl_ro; close_out c;
206
let c = open_out_bin (Filename.concat !dir "/composition.mar") in
207
output_value c comp_tbl_ro; close_out c;
208
let c = open_out_bin (Filename.concat !dir "/to_lower1.mar") in
209
output_value c to_lower1_ro; close_out c;
210
let c = open_out_bin (Filename.concat !dir "/to_title1.mar") in
211
output_value c to_title1_ro; close_out c;
212
let c = open_out_bin (Filename.concat !dir "/to_upper1.mar") in
213
output_value c to_upper1_ro; close_out c;