2
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
4
* Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
6
* The contents of this file are subject to the terms of either the GNU Lesser
7
* General Public License Version 2.1 only ("LGPL") or the Common Development and
8
* Distribution License ("CDDL")(collectively, the "License"). You may not use this
9
* file except in compliance with the License. You can obtain a copy of the CDDL at
10
* http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
11
* http://www.opensource.org/licenses/lgpl-license.php. See the License for the
12
* specific language governing permissions and limitations under the License. When
13
* distributing the software, include this License Header Notice in each file and
14
* include the full text of the License in the License file as well as the
17
* NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
19
* For Covered Software in this distribution, this License shall be governed by the
20
* laws of the State of California (excluding conflict-of-law provisions).
21
* Any litigation relating to this License shall be subject to the jurisdiction of
22
* the Federal Courts of the Northern District of California and the state courts
23
* of the State of California, with venue lying in Santa Clara County, California.
27
* If you wish your version of this file to be governed by only the CDDL or only
28
* the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
29
* include this software in this distribution under the [CDDL or LGPL Version 2.1]
30
* license." If you don't indicate a single choice of license, a recipient has the
31
* option to distribute your version of this file under either the CDDL or the LGPL
32
* Version 2.1, or to extend the choice of license to its licensees as provided
33
* above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
34
* Version 2 license, then the option applies only if the new code is made subject
35
* to such option by the copyright holder.
41
#include "pinyin_data.h"
43
static const char *initials[] = {"", "b", "p", "m", "f", "d", "t", "n", "l", "g", "k", "h", "j", "q", "x", "zh", "ch", "sh", "r", "z", "c", "s", "y", "w", };
44
static const unsigned num_initials = sizeof(initials)/sizeof(*initials);
46
static const char *finals[] = {"", "a", "o", "e", "ai", "ei", "ao", "ou", "an", "en", "ang", "eng", "er", "i", "ia", "ie", "iao", "iu", "ian", "in", "iang", "ing", "u", "ua", "uo", "uai", "ui", "uan", "un", "uang", "ong", "v", "ue", "iong", };
47
static const unsigned num_finals = sizeof(finals)/sizeof(*finals);
49
static const char *fuzzy_finals[] = {"ia", "iao", "ian", "iang", "ie", "ua", "uai", "uan", "uang", "ue"};
50
static const unsigned num_fuzzy_finals = sizeof(fuzzy_finals)/sizeof(*fuzzy_finals);
52
static const unsigned fuzzy_finals_map[] = {
53
0x0e, 0x10, 1, /* ia -> a len 1 */
54
0x10, 0x60, 2, /* iao -> ao len 2 */
55
0x12, 0x80, 2, /* ian -> an len 2 */
56
0x14, 0xa0, 3, /* iang -> ang len 3 */
57
0x0f, 0x30, 1, /* ie -> e len 1 */
58
0x17, 0x10, 1, /* ua -> a len 1 */
59
0x19, 0x40, 2, /* uai -> ai len 2 */
60
0x1b, 0x80, 2, /* uan -> an len 2 */
61
0x1d, 0xa0, 3, /* uang -> ang len 3 */
62
0x20, 0x30, 1, /* ue -> e len 1 */
65
static const unsigned fuzzy_pre_syllables [] = {
66
0x0d0e0, 'n', 0x0d120, /* qian */
67
0x09080, 'g', 0x090a0, /* gang */
68
0x080e0, 'n', 0x08120, /* lian */
69
0x15090, 'g', 0x150b0, /* seng */
70
0x04010, 'n', 0x04080, /* fan */
71
0x10030, 'n', 0x10090, /* chen */
72
0x050e0, 'n', 0x05120, /* dian */
73
0x15160, 'n', 0x151c0, /* sun */
74
0x07080, 'g', 0x070a0, /* nang */
75
0x0a160, 'n', 0x0a1c0, /* kun */
76
0x05030, 'n', 0x05090, /* den */
77
0x07090, 'g', 0x070b0, /* neng */
78
0x03030, 'n', 0x03090, /* men */
79
0x09090, 'g', 0x090b0, /* geng */
80
0x10080, 'g', 0x100a0, /* chang */
81
0x0f010, 'n', 0x0f080, /* zhan */
82
0x14010, 'n', 0x14080, /* can */
83
0x07130, 'g', 0x07150, /* ning */
84
0x17080, 'g', 0x170a0, /* wang */
85
0x01090, 'g', 0x010b0, /* beng */
86
0x0f1b0, 'g', 0x0f1d0, /* zhuang */
87
0x06010, 'n', 0x06080, /* tan */
88
0x00090, 'g', 0x000b0, /* eng */
89
0x0f080, 'g', 0x0f0a0, /* zhang */
90
0x02130, 'g', 0x02150, /* ping */
91
0x08010, 'n', 0x08080, /* lan */
92
0x0e160, 'n', 0x0e1c0, /* xun */
93
0x03010, 'n', 0x03080, /* man */
94
0x0c120, 'g', 0x0c140, /* jiang */
95
0x0a1b0, 'g', 0x0a1d0, /* kuang */
96
0x01130, 'g', 0x01150, /* bing */
97
0x13010, 'n', 0x13080, /* zan */
98
0x13030, 'n', 0x13090, /* zen */
99
0x02080, 'g', 0x020a0, /* pang */
100
0x0c0d0, 'n', 0x0c130, /* jin */
101
0x14030, 'n', 0x14090, /* cen */
102
0x05010, 'n', 0x05080, /* dan */
103
0x0f030, 'n', 0x0f090, /* zhen */
104
0x01080, 'g', 0x010a0, /* bang */
105
0x17090, 'g', 0x170b0, /* weng */
106
0x00030, 'n', 0x00090, /* en */
107
0x0a080, 'g', 0x0a0a0, /* kang */
108
0x09160, 'n', 0x091c0, /* gun */
109
0x00030, 'r', 0x000c0, /* er */
110
0x0a090, 'g', 0x0a0b0, /* keng */
111
0x15080, 'g', 0x150a0, /* sang */
112
0x12030, 'n', 0x12090, /* ren */
113
0x11160, 'n', 0x111c0, /* shun */
114
0x0d160, 'n', 0x0d1c0, /* qun */
115
0x16160, 'n', 0x161c0, /* yun */
116
0x0e120, 'g', 0x0e140, /* xiang */
117
0x12080, 'g', 0x120a0, /* rang */
118
0x09170, 'n', 0x091b0, /* guan */
119
0x16130, 'g', 0x16150, /* ying */
120
0x0a170, 'n', 0x0a1b0, /* kuan */
121
0x10010, 'n', 0x10080, /* chan */
122
0x160d0, 'n', 0x16130, /* yin */
123
0x0e0d0, 'n', 0x0e130, /* xin */
124
0x07120, 'g', 0x07140, /* niang */
125
0x0b160, 'n', 0x0b1c0, /* hun */
126
0x11170, 'n', 0x111b0, /* shuan */
127
0x05080, 'g', 0x050a0, /* dang */
128
0x00080, 'g', 0x000a0, /* ang */
129
0x15010, 'n', 0x15080, /* san */
130
0x12090, 'g', 0x120b0, /* reng */
131
0x03130, 'g', 0x03150, /* ming */
132
0x030d0, 'n', 0x03130, /* min */
133
0x07030, 'n', 0x07090, /* nen */
134
0x0a010, 'n', 0x0a080, /* kan */
135
0x16080, 'g', 0x160a0, /* yang */
136
0x05090, 'g', 0x050b0, /* deng */
137
0x101b0, 'g', 0x101d0, /* chuang */
138
0x04090, 'g', 0x040b0, /* feng */
139
0x03090, 'g', 0x030b0, /* meng */
140
0x10090, 'g', 0x100b0, /* cheng */
141
0x09030, 'n', 0x09090, /* gen */
142
0x01010, 'n', 0x01080, /* ban */
143
0x07160, 'n', 0x071c0, /* nun */
144
0x15030, 'n', 0x15090, /* sen */
145
0x04080, 'g', 0x040a0, /* fang */
146
0x08160, 'n', 0x081c0, /* lun */
147
0x0a030, 'n', 0x0a090, /* ken */
148
0x0b1b0, 'g', 0x0b1d0, /* huang */
149
0x03080, 'g', 0x030a0, /* mang */
150
0x06160, 'n', 0x061c0, /* tun */
151
0x0d0d0, 'n', 0x0d130, /* qin */
152
0x02090, 'g', 0x020b0, /* peng */
153
0x05160, 'n', 0x051c0, /* dun */
154
0x10160, 'n', 0x101c0, /* chun */
155
0x09010, 'n', 0x09080, /* gan */
156
0x13090, 'g', 0x130b0, /* zeng */
157
0x06080, 'g', 0x060a0, /* tang */
158
0x14080, 'g', 0x140a0, /* cang */
159
0x0b090, 'g', 0x0b0b0, /* heng */
160
0x0e0e0, 'n', 0x0e120, /* xian */
161
0x0f160, 'n', 0x0f1c0, /* zhun */
162
0x111b0, 'g', 0x111d0, /* shuang */
163
0x11010, 'n', 0x11080, /* shan */
164
0x02010, 'n', 0x02080, /* pan */
165
0x070d0, 'n', 0x07130, /* nin */
166
0x0b080, 'g', 0x0b0a0, /* hang */
167
0x0f170, 'n', 0x0f1b0, /* zhuan */
168
0x080d0, 'n', 0x08130, /* lin */
169
0x091b0, 'g', 0x091d0, /* guang */
170
0x0b010, 'n', 0x0b080, /* han */
171
0x14160, 'n', 0x141c0, /* cun */
172
0x010d0, 'n', 0x01130, /* bin */
173
0x11030, 'n', 0x11090, /* shen */
174
0x0e130, 'g', 0x0e150, /* xing */
175
0x0d120, 'g', 0x0d140, /* qiang */
176
0x12160, 'n', 0x121c0, /* run */
177
0x11090, 'g', 0x110b0, /* sheng */
178
0x10170, 'n', 0x101b0, /* chuan */
179
0x0d130, 'g', 0x0d150, /* qing */
180
0x0c0e0, 'n', 0x0c120, /* jian */
181
0x17010, 'n', 0x17080, /* wan */
182
0x0c130, 'g', 0x0c150, /* jing */
183
0x16010, 'n', 0x16080, /* yan */
184
0x08120, 'g', 0x08140, /* liang */
185
0x0b170, 'n', 0x0b1b0, /* huan */
186
0x0b030, 'n', 0x0b090, /* hen */
187
0x11080, 'g', 0x110a0, /* shang */
188
0x0c160, 'n', 0x0c1c0, /* jun */
189
0x08130, 'g', 0x08150, /* ling */
190
0x14090, 'g', 0x140b0, /* ceng */
191
0x020d0, 'n', 0x02130, /* pin */
192
0x00010, 'n', 0x00080, /* an */
193
0x13080, 'g', 0x130a0, /* zang */
194
0x07010, 'n', 0x07080, /* nan */
195
0x0f090, 'g', 0x0f0b0, /* zheng */
196
0x13160, 'n', 0x131c0, /* zun */
197
0x08080, 'g', 0x080a0, /* lang */
201
static const unsigned fuzzy_pro_syllables [] = {
202
0x09030, 'g', 0x00030, /* ge */
203
0x090a0, 'g', 0x000a0, /* gang */
204
0x09010, 'g', 0x00010, /* ga */
205
0x12070, 'r', 0x00070, /* rou */
206
0x07050, 'n', 0x00050, /* nei */
207
0x070a0, 'n', 0x000a0, /* nang */
208
0x070b0, 'n', 0x000b0, /* neng */
209
0x090b0, 'g', 0x000b0, /* geng */
210
0x07070, 'n', 0x00070, /* nou */
211
0x12030, 'r', 0x00030, /* re */
212
0x12090, 'r', 0x00090, /* ren */
213
0x09070, 'g', 0x00070, /* gou */
214
0x120a0, 'r', 0x000a0, /* rang */
215
0x120b0, 'r', 0x000b0, /* reng */
216
0x12080, 'r', 0x00080, /* ran */
217
0x12060, 'r', 0x00060, /* rao */
218
0x07090, 'n', 0x00090, /* nen */
219
0x09050, 'g', 0x00050, /* gei */
220
0x09090, 'g', 0x00090, /* gen */
221
0x09060, 'g', 0x00060, /* gao */
222
0x09080, 'g', 0x00080, /* gan */
223
0x09040, 'g', 0x00040, /* gai */
224
0x07060, 'n', 0x00060, /* nao */
225
0x07010, 'n', 0x00010, /* na */
226
0x07040, 'n', 0x00040, /* nai */
227
0x07080, 'n', 0x00080, /* nan */
228
0x07030, 'n', 0x00030, /* ne */
232
static const char * fuzzy_pairs[] = {
248
static const unsigned num_fuzzy_pairs = sizeof(fuzzy_pairs)/sizeof(*fuzzy_pairs)/2;
250
static const char * auto_correction_pairs[] = {
257
static const unsigned num_auto_correction_pairs = sizeof(auto_correction_pairs)/sizeof(*auto_correction_pairs)/2;
259
static const TPyTabEntry
701
pytab_entry_compare (const char *s, TPyTabEntry *v)
702
{return strcmp (s, v->pystr);}
705
CPinyinData::encodeSyllable (const char *pinyin)
707
typedef int (*bsearch_compare) (const void*, const void*);
708
TPyTabEntry *e = (TPyTabEntry*) bsearch (pinyin, pinyin_table,
709
sizeof(pinyin_table)/sizeof(pinyin_table[0]),
710
sizeof(pinyin_table[0]),
711
(bsearch_compare) pytab_entry_compare);
719
CPinyinData::decodeSyllable (TSyllable s, const char **i, const char **f)
721
if (i) *i = initials[s.initial];
722
if (f) *f = finals[s.final];
724
static char buf[128];
725
snprintf (buf, sizeof(buf), "%s%s", initials[s.initial], finals[s.final]);
727
typedef int (*bsearch_compare) (const void*, const void*);
728
TPyTabEntry *e = (TPyTabEntry*) bsearch (buf, pinyin_table,
729
sizeof(pinyin_table)/sizeof(pinyin_table[0]),
730
sizeof(pinyin_table[0]),
731
(bsearch_compare) pytab_entry_compare);
740
CPinyinData::getAutoCorrectionPairs (unsigned &num)
742
num = num_auto_correction_pairs;
743
return auto_correction_pairs;
747
CPinyinData::getFuzzyPairs (unsigned &num)
749
num = num_fuzzy_pairs;
754
CPinyinData::getInitials (unsigned &num)
761
CPinyinData::getFinals (unsigned &num)
768
CPinyinData::getPinyinTable(unsigned &num)
770
num = sizeof(pinyin_table) / sizeof(TPyTabEntry);
775
CPinyinData::getInnerFuzzyFinalMap (unsigned &num)
777
num = num_fuzzy_finals;
778
return fuzzy_finals_map;
782
CPinyinData::getFuzzyPreProSyllables (const unsigned **pre_syls, const unsigned **pro_syls)
784
*pre_syls = fuzzy_pre_syllables;
785
*pro_syls = fuzzy_pro_syllables;