50
53
struct TSyllableInfo {
54
TSyllableInfo(const char* py=NULL, int cost=0) : m_py(py), m_cost(cost) {}
55
bool operator< (const TSyllableInfo& b) const { return m_py < b.m_py; }
57
TSyllableInfo(const char* py = NULL, int cost = 0) : m_py(py), m_cost(cost)
61
operator<(const TSyllableInfo& b) const
58
67
#ifdef HAVE_ICONV_H
59
bool isCorrectConverted(const char* utf8, iconv_t ic, iconv_t ric)
69
isCorrectConverted(const char* utf8, iconv_t ic, iconv_t ric)
61
71
static char gbstr[256];
62
72
static char utstr[256];
64
74
TIConvSrcPtr src = (TIConvSrcPtr)utf8;
65
size_t srclen = strlen((char*)src)+1;
66
char* dst = (char *)gbstr;
75
size_t srclen = strlen((char*)src) + 1;
76
char* dst = (char*)gbstr;
67
77
size_t dstlen = 256;
68
78
size_t res = iconv(ic, &src, &srclen, &dst, &dstlen);
70
80
if (res != size_t(-1) && srclen == 0) {
71
81
// do revert convertion and compare them
72
82
src = (TIConvSrcPtr)gbstr;
73
srclen = strlen((char*)src)+1;
83
srclen = strlen((char*)src) + 1;
76
86
res = iconv(ric, &src, &srclen, &dst, &dstlen);
77
87
if (res != size_t(-1) && srclen == 0)
78
return (strcmp(utf8, utstr) == 0);
88
return(strcmp(utf8, utstr) == 0);
83
93
//return: bit 0x1: contains some gbk out of gb2312, bit 0x2: contains some gb18030 outof gbk
84
unsigned getPureGBEncoding(const char* utf8str)
95
getPureGBEncoding(const char* utf8str)
86
97
static iconv_t ic_gb = iconv_open("GB2312", "UTF-8");
87
98
static iconv_t ic_gbk = iconv_open("GBK", "UTF-8");
93
104
if (!isCorrectConverted(utf8str, ic_gb, ric_gb)) {
94
105
ret = 1; // at least it is contains some GBK char
95
106
if (!isCorrectConverted(utf8str, ic_gbk, ric_gbk))
96
ret = 3; //contains some GB18030-only char
107
ret = 3; //contains some GB18030-only char
99
fprintf(stderr, "==> GB category of (%s) is (0x%x)\n ", utf8str, ret);
110
fprintf(stderr, "==> GB category of (%s) is (0x%x)\n ", utf8str, ret);
105
116
#else // !HAVE_ICONV_H
106
unsigned getPureGBEncoding(const char* utf8str)
118
getPureGBEncoding(const char* utf8str)
138
153
while ((*p >= 'a' && *p <= 'z') || (*p == '\''))
140
155
if ((p > s) && ((*p == 0) || (*p == ':'))) {
159
cost = -log2(atof(p)/100);
146
161
pyset.insert(TSyllableInfo(s, cost));
189
193
while (fgets(buf, sizeof(buf), fp) != NULL) {
190
194
if (!parseLine(buf, word_buf, id, pyset)) {
191
195
if (word_buf[0] != L'<' && word_buf[0] != 0) {
192
if (m_Lexicon.size() < id+1) m_Lexicon.resize(id+1);
196
if (m_Lexicon.size() < id + 1) m_Lexicon.resize(id + 1);
193
197
m_Lexicon[id] = std::string(word_buf);
200
204
std::set<TSyllableInfo>::const_iterator ite = pyset.end();
201
205
for (; its != ite; ++its) {
202
206
const char *pystr = its->m_py.c_str();
203
int cost = its->m_cost;
206
if (m_Lexicon.size() < id+1) m_Lexicon.resize(id+1);
207
m_Lexicon[id] = std::string(word_buf);
209
CPinyinTrieMaker::TWordId wid(id, cost, false, gbcategory);
210
suc = insertFullPinyinPair(pystr, wid) && suc;
211
} else { // cache the rarely seen phonetic
213
record.word_buf = word_buf;
214
record.pystr = pystr;
216
record.gbcategory = gbcategory;
218
cached_words.push_back (record);
207
if (m_Lexicon.size() < id + 1) m_Lexicon.resize(id + 1);
208
m_Lexicon[id] = std::string(word_buf);
210
CPinyinTrieMaker::TWordId wid(id, its->m_cost, false, gbcategory);
211
suc = insertFullPinyinPair(pystr, wid) && suc;
224
// insert the cached words with rarely seen phonetic
225
// FIXME: may use the pinlv information in unicode data later
227
for (CWordCache::iterator it = cached_words.begin(); it != cached_words.end(); ++it, ++id) {
228
if (m_Lexicon.size() < id+1) m_Lexicon.resize(id+1);
229
m_Lexicon[id] = it->word_buf;
230
int cost = 30 / (-it->cost);
231
CPinyinTrieMaker::TWordId wid(id, cost, true, it->gbcategory);
232
suc = insertFullPinyinPair(it->pystr.c_str(), wid) && suc;
235
216
printf("\n %zd primitive nodes", TNode::m_AllNodes.size()); fflush(stdout);
237
218
threadNonCompletePinyin();
238
219
printf("\n %zd total nodes", TNode::m_AllNodes.size()); fflush(stdout);
240
221
std::string pyPrefix = "";
241
//print(stderr, &m_RootNode, pyPrefix);
242
222
printf("\n"); fflush(stdout);
247
227
CPinyinTrieMaker::CNodeList CPinyinTrieMaker::TNode::m_AllNodes;
248
228
CPinyinTrieMaker::TNode::TNode()
249
: m_bFullSyllableTransfer(false), m_bExpanded(false), m_WordIdSet(),
250
m_Trans(), m_cmbNodes()
229
: m_bExpanded(false), m_bFullSyllableTransfer(false)
252
231
m_AllNodes.push_back(this);
256
CPinyinTrieMaker::PNodeSet::operator< (const PNodeSet& another) const
235
CPinyinTrieMaker::PNodeSet::operator<(const PNodeSet& another) const
258
237
CNodeSet::const_iterator t1 = m_pns->begin();
259
238
CNodeSet::const_iterator t2 = m_pns->end();
276
255
for (; t1 != t2 && a1 != a2; ++t1, ++a1) {
277
256
if (*t1 != *a1) return false;
279
return (a1 == a2 && t1 != t2);
258
return(a1 == a2 && t1 != t2);
283
parseFullPinyin (const char *pinyin, std::vector<TSyllable> &ret)
262
parseFullPinyin(const char *pinyin, std::vector<TSyllable> &ret)
285
char *buf = strdup (pinyin);
264
char *buf = strdup(pinyin);
286
265
char *p = buf, *q = buf;
292
271
unsigned s = CPinyinData::encodeSyllable(q);
294
ret.push_back (TSyllable(s));
273
ret.push_back(TSyllable(s));
296
printf ("\nWarning! unrecognized syllable %s", q);
275
printf("\nWarning! unrecognized syllable %s", q);
303
unsigned s = CPinyinData::encodeSyllable(q);
305
ret.push_back (TSyllable(s));
307
printf ("\nWarning! unrecognized syllable %s", q);
282
unsigned s = CPinyinData::encodeSyllable(q);
284
ret.push_back(TSyllable(s));
286
printf("\nWarning! unrecognized syllable %s", q);
314
CPinyinTrieMaker::print(FILE* fp, TNode* root, std::string& pinyin)
316
if (root && root->m_WordIdSet.size() > 0) {
317
fprintf(fp, "%s", pinyin.c_str());
318
CWordSet::const_iterator itId = root->m_WordIdSet.begin();
319
CWordSet::const_iterator itIdLast = root->m_WordIdSet.end();
320
for (; itId != itIdLast; ++itId) {
321
fprintf(fp, " %s", m_Lexicon[itId->anony.m_id].c_str());
326
CTrans::const_iterator itTrans = root->m_Trans.begin();
327
CTrans::const_iterator itTransLast = root->m_Trans.end();
328
for (; itTrans != itTransLast; ++itTrans) {
329
const char *str = CPinyinData::decodeSyllable(itTrans->first);
330
pinyin = pinyin + str + '\'';
331
print(fp, itTrans->second, pinyin);
332
pinyin.resize(pinyin.size()-strlen(str)-1);
337
292
CPinyinTrieMaker::TNode*
338
293
CPinyinTrieMaker::insertTransfer(TNode* pnode, unsigned s)
355
310
TNode *pnode = &m_RootNode;
356
311
std::vector<TSyllable> syllables;
357
parseFullPinyin (pinyin, syllables);
312
parseFullPinyin(pinyin, syllables);
359
314
if (syllables.empty())
385
342
CNodeSet::const_iterator it = nodes.begin();
386
343
CNodeSet::const_iterator ite = nodes.end();
387
for (; it != ite; ++it)
388
p->m_WordIdSet.insert ((*it)->m_WordIdSet.begin(), (*it)->m_WordIdSet.end());
344
for (; it != ite; ++it) {
345
CWordSet::const_iterator wit = (*it)->m_WordIdSet.begin();
346
CWordSet::const_iterator wite = (*it)->m_WordIdSet.end();
348
for (; wit != wite; ++wit) {
349
CWordSet::iterator tmp = p->m_WordIdSet.find (*wit);
351
if (tmp == p->m_WordIdSet.end()) {
352
p->m_WordIdSet.insert (*wit);
353
} else if (tmp->anony.m_cost > wit->anony.m_cost) {
354
p->m_WordIdSet.erase (tmp);
355
p->m_WordIdSet.insert (*wit);
391
361
pnode->m_Trans[s] = p;
396
CPinyinTrieMaker::combineInitialTrans (TNode *pnode)
366
CPinyinTrieMaker::combineInitialTrans(TNode *pnode)
398
368
std::map<unsigned, CNodeSet> combTrans;
400
370
CTrans::const_iterator itTrans = pnode->m_Trans.begin();
401
371
CTrans::const_iterator itTransLast = pnode->m_Trans.end();
402
372
for (; itTrans != itTransLast; ++itTrans) {
403
TSyllable s = (TSyllable) itTrans->first;
373
TSyllable s = (TSyllable)itTrans->first;
405
375
s.final = s.tone = 0;
406
376
combTrans[s].insert(itTrans->second);
410
380
std::map<unsigned, CNodeSet>::const_iterator itCombTrans = combTrans.begin();
411
381
for (; itCombTrans != combTrans.end(); ++itCombTrans)
412
addCombinedTransfers (pnode, itCombTrans->first, itCombTrans->second);
382
addCombinedTransfers(pnode, itCombTrans->first, itCombTrans->second);
416
CPinyinTrieMaker::expandCombinedNode (TNode *pnode)
386
CPinyinTrieMaker::expandCombinedNode(TNode *pnode)
418
assert (pnode->m_cmbNodes.size() >= 1);
388
assert(pnode->m_cmbNodes.size() >= 1);
420
390
std::map<unsigned, CNodeSet> combTrans;
421
391
CNodeSet::const_iterator itNode = pnode->m_cmbNodes.begin();
430
400
std::map<unsigned, CNodeSet>::const_iterator itCombTrans = combTrans.begin();
431
for (; itCombTrans != combTrans.end(); ++itCombTrans) {
401
for (; itCombTrans != combTrans.end(); ++itCombTrans) {
433
403
unsigned s = itCombTrans->first;
434
404
CNodeSet nodes = itCombTrans->second;
437
407
if (itStateMap != m_StateMap.end())
438
408
p = itStateMap->second;
440
p = addCombinedTransfers (pnode, s, nodes);
410
p = addCombinedTransfers(pnode, s, nodes);
442
412
pnode->m_Trans[s] = p;
452
422
for (; itNode != TNode::m_AllNodes.end(); ++itNode) {
453
423
TNode* pnode = *itNode;
454
424
if (pnode->m_bExpanded)
455
combineInitialTrans (pnode);
425
combineInitialTrans(pnode);
457
expandCombinedNode (pnode);
427
expandCombinedNode(pnode);
498
468
for (; itWordStr != itWordStrLast; ++itWordStr) {
499
469
MBSTOWCS(wbuf, itWordStr->c_str(), 1024);
500
470
int sz = WCSLEN(wbuf);
501
offset += (sz+1)*sizeof(TWCHAR);
471
offset += (sz + 1) * sizeof(TWCHAR);
504
474
Writer f(fp, revert_endian);
506
476
suc = f.write(nWord);
507
477
suc = f.write(nNode);
508
478
suc = f.write(lexiconOffset);
510
480
itNode = TNode::m_AllNodes.begin();
511
481
itNodeLast = TNode::m_AllNodes.end();
513
483
for (; itNode != itNodeLast && suc; ++itNode) {
514
484
CPinyinTrie::TNode outNode;
515
485
TNode *pnode = *itNode;
522
492
CWordSet::const_iterator itId = pnode->m_WordIdSet.begin();
523
493
CWordSet::const_iterator itIdLast = pnode->m_WordIdSet.end();
524
for (; itId != itIdLast && outNode.m_csLevel<3; ++itId) {
494
for (; itId != itIdLast && outNode.m_csLevel < 3; ++itId) {
525
495
if (outNode.m_csLevel < itId->anony.m_csLevel)
526
496
outNode.m_csLevel = itId->anony.m_csLevel;
542
512
itId = pnode->m_WordIdSet.begin();
543
513
itIdLast = pnode->m_WordIdSet.end();
544
514
for (; itId != itIdLast; ++itId)
545
vec.push_back(TWordInfo(*itId, psrt->getCost(*itId), psrt->isSeen(*itId)));
515
vec.push_back(TWordInfo(*itId, psrt->getCost(*itId) + itId->anony.m_cost,
516
psrt->isSeen(*itId)));
546
517
std::make_heap(vec.begin(), vec.end());
547
518
std::sort_heap(vec.begin(), vec.end());
551
522
for (; itv != itve && suc; ++itv) {
552
523
CPinyinTrie::TWordIdInfo wi;
553
524
wi.m_id = itv->m_id.anony.m_id;
554
assert (wi.m_id < nWord);
525
assert(wi.m_id < nWord);
555
526
wi.m_csLevel = itv->m_id.anony.m_csLevel;
556
wi.m_bSeen = ((itv->m_bSeen)?(1):(0));
527
wi.m_bSeen = ((itv->m_bSeen) ? (1) : (0));
557
528
wi.m_cost = itv->m_id.anony.m_cost;
558
529
suc = f.write(wi);
564
535
for (; itWordStr != itWordStrLast && suc; ++itWordStr) {
565
536
MBSTOWCS(wbuf, itWordStr->c_str(), 1024);
566
537
int sz = WCSLEN(wbuf);
567
suc = f.write(wbuf, (sz+1));
538
suc = f.write(wbuf, (sz + 1));