63
void CGetFuzzySegmentsOp::_initMaps ()
65
CGetFuzzySegmentsOp::_initMaps()
65
67
unsigned num_of_fuzzy_finals;
66
const unsigned * fuzzy_final_map = CPinyinData::getInnerFuzzyFinalMap (num_of_fuzzy_finals);
68
for (int i = 0; i < num_of_fuzzy_finals; ++i)
70
unsigned f = *(fuzzy_final_map++);
68
const unsigned * fuzzy_final_map = CPinyinData::getInnerFuzzyFinalMap(
71
for (size_t i = 0; i < num_of_fuzzy_finals; ++i) {
72
unsigned f = *(fuzzy_final_map++);
71
73
unsigned _f = *(fuzzy_final_map++);
72
unsigned l = *(fuzzy_final_map++);
74
m_fuzzyFinalMap.insert (std::make_pair(f, std::make_pair(_f, l)));
74
unsigned l = *(fuzzy_final_map++);
76
m_fuzzyFinalMap.insert(std::make_pair(f, std::make_pair(_f, l)));
77
79
const unsigned *fuzzy_pre_syls, *fuzzy_pro_syls;
78
CPinyinData::getFuzzyPreProSyllables (&fuzzy_pre_syls, &fuzzy_pro_syls);
80
CPinyinData::getFuzzyPreProSyllables(&fuzzy_pre_syls, &fuzzy_pro_syls);
80
82
while (*fuzzy_pre_syls) {
81
unsigned s = *(fuzzy_pre_syls++);
82
char c = *(fuzzy_pre_syls++);
83
unsigned s = *(fuzzy_pre_syls++);
84
char c = *(fuzzy_pre_syls++);
83
85
unsigned _s = *(fuzzy_pre_syls++);
84
m_fuzzyPreMap.insert (std::make_pair(s, std::make_pair(c, _s)));
86
m_fuzzyPreMap.insert(std::make_pair(s, std::make_pair(c, _s)));
87
89
while (*fuzzy_pro_syls) {
88
unsigned s = *(fuzzy_pro_syls++);
89
char c = *(fuzzy_pro_syls++);
90
unsigned s = *(fuzzy_pro_syls++);
91
char c = *(fuzzy_pro_syls++);
90
92
unsigned _s = *(fuzzy_pro_syls++);
91
m_fuzzyProMap.insert (std::make_pair(s, std::make_pair(c, _s)));
93
m_fuzzyProMap.insert(std::make_pair(s, std::make_pair(c, _s)));
95
unsigned CGetFuzzySegmentsOp::_invalidateSegments (IPySegmentor::TSegmentVec& fuzzy_segs, IPySegmentor::TSegment& seg)
98
CGetFuzzySegmentsOp::_invalidateSegments(IPySegmentor::TSegmentVec& fuzzy_segs,
99
IPySegmentor::TSegment& seg)
97
101
unsigned invalidatedFrom = UINT_MAX;
99
IPySegmentor::TSegmentVec::reverse_iterator it = fuzzy_segs.rbegin();
103
IPySegmentor::TSegmentVec::reverse_iterator it = fuzzy_segs.rbegin();
100
104
IPySegmentor::TSegmentVec::reverse_iterator ite = fuzzy_segs.rend();
102
for (; it != ite; it+=2)
104
IPySegmentor::TSegment& seg1 = *(it+1);
106
for (; it != ite; it += 2) {
107
IPySegmentor::TSegment& seg1 = *(it + 1);
105
108
IPySegmentor::TSegment& seg2 = *it;
107
unsigned r = seg2.m_start+seg2.m_len;
110
unsigned r = seg2.m_start + seg2.m_len;
108
111
if (r <= seg.m_start)
111
114
invalidatedFrom = seg1.m_start;
114
fuzzy_segs.erase (it.base(), fuzzy_segs.end());
117
fuzzy_segs.erase(it.base(), fuzzy_segs.end());
116
119
return invalidatedFrom;
119
unsigned CGetFuzzySegmentsOp::operator () (IPySegmentor::TSegmentVec& segs, IPySegmentor::TSegmentVec& fuzzy_segs, wstring& input)
123
CGetFuzzySegmentsOp::operator ()(IPySegmentor::TSegmentVec& segs,
124
IPySegmentor::TSegmentVec& fuzzy_segs,
121
127
IPySegmentor::TSegment& seg = segs.back();
122
unsigned invalidatedFrom = _invalidateSegments (fuzzy_segs, seg);
128
unsigned invalidatedFrom = _invalidateSegments(fuzzy_segs, seg);
124
130
unsigned updatedFrom = UINT_MAX;
125
TSyllable syl = (TSyllable) seg.m_syllables[0];
127
if (m_bInnerFuzzyEnabled)
128
{ // xian -> xian, xi'an
129
CInnerFuzzyFinalMap::iterator it = m_fuzzyFinalMap.find (syl.final);
131
if (it != m_fuzzyFinalMap.end())
133
unsigned an_syl = it->second.first;
134
unsigned an_len = it->second.second;
136
unsigned xi_len = seg.m_len - an_len;
137
wstring wstr = input.substr (seg.m_start, xi_len);
131
TSyllable syl = (TSyllable)seg.m_syllables[0];
133
if (m_bInnerFuzzyEnabled) { // xian -> xian, xi'an
134
CInnerFuzzyFinalMap::iterator it = m_fuzzyFinalMap.find(syl.final);
136
if (it != m_fuzzyFinalMap.end()) {
137
unsigned an_syl = it->second.first;
138
unsigned an_len = it->second.second;
140
unsigned xi_len = seg.m_len - an_len;
141
wstring wstr = input.substr(seg.m_start, xi_len);
139
143
#ifndef _RW_STD_STL
140
std::string xi_str (wstr.begin(), wstr.end());
144
std::string xi_str(wstr.begin(), wstr.end());
142
146
std::string xi_str;
143
147
for (wstring::iterator it = wstr.begin(); it != wstr.end(); ++it)
144
xi_str.push_back (*it);
148
xi_str.push_back(*it);
147
unsigned xi_syl = CPinyinData::encodeSyllable (xi_str.c_str());
151
unsigned xi_syl = CPinyinData::encodeSyllable(xi_str.c_str());
152
156
IPySegmentor::TSegment xi = segs.back();
153
157
xi.m_len = xi_len;
154
158
xi.m_syllables[0] = xi_syl;
156
160
IPySegmentor::TSegment an = segs.back();
158
162
an.m_start += xi_len;
159
163
an.m_syllables[0] = an_syl;
160
164
an.m_inner_fuzzy = true;
162
fuzzy_segs.push_back (xi);
163
fuzzy_segs.push_back (an);
166
fuzzy_segs.push_back(xi);
167
fuzzy_segs.push_back(an);
165
169
updatedFrom = xi.m_start;
198
return std::min (updatedFrom, invalidatedFrom);
202
return std::min(updatedFrom, invalidatedFrom);
202
CQuanpinSegmentor::CQuanpinSegmentor ()
204
m_pGetFuzzySyllablesOp(NULL),
206
CQuanpinSegmentor::CQuanpinSegmentor ()
207
: m_pGetFuzzySyllablesOp(NULL),
205
208
m_pGetCorrectionPairOp(NULL),
206
209
m_pGetFuzzySegmentsOp(NULL),
207
m_pytrie(base, check, value, sizeof(base)/sizeof(*base))
210
m_pytrie(base, check, value, sizeof(base) / sizeof(*base)),
212
bool CQuanpinSegmentor::load(const char * pyTrieFileName)
217
CQuanpinSegmentor::load(const char * pyTrieFileName)
214
return m_pytrie.load (pyTrieFileName);
219
return m_pytrie.load(pyTrieFileName);
218
void print_pystr(const std::string pystr)
224
print_pystr(const std::string pystr)
220
for (const char* c = pystr.c_str(); c != pystr.c_str() + pystr.length(); ++c)
226
for (const char* c = pystr.c_str();
227
c != pystr.c_str() + pystr.length();
222
229
printf("%c", *c & 0x7f);
228
unsigned CQuanpinSegmentor::push (unsigned ch)
236
CQuanpinSegmentor::push(unsigned ch)
230
m_inputBuf.push_back (ch);
238
m_inputBuf.push_back(ch);
232
240
if (m_pGetCorrectionPairOp && m_pGetCorrectionPairOp->isEnabled()) {
233
m_pystr.push_back (ch);
241
m_pystr.push_back(ch);
235
const char * v = (*m_pGetCorrectionPairOp) (m_pystr, l);
243
const char * v = (*m_pGetCorrectionPairOp)(m_pystr, l);
238
246
unsigned orig_size = m_segs.size();
239
_clear (m_pystr.size() - l);
240
m_updatedFrom = _updateWith (v);
242
if (m_segs.size () >= orig_size) {
247
_clear(m_pystr.size() - l);
248
m_updatedFrom = _updateWith(v);
250
if (m_segs.size() >= orig_size) {
243
251
// does not get better segmentation, revert to original
244
_clear (m_pystr.size() - strlen(v));
252
_clear(m_pystr.size() - strlen(v));
245
253
std::string new_pystr;
246
std::copy(m_inputBuf.end() - l, m_inputBuf.end(), back_inserter(new_pystr));
247
m_updatedFrom = _updateWith (new_pystr);
254
std::copy(m_inputBuf.end() - l, m_inputBuf.end(),
255
back_inserter(new_pystr));
256
m_updatedFrom = _updateWith(new_pystr);
249
258
if (l != strlen(v)) {
250
259
// e.g. uen -> un
251
260
m_segs.back().m_len += l - strlen(v);
252
261
m_pystr.resize(m_inputBuf.length());
254
std::copy(m_inputBuf.end() - l, m_inputBuf.end(), m_pystr.end() - l);
263
std::copy(m_inputBuf.end() - l, m_inputBuf.end(),
256
266
return m_updatedFrom;
259
m_pystr.resize (m_pystr.size() - 1);
269
m_pystr.resize(m_pystr.size() - 1);
262
return m_updatedFrom = _push (ch);
272
return m_updatedFrom = _push(ch);
265
unsigned CQuanpinSegmentor::pop ()
276
CQuanpinSegmentor::pop()
267
278
if (m_pystr.empty())
268
279
return m_updatedFrom = 0;
270
unsigned size = m_inputBuf.size ();
271
m_inputBuf.resize (size - 1);
272
m_pystr.resize (size - 1);
281
unsigned size = m_inputBuf.size();
282
m_inputBuf.resize(size - 1);
283
m_pystr.resize(size - 1);
274
285
unsigned l = m_segs.back().m_len;
278
289
return m_updatedFrom = size - 1;
280
std::string new_pystr = m_pystr.substr (size-l);
281
m_pystr.resize (size-l);
291
std::string new_pystr = m_pystr.substr(size - l);
292
m_pystr.resize(size - l);
283
294
m_updatedFrom = _updateWith(new_pystr);
285
296
return m_updatedFrom;
288
unsigned CQuanpinSegmentor::insertAt (unsigned idx, unsigned ch)
300
CQuanpinSegmentor::insertAt(unsigned idx, unsigned ch)
291
_locateSegment (idx, i, j);
293
m_inputBuf.insert (idx, 1, ch);
294
m_pystr.insert (idx, 1, ch);
296
std::string new_pystr = m_pystr.substr (i);
298
m_segs.erase (m_segs.begin()+j, m_segs.end());
300
m_updatedFrom = _updateWith (new_pystr);
303
_locateSegment(idx, i, j);
305
m_inputBuf.insert(idx, 1, ch);
306
m_pystr.insert(idx, 1, ch);
308
std::string new_pystr = m_pystr.substr(i);
310
m_segs.erase(m_segs.begin() + j, m_segs.end());
312
m_updatedFrom = _updateWith(new_pystr);
302
314
return m_updatedFrom;
305
unsigned CQuanpinSegmentor::deleteAt (unsigned idx, bool backward)
318
CQuanpinSegmentor::deleteAt(unsigned idx, bool backward)
308
321
if (!backward) idx += 1;
309
_locateSegment (idx, i, j);
311
m_inputBuf.erase (idx, 1);
312
m_pystr.erase (idx, 1);
314
std::string new_pystr = m_pystr.substr (i);
316
m_segs.erase (m_segs.begin()+j, m_segs.end());
318
m_updatedFrom = _updateWith (new_pystr);
322
_locateSegment(idx, i, j);
324
m_inputBuf.erase(idx, 1);
325
m_pystr.erase(idx, 1);
327
std::string new_pystr = m_pystr.substr(i);
329
m_segs.erase(m_segs.begin() + j, m_segs.end());
331
m_updatedFrom = _updateWith(new_pystr);
320
333
return m_updatedFrom;
323
unsigned CQuanpinSegmentor::clear (unsigned from)
337
CQuanpinSegmentor::clear(unsigned from)
325
m_inputBuf.resize (from);
326
return _clear (from);
339
m_inputBuf.resize(from);
329
unsigned CQuanpinSegmentor::_clear (unsigned from)
344
CQuanpinSegmentor::_clear(unsigned from)
332
_locateSegment (from, i, j);
335
std::string new_pystr = m_pystr.substr (i, from-i);
337
m_segs.erase (m_segs.begin()+j, m_segs.end());
339
m_updatedFrom = _updateWith (new_pystr, from);
347
_locateSegment(from, i, j);
350
std::string new_pystr = m_pystr.substr(i, from - i);
352
m_segs.erase(m_segs.begin() + j, m_segs.end());
354
m_updatedFrom = _updateWith(new_pystr, from);
341
356
return m_updatedFrom;
344
void CQuanpinSegmentor::_locateSegment (unsigned idx, unsigned &strIdx, unsigned &segIdx)
360
CQuanpinSegmentor::_locateSegment(unsigned idx,
346
364
strIdx = segIdx = 0;
348
TSegmentVec::iterator it = m_segs.begin();
366
TSegmentVec::iterator it = m_segs.begin();
349
367
TSegmentVec::iterator ite = m_segs.end();
351
369
for (; it != ite; ++it) {
352
370
if (strIdx + (*it).m_len > idx)
355
373
strIdx += (*it).m_len;
360
unsigned CQuanpinSegmentor::_push (unsigned ch)
379
CQuanpinSegmentor::_push(unsigned ch)
363
m_pystr.push_back (ch);
364
int v = m_pytrie.match_longest (m_pystr.rbegin(), m_pystr.rend(), l);
382
m_pystr.push_back(ch);
383
int v = m_pytrie.match_longest(m_pystr.rbegin(), m_pystr.rend(), l);
366
385
if (l == 0) { // not a valid syllable character, e.g., \', i, u, or A-Z
367
386
IPySegmentor::ESegmentType seg_type;
368
387
if (ch == '\'' && m_inputBuf.size() > 1)
369
388
seg_type = IPySegmentor::SYLLABLE_SEP;
370
else if (islower (ch))
389
else if (islower(ch))
371
390
seg_type = IPySegmentor::INVALID;
373
392
seg_type = IPySegmentor::STRING;
375
ret = m_pystr.size () - 1;
376
m_segs.push_back (TSegment (ch, ret, 1, seg_type));
379
else if (l == 1) { // possible a new segment
380
int last_idx = m_pystr.size () - 2;
381
if ( last_idx >= 0 && (m_pystr[last_idx] & 0x80)) {
394
ret = m_pystr.size() - 1;
395
m_segs.push_back(TSegment(ch, ret, 1, seg_type));
396
} else if (l == 1) { // possible a new segment
397
int last_idx = m_pystr.size() - 2;
398
if (last_idx >= 0 && (m_pystr[last_idx] & 0x80)) {
382
399
// check if the last syllable character's highest bitmask is set
383
400
// e.g., feN, so [feN] + g -> [feng]
384
401
m_pystr[last_idx] &= 0x7f;
386
int v = m_pytrie.match_longest (m_pystr.rbegin(), m_pystr.rend(), l);
403
int v = m_pytrie.match_longest(m_pystr.rbegin(), m_pystr.rend(), l);
388
405
TSegment &last_seg = m_segs.back();
389
if (l == last_seg.m_len + 1) {
406
if (l == (unsigned) last_seg.m_len + 1) {
390
407
last_seg.m_len += 1;
391
408
last_seg.m_syllables[0] = v;
392
409
ret = m_pystr.size() - l;
400
417
// push the new 1-length segment
401
ret = m_pystr.size () - 1;
402
m_segs.push_back (TSegment (v, ret, 1));
405
else if (l == m_segs.back().m_len + 1) { // current segment is extensible, e.g., [xia] + n -> [xian]
406
TSegment &last_seg = m_segs.back ();
418
ret = m_pystr.size() - 1;
419
m_segs.push_back(TSegment(v, ret, 1));
420
} else if (l == (unsigned) m_segs.back().m_len + 1) {
421
// current segment is extensible, e.g., [xia] + n -> [xian]
422
TSegment &last_seg = m_segs.back();
407
423
last_seg.m_len += 1;
408
424
last_seg.m_syllables[0] = v;
409
425
ret = m_pystr.size() - l;
412
else { // other cases
413
TSegment &last_seg = m_segs.back ();
426
} else { // other cases
427
TSegment &last_seg = m_segs.back();
414
428
int i = 0, isum = last_seg.m_len + 1, lsum = l;
415
TSegmentVec new_segs(1, TSegment(v, m_pystr.size()-l, l));
429
TSegmentVec new_segs(1, TSegment(v, m_pystr.size() - l, l));
417
// e.g., [zh] [o] [n] + g -> [zhonG],
431
// e.g., [zh] [o] [n] + g -> [zhonG],
418
432
if (isum < lsum) {
419
unsigned end_idx = m_pystr.size () - 1;
433
unsigned end_idx = m_pystr.size() - 1;
420
434
m_pystr[end_idx] |= 0x80;
423
437
while (isum != lsum) {
424
438
if (lsum < isum) { // e.g., [die] + r -> [di] [er]
425
v = m_pytrie.match_longest (m_pystr.rbegin()+lsum, m_pystr.rend(), l);
426
TSegment &last_seg = new_segs.back ();
427
new_segs.push_back (TSegment(v, last_seg.m_start-l, l));
428
_addFuzzySyllables (new_segs.back ());
439
v = m_pytrie.match_longest(
440
m_pystr.rbegin() + lsum, m_pystr.rend(), l);
441
TSegment &last_seg = new_segs.back();
442
new_segs.push_back(TSegment(v, last_seg.m_start - l, l));
443
_addFuzzySyllables(new_segs.back());
436
m_segs.erase (m_segs.end()-(i+1), m_segs.end());
437
std::copy (new_segs.rbegin(), new_segs.rend(), back_inserter (m_segs));
438
ret = m_pystr.size()-lsum;
451
m_segs.erase(m_segs.end() - (i + 1), m_segs.end());
452
std::copy(new_segs.rbegin(), new_segs.rend(), back_inserter(m_segs));
453
ret = m_pystr.size() - lsum;
443
458
if (m_pGetFuzzySegmentsOp && m_pGetFuzzySegmentsOp->isEnabled())
444
ret = std::min (ret, (*m_pGetFuzzySegmentsOp) (m_segs, m_fuzzy_segs, m_inputBuf));
461
(*m_pGetFuzzySegmentsOp)(m_segs, m_fuzzy_segs, m_inputBuf));
446
463
if (m_pGetFuzzySyllablesOp && m_pGetFuzzySyllablesOp->isEnabled()) {
448
464
if (m_segs.back().m_type == SYLLABLE)
449
_addFuzzySyllables (m_segs.back ());
465
_addFuzzySyllables(m_segs.back());
451
467
if (m_fuzzy_segs.size()) {
452
_addFuzzySyllables (*(m_fuzzy_segs.end()-1));
453
_addFuzzySyllables (*(m_fuzzy_segs.end()-2));
468
_addFuzzySyllables(*(m_fuzzy_segs.end() - 1));
469
_addFuzzySyllables(*(m_fuzzy_segs.end() - 2));
461
void CQuanpinSegmentor::_addFuzzySyllables (TSegment& seg)
477
CQuanpinSegmentor::_addFuzzySyllables(TSegment& seg)
463
assert (seg.m_type == SYLLABLE);
479
assert(seg.m_type == SYLLABLE);
465
481
seg.m_fuzzy_syllables.clear();
467
CSyllables fuzzy_set = (*m_pGetFuzzySyllablesOp) (seg.m_syllables.front());
468
CSyllables::const_iterator it = fuzzy_set.begin ();
469
CSyllables::const_iterator ite = fuzzy_set.end ();
483
CSyllables fuzzy_set = (*m_pGetFuzzySyllablesOp)(seg.m_syllables.front());
484
CSyllables::const_iterator it = fuzzy_set.begin();
485
CSyllables::const_iterator ite = fuzzy_set.end();
471
487
for (; it != ite; ++it)
472
seg.m_fuzzy_syllables.push_back (*it);
488
seg.m_fuzzy_syllables.push_back(*it);
475
unsigned CQuanpinSegmentor::_updateWith (const std::string& new_pystr, unsigned from)
492
CQuanpinSegmentor::_updateWith(const std::string& new_pystr, unsigned from)
477
494
unsigned minUpdatedFrom = from;
478
495
std::string::const_iterator it = new_pystr.begin();
479
496
for (; it != new_pystr.end(); ++it) {
480
497
unsigned updatedFrom = _push(*it & 0x7f);
482
499
if (updatedFrom < minUpdatedFrom) minUpdatedFrom = updatedFrom;
484
501
return minUpdatedFrom;