59
60
else if (ds < -32768.0)
61
unsigned cost = unsigned((ds+32768.0)*256.0);
62
unsigned cost = unsigned((ds + 32768.0) * 256.0);
62
63
anony.m_cost = cost;
65
66
TCandiRank::TCandiRank(bool user, bool best, unsigned len,
66
67
bool fromLattice, unsigned rank)
68
anony.m_user = (user)?0:1;
69
anony.m_best = (best)?0:1;
70
anony.m_len = (len > 31)?(0):(31-len);
71
anony.m_lattice = (fromLattice)?0:1;
69
anony.m_user = (user) ? 0 : 1;
70
anony.m_best = (best) ? 0 : 1;
71
anony.m_len = (len > 31) ? (0) : (31 - len);
72
anony.m_lattice = (fromLattice) ? 0 : 1;
72
73
anony.m_cost = rank;
75
void CLatticeFrame::print (std::string prefix)
77
CLatticeFrame::print(std::string prefix)
77
if (m_bwType & BESTWORD) printf ("B");
78
if (m_bwType & USER_SELECTED) printf ("U");
79
if (m_bwType & BESTWORD) printf("B");
80
if (m_bwType & USER_SELECTED) printf("U");
82
printf (" Lexicon States:\n");
83
for_each (m_lexiconStates.begin (), m_lexiconStates.end (),
84
bind2nd (mem_fun_ref (&TLexiconState::print), prefix));
84
printf(" Lexicon States:\n");
85
for_each(m_lexiconStates.begin(), m_lexiconStates.end(),
86
bind2nd(mem_fun_ref(&TLexiconState::print), prefix));
86
printf (" Lattice States:\n");
87
for_each (m_latticeStates.begin (), m_latticeStates.end (),
88
bind2nd (mem_fun_ref (&TLatticeState::print), prefix));
88
printf(" Lattice States:\n");
89
for_each(m_latticeStates.begin(), m_latticeStates.end(),
90
bind2nd(mem_fun_ref(&TLatticeState::print), prefix));
92
void CIMIContext::printLattice ()
95
CIMIContext::printLattice()
94
97
std::string prefix;
96
for (int i=0; i<=m_tailIdx; ++i) {
99
for (size_t i = 0; i <= m_tailIdx; ++i) {
97
100
if (m_lattice[i].m_type == CLatticeFrame::UNUSED)
100
printf ("Lattice Frame [%d]:", i);
101
m_lattice[i].print (prefix);
103
printf("Lattice Frame [%lu]:", i);
104
m_lattice[i].print(prefix);
105
CIMIContext::CIMIContext ()
106
: m_tailIdx(1), m_pModel(NULL), m_pPinyinTrie(NULL), m_pUserDict(NULL), m_pHistory(NULL),
107
m_historyPower(5), m_bFullSymbolForwarding(false), m_pGetFullSymbolOp(NULL),
108
m_bFullPunctForwarding(true), m_pGetFullPunctOp(NULL), m_bDynaCandiOrder(true),
109
m_candiStarts(0), m_candiEnds(0), m_csLevel(0), m_bNonCompleteSyllable(true),
110
m_pPySegmentor(0), m_bOmitPunct(false)
108
CIMIContext::CIMIContext()
109
: m_tailIdx(1), m_nBest(0), m_maxBest(1), m_maxTailCandidateNum(0),
110
m_pModel(NULL), m_pPinyinTrie(NULL), m_pUserDict(NULL), m_pHistory(NULL),
111
m_historyPower(5), m_csLevel(0), m_bFullSymbolForwarding(false),
112
m_bOmitPunct(false), m_pGetFullSymbolOp(NULL),
113
m_bFullPunctForwarding(true), m_pGetFullPunctOp(NULL),
114
m_pPySegmentor(NULL), m_bNonCompleteSyllable(true),
115
m_bDynaCandiOrder(true), m_candiStarts(0), m_candiEnds(0)
112
m_lattice.resize (MAX_LATTICE_LENGTH);
113
m_lattice[0].m_latticeStates.push_back (TLatticeState (-1.0, 0));
117
m_lattice.resize(MAX_LATTICE_LENGTH);
118
m_lattice[0].m_latticeStates.add(TLatticeState(-1.0, 0));
119
setMaxBest(m_maxBest);
116
void CIMIContext::setCoreData (CIMIData *pCoreData)
123
CIMIContext::setCoreData(CIMIData *pCoreData)
118
125
m_pModel = pCoreData->getSlm();
119
126
m_pPinyinTrie = pCoreData->getPinyinTrie();
122
void CIMIContext::clear ()
127
135
m_candiStarts = m_candiEnds = 0;
130
void CIMIContext::_clearFrom (unsigned idx)
139
CIMIContext::_clearFrom(unsigned idx)
132
for (int i=idx; i<m_tailIdx+1; ++i)
141
for (size_t i = idx; i < m_tailIdx + 1; i++)
133
142
m_lattice[i].clear();
136
bool CIMIContext::buildLattice (IPySegmentor *segmentor, bool doSearch)
146
CIMIContext::buildLattice(IPySegmentor *segmentor, bool doSearch)
138
148
m_pPySegmentor = segmentor;
139
return _buildLattice (segmentor->getSegments(), segmentor->updatedFrom()+1, doSearch);
149
return _buildLattice(segmentor->getSegments(),
150
segmentor->updatedFrom() + 1, doSearch);
142
bool CIMIContext::_buildLattice (IPySegmentor::TSegmentVec &segments, unsigned rebuildFrom, bool doSearch)
154
CIMIContext::_buildLattice(IPySegmentor::TSegmentVec &segments,
155
unsigned rebuildFrom,
144
_clearFrom (rebuildFrom);
146
IPySegmentor::TSegmentVec::const_iterator it = segments.begin ();
147
IPySegmentor::TSegmentVec::const_iterator ite = segments.end ();
158
_clearFrom(rebuildFrom);
160
IPySegmentor::TSegmentVec::const_iterator it = segments.begin();
161
IPySegmentor::TSegmentVec::const_iterator ite = segments.end();
150
164
for (; it != ite; ++it) {
152
166
j = i + it->m_len;
154
if (i < rebuildFrom-1)
168
if (i < rebuildFrom - 1)
157
if (j >= m_lattice.capacity()-1)
171
if (j >= m_lattice.capacity() - 1)
160
174
if (it->m_type == IPySegmentor::SYLLABLE)
161
_forwardSyllables (i, j, *it);
175
_forwardSyllables(i, j, *it);
162
176
else if (it->m_type == IPySegmentor::SYLLABLE_SEP)
163
_forwardSyllableSep (i, j);
177
_forwardSyllableSep(i, j);
165
_forwardString (i, j, it->m_syllables);
179
_forwardString(i, j, it->m_syllables);
166
180
m_bOmitPunct = false;
169
_forwardTail (j, j+1);
183
_forwardTail(j, j + 1);
172
return doSearch && searchFrom (rebuildFrom);
186
return doSearch && searchFrom(rebuildFrom);
175
void CIMIContext::_forwardSyllables (unsigned i, unsigned j, const IPySegmentor::TSegment& seg)
190
CIMIContext::_forwardSyllables(unsigned i,
192
const IPySegmentor::TSegment& seg)
177
std::vector<unsigned>::const_iterator it = seg.m_syllables.begin ();
178
std::vector<unsigned>::const_iterator ite = seg.m_syllables.end ();
180
for (; it != ite; ++it)
181
_forwardSingleSyllable (i, j, *it, seg);
183
it = seg.m_fuzzy_syllables.begin ();
184
ite = seg.m_fuzzy_syllables.end ();
186
for (; it != ite; ++it)
187
_forwardSingleSyllable (i, j, *it, seg, true);
194
std::vector<unsigned>::const_iterator it = seg.m_syllables.begin();
195
std::vector<unsigned>::const_iterator ite = seg.m_syllables.end();
197
for (; it != ite; ++it)
198
_forwardSingleSyllable(i, j, *it, seg);
200
it = seg.m_fuzzy_syllables.begin();
201
ite = seg.m_fuzzy_syllables.end();
203
for (; it != ite; ++it)
204
_forwardSingleSyllable(i, j, *it, seg, true);
191
void CIMIContext::_forwardString (unsigned i, unsigned j, const std::vector<unsigned>& strbuf)
209
CIMIContext::_forwardString(unsigned i,
211
const std::vector<unsigned>& strbuf)
193
213
if (strbuf.size() == 1) {
194
214
unsigned ch = strbuf[0];
195
ispunct(ch)? _forwardPunctChar (i, j, ch): _forwardOrdinaryChar (i, j, ch);
216
_forwardPunctChar(i, j, ch);
218
_forwardOrdinaryChar(i, j, ch);
197
221
CLatticeFrame &fr = m_lattice[j];
198
fr.m_wstr.assign (strbuf.begin(), strbuf.end());
199
fr.m_lexiconStates.push_back (TLexiconState(i, 0));
222
fr.m_wstr.assign(strbuf.begin(), strbuf.end());
223
fr.m_lexiconStates.push_back(TLexiconState(i, 0));
203
void CIMIContext::_forwardSingleSyllable (unsigned i, unsigned j, TSyllable syllable, const IPySegmentor::TSegment& seg, bool fuzzy)
228
CIMIContext::_forwardSingleSyllable(unsigned i,
231
const IPySegmentor::TSegment& seg,
205
234
const CPinyinTrie::TNode * pn = NULL;
207
236
CLatticeFrame &fr = m_lattice[j];
208
237
fr.m_type = CLatticeFrame::SYLLABLE;
210
CLexiconStates::iterator it = m_lattice[i].m_lexiconStates.begin ();
211
CLexiconStates::iterator ite = m_lattice[i].m_lexiconStates.end ();
239
CLexiconStates::iterator it = m_lattice[i].m_lexiconStates.begin();
240
CLexiconStates::iterator ite = m_lattice[i].m_lexiconStates.end();
212
241
for (; it != ite; ++it) {
213
242
TLexiconState &lxst = *it;
214
243
bool added_from_sysdict = false;
216
245
if (lxst.m_pPYNode) {
217
246
// try to match a word from lattice i to lattice j
218
247
// and if match, we'll count it as a new lexicon on lattice j
219
pn = m_pPinyinTrie->transfer (lxst.m_pPYNode, syllable);
248
pn = m_pPinyinTrie->transfer(lxst.m_pPYNode, syllable);
221
250
added_from_sysdict = true;
222
TLexiconState new_lxst = TLexiconState (lxst.m_start, pn, lxst.m_syls, lxst.m_seg_path, fuzzy);
223
new_lxst.m_syls.push_back (syllable);
224
new_lxst.m_num_of_inner_fuzzies = lxst.m_num_of_inner_fuzzies + (seg.m_inner_fuzzy? 1: 0);
225
new_lxst.m_seg_path.push_back (seg.m_start+seg.m_len);
226
fr.m_lexiconStates.push_back (new_lxst);
251
TLexiconState new_lxst = TLexiconState(lxst.m_start,
256
new_lxst.m_syls.push_back(syllable);
257
new_lxst.m_num_of_inner_fuzzies = lxst.m_num_of_inner_fuzzies +
258
(seg.m_inner_fuzzy ? 1 : 0);
259
new_lxst.m_seg_path.push_back(seg.m_start + seg.m_len);
260
fr.m_lexiconStates.push_back(new_lxst);
230
264
if (m_pUserDict && lxst.m_syls.size() < MAX_USRDEF_WORD_LEN) {
231
265
// try to match a word from user dict
232
266
CSyllables syls = lxst.m_syls;
233
syls.push_back (syllable);
267
syls.push_back(syllable);
234
268
std::vector<CPinyinTrie::TWordIdInfo> words;
235
m_pUserDict->getWords (syls, words);
269
m_pUserDict->getWords(syls, words);
236
270
if (!words.empty() || !added_from_sysdict) {
237
271
// even if the words is empty we'll add a fake lexicon
238
272
// here. This helps _saveUserDict detect new words.
239
TLexiconState new_lxst = TLexiconState (lxst.m_start, words, lxst.m_syls, lxst.m_seg_path, fuzzy);
240
new_lxst.m_syls.push_back (syllable);
241
new_lxst.m_num_of_inner_fuzzies = lxst.m_num_of_inner_fuzzies + (seg.m_inner_fuzzy? 1: 0);
242
new_lxst.m_seg_path.push_back (seg.m_start+seg.m_len);
243
fr.m_lexiconStates.push_back (new_lxst);
273
TLexiconState new_lxst = TLexiconState(lxst.m_start,
278
new_lxst.m_syls.push_back(syllable);
279
new_lxst.m_num_of_inner_fuzzies = lxst.m_num_of_inner_fuzzies +
280
(seg.m_inner_fuzzy ? 1 : 0);
281
new_lxst.m_seg_path.push_back(seg.m_start + seg.m_len);
282
fr.m_lexiconStates.push_back(new_lxst);
248
287
// last, create a lexicon for single character with only one syllable
249
pn = m_pPinyinTrie->transfer (syllable);
288
pn = m_pPinyinTrie->transfer(syllable);
252
syls.push_back (syllable);
291
syls.push_back(syllable);
253
292
std::vector<unsigned> seg_path;
254
seg_path.push_back (seg.m_start);
255
seg_path.push_back (seg.m_start+seg.m_len);
256
TLexiconState new_lxst = TLexiconState (i, pn, syls, seg_path, fuzzy);
257
new_lxst.m_num_of_inner_fuzzies = seg.m_inner_fuzzy? 1: 0;
258
fr.m_lexiconStates.push_back (new_lxst);
293
seg_path.push_back(seg.m_start);
294
seg_path.push_back(seg.m_start + seg.m_len);
295
TLexiconState new_lxst = TLexiconState(i, pn, syls, seg_path, fuzzy);
296
new_lxst.m_num_of_inner_fuzzies = seg.m_inner_fuzzy ? 1 : 0;
297
fr.m_lexiconStates.push_back(new_lxst);
262
void CIMIContext::_forwardSyllableSep (unsigned i, unsigned j)
302
CIMIContext::_forwardSyllableSep(unsigned i, unsigned j)
264
304
CLatticeFrame &fr = m_lattice[j];
265
305
fr.m_type = CLatticeFrame::SYLLABLE | CLatticeFrame::SYLLABLE_SEP;
266
306
fr.m_lexiconStates = m_lattice[i].m_lexiconStates;
268
CLexiconStates::iterator it = fr.m_lexiconStates.begin();
308
CLexiconStates::iterator it = fr.m_lexiconStates.begin();
269
309
CLexiconStates::iterator ite = fr.m_lexiconStates.end();
270
310
for (; it != ite; ++it) {
271
311
it->m_seg_path.back() = j;
275
void CIMIContext::_forwardPunctChar (unsigned i, unsigned j, unsigned ch)
316
CIMIContext::_forwardPunctChar(unsigned i, unsigned j, unsigned ch)
277
318
CLatticeFrame &fr = m_lattice[j];
280
321
unsigned wid = 0;
282
323
if (m_pGetFullPunctOp) {
283
324
if (m_bFullPunctForwarding && !m_bOmitPunct) {
284
wstr = (*m_pGetFullPunctOp) (ch);
285
wid = m_pPinyinTrie->getSymbolId (wstr);
325
wstr = (*m_pGetFullPunctOp)(ch);
326
wid = m_pPinyinTrie->getSymbolId(wstr);
291
332
if (!wstr.empty())
292
333
fr.m_wstr = wstr;
294
fr.m_wstr.push_back (ch);
335
fr.m_wstr.push_back(ch);
296
fr.m_lexiconStates.push_back (TLexiconState(i, wid));
337
fr.m_lexiconStates.push_back(TLexiconState(i, wid));
299
void CIMIContext::_forwardOrdinaryChar (unsigned i, unsigned j, unsigned ch)
341
CIMIContext::_forwardOrdinaryChar(unsigned i, unsigned j, unsigned ch)
301
343
CLatticeFrame &fr = m_lattice[j];
304
346
unsigned wid = 0;
306
348
if (m_pGetFullSymbolOp) {
307
wstr = (*m_pGetFullSymbolOp) (ch);
308
wid = m_pPinyinTrie->getSymbolId (wstr);
349
wstr = (*m_pGetFullSymbolOp)(ch);
350
wid = m_pPinyinTrie->getSymbolId(wstr);
310
352
if (!m_bFullSymbolForwarding)
314
fr.m_type = wid? CLatticeFrame::SYMBOL: CLatticeFrame::ASCII;
356
fr.m_type = wid ? CLatticeFrame::SYMBOL : CLatticeFrame::ASCII;
317
359
fr.m_wstr = wstr;
319
fr.m_wstr.push_back (ch);
361
fr.m_wstr.push_back(ch);
321
fr.m_lexiconStates.push_back (TLexiconState(i, wid));
363
fr.m_lexiconStates.push_back(TLexiconState(i, wid));
324
void CIMIContext::_forwardTail (unsigned i, unsigned j)
367
CIMIContext::_forwardTail(unsigned i, unsigned j)
326
369
CLatticeFrame &fr = m_lattice[j];
327
370
fr.m_type = CLatticeFrame::TAIL;
329
fr.m_lexiconStates.push_back (TLexiconState (i, ENDING_WORD_ID));
372
fr.m_lexiconStates.push_back(TLexiconState(i, ENDING_WORD_ID));
332
bool CIMIContext::searchFrom (unsigned idx)
375
double exp2_tbl[32] = {exp2(0), exp2(1), exp2(2), exp2(3), exp2(4), exp2(5), exp2(6), exp2(7),
376
exp2(8), exp2(9), exp2(10), exp2(11), exp2(12), exp2(13), exp2(14), exp2(15),
377
exp2(16), exp2(17), exp2(18), exp2(19), exp2(20), exp2(21), exp2(22), exp2(23),
378
exp2(24), exp2(25), exp2(26), exp2(27), exp2(28), exp2(29), exp2(30), exp2(31),};
381
CIMIContext::searchFrom(unsigned idx)
334
383
bool affectCandidates = (idx <= m_candiEnds);
338
for (; idx<=m_tailIdx; ++idx) {
385
for (; idx <= m_tailIdx; ++idx) {
339
386
CLatticeFrame &fr = m_lattice[idx];
341
388
if (fr.m_type == CLatticeFrame::UNUSED)
344
fr.m_latticeStates.clear ();
391
fr.m_latticeStates.clear();
346
393
/* user selected word might be cut in next step */
347
if (fr.m_bwType & CLatticeFrame::USER_SELECTED)
348
_transferBetween (fr.m_bestWord.m_start, idx, fr.m_bestWord.m_pLexiconState, fr.m_bestWord.m_wordId);
394
if (fr.m_bwType & CLatticeFrame::USER_SELECTED) {
395
_transferBetween(fr.m_selWord.m_start, idx,
396
fr.m_selWord.m_pLexiconState,
397
fr.m_selWord.m_wordId);
350
CLexiconStates::iterator it = fr.m_lexiconStates.begin ();
351
CLexiconStates::iterator ite = fr.m_lexiconStates.end ();
400
CLexiconStates::iterator it = fr.m_lexiconStates.begin();
401
CLexiconStates::iterator ite = fr.m_lexiconStates.end();
352
402
for (; it != ite; ++it) {
353
403
unsigned word_num = 0;
354
404
TLexiconState &lxst = *it;
355
const CPinyinTrie::TWordIdInfo *words = lxst.getWords (word_num);
405
const CPinyinTrie::TWordIdInfo *words = lxst.getWords(word_num);
360
410
if (lxst.m_start == m_candiStarts && idx > m_candiEnds)
361
411
affectCandidates = true;
363
// only selected the word with higher unigram probablities, and
413
// only selected the word with higher unigram probablities, and
364
414
// narrow the search deepth and lower the initial score for fuzzy
366
int maxsz = it->m_bFuzzy? MAX_LEXICON_TRIES/2: MAX_LEXICON_TRIES;
367
double ic = it->m_bFuzzy? 0.5: 1.0;
369
int sz = word_num<maxsz? word_num: maxsz;
416
int maxsz = it->m_bFuzzy ? MAX_LEXICON_TRIES /
417
2 : MAX_LEXICON_TRIES;
419
double ic = it->m_bFuzzy ? 0.5 : 1.0;
421
int sz = (int) word_num < maxsz ? (int) word_num : maxsz;
370
422
int i = 0, count = 0;
371
for (i = 0; count < sz && i < sz && (words[i].m_bSeen || count < 2); ++i) {
424
while (count < sz && i < sz && (words[i].m_bSeen || count < 2)) {
372
425
if (m_csLevel >= words[i].m_csLevel) {
373
_transferBetween (lxst.m_start, idx, &lxst, words[i].m_id, ic);
426
_transferBetween(lxst.m_start, idx, &lxst, words[i].m_id,
427
ic * exp2_tbl[-(words[i].m_cost)]);
378
433
/* try extra words in history cache */
379
434
if (m_pHistory) {
380
for (; i < word_num; ++i) {
381
if (m_csLevel >= words[i].m_csLevel && m_pHistory->seenBefore (words[i].m_id))
382
_transferBetween (lxst.m_start, idx, &lxst, words[i].m_id);
435
while (i < (int) word_num) {
436
if (m_csLevel >= words[i].m_csLevel
437
&& m_pHistory->seenBefore(words[i].m_id))
438
_transferBetween(lxst.m_start, idx, &lxst,
440
ic * exp2_tbl[-(words[i].m_cost)]);
388
_backTraceBestPaths ();
452
std::vector<TLatticeState> tail_states =
453
m_lattice[m_tailIdx].m_latticeStates.getFilteredResult();
456
for (int i = 0; i < tail_states.size(); i++) {
458
tail_states[i].m_score.toString(score);
459
printf("score[%d]: %s\n", i, score.c_str());
463
for (size_t i = 0; i < m_maxBest; i++) {
465
if (_backTracePaths(tail_states, m_nBest, path, segpath)) {
466
m_path.push_back(path);
467
m_segPath.push_back(segpath);
472
if (m_pPySegmentor && m_nBest > 0 && !m_segPath[0].empty())
473
m_pPySegmentor->notify_best_segpath(m_segPath[0]);
390
475
return affectCandidates;
393
void CIMIContext::_transferBetween (unsigned start, unsigned end, TLexiconState* plxst, unsigned wid, double ic)
479
CIMIContext::_transferBetween(unsigned start, unsigned end,
480
TLexiconState* plxst, unsigned wid,
395
483
CLatticeFrame &start_fr = m_lattice[start];
396
CLatticeFrame &end_fr = m_lattice[end];
398
TLatticeState node (-1.0, end, plxst);
399
TSentenceScore efic (ic);
401
if ((end_fr.m_bwType & CLatticeFrame::USER_SELECTED) && end_fr.m_bestWord.m_wordId == wid)
402
efic = TSentenceScore (30000, 1.0);
404
static double s_history_distribution[11] = {0.0, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50};
484
CLatticeFrame &end_fr = m_lattice[end];
486
TLatticeState node(-1.0, end, plxst);
487
TSentenceScore efic(ic);
489
if ((end_fr.m_bwType & CLatticeFrame::USER_SELECTED)
490
&& end_fr.m_selWord.m_wordId == wid)
491
efic = TSentenceScore(30000, 1.0);
493
static double s_history_distribution[] = {
494
0.0, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50
405
497
double weight_h = s_history_distribution[m_historyPower];
406
498
double weight_s = 1.0 - weight_h;
408
CLatticeStates::iterator it = start_fr.m_latticeStates.begin();
500
CLatticeStates::iterator it = start_fr.m_latticeStates.begin();
409
501
CLatticeStates::iterator ite = start_fr.m_latticeStates.end();
411
// for 1-length lattice states, replace ending_word_id (comma) with none_word_id (recognized by CThreadSlm)
412
if (wid == ENDING_WORD_ID && it != ite && it->m_pBackTraceNode && it->m_pBackTraceNode->m_frIdx == 0)
415
503
for (; it != ite; ++it) {
504
// for 1-length lattice states, replace ending_word_id (comma)
505
// with none_word_id (recognized by CThreadSlm)
507
if (wid == ENDING_WORD_ID && it->m_pBackTraceNode && it->m_pBackTraceNode->m_frIdx == 0)
416
510
node.m_pBackTraceNode = &(*it);
417
511
node.m_backTraceWordId = wid;
419
double ts = m_pModel->transfer(it->m_slmState, wid, node.m_slmState);
513
double ts = m_pModel->transfer(it->m_slmState, _wid, node.m_slmState);
420
514
m_pModel->historify(node.m_slmState);
422
// backward to psuedo root, so wid is probably a user word, save the wid in idx field,
423
// so that later we could get it via CThreadSlm::lastWordId, to calculate p_{cache} correctly.
424
if (node.m_slmState.getLevel() == 0 && m_pHistory && m_pHistory->seenBefore(wid))
516
// backward to psuedo root, so wid is probably a user word,
517
// save the wid in idx field, so that later we could get it via
518
// CThreadSlm::lastWordId, to calculate p_{cache} correctly.
519
if (node.m_slmState.getLevel() == 0
520
&& m_pHistory && m_pHistory->seenBefore(wid))
425
521
node.m_slmState.setIdx(wid); // an psuedo unigram node state
427
523
if (m_pHistory) {
428
unsigned history[2] = {m_pModel->lastWordId(it->m_slmState), wid};
429
double hpr = m_pHistory->pr(history, history+2);
430
ts = weight_s * ts + weight_h*hpr;
524
unsigned history[2] = { m_pModel->lastWordId(it->m_slmState), _wid };
525
double hpr = m_pHistory->pr(history, history + 2);
526
ts = weight_s * ts + weight_h * hpr;
433
529
node.m_score = it->m_score * efic * TSentenceScore(ts);
434
end_fr.m_latticeStates.push_back (node);
530
end_fr.m_latticeStates.add(node);
438
void CIMIContext::_backTraceBestPaths ()
535
CIMIContext::_backTracePaths(const std::vector<TLatticeState>& tail_states,
536
int rank, TPath& path, TPath& segmentPath)
440
CLatticeStates& tail_states = m_lattice[m_tailIdx].m_latticeStates;
442
// there must be some transfer errors
443
if (!tail_states.size())
446
TLatticeState *bs = &(tail_states[0]);
541
if (rank >= (int) tail_states.size()) {
542
// rank out of bounds, only return the segment path
546
const TLatticeState *bs = &(tail_states[rank]);
448
548
while (bs->m_pBackTraceNode) {
449
549
unsigned start = bs->m_pBackTraceNode->m_frIdx;
450
unsigned end = bs->m_frIdx;
550
unsigned end = bs->m_frIdx;
451
551
CLatticeFrame & end_fr = m_lattice[end];
453
if (! (end_fr.m_bwType & CLatticeFrame::USER_SELECTED)) {
553
if (!(end_fr.m_bwType & CLatticeFrame::USER_SELECTED)) {
554
const TWCHAR* cwstr = NULL;
555
if (end_fr.m_wstr.empty()) {
556
cwstr = _getWstr(bs->m_backTraceWordId);
558
cwstr = end_fr.m_wstr.c_str();
561
CCandidate candi(start, end, bs->m_pLexiconState, cwstr,
562
bs->m_backTraceWordId);
454
564
end_fr.m_bwType |= CLatticeFrame::BESTWORD;
456
end_fr.m_bestWord.m_start = start;
457
end_fr.m_bestWord.m_end = end;
458
end_fr.m_bestWord.m_pLexiconState = bs->m_pLexiconState;
459
end_fr.m_bestWord.m_wordId = bs->m_backTraceWordId;
460
end_fr.m_bestWord.m_cwstr = end_fr.m_wstr.empty()?
461
_getWstr (bs->m_backTraceWordId):
462
end_fr.m_wstr.c_str();
565
end_fr.m_bestWords[rank] = candi;
567
end_fr.m_selWord = candi; // select the first by default.
465
571
if (bs->m_pBackTraceNode->m_pLexiconState) {
466
std::vector<unsigned> seg_path = bs->m_pBackTraceNode->m_pLexiconState->m_seg_path;
467
std::vector<unsigned>::reverse_iterator it = seg_path.rbegin();
468
std::vector<unsigned>::reverse_iterator ite = seg_path.rend();
572
std::vector<unsigned> seg_path =
573
bs->m_pBackTraceNode->m_pLexiconState->m_seg_path;
574
std::vector<unsigned>::reverse_iterator it = seg_path.rbegin();
470
576
for (; it != seg_path.rend(); ++it) {
471
if (m_bestSegPath.empty() || m_bestSegPath.back() != *it)
472
m_bestSegPath.push_back (*it);
577
if (segmentPath.empty() || segmentPath.back() != *it)
578
segmentPath.push_back(*it);
476
m_bestPath.push_back (end);
477
583
bs = bs->m_pBackTraceNode;
480
std::reverse (m_bestPath.begin(), m_bestPath.end());
481
std::reverse (m_bestSegPath.begin(), m_bestSegPath.end());
484
m_pPySegmentor->notify_best_segpath (m_bestSegPath);
586
std::reverse(path.begin(), path.end());
587
std::reverse(segmentPath.begin(), segmentPath.end());
487
590
std::vector<unsigned>::iterator it;
489
printf ("best lattice path: ");
490
for (it = m_bestPath.begin(); it != m_bestPath.end(); ++it)
592
printf("trace lattice path[%d]: ", rank);
593
for (it = path.begin(); it != path.end(); ++it)
493
printf ("best segments path: ");
494
for (it = m_bestSegPath.begin(); it != m_bestSegPath.end(); ++it)
597
printf("trace segments path[%d]: ", rank);
598
for (it = segmentPath.begin(); it != segmentPath.end(); ++it)
501
void CIMIContext::_clearBestPaths ()
504
m_bestSegPath.clear ();
507
unsigned CIMIContext::getBestSentence (wstring& result, unsigned start, unsigned end)
511
if (UINT_MAX == end) end = m_tailIdx - 1;
513
while (end > start && m_lattice[end].m_bwType == CLatticeFrame::NO_BESTWORD)
516
unsigned i = end, nWordConverted = 0;
518
CLatticeFrame &fr = m_lattice[i];
519
result.insert (0, fr.m_bestWord.m_cwstr);
520
i = fr.m_bestWord.m_start;
524
return nWordConverted;
527
unsigned CIMIContext::getBestSentence (std::vector<unsigned>& result, unsigned start, unsigned end)
531
if (UINT_MAX == end) end = m_tailIdx - 1;
533
while (end > start && m_lattice[end].m_bwType == CLatticeFrame::NO_BESTWORD)
536
unsigned i = end, nWordConverted = 0;
538
CLatticeFrame &fr = m_lattice[i];
539
result.insert (result.begin(), fr.m_bestWord.m_wordId);
540
i = fr.m_bestWord.m_start;
544
return nWordConverted;
607
CIMIContext::_clearPaths()
613
std::vector<CCandidates>
614
CIMIContext::getBestSentenceTails(int rank, unsigned start, unsigned end)
616
std::vector<CCandidates> result;
621
CCandidates sentence;
622
unsigned word_num = getBestSentence(sentence, rank, start, end);
623
unsigned tail_word_num = word_num;
625
while (tail_word_num > 1) {
626
unsigned dec = tail_word_num / (m_maxTailCandidateNum + 1) + 1;
627
tail_word_num -= std::min(dec, tail_word_num);
628
if (tail_word_num <= 1) {
631
CCandidates tail(sentence.begin(), sentence.begin() + tail_word_num);
632
result.push_back(tail);
638
CIMIContext::getBestSentence(CCandidates& result, int rank,
639
unsigned start, unsigned end)
641
// -1 means selected sentence
642
if (rank < -1 || rank >= (int) m_nBest)
650
while (end > start && m_lattice[end].m_bwType == CLatticeFrame::NO_BESTWORD)
653
unsigned i = end, nWordConverted = 0;
655
CLatticeFrame& fr = m_lattice[i];
657
result.insert(result.begin(), fr.m_selWord);
658
i = fr.m_selWord.m_start;
660
result.insert(result.begin(), fr.m_bestWords[rank]);
661
i = fr.m_bestWords[rank].m_start;
665
return nWordConverted;
669
CIMIContext::getBestSentence(wstring& result, int rank,
670
unsigned start, unsigned end)
672
CCandidates sentence;
673
unsigned nWordConverted = getBestSentence(sentence, rank, start, end);
675
for (size_t i = 0; i < sentence.size(); i++) {
676
result += sentence[i].m_cwstr;
678
return nWordConverted;
682
CIMIContext::getBestSentence(std::vector<unsigned>& result, int rank,
683
unsigned start, unsigned end)
685
CCandidates sentence;
686
unsigned nWordConverted = getBestSentence(sentence, rank, start, end);
688
for (size_t i = 0; i < sentence.size(); i++) {
689
result.push_back(sentence[i].m_wordId);
691
return nWordConverted;
696
CIMIContext::getSelectedSentence(wstring& result,
697
unsigned start, unsigned end)
699
return getBestSentence(result, -1, start, end);
704
CIMIContext::getSelectedSentence(std::vector<unsigned>& result,
705
unsigned start, unsigned end)
707
return getBestSentence(result, -1, start, end);
547
710
struct TCandiPair {
551
TCandiPair() : m_candi(), m_Rank() { }
714
TCandiPair() : m_candi(), m_Rank()
554
719
struct TCandiPairPtr {
555
720
TCandiPair* m_Ptr;
557
TCandiPairPtr(TCandiPair* p=NULL) : m_Ptr(p)
722
TCandiPairPtr(TCandiPair* p = NULL) : m_Ptr(p)
561
operator< (const TCandiPairPtr& b) const
562
{ return m_Ptr->m_Rank < b.m_Ptr->m_Rank; }
727
operator<(const TCandiPairPtr& b) const
729
return m_Ptr->m_Rank < b.m_Ptr->m_Rank;
565
const TWCHAR *CIMIContext::_getWstr (unsigned wid)
734
CIMIContext::_getWstr(unsigned wid)
567
736
if (wid < m_pPinyinTrie->getWordCount())
568
737
return (*m_pPinyinTrie)[wid];
695
888
fr = m_lattice[frIdx];
698
if (fr.m_bwType & (CLatticeFrame::USER_SELECTED | CLatticeFrame::BESTWORD)) {
699
ret = fr.m_bestWord.m_start;
892
(CLatticeFrame::USER_SELECTED | CLatticeFrame::BESTWORD)) {
893
ret = fr.m_selWord.m_start;
700
894
fr.m_bwType = CLatticeFrame::NO_BESTWORD;
701
if (doSearch) searchFrom (frIdx);
895
if (doSearch) searchFrom(frIdx);
707
void CIMIContext::makeSelection (CCandidate &candi, bool doSearch)
902
CIMIContext::makeSelection(CCandidate &candi, bool doSearch)
709
904
CLatticeFrame &fr = m_lattice[candi.m_end];
710
905
fr.m_bwType = fr.m_bwType | CLatticeFrame::USER_SELECTED;
711
fr.m_bestWord = candi;
712
if (doSearch) searchFrom (candi.m_end);
715
void CIMIContext::memorize ()
718
_saveHistoryCache ();
721
void CIMIContext::_saveUserDict ()
906
fr.m_selWord = candi;
907
// make best sentence word consistent as well
908
for (size_t i = 0; i < m_nBest; i++) {
909
fr.m_bestWords[i] = candi;
912
if (doSearch) searchFrom(candi.m_end);
916
CIMIContext::selectSentence(int idx)
918
unsigned i = m_tailIdx - 1;
919
while (i > 0 && m_lattice[i].m_bwType == CLatticeFrame::NO_BESTWORD)
923
CLatticeFrame &fr = m_lattice[i];
924
fr.m_selWord = fr.m_bestWords[idx];
925
i = fr.m_selWord.m_start;
930
CIMIContext::memorize()
937
CIMIContext::_saveUserDict()
723
939
if (!m_pUserDict)
726
if (m_bestPath.empty())
731
943
bool has_user_selected = false;
732
std::vector<unsigned>::iterator it = m_bestPath.begin();
733
std::vector<unsigned>::iterator ite = m_bestPath.end();
734
for (; it != ite; ++it, ++s) {
735
CLatticeFrame &fr = m_lattice[*it];
736
if (!fr.isSyllableFrame ()) {
741
CSyllables &tmp = fr.m_bestWord.m_pLexiconState->m_syls;
742
if (syls.size() + tmp.size() > MAX_USRDEF_WORD_LEN) {
944
unsigned i = m_tailIdx - 1;
947
while (i > 0 && m_lattice[i].m_bwType == CLatticeFrame::NO_BESTWORD)
951
CLatticeFrame &fr = m_lattice[i];
952
if (!fr.isSyllableFrame()) {
953
i = fr.m_selWord.m_start;
957
TLexiconState* state = fr.m_selWord.m_pLexiconState;
959
i = fr.m_selWord.m_start;
963
if (syls.size() + state->m_syls.size() > MAX_USRDEF_WORD_LEN) {
964
i = fr.m_selWord.m_start;
968
if (!e_pos) e_pos = i;
747
970
has_user_selected |= (fr.m_bwType & CLatticeFrame::USER_SELECTED);
748
std::copy (tmp.begin(), tmp.end(), back_inserter(syls));
971
std::copy(state->m_syls.begin(), state->m_syls.end(), inserter(syls, syls.begin()));
972
i = fr.m_selWord.m_start;
751
if (s >= 2 && has_user_selected && !syls.empty()) {
975
if (has_user_selected && syls.size() > 1) {
753
getBestSentence (phrase, 0, *it);
977
getSelectedSentence (phrase, 0, e_pos);
754
978
m_pUserDict->addWord (syls, phrase);
758
void CIMIContext::_saveHistoryCache ()
983
CIMIContext::_saveHistoryCache()
763
if (m_bestPath.empty())
766
988
std::vector<unsigned> result;
767
std::vector<unsigned>::const_iterator it = m_bestPath.begin();
768
std::vector<unsigned>::const_iterator ite = m_bestPath.end() - 1;
769
for (; it != ite; ++it) {
770
CLatticeFrame &fr = m_lattice[*it];
771
if (fr.isSyllableFrame ())
772
result.push_back (fr.m_bestWord.m_wordId);
774
result.push_back (0);
989
unsigned i = m_tailIdx - 1;
990
while (i > 0 && m_lattice[i].m_bwType == CLatticeFrame::NO_BESTWORD)
994
CLatticeFrame &fr = m_lattice[i];
995
if (fr.isSyllableFrame()) {
996
result.insert(result.begin(), fr.m_selWord.m_wordId);
998
result.insert(result.begin(), 0);
1000
i = fr.m_selWord.m_start;
777
1003
if (!result.empty())
778
m_pHistory->memorize (&(result[0]), &(result[0]) + result.size());
1004
m_pHistory->memorize(&(result[0]), &(result[0]) + result.size());
782
void CIMIContext::deleteCandidate (CCandidate &candi)
1008
CIMIContext::deleteCandidate(CCandidate &candi)
784
1010
unsigned wid = candi.m_wordId;
786
1012
if (wid > INI_USRDEF_WID) {
787
m_pHistory->forget (wid);
788
m_pUserDict->removeWord (wid);
789
_buildLattice (m_pPySegmentor->getSegments(), candi.m_start+1);
1013
m_pHistory->forget(wid);
1014
m_pUserDict->removeWord(wid);
1015
_buildLattice(m_pPySegmentor->getSegments(), candi.m_start + 1);
793
void CIMIContext::removeFromHistoryCache (std::vector<unsigned>& wids)
1020
CIMIContext::removeFromHistoryCache(std::vector<unsigned>& wids)
795
1022
if (!m_pHistory)
798
m_pHistory->forget (&(wids[0]), &(wids[0]) + wids.size());
799
buildLattice (m_pPySegmentor);
1025
m_pHistory->forget(&(wids[0]), &(wids[0]) + wids.size());
1026
buildLattice(m_pPySegmentor);