2
2
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
4
4
* Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
6
6
* The contents of this file are subject to the terms of either the GNU Lesser
7
7
* General Public License Version 2.1 only ("LGPL") or the Common Development and
8
8
* Distribution License ("CDDL")(collectively, the "License"). You may not use this
9
9
* file except in compliance with the License. You can obtain a copy of the CDDL at
10
10
* http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
11
* http://www.opensource.org/licenses/lgpl-license.php. See the License for the
11
* http://www.opensource.org/licenses/lgpl-license.php. See the License for the
12
12
* specific language governing permissions and limitations under the License. When
13
13
* distributing the software, include this License Header Notice in each file and
14
14
* include the full text of the License in the License file as well as the
15
15
* following notice:
17
17
* NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
19
19
* For Covered Software in this distribution, this License shall be governed by the
21
21
* Any litigation relating to this License shall be subject to the jurisdiction of
22
22
* the Federal Courts of the Northern District of California and the state courts
23
23
* of the State of California, with venue lying in Santa Clara County, California.
27
27
* If you wish your version of this file to be governed by only the CDDL or only
28
28
* the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
29
29
* include this software in this distribution under the [CDDL or LGPL Version 2.1]
32
32
* Version 2.1, or to extend the choice of license to its licensees as provided
33
33
* above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
34
34
* Version 2 license, then the option applies only if the new code is made subject
35
* to such option by the copyright holder.
35
* to such option by the copyright holder.
38
38
#ifndef SUNPY_LATTICE_STATES_H
55
55
* StateNode from language model implemetation to replace this
58
typedef CThreadSlm::TState CSlmState;
58
typedef CThreadSlm::TState CSlmState;
61
61
* A WordKey could represent a word. Define this use the unsigned int
62
62
* directly. Because in the future, we may adopt word class, such as
63
63
* Digital Word Class.
65
typedef unsigned CWordId;
65
typedef unsigned CWordId;
68
68
* This class is used to record lexicon state (pinyin trie nodes)
74
74
typedef std::vector<CPinyinTrie::TWordIdInfo> TWordIdInfoVec;
76
76
const CPinyinTrie::TNode *m_pPYNode;
77
TWordIdInfoVec m_words;
78
CSyllables m_syls; // accumulated syllables, may contain fuzzy syllables
79
std::vector<unsigned> m_seg_path; // accumulated segments, may contain fuzzy segments
81
unsigned m_num_of_inner_fuzzies :14;
85
TLexiconState (unsigned start, const CPinyinTrie::TNode *pnode, CSyllables& syls, std::vector<unsigned>& seg_path, bool fuzzy=false):
86
m_start(start), m_pPYNode(pnode), m_syls(syls), m_seg_path(seg_path), m_bPinyin(true), m_bFuzzy(fuzzy), m_num_of_inner_fuzzies(0) {}
88
TLexiconState (unsigned start, TWordIdInfoVec &words, CSyllables &syls, std::vector<unsigned>& seg_path, bool fuzzy=false):
89
m_start(start), m_pPYNode(NULL), m_words(words), m_syls(syls), m_seg_path(seg_path), m_bPinyin(true), m_bFuzzy(fuzzy), m_num_of_inner_fuzzies(0) {}
91
TLexiconState (unsigned start, unsigned wid):
92
m_start(start), m_pPYNode(NULL), m_bPinyin(false)
94
m_words.push_back(wid);
95
m_seg_path.push_back (start);
96
m_seg_path.push_back (start+1);
99
const CPinyinTrie::TWordIdInfo *getWords (unsigned &num);
100
void print (std::string prefix) const;
77
TWordIdInfoVec m_words;
78
// accumulated syllables, may contain fuzzy syllables
80
// accumulated segments, may contain fuzzy segments
81
std::vector<unsigned> m_seg_path;
83
unsigned m_start : 16;
84
unsigned m_num_of_inner_fuzzies : 14;
88
TLexiconState (unsigned start,
89
const CPinyinTrie::TNode *pnode,
91
std::vector<unsigned>& seg_path,
93
: m_pPYNode(pnode), m_syls(syls), m_seg_path(seg_path), m_start(start),
94
m_num_of_inner_fuzzies(0), m_bFuzzy(fuzzy), m_bPinyin(true) {}
96
TLexiconState (unsigned start,
97
TWordIdInfoVec &words,
99
std::vector<unsigned>& seg_path,
101
: m_pPYNode(NULL), m_words(words), m_syls(syls), m_seg_path(seg_path),
102
m_start(start), m_num_of_inner_fuzzies(0), m_bFuzzy(fuzzy),
105
TLexiconState (unsigned start, unsigned wid)
106
: m_pPYNode(NULL), m_start(start), m_bPinyin(false) {
107
m_words.push_back(wid);
108
m_seg_path.push_back(start);
109
m_seg_path.push_back(start + 1);
112
const CPinyinTrie::TWordIdInfo *getWords(unsigned &num);
113
void print(std::string prefix) const;
112
125
* The basic static unit used in the lattice searching
114
127
struct TLatticeState {
115
TSentenceScore m_score;
117
TLexiconState *m_pLexiconState;
118
TLatticeState *m_pBackTraceNode;
119
CSlmState m_slmState;
120
CWordId m_backTraceWordId;
128
TSentenceScore m_score;
130
TLexiconState* m_pLexiconState;
131
TLatticeState* m_pBackTraceNode;
132
CSlmState m_slmState;
133
CWordId m_backTraceWordId;
122
135
TLatticeState(double score = -1.0,
124
137
TLexiconState* lxstPtr = NULL,
125
138
TLatticeState* btNodePtr = NULL,
126
CSlmState sk= CSlmState(),
139
CSlmState sk = CSlmState(),
127
140
CWordId wk = CWordId())
128
: m_score(score), m_frIdx(frIdx), m_pBackTraceNode(btNodePtr),
129
m_pLexiconState(lxstPtr), m_slmState(sk), m_backTraceWordId(wk) {}
141
: m_score(score), m_frIdx(frIdx), m_pLexiconState(lxstPtr),
142
m_pBackTraceNode(btNodePtr), m_slmState(sk), m_backTraceWordId(wk) {}
131
144
/** for debug printing... */
133
print(std::string prefix) const;
145
void print(std::string prefix) const;
147
bool operator<(const TLatticeState& rhs) const {
148
return m_score < rhs.m_score;
151
bool operator==(const TLatticeState& rhs) const {
152
return m_score == rhs.m_score;
155
bool operator>(const TLatticeState& rhs) const {
156
return !((*this) < rhs || (*this) == rhs);
136
160
typedef std::vector<TLatticeState> CLatticeStateVec;
163
* A container that keeps the top K elements.
165
class CTopLatticeStates {
166
std::vector<TLatticeState> m_heap;
169
CTopLatticeStates(size_t threshold) : m_threshold(threshold) {}
171
bool push(const TLatticeState& state);
174
const TLatticeState& top() const { return m_heap[0]; }
176
size_t size() const { return m_heap.size(); }
178
typedef std::vector<TLatticeState>::iterator iterator;
180
iterator begin() { return m_heap.begin(); }
181
iterator end() { return m_heap.end(); }
139
185
* All lattice node on a lattice frame. This class provide beam pruning
140
186
* while push_back, which means at most the best MAX states are reserved,
141
187
* ie, weak state will may be discard while new better state are inserted,
144
190
class CLatticeStates {
146
static const unsigned beam_width = 32;
149
/** just use the CLatticeStateVec's iterator */
150
typedef CLatticeStateVec::iterator iterator;
152
/** just use the CLatticeStateVec's iterator */
153
typedef CLatticeStateVec::const_iterator const_iterator;
155
typedef CLatticeStateVec::reference reference;
156
typedef CLatticeStateVec::const_reference const_reference;
157
typedef CLatticeStateVec::size_type size_type;
164
push_back(const TLatticeState& node);
167
/** return the first iterator of m_vec. */
170
{ return m_vec.size(); }
174
{ return m_vec.begin(); }
176
/** return the first iterator of m_vec. */
179
{ return m_vec.begin(); }
184
/** return the last iterator of m_vec. */
187
{ return m_vec.end(); }
189
/** return the last iterator of m_vec. */
192
{ return m_vec.end(); }
196
operator[] (size_type index)
197
{return m_vec[index];}
200
operator[] (size_type index) const
201
{return m_vec[index];}
205
bubbleUp(int idxInHeap);
208
ironDown(int idxInHeap);
211
std::vector<TLatticeState> m_vec;
212
std::vector<int> m_vecIdxInHeap;
213
std::map<CSlmState, int> m_map;
214
std::vector<int> m_heap;
192
static const unsigned beam_width;
193
static const TSentenceScore filter_ratio_l1;
194
static const TSentenceScore filter_ratio_l2;
195
static const TSentenceScore filter_threshold_exp;
198
CLatticeStates() : m_size(0), m_maxBest(2) {}
200
void setMaxBest(size_t maxBest) { m_maxBest = maxBest; }
203
void add(const TLatticeState& state);
205
std::vector<TLatticeState> getSortedResult();
206
std::vector<TLatticeState> getFilteredResult();
208
typedef std::map<CSlmState, CTopLatticeStates> state_map;
210
friend class CLatticeStates;
211
state_map::iterator m_mainIt;
212
state_map::iterator m_mainEnd;
213
CTopLatticeStates::iterator m_childIt;
215
iterator(state_map::iterator mit, state_map::iterator mend,
216
CTopLatticeStates::iterator cit)
217
: m_mainIt(mit), m_mainEnd(mend), m_childIt(cit) {}
222
bool operator!=(const CLatticeStates::iterator& rhs);
223
TLatticeState& operator*();
224
TLatticeState* operator->();
230
void _pushScoreHeap(TSentenceScore score, CSlmState slmState);
231
void _popScoreHeap();
232
void _refreshHeapIdx(int heapIdx);
233
void _adjustUp(int node);
234
void _adjustDown(int node);
237
state_map m_stateMap;
241
std::map<CSlmState, int> m_heapIdx;
242
std::vector<std::pair<TSentenceScore, CSlmState> > m_scoreHeap;