2
2
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
4
4
* Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
6
6
* The contents of this file are subject to the terms of either the GNU Lesser
7
7
* General Public License Version 2.1 only ("LGPL") or the Common Development and
8
8
* Distribution License ("CDDL")(collectively, the "License"). You may not use this
9
9
* file except in compliance with the License. You can obtain a copy of the CDDL at
10
10
* http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
11
* http://www.opensource.org/licenses/lgpl-license.php. See the License for the
11
* http://www.opensource.org/licenses/lgpl-license.php. See the License for the
12
12
* specific language governing permissions and limitations under the License. When
13
13
* distributing the software, include this License Header Notice in each file and
14
14
* include the full text of the License in the License file as well as the
15
15
* following notice:
17
17
* NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
19
19
* For Covered Software in this distribution, this License shall be governed by the
84
84
fprintf(stderr, "\nUsage:\n");
85
fprintf(stderr, "slmseg -d dict_file [-f (text|bin)] [-i] [-s STOK_ID] [-m lm_file]\n\n");
87
"slmseg -d dict_file [-f (text|bin)] [-i] [-s STOK_ID] [-m lm_file]\n\n");
86
88
fprintf(stderr, " -f --format:\n");
87
fprintf(stderr, " Output Format, can be 'text' or 'bin'. default 'bin'\n");
88
fprintf(stderr, " Normally, in text mode, word text are output, while in binary mode,\n");
89
fprintf(stderr, " binary short integer of the word-ids are writed to stdout.\n");
90
" Output Format, can be 'text' or 'bin'. default 'bin'\n");
93
" Normally, in text mode, word text are output, while in binary mode,\n");
95
" binary short integer of the word-ids are writed to stdout.\n");
90
96
fprintf(stderr, " -s --stok:\n");
91
97
fprintf(stderr, " Sentence token id. Default 10.\n");
92
fprintf(stderr, " It will be write to output in binary mode after every sentence.\n");
100
" It will be write to output in binary mode after every sentence.\n");
93
101
fprintf(stderr, " -i --show-id:\n");
94
fprintf(stderr, " Show Id info. Under text output format mode, Attach id after known-words.\n");
104
" Show Id info. Under text output format mode, Attach id after known-words.\n");
95
105
fprintf(stderr, " Under binary mode, print id in text.\n");
96
106
fprintf(stderr, " -m --model:\n");
97
107
fprintf(stderr, " Language model file name");
98
108
fprintf(stderr, "\n");
99
109
fprintf(stderr, "Notes:\n");
100
fprintf(stderr, " Under binary mode, consecutive id of 0 are merged into one 0.\n");
101
fprintf(stderr, " Under text mode, no space are insert between unknown-words. \n");
111
" Under binary mode, consecutive id of 0 are merged into one 0.\n");
113
" Under text mode, no space are insert between unknown-words. \n");
102
114
fprintf(stderr, "\n");
103
115
fprintf(stderr, "\n");
152
output(int len, const TWCHAR* p, TSIMWordId idprev, TSIMWordId idcur, int& nWords)
154
171
static char mbword[1024];
155
172
static TWCHAR wcword[1024];
157
174
bool bRealGap = (idcur != SIM_ID_NOT_WORD || idprev != SIM_ID_NOT_WORD);
158
175
if (s_bTextOut) {
159
for (int i=0; i < len; ++i, ++p)
176
for (int i = 0; i < len; ++i, ++p)
162
179
WCSTOMBS(mbword, wcword, sizeof(mbword));
184
201
struct TLatticeWord {
189
TLatticeWord(int left=0, int right=0, int wid=0)
190
: m_left(left), m_right(right), m_wordId(wid) { }
206
TLatticeWord(int left = 0, int right = 0, int wid = 0)
207
: m_left(left), m_right(right), m_wordId(wid)
193
212
typedef std::vector<TLatticeWord> TLatticeWordVec;
195
214
struct TLatticeStateValue {
197
TLatticeWord* mp_btword;
198
CThreadSlm::TState m_btstate;
216
TLatticeWord* mp_btword;
217
CThreadSlm::TState m_btstate;
200
TLatticeStateValue(double pr=0.0, TLatticeWord* btword=NULL, CThreadSlm::TState btstate = CThreadSlm::TState())
201
: m_pr(pr), mp_btword(btword), m_btstate(btstate) { }
219
TLatticeStateValue(double pr = 0.0,
220
TLatticeWord* btword = NULL,
221
CThreadSlm::TState btstate = CThreadSlm::TState())
222
: m_pr(pr), mp_btword(btword), m_btstate(btstate)
204
227
typedef std::map<CThreadSlm::TState, TLatticeStateValue> TLatticeColumnStates;
206
229
struct TLatticeColumn {
207
TLatticeWordVec m_wordstarting;
208
TLatticeColumnStates m_states;
230
TLatticeWordVec m_wordstarting;
231
TLatticeColumnStates m_states;
211
234
typedef std::vector<TLatticeColumn> CLattice;
213
inline void insertLatticeWord(CLattice& lattice, TLatticeWord word)
237
insertLatticeWord(CLattice& lattice, TLatticeWord word)
215
239
lattice[word.m_left].m_wordstarting.push_back(word);
221
245
const CSIMDict::TState* pstate;
223
for (int i=1; (i<word_len) && *(p+i) != WCH_NULL; ++i) {
224
int len = s_dict->matchLongest(s_dict->getRoot(), pstate, p+i);
225
if (word_len < i+len)
247
for (int i = 1; (i < word_len) && *(p + i) != WCH_NULL; ++i) {
248
int len = s_dict->matchLongest(s_dict->getRoot(), pstate, p + i);
249
if (word_len < i + len)
232
void fullSegBuildLattice(wstring& sntnc, int left, int len, CLattice& lattice)
257
fullSegBuildLattice(wstring& sntnc, int left, int len, CLattice& lattice)
234
for (int right=left+len; left < right; ++left) {
259
for (int right = left + len; left < right; ++left) {
235
260
bool found = false;
237
const TWCHAR* p = sntnc.c_str()+left;
262
const TWCHAR* p = sntnc.c_str() + left;
238
263
const CSIMDict::TState* pds = s_dict->getRoot();
239
for (len = 0; left+len < right; ++len) {
264
for (len = 0; left + len < right; ++len) {
240
265
if ((pds = s_dict->step(pds, *p++)) == NULL)
242
267
if (pds->word_id != SIM_ID_NOT_WORD) {
244
insertLatticeWord(lattice, TLatticeWord(left, left+len+1, pds->word_id));
269
insertLatticeWord(lattice,
270
TLatticeWord(left, left + len + 1,
248
insertLatticeWord(lattice, TLatticeWord(left, left+1, SIM_ID_NOT_WORD));
275
insertLatticeWord(lattice,
276
TLatticeWord(left, left + 1, SIM_ID_NOT_WORD));
253
* Lattice head should have one state, with its TState using slm's root. its
254
* pr = 0 and its mp_btword == NULL;
255
* Lattice tail must contain no word, and it previous node contain only one word
256
* with its right = left+1, right == tail.
257
* The lattice should ensure the lattice path existing
259
void buildLattice(wstring &sntnc, CLattice& lattice)
281
* Lattice head should have one state, with its TState using slm's root. its
282
* pr = 0 and its mp_btword == NULL;
283
* Lattice tail must contain no word, and it previous node contain only one word
284
* with its right = left+1, right == tail.
285
* The lattice should ensure the lattice path existing
288
buildLattice(wstring &sntnc, CLattice& lattice)
262
lattice.resize(sntnc.size()+2);
291
lattice.resize(sntnc.size() + 2);
264
293
unsigned int idcur = SIM_ID_NOT_WORD;
265
lattice[0].m_states[CThreadSlm::TState()] = TLatticeStateValue(0.0, NULL, CThreadSlm::TState());
294
lattice[0].m_states[CThreadSlm::TState()] = TLatticeStateValue(
267
for (int i=0, sz=sntnc.size(); i < sz; ) {
300
for (int i = 0, sz = sntnc.size(); i < sz; ) {
268
301
const CSIMDict::TState* pstate;
269
const TWCHAR* p = sntnc.c_str()+i;
302
const TWCHAR* p = sntnc.c_str() + i;
270
303
int len = s_dict->matchLongest(s_dict->getRoot(), pstate, p);
272
305
idcur = SIM_ID_NOT_WORD;
277
310
int ambilen = getAmbiLen(p, len);
279
312
if (ambilen <= len) {
280
insertLatticeWord(lattice, TLatticeWord(i, i+len, idcur));
313
insertLatticeWord(lattice, TLatticeWord(i, i + len, idcur));
283
316
fullSegBuildLattice(sntnc, i, ambilen, lattice);
287
lattice[sntnc.size()].m_wordstarting.push_back(TLatticeWord(sntnc.size(), sntnc.size()+1, s_iSTOKID));
320
lattice[sntnc.size()].m_wordstarting.push_back(TLatticeWord(sntnc.size(),
290
void searchBest(CLattice& lattice)
326
searchBest(CLattice& lattice)
292
for (int i=0, sz=lattice.size(); i < sz; ++i) {
328
for (int i = 0, sz = lattice.size(); i < sz; ++i) {
293
329
TLatticeColumnStates & states = lattice[i].m_states;
294
330
TLatticeColumnStates::iterator itss = states.begin();
295
331
TLatticeColumnStates::iterator itse = states.end();
371
408
TLatticeWordVec segResult;
372
409
getBestPath(lattice, segResult);
374
for (int i=0, sz=segResult.size(); i < sz; ++i) {
375
const TWCHAR *p = sntnc.c_str()+segResult[i].m_left;
411
for (int i = 0, sz = segResult.size(); i < sz; ++i) {
412
const TWCHAR *p = sntnc.c_str() + segResult[i].m_left;
376
413
int len = segResult[i].m_right - segResult[i].m_left;
377
414
idcur = segResult[i].m_wordId;
424
461
fprintf(stderr, "%d words, %d ambiguious. Done!\n", nWords, nAmbis);
427
for (int i=0; i < argc; ++i) {
464
for (int i = 0; i < argc; ++i) {
428
465
fprintf(stderr, "\nProcessing %s...", argv[i]); fflush(stderr);
429
466
FILE *fp = fopen(argv[i], "r");
430
467
if (fp != NULL) {
431
468
processSingleFile(fp, nWords, nAmbis);
432
469
fprintf(stderr, "@Offset %ld, %d words, %d ambiguious. Done!\n",
433
ftell(fp), nWords, nAmbis);
470
ftell(fp), nWords, nAmbis);
436
473
fprintf(stderr, "Can not Open!!!!!!!\n");