2
2
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
4
4
* Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
6
6
* The contents of this file are subject to the terms of either the GNU Lesser
7
7
* General Public License Version 2.1 only ("LGPL") or the Common Development and
8
8
* Distribution License ("CDDL")(collectively, the "License"). You may not use this
9
9
* file except in compliance with the License. You can obtain a copy of the CDDL at
10
10
* http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
11
* http://www.opensource.org/licenses/lgpl-license.php. See the License for the
11
* http://www.opensource.org/licenses/lgpl-license.php. See the License for the
12
12
* specific language governing permissions and limitations under the License. When
13
13
* distributing the software, include this License Header Notice in each file and
14
14
* include the full text of the License in the License file as well as the
15
15
* following notice:
17
17
* NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
19
19
* For Covered Software in this distribution, this License shall be governed by the
21
21
* Any litigation relating to this License shall be subject to the jurisdiction of
22
22
* the Federal Courts of the Northern District of California and the state courts
23
23
* of the State of California, with venue lying in Santa Clara County, California.
27
27
* If you wish your version of this file to be governed by only the CDDL or only
28
28
* the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
29
29
* include this software in this distribution under the [CDDL or LGPL Version 2.1]
57
57
static struct option long_options[] =
60
{"format", 1, 0, 'f'},
61
{"show-id", 0, 0, 'i'},
63
{"ambiguious-id", 1, 0, 'a'},
59
{ "dict", 1, 0, 'd' },
60
{ "format", 1, 0, 'f' },
61
{ "show-id", 0, 0, 'i' },
62
{ "s-tok", 1, 0, 's' },
63
{ "ambiguious-id", 1, 0, 'a' },
67
67
static char* s_strDictFile = NULL;
78
78
fprintf(stderr, "\nUsage:\n");
79
fprintf(stderr, "mmseg -d dict_file [-f (text|bin)] [-i] [-s STOK_ID] [-a AMBI_ID]\n\n");
81
"mmseg -d dict_file [-f (text|bin)] [-i] [-s STOK_ID] [-a AMBI_ID]\n\n");
80
82
fprintf(stderr, " -f --format:\n");
81
fprintf(stderr, " Output Format, can be 'text' or 'bin'. default 'bin'\n");
82
fprintf(stderr, " Normally, in text mode, word text are output, while in binary mode,\n");
83
fprintf(stderr, " binary short integer of the word-ids are written to stdout.\n");
84
" Output Format, can be 'text' or 'bin'. default 'bin'\n");
87
" Normally, in text mode, word text are output, while in binary mode,\n");
89
" binary short integer of the word-ids are written to stdout.\n");
84
90
fprintf(stderr, " -s --stok:\n");
85
91
fprintf(stderr, " Sentence token id. Default 10.\n");
86
fprintf(stderr, " It will be written to output in binary mode after every sentence.\n");
94
" It will be written to output in binary mode after every sentence.\n");
87
95
fprintf(stderr, " -i --show-id:\n");
88
fprintf(stderr, " Show Id info. Under text output format mode, attach id after known.\n");
98
" Show Id info. Under text output format mode, attach id after known.\n");
89
99
fprintf(stderr, " words. If under binary mode, print id(s) in text.\n");
90
100
fprintf(stderr, " -a --ambiguious-id:\n");
91
fprintf(stderr, " Ambiguious means ABC => A BC or AB C. If specified (AMBI-ID != 0), \n");
92
fprintf(stderr, " The sequence ABC will not be segmented, in binary mode, the AMBI-ID \n");
93
fprintf(stderr, " is written out; in text mode, <ambi>ABC</ambi> will be output. Default \n");
103
" Ambiguious means ABC => A BC or AB C. If specified (AMBI-ID != 0), \n");
106
" The sequence ABC will not be segmented, in binary mode, the AMBI-ID \n");
109
" is written out; in text mode, <ambi>ABC</ambi> will be output. Default \n");
94
110
fprintf(stderr, " is 0.\n");
95
111
fprintf(stderr, "\n");
96
112
fprintf(stderr, "Notes:\n");
97
fprintf(stderr, " Under binary mode, consecutive id of 0 are merged into one 0.\n");
98
fprintf(stderr, " Under text mode, no space are inserted between unknown-words. \n");
114
" Under binary mode, consecutive id of 0 are merged into one 0.\n");
117
" Under text mode, no space are inserted between unknown-words. \n");
99
118
fprintf(stderr, "\n");
100
119
fprintf(stderr, "\n");
105
124
getParameters(int argc, char* argv[])
108
while ((c=getopt_long(argc, argv, "d:if:s:a:", long_options, NULL)) != -1)
128
getopt_long(argc, argv, "d:if:s:a:", long_options,
112
132
s_strDictFile = strdup(optarg);
149
output(int len, const TWCHAR* p, TSIMWordId idprev, TSIMWordId idcur, int& nWords)
151
175
static char mbword[1024];
152
176
static TWCHAR wcword[1024];
154
178
bool bRealGap = (idcur != SIM_ID_NOT_WORD || idprev != SIM_ID_NOT_WORD);
155
179
if (s_bTextOut) {
156
for (int i=0; i < len; ++i, ++p)
180
for (int i = 0; i < len; ++i, ++p)
159
183
WCSTOMBS(mbword, wcword, sizeof(mbword));
161
185
printf("(%d)", unsigned(idprev));
162
186
if (bRealGap && (nWords > 0))
164
(s_iAmbiID && idcur == s_iAmbiID)? printf ("<ambi>%s</ambi>", mbword):
165
printf("%s", mbword);
188
(s_iAmbiID && idcur == s_iAmbiID) ? printf("<ambi>%s</ambi>", mbword) :
189
printf("%s", mbword);
166
190
if (s_bShowId && idcur != SIM_ID_NOT_WORD)
167
191
printf("(%d)", unsigned(idcur));
183
* Return 最大交集歧义长度. For example, ABCDEF if ABC CD DEF are words.
184
* if return len > word_len, then ambiguious exists at word [p p+len)...
207
* Return 最大交集歧义长度. For example, ABCDEF if ABC CD DEF are words.
208
* if return len > word_len, then ambiguious exists at word [p p+len)...
187
211
getAmbiLen(const TWCHAR* p, int word_len)
189
213
const CSIMDict::TState* pstate;
191
for (int i=1; i<word_len && *(p+i) != WCH_NULL; ++i) {
192
int len = s_dict->matchLongest(s_dict->getRoot(), pstate, p+i);
193
if (word_len < i+len)
215
for (int i = 1; i < word_len && *(p + i) != WCH_NULL; ++i) {
216
int len = s_dict->matchLongest(s_dict->getRoot(), pstate, p + i);
217
if (word_len < i + len)
263
286
fprintf(stderr, "Loading lexicon..."); fflush(stderr);
264
287
s_dict = new CSIMDict();
265
288
if (!s_dict->parseText(s_strDictFile)) {
266
fprintf(stderr, "fail\n"); fflush(stderr);
289
fprintf(stderr, "fail\n"); fflush(stderr);
269
292
fprintf(stderr, "done"); fflush(stderr);
272
295
fprintf(stderr, "\nProcessing from stdin..."); fflush(stderr);
273
296
processSingleFile(stdin, nWords, nAmbis);
274
fprintf(stderr, "%d words, %d ambiguious. Done!\n", nWords, nAmbis); fflush(stderr);
297
fprintf(stderr, "%d words, %d ambiguious. Done!\n", nWords, nAmbis);
276
for (int i=0; i < argc; ++i) {
300
for (int i = 0; i < argc; ++i) {
277
301
fprintf(stderr, "\nProcessing %s...", argv[i]); fflush(stderr);
278
302
FILE *fp = fopen(argv[i], "r");
279
303
if (fp != NULL) {
280
304
processSingleFile(fp, nWords, nAmbis);
281
fprintf(stderr, "@Offset %ld, %d words, %d ambiguious. Done!\n", ftell(fp), nWords, nAmbis); fflush(stderr);
306
"@Offset %ld, %d words, %d ambiguious. Done!\n",
309
nAmbis); fflush(stderr);
283
311
fprintf(stderr, "Can not Open!!!!!!!\n"); fflush(stderr);