2
2
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
4
4
* Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
6
6
* The contents of this file are subject to the terms of either the GNU Lesser
7
7
* General Public License Version 2.1 only ("LGPL") or the Common Development and
8
8
* Distribution License ("CDDL")(collectively, the "License"). You may not use this
9
9
* file except in compliance with the License. You can obtain a copy of the CDDL at
10
10
* http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
11
* http://www.opensource.org/licenses/lgpl-license.php. See the License for the
11
* http://www.opensource.org/licenses/lgpl-license.php. See the License for the
12
12
* specific language governing permissions and limitations under the License. When
13
13
* distributing the software, include this License Header Notice in each file and
14
14
* include the full text of the License in the License file as well as the
15
15
* following notice:
17
17
* NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
19
19
* For Covered Software in this distribution, this License shall be governed by the
42
42
CShuangpinData CShuangpinSegmentor::s_shpData;
44
CShuangpinSegmentor::CShuangpinSegmentor (EShuangpinType shpType)
44
CShuangpinSegmentor::CShuangpinSegmentor (EShuangpinType shpType)
45
45
: m_updatedFrom(0), m_nAlpha(0), m_hasInvalid(false), m_nLastValidPos(0)
49
49
s_shpData.setShuangpinType(shpType);
52
unsigned CShuangpinSegmentor::push (unsigned ch)
53
CShuangpinSegmentor::push(unsigned ch)
54
m_inputBuf.push_back (ch);
55
m_inputBuf.push_back(ch);
56
return m_updatedFrom = _push (ch);
57
return m_updatedFrom = _push(ch);
59
unsigned CShuangpinSegmentor::pop ()
61
CShuangpinSegmentor::pop()
61
63
if (m_pystr.empty())
62
64
return m_updatedFrom = 0;
64
unsigned size = m_inputBuf.size ();
66
unsigned size = m_inputBuf.size();
66
68
EShuangpinType shpType = s_shpData.getShuangpinType();
67
bool isInputPy = ( islower(m_pystr[size-1]) ||
68
(m_pystr[size-1] == ';' && (shpType == MS2003 || shpType == ZIGUANG)) );
69
bool isInputPy = (islower(m_pystr[size - 1]) ||
70
(m_pystr[size - 1] == ';' &&
71
(shpType == MS2003 || shpType == ZIGUANG)));
73
m_inputBuf.resize (size - 1);
74
m_pystr.resize (size - 1);
76
m_inputBuf.resize(size - 1);
77
m_pystr.resize(size - 1);
76
79
unsigned l = m_segs.back().m_len;
79
if (size==1 || m_segs.back().m_type!=IPySegmentor::INVALID) {
82
if (size == 1 || m_segs.back().m_type != IPySegmentor::INVALID) {
80
83
m_hasInvalid = false;
83
86
return m_updatedFrom = size - 1;
85
std::string new_pystr = m_pystr.substr (size-l);
86
m_pystr.resize (size-l);
88
std::string new_pystr = m_pystr.substr(size - l);
89
m_pystr.resize(size - l);
88
91
m_updatedFrom = UINT_MAX;
89
92
std::string::const_iterator it = new_pystr.begin();
90
for (; it!= new_pystr.end(); ++it) {
91
unsigned tmp = _push ((*it) & 0x7f);
93
for (; it != new_pystr.end(); ++it) {
94
unsigned tmp = _push((*it) & 0x7f);
92
95
if (tmp < m_updatedFrom) m_updatedFrom = tmp;
95
98
return m_updatedFrom;
98
unsigned CShuangpinSegmentor::insertAt (unsigned idx, unsigned ch)
102
CShuangpinSegmentor::insertAt(unsigned idx, unsigned ch)
100
104
unsigned pyIdx, segIdx;
101
_locateSegment (idx, pyIdx, segIdx);
103
m_inputBuf.insert (idx, 1, ch);
104
m_pystr.insert (idx, 1, ch);
106
std::string new_pystr = m_pystr.substr (pyIdx);
107
m_pystr.resize (pyIdx);
108
m_segs.erase (m_segs.begin()+segIdx, m_segs.end());
105
_locateSegment(idx, pyIdx, segIdx);
107
m_inputBuf.insert(idx, 1, ch);
108
m_pystr.insert(idx, 1, ch);
110
std::string new_pystr = m_pystr.substr(pyIdx);
111
m_pystr.resize(pyIdx);
112
m_segs.erase(m_segs.begin() + segIdx, m_segs.end());
110
114
if (m_nLastValidPos == idx) {
111
115
m_hasInvalid = false;
113
116
} else if (m_nLastValidPos + 1 == idx) {
114
117
m_hasInvalid = false;
115
118
int nSize = m_pystr.size();
116
if (islower(m_pystr[nSize-1])) {
119
if (islower(m_pystr[nSize - 1])) {
117
120
m_nLastValidPos = idx - 1;
118
new_pystr.insert((size_t)0, 1, m_pystr[nSize-1]);
119
m_pystr.erase(nSize-1, 1);
120
m_segs.erase (m_segs.begin()+segIdx-1);
121
new_pystr.insert((size_t)0, 1, m_pystr[nSize - 1]);
122
m_pystr.erase(nSize - 1, 1);
123
m_segs.erase(m_segs.begin() + segIdx - 1);
123
125
} else if (m_nLastValidPos + 1 > idx) {
124
126
m_hasInvalid = false;
125
127
m_nLastValidPos = idx;
129
131
m_updatedFrom = UINT_MAX;
130
132
std::string::const_iterator it = new_pystr.begin();
131
for (; it!= new_pystr.end(); ++it) {
132
unsigned tmp = _push ((*it) & 0x7f);
133
for (; it != new_pystr.end(); ++it) {
134
unsigned tmp = _push((*it) & 0x7f);
133
135
if (tmp < m_updatedFrom) m_updatedFrom = tmp;
136
138
return m_updatedFrom;
139
unsigned CShuangpinSegmentor::deleteAt (unsigned idx, bool backward)
142
CShuangpinSegmentor::deleteAt(unsigned idx, bool backward)
141
144
unsigned pyIdx, segIdx;
142
145
if (!backward) idx += 1;
143
_locateSegment (idx, pyIdx, segIdx);
145
m_inputBuf.erase (idx, 1);
146
m_pystr.erase (idx, 1);
148
std::string new_pystr = m_pystr.substr (pyIdx);
149
m_pystr.resize (pyIdx);
150
TSegmentVec tmp_segs (m_segs.begin()+segIdx+1, m_segs.end());
151
m_segs.erase (m_segs.begin()+segIdx, m_segs.end());
146
_locateSegment(idx, pyIdx, segIdx);
148
m_inputBuf.erase(idx, 1);
149
m_pystr.erase(idx, 1);
151
std::string new_pystr = m_pystr.substr(pyIdx);
152
m_pystr.resize(pyIdx);
153
TSegmentVec tmp_segs(m_segs.begin() + segIdx + 1, m_segs.end());
154
m_segs.erase(m_segs.begin() + segIdx, m_segs.end());
153
156
if (m_nLastValidPos + 1 < idx) {
154
//del invalid ch, and do not effect current status.
157
//del invalid ch, and do not effect current status.
155
158
m_pystr.insert(idx, new_pystr);
156
m_segs.insert (m_segs.end(), tmp_segs.begin(), tmp_segs.end());
157
return m_inputBuf.size() -1;
159
m_segs.insert(m_segs.end(), tmp_segs.begin(), tmp_segs.end());
160
return m_inputBuf.size() - 1;
159
162
m_hasInvalid = false;
160
163
m_nAlpha = _getNumberOfNonAlpha();
163
166
m_updatedFrom = UINT_MAX;
164
167
std::string::const_iterator it = new_pystr.begin();
165
for (; it!= new_pystr.end(); ++it) {
166
unsigned tmp = _push ((*it) & 0x7f);
168
for (; it != new_pystr.end(); ++it) {
169
unsigned tmp = _push((*it) & 0x7f);
167
170
if (tmp < m_updatedFrom) m_updatedFrom = tmp;
170
173
return m_updatedFrom;
173
unsigned CShuangpinSegmentor::clear (unsigned from)
177
CShuangpinSegmentor::clear(unsigned from)
175
m_inputBuf.resize (from);
176
return _clear (from);
179
m_inputBuf.resize(from);
179
unsigned CShuangpinSegmentor::_clear (unsigned from)
184
CShuangpinSegmentor::_clear(unsigned from)
182
_locateSegment (from, i, j);
187
_locateSegment(from, i, j);
184
std::string new_pystr = m_pystr.substr (i, from-i);
189
std::string new_pystr = m_pystr.substr(i, from - i);
186
191
m_nAlpha = _getNumberOfNonAlpha();
188
m_segs.erase (m_segs.begin()+j, m_segs.end());
193
m_segs.erase(m_segs.begin() + j, m_segs.end());
190
195
if (m_nLastValidPos + 1 >= from) {
191
196
m_hasInvalid = false;
194
199
m_updatedFrom = from;
196
201
for (std::string::const_iterator it = new_pystr.begin();
197
it!= new_pystr.end(); ++it) {
198
unsigned tmp = _push ((*it) & 0x7f);
202
it != new_pystr.end(); ++it) {
203
unsigned tmp = _push((*it) & 0x7f);
199
204
if (tmp < m_updatedFrom) m_updatedFrom = tmp;
202
207
return m_updatedFrom;
205
int CShuangpinSegmentor::_getNumberOfNonAlpha() const
211
CShuangpinSegmentor::_getNumberOfNonAlpha() const
207
213
int nNonAlpha = 0;
208
214
for (const char* c = m_pystr.c_str(); *c != 0; ++c) {
212
218
return nNonAlpha;
215
void CShuangpinSegmentor::_locateSegment (unsigned idx, unsigned &strIdx, unsigned &segIdx)
222
CShuangpinSegmentor::_locateSegment(unsigned idx,
217
226
strIdx = segIdx = 0;
219
TSegmentVec::const_iterator it = m_segs.begin();
228
TSegmentVec::const_iterator it = m_segs.begin();
220
229
TSegmentVec::const_iterator ite = m_segs.end();
222
231
for (; it != ite; ++it) {
223
232
if (strIdx + it->m_len > idx)
226
235
strIdx += it->m_len;
231
int CShuangpinSegmentor::_encode(const char* buf, char ch, bool isComplete)
241
CShuangpinSegmentor::_encode(const char* buf, char ch, bool isComplete)
261
271
TSyllable syl = s_shpData.encodeSyllable(iter->c_str());
262
272
if ((int)syl != 0) {
263
273
s.m_syllables.push_back(syl);
264
m_segs.push_back (s);
266
m_segs.push_back (TSegment (ch, s.m_start, 1, IPySegmentor::STRING));
276
m_segs.push_back(TSegment(ch, s.m_start, 1,
277
IPySegmentor::STRING));
269
280
return s.m_start;
273
unsigned CShuangpinSegmentor::_push (unsigned ch)
285
CShuangpinSegmentor::_push(unsigned ch)
275
287
int startFrom = 0;
277
289
EShuangpinType shpType;
279
m_pystr.push_back (ch);
291
m_pystr.push_back(ch);
280
292
const int len = m_pystr.size();
281
293
if (m_hasInvalid) {
282
294
startFrom = len - 1;
283
m_segs.push_back (TSegment (ch, startFrom, 1, IPySegmentor::INVALID));
295
m_segs.push_back(TSegment(ch, startFrom, 1, IPySegmentor::INVALID));
287
299
shpType = s_shpData.getShuangpinType();
288
isInputPy = ( islower(ch) ||
289
(ch == ';' && (shpType == MS2003 || shpType == ZIGUANG)) );
300
isInputPy = (islower(ch) ||
301
(ch == ';' && (shpType == MS2003 || shpType == ZIGUANG)));
292
304
startFrom = len - 1;
294
306
IPySegmentor::ESegmentType seg_type;
295
307
if (ch == '\'' && m_inputBuf.size() > 1)
296
308
seg_type = IPySegmentor::SYLLABLE_SEP;
298
310
seg_type = IPySegmentor::STRING;
299
m_segs.push_back (TSegment (ch, startFrom, 1, seg_type));
311
m_segs.push_back(TSegment(ch, startFrom, 1, seg_type));
301
313
m_nLastValidPos += 1;
303
bool bCompleted = !((len - m_nAlpha)%2) && isInputPy;
315
bool bCompleted = !((len - m_nAlpha) % 2) && isInputPy;
305
317
if (bCompleted) {
306
sprintf(buf, "%c%c", m_pystr[len-2], ch);
318
sprintf(buf, "%c%c", m_pystr[len - 2], ch);
308
320
sprintf(buf, "%c", ch);
311
323
if (startFrom < 0) {
312
324
m_hasInvalid = true;
313
325
startFrom = m_pystr.size() - 1;
314
m_segs.push_back (TSegment (ch, startFrom, 1, IPySegmentor::INVALID));
326
m_segs.push_back(TSegment(ch, startFrom, 1, IPySegmentor::INVALID));
320
332
if (m_pGetFuzzySyllablesOp && m_pGetFuzzySyllablesOp->isEnabled())
321
if ( m_segs.back().m_type == SYLLABLE)
322
_addFuzzySyllables (m_segs.back ());
333
if (m_segs.back().m_type == SYLLABLE)
334
_addFuzzySyllables(m_segs.back());
324
336
return startFrom;
327
void CShuangpinSegmentor::_addFuzzySyllables (TSegment& seg)
340
CShuangpinSegmentor::_addFuzzySyllables(TSegment& seg)
329
assert (seg.m_type == SYLLABLE);
342
assert(seg.m_type == SYLLABLE);
331
344
seg.m_fuzzy_syllables.clear();
333
std::vector<unsigned>::iterator it = seg.m_syllables.begin();
346
std::vector<unsigned>::iterator it = seg.m_syllables.begin();
334
347
std::vector<unsigned>::iterator ite = seg.m_syllables.end();
335
for (; it != ite; ++it)
337
CSyllables fuzzy_set = (*m_pGetFuzzySyllablesOp) (*it);
348
for (; it != ite; ++it) {
349
CSyllables fuzzy_set = (*m_pGetFuzzySyllablesOp)(*it);
339
CSyllables::const_iterator _it = fuzzy_set.begin ();
340
CSyllables::const_iterator _ite = fuzzy_set.end ();
351
CSyllables::const_iterator _it = fuzzy_set.begin();
352
CSyllables::const_iterator _ite = fuzzy_set.end();
341
353
for (; _it != _ite; ++_it)
342
seg.m_fuzzy_syllables.push_back (*_it);
354
seg.m_fuzzy_syllables.push_back(*_it);