4
* Copyright 2010 BYVoid <byvoid1@gmail.com>
6
* Licensed under the Apache License, Version 2.0 (the "License");
7
* you may not use this file except in compliance with the License.
8
* You may obtain a copy of the License at
10
* http://www.apache.org/licenses/LICENSE-2.0
12
* Unless required by applicable law or agreed to in writing, software
13
* distributed under the License is distributed on an "AS IS" BASIS,
14
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
* See the License for the specific language governing permissions and
16
* limitations under the License.
19
#include "opencc_utils.h"
20
#include "opencc_converter.h"
21
#include "opencc_encoding.h"
22
#include "opencc_dictionary.h"
24
#define OPENCC_SP_SEG_DEFAULT_BUFFER_SIZE 1024
30
size_t * match_length;
34
} opencc_sp_seg_buffer;
38
opencc_sp_seg_buffer sp_seg_buffer;
39
opencc_dictionary_t dicts;
40
} opencc_converter_description;
42
static converter_error errnum = CONVERTER_ERROR_VOID;
44
static void sp_seg_buffer_free(opencc_sp_seg_buffer * ossb)
46
free(ossb->match_length);
52
static void sp_seg_set_buffer_size(opencc_sp_seg_buffer * ossb, size_t buffer_size)
54
if (ossb->initialized == TRUE)
55
sp_seg_buffer_free(ossb);
57
ossb->buffer_size = buffer_size;
58
ossb->match_length = (size_t *) malloc((buffer_size + 1) * sizeof(size_t));
59
ossb->min_len = (size_t *) malloc(buffer_size * sizeof(size_t));
60
ossb->parent = (size_t *) malloc(buffer_size * sizeof(size_t));
61
ossb->path = (size_t *) malloc(buffer_size * sizeof(size_t));
63
ossb->initialized = TRUE;
66
static size_t sp_seg(opencc_converter_description * cd, ucs4_t ** inbuf, size_t * inbuf_left,
67
ucs4_t ** outbuf, size_t * outbuf_left, size_t length)
74
const ucs4_t * match_rs = dict_match_longest(cd->dicts, *inbuf, 1);
81
(*outbuf) ++,(*outbuf_left) --;
82
(*inbuf) ++,(*inbuf_left) --;
89
opencc_sp_seg_buffer * ossb = &(cd->sp_seg_buffer);
90
size_t buffer_size_need = length + 1;
91
if (ossb->initialized == FALSE || ossb->buffer_size < buffer_size_need)
92
sp_seg_set_buffer_size(&(cd->sp_seg_buffer), buffer_size_need);
96
for (i = 0; i <= length; i ++)
97
ossb->min_len[i] = INFINITY_INT;
99
ossb->min_len[0] = ossb->parent[0] = 0;
101
for (i = 0; i < length; i ++)
105
= dict_get_all_match_lengths(cd->dicts, (*inbuf) + i, ossb->match_length);
107
if (ossb->match_length[0] != 1)
108
ossb->match_length[match_count ++] = 1;
111
for (j = 0; j < match_count; j ++)
113
size_t k = ossb->match_length[j];
114
ossb->match_length[j] = 0;
116
if (k > 1 && ossb->min_len[i] + 1 <= ossb->min_len[i + k])
118
ossb->min_len[i + k] = ossb->min_len[i] + 1;
119
ossb->parent[i + k] = i;
121
else if (k == 1 && ossb->min_len[i] + 1 < ossb->min_len[i + k])
123
ossb->min_len[i + k] = ossb->min_len[i] + 1;
124
ossb->parent[i + k] = i;
130
for (i = length, j = ossb->min_len[length]; i != 0; i = ossb->parent[i])
133
size_t inbuf_left_start = *inbuf_left;
137
for (i = begin = 0; i < ossb->min_len[length]; i ++)
141
const ucs4_t * match_rs = dict_match_longest(cd->dicts, *inbuf, end - begin);
143
if (match_rs == NULL)
146
(*outbuf) ++, (*outbuf_left) --;
147
(*inbuf) ++, (*inbuf_left) --;
151
/* 輸出緩衝區剩餘空間小於分詞長度 */
152
size_t match_len = ucs4len(match_rs);
153
if (match_len > *outbuf_left)
155
for (; *match_rs; match_rs ++)
157
**outbuf = *match_rs;
158
(*outbuf) ++,(*outbuf_left) --;
159
(*inbuf) ++,(*inbuf_left) --;
166
return inbuf_left_start - *inbuf_left;
169
static size_t agspseg(opencc_converter_description * cd,
170
ucs4_t ** inbuf, size_t * inbuf_left,
171
ucs4_t ** outbuf, size_t * outbuf_left)
174
size_t i, start, bound;
175
const ucs4_t * inbuf_start = *inbuf;
176
size_t inbuf_left_start = *inbuf_left;
177
size_t sp_seg_length;
181
for (i = start = 0; inbuf_start[i] && *inbuf_left > 0 && *outbuf_left > 0; i ++)
183
if (i != 0 && i == bound)
186
sp_seg_length = sp_seg(cd, inbuf, inbuf_left, outbuf, outbuf_left, bound - start);
187
if (sp_seg_length == (size_t) -1)
189
if (sp_seg_length == 0)
191
if (inbuf_left_start - *inbuf_left > 0)
192
return inbuf_left_start - *inbuf_left;
194
errnum = CONVERTER_ERROR_OUTBUF;
200
const ucs4_t * match_rs = dict_match_longest(cd->dicts, inbuf_start + i, 0);
202
size_t match_len = 1;
203
if (match_rs != NULL)
204
match_len = ucs4len(match_rs);
206
if (i + match_len > bound)
207
bound = i + match_len;
210
if (*inbuf_left > 0 && *outbuf_left > 0)
212
sp_seg_length = sp_seg(cd, inbuf, inbuf_left, outbuf, outbuf_left, bound - start);
213
if (sp_seg_length == (size_t) -1)
215
if (sp_seg_length == 0)
217
if (inbuf_left_start - *inbuf_left > 0)
218
return inbuf_left_start - *inbuf_left;
220
errnum = CONVERTER_ERROR_OUTBUF;
225
return inbuf_left_start - *inbuf_left;
229
static size_t mmseg(opencc_converter_description * cd,
230
ucs4_t ** inbuf, size_t * inbuf_left,
231
ucs4_t ** outbuf, size_t * outbuf_left)
234
size_t inbuf_left_start = *inbuf_left;
236
for (; **inbuf && *inbuf_left > 0 && *outbuf_left > 0;)
238
const ucs4_t * match_rs = dict_match_longest(cd->dicts, *inbuf, *inbuf_left);
240
if (match_rs == NULL)
243
(*outbuf) ++, (*outbuf_left) --;
244
(*inbuf) ++, (*inbuf_left) --;
248
/* 輸出緩衝區剩餘空間小於分詞長度 */
249
size_t match_len = ucs4len(match_rs);
250
if (match_len > *outbuf_left)
252
if (inbuf_left_start - *inbuf_left > 0)
254
errnum = CONVERTER_ERROR_OUTBUF;
258
for (; *match_rs; match_rs ++)
260
**outbuf = *match_rs;
261
(*outbuf) ++,(*outbuf_left) --;
262
(*inbuf) ++,(*inbuf_left) --;
267
return inbuf_left_start - *inbuf_left;
271
size_t converter_convert(opencc_converter_t cdt, ucs4_t ** inbuf, size_t * inbuf_left,
272
ucs4_t ** outbuf, size_t * outbuf_left)
274
opencc_converter_description * cd = (opencc_converter_description *) cdt;
276
if (cd->dicts == NULL)
278
errnum = CONVERTER_ERROR_NODICT;
292
void converter_assign_dicts(opencc_converter_t cdt, opencc_dictionary_t dicts)
294
opencc_converter_description * cd = (opencc_converter_description *) cdt;
298
opencc_converter_t converter_open()
300
opencc_converter_description * cd = (opencc_converter_description *)
301
malloc(sizeof(opencc_converter_description));
303
cd->sp_seg_buffer.initialized = FALSE;
304
cd->sp_seg_buffer.match_length = cd->sp_seg_buffer.min_len
305
= cd->sp_seg_buffer.parent = cd->sp_seg_buffer.path = NULL;
307
sp_seg_set_buffer_size(&cd->sp_seg_buffer, OPENCC_SP_SEG_DEFAULT_BUFFER_SIZE);
311
return (opencc_converter_t) cd;
314
void converter_close(opencc_converter_t cdt)
316
opencc_converter_description * cd = (opencc_converter_description *) cdt;
318
sp_seg_buffer_free(&(cd->sp_seg_buffer));
323
converter_error converter_errnum(void)
328
void converter_perror(const char * spec)
334
case CONVERTER_ERROR_VOID:
336
case CONVERTER_ERROR_NODICT:
337
perr(_("No dictionary loaded"));
339
case CONVERTER_ERROR_OUTBUF:
340
perr(_("Output buffer not enough for one segment"));