~ubuntu-branches/ubuntu/trusty/opencc/trusty-proposed

« back to all changes in this revision

Viewing changes to src/opencc_converter.c

  • Committer: Bazaar Package Importer
  • Author(s): LI Daobing, Asias He, LI Daobing
  • Date: 2010-08-11 22:04:12 UTC
  • mfrom: (1.1.5 upstream)
  • Revision ID: james.westby@ubuntu.com-20100811220412-ojbzhlzxphjs1340
Tags: 0.1.1-1
[ Asias He ]
* New upstream release (closes: #591076).
* debian/control: bump standards version to 3.9.1.
* debian/watch: use googlecode.debian.net.

[ LI Daobing ]
* debian/clean: added. 

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
/*
2
 
* Open Chinese Convert
3
 
*
4
 
* Copyright 2010 BYVoid <byvoid1@gmail.com>
5
 
*
6
 
* Licensed under the Apache License, Version 2.0 (the "License");
7
 
* you may not use this file except in compliance with the License.
8
 
* You may obtain a copy of the License at
9
 
*
10
 
*      http://www.apache.org/licenses/LICENSE-2.0
11
 
*
12
 
* Unless required by applicable law or agreed to in writing, software
13
 
* distributed under the License is distributed on an "AS IS" BASIS,
14
 
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
 
* See the License for the specific language governing permissions and
16
 
* limitations under the License.
17
 
*/
18
 
 
19
 
#include "opencc_utils.h"
20
 
#include "opencc_converter.h"
21
 
#include "opencc_encoding.h"
22
 
#include "opencc_dictionary.h"
23
 
 
24
 
#define OPENCC_SP_SEG_DEFAULT_BUFFER_SIZE 1024
25
 
 
26
 
typedef struct
27
 
{
28
 
        int initialized;
29
 
        size_t buffer_size;
30
 
        size_t * match_length;
31
 
        size_t * min_len;
32
 
        size_t * parent;
33
 
        size_t * path;
34
 
} opencc_sp_seg_buffer;
35
 
 
36
 
typedef struct
37
 
{
38
 
        opencc_sp_seg_buffer sp_seg_buffer;
39
 
        opencc_dictionary_t dicts;
40
 
} opencc_converter_description;
41
 
 
42
 
static converter_error errnum = CONVERTER_ERROR_VOID;
43
 
 
44
 
static void sp_seg_buffer_free(opencc_sp_seg_buffer * ossb)
45
 
{
46
 
        free(ossb->match_length);
47
 
        free(ossb->min_len);
48
 
        free(ossb->parent);
49
 
        free(ossb->path);
50
 
}
51
 
 
52
 
static void sp_seg_set_buffer_size(opencc_sp_seg_buffer * ossb, size_t buffer_size)
53
 
{
54
 
        if (ossb->initialized == TRUE)
55
 
                sp_seg_buffer_free(ossb);
56
 
        
57
 
        ossb->buffer_size = buffer_size;
58
 
        ossb->match_length = (size_t *) malloc((buffer_size + 1) * sizeof(size_t));
59
 
        ossb->min_len = (size_t *) malloc(buffer_size * sizeof(size_t));
60
 
        ossb->parent = (size_t *) malloc(buffer_size * sizeof(size_t));
61
 
        ossb->path = (size_t *) malloc(buffer_size * sizeof(size_t));
62
 
        
63
 
        ossb->initialized = TRUE;
64
 
}
65
 
 
66
 
static size_t sp_seg(opencc_converter_description * cd, ucs4_t ** inbuf, size_t * inbuf_left,
67
 
                ucs4_t ** outbuf, size_t * outbuf_left, size_t length)
68
 
{
69
 
        /* 最短路徑分詞 */
70
 
        
71
 
        /* 對長度爲1時特殊優化 */
72
 
        if (length == 1)
73
 
        {
74
 
                const ucs4_t * match_rs = dict_match_longest(cd->dicts, *inbuf, 1);
75
 
                
76
 
                if (match_rs == NULL)
77
 
                        **outbuf = **inbuf;
78
 
                else
79
 
                        **outbuf = *match_rs;
80
 
                
81
 
                (*outbuf) ++,(*outbuf_left) --;
82
 
                (*inbuf) ++,(*inbuf_left) --;
83
 
 
84
 
                /* 必須保證有一個字符空間 */
85
 
                return 1;
86
 
        }
87
 
        
88
 
        /* 設置緩衝區空間 */
89
 
        opencc_sp_seg_buffer * ossb = &(cd->sp_seg_buffer);
90
 
        size_t buffer_size_need = length + 1;
91
 
        if (ossb->initialized == FALSE || ossb->buffer_size < buffer_size_need)
92
 
                sp_seg_set_buffer_size(&(cd->sp_seg_buffer), buffer_size_need);
93
 
        
94
 
        size_t i, j;
95
 
 
96
 
        for (i = 0; i <= length; i ++)
97
 
                ossb->min_len[i] = INFINITY_INT;
98
 
        
99
 
        ossb->min_len[0] = ossb->parent[0] = 0;
100
 
        
101
 
        for (i = 0; i < length; i ++)
102
 
        {
103
 
                /* 獲取所有匹配長度 */
104
 
                size_t match_count
105
 
                        = dict_get_all_match_lengths(cd->dicts, (*inbuf) + i, ossb->match_length);
106
 
                
107
 
                if (ossb->match_length[0] != 1)
108
 
                        ossb->match_length[match_count ++] = 1;
109
 
                
110
 
                /* 動態規劃求最短分割路徑 */
111
 
                for (j = 0; j < match_count; j ++)
112
 
                {
113
 
                        size_t k = ossb->match_length[j];
114
 
                        ossb->match_length[j] = 0;
115
 
                        
116
 
                        if (k > 1 && ossb->min_len[i] + 1 <= ossb->min_len[i + k])
117
 
                        {
118
 
                                ossb->min_len[i + k] = ossb->min_len[i] + 1;
119
 
                                ossb->parent[i + k] = i;
120
 
                        }
121
 
                        else if (k == 1 && ossb->min_len[i] + 1 < ossb->min_len[i + k])
122
 
                        {
123
 
                                ossb->min_len[i + k] = ossb->min_len[i] + 1;
124
 
                                ossb->parent[i + k] = i;
125
 
                        }
126
 
                }
127
 
        }
128
 
        
129
 
        /* 取得最短分割路徑 */
130
 
        for (i = length, j = ossb->min_len[length]; i != 0; i = ossb->parent[i])
131
 
                ossb->path[--j] = i;
132
 
        
133
 
        size_t inbuf_left_start = *inbuf_left;
134
 
        size_t begin, end;
135
 
 
136
 
        /* 根據最短分割路徑轉換 */
137
 
        for (i = begin = 0; i < ossb->min_len[length]; i ++)
138
 
        {
139
 
                end = ossb->path[i];
140
 
                
141
 
                const ucs4_t * match_rs = dict_match_longest(cd->dicts, *inbuf, end - begin);
142
 
 
143
 
                if (match_rs == NULL)
144
 
                {
145
 
                        **outbuf = **inbuf;
146
 
                        (*outbuf) ++, (*outbuf_left) --;
147
 
                        (*inbuf) ++, (*inbuf_left) --;
148
 
                }
149
 
                else
150
 
                {
151
 
                        /* 輸出緩衝區剩餘空間小於分詞長度 */
152
 
                        size_t match_len = ucs4len(match_rs);
153
 
                        if (match_len > *outbuf_left)
154
 
                                break;
155
 
                        for (; *match_rs; match_rs ++)
156
 
                        {
157
 
                                **outbuf = *match_rs;
158
 
                                (*outbuf) ++,(*outbuf_left) --;
159
 
                                (*inbuf) ++,(*inbuf_left) --;
160
 
                        }
161
 
                }
162
 
                
163
 
                begin = end;
164
 
        }
165
 
        
166
 
        return inbuf_left_start - *inbuf_left;
167
 
}
168
 
 
169
 
static size_t agspseg(opencc_converter_description * cd,
170
 
                ucs4_t ** inbuf, size_t * inbuf_left,
171
 
                ucs4_t ** outbuf, size_t * outbuf_left)
172
 
{
173
 
        /* 歧義分割最短路徑分詞 */
174
 
        size_t i, start, bound;
175
 
        const ucs4_t * inbuf_start = *inbuf;
176
 
        size_t inbuf_left_start = *inbuf_left;
177
 
        size_t sp_seg_length;
178
 
        
179
 
        bound = 0;
180
 
        
181
 
        for (i = start = 0; inbuf_start[i] && *inbuf_left > 0 && *outbuf_left > 0; i ++)
182
 
        {
183
 
                if (i != 0 && i == bound)
184
 
                {
185
 
                        /* 對歧義部分進行最短路徑分詞 */
186
 
                        sp_seg_length = sp_seg(cd, inbuf, inbuf_left, outbuf, outbuf_left, bound - start);
187
 
                        if (sp_seg_length ==  (size_t) -1)
188
 
                                return (size_t) -1;
189
 
                        if (sp_seg_length == 0)
190
 
                        {
191
 
                                if (inbuf_left_start - *inbuf_left > 0)
192
 
                                        return inbuf_left_start - *inbuf_left;
193
 
                                /* 空間不足 */
194
 
                                errnum = CONVERTER_ERROR_OUTBUF;
195
 
                                return (size_t) -1;
196
 
                        }
197
 
                        start = i;
198
 
                }
199
 
        
200
 
                const ucs4_t * match_rs = dict_match_longest(cd->dicts, inbuf_start + i, 0);
201
 
                
202
 
                size_t match_len = 1;
203
 
                if (match_rs != NULL)
204
 
                        match_len = ucs4len(match_rs);
205
 
                
206
 
                if (i + match_len > bound)
207
 
                        bound = i + match_len;
208
 
        }
209
 
        
210
 
        if (*inbuf_left > 0 && *outbuf_left > 0)
211
 
        {
212
 
                sp_seg_length = sp_seg(cd, inbuf, inbuf_left, outbuf, outbuf_left, bound - start);
213
 
                if (sp_seg_length ==  (size_t) -1)
214
 
                        return (size_t) -1;
215
 
                if (sp_seg_length == 0)
216
 
                {
217
 
                        if (inbuf_left_start - *inbuf_left > 0)
218
 
                                return inbuf_left_start - *inbuf_left;
219
 
                        /* 空間不足 */
220
 
                        errnum = CONVERTER_ERROR_OUTBUF;
221
 
                        return (size_t) -1;
222
 
                }
223
 
        }
224
 
 
225
 
        return inbuf_left_start - *inbuf_left;
226
 
}
227
 
 
228
 
#if 0
229
 
static size_t mmseg(opencc_converter_description * cd,
230
 
                ucs4_t ** inbuf, size_t * inbuf_left,
231
 
                ucs4_t ** outbuf, size_t * outbuf_left)
232
 
{
233
 
        /* 正向最大分詞 */
234
 
        size_t inbuf_left_start = *inbuf_left;
235
 
 
236
 
        for (; **inbuf && *inbuf_left > 0 && *outbuf_left > 0;)
237
 
        {
238
 
                const ucs4_t * match_rs = dict_match_longest(cd->dicts, *inbuf, *inbuf_left);
239
 
 
240
 
                if (match_rs == NULL)
241
 
                {
242
 
                        **outbuf = **inbuf;
243
 
                        (*outbuf) ++, (*outbuf_left) --;
244
 
                        (*inbuf) ++, (*inbuf_left) --;
245
 
                }
246
 
                else
247
 
                {
248
 
                        /* 輸出緩衝區剩餘空間小於分詞長度 */
249
 
                        size_t match_len = ucs4len(match_rs);
250
 
                        if (match_len > *outbuf_left)
251
 
                        {
252
 
                                if (inbuf_left_start - *inbuf_left > 0)
253
 
                                        break;
254
 
                                errnum = CONVERTER_ERROR_OUTBUF;
255
 
                                return (size_t) -1;
256
 
                        }
257
 
 
258
 
                        for (; *match_rs; match_rs ++)
259
 
                        {
260
 
                                **outbuf = *match_rs;
261
 
                                (*outbuf) ++,(*outbuf_left) --;
262
 
                                (*inbuf) ++,(*inbuf_left) --;
263
 
                        }
264
 
                }
265
 
        }
266
 
 
267
 
        return inbuf_left_start - *inbuf_left;
268
 
}
269
 
#endif
270
 
 
271
 
size_t converter_convert(opencc_converter_t cdt, ucs4_t ** inbuf, size_t * inbuf_left,
272
 
                ucs4_t ** outbuf, size_t * outbuf_left)
273
 
{
274
 
        opencc_converter_description * cd = (opencc_converter_description *) cdt;
275
 
 
276
 
        if (cd->dicts == NULL)
277
 
        {
278
 
                errnum = CONVERTER_ERROR_NODICT;
279
 
                return (size_t) -1;
280
 
        }
281
 
 
282
 
        return agspseg
283
 
        (
284
 
                cd,
285
 
                inbuf,
286
 
                inbuf_left,
287
 
                outbuf,
288
 
                outbuf_left
289
 
        );
290
 
}
291
 
 
292
 
void converter_assign_dicts(opencc_converter_t cdt, opencc_dictionary_t dicts)
293
 
{
294
 
        opencc_converter_description * cd = (opencc_converter_description *) cdt;
295
 
        cd->dicts = dicts;
296
 
}
297
 
 
298
 
opencc_converter_t converter_open()
299
 
{
300
 
        opencc_converter_description * cd = (opencc_converter_description *)
301
 
                        malloc(sizeof(opencc_converter_description));
302
 
 
303
 
        cd->sp_seg_buffer.initialized = FALSE;
304
 
        cd->sp_seg_buffer.match_length = cd->sp_seg_buffer.min_len
305
 
                        = cd->sp_seg_buffer.parent = cd->sp_seg_buffer.path = NULL;
306
 
 
307
 
        sp_seg_set_buffer_size(&cd->sp_seg_buffer, OPENCC_SP_SEG_DEFAULT_BUFFER_SIZE);
308
 
 
309
 
        cd->dicts = NULL;
310
 
 
311
 
        return (opencc_converter_t) cd;
312
 
}
313
 
 
314
 
void converter_close(opencc_converter_t cdt)
315
 
{
316
 
        opencc_converter_description * cd = (opencc_converter_description *) cdt;
317
 
 
318
 
        sp_seg_buffer_free(&(cd->sp_seg_buffer));
319
 
 
320
 
        free(cd);
321
 
}
322
 
 
323
 
converter_error converter_errnum(void)
324
 
{
325
 
        return errnum;
326
 
}
327
 
 
328
 
void converter_perror(const char * spec)
329
 
{
330
 
        perr(spec);
331
 
        perr("\n");
332
 
        switch(errnum)
333
 
        {
334
 
        case CONVERTER_ERROR_VOID:
335
 
                break;
336
 
        case CONVERTER_ERROR_NODICT:
337
 
                perr(_("No dictionary loaded"));
338
 
                break;
339
 
        case CONVERTER_ERROR_OUTBUF:
340
 
                perr(_("Output buffer not enough for one segment"));
341
 
                break;
342
 
        default:
343
 
                perr(_("Unknown"));
344
 
        }
345
 
}