~ubuntu-branches/ubuntu/maverick/python3.1/maverick

« back to all changes in this revision

Viewing changes to Modules/cjkcodecs/_codecs_cn.c

  • Committer: Bazaar Package Importer
  • Author(s): Matthias Klose
  • Date: 2009-03-23 00:01:27 UTC
  • Revision ID: james.westby@ubuntu.com-20090323000127-5fstfxju4ufrhthq
Tags: upstream-3.1~a1+20090322
ImportĀ upstreamĀ versionĀ 3.1~a1+20090322

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/*
 
2
 * _codecs_cn.c: Codecs collection for Mainland Chinese encodings
 
3
 *
 
4
 * Written by Hye-Shik Chang <perky@FreeBSD.org>
 
5
 */
 
6
 
 
7
#include "cjkcodecs.h"
 
8
#include "mappings_cn.h"
 
9
 
 
10
/**
 
11
 * hz is predefined as 100 on AIX. So we undefine it to avoid
 
12
 * conflict against hz codec's.
 
13
 */
 
14
#ifdef _AIX
 
15
#undef hz
 
16
#endif
 
17
 
 
18
/* GBK and GB2312 map differently in few codepoints that are listed below:
 
19
 *
 
20
 *              gb2312                          gbk
 
21
 * A1A4         U+30FB KATAKANA MIDDLE DOT      U+00B7 MIDDLE DOT
 
22
 * A1AA         U+2015 HORIZONTAL BAR           U+2014 EM DASH
 
23
 * A844         undefined                       U+2015 HORIZONTAL BAR
 
24
 */
 
25
 
 
26
#define GBK_DECODE(dc1, dc2, assi) \
 
27
        if ((dc1) == 0xa1 && (dc2) == 0xaa) (assi) = 0x2014; \
 
28
        else if ((dc1) == 0xa8 && (dc2) == 0x44) (assi) = 0x2015; \
 
29
        else if ((dc1) == 0xa1 && (dc2) == 0xa4) (assi) = 0x00b7; \
 
30
        else TRYMAP_DEC(gb2312, assi, dc1 ^ 0x80, dc2 ^ 0x80); \
 
31
        else TRYMAP_DEC(gbkext, assi, dc1, dc2);
 
32
 
 
33
#define GBK_ENCODE(code, assi) \
 
34
        if ((code) == 0x2014) (assi) = 0xa1aa; \
 
35
        else if ((code) == 0x2015) (assi) = 0xa844; \
 
36
        else if ((code) == 0x00b7) (assi) = 0xa1a4; \
 
37
        else if ((code) != 0x30fb && TRYMAP_ENC_COND(gbcommon, assi, code));
 
38
 
 
39
/*
 
40
 * GB2312 codec
 
41
 */
 
42
 
 
43
ENCODER(gb2312)
 
44
{
 
45
        while (inleft > 0) {
 
46
                Py_UNICODE c = IN1;
 
47
                DBCHAR code;
 
48
 
 
49
                if (c < 0x80) {
 
50
                        WRITE1((unsigned char)c)
 
51
                        NEXT(1, 1)
 
52
                        continue;
 
53
                }
 
54
                UCS4INVALID(c)
 
55
 
 
56
                REQUIRE_OUTBUF(2)
 
57
                TRYMAP_ENC(gbcommon, code, c);
 
58
                else return 1;
 
59
 
 
60
                if (code & 0x8000) /* MSB set: GBK */
 
61
                        return 1;
 
62
 
 
63
                OUT1((code >> 8) | 0x80)
 
64
                OUT2((code & 0xFF) | 0x80)
 
65
                NEXT(1, 2)
 
66
        }
 
67
 
 
68
        return 0;
 
69
}
 
70
 
 
71
DECODER(gb2312)
 
72
{
 
73
        while (inleft > 0) {
 
74
                unsigned char c = **inbuf;
 
75
 
 
76
                REQUIRE_OUTBUF(1)
 
77
 
 
78
                if (c < 0x80) {
 
79
                        OUT1(c)
 
80
                        NEXT(1, 1)
 
81
                        continue;
 
82
                }
 
83
 
 
84
                REQUIRE_INBUF(2)
 
85
                TRYMAP_DEC(gb2312, **outbuf, c ^ 0x80, IN2 ^ 0x80) {
 
86
                        NEXT(2, 1)
 
87
                }
 
88
                else return 2;
 
89
        }
 
90
 
 
91
        return 0;
 
92
}
 
93
 
 
94
 
 
95
/*
 
96
 * GBK codec
 
97
 */
 
98
 
 
99
ENCODER(gbk)
 
100
{
 
101
        while (inleft > 0) {
 
102
                Py_UNICODE c = IN1;
 
103
                DBCHAR code;
 
104
 
 
105
                if (c < 0x80) {
 
106
                        WRITE1((unsigned char)c)
 
107
                        NEXT(1, 1)
 
108
                        continue;
 
109
                }
 
110
                UCS4INVALID(c)
 
111
 
 
112
                REQUIRE_OUTBUF(2)
 
113
 
 
114
                GBK_ENCODE(c, code)
 
115
                else return 1;
 
116
 
 
117
                OUT1((code >> 8) | 0x80)
 
118
                if (code & 0x8000)
 
119
                        OUT2((code & 0xFF)) /* MSB set: GBK */
 
120
                else
 
121
                        OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
 
122
                NEXT(1, 2)
 
123
        }
 
124
 
 
125
        return 0;
 
126
}
 
127
 
 
128
DECODER(gbk)
 
129
{
 
130
        while (inleft > 0) {
 
131
                unsigned char c = IN1;
 
132
 
 
133
                REQUIRE_OUTBUF(1)
 
134
 
 
135
                if (c < 0x80) {
 
136
                        OUT1(c)
 
137
                        NEXT(1, 1)
 
138
                        continue;
 
139
                }
 
140
 
 
141
                REQUIRE_INBUF(2)
 
142
 
 
143
                GBK_DECODE(c, IN2, **outbuf)
 
144
                else return 2;
 
145
 
 
146
                NEXT(2, 1)
 
147
        }
 
148
 
 
149
        return 0;
 
150
}
 
151
 
 
152
 
 
153
/*
 
154
 * GB18030 codec
 
155
 */
 
156
 
 
157
ENCODER(gb18030)
 
158
{
 
159
        while (inleft > 0) {
 
160
                ucs4_t c = IN1;
 
161
                DBCHAR code;
 
162
 
 
163
                if (c < 0x80) {
 
164
                        WRITE1(c)
 
165
                        NEXT(1, 1)
 
166
                        continue;
 
167
                }
 
168
 
 
169
                DECODE_SURROGATE(c)
 
170
                if (c > 0x10FFFF)
 
171
#if Py_UNICODE_SIZE == 2
 
172
                        return 2; /* surrogates pair */
 
173
#else
 
174
                        return 1;
 
175
#endif
 
176
                else if (c >= 0x10000) {
 
177
                        ucs4_t tc = c - 0x10000;
 
178
 
 
179
                        REQUIRE_OUTBUF(4)
 
180
 
 
181
                        OUT4((unsigned char)(tc % 10) + 0x30)
 
182
                        tc /= 10;
 
183
                        OUT3((unsigned char)(tc % 126) + 0x81)
 
184
                        tc /= 126;
 
185
                        OUT2((unsigned char)(tc % 10) + 0x30)
 
186
                        tc /= 10;
 
187
                        OUT1((unsigned char)(tc + 0x90))
 
188
 
 
189
#if Py_UNICODE_SIZE == 2
 
190
                        NEXT(2, 4) /* surrogates pair */
 
191
#else
 
192
                        NEXT(1, 4)
 
193
#endif
 
194
                        continue;
 
195
                }
 
196
 
 
197
                REQUIRE_OUTBUF(2)
 
198
 
 
199
                GBK_ENCODE(c, code)
 
200
                else TRYMAP_ENC(gb18030ext, code, c);
 
201
                else {
 
202
                        const struct _gb18030_to_unibmp_ranges *utrrange;
 
203
 
 
204
                        REQUIRE_OUTBUF(4)
 
205
 
 
206
                        for (utrrange = gb18030_to_unibmp_ranges;
 
207
                             utrrange->first != 0;
 
208
                             utrrange++)
 
209
                                if (utrrange->first <= c &&
 
210
                                    c <= utrrange->last) {
 
211
                                        Py_UNICODE tc;
 
212
 
 
213
                                        tc = c - utrrange->first +
 
214
                                             utrrange->base;
 
215
 
 
216
                                        OUT4((unsigned char)(tc % 10) + 0x30)
 
217
                                        tc /= 10;
 
218
                                        OUT3((unsigned char)(tc % 126) + 0x81)
 
219
                                        tc /= 126;
 
220
                                        OUT2((unsigned char)(tc % 10) + 0x30)
 
221
                                        tc /= 10;
 
222
                                        OUT1((unsigned char)tc + 0x81)
 
223
 
 
224
                                        NEXT(1, 4)
 
225
                                        break;
 
226
                                }
 
227
 
 
228
                        if (utrrange->first == 0)
 
229
                                return 1;
 
230
                        continue;
 
231
                }
 
232
 
 
233
                OUT1((code >> 8) | 0x80)
 
234
                if (code & 0x8000)
 
235
                        OUT2((code & 0xFF)) /* MSB set: GBK or GB18030ext */
 
236
                else
 
237
                        OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
 
238
 
 
239
                NEXT(1, 2)
 
240
        }
 
241
 
 
242
        return 0;
 
243
}
 
244
 
 
245
DECODER(gb18030)
 
246
{
 
247
        while (inleft > 0) {
 
248
                unsigned char c = IN1, c2;
 
249
 
 
250
                REQUIRE_OUTBUF(1)
 
251
 
 
252
                if (c < 0x80) {
 
253
                        OUT1(c)
 
254
                        NEXT(1, 1)
 
255
                        continue;
 
256
                }
 
257
 
 
258
                REQUIRE_INBUF(2)
 
259
 
 
260
                c2 = IN2;
 
261
                if (c2 >= 0x30 && c2 <= 0x39) { /* 4 bytes seq */
 
262
                        const struct _gb18030_to_unibmp_ranges *utr;
 
263
                        unsigned char c3, c4;
 
264
                        ucs4_t lseq;
 
265
 
 
266
                        REQUIRE_INBUF(4)
 
267
                        c3 = IN3;
 
268
                        c4 = IN4;
 
269
                        if (c < 0x81 || c3 < 0x81 || c4 < 0x30 || c4 > 0x39)
 
270
                                return 4;
 
271
                        c -= 0x81;  c2 -= 0x30;
 
272
                        c3 -= 0x81; c4 -= 0x30;
 
273
 
 
274
                        if (c < 4) { /* U+0080 - U+FFFF */
 
275
                                lseq = ((ucs4_t)c * 10 + c2) * 1260 +
 
276
                                        (ucs4_t)c3 * 10 + c4;
 
277
                                if (lseq < 39420) {
 
278
                                        for (utr = gb18030_to_unibmp_ranges;
 
279
                                             lseq >= (utr + 1)->base;
 
280
                                             utr++) ;
 
281
                                        OUT1(utr->first - utr->base + lseq)
 
282
                                        NEXT(4, 1)
 
283
                                        continue;
 
284
                                }
 
285
                        }
 
286
                        else if (c >= 15) { /* U+10000 - U+10FFFF */
 
287
                                lseq = 0x10000 + (((ucs4_t)c-15) * 10 + c2)
 
288
                                        * 1260 + (ucs4_t)c3 * 10 + c4;
 
289
                                if (lseq <= 0x10FFFF) {
 
290
                                        WRITEUCS4(lseq);
 
291
                                        NEXT_IN(4)
 
292
                                        continue;
 
293
                                }
 
294
                        }
 
295
                        return 4;
 
296
                }
 
297
 
 
298
                GBK_DECODE(c, c2, **outbuf)
 
299
                else TRYMAP_DEC(gb18030ext, **outbuf, c, c2);
 
300
                else return 2;
 
301
 
 
302
                NEXT(2, 1)
 
303
        }
 
304
 
 
305
        return 0;
 
306
}
 
307
 
 
308
 
 
309
/*
 
310
 * HZ codec
 
311
 */
 
312
 
 
313
ENCODER_INIT(hz)
 
314
{
 
315
        state->i = 0;
 
316
        return 0;
 
317
}
 
318
 
 
319
ENCODER_RESET(hz)
 
320
{
 
321
        if (state->i != 0) {
 
322
                WRITE2('~', '}')
 
323
                state->i = 0;
 
324
                NEXT_OUT(2)
 
325
        }
 
326
        return 0;
 
327
}
 
328
 
 
329
ENCODER(hz)
 
330
{
 
331
        while (inleft > 0) {
 
332
                Py_UNICODE c = IN1;
 
333
                DBCHAR code;
 
334
 
 
335
                if (c < 0x80) {
 
336
                        if (state->i == 0) {
 
337
                                WRITE1((unsigned char)c)
 
338
                                NEXT(1, 1)
 
339
                        }
 
340
                        else {
 
341
                                WRITE3('~', '}', (unsigned char)c)
 
342
                                NEXT(1, 3)
 
343
                                state->i = 0;
 
344
                        }
 
345
                        continue;
 
346
                }
 
347
 
 
348
                UCS4INVALID(c)
 
349
 
 
350
                TRYMAP_ENC(gbcommon, code, c);
 
351
                else return 1;
 
352
 
 
353
                if (code & 0x8000) /* MSB set: GBK */
 
354
                        return 1;
 
355
 
 
356
                if (state->i == 0) {
 
357
                        WRITE4('~', '{', code >> 8, code & 0xff)
 
358
                        NEXT(1, 4)
 
359
                        state->i = 1;
 
360
                }
 
361
                else {
 
362
                        WRITE2(code >> 8, code & 0xff)
 
363
                        NEXT(1, 2)
 
364
                }
 
365
        }
 
366
 
 
367
        return 0;
 
368
}
 
369
 
 
370
DECODER_INIT(hz)
 
371
{
 
372
        state->i = 0;
 
373
        return 0;
 
374
}
 
375
 
 
376
DECODER_RESET(hz)
 
377
{
 
378
        state->i = 0;
 
379
        return 0;
 
380
}
 
381
 
 
382
DECODER(hz)
 
383
{
 
384
        while (inleft > 0) {
 
385
                unsigned char c = IN1;
 
386
 
 
387
                if (c == '~') {
 
388
                        unsigned char c2 = IN2;
 
389
 
 
390
                        REQUIRE_INBUF(2)
 
391
                        if (c2 == '~') {
 
392
                                WRITE1('~')
 
393
                                NEXT(2, 1)
 
394
                                continue;
 
395
                        }
 
396
                        else if (c2 == '{' && state->i == 0)
 
397
                                state->i = 1; /* set GB */
 
398
                        else if (c2 == '}' && state->i == 1)
 
399
                                state->i = 0; /* set ASCII */
 
400
                        else if (c2 == '\n')
 
401
                                ; /* line-continuation */
 
402
                        else
 
403
                                return 2;
 
404
                        NEXT(2, 0);
 
405
                        continue;
 
406
                }
 
407
 
 
408
                if (c & 0x80)
 
409
                        return 1;
 
410
 
 
411
                if (state->i == 0) { /* ASCII mode */
 
412
                        WRITE1(c)
 
413
                        NEXT(1, 1)
 
414
                }
 
415
                else { /* GB mode */
 
416
                        REQUIRE_INBUF(2)
 
417
                        REQUIRE_OUTBUF(1)
 
418
                        TRYMAP_DEC(gb2312, **outbuf, c, IN2) {
 
419
                                NEXT(2, 1)
 
420
                        }
 
421
                        else
 
422
                                return 2;
 
423
                }
 
424
        }
 
425
 
 
426
        return 0;
 
427
}
 
428
 
 
429
 
 
430
BEGIN_MAPPINGS_LIST
 
431
  MAPPING_DECONLY(gb2312)
 
432
  MAPPING_DECONLY(gbkext)
 
433
  MAPPING_ENCONLY(gbcommon)
 
434
  MAPPING_ENCDEC(gb18030ext)
 
435
END_MAPPINGS_LIST
 
436
 
 
437
BEGIN_CODECS_LIST
 
438
  CODEC_STATELESS(gb2312)
 
439
  CODEC_STATELESS(gbk)
 
440
  CODEC_STATELESS(gb18030)
 
441
  CODEC_STATEFUL(hz)
 
442
END_CODECS_LIST
 
443
 
 
444
I_AM_A_MODULE_FOR(cn)