2
* _codecs_cn.c: Codecs collection for Mainland Chinese encodings
4
* Written by Hye-Shik Chang <perky@FreeBSD.org>
8
#include "mappings_cn.h"
11
* hz is predefined as 100 on AIX. So we undefine it to avoid
12
* conflict against hz codec's.
18
/* GBK and GB2312 map differently in few codepoints that are listed below:
21
* A1A4 U+30FB KATAKANA MIDDLE DOT U+00B7 MIDDLE DOT
22
* A1AA U+2015 HORIZONTAL BAR U+2014 EM DASH
23
* A844 undefined U+2015 HORIZONTAL BAR
26
#define GBK_DECODE(dc1, dc2, assi) \
27
if ((dc1) == 0xa1 && (dc2) == 0xaa) (assi) = 0x2014; \
28
else if ((dc1) == 0xa8 && (dc2) == 0x44) (assi) = 0x2015; \
29
else if ((dc1) == 0xa1 && (dc2) == 0xa4) (assi) = 0x00b7; \
30
else TRYMAP_DEC(gb2312, assi, dc1 ^ 0x80, dc2 ^ 0x80); \
31
else TRYMAP_DEC(gbkext, assi, dc1, dc2);
33
#define GBK_ENCODE(code, assi) \
34
if ((code) == 0x2014) (assi) = 0xa1aa; \
35
else if ((code) == 0x2015) (assi) = 0xa844; \
36
else if ((code) == 0x00b7) (assi) = 0xa1a4; \
37
else if ((code) != 0x30fb && TRYMAP_ENC_COND(gbcommon, assi, code));
50
WRITE1((unsigned char)c)
57
TRYMAP_ENC(gbcommon, code, c);
60
if (code & 0x8000) /* MSB set: GBK */
63
OUT1((code >> 8) | 0x80)
64
OUT2((code & 0xFF) | 0x80)
74
unsigned char c = **inbuf;
85
TRYMAP_DEC(gb2312, **outbuf, c ^ 0x80, IN2 ^ 0x80) {
106
WRITE1((unsigned char)c)
117
OUT1((code >> 8) | 0x80)
119
OUT2((code & 0xFF)) /* MSB set: GBK */
121
OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
131
unsigned char c = IN1;
143
GBK_DECODE(c, IN2, **outbuf)
171
#if Py_UNICODE_SIZE == 2
172
return 2; /* surrogates pair */
176
else if (c >= 0x10000) {
177
ucs4_t tc = c - 0x10000;
181
OUT4((unsigned char)(tc % 10) + 0x30)
183
OUT3((unsigned char)(tc % 126) + 0x81)
185
OUT2((unsigned char)(tc % 10) + 0x30)
187
OUT1((unsigned char)(tc + 0x90))
189
#if Py_UNICODE_SIZE == 2
190
NEXT(2, 4) /* surrogates pair */
200
else TRYMAP_ENC(gb18030ext, code, c);
202
const struct _gb18030_to_unibmp_ranges *utrrange;
206
for (utrrange = gb18030_to_unibmp_ranges;
207
utrrange->first != 0;
209
if (utrrange->first <= c &&
210
c <= utrrange->last) {
213
tc = c - utrrange->first +
216
OUT4((unsigned char)(tc % 10) + 0x30)
218
OUT3((unsigned char)(tc % 126) + 0x81)
220
OUT2((unsigned char)(tc % 10) + 0x30)
222
OUT1((unsigned char)tc + 0x81)
228
if (utrrange->first == 0)
233
OUT1((code >> 8) | 0x80)
235
OUT2((code & 0xFF)) /* MSB set: GBK or GB18030ext */
237
OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
248
unsigned char c = IN1, c2;
261
if (c2 >= 0x30 && c2 <= 0x39) { /* 4 bytes seq */
262
const struct _gb18030_to_unibmp_ranges *utr;
263
unsigned char c3, c4;
269
if (c < 0x81 || c3 < 0x81 || c4 < 0x30 || c4 > 0x39)
271
c -= 0x81; c2 -= 0x30;
272
c3 -= 0x81; c4 -= 0x30;
274
if (c < 4) { /* U+0080 - U+FFFF */
275
lseq = ((ucs4_t)c * 10 + c2) * 1260 +
276
(ucs4_t)c3 * 10 + c4;
278
for (utr = gb18030_to_unibmp_ranges;
279
lseq >= (utr + 1)->base;
281
OUT1(utr->first - utr->base + lseq)
286
else if (c >= 15) { /* U+10000 - U+10FFFF */
287
lseq = 0x10000 + (((ucs4_t)c-15) * 10 + c2)
288
* 1260 + (ucs4_t)c3 * 10 + c4;
289
if (lseq <= 0x10FFFF) {
298
GBK_DECODE(c, c2, **outbuf)
299
else TRYMAP_DEC(gb18030ext, **outbuf, c, c2);
337
WRITE1((unsigned char)c)
341
WRITE3('~', '}', (unsigned char)c)
350
TRYMAP_ENC(gbcommon, code, c);
353
if (code & 0x8000) /* MSB set: GBK */
357
WRITE4('~', '{', code >> 8, code & 0xff)
362
WRITE2(code >> 8, code & 0xff)
385
unsigned char c = IN1;
388
unsigned char c2 = IN2;
396
else if (c2 == '{' && state->i == 0)
397
state->i = 1; /* set GB */
398
else if (c2 == '}' && state->i == 1)
399
state->i = 0; /* set ASCII */
401
; /* line-continuation */
411
if (state->i == 0) { /* ASCII mode */
418
TRYMAP_DEC(gb2312, **outbuf, c, IN2) {
431
MAPPING_DECONLY(gb2312)
432
MAPPING_DECONLY(gbkext)
433
MAPPING_ENCONLY(gbcommon)
434
MAPPING_ENCDEC(gb18030ext)
438
CODEC_STATELESS(gb2312)
440
CODEC_STATELESS(gb18030)
444
I_AM_A_MODULE_FOR(cn)