~ubuntu-branches/ubuntu/trusty/musl/trusty-proposed

« back to all changes in this revision

Viewing changes to src/locale/iconv.c

  • Committer: Package Import Robot
  • Author(s): Kevin Bortis
  • Date: 2013-09-20 20:54:14 UTC
  • Revision ID: package-import@ubuntu.com-20130920205414-5b61trtmma18w58o
Tags: upstream-0.9.13
ImportĀ upstreamĀ versionĀ 0.9.13

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
#include <iconv.h>
 
2
#include <errno.h>
 
3
#include <wchar.h>
 
4
#include <string.h>
 
5
#include <stdlib.h>
 
6
#include <limits.h>
 
7
#include <stdint.h>
 
8
 
 
9
#define UTF_32BE    0300
 
10
#define UTF_16LE    0301
 
11
#define UTF_16BE    0302
 
12
#define UTF_32LE    0303
 
13
#define UCS2BE      0304
 
14
#define UCS2LE      0305
 
15
#define WCHAR_T     0306
 
16
#define US_ASCII    0307
 
17
#define UTF_8       0310
 
18
#define EUC_JP      0320
 
19
#define SHIFT_JIS   0321
 
20
#define GB18030     0330
 
21
#define GBK         0331
 
22
#define GB2312      0332
 
23
#define BIG5        0340
 
24
#define EUC_KR      0350
 
25
 
 
26
/* FIXME: these are not implemented yet
 
27
 * EUC:   A1-FE A1-FE
 
28
 * GBK:   81-FE 40-7E,80-FE
 
29
 * Big5:  A1-FE 40-7E,A1-FE
 
30
 */
 
31
 
 
32
/* Definitions of charmaps. Each charmap consists of:
 
33
 * 1. Empty-string-terminated list of null-terminated aliases.
 
34
 * 2. Special type code or number of elided entries.
 
35
 * 3. Character table (size determined by field 2). */
 
36
 
 
37
static const unsigned char charmaps[] =
 
38
"utf8\0\0\310"
 
39
"wchart\0\0\306"
 
40
"ucs2\0ucs2be\0\0\304"
 
41
"ucs2le\0\0\305"
 
42
"utf16\0utf16be\0\0\302"
 
43
"utf16le\0\0\301"
 
44
"ucs4\0ucs4be\0utf32\0utf32be\0\0\300"
 
45
"ucs4le\0utf32le\0\0\303"
 
46
"ascii\0usascii\0iso646\0iso646us\0\0\307"
 
47
"eucjp\0\0\320"
 
48
"shiftjis\0sjis\0\0\321"
 
49
"gb18030\0\0\330"
 
50
"gbk\0\0\331"
 
51
"gb2312\0\0\332"
 
52
"big5\0bigfive\0cp950\0big5hkscs\0\0\340"
 
53
"euckr\0ksc5601\0ksx1001\0cp949\0\0\350"
 
54
#include "codepages.h"
 
55
;
 
56
 
 
57
static const unsigned short legacy_chars[] = {
 
58
#include "legacychars.h"
 
59
};
 
60
 
 
61
static const unsigned short jis0208[84][94] = {
 
62
#include "jis0208.h"
 
63
};
 
64
 
 
65
static const unsigned short gb18030[126][190] = {
 
66
#include "gb18030.h"
 
67
};
 
68
 
 
69
static const unsigned short big5[89][157] = {
 
70
#include "big5.h"
 
71
};
 
72
 
 
73
static const unsigned short hkscs[] = {
 
74
#include "hkscs.h"
 
75
};
 
76
 
 
77
static const unsigned short ksc[93][94] = {
 
78
#include "ksc.h"
 
79
};
 
80
 
 
81
static int fuzzycmp(const unsigned char *a, const unsigned char *b)
 
82
{
 
83
        for (; *a && *b; a++, b++) {
 
84
                while (*a && (*a|32U)-'a'>26 && *a-'0'>10U) a++;
 
85
                if ((*a|32U) != *b) return 1;
 
86
        }
 
87
        return *a != *b;
 
88
}
 
89
 
 
90
static size_t find_charmap(const void *name)
 
91
{
 
92
        const unsigned char *s;
 
93
        for (s=charmaps; *s; ) {
 
94
                if (!fuzzycmp(name, s)) {
 
95
                        for (; *s; s+=strlen((void *)s)+1);
 
96
                        return s+1-charmaps;
 
97
                }
 
98
                s += strlen((void *)s)+1;
 
99
                if (!*s) {
 
100
                        if (s[1] > 0200) s+=2;
 
101
                        else s+=2+(128U-s[1])/4*5;
 
102
                }
 
103
        }
 
104
        return -1;
 
105
}
 
106
 
 
107
iconv_t iconv_open(const char *to, const char *from)
 
108
{
 
109
        size_t f, t;
 
110
 
 
111
        if ((t = find_charmap(to))==-1
 
112
         || (f = find_charmap(from))==-1
 
113
         || (charmaps[t] >= 0320)) {
 
114
                errno = EINVAL;
 
115
                return (iconv_t)-1;
 
116
        }
 
117
 
 
118
        return (void *)(f<<16 | t);
 
119
}
 
120
 
 
121
int iconv_close(iconv_t cd)
 
122
{
 
123
        return 0;
 
124
}
 
125
 
 
126
static unsigned get_16(const unsigned char *s, int e)
 
127
{
 
128
        e &= 1;
 
129
        return s[e]<<8 | s[1-e];
 
130
}
 
131
 
 
132
static void put_16(unsigned char *s, unsigned c, int e)
 
133
{
 
134
        e &= 1;
 
135
        s[e] = c>>8;
 
136
        s[1-e] = c;
 
137
}
 
138
 
 
139
static unsigned get_32(const unsigned char *s, int e)
 
140
{
 
141
        e &= 3;
 
142
        return s[e]+0U<<24 | s[e^1]<<16 | s[e^2]<<8 | s[e^3];
 
143
}
 
144
 
 
145
static void put_32(unsigned char *s, unsigned c, int e)
 
146
{
 
147
        e &= 3;
 
148
        s[e^0] = c>>24;
 
149
        s[e^1] = c>>16;
 
150
        s[e^2] = c>>8;
 
151
        s[e^3] = c;
 
152
}
 
153
 
 
154
/* Adapt as needed */
 
155
#define mbrtowc_utf8 mbrtowc
 
156
#define wctomb_utf8 wctomb
 
157
 
 
158
size_t iconv(iconv_t cd0, char **restrict in, size_t *restrict inb, char **restrict out, size_t *restrict outb)
 
159
{
 
160
        size_t x=0;
 
161
        unsigned long cd = (unsigned long)cd0;
 
162
        unsigned to = cd & 0xffff;
 
163
        unsigned from = cd >> 16;
 
164
        const unsigned char *map = charmaps+from+1;
 
165
        const unsigned char *tomap = charmaps+to+1;
 
166
        mbstate_t st = {0};
 
167
        wchar_t wc;
 
168
        unsigned c, d;
 
169
        size_t k, l;
 
170
        int err;
 
171
        unsigned char type = map[-1];
 
172
        unsigned char totype = tomap[-1];
 
173
 
 
174
        if (!in || !*in || !*inb) return 0;
 
175
 
 
176
        for (; *inb; *in+=l, *inb-=l) {
 
177
                c = *(unsigned char *)*in;
 
178
                l = 1;
 
179
 
 
180
                if (c >= 128 || type-UTF_32BE < 7U) switch (type) {
 
181
                case UTF_8:
 
182
                        l = mbrtowc_utf8(&wc, *in, *inb, &st);
 
183
                        if (!l) l++;
 
184
                        else if (l == (size_t)-1) goto ilseq;
 
185
                        else if (l == (size_t)-2) goto starved;
 
186
                        c = wc;
 
187
                        break;
 
188
                case US_ASCII:
 
189
                        goto ilseq;
 
190
                case WCHAR_T:
 
191
                        l = sizeof(wchar_t);
 
192
                        if (*inb < l) goto starved;
 
193
                        c = *(wchar_t *)*in;
 
194
                        if (0) {
 
195
                case UTF_32BE:
 
196
                case UTF_32LE:
 
197
                        l = 4;
 
198
                        if (*inb < 4) goto starved;
 
199
                        c = get_32((void *)*in, type);
 
200
                        }
 
201
                        if (c-0xd800u < 0x800u || c >= 0x110000u) goto ilseq;
 
202
                        break;
 
203
                case UCS2BE:
 
204
                case UCS2LE:
 
205
                case UTF_16BE:
 
206
                case UTF_16LE:
 
207
                        l = 2;
 
208
                        if (*inb < 2) goto starved;
 
209
                        c = get_16((void *)*in, type);
 
210
                        if ((unsigned)(c-0xdc00) < 0x400) goto ilseq;
 
211
                        if ((unsigned)(c-0xd800) < 0x400) {
 
212
                                if (type-UCS2BE < 2U) goto ilseq;
 
213
                                l = 4;
 
214
                                if (*inb < 4) goto starved;
 
215
                                d = get_16((void *)(*in + 2), type);
 
216
                                if ((unsigned)(d-0xdc00) >= 0x400) goto ilseq;
 
217
                                c = ((c-0xd7c0)<<10) + (d-0xdc00);
 
218
                        }
 
219
                        break;
 
220
                case SHIFT_JIS:
 
221
                        if (c-0xa1 <= 0xdf-0xa1) {
 
222
                                c += 0xff61-0xa1;
 
223
                                break;
 
224
                        }
 
225
                        l = 2;
 
226
                        if (*inb < 2) goto starved;
 
227
                        d = *((unsigned char *)*in + 1);
 
228
                        if (c-129 <= 159-129) c -= 129;
 
229
                        else if (c-224 <= 239-224) c -= 193;
 
230
                        else goto ilseq;
 
231
                        c *= 2;
 
232
                        if (d-64 <= 158-64) {
 
233
                                if (d==127) goto ilseq;
 
234
                                if (d>127) d--;
 
235
                                d -= 64;
 
236
                        } else if (d-159 <= 252-159) {
 
237
                                c++;
 
238
                                d -= 159;
 
239
                        }
 
240
                        c = jis0208[c][d];
 
241
                        if (!c) goto ilseq;
 
242
                        break;
 
243
                case EUC_JP:
 
244
                        l = 2;
 
245
                        if (*inb < 2) goto starved;
 
246
                        d = *((unsigned char *)*in + 1);
 
247
                        if (c==0x8e) {
 
248
                                c = d;
 
249
                                if (c-0xa1 > 0xdf-0xa1) goto ilseq;
 
250
                                c += 0xff61 - 0xa1;
 
251
                                break;
 
252
                        }
 
253
                        c -= 0xa1;
 
254
                        d -= 0xa1;
 
255
                        if (c >= 84 || d >= 94) goto ilseq;
 
256
                        c = jis0208[c][d];
 
257
                        if (!c) goto ilseq;
 
258
                        break;
 
259
                case GB2312:
 
260
                        if (c < 0xa1) goto ilseq;
 
261
                case GBK:
 
262
                case GB18030:
 
263
                        c -= 0x81;
 
264
                        if (c >= 126) goto ilseq;
 
265
                        l = 2;
 
266
                        if (*inb < 2) goto starved;
 
267
                        d = *((unsigned char *)*in + 1);
 
268
                        if (d < 0xa1 && type == GB2312) goto ilseq;
 
269
                        if (d-0x40>=191 || d==127) {
 
270
                                if (d-'0'>9 || type != GB18030)
 
271
                                        goto ilseq;
 
272
                                l = 4;
 
273
                                if (*inb < 4) goto starved;
 
274
                                c = (10*c + d-'0') * 1260;
 
275
                                d = *((unsigned char *)*in + 2);
 
276
                                if (d-0x81>126) goto ilseq;
 
277
                                c += 10*(d-0x81);
 
278
                                d = *((unsigned char *)*in + 3);
 
279
                                if (d-'0'>9) goto ilseq;
 
280
                                c += d-'0';
 
281
                                c += 128;
 
282
                                for (d=0; d<=c; ) {
 
283
                                        k = 0;
 
284
                                        for (int i=0; i<126; i++)
 
285
                                                for (int j=0; j<190; j++)
 
286
                                                        if (gb18030[i][j]-d <= c-d)
 
287
                                                                k++;
 
288
                                        d = c+1;
 
289
                                        c += k;
 
290
                                }
 
291
                                break;
 
292
                        }
 
293
                        d -= 0x40;
 
294
                        if (d>63) d--;
 
295
                        c = gb18030[c][d];
 
296
                        break;
 
297
                case BIG5:
 
298
                        l = 2;
 
299
                        if (*inb < 2) goto starved;
 
300
                        d = *((unsigned char *)*in + 1);
 
301
                        if (d-0x40>=0xff-0x40 || d-0x7f<0xa1-0x7f) goto ilseq;
 
302
                        d -= 0x40;
 
303
                        if (d > 0x3e) d -= 0x22;
 
304
                        if (c-0xa1>=0xfa-0xa1) {
 
305
                                if (c-0x87>=0xff-0x87) goto ilseq;
 
306
                                if (c < 0xa1) c -= 0x87;
 
307
                                else c -= 0x87 + (0xfa-0xa1);
 
308
                                c = (hkscs[4867+(c*157+d)/16]>>(c*157+d)%16)%2<<17
 
309
                                        | hkscs[c*157+d];
 
310
                                /* A few HKSCS characters map to pairs of UCS
 
311
                                 * characters. These are mapped to surrogate
 
312
                                 * range in the hkscs table then hard-coded
 
313
                                 * here. Ugly, yes. */
 
314
                                if (c/256 == 0xdc) {
 
315
                                        if (totype-0300U > 8) k = 2;
 
316
                                        else k = "\10\4\4\10\4\4\10\2\4"[totype-0300];
 
317
                                        if (k > *outb) goto toobig;
 
318
                                        x += iconv((iconv_t)(uintptr_t)to,
 
319
                                                &(char *){"\303\212\314\204"
 
320
                                                "\303\212\314\214"
 
321
                                                "\303\252\314\204"
 
322
                                                "\303\252\314\214"
 
323
                                                +c%256}, &(size_t){4},
 
324
                                                out, outb);
 
325
                                        continue;
 
326
                                }
 
327
                                if (!c) goto ilseq;
 
328
                                break;
 
329
                        }
 
330
                        c -= 0xa1;
 
331
                        c = big5[c][d]|(c==0x27&&(d==0x3a||d==0x3c||d==0x42))<<17;
 
332
                        if (!c) goto ilseq;
 
333
                        break;
 
334
                case EUC_KR:
 
335
                        l = 2;
 
336
                        if (*inb < 2) goto starved;
 
337
                        d = *((unsigned char *)*in + 1);
 
338
                        c -= 0xa1;
 
339
                        d -= 0xa1;
 
340
                        if (c >= 93 || d >= 94) {
 
341
                                c += (0xa1-0x81);
 
342
                                d += 0xa1;
 
343
                                if (c >= 93 || c>=0xc6-0x81 && d>0x52)
 
344
                                        goto ilseq;
 
345
                                if (d-'A'<26) d = d-'A';
 
346
                                else if (d-'a'<26) d = d-'a'+26;
 
347
                                else if (d-0x81<0xff-0x81) d = d-0x81+52;
 
348
                                else goto ilseq;
 
349
                                if (c < 0x20) c = 178*c + d;
 
350
                                else c = 178*0x20 + 84*(c-0x20) + d;
 
351
                                c += 0xac00;
 
352
                                for (d=0xac00; d<=c; ) {
 
353
                                        k = 0;
 
354
                                        for (int i=0; i<93; i++)
 
355
                                                for (int j=0; j<94; j++)
 
356
                                                        if (ksc[i][j]-d <= c-d)
 
357
                                                                k++;
 
358
                                        d = c+1;
 
359
                                        c += k;
 
360
                                }
 
361
                                break;
 
362
                        }
 
363
                        c = ksc[c][d];
 
364
                        if (!c) goto ilseq;
 
365
                        break;
 
366
                default:
 
367
                        if (c < 128+type) break;
 
368
                        c -= 128+type;
 
369
                        c = legacy_chars[ map[c*5/4]>>2*c%8 |
 
370
                                map[c*5/4+1]<<8-2*c%8 & 1023 ];
 
371
                        if (!c) c = *(unsigned char *)*in;
 
372
                        if (c==1) goto ilseq;
 
373
                }
 
374
 
 
375
                switch (totype) {
 
376
                case WCHAR_T:
 
377
                        if (*outb < sizeof(wchar_t)) goto toobig;
 
378
                        *(wchar_t *)*out = c;
 
379
                        *out += sizeof(wchar_t);
 
380
                        *outb -= sizeof(wchar_t);
 
381
                        break;
 
382
                case UTF_8:
 
383
                        if (*outb < 4) {
 
384
                                char tmp[4];
 
385
                                k = wctomb_utf8(tmp, c);
 
386
                                if (*outb < k) goto toobig;
 
387
                                memcpy(*out, tmp, k);
 
388
                        } else k = wctomb_utf8(*out, c);
 
389
                        *out += k;
 
390
                        *outb -= k;
 
391
                        break;
 
392
                case US_ASCII:
 
393
                        if (c > 0x7f) subst: x++, c='*';
 
394
                default:
 
395
                        if (*outb < 1) goto toobig;
 
396
                        if (c < 128+totype) {
 
397
                        revout:
 
398
                                *(*out)++ = c;
 
399
                                *outb -= 1;
 
400
                                break;
 
401
                        }
 
402
                        d = c;
 
403
                        for (c=0; c<128-totype; c++) {
 
404
                                if (d == legacy_chars[ tomap[c*5/4]>>2*c%8 |
 
405
                                        tomap[c*5/4+1]<<8-2*c%8 & 1023 ]) {
 
406
                                        c += 128;
 
407
                                        goto revout;
 
408
                                }
 
409
                        }
 
410
                        goto subst;
 
411
                case UCS2BE:
 
412
                case UCS2LE:
 
413
                case UTF_16BE:
 
414
                case UTF_16LE:
 
415
                        if (c < 0x10000 || type-UCS2BE < 2U) {
 
416
                                if (c >= 0x10000) c = 0xFFFD;
 
417
                                if (*outb < 2) goto toobig;
 
418
                                put_16((void *)*out, c, totype);
 
419
                                *out += 2;
 
420
                                *outb -= 2;
 
421
                                break;
 
422
                        }
 
423
                        if (*outb < 4) goto toobig;
 
424
                        c -= 0x10000;
 
425
                        put_16((void *)*out, (c>>10)|0xd800, totype);
 
426
                        put_16((void *)(*out + 2), (c&0x3ff)|0xdc00, totype);
 
427
                        *out += 4;
 
428
                        *outb -= 4;
 
429
                        break;
 
430
                case UTF_32BE:
 
431
                case UTF_32LE:
 
432
                        if (*outb < 4) goto toobig;
 
433
                        put_32((void *)*out, c, totype);
 
434
                        *out += 4;
 
435
                        *outb -= 4;
 
436
                        break;
 
437
                }
 
438
        }
 
439
        return x;
 
440
ilseq:
 
441
        err = EILSEQ;
 
442
        x = -1;
 
443
        goto end;
 
444
toobig:
 
445
        err = E2BIG;
 
446
        x = -1;
 
447
        goto end;
 
448
starved:
 
449
        err = EINVAL;
 
450
        x = -1;
 
451
end:
 
452
        errno = err;
 
453
        return x;
 
454
}