2
* Copyright (C) 2001 Edmund Grimley Evans <edmundo@rano.org>
4
* This program is free software; you can redistribute it and/or modify
5
* it under the terms of the GNU General Public License as published by
6
* the Free Software Foundation; either version 2 of the License, or
7
* (at your option) any later version.
9
* This program is distributed in the hope that it will be useful,
10
* but WITHOUT ANY WARRANTY; without even the implied warranty of
11
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
* GNU General Public License for more details.
14
* You should have received a copy of the GNU General Public License
15
* along with this program; if not, write to the Free Software
16
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20
* See the corresponding header file for a description of the functions
21
* that this file provides.
23
* This was first written for Ogg Vorbis but could be of general use.
25
* The only deliberate assumption about data sizes is that a short has
26
* at least 16 bits, but this code has only been tested on systems with
27
* 8-bit char, 16-bit short and 32-bit int.
34
#ifndef HAVE_ICONV /* should be ifdef USE_CHARSET_CONVERT */
43
* This is like the standard strcasecmp, but it does not depend
44
* on the locale. Locale-dependent functions can be dangerous:
45
* we once had a bug involving strcasecmp("iso", "ISO") in a
48
* (I'm not really sure what the official standard says
49
* about the sign of strcasecmp("Z", "["), but usually
50
* we're only interested in whether it's zero.)
53
static int ascii_strcasecmp(const char *s1, const char *s2)
63
if ('a' <= c1 && c1 <= 'z')
66
if ('a' <= c2 && c2 <= 'z')
71
return (unsigned char)*s1 - (unsigned char)*s2;
75
* UTF-8 equivalents of the C library's wctomb() and mbtowc().
78
int utf8_mbtowc(int *pwc, const char *s, size_t n)
95
if (n >= 2 && (s[1] & 0xc0) == 0x80) {
97
*pwc = ((c & 0x1f) << 6) | (s[1] & 0x3f);
116
wc = *s++ & ((1 << (7 - k)) - 1);
117
for (i = 1; i < k; i++) {
118
if ((*s & 0xc0) != 0x80)
120
wc = (wc << 6) | (*s++ & 0x3f);
122
if (wc < (1 << (5 * k - 4)))
129
int utf8_wctomb(char *s, int wc1)
131
unsigned int wc = wc1;
135
if (wc < (1u << 7)) {
139
else if (wc < (1u << 11)) {
140
*s++ = 0xc0 | (wc >> 6);
141
*s++ = 0x80 | (wc & 0x3f);
144
else if (wc < (1u << 16)) {
145
*s++ = 0xe0 | (wc >> 12);
146
*s++ = 0x80 | ((wc >> 6) & 0x3f);
147
*s++ = 0x80 | (wc & 0x3f);
150
else if (wc < (1u << 21)) {
151
*s++ = 0xf0 | (wc >> 18);
152
*s++ = 0x80 | ((wc >> 12) & 0x3f);
153
*s++ = 0x80 | ((wc >> 6) & 0x3f);
154
*s++ = 0x80 | (wc & 0x3f);
157
else if (wc < (1u << 26)) {
158
*s++ = 0xf8 | (wc >> 24);
159
*s++ = 0x80 | ((wc >> 18) & 0x3f);
160
*s++ = 0x80 | ((wc >> 12) & 0x3f);
161
*s++ = 0x80 | ((wc >> 6) & 0x3f);
162
*s++ = 0x80 | (wc & 0x3f);
165
else if (wc < (1u << 31)) {
166
*s++ = 0xfc | (wc >> 30);
167
*s++ = 0x80 | ((wc >> 24) & 0x3f);
168
*s++ = 0x80 | ((wc >> 18) & 0x3f);
169
*s++ = 0x80 | ((wc >> 12) & 0x3f);
170
*s++ = 0x80 | ((wc >> 6) & 0x3f);
171
*s++ = 0x80 | (wc & 0x3f);
179
* The charset "object" and methods.
184
int (*mbtowc)(void *table, int *pwc, const char *s, size_t n);
185
int (*wctomb)(void *table, char *s, int wc);
189
int charset_mbtowc(struct charset *charset, int *pwc, const char *s, size_t n)
191
return (*charset->mbtowc)(charset->map, pwc, s, n);
194
int charset_wctomb(struct charset *charset, char *s, int wc)
196
return (*charset->wctomb)(charset->map, s, wc);
199
int charset_max(struct charset *charset)
205
* Implementation of UTF-8.
208
static int mbtowc_utf8(void *map, int *pwc, const char *s, size_t n)
211
return utf8_mbtowc(pwc, s, n);
214
static int wctomb_utf8(void *map, char *s, int wc)
217
return utf8_wctomb(s, wc);
221
* Implementation of US-ASCII.
222
* Probably on most architectures this compiles to less than 256 bytes
223
* of code, so we can save space by not having a table for this one.
226
static int mbtowc_ascii(void *map, int *pwc, const char *s, size_t n)
233
wc = (unsigned char)*s;
241
static int wctomb_ascii(void *map, char *s, int wc)
253
* Implementation of ISO-8859-1.
254
* Probably on most architectures this compiles to less than 256 bytes
255
* of code, so we can save space by not having a table for this one.
258
static int mbtowc_iso1(void *map, int *pwc, const char *s, size_t n)
265
wc = (unsigned char)*s;
273
static int wctomb_iso1(void *map, char *s, int wc)
285
* Implementation of any 8-bit charset.
289
const unsigned short *from;
290
struct inverse_map *to;
293
static int mbtowc_8bit(void *map1, int *pwc, const char *s, size_t n)
295
struct map *map = map1;
300
wc = map->from[(unsigned char)*s];
309
* For the inverse map we use a hash table, which has the advantages
310
* of small constant memory requirement and simple memory allocation,
311
* but the disadvantage of slow conversion in the worst case.
312
* If you need real-time performance while letting a potentially
313
* malicious user define their own map, then the method used in
314
* linux/drivers/char/consolemap.c would be more appropriate.
318
unsigned char first[256];
319
unsigned char next[256];
323
* The simple hash is good enough for this application.
324
* Use the alternative trivial hashes for testing.
326
#define HASH(i) ((i) & 0xff)
327
/* #define HASH(i) 0 */
328
/* #define HASH(i) 99 */
330
static struct inverse_map *make_inverse_map(const unsigned short *from)
332
struct inverse_map *to;
336
to = (struct inverse_map *)malloc(sizeof(struct inverse_map));
339
for (i = 0; i < 256; i++)
340
to->first[i] = to->next[i] = used[i] = 0;
341
for (i = 255; i >= 0; i--)
342
if (from[i] != 0xffff) {
344
to->next[i] = to->first[k];
349
/* Point the empty buckets at an empty list. */
350
for (i = 0; i < 256; i++)
354
for (j = 0; j < 256; j++)
361
int wctomb_8bit(void *map1, char *s, int wc1)
363
struct map *map = map1;
364
unsigned short wc = wc1;
373
if (1) /* Change 1 to 0 to test the case where malloc fails. */
375
map->to = make_inverse_map(map->from);
378
/* Use the inverse map. */
379
i = map->to->first[HASH(wc)];
381
if (map->from[i] == wc) {
385
if (!(i = map->to->next[i]))
390
/* We don't have an inverse map, so do a linear search. */
391
for (i = 0; i < 256; i++)
392
if (map->from[i] == wc) {
402
* The "constructor" charset_find().
405
struct charset charset_utf8 = {
412
struct charset charset_iso1 = {
419
struct charset charset_ascii = {
426
struct charset *charset_find(const char *code)
430
/* Find good (MIME) name. */
431
for (i = 0; names[i].bad; i++)
432
if (!ascii_strcasecmp(code, names[i].bad)) {
433
code = names[i].good;
437
/* Recognise some charsets for which we avoid using a table. */
438
if (!ascii_strcasecmp(code, "UTF-8"))
439
return &charset_utf8;
440
if (!ascii_strcasecmp(code, "US-ASCII"))
441
return &charset_ascii;
442
if (!ascii_strcasecmp(code, "ISO-8859-1"))
443
return &charset_iso1;
445
/* Look for a mapping for a simple 8-bit encoding. */
446
for (i = 0; maps[i].name; i++)
447
if (!ascii_strcasecmp(code, maps[i].name)) {
448
if (!maps[i].charset) {
449
maps[i].charset = (struct charset *)malloc(sizeof(struct charset));
450
if (maps[i].charset) {
451
struct map *map = (struct map *)malloc(sizeof(struct map));
453
free(maps[i].charset);
457
maps[i].charset->max = 1;
458
maps[i].charset->mbtowc = &mbtowc_8bit;
459
maps[i].charset->wctomb = &wctomb_8bit;
460
maps[i].charset->map = map;
461
map->from = maps[i].map;
462
map->to = 0; /* inverse mapping is created when required */
466
return maps[i].charset;
473
* Function to convert a buffer from one encoding to another.
474
* Invalid bytes are replaced by '#', and characters that are
475
* not available in the target encoding are replaced by '?'.
476
* Each of TO and TOLEN may be zero, if the result is not needed.
477
* The output buffer is null-terminated, so it is all right to
478
* use charset_convert(fromcode, tocode, s, strlen(s), &t, 0).
481
int charset_convert(const char *fromcode, const char *tocode,
482
const char *from, size_t fromlen,
483
char **to, size_t *tolen)
486
struct charset *charset1, *charset2;
487
char *tobuf, *p, *newbuf;
490
charset1 = charset_find(fromcode);
491
charset2 = charset_find(tocode);
492
if (!charset1 || !charset2 )
495
tobuf = (char *)malloc(fromlen * charset2->max + 1);
499
for (p = tobuf; fromlen; from += i, fromlen -= i, p += j) {
500
i = charset_mbtowc(charset1, &wc, from, fromlen);
508
j = charset_wctomb(charset2, p, wc);
512
j = charset_wctomb(charset2, p, '?');
522
newbuf = realloc(tobuf, p - tobuf);
523
*to = newbuf ? newbuf : tobuf;
531
#endif /* USE_CHARSET_ICONV */