52
#define IS_ALNUM(C) ( ((C) >= 'A' && (C) <= 'Z') || ((C) >= 'a' && (C) <= 'z') || ((C) >= '0' && (C) <= '9') )
53
#define CHAR_LOWER(C) ( ((C) >= 'A' && (C) <= 'Z') ? ((C) + 'a' - 'A') : (C) )
54
static int hts_equalsAlphanum(const char *a, const char *b) {
56
for(i = 0, j = 0;; i++, j++) {
58
for(; a[i] != '\0' && !IS_ALNUM(a[i]); i++) ;
59
for(; b[j] != '\0' && !IS_ALNUM(b[j]); j++) ;
61
if (CHAR_LOWER(a[i]) != CHAR_LOWER(b[j])) {
64
/* End of string ? (note: a[i] == b[j]) */
65
else if (a[i] == '\0') {
74
/* Copy the memory region [s .. s + size - 1 ] as a \0-terminated string. */
75
static char *hts_stringMemCopy(const char *s, size_t size) {
76
char *dest = malloc(size + 1);
79
memcpy(dest, s, size);
54
88
typedef struct wincodepage_t wincodepage_t;
207
241
UINT hts_getCodepage(const char *name) {
210
#define IS_ALNUM(C) ( ((C) >= 'A' && (C) <= 'Z') || ((C) >= 'a' && (C) <= 'z') || ((C) >= '0' && (C) <= '9') )
211
#define CHAR_LOWER(C) ( ((C) >= 'A' && (C) <= 'Z') ? ((C) + 'a' - 'A') : (C) )
212
244
for(id = 0; codepages[id].name != NULL; id++) {
215
245
/* Compare the two strings, lowercase and alphanum only (ISO88591 == iso-8859-1) */
216
const char *a = name, *b = codepages[id].name;
218
for(i = 0, j = 0;; i++, j++) {
220
for(; a[i] != '\0' && !IS_ALNUM(a[i]); i++) ;
221
for(; b[j] != '\0' && !IS_ALNUM(b[j]); j++) ;
223
if (CHAR_LOWER(a[i]) != CHAR_LOWER(b[j])) {
226
/* End of string ? (note: a[i] == b[j]) */
227
else if (a[i] == '\0') {
228
return codepages[id].codepage;
246
if (hts_equalsAlphanum(name, codepages[id].name)) {
247
return codepages[id].codepage;
238
static char *strndup(const char *s, size_t size) {
239
char *dest = malloc(size + 1);
242
memcpy(dest, s, size);
249
255
LPWSTR hts_convertStringToUCS2(const char *s, int size, UINT cp, int *pwsize) {
250
256
/* Size in wide chars of the output */
251
257
const int wsize = MultiByteToWideChar(cp, 0, (LPCSTR) s, size, NULL, 0);
302
308
char *hts_convertStringCPToUTF8(const char *s, size_t size, UINT cp) {
303
309
/* Empty string ? */
305
return strndup(s, size);
311
return hts_stringMemCopy(s, size);
307
313
/* Already UTF-8 ? */
308
314
if (cp == CP_UTF8 || hts_isStringAscii(s, size)) {
309
return strndup(s, size);
315
return hts_stringMemCopy(s, size);
311
317
/* Other (valid) charset */
312
318
else if (cp != 0) {
329
335
char *hts_convertStringCPFromUTF8(const char *s, size_t size, UINT cp) {
330
336
/* Empty string ? */
332
return strndup(s, size);
338
return hts_stringMemCopy(s, size);
334
340
/* Already UTF-8 ? */
335
341
if (cp == CP_UTF8 || hts_isStringAscii(s, size)) {
336
return strndup(s, size);
342
return hts_stringMemCopy(s, size);
338
344
/* Other (valid) charset */
339
345
else if (cp != 0) {
374
380
#include <errno.h>
382
#if ( defined(HTS_USEICONV) && ( HTS_USEICONV == 0 ) )
383
#define DISABLE_ICONV
386
#ifndef DISABLE_ICONV
375
387
#include <iconv.h>
377
static char *hts_convertStringToUTF8_(const char *s, size_t size,
389
#include "htscodepages.h"
391
/* decode from a codepage to UTF-8 */
392
static char* hts_codepageToUTF8(const char *codepage, const char *s) {
393
/* find the given codepage */
395
for(i = 0 ; table_mappings[i].name != NULL
396
&& !hts_equalsAlphanum(table_mappings[i].name, codepage) ; i++) ;
399
if (table_mappings[i].name != NULL) {
404
for(j = 0, k = 0 ; s[j] != '\0' ; j++) {
405
const unsigned char c = (unsigned char) s[j];
406
const hts_UCS4 uc = table_mappings[i].table[c];
407
const size_t max = k + MAX_UTF;
409
for(capa = 16 ; capa < max ; capa <<= 1) ;
410
dest = realloc(dest, capa);
416
const size_t len = hts_writeUTF8(uc, &dest[k], MAX_UTF);
429
static char *hts_convertStringCharset(const char *s, size_t size,
378
430
const char *to, const char *from) {
379
431
/* Empty string ? */
381
433
return strdup("");
383
435
/* Already on correct charset ? */
384
if (strcasecmp(from, to) == 0) {
385
return strndup(s, size);
436
if (hts_equalsAlphanum(from, to)) {
437
return hts_stringMemCopy(s, size);
439
#ifndef DISABLE_ICONV
387
440
/* Find codepage */
389
442
const iconv_t cp = iconv_open(to, from);
499
/* Limited codepage decoding support only. */
500
if (hts_isCharsetUTF8(to)) {
501
return hts_codepageToUTF8(from, s);
446
505
/* Error, charset not found! */
455
514
/* Already UTF-8 ? */
456
515
if (hts_isCharsetUTF8(charset) || hts_isStringAscii(s, size)) {
457
return strndup(s, size);
516
return hts_stringMemCopy(s, size);
459
518
/* Find codepage */
461
return hts_convertStringToUTF8_(s, size, "utf-8", charset);
520
return hts_convertStringCharset(s, size, "utf-8", charset);
470
529
/* Already UTF-8 ? */
471
530
if (hts_isCharsetUTF8(charset) || hts_isStringAscii(s, size)) {
472
return strndup(s, size);
531
return hts_stringMemCopy(s, size);
474
533
/* Find codepage */
476
return hts_convertStringToUTF8_(s, size, charset, "utf-8");
535
return hts_convertStringCharset(s, size, charset, "utf-8");
1195
int hts_isStringUTF8(const char *s, size_t size) {
1196
const unsigned char *const data = (const unsigned char*) s;
1199
for(i = 0 ; i < size ; ) {
1200
/* Reader: can read bytes up to j */
1201
#define RD ( i < size ? data[i++] : -1 )
1203
/* Writer: upon error, return FFFD (replacement character) */
1204
#define WR(C) if ((C) == -1) { return 0; }
1206
/* Read Unicode character. */
1207
READ_UNICODE(RD, WR);
1136
1215
char *hts_convertUCS4StringToUTF8(const hts_UCS4 *s, size_t nChars) {
1138
1217
char *dest = NULL;