2
* Copyright 2010 Inalogic Inc.
4
* This program is free software: you can redistribute it and/or modify it
5
* under the terms of the GNU Lesser General Public License version 3, as
6
* published by the Free Software Foundation.
8
* This program is distributed in the hope that it will be useful, but
9
* WITHOUT ANY WARRANTY; without even the implied warranties of
10
* MERCHANTABILITY, SATISFACTORY QUALITY or FITNESS FOR A PARTICULAR
11
* PURPOSE. See the applicable version of the GNU Lesser General Public
12
* License for more details.
14
* You should have received a copy of both the GNU Lesser General Public
15
* License version 3 along with this program. If not, see
16
* <http://www.gnu.org/licenses/>
18
* Authored by: Jay Taoko <jay.taoko_AT_gmail_DOT_com>
31
// Converts a single codepoint in the specified UTF-8 stream of text
32
// into a UTF-32 value
34
// Illegal sequences are converted to the unicode replacement character
36
// utf8str - [in] buffer containing UTF-8 text
37
// utf8len - [in] number of code-units (bytes) available in buffer
38
// pch32 - [out] single UTF-32 value
40
// Returns number of bytes processed from utf8str
42
size_t utf8_to_utf32(t_UTF8 *utf8str, size_t utf8len, t_UTF32 *pch32)
44
t_UTF8 ch = *utf8str++;
50
static t_UTF32 nonshortest[] =
52
0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff
55
// validate parameters
56
if(utf8str == 0 || utf8len <= 0 || pch32 == 0)
59
// look for plain ASCII first as this is most likely
65
// LEAD-byte of 2-byte seq: 110xxxxx 10xxxxxx
66
else if((ch & 0xE0) == 0xC0)
71
// LEAD-byte of 3-byte seq: 1110xxxx 10xxxxxx 10xxxxxx
72
else if((ch & 0xF0) == 0xE0)
77
// LEAD-byte of 4-byte seq: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
78
else if((ch & 0xF8) == 0xF0)
83
// ILLEGAL 5-byte seq: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
84
else if((ch & 0xFC) == 0xF8)
86
// range-checking the t_UTF32 result will catch this
90
// ILLEGAL 6-byte seq: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
91
else if((ch & 0xFE) == 0xFC)
93
// range-checking the t_UTF32 result will catch this
97
// ILLEGAL continuation (trailing) byte by itself
98
else if((ch & 0xC0) == 0x80)
100
*pch32 = UNI_REPLACEMENT_CHAR;
103
// any other ILLEGAL form.
106
*pch32 = UNI_REPLACEMENT_CHAR;
110
// process trailing bytes
111
for(i = 0; i < trailing && len < utf8len; i++)
115
// Valid trail-byte: 10xxxxxx
116
if((ch & 0xC0) == 0x80)
118
val32 = (val32 << 6) + (ch & 0x7f);
121
// Anything else is an error
124
*pch32 = UNI_REPLACEMENT_CHAR;
129
// did we decode a full utf-8 sequence?
130
if(val32 < nonshortest[trailing] || i != trailing)
131
*pch32 = UNI_REPLACEMENT_CHAR;
141
// Converts the specified UTF-32 value to UTF-8
143
// ch32 - [in] single utf-32 value
144
// utf8str - [out] buffer to receive UTF-8 text
145
// utf8len - [in] size of utf8 buffer in bytes
147
// Returns number of bytes stored in utf8str
149
size_t utf32_to_utf8(t_UTF8 *utf8str, size_t utf8len, t_UTF32 ch32)
153
// validate parameters
154
if(utf8str == 0 || utf8len == 0)
157
// ASCII is the easiest
160
*utf8str = (t_UTF8)ch32;
164
// make sure we have a legal utf32 char
165
if(ch32 > UNI_MAX_LEGAL_UTF32)
166
ch32 = UNI_REPLACEMENT_CHAR;
168
// cannot encode the surrogate range
169
if(ch32 >= UNI_SUR_HIGH_START && ch32 <= UNI_SUR_LOW_END)
170
ch32 = UNI_REPLACEMENT_CHAR;
173
if(ch32 < 0x800 && utf8len >= 2)
175
*utf8str++ = (t_UTF8)((ch32 >> 6) | 0xC0);
176
*utf8str++ = (t_UTF8)((ch32 & 0x3f) | 0x80);
180
else if(ch32 < 0x10000 && utf8len >= 3)
182
*utf8str++ = (t_UTF8)((ch32 >> 12) | 0xE0);
183
*utf8str++ = (t_UTF8)((ch32 >> 6) & 0x3f | 0x80);
184
*utf8str++ = (t_UTF8)((ch32 & 0x3f) | 0x80);
188
else if(ch32 <= UNI_MAX_LEGAL_UTF32 && utf8len >= 4)
190
*utf8str++ = (t_UTF8)((ch32 >> 18) | 0xF0);
191
*utf8str++ = (t_UTF8)((ch32 >> 12) & 0x3f | 0x80);
192
*utf8str++ = (t_UTF8)((ch32 >> 6) & 0x3f | 0x80);
193
*utf8str++ = (t_UTF8)((ch32 & 0x3f) | 0x80);
197
// 5/6 byte sequences never occur because we limit using UNI_MAX_LEGAL_UTF32
205
// Convert the specified UTF-8 stream of text to UTF-16
207
// 1. The maximum number possible of whole UTF-16 characters are stored in wstr
208
// 2. Illegal sequences are converted to the unicode replacement character
209
// 3. Returns the number of bytes processeed from utf8str
211
// utf8str - [in] buffer containing utf-8 text
212
// utf8len - [in] number of code-units (bytes) in buffer
213
// utf16str - [out] receives resulting utf-16 text
214
// utf16len - [in/out] on input, specifies the size (in UTF16s) of utf16str
215
// on output, holds actual number of UTF16s stored in utf16str
217
// Returns the number of bytes processed from utf8str
219
size_t utf8_to_utf16(t_UTF8 *utf8str, size_t utf8len, t_UTF16 *utf16str, size_t *utf16len)
221
t_UTF16 *utf16start = utf16str;
222
t_UTF8 *utf8start = utf8str;
228
while(utf8len > 0 && *utf16len > 0)
231
len = utf8_to_utf32(utf8str, utf8len, &ch32);
236
tmp16len = *utf16len;
237
len = utf32_to_utf16(&ch32, 1, utf16str, &tmp16len);
242
*utf16len = utf16str - utf16start;
243
return utf8str - utf8start;
249
// Convert the specified UTF-16 stream of text to UTF-8
251
// 1. As many whole codepoints as possible are stored in utf8str
252
// 2. Illegal sequences are converted to the unicode replacement character
254
// utf16str - [in] buffer containing utf-16 text
255
// utf16len - [in] number of code-units (UTF16s) in buffer
256
// utf8str - [out] receives resulting utf-8 text
257
// utf8len - [in/out] on input, specifies the size (in bytes) of utf8str
258
// on output, holds actual number of bytes stored in utf8str
260
// Returns the number of characters (UTF16s) processed from utf16str
262
size_t utf16_to_utf8(t_UTF16 *utf16str, size_t utf16len, t_UTF8 *utf8str, size_t *utf8len)
264
t_UTF16 * utf16start = utf16str;
265
t_UTF8 * utf8start = utf8str;
270
while(utf16len > 0 && *utf8len > 0)
274
len = utf16_to_utf32(utf16str, utf16len, &ch32, &ch32len);
279
len = utf32_to_utf8(utf8str, *utf8len, ch32);
284
*utf8len = utf8str - utf8start;
285
return utf16str - utf16start;
291
// Converts plain ASCII string to UTF-16
293
// asciistr - [in] buffer containing ASCII characters
294
// asciilen - [in] number of characters in buffer
295
// utf16str - [out] receives the resulting UTF-16 text
296
// utf16len - [in/out] on input, specifies length of utf16 buffer,
297
// on output, holds number of chars stored in utf16str
299
// Returns number of characters processed from asciistr
301
size_t ascii_to_utf16(t_UTF8 *asciistr, size_t asciilen, t_UTF16 *utf16str, size_t *utf16len)
303
size_t len = Min(*utf16len, asciilen);
305
MultiByteToWideChar(CP_ACP, 0, (CCHAR*)asciistr, len, (WCHAR *)utf16str, len);
313
// Converts UTF-16 to plain ASCII (lossy)
315
// utf16str - [in] buffer containing t_UTF16 characters
316
// utf16len - [in] number of WCHARs in buffer
317
// asciistr - [out] receives the resulting UTF-16 text
318
// asciilen - [in/out] on input, specifies length of ascii buffer,
319
// on output, holds number of chars stored in asciistr
321
// Returns number of characters processed from utf16str
323
size_t utf16_to_ascii(t_UTF16 *utf16str, size_t utf16len, t_UTF8 *asciistr, size_t *asciilen)
325
size_t len = Min(utf16len, *asciilen);
327
WideCharToMultiByte(CP_ACP, 0, INL_REINTERPRET_CAST(LPCWSTR, utf16str), len, (LPSTR)asciistr, *asciilen, 0, 0);
335
// Copies UTF-8 string from src to dest
337
// src - [in] buffer containing utf-8 text
338
// srclen - [in] number of code-units in src
339
// dest - [out] receives resulting string
340
// destlen - [in/out] on input, specifies length of dest buffer
341
// on output, holds number of UTF8s stored in dest
343
// returns number of CHARs processed from src
345
size_t copy_utf8(t_UTF8 *src, size_t srclen, t_UTF8 *dest, size_t *destlen)
347
size_t len = Min(*destlen, srclen);
348
memcpy(dest, src, len * sizeof(t_UTF8));
357
// Copies UTF-16 string from src to dest
359
// src - [in] buffer containing utf-16 text
360
// srclen - [in] number of code-units in src
361
// dest - [out] receives resulting string
362
// destlen - [in/out] on input, specifies length of dest buffer
363
// on output, holds number of UTF16s stored in dest
365
// returns number of WCHARs processed from src
367
size_t copy_utf16(t_UTF16 *src, size_t srclen, t_UTF16 *dest, size_t *destlen)
369
size_t len = Min(*destlen, srclen);
370
memcpy(dest, src, len * sizeof(t_UTF16));
379
// Copies UTF-16 string from src to dest, performing endianess swap
380
// for each code-unit
382
// src - [in] buffer containing utf-16 text
383
// srclen - [in] number of code-units in src
384
// dest - [out] receives resulting word-swapped string
385
// destlen - [in/out] on input, specifies length of dest buffer
386
// on output, holds number of UTF16s stored in dest
388
// Returns number of WCHARs processed from src
390
size_t swap_utf16(t_UTF16 *src, size_t srclen, t_UTF16 *dest, size_t *destlen)
392
size_t len = Min(*destlen, srclen);
395
for(i = 0; i < len; i++)
396
dest[i] = SWAPWORD(src[i]);
405
// Converts the specified UTF-32 stream of text to UTF-16
407
// utf32str - [in] buffer containing utf-32 text
408
// utf32len - [in] number of characters (UTF32s) in utf32str
409
// utf16str - [out] receives resulting utf-16 text
410
// utf16len - [in/out] on input, specifies the size (in UTF16s) of utf16str
411
// on output, holds actual number of t_UTF16 values stored in utf16str
413
// returns number of UTF32s processed from utf32str
415
size_t utf32_to_utf16(t_UTF32 *utf32str, size_t utf32len, t_UTF16 *utf16str, size_t *utf16len)
417
t_UTF16 *utf16start = utf16str;
418
t_UTF32 *utf32start = utf32str;
420
while(utf32len > 0 && *utf16len > 0)
422
t_UTF32 ch32 = *utf32str++;
425
// target is a character <= 0xffff
428
// make sure we don't represent anything in t_UTF16 surrogate range
429
// (this helps protect against non-shortest forms)
430
if(ch32 >= UNI_SUR_HIGH_START && ch32 <= UNI_SUR_LOW_END)
432
*utf16str++ = UNI_REPLACEMENT_CHAR;
437
*utf16str++ = (WORD)ch32;
441
// FFFE and FFFF are illegal mid-stream
442
else if(ch32 == 0xfffe || ch32 == 0xffff)
444
*utf16str++ = UNI_REPLACEMENT_CHAR;
447
// target is illegal Unicode value
448
else if(ch32 > UNI_MAX_UTF16)
450
*utf16str++ = UNI_REPLACEMENT_CHAR;
453
// target is in range 0xffff - 0x10ffff
454
else if(*utf16len >= 2)
458
*utf16str++ = (WORD)((ch32 >> 10) + UNI_SUR_HIGH_START);
459
*utf16str++ = (WORD)((ch32 & 0x3ff) + UNI_SUR_LOW_START);
465
// no room to store result
470
*utf16len = utf16str - utf16start;
471
return utf32str - utf32start;
477
// Converts the specified UTF-16 stream of text to UTF-32
479
// utf16str - [in] buffer containing utf-16 text
480
// utf16len - [in] number of code-units (UTF16s) in utf16str
481
// utf32str - [out] receives resulting utf-32 text
482
// utf32len - [in/out] on input, specifies the size (in UTF32s) of utf32str
483
// on output, holds actual number of t_UTF32 values stored in utf32str
485
// returns number of UTF16s processed from utf16str
487
size_t utf16_to_utf32(t_UTF16 *utf16str, size_t utf16len, t_UTF32 *utf32str, size_t *utf32len)
489
t_UTF16 *utf16start = utf16str;
490
t_UTF32 *utf32start = utf32str;
492
while(utf16len > 0 && *utf32len > 0)
494
t_UTF32 ch = *utf16str;
496
// first of a surrogate pair?
497
if(ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END && utf16len >= 2)
499
// get the second half of the pair
500
t_UTF32 ch2 = *(utf16str + 1);
502
// valid trailing surrogate unit?
503
if(ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
505
ch = ((ch - UNI_SUR_HIGH_START) << 10) +
506
((ch2 - UNI_SUR_LOW_START) + 0x00010000);
514
ch = UNI_REPLACEMENT_CHAR;
525
*utf32len = utf32str - utf32start;
526
return utf16str - utf16start;
532
// Converts the specified big-endian UTF-16 stream of text to UTF-32
534
// utf16str - [in] buffer containing utf-16 big-endian text
535
// utf16len - [in] number of code-units (UTF16s) in utf16str
536
// utf32str - [out] receives resulting utf-32 text
537
// utf32len - [in/out] on input, specifies the size (in UTF32s) of utf32str
538
// on output, holds actual number of t_UTF32 values stored in utf32str
540
// returns number of UTF16s processed from utf16str
542
size_t utf16be_to_utf32(t_UTF16 *utf16str, size_t utf16len, t_UTF32 *utf32str, size_t *utf32len)
544
t_UTF16 *utf16start = utf16str;
545
t_UTF32 *utf32start = utf32str;
547
while(utf16len > 0 && *utf32len > 0)
549
t_UTF32 ch = SWAPWORD(*utf16str);
551
// first of a surrogate pair?
552
if(ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END && utf16len >= 2)
554
t_UTF32 ch2 = SWAPWORD(*(utf16str + 1));
556
// valid trailing surrogate unit?
557
if(ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
559
ch = ((ch - UNI_SUR_HIGH_START) << 10) +
560
((ch2 - UNI_SUR_LOW_START) + 0x00010000);
568
ch = UNI_REPLACEMENT_CHAR;
579
*utf32len = utf32str - utf32start;
580
return utf16str - utf16start;