2
* Copyright 2001-2004 Unicode, Inc.
6
* This source code is provided as is by Unicode, Inc. No claims are
7
* made as to fitness for any particular purpose. No warranties of any
8
* kind are expressed or implied. The recipient agrees to determine
9
* applicability of information provided. If this file has been
10
* purchased on magnetic or optical media from Unicode, Inc., the
11
* sole remedy for any claim will be exchange of defective media
12
* within 90 days of receipt.
14
* Limitations on Rights to Redistribute This Code
16
* Unicode, Inc. hereby grants the right to freely use the information
17
* supplied in this file in the creation of products supporting the
18
* Unicode Standard, and to make copies of this file in any form
19
* for internal or external distribution as long as this notice
23
/* ---------------------------------------------------------------------
25
Conversions between UTF32, UTF-16, and UTF-8. Source code file.
26
Author: Mark E. Davis, 1994.
27
Rev History: Rick McGowan, fixes & updates May 2001.
28
Sept 2001: fixed const & error conditions per
29
mods suggested by S. Parent & A. Lillich.
30
June 2002: Tim Dodd added detection and handling of incomplete
31
source sequences, enhanced error detection, added casts
32
to eliminate compiler warnings.
33
July 2003: slight mods to back out aggressive FFFE detection.
34
Jan 2004: updated switches in from-UTF8 conversions.
35
Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
36
May 2006: updated isLegalUTF8Sequence.
38
See the header file "ConvertUTF.h" for complete documentation.
40
------------------------------------------------------------------------ */
47
static const int halfShift = 10; /* used for shifting by 10 bits */
49
static const t_UTF32 halfBase = 0x0010000UL;
50
static const t_UTF32 halfMask = 0x3FFUL;
52
#define UNI_SUR_HIGH_START (t_UTF32)0xD800
53
#define UNI_SUR_HIGH_END (t_UTF32)0xDBFF
54
#define UNI_SUR_LOW_START (t_UTF32)0xDC00
55
#define UNI_SUR_LOW_END (t_UTF32)0xDFFF
58
ConversionResult ConvertUTF32toUTF16 (const t_UTF32** sourceStart, const t_UTF32* sourceEnd, t_UTF16** targetStart, t_UTF16* targetEnd, ConversionFlags flags)
60
ConversionResult result = conversionOK;
61
const t_UTF32* source = *sourceStart;
62
t_UTF16* target = *targetStart;
63
while (source < sourceEnd) {
65
if (target >= targetEnd) {
66
result = targetExhausted; break;
69
if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
70
/* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
71
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
72
if (flags == strictConversion) {
73
--source; /* return to the illegal value itself */
74
result = sourceIllegal;
77
*target++ = UNI_REPLACEMENT_CHAR;
80
*target++ = (t_UTF16)ch; /* normal case */
82
} else if (ch > UNI_MAX_LEGAL_UTF32) {
83
if (flags == strictConversion) {
84
result = sourceIllegal;
86
*target++ = UNI_REPLACEMENT_CHAR;
89
/* target is a character in range 0xFFFF - 0x10FFFF. */
90
if (target + 1 >= targetEnd) {
91
--source; /* Back up source pointer! */
92
result = targetExhausted; break;
95
*target++ = (t_UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
96
*target++ = (t_UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
99
*sourceStart = source;
100
*targetStart = target;
104
/* --------------------------------------------------------------------- */
106
ConversionResult ConvertUTF16toUTF32 (const t_UTF16** sourceStart, const t_UTF16* sourceEnd, t_UTF32** targetStart, t_UTF32* targetEnd, ConversionFlags flags)
108
ConversionResult result = conversionOK;
109
const t_UTF16* source = *sourceStart;
110
t_UTF32* target = *targetStart;
112
while (source < sourceEnd) {
113
const t_UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
115
/* If we have a surrogate pair, convert to UTF32 first. */
116
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
117
/* If the 16 bits following the high surrogate are in the source buffer... */
118
if (source < sourceEnd) {
120
/* If it's a low surrogate, convert to UTF32. */
121
if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
122
ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
123
+ (ch2 - UNI_SUR_LOW_START) + halfBase;
125
} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
126
--source; /* return to the illegal value itself */
127
result = sourceIllegal;
130
} else { /* We don't have the 16 bits following the high surrogate. */
131
--source; /* return to the high surrogate */
132
result = sourceExhausted;
135
} else if (flags == strictConversion) {
136
/* UTF-16 surrogate values are illegal in UTF-32 */
137
if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
138
--source; /* return to the illegal value itself */
139
result = sourceIllegal;
143
if (target >= targetEnd) {
144
source = oldSource; /* Back up source pointer! */
145
result = targetExhausted; break;
149
*sourceStart = source;
150
*targetStart = target;
152
if (result == sourceIllegal)
154
fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
161
/* --------------------------------------------------------------------- */
164
* Index into the table below with the first byte of a UTF-8 sequence to
165
* get the number of trailing bytes that are supposed to follow it.
166
* Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
167
* left as-is for anyone who may want to do such conversion, which was
168
* allowed in earlier algorithms.
170
static const char trailingBytesForUTF8[256] =
172
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
173
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
174
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
175
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
176
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
177
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
178
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
179
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
183
* Magic values subtracted from a buffer value during UTF8 conversion.
184
* This table contains as many values as there might be trailing bytes
185
* in a UTF-8 sequence.
187
static const t_UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
188
0x03C82080UL, 0xFA082080UL, 0x82082080UL };
191
* Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
192
* into the first byte, depending on how many bytes follow. There are
193
* as many entries in this table as there are UTF-8 sequence types.
194
* (I.e., one byte sequence, two byte... etc.). Remember that sequencs
195
* for *legal* UTF-8 will be 4 or fewer bytes total.
197
static const t_UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
199
/* --------------------------------------------------------------------- */
201
/* The interface converts a whole buffer to avoid function-call overhead.
202
* Constants have been gathered. Loops & conditionals have been removed as
203
* much as possible for efficiency, in favor of drop-through switches.
204
* (See "Note A" at the bottom of the file for equivalent code.)
205
* If your compiler supports it, the "isLegalUTF8" call can be turned
206
* into an inline function.
209
/* --------------------------------------------------------------------- */
211
ConversionResult ConvertUTF16toUTF8 (const t_UTF16** sourceStart, const t_UTF16* sourceEnd, t_UTF8** targetStart, t_UTF8* targetEnd, ConversionFlags flags)
213
ConversionResult result = conversionOK;
214
const t_UTF16* source = *sourceStart;
215
t_UTF8* target = *targetStart;
216
while (source < sourceEnd) {
218
unsigned short bytesToWrite = 0;
219
const t_UTF32 byteMask = 0xBF;
220
const t_UTF32 byteMark = 0x80;
221
const t_UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
223
/* If we have a surrogate pair, convert to UTF32 first. */
224
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
225
/* If the 16 bits following the high surrogate are in the source buffer... */
226
if (source < sourceEnd) {
227
t_UTF32 ch2 = *source;
228
/* If it's a low surrogate, convert to UTF32. */
229
if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
230
ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
231
+ (ch2 - UNI_SUR_LOW_START) + halfBase;
233
} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
234
--source; /* return to the illegal value itself */
235
result = sourceIllegal;
238
} else { /* We don't have the 16 bits following the high surrogate. */
239
--source; /* return to the high surrogate */
240
result = sourceExhausted;
243
} else if (flags == strictConversion) {
244
/* UTF-16 surrogate values are illegal in UTF-32 */
245
if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
246
--source; /* return to the illegal value itself */
247
result = sourceIllegal;
251
/* Figure out how many bytes the result will require */
252
if (ch < (t_UTF32)0x80) { bytesToWrite = 1;
253
} else if (ch < (t_UTF32)0x800) { bytesToWrite = 2;
254
} else if (ch < (t_UTF32)0x10000) { bytesToWrite = 3;
255
} else if (ch < (t_UTF32)0x110000) { bytesToWrite = 4;
256
} else { bytesToWrite = 3;
257
ch = UNI_REPLACEMENT_CHAR;
260
target += bytesToWrite;
261
if (target > targetEnd) {
262
source = oldSource; /* Back up source pointer! */
263
target -= bytesToWrite; result = targetExhausted; break;
265
switch (bytesToWrite) { /* note: everything falls through. */
266
case 4: *--target = (t_UTF8)((ch | byteMark) & byteMask); ch >>= 6;
267
case 3: *--target = (t_UTF8)((ch | byteMark) & byteMask); ch >>= 6;
268
case 2: *--target = (t_UTF8)((ch | byteMark) & byteMask); ch >>= 6;
269
case 1: *--target = (t_UTF8)(ch | firstByteMark[bytesToWrite]);
271
target += bytesToWrite;
273
*sourceStart = source;
274
*targetStart = target;
278
/* --------------------------------------------------------------------- */
281
* Utility routine to tell whether a sequence of bytes is legal UTF-8.
282
* This must be called with the length pre-determined by the first byte.
283
* If not calling this from ConvertUTF8to*, then the length can be set by:
284
* length = trailingBytesForUTF8[*source]+1;
285
* and the sequence is illegal right away if there aren't that many bytes
287
* If presented with a length > 4, this returns false. The Unicode
288
* definition of UTF-8 goes up to 4-byte sequences.
291
static bool isLegalUTF8(const t_UTF8 *source, int length)
294
const t_UTF8 *srcptr = source+length;
296
default: return false;
297
/* Everything else falls through when "true"... */
298
case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
299
case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
300
case 2: if ((a = (*--srcptr)) > 0xBF) return false;
303
/* no fall-through in this inner switch */
304
case 0xE0: if (a < 0xA0) return false; break;
305
case 0xED: if ((a < 0x80) || (a > 0x9F)) return false; break;
306
case 0xF0: if (a < 0x90) return false; break;
307
case 0xF4: if (a > 0x8F) return false; break;
308
default: if (a < 0x80) return false;
311
case 1: if (*source >= 0x80 && *source < 0xC2) return false;
313
if (*source > 0xF4) return false;
317
/* --------------------------------------------------------------------- */
320
* Exported function to return whether a UTF-8 sequence is legal or not.
321
* This is not used here; it's just exported.
324
bool isLegalUTF8Sequence(const t_UTF8 *source, const t_UTF8 *sourceEnd)
327
if (source == sourceEnd) {
331
length = trailingBytesForUTF8[*source]+1;
332
if (source+length > sourceEnd) {
335
if (!isLegalUTF8(source, length)) {
339
if (source >= sourceEnd) {
346
* This is a variation of isLegalUTF8Sequence() that behaves like g_utf8_validate().
347
* In addition to knowing if the sequence is legal, it also tells you the last good character.
350
tr_utf8_validate( const char * str, int max_len, const char ** end )
352
const t_UTF8* source = (const t_UTF8*) str;
353
const t_UTF8* sourceEnd;
361
sourceEnd = source + ((max_len < 0) ? strlen(str) : (size_t)max_len);
363
if( source == sourceEnd )
366
*end = (const char*) source;
372
const int length = trailingBytesForUTF8[*source] + 1;
373
if (source + length > sourceEnd) {
375
*end = (const char*) source;
378
if (!isLegalUTF8(source, length)) {
380
*end = (const char*) source;
384
if (source >= sourceEnd) {
386
*end = (const char*) source;
395
/* --------------------------------------------------------------------- */
397
ConversionResult ConvertUTF8toUTF16 (const t_UTF8** sourceStart, const t_UTF8* sourceEnd, t_UTF16** targetStart, t_UTF16* targetEnd, ConversionFlags flags)
399
ConversionResult result = conversionOK;
400
const t_UTF8* source = *sourceStart;
401
t_UTF16* target = *targetStart;
402
while (source < sourceEnd) {
404
unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
405
if (source + extraBytesToRead >= sourceEnd) {
406
result = sourceExhausted; break;
408
/* Do this check whether lenient or strict */
409
if (! isLegalUTF8(source, extraBytesToRead+1)) {
410
result = sourceIllegal;
414
* The cases all fall through. See "Note A" below.
416
switch (extraBytesToRead) {
417
case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
418
case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
419
case 3: ch += *source++; ch <<= 6;
420
case 2: ch += *source++; ch <<= 6;
421
case 1: ch += *source++; ch <<= 6;
422
case 0: ch += *source++;
424
ch -= offsetsFromUTF8[extraBytesToRead];
426
if (target >= targetEnd) {
427
source -= (extraBytesToRead+1); /* Back up source pointer! */
428
result = targetExhausted; break;
430
if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
431
/* UTF-16 surrogate values are illegal in UTF-32 */
432
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
433
if (flags == strictConversion) {
434
source -= (extraBytesToRead+1); /* return to the illegal value itself */
435
result = sourceIllegal;
438
*target++ = UNI_REPLACEMENT_CHAR;
441
*target++ = (t_UTF16)ch; /* normal case */
443
} else if (ch > UNI_MAX_UTF16) {
444
if (flags == strictConversion) {
445
result = sourceIllegal;
446
source -= (extraBytesToRead+1); /* return to the start */
447
break; /* Bail out; shouldn't continue */
449
*target++ = UNI_REPLACEMENT_CHAR;
452
/* target is a character in range 0xFFFF - 0x10FFFF. */
453
if (target + 1 >= targetEnd) {
454
source -= (extraBytesToRead+1); /* Back up source pointer! */
455
result = targetExhausted; break;
458
*target++ = (t_UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
459
*target++ = (t_UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
462
*sourceStart = source;
463
*targetStart = target;
467
/* --------------------------------------------------------------------- */
469
ConversionResult ConvertUTF32toUTF8 (
470
const t_UTF32** sourceStart, const t_UTF32* sourceEnd,
471
t_UTF8** targetStart, t_UTF8* targetEnd, ConversionFlags flags)
473
ConversionResult result = conversionOK;
474
const t_UTF32* source = *sourceStart;
475
t_UTF8* target = *targetStart;
476
while (source < sourceEnd) {
478
unsigned short bytesToWrite = 0;
479
const t_UTF32 byteMask = 0xBF;
480
const t_UTF32 byteMark = 0x80;
482
if (flags == strictConversion ) {
483
/* UTF-16 surrogate values are illegal in UTF-32 */
484
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
485
--source; /* return to the illegal value itself */
486
result = sourceIllegal;
491
* Figure out how many bytes the result will require. Turn any
492
* illegally large UTF32 things (> Plane 17) into replacement chars.
494
if (ch < (t_UTF32)0x80) { bytesToWrite = 1;
495
} else if (ch < (t_UTF32)0x800) { bytesToWrite = 2;
496
} else if (ch < (t_UTF32)0x10000) { bytesToWrite = 3;
497
} else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
498
} else { bytesToWrite = 3;
499
ch = UNI_REPLACEMENT_CHAR;
500
result = sourceIllegal;
503
target += bytesToWrite;
504
if (target > targetEnd) {
505
--source; /* Back up source pointer! */
506
target -= bytesToWrite; result = targetExhausted; break;
508
switch (bytesToWrite) { /* note: everything falls through. */
509
case 4: *--target = (t_UTF8)((ch | byteMark) & byteMask); ch >>= 6;
510
case 3: *--target = (t_UTF8)((ch | byteMark) & byteMask); ch >>= 6;
511
case 2: *--target = (t_UTF8)((ch | byteMark) & byteMask); ch >>= 6;
512
case 1: *--target = (t_UTF8) (ch | firstByteMark[bytesToWrite]);
514
target += bytesToWrite;
516
*sourceStart = source;
517
*targetStart = target;
521
/* --------------------------------------------------------------------- */
523
ConversionResult ConvertUTF8toUTF32 (
524
const t_UTF8** sourceStart, const t_UTF8* sourceEnd,
525
t_UTF32** targetStart, t_UTF32* targetEnd, ConversionFlags flags)
527
ConversionResult result = conversionOK;
528
const t_UTF8* source = *sourceStart;
529
t_UTF32* target = *targetStart;
530
while (source < sourceEnd) {
532
unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
533
if (source + extraBytesToRead >= sourceEnd) {
534
result = sourceExhausted; break;
536
/* Do this check whether lenient or strict */
537
if (! isLegalUTF8(source, extraBytesToRead+1)) {
538
result = sourceIllegal;
542
* The cases all fall through. See "Note A" below.
544
switch (extraBytesToRead) {
545
case 5: ch += *source++; ch <<= 6;
546
case 4: ch += *source++; ch <<= 6;
547
case 3: ch += *source++; ch <<= 6;
548
case 2: ch += *source++; ch <<= 6;
549
case 1: ch += *source++; ch <<= 6;
550
case 0: ch += *source++;
552
ch -= offsetsFromUTF8[extraBytesToRead];
554
if (target >= targetEnd) {
555
source -= (extraBytesToRead+1); /* Back up the source pointer! */
556
result = targetExhausted; break;
558
if (ch <= UNI_MAX_LEGAL_UTF32) {
560
* UTF-16 surrogate values are illegal in UTF-32, and anything
561
* over Plane 17 (> 0x10FFFF) is illegal.
563
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
564
if (flags == strictConversion) {
565
source -= (extraBytesToRead+1); /* return to the illegal value itself */
566
result = sourceIllegal;
569
*target++ = UNI_REPLACEMENT_CHAR;
574
} else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
575
result = sourceIllegal;
576
*target++ = UNI_REPLACEMENT_CHAR;
579
*sourceStart = source;
580
*targetStart = target;
584
/* ---------------------------------------------------------------------
587
The fall-through switches in UTF-8 reading code save a
588
temp variable, some decrements & conditionals. The switches
589
are equivalent to the following loop:
591
int tmpBytesToRead = extraBytesToRead+1;
595
if (tmpBytesToRead) ch <<= 6;
596
} while (tmpBytesToRead > 0);
598
In UTF-8 writing code, the switches on "bytesToWrite" are
599
similarly unrolled loops.
601
--------------------------------------------------------------------- */