1
/*********************************************************
2
* Copyright (C) 2010 VMware, Inc. All rights reserved.
4
* This program is free software; you can redistribute it and/or modify it
5
* under the terms of the GNU Lesser General Public License as published
6
* by the Free Software Foundation version 2.1 and no later version.
8
* This program is distributed in the hope that it will be useful, but
9
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
10
* or FITNESS FOR A PARTICULAR PURPOSE. See the Lesser GNU General Public
11
* License for more details.
13
* You should have received a copy of the GNU Lesser General Public License
14
* along with this program; if not, write to the Free Software Foundation, Inc.,
15
* 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
17
*********************************************************/
22
* Character set and encoding conversion functions, without ICU.
31
*-----------------------------------------------------------------------------
35
* Parse the next UTF-8 sequence.
39
* Length of sequence and Unicode character in *uchar on success.
44
*-----------------------------------------------------------------------------
48
CodeSet_GetUtf8(const char *string, // IN: string
49
const char *end, // IN: end of string
50
uint32 *uchar) // OUT: the Unicode character
52
uint8 *p = (uint8 *) string;
61
// ASCII: U+0000 - U+007F: 1 byte of UTF-8.
66
if ((c < 0xc2) || (c > 0xf4)) {
67
// 0x81 to 0xbf are not valid first bytes
68
// 0xc0 and 0xc1 cannot appear in UTF-8, see below
69
// leading char can not be > 0xf4, illegal as well
74
// U+0080 - U+07FF: 2 bytes of UTF-8.
77
} else if (c < 0xf0) {
78
// U+0800 - U+FFFF: 3 bytes of UTF-8.
82
// U+10000 - U+10FFFF: 4 bytes of UTF-8.
87
if ((e = p + len) > (uint8 *) end) {
93
if ((*p & 0xc0) != 0x80) {
102
* Enforce shortest encoding.
103
* UTF-8 mandates that shortest possible encoding is used,
104
* as otherwise doing UTF-8 => anything => UTF-8 could bypass some
105
* important tests, like '/' for path separator or \0 for string
108
* This test does not work for len == 2, but that case is handled
109
* by requiring the first byte to be 0xc2 or greater (see above).
112
if (c < 1U << (len * 5 - 4)) {
126
*-----------------------------------------------------------------------------
128
* CodeSet_LengthInCodePoints --
130
* Return the length of a UTF8 string in code points (the number of
131
* unicode characters present in the string, not the length of the
134
* Like strlen, the length returned does not include the terminating NUL.
142
*-----------------------------------------------------------------------------
146
CodeSet_LengthInCodePoints(const char *utf8) // IN:
150
uint32 codePoints = 0;
155
end = p + strlen(utf8);
159
uint32 len = CodeSet_GetUtf8(p, end, &utf32);
174
*-----------------------------------------------------------------------------
176
* CodeSet_UTF8ToUTF32 --
178
* Convert a UTF8 string into a UTF32 string. The result is returned as a
179
* dynamically allocated string that the caller is responsible for.
182
* TRUE Input string was valid, converted string in *utf32
183
* FALSE Input string was invalid or internal error
188
*-----------------------------------------------------------------------------
192
CodeSet_UTF8ToUTF32(const char *utf8, // IN:
193
char **utf32) // OUT:
202
if (utf8 == NULL) { // NULL is not an error
208
codePoints = CodeSet_LengthInCodePoints(utf8);
209
if (codePoints == -1) {
216
end = p + strlen(utf8);
218
ptr = Util_SafeMalloc(sizeof(*ptr) * (codePoints + 1));
219
*utf32 = (char *) ptr;
222
p += CodeSet_GetUtf8(p, end, ptr++);
232
*-----------------------------------------------------------------------------
234
* CodeSet_UTF32ToUTF8 --
236
* Convert a UTF32 string into a UTF8 string. The result is returned as a
237
* dynamically allocated string that the caller is responsible for.
240
* TRUE Input string was valid, converted string in *utf8
241
* FALSE Input string was invalid or internal error
246
*-----------------------------------------------------------------------------
251
CodeSet_UTF32ToUTF8(const char *utf32, // IN:
265
if (utf32 == NULL) { // NULL is not an error
272
* Determine the length of the UTF32 string. A UTF32 string terminates
273
* with four (4) bytes of zero (0).
280
value.bytes[0] = *p++;
281
value.bytes[1] = *p++;
282
value.bytes[2] = *p++;
283
value.bytes[3] = *p++;
285
if (value.word == 0) {
293
* Now that we know the length, allocate the memory for the UTF8 string.
294
* The UTF8 string length calculation ensures that there will always be
295
* sufficient space to represent the UTF32 string. Most of the time this
296
* will involved allocating too much memory however the memory wastage
297
* will be very short lived and very small.
300
*utf8 = Util_SafeMalloc((4 * len) + 1); // cover the NUL byte
303
* Process the UTF32 string, converting each code point into its
310
for (i = 0; i < len; i++) {
311
value.bytes[0] = *p++;
312
value.bytes[1] = *p++;
313
value.bytes[2] = *p++;
314
value.bytes[3] = *p++;
316
if (value.word < 0x80) { // One byte case (ASCII)
318
} else if (value.word < 0x800) { // Two byte case
319
*q++ = 0xC0 | (value.word >> 6);
320
*q++ = 0x80 | (value.word & 0x3F);
321
} else if (value.word < 0x10000) { // Three byte case
322
*q++ = 0xE0 | (value.word >> 12);
323
*q++ = 0x80 | ((value.word >> 6) & 0x3F);
324
*q++ = 0x80 | (value.word & 0x3F);
325
} else if (value.word < 0x110000) { // Four byte case
326
*q++ = 0xF0 | (value.word >> 18);
327
*q++ = 0x80 | ((value.word >> 12) & 0x3F);
328
*q++ = 0x80 | ((value.word >> 6) & 0x3F);
329
*q++ = 0x80 | (value.word & 0x3F);
330
} else { // INVALID VALUE!