43
43
#include "unicodeBase.h"
44
44
#include "unicodeInt.h"
47
48
* Padding for initial and final bytes used by an encoding. The value
48
49
* comes from ICU's UCNV_GET_MAX_BYTES_FOR_STRING macro and accounts
49
50
* for leading and trailing bytes and NUL.
51
53
static const size_t UNICODE_UTF16_CODE_UNITS_PADDING = 10;
54
57
*-----------------------------------------------------------------------------
62
65
* Otherwise, buffer must be of the specified length, but does
63
66
* not need to be NUL-terminated.
65
* Note that regardless of the encoding of the buffer passed to this
66
* function, the returned string can hold any Unicode characters.
68
* If the buffer contains an invalid sequence of the specified
69
* encoding or memory could not be allocated, returns NULL.
68
* Return NULL on memory allocation failure.
70
* Return NULL if strict is true and the buffer contains an invalid
71
* sequence in the specified encoding.
73
* If strict is false, then an invalid sequence is replaced by
74
* a substitution character, which is either the Unicode
75
* substitution character (U+FFFD or \xef\xbf\xbd in UTF-8)
76
* or subchar1 (ASCII SUB or control-z, value 0x1a).
72
79
* An allocated Unicode string containing the decoded characters
83
90
UnicodeAllocInternal(const void *buffer, // IN
84
91
ssize_t lengthInBytes, // IN
85
StringEncoding encoding) // IN
92
StringEncoding encoding, // IN
87
95
char *utf8Result = NULL;
89
97
ASSERT(buffer != NULL);
98
ASSERT(lengthInBytes >= 0);
99
ASSERT(Unicode_IsEncodingValid(encoding));
91
encoding = Unicode_ResolveEncoding(encoding);
102
CodeSet_GenericToGeneric(Unicode_EncodingEnumToName(encoding),
103
buffer, lengthInBytes,
104
"UTF-8", CSGTG_TRANSLIT, &utf8Result, NULL);
93
108
switch (encoding) {
94
109
case STRING_ENCODING_US_ASCII:
96
* Fall through and treat as a special case of UTF-8.
97
* Unicode_AllocWithLength() has already ensured we've gotten
98
* only 7-bit bytes in 'buffer'.
100
110
case STRING_ENCODING_UTF8:
103
const char *utf8Str = (const char *)buffer;
105
if (lengthInBytes == -1) {
106
lengthInBytes = strlen(utf8Str);
109
// Ensure the input is valid UTF-8.
110
if (CodeSet_Utf8ToUtf16le(utf8Str,
115
utf8Result = Util_SafeStrndup(utf8Str, lengthInBytes);
111
if (Unicode_IsBufferValid(buffer, lengthInBytes, encoding)) {
112
utf8Result = Util_SafeStrndup(buffer, lengthInBytes);
119
case STRING_ENCODING_UTF16:
120
115
case STRING_ENCODING_UTF16_LE:
121
if (lengthInBytes == -1) {
122
lengthInBytes = Unicode_UTF16Strlen((const utf16_t *)buffer) * 2;
125
116
// utf8Result will be left NULL on failure.
126
117
CodeSet_Utf16leToUtf8((const char *)buffer,
132
if (lengthInBytes == -1) {
134
* TODO: This doesn't work for UTF-16 BE, UTF-32, and other
135
* encodings with embedded NULs.
137
lengthInBytes = strlen((const char *)buffer);
139
123
CodeSet_GenericToGeneric(Unicode_EncodingEnumToName(encoding),
140
124
buffer, lengthInBytes,
141
"UTF-8", CSGTG_NORMAL, &utf8Result, NULL);
125
"UTF-8", 0, &utf8Result, NULL);
150
134
*-----------------------------------------------------------------------------
136
* Unicode_IsBufferValid --
138
* Tests if the given buffer is valid in the specified encoding.
140
* If lengthInBytes is -1, then buffer must be NUL-terminated.
141
* Otherwise, buffer must be of the specified length, but does
142
* not need to be NUL-terminated.
145
* TRUE if the buffer is valid, FALSE if it's not.
150
*-----------------------------------------------------------------------------
154
Unicode_IsBufferValid(const void *buffer, // IN
155
ssize_t lengthInBytes, // IN
156
StringEncoding encoding) // IN
158
if (buffer == NULL) {
159
ASSERT(lengthInBytes <= 0);
163
encoding = Unicode_ResolveEncoding(encoding);
165
if (encoding == STRING_ENCODING_US_ASCII) {
166
return UnicodeSanityCheck(buffer, lengthInBytes, encoding);
169
if (lengthInBytes == -1) {
170
lengthInBytes = Unicode_LengthInBytes(buffer, encoding);
173
return CodeSet_Validate(buffer, lengthInBytes,
174
Unicode_EncodingEnumToName(encoding));
179
*-----------------------------------------------------------------------------
152
181
* Unicode_Duplicate --
154
183
* Allocates and returns a copy of the passed-in Unicode string.
436
465
// TODO: Lots more encodings can be added here.
437
466
basicCodePointSize = supplementaryCodePointSize = 1;
439
case STRING_ENCODING_UTF16:
440
468
case STRING_ENCODING_UTF16_LE:
441
469
case STRING_ENCODING_UTF16_BE:
470
case STRING_ENCODING_UTF16_XE:
442
471
basicCodePointSize = 2;
443
472
supplementaryCodePointSize = 4;
445
case STRING_ENCODING_UTF32:
446
474
case STRING_ENCODING_UTF32_LE:
447
475
case STRING_ENCODING_UTF32_BE:
476
case STRING_ENCODING_UTF32_XE:
448
477
basicCodePointSize = 4;
449
478
supplementaryCodePointSize = 4;
667
return UnicodeGetAllocBytesInternal(str, encoding, NULL);
695
return UnicodeGetAllocBytesInternal(str, encoding, -1, NULL);
700
*-----------------------------------------------------------------------------
702
* Unicode_GetAllocBytesWithLength --
704
* Allocates and returns a buffer into which the contents of the unicode
705
* string of the specified length are extracted using the specified
708
* NOTE: The buffer returned is always NUL terminated. The length of
709
* the NUL can depend on the encoding. UTF-16 NUL is "\0\0";
710
* UTF-32 NUL is "\0\0\0\0".
712
* NULL is returned for NULL argument.
715
* NULL if argument is NULL.
716
* Otherwise, pointer to the dynamically allocated memory
717
* or NULL on conversion failure.
718
* The caller is responsible to free the memory allocated
724
*-----------------------------------------------------------------------------
728
Unicode_GetAllocBytesWithLength(ConstUnicode str, // IN:
729
StringEncoding encoding, // IN:
730
ssize_t lengthInBytes) // IN:
735
ASSERT(lengthInBytes >= 0);
737
return UnicodeGetAllocBytesInternal(str, encoding, lengthInBytes, NULL);
693
763
UnicodeGetAllocBytesInternal(ConstUnicode ustr, // IN
694
764
StringEncoding encoding, // IN
765
ssize_t lengthInBytes, // IN
695
766
size_t *retLength) // OUT: optional
697
768
const char *utf8Str = ustr;
699
769
char *result = NULL;
701
771
ASSERT(ustr != NULL);
703
773
encoding = Unicode_ResolveEncoding(encoding);
705
len = strlen(utf8Str);
775
if (lengthInBytes == -1) {
776
lengthInBytes = Unicode_LengthInBytes(ustr, STRING_ENCODING_UTF8);
707
779
switch (encoding) {
708
780
case STRING_ENCODING_US_ASCII:
709
if (!UnicodeSanityCheck(utf8Str, len, encoding)) {
781
if (!UnicodeSanityCheck(utf8Str, lengthInBytes, encoding)) {
713
785
case STRING_ENCODING_UTF8:
714
result = Util_SafeMalloc(len + 1);
715
memcpy(result, utf8Str, len + 1);
786
result = Util_SafeMalloc(lengthInBytes + 1);
787
memcpy(result, utf8Str, lengthInBytes + 1);
716
788
if (retLength != NULL) {
789
*retLength = lengthInBytes;
721
case STRING_ENCODING_UTF16:
722
793
case STRING_ENCODING_UTF16_LE:
723
if (!CodeSet_Utf8ToUtf16le(utf8Str, len, &result, retLength)) {
794
if (!CodeSet_Utf8ToUtf16le(utf8Str, lengthInBytes, &result, retLength)) {
724
795
// input should be valid UTF-8, no conversion error possible
725
796
ASSERT_MEM_ALLOC(FALSE);
730
if (!CodeSet_GenericToGeneric("UTF-8", utf8Str, len,
801
if (!CodeSet_GenericToGeneric("UTF-8", utf8Str, lengthInBytes,
731
802
Unicode_EncodingEnumToName(encoding),
733
804
&result, retLength)) {