2
******************************************************************************
4
* Copyright (C) 1998-2001, International Business Machines
5
* Corporation and others. All Rights Reserved.
7
******************************************************************************
11
* Modification History:
13
* Date Name Description
14
* 12/07/98 bertrand Creation.
15
******************************************************************************
18
#include "unicode/utypes.h"
19
#include "unicode/ustring.h"
20
#include "unicode/putil.h"
21
#include "unicode/ucnv.h"
29
/* forward declaractions of definitions for the shared default converter */
31
static UConverter *gDefaultConverter = NULL;
33
/* ANSI string.h - style functions ------------------------------------------ */
35
#define MAX_STRLEN 0x0FFFFFFF
37
/* ---- String searching functions ---- */
39
U_CAPI UChar* U_EXPORT2
40
u_strchr(const UChar *s, UChar c)
42
while (*s && *s != c) {
50
/* A Boyer-Moore algorithm would be better, but that would require a hashtable
51
because UChar is so big. This algorithm doesn't use a lot of extra memory.
53
U_CAPI UChar * U_EXPORT2
54
u_strstr(const UChar *s, const UChar *substring) {
56
UChar *strItr, *subItr;
58
if (*substring == 0) {
64
subItr = (UChar *)substring;
66
/* Only one string iterator needs checking for null terminator */
67
while ((*strItr != 0) && (*strItr == *subItr)) {
72
if (*subItr == 0) { /* Was the end of the substring reached? */
77
} while (*strItr != 0); /* Was the end of the string reached? */
79
return NULL; /* No match */
82
U_CAPI UChar * U_EXPORT2
83
u_strchr32(const UChar *s, UChar32 c) {
85
/* non-surrogate BMP code point */
86
return u_strchr(s, (UChar)c);
87
} else if(c <= 0xdfff) {
88
/* surrogate code point */
92
t = u_strchr(s, (UChar)c);
97
UTF_IS_SURROGATE_FIRST(*t) ?
98
UTF_IS_TRAIL(*(t+1)) :
99
(s<t && UTF_IS_LEAD(*(t-1)))
101
/* matched surrogate, not a surrogate code point, continue searching */
108
} else if(c <= 0xffff) {
109
/* non-surrogate BMP code point */
110
return u_strchr(s, (UChar)c);
112
/* supplementary code point, search for string */
115
buffer[0] = UTF16_LEAD(c);
116
buffer[1] = UTF16_TRAIL(c);
118
return u_strstr(s, buffer);
122
/* Search for a codepoint in a string that matches one of the matchSet codepoints. */
123
U_CAPI UChar * U_EXPORT2
124
u_strpbrk(const UChar *string, const UChar *matchSet)
129
for (matchLen = 0; matchSet[matchLen]; matchLen++)
131
if (!UTF_IS_SINGLE(matchSet[matchLen]))
139
const UChar *matchItr;
142
for (strItr = string; *strItr; strItr++)
144
for (matchItr = matchSet; *matchItr; matchItr++)
146
if (*matchItr == *strItr)
148
return (UChar *)strItr;
157
UChar32 stringCh, matchSetCh;
158
int32_t stringLen = u_strlen(string);
160
for (strItr = 0; strItr < stringLen; strItr++)
162
UTF_GET_CHAR_SAFE(string, 0, strItr, stringLen, stringCh, TRUE);
163
for (matchItr = 0; matchItr < matchLen; matchItr++)
165
UTF_GET_CHAR_SAFE(matchSet, 0, matchItr, matchLen, matchSetCh, TRUE);
166
if (stringCh == matchSetCh && (stringCh != UTF_ERROR_VALUE
167
|| string[strItr] == UTF_ERROR_VALUE
168
|| (matchSetCh == UTF_ERROR_VALUE && !UTF_IS_SINGLE(matchSet[matchItr]))))
170
return (UChar *)string + strItr;
176
/* Didn't find it. */
180
/* Search for a codepoint in a string that matches one of the matchSet codepoints. */
181
U_CAPI int32_t U_EXPORT2
182
u_strcspn(const UChar *string, const UChar *matchSet)
184
const UChar *foundStr = u_strpbrk(string, matchSet);
185
if (foundStr == NULL)
187
return u_strlen(string);
189
return foundStr - string;
192
/* Search for a codepoint in a string that does not match one of the matchSet codepoints. */
193
U_CAPI int32_t U_EXPORT2
194
u_strspn(const UChar *string, const UChar *matchSet)
201
for (matchLen = 0; matchSet[matchLen]; matchLen++)
203
if (!UTF_IS_SINGLE(matchSet[matchLen]))
211
const UChar *matchItr;
214
for (strItr = string; *strItr && match; strItr++)
217
for (matchItr = matchSet; *matchItr; matchItr++)
219
if (*matchItr == *strItr)
226
retValue = strItr - string - (match == FALSE);
232
UChar32 stringCh, matchSetCh;
233
int32_t stringLen = u_strlen(string);
235
for (strItr = 0; strItr < stringLen && match; strItr++)
238
UTF_GET_CHAR_SAFE(string, 0, strItr, stringLen, stringCh, TRUE);
239
for (matchItr = 0; matchItr < matchLen; matchItr++)
241
UTF_GET_CHAR_SAFE(matchSet, 0, matchItr, matchLen, matchSetCh, TRUE);
242
if (stringCh == matchSetCh && (stringCh != UTF_ERROR_VALUE
243
|| string[strItr] == UTF_ERROR_VALUE
244
|| (matchSetCh == UTF_ERROR_VALUE && !UTF_IS_SINGLE(matchSet[matchItr]))))
251
retValue = strItr - (match == FALSE);
254
/* Found a mismatch or didn't find it. */
258
/* ----- Text manipulation functions --- */
260
U_CAPI UChar* U_EXPORT2
261
u_strtok_r(UChar *src,
267
uint32_t nonDelimIdx;
271
*saveState = src; /* Set to "src" in case there are no delimiters */
273
else if (saveState && *saveState) {
274
tokSource = *saveState;
280
/* Skip initial delimiters */
281
nonDelimIdx = u_strspn(tokSource, delim);
282
tokSource = &tokSource[nonDelimIdx];
285
nextToken = u_strpbrk(tokSource, delim);
286
if (nextToken != NULL) {
289
*saveState = nextToken;
292
else if (saveState && *saveState) {
293
/* Return the last token */
299
/* No tokens were found. Only delimiters were left. */
305
U_CAPI UChar* U_EXPORT2
309
UChar *anchor = dst; /* save a pointer to start of dst */
311
while(*dst != 0) { /* To end of first string */
314
while((*(dst++) = *(src++)) != 0) { /* copy string 2 over */
320
U_CAPI UChar* U_EXPORT2
321
u_strncat(UChar *dst,
326
UChar *anchor = dst; /* save a pointer to start of dst */
328
while(*dst != 0) { /* To end of first string */
331
while((*dst = *src) != 0) { /* copy string 2 over */
346
/* ----- Text property functions --- */
348
U_CAPI int32_t U_EXPORT2
349
u_strcmp(const UChar *s1,
357
if (c1 != c2 || c1 == 0) {
361
return (int32_t)c1 - (int32_t)c2;
364
/* rotate surrogates to the top to get code point order; assume c>=0xd800 */
365
#define UTF16FIXUP(c) { \
366
if ((c) >= 0xe000) { \
374
/* String compare in code point order - u_strcmp() compares in code unit order. */
375
U_CAPI int32_t U_EXPORT2
376
u_strcmpCodePointOrder(const UChar *s1, const UChar *s2) {
379
/* compare identical prefixes - they do not need to be fixed up */
391
/* if both values are in or above the surrogate range, Fix them up. */
392
if (c1 >= 0xD800 && c2 >= 0xD800) {
397
/* now c1 and c2 are in UTF-32-compatible order */
398
return (int32_t)c1-(int32_t)c2;
401
U_CAPI int32_t U_EXPORT2
402
u_strncmp(const UChar *s1,
409
rc = (int32_t)*s1 - (int32_t)*s2;
410
if(rc != 0 || *s1 == 0 || --n == 0) {
421
U_CAPI int32_t U_EXPORT2
422
u_strncmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t n) {
429
/* compare identical prefixes - they do not need to be fixed up */
434
if(c1==0 || --n==0) {
444
/* c1!=c2, fix up each one if they're both in or above the surrogate range, then compare them */
445
if (c1 >= 0xD800 && c2 >= 0xD800) {
450
/* now c1 and c2 are in UTF-32-compatible order */
451
return (int32_t)c1-(int32_t)c2;
454
U_CAPI UChar* U_EXPORT2
458
UChar *anchor = dst; /* save a pointer to start of dst */
460
while((*(dst++) = *(src++)) != 0) { /* copy string 2 over */
466
U_CAPI UChar* U_EXPORT2
467
u_strncpy(UChar *dst,
471
UChar *anchor = dst; /* save a pointer to start of dst */
473
/* copy string 2 over */
474
while(n > 0 && (*(dst++) = *(src++)) != 0) {
481
U_CAPI int32_t U_EXPORT2
482
u_strlen(const UChar *s)
484
#if U_SIZEOF_WCHAR_T == U_SIZEOF_UCHAR
485
return uprv_wcslen(s);
495
U_CAPI int32_t U_EXPORT2
496
u_countChar32(const UChar *s, int32_t length) {
499
if(s==NULL || length<-1) {
507
if(UTF_IS_LEAD(*s) && length>=2 && UTF_IS_TRAIL(*(s+1))) {
515
} else /* length==-1 */ {
525
* sufficient to look ahead one because of UTF-16;
526
* safe to look ahead one because at worst that would be the terminating NUL
528
if(UTF_IS_LEAD(c) && UTF_IS_TRAIL(*s)) {
536
U_CAPI UChar * U_EXPORT2
537
u_memcpy(UChar *dest, const UChar *src, int32_t count) {
538
return (UChar *)uprv_memcpy(dest, src, count*U_SIZEOF_UCHAR);
541
U_CAPI UChar * U_EXPORT2
542
u_memmove(UChar *dest, const UChar *src, int32_t count) {
543
return (UChar *)uprv_memmove(dest, src, count*U_SIZEOF_UCHAR);
546
U_CAPI UChar * U_EXPORT2
547
u_memset(UChar *dest, UChar c, int32_t count) {
550
UChar *limit = dest + count;
552
while (ptr < limit) {
559
U_CAPI int32_t U_EXPORT2
560
u_memcmp(UChar *buf1, UChar *buf2, int32_t count) {
562
UChar *limit = buf1 + count;
565
while (buf1 < limit) {
566
result = (int32_t)(uint16_t)*buf1 - (int32_t)(uint16_t)*buf2;
577
U_CAPI int32_t U_EXPORT2
578
u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count) {
588
/* compare identical prefixes - they do not need to be fixed up */
602
/* c1!=c2, fix up each one if they're both in or above the surrogate range, then compare them */
603
if (c1 >= 0xD800 && c2 >= 0xD800) {
608
/* now c1 and c2 are in UTF-32-compatible order */
609
return (int32_t)c1-(int32_t)c2;
612
U_CAPI UChar * U_EXPORT2
613
u_memchr(UChar *src, UChar ch, int32_t count) {
616
UChar *limit = src + count;
622
} while (++ptr < limit);
627
U_CAPI UChar * U_EXPORT2
628
u_memchr32(UChar *src, UChar32 ch, int32_t count) {
629
if(count<=0 || (uint32_t)ch>0x10ffff) {
630
return NULL; /* no string, or illegal arguments */
634
return u_memchr(src, (UChar)ch, count); /* BMP, single UChar */
636
return NULL; /* too short for a surrogate pair */
638
const UChar *limit=src+count-1; /* -1 so that we do not need a separate check for the trail unit */
639
UChar lead=UTF16_LEAD(ch), trail=UTF16_TRAIL(ch);
642
if(*src==lead && *(src+1)==trail) {
645
} while(++src<limit);
650
/* conversions between char* and UChar* ------------------------------------- */
653
returns the minimum of (the length of the null-terminated string) and n.
655
static int32_t u_astrnlen(const char *s1, int32_t n)
661
while (*(s1++) && n--)
669
U_CAPI UChar* U_EXPORT2
670
u_uastrncpy(UChar *ucs1,
674
UChar *target = ucs1;
675
UErrorCode err = U_ZERO_ERROR;
676
UConverter *cnv = u_getDefaultConverter(&err);
677
if(U_SUCCESS(err) && cnv != NULL) {
683
s2+u_astrnlen(s2, n),
687
ucnv_reset(cnv); /* be good citizens */
688
u_releaseDefaultConverter(cnv);
689
if(U_FAILURE(err) && (err != U_BUFFER_OVERFLOW_ERROR) ) {
690
*ucs1 = 0; /* failure */
692
if(target < (ucs1+n)) { /* U_BUFFER_OVERFLOW_ERROR isn't an err, just means no termination will happen. */
693
*target = 0; /* terminate */
701
U_CAPI UChar* U_EXPORT2
702
u_uastrcpy(UChar *ucs1,
705
UErrorCode err = U_ZERO_ERROR;
706
UConverter *cnv = u_getDefaultConverter(&err);
707
if(U_SUCCESS(err) && cnv != NULL) {
714
u_releaseDefaultConverter(cnv);
725
returns the minimum of (the length of the null-terminated string) and n.
727
static int32_t u_ustrnlen(const UChar *ucs1, int32_t n)
733
while (*(ucs1++) && n--)
741
U_CAPI char* U_EXPORT2
742
u_austrncpy(char *s1,
747
UErrorCode err = U_ZERO_ERROR;
748
UConverter *cnv = u_getDefaultConverter(&err);
749
if(U_SUCCESS(err) && cnv != NULL) {
751
ucnv_fromUnicode(cnv,
755
ucs2+u_ustrnlen(ucs2, n),
759
ucnv_reset(cnv); /* be good citizens */
760
u_releaseDefaultConverter(cnv);
761
if(U_FAILURE(err) && (err != U_BUFFER_OVERFLOW_ERROR) ) {
762
*s1 = 0; /* failure */
764
if(target < (s1+n)) { /* U_BUFFER_OVERFLOW_ERROR isn't an err, just means no termination will happen. */
765
*target = 0; /* terminate */
773
U_CAPI char* U_EXPORT2
777
UErrorCode err = U_ZERO_ERROR;
778
UConverter *cnv = u_getDefaultConverter(&err);
779
if(U_SUCCESS(err) && cnv != NULL) {
780
int32_t len = ucnv_fromUChars(cnv,
786
u_releaseDefaultConverter(cnv);
794
/* mutexed access to a shared default converter ----------------------------- */
796
UBool ustring_cleanup(void) {
797
if (gDefaultConverter) {
798
ucnv_close(gDefaultConverter);
799
gDefaultConverter = NULL;
802
/* it's safe to close a 0 converter */
806
U_CAPI UConverter* U_EXPORT2
807
u_getDefaultConverter(UErrorCode *status)
809
UConverter *converter = NULL;
811
if (gDefaultConverter != NULL) {
814
/* need to check to make sure it wasn't taken out from under us */
815
if (gDefaultConverter != NULL) {
816
converter = gDefaultConverter;
817
gDefaultConverter = NULL;
822
/* if the cache was empty, create a converter */
823
if(converter == NULL) {
824
converter = ucnv_open(NULL, status);
825
if(U_FAILURE(*status)) {
833
U_CAPI void U_EXPORT2
834
u_releaseDefaultConverter(UConverter *converter)
836
if(gDefaultConverter == NULL) {
837
if (converter != NULL) {
838
ucnv_reset(converter);
842
if(gDefaultConverter == NULL) {
843
gDefaultConverter = converter;
849
if(converter != NULL) {
850
ucnv_close(converter);
854
/* u_unescape & support fns ------------------------------------------------- */
856
/* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
857
static const UChar UNESCAPE_MAP[] = {
870
enum { UNESCAPE_MAP_LENGTH = sizeof(UNESCAPE_MAP) / sizeof(UNESCAPE_MAP[0]) };
872
/* Convert one octal digit to a numeric value 0..7, or -1 on failure */
873
static int8_t _digit8(UChar c) {
874
if (c >= 0x0030 && c <= 0x0037) {
875
return (int8_t)(c - 0x0030);
880
/* Convert one hex digit to a numeric value 0..F, or -1 on failure */
881
static int8_t _digit16(UChar c) {
882
if (c >= 0x0030 && c <= 0x0039) {
883
return (int8_t)(c - 0x0030);
885
if (c >= 0x0041 && c <= 0x0046) {
886
return (int8_t)(c - (0x0041 - 10));
888
if (c >= 0x0061 && c <= 0x0066) {
889
return (int8_t)(c - (0x0061 - 10));
894
/* Parse a single escape sequence. Although this method deals in
895
* UChars, it does not use C++ or UnicodeString. This allows it to
896
* be used from C contexts. */
897
U_CAPI UChar32 U_EXPORT2
898
u_unescapeAt(UNESCAPE_CHAR_AT charAt,
903
int32_t start = *offset;
909
int8_t bitsPerDigit = 4;
913
/* Check that offset is in range */
914
if (*offset < 0 || *offset >= length) {
918
/* Fetch first UChar after '\\' */
919
c = charAt((*offset)++, context);
921
/* Convert hexadecimal and octal escapes */
938
n = 1; /* Already have first octal digit */
945
while (*offset < length && n < maxDig) {
946
c = charAt(*offset, context);
947
dig = (int8_t)((bitsPerDigit == 3) ? _digit8(c) : _digit16(c));
951
result = (result << bitsPerDigit) | dig;
961
/* Convert C-style escapes in table */
962
for (i=0; i<UNESCAPE_MAP_LENGTH; i+=2) {
963
if (c == UNESCAPE_MAP[i]) {
964
return UNESCAPE_MAP[i+1];
965
} else if (c < UNESCAPE_MAP[i]) {
970
/* If no special forms are recognized, then consider
971
* the backslash to generically escape the next character.
972
* Deal with surrogate pairs. */
973
if (UTF_IS_FIRST_SURROGATE(c) && *offset < length) {
974
UChar c2 = charAt(*offset, context);
975
if (UTF_IS_SECOND_SURROGATE(c2)) {
977
return UTF16_GET_PAIR_VALUE(c, c2);
983
/* Invalid escape sequence */
984
*offset = start; /* Reset to initial value */
985
return (UChar32)0xFFFFFFFF;
988
/* u_unescapeAt() callback to return a UChar from a char* */
989
static UChar _charPtr_charAt(int32_t offset, void *context) {
991
/* It would be more efficient to access the invariant tables
992
* directly but there is no API for that. */
993
u_charsToUChars(((char*) context) + offset, &c16, 1);
997
/* Append an escape-free segment of the text; used by u_unescape() */
998
static void _appendUChars(UChar *dest, int32_t destCapacity,
999
const char *src, int32_t srcLen) {
1000
if (destCapacity < 0) {
1003
if (srcLen > destCapacity) {
1004
srcLen = destCapacity;
1006
u_charsToUChars(src, dest, srcLen);
1009
/* Do an invariant conversion of char* -> UChar*, with escape parsing */
1010
U_CAPI int32_t U_EXPORT2
1011
u_unescape(const char *src, UChar *dest, int32_t destCapacity) {
1012
const char *segment = src;
1016
while ((c=*src) != 0) {
1017
/* '\\' intentionally written as compiler-specific
1018
* character constant to correspond to compiler-specific
1019
* char* constants. */
1021
int32_t lenParsed = 0;
1023
if (src != segment) {
1025
_appendUChars(dest + i, destCapacity - i,
1026
segment, src - segment);
1030
++src; /* advance past '\\' */
1031
c32 = u_unescapeAt(_charPtr_charAt, &lenParsed, uprv_strlen(src), (void*)src);
1032
if (lenParsed == 0) {
1035
src += lenParsed; /* advance past escape seq. */
1036
if (dest != NULL && UTF_CHAR_LENGTH(c32) <= (destCapacity - i)) {
1037
UTF_APPEND_CHAR_UNSAFE(dest, i, c32);
1039
i += UTF_CHAR_LENGTH(c32);
1046
if (src != segment) {
1048
_appendUChars(dest + i, destCapacity - i,
1049
segment, src - segment);
1053
if (dest != NULL && i < destCapacity) {
1056
return i + 1; /* add 1 for zero term */
1059
if (dest != NULL && destCapacity > 0) {
1065
/* C UGrowBuffer implementation --------------------------------------------- */
1067
U_CAPI UBool /* U_CALLCONV U_EXPORT2 */
1068
u_growBufferFromStatic(void *context,
1069
UChar **pBuffer, int32_t *pCapacity, int32_t reqCapacity,
1071
UChar *newBuffer=(UChar *)uprv_malloc(reqCapacity*U_SIZEOF_UCHAR);
1072
if(newBuffer!=NULL) {
1074
uprv_memcpy(newBuffer, *pBuffer, length*U_SIZEOF_UCHAR);
1076
*pCapacity=reqCapacity;
1081
/* release the old pBuffer if it was not statically allocated */
1082
if(*pBuffer!=(UChar *)context) {
1083
uprv_free(*pBuffer);
1087
return (UBool)(newBuffer!=NULL);
1090
/* NUL-termination of strings ----------------------------------------------- */
1093
* NUL-terminate a string no matter what its type.
1094
* Set warning and error codes accordingly.
1096
#define __TERMINATE_STRING(dest, destCapacity, length, pErrorCode) \
1097
if(pErrorCode!=NULL && U_SUCCESS(*pErrorCode)) { \
1098
/* not a public function, so no complete argument checking */ \
1101
/* assume that the caller handles this */ \
1102
} else if(length<destCapacity) { \
1103
/* NUL-terminate the string, the NUL fits */ \
1105
/* unset the not-terminated warning but leave all others */ \
1106
if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) { \
1107
*pErrorCode=U_ZERO_ERROR; \
1109
} else if(length==destCapacity) { \
1110
/* unable to NUL-terminate, but the string itself fit - set a warning code */ \
1111
*pErrorCode=U_STRING_NOT_TERMINATED_WARNING; \
1112
} else /* length>destCapacity */ { \
1113
/* even the string itself did not fit - set an error code */ \
1114
*pErrorCode=U_BUFFER_OVERFLOW_ERROR; \
1118
U_CAPI int32_t U_EXPORT2
1119
u_terminateUChars(UChar *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1120
__TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1124
U_CAPI int32_t U_EXPORT2
1125
u_terminateChars(char *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1126
__TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1130
U_CAPI int32_t U_EXPORT2
1131
u_terminateUChar32s(UChar32 *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1132
__TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1136
U_CAPI int32_t U_EXPORT2
1137
u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1138
__TERMINATE_STRING(dest, destCapacity, length, pErrorCode);