/********************************************************* * Copyright (C) 2007 VMware, Inc. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation version 2.1 and no later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the Lesser GNU General Public * License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * *********************************************************/ /* * unicodeSimpleOperations.c -- * * Simple UTF-8 implementation of unicodeOperations.h interface. */ #include "vmware.h" #include "util.h" #include "str.h" #include "unicodeBase.h" #include "unicodeInt.h" #include "unicodeOperations.h" #include "codeset.h" /* *----------------------------------------------------------------------------- * * Unicode_LengthInCodePoints -- * * Returns the length of the unicode string in code points * ("unicode characters"). * * Results: * As above * * Side effects: * None * *----------------------------------------------------------------------------- */ UnicodeIndex Unicode_LengthInCodePoints(ConstUnicode str) // IN: { return CodeSet_LengthInCodePoints(str); } /* *----------------------------------------------------------------------------- * * Unicode_CompareRange -- * * Compares ranges of two Unicode strings for canonical * equivalence in code point order. * * Canonical equivalence means the two strings represent the same * Unicode code points, regardless of the order of combining * characters or use of compatibility singletons. * * See Unicode Standard Annex #15 (Unicode Normalization Forms) * for more on canonical equivalence and composition. * * If ignoreCase is TRUE, then the two strings are case-folded * (converted to upper-case, then converted to lower-case) in a * locale-agnostic manner before comparing. * * Indices and lengths that are out of bounds are pinned to the * edges of the string. * * Pass -1 for any length parameter to indicate "from start until * end of string". * * The start and length arguments are in code points - unicode * "characters" - not bytes! * * Results: * -1 if str1 < str2, 0 if str1 == str2, 1 if str1 > str2. * * Side effects: * None * *----------------------------------------------------------------------------- */ int Unicode_CompareRange(ConstUnicode str1, // IN: UnicodeIndex str1Start, // IN: UnicodeIndex str1Length, // IN: ConstUnicode str2, // IN: UnicodeIndex str2Start, // IN: UnicodeIndex str2Length, // IN: Bool ignoreCase) // IN: { int result = -1; Unicode substr1 = NULL; Unicode substr2 = NULL; utf16_t *substr1UTF16 = NULL; utf16_t *substr2UTF16 = NULL; UnicodeIndex i = 0; UnicodeIndex utf16Index; utf16_t codeUnit1; utf16_t codeUnit2; uint32 codePoint1; uint32 codePoint2; /* * TODO: Allocating substrings is a performance hit. We should do this * search in-place. (However, searching UTF-8 requires tender loving * care, and it's just easier to search UTF-16.) */ substr1 = Unicode_Substr(str1, str1Start, str1Length); if (!substr1) { goto out; } substr2 = Unicode_Substr(str2, str2Start, str2Length); if (!substr2) { goto out; } /* * XXX TODO: Need to normalize the incoming strings to NFC or NFD. */ substr1UTF16 = Unicode_GetAllocUTF16(substr1); if (!substr1UTF16) { goto out; } substr2UTF16 = Unicode_GetAllocUTF16(substr2); if (!substr2UTF16) { goto out; } /* * TODO: This is the naive string search algorithm, which is O(n * m). We * can do better with KMP or Boyer-Moore if this proves to be a bottleneck. */ while (TRUE) { codeUnit1 = *(substr1UTF16 + i); codeUnit2 = *(substr2UTF16 + i); /* * TODO: Simple case folding doesn't handle the situation where more * than one code unit is needed to store the result of the case folding. * * This means that German "straBe" (where B = sharp S, U+00DF) will not * match "STRASSE", even though the two strings are the same. */ if (ignoreCase) { codeUnit1 = UnicodeSimpleCaseFold(codeUnit1); codeUnit2 = UnicodeSimpleCaseFold(codeUnit2); } if (codeUnit1 != codeUnit2) { break; } if (codeUnit1 == 0) { // End of both strings reached: strings are equal. result = 0; goto out; } i++; } /* * The two UTF-16 code units differ. If they're the first code unit of a * surrogate pair (for Unicode values past U+FFFF), decode the surrogate * pair into a full Unicode code point. */ if (U16_IS_SURROGATE(codeUnit1)) { ssize_t substrUTF16Len = Unicode_UTF16Strlen(substr1UTF16); // U16_NEXT modifies the index, so let it work on a copy. utf16Index = i; // Decode the surrogate if needed. U16_NEXT(substr1UTF16, utf16Index, substrUTF16Len, codePoint1); } else { // Not a surrogate? Then the code point value is the code unit. codePoint1 = codeUnit1; } if (U16_IS_SURROGATE(codeUnit2)) { ssize_t substrUTF16Len = Unicode_UTF16Strlen(substr2UTF16); utf16Index = i; U16_NEXT(substr2UTF16, utf16Index, substrUTF16Len, codePoint2); } else { codePoint2 = codeUnit2; } if (codePoint1 < codePoint2) { result = -1; } else if (codePoint1 > codePoint2) { result = 1; } else { // If we hit the end of the string, we've already gone to 'out'. NOT_REACHED(); } out: free(substr1UTF16); free(substr2UTF16); Unicode_Free(substr1); Unicode_Free(substr2); return result; } /* *----------------------------------------------------------------------------- * * Unicode_FindSubstrInRange -- * * Searches the string 'str' in the range [strStart, strStart+strLength) * for the first occurrence of the code units of 'strToFind' * in the range [strToFindStart, strToFindStart+strToFindLength). * * Indices and lengths that are out of bounds are pinned to the * edges of the string. * * Pass -1 for any length parameter to indicate "from start until * end of string". * * The start and length arguments are in code points - unicode * "characters" - not bytes! * * Results: * If 'strToFind' exists inside 'str' in the specified range, * returns the first starting index of 'strToFind' in that range. * * Otherwise, returns UNICODE_INDEX_NOT_FOUND. * * Side effects: * None * *----------------------------------------------------------------------------- */ UnicodeIndex Unicode_FindSubstrInRange(ConstUnicode str, // IN: UnicodeIndex strStart, // IN: UnicodeIndex strLength, // IN: ConstUnicode strToFind, // IN: UnicodeIndex strToFindStart, // IN: UnicodeIndex strToFindLength) // IN: { UnicodeIndex index; uint32 *utf32Source = NULL; uint32 *utf32Search = NULL; ASSERT(str); ASSERT(strStart >= 0); ASSERT((strLength >= 0) || (strLength == -1)); ASSERT(strToFind); ASSERT(strToFindStart >= 0); ASSERT((strToFindLength >= 0) || (strToFindLength == -1)); /* * Convert the string to be searched and the search string to UTF32. */ if (!CodeSet_UTF8ToUTF32(str, (char **) &utf32Source)) { Panic("%s: invalid UTF8 string @ %p\n", __FUNCTION__, str); } if (!CodeSet_UTF8ToUTF32(strToFind, (char **) &utf32Search)) { Panic("%s: invalid UTF8 string @ %p\n", __FUNCTION__, strToFind); } /* * Do any bounds cleanup and checking that is necessary... */ if (strLength < 0) { strLength = Unicode_LengthInCodePoints(str) - strStart; } if (strToFindLength < 0) { strToFindLength = Unicode_LengthInCodePoints(strToFind) - strToFindStart; } if (strLength < strToFindLength) { index = UNICODE_INDEX_NOT_FOUND; goto bail; } /* * Yes, this may be viewed as a bit strange but this is what strstr does. */ if (strToFindLength == 0) { index = strStart; goto bail; } /* * Attempt to find the first occurence of the search string in the string * to be searched. */ for (index = strStart; index <= (strStart + strLength - strToFindLength); index++) { UnicodeIndex i; Bool match = FALSE; UnicodeIndex indexSrc = index; UnicodeIndex indexSrch = strToFindStart; for (i = 0; i < strToFindLength; i++) { match = (utf32Source[indexSrc++] == utf32Search[indexSrch++]); if (!match) { break; } } if (match) { goto bail; } } index = UNICODE_INDEX_NOT_FOUND; bail: free(utf32Source); free(utf32Search); return index; } /* *----------------------------------------------------------------------------- * * Unicode_FindLastSubstrInRange -- * * Searches the string 'str' in the range [strStart, strStart+strLength) * for the last occurrence of the code units of 'strToFind' * in the range [strToFindStart, strToFindStart+strToFindLength). * * Indices and lengths that are out of bounds are pinned to the * edges of the string. * * Pass -1 for any length parameter to indicate "from start until * end of string". * * The start and length arguments are in code points - unicode * "characters" - not bytes! * * Results: * If 'strToFind' exists inside 'str' in the specified range, * returns the last starting index of 'strToFind' in that range. * * Otherwise, returns UNICODE_INDEX_NOT_FOUND. * * Side effects: * None * *----------------------------------------------------------------------------- */ UnicodeIndex Unicode_FindLastSubstrInRange(ConstUnicode str, // IN: UnicodeIndex strStart, // IN: UnicodeIndex strLength, // IN: ConstUnicode strToFind, // IN: UnicodeIndex strToFindStart, // IN: UnicodeIndex strToFindLength) // IN: { UnicodeIndex index; uint32 *utf32Source = NULL; uint32 *utf32Search = NULL; ASSERT(str); ASSERT(strStart >= 0); ASSERT((strLength >= 0) || (strLength == -1)); ASSERT(strToFind); ASSERT(strToFindStart >= 0); ASSERT((strToFindLength >= 0) || (strToFindLength == -1)); /* * Convert the string to be searched and the search string to UTF32. */ if (!CodeSet_UTF8ToUTF32(str, (char **) &utf32Source)) { Panic("%s: invalid UTF8 string @ %p\n", __FUNCTION__, str); } if (!CodeSet_UTF8ToUTF32(strToFind, (char **) &utf32Search)) { Panic("%s: invalid UTF8 string @ %p\n", __FUNCTION__, strToFind); } /* * Do any bounds cleanup and checking that is necessary... */ if (strLength < 0) { strLength = Unicode_LengthInCodePoints(str) - strStart; } if (strToFindLength < 0) { strToFindLength = Unicode_LengthInCodePoints(strToFind) - strToFindStart; } if (strLength < strToFindLength) { index = UNICODE_INDEX_NOT_FOUND; goto bail; } /* * Yes, this may be viewed as a bit strange but this is what strstr does. */ if (strToFindLength == 0) { index = strStart; goto bail; } /* * Attempt to find the last occurence of the search string in the string * to be searched. */ for (index = strStart + strLength - strToFindLength; index >= strStart; index--) { UnicodeIndex i; Bool match = FALSE; UnicodeIndex indexSrc = index; UnicodeIndex indexSrch = strToFindStart; for (i = 0; i < strToFindLength; i++) { match = (utf32Source[indexSrc++] == utf32Search[indexSrch++]); if (!match) { break; } } if (match) { goto bail; } } index = UNICODE_INDEX_NOT_FOUND; bail: free(utf32Source); free(utf32Search); return index; } /* *----------------------------------------------------------------------------- * * Unicode_Substr -- * * Allocates and returns a substring of 'str'. * * Indices and lengths that are out of bounds are pinned to the * edges of the string. * * Pass -1 for the length parameter to indicate "from start until * end of string". * * The start and length arguments are in code points - unicode * "characters" - not bytes! * * Results: * The newly-allocated substring of 'str' in the range [index, * index + length). Caller must free with Unicode_Free. * * Side effects: * None * *----------------------------------------------------------------------------- */ Unicode Unicode_Substr(ConstUnicode str, // IN: UnicodeIndex start, // IN: UnicodeIndex length) // IN: { char *substr; uint32 codePointLen; uint32 *utf32 = NULL; ASSERT(str); ASSERT((start >= 0) || (start == -1)); ASSERT((length >= 0) || (length == -1)); if (!CodeSet_UTF8ToUTF32(str, (char **) &utf32)) { Panic("%s: invalid UTF8 string @ %p\n", __FUNCTION__, str); } codePointLen = 0; while (utf32[codePointLen] != 0) { codePointLen++; } if ((start < 0) || (start > codePointLen)) { start = codePointLen; } if ((length < 0) || ((start + length) > codePointLen)) { length = codePointLen - start; } utf32[start + length] = 0; CodeSet_UTF32ToUTF8((char *) &utf32[start], &substr); free(utf32); return substr; } /* *----------------------------------------------------------------------------- * * Unicode_ReplaceRange -- * * Core operation upon which append, insert, replace, and remove * are based. * * Replaces the code units of destination in the range * [destinationStart, destinationLength) with the code units of * source in the range [sourceStart, sourceLength). * * Indices and lengths that are out of bounds are pinned to the * edges of the string. * * Pass -1 for any length parameter to indicate "from start until * end of string". * * The start and length arguments are in code points - unicode * "characters" - not bytes! * * Results: * A newly-allocated string containing the results of the replace * operation. Caller must free with Unicode_Free. * * Side effects: * None * *----------------------------------------------------------------------------- */ Unicode Unicode_ReplaceRange(ConstUnicode dest, // IN: UnicodeIndex destStart, // IN: UnicodeIndex destLength, // IN: ConstUnicode src, // IN: UnicodeIndex srcStart, // IN: UnicodeIndex srcLength) // IN: { Unicode result; Unicode stringOne; Unicode stringTwo; Unicode stringThree; ASSERT(dest); ASSERT((destStart >= 0) || (destStart == -1)); ASSERT((destLength >= 0) || (destLength == -1)); ASSERT(src); ASSERT((srcStart >= 0) || (srcStart == -1)); ASSERT((srcLength >= 0) || (srcLength == -1)); stringOne = Unicode_Substr(dest, 0, destStart); stringTwo = Unicode_Substr(src, srcStart, srcLength); stringThree = Unicode_Substr(dest, destStart + destLength, -1); result = Unicode_Join(stringOne, stringTwo, stringThree, NULL); Unicode_Free(stringOne); Unicode_Free(stringTwo); Unicode_Free(stringThree); return result; } /* *----------------------------------------------------------------------------- * * Unicode_Join -- * * Allocates and returns a new string containing the 'first' followed by * all the unicode strings specified as optional arguments (which must * be of type ConstUnicode). Appending ceases when a NULL pointer is * detected. * * Results: * The newly-allocated string. Caller must free with Unicode_Free. * * Side effects: * None * *----------------------------------------------------------------------------- */ Unicode Unicode_Join(ConstUnicode first, // IN: ...) // IN: { Unicode result; if (first == NULL) { result = NULL; } else { va_list args; ConstUnicode cur; result = Unicode_Duplicate(first); va_start(args, first); while ((cur = va_arg(args, ConstUnicode)) != NULL) { Unicode temp; temp = Unicode_Format("%s%s", result, cur); Unicode_Free(result); result = temp; } va_end(args); } return result; } /* *----------------------------------------------------------------------------- * * Unicode_Format -- * * Format a Unicode string (roughly equivalent to Str_Asprintf()). * * Results: * The formatted string. * * Side effects: * None. * *----------------------------------------------------------------------------- */ Unicode Unicode_Format(const char *fmt, // IN: the format ...) // IN: the arguments { va_list args; char *p; va_start(args, fmt); p = Str_Vasprintf(NULL, fmt, args); va_end(args); return p; }