2
* PROGRAM: JRD International support
3
* MODULE: unicode_util.h
4
* DESCRIPTION: Unicode functions
6
* The contents of this file are subject to the Initial
7
* Developer's Public License Version 1.0 (the "License");
8
* you may not use this file except in compliance with the
9
* License. You may obtain a copy of the License at
10
* http://www.ibphoenix.com/main.nfs?a=ibphoenix&page=ibp_idpl.
12
* Software distributed under the License is distributed AS IS,
13
* WITHOUT WARRANTY OF ANY KIND, either express or implied.
14
* See the License for the specific language governing rights
15
* and limitations under the License.
17
* The Original Code was created by Adriano dos Santos Fernandes
18
* for the Firebird Open Source RDBMS project.
20
* Copyright (c) 2004 Adriano dos Santos Fernandes <adrianosf@uol.com.br>
21
* and all contributors signed below.
23
* All Rights Reserved.
24
* Contributor(s): ______________________________________.
28
#include "../common/classes/alloc.h"
29
#include "../jrd/constants.h"
30
#include "../jrd/unicode_util.h"
31
#include "../jrd/CharSet.h"
32
#include "../jrd/IntlUtil.h"
33
#include "../jrd/gdsassert.h"
34
#include "../common/classes/auto.h"
35
#include "../common/classes/GenericMap.h"
36
#include "../common/classes/init.h"
37
#include "../common/classes/objects_array.h"
38
#include "../common/classes/rwlock.h"
39
#include "unicode/ustring.h"
40
#include "unicode/utrans.h"
41
#include "unicode/uchar.h"
42
#include "unicode/ucnv.h"
43
#include "unicode/ucol.h"
46
using namespace Firebird;
52
const char* const UnicodeUtil::DEFAULT_ICU_VERSION =
53
STRINGIZE(U_ICU_VERSION_MAJOR_NUM)"."STRINGIZE(U_ICU_VERSION_MINOR_NUM);
56
// encapsulate ICU collations libraries
57
struct UnicodeUtil::ICU
60
ICU(const ICU&); // not implemented
61
ICU& operator =(const ICU&); // not implemented
65
: inModule(NULL), ucModule(NULL)
75
ModuleLoader::Module* inModule;
76
ModuleLoader::Module* ucModule;
77
UVersionInfo collVersion;
79
void (U_EXPORT2 *uInit)(UErrorCode* status);
80
void (U_EXPORT2 *uVersionToString)(UVersionInfo versionArray, char* versionString);
82
int32_t (U_EXPORT2 *ulocCountAvailable)();
83
const char* (U_EXPORT2 *ulocGetAvailable)(int32_t n);
85
void (U_EXPORT2 *usetClose)(USet* set);
86
int32_t (U_EXPORT2 *usetGetItem)(const USet* set, int32_t itemIndex,
87
UChar32* start, UChar32* end, UChar* str, int32_t strCapacity, UErrorCode* ec);
88
int32_t (U_EXPORT2 *usetGetItemCount)(const USet* set);
89
USet* (U_EXPORT2 *usetOpen)(UChar32 start, UChar32 end);
91
void (U_EXPORT2 *ucolClose)(UCollator* coll);
92
int32_t (U_EXPORT2 *ucolGetContractions)(const UCollator* coll, USet* conts, UErrorCode* status);
93
int32_t (U_EXPORT2 *ucolGetSortKey)(const UCollator* coll, const UChar* source,
94
int32_t sourceLength, uint8_t* result, int32_t resultLength);
95
UCollator* (U_EXPORT2 *ucolOpen)(const char* loc, UErrorCode* status);
96
void (U_EXPORT2 *ucolSetAttribute)(UCollator* coll, UColAttribute attr,
97
UColAttributeValue value, UErrorCode* status);
98
UCollationResult (U_EXPORT2 *ucolStrColl)(const UCollator* coll, const UChar* source,
99
int32_t sourceLength, const UChar* target, int32_t targetLength);
100
void (U_EXPORT2 *ucolGetVersion)(const UCollator* coll, UVersionInfo info);
102
void (U_EXPORT2 *utransClose)(UTransliterator* trans);
103
UTransliterator* (U_EXPORT2 *utransOpen)(
106
const UChar* rules, /* may be Null */
107
int32_t rulesLength, /* -1 if null-terminated */
108
UParseError* parseError, /* may be Null */
110
void (U_EXPORT2 *utransTransUChars)(
111
const UTransliterator* trans,
114
int32_t textCapacity,
121
// cache ICU module instances to not load and unload many times
122
class UnicodeUtil::ICUModules
124
typedef GenericMap<Pair<Left<string, ICU*> > > ModulesMap;
127
explicit ICUModules(MemoryPool&)
133
ModulesMap::Accessor modulesAccessor(&modules());
134
for (bool found = modulesAccessor.getFirst(); found; found = modulesAccessor.getNext())
135
delete modulesAccessor.current()->second;
138
InitInstance<ModulesMap> modules;
143
GlobalPtr<UnicodeUtil::ICUModules> icuModules;
147
static const char* const COLL_30_VERSION = "41.128.4.4"; // ICU 3.0 collator version
150
static void getVersions(const string& configInfo, ObjectsArray<string>& versions)
153
IntlUtil::initAsciiCharset(&cs);
155
AutoPtr<CharSet> ascii(Jrd::CharSet::createInstance(*getDefaultMemoryPool(), 0, &cs));
157
IntlUtil::SpecificAttributesMap config;
158
IntlUtil::parseSpecificAttributes(ascii, configInfo.length(),
159
(const UCHAR*) configInfo.c_str(), &config);
162
if (config.get("icu_versions", versionsStr))
165
versionsStr = "default";
172
for (size_t i = versionsStr.find(' '); i != versionsStr.npos;
173
start = i + 1, i = versionsStr.find(' ', start))
175
if ((n = versionsStr.find_first_not_of(' ', start)) != versionsStr.npos)
177
versions.add(versionsStr.substr(start, i - start));
180
if ((n = versionsStr.find_first_not_of(' ', start)) != versionsStr.npos)
182
versions.add(versionsStr.substr(start));
187
USHORT UnicodeUtil::utf16KeyLength(USHORT len)
189
return (len / 2) * 4;
194
USHORT UnicodeUtil::utf16ToKey(USHORT srcLen, const USHORT* src, USHORT dstLen, UCHAR* dst)
196
fb_assert(srcLen % sizeof(*src) == 0);
197
fb_assert(src != NULL && dst != NULL);
199
if (dstLen < srcLen / sizeof(*src) * 4)
200
return INTL_BAD_KEY_LENGTH;
202
UErrorCode status = U_ZERO_ERROR;
203
UConverter* conv = ucnv_open("BOCU-1", &status);
204
fb_assert(U_SUCCESS(status));
206
const int32_t len = ucnv_fromUChars(conv, reinterpret_cast<char*>(dst), dstLen,
207
// safe cast - alignment not changed
208
reinterpret_cast<const UChar*>(src), srcLen / sizeof(*src), &status);
209
fb_assert(U_SUCCESS(status));
217
ULONG UnicodeUtil::utf16LowerCase(ULONG srcLen, const USHORT* src, ULONG dstLen, USHORT* dst,
218
const ULONG* exceptions)
220
// this is more correct but we don't support completely yet
222
fb_assert(srcLen % sizeof(*src) == 0);
223
fb_assert(src != NULL && dst != NULL);
225
memcpy(dst, src, srcLen);
227
UErrorCode errorCode = U_ZERO_ERROR;
228
UTransliterator* trans = utrans_open("Any-Lower", UTRANS_FORWARD, NULL, 0, NULL, &errorCode);
229
//// TODO: add exceptions in this way: Any-Lower[^\\u03BC] - for U+03BC
233
int32_t capacity = dstLen;
234
int32_t len = srcLen / sizeof(USHORT);
237
utrans_transUChars(trans, reinterpret_cast<UChar*>(dst), &len, capacity, 0, &limit, &errorCode);
240
len *= sizeof(USHORT);
242
len = INTL_BAD_STR_LENGTH;
247
return INTL_BAD_STR_LENGTH;
250
fb_assert(srcLen % sizeof(*src) == 0);
251
fb_assert(src != NULL && dst != NULL);
253
srcLen /= sizeof(*src);
254
dstLen /= sizeof(*dst);
258
for (ULONG i = 0; i < srcLen;)
261
U16_NEXT(src, i, srcLen, c);
267
const ULONG* p = exceptions;
268
while (*p && *p != c)
276
U16_APPEND(dst, n, dstLen, c, error);
279
return n * sizeof(*dst);
283
ULONG UnicodeUtil::utf16UpperCase(ULONG srcLen, const USHORT* src, ULONG dstLen, USHORT* dst,
284
const ULONG* exceptions)
286
// this is more correct but we don't support completely yet
288
fb_assert(srcLen % sizeof(*src) == 0);
289
fb_assert(src != NULL && dst != NULL);
291
memcpy(dst, src, srcLen);
293
UErrorCode errorCode = U_ZERO_ERROR;
294
UTransliterator* trans = utrans_open("Any-Upper", UTRANS_FORWARD, NULL, 0, NULL, &errorCode);
295
//// TODO: add exceptions in this way: Any-Upper[^\\u03BC] - for U+03BC
299
int32_t capacity = dstLen;
300
int32_t len = srcLen / sizeof(USHORT);
303
utrans_transUChars(trans, reinterpret_cast<UChar*>(dst), &len, capacity, 0, &limit, &errorCode);
306
len *= sizeof(USHORT);
308
len = INTL_BAD_STR_LENGTH;
313
return INTL_BAD_STR_LENGTH;
316
fb_assert(srcLen % sizeof(*src) == 0);
317
fb_assert(src != NULL && dst != NULL);
319
srcLen /= sizeof(*src);
320
dstLen /= sizeof(*dst);
324
for (ULONG i = 0; i < srcLen;)
327
U16_NEXT(src, i, srcLen, c);
333
const ULONG* p = exceptions;
334
while (*p && *p != c)
342
U16_APPEND(dst, n, dstLen, c, error);
345
return n * sizeof(*dst);
349
ULONG UnicodeUtil::utf16ToUtf8(ULONG srcLen, const USHORT* src, ULONG dstLen, UCHAR* dst,
350
USHORT* err_code, ULONG* err_position)
352
fb_assert(srcLen % sizeof(*src) == 0);
353
fb_assert(src != NULL || dst == NULL);
354
fb_assert(err_code != NULL);
355
fb_assert(err_position != NULL);
360
return srcLen / sizeof(*src) * 4;
362
srcLen /= sizeof(*src);
364
const UCHAR* const dstStart = dst;
365
const UCHAR* const dstEnd = dst + dstLen;
367
for (ULONG i = 0; i < srcLen; )
369
if (dstEnd - dst == 0)
371
*err_code = CS_TRUNCATION_ERROR;
372
*err_position = i * sizeof(*src);
376
UChar32 c = src[i++];
382
*err_position = (i - 1) * sizeof(*src);
384
if (UTF_IS_SURROGATE(c))
388
if (UTF_IS_SURROGATE_FIRST(c) && i < srcLen && UTF_IS_TRAIL(c2 = src[i]))
391
c = UTF16_GET_PAIR_VALUE(c, c2);
395
*err_code = CS_BAD_INPUT;
400
if (U8_LENGTH(c) <= dstEnd - dst)
403
U8_APPEND_UNSAFE(dst, j, c);
408
*err_code = CS_TRUNCATION_ERROR;
414
return (dst - dstStart) * sizeof(*dst);
418
ULONG UnicodeUtil::utf8ToUtf16(ULONG srcLen, const UCHAR* src, ULONG dstLen, USHORT* dst,
419
USHORT* err_code, ULONG* err_position)
421
fb_assert(src != NULL || dst == NULL);
422
fb_assert(err_code != NULL);
423
fb_assert(err_position != NULL);
428
return srcLen * sizeof(*dst);
430
const USHORT* const dstStart = dst;
431
const USHORT* const dstEnd = dst + dstLen / sizeof(*dst);
433
for (ULONG i = 0; i < srcLen; )
435
if (dstEnd - dst == 0)
437
*err_code = CS_TRUNCATION_ERROR;
442
UChar32 c = src[i++];
448
*err_position = i - 1;
450
c = utf8_nextCharSafeBody(src, reinterpret_cast<int32_t*>(&i), srcLen, c, -1);
454
*err_code = CS_BAD_INPUT;
457
else if (c <= 0xFFFF)
461
if (dstEnd - dst > 1)
463
*dst++ = UTF16_LEAD(c);
464
*dst++ = UTF16_TRAIL(c);
468
*err_code = CS_TRUNCATION_ERROR;
475
return (dst - dstStart) * sizeof(*dst);
479
ULONG UnicodeUtil::utf16ToUtf32(ULONG srcLen, const USHORT* src, ULONG dstLen, ULONG* dst,
480
USHORT* err_code, ULONG* err_position)
482
fb_assert(srcLen % sizeof(*src) == 0);
483
fb_assert(src != NULL || dst == NULL);
484
fb_assert(err_code != NULL);
485
fb_assert(err_position != NULL);
490
return srcLen / sizeof(*src) * sizeof(*dst);
492
// based on u_strToUTF32 from ICU
493
const USHORT* const srcStart = src;
494
const ULONG* const dstStart = dst;
495
const USHORT* const srcEnd = src + srcLen / sizeof(*src);
496
const ULONG* const dstEnd = dst + dstLen / sizeof(*dst);
498
while (src < srcEnd && dst < dstEnd)
505
if (src < srcEnd && UTF_IS_TRAIL(ch2 = *src))
507
ch = UTF16_GET_PAIR_VALUE(ch, ch2);
512
*err_code = CS_BAD_INPUT;
521
*err_position = (src - srcStart) * sizeof(*src);
523
if (*err_code == 0 && src < srcEnd)
524
*err_code = CS_TRUNCATION_ERROR;
526
return (dst - dstStart) * sizeof(*dst);
530
ULONG UnicodeUtil::utf32ToUtf16(ULONG srcLen, const ULONG* src, ULONG dstLen, USHORT* dst,
531
USHORT* err_code, ULONG* err_position)
533
fb_assert(srcLen % sizeof(*src) == 0);
534
fb_assert(src != NULL || dst == NULL);
535
fb_assert(err_code != NULL);
536
fb_assert(err_position != NULL);
543
// based on u_strFromUTF32 from ICU
544
const ULONG* const srcStart = src;
545
const USHORT* const dstStart = dst;
546
const ULONG* const srcEnd = src + srcLen / sizeof(*src);
547
const USHORT* const dstEnd = dst + dstLen / sizeof(*dst);
549
while (src < srcEnd && dst < dstEnd)
551
const ULONG ch = *src++;
555
else if (ch <= 0x10FFFF)
557
*(dst++) = UTF16_LEAD(ch);
560
*(dst++) = UTF16_TRAIL(ch);
563
*err_code = CS_TRUNCATION_ERROR;
570
*err_code = CS_BAD_INPUT;
576
*err_position = (src - srcStart) * sizeof(*src);
578
if (*err_code == 0 && src < srcEnd)
579
*err_code = CS_TRUNCATION_ERROR;
581
return (dst - dstStart) * sizeof(*dst);
585
SSHORT UnicodeUtil::utf16Compare(ULONG len1, const USHORT* str1, ULONG len2, const USHORT* str2,
586
INTL_BOOL* error_flag)
588
fb_assert(len1 % sizeof(*str1) == 0);
589
fb_assert(len2 % sizeof(*str2) == 0);
590
fb_assert(str1 != NULL);
591
fb_assert(str2 != NULL);
592
fb_assert(error_flag != NULL);
596
// safe casts - alignment not changed
597
int32_t cmp = u_strCompare(reinterpret_cast<const UChar*>(str1), len1 / sizeof(*str1),
598
reinterpret_cast<const UChar*>(str2), len2 / sizeof(*str2), true);
600
return (cmp < 0 ? -1 : (cmp > 0 ? 1 : 0));
604
ULONG UnicodeUtil::utf16Length(ULONG len, const USHORT* str)
606
fb_assert(len % sizeof(*str) == 0);
607
// safe cast - alignment not changed
608
return u_countChar32(reinterpret_cast<const UChar*>(str), len / sizeof(*str));
612
ULONG UnicodeUtil::utf16Substring(ULONG srcLen, const USHORT* src, ULONG dstLen, USHORT* dst,
613
ULONG startPos, ULONG length)
615
fb_assert(srcLen % sizeof(*src) == 0);
616
fb_assert(src != NULL && dst != NULL);
621
const USHORT* const dstStart = dst;
622
const USHORT* const srcEnd = src + srcLen / sizeof(*src);
623
const USHORT* const dstEnd = dst + dstLen / sizeof(*dst);
626
while (src < srcEnd && dst < dstEnd && pos < startPos)
628
const ULONG ch = *src++;
632
if (src < srcEnd && UTF_IS_TRAIL(*src))
639
while (src < srcEnd && dst < dstEnd && pos < startPos + length)
641
const ULONG ch = *src++;
648
if (src < srcEnd && UTF_IS_TRAIL(ch2 = *src))
658
return (dst - dstStart) * sizeof(*dst);
662
INTL_BOOL UnicodeUtil::utf8WellFormed(ULONG len, const UCHAR* str, ULONG* offending_position)
664
fb_assert(str != NULL);
666
for (ULONG i = 0; i < len; )
668
UChar32 c = str[i++];
672
const ULONG save_i = i - 1;
674
c = utf8_nextCharSafeBody(str, reinterpret_cast<int32_t*>(&i), len, c, -1);
678
if (offending_position)
679
*offending_position = save_i;
680
return false; // malformed
685
return true; // well-formed
689
INTL_BOOL UnicodeUtil::utf16WellFormed(ULONG len, const USHORT* str, ULONG* offending_position)
691
fb_assert(str != NULL);
692
fb_assert(len % sizeof(*str) == 0);
696
for (ULONG i = 0; i < len;)
698
const ULONG save_i = i;
701
U16_NEXT(str, i, len, c);
703
if (!U_IS_SUPPLEMENTARY(c) && (U16_IS_LEAD(c) || U16_IS_TRAIL(c)))
705
if (offending_position)
706
*offending_position = save_i * sizeof(*str);
707
return false; // malformed
711
return true; // well-formed
715
INTL_BOOL UnicodeUtil::utf32WellFormed(ULONG len, const ULONG* str, ULONG* offending_position)
717
fb_assert(str != NULL);
718
fb_assert(len % sizeof(*str) == 0);
720
const ULONG* strStart = str;
724
if (!U_IS_UNICODE_CHAR(*str))
726
if (offending_position)
727
*offending_position = (str - strStart) * sizeof(*str);
728
return false; // malformed
735
return true; // well-formed
739
UnicodeUtil::ICU* UnicodeUtil::loadICU(const Firebird::string& icuVersion,
740
const Firebird::string& configInfo)
743
const char* const inTemplate = "icuin%s%s.dll";
744
const char* const ucTemplate = "icuuc%s%s.dll";
745
#elif defined(DARWIN)
746
const char* const inTemplate = "/Library/Frameworks/Firebird.framework/Versions/A/Libraries/libicui18n.dylib";
747
const char* const ucTemplate = "/Library/Frameworks/Firebird.framework/versions/A/Libraries/libicuuc.dylib";
749
const char* const inTemplate = "libicui18n.sl.%s%s";
750
const char* const ucTemplate = "libicuuc.sl.%s%s";
752
const char* const inTemplate = "libicui18n.so.%s%s";
753
const char* const ucTemplate = "libicuuc.so.%s%s";
756
ObjectsArray<string> versions;
757
getVersions(configInfo, versions);
759
string version = icuVersion.isEmpty() ? versions[0] : icuVersion;
760
if (version == "default")
762
version.printf("%d.%d", U_ICU_VERSION_MAJOR_NUM, U_ICU_VERSION_MINOR_NUM);
765
for (ObjectsArray<string>::const_iterator i(versions.begin()); i != versions.end(); ++i)
772
majorVersion = STRINGIZE(U_ICU_VERSION_MAJOR_NUM);
773
minorVersion = STRINGIZE(U_ICU_VERSION_MINOR_NUM);
777
const size_t pos = i->find('.');
781
majorVersion = i->substr(0, pos);
782
minorVersion = i->substr(pos + 1);
785
if (version != majorVersion + "." + minorVersion)
790
ReadLockGuard readGuard(icuModules->lock);
793
if (icuModules->modules().get(version, icu))
799
filename.printf(ucTemplate, majorVersion.c_str(), minorVersion.c_str());
801
icu = FB_NEW(*getDefaultMemoryPool()) ICU();
803
icu->ucModule = ModuleLoader::loadModule(filename);
806
ModuleLoader::doctorModuleExtention(filename);
807
icu->ucModule = ModuleLoader::loadModule(filename);
816
filename.printf(inTemplate, majorVersion.c_str(), minorVersion.c_str());
818
icu->inModule = ModuleLoader::loadModule(filename);
821
ModuleLoader::doctorModuleExtention(filename);
822
icu->inModule = ModuleLoader::loadModule(filename);
833
symbol.printf("u_init_%s_%s", majorVersion.c_str(), minorVersion.c_str());
834
icu->ucModule->findSymbol(symbol, icu->uInit);
836
symbol.printf("u_versionToString_%s_%s", majorVersion.c_str(), minorVersion.c_str());
837
icu->ucModule->findSymbol(symbol, icu->uVersionToString);
839
symbol.printf("uloc_countAvailable_%s_%s", majorVersion.c_str(), minorVersion.c_str());
840
icu->ucModule->findSymbol(symbol, icu->ulocCountAvailable);
842
symbol.printf("uloc_getAvailable_%s_%s", majorVersion.c_str(), minorVersion.c_str());
843
icu->ucModule->findSymbol(symbol, icu->ulocGetAvailable);
845
symbol.printf("uset_close_%s_%s", majorVersion.c_str(), minorVersion.c_str());
846
icu->ucModule->findSymbol(symbol, icu->usetClose);
848
symbol.printf("uset_getItem_%s_%s", majorVersion.c_str(), minorVersion.c_str());
849
icu->ucModule->findSymbol(symbol, icu->usetGetItem);
851
symbol.printf("uset_getItemCount_%s_%s", majorVersion.c_str(), minorVersion.c_str());
852
icu->ucModule->findSymbol(symbol, icu->usetGetItemCount);
854
symbol.printf("uset_open_%s_%s", majorVersion.c_str(), minorVersion.c_str());
855
icu->ucModule->findSymbol(symbol, icu->usetOpen);
857
symbol.printf("ucol_close_%s_%s", majorVersion.c_str(), minorVersion.c_str());
858
icu->inModule->findSymbol(symbol, icu->ucolClose);
860
symbol.printf("ucol_getContractions_%s_%s", majorVersion.c_str(), minorVersion.c_str());
861
icu->inModule->findSymbol(symbol, icu->ucolGetContractions);
863
symbol.printf("ucol_getSortKey_%s_%s", majorVersion.c_str(), minorVersion.c_str());
864
icu->inModule->findSymbol(symbol, icu->ucolGetSortKey);
866
symbol.printf("ucol_open_%s_%s", majorVersion.c_str(), minorVersion.c_str());
867
icu->inModule->findSymbol(symbol, icu->ucolOpen);
869
symbol.printf("ucol_setAttribute_%s_%s", majorVersion.c_str(), minorVersion.c_str());
870
icu->inModule->findSymbol(symbol, icu->ucolSetAttribute);
872
symbol.printf("ucol_strcoll_%s_%s", majorVersion.c_str(), minorVersion.c_str());
873
icu->inModule->findSymbol(symbol, icu->ucolStrColl);
875
symbol.printf("ucol_getVersion_%s_%s", majorVersion.c_str(), minorVersion.c_str());
876
icu->inModule->findSymbol(symbol, icu->ucolGetVersion);
878
symbol.printf("utrans_open_%s_%s", majorVersion.c_str(), minorVersion.c_str());
879
icu->inModule->findSymbol(symbol, icu->utransOpen);
881
symbol.printf("utrans_close_%s_%s", majorVersion.c_str(), minorVersion.c_str());
882
icu->inModule->findSymbol(symbol, icu->utransClose);
884
symbol.printf("utrans_transUChars_%s_%s", majorVersion.c_str(), minorVersion.c_str());
885
icu->inModule->findSymbol(symbol, icu->utransTransUChars);
887
if (/*!icu->uInit ||*/ !icu->uVersionToString || !icu->ulocCountAvailable ||
888
!icu->ulocGetAvailable || !icu->usetClose || !icu->usetGetItem ||
889
!icu->usetGetItemCount || !icu->usetOpen || !icu->ucolClose ||
890
!icu->ucolGetContractions || !icu->ucolGetSortKey || !icu->ucolOpen ||
891
!icu->ucolSetAttribute || !icu->ucolStrColl || !icu->ucolGetVersion ||
892
!icu->utransOpen || !icu->utransClose || !icu->utransTransUChars)
898
UErrorCode status = U_ZERO_ERROR;
903
if (status != U_ZERO_ERROR)
910
UCollator* collator = icu->ucolOpen("", &status);
917
icu->ucolGetVersion(collator, icu->collVersion);
918
icu->ucolClose(collator);
920
// RWLock don't allow lock upgrade (read->write) so we
921
// release read and acquire a write lock.
923
WriteLockGuard writeGuard(icuModules->lock);
925
// In this small amount of time, one may already loaded the
926
// same version, so within the write lock we verify again.
928
if (icuModules->modules().get(version, icu2))
934
icuModules->modules().put(version, icu);
942
bool UnicodeUtil::getCollVersion(const Firebird::string& icuVersion,
943
const Firebird::string& configInfo, Firebird::string& collVersion)
945
ICU* icu = loadICU(icuVersion, configInfo);
950
char version[U_MAX_VERSION_STRING_LENGTH];
951
icu->uVersionToString(icu->collVersion, version);
953
if (string(COLL_30_VERSION) == version)
956
collVersion = version;
961
UnicodeUtil::Utf16Collation* UnicodeUtil::Utf16Collation::create(
962
texttype* tt, USHORT attributes,
963
Firebird::IntlUtil::SpecificAttributesMap& specificAttributes, const Firebird::string& configInfo)
965
int attributeCount = 0;
969
if (specificAttributes.get(IntlUtil::convertAsciiToUtf16("LOCALE"), locale))
973
if (specificAttributes.get(IntlUtil::convertAsciiToUtf16("COLL-VERSION"), collVersion))
977
collVersion = IntlUtil::convertUtf16ToAscii(collVersion, &error);
983
if (specificAttributes.get(IntlUtil::convertAsciiToUtf16("NUMERIC-SORT"), numericSort))
987
numericSort = IntlUtil::convertUtf16ToAscii(numericSort, &error);
988
if (error || !(numericSort == "0" || numericSort == "1"))
992
locale = IntlUtil::convertUtf16ToAscii(locale, &error);
996
if ((attributes & ~(TEXTTYPE_ATTR_PAD_SPACE | TEXTTYPE_ATTR_CASE_INSENSITIVE |
997
TEXTTYPE_ATTR_ACCENT_INSENSITIVE)) ||
998
((attributes & (TEXTTYPE_ATTR_CASE_INSENSITIVE | TEXTTYPE_ATTR_ACCENT_INSENSITIVE)) ==
999
TEXTTYPE_ATTR_ACCENT_INSENSITIVE) ||
1000
(specificAttributes.count() - attributeCount) != 0)
1005
if (collVersion.isEmpty())
1006
collVersion = COLL_30_VERSION;
1008
tt->texttype_pad_option = (attributes & TEXTTYPE_ATTR_PAD_SPACE) ? true : false;
1010
ICU* icu = loadICU(collVersion, locale, configInfo);
1014
UErrorCode status = U_ZERO_ERROR;
1016
UCollator* compareCollator = icu->ucolOpen(locale.c_str(), &status);
1017
if (!compareCollator)
1020
UCollator* partialCollator = icu->ucolOpen(locale.c_str(), &status);
1021
if (!partialCollator)
1023
icu->ucolClose(compareCollator);
1027
UCollator* sortCollator = icu->ucolOpen(locale.c_str(), &status);
1030
icu->ucolClose(compareCollator);
1031
icu->ucolClose(partialCollator);
1035
icu->ucolSetAttribute(partialCollator, UCOL_STRENGTH, UCOL_PRIMARY, &status);
1037
if ((attributes & (TEXTTYPE_ATTR_CASE_INSENSITIVE | TEXTTYPE_ATTR_ACCENT_INSENSITIVE)) ==
1038
(TEXTTYPE_ATTR_CASE_INSENSITIVE | TEXTTYPE_ATTR_ACCENT_INSENSITIVE))
1040
icu->ucolSetAttribute(compareCollator, UCOL_STRENGTH, UCOL_PRIMARY, &status);
1041
tt->texttype_flags |= TEXTTYPE_SEPARATE_UNIQUE;
1042
tt->texttype_canonical_width = 4; // UTF-32
1044
else if (attributes & TEXTTYPE_ATTR_CASE_INSENSITIVE)
1046
icu->ucolSetAttribute(compareCollator, UCOL_STRENGTH, UCOL_SECONDARY, &status);
1047
tt->texttype_flags |= TEXTTYPE_SEPARATE_UNIQUE;
1048
tt->texttype_canonical_width = 4; // UTF-32
1051
tt->texttype_flags = TEXTTYPE_DIRECT_MATCH;
1053
const bool isNumericSort = numericSort == "1";
1056
icu->ucolSetAttribute(compareCollator, UCOL_NUMERIC_COLLATION, UCOL_ON, &status);
1057
icu->ucolSetAttribute(partialCollator, UCOL_NUMERIC_COLLATION, UCOL_ON, &status);
1058
icu->ucolSetAttribute(sortCollator, UCOL_NUMERIC_COLLATION, UCOL_ON, &status);
1061
USet* contractions = icu->usetOpen(0, 0);
1062
// status not verified here.
1063
icu->ucolGetContractions(partialCollator, contractions, &status);
1065
Utf16Collation* obj = new Utf16Collation();
1068
obj->attributes = attributes;
1069
obj->compareCollator = compareCollator;
1070
obj->partialCollator = partialCollator;
1071
obj->sortCollator = sortCollator;
1072
obj->contractions = contractions;
1073
obj->contractionsCount = icu->usetGetItemCount(contractions);
1074
obj->numericSort = isNumericSort;
1080
UnicodeUtil::Utf16Collation::~Utf16Collation()
1082
icu->usetClose(contractions);
1084
icu->ucolClose(compareCollator);
1085
icu->ucolClose(partialCollator);
1086
icu->ucolClose(sortCollator);
1088
// ASF: we should not "delete icu"
1092
USHORT UnicodeUtil::Utf16Collation::keyLength(USHORT len) const
1094
return (len / 4) * 6;
1098
USHORT UnicodeUtil::Utf16Collation::stringToKey(USHORT srcLen, const USHORT* src,
1099
USHORT dstLen, UCHAR* dst,
1100
USHORT key_type) const
1102
fb_assert(src != NULL && dst != NULL);
1103
fb_assert(srcLen % sizeof(*src) == 0);
1105
if (dstLen < keyLength(srcLen))
1108
return INTL_BAD_KEY_LENGTH;
1111
srcLen /= sizeof(*src);
1113
if (tt->texttype_pad_option)
1117
for (pad = src + srcLen - 1; pad >= src; --pad)
1123
srcLen = pad - src + 1;
1126
const UCollator* coll = NULL;
1130
case INTL_KEY_PARTIAL:
1132
coll = partialCollator;
1134
// Remove last bytes of key if they are start of a contraction
1135
// to correctly find in the index.
1136
for (int i = 0; i < contractionsCount; ++i)
1139
UErrorCode status = U_ZERO_ERROR;
1140
int len = icu->usetGetItem(contractions, i, NULL, NULL, str, sizeof(str), &status);
1147
// safe cast - alignment not changed
1148
if (u_strCompare(str, len, reinterpret_cast<const UChar*>(src) + srcLen - len, len, true) == 0)
1157
// ASF: Wee need to remove trailing numbers to return sub key that
1158
// matches full key. Example: "abc1" becomes "abc" to match "abc10".
1159
const USHORT* p = src + srcLen - 1;
1161
for (; p >= src; --p)
1163
if (!(*p >= '0' && *p <= '9'))
1167
srcLen = p - src + 1;
1173
case INTL_KEY_UNIQUE:
1174
coll = compareCollator;
1178
coll = sortCollator;
1183
return INTL_BAD_KEY_LENGTH;
1189
return icu->ucolGetSortKey(coll,
1190
reinterpret_cast<const UChar*>(src), srcLen, dst, dstLen);
1194
SSHORT UnicodeUtil::Utf16Collation::compare(ULONG len1, const USHORT* str1,
1195
ULONG len2, const USHORT* str2,
1196
INTL_BOOL* error_flag) const
1198
fb_assert(len1 % sizeof(*str1) == 0 && len2 % sizeof(*str2) == 0);
1199
fb_assert(str1 != NULL && str2 != NULL);
1200
fb_assert(error_flag != NULL);
1202
*error_flag = false;
1204
len1 /= sizeof(*str1);
1205
len2 /= sizeof(*str2);
1207
if (tt->texttype_pad_option)
1211
for (pad = str1 + len1 - 1; pad >= str1; --pad)
1217
len1 = pad - str1 + 1;
1219
for (pad = str2 + len2 - 1; pad >= str2; --pad)
1225
len2 = pad - str2 + 1;
1228
return (SSHORT)icu->ucolStrColl(compareCollator,
1229
// safe casts - alignment not changed
1230
reinterpret_cast<const UChar*>(str1), len1,
1231
reinterpret_cast<const UChar*>(str2), len2);
1235
ULONG UnicodeUtil::Utf16Collation::canonical(ULONG srcLen, const USHORT* src, ULONG dstLen, ULONG* dst,
1236
const ULONG* exceptions)
1238
HalfStaticArray<USHORT, BUFFER_SMALL / 2> upperStr;
1240
if ((attributes & (TEXTTYPE_ATTR_CASE_INSENSITIVE | TEXTTYPE_ATTR_ACCENT_INSENSITIVE)) ==
1241
(TEXTTYPE_ATTR_CASE_INSENSITIVE | TEXTTYPE_ATTR_ACCENT_INSENSITIVE))
1243
fb_assert(srcLen % sizeof(*src) == 0);
1245
memcpy(upperStr.getBuffer(srcLen / sizeof(USHORT)), src, srcLen);
1247
UErrorCode errorCode = U_ZERO_ERROR;
1248
UTransliterator* trans = icu->utransOpen("Any-Upper; NFD; [:Nonspacing Mark:] Remove; NFC",
1249
UTRANS_FORWARD, NULL, 0, NULL, &errorCode);
1253
const int32_t capacity = dstLen;
1254
int32_t len = srcLen / sizeof(USHORT);
1255
int32_t limit = len;
1257
icu->utransTransUChars(trans, reinterpret_cast<UChar*>(upperStr.begin()),
1258
&len, capacity, 0, &limit, &errorCode);
1259
icu->utransClose(trans);
1261
len *= sizeof(USHORT);
1262
if (ULONG(len) > dstLen)
1263
len = INTL_BAD_STR_LENGTH;
1266
src = upperStr.begin();
1269
return INTL_BAD_STR_LENGTH;
1271
else if (attributes & TEXTTYPE_ATTR_CASE_INSENSITIVE)
1273
srcLen = utf16UpperCase(srcLen, src,
1274
srcLen, upperStr.getBuffer(srcLen / sizeof(USHORT)), exceptions);
1275
src = upperStr.begin();
1278
// convert UTF-16 to UTF-32
1281
return utf16ToUtf32(srcLen, src, dstLen, dst, &errCode, &errPosition) / sizeof(ULONG);
1285
UnicodeUtil::ICU* UnicodeUtil::Utf16Collation::loadICU(
1286
const Firebird::string& collVersion, const Firebird::string& locale,
1287
const Firebird::string& configInfo)
1289
ObjectsArray<string> versions;
1290
getVersions(configInfo, versions);
1292
for (ObjectsArray<string>::const_iterator i(versions.begin()); i != versions.end(); ++i)
1294
ICU* icu = UnicodeUtil::loadICU(*i, configInfo);
1298
if (locale.hasData())
1300
int avail = icu->ulocCountAvailable();
1302
while (--avail >= 0)
1304
if (locale == icu->ulocGetAvailable(avail))
1312
char version[U_MAX_VERSION_STRING_LENGTH];
1313
icu->uVersionToString(icu->collVersion, version);
1315
if (collVersion != version)