2
* PROGRAM: JRD International support
4
* DESCRIPTION: INTL Utility functions
6
* The contents of this file are subject to the Initial
7
* Developer's Public License Version 1.0 (the "License");
8
* you may not use this file except in compliance with the
9
* License. You may obtain a copy of the License at
10
* http://www.ibphoenix.com/main.nfs?a=ibphoenix&page=ibp_idpl.
12
* Software distributed under the License is distributed AS IS,
13
* WITHOUT WARRANTY OF ANY KIND, either express or implied.
14
* See the License for the specific language governing rights
15
* and limitations under the License.
17
* The Original Code was created by Adriano dos Santos Fernandes
18
* for the Firebird Open Source RDBMS project.
20
* Copyright (c) 2006 Adriano dos Santos Fernandes <adrianosf@uol.com.br>
21
* and all contributors signed below.
23
* All Rights Reserved.
24
* Contributor(s): ______________________________________.
28
#include "../jrd/IntlUtil.h"
29
#include "../jrd/unicode_util.h"
30
#include "../jrd/intl_classes.h"
31
#include "../intl/country_codes.h"
32
#include "../common/classes/auto.h"
33
#include "../common/classes/Aligner.h"
36
using Jrd::UnicodeUtil;
43
TextTypeImpl(charset* a_cs, UnicodeUtil::Utf16Collation* a_collation)
45
collation(a_collation)
51
if (cs->charset_fn_destroy)
52
cs->charset_fn_destroy(cs);
59
UnicodeUtil::Utf16Collation* collation;
67
static void unicodeDestroy(texttype* tt);
68
static USHORT unicodeKeyLength(texttype* tt, USHORT len);
69
static USHORT unicodeStrToKey(texttype* tt, USHORT srcLen, const UCHAR* src,
70
USHORT dstLen, UCHAR* dst, USHORT keyType);
71
static SSHORT unicodeCompare(texttype* tt, ULONG len1, const UCHAR* str1,
72
ULONG len2, const UCHAR* str2, INTL_BOOL* errorFlag);
73
static ULONG unicodeCanonical(texttype* tt, ULONG srcLen, const UCHAR* src,
74
ULONG dstLen, UCHAR* dst);
77
string IntlUtil::generateSpecificAttributes(
78
Jrd::CharSet* cs, SpecificAttributesMap& map)
80
bool found = map.getFirst();
85
UCHAR c[sizeof(ULONG)];
88
SpecificAttribute* attribute = map.current();
90
s += escapeAttribute(cs, attribute->first);
92
const USHORT equalChar = '=';
94
size = cs->getConvFromUnicode().convert(
95
sizeof(equalChar), (const UCHAR*)&equalChar,
98
s += string((const char*)&c, size);
100
s += escapeAttribute(cs, attribute->second);
102
found = map.getNext();
106
const USHORT semiColonChar = ';';
107
size = cs->getConvFromUnicode().convert(
108
sizeof(semiColonChar), (const UCHAR*)&semiColonChar, sizeof(c), c);
110
s += string((const char*)&c, size);
118
bool IntlUtil::parseSpecificAttributes(
119
Jrd::CharSet* cs, ULONG len, const UCHAR* s, SpecificAttributesMap* map)
121
// Note that the map isn't cleared.
122
// Old attributes will be combined with the new ones.
125
const UCHAR* const end = s + len;
128
readAttributeChar(cs, &p, end, &size, true);
132
while (p < end && size == cs->getSpaceLength() &&
133
memcmp(p, cs->getSpace(), cs->getSpaceLength()) == 0)
135
if (!readAttributeChar(cs, &p, end, &size, true))
139
const UCHAR* start = p;
141
UCHAR uc[sizeof(ULONG)];
146
uSize = cs->getConvToUnicode().convert(size, p, sizeof(uc), uc);
149
((*(USHORT*)uc >= 'A' && *(USHORT*)uc <= 'Z') ||
150
(*(USHORT*)uc >= 'a' && *(USHORT*)uc <= 'z') ||
151
*(USHORT*)uc == '-' || *(USHORT*)uc == '_'))
153
if (!readAttributeChar(cs, &p, end, &size, true))
163
string name = string((const char*)start, p - start);
164
name = unescapeAttribute(cs, name);
166
while (p < end && size == cs->getSpaceLength() &&
167
memcmp(p, cs->getSpace(), cs->getSpaceLength()) == 0)
169
if (!readAttributeChar(cs, &p, end, &size, true))
173
uSize = cs->getConvToUnicode().convert(size, p, sizeof(uc), uc);
175
if (uSize != 2 || *(USHORT*)uc != '=')
180
if (readAttributeChar(cs, &p, end, &size, true))
182
while (p < end && size == cs->getSpaceLength() &&
183
memcmp(p, cs->getSpace(), cs->getSpaceLength()) == 0)
185
if (!readAttributeChar(cs, &p, end, &size, true))
189
const UCHAR* endNoSpace = start = p;
193
uSize = cs->getConvToUnicode().convert(size, p, sizeof(uc), uc);
195
if (uSize != 2 || *(USHORT*)uc != ';')
197
if (!(size == cs->getSpaceLength() &&
198
memcmp(p, cs->getSpace(), cs->getSpaceLength()) == 0))
200
endNoSpace = p + size;
203
if (!readAttributeChar(cs, &p, end, &size, true))
210
value = unescapeAttribute(cs,
211
string((const char*)start, endNoSpace - start));
214
readAttributeChar(cs, &p, end, &size, true); // skip the semicolon
220
map->put(name, value);
227
string IntlUtil::convertAsciiToUtf16(const string& ascii)
230
const char* end = ascii.c_str() + ascii.length();
232
for (const char* p = ascii.c_str(); p < end; ++p)
234
USHORT c = *(UCHAR*) p;
235
s.append((char*) &c, sizeof(c));
242
string IntlUtil::convertUtf16ToAscii(const string& utf16, bool* error)
244
fb_assert(utf16.length() % sizeof(USHORT) == 0);
247
const USHORT* end = (const USHORT*) (utf16.c_str() + utf16.length());
249
for (const USHORT* p = (const USHORT*) utf16.c_str(); p < end; ++p)
252
s.append((UCHAR) *p);
266
ULONG IntlUtil::cvtAsciiToUtf16(csconvert* obj, ULONG nSrc, const UCHAR* pSrc,
267
ULONG nDest, UCHAR* ppDest, USHORT* err_code, ULONG* err_position)
269
/**************************************
271
* c v t A s c i i T o U t f 1 6
273
**************************************
275
* Functional description
276
* Convert CHARACTER SET ASCII to UTF-16.
277
* Byte values below 128 treated as ASCII.
278
* Byte values >= 128 create BAD_INPUT
280
*************************************/
281
fb_assert(obj != NULL);
282
fb_assert((pSrc != NULL) || (ppDest == NULL));
283
fb_assert(err_code != NULL);
286
if (ppDest == NULL) /* length estimate needed? */
289
Firebird::OutAligner<USHORT> d(ppDest, nDest);
292
const USHORT* const pStart = pDest;
293
const UCHAR* const pStart_src = pSrc;
294
while (nDest >= sizeof(*pDest) && nSrc >= sizeof(*pSrc)) {
296
*err_code = CS_BAD_INPUT;
300
nDest -= sizeof(*pDest);
301
nSrc -= sizeof(*pSrc);
303
if (!*err_code && nSrc) {
304
*err_code = CS_TRUNCATION_ERROR;
306
*err_position = (pSrc - pStart_src) * sizeof(*pSrc);
308
return ((pDest - pStart) * sizeof(*pDest));
312
ULONG IntlUtil::cvtUtf16ToAscii(csconvert* obj, ULONG nSrc, const UCHAR* ppSrc,
313
ULONG nDest, UCHAR* pDest, USHORT* err_code, ULONG* err_position)
315
/**************************************
317
* c v t U t f 1 6 T o A s c i i
319
**************************************
321
* Functional description
322
* Convert UTF16 to CHARACTER SET ASCII.
323
* Byte values below 128 treated as ASCII.
324
* Byte values >= 128 create CONVERT_ERROR
326
*************************************/
327
fb_assert(obj != NULL);
328
fb_assert((ppSrc != NULL) || (pDest == NULL));
329
fb_assert(err_code != NULL);
332
if (pDest == NULL) /* length estimate needed? */
335
Firebird::Aligner<USHORT> s(ppSrc, nSrc);
336
const USHORT* pSrc = s;
338
const UCHAR* const pStart = pDest;
339
const USHORT* const pStart_src = pSrc;
340
while (nDest >= sizeof(*pDest) && nSrc >= sizeof(*pSrc)) {
342
*err_code = CS_CONVERT_ERROR;
346
nDest -= sizeof(*pDest);
347
nSrc -= sizeof(*pSrc);
349
if (!*err_code && nSrc) {
350
*err_code = CS_TRUNCATION_ERROR;
352
*err_position = (pSrc - pStart_src) * sizeof(*pSrc);
354
return ((pDest - pStart) * sizeof(*pDest));
358
void IntlUtil::initAsciiCharset(charset* cs)
360
initNarrowCharset(cs, "ASCII");
361
initConvert(&cs->charset_to_unicode, cvtAsciiToUtf16);
362
initConvert(&cs->charset_from_unicode, cvtUtf16ToAscii);
366
void IntlUtil::initConvert(csconvert* cvt, pfn_INTL_convert func)
368
memset(cvt, 0, sizeof(*cvt));
369
cvt->csconvert_version = CSCONVERT_VERSION_1;
370
cvt->csconvert_name = (const ASCII*) "DIRECT";
371
cvt->csconvert_fn_convert = func;
375
void IntlUtil::initNarrowCharset(charset* cs, const ASCII* name)
377
memset(cs, 0, sizeof(*cs));
378
cs->charset_version = CHARSET_VERSION_1;
379
cs->charset_name = name;
380
cs->charset_flags |= CHARSET_ASCII_BASED;
381
cs->charset_min_bytes_per_char = 1;
382
cs->charset_max_bytes_per_char = 1;
383
cs->charset_space_length = 1;
384
cs->charset_space_character = (const BYTE*) " ";
385
cs->charset_fn_well_formed = NULL;
389
bool IntlUtil::initUnicodeCollation(texttype* tt, charset* cs, const ASCII* name,
390
USHORT attributes, const UCharBuffer& specificAttributes, const string& configInfo)
392
// name comes from stack. Copy it.
393
ASCII* nameCopy = new ASCII[strlen(name) + 1];
394
strcpy(nameCopy, name);
395
tt->texttype_name = nameCopy;
397
tt->texttype_version = TEXTTYPE_VERSION_1;
398
tt->texttype_country = CC_INTL;
399
tt->texttype_fn_destroy = unicodeDestroy;
400
tt->texttype_fn_compare = unicodeCompare;
401
tt->texttype_fn_key_length = unicodeKeyLength;
402
tt->texttype_fn_string_to_key = unicodeStrToKey;
404
IntlUtil::SpecificAttributesMap map;
406
Jrd::CharSet* charSet = NULL;
410
charSet = Jrd::CharSet::createInstance(*getDefaultMemoryPool(), 0, cs);
411
IntlUtil::parseSpecificAttributes(charSet, specificAttributes.getCount(),
412
specificAttributes.begin(), &map);
421
IntlUtil::SpecificAttributesMap map16;
423
bool found = map.getFirst();
431
s1.resize(cs->charset_to_unicode.csconvert_fn_convert(
432
&cs->charset_to_unicode, map.current()->first.length(), NULL, 0, NULL, &errCode, &errPosition));
433
s1.resize(cs->charset_to_unicode.csconvert_fn_convert(
434
&cs->charset_to_unicode, map.current()->first.length(), (UCHAR*) map.current()->first.c_str(),
435
s1.getCapacity(), s1.begin(), &errCode, &errPosition));
437
s2.resize(cs->charset_to_unicode.csconvert_fn_convert(
438
&cs->charset_to_unicode, map.current()->second.length(), NULL, 0, NULL, &errCode, &errPosition));
439
s2.resize(cs->charset_to_unicode.csconvert_fn_convert(
440
&cs->charset_to_unicode, map.current()->second.length(), (UCHAR*) map.current()->second.c_str(),
441
s2.getCapacity(), s2.begin(), &errCode, &errPosition));
443
map16.put(string((char*) s1.begin(), s1.getCount()), string((char*) s2.begin(), s2.getCount()));
445
found = map.getNext();
448
UnicodeUtil::Utf16Collation* collation =
449
UnicodeUtil::Utf16Collation::create(tt, attributes, map16, configInfo);
454
tt->texttype_impl = new TextTypeImpl(cs, collation);
456
if (tt->texttype_canonical_width != 0)
457
tt->texttype_fn_canonical = unicodeCanonical;
463
ULONG IntlUtil::toLower(Jrd::CharSet* cs, ULONG srcLen, const UCHAR* src, ULONG dstLen, UCHAR* dst,
464
const ULONG* exceptions)
466
const ULONG utf16_length = cs->getConvToUnicode().convertLength(srcLen);
467
Firebird::HalfStaticArray<UCHAR, BUFFER_SMALL> utf16_str;
470
if (dstLen >= utf16_length) // if dst buffer is sufficient large, use it as intermediate
473
utf16_ptr = utf16_str.getBuffer(utf16_length);
476
srcLen = cs->getConvToUnicode().convert(srcLen, src, utf16_length, utf16_ptr);
478
// convert to lowercase
479
Firebird::HalfStaticArray<UCHAR, BUFFER_SMALL> lower_str;
480
srcLen = UnicodeUtil::utf16LowerCase(srcLen, Firebird::Aligner<USHORT>(utf16_ptr, srcLen),
481
utf16_length, Firebird::OutAligner<USHORT>(lower_str.getBuffer(utf16_length), utf16_length),
484
// convert to original character set
485
return cs->getConvFromUnicode().convert(srcLen, lower_str.begin(), dstLen, dst);
489
ULONG IntlUtil::toUpper(Jrd::CharSet* cs, ULONG srcLen, const UCHAR* src, ULONG dstLen, UCHAR* dst,
490
const ULONG* exceptions)
492
const ULONG utf16_length = cs->getConvToUnicode().convertLength(srcLen);
493
Firebird::HalfStaticArray<UCHAR, BUFFER_SMALL> utf16_str;
496
if (dstLen >= utf16_length) // if dst buffer is sufficient large, use it as intermediate
499
utf16_ptr = utf16_str.getBuffer(utf16_length);
502
srcLen = cs->getConvToUnicode().convert(srcLen, src, utf16_length, utf16_ptr);
504
// convert to uppercase
505
Firebird::HalfStaticArray<UCHAR, BUFFER_SMALL> upper_str;
506
srcLen = UnicodeUtil::utf16UpperCase(srcLen, Firebird::Aligner<USHORT>(utf16_ptr, srcLen),
507
utf16_length, Firebird::OutAligner<USHORT>(upper_str.getBuffer(utf16_length), utf16_length),
510
// convert to original character set
511
return cs->getConvFromUnicode().convert(srcLen, upper_str.begin(), dstLen, dst);
515
bool IntlUtil::readOneChar(Jrd::CharSet* cs, const UCHAR** s, const UCHAR* end, ULONG* size)
526
UCHAR c[sizeof(ULONG)];
527
*size = cs->substring(end - *s, *s, sizeof(c), c, 0, 1);
533
// Transform ICU-VERSION attribute (given by the user) in COLL-VERSION (to be stored).
534
bool IntlUtil::setupIcuAttributes(charset* cs, const string& specificAttributes,
535
const string& configInfo, string& newSpecificAttributes)
537
AutoPtr<Jrd::CharSet> charSet(Jrd::CharSet::createInstance(*getDefaultMemoryPool(), 0, cs));
539
IntlUtil::SpecificAttributesMap map;
540
if (!IntlUtil::parseSpecificAttributes(charSet, specificAttributes.length(),
541
(const UCHAR*) specificAttributes.begin(), &map))
547
map.get("ICU-VERSION", icuVersion);
550
if (!UnicodeUtil::getCollVersion(icuVersion, configInfo, collVersion))
553
map.remove("ICU-VERSION");
554
map.remove("COLL-VERSION");
556
if (collVersion.hasData())
557
map.put("COLL-VERSION", collVersion);
559
newSpecificAttributes = IntlUtil::generateSpecificAttributes(charSet, map);
564
string IntlUtil::escapeAttribute(Jrd::CharSet* cs, const string& s)
567
const UCHAR* p = (const UCHAR*)s.begin();
568
const UCHAR* end = (const UCHAR*)s.end();
571
while (readOneChar(cs, &p, end, &size))
573
UCHAR uc[sizeof(ULONG)];
575
ULONG uSize = cs->getConvToUnicode().convert(size, p, sizeof(uc), uc);
579
if (*(USHORT*)uc == '\\' || *(USHORT*)uc == '=' || *(USHORT*)uc == ';')
582
UCHAR bytes[sizeof(ULONG)];
584
ULONG bytesSize = cs->getConvFromUnicode().convert(
585
sizeof(USHORT), uc, sizeof(bytes), bytes);
587
ret.append(string((const char*)bytes, bytesSize));
591
ret.append(string((const char*)p, size));
598
string IntlUtil::unescapeAttribute(Jrd::CharSet* cs, const string& s)
601
const UCHAR* p = (const UCHAR*)s.begin();
602
const UCHAR* end = (const UCHAR*)s.end();
605
while (readAttributeChar(cs, &p, end, &size, false))
606
ret.append(string((const char*)p, size));
612
bool IntlUtil::isAttributeEscape(Jrd::CharSet* cs, const UCHAR* s, ULONG size)
614
UCHAR uc[sizeof(ULONG)];
615
ULONG uSize = cs->getConvToUnicode().convert(size, s, sizeof(uc), uc);
617
if (uSize == 2 && *(USHORT*)uc == '\\')
624
bool IntlUtil::readAttributeChar(Jrd::CharSet* cs, const UCHAR** s, const UCHAR* end, ULONG* size, bool returnEscape)
626
if (readOneChar(cs, s, end, size))
628
if (isAttributeEscape(cs, *s, *size))
631
ULONG firstSize = *size;
633
if (readOneChar(cs, s, end, size))
652
static void unicodeDestroy(texttype* tt)
654
delete [] const_cast<ASCII*>(tt->texttype_name);
655
delete tt->texttype_impl;
659
static USHORT unicodeKeyLength(texttype* tt, USHORT len)
661
return tt->texttype_impl->collation->keyLength(
662
len / tt->texttype_impl->cs->charset_max_bytes_per_char * 4);
666
static USHORT unicodeStrToKey(texttype* tt, USHORT srcLen, const UCHAR* src,
667
USHORT dstLen, UCHAR* dst, USHORT keyType)
671
charset* cs = tt->texttype_impl->cs;
673
HalfStaticArray<UCHAR, BUFFER_SMALL> utf16Str;
678
cs->charset_to_unicode.csconvert_fn_convert(
679
&cs->charset_to_unicode,
687
ULONG utf16Len = cs->charset_to_unicode.csconvert_fn_convert(
688
&cs->charset_to_unicode,
691
utf16Str.getCapacity(),
696
return tt->texttype_impl->collation->stringToKey(
697
utf16Len, (USHORT*)utf16Str.begin(), dstLen, dst, keyType);
702
return INTL_BAD_KEY_LENGTH;
707
static SSHORT unicodeCompare(texttype* tt, ULONG len1, const UCHAR* str1,
708
ULONG len2, const UCHAR* str2, INTL_BOOL* errorFlag)
714
charset* cs = tt->texttype_impl->cs;
716
HalfStaticArray<UCHAR, BUFFER_SMALL> utf16Str1;
717
HalfStaticArray<UCHAR, BUFFER_SMALL> utf16Str2;
722
cs->charset_to_unicode.csconvert_fn_convert(
723
&cs->charset_to_unicode,
731
ULONG utf16Len1 = cs->charset_to_unicode.csconvert_fn_convert(
732
&cs->charset_to_unicode,
735
utf16Str1.getCapacity(),
741
cs->charset_to_unicode.csconvert_fn_convert(
742
&cs->charset_to_unicode,
750
ULONG utf16Len2 = cs->charset_to_unicode.csconvert_fn_convert(
751
&cs->charset_to_unicode,
754
utf16Str2.getCapacity(),
759
return tt->texttype_impl->collation->compare(
760
utf16Len1, (USHORT*)utf16Str1.begin(),
761
utf16Len2, (USHORT*)utf16Str2.begin(), errorFlag);
771
static ULONG unicodeCanonical(texttype* tt, ULONG srcLen, const UCHAR* src, ULONG dstLen, UCHAR* dst)
775
charset* cs = tt->texttype_impl->cs;
777
HalfStaticArray<UCHAR, BUFFER_SMALL> utf16Str;
782
cs->charset_to_unicode.csconvert_fn_convert(
783
&cs->charset_to_unicode,
791
ULONG utf16Len = cs->charset_to_unicode.csconvert_fn_convert(
792
&cs->charset_to_unicode,
795
utf16Str.getCapacity(),
800
return tt->texttype_impl->collation->canonical(
801
utf16Len, Firebird::Aligner<USHORT>(utf16Str.begin(), utf16Len),
802
dstLen, Firebird::OutAligner<ULONG>(dst, dstLen), NULL);
807
return INTL_BAD_KEY_LENGTH;
812
} // namespace Firebird