~ubuntu-branches/ubuntu/oneiric/firebird2.5/oneiric-security : revision 19

1

/*

2

* PROGRAM: JRD International support

3

* MODULE: unicode_util.h

4

* DESCRIPTION: Unicode functions

5

*

6

* The contents of this file are subject to the Initial

7

* Developer's Public License Version 1.0 (the "License");

8

* you may not use this file except in compliance with the

9

* License. You may obtain a copy of the License at

10

* http://www.ibphoenix.com/main.nfs?a=ibphoenix&page=ibp_idpl.

11

*

12

* Software distributed under the License is distributed AS IS,

13

* WITHOUT WARRANTY OF ANY KIND, either express or implied.

14

* See the License for the specific language governing rights

15

* and limitations under the License.

16

*

17

* The Original Code was created by Adriano dos Santos Fernandes

18

* for the Firebird Open Source RDBMS project.

19

*

20

21

* and all contributors signed below.

22

*

23

24

* Contributor(s): ______________________________________.

25

*/

26

27

#include "firebird.h"

28

#include "../common/classes/alloc.h"

29

#include "../jrd/constants.h"

30

#include "../jrd/unicode_util.h"

31

#include "../jrd/CharSet.h"

32

#include "../jrd/IntlUtil.h"

33

#include "../jrd/gdsassert.h"

34

#include "../common/classes/auto.h"

35

#include "../common/classes/GenericMap.h"

36

#include "../common/classes/init.h"

37

#include "../common/classes/objects_array.h"

38

#include "../common/classes/rwlock.h"

39

#include "unicode/ustring.h"

40

#include "unicode/utrans.h"

41

#include "unicode/uchar.h"

42

#include "unicode/ucnv.h"

43

#include "unicode/ucol.h"

44

45

46

using namespace Firebird;

47

48

49

namespace Jrd {

50

51

52

const char* const UnicodeUtil::DEFAULT_ICU_VERSION =

53

STRINGIZE(U_ICU_VERSION_MAJOR_NUM)"."STRINGIZE(U_ICU_VERSION_MINOR_NUM);

54

55

56

// encapsulate ICU collations libraries

57

struct UnicodeUtil::ICU

58

{

59

private:

60

ICU(const ICU&); // not implemented

61

ICU& operator =(const ICU&); // not implemented

62

63

public:

64

ICU()

65

: inModule(NULL), ucModule(NULL)

66

{

67

}

68

69

~ICU()

70

{

71

delete ucModule;

72

delete inModule;

73

}

74

75

ModuleLoader::Module* inModule;

76

ModuleLoader::Module* ucModule;

77

UVersionInfo collVersion;

78

79

void (U_EXPORT2 *uInit)(UErrorCode* status);

80

void (U_EXPORT2 *uVersionToString)(UVersionInfo versionArray, char* versionString);

81

82

int32_t (U_EXPORT2 *ulocCountAvailable)();

83

const char* (U_EXPORT2 *ulocGetAvailable)(int32_t n);

84

85

void (U_EXPORT2 *usetClose)(USet* set);

86

int32_t (U_EXPORT2 *usetGetItem)(const USet* set, int32_t itemIndex,

87

UChar32* start, UChar32* end, UChar* str, int32_t strCapacity, UErrorCode* ec);

88

int32_t (U_EXPORT2 *usetGetItemCount)(const USet* set);

89

USet* (U_EXPORT2 *usetOpen)(UChar32 start, UChar32 end);

90

91

void (U_EXPORT2 *ucolClose)(UCollator* coll);

92

int32_t (U_EXPORT2 *ucolGetContractions)(const UCollator* coll, USet* conts, UErrorCode* status);

93

int32_t (U_EXPORT2 *ucolGetSortKey)(const UCollator* coll, const UChar* source,

94

int32_t sourceLength, uint8_t* result, int32_t resultLength);

95

UCollator* (U_EXPORT2 *ucolOpen)(const char* loc, UErrorCode* status);

96

void (U_EXPORT2 *ucolSetAttribute)(UCollator* coll, UColAttribute attr,

97

UColAttributeValue value, UErrorCode* status);

98

UCollationResult (U_EXPORT2 *ucolStrColl)(const UCollator* coll, const UChar* source,

99

int32_t sourceLength, const UChar* target, int32_t targetLength);

100

void (U_EXPORT2 *ucolGetVersion)(const UCollator* coll, UVersionInfo info);

101

102

void (U_EXPORT2 *utransClose)(UTransliterator* trans);

103

UTransliterator* (U_EXPORT2 *utransOpen)(

104

const char* id,

105

UTransDirection dir,

106

const UChar* rules, /* may be Null */

107

int32_t rulesLength, /* -1 if null-terminated */

108

UParseError* parseError, /* may be Null */

109

UErrorCode* status);

110

void (U_EXPORT2 *utransTransUChars)(

111

const UTransliterator* trans,

112

UChar* text,

113

int32_t* textLength,

114

int32_t textCapacity,

115

int32_t start,

116

int32_t* limit,

117

UErrorCode* status);

118

};

119

120

121

// cache ICU module instances to not load and unload many times

122

class UnicodeUtil::ICUModules

123

{

124

typedef GenericMap<Pair<Left<string, ICU*> > > ModulesMap;

125

126

public:

127

explicit ICUModules(MemoryPool&)

128

{

129

}

130

131

~ICUModules()

132

{

133

ModulesMap::Accessor modulesAccessor(&modules());

134

for (bool found = modulesAccessor.getFirst(); found; found = modulesAccessor.getNext())

135

delete modulesAccessor.current()->second;

136

}

137

138

InitInstance<ModulesMap> modules;

139

RWLock lock;

140

};

141

142

namespace {

143

GlobalPtr<UnicodeUtil::ICUModules> icuModules;

144

}

145

146

147

static const char* const COLL_30_VERSION = "41.128.4.4"; // ICU 3.0 collator version

148

149

150

static void getVersions(const string& configInfo, ObjectsArray<string>& versions)

151

{

152

charset cs;

153

IntlUtil::initAsciiCharset(&cs);

154

155

AutoPtr<CharSet> ascii(Jrd::CharSet::createInstance(*getDefaultMemoryPool(), 0, &cs));

156

157

IntlUtil::SpecificAttributesMap config;

158

IntlUtil::parseSpecificAttributes(ascii, configInfo.length(),

159

(const UCHAR*) configInfo.c_str(), &config);

160

161

string versionsStr;

162

if (config.get("icu_versions", versionsStr))

163

versionsStr.trim();

164

else

165

versionsStr = "default";

166

167

versions.clear();

168

169

size_t start = 0;

170

size_t n;

171

172

for (size_t i = versionsStr.find(' '); i != versionsStr.npos;

173

start = i + 1, i = versionsStr.find(' ', start))

174

{

175

if ((n = versionsStr.find_first_not_of(' ', start)) != versionsStr.npos)

176

start = n;

177

versions.add(versionsStr.substr(start, i - start));

178

}

179

180

if ((n = versionsStr.find_first_not_of(' ', start)) != versionsStr.npos)

181

start = n;

182

versions.add(versionsStr.substr(start));

183

}

184

185

186

// BOCU-1

187

USHORT UnicodeUtil::utf16KeyLength(USHORT len)

188

{

189

return (len / 2) * 4;

190

}

191

192

193

// BOCU-1

194

USHORT UnicodeUtil::utf16ToKey(USHORT srcLen, const USHORT* src, USHORT dstLen, UCHAR* dst)

195

{

196

fb_assert(srcLen % sizeof(*src) == 0);

197

fb_assert(src != NULL && dst != NULL);

198

199

if (dstLen < srcLen / sizeof(*src) * 4)

200

return INTL_BAD_KEY_LENGTH;

201

202

UErrorCode status = U_ZERO_ERROR;

203

UConverter* conv = ucnv_open("BOCU-1", &status);

204

fb_assert(U_SUCCESS(status));

205

206

const int32_t len = ucnv_fromUChars(conv, reinterpret_cast<char*>(dst), dstLen,

207

// safe cast - alignment not changed

208

reinterpret_cast<const UChar*>(src), srcLen / sizeof(*src), &status);

209

fb_assert(U_SUCCESS(status));

210

211

ucnv_close(conv);

212

213

return len;

214

}

215

216

217

ULONG UnicodeUtil::utf16LowerCase(ULONG srcLen, const USHORT* src, ULONG dstLen, USHORT* dst,

218

const ULONG* exceptions)

219

{

220

// this is more correct but we don't support completely yet

221

/***

222

fb_assert(srcLen % sizeof(*src) == 0);

223

fb_assert(src != NULL && dst != NULL);

224

225

memcpy(dst, src, srcLen);

226

227

UErrorCode errorCode = U_ZERO_ERROR;

228

UTransliterator* trans = utrans_open("Any-Lower", UTRANS_FORWARD, NULL, 0, NULL, &errorCode);

229

//// TODO: add exceptions in this way: Any-Lower[^\\u03BC] - for U+03BC

230

231

if (errorCode <= 0)

232

{

233

int32_t capacity = dstLen;

234

int32_t len = srcLen / sizeof(USHORT);

235

int32_t limit = len;

236

237

utrans_transUChars(trans, reinterpret_cast<UChar*>(dst), &len, capacity, 0, &limit, &errorCode);

238

utrans_close(trans);

239

240

len *= sizeof(USHORT);

241

if (len > dstLen)

242

len = INTL_BAD_STR_LENGTH;

243

244

return len;

245

}

246

else

247

return INTL_BAD_STR_LENGTH;

248

***/

249

250

fb_assert(srcLen % sizeof(*src) == 0);

251

fb_assert(src != NULL && dst != NULL);

252

253

srcLen /= sizeof(*src);

254

dstLen /= sizeof(*dst);

255

256

ULONG n = 0;

257

258

for (ULONG i = 0; i < srcLen;)

259

{

260

uint32_t c;

261

U16_NEXT(src, i, srcLen, c);

262

263

if (!exceptions)

264

c = u_tolower(c);

265

else

266

{

267

const ULONG* p = exceptions;

268

while (*p && *p != c)

269

++p;

270

271

if (*p == 0)

272

c = u_tolower(c);

273

}

274

275

bool error;

276

U16_APPEND(dst, n, dstLen, c, error);

277

}

278

279

return n * sizeof(*dst);

280

}

281

282

283

ULONG UnicodeUtil::utf16UpperCase(ULONG srcLen, const USHORT* src, ULONG dstLen, USHORT* dst,

284

const ULONG* exceptions)

285

{

286

// this is more correct but we don't support completely yet

287

/***

288

fb_assert(srcLen % sizeof(*src) == 0);

289

fb_assert(src != NULL && dst != NULL);

290

291

memcpy(dst, src, srcLen);

292

293

UErrorCode errorCode = U_ZERO_ERROR;

294

UTransliterator* trans = utrans_open("Any-Upper", UTRANS_FORWARD, NULL, 0, NULL, &errorCode);

295

//// TODO: add exceptions in this way: Any-Upper[^\\u03BC] - for U+03BC

296

297

if (errorCode <= 0)

298

{

299

int32_t capacity = dstLen;

300

int32_t len = srcLen / sizeof(USHORT);

301

int32_t limit = len;

302

303

utrans_transUChars(trans, reinterpret_cast<UChar*>(dst), &len, capacity, 0, &limit, &errorCode);

304

utrans_close(trans);

305

306

len *= sizeof(USHORT);

307

if (len > dstLen)

308

len = INTL_BAD_STR_LENGTH;

309

310

return len;

311

}

312

else

313

return INTL_BAD_STR_LENGTH;

314

***/

315

316

fb_assert(srcLen % sizeof(*src) == 0);

317

fb_assert(src != NULL && dst != NULL);

318

319

srcLen /= sizeof(*src);

320

dstLen /= sizeof(*dst);

321

322

ULONG n = 0;

323

324

for (ULONG i = 0; i < srcLen;)

325

{

326

uint32_t c;

327

U16_NEXT(src, i, srcLen, c);

328

329

if (!exceptions)

330

c = u_toupper(c);

331

else

332

{

333

const ULONG* p = exceptions;

334

while (*p && *p != c)

335

++p;

336

337

if (*p == 0)

338

c = u_toupper(c);

339

}

340

341

bool error;

342

U16_APPEND(dst, n, dstLen, c, error);

343

}

344

345

return n * sizeof(*dst);

346

}

347

348

349

ULONG UnicodeUtil::utf16ToUtf8(ULONG srcLen, const USHORT* src, ULONG dstLen, UCHAR* dst,

350

USHORT* err_code, ULONG* err_position)

351

{

352

fb_assert(srcLen % sizeof(*src) == 0);

353

fb_assert(src != NULL || dst == NULL);

354

fb_assert(err_code != NULL);

355

fb_assert(err_position != NULL);

356

357

*err_code = 0;

358

359

if (dst == NULL)

360

return srcLen / sizeof(*src) * 4;

361

362

srcLen /= sizeof(*src);

363

364

const UCHAR* const dstStart = dst;

365

const UCHAR* const dstEnd = dst + dstLen;

366

367

for (ULONG i = 0; i < srcLen; )

368

{

369

if (dstEnd - dst == 0)

370

{

371

*err_code = CS_TRUNCATION_ERROR;

372

*err_position = i * sizeof(*src);

373

break;

374

}

375

376

UChar32 c = src[i++];

377

378

if (c <= 0x7F)

379

*dst++ = c;

380

else

381

{

382

*err_position = (i - 1) * sizeof(*src);

383

384

if (UTF_IS_SURROGATE(c))

385

{

386

UChar32 c2;

387

388

if (UTF_IS_SURROGATE_FIRST(c) && i < srcLen && UTF_IS_TRAIL(c2 = src[i]))

389

{

390

++i;

391

c = UTF16_GET_PAIR_VALUE(c, c2);

392

}

393

else

394

{

395

*err_code = CS_BAD_INPUT;

396

break;

397

}

398

}

399

400

if (U8_LENGTH(c) <= dstEnd - dst)

401

{

402

int j = 0;

403

U8_APPEND_UNSAFE(dst, j, c);

404

dst += j;

405

}

406

else

407

{

408

*err_code = CS_TRUNCATION_ERROR;

409

break;

410

}

411

}

412

}

413

414

return (dst - dstStart) * sizeof(*dst);

415

}

416

417

418

ULONG UnicodeUtil::utf8ToUtf16(ULONG srcLen, const UCHAR* src, ULONG dstLen, USHORT* dst,

419

USHORT* err_code, ULONG* err_position)

420

{

421

fb_assert(src != NULL || dst == NULL);

422

fb_assert(err_code != NULL);

423

fb_assert(err_position != NULL);

424

425

*err_code = 0;

426

427

if (dst == NULL)

428

return srcLen * sizeof(*dst);

429

430

const USHORT* const dstStart = dst;

431

const USHORT* const dstEnd = dst + dstLen / sizeof(*dst);

432

433

for (ULONG i = 0; i < srcLen; )

434

{

435

if (dstEnd - dst == 0)

436

{

437

*err_code = CS_TRUNCATION_ERROR;

438

*err_position = i;

439

break;

440

}

441

442

UChar32 c = src[i++];

443

444

if (c <= 0x7F)

445

*dst++ = c;

446

else

447

{

448

*err_position = i - 1;

449

450

c = utf8_nextCharSafeBody(src, reinterpret_cast<int32_t*>(&i), srcLen, c, -1);

451

452

if (c < 0)

453

{

454

*err_code = CS_BAD_INPUT;

455

break;

456

}

457

else if (c <= 0xFFFF)

458

*dst++ = c;

459

else

460

{

461

if (dstEnd - dst > 1)

462

{

463

*dst++ = UTF16_LEAD(c);

464

*dst++ = UTF16_TRAIL(c);

465

}

466

else

467

{

468

*err_code = CS_TRUNCATION_ERROR;

469

break;

470

}

471

}

472

}

473

}

474

475

return (dst - dstStart) * sizeof(*dst);

476

}

477

478

479

ULONG UnicodeUtil::utf16ToUtf32(ULONG srcLen, const USHORT* src, ULONG dstLen, ULONG* dst,

480

USHORT* err_code, ULONG* err_position)

481

{

482

fb_assert(srcLen % sizeof(*src) == 0);

483

fb_assert(src != NULL || dst == NULL);

484

fb_assert(err_code != NULL);

485

fb_assert(err_position != NULL);

486

487

*err_code = 0;

488

489

if (dst == NULL)

490

return srcLen / sizeof(*src) * sizeof(*dst);

491

492

// based on u_strToUTF32 from ICU

493

const USHORT* const srcStart = src;

494

const ULONG* const dstStart = dst;

495

const USHORT* const srcEnd = src + srcLen / sizeof(*src);

496

const ULONG* const dstEnd = dst + dstLen / sizeof(*dst);

497

498

while (src < srcEnd && dst < dstEnd)

499

{

500

ULONG ch = *src++;

501

502

if (UTF_IS_LEAD(ch))

503

{

504

ULONG ch2;

505

if (src < srcEnd && UTF_IS_TRAIL(ch2 = *src))

506

{

507

ch = UTF16_GET_PAIR_VALUE(ch, ch2);

508

++src;

509

}

510

else

511

{

512

*err_code = CS_BAD_INPUT;

513

--src;

514

break;

515

}

516

}

517

518

*(dst++) = ch;

519

}

520

521

*err_position = (src - srcStart) * sizeof(*src);

522

523

if (*err_code == 0 && src < srcEnd)

524

*err_code = CS_TRUNCATION_ERROR;

525

526

return (dst - dstStart) * sizeof(*dst);

527

}

528

529

530

ULONG UnicodeUtil::utf32ToUtf16(ULONG srcLen, const ULONG* src, ULONG dstLen, USHORT* dst,

531

USHORT* err_code, ULONG* err_position)

532

{

533

fb_assert(srcLen % sizeof(*src) == 0);

534

fb_assert(src != NULL || dst == NULL);

535

fb_assert(err_code != NULL);

536

fb_assert(err_position != NULL);

537

538

*err_code = 0;

539

540

if (dst == NULL)

541

return srcLen;

542

543

// based on u_strFromUTF32 from ICU

544

const ULONG* const srcStart = src;

545

const USHORT* const dstStart = dst;

546

const ULONG* const srcEnd = src + srcLen / sizeof(*src);

547

const USHORT* const dstEnd = dst + dstLen / sizeof(*dst);

548

549

while (src < srcEnd && dst < dstEnd)

550

{

551

const ULONG ch = *src++;

552

553

if (ch <= 0xFFFF)

554

*(dst++) = ch;

555

else if (ch <= 0x10FFFF)

556

{

557

*(dst++) = UTF16_LEAD(ch);

558

559

if (dst < dstEnd)

560

*(dst++) = UTF16_TRAIL(ch);

561

else

562

{

563

*err_code = CS_TRUNCATION_ERROR;

564

--dst;

565

break;

566

}

567

}

568

else

569

{

570

*err_code = CS_BAD_INPUT;

571

--src;

572

break;

573

}

574

}

575

576

*err_position = (src - srcStart) * sizeof(*src);

577

578

if (*err_code == 0 && src < srcEnd)

579

*err_code = CS_TRUNCATION_ERROR;

580

581

return (dst - dstStart) * sizeof(*dst);

582

}

583

584

585

SSHORT UnicodeUtil::utf16Compare(ULONG len1, const USHORT* str1, ULONG len2, const USHORT* str2,

586

INTL_BOOL* error_flag)

587

{

588

fb_assert(len1 % sizeof(*str1) == 0);

589

fb_assert(len2 % sizeof(*str2) == 0);

590

fb_assert(str1 != NULL);

591

fb_assert(str2 != NULL);

592

fb_assert(error_flag != NULL);

593

594

*error_flag = false;

595

596

// safe casts - alignment not changed

597

int32_t cmp = u_strCompare(reinterpret_cast<const UChar*>(str1), len1 / sizeof(*str1),

598

reinterpret_cast<const UChar*>(str2), len2 / sizeof(*str2), true);

599

600

return (cmp < 0 ? -1 : (cmp > 0 ? 1 : 0));

601

}

602

603

604

ULONG UnicodeUtil::utf16Length(ULONG len, const USHORT* str)

605

{

606

fb_assert(len % sizeof(*str) == 0);

607

// safe cast - alignment not changed

608

return u_countChar32(reinterpret_cast<const UChar*>(str), len / sizeof(*str));

609

}

610

611

612

ULONG UnicodeUtil::utf16Substring(ULONG srcLen, const USHORT* src, ULONG dstLen, USHORT* dst,

613

ULONG startPos, ULONG length)

614

{

615

fb_assert(srcLen % sizeof(*src) == 0);

616

fb_assert(src != NULL && dst != NULL);

617

618

if (length == 0)

619

return 0;

620

621

const USHORT* const dstStart = dst;

622

const USHORT* const srcEnd = src + srcLen / sizeof(*src);

623

const USHORT* const dstEnd = dst + dstLen / sizeof(*dst);

624

ULONG pos = 0;

625

626

while (src < srcEnd && dst < dstEnd && pos < startPos)

627

{

628

const ULONG ch = *src++;

629

630

if (UTF_IS_LEAD(ch))

631

{

632

if (src < srcEnd && UTF_IS_TRAIL(*src))

633

++src;

634

}

635

636

++pos;

637

}

638

639

while (src < srcEnd && dst < dstEnd && pos < startPos + length)

640

{

641

const ULONG ch = *src++;

642

643

*(dst++) = ch;

644

645

if (UTF_IS_LEAD(ch))

646

{

647

ULONG ch2;

648

if (src < srcEnd && UTF_IS_TRAIL(ch2 = *src))

649

{

650

*(dst++) = ch2;

651

++src;

652

}

653

}

654

655

++pos;

656

}

657

658

return (dst - dstStart) * sizeof(*dst);

659

}

660

661

662

INTL_BOOL UnicodeUtil::utf8WellFormed(ULONG len, const UCHAR* str, ULONG* offending_position)

663

{

664

fb_assert(str != NULL);

665

666

for (ULONG i = 0; i < len; )

667

{

668

UChar32 c = str[i++];

669

670

if (c > 0x7F)

671

{

672

const ULONG save_i = i - 1;

673

674

c = utf8_nextCharSafeBody(str, reinterpret_cast<int32_t*>(&i), len, c, -1);

675

676

if (c < 0)

677

{

678

if (offending_position)

679

*offending_position = save_i;

680

return false; // malformed

681

}

682

}

683

}

684

685

return true; // well-formed

686

}

687

688

689

INTL_BOOL UnicodeUtil::utf16WellFormed(ULONG len, const USHORT* str, ULONG* offending_position)

690

{

691

fb_assert(str != NULL);

692

fb_assert(len % sizeof(*str) == 0);

693

694

len /= sizeof(*str);

695

696

for (ULONG i = 0; i < len;)

697

{

698

const ULONG save_i = i;

699

700

uint32_t c;

701

U16_NEXT(str, i, len, c);

702

703

if (!U_IS_SUPPLEMENTARY(c) && (U16_IS_LEAD(c) || U16_IS_TRAIL(c)))

704

{

705

if (offending_position)

706

*offending_position = save_i * sizeof(*str);

707

return false; // malformed

708

}

709

}

710

711

return true; // well-formed

712

}

713

714

715

INTL_BOOL UnicodeUtil::utf32WellFormed(ULONG len, const ULONG* str, ULONG* offending_position)

716

{

717

fb_assert(str != NULL);

718

fb_assert(len % sizeof(*str) == 0);

719

720

const ULONG* strStart = str;

721

722

while (len)

723

{

724

if (!U_IS_UNICODE_CHAR(*str))

725

{

726

if (offending_position)

727

*offending_position = (str - strStart) * sizeof(*str);

728

return false; // malformed

729

}

730

731

++str;

732

len -= sizeof(*str);

733

}

734

735

return true; // well-formed

736

}

737

738

739

UnicodeUtil::ICU* UnicodeUtil::loadICU(const Firebird::string& icuVersion,

740

const Firebird::string& configInfo)

741

{

742

#if defined(WIN_NT)

743

const char* const inTemplate = "icuin%s%s.dll";

744

const char* const ucTemplate = "icuuc%s%s.dll";

745

#elif defined(DARWIN)

746

const char* const inTemplate = "/Library/Frameworks/Firebird.framework/Versions/A/Libraries/libicui18n.dylib";

747

const char* const ucTemplate = "/Library/Frameworks/Firebird.framework/versions/A/Libraries/libicuuc.dylib";

748

#elif defined(HPUX)

749

const char* const inTemplate = "libicui18n.sl.%s%s";

750

const char* const ucTemplate = "libicuuc.sl.%s%s";

751

#else

752

const char* const inTemplate = "libicui18n.so.%s%s";

753

const char* const ucTemplate = "libicuuc.so.%s%s";

754

#endif

755

756

ObjectsArray<string> versions;

757

getVersions(configInfo, versions);

758

759

string version = icuVersion.isEmpty() ? versions[0] : icuVersion;

760

if (version == "default")

761

{

762

version.printf("%d.%d", U_ICU_VERSION_MAJOR_NUM, U_ICU_VERSION_MINOR_NUM);

763

}

764

765

for (ObjectsArray<string>::const_iterator i(versions.begin()); i != versions.end(); ++i)

766

{

767

string majorVersion;

768

string minorVersion;

769

770

if (*i == "default")

771

{

772

majorVersion = STRINGIZE(U_ICU_VERSION_MAJOR_NUM);

773

minorVersion = STRINGIZE(U_ICU_VERSION_MINOR_NUM);

774

}

775

else

776

{

777

const size_t pos = i->find('.');

778

if (pos == i->npos)

779

continue;

780

781

majorVersion = i->substr(0, pos);

782

minorVersion = i->substr(pos + 1);

783

}

784

785

if (version != majorVersion + "." + minorVersion)

786

{

787

continue;

788

}

789

790

ReadLockGuard readGuard(icuModules->lock);

791

792

ICU* icu;

793

if (icuModules->modules().get(version, icu))

794

{

795

return icu;

796

}

797

798

PathName filename;

799

filename.printf(ucTemplate, majorVersion.c_str(), minorVersion.c_str());

800

801

icu = FB_NEW(*getDefaultMemoryPool()) ICU();

802

803

icu->ucModule = ModuleLoader::loadModule(filename);

804

if (!icu->ucModule)

805

{

806

ModuleLoader::doctorModuleExtention(filename);

807

icu->ucModule = ModuleLoader::loadModule(filename);

808

}

809

810

if (!icu->ucModule)

811

{

812

delete icu;

813

continue;

814

}

815

816

filename.printf(inTemplate, majorVersion.c_str(), minorVersion.c_str());

817

818

icu->inModule = ModuleLoader::loadModule(filename);

819

if (!icu->inModule)

820

{

821

ModuleLoader::doctorModuleExtention(filename);

822

icu->inModule = ModuleLoader::loadModule(filename);

823

}

824

825

if (!icu->inModule)

826

{

827

delete icu;

828

continue;

829

}

830

831

string symbol;

832

833

symbol.printf("u_init_%s_%s", majorVersion.c_str(), minorVersion.c_str());

834

icu->ucModule->findSymbol(symbol, icu->uInit);

835

836

symbol.printf("u_versionToString_%s_%s", majorVersion.c_str(), minorVersion.c_str());

837

icu->ucModule->findSymbol(symbol, icu->uVersionToString);

838

839

symbol.printf("uloc_countAvailable_%s_%s", majorVersion.c_str(), minorVersion.c_str());

840

icu->ucModule->findSymbol(symbol, icu->ulocCountAvailable);

841

842

symbol.printf("uloc_getAvailable_%s_%s", majorVersion.c_str(), minorVersion.c_str());

843

icu->ucModule->findSymbol(symbol, icu->ulocGetAvailable);

844

845

symbol.printf("uset_close_%s_%s", majorVersion.c_str(), minorVersion.c_str());

846

icu->ucModule->findSymbol(symbol, icu->usetClose);

847

848

symbol.printf("uset_getItem_%s_%s", majorVersion.c_str(), minorVersion.c_str());

849

icu->ucModule->findSymbol(symbol, icu->usetGetItem);

850

851

symbol.printf("uset_getItemCount_%s_%s", majorVersion.c_str(), minorVersion.c_str());

852

icu->ucModule->findSymbol(symbol, icu->usetGetItemCount);

853

854

symbol.printf("uset_open_%s_%s", majorVersion.c_str(), minorVersion.c_str());

855

icu->ucModule->findSymbol(symbol, icu->usetOpen);

856

857

symbol.printf("ucol_close_%s_%s", majorVersion.c_str(), minorVersion.c_str());

858

icu->inModule->findSymbol(symbol, icu->ucolClose);

859

860

symbol.printf("ucol_getContractions_%s_%s", majorVersion.c_str(), minorVersion.c_str());

861

icu->inModule->findSymbol(symbol, icu->ucolGetContractions);

862

863

symbol.printf("ucol_getSortKey_%s_%s", majorVersion.c_str(), minorVersion.c_str());

864

icu->inModule->findSymbol(symbol, icu->ucolGetSortKey);

865

866

symbol.printf("ucol_open_%s_%s", majorVersion.c_str(), minorVersion.c_str());

867

icu->inModule->findSymbol(symbol, icu->ucolOpen);

868

869

symbol.printf("ucol_setAttribute_%s_%s", majorVersion.c_str(), minorVersion.c_str());

870

icu->inModule->findSymbol(symbol, icu->ucolSetAttribute);

871

872

symbol.printf("ucol_strcoll_%s_%s", majorVersion.c_str(), minorVersion.c_str());

873

icu->inModule->findSymbol(symbol, icu->ucolStrColl);

874

875

symbol.printf("ucol_getVersion_%s_%s", majorVersion.c_str(), minorVersion.c_str());

876

icu->inModule->findSymbol(symbol, icu->ucolGetVersion);

877

878

symbol.printf("utrans_open_%s_%s", majorVersion.c_str(), minorVersion.c_str());

879

icu->inModule->findSymbol(symbol, icu->utransOpen);

880

881

symbol.printf("utrans_close_%s_%s", majorVersion.c_str(), minorVersion.c_str());

882

icu->inModule->findSymbol(symbol, icu->utransClose);

883

884

symbol.printf("utrans_transUChars_%s_%s", majorVersion.c_str(), minorVersion.c_str());

885

icu->inModule->findSymbol(symbol, icu->utransTransUChars);

886

887

if (/*!icu->uInit ||*/ !icu->uVersionToString || !icu->ulocCountAvailable ||

888

!icu->ulocGetAvailable || !icu->usetClose || !icu->usetGetItem ||

889

!icu->usetGetItemCount || !icu->usetOpen || !icu->ucolClose ||

890

!icu->ucolGetContractions || !icu->ucolGetSortKey || !icu->ucolOpen ||

891

!icu->ucolSetAttribute || !icu->ucolStrColl || !icu->ucolGetVersion ||

892

!icu->utransOpen || !icu->utransClose || !icu->utransTransUChars)

893

{

894

delete icu;

895

continue;

896

}

897

898

UErrorCode status = U_ZERO_ERROR;

899

900

if (icu->uInit)

901

{

902

icu->uInit(&status);

903

if (status != U_ZERO_ERROR)

904

{

905

delete icu;

906

continue;

907

}

908

}

909

910

UCollator* collator = icu->ucolOpen("", &status);

911

if (!collator)

912

{

913

delete icu;

914

continue;

915

}

916

917

icu->ucolGetVersion(collator, icu->collVersion);

918

icu->ucolClose(collator);

919

920

// RWLock don't allow lock upgrade (read->write) so we

921

// release read and acquire a write lock.

922

readGuard.release();

923

WriteLockGuard writeGuard(icuModules->lock);

924

925

// In this small amount of time, one may already loaded the

926

// same version, so within the write lock we verify again.

927

ICU* icu2;

928

if (icuModules->modules().get(version, icu2))

929

{

930

delete icu;

931

return icu2;

932

}

933

934

icuModules->modules().put(version, icu);

935

return icu;

936

}

937

938

return NULL;

939

}

940

941

942

bool UnicodeUtil::getCollVersion(const Firebird::string& icuVersion,

943

const Firebird::string& configInfo, Firebird::string& collVersion)

944

{

945

ICU* icu = loadICU(icuVersion, configInfo);

946

947

if (!icu)

948

return false;

949

950

char version[U_MAX_VERSION_STRING_LENGTH];

951

icu->uVersionToString(icu->collVersion, version);

952

953

if (string(COLL_30_VERSION) == version)

954

collVersion = "";

955

else

956

collVersion = version;

957

958

return true;

959

}

960

961

UnicodeUtil::Utf16Collation* UnicodeUtil::Utf16Collation::create(

962

texttype* tt, USHORT attributes,

963

Firebird::IntlUtil::SpecificAttributesMap& specificAttributes, const Firebird::string& configInfo)

964

{

965

int attributeCount = 0;

966

bool error;

967

968

string locale;

969

if (specificAttributes.get(IntlUtil::convertAsciiToUtf16("LOCALE"), locale))

970

++attributeCount;

971

972

string collVersion;

973

if (specificAttributes.get(IntlUtil::convertAsciiToUtf16("COLL-VERSION"), collVersion))

974

{

975

++attributeCount;

976

977

collVersion = IntlUtil::convertUtf16ToAscii(collVersion, &error);

978

if (error)

979

return NULL;

980

}

981

982

string numericSort;

983

if (specificAttributes.get(IntlUtil::convertAsciiToUtf16("NUMERIC-SORT"), numericSort))

984

{

985

++attributeCount;

986

987

numericSort = IntlUtil::convertUtf16ToAscii(numericSort, &error);

988

if (error || !(numericSort == "0" || numericSort == "1"))

989

return NULL;

990

}

991

992

locale = IntlUtil::convertUtf16ToAscii(locale, &error);

993

if (error)

994

return NULL;

995

996

if ((attributes & ~(TEXTTYPE_ATTR_PAD_SPACE | TEXTTYPE_ATTR_CASE_INSENSITIVE |

997

TEXTTYPE_ATTR_ACCENT_INSENSITIVE)) ||

998

((attributes & (TEXTTYPE_ATTR_CASE_INSENSITIVE | TEXTTYPE_ATTR_ACCENT_INSENSITIVE)) ==

999

TEXTTYPE_ATTR_ACCENT_INSENSITIVE) ||

1000

(specificAttributes.count() - attributeCount) != 0)

1001

{

1002

return NULL;

1003

}

1004

1005

if (collVersion.isEmpty())

1006

collVersion = COLL_30_VERSION;

1007

1008

tt->texttype_pad_option = (attributes & TEXTTYPE_ATTR_PAD_SPACE) ? true : false;

1009

1010

ICU* icu = loadICU(collVersion, locale, configInfo);

1011

if (!icu)

1012

return NULL;

1013

1014

UErrorCode status = U_ZERO_ERROR;

1015

1016

UCollator* compareCollator = icu->ucolOpen(locale.c_str(), &status);

1017

if (!compareCollator)

1018

return NULL;

1019

1020

UCollator* partialCollator = icu->ucolOpen(locale.c_str(), &status);

1021

if (!partialCollator)

1022

{

1023

icu->ucolClose(compareCollator);

1024

return NULL;

1025

}

1026

1027

UCollator* sortCollator = icu->ucolOpen(locale.c_str(), &status);

1028

if (!sortCollator)

1029

{

1030

icu->ucolClose(compareCollator);

1031

icu->ucolClose(partialCollator);

1032

return NULL;

1033

}

1034

1035

icu->ucolSetAttribute(partialCollator, UCOL_STRENGTH, UCOL_PRIMARY, &status);

1036

1037

if ((attributes & (TEXTTYPE_ATTR_CASE_INSENSITIVE | TEXTTYPE_ATTR_ACCENT_INSENSITIVE)) ==

1038

(TEXTTYPE_ATTR_CASE_INSENSITIVE | TEXTTYPE_ATTR_ACCENT_INSENSITIVE))

1039

{

1040

icu->ucolSetAttribute(compareCollator, UCOL_STRENGTH, UCOL_PRIMARY, &status);

1041

tt->texttype_flags |= TEXTTYPE_SEPARATE_UNIQUE;

1042

tt->texttype_canonical_width = 4; // UTF-32

1043

}

1044

else if (attributes & TEXTTYPE_ATTR_CASE_INSENSITIVE)

1045

{

1046

icu->ucolSetAttribute(compareCollator, UCOL_STRENGTH, UCOL_SECONDARY, &status);

1047

tt->texttype_flags |= TEXTTYPE_SEPARATE_UNIQUE;

1048

tt->texttype_canonical_width = 4; // UTF-32

1049

}

1050

else

1051

tt->texttype_flags = TEXTTYPE_DIRECT_MATCH;

1052

1053

const bool isNumericSort = numericSort == "1";

1054

if (isNumericSort)

1055

{

1056

icu->ucolSetAttribute(compareCollator, UCOL_NUMERIC_COLLATION, UCOL_ON, &status);

1057

icu->ucolSetAttribute(partialCollator, UCOL_NUMERIC_COLLATION, UCOL_ON, &status);

1058

icu->ucolSetAttribute(sortCollator, UCOL_NUMERIC_COLLATION, UCOL_ON, &status);

1059

}

1060

1061

USet* contractions = icu->usetOpen(0, 0);

1062

// status not verified here.

1063

icu->ucolGetContractions(partialCollator, contractions, &status);

1064

1065

Utf16Collation* obj = new Utf16Collation();

1066

obj->icu = icu;

1067

obj->tt = tt;

1068

obj->attributes = attributes;

1069

obj->compareCollator = compareCollator;

1070

obj->partialCollator = partialCollator;

1071

obj->sortCollator = sortCollator;

1072

obj->contractions = contractions;

1073

obj->contractionsCount = icu->usetGetItemCount(contractions);

1074

obj->numericSort = isNumericSort;

1075

1076

return obj;

1077

}

1078

1079

1080

UnicodeUtil::Utf16Collation::~Utf16Collation()

1081

{

1082

icu->usetClose(contractions);

1083

1084

icu->ucolClose(compareCollator);

1085

icu->ucolClose(partialCollator);

1086

icu->ucolClose(sortCollator);

1087

1088

// ASF: we should not "delete icu"

1089

}

1090

1091

1092

USHORT UnicodeUtil::Utf16Collation::keyLength(USHORT len) const

1093

{

1094

return (len / 4) * 6;

1095

}

1096

1097

1098

USHORT UnicodeUtil::Utf16Collation::stringToKey(USHORT srcLen, const USHORT* src,

1099

USHORT dstLen, UCHAR* dst,

1100

USHORT key_type) const

1101

{

1102

fb_assert(src != NULL && dst != NULL);

1103

fb_assert(srcLen % sizeof(*src) == 0);

1104

1105

if (dstLen < keyLength(srcLen))

1106

{

1107

fb_assert(false);

1108

return INTL_BAD_KEY_LENGTH;

1109

}

1110

1111

srcLen /= sizeof(*src);

1112

1113

if (tt->texttype_pad_option)

1114

{

1115

const USHORT* pad;

1116

1117

for (pad = src + srcLen - 1; pad >= src; --pad)

1118

{

1119

if (*pad != 32)

1120

break;

1121

}

1122

1123

srcLen = pad - src + 1;

1124

}

1125

1126

const UCollator* coll = NULL;

1127

1128

switch (key_type)

1129

{

1130

case INTL_KEY_PARTIAL:

1131

{

1132

coll = partialCollator;

1133

1134

// Remove last bytes of key if they are start of a contraction

1135

// to correctly find in the index.

1136

for (int i = 0; i < contractionsCount; ++i)

1137

{

1138

UChar str[10];

1139

UErrorCode status = U_ZERO_ERROR;

1140

int len = icu->usetGetItem(contractions, i, NULL, NULL, str, sizeof(str), &status);

1141

1142

if (len > srcLen)

1143

len = srcLen;

1144

else

1145

--len;

1146

1147

// safe cast - alignment not changed

1148

if (u_strCompare(str, len, reinterpret_cast<const UChar*>(src) + srcLen - len, len, true) == 0)

1149

{

1150

srcLen -= len;

1151

break;

1152

}

1153

}

1154

1155

if (numericSort)

1156

{

1157

// ASF: Wee need to remove trailing numbers to return sub key that

1158

// matches full key. Example: "abc1" becomes "abc" to match "abc10".

1159

const USHORT* p = src + srcLen - 1;

1160

1161

for (; p >= src; --p)

1162

{

1163

if (!(*p >= '0' && *p <= '9'))

1164

break;

1165

}

1166

1167

srcLen = p - src + 1;

1168

}

1169

1170

break;

1171

}

1172

1173

case INTL_KEY_UNIQUE:

1174

coll = compareCollator;

1175

break;

1176

1177

case INTL_KEY_SORT:

1178

coll = sortCollator;

1179

break;

1180

1181

default:

1182

fb_assert(false);

1183

return INTL_BAD_KEY_LENGTH;

1184

}

1185

1186

if (srcLen == 0)

1187

return 0;

1188

1189

return icu->ucolGetSortKey(coll,

1190

reinterpret_cast<const UChar*>(src), srcLen, dst, dstLen);

1191

}

1192

1193

1194

SSHORT UnicodeUtil::Utf16Collation::compare(ULONG len1, const USHORT* str1,

1195

ULONG len2, const USHORT* str2,

1196

INTL_BOOL* error_flag) const

1197

{

1198

fb_assert(len1 % sizeof(*str1) == 0 && len2 % sizeof(*str2) == 0);

1199

fb_assert(str1 != NULL && str2 != NULL);

1200

fb_assert(error_flag != NULL);

1201

1202

*error_flag = false;

1203

1204

len1 /= sizeof(*str1);

1205

len2 /= sizeof(*str2);

1206

1207

if (tt->texttype_pad_option)

1208

{

1209

const USHORT* pad;

1210

1211

for (pad = str1 + len1 - 1; pad >= str1; --pad)

1212

{

1213

if (*pad != 32)

1214

break;

1215

}

1216

1217

len1 = pad - str1 + 1;

1218

1219

for (pad = str2 + len2 - 1; pad >= str2; --pad)

1220

{

1221

if (*pad != 32)

1222

break;

1223

}

1224

1225

len2 = pad - str2 + 1;

1226

}

1227

1228

return (SSHORT)icu->ucolStrColl(compareCollator,

1229

// safe casts - alignment not changed

1230

reinterpret_cast<const UChar*>(str1), len1,

1231

reinterpret_cast<const UChar*>(str2), len2);

1232

}

1233

1234

1235

ULONG UnicodeUtil::Utf16Collation::canonical(ULONG srcLen, const USHORT* src, ULONG dstLen, ULONG* dst,

1236

const ULONG* exceptions)

1237

{

1238

HalfStaticArray<USHORT, BUFFER_SMALL / 2> upperStr;

1239

1240

if ((attributes & (TEXTTYPE_ATTR_CASE_INSENSITIVE | TEXTTYPE_ATTR_ACCENT_INSENSITIVE)) ==

1241

(TEXTTYPE_ATTR_CASE_INSENSITIVE | TEXTTYPE_ATTR_ACCENT_INSENSITIVE))

1242

{

1243

fb_assert(srcLen % sizeof(*src) == 0);

1244

1245

memcpy(upperStr.getBuffer(srcLen / sizeof(USHORT)), src, srcLen);

1246

1247

UErrorCode errorCode = U_ZERO_ERROR;

1248

UTransliterator* trans = icu->utransOpen("Any-Upper; NFD; [:Nonspacing Mark:] Remove; NFC",

1249

UTRANS_FORWARD, NULL, 0, NULL, &errorCode);

1250

1251

if (errorCode <= 0)

1252

{

1253

const int32_t capacity = dstLen;

1254

int32_t len = srcLen / sizeof(USHORT);

1255

int32_t limit = len;

1256

1257

icu->utransTransUChars(trans, reinterpret_cast<UChar*>(upperStr.begin()),

1258

&len, capacity, 0, &limit, &errorCode);

1259

icu->utransClose(trans);

1260

1261

len *= sizeof(USHORT);

1262

if (ULONG(len) > dstLen)

1263

len = INTL_BAD_STR_LENGTH;

1264

1265

srcLen = len;

1266

src = upperStr.begin();

1267

}

1268

else

1269

return INTL_BAD_STR_LENGTH;

1270

}

1271

else if (attributes & TEXTTYPE_ATTR_CASE_INSENSITIVE)

1272

{

1273

srcLen = utf16UpperCase(srcLen, src,

1274

srcLen, upperStr.getBuffer(srcLen / sizeof(USHORT)), exceptions);

1275

src = upperStr.begin();

1276

}

1277

1278

// convert UTF-16 to UTF-32

1279

USHORT errCode;

1280

ULONG errPosition;

1281

return utf16ToUtf32(srcLen, src, dstLen, dst, &errCode, &errPosition) / sizeof(ULONG);

1282

}

1283

1284

1285

UnicodeUtil::ICU* UnicodeUtil::Utf16Collation::loadICU(

1286

const Firebird::string& collVersion, const Firebird::string& locale,

1287

const Firebird::string& configInfo)

1288

{

1289

ObjectsArray<string> versions;

1290

getVersions(configInfo, versions);

1291

1292

for (ObjectsArray<string>::const_iterator i(versions.begin()); i != versions.end(); ++i)

1293

{

1294

ICU* icu = UnicodeUtil::loadICU(*i, configInfo);

1295

if (!icu)

1296

continue;

1297

1298

if (locale.hasData())

1299

{

1300

int avail = icu->ulocCountAvailable();

1301

1302

while (--avail >= 0)

1303

{

1304

if (locale == icu->ulocGetAvailable(avail))

1305

break;

1306

}

1307

1308

if (avail < 0)

1309

continue;

1310

}

1311

1312

char version[U_MAX_VERSION_STRING_LENGTH];

1313

icu->uVersionToString(icu->collVersion, version);

1314

1315

if (collVersion != version)

1316

continue;

1317

1318

return icu;

1319

}

1320

1321

return NULL;

1322

}

1323

1324

1325

} // namespace Jrd