~ubuntu-branches/ubuntu/gutsy/icu/gutsy

UnicodeString* CanonicalIterator::getEquivalents(UnicodeString segment, int32_t &result_len, UErrorCode status) { //private String[] getEquivalents(String segment)

303

UnicodeString* CanonicalIterator::getEquivalents(const UnicodeString &segment, int32_t &result_len, UErrorCode &status) {

304

//private String[] getEquivalents(String segment)

305

369

306

Hashtable *result = new Hashtable(FALSE, status);

370

Hashtable *basic = getEquivalents2(segment, status);

371

307

if (U_SUCCESS(status)) {

308

result->setValueDeleter(uhash_deleteUnicodeString);

309

}

310

UChar USeg[256];

311

int32_t segLen = segment.extract(USeg, 256, status);

312

Hashtable *basic = getEquivalents2(USeg, segLen, status);

313

//Hashtable *basic = getEquivalents2(segment, segLen, status);

314

372

315

// now get all the permutations

373

316

// add only the ones that are canonically equivalent

374

317

// TODO: optimize by not permuting any class zero.

318

319

Hashtable *permutations = new Hashtable(FALSE, status);

320

if (U_SUCCESS(status)) {

321

permutations->setValueDeleter(uhash_deleteUnicodeString);

322

}

323

375

324

const UHashElement *ne = NULL;

376

325

int32_t el = -1;

377

326

//Iterator it = basic.iterator();

378

327

ne = basic->nextElement(el);

379

//while (it.hasNext())

328

//while (it.hasNext())

380

329

while (ne != NULL) {

381

330

//String item = (String) it.next();

382

331

UnicodeString item = *((UnicodeString *)(ne->value.pointer));

383

Hashtable *permutations = permute(item, status);

332

333

permutations->removeAll();

334

permute(item, SKIP_ZEROES, permutations, status);

384

335

const UHashElement *ne2 = NULL;

385

336

int32_t el2 = -1;

386

337

//Iterator it2 = permutations.iterator();

387

338

ne2 = permutations->nextElement(el2);

388

//while (it2.hasNext())

339

//while (it2.hasNext())

389

340

while (ne2 != NULL) {

390

341

//String possible = (String) it2.next();

391

UnicodeString *possible = new UnicodeString(*((UnicodeString *)(ne2->value.pointer)));

342

//UnicodeString *possible = new UnicodeString(*((UnicodeString *)(ne2->value.pointer)));

343

UnicodeString possible(*((UnicodeString *)(ne2->value.pointer)));

392

344

UnicodeString attempt;

393

Normalizer::normalize(*possible, UNORM_NFD, 0, attempt, status);

345

Normalizer::normalize(possible, UNORM_NFD, 0, attempt, status);

394

346

395

347

// TODO: check if operator == is semanticaly the same as attempt.equals(segment)

396

348

if (attempt==segment) {

397

349

//if (PROGRESS) printf("Adding Permutation: %s\n", UToS(Tr(*possible)));

398

350

// TODO: use the hashtable just to catch duplicates - store strings directly (somehow).

399

result->put(*possible, possible, status); //add(possible);

351

result->put(possible, new UnicodeString(possible), status); //add(possible);

400

352

} else {

401

353

//if (PROGRESS) printf("-Skipping Permutation: %s\n", UToS(Tr(*possible)));

402

354

}

403

355

404

356

ne2 = permutations->nextElement(el2);

405

357

}

406

delete permutations;

407

358

ne = basic->nextElement(el);

408

359

}

409

360

410

361

// convert into a String[] to clean up storage

411

362

//String[] finalResult = new String[result.size()];

412

363

UnicodeString *finalResult = new UnicodeString[result->count()];

421

372

}

422

373

423

374

375

delete permutations;

376

delete basic;

424

377

delete result;

425

378

return finalResult;

426

379

}

427

380

428

Hashtable *CanonicalIterator::getEquivalents2(UnicodeString segment, UErrorCode status) {

429

//Set result = new TreeSet();

381

Hashtable *CanonicalIterator::getEquivalents2(const UChar *segment, int32_t segLen, UErrorCode &status) {

382

//Hashtable *CanonicalIterator::getEquivalents2(const UnicodeString &segment, int32_t segLen, UErrorCode &status) {

383

430

384

Hashtable *result = new Hashtable(FALSE, status);

431

result->setValueDeleter(uhash_deleteUnicodeString);

385

if (U_SUCCESS(status)) {

386

result->setValueDeleter(uhash_deleteUnicodeString);

387

}

432

388

433

389

//if (PROGRESS) printf("Adding: %s\n", UToS(Tr(segment)));

434

390

435

//result.add(segment);

436

result->put(segment, new UnicodeString(segment), status);

437

438

//StringBuffer workingBuffer = new StringBuffer();

439

UnicodeString workingBuffer;

440

441

391

UnicodeString toPut(segment, segLen);

392

393

result->put(toPut, new UnicodeString(toPut), status);

394

395

USerializedSet starts;

396

442

397

// cycle through all the characters

443

UChar32 cp;

444

int32_t i = 0, j = 0;

445

for (i = 0; i < segment.length(); i += UTF16_CHAR_LENGTH(cp)) {

398

UChar32 cp, limit = 0;

399

int32_t i = 0, j;

400

for (i = 0; i < segLen; i += UTF16_CHAR_LENGTH(cp)) {

446

401

// see if any character is at the start of some decomposition

447

cp = segment.char32At(i);

448

UnicodeSet *starts = (UnicodeSet *)AT_START->get(cp);

449

if (starts == NULL) continue;

450

//UnicodeSetIterator usi = new UnicodeSetIterator(starts);

451

int32_t setSize = starts->size();

402

UTF_GET_CHAR(segment, 0, i, segLen, cp);

403

if (!unorm_getCanonStartSet(cp, &starts)) {

404

continue;

405

}

452

406

// if so, see which decompositions match

453

//while (TRUE) {

454

for(j = 0; j < setSize; j++) {

455

//UChar32 cp2 = usi.next();

456

UChar32 cp2 = starts->charAt(j);

457

//if (cp2 < 0) break; // done

458

const Hashtable *remainder = extract(cp2, segment, i, workingBuffer, status);

407

for(j = 0, cp = limit; cp < limit || uset_getSerializedRange(&starts, j++, &cp, &limit); ++cp) {

408

//Hashtable *remainder = extract(cp, segment, segLen, i, status);

409

Hashtable *remainder = extract(cp, segment, segLen, i, status);

459

410

if (remainder == NULL) continue;

460

411

461

412

// there were some matches, so add all the possibilities to the set.

462

//UnicodeString prefix = segment.substring(0, i) + UTF16.valueOf(cp2);

463

UnicodeString *prefix = new UnicodeString;

464

segment.extract(0, i, *prefix);

465

*prefix += cp2;

413

UnicodeString prefix(segment, i);

414

prefix += cp;

466

415

467

416

const UHashElement *ne = NULL;

468

417

int32_t el = -1;

469

//Iterator it = remainder.iterator();

470

418

ne = remainder->nextElement(el);

471

419

while (ne != NULL) {

472

//String item = (String) it.next();

473

420

UnicodeString item = *((UnicodeString *)(ne->value.pointer));

474

//result.add(prefix + item);

475

*prefix += item;

476

result->put(*prefix, prefix, status);

421

UnicodeString *toAdd = new UnicodeString(prefix);

422

*toAdd += item;

423

result->put(*toAdd, toAdd, status);

477

424

478

//if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*prefix)));

425

//if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd)));

479

426

480

427

ne = remainder->nextElement(el);

481

428

}

491

438

* (with canonical rearrangment!)

492

439

* If so, take the remainder, and return the equivalents

493

440

494

const Hashtable *CanonicalIterator::extract(UChar32 comp, UnicodeString segment, int32_t segmentPos, UnicodeString buffer, UErrorCode status) {

441

Hashtable *CanonicalIterator::extract(UChar32 comp, const UChar *segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) {

442

//Hashtable *CanonicalIterator::extract(UChar32 comp, const UnicodeString &segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) {

495

443

//if (PROGRESS) printf(" extract: %s, ", UToS(Tr(UnicodeString(comp))));

496

444

//if (PROGRESS) printf("%s, %i\n", UToS(Tr(segment)), segmentPos);

497

445

498

//String decomp = Normalizer.normalize(UTF16.valueOf(comp), Normalizer.DECOMP, 0);

499

UnicodeString decomp;

500

Normalizer::normalize(comp, UNORM_NFD, 0, decomp, status);

501

446

const int32_t bufSize = 256;

447

int32_t bufLen = 0;

448

UChar temp[bufSize];

449

450

const int32_t decompSize = 64;

451

int32_t inputLen = 0;

452

UChar decomp[decompSize];

453

454

UTF_APPEND_CHAR(temp, inputLen, bufSize, comp);

455

int32_t decompLen = unorm_getDecomposition(comp, FALSE, decomp, decompSize);

456

if(decompLen < 0) {

457

decompLen = -decompLen;

458

}

459

460

UChar *buff = temp+inputLen;

461

502

462

// See if it matches the start of segment (at segmentPos)

503

463

UBool ok = FALSE;

504

464

UChar32 cp;

505

465

int32_t decompPos = 0;

506

UChar32 decompCp = decomp.char32At(0);

507

decompPos += UTF16_CHAR_LENGTH(decompCp); // adjust position to skip first char

508

//int decompClass = getClass(decompCp);

509

buffer.truncate(0); // initialize working buffer, shared among callees

510

466

UChar32 decompCp;

467

UTF_NEXT_CHAR(decomp, decompPos, decompLen, decompCp);

468

511

469

int32_t i = 0;

512

for (i = segmentPos; i < segment.length(); i += UTF16_CHAR_LENGTH(cp)) {

513

cp = segment.char32At(i);

470

i = segmentPos;

471

while(i < segLen) {

472

UTF_NEXT_CHAR(segment, i, segLen, cp);

473

514

474

if (cp == decompCp) { // if equal, eat another cp from decomp

515

475

516

476

//if (PROGRESS) printf(" matches: %s\n", UToS(Tr(UnicodeString(cp))));

517

477

518

if (decompPos == decomp.length()) { // done, have all decomp characters!

519

//buffer.append(segment.substring(i + UTF16.getCharCount(cp))); // add remaining segment chars

520

buffer.append(segment, i+UTF16_CHAR_LENGTH(cp), segment.length()-i-UTF16_CHAR_LENGTH(cp));

478

if (decompPos == decompLen) { // done, have all decomp characters!

479

//u_strcat(buff+bufLen, segment+i);

480

memcpy(buff+bufLen, segment+i, (segLen-i)*sizeof(UChar));

481

bufLen+=segLen-i;

482

521

483

ok = TRUE;

522

484

break;

523

485

}

524

decompCp = decomp.char32At(decompPos);

525

decompPos += UTF16_CHAR_LENGTH(decompCp);

526

//decompClass = getClass(decompCp);

486

UTF_NEXT_CHAR(decomp, decompPos, decompLen, decompCp);

527

487

} else {

528

488

//if (PROGRESS) printf(" buffer: %s\n", UToS(Tr(UnicodeString(cp))));

529

489

530

490

// brute force approach

531

491

532

533

//UTF16.append(buffer, cp);

534

buffer.append(cp);

492

UTF_APPEND_CHAR(buff, bufLen, bufSize, cp);

535

493

536

494

/* TODO: optimize

537

495

// since we know that the classes are monotonically increasing, after zero

540

498

// there are only a few cases that work: zero, less, same, greater

541

499

// if both classes are the same, we fail

542

500

// if the decomp class < the segment class, we fail

543

501

544

502

segClass = getClass(cp);

545

503

if (decompClass <= segClass) return null;

546

504

550

508

551

509

//if (PROGRESS) printf("Matches\n");

552

510

553

if (buffer.length() == 0) {

511

if (bufLen == 0) {

554

512

Hashtable *result = new Hashtable(FALSE, status);

555

513

result->setValueDeleter(uhash_deleteUnicodeString);

556

514

result->put("", new UnicodeString(""), status);

557

515

return result; // succeed, but no remainder

558

516

}

559

517

560

//String remainder = buffer.toString();

561

UnicodeString remainder = buffer;

562

563

518

// brute force approach

564

519

// check to make sure result is canonically equivalent

565

//String trial = Normalizer.normalize(UTF16.valueOf(comp) + remainder, Normalizer.DECOMP, 0);

566

UnicodeString trial;

567

UnicodeString temp = remainder;

568

temp.insert(0, comp);

569

Normalizer::normalize(temp, UNORM_NFD, 0, trial, status);

570

571

//if (!segment.regionMatches(segmentPos, trial, 0, segment.length() - segmentPos)) return null;

572

if (segment.indexOf(trial, 0, segment.length() - segmentPos, segmentPos, segment.length() - segmentPos)==-1) {

520

int32_t tempLen = inputLen + bufLen;

521

522

UChar trial[bufSize];

523

unorm_decompose(trial, bufSize, temp, tempLen, FALSE, FALSE, &status);

524

525

if(uprv_memcmp(segment+segmentPos, trial, (segLen - segmentPos)*sizeof(UChar)) != 0) {

573

526

return NULL;

574

527

}

575

576

// get the remaining combinations

577

return getEquivalents2(remainder, status);

528

529

return getEquivalents2(buff, bufLen, status);

578

530

}

579

531

580

532

Older »