189
203
// on the NFD form - see above).
190
204
for (; i < source.length(); i += UTF16_CHAR_LENGTH(cp)) {
191
205
cp = source.char32At(i);
192
if (SAFE_START->contains(cp)) {
193
source.extract(start, i, list[list_length++]); // add up to i
206
if (unorm_isCanonSafeStart(cp)) {
207
source.extract(start, i-start, list[list_length++]); // add up to i
197
source.extract(start, i, list[list_length++]); // add last one
211
source.extract(start, i-start, list[list_length++]); // add last one
200
214
// allocate the arrays, and find the strings that are CE to each segment
201
215
pieces = new UnicodeString*[list_length];
202
216
pieces_length = list_length;
203
pieces_lengths = new int32_t[list_length];
217
pieces_lengths = (int32_t*)uprv_malloc(list_length * sizeof(int32_t));
205
current = new int32_t[list_length];
206
219
current_length = list_length;
220
current = (int32_t*)uprv_malloc(list_length * sizeof(int32_t));
207
221
for (i = 0; i < current_length; i++) {
221
* Dumb recursive implementation of permutation.
235
* Dumb recursive implementation of permutation.
223
237
* @param source the string to find permutations for
224
238
* @return the results in a set.
226
Hashtable *CanonicalIterator::permute(UnicodeString &source, UErrorCode status) {
240
void CanonicalIterator::permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status) {
241
if(U_FAILURE(status)) {
227
244
//if (PROGRESS) printf("Permute: %s\n", UToS(Tr(source)));
230
Hashtable *result = new Hashtable(FALSE, status);
231
result->setValueDeleter(uhash_deleteUnicodeString);
234
248
// if zero or one character, just return a set with it
235
249
// we check for length < 2 to keep from counting code points all the time
236
//if (source.length() <= 2 && UTF16_CHAR_LENGTH(source.char32At(0)) <= 1) {
237
if (source.length() < 2 || (source.length() == 2 && UTF16_CHAR_LENGTH(source.char32At(0)) > 1)) {
250
if (source.length() <= 2 && source.countChar32() <= 1) {
238
251
UnicodeString *toPut = new UnicodeString(source);
239
result->put(source, toPut, status);
252
result->put(source, toPut, status);
243
256
// otherwise iterate through the string, and recursively permute all the other characters
258
Hashtable *subpermute = new Hashtable(FALSE, status);
259
if (U_SUCCESS(status)) {
260
subpermute->setValueDeleter(uhash_deleteUnicodeString);
245
263
for (i = 0; i < source.length(); i += UTF16_CHAR_LENGTH(cp)) {
246
264
cp = source.char32At(i);
247
265
const UHashElement *ne = NULL;
249
267
UnicodeString subPermuteString = source;
270
// if the character is canonical combining class zero,
272
if (skipZeros && i != 0 && u_getCombiningClass(cp) == 0) {
273
//System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i)));
277
subpermute->removeAll();
251
279
// see what the permutations of the characters before and after this one are
252
280
//Hashtable *subpermute = permute(source.substring(0,i) + source.substring(i + UTF16.getCharCount(cp)));
253
Hashtable *subpermute = permute(subPermuteString.replace(i, UTF16_CHAR_LENGTH(cp), NULL, 0), status);
281
permute(subPermuteString.replace(i, UTF16_CHAR_LENGTH(cp), NULL, 0), skipZeros, subpermute, status);
254
282
// The upper replace is destructive. The question is do we have to make a copy, or we don't care about the contents
255
283
// of source at this point.
257
285
// prefix this character to all of them
258
286
ne = subpermute->nextElement(el);
259
287
while (ne != NULL) {
264
292
result->put(*chStr, chStr, status);
265
293
ne = subpermute->nextElement(el);
272
static UBool U_CALLCONV
273
_enumCategoryRangeSAFE_STARTsetup(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
275
// TODO: use a switch that will automatically add all the unassigned, lead surrogates, tail surrogates and privates
276
//fprintf(stdout, "SAFE_START:%08X - %08X, %i\n", start, limit, type);
278
for(; start < limit; start++) {
279
cc = u_getCombiningClass(start);
281
int32_t lowerLimit = start;
282
while(cc == 0 && start <= limit) {
283
cc = u_getCombiningClass(++start);
285
SAFE_START->add(lowerLimit, start-1);
289
SAFE_START->add(start, limit-1);
294
static UBool U_CALLCONV
295
_enumCategoryRangeAT_STARTsetup(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
296
UErrorCode status = *(UErrorCode *)context;
298
//fprintf(stdout, "AT_START:%08X - %08X, %i\n", start, limit, type);
301
for(cp = start; cp < limit; cp++) {
302
UnicodeString istr(cp);
303
UnicodeString decomp;
304
Normalizer::normalize(istr, UNORM_NFD, 0, decomp, status);
305
if (decomp==istr) continue;
307
// add each character in the decomposition to canBeIn
308
UChar32 component = 0;
310
for (i = 0; i < decomp.length(); i += UTF16_CHAR_LENGTH(component)) {
311
component = decomp.char32At(i);
313
UnicodeSet *isIn = (UnicodeSet *)AT_START->get(component);
315
isIn = new UnicodeSet();
318
AT_START->put(component, isIn, status);
319
} else if (u_getCombiningClass(component) == 0) {
320
SAFE_START->remove(component);
328
void CanonicalIterator::initStaticData(UErrorCode status) {
329
if(SAFE_START == NULL && AT_START == NULL) {
330
SAFE_START = new UnicodeSet();
331
// TODO: have value deleter for UnicodeSets
332
AT_START = new Hashtable(FALSE, status);
335
//if (PROGRESS) printf("Getting Safe Start");
337
// TODO: use u_enumCharType() instead
338
// the fastest with current, public apis is to
339
// enumerate with u_enumCharType() for all categories !=0 and then
340
// getCombiningClass(start..limit-1) that cuts it down by a factor of about 11...
341
u_enumCharTypes(_enumCategoryRangeSAFE_STARTsetup, 0);
343
//if (PROGRESS) printf("Getting Containment\n");
344
u_enumCharTypes(_enumCategoryRangeAT_STARTsetup, &status);
349
*@return the set of "safe starts", characters that are class zero AND are never non-initial in a decomposition.
351
UnicodeSet *CanonicalIterator::getSafeStart(UErrorCode status) {
352
initStaticData(status);
357
*@return the set of characters whose decompositions start with the given character
359
UnicodeSet *CanonicalIterator::getStarts(UChar32 cp, UErrorCode status) {
360
initStaticData(status);
361
UnicodeSet *result = (UnicodeSet *)AT_START->get(cp);
367
302
// we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
368
UnicodeString* CanonicalIterator::getEquivalents(UnicodeString segment, int32_t &result_len, UErrorCode status) { //private String[] getEquivalents(String segment)
303
UnicodeString* CanonicalIterator::getEquivalents(const UnicodeString &segment, int32_t &result_len, UErrorCode &status) {
304
//private String[] getEquivalents(String segment)
369
306
Hashtable *result = new Hashtable(FALSE, status);
370
Hashtable *basic = getEquivalents2(segment, status);
307
if (U_SUCCESS(status)) {
308
result->setValueDeleter(uhash_deleteUnicodeString);
311
int32_t segLen = segment.extract(USeg, 256, status);
312
Hashtable *basic = getEquivalents2(USeg, segLen, status);
313
//Hashtable *basic = getEquivalents2(segment, segLen, status);
372
315
// now get all the permutations
373
316
// add only the ones that are canonically equivalent
374
317
// TODO: optimize by not permuting any class zero.
319
Hashtable *permutations = new Hashtable(FALSE, status);
320
if (U_SUCCESS(status)) {
321
permutations->setValueDeleter(uhash_deleteUnicodeString);
375
324
const UHashElement *ne = NULL;
377
326
//Iterator it = basic.iterator();
378
327
ne = basic->nextElement(el);
379
//while (it.hasNext())
328
//while (it.hasNext())
380
329
while (ne != NULL) {
381
330
//String item = (String) it.next();
382
331
UnicodeString item = *((UnicodeString *)(ne->value.pointer));
383
Hashtable *permutations = permute(item, status);
333
permutations->removeAll();
334
permute(item, SKIP_ZEROES, permutations, status);
384
335
const UHashElement *ne2 = NULL;
385
336
int32_t el2 = -1;
386
337
//Iterator it2 = permutations.iterator();
387
338
ne2 = permutations->nextElement(el2);
388
//while (it2.hasNext())
339
//while (it2.hasNext())
389
340
while (ne2 != NULL) {
390
341
//String possible = (String) it2.next();
391
UnicodeString *possible = new UnicodeString(*((UnicodeString *)(ne2->value.pointer)));
342
//UnicodeString *possible = new UnicodeString(*((UnicodeString *)(ne2->value.pointer)));
343
UnicodeString possible(*((UnicodeString *)(ne2->value.pointer)));
392
344
UnicodeString attempt;
393
Normalizer::normalize(*possible, UNORM_NFD, 0, attempt, status);
345
Normalizer::normalize(possible, UNORM_NFD, 0, attempt, status);
395
347
// TODO: check if operator == is semanticaly the same as attempt.equals(segment)
396
348
if (attempt==segment) {
397
349
//if (PROGRESS) printf("Adding Permutation: %s\n", UToS(Tr(*possible)));
398
350
// TODO: use the hashtable just to catch duplicates - store strings directly (somehow).
399
result->put(*possible, possible, status); //add(possible);
351
result->put(possible, new UnicodeString(possible), status); //add(possible);
401
353
//if (PROGRESS) printf("-Skipping Permutation: %s\n", UToS(Tr(*possible)));
404
356
ne2 = permutations->nextElement(el2);
407
358
ne = basic->nextElement(el);
410
361
// convert into a String[] to clean up storage
411
362
//String[] finalResult = new String[result.size()];
412
363
UnicodeString *finalResult = new UnicodeString[result->count()];
425
378
return finalResult;
428
Hashtable *CanonicalIterator::getEquivalents2(UnicodeString segment, UErrorCode status) {
429
//Set result = new TreeSet();
381
Hashtable *CanonicalIterator::getEquivalents2(const UChar *segment, int32_t segLen, UErrorCode &status) {
382
//Hashtable *CanonicalIterator::getEquivalents2(const UnicodeString &segment, int32_t segLen, UErrorCode &status) {
430
384
Hashtable *result = new Hashtable(FALSE, status);
431
result->setValueDeleter(uhash_deleteUnicodeString);
385
if (U_SUCCESS(status)) {
386
result->setValueDeleter(uhash_deleteUnicodeString);
433
389
//if (PROGRESS) printf("Adding: %s\n", UToS(Tr(segment)));
435
//result.add(segment);
436
result->put(segment, new UnicodeString(segment), status);
438
//StringBuffer workingBuffer = new StringBuffer();
439
UnicodeString workingBuffer;
391
UnicodeString toPut(segment, segLen);
393
result->put(toPut, new UnicodeString(toPut), status);
395
USerializedSet starts;
442
397
// cycle through all the characters
444
int32_t i = 0, j = 0;
445
for (i = 0; i < segment.length(); i += UTF16_CHAR_LENGTH(cp)) {
398
UChar32 cp, limit = 0;
400
for (i = 0; i < segLen; i += UTF16_CHAR_LENGTH(cp)) {
446
401
// see if any character is at the start of some decomposition
447
cp = segment.char32At(i);
448
UnicodeSet *starts = (UnicodeSet *)AT_START->get(cp);
449
if (starts == NULL) continue;
450
//UnicodeSetIterator usi = new UnicodeSetIterator(starts);
451
int32_t setSize = starts->size();
402
UTF_GET_CHAR(segment, 0, i, segLen, cp);
403
if (!unorm_getCanonStartSet(cp, &starts)) {
452
406
// if so, see which decompositions match
454
for(j = 0; j < setSize; j++) {
455
//UChar32 cp2 = usi.next();
456
UChar32 cp2 = starts->charAt(j);
457
//if (cp2 < 0) break; // done
458
const Hashtable *remainder = extract(cp2, segment, i, workingBuffer, status);
407
for(j = 0, cp = limit; cp < limit || uset_getSerializedRange(&starts, j++, &cp, &limit); ++cp) {
408
//Hashtable *remainder = extract(cp, segment, segLen, i, status);
409
Hashtable *remainder = extract(cp, segment, segLen, i, status);
459
410
if (remainder == NULL) continue;
461
412
// there were some matches, so add all the possibilities to the set.
462
//UnicodeString prefix = segment.substring(0, i) + UTF16.valueOf(cp2);
463
UnicodeString *prefix = new UnicodeString;
464
segment.extract(0, i, *prefix);
413
UnicodeString prefix(segment, i);
467
416
const UHashElement *ne = NULL;
469
//Iterator it = remainder.iterator();
470
418
ne = remainder->nextElement(el);
471
419
while (ne != NULL) {
472
//String item = (String) it.next();
473
420
UnicodeString item = *((UnicodeString *)(ne->value.pointer));
474
//result.add(prefix + item);
476
result->put(*prefix, prefix, status);
421
UnicodeString *toAdd = new UnicodeString(prefix);
423
result->put(*toAdd, toAdd, status);
478
//if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*prefix)));
425
//if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd)));
480
427
ne = remainder->nextElement(el);
491
438
* (with canonical rearrangment!)
492
439
* If so, take the remainder, and return the equivalents
494
const Hashtable *CanonicalIterator::extract(UChar32 comp, UnicodeString segment, int32_t segmentPos, UnicodeString buffer, UErrorCode status) {
441
Hashtable *CanonicalIterator::extract(UChar32 comp, const UChar *segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) {
442
//Hashtable *CanonicalIterator::extract(UChar32 comp, const UnicodeString &segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) {
495
443
//if (PROGRESS) printf(" extract: %s, ", UToS(Tr(UnicodeString(comp))));
496
444
//if (PROGRESS) printf("%s, %i\n", UToS(Tr(segment)), segmentPos);
498
//String decomp = Normalizer.normalize(UTF16.valueOf(comp), Normalizer.DECOMP, 0);
499
UnicodeString decomp;
500
Normalizer::normalize(comp, UNORM_NFD, 0, decomp, status);
446
const int32_t bufSize = 256;
450
const int32_t decompSize = 64;
451
int32_t inputLen = 0;
452
UChar decomp[decompSize];
454
UTF_APPEND_CHAR(temp, inputLen, bufSize, comp);
455
int32_t decompLen = unorm_getDecomposition(comp, FALSE, decomp, decompSize);
457
decompLen = -decompLen;
460
UChar *buff = temp+inputLen;
502
462
// See if it matches the start of segment (at segmentPos)
503
463
UBool ok = FALSE;
505
465
int32_t decompPos = 0;
506
UChar32 decompCp = decomp.char32At(0);
507
decompPos += UTF16_CHAR_LENGTH(decompCp); // adjust position to skip first char
508
//int decompClass = getClass(decompCp);
509
buffer.truncate(0); // initialize working buffer, shared among callees
467
UTF_NEXT_CHAR(decomp, decompPos, decompLen, decompCp);
512
for (i = segmentPos; i < segment.length(); i += UTF16_CHAR_LENGTH(cp)) {
513
cp = segment.char32At(i);
472
UTF_NEXT_CHAR(segment, i, segLen, cp);
514
474
if (cp == decompCp) { // if equal, eat another cp from decomp
516
476
//if (PROGRESS) printf(" matches: %s\n", UToS(Tr(UnicodeString(cp))));
518
if (decompPos == decomp.length()) { // done, have all decomp characters!
519
//buffer.append(segment.substring(i + UTF16.getCharCount(cp))); // add remaining segment chars
520
buffer.append(segment, i+UTF16_CHAR_LENGTH(cp), segment.length()-i-UTF16_CHAR_LENGTH(cp));
478
if (decompPos == decompLen) { // done, have all decomp characters!
479
//u_strcat(buff+bufLen, segment+i);
480
memcpy(buff+bufLen, segment+i, (segLen-i)*sizeof(UChar));
524
decompCp = decomp.char32At(decompPos);
525
decompPos += UTF16_CHAR_LENGTH(decompCp);
526
//decompClass = getClass(decompCp);
486
UTF_NEXT_CHAR(decomp, decompPos, decompLen, decompCp);
528
488
//if (PROGRESS) printf(" buffer: %s\n", UToS(Tr(UnicodeString(cp))));
530
490
// brute force approach
533
//UTF16.append(buffer, cp);
492
UTF_APPEND_CHAR(buff, bufLen, bufSize, cp);
536
494
/* TODO: optimize
537
495
// since we know that the classes are monotonically increasing, after zero
551
509
//if (PROGRESS) printf("Matches\n");
553
if (buffer.length() == 0) {
554
512
Hashtable *result = new Hashtable(FALSE, status);
555
513
result->setValueDeleter(uhash_deleteUnicodeString);
556
514
result->put("", new UnicodeString(""), status);
557
515
return result; // succeed, but no remainder
560
//String remainder = buffer.toString();
561
UnicodeString remainder = buffer;
563
518
// brute force approach
564
519
// check to make sure result is canonically equivalent
565
//String trial = Normalizer.normalize(UTF16.valueOf(comp) + remainder, Normalizer.DECOMP, 0);
567
UnicodeString temp = remainder;
568
temp.insert(0, comp);
569
Normalizer::normalize(temp, UNORM_NFD, 0, trial, status);
571
//if (!segment.regionMatches(segmentPos, trial, 0, segment.length() - segmentPos)) return null;
572
if (segment.indexOf(trial, 0, segment.length() - segmentPos, segmentPos, segment.length() - segmentPos)==-1) {
520
int32_t tempLen = inputLen + bufLen;
522
UChar trial[bufSize];
523
unorm_decompose(trial, bufSize, temp, tempLen, FALSE, FALSE, &status);
525
if(uprv_memcmp(segment+segmentPos, trial, (segLen - segmentPos)*sizeof(UChar)) != 0) {
576
// get the remaining combinations
577
return getEquivalents2(remainder, status);
529
return getEquivalents2(buff, bufLen, status);