2
******************************************************************************
4
* Copyright (C) 2009-2010, International Business Machines
5
* Corporation and others. All Rights Reserved.
7
******************************************************************************
10
package com.ibm.icu.impl;
12
import java.util.ArrayList;
14
import com.ibm.icu.text.UnicodeSet;
15
import com.ibm.icu.text.UnicodeSet.SpanCondition;
18
* Implement span() etc. for a set with strings.
19
* Avoid recursion because of its exponential complexity.
20
* Instead, try multiple paths at once and track them with an IndexList.
22
public class UnicodeSetStringSpan {
25
* Which span() variant will be used? The object is either built for one variant and used once, or built for all and
26
* may be used many times.
28
public static final int FWD = 0x20;
29
public static final int BACK = 0x10;
30
public static final int UTF16 = 8;
31
public static final int CONTAINED = 2;
32
public static final int NOT_CONTAINED = 1;
34
public static final int ALL = 0x3f;
36
public static final int FWD_UTF16_CONTAINED = FWD | UTF16 | CONTAINED;
37
public static final int FWD_UTF16_NOT_CONTAINED = FWD | UTF16 | NOT_CONTAINED;
38
public static final int BACK_UTF16_CONTAINED = BACK | UTF16 | CONTAINED;
39
public static final int BACK_UTF16_NOT_CONTAINED = BACK | UTF16 | NOT_CONTAINED;
41
// Special spanLength short values. (since Java has not unsigned byte type)
42
// All code points in the string are contained in the parent set.
43
static final short ALL_CP_CONTAINED = 0xff;
44
// The spanLength is >=0xfe.
45
static final short LONG_SPAN = ALL_CP_CONTAINED - 1;
47
// Set for span(). Same as parent but without strings.
48
private UnicodeSet spanSet;
50
// Set for span(not contained).
51
// Same as spanSet, plus characters that start or end strings.
52
private UnicodeSet spanNotSet;
54
// The strings of the parent set.
55
private ArrayList<String> strings;
57
// the lengths of span(), spanBack() etc. for each string.
58
private short[] spanLengths;
60
// Maximum lengths of relevant strings.
61
private int maxLength16;
63
// Set up for all variants of span()?
67
private OffsetList offsets;
69
// Construct for all variants of span(), or only for any one variant.
70
// Initialize as little as possible, for single use.
71
public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which) {
72
spanSet = new UnicodeSet(0, 0x10ffff);
75
spanSet.retainAll(set);
76
if (0 != (which & NOT_CONTAINED)) {
77
// Default to the same sets.
78
// addToSpanNotSet() will create a separate set if necessary.
81
offsets = new OffsetList();
83
// Determine if the strings even need to be taken into account at all for span() etc.
84
// If any string is relevant, then all strings need to be used for
85
// span(longest match) but only the relevant ones for span(while contained).
86
// TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
87
// and do not store UTF-8 strings if !thisRelevant and CONTAINED.
88
// (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
89
// Also count the lengths of the UTF-8 versions of the strings for memory allocation.
90
int stringsLength = strings.size();
93
boolean someRelevant = false;
94
for (i = 0; i < stringsLength; ++i) {
95
String string = strings.get(i);
96
int length16 = string.length();
97
spanLength = spanSet.span(string, SpanCondition.CONTAINED);
98
if (spanLength < length16) { // Relevant string.
101
if ((0 != (which & UTF16)) && length16 > maxLength16) {
102
maxLength16 = length16;
110
// Freeze after checking for the need to use strings at all because freezing
111
// a set takes some time and memory which are wasted if there are no relevant strings.
116
int spanBackLengthsOffset;
118
// Allocate a block of meta data.
121
// 2 sets of span lengths
122
allocSize = stringsLength * (2);
124
allocSize = stringsLength; // One set of span lengths.
126
spanLengths = new short[allocSize];
129
// Store span lengths for all span() variants.
130
spanBackLengthsOffset = stringsLength;
132
// Store span lengths for only one span() variant.
133
spanBackLengthsOffset = 0;
136
// Set the meta data and spanNotSet and write the UTF-8 strings.
138
for (i = 0; i < stringsLength; ++i) {
139
String string = strings.get(i);
140
int length16 = string.length();
141
spanLength = spanSet.span(string, SpanCondition.CONTAINED);
142
if (spanLength < length16) { // Relevant string.
143
if (0 != (which & UTF16)) {
144
if (0 != (which & CONTAINED)) {
145
if (0 != (which & FWD)) {
146
spanLengths[i] = makeSpanLengthByte(spanLength);
148
if (0 != (which & BACK)) {
149
spanLength = length16
150
- spanSet.spanBack(string, length16, SpanCondition.CONTAINED);
151
spanLengths[spanBackLengthsOffset + i] = makeSpanLengthByte(spanLength);
153
} else /* not CONTAINED, not all, but NOT_CONTAINED */{
154
spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = 0; // Only store a relevant/irrelevant
158
if (0 != (which & NOT_CONTAINED)) {
159
// Add string start and end code points to the spanNotSet so that
160
// a span(while not contained) stops before any string.
162
if (0 != (which & FWD)) {
163
c = string.codePointAt(0);
166
if (0 != (which & BACK)) {
167
c = string.codePointBefore(length16);
171
} else { // Irrelevant string.
173
spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = ALL_CP_CONTAINED;
175
// All spanXYZLengths pointers contain the same address.
176
spanLengths[i] = ALL_CP_CONTAINED;
188
* Constructs a copy of an existing UnicodeSetStringSpan.
189
* Assumes which==ALL for a frozen set.
191
public UnicodeSetStringSpan(final UnicodeSetStringSpan otherStringSpan, final ArrayList<String> newParentSetStrings) {
192
spanSet = otherStringSpan.spanSet;
193
strings = newParentSetStrings;
194
maxLength16 = otherStringSpan.maxLength16;
196
if (otherStringSpan.spanNotSet == otherStringSpan.spanSet) {
197
spanNotSet = spanSet;
199
spanNotSet = (UnicodeSet) otherStringSpan.spanNotSet.clone();
201
offsets = new OffsetList();
203
spanLengths = otherStringSpan.spanLengths.clone();
207
* Do the strings need to be checked in span() etc.?
209
* @return TRUE if strings need to be checked (call span() here), FALSE if not (use a BMPSet for best performance).
211
public boolean needsStringSpanUTF16() {
212
return (maxLength16 != 0);
215
// For fast UnicodeSet::contains(c).
216
public boolean contains(int c) {
217
return spanSet.contains(c);
220
// Add a starting or ending string character to the spanNotSet
221
// so that a character span ends before any string.
222
private void addToSpanNotSet(int c) {
223
if (spanNotSet == null || spanNotSet == spanSet) {
224
if (spanSet.contains(c)) {
225
return; // Nothing to do.
227
spanNotSet = spanSet.cloneAsThawed();
233
* Note: In span() when spanLength==0 (after a string match, or at the beginning after an empty code point span) and
234
* in spanNot() and spanNotUTF8(), string matching could use a binary search because all string matches are done
235
* from the same start index.
237
* For UTF-8, this would require a comparison function that returns UTF-16 order.
239
* This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets
240
* with strings have very few very short strings. For cases with many strings, it might be better to use a different
241
* API and implementation with a DFA (state machine).
245
* Algorithm for span(SpanCondition.CONTAINED)
247
* Theoretical algorithm: - Iterate through the string, and at each code point boundary: + If the code point there
248
* is in the set, then remember to continue after it. + If a set string matches at the current position, then
249
* remember to continue after it. + Either recursively span for each code point or string match, or recursively span
250
* for all but the shortest one and iteratively continue the span with the shortest local match. + Remember the
251
* longest recursive span (the farthest end point). + If there is no match at the current position, neither for the
252
* code point there nor for any set string, then stop and return the longest recursive span length.
254
* Optimized implementation:
256
* (We assume that most sets will have very few very short strings. A span using a string-less set is extremely
259
* Create and cache a spanSet which contains all of the single code points of the original set but none of its
262
* - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). - Loop: + Try to match each set
263
* string at the end of the spanLength. ~ Set strings that start with set-contained code points must be matched with
264
* a partial overlap because the recursive algorithm would have tried to match them at every position. ~ Set strings
265
* that entirely consist of set-contained code points are irrelevant for span(SpanCondition.CONTAINED)
266
* because the recursive algorithm would continue after them anyway and find the longest recursive match from their
267
* end. ~ Rather than recursing, note each end point of a set string match. + If no set string matched after
268
* spanSet.span(), then return with where the spanSet.span() ended. + If at least one set string matched after
269
* spanSet.span(), then pop the shortest string match end point and continue the loop, trying to match all set
270
* strings from there. + If at least one more set string matched after a previous string match, then test if the
271
* code point after the previous string match is also contained in the set. Continue the loop with the shortest end
272
* point of either this code point or a matching set string. + If no more set string matched after a previous string
273
* match, then try another spanLength=spanSet.span(SpanCondition.CONTAINED). Stop if spanLength==0,
274
* otherwise continue the loop.
276
* By noting each end point of a set string match, the function visits each string position at most once and
277
* finishes in linear time.
279
* The recursive algorithm may visit the same string position many times if multiple paths lead to it and finishes
280
* in exponential time.
284
* Algorithm for span(SIMPLE)
286
* Theoretical algorithm: - Iterate through the string, and at each code point boundary: + If the code point there
287
* is in the set, then remember to continue after it. + If a set string matches at the current position, then
288
* remember to continue after it. + Continue from the farthest match position and ignore all others. + If there is
289
* no match at the current position, then stop and return the current position.
291
* Optimized implementation:
293
* (Same assumption and spanSet as above.)
295
* - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). - Loop: + Try to match each set
296
* string at the end of the spanLength. ~ Set strings that start with set-contained code points must be matched with
297
* a partial overlap because the standard algorithm would have tried to match them earlier. ~ Set strings that
298
* entirely consist of set-contained code points must be matched with a full overlap because the longest-match
299
* algorithm would hide set string matches that end earlier. Such set strings need not be matched earlier inside the
300
* code point span because the standard algorithm would then have continued after the set string match anyway. ~
301
* Remember the longest set string match (farthest end point) from the earliest starting point. + If no set string
302
* matched after spanSet.span(), then return with where the spanSet.span() ended. + If at least one set string
303
* matched, then continue the loop after the longest match from the earliest position. + If no more set string
304
* matched after a previous string match, then try another
305
* spanLength=spanSet.span(SpanCondition.CONTAINED). Stop if spanLength==0, otherwise continue the
311
* @param s The string to be spanned
312
* @param start The start index that the span begins
313
* @param spanCondition The span condition
314
* @return the length of the span
317
public synchronized int span(CharSequence s, int start, int length, SpanCondition spanCondition) {
318
if (spanCondition == SpanCondition.NOT_CONTAINED) {
319
return spanNot(s, start, length);
321
int spanLength = spanSet.span(s.subSequence(start, start + length), SpanCondition.CONTAINED);
322
if (spanLength == length) {
326
// Consider strings; they may overlap with the span.
328
if (spanCondition == SpanCondition.CONTAINED) {
329
// Use offset list to try all possibilities.
330
initSize = maxLength16;
332
offsets.setMaxLength(initSize);
333
int pos = start + spanLength, rest = length - spanLength;
334
int i, stringsLength = strings.size();
336
if (spanCondition == SpanCondition.CONTAINED) {
337
for (i = 0; i < stringsLength; ++i) {
338
int overlap = spanLengths[i];
339
if (overlap == ALL_CP_CONTAINED) {
340
continue; // Irrelevant string.
342
String string = strings.get(i);
344
int length16 = string.length();
346
// Try to match this string at pos-overlap..pos.
347
if (overlap >= LONG_SPAN) {
349
// While contained: No point matching fully inside the code point span.
350
overlap = string.offsetByCodePoints(overlap, -1); // Length of the string minus the last code
353
if (overlap > spanLength) {
354
overlap = spanLength;
356
int inc = length16 - overlap; // Keep overlap+inc==length16.
361
// Try to match if the increment is not listed already.
362
if (!offsets.containsOffset(inc) && matches16CPB(s, pos - overlap, length, string, length16)) {
364
return length; // Reached the end of the string.
366
offsets.addOffset(inc);
376
int maxInc = 0, maxOverlap = 0;
377
for (i = 0; i < stringsLength; ++i) {
378
int overlap = spanLengths[i];
379
// For longest match, we do need to try to match even an all-contained string
380
// to find the match from the earliest start.
382
String string = strings.get(i);
384
int length16 = string.length();
386
// Try to match this string at pos-overlap..pos.
387
if (overlap >= LONG_SPAN) {
389
// Longest match: Need to match fully inside the code point span
390
// to find the match from the earliest start.
392
if (overlap > spanLength) {
393
overlap = spanLength;
395
int inc = length16 - overlap; // Keep overlap+inc==length16.
397
if (inc > rest || overlap < maxOverlap) {
400
// Try to match if the string is longer or starts earlier.
401
if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */inc > maxInc)
402
&& matches16CPB(s, pos - overlap, length, string, length16)) {
403
maxInc = inc; // Longest match from earliest start.
404
maxOverlap = overlap;
412
if (maxInc != 0 || maxOverlap != 0) {
413
// Longest-match algorithm, and there was a string match.
414
// Simply continue after it.
418
return length; // Reached the end of the string.
420
spanLength = 0; // Match strings from after a string match.
424
// Finished trying to match all strings at pos.
426
if (spanLength != 0 || pos == 0) {
427
// The position is after an unlimited code point span (spanLength!=0),
428
// not after a string match.
429
// The only position where spanLength==0 after a span is pos==0.
430
// Otherwise, an unlimited code point span is only tried again when no
431
// strings match, and if such a non-initial span fails we stop.
432
if (offsets.isEmpty()) {
433
return pos - start; // No strings matched after a span.
435
// Match strings from after the next string match.
437
// The position is after a string match (or a single code point).
438
if (offsets.isEmpty()) {
439
// No more strings matched after a previous string match.
440
// Try another code point span from after the last string match.
441
spanLength = spanSet.span(s.subSequence(pos, pos + rest), SpanCondition.CONTAINED);
442
if (spanLength == rest || // Reached the end of the string, or
443
spanLength == 0 // neither strings nor span progressed.
445
return pos + spanLength - start;
449
continue; // spanLength>0: Match strings from after a span.
451
// Try to match only one code point from after a string match if some
452
// string matched beyond it, so that we try all possible positions
453
// and don't overshoot.
454
spanLength = spanOne(spanSet, s, pos, rest);
455
if (spanLength > 0) {
456
if (spanLength == rest) {
457
return length; // Reached the end of the string.
459
// Match strings after this code point.
460
// There cannot be any increments below it because UnicodeSet strings
461
// contain multiple code points.
464
offsets.shift(spanLength);
466
continue; // Match strings from after a single code point.
468
// Match strings from after the next string match.
471
int minOffset = offsets.popMinimum();
474
spanLength = 0; // Match strings from after a string match.
479
* Span a string backwards.
481
* @param s The string to be spanned
482
* @param spanCondition The span condition
483
* @return The string index which starts the span (i.e. inclusive).
486
public synchronized int spanBack(CharSequence s, int length, SpanCondition spanCondition) {
487
if (spanCondition == SpanCondition.NOT_CONTAINED) {
488
return spanNotBack(s, length);
490
int pos = spanSet.spanBack(s, length, SpanCondition.CONTAINED);
494
int spanLength = length - pos;
496
// Consider strings; they may overlap with the span.
498
if (spanCondition == SpanCondition.CONTAINED) {
499
// Use offset list to try all possibilities.
500
initSize = maxLength16;
502
offsets.setMaxLength(initSize);
503
int i, stringsLength = strings.size();
504
int spanBackLengthsOffset = 0;
506
spanBackLengthsOffset = stringsLength;
509
if (spanCondition == SpanCondition.CONTAINED) {
510
for (i = 0; i < stringsLength; ++i) {
511
int overlap = spanLengths[spanBackLengthsOffset + i];
512
if (overlap == ALL_CP_CONTAINED) {
513
continue; // Irrelevant string.
515
String string = strings.get(i);
517
int length16 = string.length();
519
// Try to match this string at pos-(length16-overlap)..pos-length16.
520
if (overlap >= LONG_SPAN) {
522
// While contained: No point matching fully inside the code point span.
524
len1 = string.offsetByCodePoints(0, 1);
525
overlap -= len1; // Length of the string minus the first code point.
527
if (overlap > spanLength) {
528
overlap = spanLength;
530
int dec = length16 - overlap; // Keep dec+overlap==length16.
535
// Try to match if the decrement is not listed already.
536
if (!offsets.containsOffset(dec) && matches16CPB(s, pos - dec, length, string, length16)) {
538
return 0; // Reached the start of the string.
540
offsets.addOffset(dec);
550
int maxDec = 0, maxOverlap = 0;
551
for (i = 0; i < stringsLength; ++i) {
552
int overlap = spanLengths[spanBackLengthsOffset + i];
553
// For longest match, we do need to try to match even an all-contained string
554
// to find the match from the latest end.
556
String string = strings.get(i);
558
int length16 = string.length();
560
// Try to match this string at pos-(length16-overlap)..pos-length16.
561
if (overlap >= LONG_SPAN) {
563
// Longest match: Need to match fully inside the code point span
564
// to find the match from the latest end.
566
if (overlap > spanLength) {
567
overlap = spanLength;
569
int dec = length16 - overlap; // Keep dec+overlap==length16.
571
if (dec > pos || overlap < maxOverlap) {
574
// Try to match if the string is longer or ends later.
575
if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */dec > maxDec)
576
&& matches16CPB(s, pos - dec, length, string, length16)) {
577
maxDec = dec; // Longest match from latest end.
578
maxOverlap = overlap;
586
if (maxDec != 0 || maxOverlap != 0) {
587
// Longest-match algorithm, and there was a string match.
588
// Simply continue before it.
591
return 0; // Reached the start of the string.
593
spanLength = 0; // Match strings from before a string match.
597
// Finished trying to match all strings at pos.
599
if (spanLength != 0 || pos == length) {
600
// The position is before an unlimited code point span (spanLength!=0),
601
// not before a string match.
602
// The only position where spanLength==0 before a span is pos==length.
603
// Otherwise, an unlimited code point span is only tried again when no
604
// strings match, and if such a non-initial span fails we stop.
605
if (offsets.isEmpty()) {
606
return pos; // No strings matched before a span.
608
// Match strings from before the next string match.
610
// The position is before a string match (or a single code point).
611
if (offsets.isEmpty()) {
612
// No more strings matched before a previous string match.
613
// Try another code point span from before the last string match.
615
pos = spanSet.spanBack(s, oldPos, SpanCondition.CONTAINED);
616
spanLength = oldPos - pos;
617
if (pos == 0 || // Reached the start of the string, or
618
spanLength == 0 // neither strings nor span progressed.
622
continue; // spanLength>0: Match strings from before a span.
624
// Try to match only one code point from before a string match if some
625
// string matched beyond it, so that we try all possible positions
626
// and don't overshoot.
627
spanLength = spanOneBack(spanSet, s, pos);
628
if (spanLength > 0) {
629
if (spanLength == pos) {
630
return 0; // Reached the start of the string.
632
// Match strings before this code point.
633
// There cannot be any decrements below it because UnicodeSet strings
634
// contain multiple code points.
636
offsets.shift(spanLength);
638
continue; // Match strings from before a single code point.
640
// Match strings from before the next string match.
643
pos -= offsets.popMinimum();
644
spanLength = 0; // Match strings from before a string match.
649
* Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED)
651
* Theoretical algorithm: - Iterate through the string, and at each code point boundary: + If the code point there
652
* is in the set, then return with the current position. + If a set string matches at the current position, then
653
* return with the current position.
655
* Optimized implementation:
657
* (Same assumption as for span() above.)
659
* Create and cache a spanNotSet which contains all of the single code points of the original set but none of its
660
* strings. For each set string add its initial code point to the spanNotSet. (Also add its final code point for
664
* + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED).
665
* + If the current code point is in the original set, then return the current position.
666
* + If any set string matches at the current position, then return the current position.
667
* + If there is no match at the current position, neither for the code point
668
* there nor for any set string, then skip this code point and continue the loop. This happens for
669
* set-string-initial code points that were added to spanNotSet when there is not actually a match for such a set
672
* @return the length of the span
674
private int spanNot(CharSequence s, int start, int length) {
675
int pos = start, rest = length;
676
int i, stringsLength = strings.size();
678
// Span until we find a code point from the set,
679
// or a code point that starts or ends some string.
680
i = spanNotSet.span(s.subSequence(pos, pos + rest), SpanCondition.NOT_CONTAINED);
682
return length; // Reached the end of the string.
687
// Check whether the current code point is in the original set,
688
// without the string starts and ends.
689
int cpLength = spanOne(spanSet, s, pos, rest);
691
return pos - start; // There is a set element at pos.
694
// Try to match the strings at pos.
695
for (i = 0; i < stringsLength; ++i) {
696
if (spanLengths[i] == ALL_CP_CONTAINED) {
697
continue; // Irrelevant string.
699
String string = strings.get(i);
701
int length16 = string.length();
702
if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) {
703
return pos - start; // There is a set element at pos.
707
// The span(while not contained) ended on a string start/end which is
708
// not in the original set. Skip this code point and continue.
713
return length; // Reached the end of the string.
716
private int spanNotBack(CharSequence s, int length) {
718
int i, stringsLength = strings.size();
720
// Span until we find a code point from the set,
721
// or a code point that starts or ends some string.
722
pos = spanNotSet.spanBack(s, pos, SpanCondition.NOT_CONTAINED);
724
return 0; // Reached the start of the string.
727
// Check whether the current code point is in the original set,
728
// without the string starts and ends.
729
int cpLength = spanOneBack(spanSet, s, pos);
731
return pos; // There is a set element at pos.
734
// Try to match the strings at pos.
735
for (i = 0; i < stringsLength; ++i) {
736
// Use spanLengths rather than a spanLengths pointer because
737
// it is easier and we only need to know whether the string is irrelevant
738
// which is the same in either array.
739
if (spanLengths[i] == ALL_CP_CONTAINED) {
740
continue; // Irrelevant string.
742
String string = strings.get(i);
744
int length16 = string.length();
745
if (length16 <= pos && matches16CPB(s, pos - length16, length, string, length16)) {
746
return pos; // There is a set element at pos.
750
// The span(while not contained) ended on a string start/end which is
751
// not in the original set. Skip this code point and continue.
755
return 0; // Reached the start of the string.
758
static short makeSpanLengthByte(int spanLength) {
759
// 0xfe==UnicodeSetStringSpan::LONG_SPAN
760
return spanLength < LONG_SPAN ? (short) spanLength : LONG_SPAN;
763
// Compare strings without any argument checks. Requires length>0.
764
private static boolean matches16(CharSequence s, int start, final String t, int length) {
765
int end = start + length;
766
while (length-- > 0) {
767
if (s.charAt(--end) != t.charAt(length)) {
775
* Compare 16-bit Unicode strings (which may be malformed UTF-16)
776
* at code point boundaries.
777
* That is, each edge of a match must not be in the middle of a surrogate pair.
778
* @param start The start index of s.
779
* @param slength The length of s from start.
780
* @param tlength The length of t.
782
static boolean matches16CPB(CharSequence s, int start, int slength, final String t, int tlength) {
783
return !(0 < start && com.ibm.icu.text.UTF16.isLeadSurrogate (s.charAt(start - 1)) &&
784
com.ibm.icu.text.UTF16.isTrailSurrogate(s.charAt(start + 0)))
785
&& !(tlength < slength && com.ibm.icu.text.UTF16.isLeadSurrogate (s.charAt(start + tlength - 1)) &&
786
com.ibm.icu.text.UTF16.isTrailSurrogate(s.charAt(start + tlength)))
787
&& matches16(s, start, t, tlength);
790
// Does the set contain the next code point?
791
// If so, return its length; otherwise return its negative length.
792
static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) {
793
char c = s.charAt(start);
794
if (c >= 0xd800 && c <= 0xdbff && length >= 2) {
795
char c2 = s.charAt(start + 1);
796
if (com.ibm.icu.text.UTF16.isTrailSurrogate(c2)) {
797
int supplementary = UCharacterProperty.getRawSupplementary(c, c2);
798
return set.contains(supplementary) ? 2 : -2;
801
return set.contains(c) ? 1 : -1;
804
static int spanOneBack(final UnicodeSet set, CharSequence s, int length) {
805
char c = s.charAt(length - 1);
806
if (c >= 0xdc00 && c <= 0xdfff && length >= 2) {
807
char c2 = s.charAt(length - 2);
808
if (com.ibm.icu.text.UTF16.isLeadSurrogate(c2)) {
809
int supplementary = UCharacterProperty.getRawSupplementary(c2, c);
810
return set.contains(supplementary) ? 2 : -2;
813
return set.contains(c) ? 1 : -1;
818
* Helper class for UnicodeSetStringSpan.
820
* List of offsets from the current position from where to try matching a code point or a string. Store offsets rather
821
* than indexes to simplify the code and use the same list for both increments (in span()) and decrements (in
824
* Assumption: The maximum offset is limited, and the offsets that are stored at any one time are relatively dense, that
825
* is, there are normally no gaps of hundreds or thousands of offset values.
827
* The implementation uses a circular buffer of byte flags, each indicating whether the corresponding offset is in the
828
* list. This avoids inserting into a sorted list of offsets (or absolute indexes) and physically moving part of the
831
* Note: In principle, the caller should setMaxLength() to the maximum of the max string length and U16_LENGTH/U8_LENGTH
832
* to account for "long" single code points.
834
* Note: If maxLength were guaranteed to be no more than 32 or 64, the list could be stored as bit flags in a single
835
* integer. Rather than handling a circular buffer with a start list index, the integer would simply be shifted when
836
* lower offsets are removed. UnicodeSet does not have a limit on the lengths of strings.
838
static class OffsetList {
839
private boolean[] list;
843
public OffsetList() {
844
list = new boolean[16]; // default size
847
public void setMaxLength(int maxLength) {
848
if (maxLength > list.length) {
849
list = new boolean[maxLength];
854
public void clear() {
855
for (int i = list.length; i-- > 0;) {
861
public boolean isEmpty() {
862
return (length == 0);
865
// Reduce all stored offsets by delta, used when the current position
867
// There must not be any offsets lower than delta.
868
// If there is an offset equal to delta, it is removed.
869
// delta=[1..maxLength]
870
public void shift(int delta) {
871
int i = start + delta;
872
if (i >= list.length) {
882
// Add an offset. The list must not contain it yet.
883
// offset=[1..maxLength]
884
public void addOffset(int offset) {
885
int i = start + offset;
886
if (i >= list.length) {
893
// offset=[1..maxLength]
894
public boolean containsOffset(int offset) {
895
int i = start + offset;
896
if (i >= list.length) {
902
// Find the lowest stored offset from a non-empty list, remove it,
903
// and reduce all other offsets by this minimum.
904
// Returns [1..maxLength].
905
public int popMinimum() {
906
// Look for the next offset in list[start+1..list.length-1].
907
int i = start, result;
908
while (++i < list.length) {
919
// Wrap around and look for the next offset in list[0..start].
920
// Since the list is not empty, there will be one.
921
result = list.length - start;