2
*******************************************************************************
3
* Copyright (C) 1996-2010, International Business Machines Corporation and *
4
* others. All Rights Reserved. *
5
*******************************************************************************
8
*******************************************************************************
10
package com.ibm.icu.text;
13
* import java.text.StringCharacterIterator;
14
* import java.text.CharacterIterator;
16
import java.text.CharacterIterator;
17
import java.util.MissingResourceException;
19
import com.ibm.icu.impl.CharacterIteratorWrapper;
20
import com.ibm.icu.impl.ICUDebug;
21
import com.ibm.icu.impl.Norm2AllModes;
22
import com.ibm.icu.impl.Normalizer2Impl;
23
import com.ibm.icu.impl.StringUCharacterIterator;
24
import com.ibm.icu.impl.UCharacterProperty;
25
import com.ibm.icu.lang.UCharacter;
28
* <p><code>CollationElementIterator</code> is an iterator created by
29
* a RuleBasedCollator to walk through a string. The return result of
30
* each iteration is a 32-bit collation element that defines the
31
* ordering priority of the next character or sequence of characters
32
* in the source string.</p>
34
* <p>For illustration, consider the following in Spanish:
37
* "ca" -> the first collation element is collation_element('c') and second
38
* collation element is collation_element('a').
40
* Since "ch" in Spanish sorts as one entity, the below example returns one
41
* collation element for the two characters 'c' and 'h'
43
* "cha" -> the first collation element is collation_element('ch') and second
44
* collation element is collation_element('a').
50
* Since the character 'æ' is a composed character of 'a' and 'e', the
51
* iterator returns two collation elements for the single character 'æ'
53
* "æb" -> the first collation element is collation_element('a'), the
54
* second collation element is collation_element('e'), and the
55
* third collation element is collation_element('b').
60
* <p>For collation ordering comparison, the collation element results
61
* can not be compared simply by using basic arithmetric operators,
62
* e.g. <, == or >, further processing has to be done. Details
63
* can be found in the ICU
64
* <a href="http://www.icu-project.org/userguide/Collate_ServiceArchitecture.html">
65
* user guide</a>. An example of using the CollationElementIterator
66
* for collation ordering comparison is the class
67
* <a href=StringSearch.html> com.ibm.icu.text.StringSearch</a>.</p>
69
* <p>To construct a CollationElementIterator object, users
70
* call the method getCollationElementIterator() on a
71
* RuleBasedCollator that defines the desired sorting order.</p>
76
* String testString = "This is a test";
77
* RuleBasedCollator rbc = new RuleBasedCollator("&a<b");
78
* CollationElementIterator iterator = rbc.getCollationElementIterator(testString);
79
* int primaryOrder = iterator.IGNORABLE;
80
* while (primaryOrder != iterator.NULLORDER) {
81
* int order = iterator.next();
82
* if (order != iterator.IGNORABLE &&
83
* order != iterator.NULLORDER) {
84
* // order is valid, not ignorable and we have not passed the end
85
* // of the iteration, we do something
86
* primaryOrder = CollationElementIterator.primaryOrder(order);
87
* System.out.println("Next primary order 0x" +
88
* Integer.toHexString(primaryOrder));
95
* This class is not subclassable
98
* @see RuleBasedCollator
100
* @author Syn Wee Quek
103
public final class CollationElementIterator
107
// public data members --------------------------------------------------
110
* <p>This constant is returned by the iterator in the methods
111
* next() and previous() when the end or the beginning of the
112
* source string has been reached, and there are no more valid
113
* collation elements to return.</p>
115
* <p>See class documentation for an example of use.</p>
119
public final static int NULLORDER = 0xffffffff;
122
* <p>This constant is returned by the iterator in the methods
123
* next() and previous() when a collation element result is to be
126
* <p>See class documentation for an example of use.</p>
130
public static final int IGNORABLE = 0;
132
// public methods -------------------------------------------------------
134
// public getters -------------------------------------------------------
137
* <p>Returns the character offset in the source string
138
* corresponding to the next collation element. I.e., getOffset()
139
* returns the position in the source string corresponding to the
140
* collation element that will be returned by the next call to
141
* next(). This value could be any of:
143
* <li> The index of the <b>first</b> character corresponding to
144
* the next collation element. (This means that if
145
* <code>setOffset(offset)</code> sets the index in the middle of
146
* a contraction, <code>getOffset()</code> returns the index of
147
* the first character in the contraction, which may not be equal
148
* to the original offset that was set. Hence calling getOffset()
149
* immediately after setOffset(offset) does not guarantee that the
150
* original offset set will be returned.)
151
* <li> If normalization is on, the index of the <b>immediate</b>
152
* subsequent character, or composite character with the first
153
* character, having a combining class of 0.
154
* <li> The length of the source string, if iteration has reached
158
* @return The character offset in the source string corresponding to the
159
* collation element that will be returned by the next call to
163
public int getOffset()
165
if (m_bufferOffset_ != -1) {
171
return m_source_.getIndex();
176
* <p> Returns the maximum length of any expansion sequence that ends with
177
* the specified collation element. If there is no expansion with this
178
* collation element as the last element, returns 1.
180
* @param ce a collation element returned by previous() or next().
181
* @return the maximum length of any expansion sequence ending
182
* with the specified collation element.
185
public int getMaxExpansion(int ce)
188
int limit = m_collator_.m_expansionEndCE_.length;
189
long unsignedce = ce & 0xFFFFFFFFl;
190
while (start < limit - 1) {
191
int mid = start + ((limit - start) >> 1);
192
long midce = m_collator_.m_expansionEndCE_[mid] & 0xFFFFFFFFl;
193
if (unsignedce <= midce) {
201
if (m_collator_.m_expansionEndCE_[start] == ce) {
202
result = m_collator_.m_expansionEndCEMaxSize_[start];
204
else if (limit < m_collator_.m_expansionEndCE_.length &&
205
m_collator_.m_expansionEndCE_[limit] == ce) {
206
result = m_collator_.m_expansionEndCEMaxSize_[limit];
208
else if ((ce & 0xFFFF) == 0x00C0) {
214
// public other methods -------------------------------------------------
217
* <p> Resets the cursor to the beginning of the string. The next
218
* call to next() or previous() will return the first and last
219
* collation element in the string, respectively.</p>
221
* <p>If the RuleBasedCollator used by this iterator has had its
222
* attributes changed, calling reset() will reinitialize the
223
* iterator to use the new attributes.</p>
229
m_source_.setToStart();
230
updateInternalState();
234
* <p>Get the next collation element in the source string.</p>
236
* <p>This iterator iterates over a sequence of collation elements
237
* that were built from the string. Because there isn't
238
* necessarily a one-to-one mapping from characters to collation
239
* elements, this doesn't mean the same thing as "return the
240
* collation element [or ordering priority] of the next character
241
* in the string".</p>
243
* <p>This function returns the collation element that the
244
* iterator is currently pointing to, and then updates the
245
* internal pointer to point to the next element. Previous()
246
* updates the pointer first, and then returns the element. This
247
* means that when you change direction while iterating (i.e.,
248
* call next() and then call previous(), or call previous() and
249
* then call next()), you'll get back the same element twice.</p>
251
* @return the next collation element or NULLORDER if the end of the
252
* iteration has been reached.
257
m_isForwards_ = true;
258
if (m_CEBufferSize_ > 0) {
259
if (m_CEBufferOffset_ < m_CEBufferSize_) {
260
// if there are expansions left in the buffer, we return it
261
return m_CEBuffer_[m_CEBufferOffset_ ++];
264
m_CEBufferOffset_ = 0;
267
int ch_int = nextChar();
269
if (ch_int == UCharacterIterator.DONE) {
272
char ch = (char)ch_int;
273
if (m_collator_.m_isHiragana4_) {
274
/* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag
275
* based on whether the previous codepoint was Hiragana or Katakana.
277
m_isCodePointHiragana_ = (m_isCodePointHiragana_ && (ch >= 0x3099 && ch <= 0x309C)) ||
278
((ch >= 0x3040 && ch <= 0x309e) && !(ch > 0x3094 && ch < 0x309d));
281
int result = NULLORDER;
283
// For latin-1 characters we never need to fall back to the UCA
284
// table because all of the UCA data is replicated in the
285
// latinOneMapping array
286
result = m_collator_.m_trie_.getLatin1LinearValue(ch);
287
if (RuleBasedCollator.isSpecial(result)) {
288
result = nextSpecial(m_collator_, result, ch);
292
result = m_collator_.m_trie_.getLeadValue(ch);
293
//System.out.println(Integer.toHexString(result));
294
if (RuleBasedCollator.isSpecial(result)) {
295
// surrogate leads are handled as special ces
296
result = nextSpecial(m_collator_, result, ch);
298
if (result == CE_NOT_FOUND_ && RuleBasedCollator.UCA_ != null) {
299
// couldn't find a good CE in the tailoring
300
// if we got here, the codepoint MUST be over 0xFF - so we look
301
// directly in the UCA
302
result = RuleBasedCollator.UCA_.m_trie_.getLeadValue(ch);
303
if (RuleBasedCollator.isSpecial(result)) {
304
// UCA also gives us a special CE
305
result = nextSpecial(RuleBasedCollator.UCA_, result, ch);
309
if(result == CE_NOT_FOUND_) {
310
// maybe there is no UCA, unlikely in Java, but ported for consistency
311
result = nextImplicit(ch);
317
* <p>Get the previous collation element in the source string.</p>
319
* <p>This iterator iterates over a sequence of collation elements
320
* that were built from the string. Because there isn't
321
* necessarily a one-to-one mapping from characters to collation
322
* elements, this doesn't mean the same thing as "return the
323
* collation element [or ordering priority] of the previous
324
* character in the string".</p>
326
* <p>This function updates the iterator's internal pointer to
327
* point to the collation element preceding the one it's currently
328
* pointing to and then returns that element, while next() returns
329
* the current element and then updates the pointer. This means
330
* that when you change direction while iterating (i.e., call
331
* next() and then call previous(), or call previous() and then
332
* call next()), you'll get back the same element twice.</p>
334
* @return the previous collation element, or NULLORDER when the start of
335
* the iteration has been reached.
338
public int previous()
340
if (m_source_.getIndex() <= 0 && m_isForwards_) {
341
// if iterator is new or reset, we can immediate perform backwards
342
// iteration even when the offset is not right.
343
m_source_.setToLimit();
344
updateInternalState();
346
m_isForwards_ = false;
347
int result = NULLORDER;
348
if (m_CEBufferSize_ > 0) {
349
if (m_CEBufferOffset_ > 0) {
350
return m_CEBuffer_[-- m_CEBufferOffset_];
353
m_CEBufferOffset_ = 0;
355
int ch_int = previousChar();
356
if (ch_int == UCharacterIterator.DONE) {
359
char ch = (char)ch_int;
360
if (m_collator_.m_isHiragana4_) {
361
m_isCodePointHiragana_ = (ch >= 0x3040 && ch <= 0x309f);
363
if (m_collator_.isContractionEnd(ch) && !isBackwardsStart()) {
364
result = previousSpecial(m_collator_, CE_CONTRACTION_, ch);
368
result = m_collator_.m_trie_.getLatin1LinearValue(ch);
371
result = m_collator_.m_trie_.getLeadValue(ch);
373
if (RuleBasedCollator.isSpecial(result)) {
374
result = previousSpecial(m_collator_, result, ch);
376
if (result == CE_NOT_FOUND_) {
377
if (!isBackwardsStart()
378
&& m_collator_.isContractionEnd(ch)) {
379
result = CE_CONTRACTION_;
382
if(RuleBasedCollator.UCA_ != null) {
383
result = RuleBasedCollator.UCA_.m_trie_.getLeadValue(ch);
387
if (RuleBasedCollator.isSpecial(result)) {
388
if(RuleBasedCollator.UCA_ != null) {
389
result = previousSpecial(RuleBasedCollator.UCA_, result, ch);
394
if(result == CE_NOT_FOUND_) {
395
result = previousImplicit(ch);
401
* Return the primary order of the specified collation element,
402
* i.e. the first 16 bits. This value is unsigned.
403
* @param ce the collation element
404
* @return the element's 16 bits primary order.
407
public final static int primaryOrder(int ce)
409
return (ce & RuleBasedCollator.CE_PRIMARY_MASK_)
410
>>> RuleBasedCollator.CE_PRIMARY_SHIFT_;
413
* Return the secondary order of the specified collation element,
414
* i.e. the 16th to 23th bits, inclusive. This value is unsigned.
415
* @param ce the collation element
416
* @return the element's 8 bits secondary order
419
public final static int secondaryOrder(int ce)
421
return (ce & RuleBasedCollator.CE_SECONDARY_MASK_)
422
>> RuleBasedCollator.CE_SECONDARY_SHIFT_;
426
* Return the tertiary order of the specified collation element, i.e. the last
427
* 8 bits. This value is unsigned.
428
* @param ce the collation element
429
* @return the element's 8 bits tertiary order
432
public final static int tertiaryOrder(int ce)
434
return ce & RuleBasedCollator.CE_TERTIARY_MASK_;
438
* <p> Sets the iterator to point to the collation element
439
* corresponding to the character at the specified offset. The
440
* value returned by the next call to next() will be the collation
441
* element corresponding to the characters at offset.</p>
443
* <p>If offset is in the middle of a contracting character
444
* sequence, the iterator is adjusted to the start of the
445
* contracting sequence. This means that getOffset() is not
446
* guaranteed to return the same value set by this method.</p>
448
* <p>If the decomposition mode is on, and offset is in the middle
449
* of a decomposible range of source text, the iterator may not
450
* return a correct result for the next forwards or backwards
451
* iteration. The user must ensure that the offset is not in the
452
* middle of a decomposible range.</p>
454
* @param offset the character offset into the original source string to
455
* set. Note that this is not an offset into the corresponding
456
* sequence of collation elements.
459
public void setOffset(int offset)
461
m_source_.setIndex(offset);
462
int ch_int = m_source_.current();
463
char ch = (char)ch_int;
464
if (ch_int != UCharacterIterator.DONE && m_collator_.isUnsafe(ch)) {
465
// if it is unsafe we need to check if it is part of a contraction
466
// or a surrogate character
467
if (UTF16.isTrailSurrogate(ch)) {
468
// if it is a surrogate pair we move up one character
469
char prevch = (char)m_source_.previous();
470
if (!UTF16.isLeadSurrogate(prevch)) {
471
m_source_.setIndex(offset); // go back to the same index
475
// could be part of a contraction
476
// backup to a safe point and iterate till we pass offset
477
while (m_source_.getIndex() > 0) {
478
if (!m_collator_.isUnsafe(ch)) {
481
ch = (char)m_source_.previous();
483
updateInternalState();
485
while (m_source_.getIndex() <= offset) {
486
prevoffset = m_source_.getIndex();
489
m_source_.setIndex(prevoffset);
492
updateInternalState();
493
// direction code to prevent next and previous from returning a
494
// character if we are already at the ends
495
offset = m_source_.getIndex();
496
if (offset == 0/* m_source_.getBeginIndex() */) {
497
// preventing previous() from returning characters from the end of
498
// the string again if we are at the beginning
499
m_isForwards_ = false;
501
else if (offset == m_source_.getLength()) {
502
// preventing next() from returning characters from the start of
503
// the string again if we are at the end
504
m_isForwards_ = true;
509
* <p>Set a new source string for iteration, and reset the offset
510
* to the beginning of the text.</p>
512
* @param source the new source string for iteration.
515
public void setText(String source)
517
m_srcUtilIter_.setText(source);
518
m_source_ = m_srcUtilIter_;
519
updateInternalState();
523
* <p>Set a new source string iterator for iteration, and reset the
524
* offset to the beginning of the text.
526
* <p>The source iterator's integrity will be preserved since a new copy
527
* will be created for use.</p>
528
* @param source the new source string iterator for iteration.
531
public void setText(UCharacterIterator source)
533
m_srcUtilIter_.setText(source.getText());
534
m_source_ = m_srcUtilIter_;
535
updateInternalState();
539
* <p>Set a new source string iterator for iteration, and reset the
540
* offset to the beginning of the text.
542
* @param source the new source string iterator for iteration.
545
public void setText(CharacterIterator source)
547
m_source_ = new CharacterIteratorWrapper(source);
548
m_source_.setToStart();
549
updateInternalState();
552
// public miscellaneous methods -----------------------------------------
555
* Tests that argument object is equals to this CollationElementIterator.
556
* Iterators are equal if the objects uses the same RuleBasedCollator,
557
* the same source text and have the same current position in iteration.
558
* @param that object to test if it is equals to this
559
* CollationElementIterator
562
public boolean equals(Object that)
567
if (that instanceof CollationElementIterator) {
568
CollationElementIterator thatceiter
569
= (CollationElementIterator)that;
570
if (!m_collator_.equals(thatceiter.m_collator_)) {
574
return m_source_.getIndex() == thatceiter.m_source_.getIndex()
575
&& m_source_.getText().equals(
576
thatceiter.m_source_.getText());
581
// package private constructors ------------------------------------------
583
private CollationElementIterator(RuleBasedCollator collator) {
584
m_utilStringBuffer_ = new StringBuilder();
585
m_collator_ = collator;
586
m_CEBuffer_ = new int[CE_BUFFER_INIT_SIZE_];
587
m_buffer_ = new StringBuilder();
588
m_utilSpecialBackUp_ = new Backup();
589
m_nfcImpl_.getFCDTrie(); // ensure the FCD data is initialized
593
* <p>CollationElementIterator constructor. This takes a source
594
* string and a RuleBasedCollator. The iterator will walk through
595
* the source string based on the rules defined by the
596
* collator. If the source string is empty, NULLORDER will be
597
* returned on the first call to next().</p>
599
* @param source the source string.
600
* @param collator the RuleBasedCollator
603
CollationElementIterator(String source, RuleBasedCollator collator)
606
m_source_ = m_srcUtilIter_ = new StringUCharacterIterator(source);
607
updateInternalState();
611
* <p>CollationElementIterator constructor. This takes a source
612
* character iterator and a RuleBasedCollator. The iterator will
613
* walk through the source string based on the rules defined by
614
* the collator. If the source string is empty, NULLORDER will be
615
* returned on the first call to next().</p>
617
* @param source the source string iterator.
618
* @param collator the RuleBasedCollator
621
CollationElementIterator(CharacterIterator source,
622
RuleBasedCollator collator)
625
m_srcUtilIter_ = new StringUCharacterIterator();
626
m_source_ = new CharacterIteratorWrapper(source);
627
updateInternalState();
631
* <p>CollationElementIterator constructor. This takes a source
632
* character iterator and a RuleBasedCollator. The iterator will
633
* walk through the source string based on the rules defined by
634
* the collator. If the source string is empty, NULLORDER will be
635
* returned on the first call to next().</p>
637
* @param source the source string iterator.
638
* @param collator the RuleBasedCollator
641
CollationElementIterator(UCharacterIterator source,
642
RuleBasedCollator collator)
645
m_srcUtilIter_ = new StringUCharacterIterator();
646
m_srcUtilIter_.setText(source.getText());
647
m_source_ = m_srcUtilIter_;
648
updateInternalState();
651
// package private data members -----------------------------------------
654
* true if current codepoint was Hiragana
656
boolean m_isCodePointHiragana_;
658
* Position in the original string that starts with a non-FCD sequence
662
* This is the CE from CEs buffer that should be returned.
663
* Initial value is 0.
664
* Forwards iteration will end with m_CEBufferOffset_ == m_CEBufferSize_,
665
* backwards will end with m_CEBufferOffset_ == 0.
666
* The next/previous after we reach the end/beginning of the m_CEBuffer_
667
* will cause this value to be reset to 0.
669
int m_CEBufferOffset_;
672
* This is the position to which we have stored processed CEs.
673
* Initial value is 0.
674
* The next/previous after we reach the end/beginning of the m_CEBuffer_
675
* will cause this value to be reset to 0.
678
static final int CE_NOT_FOUND_ = 0xF0000000;
679
static final int CE_EXPANSION_TAG_ = 1;
680
static final int CE_CONTRACTION_TAG_ = 2;
682
* Collate Digits As Numbers (CODAN) implementation
684
static final int CE_DIGIT_TAG_ = 13;
686
// package private methods ----------------------------------------------
689
* Sets the collator used.
690
* Internal use, all data members will be reset to the default values
691
* @param collator to set
693
void setCollator(RuleBasedCollator collator)
695
m_collator_ = collator;
696
updateInternalState();
700
* <p>Sets the iterator to point to the collation element corresponding to
701
* the specified character (the parameter is a CHARACTER offset in the
702
* original string, not an offset into its corresponding sequence of
703
* collation elements). The value returned by the next call to next()
704
* will be the collation element corresponding to the specified position
705
* in the text. Unlike the public method setOffset(int), this method does
706
* not try to readjust the offset to the start of a contracting sequence.
707
* getOffset() is guaranteed to return the same value as was passed to a
708
* preceding call to setOffset().</p>
709
* @param offset new character offset into the original text to set.
711
void setExactOffset(int offset)
713
m_source_.setIndex(offset);
714
updateInternalState();
718
* Checks if iterator is in the buffer zone
719
* @return true if iterator is in buffer zone, false otherwise
723
return m_bufferOffset_ > 0;
728
* <p>Sets the iterator to point to the collation element corresponding to
729
* the specified character (the parameter is a CHARACTER offset in the
730
* original string, not an offset into its corresponding sequence of
731
* collation elements). The value returned by the next call to next()
732
* will be the collation element corresponding to the specified position
733
* in the text. Unlike the public method setOffset(int), this method does
734
* not try to readjust the offset to the start of a contracting sequence.
735
* getOffset() is guaranteed to return the same value as was passed to a
736
* preceding call to setOffset().</p>
738
* @param source the new source string iterator for iteration.
739
* @param offset to the source
741
void setText(UCharacterIterator source, int offset)
743
m_srcUtilIter_.setText(source.getText());
744
m_source_ = m_srcUtilIter_;
745
m_source_.setIndex(offset);
746
updateInternalState();
749
// private inner class --------------------------------------------------
754
private static final class Backup
756
// protected data members -------------------------------------------
759
* Backup non FCD sequence limit
761
protected int m_FCDLimit_;
763
* Backup non FCD sequence start
765
protected int m_FCDStart_;
767
* Backup if previous Codepoint is Hiragana quatenary
769
protected boolean m_isCodePointHiragana_;
771
* Backup buffer position
773
protected int m_bufferOffset_;
775
* Backup source iterator offset
777
protected int m_offset_;
779
* Backup buffer contents
781
protected StringBuffer m_buffer_;
783
// protected constructor --------------------------------------------
790
m_buffer_ = new StringBuffer();
793
// end inner class ------------------------------------------------------
796
* Direction of travel
798
private boolean m_isForwards_;
800
* Source string iterator
802
private UCharacterIterator m_source_;
804
* This is position to the m_buffer_, -1 if iterator is not in m_buffer_
806
private int m_bufferOffset_;
808
* Buffer for temporary storage of normalized characters, discontiguous
809
* characters and Thai characters
811
private StringBuilder m_buffer_;
813
* Position in the original string to continue forward FCD check from.
815
private int m_FCDLimit_;
817
* The collator this iterator is based on
819
private RuleBasedCollator m_collator_;
821
* true if Hiragana quatenary is on
823
//private boolean m_isHiragana4_;
827
private int m_CEBuffer_[];
829
* In reality we should not have to deal with expansion sequences longer
830
* then 16. However this value can be change if a bigger buffer is needed.
831
* Note, if the size is change to too small a number, BIG trouble.
832
* Reasonable small value is around 10, if there's no Arabic or other
833
* funky collations that have long expansion sequence. This is the longest
834
* expansion sequence this can handle without bombing out.
836
private static final int CE_BUFFER_INIT_SIZE_ = 512;
838
* Backup storage for special processing inner cases
840
private Backup m_utilSpecialBackUp_;
842
* Backup storage in special processing entry state
844
private Backup m_utilSpecialEntryBackUp_;
846
* Backup storage in special processing discontiguous state
848
private Backup m_utilSpecialDiscontiguousBackUp_;
852
private StringUCharacterIterator m_srcUtilIter_;
853
private StringBuilder m_utilStringBuffer_;
854
private StringBuilder m_utilSkippedBuffer_;
855
private CollationElementIterator m_utilColEIter_;
856
private static final Normalizer2Impl m_nfcImpl_ = Norm2AllModes.getNFCInstance().impl;
857
private StringBuilder m_unnormalized_;
858
private Normalizer2Impl.ReorderingBuffer m_n2Buffer_;
860
* The first non-zero combining class character
862
private static final int FULL_ZERO_COMBINING_CLASS_FAST_LIMIT_ = 0xC0;
864
* One character before the first character with leading non-zero combining
867
private static final int LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_ = 0x300;
869
* Mask for the last byte
871
private static final int LAST_BYTE_MASK_ = 0xFF;
873
* Shift value for the second last byte
875
private static final int SECOND_LAST_BYTE_SHIFT_ = 8;
877
// special ce values and tags -------------------------------------------
879
// private static final int CE_EXPANSION_ = 0xF1000000;
880
private static final int CE_CONTRACTION_ = 0xF2000000;
882
* Indicates the last ce has been consumed. Compare with NULLORDER.
883
* NULLORDER is returned if error occurs.
885
/* private static final int CE_NO_MORE_CES_ = 0x00010101;
886
private static final int CE_NO_MORE_CES_PRIMARY_ = 0x00010000;
887
private static final int CE_NO_MORE_CES_SECONDARY_ = 0x00000100;
888
private static final int CE_NO_MORE_CES_TERTIARY_ = 0x00000001;
890
private static final int CE_NOT_FOUND_TAG_ = 0;
892
* Charset processing, not yet implemented
894
private static final int CE_CHARSET_TAG_ = 4;
898
private static final int CE_HANGUL_SYLLABLE_TAG_ = 6;
902
private static final int CE_LEAD_SURROGATE_TAG_ = 7;
906
private static final int CE_TRAIL_SURROGATE_TAG_ = 8;
908
* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
910
private static final int CE_CJK_IMPLICIT_TAG_ = 9;
911
private static final int CE_IMPLICIT_TAG_ = 10;
912
static final int CE_SPEC_PROC_TAG_ = 11;
914
* This is a 3 byte primary with starting secondaries and tertiaries.
915
* It fits in a single 32 bit CE and is used instead of expansion to save
916
* space without affecting the performance (hopefully).
918
private static final int CE_LONG_PRIMARY_TAG_ = 12;
920
// private static final int CE_CE_TAGS_COUNT = 14;
921
private static final int CE_BYTE_COMMON_ = 0x05;
923
// end special ce values and tags ---------------------------------------
925
private static final int HANGUL_SBASE_ = 0xAC00;
926
private static final int HANGUL_LBASE_ = 0x1100;
927
private static final int HANGUL_VBASE_ = 0x1161;
928
private static final int HANGUL_TBASE_ = 0x11A7;
929
private static final int HANGUL_VCOUNT_ = 21;
930
private static final int HANGUL_TCOUNT_ = 28;
932
// CJK stuff ------------------------------------------------------------
934
/* private static final int CJK_BASE_ = 0x4E00;
935
private static final int CJK_LIMIT_ = 0x9FFF+1;
936
private static final int CJK_COMPAT_USED_BASE_ = 0xFA0E;
937
private static final int CJK_COMPAT_USED_LIMIT_ = 0xFA2F + 1;
938
private static final int CJK_A_BASE_ = 0x3400;
939
private static final int CJK_A_LIMIT_ = 0x4DBF + 1;
940
private static final int CJK_B_BASE_ = 0x20000;
941
private static final int CJK_B_LIMIT_ = 0x2A6DF + 1;
942
private static final int NON_CJK_OFFSET_ = 0x110000;
944
private static final boolean DEBUG = ICUDebug.enabled("collator");
946
// private methods ------------------------------------------------------
949
* Reset the iterator internally
951
private void updateInternalState()
953
m_isCodePointHiragana_ = false;
954
m_buffer_.setLength(0);
955
m_bufferOffset_ = -1;
956
m_CEBufferOffset_ = 0;
959
m_FCDStart_ = m_source_.getLength();
960
//m_isHiragana4_ = m_collator_.m_isHiragana4_;
961
m_isForwards_ = true;
965
* Backup the current internal state
966
* @param backup object to store the data
968
private void backupInternalState(Backup backup)
970
backup.m_offset_ = m_source_.getIndex();
971
backup.m_FCDLimit_ = m_FCDLimit_;
972
backup.m_FCDStart_ = m_FCDStart_;
973
backup.m_isCodePointHiragana_ = m_isCodePointHiragana_;
974
backup.m_bufferOffset_ = m_bufferOffset_;
975
backup.m_buffer_.setLength(0);
976
if (m_bufferOffset_ >= 0) {
977
backup.m_buffer_.append(m_buffer_);
982
* Update the iterator internally with backed-up state
983
* @param backup object that stored the data
985
private void updateInternalState(Backup backup)
987
m_source_.setIndex(backup.m_offset_);
988
m_isCodePointHiragana_ = backup.m_isCodePointHiragana_;
989
m_bufferOffset_ = backup.m_bufferOffset_;
990
m_FCDLimit_ = backup.m_FCDLimit_;
991
m_FCDStart_ = backup.m_FCDStart_;
992
m_buffer_.setLength(0);
993
if (m_bufferOffset_ >= 0) {
994
m_buffer_.append(backup.m_buffer_);
999
* A fast combining class retrieval system.
1000
* @param ch UTF16 character
1001
* @return combining class of ch
1003
private int getCombiningClass(int ch)
1005
if (ch >= LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_ &&
1006
m_collator_.isUnsafe((char)ch) || ch > 0xFFFF
1008
return m_nfcImpl_.getCC(m_nfcImpl_.getNorm16(ch));
1014
* <p>Incremental normalization, this is an essential optimization.
1015
* Assuming FCD checks has been done, normalize the non-FCD characters into
1017
* Source offsets points to the current processing character.
1020
private void normalize()
1022
if (m_unnormalized_ == null) {
1023
m_unnormalized_ = new StringBuilder();
1024
m_n2Buffer_ = new Normalizer2Impl.ReorderingBuffer(m_nfcImpl_, m_buffer_, 10);
1026
m_unnormalized_.setLength(0);
1027
m_n2Buffer_.remove();
1029
int size = m_FCDLimit_ - m_FCDStart_;
1030
m_source_.setIndex(m_FCDStart_);
1031
for (int i = 0; i < size; i ++) {
1032
m_unnormalized_.append((char)m_source_.next());
1034
m_nfcImpl_.decomposeShort(m_unnormalized_, 0, size, m_n2Buffer_);
1038
* <p>Incremental FCD check and normalization. Gets the next base character
1039
* position and determines if the in-between characters needs normalization.
1041
* <p>When entering, the state is known to be this:
1043
* <li>We are working on source string, not the buffer.
1044
* <li>The leading combining class from the current character is 0 or the
1045
* trailing combining class of the previous char was zero.
1047
* Incoming source offsets points to the current processing character.
1048
* Return source offsets points to the current processing character.
1050
* @param ch current character (lead unit)
1051
* @param offset offset of ch +1
1052
* @return true if FCDCheck passes, false otherwise
1054
private boolean FCDCheck(int ch, int offset)
1056
boolean result = true;
1058
// Get the trailing combining class of the current character.
1059
// If it's zero, we are OK.
1060
m_FCDStart_ = offset - 1;
1061
m_source_.setIndex(offset);
1063
int fcd = m_nfcImpl_.getFCD16FromSingleLead((char)ch);
1064
if (fcd != 0 && Character.isHighSurrogate((char)ch)) {
1065
int c2 = m_source_.next();
1067
fcd = 0; // end of input
1068
} else if (Character.isLowSurrogate((char)c2)) {
1069
fcd = m_nfcImpl_.getFCD16(Character.toCodePoint((char)ch, (char)c2));
1071
m_source_.moveIndex(-1);
1076
int prevTrailCC = fcd & LAST_BYTE_MASK_;
1078
if (prevTrailCC == 0) {
1079
offset = m_source_.getIndex();
1081
// The current char has a non-zero trailing CC. Scan forward until
1082
// we find a char with a leading cc of zero.
1084
ch = m_source_.nextCodePoint();
1086
offset = m_source_.getIndex();
1090
fcd = m_nfcImpl_.getFCD16(ch);
1091
int leadCC = fcd >> SECOND_LAST_BYTE_SHIFT_;
1093
// this is a base character, we stop the FCD checks
1094
offset = m_source_.getIndex() - Character.charCount(ch);
1098
if (leadCC < prevTrailCC) {
1102
prevTrailCC = fcd & LAST_BYTE_MASK_;
1105
m_FCDLimit_ = offset;
1106
m_source_.setIndex(m_FCDStart_ + 1);
1111
* <p>Method tries to fetch the next character that is in fcd form.</p>
1112
* <p>Normalization is done if required.</p>
1113
* <p>Offsets are returned at the next character.</p>
1114
* @return next fcd character
1116
private int nextChar()
1120
// loop handles the next character whether it is in the buffer or not.
1121
if (m_bufferOffset_ < 0) {
1122
// we're working on the source and not normalizing. fast path.
1123
// note Thai pre-vowel reordering uses buffer too
1124
result = m_source_.next();
1127
// we are in the buffer, buffer offset will never be 0 here
1128
if (m_bufferOffset_ >= m_buffer_.length()) {
1129
// Null marked end of buffer, revert to the source string and
1130
// loop back to top to try again to get a character.
1131
m_source_.setIndex(m_FCDLimit_);
1132
m_bufferOffset_ = -1;
1133
m_buffer_.setLength(0);
1136
return m_buffer_.charAt(m_bufferOffset_ ++);
1138
int startoffset = m_source_.getIndex();
1139
if (result < FULL_ZERO_COMBINING_CLASS_FAST_LIMIT_
1140
// Fast fcd safe path. trail combining class == 0.
1141
|| m_collator_.getDecomposition() == Collator.NO_DECOMPOSITION
1142
|| m_bufferOffset_ >= 0 || m_FCDLimit_ >= startoffset) {
1143
// skip the fcd checks
1147
if (result < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_) {
1148
// We need to peek at the next character in order to tell if we are
1150
int next = m_source_.current();
1151
if (next == UCharacterIterator.DONE
1152
|| next < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_) {
1153
return result; // end of source string and if next character
1154
// starts with a base character is always fcd.
1158
// Need a more complete FCD check and possible normalization.
1159
if (!FCDCheck(result, startoffset)) {
1161
result = m_buffer_.charAt(0);
1162
m_bufferOffset_ = 1;
1168
* <p>Incremental normalization, this is an essential optimization.
1169
* Assuming FCD checks has been done, normalize the non-FCD characters into
1171
* Source offsets points to the current processing character.</p>
1173
private void normalizeBackwards()
1176
m_bufferOffset_ = m_buffer_.length();
1180
* <p>Incremental backwards FCD check and normalization. Gets the previous
1181
* base character position and determines if the in-between characters
1182
* needs normalization.
1184
* <p>When entering, the state is known to be this:
1186
* <li>We are working on source string, not the buffer.
1187
* <li>The trailing combining class from the current character is 0 or the
1188
* leading combining class of the next char was zero.
1190
* Input source offsets points to the previous character.
1191
* Return source offsets points to the current processing character.
1193
* @param ch current character
1194
* @param offset current character offset
1195
* @return true if FCDCheck passes, false otherwise
1197
private boolean FCDCheckBackwards(int ch, int offset)
1200
m_FCDLimit_ = offset + 1;
1201
m_source_.setIndex(offset);
1202
if (!UTF16.isSurrogate((char)ch)) {
1203
fcd = m_nfcImpl_.getFCD16FromSingleLead((char)ch);
1206
if (!Normalizer2Impl.UTF16Plus.isSurrogateLead(ch)) {
1207
int c2 = m_source_.previous();
1210
} else if (Character.isHighSurrogate((char)c2)) {
1211
ch = Character.toCodePoint((char)c2, (char)ch);
1212
fcd = m_nfcImpl_.getFCD16(ch);
1215
m_source_.moveIndex(1);
1220
// Scan backward until we find a char with a leading cc of zero.
1221
boolean result = true;
1225
leadCC = fcd >> SECOND_LAST_BYTE_SHIFT_;
1226
if (leadCC == 0 || (ch = m_source_.previousCodePoint()) < 0) {
1227
offset = m_source_.getIndex();
1230
fcd = m_nfcImpl_.getFCD16(ch);
1231
int prevTrailCC = fcd & LAST_BYTE_MASK_;
1232
if (leadCC < prevTrailCC) {
1234
} else if (fcd == 0) {
1235
offset = m_source_.getIndex() + Character.charCount(ch);
1241
// storing character with 0 lead fcd or the 1st accent with a base
1242
// character before it
1243
m_FCDStart_ = offset;
1244
m_source_.setIndex(m_FCDLimit_);
1249
* <p>Method tries to fetch the previous character that is in fcd form.</p>
1250
* <p>Normalization is done if required.</p>
1251
* <p>Offsets are returned at the current character.</p>
1252
* @return previous fcd character
1254
private int previousChar()
1256
if (m_bufferOffset_ >= 0) {
1258
if (m_bufferOffset_ >= 0) {
1259
return m_buffer_.charAt(m_bufferOffset_);
1262
// At the start of buffer, route back to string.
1263
m_buffer_.setLength(0);
1264
if (m_FCDStart_ == 0) {
1266
m_source_.setIndex(0);
1267
return UCharacterIterator.DONE;
1270
m_FCDLimit_ = m_FCDStart_;
1271
m_source_.setIndex(m_FCDStart_);
1272
return previousChar();
1276
int result = m_source_.previous();
1277
int startoffset = m_source_.getIndex();
1278
if (result < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_
1279
|| m_collator_.getDecomposition() == Collator.NO_DECOMPOSITION
1280
|| m_FCDStart_ <= startoffset || m_source_.getIndex() == 0) {
1283
int ch = m_source_.previous();
1284
if (ch < FULL_ZERO_COMBINING_CLASS_FAST_LIMIT_) {
1285
// if previous character is FCD
1289
// Need a more complete FCD check and possible normalization.
1290
if (!FCDCheckBackwards(result, startoffset)) {
1291
normalizeBackwards();
1293
result = m_buffer_.charAt(m_bufferOffset_);
1296
// fcd checks always reset m_source_ to the limit of the FCD
1297
m_source_.setIndex(startoffset);
1303
* Determines if it is at the start of source iteration
1304
* @return true if iterator at the start, false otherwise
1306
private final boolean isBackwardsStart()
1308
return (m_bufferOffset_ < 0 && m_source_.getIndex() == 0)
1309
|| (m_bufferOffset_ == 0 && m_FCDStart_ <= 0);
1313
* Checks if iterator is at the end of its source string.
1314
* @return true if it is at the end, false otherwise
1316
private final boolean isEnd()
1318
if (m_bufferOffset_ >= 0) {
1319
if (m_bufferOffset_ != m_buffer_.length()) {
1323
// at end of buffer. check if fcd is at the end
1324
return m_FCDLimit_ == m_source_.getLength();
1327
return m_source_.getLength() == m_source_.getIndex();
1331
* <p>Special CE management for surrogates</p>
1332
* <p>Lead surrogate is encountered. CE to be retrieved by using the
1333
* following code unit. If next character is a trail surrogate, both
1334
* characters will be combined to retrieve the CE, otherwise completely
1335
* ignorable (UCA specification) is returned.</p>
1336
* @param collator collator to use
1337
* @param ce current CE
1338
* @param trail character
1339
* @return next CE for the surrogate characters
1341
private final int nextSurrogate(RuleBasedCollator collator, int ce,
1344
if (!UTF16.isTrailSurrogate(trail)) {
1345
updateInternalState(m_utilSpecialBackUp_);
1348
// TODO: CE contain the data from the previous CE + the mask.
1349
// It should at least be unmasked
1350
int result = collator.m_trie_.getTrailValue(ce, trail);
1351
if (result == CE_NOT_FOUND_) {
1352
updateInternalState(m_utilSpecialBackUp_);
1358
* Gets the CE expansion offset
1359
* @param collator current collator
1360
* @param ce ce to test
1361
* @return expansion offset
1363
private int getExpansionOffset(RuleBasedCollator collator, int ce)
1365
return ((ce & 0xFFFFF0) >> 4) - collator.m_expansionOffset_;
1370
* Gets the contraction ce offset
1371
* @param collator current collator
1372
* @param ce current ce
1373
* @return contraction offset
1375
private int getContractionOffset(RuleBasedCollator collator, int ce)
1377
return (ce & 0xFFFFFF) - collator.m_contractionOffset_;
1381
* Checks if CE is a special tag CE
1382
* @param ce to check
1383
* @return true if CE is a special tag CE, false otherwise
1385
private boolean isSpecialPrefixTag(int ce)
1387
return RuleBasedCollator.isSpecial(ce) &&
1388
RuleBasedCollator.getTag(ce) == CE_SPEC_PROC_TAG_;
1392
* <p>Special processing getting a CE that is preceded by a certain
1394
* <p>Used for optimizing Japanese length and iteration marks. When a
1395
* special processing tag is encountered, iterate backwards to see if
1396
* there's a match.</p>
1397
* <p>Contraction tables are used, prefix data is stored backwards in the
1399
* @param collator collator to use
1400
* @param ce current ce
1401
* @param entrybackup entry backup iterator status
1402
* @return next collation element
1404
private int nextSpecialPrefix(RuleBasedCollator collator, int ce,
1407
backupInternalState(m_utilSpecialBackUp_);
1408
updateInternalState(entrybackup);
1410
// We want to look at the character where we entered
1413
// This loop will run once per source string character, for as
1414
// long as we are matching a potential contraction sequence
1415
// First we position ourselves at the begining of contraction
1417
int entryoffset = getContractionOffset(collator, ce);
1418
int offset = entryoffset;
1419
if (isBackwardsStart()) {
1420
ce = collator.m_contractionCE_[offset];
1423
char previous = (char)previousChar();
1424
while (previous > collator.m_contractionIndex_[offset]) {
1425
// contraction characters are ordered, skip smaller characters
1429
if (previous == collator.m_contractionIndex_[offset]) {
1430
// Found the source string char in the table.
1431
// Pick up the corresponding CE from the table.
1432
ce = collator.m_contractionCE_[offset];
1435
// Source string char was not in the table, prefix not found
1436
ce = collator.m_contractionCE_[entryoffset];
1439
if (!isSpecialPrefixTag(ce)) {
1440
// The source string char was in the contraction table, and
1441
// the corresponding CE is not a prefix CE. We found the
1442
// prefix, break out of loop, this CE will end up being
1443
// returned. This is the normal way out of prefix handling
1444
// when the source actually contained the prefix.
1448
if (ce != CE_NOT_FOUND_) {
1449
// we found something and we can merilly continue
1450
updateInternalState(m_utilSpecialBackUp_);
1452
else { // prefix search was a failure, we have to backup all the way to
1454
updateInternalState(entrybackup);
1460
* Checks if the ce is a contraction tag
1461
* @param ce ce to check
1462
* @return true if ce is a contraction tag, false otherwise
1464
private boolean isContractionTag(int ce)
1466
return RuleBasedCollator.isSpecial(ce) &&
1467
RuleBasedCollator.getTag(ce) == CE_CONTRACTION_TAG_;
1471
* Method to copy skipped characters into the buffer and sets the fcd
1472
* position. To ensure that the skipped characters are considered later,
1473
* we need to place it in the appropriate position in the buffer and
1474
* reassign the source index. simple case if index reside in string,
1475
* simply copy to buffer and fcdposition = pos, pos = start of buffer.
1476
* if pos in normalization buffer, we'll insert the copy infront of pos
1477
* and point pos to the start of the buffer. why am i doing these copies?
1478
* well, so that the whole chunk of codes in the getNextCE,
1479
* ucol_prv_getSpecialCE does not require any changes, which will be
1481
* @param skipped character buffer
1483
private void setDiscontiguous(StringBuilder skipped)
1485
if (m_bufferOffset_ >= 0) {
1486
m_buffer_.replace(0, m_bufferOffset_, skipped.toString());
1489
m_FCDLimit_ = m_source_.getIndex();
1490
m_buffer_.setLength(0);
1491
m_buffer_.append(skipped.toString());
1494
m_bufferOffset_ = 0;
1498
* Returns the current character for forward iteration
1499
* @return current character
1501
private int currentChar()
1503
if (m_bufferOffset_ < 0) {
1504
m_source_.previous();
1505
return m_source_.next();
1508
// m_bufferOffset_ is never 0 in normal circumstances except after a
1509
// discontiguous contraction since it is always returned and moved
1510
// by 1 when we do nextChar()
1511
return m_buffer_.charAt(m_bufferOffset_ - 1);
1515
* Method to get the discontiguous collation element within the source.
1516
* Note this function will set the position to the appropriate places.
1517
* Passed in character offset points to the second combining character
1518
* after the start character.
1519
* @param collator current collator used
1520
* @param entryoffset index to the start character in the contraction table
1521
* @return discontiguous collation element offset
1523
private int nextDiscontiguous(RuleBasedCollator collator, int entryoffset)
1525
int offset = entryoffset;
1526
boolean multicontraction = false;
1527
// since it will be stuffed into this iterator and ran over again
1528
if (m_utilSkippedBuffer_ == null) {
1529
m_utilSkippedBuffer_ = new StringBuilder();
1532
m_utilSkippedBuffer_.setLength(0);
1534
char ch = (char)currentChar();
1535
m_utilSkippedBuffer_.append((char)currentChar());
1536
// accent after the first character
1537
if (m_utilSpecialDiscontiguousBackUp_ == null) {
1538
m_utilSpecialDiscontiguousBackUp_ = new Backup();
1540
backupInternalState(m_utilSpecialDiscontiguousBackUp_);
1544
int ch_int = nextChar();
1545
nextch = (char)ch_int;
1546
if (ch_int == UCharacterIterator.DONE
1547
|| getCombiningClass(nextch) == 0) {
1548
// if there are no more accents to move around
1549
// we don't have to shift previousChar, since we are resetting
1551
if (multicontraction) {
1552
if (ch_int != UCharacterIterator.DONE) {
1553
previousChar(); // backtrack
1555
setDiscontiguous(m_utilSkippedBuffer_);
1556
return collator.m_contractionCE_[offset];
1561
offset ++; // skip the combining class offset
1562
while ((offset < collator.m_contractionIndex_.length) &&
1563
(nextch > collator.m_contractionIndex_[offset])) {
1567
int ce = CE_NOT_FOUND_;
1568
if ( offset >= collator.m_contractionIndex_.length) {
1571
if ( nextch != collator.m_contractionIndex_[offset]
1572
|| getCombiningClass(nextch) == getCombiningClass(ch)) {
1573
// unmatched or blocked character
1574
if ( (m_utilSkippedBuffer_.length()!= 1) ||
1575
((m_utilSkippedBuffer_.charAt(0)!= nextch) &&
1576
(m_bufferOffset_<0) )) { // avoid push to skipped buffer twice
1577
m_utilSkippedBuffer_.append(nextch);
1579
offset = entryoffset; // Restore the offset before checking next character.
1583
ce = collator.m_contractionCE_[offset];
1586
if (ce == CE_NOT_FOUND_) {
1589
else if (isContractionTag(ce)) {
1590
// this is a multi-contraction
1591
offset = getContractionOffset(collator, ce);
1592
if (collator.m_contractionCE_[offset] != CE_NOT_FOUND_) {
1593
multicontraction = true;
1594
backupInternalState(m_utilSpecialDiscontiguousBackUp_);
1598
setDiscontiguous(m_utilSkippedBuffer_);
1603
updateInternalState(m_utilSpecialDiscontiguousBackUp_);
1604
// backup is one forward of the base character, we need to move back
1607
return collator.m_contractionCE_[entryoffset];
1611
* Gets the next contraction ce
1612
* @param collator collator to use
1613
* @param ce current ce
1614
* @return ce of the next contraction
1616
private int nextContraction(RuleBasedCollator collator, int ce)
1618
backupInternalState(m_utilSpecialBackUp_);
1619
int entryce = collator.m_contractionCE_[getContractionOffset(collator, ce)]; //CE_NOT_FOUND_;
1621
int entryoffset = getContractionOffset(collator, ce);
1622
int offset = entryoffset;
1625
ce = collator.m_contractionCE_[offset];
1626
if (ce == CE_NOT_FOUND_) {
1627
// back up the source over all the chars we scanned going
1628
// into this contraction.
1630
updateInternalState(m_utilSpecialBackUp_);
1635
// get the discontiguos maximum combining class
1636
int maxCC = (collator.m_contractionIndex_[offset] & 0xFF);
1637
// checks if all characters have the same combining class
1638
byte allSame = (byte)(collator.m_contractionIndex_[offset] >> 8);
1639
char ch = (char)nextChar();
1641
while (ch > collator.m_contractionIndex_[offset]) {
1642
// contraction characters are ordered, skip all smaller
1646
if (ch == collator.m_contractionIndex_[offset]) {
1647
// Found the source string char in the contraction table.
1648
// Pick up the corresponding CE from the table.
1649
ce = collator.m_contractionCE_[offset];
1652
// Source string char was not in contraction table.
1653
// Unless it is a discontiguous contraction, we are done
1655
if(UTF16.isLeadSurrogate(ch)) { // in order to do the proper detection, we
1656
// need to see if we're dealing with a supplementary
1657
miss = UCharacterProperty.getRawSupplementary(ch, (char) nextChar());
1660
if (maxCC == 0 || (sCC = getCombiningClass(miss)) == 0
1661
|| sCC > maxCC || (allSame != 0 && sCC == maxCC) ||
1663
// Contraction can not be discontiguous, back up by one
1668
ce = collator.m_contractionCE_[entryoffset];
1671
// Contraction is possibly discontiguous.
1672
// find the next character if ch is not a base character
1673
int ch_int = nextChar();
1674
if (ch_int != UCharacterIterator.DONE) {
1677
char nextch = (char)ch_int;
1678
if (getCombiningClass(nextch) == 0) {
1683
// base character not part of discontiguous contraction
1684
ce = collator.m_contractionCE_[entryoffset];
1687
ce = nextDiscontiguous(collator, entryoffset);
1692
if (ce == CE_NOT_FOUND_) {
1693
// source did not match the contraction, revert back original
1694
updateInternalState(m_utilSpecialBackUp_);
1699
// source was a contraction
1700
if (!isContractionTag(ce)) {
1704
// ccontinue looping to check for the remaining contraction.
1705
if (collator.m_contractionCE_[entryoffset] != CE_NOT_FOUND_) {
1706
// there are further contractions to be performed, so we store
1707
// the so-far completed ce, so that if we fail in the next
1708
// round we just return this one.
1709
entryce = collator.m_contractionCE_[entryoffset];
1710
backupInternalState(m_utilSpecialBackUp_);
1711
if (m_utilSpecialBackUp_.m_bufferOffset_ >= 0) {
1712
m_utilSpecialBackUp_.m_bufferOffset_ --;
1715
m_utilSpecialBackUp_.m_offset_ --;
1723
* Gets the next ce for long primaries, stuffs the rest of the collation
1724
* elements into the ce buffer
1725
* @param ce current ce
1728
private int nextLongPrimary(int ce)
1730
m_CEBuffer_[1] = ((ce & 0xFF) << 24)
1731
| RuleBasedCollator.CE_CONTINUATION_MARKER_;
1732
m_CEBufferOffset_ = 1;
1733
m_CEBufferSize_ = 2;
1734
m_CEBuffer_[0] = ((ce & 0xFFFF00) << 8) | (CE_BYTE_COMMON_ << 8) |
1736
return m_CEBuffer_[0];
1740
* Gets the number of expansion
1741
* @param ce current ce
1742
* @return number of expansion
1744
private int getExpansionCount(int ce)
1750
* Gets the next expansion ce and stuffs the rest of the collation elements
1751
* into the ce buffer
1752
* @param collator current collator
1753
* @param ce current ce
1754
* @return next expansion ce
1756
private int nextExpansion(RuleBasedCollator collator, int ce)
1758
// NOTE: we can encounter both continuations and expansions in an
1760
// I have to decide where continuations are going to be dealt with
1761
int offset = getExpansionOffset(collator, ce);
1762
m_CEBufferSize_ = getExpansionCount(ce);
1763
m_CEBufferOffset_ = 1;
1764
m_CEBuffer_[0] = collator.m_expansion_[offset];
1765
if (m_CEBufferSize_ != 0) {
1766
// if there are less than 16 elements in expansion
1767
for (int i = 1; i < m_CEBufferSize_; i ++) {
1768
m_CEBuffer_[i] = collator.m_expansion_[offset + i];
1772
// ce are terminated
1773
m_CEBufferSize_ = 1;
1774
while (collator.m_expansion_[offset] != 0) {
1775
m_CEBuffer_[m_CEBufferSize_ ++] =
1776
collator.m_expansion_[++ offset];
1779
// in case of one element expansion, we
1780
// want to immediately return CEpos
1781
if (m_CEBufferSize_ == 1) {
1782
m_CEBufferSize_ = 0;
1783
m_CEBufferOffset_ = 0;
1785
return m_CEBuffer_[0];
1789
* Gets the next digit ce
1790
* @param collator current collator
1791
* @param ce current collation element
1792
* @param cp current codepoint
1793
* @return next digit ce
1795
private int nextDigit(RuleBasedCollator collator, int ce, int cp)
1797
// We do a check to see if we want to collate digits as numbers;
1798
// if so we generate a custom collation key. Otherwise we pull out
1799
// the value stored in the expansion table.
1801
if (m_collator_.m_isNumericCollation_){
1803
int trailingZeroIndex = 0;
1804
boolean nonZeroValReached = false;
1806
// I just need a temporary place to store my generated CEs.
1807
// icu4c uses a unsigned byte array, i'll use a stringbuffer here
1808
// to avoid dealing with the sign problems and array allocation
1809
// clear and set initial string buffer length
1810
m_utilStringBuffer_.setLength(3);
1812
// We parse the source string until we hit a char that's NOT a
1814
// Use this u_charDigitValue. This might be slow because we have
1815
// to handle surrogates...
1816
int digVal = UCharacter.digit(cp);
1817
// if we have arrived here, we have already processed possible
1818
// supplementaries that trigered the digit tag -
1819
// all supplementaries are marked in the UCA.
1820
// We pad a zero in front of the first element anyways.
1821
// This takes care of the (probably) most common case where
1822
// people are sorting things followed by a single digit
1825
// Make sure we have enough space.
1826
if (digIndx >= ((m_utilStringBuffer_.length() - 2) << 1)) {
1827
m_utilStringBuffer_.setLength(m_utilStringBuffer_.length()
1830
// Skipping over leading zeroes.
1831
if (digVal != 0 || nonZeroValReached) {
1832
if (digVal != 0 && !nonZeroValReached) {
1833
nonZeroValReached = true;
1835
// We parse the digit string into base 100 numbers
1836
// (this fits into a byte).
1837
// We only add to the buffer in twos, thus if we are
1838
// parsing an odd character, that serves as the
1839
// 'tens' digit while the if we are parsing an even
1840
// one, that is the 'ones' digit. We dumped the
1841
// parsed base 100 value (collateVal) into a buffer.
1842
// We multiply each collateVal by 2 (to give us room)
1843
// and add 5 (to avoid overlapping magic CE byte
1844
// values). The last byte we subtract 1 to ensure it is
1845
// less than all the other bytes.
1846
if (digIndx % 2 == 1) {
1847
collateVal += digVal;
1848
// This removes trailing zeroes.
1849
if (collateVal == 0 && trailingZeroIndex == 0) {
1850
trailingZeroIndex = ((digIndx - 1) >>> 1) + 2;
1852
else if (trailingZeroIndex != 0) {
1853
trailingZeroIndex = 0;
1855
m_utilStringBuffer_.setCharAt(
1856
((digIndx - 1) >>> 1) + 2,
1857
(char)((collateVal << 1) + 6));
1861
// We drop the collation value into the buffer so if
1862
// we need to do a "front patch" we don't have to
1863
// check to see if we're hitting the last element.
1864
collateVal = digVal * 10;
1865
m_utilStringBuffer_.setCharAt((digIndx >>> 1) + 2,
1866
(char)((collateVal << 1) + 6));
1871
// Get next character.
1873
backupInternalState(m_utilSpecialBackUp_);
1874
int char32 = nextChar();
1875
char ch = (char)char32;
1876
if (UTF16.isLeadSurrogate(ch)){
1878
char trail = (char)nextChar();
1879
if (UTF16.isTrailSurrogate(trail)) {
1880
char32 = UCharacterProperty.getRawSupplementary(
1889
digVal = UCharacter.digit(char32);
1891
// Resetting position to point to the next unprocessed
1892
// char. We overshot it when doing our test/set for
1894
updateInternalState(m_utilSpecialBackUp_);
1903
if (nonZeroValReached == false){
1905
m_utilStringBuffer_.setCharAt(2, (char)6);
1908
int endIndex = trailingZeroIndex != 0 ? trailingZeroIndex
1909
: (digIndx >>> 1) + 2;
1910
if (digIndx % 2 != 0){
1911
// We missed a value. Since digIndx isn't even, stuck too many
1912
// values into the buffer (this is what we get for padding the
1913
// first byte with a zero). "Front-patch" now by pushing all
1915
// Doing it this way ensures that at least 50% of the time
1916
// (statistically speaking) we'll only be doing a single pass
1917
// and optimizes for strings with single digits. I'm just
1918
// assuming that's the more common case.
1919
for (int i = 2; i < endIndex; i ++){
1920
m_utilStringBuffer_.setCharAt(i,
1921
(char)((((((m_utilStringBuffer_.charAt(i) - 6) >>> 1)
1923
+ (((m_utilStringBuffer_.charAt(i + 1) - 6)
1924
>>> 1) / 10) << 1) + 6));
1929
// Subtract one off of the last byte.
1930
m_utilStringBuffer_.setCharAt(endIndex - 1,
1931
(char)(m_utilStringBuffer_.charAt(endIndex - 1) - 1));
1933
// We want to skip over the first two slots in the buffer.
1934
// The first slot is reserved for the header byte CODAN_PLACEHOLDER.
1935
// The second slot is for the sign/exponent byte:
1936
// 0x80 + (decimalPos/2) & 7f.
1937
m_utilStringBuffer_.setCharAt(0, (char)RuleBasedCollator.CODAN_PLACEHOLDER);
1938
m_utilStringBuffer_.setCharAt(1,
1939
(char)(0x80 + ((digIndx >>> 1) & 0x7F)));
1941
// Now transfer the collation key to our collIterate struct.
1942
// The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
1943
ce = (((m_utilStringBuffer_.charAt(0) << 8)
1945
| m_utilStringBuffer_.charAt(1))
1946
<< RuleBasedCollator.CE_PRIMARY_SHIFT_)
1948
| (RuleBasedCollator.BYTE_COMMON_
1949
<< RuleBasedCollator.CE_SECONDARY_SHIFT_)
1950
| RuleBasedCollator.BYTE_COMMON_; // Tertiary weight.
1951
int i = 2; // Reset the index into the buffer.
1953
m_CEBuffer_[0] = ce;
1954
m_CEBufferSize_ = 1;
1955
m_CEBufferOffset_ = 1;
1956
while (i < endIndex)
1958
int primWeight = m_utilStringBuffer_.charAt(i ++) << 8;
1960
primWeight |= m_utilStringBuffer_.charAt(i ++);
1962
m_CEBuffer_[m_CEBufferSize_ ++]
1963
= (primWeight << RuleBasedCollator.CE_PRIMARY_SHIFT_)
1964
| RuleBasedCollator.CE_CONTINUATION_MARKER_;
1969
// no numeric mode, we'll just switch to whatever we stashed and
1971
// find the offset to expansion table
1972
return collator.m_expansion_[getExpansionOffset(collator, ce)];
1976
* Gets the next implicit ce for codepoints
1977
* @param codepoint current codepoint
1978
* @return implicit ce
1980
private int nextImplicit(int codepoint)
1982
if (!UCharacter.isLegal(codepoint)) {
1983
// synwee to check with vladimir on the range of isNonChar()
1984
// illegal code value, use completely ignoreable!
1987
int result = RuleBasedCollator.impCEGen_.getImplicitFromCodePoint(codepoint);
1988
m_CEBuffer_[0] = (result & RuleBasedCollator.CE_PRIMARY_MASK_)
1990
m_CEBuffer_[1] = ((result & 0x0000FFFF) << 16) | 0x000000C0;
1991
m_CEBufferOffset_ = 1;
1992
m_CEBufferSize_ = 2;
1993
return m_CEBuffer_[0];
1997
* Returns the next ce associated with the following surrogate characters
1998
* @param ch current character
2001
private int nextSurrogate(char ch)
2003
int ch_int = nextChar();
2004
char nextch = (char)ch_int;
2005
if (ch_int != CharacterIterator.DONE &&
2006
UTF16.isTrailSurrogate(nextch)) {
2007
int codepoint = UCharacterProperty.getRawSupplementary(ch, nextch);
2008
return nextImplicit(codepoint);
2010
if (nextch != CharacterIterator.DONE) {
2011
previousChar(); // reverts back to the original position
2013
return IGNORABLE; // completely ignorable
2017
* Returns the next ce for a hangul character, this is an implicit
2019
* @param collator current collator
2020
* @param ch current character
2023
private int nextHangul(RuleBasedCollator collator, char ch)
2025
char L = (char)(ch - HANGUL_SBASE_);
2027
// divide into pieces
2028
// do it in this order since some compilers can do % and / in one
2030
char T = (char)(L % HANGUL_TCOUNT_);
2031
L /= HANGUL_TCOUNT_;
2032
char V = (char)(L % HANGUL_VCOUNT_);
2033
L /= HANGUL_VCOUNT_;
2040
// return the first CE, but first put the rest into the expansion
2042
m_CEBufferSize_ = 0;
2043
if (!collator.m_isJamoSpecial_) { // FAST PATH
2044
m_CEBuffer_[m_CEBufferSize_ ++] =
2045
collator.m_trie_.getLeadValue(L);
2046
m_CEBuffer_[m_CEBufferSize_ ++] =
2047
collator.m_trie_.getLeadValue(V);
2049
if (T != HANGUL_TBASE_) {
2050
m_CEBuffer_[m_CEBufferSize_ ++] =
2051
collator.m_trie_.getLeadValue(T);
2053
m_CEBufferOffset_ = 1;
2054
return m_CEBuffer_[0];
2058
// Since Hanguls pass the FCD check, it is guaranteed that we
2059
// won't be in the normalization buffer if something like this
2061
// Move Jamos into normalization buffer
2062
m_buffer_.append(L);
2063
m_buffer_.append(V);
2064
if (T != HANGUL_TBASE_) {
2065
m_buffer_.append(T);
2067
m_FCDLimit_ = m_source_.getIndex();
2068
m_FCDStart_ = m_FCDLimit_ - 1;
2069
// Indicate where to continue in main input string after
2070
// exhausting the buffer
2076
* <p>Special CE management. Expansions, contractions etc...</p>
2077
* @param collator can be plain UCA
2078
* @param ce current ce
2079
* @param ch current character
2080
* @return next special ce
2082
private int nextSpecial(RuleBasedCollator collator, int ce, char ch)
2085
Backup entrybackup = m_utilSpecialEntryBackUp_;
2086
// this is to handle recursive looping
2087
if (entrybackup != null) {
2088
m_utilSpecialEntryBackUp_ = null;
2091
entrybackup = new Backup();
2093
backupInternalState(entrybackup);
2094
try { // forces it to assign m_utilSpecialEntryBackup_
2096
// This loop will repeat only in the case of contractions,
2098
switch(RuleBasedCollator.getTag(ce)) {
2099
case CE_NOT_FOUND_TAG_:
2100
// impossible case for icu4j
2102
case RuleBasedCollator.CE_SURROGATE_TAG_:
2106
backupInternalState(m_utilSpecialBackUp_);
2107
char trail = (char)nextChar();
2108
ce = nextSurrogate(collator, ce, trail);
2109
// calculate the supplementary code point value,
2110
// if surrogate was not tailored we go one more round
2112
UCharacterProperty.getRawSupplementary(ch, trail);
2114
case CE_SPEC_PROC_TAG_:
2115
ce = nextSpecialPrefix(collator, ce, entrybackup);
2117
case CE_CONTRACTION_TAG_:
2118
ce = nextContraction(collator, ce);
2120
case CE_LONG_PRIMARY_TAG_:
2121
return nextLongPrimary(ce);
2122
case CE_EXPANSION_TAG_:
2123
return nextExpansion(collator, ce);
2125
ce = nextDigit(collator, ce, codepoint);
2127
// various implicits optimization
2128
case CE_CJK_IMPLICIT_TAG_:
2129
// 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
2130
return nextImplicit(codepoint);
2131
case CE_IMPLICIT_TAG_: // everything that is not defined
2132
return nextImplicit(codepoint);
2133
case CE_TRAIL_SURROGATE_TAG_:
2134
return IGNORABLE; // DC00-DFFF broken surrogate
2135
case CE_LEAD_SURROGATE_TAG_: // D800-DBFF
2136
return nextSurrogate(ch);
2137
case CE_HANGUL_SYLLABLE_TAG_: // AC00-D7AF
2138
return nextHangul(collator, ch);
2139
case CE_CHARSET_TAG_:
2140
// not yet implemented probably after 1.8
2141
return CE_NOT_FOUND_;
2144
// synwee todo, throw exception or something here.
2146
if (!RuleBasedCollator.isSpecial(ce)) {
2152
m_utilSpecialEntryBackUp_ = entrybackup;
2158
* Special processing is getting a CE that is preceded by a certain prefix.
2159
* Currently this is only needed for optimizing Japanese length and
2160
* iteration marks. When we encouter a special processing tag, we go
2161
* backwards and try to see if we have a match. Contraction tables are used
2162
* - so the whole process is not unlike contraction. prefix data is stored
2163
* backwards in the table.
2164
* @param collator current collator
2165
* @param ce current ce
2166
* @return previous ce
2168
private int previousSpecialPrefix(RuleBasedCollator collator, int ce)
2170
backupInternalState(m_utilSpecialBackUp_);
2172
// position ourselves at the begining of contraction sequence
2173
int offset = getContractionOffset(collator, ce);
2174
int entryoffset = offset;
2175
if (isBackwardsStart()) {
2176
ce = collator.m_contractionCE_[offset];
2179
char prevch = (char)previousChar();
2180
while (prevch > collator.m_contractionIndex_[offset]) {
2181
// since contraction codepoints are ordered, we skip all that
2185
if (prevch == collator.m_contractionIndex_[offset]) {
2186
ce = collator.m_contractionCE_[offset];
2189
// if there is a completely ignorable code point in the middle
2190
// of a prefix, we need to act as if it's not there assumption:
2191
// 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to
2193
// lone surrogates cannot be set to zero as it would break
2195
int isZeroCE = collator.m_trie_.getLeadValue(prevch);
2196
// it's easy for BMP code points
2197
if (isZeroCE == 0) {
2200
else if (UTF16.isTrailSurrogate(prevch)
2201
|| UTF16.isLeadSurrogate(prevch)) {
2202
// for supplementary code points, we have to check the next one
2203
// situations where we are going to ignore
2204
// 1. beginning of the string: schar is a lone surrogate
2205
// 2. schar is a lone surrogate
2206
// 3. schar is a trail surrogate in a valid surrogate
2207
// sequence that is explicitly set to zero.
2208
if (!isBackwardsStart()) {
2209
char lead = (char)previousChar();
2210
if (UTF16.isLeadSurrogate(lead)) {
2211
isZeroCE = collator.m_trie_.getLeadValue(lead);
2212
if (RuleBasedCollator.getTag(isZeroCE)
2213
== RuleBasedCollator.CE_SURROGATE_TAG_) {
2214
int finalCE = collator.m_trie_.getTrailValue(
2218
// this is a real, assigned completely
2219
// ignorable code point
2225
nextChar(); // revert to original offset
2226
// lone surrogate, completely ignorable
2229
nextChar(); // revert to original offset
2232
// lone surrogate at the beggining, completely ignorable
2237
// char was not in the table. prefix not found
2238
ce = collator.m_contractionCE_[entryoffset];
2241
if (!isSpecialPrefixTag(ce)) {
2242
// char was in the contraction table, and the corresponding ce
2243
// is not a prefix ce. We found the prefix, break out of loop,
2244
// this ce will end up being returned.
2248
updateInternalState(m_utilSpecialBackUp_);
2253
* Retrieves the previous contraction ce. To ensure that the backwards and
2254
* forwards iteration matches, we take the current region of most possible
2255
* match and pass it through the forward iteration. This will ensure that
2256
* the obstinate problem of overlapping contractions will not occur.
2257
* @param collator current collator
2258
* @param ce current ce
2259
* @param ch current character
2260
* @return previous contraction ce
2262
private int previousContraction(RuleBasedCollator collator, int ce, char ch)
2264
m_utilStringBuffer_.setLength(0);
2265
// since we might encounter normalized characters (from the thai
2266
// processing) we can't use peekCharacter() here.
2267
char prevch = (char)previousChar();
2268
boolean atStart = false;
2269
// TODO: address the comment above - maybe now we *can* use peekCharacter
2270
//while (collator.isUnsafe(ch) || isThaiPreVowel(prevch)) {
2271
while (collator.isUnsafe(ch)) {
2272
m_utilStringBuffer_.insert(0, ch);
2274
if (isBackwardsStart()) {
2278
prevch = (char)previousChar();
2281
// undo the previousChar() if we didn't reach the beginning
2284
// adds the initial base character to the string
2285
m_utilStringBuffer_.insert(0, ch);
2287
// a new collation element iterator is used to simply things, since
2288
// using the current collation element iterator will mean that the
2289
// forward and backwards iteration will share and change the same
2290
// buffers. it is going to be painful.
2291
int originaldecomp = collator.getDecomposition();
2292
// for faster access, since string would have been normalized above
2293
collator.setDecomposition(Collator.NO_DECOMPOSITION);
2294
if (m_utilColEIter_ == null) {
2295
m_utilColEIter_ = new CollationElementIterator(
2296
m_utilStringBuffer_.toString(),
2300
m_utilColEIter_.m_collator_ = collator;
2301
m_utilColEIter_.setText(m_utilStringBuffer_.toString());
2303
ce = m_utilColEIter_.next();
2304
m_CEBufferSize_ = 0;
2305
while (ce != NULLORDER) {
2306
if (m_CEBufferSize_ == m_CEBuffer_.length) {
2308
// increasing cebuffer size
2309
int tempbuffer[] = new int[m_CEBuffer_.length + 50];
2310
System.arraycopy(m_CEBuffer_, 0, tempbuffer, 0,
2311
m_CEBuffer_.length);
2312
m_CEBuffer_ = tempbuffer;
2314
catch( MissingResourceException e)
2318
catch (Exception e) {
2320
e.printStackTrace();
2325
m_CEBuffer_[m_CEBufferSize_ ++] = ce;
2326
ce = m_utilColEIter_.next();
2328
collator.setDecomposition(originaldecomp);
2329
m_CEBufferOffset_ = m_CEBufferSize_ - 1;
2330
return m_CEBuffer_[m_CEBufferOffset_];
2334
* Returns the previous long primary ces
2335
* @param ce long primary ce
2336
* @return previous long primary ces
2338
private int previousLongPrimary(int ce)
2340
m_CEBufferSize_ = 0;
2341
m_CEBuffer_[m_CEBufferSize_ ++] =
2342
((ce & 0xFFFF00) << 8) | (CE_BYTE_COMMON_ << 8) | CE_BYTE_COMMON_;
2343
m_CEBuffer_[m_CEBufferSize_ ++] = ((ce & 0xFF) << 24)
2344
| RuleBasedCollator.CE_CONTINUATION_MARKER_;
2345
m_CEBufferOffset_ = m_CEBufferSize_ - 1;
2346
return m_CEBuffer_[m_CEBufferOffset_];
2350
* Returns the previous expansion ces
2351
* @param collator current collator
2352
* @param ce current ce
2353
* @return previous expansion ce
2355
private int previousExpansion(RuleBasedCollator collator, int ce)
2357
// find the offset to expansion table
2358
int offset = getExpansionOffset(collator, ce);
2359
m_CEBufferSize_ = getExpansionCount(ce);
2360
if (m_CEBufferSize_ != 0) {
2361
// less than 16 elements in expansion
2362
for (int i = 0; i < m_CEBufferSize_; i ++) {
2363
m_CEBuffer_[i] = collator.m_expansion_[offset + i];
2368
// null terminated ces
2369
while (collator.m_expansion_[offset + m_CEBufferSize_] != 0) {
2370
m_CEBuffer_[m_CEBufferSize_] =
2371
collator.m_expansion_[offset + m_CEBufferSize_];
2375
m_CEBufferOffset_ = m_CEBufferSize_ - 1;
2376
return m_CEBuffer_[m_CEBufferOffset_];
2380
* Getting the digit collation elements
2382
* @param ce current collation element
2383
* @param ch current code point
2384
* @return digit collation element
2386
private int previousDigit(RuleBasedCollator collator, int ce, char ch)
2388
// We do a check to see if we want to collate digits as numbers; if so we generate
2389
// a custom collation key. Otherwise we pull out the value stored in the expansion table.
2390
if (m_collator_.m_isNumericCollation_){
2391
int leadingZeroIndex = 0;
2393
boolean nonZeroValReached = false;
2395
// clear and set initial string buffer length
2396
m_utilStringBuffer_.setLength(3);
2398
// We parse the source string until we hit a char that's NOT a digit
2399
// Use this u_charDigitValue. This might be slow because we have to
2400
// handle surrogates...
2402
if (UTF16.isTrailSurrogate(ch)) {
2403
if (!isBackwardsStart()){
2404
char lead = (char)previousChar();
2405
if (UTF16.isLeadSurrogate(lead)) {
2406
char32 = UCharacterProperty.getRawSupplementary(lead,
2414
int digVal = UCharacter.digit(char32);
2417
// Make sure we have enough space.
2418
if (digIndx >= ((m_utilStringBuffer_.length() - 2) << 1)) {
2419
m_utilStringBuffer_.setLength(m_utilStringBuffer_.length()
2422
// Skipping over "trailing" zeroes but we still add to digIndx.
2423
if (digVal != 0 || nonZeroValReached) {
2424
if (digVal != 0 && !nonZeroValReached) {
2425
nonZeroValReached = true;
2428
// We parse the digit string into base 100 numbers (this
2429
// fits into a byte).
2430
// We only add to the buffer in twos, thus if we are
2431
// parsing an odd character, that serves as the 'tens'
2432
// digit while the if we are parsing an even one, that is
2433
// the 'ones' digit. We dumped the parsed base 100 value
2434
// (collateVal) into a buffer. We multiply each collateVal
2435
// by 2 (to give us room) and add 5 (to avoid overlapping
2436
// magic CE byte values). The last byte we subtract 1 to
2437
// ensure it is less than all the other bytes.
2438
// Since we're doing in this reverse we want to put the
2439
// first digit encountered into the ones place and the
2440
// second digit encountered into the tens place.
2442
if (digIndx % 2 == 1){
2443
collateVal += digVal * 10;
2445
// This removes leading zeroes.
2446
if (collateVal == 0 && leadingZeroIndex == 0) {
2447
leadingZeroIndex = ((digIndx - 1) >>> 1) + 2;
2449
else if (leadingZeroIndex != 0) {
2450
leadingZeroIndex = 0;
2453
m_utilStringBuffer_.setCharAt(((digIndx - 1) >>> 1) + 2,
2454
(char)((collateVal << 1) + 6));
2458
collateVal = digVal;
2463
if (!isBackwardsStart()){
2464
backupInternalState(m_utilSpecialBackUp_);
2465
char32 = previousChar();
2466
if (UTF16.isTrailSurrogate(ch)){
2467
if (!isBackwardsStart()) {
2468
char lead = (char)previousChar();
2469
if (UTF16.isLeadSurrogate(lead)) {
2471
= UCharacterProperty.getRawSupplementary(
2475
updateInternalState(m_utilSpecialBackUp_);
2480
digVal = UCharacter.digit(char32);
2482
updateInternalState(m_utilSpecialBackUp_);
2491
if (nonZeroValReached == false) {
2493
m_utilStringBuffer_.setCharAt(2, (char)6);
2496
if (digIndx % 2 != 0) {
2497
if (collateVal == 0 && leadingZeroIndex == 0) {
2498
// This removes the leading 0 in a odd number sequence of
2499
// numbers e.g. avery001
2500
leadingZeroIndex = ((digIndx - 1) >>> 1) + 2;
2503
// this is not a leading 0, we add it in
2504
m_utilStringBuffer_.setCharAt((digIndx >>> 1) + 2,
2505
(char)((collateVal << 1) + 6));
2510
int endIndex = leadingZeroIndex != 0 ? leadingZeroIndex
2511
: ((digIndx >>> 1) + 2) ;
2512
digIndx = ((endIndex - 2) << 1) + 1; // removing initial zeros
2513
// Subtract one off of the last byte.
2514
// Really the first byte here, but it's reversed...
2515
m_utilStringBuffer_.setCharAt(2,
2516
(char)(m_utilStringBuffer_.charAt(2) - 1));
2517
// We want to skip over the first two slots in the buffer.
2518
// The first slot is reserved for the header byte CODAN_PLACEHOLDER.
2519
// The second slot is for the sign/exponent byte:
2520
// 0x80 + (decimalPos/2) & 7f.
2521
m_utilStringBuffer_.setCharAt(0, (char)RuleBasedCollator.CODAN_PLACEHOLDER);
2522
m_utilStringBuffer_.setCharAt(1,
2523
(char)(0x80 + ((digIndx >>> 1) & 0x7F)));
2525
// Now transfer the collation key to our collIterate struct.
2526
// The total size for our collation key is endIndx bumped up to the
2527
// next largest even value divided by two.
2528
m_CEBufferSize_ = 0;
2529
m_CEBuffer_[m_CEBufferSize_ ++]
2530
= (((m_utilStringBuffer_.charAt(0) << 8)
2532
| m_utilStringBuffer_.charAt(1))
2533
<< RuleBasedCollator.CE_PRIMARY_SHIFT_)
2535
| (RuleBasedCollator.BYTE_COMMON_
2536
<< RuleBasedCollator.CE_SECONDARY_SHIFT_)
2538
| RuleBasedCollator.BYTE_COMMON_;
2539
int i = endIndex - 1; // Reset the index into the buffer.
2541
int primWeight = m_utilStringBuffer_.charAt(i --) << 8;
2543
primWeight |= m_utilStringBuffer_.charAt(i --);
2545
m_CEBuffer_[m_CEBufferSize_ ++]
2546
= (primWeight << RuleBasedCollator.CE_PRIMARY_SHIFT_)
2547
| RuleBasedCollator.CE_CONTINUATION_MARKER_;
2549
m_CEBufferOffset_ = m_CEBufferSize_ - 1;
2550
return m_CEBuffer_[m_CEBufferOffset_];
2553
return collator.m_expansion_[getExpansionOffset(collator, ce)];
2558
* Returns previous hangul ces
2559
* @param collator current collator
2560
* @param ch current character
2561
* @return previous hangul ce
2563
private int previousHangul(RuleBasedCollator collator, char ch)
2565
char L = (char)(ch - HANGUL_SBASE_);
2566
// we do it in this order since some compilers can do % and / in one
2568
char T = (char)(L % HANGUL_TCOUNT_);
2569
L /= HANGUL_TCOUNT_;
2570
char V = (char)(L % HANGUL_VCOUNT_);
2571
L /= HANGUL_VCOUNT_;
2578
m_CEBufferSize_ = 0;
2579
if (!collator.m_isJamoSpecial_) {
2580
m_CEBuffer_[m_CEBufferSize_ ++] =
2581
collator.m_trie_.getLeadValue(L);
2582
m_CEBuffer_[m_CEBufferSize_ ++] =
2583
collator.m_trie_.getLeadValue(V);
2584
if (T != HANGUL_TBASE_) {
2585
m_CEBuffer_[m_CEBufferSize_ ++] =
2586
collator.m_trie_.getLeadValue(T);
2588
m_CEBufferOffset_ = m_CEBufferSize_ - 1;
2589
return m_CEBuffer_[m_CEBufferOffset_];
2592
// Since Hanguls pass the FCD check, it is guaranteed that we won't
2593
// be in the normalization buffer if something like this happens
2594
// Move Jamos into normalization buffer
2595
m_buffer_.append(L);
2596
m_buffer_.append(V);
2597
if (T != HANGUL_TBASE_) {
2598
m_buffer_.append(T);
2601
m_FCDStart_ = m_source_.getIndex();
2602
m_FCDLimit_ = m_FCDStart_ + 1;
2608
* Gets implicit codepoint ces
2609
* @param codepoint current codepoint
2610
* @return implicit codepoint ces
2612
private int previousImplicit(int codepoint)
2614
if (!UCharacter.isLegal(codepoint)) {
2615
return IGNORABLE; // illegal code value, completely ignoreable!
2617
int result = RuleBasedCollator.impCEGen_.getImplicitFromCodePoint(codepoint);
2618
m_CEBufferSize_ = 2;
2619
m_CEBufferOffset_ = 1;
2620
m_CEBuffer_[0] = (result & RuleBasedCollator.CE_PRIMARY_MASK_)
2622
m_CEBuffer_[1] = ((result & 0x0000FFFF) << 16) | 0x000000C0;
2623
return m_CEBuffer_[1];
2627
* Gets the previous surrogate ce
2628
* @param ch current character
2629
* @return previous surrogate ce
2631
private int previousSurrogate(char ch)
2633
if (isBackwardsStart()) {
2634
// we are at the start of the string, wrong place to be at
2637
char prevch = (char)previousChar();
2638
// Handles Han and Supplementary characters here.
2639
if (UTF16.isLeadSurrogate(prevch)) {
2640
return previousImplicit(
2641
UCharacterProperty.getRawSupplementary(prevch, ch));
2643
if (prevch != CharacterIterator.DONE) {
2646
return IGNORABLE; // completely ignorable
2650
* <p>Special CE management. Expansions, contractions etc...</p>
2651
* @param collator can be plain UCA
2652
* @param ce current ce
2653
* @param ch current character
2654
* @return previous special ce
2656
private int previousSpecial(RuleBasedCollator collator, int ce, char ch)
2659
// the only ces that loops are thai, special prefix and
2661
switch (RuleBasedCollator.getTag(ce)) {
2662
case CE_NOT_FOUND_TAG_: // this tag always returns
2664
case RuleBasedCollator.CE_SURROGATE_TAG_:
2665
// essentialy a disengaged lead surrogate. a broken
2666
// sequence was encountered and this is an error
2668
case CE_SPEC_PROC_TAG_:
2669
ce = previousSpecialPrefix(collator, ce);
2671
case CE_CONTRACTION_TAG_:
2672
// may loop for first character e.g. "0x0f71" for english
2673
if (isBackwardsStart()) {
2674
// start of string or this is not the end of any contraction
2675
ce = collator.m_contractionCE_[
2676
getContractionOffset(collator, ce)];
2679
return previousContraction(collator, ce, ch); // else
2680
case CE_LONG_PRIMARY_TAG_:
2681
return previousLongPrimary(ce);
2682
case CE_EXPANSION_TAG_: // always returns
2683
return previousExpansion(collator, ce);
2685
ce = previousDigit(collator, ce, ch);
2687
case CE_HANGUL_SYLLABLE_TAG_: // AC00-D7AF
2688
return previousHangul(collator, ch);
2689
case CE_LEAD_SURROGATE_TAG_: // D800-DBFF
2690
return IGNORABLE; // broken surrogate sequence
2691
case CE_TRAIL_SURROGATE_TAG_: // DC00-DFFF
2692
return previousSurrogate(ch);
2693
case CE_CJK_IMPLICIT_TAG_:
2694
// 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D
2695
return previousImplicit(ch);
2696
case CE_IMPLICIT_TAG_: // everything that is not defined
2697
// UCA is filled with these. Tailorings are NOT_FOUND
2698
return previousImplicit(ch);
2699
case CE_CHARSET_TAG_: // this tag always returns
2700
return CE_NOT_FOUND_;
2701
default: // this tag always returns
2704
if (!RuleBasedCollator.isSpecial(ce)) {
2712
* GET IMPLICIT PRIMARY WEIGHTS
2713
* @param cp codepoint
2714
* @param value is left justified primary key
2716
// private static final int getImplicitPrimary(int cp)
2718
// cp = swapCJK(cp);
2720
// //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
2721
// // we now have a range of numbers from 0 to 21FFFF.
2722
// // we must skip all 00, 01, 02 bytes, so most bytes have 253 values
2723
// // we must leave a gap of 01 between all values of the last byte, so
2724
// // the last byte has 126 values (3 byte case)
2725
// // we shift so that HAN all has the same first primary, for
2727
// // for the 4 byte case, we make the gap as large as we can fit.
2728
// // Three byte forms are EC xx xx, ED xx xx, EE xx xx (with a gap of 1)
2729
// // Four byte forms (most supplementaries) are EF xx xx xx (with a gap
2730
// // of LAST2_MULTIPLIER == 14)
2732
// int last0 = cp - RuleBasedCollator.IMPLICIT_4BYTE_BOUNDARY_;
2734
// int last1 = cp / RuleBasedCollator.LAST_COUNT_;
2735
// last0 = cp % RuleBasedCollator.LAST_COUNT_;
2737
// int last2 = last1 / RuleBasedCollator.OTHER_COUNT_;
2738
// last1 %= RuleBasedCollator.OTHER_COUNT_;
2739
// return RuleBasedCollator.IMPLICIT_BASE_3BYTE_ + (last2 << 24)
2741
// + ((last0 * RuleBasedCollator.LAST_MULTIPLIER_) << 8);
2744
// int last1 = last0 / RuleBasedCollator.LAST_COUNT2_;
2745
// last0 %= RuleBasedCollator.LAST_COUNT2_;
2747
// int last2 = last1 / RuleBasedCollator.OTHER_COUNT_;
2748
// last1 %= RuleBasedCollator.OTHER_COUNT_;
2750
// int last3 = last2 / RuleBasedCollator.OTHER_COUNT_;
2751
// last2 %= RuleBasedCollator.OTHER_COUNT_;
2752
// return RuleBasedCollator.IMPLICIT_BASE_4BYTE_ + (last3 << 24)
2753
// + (last2 << 16) + (last1 << 8)
2754
// + (last0 * RuleBasedCollator.LAST2_MULTIPLIER_);
2759
// * Swapping CJK characters for implicit ces
2760
// * @param cp codepoint CJK
2761
// * @return swapped result
2763
// private static final int swapCJK(int cp)
2765
// if (cp >= CJK_BASE_) {
2766
// if (cp < CJK_LIMIT_) {
2767
// return cp - CJK_BASE_;
2769
// if (cp < CJK_COMPAT_USED_BASE_) {
2770
// return cp + NON_CJK_OFFSET_;
2772
// if (cp < CJK_COMPAT_USED_LIMIT_) {
2773
// return cp - CJK_COMPAT_USED_BASE_ + (CJK_LIMIT_ - CJK_BASE_);
2775
// if (cp < CJK_B_BASE_) {
2776
// return cp + NON_CJK_OFFSET_;
2778
// if (cp < CJK_B_LIMIT_) {
2779
// return cp; // non-BMP-CJK
2781
// return cp + NON_CJK_OFFSET_; // non-CJK
2783
// if (cp < CJK_A_BASE_) {
2784
// return cp + NON_CJK_OFFSET_;
2786
// if (cp < CJK_A_LIMIT_) {
2787
// return cp - CJK_A_BASE_ + (CJK_LIMIT_ - CJK_BASE_)
2788
// + (CJK_COMPAT_USED_LIMIT_ - CJK_COMPAT_USED_BASE_);
2790
// return cp + NON_CJK_OFFSET_; // non-CJK
2794
// * Gets a character from the source string at a given offset.
2795
// * Handles both normal and iterative cases.
2796
// * No error checking and does not access the normalization buffer
2797
// * - caller beware!
2798
// * @param offset offset from current position which character is to be
2800
// * @return character at current position + offset
2802
// private char peekCharacter(int offset)
2804
// if (offset != 0) {
2805
// int currentoffset = m_source_.getIndex();
2806
// m_source_.setIndex(currentoffset + offset);
2807
// char result = (char)m_source_.current();
2808
// m_source_.setIndex(currentoffset);
2812
// return (char)m_source_.current();
2817
* Moves back 1 position in the source string. This is slightly less
2818
* complicated than previousChar in that it doesn't normalize while
2819
* moving back. Boundary checks are not performed.
2820
* This method is to be used with caution, with the assumption that
2821
* moving back one position will not exceed the source limits.
2822
* Use only with nextChar() and never call this API twice in a row without
2823
* nextChar() in the middle.
2825
private void goBackOne()
2827
if (m_bufferOffset_ >= 0) {
2831
m_source_.setIndex(m_source_.getIndex() - 1);
2836
* Moves forward 1 position in the source string. This is slightly less
2837
* complicated than nextChar in that it doesn't normalize while
2838
* moving back. Boundary checks are not performed.
2839
* This method is to be used with caution, with the assumption that
2840
* moving back one position will not exceed the source limits.
2841
* Use only with previousChar() and never call this API twice in a row
2842
* without previousChar() in the middle.
2844
private void goForwardOne()
2846
if (m_bufferOffset_ < 0) {
2847
// we're working on the source and not normalizing. fast path.
2848
// note Thai pre-vowel reordering uses buffer too
2849
m_source_.setIndex(m_source_.getIndex() + 1);
2852
// we are in the buffer, buffer offset will never be 0 here