2
*******************************************************************************
3
* Copyright (C) 1996-2010, International Business Machines Corporation and *
4
* others. All Rights Reserved. *
5
*******************************************************************************
7
package com.ibm.icu.text;
9
import java.text.ParseException;
10
import java.util.Arrays;
11
import java.util.Hashtable;
13
import com.ibm.icu.impl.UCharacterProperty;
14
import com.ibm.icu.lang.UCharacter;
17
* Class for parsing collation rules, produces a list of tokens that will be
18
* turned into collation elements
19
* @author Syn Wee Quek
20
* @since release 2.2, June 7 2002
22
final class CollationRuleParser
24
// public data members ---------------------------------------------------
26
// package private constructors ------------------------------------------
29
* <p>RuleBasedCollator constructor that takes the rules.
30
* Please see RuleBasedCollator class description for more details on the
31
* collation rule syntax.</p>
32
* @see java.util.Locale
33
* @param rules the collation rules to build the collation table from.
34
* @exception ParseException thrown when argument rules have an invalid
37
CollationRuleParser(String rules) throws ParseException
39
extractSetsFromRules(rules);
40
m_source_ = new StringBuilder(Normalizer.decompose(rules, false).trim());
41
m_rules_ = m_source_.toString();
43
m_extraCurrent_ = m_source_.length();
44
m_variableTop_ = null;
45
m_parsedToken_ = new ParsedToken();
46
m_hashTable_ = new Hashtable<Token, Token>();
47
m_options_ = new OptionSet(RuleBasedCollator.UCA_);
48
m_listHeader_ = new TokenListHeader[512];
50
m_prevStrength_ = TOKEN_UNSET_;
51
// call assembleTokenList() manually, so that we can
52
// init a parser and manually parse tokens
53
//assembleTokenList();
56
// package private inner classes -----------------------------------------
59
* Collation options set
61
static class OptionSet
63
// package private constructor ---------------------------------------
66
* Initializes the option set with the argument collators
67
* @param collator option to use
69
OptionSet(RuleBasedCollator collator)
71
m_variableTopValue_ = collator.m_variableTopValue_;
72
m_isFrenchCollation_ = collator.isFrenchCollation();
73
m_isAlternateHandlingShifted_
74
= collator.isAlternateHandlingShifted();
75
m_caseFirst_ = collator.m_caseFirst_;
76
m_isCaseLevel_ = collator.isCaseLevel();
77
m_decomposition_ = collator.getDecomposition();
78
m_strength_ = collator.getStrength();
79
m_isHiragana4_ = collator.m_isHiragana4_;
82
// package private data members --------------------------------------
84
int m_variableTopValue_;
85
boolean m_isFrenchCollation_;
87
* Attribute for handling variable elements
89
boolean m_isAlternateHandlingShifted_;
91
* who goes first, lower case or uppercase
95
* do we have an extra case level
97
boolean m_isCaseLevel_;
99
* attribute for normalization
101
int m_decomposition_;
103
* attribute for strength
107
* attribute for special Hiragana
109
boolean m_isHiragana4_;
113
* List of tokens used by the collation rules
115
static class TokenListHeader
126
int m_previousContCE_;
127
int m_pos_[] = new int[Collator.IDENTICAL + 1];
128
int m_gapsLo_[] = new int[3 * (Collator.TERTIARY + 1)];
129
int m_gapsHi_[] = new int[3 * (Collator.TERTIARY + 1)];
130
int m_numStr_[] = new int[3 * (Collator.TERTIARY + 1)];
131
Token m_fStrToken_[] = new Token[Collator.TERTIARY + 1];
132
Token m_lStrToken_[] = new Token[Collator.TERTIARY + 1];
136
* Token wrapper for collation rules
140
// package private data members ---------------------------------------
151
int m_polarity_; // 1 for <, <<, <<<, , ; and 0 for >, >>, >>>
152
TokenListHeader m_listHeader_;
155
StringBuilder m_rules_;
158
// package private constructors ---------------------------------------
162
m_CE_ = new int[128];
163
m_expCE_ = new int[128];
164
// TODO: this should also handle reverse
165
m_polarity_ = TOKEN_POLARITY_POSITIVE_;
172
// package private methods --------------------------------------------
175
* Hashcode calculation for token
176
* @return the hashcode
178
public int hashCode()
181
int len = (m_source_ & 0xFF000000) >>> 24;
182
int inc = ((len - 32) / 32) + 1;
184
int start = m_source_ & 0x00FFFFFF;
185
int limit = start + len;
187
while (start < limit) {
188
result = (result * 37) + m_rules_.charAt(start);
196
* @param target object to compare
197
* @return true if target is the same as this object
199
public boolean equals(Object target)
201
if (target == this) {
204
if (target instanceof Token) {
205
Token t = (Token)target;
206
int sstart = m_source_ & 0x00FFFFFF;
207
int tstart = t.m_source_ & 0x00FFFFFF;
208
int slimit = (m_source_ & 0xFF000000) >> 24;
209
int tlimit = (m_source_ & 0xFF000000) >> 24;
211
int end = sstart + slimit - 1;
213
if (m_source_ == 0 || t.m_source_ == 0) {
216
if (slimit != tlimit) {
219
if (m_source_ == t.m_source_) {
224
&& m_rules_.charAt(sstart) == t.m_rules_.charAt(tstart))
229
if (m_rules_.charAt(sstart) == t.m_rules_.charAt(tstart)) {
237
// package private data member -------------------------------------------
240
* Indicator that the token is resetted yet, ie & in the rules
242
static final int TOKEN_RESET_ = 0xDEADBEEF;
245
* Size of the number of tokens
249
* List of parsed tokens
251
TokenListHeader m_listHeader_[];
255
Token m_variableTop_;
259
OptionSet m_options_;
261
* Normalized collation rules with some extra characters
263
StringBuilder m_source_;
265
* Hash table to keep all tokens
267
Hashtable<Token, Token> m_hashTable_;
269
// package private method ------------------------------------------------
271
void setDefaultOptionsInCollator(RuleBasedCollator collator)
273
collator.m_defaultStrength_ = m_options_.m_strength_;
274
collator.m_defaultDecomposition_ = m_options_.m_decomposition_;
275
collator.m_defaultIsFrenchCollation_ = m_options_.m_isFrenchCollation_;
276
collator.m_defaultIsAlternateHandlingShifted_
277
= m_options_.m_isAlternateHandlingShifted_;
278
collator.m_defaultIsCaseLevel_ = m_options_.m_isCaseLevel_;
279
collator.m_defaultCaseFirst_ = m_options_.m_caseFirst_;
280
collator.m_defaultIsHiragana4_ = m_options_.m_isHiragana4_;
281
collator.m_defaultVariableTopValue_ = m_options_.m_variableTopValue_;
284
// private inner classes -------------------------------------------------
287
* This is a token that has been parsed but not yet processed. Used to
288
* reduce the number of arguments in the parser
290
private static class ParsedToken
292
// private constructor ----------------------------------------------
302
m_extensionOffset_ = 0;
306
m_strength_ = TOKEN_UNSET_;
309
// private data members ---------------------------------------------
314
int m_extensionOffset_;
319
char m_indirectIndex_;
325
private static class IndirectBoundaries
327
// package private constructor ---------------------------------------
329
IndirectBoundaries(int startce[], int limitce[])
331
// Set values for the top - TODO: once we have values for all the
332
// indirects, we are going to initalize here.
333
m_startCE_ = startce[0];
334
m_startContCE_ = startce[1];
335
if (limitce != null) {
336
m_limitCE_ = limitce[0];
337
m_limitContCE_ = limitce[1];
345
// package private data members --------------------------------------
354
* Collation option rule tag
356
private static class TokenOption
358
// package private constructor ---------------------------------------
360
TokenOption(String name, int attribute, String suboptions[],
361
int suboptionattributevalue[])
364
m_attribute_ = attribute;
365
m_subOptions_ = suboptions;
366
m_subOptionAttributeValues_ = suboptionattributevalue;
369
// package private data member ---------------------------------------
371
private String m_name_;
372
private int m_attribute_;
373
private String m_subOptions_[];
374
private int m_subOptionAttributeValues_[];
377
// private variables -----------------------------------------------------
380
* Current parsed token
382
private ParsedToken m_parsedToken_;
386
private String m_rules_;
387
private int m_current_;
389
* End of the option while reading.
390
* Need it for UnicodeSet reading support.
392
private int m_optionEnd_;
394
* Current offset in m_source
396
//private int m_sourceLimit_;
398
* Offset to m_source_ ofr the extra expansion characters
400
private int m_extraCurrent_;
403
* UnicodeSet that contains code points to be copied from the UCA
405
UnicodeSet m_copySet_;
408
* UnicodeSet that contains code points for which we want to remove
409
* UCA contractions. It implies copying of these code points from
412
UnicodeSet m_removeSet_;
414
* Stores the previous token's strength when making a list of same level
417
private int m_prevStrength_;
420
* This is space for the extra strings that need to be unquoted during the
421
* parsing of the rules
423
//private static final int TOKEN_EXTRA_RULE_SPACE_SIZE_ = 2048;
425
* Indicator that the token is not set yet
427
private static final int TOKEN_UNSET_ = 0xFFFFFFFF;
429
* Indicator that the rule is in the > polarity, ie everything on the
430
* right of the rule is less than
432
//private static final int TOKEN_POLARITY_NEGATIVE_ = 0;
434
* Indicator that the rule is in the < polarity, ie everything on the
435
* right of the rule is greater than
437
private static final int TOKEN_POLARITY_POSITIVE_ = 1;
439
* Flag mask to determine if top is set
441
private static final int TOKEN_TOP_MASK_ = 0x04;
443
* Flag mask to determine if variable top is set
445
private static final int TOKEN_VARIABLE_TOP_MASK_ = 0x08;
447
* Flag mask to determine if a before attribute is set
449
private static final int TOKEN_BEFORE_ = 0x03;
451
* For use in parsing token options
453
private static final int TOKEN_SUCCESS_MASK_ = 0x10;
456
* These values are used for finding CE values for indirect positioning.
457
* Indirect positioning is a mechanism for allowing resets on symbolic
458
* values. It only works for resets and you cannot tailor indirect names.
459
* An indirect name can define either an anchor point or a range. An anchor
460
* point behaves in exactly the same way as a code point in reset would,
461
* except that it cannot be tailored. A range (we currently only know for
462
* the [top] range will explicitly set the upper bound for generated CEs,
463
* thus allowing for better control over how many CEs can be squeezed
464
* between in the range without performance penalty. In that respect, we use
465
* [top] for tailoring of locales that use CJK characters. Other indirect
466
* values are currently a pure convenience, they can be used to assure that
467
* the CEs will be always positioned in the same place relative to a point
468
* with known properties (e.g. first primary ignorable).
470
private static final IndirectBoundaries INDIRECT_BOUNDARIES_[];
473
// * Inverse UCA constants
475
// private static final int INVERSE_SIZE_MASK_ = 0xFFF00000;
476
// private static final int INVERSE_OFFSET_MASK_ = 0x000FFFFF;
477
// private static final int INVERSE_SHIFT_VALUE_ = 20;
480
* Collation option tags
481
* [last variable] last variable value
482
* [last primary ignorable] largest CE for primary ignorable
483
* [last secondary ignorable] largest CE for secondary ignorable
484
* [last tertiary ignorable] largest CE for tertiary ignorable
485
* [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)
487
private static final TokenOption RULES_OPTIONS_[];
491
INDIRECT_BOUNDARIES_ = new IndirectBoundaries[15];
492
// UCOL_RESET_TOP_VALUE
493
INDIRECT_BOUNDARIES_[0] = new IndirectBoundaries(
494
RuleBasedCollator.UCA_CONSTANTS_.LAST_NON_VARIABLE_,
495
RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_);
496
// UCOL_FIRST_PRIMARY_IGNORABLE
497
INDIRECT_BOUNDARIES_[1] = new IndirectBoundaries(
498
RuleBasedCollator.UCA_CONSTANTS_.FIRST_PRIMARY_IGNORABLE_,
500
// UCOL_LAST_PRIMARY_IGNORABLE
501
INDIRECT_BOUNDARIES_[2] = new IndirectBoundaries(
502
RuleBasedCollator.UCA_CONSTANTS_.LAST_PRIMARY_IGNORABLE_,
505
// UCOL_FIRST_SECONDARY_IGNORABLE
506
INDIRECT_BOUNDARIES_[3] = new IndirectBoundaries(
507
RuleBasedCollator.UCA_CONSTANTS_.FIRST_SECONDARY_IGNORABLE_,
509
// UCOL_LAST_SECONDARY_IGNORABLE
510
INDIRECT_BOUNDARIES_[4] = new IndirectBoundaries(
511
RuleBasedCollator.UCA_CONSTANTS_.LAST_SECONDARY_IGNORABLE_,
513
// UCOL_FIRST_TERTIARY_IGNORABLE
514
INDIRECT_BOUNDARIES_[5] = new IndirectBoundaries(
515
RuleBasedCollator.UCA_CONSTANTS_.FIRST_TERTIARY_IGNORABLE_,
517
// UCOL_LAST_TERTIARY_IGNORABLE
518
INDIRECT_BOUNDARIES_[6] = new IndirectBoundaries(
519
RuleBasedCollator.UCA_CONSTANTS_.LAST_TERTIARY_IGNORABLE_,
521
// UCOL_FIRST_VARIABLE;
522
INDIRECT_BOUNDARIES_[7] = new IndirectBoundaries(
523
RuleBasedCollator.UCA_CONSTANTS_.FIRST_VARIABLE_,
525
// UCOL_LAST_VARIABLE
526
INDIRECT_BOUNDARIES_[8] = new IndirectBoundaries(
527
RuleBasedCollator.UCA_CONSTANTS_.LAST_VARIABLE_,
529
// UCOL_FIRST_NON_VARIABLE
530
INDIRECT_BOUNDARIES_[9] = new IndirectBoundaries(
531
RuleBasedCollator.UCA_CONSTANTS_.FIRST_NON_VARIABLE_,
533
// UCOL_LAST_NON_VARIABLE
534
INDIRECT_BOUNDARIES_[10] = new IndirectBoundaries(
535
RuleBasedCollator.UCA_CONSTANTS_.LAST_NON_VARIABLE_,
536
RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_);
537
// UCOL_FIRST_IMPLICIT
538
INDIRECT_BOUNDARIES_[11] = new IndirectBoundaries(
539
RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_,
541
// UCOL_LAST_IMPLICIT
542
INDIRECT_BOUNDARIES_[12] = new IndirectBoundaries(
543
RuleBasedCollator.UCA_CONSTANTS_.LAST_IMPLICIT_,
544
RuleBasedCollator.UCA_CONSTANTS_.FIRST_TRAILING_);
545
// UCOL_FIRST_TRAILING
546
INDIRECT_BOUNDARIES_[13] = new IndirectBoundaries(
547
RuleBasedCollator.UCA_CONSTANTS_.FIRST_TRAILING_,
549
// UCOL_LAST_TRAILING
550
INDIRECT_BOUNDARIES_[14] = new IndirectBoundaries(
551
RuleBasedCollator.UCA_CONSTANTS_.LAST_TRAILING_,
553
INDIRECT_BOUNDARIES_[14].m_limitCE_
554
= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_SPECIAL_MIN_ << 24;
556
RULES_OPTIONS_ = new TokenOption[19];
557
String option[] = {"non-ignorable", "shifted"};
558
int value[] = {RuleBasedCollator.AttributeValue.NON_IGNORABLE_,
559
RuleBasedCollator.AttributeValue.SHIFTED_};
560
RULES_OPTIONS_[0] = new TokenOption("alternate",
561
RuleBasedCollator.Attribute.ALTERNATE_HANDLING_,
563
option = new String[1];
566
value[0] = RuleBasedCollator.AttributeValue.ON_;
567
RULES_OPTIONS_[1] = new TokenOption("backwards",
568
RuleBasedCollator.Attribute.FRENCH_COLLATION_,
570
String offonoption[] = new String[2];
571
offonoption[0] = "off";
572
offonoption[1] = "on";
573
int offonvalue[] = new int[2];
574
offonvalue[0] = RuleBasedCollator.AttributeValue.OFF_;
575
offonvalue[1] = RuleBasedCollator.AttributeValue.ON_;
576
RULES_OPTIONS_[2] = new TokenOption("caseLevel",
577
RuleBasedCollator.Attribute.CASE_LEVEL_,
578
offonoption, offonvalue);
579
option = new String[3];
584
value[0] = RuleBasedCollator.AttributeValue.LOWER_FIRST_;
585
value[1] = RuleBasedCollator.AttributeValue.UPPER_FIRST_;
586
value[2] = RuleBasedCollator.AttributeValue.OFF_;
587
RULES_OPTIONS_[3] = new TokenOption("caseFirst",
588
RuleBasedCollator.Attribute.CASE_FIRST_,
590
RULES_OPTIONS_[4] = new TokenOption("normalization",
591
RuleBasedCollator.Attribute.NORMALIZATION_MODE_,
592
offonoption, offonvalue);
593
RULES_OPTIONS_[5] = new TokenOption("hiraganaQ",
594
RuleBasedCollator.Attribute.HIRAGANA_QUATERNARY_MODE_,
595
offonoption, offonvalue);
596
option = new String[5];
603
value[0] = RuleBasedCollator.AttributeValue.PRIMARY_;
604
value[1] = RuleBasedCollator.AttributeValue.SECONDARY_;
605
value[2] = RuleBasedCollator.AttributeValue.TERTIARY_;
606
value[3] = RuleBasedCollator.AttributeValue.QUATERNARY_;
607
value[4] = RuleBasedCollator.AttributeValue.IDENTICAL_;
608
RULES_OPTIONS_[6] = new TokenOption("strength",
609
RuleBasedCollator.Attribute.STRENGTH_,
611
RULES_OPTIONS_[7] = new TokenOption("variable top",
612
RuleBasedCollator.Attribute.LIMIT_,
614
RULES_OPTIONS_[8] = new TokenOption("rearrange",
615
RuleBasedCollator.Attribute.LIMIT_,
617
option = new String[3];
622
value[0] = RuleBasedCollator.AttributeValue.PRIMARY_;
623
value[1] = RuleBasedCollator.AttributeValue.SECONDARY_;
624
value[2] = RuleBasedCollator.AttributeValue.TERTIARY_;
625
RULES_OPTIONS_[9] = new TokenOption("before",
626
RuleBasedCollator.Attribute.LIMIT_,
628
RULES_OPTIONS_[10] = new TokenOption("top",
629
RuleBasedCollator.Attribute.LIMIT_,
631
String firstlastoption[] = new String[7];
632
firstlastoption[0] = "primary";
633
firstlastoption[1] = "secondary";
634
firstlastoption[2] = "tertiary";
635
firstlastoption[3] = "variable";
636
firstlastoption[4] = "regular";
637
firstlastoption[5] = "implicit";
638
firstlastoption[6] = "trailing";
640
int firstlastvalue[] = new int[7];
641
Arrays.fill(firstlastvalue, RuleBasedCollator.AttributeValue.PRIMARY_);
643
RULES_OPTIONS_[11] = new TokenOption("first",
644
RuleBasedCollator.Attribute.LIMIT_,
645
firstlastoption, firstlastvalue);
646
RULES_OPTIONS_[12] = new TokenOption("last",
647
RuleBasedCollator.Attribute.LIMIT_,
648
firstlastoption, firstlastvalue);
649
RULES_OPTIONS_[13] = new TokenOption("optimize",
650
RuleBasedCollator.Attribute.LIMIT_,
652
RULES_OPTIONS_[14] = new TokenOption("suppressContractions",
653
RuleBasedCollator.Attribute.LIMIT_,
655
RULES_OPTIONS_[15] = new TokenOption("undefined",
656
RuleBasedCollator.Attribute.LIMIT_,
658
RULES_OPTIONS_[16] = new TokenOption("scriptOrder",
659
RuleBasedCollator.Attribute.LIMIT_,
661
RULES_OPTIONS_[17] = new TokenOption("charsetname",
662
RuleBasedCollator.Attribute.LIMIT_,
664
RULES_OPTIONS_[18] = new TokenOption("charset",
665
RuleBasedCollator.Attribute.LIMIT_,
670
* Utility data members
672
private Token m_utilToken_ = new Token();
673
private CollationElementIterator m_UCAColEIter_
674
= RuleBasedCollator.UCA_.getCollationElementIterator("");
675
private int m_utilCEBuffer_[] = new int[2];
677
// private methods -------------------------------------------------------
680
* Assembles the token list
681
* @exception ParseException thrown when rules syntax fails
683
int assembleTokenList() throws ParseException
685
Token lastToken = null;
686
m_parsedToken_.m_strength_ = TOKEN_UNSET_;
687
int sourcelimit = m_source_.length();
690
while (m_current_ < sourcelimit) {
691
m_parsedToken_.m_prefixOffset_ = 0;
692
if (parseNextToken(lastToken == null) < 0) {
693
// we have reached the end
696
char specs = m_parsedToken_.m_flags_;
697
boolean variableTop = ((specs & TOKEN_VARIABLE_TOP_MASK_) != 0);
698
boolean top = ((specs & TOKEN_TOP_MASK_) != 0);
699
int lastStrength = TOKEN_UNSET_;
700
if (lastToken != null) {
701
lastStrength = lastToken.m_strength_;
703
m_utilToken_.m_source_ = m_parsedToken_.m_charsLen_ << 24
704
| m_parsedToken_.m_charsOffset_;
705
m_utilToken_.m_rules_ = m_source_;
706
// 4 Lookup each source in the CharsToToken map, and find a
708
Token sourceToken = m_hashTable_.get(m_utilToken_);
709
if (m_parsedToken_.m_strength_ != TOKEN_RESET_) {
710
if (lastToken == null) {
711
// this means that rules haven't started properly
712
throwParseException(m_source_.toString(), 0);
714
// 6 Otherwise (when relation != reset)
715
if (sourceToken == null) {
716
// If sourceToken is null, create new one
717
sourceToken = new Token();
718
sourceToken.m_rules_ = m_source_;
719
sourceToken.m_source_ = m_parsedToken_.m_charsLen_ << 24
720
| m_parsedToken_.m_charsOffset_;
721
sourceToken.m_prefix_ = m_parsedToken_.m_prefixLen_ << 24
722
| m_parsedToken_.m_prefixOffset_;
723
// TODO: this should also handle reverse
724
sourceToken.m_polarity_ = TOKEN_POLARITY_POSITIVE_;
725
sourceToken.m_next_ = null;
726
sourceToken.m_previous_ = null;
727
sourceToken.m_CELength_ = 0;
728
sourceToken.m_expCELength_ = 0;
729
m_hashTable_.put(sourceToken, sourceToken);
732
// we could have fished out a reset here
733
if (sourceToken.m_strength_ != TOKEN_RESET_
734
&& lastToken != sourceToken) {
735
// otherwise remove sourceToken from where it was.
736
if (sourceToken.m_next_ != null) {
737
if (sourceToken.m_next_.m_strength_
738
> sourceToken.m_strength_) {
739
sourceToken.m_next_.m_strength_
740
= sourceToken.m_strength_;
742
sourceToken.m_next_.m_previous_
743
= sourceToken.m_previous_;
746
sourceToken.m_listHeader_.m_last_
747
= sourceToken.m_previous_;
749
if (sourceToken.m_previous_ != null) {
750
sourceToken.m_previous_.m_next_
751
= sourceToken.m_next_;
754
sourceToken.m_listHeader_.m_first_
755
= sourceToken.m_next_;
757
sourceToken.m_next_ = null;
758
sourceToken.m_previous_ = null;
761
sourceToken.m_strength_ = m_parsedToken_.m_strength_;
762
sourceToken.m_listHeader_ = lastToken.m_listHeader_;
764
// 1. Find the strongest strength in each list, and set
765
// strongestP and strongestN accordingly in the headers.
766
if (lastStrength == TOKEN_RESET_
767
|| sourceToken.m_listHeader_.m_first_ == null) {
768
// If LAST is a reset insert sourceToken in the list.
769
if (sourceToken.m_listHeader_.m_first_ == null) {
770
sourceToken.m_listHeader_.m_first_ = sourceToken;
771
sourceToken.m_listHeader_.m_last_ = sourceToken;
773
else { // we need to find a place for us
774
// and we'll get in front of the same strength
775
if (sourceToken.m_listHeader_.m_first_.m_strength_
776
<= sourceToken.m_strength_) {
778
= sourceToken.m_listHeader_.m_first_;
779
sourceToken.m_next_.m_previous_ = sourceToken;
780
sourceToken.m_listHeader_.m_first_ = sourceToken;
781
sourceToken.m_previous_ = null;
784
lastToken = sourceToken.m_listHeader_.m_first_;
785
while (lastToken.m_next_ != null
786
&& lastToken.m_next_.m_strength_
787
> sourceToken.m_strength_) {
788
lastToken = lastToken.m_next_;
790
if (lastToken.m_next_ != null) {
791
lastToken.m_next_.m_previous_ = sourceToken;
794
sourceToken.m_listHeader_.m_last_
797
sourceToken.m_previous_ = lastToken;
798
sourceToken.m_next_ = lastToken.m_next_;
799
lastToken.m_next_ = sourceToken;
804
// Otherwise (when LAST is not a reset)
805
// if polarity (LAST) == polarity(relation), insert
806
// sourceToken after LAST, otherwise insert before.
807
// when inserting after or before, search to the next
808
// position with the same strength in that direction.
809
// (This is called postpone insertion).
810
if (sourceToken != lastToken) {
811
if (lastToken.m_polarity_ == sourceToken.m_polarity_) {
812
while (lastToken.m_next_ != null
813
&& lastToken.m_next_.m_strength_
814
> sourceToken.m_strength_) {
815
lastToken = lastToken.m_next_;
817
sourceToken.m_previous_ = lastToken;
818
if (lastToken.m_next_ != null) {
819
lastToken.m_next_.m_previous_ = sourceToken;
822
sourceToken.m_listHeader_.m_last_ = sourceToken;
824
sourceToken.m_next_ = lastToken.m_next_;
825
lastToken.m_next_ = sourceToken;
828
while (lastToken.m_previous_ != null
829
&& lastToken.m_previous_.m_strength_
830
> sourceToken.m_strength_) {
831
lastToken = lastToken.m_previous_;
833
sourceToken.m_next_ = lastToken;
834
if (lastToken.m_previous_ != null) {
835
lastToken.m_previous_.m_next_ = sourceToken;
838
sourceToken.m_listHeader_.m_first_
841
sourceToken.m_previous_ = lastToken.m_previous_;
842
lastToken.m_previous_ = sourceToken;
845
else { // repeated one thing twice in rules, stay with the
847
if (lastStrength < sourceToken.m_strength_) {
848
sourceToken.m_strength_ = lastStrength;
852
// if the token was a variable top, we're gonna put it in
853
if (variableTop == true && m_variableTop_ == null) {
855
m_variableTop_ = sourceToken;
857
// Treat the expansions.
858
// There are two types of expansions: explicit (x / y) and
859
// reset based propagating expansions
860
// (&abc * d * e <=> &ab * d / c * e / c)
861
// if both of them are in effect for a token, they are combined.
862
sourceToken.m_expansion_ = m_parsedToken_.m_extensionLen_ << 24
863
| m_parsedToken_.m_extensionOffset_;
864
if (expandNext != 0) {
865
if (sourceToken.m_strength_ == RuleBasedCollator.PRIMARY) {
866
// primary strength kills off the implicit expansion
869
else if (sourceToken.m_expansion_ == 0) {
870
// if there is no expansion, implicit is just added to
872
sourceToken.m_expansion_ = expandNext;
875
// there is both explicit and implicit expansion.
876
// We need to make a combination
877
int start = expandNext & 0xFFFFFF;
878
int size = expandNext >>> 24;
880
m_source_.append(m_source_.substring(start,
883
start = m_parsedToken_.m_extensionOffset_;
884
m_source_.append(m_source_.substring(start,
885
start + m_parsedToken_.m_extensionLen_));
886
sourceToken.m_expansion_ = (size
887
+ m_parsedToken_.m_extensionLen_) << 24
889
m_extraCurrent_ += size + m_parsedToken_.m_extensionLen_;
892
// if the previous token was a reset before, the strength of this
893
// token must match the strength of before. Otherwise we have an
894
// undefined situation.
895
// In other words, we currently have a cludge which we use to
896
// represent &a >> x. This is written as &[before 2]a << x.
897
if((lastToken.m_flags_ & TOKEN_BEFORE_) != 0) {
898
int beforeStrength = (lastToken.m_flags_ & TOKEN_BEFORE_) - 1;
899
if(beforeStrength != sourceToken.m_strength_) {
900
throwParseException(m_source_.toString(), m_current_);
906
if (lastToken != null && lastStrength == TOKEN_RESET_) {
907
// if the previous token was also a reset, this means that
908
// we have two consecutive resets and we want to remove the
909
// previous one if empty
910
if (m_resultLength_ > 0 && m_listHeader_[m_resultLength_ - 1].m_first_ == null) {
914
if (sourceToken == null) {
915
// this is a reset, but it might still be somewhere in the
916
// tailoring, in shorter form
917
int searchCharsLen = m_parsedToken_.m_charsLen_;
918
while (searchCharsLen > 1 && sourceToken == null) {
920
// key = searchCharsLen << 24 | charsOffset;
921
m_utilToken_.m_source_ = searchCharsLen << 24
922
| m_parsedToken_.m_charsOffset_;
923
m_utilToken_.m_rules_ = m_source_;
924
sourceToken = m_hashTable_.get(m_utilToken_);
926
if (sourceToken != null) {
927
expandNext = (m_parsedToken_.m_charsLen_
928
- searchCharsLen) << 24
929
| (m_parsedToken_.m_charsOffset_
933
if ((specs & TOKEN_BEFORE_) != 0) {
935
// we're doing before & there is no indirection
936
int strength = (specs & TOKEN_BEFORE_) - 1;
937
if (sourceToken != null
938
&& sourceToken.m_strength_ != TOKEN_RESET_) {
939
// this is a before that is already ordered in the UCA
940
// - so we need to get the previous with good strength
941
while (sourceToken.m_strength_ > strength
942
&& sourceToken.m_previous_ != null) {
943
sourceToken = sourceToken.m_previous_;
945
// here, either we hit the strength or NULL
946
if (sourceToken.m_strength_ == strength) {
947
if (sourceToken.m_previous_ != null) {
948
sourceToken = sourceToken.m_previous_;
950
else { // start of list
952
= sourceToken.m_listHeader_.m_reset_;
955
else { // we hit NULL, we should be doing the else part
957
= sourceToken.m_listHeader_.m_reset_;
958
sourceToken = getVirginBefore(sourceToken,
964
= getVirginBefore(sourceToken, strength);
968
// this is both before and indirection
970
m_listHeader_[m_resultLength_] = new TokenListHeader();
971
m_listHeader_[m_resultLength_].m_previousCE_ = 0;
972
m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
973
m_listHeader_[m_resultLength_].m_indirect_ = true;
974
// we need to do slightly more work. we need to get the
975
// baseCE using the inverse UCA & getPrevious. The next
976
// bound is not set, and will be decided in ucol_bld
977
int strength = (specs & TOKEN_BEFORE_) - 1;
978
int baseCE = INDIRECT_BOUNDARIES_[
979
m_parsedToken_.m_indirectIndex_].m_startCE_;
980
int baseContCE = INDIRECT_BOUNDARIES_[
981
m_parsedToken_.m_indirectIndex_].m_startContCE_;
982
int ce[] = new int[2];
983
if((baseCE >>> 24 >= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_)
984
&& (baseCE >>> 24 <= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_)) { /* implicits - */
985
int primary = baseCE & RuleBasedCollator.CE_PRIMARY_MASK_ | (baseContCE & RuleBasedCollator.CE_PRIMARY_MASK_) >> 16;
986
int raw = RuleBasedCollator.impCEGen_.getRawFromImplicit(primary);
987
int primaryCE = RuleBasedCollator.impCEGen_.getImplicitFromRaw(raw-1);
988
ce[0] = primaryCE & RuleBasedCollator.CE_PRIMARY_MASK_ | 0x0505;
989
ce[1] = (primaryCE << 16) & RuleBasedCollator.CE_PRIMARY_MASK_ | RuleBasedCollator.CE_CONTINUATION_MARKER_;
991
CollationParsedRuleBuilder.InverseUCA invuca
992
= CollationParsedRuleBuilder.INVERSE_UCA_;
993
invuca.getInversePrevCE(baseCE, baseContCE, strength,
996
m_listHeader_[m_resultLength_].m_baseCE_ = ce[0];
997
m_listHeader_[m_resultLength_].m_baseContCE_ = ce[1];
998
m_listHeader_[m_resultLength_].m_nextCE_ = 0;
999
m_listHeader_[m_resultLength_].m_nextContCE_ = 0;
1001
sourceToken = new Token();
1002
expandNext = initAReset(0, sourceToken);
1005
// 5 If the relation is a reset:
1006
// If sourceToken is null
1007
// Create new list, create new sourceToken, make the baseCE
1008
// from source, put the sourceToken in ListHeader of the new
1010
if (sourceToken == null) {
1011
if (m_listHeader_[m_resultLength_] == null) {
1012
m_listHeader_[m_resultLength_] = new TokenListHeader();
1014
// 3 Consider each item: relation, source, and expansion:
1015
// e.g. ...< x / y ...
1016
// First convert all expansions into normal form.
1018
// If "xy" doesn't occur earlier in the list or in the UCA,
1019
// convert &xy * c * d * ... into &x * c/y * d * ...
1020
// Note: reset values can never have expansions, although
1021
// they can cause the very next item to have one. They may
1022
// be contractions, if they are found earlier in the list.
1024
CollationElementIterator coleiter
1025
= RuleBasedCollator.UCA_.getCollationElementIterator(
1026
m_source_.substring(m_parsedToken_.m_charsOffset_,
1027
m_parsedToken_.m_charsOffset_
1028
+ m_parsedToken_.m_charsLen_));
1030
int CE = coleiter.next();
1031
// offset to the character in the full rule string
1032
int expand = coleiter.getOffset()
1033
+ m_parsedToken_.m_charsOffset_;
1034
int SecondCE = coleiter.next();
1036
m_listHeader_[m_resultLength_].m_baseCE_
1038
if (RuleBasedCollator.isContinuation(SecondCE)) {
1039
m_listHeader_[m_resultLength_].m_baseContCE_
1043
m_listHeader_[m_resultLength_].m_baseContCE_ = 0;
1045
m_listHeader_[m_resultLength_].m_nextCE_ = 0;
1046
m_listHeader_[m_resultLength_].m_nextContCE_ = 0;
1047
m_listHeader_[m_resultLength_].m_previousCE_ = 0;
1048
m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
1049
m_listHeader_[m_resultLength_].m_indirect_ = false;
1050
sourceToken = new Token();
1051
expandNext = initAReset(expand, sourceToken);
1053
else { // top == TRUE
1055
m_listHeader_[m_resultLength_].m_previousCE_ = 0;
1056
m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
1057
m_listHeader_[m_resultLength_].m_indirect_ = true;
1058
IndirectBoundaries ib = INDIRECT_BOUNDARIES_[
1059
m_parsedToken_.m_indirectIndex_];
1060
m_listHeader_[m_resultLength_].m_baseCE_
1062
m_listHeader_[m_resultLength_].m_baseContCE_
1063
= ib.m_startContCE_;
1064
m_listHeader_[m_resultLength_].m_nextCE_
1066
m_listHeader_[m_resultLength_].m_nextContCE_
1067
= ib.m_limitContCE_;
1068
sourceToken = new Token();
1069
expandNext = initAReset(0, sourceToken);
1072
else { // reset to something already in rules
1076
// 7 After all this, set LAST to point to sourceToken, and goto
1078
lastToken = sourceToken;
1081
if (m_resultLength_ > 0
1082
&& m_listHeader_[m_resultLength_ - 1].m_first_ == null) {
1085
return m_resultLength_;
1089
* Formats and throws a ParseException
1090
* @param rules collation rule that failed
1091
* @param offset failed offset in rules
1092
* @throws ParseException with failure information
1094
private static final void throwParseException(String rules, int offset)
1095
throws ParseException
1098
String precontext = rules.substring(0, offset);
1099
String postcontext = rules.substring(offset, rules.length());
1100
StringBuilder error = new StringBuilder(
1101
"Parse error occurred in rule at offset ");
1102
error.append(offset);
1103
error.append("\n after the prefix \"");
1104
error.append(precontext);
1105
error.append("\" before the suffix \"");
1106
error.append(postcontext);
1107
throw new ParseException(error.toString(), offset);
1110
private final boolean doSetTop() {
1111
m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
1112
m_source_.append((char)0xFFFE);
1113
IndirectBoundaries ib =
1114
INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_];
1115
m_source_.append((char)(ib.m_startCE_ >> 16));
1116
m_source_.append((char)(ib.m_startCE_ & 0xFFFF));
1117
m_extraCurrent_ += 3;
1118
if (INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_
1119
].m_startContCE_ == 0) {
1120
m_parsedToken_.m_charsLen_ = 3;
1123
m_source_.append((char)(INDIRECT_BOUNDARIES_[
1124
m_parsedToken_.m_indirectIndex_
1125
].m_startContCE_ >> 16));
1126
m_source_.append((char)(INDIRECT_BOUNDARIES_[
1127
m_parsedToken_.m_indirectIndex_
1128
].m_startContCE_ & 0xFFFF));
1129
m_extraCurrent_ += 2;
1130
m_parsedToken_.m_charsLen_ = 5;
1135
private static boolean isCharNewLine(char c) {
1137
case 0x000A: /* LF */
1138
case 0x000D: /* CR */
1139
case 0x000C: /* FF */
1140
case 0x0085: /* NEL */
1141
case 0x2028: /* LS */
1142
case 0x2029: /* PS */
1150
* Getting the next token
1152
* @param startofrules
1153
* flag indicating if we are at the start of rules
1154
* @return the offset of the rules
1155
* @exception ParseException
1156
* thrown when rule parsing fails
1158
@SuppressWarnings("fallthrough")
1159
private int parseNextToken(boolean startofrules) throws ParseException
1162
boolean variabletop = false;
1163
boolean top = false;
1164
boolean inchars = true;
1165
boolean inquote = false;
1166
boolean wasinquote = false;
1168
boolean isescaped = false;
1169
int /*newcharslen = 0,*/ newextensionlen = 0;
1170
int /*charsoffset = 0,*/ extensionoffset = 0;
1171
int newstrength = TOKEN_UNSET_;
1173
m_parsedToken_.m_charsLen_ = 0;
1174
m_parsedToken_.m_charsOffset_ = 0;
1175
m_parsedToken_.m_prefixOffset_ = 0;
1176
m_parsedToken_.m_prefixLen_ = 0;
1177
m_parsedToken_.m_indirectIndex_ = 0;
1179
int limit = m_rules_.length();
1180
while (m_current_ < limit) {
1181
char ch = m_source_.charAt(m_current_);
1183
if (ch == 0x0027) { // '\''
1187
if ((m_parsedToken_.m_charsLen_ == 0) || inchars) {
1188
if (m_parsedToken_.m_charsLen_ == 0) {
1189
m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
1191
m_parsedToken_.m_charsLen_ ++;
1194
if (newextensionlen == 0) {
1195
extensionoffset = m_extraCurrent_;
1201
else if (isescaped) {
1203
if (newstrength == TOKEN_UNSET_) {
1204
throwParseException(m_rules_, m_current_);
1206
if (ch != 0 && m_current_ != limit) {
1208
if (m_parsedToken_.m_charsLen_ == 0) {
1209
m_parsedToken_.m_charsOffset_ = m_current_;
1211
m_parsedToken_.m_charsLen_ ++;
1214
if (newextensionlen == 0) {
1215
extensionoffset = m_current_;
1222
if (!UCharacterProperty.isRuleWhiteSpace(ch)) {
1223
// Sets the strength for this entry
1225
case 0x003D : // '='
1226
if (newstrength != TOKEN_UNSET_) {
1227
return doEndParseNextToken(newstrength,
1231
variabletop, before);
1233
// if we start with strength, we'll reset to top
1234
if (startofrules == true) {
1235
m_parsedToken_.m_indirectIndex_ = 5;
1237
return doEndParseNextToken(TOKEN_RESET_,
1241
variabletop, before);
1243
newstrength = Collator.IDENTICAL;
1244
if(m_source_.charAt(m_current_ + 1) == 0x002A) { // '*'
1246
m_prevStrength_ = newstrength;
1248
m_prevStrength_ = TOKEN_UNSET_;
1251
case 0x002C : // ','
1252
if (newstrength != TOKEN_UNSET_) {
1253
return doEndParseNextToken(newstrength,
1257
variabletop, before);
1259
// if we start with strength, we'll reset to top
1260
if (startofrules == true) {
1261
m_parsedToken_.m_indirectIndex_ = 5;
1263
return doEndParseNextToken(TOKEN_RESET_,
1267
variabletop, before);
1269
newstrength = Collator.TERTIARY;
1270
m_prevStrength_ = TOKEN_UNSET_;
1272
case 0x003B : // ';'
1273
if (newstrength != TOKEN_UNSET_) {
1274
return doEndParseNextToken(newstrength,
1278
variabletop, before);
1280
// if we start with strength, we'll reset to top
1281
if (startofrules == true) {
1282
m_parsedToken_.m_indirectIndex_ = 5;
1284
return doEndParseNextToken(TOKEN_RESET_,
1288
variabletop, before);
1290
newstrength = Collator.SECONDARY;
1291
m_prevStrength_ = TOKEN_UNSET_;
1293
case 0x003C : // '<'
1294
if (newstrength != TOKEN_UNSET_) {
1295
return doEndParseNextToken(newstrength,
1299
variabletop, before);
1301
// if we start with strength, we'll reset to top
1302
if (startofrules == true) {
1303
m_parsedToken_.m_indirectIndex_ = 5;
1305
return doEndParseNextToken(TOKEN_RESET_,
1309
variabletop, before);
1311
// before this, do a scan to verify whether this is
1313
if (m_source_.charAt(m_current_ + 1) == 0x003C) {
1315
if (m_source_.charAt(m_current_ + 1) == 0x003C) {
1316
m_current_ ++; // three in a row!
1317
newstrength = Collator.TERTIARY;
1319
else { // two in a row
1320
newstrength = Collator.SECONDARY;
1324
newstrength = Collator.PRIMARY;
1327
if(m_source_.charAt(m_current_ + 1) == 0x002A) { // '*'
1329
m_prevStrength_ = newstrength;
1331
m_prevStrength_ = TOKEN_UNSET_;
1334
case 0x0026 : // '&'
1335
if (newstrength != TOKEN_UNSET_) {
1336
return doEndParseNextToken(newstrength,
1340
variabletop, before);
1342
newstrength = TOKEN_RESET_; // PatternEntry::RESET = 0
1343
m_prevStrength_ = TOKEN_UNSET_;
1345
case 0x005b : // '['
1346
// options - read an option, analyze it
1347
m_optionEnd_ = m_rules_.indexOf(0x005d, m_current_);
1348
if (m_optionEnd_ != -1) { // ']'
1349
byte result = readAndSetOption();
1350
m_current_ = m_optionEnd_;
1351
if ((result & TOKEN_TOP_MASK_) != 0) {
1352
if (newstrength == TOKEN_RESET_) {
1355
// This is a combination of before and
1357
// '&[before 2][first regular]<b'
1358
m_source_.append((char)0x002d);
1359
m_source_.append((char)before);
1360
m_extraCurrent_ += 2;
1361
m_parsedToken_.m_charsLen_ += 2;
1364
return doEndParseNextToken(newstrength,
1368
variabletop, before);
1371
throwParseException(m_rules_, m_current_);
1374
else if ((result & TOKEN_VARIABLE_TOP_MASK_) != 0) {
1375
if (newstrength != TOKEN_RESET_
1376
&& newstrength != TOKEN_UNSET_) {
1378
m_parsedToken_.m_charsOffset_
1380
m_source_.append((char)0xFFFF);
1383
m_parsedToken_.m_charsLen_ = 1;
1384
return doEndParseNextToken(newstrength,
1388
variabletop, before);
1391
throwParseException(m_rules_, m_current_);
1394
else if ((result & TOKEN_BEFORE_) != 0){
1395
if (newstrength == TOKEN_RESET_) {
1396
before = (byte)(result & TOKEN_BEFORE_);
1399
throwParseException(m_rules_, m_current_);
1404
case 0x002F : // '/'
1405
wasinquote = false; // if we were copying source
1406
// characters, we want to stop now
1407
inchars = false; // we're now processing expansion
1409
case 0x005C : // back slash for escaped chars
1412
// found a quote, we're gonna start copying
1413
case 0x0027 : //'\''
1414
if (newstrength == TOKEN_UNSET_) {
1415
if (m_prevStrength_ == TOKEN_UNSET_) {
1416
// quote is illegal until we have a strength
1417
throwParseException(m_rules_, m_current_);
1419
newstrength = m_prevStrength_;
1423
if (inchars) { // we're doing characters
1424
if (wasinquote == false) {
1425
m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
1427
if (m_parsedToken_.m_charsLen_ != 0) {
1428
m_source_.append(m_source_.substring(
1429
m_current_ - m_parsedToken_.m_charsLen_,
1431
m_extraCurrent_ += m_parsedToken_.m_charsLen_;
1433
m_parsedToken_.m_charsLen_ ++;
1435
else { // we're doing an expansion
1436
if (wasinquote == false) {
1437
extensionoffset = m_extraCurrent_;
1439
if (newextensionlen != 0) {
1440
m_source_.append(m_source_.substring(
1441
m_current_ - newextensionlen,
1443
m_extraCurrent_ += newextensionlen;
1449
ch = m_source_.charAt(m_current_);
1450
if (ch == 0x0027) { // copy the double quote
1451
m_source_.append(ch);
1456
// '@' is french only if the strength is not currently set
1457
// if it is, it's just a regular character in collation
1458
case 0x0040 : // '@'
1459
if (newstrength == TOKEN_UNSET_) {
1460
m_options_.m_isFrenchCollation_ = true;
1465
// this means we have actually been reading prefix part
1466
// we want to store read characters to the prefix part
1467
// and continue reading the characters (proper way
1468
// would be to restart reading the chars, but in that
1469
// case we would have to complicate the token hasher,
1470
// which I do not intend to play with. Instead, we will
1471
// do prefixes when prefixes are due (before adding the
1473
m_parsedToken_.m_prefixOffset_
1474
= m_parsedToken_.m_charsOffset_;
1475
m_parsedToken_.m_prefixLen_
1476
= m_parsedToken_.m_charsLen_;
1477
if (inchars) { // we're doing characters
1478
if (wasinquote == false) {
1479
m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
1481
if (m_parsedToken_.m_charsLen_ != 0) {
1482
String prefix = m_source_.substring(
1483
m_current_ - m_parsedToken_.m_charsLen_,
1485
m_source_.append(prefix);
1486
m_extraCurrent_ += m_parsedToken_.m_charsLen_;
1488
m_parsedToken_.m_charsLen_ ++;
1493
ch = m_source_.charAt(m_current_);
1494
// skip whitespace between '|' and the character
1495
} while (UCharacterProperty.isRuleWhiteSpace(ch));
1497
case 0x0023: // '#' // this is a comment, skip everything through the end of line
1500
ch = m_source_.charAt(m_current_);
1501
} while (!isCharNewLine(ch));
1503
case 0x0021: // '!' // ignoring java set thai reordering
1506
if (newstrength == TOKEN_UNSET_) {
1507
if(m_prevStrength_ == TOKEN_UNSET_){
1508
throwParseException(m_rules_, m_current_);
1510
newstrength = m_prevStrength_;
1513
if (isSpecialChar(ch) && (inquote == false)) {
1514
throwParseException(m_rules_, m_current_);
1516
if (ch == 0x0000 && m_current_ + 1 == limit) {
1520
if (m_parsedToken_.m_charsLen_ == 0) {
1521
m_parsedToken_.m_charsOffset_ = m_current_;
1523
m_parsedToken_.m_charsLen_++;
1524
if(m_prevStrength_ != TOKEN_UNSET_){
1525
char[] fullchar = Character.toChars(Character.codePointAt(m_source_, m_current_));
1526
m_current_ += fullchar.length;
1527
m_parsedToken_.m_charsLen_ += fullchar.length - 1;
1528
return doEndParseNextToken(newstrength,
1532
variabletop, before);
1536
if (newextensionlen == 0) {
1537
extensionoffset = m_current_;
1547
m_source_.append(ch);
1553
return doEndParseNextToken(newstrength, top,
1554
extensionoffset, newextensionlen,
1555
variabletop, before);
1559
* End the next parse token
1560
* @param newstrength new strength
1561
* @return offset in rules, -1 for end of rules
1563
private int doEndParseNextToken(int newstrength, /*int newcharslen,*/
1564
boolean top, /*int charsoffset,*/
1565
int extensionoffset, int newextensionlen,
1566
boolean variabletop, int before)
1567
throws ParseException
1569
if (newstrength == TOKEN_UNSET_) {
1572
if (m_parsedToken_.m_charsLen_ == 0 && top == false) {
1573
throwParseException(m_rules_, m_current_);
1576
m_parsedToken_.m_strength_ = newstrength;
1577
//m_parsedToken_.m_charsOffset_ = charsoffset;
1578
//m_parsedToken_.m_charsLen_ = newcharslen;
1579
m_parsedToken_.m_extensionOffset_ = extensionoffset;
1580
m_parsedToken_.m_extensionLen_ = newextensionlen;
1581
m_parsedToken_.m_flags_ = (char)
1582
((variabletop ? TOKEN_VARIABLE_TOP_MASK_ : 0)
1583
| (top ? TOKEN_TOP_MASK_ : 0) | before);
1588
* Token before this element
1589
* @param sourcetoken
1590
* @param strength collation strength
1591
* @return the token before source token
1592
* @exception ParseException thrown when rules have the wrong syntax
1594
private Token getVirginBefore(Token sourcetoken, int strength)
1595
throws ParseException
1597
// this is a virgin before - we need to fish the anchor from the UCA
1598
if (sourcetoken != null) {
1599
int offset = sourcetoken.m_source_ & 0xFFFFFF;
1600
m_UCAColEIter_.setText(m_source_.substring(offset, offset + 1));
1603
m_UCAColEIter_.setText(
1604
m_source_.substring(m_parsedToken_.m_charsOffset_,
1605
m_parsedToken_.m_charsOffset_ + 1));
1608
int basece = m_UCAColEIter_.next() & 0xFFFFFF3F;
1609
int basecontce = m_UCAColEIter_.next();
1610
if (basecontce == CollationElementIterator.NULLORDER) {
1617
if((basece >>> 24 >= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_)
1618
&& (basece >>> 24 <= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_)) { /* implicits - */
1620
int primary = basece & RuleBasedCollator.CE_PRIMARY_MASK_ | (basecontce & RuleBasedCollator.CE_PRIMARY_MASK_) >> 16;
1621
int raw = RuleBasedCollator.impCEGen_.getRawFromImplicit(primary);
1622
ch = RuleBasedCollator.impCEGen_.getCodePointFromRaw(raw-1);
1623
int primaryCE = RuleBasedCollator.impCEGen_.getImplicitFromRaw(raw-1);
1624
m_utilCEBuffer_[0] = primaryCE & RuleBasedCollator.CE_PRIMARY_MASK_ | 0x0505;
1625
m_utilCEBuffer_[1] = (primaryCE << 16) & RuleBasedCollator.CE_PRIMARY_MASK_ | RuleBasedCollator.CE_CONTINUATION_MARKER_;
1627
m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
1628
m_source_.append('\uFFFE');
1629
m_source_.append((char)ch);
1630
m_extraCurrent_ += 2;
1631
m_parsedToken_.m_charsLen_++;
1633
m_utilToken_.m_source_ = (m_parsedToken_.m_charsLen_ << 24)
1634
| m_parsedToken_.m_charsOffset_;
1635
m_utilToken_.m_rules_ = m_source_;
1636
sourcetoken = m_hashTable_.get(m_utilToken_);
1638
if(sourcetoken == null) {
1639
m_listHeader_[m_resultLength_] = new TokenListHeader();
1640
m_listHeader_[m_resultLength_].m_baseCE_
1641
= m_utilCEBuffer_[0] & 0xFFFFFF3F;
1642
if (RuleBasedCollator.isContinuation(m_utilCEBuffer_[1])) {
1643
m_listHeader_[m_resultLength_].m_baseContCE_
1644
= m_utilCEBuffer_[1];
1647
m_listHeader_[m_resultLength_].m_baseContCE_ = 0;
1649
m_listHeader_[m_resultLength_].m_nextCE_ = 0;
1650
m_listHeader_[m_resultLength_].m_nextContCE_ = 0;
1651
m_listHeader_[m_resultLength_].m_previousCE_ = 0;
1652
m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
1653
m_listHeader_[m_resultLength_].m_indirect_ = false;
1655
sourcetoken = new Token();
1656
initAReset(-1, sourcetoken);
1661
// first ce and second ce m_utilCEBuffer_
1662
/*int invpos = */CollationParsedRuleBuilder.INVERSE_UCA_.getInversePrevCE(
1664
strength, m_utilCEBuffer_);
1665
// we got the previous CE. Now we need to see if the difference between
1666
// the two CEs is really of the requested strength.
1667
// if it's a bigger difference (we asked for secondary and got primary), we
1668
// need to modify the CE.
1669
if(CollationParsedRuleBuilder.INVERSE_UCA_.getCEStrengthDifference(basece, basecontce, m_utilCEBuffer_[0], m_utilCEBuffer_[1]) < strength) {
1670
// adjust the strength
1671
// now we are in the situation where our baseCE should actually be modified in
1672
// order to get the CE in the right position.
1673
if(strength == Collator.SECONDARY) {
1674
m_utilCEBuffer_[0] = basece - 0x0200;
1675
} else { // strength == UCOL_TERTIARY
1676
m_utilCEBuffer_[0] = basece - 0x02;
1678
if(RuleBasedCollator.isContinuation(basecontce)) {
1679
if(strength == Collator.SECONDARY) {
1680
m_utilCEBuffer_[1] = basecontce - 0x0200;
1681
} else { // strength == UCOL_TERTIARY
1682
m_utilCEBuffer_[1] = basecontce - 0x02;
1688
// the code below relies on getting a code point from the inverse table, in order to be
1689
// able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
1690
// 1. There are many code points that have the same CE
1691
// 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
1692
// Also, in case when there is no equivalent strength before an element, we have to actually
1693
// construct one. For example, &[before 2]a << x won't result in x << a, because the element
1694
// before a is a primary difference.
1695
ch = CollationParsedRuleBuilder.INVERSE_UCA_.m_table_[3 * invpos
1697
if ((ch & INVERSE_SIZE_MASK_) != 0) {
1698
int offset = ch & INVERSE_OFFSET_MASK_;
1699
ch = CollationParsedRuleBuilder.INVERSE_UCA_.m_continuations_[
1702
m_source_.append((char)ch);
1704
m_parsedToken_.m_charsOffset_ = m_extraCurrent_ - 1;
1705
m_parsedToken_.m_charsLen_ = 1;
1707
// We got an UCA before. However, this might have been tailored.
1710
// &[before 3]\u306a<<<\u306a|\u309d
1712
m_utilToken_.m_source_ = (m_parsedToken_.m_charsLen_ << 24)
1713
| m_parsedToken_.m_charsOffset_;
1714
m_utilToken_.m_rules_ = m_source_;
1715
sourcetoken = (Token)m_hashTable_.get(m_utilToken_);
1718
// here is how it should be. The situation such as &[before 1]a < x, should be
1719
// resolved exactly as if we wrote &a > x.
1720
// therefore, I don't really care if the UCA value before a has been changed.
1721
// However, I do care if the strength between my element and the previous element
1722
// is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
1723
// have to construct the base CE.
1725
// if we found a tailored thing, we have to use the UCA value and
1726
// construct a new reset token with constructed name
1727
//if (sourcetoken != null && sourcetoken.m_strength_ != TOKEN_RESET_) {
1728
// character to which we want to anchor is already tailored.
1729
// We need to construct a new token which will be the anchor point
1730
//m_source_.setCharAt(m_extraCurrent_ - 1, '\uFFFE');
1731
//m_source_.append(ch);
1732
//m_extraCurrent_ ++;
1733
//m_parsedToken_.m_charsLen_ ++;
1735
m_parsedToken_.m_charsOffset_ -= 10;
1736
m_parsedToken_.m_charsLen_ += 10;
1737
m_listHeader_[m_resultLength_] = new TokenListHeader();
1738
m_listHeader_[m_resultLength_].m_baseCE_
1739
= m_utilCEBuffer_[0] & 0xFFFFFF3F;
1740
if (RuleBasedCollator.isContinuation(m_utilCEBuffer_[1])) {
1741
m_listHeader_[m_resultLength_].m_baseContCE_
1742
= m_utilCEBuffer_[1];
1745
m_listHeader_[m_resultLength_].m_baseContCE_ = 0;
1747
m_listHeader_[m_resultLength_].m_nextCE_ = 0;
1748
m_listHeader_[m_resultLength_].m_nextContCE_ = 0;
1749
m_listHeader_[m_resultLength_].m_previousCE_ = 0;
1750
m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
1751
m_listHeader_[m_resultLength_].m_indirect_ = false;
1752
sourcetoken = new Token();
1753
initAReset(-1, sourcetoken);
1760
* Processing Description.
1761
* 1. Build a m_listHeader_. Each list has a header, which contains two lists
1762
* (positive and negative), a reset token, a baseCE, nextCE, and
1763
* previousCE. The lists and reset may be null.
1764
* 2. As you process, you keep a LAST pointer that points to the last token
1766
* @param expand string offset, -1 for null strings
1767
* @param targetToken token to update
1768
* @return expandnext offset
1769
* @throws ParseException thrown when rules syntax failed
1771
private int initAReset(int expand, Token targetToken) throws ParseException
1773
if (m_resultLength_ == m_listHeader_.length - 1) {
1774
// Unfortunately, this won't work, as we store addresses of lhs in
1776
TokenListHeader temp[] = new TokenListHeader[m_resultLength_ << 1];
1777
System.arraycopy(m_listHeader_, 0, temp, 0, m_resultLength_ + 1);
1778
m_listHeader_ = temp;
1780
// do the reset thing
1781
targetToken.m_rules_ = m_source_;
1782
targetToken.m_source_ = m_parsedToken_.m_charsLen_ << 24
1783
| m_parsedToken_.m_charsOffset_;
1784
targetToken.m_expansion_ = m_parsedToken_.m_extensionLen_ << 24
1785
| m_parsedToken_.m_extensionOffset_;
1786
// keep the flags around so that we know about before
1787
targetToken.m_flags_ = m_parsedToken_.m_flags_;
1789
if (m_parsedToken_.m_prefixOffset_ != 0) {
1790
throwParseException(m_rules_, m_parsedToken_.m_charsOffset_ - 1);
1793
targetToken.m_prefix_ = 0;
1794
// TODO: this should also handle reverse
1795
targetToken.m_polarity_ = TOKEN_POLARITY_POSITIVE_;
1796
targetToken.m_strength_ = TOKEN_RESET_;
1797
targetToken.m_next_ = null;
1798
targetToken.m_previous_ = null;
1799
targetToken.m_CELength_ = 0;
1800
targetToken.m_expCELength_ = 0;
1801
targetToken.m_listHeader_ = m_listHeader_[m_resultLength_];
1802
m_listHeader_[m_resultLength_].m_first_ = null;
1803
m_listHeader_[m_resultLength_].m_last_ = null;
1804
m_listHeader_[m_resultLength_].m_first_ = null;
1805
m_listHeader_[m_resultLength_].m_last_ = null;
1806
m_listHeader_[m_resultLength_].m_reset_ = targetToken;
1808
/* 3 Consider each item: relation, source, and expansion:
1809
* e.g. ...< x / y ...
1810
* First convert all expansions into normal form. Examples:
1811
* If "xy" doesn't occur earlier in the list or in the UCA, convert
1812
* &xy * c * d * ... into &x * c/y * d * ...
1813
* Note: reset values can never have expansions, although they can
1814
* cause the very next item to have one. They may be contractions, if
1815
* they are found earlier in the list.
1819
// check to see if there is an expansion
1820
if (m_parsedToken_.m_charsLen_ > 1) {
1821
targetToken.m_source_ = ((expand
1822
- m_parsedToken_.m_charsOffset_ )
1824
| m_parsedToken_.m_charsOffset_;
1825
result = ((m_parsedToken_.m_charsLen_
1826
+ m_parsedToken_.m_charsOffset_ - expand) << 24)
1832
m_hashTable_.put(targetToken, targetToken);
1837
* Checks if an character is special
1838
* @param ch character to test
1839
* @return true if the character is special
1841
private static final boolean isSpecialChar(char ch)
1843
return (ch <= 0x002F && ch >= 0x0020) || (ch <= 0x003F && ch >= 0x003A)
1844
|| (ch <= 0x0060 && ch >= 0x005B)
1845
|| (ch <= 0x007E && ch >= 0x007D) || ch == 0x007B;
1849
UnicodeSet readAndSetUnicodeSet(String source, int start) throws ParseException
1851
while(source.charAt(start) != '[') { /* advance while we find the first '[' */
1854
// now we need to get a balanced set of '[]'. The problem is that a set can have
1855
// many, and *end point to the first closing '['
1856
int noOpenBraces = 1;
1857
int current = 1; // skip the opening brace
1858
while(start+current < source.length() && noOpenBraces != 0) {
1859
if(source.charAt(start+current) == '[') {
1861
} else if(source.charAt(start+current) == ']') { // closing brace
1866
//int nextBrace = -1;
1868
if(noOpenBraces != 0 || (/*nextBrace =*/ source.indexOf("]", start+current) /*']'*/) == -1) {
1869
throwParseException(m_rules_, start);
1871
return new UnicodeSet(source.substring(start, start+current)); //uset_openPattern(start, current);
1875
/** in C, optionarg is passed by reference to function.
1876
* We use a private int to simulate this.
1878
private int m_optionarg_ = 0;
1880
private int readOption(String rules, int start, int optionend)
1884
while (i < RULES_OPTIONS_.length) {
1885
String option = RULES_OPTIONS_[i].m_name_;
1886
int optionlength = option.length();
1887
if (rules.length() > start + optionlength
1888
&& option.equalsIgnoreCase(rules.substring(start,
1889
start + optionlength))) {
1890
if (optionend - start > optionlength) {
1891
m_optionarg_ = start + optionlength;
1892
// start of the options, skip space
1893
while (m_optionarg_ < optionend && (UCharacter.isWhitespace(rules.charAt(m_optionarg_)) || UCharacterProperty.isRuleWhiteSpace(rules.charAt(m_optionarg_))))
1902
if(i == RULES_OPTIONS_.length) {
1908
* Reads and set collation options
1909
* @return TOKEN_SUCCESS if option is set correct, 0 otherwise
1910
* @exception ParseException thrown when options in rules are wrong
1912
private byte readAndSetOption() throws ParseException
1914
int start = m_current_ + 1; // skip opening '['
1915
int i = readOption(m_rules_, start, m_optionEnd_);
1917
int optionarg = m_optionarg_;
1920
throwParseException(m_rules_, start);
1924
if (optionarg != 0) {
1925
for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length;
1927
String subname = RULES_OPTIONS_[i].m_subOptions_[j];
1928
int size = optionarg + subname.length();
1929
if (m_rules_.length() > size
1930
&& subname.equalsIgnoreCase(m_rules_.substring(
1931
optionarg, size))) {
1932
setOptions(m_options_, RULES_OPTIONS_[i].m_attribute_,
1933
RULES_OPTIONS_[i].m_subOptionAttributeValues_[j]);
1934
return TOKEN_SUCCESS_MASK_;
1938
throwParseException(m_rules_, optionarg);
1940
else if (i == 7) { // variable top
1941
return TOKEN_SUCCESS_MASK_ | TOKEN_VARIABLE_TOP_MASK_;
1943
else if (i == 8) { // rearange
1944
return TOKEN_SUCCESS_MASK_;
1946
else if (i == 9) { // before
1947
if (optionarg != 0) {
1948
for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length;
1950
String subname = RULES_OPTIONS_[i].m_subOptions_[j];
1951
int size = optionarg + subname.length();
1952
if (m_rules_.length() > size
1953
&& subname.equalsIgnoreCase(
1954
m_rules_.substring(optionarg,
1955
optionarg + subname.length()))) {
1956
return (byte)(TOKEN_SUCCESS_MASK_
1957
| RULES_OPTIONS_[i].m_subOptionAttributeValues_[j]
1962
throwParseException(m_rules_, optionarg);
1964
else if (i == 10) { // top, we are going to have an array with
1965
// structures of limit CEs index to this array will be
1966
// src->parsedToken.indirectIndex
1967
m_parsedToken_.m_indirectIndex_ = 0;
1968
return TOKEN_SUCCESS_MASK_ | TOKEN_TOP_MASK_;
1970
else if (i < 13) { // first, last
1971
for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length; j ++) {
1972
String subname = RULES_OPTIONS_[i].m_subOptions_[j];
1973
int size = optionarg + subname.length();
1974
if (m_rules_.length() > size
1975
&& subname.equalsIgnoreCase(m_rules_.substring(optionarg,
1977
m_parsedToken_.m_indirectIndex_ = (char)(i - 10 + (j << 1));
1978
return TOKEN_SUCCESS_MASK_ | TOKEN_TOP_MASK_;
1981
throwParseException(m_rules_, optionarg);
1983
else if(i == 13 || i == 14) { // copy and remove are handled before normalization
1984
// we need to move end here
1985
int noOpenBraces = 1;
1986
m_current_++; // skip opening brace
1987
while(m_current_ < m_source_.length() && noOpenBraces != 0) {
1988
if(m_source_.charAt(m_current_) == '[') {
1990
} else if(m_source_.charAt(m_current_) == ']') { // closing brace
1995
m_optionEnd_ = m_current_-1;
1996
return TOKEN_SUCCESS_MASK_;
1999
throwParseException(m_rules_, optionarg);
2001
return TOKEN_SUCCESS_MASK_; // we will never reach here.
2005
* Set collation option
2006
* @param optionset option set to set
2007
* @param attribute type to set
2008
* @param value attribute value
2010
private void setOptions(OptionSet optionset, int attribute, int value)
2012
switch (attribute) {
2013
case RuleBasedCollator.Attribute.HIRAGANA_QUATERNARY_MODE_ :
2014
optionset.m_isHiragana4_
2015
= (value == RuleBasedCollator.AttributeValue.ON_);
2017
case RuleBasedCollator.Attribute.FRENCH_COLLATION_ :
2018
optionset.m_isFrenchCollation_
2019
= (value == RuleBasedCollator.AttributeValue.ON_);
2021
case RuleBasedCollator.Attribute.ALTERNATE_HANDLING_ :
2022
optionset.m_isAlternateHandlingShifted_
2024
== RuleBasedCollator.AttributeValue.SHIFTED_);
2026
case RuleBasedCollator.Attribute.CASE_FIRST_ :
2027
optionset.m_caseFirst_ = value;
2029
case RuleBasedCollator.Attribute.CASE_LEVEL_ :
2030
optionset.m_isCaseLevel_
2031
= (value == RuleBasedCollator.AttributeValue.ON_);
2033
case RuleBasedCollator.Attribute.NORMALIZATION_MODE_ :
2034
if (value == RuleBasedCollator.AttributeValue.ON_) {
2035
value = Collator.CANONICAL_DECOMPOSITION;
2037
optionset.m_decomposition_ = value;
2039
case RuleBasedCollator.Attribute.STRENGTH_ :
2040
optionset.m_strength_ = value;
2047
UnicodeSet getTailoredSet() throws ParseException
2049
boolean startOfRules = true;
2050
UnicodeSet tailored = new UnicodeSet();
2052
CanonicalIterator it = new CanonicalIterator("");
2054
m_parsedToken_.m_strength_ = TOKEN_UNSET_;
2055
int sourcelimit = m_source_.length();
2056
//int expandNext = 0;
2058
while (m_current_ < sourcelimit) {
2059
m_parsedToken_.m_prefixOffset_ = 0;
2060
if (parseNextToken(startOfRules) < 0) {
2061
// we have reached the end
2064
startOfRules = false;
2065
// The idea is to tokenize the rule set. For each non-reset token,
2066
// we add all the canonicaly equivalent FCD sequences
2067
if(m_parsedToken_.m_strength_ != TOKEN_RESET_) {
2068
it.setSource(m_source_.substring(
2069
m_parsedToken_.m_charsOffset_,
2070
m_parsedToken_.m_charsOffset_+m_parsedToken_.m_charsLen_));
2071
pattern = it.next();
2072
while(pattern != null) {
2073
if(Normalizer.quickCheck(pattern, Normalizer.FCD,0) != Normalizer.NO) {
2074
tailored.add(pattern);
2076
pattern = it.next();
2083
final private void extractSetsFromRules(String rules) throws ParseException {
2084
int optionNumber = -1;
2087
while(i < rules.length()) {
2088
if(rules.charAt(i) == 0x005B) {
2089
optionNumber = readOption(rules, i+1, rules.length());
2090
setStart = m_optionarg_;
2091
if(optionNumber == 13) { /* copy - parts of UCA to tailoring */
2092
UnicodeSet newSet = readAndSetUnicodeSet(rules, setStart);
2093
if(m_copySet_ == null) {
2094
m_copySet_ = newSet;
2096
m_copySet_.addAll(newSet);
2098
} else if(optionNumber == 14) {
2099
UnicodeSet newSet = readAndSetUnicodeSet(rules, setStart);
2100
if(m_removeSet_ == null) {
2101
m_removeSet_ = newSet;
2103
m_removeSet_.addAll(newSet);