~ubuntu-branches/ubuntu/vivid/icu4j-4.4/vivid

« back to all changes in this revision

Viewing changes to main/classes/collate/src/com/ibm/icu/text/CollationRuleParser.java

  • Committer: Bazaar Package Importer
  • Author(s): Niels Thykier
  • Date: 2011-08-02 15:50:33 UTC
  • Revision ID: james.westby@ubuntu.com-20110802155033-itjzsl21y2lqdonn
Tags: upstream-4.4.2
ImportĀ upstreamĀ versionĀ 4.4.2

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/**
 
2
*******************************************************************************
 
3
* Copyright (C) 1996-2010, International Business Machines Corporation and    *
 
4
* others. All Rights Reserved.                                                *
 
5
*******************************************************************************
 
6
*/
 
7
package com.ibm.icu.text;
 
8
 
 
9
import java.text.ParseException;
 
10
import java.util.Arrays;
 
11
import java.util.Hashtable;
 
12
 
 
13
import com.ibm.icu.impl.UCharacterProperty;
 
14
import com.ibm.icu.lang.UCharacter;
 
15
 
 
16
/**
 
17
* Class for parsing collation rules, produces a list of tokens that will be
 
18
* turned into collation elements
 
19
* @author Syn Wee Quek
 
20
* @since release 2.2, June 7 2002
 
21
*/
 
22
final class CollationRuleParser
 
23
{
 
24
    // public data members ---------------------------------------------------
 
25
 
 
26
    // package private constructors ------------------------------------------
 
27
 
 
28
    /**
 
29
     * <p>RuleBasedCollator constructor that takes the rules.
 
30
     * Please see RuleBasedCollator class description for more details on the
 
31
     * collation rule syntax.</p>
 
32
     * @see java.util.Locale
 
33
     * @param rules the collation rules to build the collation table from.
 
34
     * @exception ParseException thrown when argument rules have an invalid
 
35
     *            syntax.
 
36
     */
 
37
    CollationRuleParser(String rules) throws ParseException
 
38
    {
 
39
        extractSetsFromRules(rules);
 
40
        m_source_ = new StringBuilder(Normalizer.decompose(rules, false).trim());
 
41
        m_rules_ = m_source_.toString();
 
42
        m_current_ = 0;
 
43
        m_extraCurrent_ = m_source_.length();
 
44
        m_variableTop_ = null;
 
45
        m_parsedToken_ = new ParsedToken();
 
46
        m_hashTable_ = new Hashtable<Token, Token>();
 
47
        m_options_ = new OptionSet(RuleBasedCollator.UCA_);
 
48
        m_listHeader_ = new TokenListHeader[512];
 
49
        m_resultLength_ = 0;
 
50
        m_prevStrength_ = TOKEN_UNSET_;
 
51
        // call assembleTokenList() manually, so that we can
 
52
        // init a parser and manually parse tokens
 
53
        //assembleTokenList();
 
54
    }
 
55
 
 
56
    // package private inner classes -----------------------------------------
 
57
 
 
58
    /**
 
59
     * Collation options set
 
60
     */
 
61
    static class OptionSet
 
62
    {
 
63
        // package private constructor ---------------------------------------
 
64
 
 
65
        /**
 
66
         * Initializes the option set with the argument collators
 
67
         * @param collator option to use
 
68
         */
 
69
        OptionSet(RuleBasedCollator collator)
 
70
        {
 
71
            m_variableTopValue_ = collator.m_variableTopValue_;
 
72
            m_isFrenchCollation_ = collator.isFrenchCollation();
 
73
            m_isAlternateHandlingShifted_
 
74
                                   = collator.isAlternateHandlingShifted();
 
75
            m_caseFirst_ = collator.m_caseFirst_;
 
76
            m_isCaseLevel_ = collator.isCaseLevel();
 
77
            m_decomposition_ = collator.getDecomposition();
 
78
            m_strength_ = collator.getStrength();
 
79
            m_isHiragana4_ = collator.m_isHiragana4_;
 
80
        }
 
81
 
 
82
        // package private data members --------------------------------------
 
83
 
 
84
        int m_variableTopValue_;
 
85
        boolean m_isFrenchCollation_;
 
86
        /**
 
87
         * Attribute for handling variable elements
 
88
         */
 
89
        boolean m_isAlternateHandlingShifted_;
 
90
        /**
 
91
         * who goes first, lower case or uppercase
 
92
         */
 
93
        int m_caseFirst_;
 
94
        /**
 
95
         * do we have an extra case level
 
96
         */
 
97
        boolean m_isCaseLevel_;
 
98
        /**
 
99
         * attribute for normalization
 
100
         */
 
101
        int m_decomposition_;
 
102
        /**
 
103
         * attribute for strength
 
104
         */
 
105
        int m_strength_;
 
106
        /**
 
107
         * attribute for special Hiragana
 
108
         */
 
109
        boolean m_isHiragana4_;
 
110
    }
 
111
 
 
112
    /**
 
113
     * List of tokens used by the collation rules
 
114
     */
 
115
    static class TokenListHeader
 
116
    {
 
117
        Token m_first_;
 
118
        Token m_last_;
 
119
        Token m_reset_;
 
120
        boolean m_indirect_;
 
121
        int m_baseCE_;
 
122
        int m_baseContCE_;
 
123
        int m_nextCE_;
 
124
        int m_nextContCE_;
 
125
        int m_previousCE_;
 
126
        int m_previousContCE_;
 
127
        int m_pos_[] = new int[Collator.IDENTICAL + 1];
 
128
        int m_gapsLo_[] = new int[3 * (Collator.TERTIARY + 1)];
 
129
        int m_gapsHi_[] = new int[3 * (Collator.TERTIARY + 1)];
 
130
        int m_numStr_[] = new int[3 * (Collator.TERTIARY + 1)];
 
131
        Token m_fStrToken_[] = new Token[Collator.TERTIARY + 1];
 
132
        Token m_lStrToken_[] = new Token[Collator.TERTIARY + 1];
 
133
    }
 
134
 
 
135
    /**
 
136
     * Token wrapper for collation rules
 
137
     */
 
138
    static class Token
 
139
    {
 
140
       // package private data members ---------------------------------------
 
141
 
 
142
       int m_CE_[];
 
143
       int m_CELength_;
 
144
       int m_expCE_[];
 
145
       int m_expCELength_;
 
146
       int m_source_;
 
147
       int m_expansion_;
 
148
       int m_prefix_;
 
149
       int m_strength_;
 
150
       int m_toInsert_;
 
151
       int m_polarity_; // 1 for <, <<, <<<, , ; and 0 for >, >>, >>>
 
152
       TokenListHeader m_listHeader_;
 
153
       Token m_previous_;
 
154
       Token m_next_;
 
155
       StringBuilder m_rules_;
 
156
       char m_flags_;
 
157
 
 
158
       // package private constructors ---------------------------------------
 
159
 
 
160
       Token()
 
161
       {
 
162
           m_CE_ = new int[128];
 
163
           m_expCE_ = new int[128];
 
164
           // TODO: this should also handle reverse
 
165
           m_polarity_ = TOKEN_POLARITY_POSITIVE_;
 
166
           m_next_ = null;
 
167
           m_previous_ = null;
 
168
           m_CELength_ = 0;
 
169
           m_expCELength_ = 0;
 
170
       }
 
171
 
 
172
       // package private methods --------------------------------------------
 
173
 
 
174
       /**
 
175
        * Hashcode calculation for token
 
176
        * @return the hashcode
 
177
        */
 
178
       public int hashCode()
 
179
       {
 
180
           int result = 0;
 
181
           int len = (m_source_ & 0xFF000000) >>> 24;
 
182
           int inc = ((len - 32) / 32) + 1;
 
183
 
 
184
           int start = m_source_ & 0x00FFFFFF;
 
185
           int limit = start + len;
 
186
 
 
187
           while (start < limit) {
 
188
               result = (result * 37) + m_rules_.charAt(start);
 
189
               start += inc;
 
190
           }
 
191
           return result;
 
192
       }
 
193
 
 
194
       /**
 
195
        * Equals calculation
 
196
        * @param target object to compare
 
197
        * @return true if target is the same as this object
 
198
        */
 
199
       public boolean equals(Object target)
 
200
       {
 
201
           if (target == this) {
 
202
               return true;
 
203
           }
 
204
           if (target instanceof Token) {
 
205
               Token t = (Token)target;
 
206
               int sstart = m_source_ & 0x00FFFFFF;
 
207
               int tstart = t.m_source_ & 0x00FFFFFF;
 
208
               int slimit = (m_source_ & 0xFF000000) >> 24;
 
209
               int tlimit = (m_source_ & 0xFF000000) >> 24;
 
210
 
 
211
               int end = sstart + slimit - 1;
 
212
 
 
213
               if (m_source_ == 0 || t.m_source_ == 0) {
 
214
                   return false;
 
215
               }
 
216
               if (slimit != tlimit) {
 
217
                   return false;
 
218
               }
 
219
               if (m_source_ == t.m_source_) {
 
220
                   return true;
 
221
               }
 
222
 
 
223
               while (sstart < end
 
224
                      && m_rules_.charAt(sstart) == t.m_rules_.charAt(tstart))
 
225
               {
 
226
                   ++ sstart;
 
227
                   ++ tstart;
 
228
               }
 
229
               if (m_rules_.charAt(sstart) == t.m_rules_.charAt(tstart)) {
 
230
                   return true;
 
231
               }
 
232
           }
 
233
           return false;
 
234
        }
 
235
    }
 
236
 
 
237
    // package private data member -------------------------------------------
 
238
 
 
239
    /**
 
240
     * Indicator that the token is resetted yet, ie & in the rules
 
241
     */
 
242
    static final int TOKEN_RESET_ = 0xDEADBEEF;
 
243
 
 
244
    /**
 
245
     * Size of the number of tokens
 
246
     */
 
247
    int m_resultLength_;
 
248
    /**
 
249
     * List of parsed tokens
 
250
     */
 
251
    TokenListHeader m_listHeader_[];
 
252
    /**
 
253
     * Variable top token
 
254
     */
 
255
    Token m_variableTop_;
 
256
    /**
 
257
     * Collation options
 
258
     */
 
259
    OptionSet m_options_;
 
260
    /**
 
261
     * Normalized collation rules with some extra characters
 
262
     */
 
263
    StringBuilder m_source_;
 
264
    /**
 
265
     * Hash table to keep all tokens
 
266
     */
 
267
    Hashtable<Token, Token> m_hashTable_;
 
268
 
 
269
    // package private method ------------------------------------------------
 
270
 
 
271
    void setDefaultOptionsInCollator(RuleBasedCollator collator)
 
272
    {
 
273
        collator.m_defaultStrength_ = m_options_.m_strength_;
 
274
        collator.m_defaultDecomposition_ = m_options_.m_decomposition_;
 
275
        collator.m_defaultIsFrenchCollation_ = m_options_.m_isFrenchCollation_;
 
276
        collator.m_defaultIsAlternateHandlingShifted_
 
277
                                    = m_options_.m_isAlternateHandlingShifted_;
 
278
        collator.m_defaultIsCaseLevel_ = m_options_.m_isCaseLevel_;
 
279
        collator.m_defaultCaseFirst_ = m_options_.m_caseFirst_;
 
280
        collator.m_defaultIsHiragana4_ = m_options_.m_isHiragana4_;
 
281
        collator.m_defaultVariableTopValue_ = m_options_.m_variableTopValue_;
 
282
    }
 
283
 
 
284
    // private inner classes -------------------------------------------------
 
285
 
 
286
    /**
 
287
     * This is a token that has been parsed but not yet processed. Used to
 
288
     * reduce the number of arguments in the parser
 
289
     */
 
290
    private static class ParsedToken
 
291
    {
 
292
        // private constructor ----------------------------------------------
 
293
 
 
294
        /**
 
295
         * Empty constructor
 
296
         */
 
297
        ParsedToken()
 
298
        {
 
299
            m_charsLen_ = 0;
 
300
            m_charsOffset_ = 0;
 
301
            m_extensionLen_ = 0;
 
302
            m_extensionOffset_ = 0;
 
303
            m_prefixLen_ = 0;
 
304
            m_prefixOffset_ = 0;
 
305
            m_flags_ = 0;
 
306
            m_strength_ = TOKEN_UNSET_;
 
307
        }
 
308
 
 
309
        // private data members ---------------------------------------------
 
310
 
 
311
        int m_strength_;
 
312
        int m_charsOffset_;
 
313
        int m_charsLen_;
 
314
        int m_extensionOffset_;
 
315
        int m_extensionLen_;
 
316
        int m_prefixOffset_;
 
317
        int m_prefixLen_;
 
318
        char m_flags_;
 
319
        char m_indirectIndex_;
 
320
    }
 
321
 
 
322
    /**
 
323
     * Boundary wrappers
 
324
     */
 
325
    private static class IndirectBoundaries
 
326
    {
 
327
        // package private constructor ---------------------------------------
 
328
 
 
329
        IndirectBoundaries(int startce[], int limitce[])
 
330
        {
 
331
            // Set values for the top - TODO: once we have values for all the
 
332
            // indirects, we are going to initalize here.
 
333
            m_startCE_ = startce[0];
 
334
            m_startContCE_ = startce[1];
 
335
            if (limitce != null) {
 
336
                m_limitCE_ = limitce[0];
 
337
                m_limitContCE_ = limitce[1];
 
338
            }
 
339
            else {
 
340
                m_limitCE_ = 0;
 
341
                m_limitContCE_ = 0;
 
342
            }
 
343
        }
 
344
 
 
345
        // package private data members --------------------------------------
 
346
 
 
347
        int m_startCE_;
 
348
        int m_startContCE_;
 
349
        int m_limitCE_;
 
350
        int m_limitContCE_;
 
351
    }
 
352
 
 
353
    /**
 
354
     * Collation option rule tag
 
355
     */
 
356
    private static class TokenOption
 
357
    {
 
358
        // package private constructor ---------------------------------------
 
359
 
 
360
        TokenOption(String name, int attribute, String suboptions[],
 
361
                    int suboptionattributevalue[])
 
362
        {
 
363
            m_name_ = name;
 
364
            m_attribute_ = attribute;
 
365
            m_subOptions_ = suboptions;
 
366
            m_subOptionAttributeValues_ = suboptionattributevalue;
 
367
        }
 
368
 
 
369
        // package private data member ---------------------------------------
 
370
 
 
371
        private String m_name_;
 
372
        private int m_attribute_;
 
373
        private String m_subOptions_[];
 
374
        private int m_subOptionAttributeValues_[];
 
375
    }
 
376
 
 
377
    // private variables -----------------------------------------------------
 
378
 
 
379
    /**
 
380
     * Current parsed token
 
381
     */
 
382
    private ParsedToken m_parsedToken_;
 
383
    /**
 
384
     * Collation rule
 
385
     */
 
386
    private String m_rules_;
 
387
    private int m_current_;
 
388
    /**
 
389
     * End of the option while reading.
 
390
     * Need it for UnicodeSet reading support.
 
391
     */
 
392
    private int m_optionEnd_;
 
393
    /*
 
394
     * Current offset in m_source
 
395
     */
 
396
    //private int m_sourceLimit_;
 
397
    /**
 
398
     * Offset to m_source_ ofr the extra expansion characters
 
399
     */
 
400
    private int m_extraCurrent_;
 
401
 
 
402
    /**
 
403
     * UnicodeSet that contains code points to be copied from the UCA
 
404
     */
 
405
    UnicodeSet m_copySet_;
 
406
 
 
407
    /**
 
408
     * UnicodeSet that contains code points for which we want to remove
 
409
     * UCA contractions. It implies copying of these code points from
 
410
     * the UCA.
 
411
     */
 
412
    UnicodeSet m_removeSet_;
 
413
    /**
 
414
     * Stores the previous token's strength when making a list of same level
 
415
     * differences.
 
416
     */
 
417
    private int m_prevStrength_;
 
418
 
 
419
    /*
 
420
     * This is space for the extra strings that need to be unquoted during the
 
421
     * parsing of the rules
 
422
     */
 
423
    //private static final int TOKEN_EXTRA_RULE_SPACE_SIZE_ = 2048;
 
424
    /**
 
425
     * Indicator that the token is not set yet
 
426
     */
 
427
    private static final int TOKEN_UNSET_ = 0xFFFFFFFF;
 
428
    /*
 
429
     * Indicator that the rule is in the > polarity, ie everything on the
 
430
     * right of the rule is less than
 
431
     */
 
432
    //private static final int TOKEN_POLARITY_NEGATIVE_ = 0;
 
433
    /**
 
434
     * Indicator that the rule is in the < polarity, ie everything on the
 
435
     * right of the rule is greater than
 
436
     */
 
437
    private static final int TOKEN_POLARITY_POSITIVE_ = 1;
 
438
    /**
 
439
     * Flag mask to determine if top is set
 
440
     */
 
441
    private static final int TOKEN_TOP_MASK_ = 0x04;
 
442
    /**
 
443
     * Flag mask to determine if variable top is set
 
444
     */
 
445
    private static final int TOKEN_VARIABLE_TOP_MASK_ = 0x08;
 
446
    /**
 
447
     * Flag mask to determine if a before attribute is set
 
448
     */
 
449
    private static final int TOKEN_BEFORE_ = 0x03;
 
450
    /**
 
451
     * For use in parsing token options
 
452
     */
 
453
    private static final int TOKEN_SUCCESS_MASK_ = 0x10;
 
454
 
 
455
    /**
 
456
     * These values are used for finding CE values for indirect positioning.
 
457
     * Indirect positioning is a mechanism for allowing resets on symbolic
 
458
     * values. It only works for resets and you cannot tailor indirect names.
 
459
     * An indirect name can define either an anchor point or a range. An anchor
 
460
     * point behaves in exactly the same way as a code point in reset would,
 
461
     * except that it cannot be tailored. A range (we currently only know for
 
462
     * the [top] range will explicitly set the upper bound for generated CEs,
 
463
     * thus allowing for better control over how many CEs can be squeezed
 
464
     * between in the range without performance penalty. In that respect, we use
 
465
     * [top] for tailoring of locales that use CJK characters. Other indirect
 
466
     * values are currently a pure convenience, they can be used to assure that
 
467
     * the CEs will be always positioned in the same place relative to a point
 
468
     * with known properties (e.g. first primary ignorable).
 
469
     */
 
470
    private static final IndirectBoundaries INDIRECT_BOUNDARIES_[];
 
471
 
 
472
//    /**
 
473
//     * Inverse UCA constants
 
474
//     */
 
475
//    private static final int INVERSE_SIZE_MASK_ = 0xFFF00000;
 
476
//    private static final int INVERSE_OFFSET_MASK_ = 0x000FFFFF;
 
477
//    private static final int INVERSE_SHIFT_VALUE_ = 20;
 
478
 
 
479
    /**
 
480
     * Collation option tags
 
481
     * [last variable] last variable value
 
482
     * [last primary ignorable] largest CE for primary ignorable
 
483
     * [last secondary ignorable] largest CE for secondary ignorable
 
484
     * [last tertiary ignorable] largest CE for tertiary ignorable
 
485
     * [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)
 
486
     */
 
487
    private static final TokenOption RULES_OPTIONS_[];
 
488
 
 
489
    static
 
490
    {
 
491
        INDIRECT_BOUNDARIES_ = new IndirectBoundaries[15];
 
492
        // UCOL_RESET_TOP_VALUE
 
493
        INDIRECT_BOUNDARIES_[0] = new IndirectBoundaries(
 
494
                        RuleBasedCollator.UCA_CONSTANTS_.LAST_NON_VARIABLE_,
 
495
                        RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_);
 
496
        // UCOL_FIRST_PRIMARY_IGNORABLE
 
497
        INDIRECT_BOUNDARIES_[1] = new IndirectBoundaries(
 
498
                    RuleBasedCollator.UCA_CONSTANTS_.FIRST_PRIMARY_IGNORABLE_,
 
499
                    null);
 
500
        // UCOL_LAST_PRIMARY_IGNORABLE
 
501
        INDIRECT_BOUNDARIES_[2] = new IndirectBoundaries(
 
502
                    RuleBasedCollator.UCA_CONSTANTS_.LAST_PRIMARY_IGNORABLE_,
 
503
                    null);
 
504
 
 
505
        // UCOL_FIRST_SECONDARY_IGNORABLE
 
506
        INDIRECT_BOUNDARIES_[3] = new IndirectBoundaries(
 
507
                   RuleBasedCollator.UCA_CONSTANTS_.FIRST_SECONDARY_IGNORABLE_,
 
508
                   null);
 
509
        // UCOL_LAST_SECONDARY_IGNORABLE
 
510
        INDIRECT_BOUNDARIES_[4] = new IndirectBoundaries(
 
511
                   RuleBasedCollator.UCA_CONSTANTS_.LAST_SECONDARY_IGNORABLE_,
 
512
                   null);
 
513
        // UCOL_FIRST_TERTIARY_IGNORABLE
 
514
        INDIRECT_BOUNDARIES_[5] = new IndirectBoundaries(
 
515
                   RuleBasedCollator.UCA_CONSTANTS_.FIRST_TERTIARY_IGNORABLE_,
 
516
                   null);
 
517
        // UCOL_LAST_TERTIARY_IGNORABLE
 
518
        INDIRECT_BOUNDARIES_[6] = new IndirectBoundaries(
 
519
                   RuleBasedCollator.UCA_CONSTANTS_.LAST_TERTIARY_IGNORABLE_,
 
520
                   null);
 
521
        // UCOL_FIRST_VARIABLE;
 
522
        INDIRECT_BOUNDARIES_[7] = new IndirectBoundaries(
 
523
                   RuleBasedCollator.UCA_CONSTANTS_.FIRST_VARIABLE_,
 
524
                   null);
 
525
        // UCOL_LAST_VARIABLE
 
526
        INDIRECT_BOUNDARIES_[8] = new IndirectBoundaries(
 
527
                   RuleBasedCollator.UCA_CONSTANTS_.LAST_VARIABLE_,
 
528
                   null);
 
529
        // UCOL_FIRST_NON_VARIABLE
 
530
        INDIRECT_BOUNDARIES_[9] = new IndirectBoundaries(
 
531
                   RuleBasedCollator.UCA_CONSTANTS_.FIRST_NON_VARIABLE_,
 
532
                   null);
 
533
        // UCOL_LAST_NON_VARIABLE
 
534
        INDIRECT_BOUNDARIES_[10] = new IndirectBoundaries(
 
535
                   RuleBasedCollator.UCA_CONSTANTS_.LAST_NON_VARIABLE_,
 
536
                   RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_);
 
537
        // UCOL_FIRST_IMPLICIT
 
538
        INDIRECT_BOUNDARIES_[11] = new IndirectBoundaries(
 
539
                   RuleBasedCollator.UCA_CONSTANTS_.FIRST_IMPLICIT_,
 
540
                   null);
 
541
        // UCOL_LAST_IMPLICIT
 
542
        INDIRECT_BOUNDARIES_[12] = new IndirectBoundaries(
 
543
                   RuleBasedCollator.UCA_CONSTANTS_.LAST_IMPLICIT_,
 
544
                   RuleBasedCollator.UCA_CONSTANTS_.FIRST_TRAILING_);
 
545
        // UCOL_FIRST_TRAILING
 
546
        INDIRECT_BOUNDARIES_[13] = new IndirectBoundaries(
 
547
                   RuleBasedCollator.UCA_CONSTANTS_.FIRST_TRAILING_,
 
548
                   null);
 
549
        // UCOL_LAST_TRAILING
 
550
        INDIRECT_BOUNDARIES_[14] = new IndirectBoundaries(
 
551
                   RuleBasedCollator.UCA_CONSTANTS_.LAST_TRAILING_,
 
552
                   null);
 
553
        INDIRECT_BOUNDARIES_[14].m_limitCE_
 
554
                 = RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_SPECIAL_MIN_ << 24;
 
555
 
 
556
        RULES_OPTIONS_ = new TokenOption[19];
 
557
        String option[] = {"non-ignorable", "shifted"};
 
558
        int value[] = {RuleBasedCollator.AttributeValue.NON_IGNORABLE_,
 
559
                       RuleBasedCollator.AttributeValue.SHIFTED_};
 
560
        RULES_OPTIONS_[0] = new TokenOption("alternate",
 
561
                              RuleBasedCollator.Attribute.ALTERNATE_HANDLING_,
 
562
                              option, value);
 
563
        option = new String[1];
 
564
        option[0] = "2";
 
565
        value = new int[1];
 
566
        value[0] = RuleBasedCollator.AttributeValue.ON_;
 
567
        RULES_OPTIONS_[1] = new TokenOption("backwards",
 
568
                                 RuleBasedCollator.Attribute.FRENCH_COLLATION_,
 
569
                                 option, value);
 
570
        String offonoption[] = new String[2];
 
571
        offonoption[0] = "off";
 
572
        offonoption[1] = "on";
 
573
        int offonvalue[] = new int[2];
 
574
        offonvalue[0] = RuleBasedCollator.AttributeValue.OFF_;
 
575
        offonvalue[1] = RuleBasedCollator.AttributeValue.ON_;
 
576
        RULES_OPTIONS_[2] = new TokenOption("caseLevel",
 
577
                                       RuleBasedCollator.Attribute.CASE_LEVEL_,
 
578
                                       offonoption, offonvalue);
 
579
        option = new String[3];
 
580
        option[0] = "lower";
 
581
        option[1] = "upper";
 
582
        option[2] = "off";
 
583
        value = new int[3];
 
584
        value[0] = RuleBasedCollator.AttributeValue.LOWER_FIRST_;
 
585
        value[1] = RuleBasedCollator.AttributeValue.UPPER_FIRST_;
 
586
        value[2] = RuleBasedCollator.AttributeValue.OFF_;
 
587
        RULES_OPTIONS_[3] = new TokenOption("caseFirst",
 
588
                                       RuleBasedCollator.Attribute.CASE_FIRST_,
 
589
                                       option, value);
 
590
        RULES_OPTIONS_[4] = new TokenOption("normalization",
 
591
                               RuleBasedCollator.Attribute.NORMALIZATION_MODE_,
 
592
                               offonoption, offonvalue);
 
593
        RULES_OPTIONS_[5] = new TokenOption("hiraganaQ",
 
594
                         RuleBasedCollator.Attribute.HIRAGANA_QUATERNARY_MODE_,
 
595
                         offonoption, offonvalue);
 
596
        option = new String[5];
 
597
        option[0] = "1";
 
598
        option[1] = "2";
 
599
        option[2] = "3";
 
600
        option[3] = "4";
 
601
        option[4] = "I";
 
602
        value = new int[5];
 
603
        value[0] = RuleBasedCollator.AttributeValue.PRIMARY_;
 
604
        value[1] = RuleBasedCollator.AttributeValue.SECONDARY_;
 
605
        value[2] = RuleBasedCollator.AttributeValue.TERTIARY_;
 
606
        value[3] = RuleBasedCollator.AttributeValue.QUATERNARY_;
 
607
        value[4] = RuleBasedCollator.AttributeValue.IDENTICAL_;
 
608
        RULES_OPTIONS_[6] = new TokenOption("strength",
 
609
                                         RuleBasedCollator.Attribute.STRENGTH_,
 
610
                                         option, value);
 
611
        RULES_OPTIONS_[7] = new TokenOption("variable top",
 
612
                                  RuleBasedCollator.Attribute.LIMIT_,
 
613
                                  null, null);
 
614
        RULES_OPTIONS_[8] = new TokenOption("rearrange",
 
615
                                  RuleBasedCollator.Attribute.LIMIT_,
 
616
                                  null, null);
 
617
        option = new String[3];
 
618
        option[0] = "1";
 
619
        option[1] = "2";
 
620
        option[2] = "3";
 
621
        value = new int[3];
 
622
        value[0] = RuleBasedCollator.AttributeValue.PRIMARY_;
 
623
        value[1] = RuleBasedCollator.AttributeValue.SECONDARY_;
 
624
        value[2] = RuleBasedCollator.AttributeValue.TERTIARY_;
 
625
        RULES_OPTIONS_[9] = new TokenOption("before",
 
626
                                  RuleBasedCollator.Attribute.LIMIT_,
 
627
                                  option, value);
 
628
        RULES_OPTIONS_[10] = new TokenOption("top",
 
629
                                  RuleBasedCollator.Attribute.LIMIT_,
 
630
                                  null, null);
 
631
        String firstlastoption[] = new String[7];
 
632
        firstlastoption[0] = "primary";
 
633
        firstlastoption[1] = "secondary";
 
634
        firstlastoption[2] = "tertiary";
 
635
        firstlastoption[3] = "variable";
 
636
        firstlastoption[4] = "regular";
 
637
        firstlastoption[5] = "implicit";
 
638
        firstlastoption[6] = "trailing";
 
639
 
 
640
        int firstlastvalue[] = new int[7];
 
641
        Arrays.fill(firstlastvalue, RuleBasedCollator.AttributeValue.PRIMARY_);
 
642
 
 
643
        RULES_OPTIONS_[11] = new TokenOption("first",
 
644
                                  RuleBasedCollator.Attribute.LIMIT_,
 
645
                                  firstlastoption, firstlastvalue);
 
646
        RULES_OPTIONS_[12] = new TokenOption("last",
 
647
                                  RuleBasedCollator.Attribute.LIMIT_,
 
648
                                  firstlastoption, firstlastvalue);
 
649
        RULES_OPTIONS_[13] = new TokenOption("optimize",
 
650
                                  RuleBasedCollator.Attribute.LIMIT_,
 
651
                                  null, null);
 
652
        RULES_OPTIONS_[14] = new TokenOption("suppressContractions",
 
653
                                  RuleBasedCollator.Attribute.LIMIT_,
 
654
                                  null, null);
 
655
        RULES_OPTIONS_[15] = new TokenOption("undefined",
 
656
                                  RuleBasedCollator.Attribute.LIMIT_,
 
657
                                  null, null);
 
658
        RULES_OPTIONS_[16] = new TokenOption("scriptOrder",
 
659
                                  RuleBasedCollator.Attribute.LIMIT_,
 
660
                                  null, null);
 
661
        RULES_OPTIONS_[17] = new TokenOption("charsetname",
 
662
                                  RuleBasedCollator.Attribute.LIMIT_,
 
663
                                  null, null);
 
664
        RULES_OPTIONS_[18] = new TokenOption("charset",
 
665
                                  RuleBasedCollator.Attribute.LIMIT_,
 
666
                                  null, null);
 
667
    }
 
668
 
 
669
    /**
 
670
     * Utility data members
 
671
     */
 
672
    private Token m_utilToken_ = new Token();
 
673
    private CollationElementIterator m_UCAColEIter_
 
674
                      = RuleBasedCollator.UCA_.getCollationElementIterator("");
 
675
    private int m_utilCEBuffer_[] = new int[2];
 
676
 
 
677
    // private methods -------------------------------------------------------
 
678
 
 
679
    /**
 
680
     * Assembles the token list
 
681
     * @exception ParseException thrown when rules syntax fails
 
682
     */
 
683
    int assembleTokenList() throws ParseException
 
684
    {
 
685
        Token lastToken = null;
 
686
        m_parsedToken_.m_strength_ = TOKEN_UNSET_;
 
687
        int sourcelimit = m_source_.length();
 
688
        int expandNext = 0;
 
689
 
 
690
        while (m_current_ < sourcelimit) {
 
691
            m_parsedToken_.m_prefixOffset_ = 0;
 
692
            if (parseNextToken(lastToken == null) < 0) {
 
693
                // we have reached the end
 
694
                continue;
 
695
            }
 
696
            char specs = m_parsedToken_.m_flags_;
 
697
            boolean variableTop = ((specs & TOKEN_VARIABLE_TOP_MASK_) != 0);
 
698
            boolean top = ((specs & TOKEN_TOP_MASK_) != 0);
 
699
            int lastStrength = TOKEN_UNSET_;
 
700
            if (lastToken != null) {
 
701
                lastStrength = lastToken.m_strength_;
 
702
            }
 
703
            m_utilToken_.m_source_ = m_parsedToken_.m_charsLen_ << 24
 
704
                                             | m_parsedToken_.m_charsOffset_;
 
705
            m_utilToken_.m_rules_ = m_source_;
 
706
            // 4 Lookup each source in the CharsToToken map, and find a
 
707
            // sourcetoken
 
708
            Token sourceToken = m_hashTable_.get(m_utilToken_);
 
709
            if (m_parsedToken_.m_strength_ != TOKEN_RESET_) {
 
710
                if (lastToken == null) {
 
711
                    // this means that rules haven't started properly
 
712
                    throwParseException(m_source_.toString(), 0);
 
713
                }
 
714
                //  6 Otherwise (when relation != reset)
 
715
                if (sourceToken == null) {
 
716
                    // If sourceToken is null, create new one
 
717
                    sourceToken = new Token();
 
718
                     sourceToken.m_rules_ = m_source_;
 
719
                    sourceToken.m_source_ = m_parsedToken_.m_charsLen_ << 24
 
720
                                           | m_parsedToken_.m_charsOffset_;
 
721
                    sourceToken.m_prefix_ = m_parsedToken_.m_prefixLen_ << 24
 
722
                                           | m_parsedToken_.m_prefixOffset_;
 
723
                    // TODO: this should also handle reverse
 
724
                    sourceToken.m_polarity_ = TOKEN_POLARITY_POSITIVE_;
 
725
                    sourceToken.m_next_ = null;
 
726
                     sourceToken.m_previous_ = null;
 
727
                    sourceToken.m_CELength_ = 0;
 
728
                    sourceToken.m_expCELength_ = 0;
 
729
                    m_hashTable_.put(sourceToken, sourceToken);
 
730
                }
 
731
                else {
 
732
                    // we could have fished out a reset here
 
733
                    if (sourceToken.m_strength_ != TOKEN_RESET_
 
734
                        && lastToken != sourceToken) {
 
735
                        // otherwise remove sourceToken from where it was.
 
736
                        if (sourceToken.m_next_ != null) {
 
737
                            if (sourceToken.m_next_.m_strength_
 
738
                                                   > sourceToken.m_strength_) {
 
739
                                sourceToken.m_next_.m_strength_
 
740
                                                   = sourceToken.m_strength_;
 
741
                            }
 
742
                            sourceToken.m_next_.m_previous_
 
743
                                                    = sourceToken.m_previous_;
 
744
                        }
 
745
                        else {
 
746
                            sourceToken.m_listHeader_.m_last_
 
747
                                                    = sourceToken.m_previous_;
 
748
                        }
 
749
                        if (sourceToken.m_previous_ != null) {
 
750
                            sourceToken.m_previous_.m_next_
 
751
                                                        = sourceToken.m_next_;
 
752
                        }
 
753
                        else {
 
754
                            sourceToken.m_listHeader_.m_first_
 
755
                                                        = sourceToken.m_next_;
 
756
                        }
 
757
                        sourceToken.m_next_ = null;
 
758
                        sourceToken.m_previous_ = null;
 
759
                    }
 
760
                }
 
761
                sourceToken.m_strength_ = m_parsedToken_.m_strength_;
 
762
                sourceToken.m_listHeader_ = lastToken.m_listHeader_;
 
763
 
 
764
                // 1.  Find the strongest strength in each list, and set
 
765
                // strongestP and strongestN accordingly in the headers.
 
766
                if (lastStrength == TOKEN_RESET_
 
767
                    || sourceToken.m_listHeader_.m_first_ == null) {
 
768
                    // If LAST is a reset insert sourceToken in the list.
 
769
                    if (sourceToken.m_listHeader_.m_first_ == null) {
 
770
                        sourceToken.m_listHeader_.m_first_ = sourceToken;
 
771
                        sourceToken.m_listHeader_.m_last_ = sourceToken;
 
772
                    }
 
773
                    else { // we need to find a place for us
 
774
                           // and we'll get in front of the same strength
 
775
                        if (sourceToken.m_listHeader_.m_first_.m_strength_
 
776
                                                 <= sourceToken.m_strength_) {
 
777
                            sourceToken.m_next_
 
778
                                          = sourceToken.m_listHeader_.m_first_;
 
779
                            sourceToken.m_next_.m_previous_ = sourceToken;
 
780
                            sourceToken.m_listHeader_.m_first_ = sourceToken;
 
781
                            sourceToken.m_previous_ = null;
 
782
                        }
 
783
                        else {
 
784
                            lastToken = sourceToken.m_listHeader_.m_first_;
 
785
                            while (lastToken.m_next_ != null
 
786
                                   && lastToken.m_next_.m_strength_
 
787
                                                 > sourceToken.m_strength_) {
 
788
                                lastToken = lastToken.m_next_;
 
789
                            }
 
790
                            if (lastToken.m_next_ != null) {
 
791
                                lastToken.m_next_.m_previous_ = sourceToken;
 
792
                            }
 
793
                            else {
 
794
                                sourceToken.m_listHeader_.m_last_
 
795
                                                               = sourceToken;
 
796
                            }
 
797
                            sourceToken.m_previous_ = lastToken;
 
798
                            sourceToken.m_next_ = lastToken.m_next_;
 
799
                            lastToken.m_next_ = sourceToken;
 
800
                        }
 
801
                    }
 
802
                }
 
803
                else {
 
804
                    // Otherwise (when LAST is not a reset)
 
805
                    // if polarity (LAST) == polarity(relation), insert
 
806
                    // sourceToken after LAST, otherwise insert before.
 
807
                    // when inserting after or before, search to the next
 
808
                    // position with the same strength in that direction.
 
809
                    // (This is called postpone insertion).
 
810
                    if (sourceToken != lastToken) {
 
811
                        if (lastToken.m_polarity_ == sourceToken.m_polarity_) {
 
812
                            while (lastToken.m_next_ != null
 
813
                                   && lastToken.m_next_.m_strength_
 
814
                                                   > sourceToken.m_strength_) {
 
815
                                lastToken = lastToken.m_next_;
 
816
                            }
 
817
                            sourceToken.m_previous_ = lastToken;
 
818
                            if (lastToken.m_next_ != null) {
 
819
                                lastToken.m_next_.m_previous_ = sourceToken;
 
820
                            }
 
821
                            else {
 
822
                                sourceToken.m_listHeader_.m_last_ = sourceToken;
 
823
                            }
 
824
                            sourceToken.m_next_ = lastToken.m_next_;
 
825
                            lastToken.m_next_ = sourceToken;
 
826
                        }
 
827
                        else {
 
828
                            while (lastToken.m_previous_ != null
 
829
                                   && lastToken.m_previous_.m_strength_
 
830
                                                > sourceToken.m_strength_) {
 
831
                                lastToken = lastToken.m_previous_;
 
832
                            }
 
833
                            sourceToken.m_next_ = lastToken;
 
834
                            if (lastToken.m_previous_ != null) {
 
835
                                lastToken.m_previous_.m_next_ = sourceToken;
 
836
                            }
 
837
                            else {
 
838
                                sourceToken.m_listHeader_.m_first_
 
839
                                                                 = sourceToken;
 
840
                            }
 
841
                            sourceToken.m_previous_ = lastToken.m_previous_;
 
842
                            lastToken.m_previous_ = sourceToken;
 
843
                        }
 
844
                    }
 
845
                    else { // repeated one thing twice in rules, stay with the
 
846
                           // stronger strength
 
847
                        if (lastStrength < sourceToken.m_strength_) {
 
848
                            sourceToken.m_strength_ = lastStrength;
 
849
                        }
 
850
                    }
 
851
                }
 
852
                // if the token was a variable top, we're gonna put it in
 
853
                if (variableTop == true && m_variableTop_ == null) {
 
854
                    variableTop = false;
 
855
                    m_variableTop_ = sourceToken;
 
856
                }
 
857
                // Treat the expansions.
 
858
                // There are two types of expansions: explicit (x / y) and
 
859
                // reset based propagating expansions
 
860
                // (&abc * d * e <=> &ab * d / c * e / c)
 
861
                // if both of them are in effect for a token, they are combined.
 
862
               sourceToken.m_expansion_ = m_parsedToken_.m_extensionLen_ << 24
 
863
                                          | m_parsedToken_.m_extensionOffset_;
 
864
               if (expandNext != 0) {
 
865
                   if (sourceToken.m_strength_ == RuleBasedCollator.PRIMARY) {
 
866
                       // primary strength kills off the implicit expansion
 
867
                       expandNext = 0;
 
868
                   }
 
869
                   else if (sourceToken.m_expansion_ == 0) {
 
870
                       // if there is no expansion, implicit is just added to
 
871
                       // the token
 
872
                       sourceToken.m_expansion_ = expandNext;
 
873
                   }
 
874
                   else {
 
875
                       // there is both explicit and implicit expansion.
 
876
                       // We need to make a combination
 
877
                       int start = expandNext & 0xFFFFFF;
 
878
                       int size = expandNext >>> 24;
 
879
                       if (size > 0) {
 
880
                          m_source_.append(m_source_.substring(start,
 
881
                                                               start + size));
 
882
                       }
 
883
                          start = m_parsedToken_.m_extensionOffset_;
 
884
                       m_source_.append(m_source_.substring(start,
 
885
                                      start + m_parsedToken_.m_extensionLen_));
 
886
                       sourceToken.m_expansion_ = (size
 
887
                                       + m_parsedToken_.m_extensionLen_) << 24
 
888
                                       | m_extraCurrent_;
 
889
                       m_extraCurrent_ += size + m_parsedToken_.m_extensionLen_;
 
890
                   }
 
891
                }
 
892
               // if the previous token was a reset before, the strength of this
 
893
               // token must match the strength of before. Otherwise we have an
 
894
               // undefined situation.
 
895
               // In other words, we currently have a cludge which we use to
 
896
               // represent &a >> x. This is written as &[before 2]a << x.
 
897
               if((lastToken.m_flags_ & TOKEN_BEFORE_) != 0) {
 
898
                   int beforeStrength = (lastToken.m_flags_ & TOKEN_BEFORE_) - 1;
 
899
                   if(beforeStrength != sourceToken.m_strength_) {
 
900
                          throwParseException(m_source_.toString(), m_current_);
 
901
                   }
 
902
               }
 
903
 
 
904
            }
 
905
            else {
 
906
                if (lastToken != null && lastStrength == TOKEN_RESET_) {
 
907
                    // if the previous token was also a reset, this means that
 
908
                    // we have two consecutive resets and we want to remove the
 
909
                    // previous one if empty
 
910
                    if (m_resultLength_ > 0 && m_listHeader_[m_resultLength_ - 1].m_first_ == null) {
 
911
                        m_resultLength_ --;
 
912
                    }
 
913
                }
 
914
                if (sourceToken == null) {
 
915
                    // this is a reset, but it might still be somewhere in the
 
916
                    // tailoring, in shorter form
 
917
                    int searchCharsLen = m_parsedToken_.m_charsLen_;
 
918
                    while (searchCharsLen > 1 && sourceToken == null) {
 
919
                        searchCharsLen --;
 
920
                        // key = searchCharsLen << 24 | charsOffset;
 
921
                        m_utilToken_.m_source_ = searchCharsLen << 24
 
922
                                             | m_parsedToken_.m_charsOffset_;
 
923
                        m_utilToken_.m_rules_ = m_source_;
 
924
                        sourceToken = m_hashTable_.get(m_utilToken_);
 
925
                    }
 
926
                    if (sourceToken != null) {
 
927
                        expandNext = (m_parsedToken_.m_charsLen_
 
928
                                                      - searchCharsLen) << 24
 
929
                                        | (m_parsedToken_.m_charsOffset_
 
930
                                           + searchCharsLen);
 
931
                    }
 
932
                }
 
933
                if ((specs & TOKEN_BEFORE_) != 0) {
 
934
                    if (top == false) {
 
935
                        // we're doing before & there is no indirection
 
936
                        int strength = (specs & TOKEN_BEFORE_) - 1;
 
937
                        if (sourceToken != null
 
938
                            && sourceToken.m_strength_ != TOKEN_RESET_) {
 
939
                            // this is a before that is already ordered in the UCA
 
940
                            // - so we need to get the previous with good strength
 
941
                            while (sourceToken.m_strength_ > strength
 
942
                                   && sourceToken.m_previous_ != null) {
 
943
                                sourceToken = sourceToken.m_previous_;
 
944
                            }
 
945
                            // here, either we hit the strength or NULL
 
946
                            if (sourceToken.m_strength_ == strength) {
 
947
                                if (sourceToken.m_previous_ != null) {
 
948
                                    sourceToken = sourceToken.m_previous_;
 
949
                                }
 
950
                                else { // start of list
 
951
                                    sourceToken
 
952
                                         = sourceToken.m_listHeader_.m_reset_;
 
953
                                }
 
954
                            }
 
955
                            else { // we hit NULL, we should be doing the else part
 
956
                                sourceToken
 
957
                                         = sourceToken.m_listHeader_.m_reset_;
 
958
                                sourceToken = getVirginBefore(sourceToken,
 
959
                                                              strength);
 
960
                            }
 
961
                        }
 
962
                        else {
 
963
                            sourceToken
 
964
                                      = getVirginBefore(sourceToken, strength);
 
965
                        }
 
966
                    }
 
967
                    else {
 
968
                        // this is both before and indirection
 
969
                        top = false;
 
970
                        m_listHeader_[m_resultLength_] = new TokenListHeader();
 
971
                        m_listHeader_[m_resultLength_].m_previousCE_ = 0;
 
972
                        m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
 
973
                        m_listHeader_[m_resultLength_].m_indirect_ = true;
 
974
                        // we need to do slightly more work. we need to get the
 
975
                        // baseCE using the inverse UCA & getPrevious. The next
 
976
                        // bound is not set, and will be decided in ucol_bld
 
977
                        int strength = (specs & TOKEN_BEFORE_) - 1;
 
978
                        int baseCE = INDIRECT_BOUNDARIES_[
 
979
                                   m_parsedToken_.m_indirectIndex_].m_startCE_;
 
980
                        int baseContCE = INDIRECT_BOUNDARIES_[
 
981
                               m_parsedToken_.m_indirectIndex_].m_startContCE_;
 
982
                        int ce[] = new int[2];
 
983
                        if((baseCE >>> 24 >= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_)
 
984
                        && (baseCE >>> 24 <=  RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_)) { /* implicits - */
 
985
                            int primary = baseCE & RuleBasedCollator.CE_PRIMARY_MASK_ | (baseContCE & RuleBasedCollator.CE_PRIMARY_MASK_) >> 16;
 
986
                            int raw = RuleBasedCollator.impCEGen_.getRawFromImplicit(primary);
 
987
                            int primaryCE = RuleBasedCollator.impCEGen_.getImplicitFromRaw(raw-1);
 
988
                            ce[0] = primaryCE & RuleBasedCollator.CE_PRIMARY_MASK_ | 0x0505;
 
989
                            ce[1] = (primaryCE << 16) & RuleBasedCollator.CE_PRIMARY_MASK_ | RuleBasedCollator.CE_CONTINUATION_MARKER_;
 
990
                        } else {
 
991
                            CollationParsedRuleBuilder.InverseUCA invuca
 
992
                                = CollationParsedRuleBuilder.INVERSE_UCA_;
 
993
                            invuca.getInversePrevCE(baseCE, baseContCE, strength,
 
994
                                    ce);
 
995
                        }
 
996
                        m_listHeader_[m_resultLength_].m_baseCE_ = ce[0];
 
997
                        m_listHeader_[m_resultLength_].m_baseContCE_ = ce[1];
 
998
                        m_listHeader_[m_resultLength_].m_nextCE_ = 0;
 
999
                        m_listHeader_[m_resultLength_].m_nextContCE_ = 0;
 
1000
 
 
1001
                        sourceToken = new Token();
 
1002
                        expandNext = initAReset(0, sourceToken);
 
1003
                    }
 
1004
                }
 
1005
                // 5 If the relation is a reset:
 
1006
                // If sourceToken is null
 
1007
                // Create new list, create new sourceToken, make the baseCE
 
1008
                // from source, put the sourceToken in ListHeader of the new
 
1009
                // list
 
1010
                if (sourceToken == null) {
 
1011
                    if (m_listHeader_[m_resultLength_] == null) {
 
1012
                        m_listHeader_[m_resultLength_] = new TokenListHeader();
 
1013
                    }
 
1014
                    // 3 Consider each item: relation, source, and expansion:
 
1015
                    // e.g. ...< x / y ...
 
1016
                    // First convert all expansions into normal form.
 
1017
                    // Examples:
 
1018
                    // If "xy" doesn't occur earlier in the list or in the UCA,
 
1019
                    // convert &xy * c * d * ... into &x * c/y * d * ...
 
1020
                    // Note: reset values can never have expansions, although
 
1021
                    // they can cause the very next item to have one. They may
 
1022
                    // be contractions, if they are found earlier in the list.
 
1023
                    if (top == false) {
 
1024
                        CollationElementIterator coleiter
 
1025
                        = RuleBasedCollator.UCA_.getCollationElementIterator(
 
1026
                            m_source_.substring(m_parsedToken_.m_charsOffset_,
 
1027
                                                m_parsedToken_.m_charsOffset_
 
1028
                                                + m_parsedToken_.m_charsLen_));
 
1029
 
 
1030
                        int CE = coleiter.next();
 
1031
                        // offset to the character in the full rule string
 
1032
                        int expand = coleiter.getOffset()
 
1033
                                     + m_parsedToken_.m_charsOffset_;
 
1034
                        int SecondCE = coleiter.next();
 
1035
 
 
1036
                        m_listHeader_[m_resultLength_].m_baseCE_
 
1037
                                                             = CE & 0xFFFFFF3F;
 
1038
                        if (RuleBasedCollator.isContinuation(SecondCE)) {
 
1039
                            m_listHeader_[m_resultLength_].m_baseContCE_
 
1040
                                                                    = SecondCE;
 
1041
                        }
 
1042
                        else {
 
1043
                            m_listHeader_[m_resultLength_].m_baseContCE_ = 0;
 
1044
                        }
 
1045
                        m_listHeader_[m_resultLength_].m_nextCE_ = 0;
 
1046
                        m_listHeader_[m_resultLength_].m_nextContCE_ = 0;
 
1047
                        m_listHeader_[m_resultLength_].m_previousCE_ = 0;
 
1048
                        m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
 
1049
                        m_listHeader_[m_resultLength_].m_indirect_ = false;
 
1050
                        sourceToken = new Token();
 
1051
                        expandNext = initAReset(expand, sourceToken);
 
1052
                    }
 
1053
                    else { // top == TRUE
 
1054
                        top = false;
 
1055
                        m_listHeader_[m_resultLength_].m_previousCE_ = 0;
 
1056
                        m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
 
1057
                        m_listHeader_[m_resultLength_].m_indirect_ = true;
 
1058
                        IndirectBoundaries ib = INDIRECT_BOUNDARIES_[
 
1059
                                              m_parsedToken_.m_indirectIndex_];
 
1060
                        m_listHeader_[m_resultLength_].m_baseCE_
 
1061
                                                               = ib.m_startCE_;
 
1062
                        m_listHeader_[m_resultLength_].m_baseContCE_
 
1063
                                                           = ib.m_startContCE_;
 
1064
                        m_listHeader_[m_resultLength_].m_nextCE_
 
1065
                                                               = ib.m_limitCE_;
 
1066
                        m_listHeader_[m_resultLength_].m_nextContCE_
 
1067
                                                           = ib.m_limitContCE_;
 
1068
                        sourceToken = new Token();
 
1069
                        expandNext = initAReset(0, sourceToken);
 
1070
                    }
 
1071
                }
 
1072
                else { // reset to something already in rules
 
1073
                    top = false;
 
1074
                }
 
1075
            }
 
1076
            // 7 After all this, set LAST to point to sourceToken, and goto
 
1077
            // step 3.
 
1078
            lastToken = sourceToken;
 
1079
        }
 
1080
 
 
1081
        if (m_resultLength_ > 0
 
1082
            && m_listHeader_[m_resultLength_ - 1].m_first_ == null) {
 
1083
            m_resultLength_ --;
 
1084
        }
 
1085
        return m_resultLength_;
 
1086
    }
 
1087
 
 
1088
    /**
 
1089
     * Formats and throws a ParseException
 
1090
     * @param rules collation rule that failed
 
1091
     * @param offset failed offset in rules
 
1092
     * @throws ParseException with failure information
 
1093
     */
 
1094
    private static final void throwParseException(String rules, int offset)
 
1095
                                                          throws ParseException
 
1096
    {
 
1097
        // for pre-context
 
1098
        String precontext = rules.substring(0, offset);
 
1099
        String postcontext = rules.substring(offset, rules.length());
 
1100
        StringBuilder error = new StringBuilder(
 
1101
                                    "Parse error occurred in rule at offset ");
 
1102
        error.append(offset);
 
1103
        error.append("\n after the prefix \"");
 
1104
        error.append(precontext);
 
1105
        error.append("\" before the suffix \"");
 
1106
        error.append(postcontext);
 
1107
        throw new ParseException(error.toString(), offset);
 
1108
    }
 
1109
 
 
1110
    private final boolean doSetTop() {
 
1111
        m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
 
1112
        m_source_.append((char)0xFFFE);
 
1113
        IndirectBoundaries ib =
 
1114
                  INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_];
 
1115
        m_source_.append((char)(ib.m_startCE_ >> 16));
 
1116
        m_source_.append((char)(ib.m_startCE_ & 0xFFFF));
 
1117
        m_extraCurrent_ += 3;
 
1118
        if (INDIRECT_BOUNDARIES_[m_parsedToken_.m_indirectIndex_
 
1119
                                                       ].m_startContCE_ == 0) {
 
1120
            m_parsedToken_.m_charsLen_ = 3;
 
1121
        }
 
1122
        else {
 
1123
            m_source_.append((char)(INDIRECT_BOUNDARIES_[
 
1124
                                        m_parsedToken_.m_indirectIndex_
 
1125
                                    ].m_startContCE_ >> 16));
 
1126
            m_source_.append((char)(INDIRECT_BOUNDARIES_[
 
1127
                                        m_parsedToken_.m_indirectIndex_
 
1128
                                    ].m_startContCE_ & 0xFFFF));
 
1129
            m_extraCurrent_ += 2;
 
1130
            m_parsedToken_.m_charsLen_ = 5;
 
1131
        }
 
1132
        return true;
 
1133
    }
 
1134
 
 
1135
    private static boolean isCharNewLine(char c) {
 
1136
        switch (c) {
 
1137
        case 0x000A: /* LF */
 
1138
        case 0x000D: /* CR */
 
1139
        case 0x000C: /* FF */
 
1140
        case 0x0085: /* NEL */
 
1141
        case 0x2028: /* LS */
 
1142
        case 0x2029: /* PS */
 
1143
            return true;
 
1144
        default:
 
1145
            return false;
 
1146
        }
 
1147
    }
 
1148
 
 
1149
    /**
 
1150
     * Getting the next token
 
1151
     *
 
1152
     * @param startofrules
 
1153
     *            flag indicating if we are at the start of rules
 
1154
     * @return the offset of the rules
 
1155
     * @exception ParseException
 
1156
     *                thrown when rule parsing fails
 
1157
     */
 
1158
    @SuppressWarnings("fallthrough")
 
1159
    private int parseNextToken(boolean startofrules) throws ParseException
 
1160
    {
 
1161
        // parsing part
 
1162
        boolean variabletop = false;
 
1163
        boolean top = false;
 
1164
        boolean inchars = true;
 
1165
        boolean inquote = false;
 
1166
        boolean wasinquote = false;
 
1167
        byte before = 0;
 
1168
        boolean isescaped = false;
 
1169
        int /*newcharslen = 0,*/ newextensionlen = 0;
 
1170
        int /*charsoffset = 0,*/ extensionoffset = 0;
 
1171
        int newstrength = TOKEN_UNSET_;
 
1172
 
 
1173
        m_parsedToken_.m_charsLen_ = 0;
 
1174
        m_parsedToken_.m_charsOffset_ = 0;
 
1175
        m_parsedToken_.m_prefixOffset_ = 0;
 
1176
        m_parsedToken_.m_prefixLen_ = 0;
 
1177
        m_parsedToken_.m_indirectIndex_ = 0;
 
1178
 
 
1179
        int limit = m_rules_.length();
 
1180
        while (m_current_ < limit) {
 
1181
            char ch = m_source_.charAt(m_current_);
 
1182
            if (inquote) {
 
1183
                if (ch == 0x0027) { // '\''
 
1184
                    inquote = false;
 
1185
                }
 
1186
                else {
 
1187
                    if ((m_parsedToken_.m_charsLen_ == 0) || inchars) {
 
1188
                         if (m_parsedToken_.m_charsLen_ == 0) {
 
1189
                             m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
 
1190
                         }
 
1191
                         m_parsedToken_.m_charsLen_ ++;
 
1192
                    }
 
1193
                    else {
 
1194
                        if (newextensionlen == 0) {
 
1195
                            extensionoffset = m_extraCurrent_;
 
1196
                        }
 
1197
                        newextensionlen ++;
 
1198
                    }
 
1199
                }
 
1200
            }
 
1201
            else if (isescaped) {
 
1202
                isescaped = false;
 
1203
                if (newstrength == TOKEN_UNSET_) {
 
1204
                    throwParseException(m_rules_, m_current_);
 
1205
                }
 
1206
                if (ch != 0 && m_current_ != limit) {
 
1207
                    if (inchars) {
 
1208
                        if (m_parsedToken_.m_charsLen_ == 0) {
 
1209
                            m_parsedToken_.m_charsOffset_ = m_current_;
 
1210
                        }
 
1211
                        m_parsedToken_.m_charsLen_ ++;
 
1212
                    }
 
1213
                    else {
 
1214
                        if (newextensionlen == 0) {
 
1215
                            extensionoffset = m_current_;
 
1216
                        }
 
1217
                        newextensionlen ++;
 
1218
                    }
 
1219
                }
 
1220
            }
 
1221
            else {
 
1222
                if (!UCharacterProperty.isRuleWhiteSpace(ch)) {
 
1223
                    // Sets the strength for this entry
 
1224
                    switch (ch) {
 
1225
                    case 0x003D : // '='
 
1226
                        if (newstrength != TOKEN_UNSET_) {
 
1227
                            return doEndParseNextToken(newstrength,
 
1228
                                                       top,
 
1229
                                                       extensionoffset,
 
1230
                                                       newextensionlen,
 
1231
                                                       variabletop, before);
 
1232
                        }
 
1233
                        // if we start with strength, we'll reset to top
 
1234
                        if (startofrules == true) {
 
1235
                            m_parsedToken_.m_indirectIndex_ = 5;
 
1236
                            top = doSetTop();
 
1237
                            return doEndParseNextToken(TOKEN_RESET_,
 
1238
                                                       top,
 
1239
                                                       extensionoffset,
 
1240
                                                       newextensionlen,
 
1241
                                                       variabletop, before);
 
1242
                        }
 
1243
                        newstrength = Collator.IDENTICAL;
 
1244
                        if(m_source_.charAt(m_current_ + 1) == 0x002A) { // '*'
 
1245
                            m_current_++;
 
1246
                            m_prevStrength_ = newstrength;
 
1247
                        }else{
 
1248
                            m_prevStrength_ = TOKEN_UNSET_;
 
1249
                        }
 
1250
                        break;
 
1251
                    case 0x002C : // ','
 
1252
                        if (newstrength != TOKEN_UNSET_) {
 
1253
                            return doEndParseNextToken(newstrength,
 
1254
                                                       top,
 
1255
                                                       extensionoffset,
 
1256
                                                       newextensionlen,
 
1257
                                                       variabletop, before);
 
1258
                        }
 
1259
                        // if we start with strength, we'll reset to top
 
1260
                        if (startofrules == true) {
 
1261
                            m_parsedToken_.m_indirectIndex_ = 5;
 
1262
                            top = doSetTop();
 
1263
                            return doEndParseNextToken(TOKEN_RESET_,
 
1264
                                                       top,
 
1265
                                                       extensionoffset,
 
1266
                                                       newextensionlen,
 
1267
                                                       variabletop, before);
 
1268
                        }
 
1269
                        newstrength = Collator.TERTIARY;
 
1270
                        m_prevStrength_ = TOKEN_UNSET_;
 
1271
                        break;
 
1272
                    case 0x003B : // ';'
 
1273
                        if (newstrength != TOKEN_UNSET_) {
 
1274
                            return doEndParseNextToken(newstrength,
 
1275
                                                       top,
 
1276
                                                       extensionoffset,
 
1277
                                                       newextensionlen,
 
1278
                                                       variabletop, before);
 
1279
                        }
 
1280
                        // if we start with strength, we'll reset to top
 
1281
                        if (startofrules == true) {
 
1282
                            m_parsedToken_.m_indirectIndex_ = 5;
 
1283
                            top = doSetTop();
 
1284
                            return doEndParseNextToken(TOKEN_RESET_,
 
1285
                                                       top,
 
1286
                                                       extensionoffset,
 
1287
                                                       newextensionlen,
 
1288
                                                       variabletop, before);
 
1289
                        }
 
1290
                        newstrength = Collator.SECONDARY;
 
1291
                        m_prevStrength_ = TOKEN_UNSET_;
 
1292
                        break;
 
1293
                    case 0x003C : // '<'
 
1294
                        if (newstrength != TOKEN_UNSET_) {
 
1295
                            return doEndParseNextToken(newstrength,
 
1296
                                                       top,
 
1297
                                                       extensionoffset,
 
1298
                                                       newextensionlen,
 
1299
                                                       variabletop, before);
 
1300
                        }
 
1301
                        // if we start with strength, we'll reset to top
 
1302
                        if (startofrules == true) {
 
1303
                            m_parsedToken_.m_indirectIndex_ = 5;
 
1304
                            top = doSetTop();
 
1305
                            return doEndParseNextToken(TOKEN_RESET_,
 
1306
                                                       top,
 
1307
                                                       extensionoffset,
 
1308
                                                       newextensionlen,
 
1309
                                                       variabletop, before);
 
1310
                        }
 
1311
                        // before this, do a scan to verify whether this is
 
1312
                        // another strength
 
1313
                        if (m_source_.charAt(m_current_ + 1) == 0x003C) {
 
1314
                            m_current_ ++;
 
1315
                            if (m_source_.charAt(m_current_ + 1) == 0x003C) {
 
1316
                                m_current_ ++; // three in a row!
 
1317
                                newstrength = Collator.TERTIARY;
 
1318
                            }
 
1319
                            else { // two in a row
 
1320
                                newstrength = Collator.SECONDARY;
 
1321
                            }
 
1322
                        }
 
1323
                        else { // just one
 
1324
                            newstrength = Collator.PRIMARY;
 
1325
                        }
 
1326
 
 
1327
                        if(m_source_.charAt(m_current_ + 1) == 0x002A) { // '*'
 
1328
                            m_current_++;
 
1329
                            m_prevStrength_ = newstrength;
 
1330
                        }else{
 
1331
                            m_prevStrength_ = TOKEN_UNSET_;
 
1332
                        }
 
1333
                        break;
 
1334
                    case 0x0026 : // '&'
 
1335
                        if (newstrength != TOKEN_UNSET_) {
 
1336
                            return doEndParseNextToken(newstrength,
 
1337
                                                       top,
 
1338
                                                       extensionoffset,
 
1339
                                                       newextensionlen,
 
1340
                                                       variabletop, before);
 
1341
                        }
 
1342
                        newstrength = TOKEN_RESET_; // PatternEntry::RESET = 0
 
1343
                        m_prevStrength_ = TOKEN_UNSET_;
 
1344
                        break;
 
1345
                    case 0x005b : // '['
 
1346
                        // options - read an option, analyze it
 
1347
                        m_optionEnd_ = m_rules_.indexOf(0x005d, m_current_);
 
1348
                        if (m_optionEnd_ != -1) { // ']'
 
1349
                            byte result = readAndSetOption();
 
1350
                            m_current_ = m_optionEnd_;
 
1351
                            if ((result & TOKEN_TOP_MASK_) != 0) {
 
1352
                                if (newstrength == TOKEN_RESET_) {
 
1353
                                    top = doSetTop();
 
1354
                                    if (before != 0) {
 
1355
                                        // This is a combination of before and
 
1356
                                        // indirection like
 
1357
                                        // '&[before 2][first regular]<b'
 
1358
                                        m_source_.append((char)0x002d);
 
1359
                                        m_source_.append((char)before);
 
1360
                                        m_extraCurrent_ += 2;
 
1361
                                        m_parsedToken_.m_charsLen_ += 2;
 
1362
                                    }
 
1363
                                    m_current_ ++;
 
1364
                                    return doEndParseNextToken(newstrength,
 
1365
                                                       true,
 
1366
                                                       extensionoffset,
 
1367
                                                       newextensionlen,
 
1368
                                                       variabletop, before);
 
1369
                                }
 
1370
                                else {
 
1371
                                    throwParseException(m_rules_, m_current_);
 
1372
                                }
 
1373
                            }
 
1374
                            else if ((result & TOKEN_VARIABLE_TOP_MASK_) != 0) {
 
1375
                                if (newstrength != TOKEN_RESET_
 
1376
                                    && newstrength != TOKEN_UNSET_) {
 
1377
                                    variabletop = true;
 
1378
                                    m_parsedToken_.m_charsOffset_
 
1379
                                                             = m_extraCurrent_;
 
1380
                                    m_source_.append((char)0xFFFF);
 
1381
                                    m_extraCurrent_ ++;
 
1382
                                    m_current_ ++;
 
1383
                                    m_parsedToken_.m_charsLen_ = 1;
 
1384
                                    return doEndParseNextToken(newstrength,
 
1385
                                                       top,
 
1386
                                                       extensionoffset,
 
1387
                                                       newextensionlen,
 
1388
                                                       variabletop, before);
 
1389
                                }
 
1390
                                else {
 
1391
                                    throwParseException(m_rules_, m_current_);
 
1392
                                }
 
1393
                            }
 
1394
                            else if ((result & TOKEN_BEFORE_) != 0){
 
1395
                                if (newstrength == TOKEN_RESET_) {
 
1396
                                    before = (byte)(result & TOKEN_BEFORE_);
 
1397
                                }
 
1398
                                else {
 
1399
                                    throwParseException(m_rules_, m_current_);
 
1400
                                }
 
1401
                            }
 
1402
                        }
 
1403
                        break;
 
1404
                    case 0x002F : // '/'
 
1405
                        wasinquote = false; // if we were copying source
 
1406
                                            // characters, we want to stop now
 
1407
                        inchars = false; // we're now processing expansion
 
1408
                        break;
 
1409
                    case 0x005C : // back slash for escaped chars
 
1410
                        isescaped = true;
 
1411
                        break;
 
1412
                    // found a quote, we're gonna start copying
 
1413
                    case 0x0027 : //'\''
 
1414
                        if (newstrength == TOKEN_UNSET_) {
 
1415
                            if (m_prevStrength_ == TOKEN_UNSET_) {
 
1416
                                // quote is illegal until we have a strength
 
1417
                                throwParseException(m_rules_, m_current_);
 
1418
                            }else{
 
1419
                                newstrength = m_prevStrength_;
 
1420
                            }
 
1421
                        }
 
1422
                        inquote = true;
 
1423
                        if (inchars) { // we're doing characters
 
1424
                            if (wasinquote == false) {
 
1425
                                m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
 
1426
                            }
 
1427
                            if (m_parsedToken_.m_charsLen_ != 0) {
 
1428
                                m_source_.append(m_source_.substring(
 
1429
                                       m_current_ - m_parsedToken_.m_charsLen_,
 
1430
                                       m_current_));
 
1431
                                m_extraCurrent_ += m_parsedToken_.m_charsLen_;
 
1432
                            }
 
1433
                            m_parsedToken_.m_charsLen_ ++;
 
1434
                        }
 
1435
                        else { // we're doing an expansion
 
1436
                            if (wasinquote == false) {
 
1437
                                extensionoffset = m_extraCurrent_;
 
1438
                            }
 
1439
                            if (newextensionlen != 0) {
 
1440
                                m_source_.append(m_source_.substring(
 
1441
                                                   m_current_ - newextensionlen,
 
1442
                                                   m_current_));
 
1443
                                m_extraCurrent_ += newextensionlen;
 
1444
                            }
 
1445
                            newextensionlen ++;
 
1446
                        }
 
1447
                        wasinquote = true;
 
1448
                        m_current_ ++;
 
1449
                        ch = m_source_.charAt(m_current_);
 
1450
                        if (ch == 0x0027) { // copy the double quote
 
1451
                            m_source_.append(ch);
 
1452
                            m_extraCurrent_ ++;
 
1453
                            inquote = false;
 
1454
                        }
 
1455
                        break;
 
1456
                    // '@' is french only if the strength is not currently set
 
1457
                    // if it is, it's just a regular character in collation
 
1458
                    case 0x0040 : // '@'
 
1459
                        if (newstrength == TOKEN_UNSET_) {
 
1460
                            m_options_.m_isFrenchCollation_ = true;
 
1461
                            break;
 
1462
                        }
 
1463
                        // fall through
 
1464
                    case 0x007C : //|
 
1465
                        // this means we have actually been reading prefix part
 
1466
                        // we want to store read characters to the prefix part
 
1467
                        // and continue reading the characters (proper way
 
1468
                        // would be to restart reading the chars, but in that
 
1469
                        // case we would have to complicate the token hasher,
 
1470
                        // which I do not intend to play with. Instead, we will
 
1471
                        // do prefixes when prefixes are due (before adding the
 
1472
                        // elements).
 
1473
                        m_parsedToken_.m_prefixOffset_
 
1474
                                                = m_parsedToken_.m_charsOffset_;
 
1475
                        m_parsedToken_.m_prefixLen_
 
1476
                                                = m_parsedToken_.m_charsLen_;
 
1477
                        if (inchars) { // we're doing characters
 
1478
                            if (wasinquote == false) {
 
1479
                                m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
 
1480
                            }
 
1481
                            if (m_parsedToken_.m_charsLen_ != 0) {
 
1482
                                String prefix = m_source_.substring(
 
1483
                                       m_current_ - m_parsedToken_.m_charsLen_,
 
1484
                                       m_current_);
 
1485
                                m_source_.append(prefix);
 
1486
                                m_extraCurrent_ += m_parsedToken_.m_charsLen_;
 
1487
                            }
 
1488
                            m_parsedToken_.m_charsLen_ ++;
 
1489
                        }
 
1490
                        wasinquote = true;
 
1491
                        do {
 
1492
                            m_current_ ++;
 
1493
                            ch = m_source_.charAt(m_current_);
 
1494
                            // skip whitespace between '|' and the character
 
1495
                        } while (UCharacterProperty.isRuleWhiteSpace(ch));
 
1496
                        break;
 
1497
                    case 0x0023: // '#' // this is a comment, skip everything through the end of line
 
1498
                        do {
 
1499
                            m_current_ ++;
 
1500
                            ch = m_source_.charAt(m_current_);
 
1501
                        } while (!isCharNewLine(ch));
 
1502
                        break;
 
1503
                    case 0x0021: // '!' // ignoring java set thai reordering
 
1504
                        break;
 
1505
                    default :
 
1506
                        if (newstrength == TOKEN_UNSET_) {
 
1507
                            if(m_prevStrength_ == TOKEN_UNSET_){
 
1508
                                throwParseException(m_rules_, m_current_);
 
1509
                            }else{
 
1510
                                newstrength = m_prevStrength_;
 
1511
                            }
 
1512
                        }
 
1513
                        if (isSpecialChar(ch) && (inquote == false)) {
 
1514
                            throwParseException(m_rules_, m_current_);
 
1515
                        }
 
1516
                        if (ch == 0x0000 && m_current_ + 1 == limit) {
 
1517
                            break;
 
1518
                        }
 
1519
                        if (inchars) {
 
1520
                            if (m_parsedToken_.m_charsLen_ == 0) {
 
1521
                                m_parsedToken_.m_charsOffset_ = m_current_;
 
1522
                            }
 
1523
                            m_parsedToken_.m_charsLen_++;
 
1524
                            if(m_prevStrength_ != TOKEN_UNSET_){
 
1525
                                char[] fullchar = Character.toChars(Character.codePointAt(m_source_, m_current_));
 
1526
                                m_current_ += fullchar.length;
 
1527
                                m_parsedToken_.m_charsLen_ += fullchar.length - 1;
 
1528
                                return doEndParseNextToken(newstrength,
 
1529
                                                           top,
 
1530
                                                           extensionoffset,
 
1531
                                                           newextensionlen,
 
1532
                                                           variabletop, before);
 
1533
                            }
 
1534
                        }
 
1535
                        else {
 
1536
                            if (newextensionlen == 0) {
 
1537
                                extensionoffset = m_current_;
 
1538
                            }
 
1539
                            newextensionlen ++;
 
1540
                        }
 
1541
                        break;
 
1542
                    }
 
1543
                }
 
1544
            }
 
1545
            if (wasinquote) {
 
1546
                if (ch != 0x27) {
 
1547
                      m_source_.append(ch);
 
1548
                    m_extraCurrent_ ++;
 
1549
                }
 
1550
            }
 
1551
            m_current_ ++;
 
1552
        }
 
1553
        return doEndParseNextToken(newstrength, top,
 
1554
                                   extensionoffset, newextensionlen,
 
1555
                                   variabletop, before);
 
1556
    }
 
1557
 
 
1558
    /**
 
1559
     * End the next parse token
 
1560
     * @param newstrength new strength
 
1561
     * @return offset in rules, -1 for end of rules
 
1562
     */
 
1563
    private int doEndParseNextToken(int newstrength, /*int newcharslen,*/
 
1564
                                    boolean top, /*int charsoffset,*/
 
1565
                                    int extensionoffset, int newextensionlen,
 
1566
                                    boolean variabletop, int before)
 
1567
                                    throws ParseException
 
1568
    {
 
1569
        if (newstrength == TOKEN_UNSET_) {
 
1570
            return -1;
 
1571
        }
 
1572
        if (m_parsedToken_.m_charsLen_ == 0 && top == false) {
 
1573
            throwParseException(m_rules_, m_current_);
 
1574
        }
 
1575
 
 
1576
        m_parsedToken_.m_strength_ = newstrength;
 
1577
        //m_parsedToken_.m_charsOffset_ = charsoffset;
 
1578
        //m_parsedToken_.m_charsLen_ = newcharslen;
 
1579
        m_parsedToken_.m_extensionOffset_ = extensionoffset;
 
1580
        m_parsedToken_.m_extensionLen_ = newextensionlen;
 
1581
        m_parsedToken_.m_flags_ = (char)
 
1582
                                  ((variabletop ? TOKEN_VARIABLE_TOP_MASK_ : 0)
 
1583
                                  | (top ? TOKEN_TOP_MASK_ : 0) | before);
 
1584
        return m_current_;
 
1585
    }
 
1586
 
 
1587
    /**
 
1588
     * Token before this element
 
1589
     * @param sourcetoken
 
1590
     * @param strength collation strength
 
1591
     * @return the token before source token
 
1592
     * @exception ParseException thrown when rules have the wrong syntax
 
1593
     */
 
1594
    private Token getVirginBefore(Token sourcetoken, int strength)
 
1595
                                                          throws ParseException
 
1596
    {
 
1597
        // this is a virgin before - we need to fish the anchor from the UCA
 
1598
        if (sourcetoken != null) {
 
1599
            int offset = sourcetoken.m_source_ & 0xFFFFFF;
 
1600
            m_UCAColEIter_.setText(m_source_.substring(offset, offset + 1));
 
1601
        }
 
1602
        else {
 
1603
            m_UCAColEIter_.setText(
 
1604
                             m_source_.substring(m_parsedToken_.m_charsOffset_,
 
1605
                             m_parsedToken_.m_charsOffset_ + 1));
 
1606
        }
 
1607
 
 
1608
        int basece = m_UCAColEIter_.next() & 0xFFFFFF3F;
 
1609
        int basecontce = m_UCAColEIter_.next();
 
1610
        if (basecontce == CollationElementIterator.NULLORDER) {
 
1611
            basecontce = 0;
 
1612
        }
 
1613
 
 
1614
        int ch = 0;
 
1615
 
 
1616
 
 
1617
        if((basece >>> 24 >= RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MIN_)
 
1618
                && (basece >>> 24 <=  RuleBasedCollator.UCA_CONSTANTS_.PRIMARY_IMPLICIT_MAX_)) { /* implicits - */
 
1619
 
 
1620
            int primary = basece & RuleBasedCollator.CE_PRIMARY_MASK_ | (basecontce & RuleBasedCollator.CE_PRIMARY_MASK_) >> 16;
 
1621
            int raw = RuleBasedCollator.impCEGen_.getRawFromImplicit(primary);
 
1622
            ch = RuleBasedCollator.impCEGen_.getCodePointFromRaw(raw-1);
 
1623
            int primaryCE = RuleBasedCollator.impCEGen_.getImplicitFromRaw(raw-1);
 
1624
            m_utilCEBuffer_[0] = primaryCE & RuleBasedCollator.CE_PRIMARY_MASK_ | 0x0505;
 
1625
            m_utilCEBuffer_[1] = (primaryCE << 16) & RuleBasedCollator.CE_PRIMARY_MASK_ | RuleBasedCollator.CE_CONTINUATION_MARKER_;
 
1626
 
 
1627
            m_parsedToken_.m_charsOffset_ = m_extraCurrent_;
 
1628
            m_source_.append('\uFFFE');
 
1629
            m_source_.append((char)ch);
 
1630
            m_extraCurrent_ += 2;
 
1631
            m_parsedToken_.m_charsLen_++;
 
1632
 
 
1633
            m_utilToken_.m_source_ = (m_parsedToken_.m_charsLen_ << 24)
 
1634
            | m_parsedToken_.m_charsOffset_;
 
1635
            m_utilToken_.m_rules_ = m_source_;
 
1636
            sourcetoken = m_hashTable_.get(m_utilToken_);
 
1637
 
 
1638
            if(sourcetoken == null) {
 
1639
                m_listHeader_[m_resultLength_] = new TokenListHeader();
 
1640
                m_listHeader_[m_resultLength_].m_baseCE_
 
1641
                    = m_utilCEBuffer_[0] & 0xFFFFFF3F;
 
1642
                if (RuleBasedCollator.isContinuation(m_utilCEBuffer_[1])) {
 
1643
                    m_listHeader_[m_resultLength_].m_baseContCE_
 
1644
                    = m_utilCEBuffer_[1];
 
1645
                }
 
1646
                else {
 
1647
                    m_listHeader_[m_resultLength_].m_baseContCE_ = 0;
 
1648
                }
 
1649
                m_listHeader_[m_resultLength_].m_nextCE_ = 0;
 
1650
                m_listHeader_[m_resultLength_].m_nextContCE_ = 0;
 
1651
                m_listHeader_[m_resultLength_].m_previousCE_ = 0;
 
1652
                m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
 
1653
                m_listHeader_[m_resultLength_].m_indirect_ = false;
 
1654
 
 
1655
                sourcetoken = new Token();
 
1656
                initAReset(-1, sourcetoken);
 
1657
            }
 
1658
 
 
1659
        } else {
 
1660
 
 
1661
            // first ce and second ce m_utilCEBuffer_
 
1662
            /*int invpos = */CollationParsedRuleBuilder.INVERSE_UCA_.getInversePrevCE(
 
1663
                                                         basece, basecontce,
 
1664
                                                         strength, m_utilCEBuffer_);
 
1665
            // we got the previous CE. Now we need to see if the difference between
 
1666
            // the two CEs is really of the requested strength.
 
1667
            // if it's a bigger difference (we asked for secondary and got primary), we
 
1668
            // need to modify the CE.
 
1669
            if(CollationParsedRuleBuilder.INVERSE_UCA_.getCEStrengthDifference(basece, basecontce, m_utilCEBuffer_[0], m_utilCEBuffer_[1]) < strength) {
 
1670
                // adjust the strength
 
1671
                // now we are in the situation where our baseCE should actually be modified in
 
1672
                // order to get the CE in the right position.
 
1673
                if(strength == Collator.SECONDARY) {
 
1674
                    m_utilCEBuffer_[0] = basece - 0x0200;
 
1675
                } else { // strength == UCOL_TERTIARY
 
1676
                    m_utilCEBuffer_[0] = basece - 0x02;
 
1677
                }
 
1678
                if(RuleBasedCollator.isContinuation(basecontce)) {
 
1679
                    if(strength == Collator.SECONDARY) {
 
1680
                        m_utilCEBuffer_[1] = basecontce - 0x0200;
 
1681
                    } else { // strength == UCOL_TERTIARY
 
1682
                        m_utilCEBuffer_[1] = basecontce - 0x02;
 
1683
                    }
 
1684
                }
 
1685
            }
 
1686
 
 
1687
/*
 
1688
            // the code below relies on getting a code point from the inverse table, in order to be
 
1689
            // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
 
1690
            // 1. There are many code points that have the same CE
 
1691
            // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
 
1692
            // Also, in case when there is no equivalent strength before an element, we have to actually
 
1693
            // construct one. For example, &[before 2]a << x won't result in x << a, because the element
 
1694
            // before a is a primary difference.
 
1695
            ch = CollationParsedRuleBuilder.INVERSE_UCA_.m_table_[3 * invpos
 
1696
                                                                      + 2];
 
1697
            if ((ch &  INVERSE_SIZE_MASK_) != 0) {
 
1698
                int offset = ch & INVERSE_OFFSET_MASK_;
 
1699
                ch = CollationParsedRuleBuilder.INVERSE_UCA_.m_continuations_[
 
1700
                                                                           offset];
 
1701
            }
 
1702
            m_source_.append((char)ch);
 
1703
            m_extraCurrent_ ++;
 
1704
            m_parsedToken_.m_charsOffset_ = m_extraCurrent_ - 1;
 
1705
            m_parsedToken_.m_charsLen_ = 1;
 
1706
 
 
1707
            // We got an UCA before. However, this might have been tailored.
 
1708
            // example:
 
1709
            // &\u30ca = \u306a
 
1710
            // &[before 3]\u306a<<<\u306a|\u309d
 
1711
 
 
1712
            m_utilToken_.m_source_ = (m_parsedToken_.m_charsLen_ << 24)
 
1713
                                                 | m_parsedToken_.m_charsOffset_;
 
1714
            m_utilToken_.m_rules_ = m_source_;
 
1715
            sourcetoken = (Token)m_hashTable_.get(m_utilToken_);
 
1716
*/
 
1717
 
 
1718
            // here is how it should be. The situation such as &[before 1]a < x, should be
 
1719
            // resolved exactly as if we wrote &a > x.
 
1720
            // therefore, I don't really care if the UCA value before a has been changed.
 
1721
            // However, I do care if the strength between my element and the previous element
 
1722
            // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
 
1723
            // have to construct the base CE.
 
1724
 
 
1725
            // if we found a tailored thing, we have to use the UCA value and
 
1726
            // construct a new reset token with constructed name
 
1727
            //if (sourcetoken != null && sourcetoken.m_strength_ != TOKEN_RESET_) {
 
1728
                // character to which we want to anchor is already tailored.
 
1729
                // We need to construct a new token which will be the anchor point
 
1730
                //m_source_.setCharAt(m_extraCurrent_ - 1, '\uFFFE');
 
1731
                //m_source_.append(ch);
 
1732
                //m_extraCurrent_ ++;
 
1733
                //m_parsedToken_.m_charsLen_ ++;
 
1734
                // grab before
 
1735
                m_parsedToken_.m_charsOffset_ -= 10;
 
1736
                m_parsedToken_.m_charsLen_ += 10;
 
1737
                m_listHeader_[m_resultLength_] = new TokenListHeader();
 
1738
                m_listHeader_[m_resultLength_].m_baseCE_
 
1739
                                                 = m_utilCEBuffer_[0] & 0xFFFFFF3F;
 
1740
                if (RuleBasedCollator.isContinuation(m_utilCEBuffer_[1])) {
 
1741
                    m_listHeader_[m_resultLength_].m_baseContCE_
 
1742
                                                              = m_utilCEBuffer_[1];
 
1743
                }
 
1744
                else {
 
1745
                    m_listHeader_[m_resultLength_].m_baseContCE_ = 0;
 
1746
                }
 
1747
                m_listHeader_[m_resultLength_].m_nextCE_ = 0;
 
1748
                m_listHeader_[m_resultLength_].m_nextContCE_ = 0;
 
1749
                m_listHeader_[m_resultLength_].m_previousCE_ = 0;
 
1750
                m_listHeader_[m_resultLength_].m_previousContCE_ = 0;
 
1751
                m_listHeader_[m_resultLength_].m_indirect_ = false;
 
1752
                sourcetoken = new Token();
 
1753
                initAReset(-1, sourcetoken);
 
1754
            //}
 
1755
        }
 
1756
        return sourcetoken;
 
1757
    }
 
1758
 
 
1759
    /**
 
1760
     * Processing Description.
 
1761
     * 1. Build a m_listHeader_. Each list has a header, which contains two lists
 
1762
     * (positive and negative), a reset token, a baseCE, nextCE, and
 
1763
     * previousCE. The lists and reset may be null.
 
1764
     * 2. As you process, you keep a LAST pointer that points to the last token
 
1765
     * you handled.
 
1766
     * @param expand string offset, -1 for null strings
 
1767
     * @param targetToken token to update
 
1768
     * @return expandnext offset
 
1769
     * @throws ParseException thrown when rules syntax failed
 
1770
     */
 
1771
    private int initAReset(int expand, Token targetToken) throws ParseException
 
1772
    {
 
1773
        if (m_resultLength_ == m_listHeader_.length - 1) {
 
1774
            // Unfortunately, this won't work, as we store addresses of lhs in
 
1775
            // token
 
1776
            TokenListHeader temp[] = new TokenListHeader[m_resultLength_ << 1];
 
1777
            System.arraycopy(m_listHeader_, 0, temp, 0, m_resultLength_ + 1);
 
1778
            m_listHeader_ = temp;
 
1779
        }
 
1780
        // do the reset thing
 
1781
        targetToken.m_rules_ = m_source_;
 
1782
        targetToken.m_source_ = m_parsedToken_.m_charsLen_ << 24
 
1783
                                | m_parsedToken_.m_charsOffset_;
 
1784
        targetToken.m_expansion_ = m_parsedToken_.m_extensionLen_ << 24
 
1785
                                   | m_parsedToken_.m_extensionOffset_;
 
1786
        // keep the flags around so that we know about before
 
1787
        targetToken.m_flags_ = m_parsedToken_.m_flags_;
 
1788
 
 
1789
        if (m_parsedToken_.m_prefixOffset_ != 0) {
 
1790
            throwParseException(m_rules_, m_parsedToken_.m_charsOffset_ - 1);
 
1791
        }
 
1792
 
 
1793
        targetToken.m_prefix_ = 0;
 
1794
        // TODO: this should also handle reverse
 
1795
        targetToken.m_polarity_ = TOKEN_POLARITY_POSITIVE_;
 
1796
        targetToken.m_strength_ = TOKEN_RESET_;
 
1797
        targetToken.m_next_ = null;
 
1798
        targetToken.m_previous_ = null;
 
1799
        targetToken.m_CELength_ = 0;
 
1800
        targetToken.m_expCELength_ = 0;
 
1801
        targetToken.m_listHeader_ = m_listHeader_[m_resultLength_];
 
1802
        m_listHeader_[m_resultLength_].m_first_ = null;
 
1803
        m_listHeader_[m_resultLength_].m_last_ = null;
 
1804
        m_listHeader_[m_resultLength_].m_first_ = null;
 
1805
        m_listHeader_[m_resultLength_].m_last_ = null;
 
1806
        m_listHeader_[m_resultLength_].m_reset_ = targetToken;
 
1807
 
 
1808
        /* 3 Consider each item: relation, source, and expansion:
 
1809
         * e.g. ...< x / y ...
 
1810
         * First convert all expansions into normal form. Examples:
 
1811
         * If "xy" doesn't occur earlier in the list or in the UCA, convert
 
1812
         * &xy * c * d * ... into &x * c/y * d * ...
 
1813
         * Note: reset values can never have expansions, although they can
 
1814
         * cause the very next item to have one. They may be contractions, if
 
1815
         * they are found earlier in the list.
 
1816
         */
 
1817
        int result = 0;
 
1818
        if (expand > 0) {
 
1819
            // check to see if there is an expansion
 
1820
            if (m_parsedToken_.m_charsLen_ > 1) {
 
1821
                targetToken.m_source_ = ((expand
 
1822
                                          - m_parsedToken_.m_charsOffset_ )
 
1823
                                          << 24)
 
1824
                                          | m_parsedToken_.m_charsOffset_;
 
1825
                result = ((m_parsedToken_.m_charsLen_
 
1826
                               + m_parsedToken_.m_charsOffset_ - expand) << 24)
 
1827
                               | expand;
 
1828
            }
 
1829
        }
 
1830
 
 
1831
        m_resultLength_ ++;
 
1832
        m_hashTable_.put(targetToken, targetToken);
 
1833
        return result;
 
1834
    }
 
1835
 
 
1836
    /**
 
1837
     * Checks if an character is special
 
1838
     * @param ch character to test
 
1839
     * @return true if the character is special
 
1840
     */
 
1841
    private static final boolean isSpecialChar(char ch)
 
1842
    {
 
1843
        return (ch <= 0x002F && ch >= 0x0020) || (ch <= 0x003F && ch >= 0x003A)
 
1844
               || (ch <= 0x0060 && ch >= 0x005B)
 
1845
               || (ch <= 0x007E && ch >= 0x007D) || ch == 0x007B;
 
1846
    }
 
1847
 
 
1848
    private
 
1849
    UnicodeSet readAndSetUnicodeSet(String source, int start) throws ParseException
 
1850
    {
 
1851
      while(source.charAt(start) != '[') { /* advance while we find the first '[' */
 
1852
        start++;
 
1853
      }
 
1854
      // now we need to get a balanced set of '[]'. The problem is that a set can have
 
1855
      // many, and *end point to the first closing '['
 
1856
      int noOpenBraces = 1;
 
1857
      int current = 1; // skip the opening brace
 
1858
      while(start+current < source.length() && noOpenBraces != 0) {
 
1859
        if(source.charAt(start+current) == '[') {
 
1860
          noOpenBraces++;
 
1861
        } else if(source.charAt(start+current) == ']') { // closing brace
 
1862
          noOpenBraces--;
 
1863
        }
 
1864
        current++;
 
1865
      }
 
1866
      //int nextBrace = -1;
 
1867
 
 
1868
      if(noOpenBraces != 0 || (/*nextBrace =*/ source.indexOf("]", start+current) /*']'*/) == -1) {
 
1869
        throwParseException(m_rules_, start);
 
1870
      }
 
1871
      return new UnicodeSet(source.substring(start, start+current)); //uset_openPattern(start, current);
 
1872
    }
 
1873
 
 
1874
 
 
1875
    /** in C, optionarg is passed by reference to function.
 
1876
     *  We use a private int to simulate this.
 
1877
     */
 
1878
    private int m_optionarg_ = 0;
 
1879
 
 
1880
    private int readOption(String rules, int start, int optionend)
 
1881
    {
 
1882
        m_optionarg_ = 0;
 
1883
        int i = 0;
 
1884
        while (i < RULES_OPTIONS_.length) {
 
1885
            String option = RULES_OPTIONS_[i].m_name_;
 
1886
            int optionlength = option.length();
 
1887
            if (rules.length() > start + optionlength
 
1888
                && option.equalsIgnoreCase(rules.substring(start,
 
1889
                                                      start + optionlength))) {
 
1890
                if (optionend - start > optionlength) {
 
1891
                    m_optionarg_ = start + optionlength;
 
1892
                    // start of the options, skip space
 
1893
                    while (m_optionarg_ < optionend && (UCharacter.isWhitespace(rules.charAt(m_optionarg_)) || UCharacterProperty.isRuleWhiteSpace(rules.charAt(m_optionarg_))))
 
1894
                    {   // eat whitespace
 
1895
                        m_optionarg_ ++;
 
1896
                    }
 
1897
                }
 
1898
                break;
 
1899
            }
 
1900
            i ++;
 
1901
        }
 
1902
        if(i == RULES_OPTIONS_.length) {
 
1903
            i = -1;
 
1904
        }
 
1905
        return i;
 
1906
    }
 
1907
    /**
 
1908
     * Reads and set collation options
 
1909
     * @return TOKEN_SUCCESS if option is set correct, 0 otherwise
 
1910
     * @exception ParseException thrown when options in rules are wrong
 
1911
     */
 
1912
    private byte readAndSetOption() throws ParseException
 
1913
    {
 
1914
        int start = m_current_ + 1; // skip opening '['
 
1915
        int i = readOption(m_rules_, start, m_optionEnd_);
 
1916
 
 
1917
        int optionarg = m_optionarg_;
 
1918
 
 
1919
        if (i < 0) {
 
1920
            throwParseException(m_rules_, start);
 
1921
        }
 
1922
 
 
1923
        if (i < 7) {
 
1924
            if (optionarg != 0) {
 
1925
                for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length;
 
1926
                                                                        j ++) {
 
1927
                     String subname = RULES_OPTIONS_[i].m_subOptions_[j];
 
1928
                     int size = optionarg + subname.length();
 
1929
                     if (m_rules_.length() > size
 
1930
                         && subname.equalsIgnoreCase(m_rules_.substring(
 
1931
                                                           optionarg, size))) {
 
1932
                         setOptions(m_options_, RULES_OPTIONS_[i].m_attribute_,
 
1933
                             RULES_OPTIONS_[i].m_subOptionAttributeValues_[j]);
 
1934
                         return TOKEN_SUCCESS_MASK_;
 
1935
                     }
 
1936
                }
 
1937
            }
 
1938
            throwParseException(m_rules_, optionarg);
 
1939
        }
 
1940
        else if (i == 7) { // variable top
 
1941
            return TOKEN_SUCCESS_MASK_ | TOKEN_VARIABLE_TOP_MASK_;
 
1942
        }
 
1943
        else if (i == 8) { // rearange
 
1944
            return TOKEN_SUCCESS_MASK_;
 
1945
        }
 
1946
        else if (i == 9) { // before
 
1947
            if (optionarg != 0) {
 
1948
                for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length;
 
1949
                                                                        j ++) {
 
1950
                     String subname = RULES_OPTIONS_[i].m_subOptions_[j];
 
1951
                     int size = optionarg + subname.length();
 
1952
                     if (m_rules_.length() > size
 
1953
                         && subname.equalsIgnoreCase(
 
1954
                                               m_rules_.substring(optionarg,
 
1955
                                              optionarg + subname.length()))) {
 
1956
                         return (byte)(TOKEN_SUCCESS_MASK_
 
1957
                            | RULES_OPTIONS_[i].m_subOptionAttributeValues_[j]
 
1958
                            + 1);
 
1959
                     }
 
1960
                }
 
1961
            }
 
1962
            throwParseException(m_rules_, optionarg);
 
1963
        }
 
1964
        else if (i == 10) {  // top, we are going to have an array with
 
1965
            // structures of limit CEs index to this array will be
 
1966
            // src->parsedToken.indirectIndex
 
1967
            m_parsedToken_.m_indirectIndex_ = 0;
 
1968
            return TOKEN_SUCCESS_MASK_ | TOKEN_TOP_MASK_;
 
1969
        }
 
1970
        else if (i < 13) { // first, last
 
1971
            for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length; j ++) {
 
1972
                String subname = RULES_OPTIONS_[i].m_subOptions_[j];
 
1973
                int size = optionarg + subname.length();
 
1974
                if (m_rules_.length() > size
 
1975
                    && subname.equalsIgnoreCase(m_rules_.substring(optionarg,
 
1976
                                                                   size))) {
 
1977
                    m_parsedToken_.m_indirectIndex_ = (char)(i - 10 + (j << 1));
 
1978
                    return TOKEN_SUCCESS_MASK_ | TOKEN_TOP_MASK_;
 
1979
                }
 
1980
            }
 
1981
            throwParseException(m_rules_, optionarg);
 
1982
        }
 
1983
        else if(i == 13 || i == 14) { // copy and remove are handled before normalization
 
1984
            // we need to move end here
 
1985
            int noOpenBraces = 1;
 
1986
            m_current_++; // skip opening brace
 
1987
            while(m_current_ < m_source_.length() && noOpenBraces != 0) {
 
1988
                if(m_source_.charAt(m_current_) == '[') {
 
1989
                  noOpenBraces++;
 
1990
                } else if(m_source_.charAt(m_current_) == ']') { // closing brace
 
1991
                  noOpenBraces--;
 
1992
                }
 
1993
                m_current_++;
 
1994
            }
 
1995
            m_optionEnd_ = m_current_-1;
 
1996
            return TOKEN_SUCCESS_MASK_;
 
1997
        }
 
1998
        else {
 
1999
            throwParseException(m_rules_, optionarg);
 
2000
        }
 
2001
        return TOKEN_SUCCESS_MASK_; // we will never reach here.
 
2002
    }
 
2003
 
 
2004
    /**
 
2005
     * Set collation option
 
2006
     * @param optionset option set to set
 
2007
     * @param attribute type to set
 
2008
     * @param value attribute value
 
2009
     */
 
2010
    private void setOptions(OptionSet optionset, int attribute, int value)
 
2011
    {
 
2012
        switch (attribute) {
 
2013
            case RuleBasedCollator.Attribute.HIRAGANA_QUATERNARY_MODE_ :
 
2014
                optionset.m_isHiragana4_
 
2015
                            = (value == RuleBasedCollator.AttributeValue.ON_);
 
2016
                break;
 
2017
            case RuleBasedCollator.Attribute.FRENCH_COLLATION_ :
 
2018
                optionset.m_isFrenchCollation_
 
2019
                             = (value == RuleBasedCollator.AttributeValue.ON_);
 
2020
                break;
 
2021
            case RuleBasedCollator.Attribute.ALTERNATE_HANDLING_ :
 
2022
                optionset.m_isAlternateHandlingShifted_
 
2023
                             = (value
 
2024
                                == RuleBasedCollator.AttributeValue.SHIFTED_);
 
2025
                break;
 
2026
            case RuleBasedCollator.Attribute.CASE_FIRST_ :
 
2027
                optionset.m_caseFirst_ = value;
 
2028
                break;
 
2029
            case RuleBasedCollator.Attribute.CASE_LEVEL_ :
 
2030
                optionset.m_isCaseLevel_
 
2031
                             = (value == RuleBasedCollator.AttributeValue.ON_);
 
2032
                break;
 
2033
            case RuleBasedCollator.Attribute.NORMALIZATION_MODE_ :
 
2034
                if (value == RuleBasedCollator.AttributeValue.ON_) {
 
2035
                    value = Collator.CANONICAL_DECOMPOSITION;
 
2036
                }
 
2037
                optionset.m_decomposition_ = value;
 
2038
                break;
 
2039
            case RuleBasedCollator.Attribute.STRENGTH_ :
 
2040
                optionset.m_strength_ = value;
 
2041
                break;
 
2042
            default :
 
2043
                break;
 
2044
        }
 
2045
      }
 
2046
 
 
2047
    UnicodeSet getTailoredSet() throws ParseException
 
2048
    {
 
2049
        boolean startOfRules = true;
 
2050
        UnicodeSet tailored = new UnicodeSet();
 
2051
        String pattern;
 
2052
        CanonicalIterator it = new CanonicalIterator("");
 
2053
 
 
2054
        m_parsedToken_.m_strength_ = TOKEN_UNSET_;
 
2055
        int sourcelimit = m_source_.length();
 
2056
        //int expandNext = 0;
 
2057
 
 
2058
        while (m_current_ < sourcelimit) {
 
2059
        m_parsedToken_.m_prefixOffset_ = 0;
 
2060
        if (parseNextToken(startOfRules) < 0) {
 
2061
            // we have reached the end
 
2062
            continue;
 
2063
        }
 
2064
        startOfRules = false;
 
2065
        // The idea is to tokenize the rule set. For each non-reset token,
 
2066
        // we add all the canonicaly equivalent FCD sequences
 
2067
            if(m_parsedToken_.m_strength_ != TOKEN_RESET_) {
 
2068
                it.setSource(m_source_.substring(
 
2069
                      m_parsedToken_.m_charsOffset_,
 
2070
                      m_parsedToken_.m_charsOffset_+m_parsedToken_.m_charsLen_));
 
2071
                pattern = it.next();
 
2072
                while(pattern != null) {
 
2073
                      if(Normalizer.quickCheck(pattern, Normalizer.FCD,0) != Normalizer.NO) {
 
2074
                        tailored.add(pattern);
 
2075
                    }
 
2076
                    pattern = it.next();
 
2077
                }
 
2078
            }
 
2079
        }
 
2080
        return tailored;
 
2081
    }
 
2082
 
 
2083
    final private void extractSetsFromRules(String rules) throws ParseException {
 
2084
      int optionNumber = -1;
 
2085
      int setStart = 0;
 
2086
      int i = 0;
 
2087
      while(i < rules.length()) {
 
2088
        if(rules.charAt(i) == 0x005B) {
 
2089
          optionNumber = readOption(rules, i+1, rules.length());
 
2090
          setStart = m_optionarg_;
 
2091
          if(optionNumber == 13) { /* copy - parts of UCA to tailoring */
 
2092
            UnicodeSet newSet = readAndSetUnicodeSet(rules, setStart);
 
2093
              if(m_copySet_ == null) {
 
2094
                m_copySet_ = newSet;
 
2095
              } else {
 
2096
                m_copySet_.addAll(newSet);
 
2097
              }
 
2098
          } else if(optionNumber == 14) {
 
2099
            UnicodeSet newSet = readAndSetUnicodeSet(rules, setStart);
 
2100
              if(m_removeSet_ == null) {
 
2101
                m_removeSet_ = newSet;
 
2102
              } else {
 
2103
                m_removeSet_.addAll(newSet);
 
2104
              }
 
2105
          }
 
2106
        }
 
2107
        i++;
 
2108
      }
 
2109
    }
 
2110
}