4
**********************************************************************
5
* Copyright (c) 2001-2005, International Business Machines
6
* Corporation and others. All Rights Reserved.
7
**********************************************************************
13
#include "unicode/utypes.h"
14
#include "unicode/uobject.h"
23
// RBBISetBuilder Derives the character categories used by the runtime RBBI engine
24
// from the Unicode Sets appearing in the source RBBI rules, and
25
// creates the TRIE table used to map from Unicode to the
26
// character categories.
33
// Each of the non-overlapping character ranges gets one of these descriptors.
34
// All of them are strung together in a linked list, which is kept in order
37
class RangeDescriptor : public UMemory {
39
UChar32 fStartChar; // Start of range, unicode 32 bit value.
40
UChar32 fEndChar; // End of range, unicode 32 bit value.
41
int32_t fNum; // runtime-mapped input value for this range.
42
UVector *fIncludesSets; // vector of the the original
43
// Unicode sets that include this range.
44
// (Contains ptrs to uset nodes)
45
RangeDescriptor *fNext; // Next RangeDescriptor in the linked list.
47
RangeDescriptor(UErrorCode &status);
48
RangeDescriptor(const RangeDescriptor &other, UErrorCode &status);
50
void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with
51
// where appearing in the second (higher) part.
52
void setDictionaryFlag(); // Check whether this range appears as part of
53
// the Unicode set named "dictionary"
56
RangeDescriptor(const RangeDescriptor &other); // forbid copying of this class
57
RangeDescriptor &operator=(const RangeDescriptor &other); // forbid copying of this class
62
// RBBISetBuilder Handles processing of Unicode Sets from RBBI rules.
64
// Starting with the rules parse tree from the scanner,
66
// - Enumerate the set of UnicodeSets that are referenced
68
// - compute a derived set of non-overlapping UnicodeSets
69
// that will correspond to columns in the state table for
70
// the RBBI execution engine.
71
// - construct the trie table that maps input characters
72
// to set numbers in the non-overlapping set of sets.
76
class RBBISetBuilder : public UMemory {
78
RBBISetBuilder(RBBIRuleBuilder *rb);
82
void addValToSets(UVector *sets, uint32_t val);
83
void addValToSet (RBBINode *usetNode, uint32_t val);
84
int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the
85
// runtime state machine, which are the same as
86
// columns in the DFA state table
87
int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie.
88
void serializeTrie(uint8_t *where); // write out the serialized Trie.
89
UChar32 getFirstChar(int32_t val) const;
90
UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo
91
// character were encountered.
95
void printRangeGroups();
99
#define printRangeGroups()
105
RBBIRuleBuilder *fRB; // The RBBI Rule Compiler that owns us.
108
RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors
110
UNewTrie *fTrie; // The mapping TRIE that is the end result of processing
111
uint32_t fTrieSize; // the Unicode Sets.
113
// Groups correspond to character categories -
114
// groups of ranges that are in the same original UnicodeSets.
115
// fGroupCount is the index of the last used group.
116
// fGroupCount+1 is also the number of columns in the RBBI state table being compiled.
117
// State table column 0 is not used. Column 1 is for end-of-input.
118
// column 2 is for group 0. Funny counting.
123
RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class
124
RBBISetBuilder &operator=(const RBBISetBuilder &other); // forbid copying of this class