2
**********************************************************************
3
* Copyright (C) 1999-2000 IBM and others. All rights reserved.
4
**********************************************************************
5
* Date Name Description
6
* 03/22/2000 helena Creation.
7
**********************************************************************
12
#include "unicode/utypes.h"
13
#include "unicode/unistr.h"
14
#include "unicode/chariter.h"
15
#include "unicode/tblcoll.h"
16
#include "unicode/brkiter.h"
22
* <code>StringSearch</code> is a <code>SearchIterator</code> that provides
23
* language-sensitive text searching based on the comparison rules defined
24
* in a {@link RuleBasedCollator} object.
25
* Instances of <code>StringSearch</code> function as iterators
26
* maintain a current position and scan over text returning the index of
27
* characters where the pattern occurs and the length of each match.
29
* <code>StringSearch</code> uses a version of the fast Boyer-Moore search
30
* algorithm that has been adapted to work with the large character set of
31
* Unicode. See "Efficient Text Searching in Java", to be published in
32
* <i>Java Report</i> in February, 1999, for further information on the algorithm.
34
* Consult the <code>SearchIterator</code> documentation for information on
35
* and examples of how to use instances of this class to implement text
36
* searching. <code>SearchIterator</code> provides all of the necessary
37
* API; this class only provides constructors and internal implementation
41
* @see RuleBasedCollator
43
* @author Laura Werner
47
class StringSearch : public SearchIterator
51
* Construct a <code>StringSearch</code> object using a specific collator and set
52
* of boundary-detection rules.
54
* @param pat The text for which this object will search.
56
* @param target The text in which to search for the pattern.
58
* @param coll A <code>RuleBasedCollator</code> object which defines the
59
* language-sensitive comparison rules used to determine
60
* whether text in the pattern and target matches.
62
* @param breaker A <code>BreakIterator</code> object used to constrain the matches
63
* that are found. Matches whose start and end indices
64
* in the target text are not boundaries as determined
65
* by the <code>BreakIterator</code> are ignored. If this behavior
66
* is not desired, <code>null</code> can be passed in instead.
68
StringSearch(const UnicodeString& pat,
69
CharacterIterator* target,
70
RuleBasedCollator* coll,
71
BreakIterator* breaker,
75
* Construct a <code>StringSearch</code> object using a specific collator.
77
* @param pattern The text for which this object will search.
79
* @param target The text in which to search for the pattern.
81
* @param collator A <code>RuleBasedCollator</code> object which defines the
82
* language-sensitive comparison rules used to determine
83
* whether text in the pattern and target matches.
85
StringSearch(const UnicodeString& pattern,
86
CharacterIterator* target,
87
RuleBasedCollator* collator,
93
StringSearch(const StringSearch& that);
96
* Construct a <code>StringSearch</code> object using the collator and
97
* character boundary detection rules for a given locale
99
* @param pattern The text for which this object will search.
101
* @param target The text in which to search for the pattern.
103
* @param loc The locale whose collation and break-detection rules
106
* @exception ClassCastException thrown if the collator for the specified
107
* locale is not a RuleBasedCollator.
109
StringSearch(const UnicodeString& pattern,
110
CharacterIterator* target,
114
* Construct a <code>StringSearch</code> object using the collator for the default
117
* @param pattern The text for which this object will search.
119
* @param target The text in which to search for the pattern.
121
* @param collator A <code>RuleBasedCollator</code> object which defines the
122
* language-sensitive comparison rules used to determine
123
* whether text in the pattern and target matches.
125
StringSearch(const UnicodeString& pattern,
126
const UnicodeString& target,
129
virtual ~StringSearch(void);
131
* Assignment operator. Sets this iterator to have the same behavior,
132
* and iterate over the same text, as the one passed in.
134
StringSearch& operator=(const StringSearch& that);
137
* Equality operator. Returns TRUE if both BreakIterators are of the
138
* same class, have the same behavior, and iterate over the same text.
140
virtual UBool operator==(const SearchIterator& that) const;
143
* Not-equal operator. If operator== returns TRUE, this returns FALSE,
146
UBool operator!=(const SearchIterator& that) const;
149
* Returns a newly-constructed RuleBasedBreakIterator with the same
150
* behavior, and iterating over the same text, as this one.
152
virtual SearchIterator* clone(void) const;
154
//-------------------------------------------------------------------
155
// Getters and Setters
156
//-------------------------------------------------------------------
159
* Sets this object's strength property. The strength determines the
160
* minimum level of difference considered significant during a
161
* search. Generally, {@link Collator#TERTIARY} and
162
* {@link Collator#IDENTICAL} indicate that all differences are
163
* considered significant, {@link Collator#SECONDARY} indicates
164
* that upper/lower case distinctions should be ignored, and
165
* {@link Collator#PRIMARY} indicates that both case and accents
166
* should be ignored. However, the exact meanings of these constants
167
* are determined by individual Collator objects.
169
* @see Collator#PRIMARY
170
* @see Collator#SECONDARY
171
* @see Collator#TERTIARY
172
* @see Collator#IDENTICAL
174
void setStrength(Collator::ECollationStrength newStrength, UErrorCode& status);
178
* Returns this object's strength property, which indicates what level
179
* of differences are considered significant during a search.
183
Collator::ECollationStrength getStrength(void) const{ return strength; }
186
* Set the collator to be used for this string search. Also changes
187
* the search strength to match that of the new collator.
189
* This method causes internal data such as Boyer-Moore shift tables
190
* to be recalculated, but the iterator's position is unchanged.
194
void setCollator(const RuleBasedCollator* coll, UErrorCode& status);
197
* Return the RuleBasedCollator being used for this string search.
199
const RuleBasedCollator& getCollator() const;
202
* Set the pattern for which to search.
203
* This method causes internal data such as Boyer-Moore shift tables
204
* to be recalculated, but the iterator's position is unchanged.
206
void setPattern(const UnicodeString& pat, UErrorCode& status);
209
* Returns the pattern for which this object is searching.
211
const UnicodeString& getPattern() const;
214
* Set the target text which should be searched and resets the
215
* iterator's position to point before the start of the new text.
216
* This method is useful if you want to re-use an iterator to
217
* search for the same pattern within a different body of text.
219
virtual void setTarget(const UnicodeString& newText);
222
* Set the target text which should be searched and resets the
223
* iterator's position to point before the start of the target text.
224
* This method is useful if you want to re-use an iterator to
225
* search for the same pattern within a different body of text.
229
virtual void adoptTarget(CharacterIterator* iterator);
233
virtual void reset(void);
235
* Returns a unique class ID POLYMORPHICALLY. Pure virtual override.
236
* This method is to implement a simple version of RTTI, since not all
237
* C++ compilers support genuine RTTI. Polymorphic operator==() and
238
* clone() methods call this method.
240
* @return The class ID for this object. All objects of a
241
* given class have the same class ID. Objects of
242
* other classes have different class IDs.
244
inline virtual UClassID getDynamicClassID(void) const;
247
* Returns the class ID for this class. This is useful only for
248
* comparing to a return value from getDynamicClassID(). For example:
250
* Base* polymorphic_pointer = createPolymorphicObject();
251
* if (polymorphic_pointer->getDynamicClassID() ==
252
* Derived::getStaticClassID()) ...
254
* @return The class ID for all objects of this class.
256
inline static UClassID getStaticClassID(void);
259
//-------------------------------------------------------------------
261
//-------------------------------------------------------------------
264
* Search forward for matching text, starting at a given location.
265
* Clients should not call this method directly; instead they should call
266
* {@link SearchIterator#next}.
268
* If a match is found, this method returns the index at which the match
269
* starts and calls {@link SearchIterator#setMatchLength}
270
* with the number of characters in the target
271
* text that make up the match. If no match is found, the method returns
272
* <code>DONE</code> and does not call <tt>setMatchLength</tt>.
274
* @param start The index in the target text at which the search starts.
276
* @return The index at which the matched text in the target starts, or DONE
277
* if no match was found.
279
* @see SearchIterator#next
280
* @see SearchIterator#DONE
282
virtual int32_t handleNext(int32_t start, UErrorCode& status);
284
* Search backward for matching text ,starting at a given location.
285
* Clients should not call this method directly; instead they should call
286
* <code>SearchIterator.previous()</code>, which this method overrides.
288
* If a match is found, this method returns the index at which the match
289
* starts and calls {@link SearchIterator#setMatchLength}
290
* with the number of characters in the target
291
* text that make up the match. If no match is found, the method returns
292
* <code>DONE</code> and does not call <tt>setMatchLength</tt>.
294
* @param start The index in the target text at which the search starts.
296
* @return The index at which the matched text in the target starts, or DONE
297
* if no match was found.
299
* @see SearchIterator#previous
300
* @see SearchIterator#DONE
302
virtual int32_t handlePrev(int32_t start, UErrorCode& status);
305
* Return a bitmask that will select only the portions of a collation
306
* element that are significant at the given strength level.
308
static int32_t getMask(Collator::ECollationStrength strength);
311
void initialize(UErrorCode& status);
313
* Method used by StringSearch to determine how far to the right to
314
* shift the pattern during a Boyer-Moore search.
316
* @param curValue The current value in the target text
317
* @param curIndex The index in the pattern at which we failed to match
318
* curValue in the target text.
320
int32_t getShift( int32_t curValue, int32_t curIndex ) const;
323
* Method used by StringSearch to determine how far to the left to
324
* shift the pattern during a reverse Boyer-Moore search.
326
* @param curValue The current value in the target text
327
* @param curIndex The index in the pattern at which we failed to match
328
* curValue in the target text.
330
int32_t getBackShift( int32_t curValue, int32_t curIndex ) const;
333
* Hash a collation element from its full size (32 bits) down into a
334
* value that can be used as an index into the shift tables. Right
335
* now we do a modulus by the size of the hash table.
337
* TODO: At some point I should experiment to see whether a slightly
338
* more complicated hash function gives us a better distribution
339
* on multilingual text. I doubt it will have much effect on
340
* performance, though.
342
static int32_t hash(int32_t order);
344
//------------------------------------------------------------------------
347
CollationElementIterator *iter;
348
RuleBasedCollator *collator;
349
/* HSYS ? Why? Changes to this will not affect collator. no changes to the comparsion result */
350
Collator::ECollationStrength strength;
352
//------------------------------------------------------------------------
353
// Everything from here on down is the data used to represent the
354
// Boyer-Moore shift tables and the code that generates and manipulates
358
int32_t valueListLen;
359
int32_t shiftTable[256];
360
int32_t backShiftTable[256];
362
UnicodeString pattern; // The pattern string
363
int32_t normLen; // num. of collation elements in pattern.
364
int32_t minLen; // Min of composed, decomposed versions
365
int32_t maxLen; // Max
366
CollationElementIterator *it; // to be removed
374
static char fgClassID;
377
inline UBool ::StringSearch::operator!=(const SearchIterator& that) const
379
return !operator==(that);
382
inline UClassID ::StringSearch::getDynamicClassID(void) const
384
return ::StringSearch::getStaticClassID();
387
inline UClassID ::StringSearch::getStaticClassID(void)
389
return (UClassID)(&fgClassID);