2
**********************************************************************
3
* Copyright (C) 2001 IBM and others. All rights reserved.
4
**********************************************************************
5
* Date Name Description
6
* 03/22/2000 helena Creation.
7
**********************************************************************
13
#include "unicode/unistr.h"
14
#include "unicode/chariter.h"
15
#include "unicode/brkiter.h"
16
#include "unicode/usearch.h"
19
typedef struct USearch USearch;
24
* <tt>SearchIterator</tt> is an abstract base class that provides
25
* methods to search for a pattern within a text string. Instances of
26
* <tt>SearchIterator</tt> maintain a current position and scans over the
27
* target text, returning the indices the pattern is matched and the length
30
* <tt>SearchIterator</tt> defines a protocol for text searching.
31
* Subclasses provide concrete implementations of various search algorithms.
32
* For example, {@link StringSearch} implements language-sensitive pattern
33
* matching based on the comparison rules defined in a
34
* {@link RuleBasedCollator} object.
36
* Other options for searching includes using a BreakIterator to restrict
37
* the points at which matches are detected.
39
* <tt>SearchIterator</tt> provides an API that is similar to that of
40
* other text iteration classes such as <tt>BreakIterator</tt>. Using
41
* this class, it is easy to scan through text looking for all occurances of
42
* a given pattern. The following example uses a <tt>StringSearch</tt>
43
* object to find all instances of "fox" in the target string. Any other
44
* subclass of <tt>SearchIterator</tt> can be used in an identical
47
* UnicodeString target("The quick brown fox jumped over the lazy fox");
48
* UnicodeString pattern("fox");
50
* SearchIterator *iter = new StringSearch(pattern, target);
52
* for (int pos = iter->first(); pos != USEARCH_DONE;
53
* pos = iter->next()) {
54
* printf("Found match at %d pos, length is %d\n", pos,
55
* iter.getMatchLength());
61
class U_I18N_API SearchIterator {
65
// public constructors and destructors -------------------------------
68
* Copy constructor that creates a SearchIterator instance with the same
69
* behavior, and iterating over the same text.
70
* @param other the SearchIterator instance to be copied.
73
SearchIterator(const SearchIterator &other);
76
* Destructor. Cleans up the search iterator data struct.
79
virtual ~SearchIterator();
81
// public get and set methods ----------------------------------------
84
* Sets the index to point to the given position, and clears any state
87
* This method takes the argument index and sets the position in the text
88
* string accordingly without checking if the index is pointing to a
89
* valid starting point to begin searching.
90
* @param position within the text to be set
91
* @param status for errors if it occurs
94
virtual void setOffset(UTextOffset position, UErrorCode &status) = 0;
97
* Return the current index in the text being searched.
98
* If the iteration has gone past the end of the text
99
* (or past the beginning for a backwards search), {@link #USEARCH_DONE}
101
* @return current index in the text being searched.
104
virtual UTextOffset getOffset(void) const = 0;
107
* Sets the text searching attributes located in the enum
108
* USearchAttribute with values from the enum USearchAttributeValue.
109
* USEARCH_DEFAULT can be used for all attributes for resetting.
110
* @param attribute text attribute (enum USearchAttribute) to be set
111
* @param value text attribute value
112
* @param status for errors if it occurs
115
void setAttribute(USearchAttribute attribute,
116
USearchAttributeValue value,
120
* Gets the text searching attributes
121
* @param attribute text attribute (enum USearchAttribute) to be retrieve
122
* @return text attribute value
125
USearchAttributeValue getAttribute(USearchAttribute attribute) const;
128
* Returns the index to the match in the text string that was searched.
129
* This call returns a valid result only after a successful call to
130
* {@link #first}, {@link #next}, {@link #previous}, or {@link #last}.
131
* Just after construction, or after a searching method returns
132
* <tt>USEARCH_DONE</tt>, this method will return <tt>USEARCH_DONE</tt>.
134
* Use getMatchedLength to get the matched string length.
135
* @return index of a substring within the text string that is being
139
UTextOffset getMatchedStart(void) const;
142
* Returns the length of text in the string which matches the search
143
* pattern. This call returns a valid result only after a successful call
144
* to {@link #first}, {@link #next}, {@link #previous}, or {@link #last}.
145
* Just after construction, or after a searching method returns
146
* <tt>USEARCH_DONE</tt>, this method will return 0.
147
* @return The length of the match in the target text, or 0 if there
148
* is no match currently.
151
int32_t getMatchedLength(void) const;
154
* Returns the text that was matched by the most recent call to
155
* {@link #first}, {@link #next}, {@link #previous}, or {@link #last}.
156
* If the iterator is not pointing at a valid match (e.g. just after
157
* construction or after <tt>USEARCH_DONE</tt> has been returned,
158
* returns an empty string.
159
* @param result stores the matched string or an empty string if a match
163
void getMatchedText(UnicodeString &result) const;
166
* Set the BreakIterator that will be used to restrict the points
167
* at which matches are detected. The user is responsible for deleting
169
* @param breakiter A BreakIterator that will be used to restrict the
170
* points at which matches are detected. If a match is
171
* found, but the match's start or end index is not a
172
* boundary as determined by the <tt>BreakIterator</tt>,
173
* the match will be rejected and another will be searched
174
* for. If this parameter is <tt>NULL</tt>, no break
175
* detection is attempted.
176
* @param status for errors if it occurs
179
void setBreakIterator(BreakIterator *breakiter, UErrorCode &status);
182
* Returns the BreakIterator that is used to restrict the points at
183
* which matches are detected. This will be the same object that was
184
* passed to the constructor or to <tt>setBreakIterator</tt>.
185
* Note that <tt>NULL</tt> is a legal value; it means that break
186
* detection should not be attempted.
187
* @return BreakIterator used to restrict matchings.
190
const BreakIterator * getBreakIterator(void) const;
193
* Set the string text to be searched. Text iteration will hence begin at
194
* the start of the text string. This method is useful if you want to
195
* re-use an iterator to search for the same pattern within a different
196
* body of text. The user is responsible for deleting the text.
197
* @param text string to be searched.
198
* @param status for errors. If the text length is 0,
199
* an U_ILLEGAL_ARGUMENT_ERROR is returned.
202
virtual void setText(const UnicodeString &text, UErrorCode &status);
205
* Set the string text to be searched. Text iteration will hence begin at
206
* the start of the text string. This method is useful if you want to
207
* re-use an iterator to search for the same pattern within a different
210
* Note: No parsing of the text within the <tt>CharacterIterator</tt>
211
* will be done during searching for this version. The block of text
212
* in <tt>CharacterIterator</tt> will be used as it is.
213
* The user is responsible for deleting the text.
214
* @param text string iterator to be searched.
215
* @param status for errors if any. If the text length is 0 then an
216
* U_ILLEGAL_ARGUMENT_ERROR is returned.
219
virtual void setText(CharacterIterator &text, UErrorCode &status);
222
* Return the string text to be searched.
223
* @return text string to be searched.
226
const UnicodeString & getText(void) const;
228
// operator overloading ----------------------------------------------
232
* @param that SearchIterator instance to be compared.
233
* @return TRUE if both BreakIterators are of the same class, have the
234
* same behavior, terates over the same text and have the same
235
* attributes. FALSE otherwise.
238
virtual UBool operator==(const SearchIterator &that) const;
241
* Not-equal operator.
242
* @param that SearchIterator instance to be compared.
243
* @return FALSE if operator== returns TRUE, and vice versa.
246
UBool operator!=(const SearchIterator &that) const;
248
// public methods ----------------------------------------------------
251
* Returns a copy of SearchIterator with the same behavior, and
252
* iterating over the same text, as this one. Note that all data will be
253
* replicated, except for the text string to be searched.
254
* @return cloned object
257
virtual SearchIterator* safeClone(void) const = 0;
260
* Returns the first index at which the string text matches the search
261
* pattern. The iterator is adjusted so that its current index (as
262
* returned by {@link #usearch_getOffset}) is the match position if one
264
* If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
265
* the iterator will be adjusted to the index USEARCH_DONE
266
* @param status for errors if it occurs
267
* @return The character index of the first match, or
268
* <tt>USEARCH_DONE</tt> if there are no matches.
271
UTextOffset first(UErrorCode &status);
274
* Returns the first index greater than <tt>position</tt> at which the
275
* string text matches the search pattern. The iterator is adjusted so
276
* that its current index (as returned by {@link #getOffset}) is the
277
* match position if one was found. If a match is not found,
278
* <tt>USEARCH_DONE</tt> will be returned and the iterator will be
279
* adjusted to the index USEARCH_DONE
280
* @param position where search if to start from
281
* @param status for errors if it occurs
282
* @return The character index of the first match following
283
* <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are no
287
UTextOffset following(UTextOffset position, UErrorCode &status);
290
* Returns the last index in the target text at which it matches the
291
* search pattern. The iterator is adjusted so that its current index
292
* (as returned by {@link #getOffset}) is the match position if one was
294
* If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
295
* the iterator will be adjusted to the index USEARCH_DONE.
296
* @param status for errors if it occurs
297
* @return The index of the first match, or <tt>USEARCH_DONE</tt> if
298
* there are no matches.
301
UTextOffset last(UErrorCode &status);
304
* Returns the first index less than <tt>position</tt> at which the string
305
* text matches the search pattern. The iterator is adjusted so that its
306
* current index (as returned by {@link #getOffset}) is the match
307
* position if one was found. If a match is not found,
308
* <tt>USEARCH_DONE</tt> will be returned and the iterator will be
309
* adjusted to the index USEARCH_DONE
310
* @param position where search is to start from
311
* @param status for errors if it occurs
312
* @return The character index of the first match preceding
313
* <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are
317
UTextOffset preceding(UTextOffset position, UErrorCode &status);
320
* Returns the index of the next point at which the text matches the
321
* search pattern, starting from the current position
322
* The iterator is adjusted so that its current index (as returned by
323
* {@link #getIndex}) is the match position if one was found.
324
* If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
325
* the iterator will be adjusted to a position after the end of the text
327
* @param status for errors if it occurs
328
* @return The index of the next match after the current position,
329
* or <tt>USEARCH_DONE</tt> if there are no more matches.
332
UTextOffset next(UErrorCode &status);
335
* Returns the index of the previous point at which the string text
336
* matches the search pattern, starting at the current position.
337
* The iterator is adjusted so that its current index (as returned by
338
* {@link #getOffset}) is the match position if one was found.
339
* If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
340
* the iterator will be adjusted to the index USEARCH_DONE
341
* @param status for errors if it occurs
342
* @return The index of the previous match before the current position,
343
* or <tt>USEARCH_DONE</tt> if there are no more matches.
346
UTextOffset previous(UErrorCode &status);
349
* Resets the iteration.
350
* Search will begin at the start of the text string if a forward
351
* iteration is initiated before a backwards iteration. Otherwise if a
352
* backwards iteration is initiated before a forwards iteration, the
353
* search will begin at the end of the text string.
356
virtual void reset();
359
// protected data members ---------------------------------------------
362
* C search data struct
368
* Currently the C++ breakiterator does not have getRules etc to reproduce
369
* another in C. Hence we keep the original around and do the verification
370
* at the end of the match. The user is responsible for deleting this
373
BreakIterator *m_breakiterator_;
376
* Unicode string version of the search text
378
UnicodeString m_text_;
380
// protected constructors and destructors -----------------------------
383
* Default constructor.
384
* Initializes data to the default values.
389
* Constructor for use by subclasses.
390
* @param text The target text to be searched.
391
* @param breakiter A {@link BreakIterator} that is used to restrict the
392
* points at which matches are detected. If
393
* <tt>handleNext</tt> or <tt>handlePrev</tt> finds a
394
* match, but the match's start or end index is not a
395
* boundary as determined by the <tt>BreakIterator</tt>,
396
* the match is rejected and <tt>handleNext</tt> or
397
* <tt>handlePrev</tt> is called again. If this parameter
398
* is <tt>NULL</tt>, no break detection is attempted. .
400
SearchIterator(const UnicodeString &text,
401
BreakIterator *breakiter = NULL);
404
* Constructor for use by subclasses.
406
* Note: No parsing of the text within the <tt>CharacterIterator</tt>
407
* will be done during searching for this version. The block of text
408
* in <tt>CharacterIterator</tt> will be used as it is.
409
* @param text The target text to be searched.
410
* @param breakiter A {@link BreakIterator} that is used to restrict the
411
* points at which matches are detected. If
412
* <tt>handleNext</tt> or <tt>handlePrev</tt> finds a
413
* match, but the match's start or end index is not a
414
* boundary as determined by the <tt>BreakIterator</tt>,
415
* the match is rejected and <tt>handleNext</tt> or
416
* <tt>handlePrev</tt> is called again. If this parameter
417
* is <tt>NULL</tt>, no break detection is attempted.
419
SearchIterator(CharacterIterator &text, BreakIterator *breakiter = NULL);
421
// protected methods --------------------------------------------------
424
* Assignment operator. Sets this iterator to have the same behavior,
425
* and iterate over the same text, as the one passed in.
426
* @param that instance to be copied.
428
SearchIterator & operator=(const SearchIterator &that);
431
* Abstract method which subclasses override to provide the mechanism
432
* for finding the next match in the target text. This allows different
433
* subclasses to provide different search algorithms.
435
* If a match is found, the implementation should return the index at
436
* which the match starts and should call
437
* {@link #setMatchLength setMatchLength} with the number of characters
438
* in the target text that make up the match. If no match is found, the
439
* method should return USEARCH_DONE.
441
* @param position The index in the target text at which the search
443
* @param status for error codes if it occurs.
444
* @return index at which the match starts, else if match is not found
445
* USEARCH_DONE is returned
447
virtual UTextOffset handleNext(UTextOffset position, UErrorCode &status)
451
* Abstract method which subclasses override to provide the mechanism for
452
* finding the previous match in the target text. This allows different
453
* subclasses to provide different search algorithms.
455
* If a match is found, the implementation should return the index at
456
* which the match starts and should call
457
* {@link #setMatchLength setMatchLength} with the number of characters
458
* in the target text that make up the match. If no match is found, the
459
* method should return USEARCH_DONE.
461
* @param position The index in the target text at which the search
463
* @param status for error codes if it occurs.
464
* @return index at which the match starts, else if match is not found
465
* USEARCH_DONE is returned
467
virtual UTextOffset handlePrev(UTextOffset position, UErrorCode &status)
471
* Sets the length of the currently matched string in the text string to
473
* Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
474
* methods should call this when they find a match in the target text.
475
* @param length length of the matched text.
477
virtual void setMatchLength(int32_t length);
480
* Sets the offset of the currently matched string in the text string to
482
* Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
483
* methods should call this when they find a match in the target text.
484
* @param position start offset of the matched text.
486
virtual void setMatchStart(UTextOffset position);
489
* sets match not found
491
void setMatchNotFound();
494
inline UBool SearchIterator::operator!=(const SearchIterator &that) const
496
return !operator==(that);