2
**********************************************************************
3
* Copyright (C) 2001-2008 IBM and others. All rights reserved.
4
**********************************************************************
5
* Date Name Description
6
* 03/22/2000 helena Creation.
7
**********************************************************************
13
#include "unicode/utypes.h"
17
* \brief C++ API: SearchIterator object.
20
#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
22
#include "unicode/uobject.h"
23
#include "unicode/unistr.h"
24
#include "unicode/chariter.h"
25
#include "unicode/brkiter.h"
26
#include "unicode/usearch.h"
35
typedef struct USearch USearch;
41
* <tt>SearchIterator</tt> is an abstract base class that provides
42
* methods to search for a pattern within a text string. Instances of
43
* <tt>SearchIterator</tt> maintain a current position and scans over the
44
* target text, returning the indices the pattern is matched and the length
47
* <tt>SearchIterator</tt> defines a protocol for text searching.
48
* Subclasses provide concrete implementations of various search algorithms.
49
* For example, <tt>StringSearch</tt> implements language-sensitive pattern
50
* matching based on the comparison rules defined in a
51
* <tt>RuleBasedCollator</tt> object.
53
* Other options for searching includes using a BreakIterator to restrict
54
* the points at which matches are detected.
56
* <tt>SearchIterator</tt> provides an API that is similar to that of
57
* other text iteration classes such as <tt>BreakIterator</tt>. Using
58
* this class, it is easy to scan through text looking for all occurances of
59
* a given pattern. The following example uses a <tt>StringSearch</tt>
60
* object to find all instances of "fox" in the target string. Any other
61
* subclass of <tt>SearchIterator</tt> can be used in an identical
64
* UnicodeString target("The quick brown fox jumped over the lazy fox");
65
* UnicodeString pattern("fox");
67
* SearchIterator *iter = new StringSearch(pattern, target);
68
* UErrorCode error = U_ZERO_ERROR;
69
* for (int pos = iter->first(error); pos != USEARCH_DONE;
70
* pos = iter->next(error)) {
71
* printf("Found match at %d pos, length is %d\n", pos,
72
* iter.getMatchLength());
77
* @see RuleBasedCollator
79
class U_I18N_API SearchIterator : public UObject {
83
// public constructors and destructors -------------------------------
86
* Copy constructor that creates a SearchIterator instance with the same
87
* behavior, and iterating over the same text.
88
* @param other the SearchIterator instance to be copied.
91
SearchIterator(const SearchIterator &other);
94
* Destructor. Cleans up the search iterator data struct.
97
virtual ~SearchIterator();
99
// public get and set methods ----------------------------------------
102
* Sets the index to point to the given position, and clears any state
105
* This method takes the argument index and sets the position in the text
106
* string accordingly without checking if the index is pointing to a
107
* valid starting point to begin searching.
108
* @param position within the text to be set. If position is less
109
* than or greater than the text range for searching,
110
* an U_INDEX_OUTOFBOUNDS_ERROR will be returned
111
* @param status for errors if it occurs
114
virtual void setOffset(int32_t position, UErrorCode &status) = 0;
117
* Return the current index in the text being searched.
118
* If the iteration has gone past the end of the text
119
* (or past the beginning for a backwards search), USEARCH_DONE
121
* @return current index in the text being searched.
124
virtual int32_t getOffset(void) const = 0;
127
* Sets the text searching attributes located in the enum
128
* USearchAttribute with values from the enum USearchAttributeValue.
129
* USEARCH_DEFAULT can be used for all attributes for resetting.
130
* @param attribute text attribute (enum USearchAttribute) to be set
131
* @param value text attribute value
132
* @param status for errors if it occurs
135
void setAttribute(USearchAttribute attribute,
136
USearchAttributeValue value,
140
* Gets the text searching attributes
141
* @param attribute text attribute (enum USearchAttribute) to be retrieve
142
* @return text attribute value
145
USearchAttributeValue getAttribute(USearchAttribute attribute) const;
148
* Returns the index to the match in the text string that was searched.
149
* This call returns a valid result only after a successful call to
150
* <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
151
* Just after construction, or after a searching method returns
152
* <tt>USEARCH_DONE</tt>, this method will return <tt>USEARCH_DONE</tt>.
154
* Use getMatchedLength to get the matched string length.
155
* @return index of a substring within the text string that is being
163
int32_t getMatchedStart(void) const;
166
* Returns the length of text in the string which matches the search
167
* pattern. This call returns a valid result only after a successful call
168
* to <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
169
* Just after construction, or after a searching method returns
170
* <tt>USEARCH_DONE</tt>, this method will return 0.
171
* @return The length of the match in the target text, or 0 if there
172
* is no match currently.
179
int32_t getMatchedLength(void) const;
182
* Returns the text that was matched by the most recent call to
183
* <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
184
* If the iterator is not pointing at a valid match (e.g. just after
185
* construction or after <tt>USEARCH_DONE</tt> has been returned,
186
* returns an empty string.
187
* @param result stores the matched string or an empty string if a match
195
void getMatchedText(UnicodeString &result) const;
198
* Set the BreakIterator that will be used to restrict the points
199
* at which matches are detected. The user is responsible for deleting
201
* @param breakiter A BreakIterator that will be used to restrict the
202
* points at which matches are detected. If a match is
203
* found, but the match's start or end index is not a
204
* boundary as determined by the <tt>BreakIterator</tt>,
205
* the match will be rejected and another will be searched
206
* for. If this parameter is <tt>NULL</tt>, no break
207
* detection is attempted.
208
* @param status for errors if it occurs
212
void setBreakIterator(BreakIterator *breakiter, UErrorCode &status);
215
* Returns the BreakIterator that is used to restrict the points at
216
* which matches are detected. This will be the same object that was
217
* passed to the constructor or to <tt>setBreakIterator</tt>.
218
* Note that <tt>NULL</tt> is a legal value; it means that break
219
* detection should not be attempted.
220
* @return BreakIterator used to restrict matchings.
221
* @see #setBreakIterator
224
const BreakIterator * getBreakIterator(void) const;
227
* Set the string text to be searched. Text iteration will hence begin at
228
* the start of the text string. This method is useful if you want to
229
* re-use an iterator to search for the same pattern within a different
230
* body of text. The user is responsible for deleting the text.
231
* @param text string to be searched.
232
* @param status for errors. If the text length is 0,
233
* an U_ILLEGAL_ARGUMENT_ERROR is returned.
236
virtual void setText(const UnicodeString &text, UErrorCode &status);
239
* Set the string text to be searched. Text iteration will hence begin at
240
* the start of the text string. This method is useful if you want to
241
* re-use an iterator to search for the same pattern within a different
244
* Note: No parsing of the text within the <tt>CharacterIterator</tt>
245
* will be done during searching for this version. The block of text
246
* in <tt>CharacterIterator</tt> will be used as it is.
247
* The user is responsible for deleting the text.
248
* @param text string iterator to be searched.
249
* @param status for errors if any. If the text length is 0 then an
250
* U_ILLEGAL_ARGUMENT_ERROR is returned.
253
virtual void setText(CharacterIterator &text, UErrorCode &status);
256
* Return the string text to be searched.
257
* @return text string to be searched.
260
const UnicodeString & getText(void) const;
262
// operator overloading ----------------------------------------------
266
* @param that SearchIterator instance to be compared.
267
* @return TRUE if both BreakIterators are of the same class, have the
268
* same behavior, terates over the same text and have the same
269
* attributes. FALSE otherwise.
272
virtual UBool operator==(const SearchIterator &that) const;
275
* Not-equal operator.
276
* @param that SearchIterator instance to be compared.
277
* @return FALSE if operator== returns TRUE, and vice versa.
280
UBool operator!=(const SearchIterator &that) const;
282
// public methods ----------------------------------------------------
285
* Returns a copy of SearchIterator with the same behavior, and
286
* iterating over the same text, as this one. Note that all data will be
287
* replicated, except for the text string to be searched.
288
* @return cloned object
291
virtual SearchIterator* safeClone(void) const = 0;
294
* Returns the first index at which the string text matches the search
295
* pattern. The iterator is adjusted so that its current index (as
296
* returned by <tt>getOffset</tt>) is the match position if one
298
* If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
299
* the iterator will be adjusted to the index USEARCH_DONE
300
* @param status for errors if it occurs
301
* @return The character index of the first match, or
302
* <tt>USEARCH_DONE</tt> if there are no matches.
306
int32_t first(UErrorCode &status);
309
* Returns the first index greater than <tt>position</tt> at which the
310
* string text matches the search pattern. The iterator is adjusted so
311
* that its current index (as returned by <tt>getOffset</tt>) is the
312
* match position if one was found. If a match is not found,
313
* <tt>USEARCH_DONE</tt> will be returned and the iterator will be
314
* adjusted to the index USEARCH_DONE
315
* @param position where search if to start from. If position is less
316
* than or greater than the text range for searching,
317
* an U_INDEX_OUTOFBOUNDS_ERROR will be returned
318
* @param status for errors if it occurs
319
* @return The character index of the first match following
320
* <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are no
325
int32_t following(int32_t position, UErrorCode &status);
328
* Returns the last index in the target text at which it matches the
329
* search pattern. The iterator is adjusted so that its current index
330
* (as returned by <tt>getOffset</tt>) is the match position if one was
332
* If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
333
* the iterator will be adjusted to the index USEARCH_DONE.
334
* @param status for errors if it occurs
335
* @return The index of the first match, or <tt>USEARCH_DONE</tt> if
336
* there are no matches.
340
int32_t last(UErrorCode &status);
343
* Returns the first index less than <tt>position</tt> at which the string
344
* text matches the search pattern. The iterator is adjusted so that its
345
* current index (as returned by <tt>getOffset</tt>) is the match
346
* position if one was found. If a match is not found,
347
* <tt>USEARCH_DONE</tt> will be returned and the iterator will be
348
* adjusted to the index USEARCH_DONE
349
* @param position where search is to start from. If position is less
350
* than or greater than the text range for searching,
351
* an U_INDEX_OUTOFBOUNDS_ERROR will be returned
352
* @param status for errors if it occurs
353
* @return The character index of the first match preceding
354
* <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are
359
int32_t preceding(int32_t position, UErrorCode &status);
362
* Returns the index of the next point at which the text matches the
363
* search pattern, starting from the current position
364
* The iterator is adjusted so that its current index (as returned by
365
* <tt>getOffset</tt>) is the match position if one was found.
366
* If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
367
* the iterator will be adjusted to a position after the end of the text
369
* @param status for errors if it occurs
370
* @return The index of the next match after the current position,
371
* or <tt>USEARCH_DONE</tt> if there are no more matches.
375
int32_t next(UErrorCode &status);
378
* Returns the index of the previous point at which the string text
379
* matches the search pattern, starting at the current position.
380
* The iterator is adjusted so that its current index (as returned by
381
* <tt>getOffset</tt>) is the match position if one was found.
382
* If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
383
* the iterator will be adjusted to the index USEARCH_DONE
384
* @param status for errors if it occurs
385
* @return The index of the previous match before the current position,
386
* or <tt>USEARCH_DONE</tt> if there are no more matches.
390
int32_t previous(UErrorCode &status);
393
* Resets the iteration.
394
* Search will begin at the start of the text string if a forward
395
* iteration is initiated before a backwards iteration. Otherwise if a
396
* backwards iteration is initiated before a forwards iteration, the
397
* search will begin at the end of the text string.
400
virtual void reset();
403
// protected data members ---------------------------------------------
406
* C search data struct
413
* Currently the C++ breakiterator does not have getRules etc to reproduce
414
* another in C. Hence we keep the original around and do the verification
415
* at the end of the match. The user is responsible for deleting this
419
BreakIterator *m_breakiterator_;
422
* Unicode string version of the search text
425
UnicodeString m_text_;
427
// protected constructors and destructors -----------------------------
430
* Default constructor.
431
* Initializes data to the default values.
437
* Constructor for use by subclasses.
438
* @param text The target text to be searched.
439
* @param breakiter A {@link BreakIterator} that is used to restrict the
440
* points at which matches are detected. If
441
* <tt>handleNext</tt> or <tt>handlePrev</tt> finds a
442
* match, but the match's start or end index is not a
443
* boundary as determined by the <tt>BreakIterator</tt>,
444
* the match is rejected and <tt>handleNext</tt> or
445
* <tt>handlePrev</tt> is called again. If this parameter
446
* is <tt>NULL</tt>, no break detection is attempted.
451
SearchIterator(const UnicodeString &text,
452
BreakIterator *breakiter = NULL);
455
* Constructor for use by subclasses.
457
* Note: No parsing of the text within the <tt>CharacterIterator</tt>
458
* will be done during searching for this version. The block of text
459
* in <tt>CharacterIterator</tt> will be used as it is.
460
* @param text The target text to be searched.
461
* @param breakiter A {@link BreakIterator} that is used to restrict the
462
* points at which matches are detected. If
463
* <tt>handleNext</tt> or <tt>handlePrev</tt> finds a
464
* match, but the match's start or end index is not a
465
* boundary as determined by the <tt>BreakIterator</tt>,
466
* the match is rejected and <tt>handleNext</tt> or
467
* <tt>handlePrev</tt> is called again. If this parameter
468
* is <tt>NULL</tt>, no break detection is attempted.
473
SearchIterator(CharacterIterator &text, BreakIterator *breakiter = NULL);
475
// protected methods --------------------------------------------------
478
* Assignment operator. Sets this iterator to have the same behavior,
479
* and iterate over the same text, as the one passed in.
480
* @param that instance to be copied.
483
SearchIterator & operator=(const SearchIterator &that);
486
* Abstract method which subclasses override to provide the mechanism
487
* for finding the next match in the target text. This allows different
488
* subclasses to provide different search algorithms.
490
* If a match is found, the implementation should return the index at
491
* which the match starts and should call
492
* <tt>setMatchLength</tt> with the number of characters
493
* in the target text that make up the match. If no match is found, the
494
* method should return USEARCH_DONE.
496
* @param position The index in the target text at which the search
498
* @param status for error codes if it occurs.
499
* @return index at which the match starts, else if match is not found
500
* USEARCH_DONE is returned
501
* @see #setMatchLength
504
virtual int32_t handleNext(int32_t position, UErrorCode &status)
508
* Abstract method which subclasses override to provide the mechanism for
509
* finding the previous match in the target text. This allows different
510
* subclasses to provide different search algorithms.
512
* If a match is found, the implementation should return the index at
513
* which the match starts and should call
514
* <tt>setMatchLength</tt> with the number of characters
515
* in the target text that make up the match. If no match is found, the
516
* method should return USEARCH_DONE.
518
* @param position The index in the target text at which the search
520
* @param status for error codes if it occurs.
521
* @return index at which the match starts, else if match is not found
522
* USEARCH_DONE is returned
523
* @see #setMatchLength
526
virtual int32_t handlePrev(int32_t position, UErrorCode &status)
530
* Sets the length of the currently matched string in the text string to
532
* Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
533
* methods should call this when they find a match in the target text.
534
* @param length length of the matched text.
539
virtual void setMatchLength(int32_t length);
542
* Sets the offset of the currently matched string in the text string to
544
* Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
545
* methods should call this when they find a match in the target text.
546
* @param position start offset of the matched text.
551
virtual void setMatchStart(int32_t position);
554
* sets match not found
557
void setMatchNotFound();
560
inline UBool SearchIterator::operator!=(const SearchIterator &that) const
562
return !operator==(that);
566
#endif /* #if !UCONFIG_NO_COLLATION */