2
********************************************************************
4
* Copyright (C) 1997-2005, International Business Machines
5
* Corporation and others. All Rights Reserved.
7
********************************************************************
13
#include "unicode/utypes.h"
14
#include "unicode/uobject.h"
15
#include "unicode/unistr.h"
18
* \brief C++ API: Character Iterator
23
* Abstract class that defines an API for forward-only iteration
25
* This is a minimal interface for iteration without random access
26
* or backwards iteration. It is especially useful for wrapping
27
* streams with converters into an object for collation or
30
* <p>Characters can be accessed in two ways: as code units or as
32
* Unicode code points are 21-bit integers and are the scalar values
33
* of Unicode characters. ICU uses the type UChar32 for them.
34
* Unicode code units are the storage units of a given
35
* Unicode/UCS Transformation Format (a character encoding scheme).
36
* With UTF-16, all code points can be represented with either one
37
* or two code units ("surrogates").
38
* String storage is typically based on code units, while properties
39
* of characters are typically determined using code point values.
40
* Some processes may be designed to work with sequences of code units,
41
* or it may be known that all characters that are important to an
42
* algorithm can be represented with single code units.
43
* Other processes will need to use the code point access functions.</p>
45
* <p>ForwardCharacterIterator provides nextPostInc() to access
46
* a code unit and advance an internal position into the text object,
47
* similar to a <code>return text[position++]</code>.<br>
48
* It provides next32PostInc() to access a code point and advance an internal
51
* <p>next32PostInc() assumes that the current position is that of
52
* the beginning of a code point, i.e., of its first code unit.
53
* After next32PostInc(), this will be true again.
54
* In general, access to code units and code points in the same
55
* iteration loop should not be mixed. In UTF-16, if the current position
56
* is on a second code unit (Low Surrogate), then only that code unit
57
* is returned even by next32PostInc().</p>
59
* <p>For iteration with either function, there are two ways to
60
* check for the end of the iteration. When there are no more
61
* characters in the text object:
63
* <li>The hasNext() function returns FALSE.</li>
64
* <li>nextPostInc() and next32PostInc() return DONE
65
* when one attempts to read beyond the end of the text object.</li>
70
* void function1(ForwardCharacterIterator &it) {
72
* while(it.hasNext()) {
73
* c=it.next32PostInc();
78
* void function1(ForwardCharacterIterator &it) {
80
* while((c=it.nextPostInc())!=ForwardCharacterIterator::DONE) {
89
class U_COMMON_API ForwardCharacterIterator : public UObject {
92
* Value returned by most of ForwardCharacterIterator's functions
93
* when the iterator has reached the limits of its iteration.
96
enum { DONE = 0xffff };
102
virtual ~ForwardCharacterIterator();
105
* Returns true when both iterators refer to the same
106
* character in the same character-storage object.
107
* @param that The ForwardCharacterIterator to be compared for equality
108
* @return true when both iterators refer to the same
109
* character in the same character-storage object
112
virtual UBool operator==(const ForwardCharacterIterator& that) const = 0;
115
* Returns true when the iterators refer to different
116
* text-storage objects, or to different characters in the
117
* same text-storage object.
118
* @param that The ForwardCharacterIterator to be compared for inequality
119
* @return true when the iterators refer to different
120
* text-storage objects, or to different characters in the
121
* same text-storage object
124
inline UBool operator!=(const ForwardCharacterIterator& that) const;
127
* Generates a hash code for this iterator.
128
* @return the hash code.
131
virtual int32_t hashCode(void) const = 0;
134
* Returns a UClassID for this ForwardCharacterIterator ("poor man's
135
* RTTI").<P> Despite the fact that this function is public,
136
* DO NOT CONSIDER IT PART OF CHARACTERITERATOR'S API!
137
* @return a UClassID for this ForwardCharacterIterator
140
virtual UClassID getDynamicClassID(void) const = 0;
143
* Gets the current code unit for returning and advances to the next code unit
144
* in the iteration range
145
* (toward endIndex()). If there are
146
* no more code units to return, returns DONE.
147
* @return the current code unit.
150
virtual UChar nextPostInc(void) = 0;
153
* Gets the current code point for returning and advances to the next code point
154
* in the iteration range
155
* (toward endIndex()). If there are
156
* no more code points to return, returns DONE.
157
* @return the current code point.
160
virtual UChar32 next32PostInc(void) = 0;
163
* Returns FALSE if there are no more code units or code points
164
* at or after the current position in the iteration range.
165
* This is used with nextPostInc() or next32PostInc() in forward
167
* @returns FALSE if there are no more code units or code points
168
* at or after the current position in the iteration range.
171
virtual UBool hasNext() = 0;
174
/** Default constructor to be overridden in the implementing class. @stable ICU 2.0*/
175
ForwardCharacterIterator();
177
/** Copy constructor to be overridden in the implementing class. @stable ICU 2.0*/
178
ForwardCharacterIterator(const ForwardCharacterIterator &other);
181
* Assignment operator to be overridden in the implementing class.
184
ForwardCharacterIterator &operator=(const ForwardCharacterIterator&) { return *this; }
188
* Abstract class that defines an API for iteration
190
* This is an interface for forward and backward iteration
191
* and random access into a text object.
193
* <p>The API provides backward compatibility to the Java and older ICU
194
* CharacterIterator classes but extends them significantly:
196
* <li>CharacterIterator is now a subclass of ForwardCharacterIterator.</li>
197
* <li>While the old API functions provided forward iteration with
198
* "pre-increment" semantics, the new one also provides functions
199
* with "post-increment" semantics. They are more efficient and should
200
* be the preferred iterator functions for new implementations.
201
* The backward iteration always had "pre-decrement" semantics, which
202
* are efficient.</li>
203
* <li>Just like ForwardCharacterIterator, it provides access to
204
* both code units and code points. Code point access versions are available
205
* for the old and the new iteration semantics.</li>
206
* <li>There are new functions for setting and moving the current position
207
* without returning a character, for efficiency.</li>
210
* See ForwardCharacterIterator for examples for using the new forward iteration
211
* functions. For backward iteration, there is also a hasPrevious() function
212
* that can be used analogously to hasNext().
213
* The old functions work as before and are shown below.</p>
215
* <p>Examples for some of the new functions:</p>
217
* Forward iteration with hasNext():
219
* void forward1(CharacterIterator &it) {
221
* for(it.setToStart(); it.hasNext();) {
222
* c=it.next32PostInc();
227
* Forward iteration more similar to loops with the old forward iteration,
228
* showing a way to convert simple for() loops:
230
* void forward2(CharacterIterator &it) {
232
* for(c=it.firstPostInc(); c!=CharacterIterator::DONE; c=it.nextPostInc()) {
237
* Backward iteration with setToEnd() and hasPrevious():
239
* void backward1(CharacterIterator &it) {
241
* for(it.setToEnd(); it.hasPrevious();) {
247
* Backward iteration with a more traditional for() loop:
249
* void backward2(CharacterIterator &it) {
251
* for(c=it.last(); c!=CharacterIterator::DONE; c=it.previous()) {
257
* Example for random access:
259
* void random(CharacterIterator &it) {
260
* // set to the third code point from the beginning
261
* it.move32(3, CharacterIterator::kStart);
262
* // get a code point from here without moving the position
263
* UChar32 c=it.current32();
264
* // get the position
265
* int32_t pos=it.getIndex();
266
* // get the previous code unit
267
* UChar u=it.previous();
268
* // move back one more code unit
269
* it.move(-1, CharacterIterator::kCurrent);
270
* // set the position back to where it was
271
* // and read the same code point c and move beyond it
273
* if(c!=it.next32PostInc()) {
274
* exit(1); // CharacterIterator inconsistent
279
* <p>Examples, especially for the old API:</p>
281
* Function processing characters, in this example simple output
284
* void processChar( UChar c )
290
* Traverse the text from start to finish
293
* void traverseForward(CharacterIterator& iter)
295
* for(UChar c = iter.first(); c != CharacterIterator.DONE; c = iter.next()) {
301
* Traverse the text backwards, from end to start
304
* void traverseBackward(CharacterIterator& iter)
306
* for(UChar c = iter.last(); c != CharacterIterator.DONE; c = iter.previous()) {
312
* Traverse both forward and backward from a given position in the text.
313
* Calls to notBoundary() in this example represents some additional stopping criteria.
316
* void traverseOut(CharacterIterator& iter, int32_t pos)
319
* for (c = iter.setIndex(pos);
320
* c != CharacterIterator.DONE && (Unicode::isLetter(c) || Unicode::isDigit(c));
321
* c = iter.next()) {}
322
* int32_t end = iter.getIndex();
323
* for (c = iter.setIndex(pos);
324
* c != CharacterIterator.DONE && (Unicode::isLetter(c) || Unicode::isDigit(c));
325
* c = iter.previous()) {}
326
* int32_t start = iter.getIndex() + 1;
328
* cout << "start: " << start << " end: " << end << endl;
329
* for (c = iter.setIndex(start); iter.getIndex() < end; c = iter.next() ) {
335
* Creating a StringCharacterIterator and calling the test functions
338
* void CharacterIterator_Example( void )
340
* cout << endl << "===== CharacterIterator_Example: =====" << endl;
341
* UnicodeString text("Ein kleiner Satz.");
342
* StringCharacterIterator iterator(text);
343
* cout << "----- traverseForward: -----------" << endl;
344
* traverseForward( iterator );
345
* cout << endl << endl << "----- traverseBackward: ----------" << endl;
346
* traverseBackward( iterator );
347
* cout << endl << endl << "----- traverseOut: ---------------" << endl;
348
* traverseOut( iterator, 7 );
349
* cout << endl << endl << "-----" << endl;
356
class U_COMMON_API CharacterIterator : public ForwardCharacterIterator {
359
* Origin enumeration for the move() and move32() functions.
362
enum EOrigin { kStart, kCurrent, kEnd };
365
* Returns a pointer to a new CharacterIterator of the same
366
* concrete class as this one, and referring to the same
367
* character in the same text-storage object as this one. The
368
* caller is responsible for deleting the new clone.
369
* @return a pointer to a new CharacterIterator
372
virtual CharacterIterator* clone(void) const = 0;
375
* Sets the iterator to refer to the first code unit in its
376
* iteration range, and returns that code unit.
377
* This can be used to begin an iteration with next().
378
* @return the first code unit in its iteration range.
381
virtual UChar first(void) = 0;
384
* Sets the iterator to refer to the first code unit in its
385
* iteration range, returns that code unit, and moves the position
386
* to the second code unit. This is an alternative to setToStart()
387
* for forward iteration with nextPostInc().
388
* @return the first code unit in its iteration range.
391
virtual UChar firstPostInc(void);
394
* Sets the iterator to refer to the first code point in its
395
* iteration range, and returns that code unit,
396
* This can be used to begin an iteration with next32().
397
* Note that an iteration with next32PostInc(), beginning with,
398
* e.g., setToStart() or firstPostInc(), is more efficient.
399
* @return the first code point in its iteration range.
402
virtual UChar32 first32(void) = 0;
405
* Sets the iterator to refer to the first code point in its
406
* iteration range, returns that code point, and moves the position
407
* to the second code point. This is an alternative to setToStart()
408
* for forward iteration with next32PostInc().
409
* @return the first code point in its iteration range.
412
virtual UChar32 first32PostInc(void);
415
* Sets the iterator to refer to the first code unit or code point in its
416
* iteration range. This can be used to begin a forward
417
* iteration with nextPostInc() or next32PostInc().
418
* @return the start position of the iteration range
421
inline int32_t setToStart();
424
* Sets the iterator to refer to the last code unit in its
425
* iteration range, and returns that code unit.
426
* This can be used to begin an iteration with previous().
427
* @return the last code unit.
430
virtual UChar last(void) = 0;
433
* Sets the iterator to refer to the last code point in its
434
* iteration range, and returns that code unit.
435
* This can be used to begin an iteration with previous32().
436
* @return the last code point.
439
virtual UChar32 last32(void) = 0;
442
* Sets the iterator to the end of its iteration range, just behind
443
* the last code unit or code point. This can be used to begin a backward
444
* iteration with previous() or previous32().
445
* @return the end position of the iteration range
448
inline int32_t setToEnd();
451
* Sets the iterator to refer to the "position"-th code unit
452
* in the text-storage object the iterator refers to, and
453
* returns that code unit.
454
* @param position the "position"-th code unit in the text-storage object
455
* @return the "position"-th code unit.
458
virtual UChar setIndex(int32_t position) = 0;
461
* Sets the iterator to refer to the beginning of the code point
462
* that contains the "position"-th code unit
463
* in the text-storage object the iterator refers to, and
464
* returns that code point.
465
* The current position is adjusted to the beginning of the code point
466
* (its first code unit).
467
* @param position the "position"-th code unit in the text-storage object
468
* @return the "position"-th code point.
471
virtual UChar32 setIndex32(int32_t position) = 0;
474
* Returns the code unit the iterator currently refers to.
475
* @return the current code unit.
478
virtual UChar current(void) const = 0;
481
* Returns the code point the iterator currently refers to.
482
* @return the current code point.
485
virtual UChar32 current32(void) const = 0;
488
* Advances to the next code unit in the iteration range
489
* (toward endIndex()), and returns that code unit. If there are
490
* no more code units to return, returns DONE.
491
* @return the next code unit.
494
virtual UChar next(void) = 0;
497
* Advances to the next code point in the iteration range
498
* (toward endIndex()), and returns that code point. If there are
499
* no more code points to return, returns DONE.
500
* Note that iteration with "pre-increment" semantics is less
501
* efficient than iteration with "post-increment" semantics
502
* that is provided by next32PostInc().
503
* @return the next code point.
506
virtual UChar32 next32(void) = 0;
509
* Advances to the previous code unit in the iteration range
510
* (toward startIndex()), and returns that code unit. If there are
511
* no more code units to return, returns DONE.
512
* @return the previous code unit.
515
virtual UChar previous(void) = 0;
518
* Advances to the previous code point in the iteration range
519
* (toward startIndex()), and returns that code point. If there are
520
* no more code points to return, returns DONE.
521
* @return the previous code point.
524
virtual UChar32 previous32(void) = 0;
527
* Returns FALSE if there are no more code units or code points
528
* before the current position in the iteration range.
529
* This is used with previous() or previous32() in backward
531
* @return FALSE if there are no more code units or code points
532
* before the current position in the iteration range, return TRUE otherwise.
535
virtual UBool hasPrevious() = 0;
538
* Returns the numeric index in the underlying text-storage
539
* object of the character returned by first(). Since it's
540
* possible to create an iterator that iterates across only
541
* part of a text-storage object, this number isn't
543
* @returns the numeric index in the underlying text-storage
544
* object of the character returned by first().
547
inline int32_t startIndex(void) const;
550
* Returns the numeric index in the underlying text-storage
551
* object of the position immediately BEYOND the character
552
* returned by last().
553
* @return the numeric index in the underlying text-storage
554
* object of the position immediately BEYOND the character
555
* returned by last().
558
inline int32_t endIndex(void) const;
561
* Returns the numeric index in the underlying text-storage
562
* object of the character the iterator currently refers to
563
* (i.e., the character returned by current()).
564
* @return the numberic index in the text-storage object of
565
* the character the iterator currently refers to
568
inline int32_t getIndex(void) const;
571
* Returns the length of the entire text in the underlying
572
* text-storage object.
573
* @return the length of the entire text in the text-storage object
576
inline int32_t getLength() const;
579
* Moves the current position relative to the start or end of the
580
* iteration range, or relative to the current position itself.
581
* The movement is expressed in numbers of code units forward
582
* or backward by specifying a positive or negative delta.
583
* @param delta the position relative to origin. A positive delta means forward;
584
* a negative delta means backward.
585
* @param origin Origin enumeration {kStart, kCurrent, kEnd}
586
* @return the new position
589
virtual int32_t move(int32_t delta, EOrigin origin) = 0;
592
* Moves the current position relative to the start or end of the
593
* iteration range, or relative to the current position itself.
594
* The movement is expressed in numbers of code points forward
595
* or backward by specifying a positive or negative delta.
596
* @param delta the position relative to origin. A positive delta means forward;
597
* a negative delta means backward.
598
* @param origin Origin enumeration {kStart, kCurrent, kEnd}
599
* @return the new position
602
virtual int32_t move32(int32_t delta, EOrigin origin) = 0;
605
* Copies the text under iteration into the UnicodeString
606
* referred to by "result".
607
* @param result Receives a copy of the text under iteration.
610
virtual void getText(UnicodeString& result) = 0;
620
* Constructor, just setting the length field in this base class.
623
CharacterIterator(int32_t length);
626
* Constructor, just setting the length and position fields in this base class.
629
CharacterIterator(int32_t length, int32_t position);
632
* Constructor, just setting the length, start, end, and position fields in this base class.
635
CharacterIterator(int32_t length, int32_t textBegin, int32_t textEnd, int32_t position);
640
* @param that The CharacterIterator to be copied
643
CharacterIterator(const CharacterIterator &that);
646
* Assignment operator. Sets this CharacterIterator to have the same behavior,
647
* as the one passed in.
648
* @param that The CharacterIterator passed in.
649
* @return the newly set CharacterIterator.
652
CharacterIterator &operator=(const CharacterIterator &that);
655
* Base class text length field.
656
* Necessary this for correct getText() and hashCode().
662
* Base class field for the current position.
668
* Base class field for the start of the iteration range.
674
* Base class field for the end of the iteration range.
681
ForwardCharacterIterator::operator!=(const ForwardCharacterIterator& that) const {
682
return !operator==(that);
686
CharacterIterator::setToStart() {
687
return move(0, kStart);
691
CharacterIterator::setToEnd() {
692
return move(0, kEnd);
696
CharacterIterator::startIndex(void) const {
701
CharacterIterator::endIndex(void) const {
706
CharacterIterator::getIndex(void) const {
711
CharacterIterator::getLength(void) const {