2
*******************************************************************************
4
* Copyright (C) 2002-2004, International Business Machines
5
* Corporation and others. All Rights Reserved.
7
*******************************************************************************
10
* tab size: 8 (not used)
13
* created on: 2002jan18
14
* created by: Markus W. Scherer
22
* \brief C API: Unicode Character Iteration
27
#include "unicode/utypes.h"
32
class CharacterIterator;
41
typedef struct UCharIterator UCharIterator; /**< C typedef for struct UCharIterator. @stable ICU 2.1 */
44
* Origin constants for UCharIterator.getIndex() and UCharIterator.move().
45
* @see UCharIteratorMove
49
typedef enum UCharIteratorOrigin {
50
UITER_START, UITER_CURRENT, UITER_LIMIT, UITER_ZERO, UITER_LENGTH
51
} UCharIteratorOrigin;
53
/** Constants for UCharIterator. @stable ICU 2.6 */
56
* Constant value that may be returned by UCharIteratorMove
57
* indicating that the final UTF-16 index is not known, but that the move succeeded.
58
* This can occur when moving relative to limit or length, or
59
* when moving relative to the current index after a setState()
60
* when the current UTF-16 index is not known.
62
* It would be very inefficient to have to count from the beginning of the text
63
* just to get the current/limit/length index after moving relative to it.
64
* The actual index can be determined with getIndex(UITER_CURRENT)
65
* which will count the UChars if necessary.
69
UITER_UNKNOWN_INDEX=-2
74
* Constant for UCharIterator getState() indicating an error or
76
* Returned by uiter_getState()/UCharIteratorGetState
77
* when an error occurs.
78
* Also, some UCharIterator implementations may not be able to return
79
* a valid state for each position. This will be clearly documented
80
* for each such iterator (none of the public ones here).
84
#define UITER_NO_STATE ((uint32_t)0xffffffff)
87
* Function type declaration for UCharIterator.getIndex().
89
* Gets the current position, or the start or limit of the
92
* This function may perform slowly for UITER_CURRENT after setState() was called,
93
* or for UITER_LENGTH, because an iterator implementation may have to count
94
* UChars if the underlying storage is not UTF-16.
96
* @param iter the UCharIterator structure ("this pointer")
97
* @param origin get the 0, start, limit, length, or current index
98
* @return the requested index, or U_SENTINEL in an error condition
100
* @see UCharIteratorOrigin
104
typedef int32_t U_CALLCONV
105
UCharIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin);
108
* Function type declaration for UCharIterator.move().
110
* Use iter->move(iter, index, UITER_ZERO) like CharacterIterator::setIndex(index).
112
* Moves the current position relative to the start or limit of the
113
* iteration range, or relative to the current position itself.
114
* The movement is expressed in numbers of code units forward
115
* or backward by specifying a positive or negative delta.
116
* Out of bounds movement will be pinned to the start or limit.
118
* This function may perform slowly for moving relative to UITER_LENGTH
119
* because an iterator implementation may have to count the rest of the
120
* UChars if the native storage is not UTF-16.
122
* When moving relative to the limit or length, or
123
* relative to the current position after setState() was called,
124
* move() may return UITER_UNKNOWN_INDEX (-2) to avoid an inefficient
125
* determination of the actual UTF-16 index.
126
* The actual index can be determined with getIndex(UITER_CURRENT)
127
* which will count the UChars if necessary.
128
* See UITER_UNKNOWN_INDEX for details.
130
* @param iter the UCharIterator structure ("this pointer")
131
* @param delta can be positive, zero, or negative
132
* @param origin move relative to the 0, start, limit, length, or current index
133
* @return the new index, or U_SENTINEL on an error condition,
134
* or UITER_UNKNOWN_INDEX when the index is not known.
136
* @see UCharIteratorOrigin
138
* @see UITER_UNKNOWN_INDEX
141
typedef int32_t U_CALLCONV
142
UCharIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin);
145
* Function type declaration for UCharIterator.hasNext().
147
* Check if current() and next() can still
148
* return another code unit.
150
* @param iter the UCharIterator structure ("this pointer")
151
* @return boolean value for whether current() and next() can still return another code unit
156
typedef UBool U_CALLCONV
157
UCharIteratorHasNext(UCharIterator *iter);
160
* Function type declaration for UCharIterator.hasPrevious().
162
* Check if previous() can still return another code unit.
164
* @param iter the UCharIterator structure ("this pointer")
165
* @return boolean value for whether previous() can still return another code unit
170
typedef UBool U_CALLCONV
171
UCharIteratorHasPrevious(UCharIterator *iter);
174
* Function type declaration for UCharIterator.current().
176
* Return the code unit at the current position,
177
* or U_SENTINEL if there is none (index is at the limit).
179
* @param iter the UCharIterator structure ("this pointer")
180
* @return the current code unit
185
typedef UChar32 U_CALLCONV
186
UCharIteratorCurrent(UCharIterator *iter);
189
* Function type declaration for UCharIterator.next().
191
* Return the code unit at the current index and increment
192
* the index (post-increment, like s[i++]),
193
* or return U_SENTINEL if there is none (index is at the limit).
195
* @param iter the UCharIterator structure ("this pointer")
196
* @return the current code unit (and post-increment the current index)
201
typedef UChar32 U_CALLCONV
202
UCharIteratorNext(UCharIterator *iter);
205
* Function type declaration for UCharIterator.previous().
207
* Decrement the index and return the code unit from there
208
* (pre-decrement, like s[--i]),
209
* or return U_SENTINEL if there is none (index is at the start).
211
* @param iter the UCharIterator structure ("this pointer")
212
* @return the previous code unit (after pre-decrementing the current index)
217
typedef UChar32 U_CALLCONV
218
UCharIteratorPrevious(UCharIterator *iter);
221
* Function type declaration for UCharIterator.reservedFn().
222
* Reserved for future use.
224
* @param iter the UCharIterator structure ("this pointer")
225
* @param something some integer argument
226
* @return some integer
231
typedef int32_t U_CALLCONV
232
UCharIteratorReserved(UCharIterator *iter, int32_t something);
235
* Function type declaration for UCharIterator.getState().
237
* Get the "state" of the iterator in the form of a single 32-bit word.
238
* It is recommended that the state value be calculated to be as small as
239
* is feasible. For strings with limited lengths, fewer than 32 bits may
242
* This is used together with setState()/UCharIteratorSetState
243
* to save and restore the iterator position more efficiently than with
246
* The iterator state is defined as a uint32_t value because it is designed
247
* for use in ucol_nextSortKeyPart() which provides 32 bits to store the state
248
* of the character iterator.
250
* With some UCharIterator implementations (e.g., UTF-8),
251
* getting and setting the UTF-16 index with existing functions
252
* (getIndex(UITER_CURRENT) followed by move(pos, UITER_ZERO)) is possible but
253
* relatively slow because the iterator has to "walk" from a known index
254
* to the requested one.
255
* This takes more time the farther it needs to go.
257
* An opaque state value allows an iterator implementation to provide
258
* an internal index (UTF-8: the source byte array index) for
259
* fast, constant-time restoration.
261
* After calling setState(), a getIndex(UITER_CURRENT) may be slow because
262
* the UTF-16 index may not be restored as well, but the iterator can deliver
263
* the correct text contents and move relative to the current position
264
* without performance degradation.
266
* Some UCharIterator implementations may not be able to return
267
* a valid state for each position, in which case they return UITER_NO_STATE instead.
268
* This will be clearly documented for each such iterator (none of the public ones here).
270
* @param iter the UCharIterator structure ("this pointer")
271
* @return the state word
274
* @see UCharIteratorSetState
275
* @see UITER_NO_STATE
278
typedef uint32_t U_CALLCONV
279
UCharIteratorGetState(const UCharIterator *iter);
282
* Function type declaration for UCharIterator.setState().
284
* Restore the "state" of the iterator using a state word from a getState() call.
285
* The iterator object need not be the same one as for which getState() was called,
286
* but it must be of the same type (set up using the same uiter_setXYZ function)
287
* and it must iterate over the same string
288
* (binary identical regardless of memory address).
289
* For more about the state word see UCharIteratorGetState.
291
* After calling setState(), a getIndex(UITER_CURRENT) may be slow because
292
* the UTF-16 index may not be restored as well, but the iterator can deliver
293
* the correct text contents and move relative to the current position
294
* without performance degradation.
296
* @param iter the UCharIterator structure ("this pointer")
297
* @param state the state word from a getState() call
298
* on a same-type, same-string iterator
299
* @param pErrorCode Must be a valid pointer to an error code value,
300
* which must not indicate a failure before the function call.
303
* @see UCharIteratorGetState
306
typedef void U_CALLCONV
307
UCharIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode);
311
* C API for code unit iteration.
312
* This can be used as a C wrapper around
313
* CharacterIterator, Replaceable, or implemented using simple strings, etc.
315
* There are two roles for using UCharIterator:
317
* A "provider" sets the necessary function pointers and controls the "protected"
318
* fields of the UCharIterator structure. A "provider" passes a UCharIterator
319
* into C APIs that need a UCharIterator as an abstract, flexible string interface.
321
* Implementations of such C APIs are "callers" of UCharIterator functions;
322
* they only use the "public" function pointers and never access the "protected"
325
* The current() and next() functions only check the current index against the
326
* limit, and previous() only checks the current index against the start,
327
* to see if the iterator already reached the end of the iteration range.
329
* The assumption - in all iterators - is that the index is moved via the API,
330
* which means it won't go out of bounds, or the index is modified by
331
* user code that knows enough about the iterator implementation to set valid
334
* UCharIterator functions return code unit values 0..0xffff,
335
* or U_SENTINEL if the iteration bounds are reached.
339
struct UCharIterator {
341
* (protected) Pointer to string or wrapped object or similar.
342
* Not used by caller.
348
* (protected) Length of string or similar.
349
* Not used by caller.
355
* (protected) Start index or similar.
356
* Not used by caller.
362
* (protected) Current index or similar.
363
* Not used by caller.
369
* (protected) Limit index or similar.
370
* Not used by caller.
376
* (protected) Used by UTF-8 iterators and possibly others.
379
int32_t reservedField;
382
* (public) Returns the current position or the
383
* start or limit index of the iteration range.
385
* @see UCharIteratorGetIndex
388
UCharIteratorGetIndex *getIndex;
391
* (public) Moves the current position relative to the start or limit of the
392
* iteration range, or relative to the current position itself.
393
* The movement is expressed in numbers of code units forward
394
* or backward by specifying a positive or negative delta.
396
* @see UCharIteratorMove
399
UCharIteratorMove *move;
402
* (public) Check if current() and next() can still
403
* return another code unit.
405
* @see UCharIteratorHasNext
408
UCharIteratorHasNext *hasNext;
411
* (public) Check if previous() can still return another code unit.
413
* @see UCharIteratorHasPrevious
416
UCharIteratorHasPrevious *hasPrevious;
419
* (public) Return the code unit at the current position,
420
* or U_SENTINEL if there is none (index is at the limit).
422
* @see UCharIteratorCurrent
425
UCharIteratorCurrent *current;
428
* (public) Return the code unit at the current index and increment
429
* the index (post-increment, like s[i++]),
430
* or return U_SENTINEL if there is none (index is at the limit).
432
* @see UCharIteratorNext
435
UCharIteratorNext *next;
438
* (public) Decrement the index and return the code unit from there
439
* (pre-decrement, like s[--i]),
440
* or return U_SENTINEL if there is none (index is at the start).
442
* @see UCharIteratorPrevious
445
UCharIteratorPrevious *previous;
448
* (public) Reserved for future use. Currently NULL.
450
* @see UCharIteratorReserved
453
UCharIteratorReserved *reservedFn;
456
* (public) Return the state of the iterator, to be restored later with setState().
457
* This function pointer is NULL if the iterator does not implement it.
459
* @see UCharIteratorGet
462
UCharIteratorGetState *getState;
465
* (public) Restore the iterator state from the state word from a call
467
* This function pointer is NULL if the iterator does not implement it.
469
* @see UCharIteratorSet
472
UCharIteratorSetState *setState;
476
* Helper function for UCharIterator to get the code point
477
* at the current index.
479
* Return the code point that includes the code unit at the current position,
480
* or U_SENTINEL if there is none (index is at the limit).
481
* If the current code unit is a lead or trail surrogate,
482
* then the following or preceding surrogate is used to form
483
* the code point value.
485
* @param iter the UCharIterator structure ("this pointer")
486
* @return the current code point
490
* @see UnicodeString::char32At()
493
U_STABLE UChar32 U_EXPORT2
494
uiter_current32(UCharIterator *iter);
497
* Helper function for UCharIterator to get the next code point.
499
* Return the code point at the current index and increment
500
* the index (post-increment, like s[i++]),
501
* or return U_SENTINEL if there is none (index is at the limit).
503
* @param iter the UCharIterator structure ("this pointer")
504
* @return the current code point (and post-increment the current index)
510
U_STABLE UChar32 U_EXPORT2
511
uiter_next32(UCharIterator *iter);
514
* Helper function for UCharIterator to get the previous code point.
516
* Decrement the index and return the code point from there
517
* (pre-decrement, like s[--i]),
518
* or return U_SENTINEL if there is none (index is at the start).
520
* @param iter the UCharIterator structure ("this pointer")
521
* @return the previous code point (after pre-decrementing the current index)
527
U_STABLE UChar32 U_EXPORT2
528
uiter_previous32(UCharIterator *iter);
531
* Get the "state" of the iterator in the form of a single 32-bit word.
532
* This is a convenience function that calls iter->getState(iter)
533
* if iter->getState is not NULL;
534
* if it is NULL or any other error occurs, then UITER_NO_STATE is returned.
536
* Some UCharIterator implementations may not be able to return
537
* a valid state for each position, in which case they return UITER_NO_STATE instead.
538
* This will be clearly documented for each such iterator (none of the public ones here).
540
* @param iter the UCharIterator structure ("this pointer")
541
* @return the state word
544
* @see UCharIteratorGetState
545
* @see UITER_NO_STATE
548
U_STABLE uint32_t U_EXPORT2
549
uiter_getState(const UCharIterator *iter);
552
* Restore the "state" of the iterator using a state word from a getState() call.
553
* This is a convenience function that calls iter->setState(iter, state, pErrorCode)
554
* if iter->setState is not NULL; if it is NULL, then U_UNSUPPORTED_ERROR is set.
556
* @param iter the UCharIterator structure ("this pointer")
557
* @param state the state word from a getState() call
558
* on a same-type, same-string iterator
559
* @param pErrorCode Must be a valid pointer to an error code value,
560
* which must not indicate a failure before the function call.
563
* @see UCharIteratorSetState
566
U_STABLE void U_EXPORT2
567
uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode);
570
* Set up a UCharIterator to iterate over a string.
572
* Sets the UCharIterator function pointers for iteration over the string s
573
* with iteration boundaries start=index=0 and length=limit=string length.
574
* The "provider" may set the start, index, and limit values at any time
575
* within the range 0..length.
576
* The length field will be ignored.
578
* The string pointer s is set into UCharIterator.context without copying
579
* or reallocating the string contents.
581
* getState() simply returns the current index.
582
* move() will always return the final index.
584
* @param iter UCharIterator structure to be set for iteration
585
* @param s String to iterate over
586
* @param length Length of s, or -1 if NUL-terminated
591
U_STABLE void U_EXPORT2
592
uiter_setString(UCharIterator *iter, const UChar *s, int32_t length);
595
* Set up a UCharIterator to iterate over a UTF-16BE string
596
* (byte vector with a big-endian pair of bytes per UChar).
598
* Everything works just like with a normal UChar iterator (uiter_setString),
599
* except that UChars are assembled from byte pairs,
600
* and that the length argument here indicates an even number of bytes.
602
* getState() simply returns the current index.
603
* move() will always return the final index.
605
* @param iter UCharIterator structure to be set for iteration
606
* @param s UTF-16BE string to iterate over
607
* @param length Length of s as an even number of bytes, or -1 if NUL-terminated
608
* (NUL means pair of 0 bytes at even index from s)
611
* @see uiter_setString
614
U_STABLE void U_EXPORT2
615
uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length);
618
* Set up a UCharIterator to iterate over a UTF-8 string.
620
* Sets the UCharIterator function pointers for iteration over the UTF-8 string s
621
* with UTF-8 iteration boundaries 0 and length.
622
* The implementation counts the UTF-16 index on the fly and
623
* lazily evaluates the UTF-16 length of the text.
625
* The start field is used as the UTF-8 offset, the limit field as the UTF-8 length.
626
* When the reservedField is not 0, then it contains a supplementary code point
627
* and the UTF-16 index is between the two corresponding surrogates.
628
* At that point, the UTF-8 index is behind that code point.
630
* The UTF-8 string pointer s is set into UCharIterator.context without copying
631
* or reallocating the string contents.
633
* getState() returns a state value consisting of
634
* - the current UTF-8 source byte index (bits 31..1)
635
* - a flag (bit 0) that indicates whether the UChar position is in the middle
636
* of a surrogate pair
637
* (from a 4-byte UTF-8 sequence for the corresponding supplementary code point)
639
* getState() cannot also encode the UTF-16 index in the state value.
640
* move(relative to limit or length), or
641
* move(relative to current) after setState(), may return UITER_UNKNOWN_INDEX.
643
* @param iter UCharIterator structure to be set for iteration
644
* @param s UTF-8 string to iterate over
645
* @param length Length of s in bytes, or -1 if NUL-terminated
650
U_STABLE void U_EXPORT2
651
uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length);
656
* Set up a UCharIterator to wrap around a C++ CharacterIterator.
658
* Sets the UCharIterator function pointers for iteration using the
659
* CharacterIterator charIter.
661
* The CharacterIterator pointer charIter is set into UCharIterator.context
662
* without copying or cloning the CharacterIterator object.
663
* The other "protected" UCharIterator fields are set to 0 and will be ignored.
664
* The iteration index and boundaries are controlled by the CharacterIterator.
666
* getState() simply returns the current index.
667
* move() will always return the final index.
669
* @param iter UCharIterator structure to be set for iteration
670
* @param charIter CharacterIterator to wrap
675
U_STABLE void U_EXPORT2
676
uiter_setCharacterIterator(UCharIterator *iter, CharacterIterator *charIter);
679
* Set up a UCharIterator to iterate over a C++ Replaceable.
681
* Sets the UCharIterator function pointers for iteration over the
682
* Replaceable rep with iteration boundaries start=index=0 and
683
* length=limit=rep->length().
684
* The "provider" may set the start, index, and limit values at any time
685
* within the range 0..length=rep->length().
686
* The length field will be ignored.
688
* The Replaceable pointer rep is set into UCharIterator.context without copying
689
* or cloning/reallocating the Replaceable object.
691
* getState() simply returns the current index.
692
* move() will always return the final index.
694
* @param iter UCharIterator structure to be set for iteration
695
* @param rep Replaceable to iterate over
700
U_STABLE void U_EXPORT2
701
uiter_setReplaceable(UCharIterator *iter, const Replaceable *rep);