1
/*********************************************************
2
* Copyright (C) 2008 VMware, Inc. All rights reserved.
4
* This file is part of VMware View Open Client.
5
*********************************************************/
7
*******************************************************************************
9
* Copyright (C) 2002-2006, International Business Machines
10
* Corporation and others. All Rights Reserved.
12
*******************************************************************************
15
* tab size: 8 (not used)
18
* created on: 2002jan18
19
* created by: Markus W. Scherer
27
* \brief C API: Unicode Character Iteration
32
#include "unicode/utypes.h"
37
class CharacterIterator;
46
typedef struct UCharIterator UCharIterator; /**< C typedef for struct UCharIterator. @stable ICU 2.1 */
49
* Origin constants for UCharIterator.getIndex() and UCharIterator.move().
50
* @see UCharIteratorMove
54
typedef enum UCharIteratorOrigin {
55
UITER_START, UITER_CURRENT, UITER_LIMIT, UITER_ZERO, UITER_LENGTH
56
} UCharIteratorOrigin;
58
/** Constants for UCharIterator. @stable ICU 2.6 */
61
* Constant value that may be returned by UCharIteratorMove
62
* indicating that the final UTF-16 index is not known, but that the move succeeded.
63
* This can occur when moving relative to limit or length, or
64
* when moving relative to the current index after a setState()
65
* when the current UTF-16 index is not known.
67
* It would be very inefficient to have to count from the beginning of the text
68
* just to get the current/limit/length index after moving relative to it.
69
* The actual index can be determined with getIndex(UITER_CURRENT)
70
* which will count the UChars if necessary.
74
UITER_UNKNOWN_INDEX=-2
79
* Constant for UCharIterator getState() indicating an error or
81
* Returned by uiter_getState()/UCharIteratorGetState
82
* when an error occurs.
83
* Also, some UCharIterator implementations may not be able to return
84
* a valid state for each position. This will be clearly documented
85
* for each such iterator (none of the public ones here).
89
#define UITER_NO_STATE ((uint32_t)0xffffffff)
92
* Function type declaration for UCharIterator.getIndex().
94
* Gets the current position, or the start or limit of the
97
* This function may perform slowly for UITER_CURRENT after setState() was called,
98
* or for UITER_LENGTH, because an iterator implementation may have to count
99
* UChars if the underlying storage is not UTF-16.
101
* @param iter the UCharIterator structure ("this pointer")
102
* @param origin get the 0, start, limit, length, or current index
103
* @return the requested index, or U_SENTINEL in an error condition
105
* @see UCharIteratorOrigin
109
typedef int32_t U_CALLCONV
110
UCharIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin);
113
* Function type declaration for UCharIterator.move().
115
* Use iter->move(iter, index, UITER_ZERO) like CharacterIterator::setIndex(index).
117
* Moves the current position relative to the start or limit of the
118
* iteration range, or relative to the current position itself.
119
* The movement is expressed in numbers of code units forward
120
* or backward by specifying a positive or negative delta.
121
* Out of bounds movement will be pinned to the start or limit.
123
* This function may perform slowly for moving relative to UITER_LENGTH
124
* because an iterator implementation may have to count the rest of the
125
* UChars if the native storage is not UTF-16.
127
* When moving relative to the limit or length, or
128
* relative to the current position after setState() was called,
129
* move() may return UITER_UNKNOWN_INDEX (-2) to avoid an inefficient
130
* determination of the actual UTF-16 index.
131
* The actual index can be determined with getIndex(UITER_CURRENT)
132
* which will count the UChars if necessary.
133
* See UITER_UNKNOWN_INDEX for details.
135
* @param iter the UCharIterator structure ("this pointer")
136
* @param delta can be positive, zero, or negative
137
* @param origin move relative to the 0, start, limit, length, or current index
138
* @return the new index, or U_SENTINEL on an error condition,
139
* or UITER_UNKNOWN_INDEX when the index is not known.
141
* @see UCharIteratorOrigin
143
* @see UITER_UNKNOWN_INDEX
146
typedef int32_t U_CALLCONV
147
UCharIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin);
150
* Function type declaration for UCharIterator.hasNext().
152
* Check if current() and next() can still
153
* return another code unit.
155
* @param iter the UCharIterator structure ("this pointer")
156
* @return boolean value for whether current() and next() can still return another code unit
161
typedef UBool U_CALLCONV
162
UCharIteratorHasNext(UCharIterator *iter);
165
* Function type declaration for UCharIterator.hasPrevious().
167
* Check if previous() can still return another code unit.
169
* @param iter the UCharIterator structure ("this pointer")
170
* @return boolean value for whether previous() can still return another code unit
175
typedef UBool U_CALLCONV
176
UCharIteratorHasPrevious(UCharIterator *iter);
179
* Function type declaration for UCharIterator.current().
181
* Return the code unit at the current position,
182
* or U_SENTINEL if there is none (index is at the limit).
184
* @param iter the UCharIterator structure ("this pointer")
185
* @return the current code unit
190
typedef UChar32 U_CALLCONV
191
UCharIteratorCurrent(UCharIterator *iter);
194
* Function type declaration for UCharIterator.next().
196
* Return the code unit at the current index and increment
197
* the index (post-increment, like s[i++]),
198
* or return U_SENTINEL if there is none (index is at the limit).
200
* @param iter the UCharIterator structure ("this pointer")
201
* @return the current code unit (and post-increment the current index)
206
typedef UChar32 U_CALLCONV
207
UCharIteratorNext(UCharIterator *iter);
210
* Function type declaration for UCharIterator.previous().
212
* Decrement the index and return the code unit from there
213
* (pre-decrement, like s[--i]),
214
* or return U_SENTINEL if there is none (index is at the start).
216
* @param iter the UCharIterator structure ("this pointer")
217
* @return the previous code unit (after pre-decrementing the current index)
222
typedef UChar32 U_CALLCONV
223
UCharIteratorPrevious(UCharIterator *iter);
226
* Function type declaration for UCharIterator.reservedFn().
227
* Reserved for future use.
229
* @param iter the UCharIterator structure ("this pointer")
230
* @param something some integer argument
231
* @return some integer
236
typedef int32_t U_CALLCONV
237
UCharIteratorReserved(UCharIterator *iter, int32_t something);
240
* Function type declaration for UCharIterator.getState().
242
* Get the "state" of the iterator in the form of a single 32-bit word.
243
* It is recommended that the state value be calculated to be as small as
244
* is feasible. For strings with limited lengths, fewer than 32 bits may
247
* This is used together with setState()/UCharIteratorSetState
248
* to save and restore the iterator position more efficiently than with
251
* The iterator state is defined as a uint32_t value because it is designed
252
* for use in ucol_nextSortKeyPart() which provides 32 bits to store the state
253
* of the character iterator.
255
* With some UCharIterator implementations (e.g., UTF-8),
256
* getting and setting the UTF-16 index with existing functions
257
* (getIndex(UITER_CURRENT) followed by move(pos, UITER_ZERO)) is possible but
258
* relatively slow because the iterator has to "walk" from a known index
259
* to the requested one.
260
* This takes more time the farther it needs to go.
262
* An opaque state value allows an iterator implementation to provide
263
* an internal index (UTF-8: the source byte array index) for
264
* fast, constant-time restoration.
266
* After calling setState(), a getIndex(UITER_CURRENT) may be slow because
267
* the UTF-16 index may not be restored as well, but the iterator can deliver
268
* the correct text contents and move relative to the current position
269
* without performance degradation.
271
* Some UCharIterator implementations may not be able to return
272
* a valid state for each position, in which case they return UITER_NO_STATE instead.
273
* This will be clearly documented for each such iterator (none of the public ones here).
275
* @param iter the UCharIterator structure ("this pointer")
276
* @return the state word
279
* @see UCharIteratorSetState
280
* @see UITER_NO_STATE
283
typedef uint32_t U_CALLCONV
284
UCharIteratorGetState(const UCharIterator *iter);
287
* Function type declaration for UCharIterator.setState().
289
* Restore the "state" of the iterator using a state word from a getState() call.
290
* The iterator object need not be the same one as for which getState() was called,
291
* but it must be of the same type (set up using the same uiter_setXYZ function)
292
* and it must iterate over the same string
293
* (binary identical regardless of memory address).
294
* For more about the state word see UCharIteratorGetState.
296
* After calling setState(), a getIndex(UITER_CURRENT) may be slow because
297
* the UTF-16 index may not be restored as well, but the iterator can deliver
298
* the correct text contents and move relative to the current position
299
* without performance degradation.
301
* @param iter the UCharIterator structure ("this pointer")
302
* @param state the state word from a getState() call
303
* on a same-type, same-string iterator
304
* @param pErrorCode Must be a valid pointer to an error code value,
305
* which must not indicate a failure before the function call.
308
* @see UCharIteratorGetState
311
typedef void U_CALLCONV
312
UCharIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode);
316
* C API for code unit iteration.
317
* This can be used as a C wrapper around
318
* CharacterIterator, Replaceable, or implemented using simple strings, etc.
320
* There are two roles for using UCharIterator:
322
* A "provider" sets the necessary function pointers and controls the "protected"
323
* fields of the UCharIterator structure. A "provider" passes a UCharIterator
324
* into C APIs that need a UCharIterator as an abstract, flexible string interface.
326
* Implementations of such C APIs are "callers" of UCharIterator functions;
327
* they only use the "public" function pointers and never access the "protected"
330
* The current() and next() functions only check the current index against the
331
* limit, and previous() only checks the current index against the start,
332
* to see if the iterator already reached the end of the iteration range.
334
* The assumption - in all iterators - is that the index is moved via the API,
335
* which means it won't go out of bounds, or the index is modified by
336
* user code that knows enough about the iterator implementation to set valid
339
* UCharIterator functions return code unit values 0..0xffff,
340
* or U_SENTINEL if the iteration bounds are reached.
344
struct UCharIterator {
346
* (protected) Pointer to string or wrapped object or similar.
347
* Not used by caller.
353
* (protected) Length of string or similar.
354
* Not used by caller.
360
* (protected) Start index or similar.
361
* Not used by caller.
367
* (protected) Current index or similar.
368
* Not used by caller.
374
* (protected) Limit index or similar.
375
* Not used by caller.
381
* (protected) Used by UTF-8 iterators and possibly others.
384
int32_t reservedField;
387
* (public) Returns the current position or the
388
* start or limit index of the iteration range.
390
* @see UCharIteratorGetIndex
393
UCharIteratorGetIndex *getIndex;
396
* (public) Moves the current position relative to the start or limit of the
397
* iteration range, or relative to the current position itself.
398
* The movement is expressed in numbers of code units forward
399
* or backward by specifying a positive or negative delta.
401
* @see UCharIteratorMove
404
UCharIteratorMove *move;
407
* (public) Check if current() and next() can still
408
* return another code unit.
410
* @see UCharIteratorHasNext
413
UCharIteratorHasNext *hasNext;
416
* (public) Check if previous() can still return another code unit.
418
* @see UCharIteratorHasPrevious
421
UCharIteratorHasPrevious *hasPrevious;
424
* (public) Return the code unit at the current position,
425
* or U_SENTINEL if there is none (index is at the limit).
427
* @see UCharIteratorCurrent
430
UCharIteratorCurrent *current;
433
* (public) Return the code unit at the current index and increment
434
* the index (post-increment, like s[i++]),
435
* or return U_SENTINEL if there is none (index is at the limit).
437
* @see UCharIteratorNext
440
UCharIteratorNext *next;
443
* (public) Decrement the index and return the code unit from there
444
* (pre-decrement, like s[--i]),
445
* or return U_SENTINEL if there is none (index is at the start).
447
* @see UCharIteratorPrevious
450
UCharIteratorPrevious *previous;
453
* (public) Reserved for future use. Currently NULL.
455
* @see UCharIteratorReserved
458
UCharIteratorReserved *reservedFn;
461
* (public) Return the state of the iterator, to be restored later with setState().
462
* This function pointer is NULL if the iterator does not implement it.
464
* @see UCharIteratorGet
467
UCharIteratorGetState *getState;
470
* (public) Restore the iterator state from the state word from a call
472
* This function pointer is NULL if the iterator does not implement it.
474
* @see UCharIteratorSet
477
UCharIteratorSetState *setState;
481
* Helper function for UCharIterator to get the code point
482
* at the current index.
484
* Return the code point that includes the code unit at the current position,
485
* or U_SENTINEL if there is none (index is at the limit).
486
* If the current code unit is a lead or trail surrogate,
487
* then the following or preceding surrogate is used to form
488
* the code point value.
490
* @param iter the UCharIterator structure ("this pointer")
491
* @return the current code point
495
* @see UnicodeString::char32At()
498
U_STABLE UChar32 U_EXPORT2
499
uiter_current32(UCharIterator *iter);
502
* Helper function for UCharIterator to get the next code point.
504
* Return the code point at the current index and increment
505
* the index (post-increment, like s[i++]),
506
* or return U_SENTINEL if there is none (index is at the limit).
508
* @param iter the UCharIterator structure ("this pointer")
509
* @return the current code point (and post-increment the current index)
515
U_STABLE UChar32 U_EXPORT2
516
uiter_next32(UCharIterator *iter);
519
* Helper function for UCharIterator to get the previous code point.
521
* Decrement the index and return the code point from there
522
* (pre-decrement, like s[--i]),
523
* or return U_SENTINEL if there is none (index is at the start).
525
* @param iter the UCharIterator structure ("this pointer")
526
* @return the previous code point (after pre-decrementing the current index)
532
U_STABLE UChar32 U_EXPORT2
533
uiter_previous32(UCharIterator *iter);
536
* Get the "state" of the iterator in the form of a single 32-bit word.
537
* This is a convenience function that calls iter->getState(iter)
538
* if iter->getState is not NULL;
539
* if it is NULL or any other error occurs, then UITER_NO_STATE is returned.
541
* Some UCharIterator implementations may not be able to return
542
* a valid state for each position, in which case they return UITER_NO_STATE instead.
543
* This will be clearly documented for each such iterator (none of the public ones here).
545
* @param iter the UCharIterator structure ("this pointer")
546
* @return the state word
549
* @see UCharIteratorGetState
550
* @see UITER_NO_STATE
553
U_STABLE uint32_t U_EXPORT2
554
uiter_getState(const UCharIterator *iter);
557
* Restore the "state" of the iterator using a state word from a getState() call.
558
* This is a convenience function that calls iter->setState(iter, state, pErrorCode)
559
* if iter->setState is not NULL; if it is NULL, then U_UNSUPPORTED_ERROR is set.
561
* @param iter the UCharIterator structure ("this pointer")
562
* @param state the state word from a getState() call
563
* on a same-type, same-string iterator
564
* @param pErrorCode Must be a valid pointer to an error code value,
565
* which must not indicate a failure before the function call.
568
* @see UCharIteratorSetState
571
U_STABLE void U_EXPORT2
572
uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode);
575
* Set up a UCharIterator to iterate over a string.
577
* Sets the UCharIterator function pointers for iteration over the string s
578
* with iteration boundaries start=index=0 and length=limit=string length.
579
* The "provider" may set the start, index, and limit values at any time
580
* within the range 0..length.
581
* The length field will be ignored.
583
* The string pointer s is set into UCharIterator.context without copying
584
* or reallocating the string contents.
586
* getState() simply returns the current index.
587
* move() will always return the final index.
589
* @param iter UCharIterator structure to be set for iteration
590
* @param s String to iterate over
591
* @param length Length of s, or -1 if NUL-terminated
596
U_STABLE void U_EXPORT2
597
uiter_setString(UCharIterator *iter, const UChar *s, int32_t length);
600
* Set up a UCharIterator to iterate over a UTF-16BE string
601
* (byte vector with a big-endian pair of bytes per UChar).
603
* Everything works just like with a normal UChar iterator (uiter_setString),
604
* except that UChars are assembled from byte pairs,
605
* and that the length argument here indicates an even number of bytes.
607
* getState() simply returns the current index.
608
* move() will always return the final index.
610
* @param iter UCharIterator structure to be set for iteration
611
* @param s UTF-16BE string to iterate over
612
* @param length Length of s as an even number of bytes, or -1 if NUL-terminated
613
* (NUL means pair of 0 bytes at even index from s)
616
* @see uiter_setString
619
U_STABLE void U_EXPORT2
620
uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length);
623
* Set up a UCharIterator to iterate over a UTF-8 string.
625
* Sets the UCharIterator function pointers for iteration over the UTF-8 string s
626
* with UTF-8 iteration boundaries 0 and length.
627
* The implementation counts the UTF-16 index on the fly and
628
* lazily evaluates the UTF-16 length of the text.
630
* The start field is used as the UTF-8 offset, the limit field as the UTF-8 length.
631
* When the reservedField is not 0, then it contains a supplementary code point
632
* and the UTF-16 index is between the two corresponding surrogates.
633
* At that point, the UTF-8 index is behind that code point.
635
* The UTF-8 string pointer s is set into UCharIterator.context without copying
636
* or reallocating the string contents.
638
* getState() returns a state value consisting of
639
* - the current UTF-8 source byte index (bits 31..1)
640
* - a flag (bit 0) that indicates whether the UChar position is in the middle
641
* of a surrogate pair
642
* (from a 4-byte UTF-8 sequence for the corresponding supplementary code point)
644
* getState() cannot also encode the UTF-16 index in the state value.
645
* move(relative to limit or length), or
646
* move(relative to current) after setState(), may return UITER_UNKNOWN_INDEX.
648
* @param iter UCharIterator structure to be set for iteration
649
* @param s UTF-8 string to iterate over
650
* @param length Length of s in bytes, or -1 if NUL-terminated
655
U_STABLE void U_EXPORT2
656
uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length);
661
* Set up a UCharIterator to wrap around a C++ CharacterIterator.
663
* Sets the UCharIterator function pointers for iteration using the
664
* CharacterIterator charIter.
666
* The CharacterIterator pointer charIter is set into UCharIterator.context
667
* without copying or cloning the CharacterIterator object.
668
* The other "protected" UCharIterator fields are set to 0 and will be ignored.
669
* The iteration index and boundaries are controlled by the CharacterIterator.
671
* getState() simply returns the current index.
672
* move() will always return the final index.
674
* @param iter UCharIterator structure to be set for iteration
675
* @param charIter CharacterIterator to wrap
680
U_STABLE void U_EXPORT2
681
uiter_setCharacterIterator(UCharIterator *iter, U_NAMESPACE_QUALIFIER CharacterIterator *charIter);
684
* Set up a UCharIterator to iterate over a C++ Replaceable.
686
* Sets the UCharIterator function pointers for iteration over the
687
* Replaceable rep with iteration boundaries start=index=0 and
688
* length=limit=rep->length().
689
* The "provider" may set the start, index, and limit values at any time
690
* within the range 0..length=rep->length().
691
* The length field will be ignored.
693
* The Replaceable pointer rep is set into UCharIterator.context without copying
694
* or cloning/reallocating the Replaceable object.
696
* getState() simply returns the current index.
697
* move() will always return the final index.
699
* @param iter UCharIterator structure to be set for iteration
700
* @param rep Replaceable to iterate over
705
U_STABLE void U_EXPORT2
706
uiter_setReplaceable(UCharIterator *iter, const U_NAMESPACE_QUALIFIER Replaceable *rep);