2
*************************************************************************
4
* Copyright (c) 1996-2001, International Business Machines Corporation and
5
* others. All Rights Reserved.
6
*************************************************************************
9
#include "unicode/utypes.h"
10
#include "unicode/unistr.h"
11
#include "unicode/chariter.h"
12
#include "unicode/schriter.h"
13
#include "unicode/uchriter.h"
14
#include "unicode/uiter.h"
15
#include "unicode/normlzr.h"
21
//-------------------------------------------------------------------------
22
// Constructors and other boilerplate
23
//-------------------------------------------------------------------------
25
Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
26
fUMode(mode), fOptions(0),
27
currentIndex(0), nextIndex(0),
28
buffer(), bufferPos(0)
30
init(new StringCharacterIterator(str));
33
Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) :
34
fUMode(mode), fOptions(0),
35
currentIndex(0), nextIndex(0),
36
buffer(), bufferPos(0)
38
init(new UCharCharacterIterator(str, length));
41
Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
42
fUMode(mode), fOptions(0),
43
currentIndex(0), nextIndex(0),
44
buffer(), bufferPos(0)
49
// deprecated constructors
51
Normalizer::Normalizer(const UnicodeString& str,
53
fUMode(getUMode(mode)), fOptions(0),
54
currentIndex(0), nextIndex(0),
55
buffer(), bufferPos(0)
57
init(new StringCharacterIterator(str));
60
Normalizer::Normalizer(const UnicodeString& str,
63
fUMode(getUMode(mode)), fOptions(options),
64
currentIndex(0), nextIndex(0),
65
buffer(), bufferPos(0)
67
init(new StringCharacterIterator(str));
70
Normalizer::Normalizer(const UChar *str, int32_t length, EMode mode) :
71
fUMode(getUMode(mode)), fOptions(0),
72
currentIndex(0), nextIndex(0),
73
buffer(), bufferPos(0)
75
init(new UCharCharacterIterator(str, length));
78
Normalizer::Normalizer(const CharacterIterator& iter,
80
fUMode(getUMode(mode)), fOptions(0),
81
currentIndex(0), nextIndex(0),
82
buffer(), bufferPos(0)
87
Normalizer::Normalizer(const CharacterIterator& iter,
90
fUMode(getUMode(mode)), fOptions(options),
91
currentIndex(0), nextIndex(0),
92
buffer(), bufferPos(0)
97
Normalizer::Normalizer(const Normalizer ©) :
98
fUMode(copy.fUMode), fOptions(copy.fOptions),
99
currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
100
buffer(copy.buffer), bufferPos(copy.bufferPos)
102
init(((CharacterIterator *)(copy.text->context))->clone());
105
static const UChar _NUL=0;
108
Normalizer::init(CharacterIterator *iter) {
109
UErrorCode errorCode=U_ZERO_ERROR;
111
text=new UCharIterator;
113
if(unorm_haveData(&errorCode)) {
114
uiter_setCharacterIterator(text, iter);
117
uiter_setCharacterIterator(text, new UCharCharacterIterator(&_NUL, 0));
121
Normalizer::~Normalizer()
123
delete (CharacterIterator *)text->context;
128
Normalizer::clone() const
131
return new Normalizer(*this);
138
* Generates a hash code for this iterator.
140
int32_t Normalizer::hashCode() const
142
return ((CharacterIterator *)(text->context))->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
145
UBool Normalizer::operator==(const Normalizer& that) const
149
fUMode==that.fUMode &&
150
fOptions==that.fOptions &&
151
*((CharacterIterator *)(text->context))==*((CharacterIterator *)(that.text->context)) &&
152
buffer==that.buffer &&
153
bufferPos==that.bufferPos &&
154
nextIndex==that.nextIndex;
157
//-------------------------------------------------------------------------
158
// Static utility methods
159
//-------------------------------------------------------------------------
162
Normalizer::normalize(const UnicodeString& source,
163
UNormalizationMode mode, int32_t options,
164
UnicodeString& result,
165
UErrorCode &status) {
166
if(source.isBogus() || U_FAILURE(status)) {
168
if(U_SUCCESS(status)) {
169
status=U_ILLEGAL_ARGUMENT_ERROR;
172
UChar *buffer=result.getBuffer(source.length());
173
int32_t length=unorm_internalNormalize(buffer, result.getCapacity(),
174
source.getBuffer(), source.length(),
175
mode, (options&IGNORE_HANGUL)!=0,
177
result.releaseBuffer(length);
178
if(status==U_BUFFER_OVERFLOW_ERROR) {
180
buffer=result.getBuffer(length);
181
length=unorm_internalNormalize(buffer, result.getCapacity(),
182
source.getBuffer(), source.length(),
183
mode, (options&IGNORE_HANGUL)!=0,
185
result.releaseBuffer(length);
188
if(U_FAILURE(status)) {
194
UNormalizationCheckResult
195
Normalizer::quickCheck(const UnicodeString& source,
196
UNormalizationMode mode,
197
UErrorCode &status) {
198
if(U_FAILURE(status)) {
202
return unorm_quickCheck(source.getBuffer(), source.length(),
207
Normalizer::compose(const UnicodeString& source,
208
UBool compat, int32_t options,
209
UnicodeString& result,
210
UErrorCode &status) {
211
if(source.isBogus() || U_FAILURE(status)) {
213
if(U_SUCCESS(status)) {
214
status=U_ILLEGAL_ARGUMENT_ERROR;
217
UChar *buffer=result.getBuffer(source.length());
218
int32_t length=unorm_compose(buffer, result.getCapacity(),
219
source.getBuffer(), source.length(),
220
compat, (options&IGNORE_HANGUL)!=0,
222
result.releaseBuffer(length);
223
if(status==U_BUFFER_OVERFLOW_ERROR) {
225
buffer=result.getBuffer(length);
226
length=unorm_compose(buffer, result.getCapacity(),
227
source.getBuffer(), source.length(),
228
compat, (options&IGNORE_HANGUL)!=0,
230
result.releaseBuffer(length);
233
if(U_FAILURE(status)) {
240
Normalizer::decompose(const UnicodeString& source,
241
UBool compat, int32_t options,
242
UnicodeString& result,
243
UErrorCode &status) {
244
if(source.isBogus() || U_FAILURE(status)) {
246
if(U_SUCCESS(status)) {
247
status=U_ILLEGAL_ARGUMENT_ERROR;
250
UChar *buffer=result.getBuffer(source.length());
251
int32_t length=unorm_decompose(buffer, result.getCapacity(),
252
source.getBuffer(), source.length(),
253
compat, (options&IGNORE_HANGUL)!=0,
255
result.releaseBuffer(length);
256
if(status==U_BUFFER_OVERFLOW_ERROR) {
258
buffer=result.getBuffer(length);
259
length=unorm_decompose(buffer, result.getCapacity(),
260
source.getBuffer(), source.length(),
261
compat, (options&IGNORE_HANGUL)!=0,
263
result.releaseBuffer(length);
266
if(U_FAILURE(status)) {
273
Normalizer::concatenate(UnicodeString &left, UnicodeString &right,
274
UnicodeString &result,
275
UNormalizationMode mode, int32_t options,
276
UErrorCode &errorCode) {
277
if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {
279
if(U_SUCCESS(errorCode)) {
280
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
283
UChar *buffer=result.getBuffer(left.length()+right.length());
284
int32_t length=unorm_concatenate(left.getBuffer(), left.length(),
285
right.getBuffer(), right.length(),
286
buffer, result.getCapacity(),
289
result.releaseBuffer(length);
290
if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
291
errorCode=U_ZERO_ERROR;
292
buffer=result.getBuffer(length);
293
int32_t length=unorm_concatenate(left.getBuffer(), left.length(),
294
right.getBuffer(), right.length(),
295
buffer, result.getCapacity(),
298
result.releaseBuffer(length);
301
if(U_FAILURE(errorCode)) {
308
//-------------------------------------------------------------------------
310
//-------------------------------------------------------------------------
313
* Return the current character in the normalized text->
315
UChar32 Normalizer::current() {
316
if(bufferPos<buffer.length() || nextNormalize()) {
317
return buffer.char32At(bufferPos);
324
* Return the next character in the normalized text and advance
325
* the iteration position by one. If the end
326
* of the text has already been reached, {@link #DONE} is returned.
328
UChar32 Normalizer::next() {
329
if(bufferPos<buffer.length() || nextNormalize()) {
330
UChar32 c=buffer.char32At(bufferPos);
331
bufferPos+=UTF_CHAR_LENGTH(c);
339
* Return the previous character in the normalized text and decrement
340
* the iteration position by one. If the beginning
341
* of the text has already been reached, {@link #DONE} is returned.
343
UChar32 Normalizer::previous() {
344
if(bufferPos>0 || previousNormalize()) {
345
UChar32 c=buffer.char32At(bufferPos-1);
346
bufferPos-=UTF_CHAR_LENGTH(c);
353
void Normalizer::reset() {
354
currentIndex=nextIndex=text->move(text, 0, UITER_START);
359
Normalizer::setIndexOnly(int32_t index) {
360
currentIndex=nextIndex=text->move(text, index, UITER_ZERO); // validates index
365
* Set the iteration position in the input text that is being normalized
366
* and return the first normalized character at that position.
368
* <b>Note:</b> This method sets the position in the <em>input</em> text,
369
* while {@link #next} and {@link #previous} iterate through characters
370
* in the normalized <em>output</em>. This means that there is not
371
* necessarily a one-to-one correspondence between characters returned
372
* by <tt>next</tt> and <tt>previous</tt> and the indices passed to and
373
* returned from <tt>setIndex</tt> and {@link #getIndex}.
375
* @param index the desired index in the input text->
377
* @return the first normalized character that is the result of iterating
378
* forward starting at the given index.
380
* @throws IllegalArgumentException if the given index is less than
381
* {@link #getBeginIndex} or greater than {@link #getEndIndex}.
383
UChar32 Normalizer::setIndex(int32_t index) {
389
* Return the first character in the normalized text-> This resets
390
* the <tt>Normalizer's</tt> position to the beginning of the text->
392
UChar32 Normalizer::first() {
398
* Return the last character in the normalized text-> This resets
399
* the <tt>Normalizer's</tt> position to be just before the
400
* the input text corresponding to that normalized character.
402
UChar32 Normalizer::last() {
403
currentIndex=nextIndex=text->move(text, 0, UITER_LIMIT);
409
* Retrieve the current iteration position in the input text that is
410
* being normalized. This method is useful in applications such as
411
* searching, where you need to be able to determine the position in
412
* the input text that corresponds to a given normalized output character.
414
* <b>Note:</b> This method sets the position in the <em>input</em>, while
415
* {@link #next} and {@link #previous} iterate through characters in the
416
* <em>output</em>. This means that there is not necessarily a one-to-one
417
* correspondence between characters returned by <tt>next</tt> and
418
* <tt>previous</tt> and the indices passed to and returned from
419
* <tt>setIndex</tt> and {@link #getIndex}.
422
int32_t Normalizer::getIndex() const {
423
if(bufferPos<buffer.length()) {
431
* Retrieve the index of the start of the input text-> This is the begin index
432
* of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
433
* over which this <tt>Normalizer</tt> is iterating
435
int32_t Normalizer::startIndex() const {
436
return text->getIndex(text, UITER_START);
440
* Retrieve the index of the end of the input text-> This is the end index
441
* of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
442
* over which this <tt>Normalizer</tt> is iterating
444
int32_t Normalizer::endIndex() const {
445
return text->getIndex(text, UITER_LIMIT);
448
//-------------------------------------------------------------------------
449
// Property access methods
450
//-------------------------------------------------------------------------
453
Normalizer::setMode(UNormalizationMode newMode)
459
Normalizer::getUMode() const
465
Normalizer::setOption(int32_t option,
471
fOptions &= (~option);
476
Normalizer::getOption(int32_t option) const
478
return (fOptions & option) != 0;
482
* Set the input text over which this <tt>Normalizer</tt> will iterate.
483
* The iteration position is set to the beginning of the input text->
486
Normalizer::setText(const UnicodeString& newText,
489
if (U_FAILURE(status)) {
492
CharacterIterator *newIter = new StringCharacterIterator(newText);
493
if (newIter == NULL) {
494
status = U_MEMORY_ALLOCATION_ERROR;
497
delete (CharacterIterator *)(text->context);
498
text->context = newIter;
503
* Set the input text over which this <tt>Normalizer</tt> will iterate.
504
* The iteration position is set to the beginning of the string.
507
Normalizer::setText(const CharacterIterator& newText,
510
if (U_FAILURE(status)) {
513
CharacterIterator *newIter = newText.clone();
514
if (newIter == NULL) {
515
status = U_MEMORY_ALLOCATION_ERROR;
518
delete (CharacterIterator *)(text->context);
519
text->context = newIter;
524
Normalizer::setText(const UChar* newText,
528
if (U_FAILURE(status)) {
531
CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
532
if (newIter == NULL) {
533
status = U_MEMORY_ALLOCATION_ERROR;
536
delete (CharacterIterator *)(text->context);
537
text->context = newIter;
542
* Copies the text under iteration into the UnicodeString referred to by "result".
543
* @param result Receives a copy of the text under iteration.
546
Normalizer::getText(UnicodeString& result)
548
((CharacterIterator *)(text->context))->getText(result);
551
//-------------------------------------------------------------------------
552
// Private utility methods
553
//-------------------------------------------------------------------------
555
void Normalizer::clearBuffer() {
561
Normalizer::nextNormalize() {
564
UErrorCode errorCode;
567
currentIndex=nextIndex;
568
text->move(text, nextIndex, UITER_ZERO);
569
if(!text->hasNext(text)) {
573
errorCode=U_ZERO_ERROR;
574
p=buffer.getBuffer(-1);
575
length=unorm_next(text, p, buffer.getCapacity(),
579
buffer.releaseBuffer(length);
580
if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
581
errorCode=U_ZERO_ERROR;
582
text->move(text, nextIndex, UITER_ZERO);
583
p=buffer.getBuffer(length);
584
length=unorm_next(text, p, buffer.getCapacity(),
588
buffer.releaseBuffer(length);
591
nextIndex=text->getIndex(text, UITER_CURRENT);
592
return U_SUCCESS(errorCode) && !buffer.isEmpty();
596
Normalizer::previousNormalize() {
599
UErrorCode errorCode;
602
nextIndex=currentIndex;
603
text->move(text, currentIndex, UITER_ZERO);
604
if(!text->hasPrevious(text)) {
608
errorCode=U_ZERO_ERROR;
609
p=buffer.getBuffer(-1);
610
length=unorm_previous(text, p, buffer.getCapacity(),
614
buffer.releaseBuffer(length);
615
if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
616
errorCode=U_ZERO_ERROR;
617
text->move(text, currentIndex, UITER_ZERO);
618
p=buffer.getBuffer(length);
619
length=unorm_previous(text, p, buffer.getCapacity(),
623
buffer.releaseBuffer(length);
626
bufferPos=buffer.length();
627
currentIndex=text->getIndex(text, UITER_CURRENT);
628
return U_SUCCESS(errorCode) && !buffer.isEmpty();