2
**********************************************************************
3
* Copyright (C) 2001-2007, International Business Machines
4
* Corporation and others. All Rights Reserved.
5
**********************************************************************
6
* Date Name Description
7
* 07/03/01 aliu Creation.
8
**********************************************************************
11
#include "unicode/utypes.h"
13
#if !UCONFIG_NO_TRANSLITERATION
15
#include "unicode/uniset.h"
16
#include "unicode/uiter.h"
23
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
26
* System registration hook.
28
void NormalizationTransliterator::registerIDs() {
29
UErrorCode errorCode = U_ZERO_ERROR;
30
if(!unorm_haveData(&errorCode)) {
34
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
35
_create, integerToken(UNORM_NFC));
36
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
37
_create, integerToken(UNORM_NFKC));
38
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
39
_create, integerToken(UNORM_NFD));
40
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
41
_create, integerToken(UNORM_NFKD));
42
Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
43
UNICODE_STRING_SIMPLE("NFD"), TRUE);
44
Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
45
UNICODE_STRING_SIMPLE("NFKD"), TRUE);
51
Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
53
return new NormalizationTransliterator(ID, (UNormalizationMode) context.integer, 0);
57
* Constructs a transliterator.
59
NormalizationTransliterator::NormalizationTransliterator(
60
const UnicodeString& id,
61
UNormalizationMode mode, int32_t opt) :
62
Transliterator(id, 0) {
70
NormalizationTransliterator::~NormalizationTransliterator() {
76
NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
83
* Assignment operator.
85
/*NormalizationTransliterator& NormalizationTransliterator::operator=(const NormalizationTransliterator& o) {
86
Transliterator::operator=(o);
95
Transliterator* NormalizationTransliterator::clone(void) const {
96
return new NormalizationTransliterator(*this);
100
* Implements {@link Transliterator#handleTransliterate}.
102
void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
103
UBool isIncremental) const {
104
// start and limit of the input range
105
int32_t start = offsets.start;
106
int32_t limit = offsets.limit;
107
int32_t length, delta;
113
// a C code unit iterator, implemented around the Replaceable
115
uiter_setReplaceable(&iter, &text);
117
// the output string and buffer pointer
118
UnicodeString output;
120
UBool neededToNormalize;
122
UErrorCode errorCode;
125
* Normalize as short chunks at a time as possible even in
126
* bulk mode, so that styled text is minimally disrupted.
127
* In incremental mode, a chunk that ends with offsets.limit
128
* must not be normalized.
130
* If it was known that the input text is not styled, then
131
* a bulk mode normalization could look like this:
134
UChar staticChars[256];
137
length = limit - start;
138
input.setTo(staticChars, 0, sizeof(staticChars)/U_SIZEOF_UCHAR); // writable alias
140
_Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
141
input.releaseBuffer(length);
143
UErrorCode status = U_ZERO_ERROR;
144
Normalizer::normalize(input, fMode, options, output, status);
146
text.handleReplaceBetween(start, limit, output);
148
int32_t delta = output.length() - length;
149
offsets.contextLimit += delta;
150
offsets.limit += delta;
151
offsets.start = limit + delta;
155
while(start < limit) {
156
// set the iterator limits for the remaining input range
157
// this is a moving target because of the replacements in the text object
158
iter.start = iter.index = start;
161
// incrementally normalize a small chunk of the input
162
buffer = output.getBuffer(-1);
163
errorCode = U_ZERO_ERROR;
164
length = unorm_next(&iter, buffer, output.getCapacity(),
166
TRUE, &neededToNormalize,
168
output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
170
if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
171
// use a larger output string buffer and do it again from the start
173
buffer = output.getBuffer(length);
174
errorCode = U_ZERO_ERROR;
175
length = unorm_next(&iter, buffer, output.getCapacity(),
177
TRUE, &neededToNormalize,
179
output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
182
if(U_FAILURE(errorCode)) {
187
if(isIncremental && limit == iter.limit) {
188
// stop in incremental mode when we reach the input limit
189
// in case there are additional characters that could change the
190
// normalization result
192
// UNLESS all characters in the result of the normalization of
193
// the last run are in the skippable set
194
const UChar *s=output.getBuffer();
195
int32_t i=0, outLength=output.length();
199
U16_NEXT(s, i, outLength, c);
200
if(!unorm_isNFSkippable(c, fMode)) {
201
outLength=-1; // I wish C++ had labeled loops and break outer; ...
210
if(neededToNormalize) {
211
// replace the input chunk with its normalized form
212
text.handleReplaceBetween(start, limit, output);
214
// update all necessary indexes accordingly
215
delta = length - (limit - start); // length change in the text object
216
start = limit += delta; // the next chunk starts where this one ends, with adjustment
217
limit = offsets.limit += delta; // set the iteration limit to the adjusted end of the input range
218
offsets.contextLimit += delta;
222
limit = offsets.limit;
226
offsets.start = start;
231
#endif /* #if !UCONFIG_NO_TRANSLITERATION */