1
/**********************************************************************
3
* Description: Utilities to normalize and manipulate UTF-32 and
5
* Author: Ranjith Unnikrishnan
6
* Created: Thu July 4 2013
8
* (C) Copyright 2013, Google Inc.
9
* Licensed under the Apache License, Version 2.0 (the "License");
10
* you may not use this file except in compliance with the License.
11
* You may obtain a copy of the License at
12
* http://www.apache.org/licenses/LICENSE-2.0
13
* Unless required by applicable law or agreed to in writing, software
14
* distributed under the License is distributed on an "AS IS" BASIS,
15
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
* See the License for the specific language governing permissions and
17
* limitations under the License.
19
**********************************************************************/
21
#include "normstrngs.h"
23
#include "icuerrorcode.h"
25
#include "unicode/normalizer2.h" // From libicu
26
#include "unicode/translit.h" // From libicu
27
#include "unicode/unorm2.h" // From libicu
31
void UTF8ToUTF32(const char* utf8_str, GenericVector<char32>* str32) {
33
str32->reserve(strlen(utf8_str));
34
int len = strlen(utf8_str);
36
for (int ch = 0; ch < len; ch += step) {
37
step = UNICHAR::utf8_step(utf8_str + ch);
39
UNICHAR uni_ch(utf8_str + ch, step);
40
(*str32) += uni_ch.first_uni();
45
void UTF32ToUTF8(const GenericVector<char32>& str32, STRING* utf8_str) {
46
utf8_str->ensure(str32.length());
47
utf8_str->assign("", 0);
48
for (int i = 0; i < str32.length(); ++i) {
49
UNICHAR uni_ch(str32[i]);
50
char *utf8 = uni_ch.utf8_str();
58
bool is_hyphen_punc(const char32 ch) {
59
static const int kNumHyphenPuncUnicodes = 13;
60
static const char32 kHyphenPuncUnicodes[kNumHyphenPuncUnicodes] = {
62
0x2010, 0x2011, 0x2012, 0x2013, 0x2014, 0x2015, // hyphen..horizontal bar
63
0x207b, // superscript minus
64
0x208b, // subscript minus
66
0xfe58, // small em dash
67
0xfe63, // small hyphen-minus
68
0xff0d, // fullwidth hyphen-minus
70
for (int i = 0; i < kNumHyphenPuncUnicodes; ++i) {
71
if (kHyphenPuncUnicodes[i] == ch)
77
bool is_single_quote(const char32 ch) {
78
static const int kNumSingleQuoteUnicodes = 8;
79
static const char32 kSingleQuoteUnicodes[kNumSingleQuoteUnicodes] = {
82
0x2018, // left single quotation mark (English, others)
83
0x2019, // right single quotation mark (Danish, Finnish, Swedish, Norw.)
84
// We may have to introduce a comma set with 0x201a
85
0x201B, // single high-reveresed-9 quotation mark (PropList.txt)
87
0x300C, // left corner bracket (East Asian languages)
88
0xFF07, // fullwidth apostrophe
90
for (int i = 0; i < kNumSingleQuoteUnicodes; ++i) {
91
if (kSingleQuoteUnicodes[i] == ch)
97
bool is_double_quote(const char32 ch) {
98
static const int kNumDoubleQuoteUnicodes = 8;
99
static const char32 kDoubleQuoteUnicodes[kNumDoubleQuoteUnicodes] = {
101
0x201C, // left double quotation mark (English, others)
102
0x201D, // right double quotation mark (Danish, Finnish, Swedish, Norw.)
103
0x201F, // double high-reversed-9 quotation mark (PropList.txt)
104
0x2033, // double prime
105
0x301D, // reversed double prime quotation mark (East Asian langs, horiz.)
106
0x301E, // close double prime (East Asian languages written horizontally)
107
0xFF02, // fullwidth quotation mark
109
for (int i = 0; i < kNumDoubleQuoteUnicodes; ++i) {
110
if (kDoubleQuoteUnicodes[i] == ch)
116
STRING NormalizeUTF8String(const char* str8) {
117
GenericVector<char32> str32, out_str32, norm_str;
118
UTF8ToUTF32(str8, &str32);
119
for (int i = 0; i < str32.length(); ++i) {
121
NormalizeChar32(str32[i], &norm_str);
122
for (int j = 0; j < norm_str.length(); ++j) {
123
out_str32.push_back(norm_str[j]);
127
UTF32ToUTF8(out_str32, &out_str8);
131
void NormalizeChar32(char32 ch, GenericVector<char32>* str) {
132
IcuErrorCode error_code;
133
const icu::Normalizer2* nfkc = icu::Normalizer2::getInstance(
134
NULL, "nfkc", UNORM2_COMPOSE, error_code);
135
error_code.assertSuccess();
138
icu::UnicodeString uch_str(static_cast<UChar32>(ch));
139
icu::UnicodeString norm_str = nfkc->normalize(uch_str, error_code);
140
error_code.assertSuccess();
143
for (int i = 0; i < norm_str.length(); ++i) {
144
// If any spaces were added by NFKC, pretend normalization is a nop.
145
if (norm_str[i] == ' ') {
150
str->push_back(OCRNormalize(static_cast<char32>(norm_str[i])));
155
// Apply just the OCR-specific normalizations and return the normalized char.
156
char32 OCRNormalize(char32 ch) {
157
if (is_hyphen_punc(ch))
159
else if (is_single_quote(ch))
161
else if (is_double_quote(ch))
166
bool IsOCREquivalent(char32 ch1, char32 ch2) {
167
return OCRNormalize(ch1) == OCRNormalize(ch2);
170
bool IsValidCodepoint(const char32 ch) {
171
// In the range [0, 0xD800) or [0xE000, 0x10FFFF]
172
return (static_cast<uinT32>(ch) < 0xD800)
173
|| (ch >= 0xE000 && ch <= 0x10FFFF);
176
bool IsWhitespace(const char32 ch) {
177
ASSERT_HOST_MSG(IsValidCodepoint(ch),
178
"Invalid Unicode codepoint: 0x%x\n", ch);
179
return u_isUWhiteSpace(static_cast<UChar32>(ch));
182
bool IsUTF8Whitespace(const char* text) {
183
return SpanUTF8Whitespace(text) == strlen(text);
186
int SpanUTF8Whitespace(const char* text) {
188
for (UNICHAR::const_iterator it = UNICHAR::begin(text, strlen(text));
189
it != UNICHAR::end(text, strlen(text));
191
if (!IsWhitespace(*it)) break;
192
n_white += it.utf8_len();
197
int SpanUTF8NotWhitespace(const char* text) {
199
for (UNICHAR::const_iterator it = UNICHAR::begin(text, strlen(text));
200
it != UNICHAR::end(text, strlen(text));
202
if (IsWhitespace(*it)) break;
203
n_notwhite += it.utf8_len();
208
bool IsInterchangeValid(const char32 ch) {
209
return IsValidCodepoint(ch) &&
210
!(ch >= 0xFDD0 && ch <= 0xFDEF) && // Noncharacters.
211
!(ch >= 0xFFFE && ch <= 0xFFFF) &&
212
!(ch >= 0x1FFFE && ch <= 0x1FFFF) &&
213
!(ch >= 0x2FFFE && ch <= 0x2FFFF) &&
214
!(ch >= 0x3FFFE && ch <= 0x3FFFF) &&
215
!(ch >= 0x4FFFE && ch <= 0x4FFFF) &&
216
!(ch >= 0x5FFFE && ch <= 0x5FFFF) &&
217
!(ch >= 0x6FFFE && ch <= 0x6FFFF) &&
218
!(ch >= 0x7FFFE && ch <= 0x7FFFF) &&
219
!(ch >= 0x8FFFE && ch <= 0x8FFFF) &&
220
!(ch >= 0x9FFFE && ch <= 0x9FFFF) &&
221
!(ch >= 0xAFFFE && ch <= 0xAFFFF) &&
222
!(ch >= 0xBFFFE && ch <= 0xBFFFF) &&
223
!(ch >= 0xCFFFE && ch <= 0xCFFFF) &&
224
!(ch >= 0xDFFFE && ch <= 0xDFFFF) &&
225
!(ch >= 0xEFFFE && ch <= 0xEFFFF) &&
226
!(ch >= 0xFFFFE && ch <= 0xFFFFF) &&
227
!(ch >= 0x10FFFE && ch <= 0x10FFFF) &&
228
(!u_isISOControl(static_cast<UChar32>(ch)) ||
229
ch == '\n' || ch == '\f' || ch == '\t' || ch == '\r');
232
bool IsInterchangeValid7BitAscii(const char32 ch) {
233
return IsValidCodepoint(ch) &&
235
(!u_isISOControl(static_cast<UChar32>(ch)) ||
236
ch == '\n' || ch == '\f' || ch == '\t' || ch == '\r');
239
char32 FullwidthToHalfwidth(const char32 ch) {
240
// Return unchanged if not in the fullwidth-halfwidth Unicode block.
241
if (ch < 0xFF00 || ch > 0xFFEF || !IsValidCodepoint(ch)) {
242
if (ch != 0x3000) return ch;
244
// Special case for fullwidth left and right "white parentheses".
245
if (ch == 0xFF5F) return 0x2985;
246
if (ch == 0xFF60) return 0x2986;
247
// Construct a full-to-half width transliterator.
248
IcuErrorCode error_code;
249
icu::UnicodeString uch_str(static_cast<UChar32>(ch));
250
const icu::Transliterator* fulltohalf = icu::Transliterator::createInstance(
251
"Fullwidth-Halfwidth", UTRANS_FORWARD, error_code);
252
error_code.assertSuccess();
255
fulltohalf->transliterate(uch_str);
257
ASSERT_HOST(uch_str.length() != 0);
261
} // namespace tesseract