2
******************************************************************************
3
* Copyright (C) 1998-2001, International Business Machines Corporation and *
4
* others. All Rights Reserved. *
5
******************************************************************************
12
#include "unicode/utypes.h"
13
#include "unicode/unicode.h"
14
#include "unicode/uchar.h"
15
#include "unicode/uchriter.h"
16
#include "unicode/brkiter.h"
17
#include "unicode/locid.h"
18
#include "unicode/unistr.h"
21
* This program takes a Unicode text file containing Thai text with
22
* spaces inserted where the word breaks are. It computes a copy of
23
* the text without spaces and uses a word instance of a Thai BreakIterator
24
* to compute the word breaks. The program reports any differences in the
27
* NOTE: by it's very nature, Thai word breaking is not exact, so it is
28
* exptected that this program will always report some differences.
32
* This class is a break iterator that counts words and spaces.
34
class SpaceBreakIterator
38
// text - pointer to an array of UChars to iterate over
39
// count - the number of UChars in text
40
SpaceBreakIterator(const UChar *text, int32_t count);
43
~SpaceBreakIterator();
45
// return next break position
48
// return current word count
49
int32_t getWordCount();
51
// return current space count
52
int32_t getSpaceCount();
55
// No arg constructor: private so clients can't call it.
58
// The underlying BreakIterator
59
BreakIterator *fBreakIter;
61
// address of the UChar array
64
// number of UChars in fText
70
// current space count
73
// true when fBreakIter has returned DONE
78
* This is the main class. It compares word breaks and reports the differences.
80
class ThaiWordbreakTest
83
// The main constructor:
84
// spaces - pointer to a UChar array for the text with spaces
85
// spaceCount - the number of characters in the spaces array
86
// noSpaces - pointer to a UChar array for the text without spaces
87
// noSpaceCount - the number of characters in the noSpaces array
88
// verbose - report all breaks if true, otherwise just report differences
89
ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount, const UChar *noSpaces, int32_t noSpaceCount, UBool verbose);
92
// returns the number of breaks that are in the spaces array
93
// but aren't found in the noSpaces array
94
int32_t getBreaksNotFound();
96
// returns the number of breaks which are found in the noSpaces
97
// array but aren't in the spaces array
98
int32_t getInvalidBreaks();
100
// returns the number of words found in the spaces array
101
int32_t getWordCount();
103
// reads the input Unicode text file:
104
// fileName - the path name of the file
105
// charCount - set to the number of UChars read from the file
106
// returns - the address of the UChar array containing the characters
107
static const UChar *readFile(char *fileName, int32_t &charCount);
109
// removes spaces form the input UChar array:
110
// spaces - pointer to the input UChar array
111
// count - number of UChars in the spaces array
112
// nonSpaceCount - the number of UChars in the result array
113
// returns - the address of the UChar array with spaces removed
114
static const UChar *crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount);
117
// The no arg constructor - private so clients can't call it
120
// This does the actual comparison:
121
// spaces - the address of the UChar array for the text with spaces
122
// spaceCount - the number of UChars in the spaces array
123
// noSpaces - the address of the UChar array for the text without spaces
124
// noSpaceCount - the number of UChars in the noSpaces array
125
// returns - true if all breaks match, false otherwise
126
UBool compareWordBreaks(const UChar *spaces, int32_t spaceCount,
127
const UChar *noSpaces, int32_t noSpaceCount);
129
// helper method to report a break in the spaces
130
// array that's not found in the noSpaces array
131
void breakNotFound(int32_t br);
133
// helper method to report a break that's found in
134
// the noSpaces array that's not in the spaces array
135
void foundInvalidBreak(int32_t br);
137
// count of breaks in the spaces array that
138
// aren't found in the noSpaces array
139
int32_t fBreaksNotFound;
141
// count of breaks found in the noSpaces array
142
// that aren't in the spaces array
143
int32_t fInvalidBreaks;
145
// number of words found in the spaces array
148
// report all breaks if true, otherwise just report differences
153
* The main constructor: it calls compareWordBreaks and reports any differences
155
ThaiWordbreakTest::ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount,
156
const UChar *noSpaces, int32_t noSpaceCount, UBool verbose)
157
: fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose)
159
compareWordBreaks(spaces, spaceCount, noSpaces, noSpaceCount);
163
* The no arg constructor
165
ThaiWordbreakTest::ThaiWordbreakTest()
173
ThaiWordbreakTest::~ThaiWordbreakTest()
179
* returns the number of breaks in the spaces array
180
* that aren't found in the noSpaces array
182
inline int32_t ThaiWordbreakTest::getBreaksNotFound()
184
return fBreaksNotFound;
188
* Returns the number of breaks found in the noSpaces
189
* array that aren't in the spaces array
191
inline int32_t ThaiWordbreakTest::getInvalidBreaks()
193
return fInvalidBreaks;
197
* Returns the number of words found in the spaces array
199
inline int32_t ThaiWordbreakTest::getWordCount()
205
* This method does the acutal break comparison and reports the results.
206
* It uses a SpaceBreakIterator to iterate over the text with spaces,
207
* and a word instance of a Thai BreakIterator to iterate over the text
210
UBool ThaiWordbreakTest::compareWordBreaks(const UChar *spaces, int32_t spaceCount,
211
const UChar *noSpaces, int32_t noSpaceCount)
215
UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(noSpaces, noSpaceCount);
216
UErrorCode status = U_ZERO_ERROR;
218
BreakIterator *breakIter = BreakIterator::createWordInstance(thai, status);
219
breakIter->adoptText(noSpaceIter);
221
SpaceBreakIterator spaceIter(spaces, spaceCount);
223
int32_t nextBreak = 0;
224
int32_t nextSpaceBreak = 0;
225
int32_t iterCount = 0;
228
nextSpaceBreak = spaceIter.next();
229
nextBreak = breakIter->next();
231
if (nextSpaceBreak == BreakIterator::DONE || nextBreak == BreakIterator::DONE) {
232
if (nextBreak != BreakIterator::DONE) {
233
fprintf(stderr, "break iterator didn't end.\n");
234
} else if (nextSpaceBreak != BreakIterator::DONE) {
235
fprintf(stderr, "premature break iterator end.\n");
241
while (nextSpaceBreak != nextBreak &&
242
nextSpaceBreak != BreakIterator::DONE && nextBreak != BreakIterator::DONE) {
243
if (nextSpaceBreak < nextBreak) {
244
breakNotFound(nextSpaceBreak);
246
nextSpaceBreak = spaceIter.next();
247
} else if (nextSpaceBreak > nextBreak) {
248
foundInvalidBreak(nextBreak);
250
nextBreak = breakIter->next();
255
printf("%d %d\n", nextSpaceBreak, nextBreak);
260
fWordCount = spaceIter.getWordCount();
268
* Report a break that's in the text with spaces but
269
* not found in the text without spaces.
271
void ThaiWordbreakTest::breakNotFound(int32_t br)
274
printf("%d ****\n", br);
276
fprintf(stderr, "break not found: %d\n", br);
279
fBreaksNotFound += 1;
283
* Report a break that's found in the text without spaces
284
* that isn't in the text with spaces.
286
void ThaiWordbreakTest::foundInvalidBreak(int32_t br)
289
printf("**** %d\n", br);
291
fprintf(stderr, "found invalid break: %d\n", br);
298
* Read the text from a file. The text must start with a Unicode Byte
299
* Order Mark (BOM) so that we know what order to read the bytes in.
301
const UChar *ThaiWordbreakTest::readFile(char *fileName, int32_t &charCount)
309
f = fopen(fileName, "rb");
312
fprintf(stderr,"Couldn't open %s reason: %s \n", fileName, strerror(errno));
316
fseek(f, 0, SEEK_END);
319
fseek(f, 0, SEEK_SET);
320
bufferChars = new char[fileSize];
322
if(bufferChars == 0) {
323
fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
328
fread(bufferChars, sizeof(char), fileSize, f);
330
fprintf(stderr,"Couldn't read %s reason: %s \n", fileName, strerror(errno));
332
delete[] bufferChars;
337
UnicodeString myText(bufferChars, fileSize, "UTF-8");
339
delete[] bufferChars;
341
charCount = myText.length();
342
buffer = new UChar[charCount];
344
fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
348
myText.extract(1, myText.length(), buffer);
349
charCount--; // skip the BOM
350
buffer[charCount] = 0; // NULL terminate for easier reading in the debugger
356
* Remove spaces from the input UChar array.
358
* We check explicitly for a Unicode code value of 0x0020
359
* because Unicode::isSpaceChar returns true for CR, LF, etc.
362
const UChar *ThaiWordbreakTest::crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount)
364
int32_t i, out, spaceCount;
367
for (i = 0; i < count; i += 1) {
368
if (spaces[i] == 0x0020 /*Unicode::isSpaceChar(spaces[i])*/) {
373
nonSpaceCount = count - spaceCount;
374
UChar *noSpaces = new UChar[nonSpaceCount];
377
fprintf(stderr, "Couldn't allocate memory for the space stripped text.\n");
381
for (out = 0, i = 0; i < count; i += 1) {
382
if (spaces[i] != 0x0020 /*! Unicode::isSpaceChar(spaces[i])*/) {
383
noSpaces[out++] = spaces[i];
391
* The main routine. Read the command line arguments, read the text file,
392
* remove the spaces, do the comparison and report the final results
394
int main(int argc, char **argv)
396
char *fileName = "space.txt";
398
UBool verbose = false;
400
if (argc >= 2 && strcmp(argv[1], "-verbose") == 0) {
405
if (arg == argc - 1) {
406
fileName = argv[arg++];
410
fprintf(stderr, "Usage: %s [-verbose] [<file>]\n", argv[0]);
414
int32_t spaceCount, nonSpaceCount;
415
const UChar *spaces, *noSpaces;
417
spaces = ThaiWordbreakTest::readFile(fileName, spaceCount);
423
noSpaces = ThaiWordbreakTest::crunchSpaces(spaces, spaceCount, nonSpaceCount);
429
ThaiWordbreakTest test(spaces, spaceCount, noSpaces, nonSpaceCount, verbose);
431
printf("word count: %d\n", test.getWordCount());
432
printf("breaks not found: %d\n", test.getBreaksNotFound());
433
printf("invalid breaks found: %d\n", test.getInvalidBreaks());
439
* The main constructor. Clear all the counts and construct a default
440
* word instance of a BreakIterator.
442
SpaceBreakIterator::SpaceBreakIterator(const UChar *text, int32_t count)
443
: fBreakIter(0), fText(text), fTextCount(count), fWordCount(0), fSpaceCount(0), fDone(false)
445
UCharCharacterIterator *iter = new UCharCharacterIterator(text, count);
446
UErrorCode status = U_ZERO_ERROR;
449
fBreakIter = BreakIterator::createWordInstance(us, status);
450
fBreakIter->adoptText(iter);
453
SpaceBreakIterator::SpaceBreakIterator()
459
* The destructor. delete the underlying BreakIterator
461
SpaceBreakIterator::~SpaceBreakIterator()
467
* Return the next break, counting words and spaces.
469
int32_t SpaceBreakIterator::next()
472
return BreakIterator::DONE;
475
int32_t nextBreak = fBreakIter->next();
477
if (nextBreak == BreakIterator::DONE) {
479
return BreakIterator::DONE;
482
int32_t result = nextBreak - fSpaceCount;
484
if (nextBreak < fTextCount) {
485
if (fText[nextBreak] == 0x0020 /*Unicode::isSpaceChar(fText[nextBreak])*/) {
486
fSpaceCount += fBreakIter->next() - nextBreak;
496
* Returns the current space count
498
int32_t SpaceBreakIterator::getSpaceCount()
504
* Returns the current word count
506
int32_t SpaceBreakIterator::getWordCount()