1
/********************************************************************
3
* Copyright (c) 1997-2001, International Business Machines Corporation and
4
* others. All Rights Reserved.
5
********************************************************************/
8
#include "unicode/brkiter.h"
9
#include "unicode/unicode.h"
11
//#include "txbdapi.h" // BreakIteratorAPIC
13
//--------------------------------------------------------------------------------------
15
* "Vector" class for holding test tables
16
* (this class is actually a linked list, but we use the name and API of the
17
* java.util.Vector class to keep as much of our test code as possible the same.)
19
class Enumeration { // text enumeration
21
virtual UBool hasMoreElements() = 0;
22
virtual UnicodeString nextElement() = 0;
25
class Vector { // text vector
30
TextLink() : fLink(0), fText() {}
31
TextLink(TextLink* link, UnicodeString text) : fLink(link), fText(text) {}
43
class VectorEnumeration : public Enumeration {
45
VectorEnumeration(Vector* vector) : fVector(vector), fPos(&vector->fBase) {}
47
UBool hasMoreElements() { return fPos->fLink != &fVector->fBase; }
48
UnicodeString nextElement() { fPos = fPos->fLink; return fPos->fText; }
54
Vector() : fBase(), fEnd(&fBase), fSize(0) { fBase.fLink = &fBase; }
57
while (fBase.fLink != &fBase) {
58
TextLink* link = fBase.fLink;
59
fBase.fLink = link->fLink;
64
void addElement(UnicodeString text) { fEnd->fLink = new TextLink(&fBase, text); fEnd = fEnd->fLink; ++fSize; }
65
void insertElementAt(UnicodeString text, int pos) {
66
if(pos >= fSize || pos < 0)
69
TextLink* insert = new TextLink(&fBase, text);
70
insert->fLink=fBase.fLink;
75
TextLink* link = fBase.fLink;
78
TextLink* insert = new TextLink(&fBase, text);
79
insert->fLink =link->fLink;
86
UnicodeString elementAt(int32_t pos) {
88
return UnicodeString();
90
TextLink* link = fBase.fLink;
91
while (pos-- > 0) link = link->fLink;
94
UnicodeString lastElement() { return fEnd == &fBase ? UnicodeString() : fEnd->fText; }
95
int32_t size() { return fSize; }
97
Enumeration* elements() { return new VectorEnumeration(this); }
101
//--------------------------------------------------------------------------------------
103
* IntlTestTextBoundary is medium top level test class for everything in the directory "findword".
106
#include "unicode/utypes.h"
110
#include "unicode/schriter.h"
112
const UChar IntlTestTextBoundary::cannedTestArray[] = {
113
0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031,
114
0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b,
115
0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2,
116
0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3,
117
0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303,
118
0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000,
119
0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f,
120
0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000
123
UnicodeString* IntlTestTextBoundary::cannedTestChars = 0;
125
//---------------------------------------------
127
//---------------------------------------------
129
IntlTestTextBoundary::IntlTestTextBoundary()
131
UnicodeString temp(cannedTestArray);
132
cannedTestChars = new UnicodeString();
133
*cannedTestChars += (UChar)0x0000;
134
*cannedTestChars += temp;
136
addTestSentenceData();
138
addTestCharacterData();
141
IntlTestTextBoundary::~IntlTestTextBoundary()
143
delete wordSelectionData;
144
delete sentenceSelectionData;
145
delete lineSelectionData;
146
delete characterSelectionData;
147
delete cannedTestChars;
151
* @bug 4097779 4098467 4117554
153
void IntlTestTextBoundary::addTestWordData()
155
wordSelectionData = new Vector();
157
wordSelectionData->addElement("12,34");
159
wordSelectionData->addElement(" ");
160
wordSelectionData->addElement(UCharToUnicodeString((UChar)(0x00A2))); //cent sign
161
wordSelectionData->addElement(UCharToUnicodeString((UChar)(0x00A3))); //pound sign
162
wordSelectionData->addElement(UCharToUnicodeString((UChar)(0x00A4))); //currency sign
163
wordSelectionData->addElement(UCharToUnicodeString((UChar)(0x00A5))); //yen sign
164
wordSelectionData->addElement("alpha-beta-gamma");
165
wordSelectionData->addElement(".");
166
wordSelectionData->addElement(" ");
167
wordSelectionData->addElement("Badges");
168
wordSelectionData->addElement("?");
169
wordSelectionData->addElement(" ");
170
wordSelectionData->addElement("BADGES");
171
wordSelectionData->addElement("!");
172
wordSelectionData->addElement("?");
173
wordSelectionData->addElement("!");
174
wordSelectionData->addElement(" ");
175
wordSelectionData->addElement("We");
176
wordSelectionData->addElement(" ");
177
wordSelectionData->addElement("don't");
178
wordSelectionData->addElement(" ");
179
wordSelectionData->addElement("need");
180
wordSelectionData->addElement(" ");
181
wordSelectionData->addElement("no");
182
wordSelectionData->addElement(" ");
183
wordSelectionData->addElement("STINKING");
184
wordSelectionData->addElement(" ");
185
wordSelectionData->addElement("BADGES");
186
wordSelectionData->addElement("!");
187
wordSelectionData->addElement("!");
188
wordSelectionData->addElement("!");
190
wordSelectionData->addElement("012.566,5");
191
wordSelectionData->addElement(" ");
192
wordSelectionData->addElement("123.3434,900");
193
wordSelectionData->addElement(" ");
194
wordSelectionData->addElement("1000,233,456.000");
195
wordSelectionData->addElement(" ");
196
wordSelectionData->addElement("1,23.322%");
197
wordSelectionData->addElement(" ");
198
wordSelectionData->addElement("123.1222");
200
wordSelectionData->addElement(" ");
201
wordSelectionData->addElement("$123,000.20");
203
wordSelectionData->addElement(" ");
204
wordSelectionData->addElement("179.01%");
206
wordSelectionData->addElement("Hello");
207
wordSelectionData->addElement(",");
208
wordSelectionData->addElement(" ");
209
wordSelectionData->addElement("how");
210
wordSelectionData->addElement(" ");
211
wordSelectionData->addElement("are");
212
wordSelectionData->addElement(" ");
213
wordSelectionData->addElement("you");
214
wordSelectionData->addElement(" ");
215
wordSelectionData->addElement("X");
216
wordSelectionData->addElement(" ");
218
wordSelectionData->addElement("Now");
219
wordSelectionData->addElement("\r");
220
wordSelectionData->addElement("is");
221
wordSelectionData->addElement("\n");
222
wordSelectionData->addElement("the");
223
wordSelectionData->addElement("\r\n");
224
wordSelectionData->addElement("time");
225
wordSelectionData->addElement("\n");
226
wordSelectionData->addElement("\r");
227
wordSelectionData->addElement("for");
228
wordSelectionData->addElement("\r");
229
wordSelectionData->addElement("\r");
230
wordSelectionData->addElement("all");
231
wordSelectionData->addElement(" ");
233
// to test for bug #4097779
234
wordSelectionData->addElement(CharsToUnicodeString("aa\\u0300a"));
235
wordSelectionData->addElement(" ");
237
// to test for bug #4098467
238
// What follows is a string of Korean characters (I found it in the Yellow Pages
239
// ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
240
// it correctly), first as precomposed syllables, and then as conjoining jamo.
241
// Both sequences should be semantically identical and break the same way.
242
// precomposed syllables...
243
wordSelectionData->addElement(CharsToUnicodeString("\\uc0c1\\ud56d"));
244
wordSelectionData->addElement(" ");
245
wordSelectionData->addElement(CharsToUnicodeString("\\ud55c\\uc778"));
246
wordSelectionData->addElement(" ");
247
wordSelectionData->addElement(CharsToUnicodeString("\\uc5f0\\ud569"));
248
wordSelectionData->addElement(" ");
249
wordSelectionData->addElement(CharsToUnicodeString("\\uc7a5\\ub85c\\uad50\\ud68c"));
250
wordSelectionData->addElement(" ");
251
// conjoining jamo...
252
wordSelectionData->addElement(CharsToUnicodeString("\\u1109\\u1161\\u11bc\\u1112\\u1161\\u11bc"));
253
wordSelectionData->addElement(" ");
254
wordSelectionData->addElement(CharsToUnicodeString("\\u1112\\u1161\\u11ab\\u110b\\u1175\\u11ab"));
255
wordSelectionData->addElement(" ");
256
wordSelectionData->addElement(CharsToUnicodeString("\\u110b\\u1167\\u11ab\\u1112\\u1161\\u11b8"));
257
wordSelectionData->addElement(" ");
258
wordSelectionData->addElement(CharsToUnicodeString("\\u110c\\u1161\\u11bc\\u1105\\u1169\\u1100\\u116d\\u1112\\u116c"));
259
wordSelectionData->addElement(" ");
261
// this is a test for bug #4117554: the ideographic iteration mark (U+3005) should
262
// count as a Kanji character for the purposes of word breaking
263
wordSelectionData->addElement("abc");
264
wordSelectionData->addElement(CharsToUnicodeString("\\u4e01\\u4e02\\u3005\\u4e03\\u4e03"));
265
wordSelectionData->addElement("abc");
270
const UChar kParagraphSeparator = 0x2029;
271
const UChar kLineSeparator = 0x2028;
274
* @bug 4111338 4117554 4113835
276
void IntlTestTextBoundary::addTestSentenceData()
278
sentenceSelectionData = new Vector();
279
sentenceSelectionData->addElement("This is a simple sample sentence. ");
280
sentenceSelectionData->addElement("(This is it.) ");
281
sentenceSelectionData->addElement("This is a simple sample sentence. ");
282
sentenceSelectionData->addElement("\"This isn\'t it.\" ");
283
sentenceSelectionData->addElement("Hi! ");
284
sentenceSelectionData->addElement("This is a simple sample sentence. ");
285
sentenceSelectionData->addElement("It does not have to make any sense as you can see. ");
286
sentenceSelectionData->addElement("Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ");
287
sentenceSelectionData->addElement("Che la dritta via aveo smarrita. ");
288
sentenceSelectionData->addElement("He said, that I said, that you said!! ");
290
sentenceSelectionData->addElement("Don't rock the boat." + UCharToUnicodeString(kParagraphSeparator));
292
sentenceSelectionData->addElement("Because I am the daddy, that is why. ");
293
sentenceSelectionData->addElement("Not on my time (el timo.)! ");
295
sentenceSelectionData->addElement("So what!!" + UCharToUnicodeString(kParagraphSeparator));
297
sentenceSelectionData->addElement("\"But now,\" he said, \"I know!\" ");
298
sentenceSelectionData->addElement("Harris thumbed down several, including \"Away We Go\" (which became the huge success Oklahoma!). ");
299
sentenceSelectionData->addElement("One species, B. anthracis, is highly virulent.\n");
300
sentenceSelectionData->addElement("Wolf said about Sounder:\"Beautifully thought-out and directed.\" ");
301
sentenceSelectionData->addElement("Have you ever said, \"This is where\tI shall live\"? ");
302
sentenceSelectionData->addElement("He answered, \"You may not!\" ");
303
sentenceSelectionData->addElement("Another popular saying is: \"How do you do?\". ");
304
sentenceSelectionData->addElement("Yet another popular saying is: \'I\'m fine thanks.\' ");
305
sentenceSelectionData->addElement("What is the proper use of the abbreviation pp.? ");
306
sentenceSelectionData->addElement("Yes, I am definatelly 12\" tall!!");
308
// test for bug #4113835: \n and \r count as spaces, not as paragraph breaks
309
sentenceSelectionData->addElement(CharsToUnicodeString("Now\ris\nthe\r\ntime\n\rfor\r\rall\\u2029"));
311
// test for bug #4111338: Don't break sentences at the boundary between CJK
313
sentenceSelectionData->addElement(CharsToUnicodeString("\\u5487\\u67ff\\ue591\\u5017\\u61b3\\u60a1\\u9510\\u8165:\"JAVA\\u821c")
314
+ CharsToUnicodeString("\\u8165\\u7fc8\\u51ce\\u306d,\\u2494\\u56d8\\u4ec0\\u60b1\\u8560\\u51ba")
315
+ CharsToUnicodeString("\\u611d\\u57b6\\u2510\\u5d46\".\\u2029"));
316
sentenceSelectionData->addElement(CharsToUnicodeString("\\u5487\\u67ff\\ue591\\u5017\\u61b3\\u60a1\\u9510\\u8165\\u9de8")
317
+ CharsToUnicodeString("\\u97e4JAVA\\u821c\\u8165\\u7fc8\\u51ce\\u306d\\ue30b\\u2494\\u56d8\\u4ec0")
318
+ CharsToUnicodeString("\\u60b1\\u8560\\u51ba\\u611d\\u57b6\\u2510\\u5d46\\u97e5\\u7751\\u2029"));
319
sentenceSelectionData->addElement(CharsToUnicodeString("\\u5487\\u67ff\\ue591\\u5017\\u61b3\\u60a1\\u9510\\u8165\\u9de8\\u97e4")
320
+ CharsToUnicodeString("\\u6470\\u8790JAVA\\u821c\\u8165\\u7fc8\\u51ce\\u306d\\ue30b\\u2494\\u56d8")
321
+ CharsToUnicodeString("\\u4ec0\\u60b1\\u8560\\u51ba\\u611d\\u57b6\\u2510\\u5d46\\u97e5\\u7751\\u2029"));
322
sentenceSelectionData->addElement(CharsToUnicodeString("He said, \"I can go there.\"\\u2029"));
324
// test for bug #4117554: Treat fullwidth variants of .!? the same as their
325
// normal counterparts
326
sentenceSelectionData->addElement(CharsToUnicodeString("I know I'm right\\uff0e "));
327
sentenceSelectionData->addElement(CharsToUnicodeString("Right\\uff1f "));
328
sentenceSelectionData->addElement(CharsToUnicodeString("Right\\uff01 "));
330
// test for bug #4117554: Don't break sentences at boundary between CJK and digits
331
sentenceSelectionData->addElement(CharsToUnicodeString("\\u5487\\u67ff\\ue591\\u5017\\u61b3\\u60a1\\u9510\\u8165\\u9de8")
332
+ CharsToUnicodeString("\\u97e48888\\u821c\\u8165\\u7fc8\\u51ce\\u306d\\ue30b\\u2494\\u56d8\\u4ec0")
333
+ CharsToUnicodeString("\\u60b1\\u8560\\u51ba\\u611d\\u57b6\\u2510\\u5d46\\u97e5\\u7751\\u2029"));
335
// test for bug #4117554: Break sentence between a sentence terminator and
336
// opening punctuation
337
sentenceSelectionData->addElement("no?");
338
sentenceSelectionData->addElement("(yes)" + CharsToUnicodeString("\\u2029"));
340
// test for bug #4158381: Don't break sentence after period if it isn't
341
// followed by a space
342
sentenceSelectionData->addElement("Test <code>Flags.Flag</code> class. ");
343
sentenceSelectionData->addElement("Another test." + CharsToUnicodeString("\\u2029"));
345
// test for bug #4158381: No breaks when there are no terminators around
346
sentenceSelectionData->addElement("<P>Provides a set of "lightweight" (all-java<FONT SIZE=\"-2\"><SUP>TM</SUP></FONT> language) components that, to the maximum degree possible, work the same on all platforms. ");
347
sentenceSelectionData->addElement("Another test." + CharsToUnicodeString("\\u2029"));
349
// test for bug #4143071: Make sure sentences that end with digits
351
sentenceSelectionData->addElement("Today is the 27th of May, 1998. ");
352
sentenceSelectionData->addElement("Tomorrow with be 28 May 1998. ");
353
sentenceSelectionData->addElement("The day after will be the 30th."
354
+ CharsToUnicodeString("\\u2029"));
356
// test for bug #4152416: Make sure sentences ending with a capital
357
// letter are treated correctly
358
sentenceSelectionData->addElement("The type of all primitive <code>boolean</code> values accessed in the target VM. ");
359
sentenceSelectionData->addElement("Calls to xxx will return an implementor of this interface." + CharsToUnicodeString("\\u2029"));
361
// test for bug #4152117: Make sure sentence breaking is handling
362
// punctuation correctly [COULD NOT REPRODUCE THIS BUG, BUT TEST IS
363
// HERE TO MAKE SURE IT DOESN'T CROP UP]
364
sentenceSelectionData->addElement("Constructs a randomly generated BigInteger, uniformly distributed over the range <tt>0</tt> to <tt>(2<sup>numBits</sup> - 1)</tt>, inclusive. ");
365
sentenceSelectionData->addElement("The uniformity of the distribution assumes that a fair source of random bits is provided in <tt>rnd</tt>. ");
366
sentenceSelectionData->addElement("Note that this constructor always constructs a non-negative BigInteger." + CharsToUnicodeString("\\u2029"));
371
* @bug 4068133 4086052 4035266 4097920 4098467 4117554
373
void IntlTestTextBoundary::addTestLineData()
375
lineSelectionData = new Vector();
376
lineSelectionData->addElement("Multi-");
377
lineSelectionData->addElement("Level ");
378
lineSelectionData->addElement("example ");
379
lineSelectionData->addElement("of ");
380
lineSelectionData->addElement("a ");
381
lineSelectionData->addElement("semi-");
382
lineSelectionData->addElement("idiotic ");
383
lineSelectionData->addElement("non-");
384
lineSelectionData->addElement("sensical ");
385
lineSelectionData->addElement("(non-");
386
lineSelectionData->addElement("important) ");
387
lineSelectionData->addElement("sentence. ");
389
lineSelectionData->addElement("Hi ");
390
lineSelectionData->addElement("Hello ");
391
lineSelectionData->addElement("How\n");
392
lineSelectionData->addElement("are\r");
393
lineSelectionData->addElement("you" + UCharToUnicodeString(kLineSeparator));
394
lineSelectionData->addElement("fine.\t");
395
lineSelectionData->addElement("good. ");
397
lineSelectionData->addElement("Now\r");
398
lineSelectionData->addElement("is\n");
399
lineSelectionData->addElement("the\r\n");
400
lineSelectionData->addElement("time\n");
401
lineSelectionData->addElement("\r");
402
lineSelectionData->addElement("for\r");
403
lineSelectionData->addElement("\r");
404
lineSelectionData->addElement("all");
406
// to test for bug #4068133
407
lineSelectionData->addElement(CharsToUnicodeString("\\u96f6"));
408
lineSelectionData->addElement(CharsToUnicodeString("\\u4e00\\u3002"));
409
lineSelectionData->addElement(CharsToUnicodeString("\\u4e8c\\u3001"));
410
lineSelectionData->addElement(CharsToUnicodeString("\\u4e09\\u3002\\u3001"));
411
lineSelectionData->addElement(CharsToUnicodeString("\\u56db\\u3001\\u3002\\u3001"));
412
lineSelectionData->addElement(CharsToUnicodeString("\\u4e94,"));
413
lineSelectionData->addElement(CharsToUnicodeString("\\u516d."));
414
lineSelectionData->addElement(CharsToUnicodeString("\\u4e03.\\u3001,\\u3002"));
415
lineSelectionData->addElement(CharsToUnicodeString("\\u516b"));
417
// to test for bug #4086052
418
lineSelectionData->addElement(CharsToUnicodeString("foo\\u00a0bar "));
419
// lineSelectionData->addElement("foo\\ufeffbar");
421
// to test for bug #4097920
422
lineSelectionData->addElement("dog,");
423
lineSelectionData->addElement("cat,");
424
lineSelectionData->addElement("mouse ");
425
lineSelectionData->addElement("(one)");
426
lineSelectionData->addElement("(two)\n");
428
// to test for bug #4035266
429
lineSelectionData->addElement("The ");
430
lineSelectionData->addElement("balance ");
431
lineSelectionData->addElement("is ");
432
lineSelectionData->addElement("$-23,456.78, ");
433
lineSelectionData->addElement("not ");
434
lineSelectionData->addElement("-$32,456.78!\n");
436
// to test for bug #4098467
437
// What follows is a string of Korean characters (I found it in the Yellow Pages
438
// ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
439
// it correctly), first as precomposed syllables, and then as conjoining jamo.
440
// Both sequences should be semantically identical and break the same way.
441
// precomposed syllables...
442
lineSelectionData->addElement(CharsToUnicodeString("\\uc0c1\\ud56d "));
443
lineSelectionData->addElement(CharsToUnicodeString("\\ud55c\\uc778 "));
444
lineSelectionData->addElement(CharsToUnicodeString("\\uc5f0\\ud569 "));
445
lineSelectionData->addElement(CharsToUnicodeString("\\uc7a5\\ub85c\\uad50\\ud68c "));
446
// conjoining jamo...
447
lineSelectionData->addElement(CharsToUnicodeString("\\u1109\\u1161\\u11bc\\u1112\\u1161\\u11bc "));
448
lineSelectionData->addElement(CharsToUnicodeString("\\u1112\\u1161\\u11ab\\u110b\\u1175\\u11ab "));
449
lineSelectionData->addElement(CharsToUnicodeString("\\u110b\\u1167\\u11ab\\u1112\\u1161\\u11b8 "));
450
lineSelectionData->addElement(CharsToUnicodeString("\\u110c\\u1161\\u11bc\\u1105\\u1169\\u1100\\u116d\\u1112\\u116c"));
452
// to test for bug #4117554: Fullwidth .!? should be treated as postJwrd
453
lineSelectionData->addElement(CharsToUnicodeString("\\u4e01\\uff0e"));
454
lineSelectionData->addElement(CharsToUnicodeString("\\u4e02\\uff01"));
455
lineSelectionData->addElement(CharsToUnicodeString("\\u4e03\\uff1f"));
460
const UnicodeString graveS = "S" + (UChar)0x0300;
461
const UnicodeString acuteBelowI = "i" + UCharToUnicodeString(0x0317);
462
const UnicodeString acuteE = "e" + UCharToUnicodeString(0x0301);
463
const UnicodeString circumflexA = "a" + UCharToUnicodeString(0x0302);
464
const UnicodeString tildeE = "e" + UCharToUnicodeString(0x0303);
470
void IntlTestTextBoundary::addTestCharacterData()
472
characterSelectionData = new Vector();
473
characterSelectionData->addElement("S" + UCharToUnicodeString(0x0300)); //graveS
474
characterSelectionData->addElement("i" + UCharToUnicodeString(0x0301)); // acuteBelowI
475
characterSelectionData->addElement("m");
476
characterSelectionData->addElement("p");
477
characterSelectionData->addElement("l");
478
characterSelectionData->addElement("e" + UCharToUnicodeString(0x0301)); // acuteE
479
characterSelectionData->addElement(" ");
480
characterSelectionData->addElement("s");
481
characterSelectionData->addElement("a" + UCharToUnicodeString(0x0302)); // circumflexA
482
characterSelectionData->addElement("m");
483
characterSelectionData->addElement("p");
484
characterSelectionData->addElement("l");
485
characterSelectionData->addElement("e" + UCharToUnicodeString(0x0303)); // tildeE
486
characterSelectionData->addElement(".");
487
characterSelectionData->addElement("w");
488
characterSelectionData->addElement("a" + UCharToUnicodeString(0x0302)); // circumflexA
489
characterSelectionData->addElement("w");
490
characterSelectionData->addElement("a");
491
characterSelectionData->addElement("f");
492
characterSelectionData->addElement("q");
493
characterSelectionData->addElement("\n");
494
characterSelectionData->addElement("\r");
495
characterSelectionData->addElement("\r\n");
496
characterSelectionData->addElement("\n");
498
// to test for bug #4098467
499
// What follows is a string of Korean characters (I found it in the Yellow Pages
500
// ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
501
// it correctly), first as precomposed syllables, and then as conjoining jamo.
502
// Both sequences should be semantically identical and break the same way.
503
// precomposed syllables...
504
characterSelectionData->addElement(CharsToUnicodeString("\\uc0c1"));
505
characterSelectionData->addElement(CharsToUnicodeString("\\ud56d"));
506
characterSelectionData->addElement(" ");
507
characterSelectionData->addElement(CharsToUnicodeString("\\ud55c"));
508
characterSelectionData->addElement(CharsToUnicodeString("\\uc778"));
509
characterSelectionData->addElement(" ");
510
characterSelectionData->addElement(CharsToUnicodeString("\\uc5f0"));
511
characterSelectionData->addElement(CharsToUnicodeString("\\ud569"));
512
characterSelectionData->addElement(" ");
513
characterSelectionData->addElement(CharsToUnicodeString("\\uc7a5"));
514
characterSelectionData->addElement(CharsToUnicodeString("\\ub85c"));
515
characterSelectionData->addElement(CharsToUnicodeString("\\uad50"));
516
characterSelectionData->addElement(CharsToUnicodeString("\\ud68c"));
517
characterSelectionData->addElement(" ");
518
// conjoining jamo...
519
characterSelectionData->addElement(CharsToUnicodeString("\\u1109\\u1161\\u11bc"));
520
characterSelectionData->addElement(CharsToUnicodeString("\\u1112\\u1161\\u11bc"));
521
characterSelectionData->addElement(" ");
522
characterSelectionData->addElement(CharsToUnicodeString("\\u1112\\u1161\\u11ab"));
523
characterSelectionData->addElement(CharsToUnicodeString("\\u110b\\u1175\\u11ab"));
524
characterSelectionData->addElement(" ");
525
characterSelectionData->addElement(CharsToUnicodeString("\\u110b\\u1167\\u11ab"));
526
characterSelectionData->addElement(CharsToUnicodeString("\\u1112\\u1161\\u11b8"));
527
characterSelectionData->addElement(" ");
528
characterSelectionData->addElement(CharsToUnicodeString("\\u110c\\u1161\\u11bc"));
529
characterSelectionData->addElement(CharsToUnicodeString("\\u1105\\u1169"));
530
characterSelectionData->addElement(CharsToUnicodeString("\\u1100\\u116d"));
531
characterSelectionData->addElement(CharsToUnicodeString("\\u1112\\u116c"));
535
UnicodeString IntlTestTextBoundary::createTestData(Enumeration* e)
537
UnicodeString result = "";
539
while (e->hasMoreElements()) {
540
result += e->nextElement();
545
//---------------------------------------------
546
// SentenceBreak tests
547
//---------------------------------------------
549
void IntlTestTextBoundary::TestSentenceIteration()
551
UErrorCode status = U_ZERO_ERROR;
552
BreakIterator* e = BreakIterator::createSentenceInstance(Locale::getDefault(), status);
553
if (U_FAILURE(status))
555
errln("Failed to create the BreakIterator for default locale in TestSentenceIteration.\n");
558
generalIteratorTest(*e, sentenceSelectionData);
562
void IntlTestTextBoundary::TestSentenceInvariants()
564
UErrorCode status = U_ZERO_ERROR;
565
BreakIterator *e = BreakIterator::createSentenceInstance(Locale::getDefault(), status);
566
if (U_FAILURE(status))
568
errln("Failed to create the BreakIterator for default locale in TestSentenceInvariant.\n");
571
UnicodeString s = *cannedTestChars + CharsToUnicodeString(".,\\u3001\\u3002\\u3041\\u3042\\u3043\\ufeff");
572
doOtherInvariantTest(*e, s);
575
//---------------------------------------------
577
//---------------------------------------------
578
void IntlTestTextBoundary::TestWordIteration()
580
UErrorCode status = U_ZERO_ERROR;
581
BreakIterator* e = BreakIterator::createWordInstance(Locale::getDefault(), status);
582
if (U_FAILURE(status))
584
errln("Failed to create the BreakIterator for default locale in TestWordIteration.\n");
587
generalIteratorTest(*e, wordSelectionData);
590
void IntlTestTextBoundary::TestWordInvariants()
592
UErrorCode status = U_ZERO_ERROR;
593
BreakIterator *e = BreakIterator::createWordInstance(Locale::getDefault(), status);
594
if (U_FAILURE(status))
596
errln("Failed to create the BreakIterator for default locale in TestWordInvariants.\n");
599
UnicodeString s = *cannedTestChars + CharsToUnicodeString("\',.\\u3041\\u3042\\u3043\\u309b\\u309c\\u30a1\\u30a2\\u30a3\\u4e00\\u4e01\\u4e02");
600
doBreakInvariantTest(*e, s);
601
s = *cannedTestChars + CharsToUnicodeString("\',.\\u3041\\u3042\\u3043\\u309b\\u309c\\u30a1\\u30a2\\u30a3\\u4e00\\u4e01\\u4e02");
602
doOtherInvariantTest(*e, s);
605
//---------------------------------------------
606
// CharacterBreak tests
607
//---------------------------------------------
608
void IntlTestTextBoundary::TestCharacterIteration()
610
UErrorCode status = U_ZERO_ERROR;
611
BreakIterator* e = BreakIterator::createCharacterInstance(Locale::getDefault(), status);
612
if (U_FAILURE(status))
614
errln("Failed to create the BreakIterator for default locale in TestCharacterIteration.\n");
617
// generalIteratorTest(*e, testCharacterText, characterSelectionData);
618
generalIteratorTest(*e, characterSelectionData);
621
void IntlTestTextBoundary::TestCharacterInvariants()
623
UErrorCode status = U_ZERO_ERROR;
624
BreakIterator *e = BreakIterator::createCharacterInstance(Locale::getDefault(), status);
625
if (U_FAILURE(status))
627
errln("Failed to create the BreakIterator for default locale in TestCharacterInvariants.\n");
630
UnicodeString s = *cannedTestChars + CharsToUnicodeString("\\u1100\\u1101\\u1102\\u1160\\u1161\\u1162\\u11a8\\u11a9\\u11aa");
631
doBreakInvariantTest(*e, s);
632
s = *cannedTestChars + CharsToUnicodeString("\\u1100\\u1101\\u1102\\u1160\\u1161\\u1162\\u11a8\\u11a9\\u11aa");
633
doOtherInvariantTest(*e, s);
636
//---------------------------------------------
638
//---------------------------------------------
639
void IntlTestTextBoundary::TestLineIteration()
641
UErrorCode status = U_ZERO_ERROR;
642
BreakIterator* e = BreakIterator::createLineInstance(Locale::getDefault(), status);
643
if (U_FAILURE(status))
645
errln("Failed to create the BreakIterator for default locale in TestLineIteration.\n");
648
generalIteratorTest(*e, lineSelectionData);
651
void IntlTestTextBoundary::TestLineInvariants()
653
UErrorCode status = U_ZERO_ERROR;
654
BreakIterator *e = BreakIterator::createLineInstance(Locale::US, status);
655
if (U_FAILURE(status))
657
errln("Failed to create the BreakIterator for default locale in TestLineInvariants.\n");
660
UnicodeString s = CharsToUnicodeString(".,;:\\u3001\\u3002\\u3041\\u3042\\u3043\\u3044\\u3045\\u30a3\\u4e00\\u4e01\\u4e02");
661
UnicodeString testChars = *cannedTestChars + s;
662
doBreakInvariantTest(*e, testChars);
663
doOtherInvariantTest(*e, testChars);
665
int32_t errCount = 0, testCharsLen, noBreakLen, dashesLen;
668
// in addition to the other invariants, a line-break iterator should make sure that:
669
// it doesn't break around the non-breaking characters
670
UnicodeString noBreak = CharsToUnicodeString("\\u00a0\\u2007\\u2011\\ufeff");
671
UnicodeString work("aaa");
672
testCharsLen = testChars.length();
673
noBreakLen = noBreak.length();
674
for (i = 0; i < testCharsLen; i++) {
675
UChar c = testChars[i];
676
if (c == '\r' || c == '\n' || c == 0x2029 || c == 0x2028 || c == 0x0003)
679
for (j = 0; j < noBreakLen; j++) {
680
work[1] = noBreak[j];
681
for (k = 0; k < testCharsLen; k++) {
682
work[2] = testChars[k];
684
for (int l = e->first(); l != BreakIterator::DONE; l = e->next())
685
if (l == 1 || l == 2) {
686
errln("Got break between U+" + UCharToUnicodeString(work[l - 1]) +
687
" and U+" + UCharToUnicodeString(work[l]));
696
// it does break after hyphens (unless they're followed by a digit, a non-spacing mark,
697
// a currency symbol, a non-breaking space, or a line or paragraph separator)
698
UnicodeString dashes = CharsToUnicodeString("-\\u00ad\\u2010\\u2012\\u2013\\u2014");
699
dashesLen = dashes.length();
700
for (i = 0; i < testCharsLen; i++) {
701
work[0] = testChars[i];
702
for (j = 0; j < dashesLen; j++) {
704
for (k = 0; k < testCharsLen; k++) {
705
UChar c = testChars[k];
706
int8_t type = Unicode::getType(c);
707
if (type == Unicode::DECIMAL_DIGIT_NUMBER ||
708
type == Unicode::OTHER_NUMBER ||
709
type == Unicode::NON_SPACING_MARK ||
710
type == Unicode::ENCLOSING_MARK ||
711
type == Unicode::CURRENCY_SYMBOL ||
712
type == Unicode::SPACE_SEPARATOR ||
713
type == Unicode::DASH_PUNCTUATION ||
714
type == Unicode::CONTROL ||
715
type == Unicode::FORMAT ||
716
c == '\n' || c == '\r' || c == 0x2028 || c == 0x2029 ||
717
c == 0x0003 || c == 0x00a0 || c == 0x2007 || c == 0x2011 ||
725
for (int l = e->first(); l != BreakIterator::DONE; l = e->next()) {
732
errln("Didn't get break between U+" + UCharToUnicodeString(work[1]) +
733
" and U+" + UCharToUnicodeString(work[2]));
744
void IntlTestTextBoundary::TestThaiLineBreak() {
745
Vector* thaiLineSelection = new Vector();
746
UErrorCode status = U_ZERO_ERROR;
748
// \u0e2f-- the Thai paiyannoi character-- isn't a letter. It's a symbol that
749
// represents elided letters at the end of a long word. It should be bound to
750
// the end of the word and not treated as an independent punctuation mark.
753
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f"));
754
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e08\\u0e30"));
755
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e23\\u0e30\\u0e14\\u0e21"));
756
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e08\\u0e49\\u0e32"));
757
// thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2b\\u0e19\\u0e49\\u0e32"));
758
// thaiLineSelection->addElement(CharsToUnicodeString("\\u0e17\\u0e35\\u0e48"));
759
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48"));
760
// the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us
761
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2d\\u0e2d\\u0e01"));
762
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e21\\u0e32"));
763
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e23\\u0e48\\u0e07"));
764
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22"));
765
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07"));
766
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e15\\u0e47\\u0e21"));
768
// the one time where the paiyannoi occurs somewhere other than at the end
769
// of a word is in the Thai abbrevation for "etc.", which both begins and
770
// ends with a paiyannoi
771
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2f\\u0e25\\u0e2f"));
772
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e17\\u0e35\\u0e48"));
773
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e19\\u0e31\\u0e49\\u0e19"));
775
BreakIterator* e = BreakIterator::createLineInstance(
776
Locale("th"), status);
777
if (U_FAILURE(status))
779
errln("Failed to create the BreakIterator for Thai locale in TestThaiLineBreak.\n");
783
generalIteratorTest(*e, thaiLineSelection);
785
delete thaiLineSelection;
788
void IntlTestTextBoundary::TestMixedThaiLineBreak()
790
UErrorCode status = U_ZERO_ERROR;
791
Vector* thaiLineSelection= new Vector();
793
// Arabic numerals should always be separated from surrounding Thai text
795
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e04\\u0e48\\u0e32"));
796
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e07\\u0e34\\u0e19"));
797
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e1a\\u0e32\\u0e17"));
798
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e41\\u0e15\\u0e30"));
799
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e23\\u0e30\\u0e14\\u0e31\\u0e1a"));
800
thaiLineSelection->addElement("39");
801
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e1a\\u0e32\\u0e17 "));
803
// words in non-Thai scripts should always be separated from surrounding Thai text
804
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e17\\u0e14"));
805
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2a\\u0e2d\\u0e1a"));
806
thaiLineSelection->addElement("Java");
807
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e1a\\u0e19"));
808
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e04\\u0e23\\u0e37\\u0e48\\u0e2d\\u0e07"));
809
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e44\\u0e2d\\u0e1a\\u0e35\\u0e40\\u0e2d\\u0e47\\u0e21 "));
811
// Thai numerals should always be separated from the text surrounding them
812
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e04\\u0e48\\u0e32"));
813
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e07\\u0e34\\u0e19"));
814
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e1a\\u0e32\\u0e17"));
815
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e41\\u0e15\\u0e30"));
816
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e23\\u0e30\\u0e14\\u0e31\\u0e1a"));
817
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e53\\u0e59"));
818
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e1a\\u0e32\\u0e17 "));
820
// Thai text should interact correctly with punctuation and symbols
821
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e44\\u0e2d\\u0e1a\\u0e35\\u0e40\\u0e2d\\u0e47\\u0e21"));
822
// thaiLineSelection->addElement(CharsToUnicodeString("(\\u0e1b\\u0e23\\u0e30\\u0e40\\u0e17\\u0e28"));
823
// thaiLineSelection->addElement(CharsToUnicodeString("\\u0e44\\u0e17\\u0e22)"));
824
thaiLineSelection->addElement(CharsToUnicodeString("(\\u0e1b\\u0e23\\u0e30\\u0e40\\u0e17\\u0e28\\u0e44\\u0e17\\u0e22)"));
825
// I believe the commented-out reading above to be the correct one, but this is what passes with our current dictionary
826
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e08\\u0e33\\u0e01\\u0e31\\u0e14"));
827
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e1b\\u0e34\\u0e14"));
828
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e15\\u0e31\\u0e27\""));
830
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\""));
831
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e23\\u0e38\\u0e48\\u0e19"));
832
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e43\\u0e2b\\u0e21\\u0e48"));
833
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e14\\u0e37\\u0e2d\\u0e19\\u0e21\\u0e34."));
834
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e22."));
835
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e19\\u0e35\\u0e49"));
836
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e23\\u0e32\\u0e04\\u0e32"));
837
thaiLineSelection->addElement("$200");
838
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e17\\u0e48\\u0e32"));
839
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e19\\u0e31\\u0e49\\u0e19 "));
840
thaiLineSelection->addElement(CharsToUnicodeString("(\"\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\")."));
842
BreakIterator* e = BreakIterator::createLineInstance(Locale("th"), status);
843
if (U_FAILURE(status))
845
errln("Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak.\n");
850
generalIteratorTest(*e, thaiLineSelection);
852
delete thaiLineSelection;
856
void IntlTestTextBoundary::TestMaiyamok()
858
Vector* thaiLineSelection= new Vector();
859
UErrorCode status = U_ZERO_ERROR;
860
// the Thai maiyamok character is a shorthand symbol that means "repeat the previous
861
// word". Instead of appearing as a word unto itself, however, it's kept together
862
// with the word before it
863
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e44\\u0e1b\\u0e46"));
864
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e21\\u0e32\\u0e46"));
865
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07"));
866
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e01\\u0e23\\u0e38\\u0e07\\u0e40\\u0e17\\u0e1e"));
867
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e41\\u0e25\\u0e30"));
868
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e03\\u0e35\\u0e22\\u0e07"));
869
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e43\\u0e2b\\u0e21\\u0e48"));
871
BreakIterator* e = BreakIterator::createLineInstance(
872
Locale("th"), status);
874
if (U_FAILURE(status))
876
errln("Failed to create the BreakIterator for Thai locale in TestMaiyamok.\n");
879
generalIteratorTest(*e, thaiLineSelection);
881
delete thaiLineSelection;
884
void IntlTestTextBoundary::TestThaiWordBreak() {
885
Vector* thaiWordSelection = new Vector();
886
UErrorCode status = U_ZERO_ERROR;
888
thaiWordSelection->addElement(CharsToUnicodeString("\\u0E1A\\u0E17")); //2
889
thaiWordSelection->addElement(CharsToUnicodeString("\\u0E17\\u0E35\\u0E48")); //5
890
thaiWordSelection->addElement(CharsToUnicodeString("\\u0E51")); //6
891
thaiWordSelection->addElement(CharsToUnicodeString("\\u0E1E\\u0E32\\u0E22\\u0E38")); //10
892
thaiWordSelection->addElement(CharsToUnicodeString("\\u0E44\\u0E0B\\u0E42\\u0E04\\u0E25\\u0E19")); //16
893
thaiWordSelection->addElement(CharsToUnicodeString("\\u000D\\u000A")); //18
895
// This is the correct result
896
//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E42\\u0E14\\u0E42\\u0E23\\u0E18\\u0E35")); //24
897
//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E2D\\u0E32\\u0E28\\u0E31\\u0E22")); //29
899
// and this is what the dictionary does...
900
thaiWordSelection->addElement(CharsToUnicodeString("\\u0E42\\u0E14")); // 20
901
thaiWordSelection->addElement(CharsToUnicodeString("\\u0E42\\u0E23\\u0E18\\u0E35\\u0E2D\\u0E32\\u0E28\\u0E31\\u0E22")); //29
903
thaiWordSelection->addElement(CharsToUnicodeString("\\u0E2D\\u0E22\\u0E39\\u0E48")); //33
905
// This is the correct result
906
//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E17\\u0E48\\u0E32\\u0E21")); //37
907
//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E01\\u0E25\\u0E32\\u0E07")); //41
909
// and this is what the dictionary does
910
thaiWordSelection->addElement(CharsToUnicodeString("\\u0E17\\u0E48\\u0E32\\u0E21\\u0E01\\u0E25\\u0E32\\u0E07")); //41
912
thaiWordSelection->addElement(CharsToUnicodeString("\\u0E17\\u0E38\\u0E48\\u0E07")); //45
913
thaiWordSelection->addElement(CharsToUnicodeString("\\u0E43\\u0E2B\\u0E0D\\u0E48")); //49
914
thaiWordSelection->addElement(CharsToUnicodeString("\\u0E43\\u0E19")); //51
916
// This is the correct result
917
//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E41\\u0E04\\u0E19\\u0E0B\\u0E31\\u0E2A")); //57
918
//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E01\\u0E31\\u0E1A")); //60
920
// and this is what the dictionary does
921
thaiWordSelection->addElement(CharsToUnicodeString("\\u0E41\\u0E04\\u0E19")); // 54
922
thaiWordSelection->addElement(CharsToUnicodeString("\\u0E0B\\u0E31\\u0E2A\\u0E01\\u0E31\\u0E1A")); //60
924
thaiWordSelection->addElement(CharsToUnicodeString("\\u0E25\\u0E38\\u0E07")); //63
926
// This is the correct result
927
//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E40\\u0E2E\\u0E19\\u0E23\\u0E35")); //68
928
//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E0A\\u0E32\\u0E27")); //71
929
//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E44\\u0E23\\u0E48")); //74
930
//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E41\\u0E25\\u0E30")); //77
932
// and this is what the dictionary does
933
thaiWordSelection->addElement(CharsToUnicodeString("\\u0E40\\u0E2E")); // 65
934
thaiWordSelection->addElement(CharsToUnicodeString("\\u0E19\\u0E23\\u0E35\\u0E0A\\u0E32\\u0E27\\u0E44\\u0E23\\u0E48\\u0E41\\u0E25\\u0E30")); //77
936
BreakIterator* e = BreakIterator::createWordInstance(
937
Locale("th"), status);
938
if (U_FAILURE(status))
940
errln("Failed to create the BreakIterator for Thai locale in TestThaiWordBreak.\n");
944
generalIteratorTest(*e, thaiWordSelection);
946
delete thaiWordSelection;
950
* Test Japanese Line Break
953
void IntlTestTextBoundary::TestJapaneseLineBreak()
955
UErrorCode status = U_ZERO_ERROR;
956
UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c");
957
UnicodeString precedingChars = CharsToUnicodeString("([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f");
958
UnicodeString followingChars = CharsToUnicodeString(")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc:;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302");
959
BreakIterator *iter = BreakIterator::createLineInstance(Locale::JAPAN, status);
962
if (U_FAILURE(status))
964
errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n");
968
for (i = 0; i < precedingChars.length(); i++) {
969
testString[1] = precedingChars[i];
970
iter->setText(testString);
971
int32_t j = iter->first();
973
errln("ja line break failure: failed to start at 0");
976
errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i])
977
+ "' (" + ((int)(precedingChars[i])) + ")");
980
errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i])
981
+ "' (" + ((int)(precedingChars[i])) + ")");
984
for (i = 0; i < followingChars.length(); i++) {
985
testString[1] = followingChars[i];
986
iter->setText(testString);
987
int j = iter->first();
989
errln("ja line break failure: failed to start at 0");
992
errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i])
993
+ "' (" + ((int)(followingChars[i])) + ")");
996
errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i])
997
+ "' (" + ((int)(followingChars[i])) + ")");
1002
//---------------------------------------------
1004
//---------------------------------------------/
1006
void IntlTestTextBoundary::TestEmptyString()
1008
UnicodeString text = "";
1010
UErrorCode status = U_ZERO_ERROR;
1012
BreakIterator* bi = BreakIterator::createLineInstance(Locale::getDefault(), status);
1013
if (U_FAILURE(status))
1015
errln("Failed to create the BreakIterator for default locale in TestEmptyString.\n");
1018
generalIteratorTest(*bi, &x);
1023
void IntlTestTextBoundary::TestGetAvailableLocales()
1025
int32_t locCount = 0;
1026
const Locale* locList = BreakIterator::getAvailableLocales(locCount);
1029
errln("getAvailableLocales() returned an empty list!");
1030
// Just make sure that it's returning good memory.
1031
for (int32_t i = 0; i < locCount; ++i) {
1032
logln(locList[i].getName());
1036
//Testing the BreakIterator::getDisplayName() function
1037
void IntlTestTextBoundary::TestGetDisplayName()
1039
UnicodeString result;
1041
BreakIterator::getDisplayName(Locale::getUS(), result);
1042
if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
1043
errln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
1046
BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
1047
if (result != "French (France)")
1048
errln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
1052
* Test End Behaviour
1055
void IntlTestTextBoundary::TestEndBehaviour()
1057
UErrorCode status = U_ZERO_ERROR;
1058
UnicodeString testString("boo.");
1059
BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
1060
if (U_FAILURE(status))
1062
errln("Failed to create the BreakIterator for default locale in TestEndBehaviour.\n");
1065
wb->setText(testString);
1067
if (wb->first() != 0)
1068
errln("Didn't get break at beginning of string.");
1069
if (wb->next() != 3)
1070
errln("Didn't get break before period in \"boo.\"");
1071
if (wb->current() != 4 && wb->next() != 4)
1072
errln("Didn't get break at end of string.");
1078
void IntlTestTextBoundary::TestBug4153072() {
1079
UErrorCode status = U_ZERO_ERROR;
1080
BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
1081
if (U_FAILURE(status))
1083
errln("Failed to create the BreakIterator for default locale in TestBug4153072\n");
1086
UnicodeString str("...Hello, World!...");
1088
int32_t end = str.length() - 3;
1091
StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
1092
iter->adoptText(textIterator);
1093
for (int index = -1; index < begin + 1; ++index) {
1094
dummy = iter->isBoundary(index);
1095
if (index < begin && dummy == TRUE) {
1096
errln((UnicodeString)"Didn't handle preceeding correctly with offset = " + index +
1097
" and begin index = " + begin);
1106
void IntlTestTextBoundary::TestPreceding()
1108
UErrorCode status = U_ZERO_ERROR;
1109
UnicodeString words3("aaa bbb ccc");
1110
BreakIterator* e = BreakIterator::createWordInstance(Locale::getDefault(), status);
1111
if (U_FAILURE(status))
1113
errln("Failed to create the BreakIterator for default locale in TestPreceeding.\n");
1117
e->setText( words3 );
1119
int32_t p1 = e->next();
1120
int32_t p2 = e->next();
1121
int32_t p3 = e->next();
1122
int32_t p4 = e->next();
1124
int32_t f = e->following(p2+1);
1125
int32_t p = e->preceding(p2+1);
1127
errln("IntlTestTextBoundary::TestPreceding: f!=p3");
1129
errln("IntlTestTextBoundary::TestPreceding: p!=p2");
1132
errln("IntlTestTextBoundary::TestPreceding: p1+1!=p2");
1135
errln("IntlTestTextBoundary::TestPreceding: p3+1!=p4");
1137
if (!e->isBoundary(p2) || e->isBoundary(p2+1) || !e->isBoundary(p3))
1139
errln("IntlTestTextBoundary::TestPreceding: isBoundary err");
1143
//---------------------------------------------
1145
//---------------------------------------------
1147
void IntlTestTextBoundary::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
1149
if (exec) logln("TestSuite TextBoundary: ");
1151
case 0: name = "TestSentenceIteration"; if(exec) TestSentenceIteration(); break;
1152
case 1: name = "TestWordIteration"; if(exec) TestWordIteration(); break;
1153
case 2: name = "TestLineIteration"; if(exec) TestLineIteration(); break;
1154
case 3: name = "TestCharacterIteration"; if(exec) TestCharacterIteration(); break;
1155
case 4: name = "TestSentenceInvariants"; if(exec) TestSentenceInvariants();break;
1156
case 5: name = "TestWordInvariants"; if(exec) TestWordInvariants();break;
1157
case 6: name = "TestLineInvariants"; if(exec) TestLineInvariants();break;
1158
case 7: name = "TestCharacterInvariants"; if(exec) TestCharacterInvariants();break;
1160
case 8: name = "TestEmptyString"; if (exec) TestEmptyString(); break;
1161
case 9: name = "TestGetAvailableLocales"; if (exec) TestGetAvailableLocales(); break;
1162
case 10: name = "TestGetDisplayName"; if (exec) TestGetDisplayName(); break;
1163
case 11: name = "TestPreceding"; if (exec) TestPreceding(); break;
1164
case 12: name = "TestBug4153072"; if (exec) TestBug4153072(); break;
1165
case 13: name = "TestEndBehaviour"; if (exec) TestEndBehaviour(); break;
1167
case 14: name = "TestJapaneseLineBreak"; if (exec) TestJapaneseLineBreak(); break;
1168
case 15: name = "TestThaiLineBreak"; if(exec) TestThaiLineBreak(); break;
1169
case 16: name = "TestMixedThaiLineBreak"; if(exec) TestMixedThaiLineBreak(); break;
1170
case 17: name = "TestMaiyamok"; if(exec) TestMaiyamok(); break;
1171
case 18: name = "TestThaiWordBreak"; if(exec) TestThaiWordBreak(); break;
1174
default: name = ""; break; //needed to end loop
1178
//---------------------------------------------
1179
// Test implementation routines
1180
//---------------------------------------------
1182
// general test Implementation subroutines
1183
void IntlTestTextBoundary::generalIteratorTest(BreakIterator& bi, Vector* expectedResult)
1185
Enumeration *elems = expectedResult->elements();
1186
UnicodeString text = createTestData(elems);
1189
logln("comparing forward and backward...");
1192
Vector *nextResults = testFirstAndNext(bi, text);
1193
if (nextResults == NULL) {
1194
errln("Couldn't get nextResults!");
1198
Vector *previousResults = testLastAndPrevious(bi, text);
1200
if (previousResults == NULL) {
1201
errln("Couldn't get previousResults!");
1205
int errs = getErrors();
1206
UnicodeString str1="forward iteration";
1207
UnicodeString str2="backward iteration";
1208
compareFragmentLists(str1, str2, nextResults,
1210
if (getErrors() == errs) {
1211
logln("comparing expected and actual...");
1212
str1="expected result";
1213
str2="actual result";
1214
compareFragmentLists(str1, str2, expectedResult,
1218
int32_t *boundaries = new int32_t[expectedResult->size() + 3];
1219
boundaries[0] = BreakIterator::DONE;
1221
for (int i = 0; i < expectedResult->size(); i++)
1222
boundaries[i + 2] = boundaries[i + 1] + ((UnicodeString)expectedResult->elementAt(i)).
1225
int len = expectedResult->size() + 3 -1;
1226
boundaries[len] = BreakIterator::DONE;
1228
testFollowing(bi, text, boundaries);
1229
testPreceding(bi, text, boundaries);
1230
testIsBoundary(bi, text, boundaries);
1232
doMultipleSelectionTest(bi, text);
1235
delete previousResults;
1236
delete []boundaries;
1239
Vector* IntlTestTextBoundary::testFirstAndNext(BreakIterator& bi, UnicodeString& text)
1241
int32_t p = bi.first();
1243
Vector *result = new Vector();
1244
UnicodeString selection;
1247
errln((UnicodeString)"first() returned " + p + (UnicodeString)" instead of 0");
1248
while (p != BreakIterator::DONE) {
1250
if (p != BreakIterator::DONE) {
1252
errln((UnicodeString)"next() failed to move forward: next() on position "
1253
+ lastP + (UnicodeString)" yielded " + p);
1254
errln("Are the *.brk files corrupt?");
1258
text.extractBetween(lastP, p, selection);
1259
result->addElement(selection);
1262
if (lastP != text.length())
1263
errln((UnicodeString)"next() returned DONE prematurely: offset was "
1264
+ lastP + (UnicodeString)" instead of " + text.length());
1271
Vector* IntlTestTextBoundary::testLastAndPrevious(BreakIterator& bi, UnicodeString& text)
1273
int32_t p = bi.last();
1275
Vector *result = new Vector();
1276
UnicodeString selection;
1278
if (p != text.length())
1279
errln((UnicodeString)"last() returned " + p + (UnicodeString)" instead of " + text.length());
1280
while (p != BreakIterator::DONE) {
1282
if (p != BreakIterator::DONE) {
1284
errln((UnicodeString)"previous() failed to move backward: previous() on position "
1285
+ lastP + (UnicodeString)" yielded " + p);
1286
text.extractBetween(p, lastP, selection);
1287
result->insertElementAt(selection, 0);
1291
errln((UnicodeString)"previous() returned DONE prematurely: offset was "
1292
+ lastP + (UnicodeString)" instead of 0");
1299
void IntlTestTextBoundary::compareFragmentLists(UnicodeString& f1Name, UnicodeString& f2Name, Vector* f1, Vector* f2)
1307
UnicodeString target;
1309
while (p1 < f1->size() && p2 < f2->size()) {
1310
s1 = (UnicodeString)f1->elementAt(p1);
1311
s2 = (UnicodeString)f2->elementAt(p2);
1315
if (s1.compare(s2) == 0) {
1316
logln(prettify((UnicodeString)" >" + s1 + (UnicodeString)"<", target));
1321
int32_t tempT1 = t1;
1322
int32_t tempT2 = t2;
1323
int32_t tempP1 = p1;
1324
int32_t tempP2 = p2;
1326
while (tempT1 != tempT2 && tempP1 < f1->size() && tempP2 < f2->size()) {
1327
while (tempT1 < tempT2 && tempP1 < f1->size()) {
1328
tempT1 += ((UnicodeString)f1->elementAt(tempP1)).length();
1331
while (tempT2 < tempT1 && tempP2 < f2->size()) {
1332
tempT2 += ((UnicodeString)f2->elementAt(tempP2)).length();
1336
logln((UnicodeString)"*** " + f1Name + (UnicodeString)" has:");
1337
while (p1 <= tempP1 && p1 < f1->size()) {
1338
s1 = (UnicodeString)f1->elementAt(p1);
1340
logln(prettify((UnicodeString)" *** >" + s1 + (UnicodeString)"<", target));
1343
logln("***** " + f2Name + " has:");
1344
while (p2 <= tempP2 && p2 < f2->size()) {
1345
s2 = (UnicodeString)f2->elementAt(p2);
1347
logln(prettify(" ***** >" + s2 + "<", target));
1350
errln((UnicodeString)"Discrepancy between " + f1Name + (UnicodeString)" and " + f2Name);
1355
void IntlTestTextBoundary::testFollowing(BreakIterator& bi, UnicodeString& text, int32_t *boundaries)
1357
logln("testFollowing():");
1359
int32_t textLen = text.length();
1360
for (int i = 0; i <= textLen; i++) {
1361
if (i == boundaries[p])
1364
int32_t b = bi.following(i);
1365
logln((UnicodeString)"bi.following(" + i + ") -> " + b);
1366
if (b != boundaries[p])
1367
errln((UnicodeString)"Wrong result from following() for " + i + (UnicodeString)": expected " + boundaries[p]
1368
+ (UnicodeString)", got " + b);
1372
void IntlTestTextBoundary::testPreceding(BreakIterator& bi, UnicodeString& text, int32_t *boundaries) {
1373
logln("testPreceding():");
1375
int32_t textLen = text.length();
1376
for (int i = 0; i <= textLen; i++) {
1377
int32_t b = bi.preceding(i);
1378
logln((UnicodeString)"bi.preceding(" + i + ") -> " + b);
1379
if (b != boundaries[p])
1380
errln((UnicodeString)"Wrong result from preceding() for " + i + (UnicodeString)": expected " + boundaries[p]
1381
+ (UnicodeString)", got " + b);
1383
if (i == boundaries[p + 1])
1388
void IntlTestTextBoundary::testIsBoundary(BreakIterator& bi, UnicodeString& text, int32_t *boundaries) {
1389
logln("testIsBoundary():");
1392
int32_t textLen = text.length();
1393
for (int i = 0; i < textLen; i++) {
1394
isB = bi.isBoundary(i);
1395
logln((UnicodeString)"bi.isBoundary(" + i + ") -> " + isB);
1397
if (i == boundaries[p]) {
1399
errln((UnicodeString)"Wrong result from isBoundary() for " + i + (UnicodeString)": expected true, got false");
1404
errln((UnicodeString)"Wrong result from isBoundary() for " + i + (UnicodeString)": expected false, got true");
1409
void IntlTestTextBoundary::doMultipleSelectionTest(BreakIterator& iterator,
1410
UnicodeString& testText)
1412
iterator.setText(testText);
1414
BreakIterator* testIterator = iterator.clone();
1415
int32_t offset = iterator.first();
1419
logln("doMultipleSelectionTest text of length: %d", testText.length());
1421
if (*testIterator != iterator)
1422
errln("clone() or operator!= failed: two clones compared unequal");
1425
testOffset = testIterator->first();
1426
testOffset = testIterator->next(count);
1427
if (offset != testOffset)
1428
errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
1430
if (offset != BreakIterator::DONE) {
1432
offset = iterator.next();
1434
if (offset != BreakIterator::DONE && *testIterator == iterator)
1435
errln("operator== failed: Two unequal iterators compared equal.");
1437
} while (offset != BreakIterator::DONE);
1439
// now do it backwards...
1440
offset = iterator.last();
1444
testOffset = testIterator->last();
1445
testOffset = testIterator->next(count);
1446
if (offset != testOffset)
1447
errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
1449
if (offset != BreakIterator::DONE) {
1451
offset = iterator.previous();
1453
} while (offset != BreakIterator::DONE);
1454
delete testIterator;
1457
void IntlTestTextBoundary::doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars)
1459
UnicodeString work("aaa");
1460
int32_t errCount = 0, testCharsLen = testChars.length(), breaksLen;
1462
// a break should always occur after CR (unless followed by LF), LF, PS, and LS
1463
UnicodeString breaks = CharsToUnicodeString("\r\n\\u2029\\u2028");
1466
breaksLen = breaks.length();
1467
for (i = 0; i < breaksLen; i++) {
1468
work[1] = breaks[i];
1469
for (j = 0; j < testCharsLen; j++) {
1470
work[0] = testChars[j];
1471
for (int k = 0; k < testCharsLen; k++) {
1472
UChar c = testChars[k];
1474
// if a cr is followed by lf, ps, ls or etx, don't do the check (that's
1475
// not supposed to work)
1476
if (work[1] == '\r' && (c == '\n' || c == 0x2029
1477
|| c == 0x2028 || c == 0x0003))
1482
UBool seen2 = FALSE;
1483
for (int l = tb.first(); l != BreakIterator::DONE; l = tb.next()) {
1490
errln("No break between U+" + UCharToUnicodeString(work[1])
1491
+ " and U+" + UCharToUnicodeString(work[2]));
1501
void IntlTestTextBoundary::doOtherInvariantTest(BreakIterator& tb, UnicodeString& testChars)
1503
UnicodeString work("a\r\na");
1504
int32_t errCount = 0, testCharsLen = testChars.length();
1508
// a break should never occur between CR and LF
1509
for (i = 0; i < testCharsLen; i++) {
1510
work[0] = testChars[i];
1511
for (j = 0; j < testCharsLen; j++) {
1512
work[3] = testChars[j];
1514
for (int32_t k = tb.first(); k != BreakIterator::DONE; k = tb.next())
1516
errln("Break between CR and LF in string U+" + UCharToUnicodeString(work[0]) +
1517
", U+d U+a U+" + UCharToUnicodeString(work[3]));
1525
// a break should never occur before a non-spacing mark, unless the preceding
1526
// character is CR, LF, PS, or LS
1529
for (i = 0; i < testCharsLen; i++) {
1530
UChar c = testChars[i];
1531
if (c == '\n' || c == '\r' || c == 0x2029 || c == 0x2028 || c == 0x0003)
1534
for (j = 0; j < testCharsLen; j++) {
1536
type = Unicode::getType(c);
1537
if ((type != Unicode::NON_SPACING_MARK) &&
1538
(type != Unicode::ENCLOSING_MARK))
1542
for (int k = tb.first(); k != BreakIterator::DONE; k = tb.next())
1544
errln("Break between U+" + UCharToUnicodeString(work[1])
1545
+ " and U+" + UCharToUnicodeString(work[2]));
1554
void IntlTestTextBoundary::sample(BreakIterator& tb,
1555
UnicodeString& text,
1556
UnicodeString& title)
1558
UnicodeString substring;
1559
UBool verboseWas = verbose;
1561
logln("-------------------------"+title+" length = "+text.length());
1563
int32_t start = tb.first();
1565
for (end = tb.next(); end != BreakIterator::DONE; end = tb.next()) {
1566
text.extractBetween(start, end, substring);
1567
logln(UnicodeString("[")+start+","+end+"] \""+substring+"\"");
1570
verbose = verboseWas;