~ubuntu-branches/ubuntu/gutsy/icu/gutsy-updates

0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b,

115

0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2,

116

0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3,

117

0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303,

118

0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000,

119

0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f,

120

0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000

121

};

122

123

UnicodeString* IntlTestTextBoundary::cannedTestChars = 0;

124

125

//---------------------------------------------

126

// setup methods

127

//---------------------------------------------

128

129

IntlTestTextBoundary::IntlTestTextBoundary()

130

{

131

UnicodeString temp(cannedTestArray);

132

cannedTestChars = new UnicodeString();

133

*cannedTestChars += (UChar)0x0000;

134

*cannedTestChars += temp;

135

addTestWordData();

136

addTestSentenceData();

137

addTestLineData();

138

addTestCharacterData();

139

}

140

141

IntlTestTextBoundary::~IntlTestTextBoundary()

142

{

143

delete wordSelectionData;

144

delete sentenceSelectionData;

145

delete lineSelectionData;

146

delete characterSelectionData;

147

delete cannedTestChars;

148

}

149

150

/**

151

* @bug 4097779 4098467 4117554

152

153

void IntlTestTextBoundary::addTestWordData()

154

{

155

wordSelectionData = new Vector();

156

157

wordSelectionData->addElement("12,34");

158

159

wordSelectionData->addElement(" ");

160

wordSelectionData->addElement(UCharToUnicodeString((UChar)(0x00A2))); //cent sign

161

wordSelectionData->addElement(UCharToUnicodeString((UChar)(0x00A3))); //pound sign

162

wordSelectionData->addElement(UCharToUnicodeString((UChar)(0x00A4))); //currency sign

163

wordSelectionData->addElement(UCharToUnicodeString((UChar)(0x00A5))); //yen sign

164

wordSelectionData->addElement("alpha-beta-gamma");

165

wordSelectionData->addElement(".");

166

wordSelectionData->addElement(" ");

167

wordSelectionData->addElement("Badges");

168

wordSelectionData->addElement("?");

169

wordSelectionData->addElement(" ");

170

wordSelectionData->addElement("BADGES");

171

wordSelectionData->addElement("!");

172

wordSelectionData->addElement("?");

173

wordSelectionData->addElement("!");

174

wordSelectionData->addElement(" ");

175

wordSelectionData->addElement("We");

176

wordSelectionData->addElement(" ");

177

wordSelectionData->addElement("don't");

178

wordSelectionData->addElement(" ");

179

wordSelectionData->addElement("need");

180

wordSelectionData->addElement(" ");

181

wordSelectionData->addElement("no");

182

wordSelectionData->addElement(" ");

183

wordSelectionData->addElement("STINKING");

184

wordSelectionData->addElement(" ");

185

wordSelectionData->addElement("BADGES");

186

wordSelectionData->addElement("!");

187

wordSelectionData->addElement("!");

188

wordSelectionData->addElement("!");

189

190

wordSelectionData->addElement("012.566,5");

191

wordSelectionData->addElement(" ");

192

wordSelectionData->addElement("123.3434,900");

193

wordSelectionData->addElement(" ");

194

wordSelectionData->addElement("1000,233,456.000");

195

wordSelectionData->addElement(" ");

196

wordSelectionData->addElement("1,23.322%");

197

wordSelectionData->addElement(" ");

198

wordSelectionData->addElement("123.1222");

199

200

wordSelectionData->addElement(" ");

201

wordSelectionData->addElement("$123,000.20");

202

203

wordSelectionData->addElement(" ");

204

wordSelectionData->addElement("179.01%");

205

206

wordSelectionData->addElement("Hello");

207

wordSelectionData->addElement(",");

208

wordSelectionData->addElement(" ");

209

wordSelectionData->addElement("how");

210

wordSelectionData->addElement(" ");

211

wordSelectionData->addElement("are");

212

wordSelectionData->addElement(" ");

213

wordSelectionData->addElement("you");

214

wordSelectionData->addElement(" ");

215

wordSelectionData->addElement("X");

216

wordSelectionData->addElement(" ");

217

218

wordSelectionData->addElement("Now");

219

wordSelectionData->addElement("\r");

220

wordSelectionData->addElement("is");

221

wordSelectionData->addElement("\n");

222

wordSelectionData->addElement("the");

223

wordSelectionData->addElement("\r\n");

224

wordSelectionData->addElement("time");

225

wordSelectionData->addElement("\n");

226

wordSelectionData->addElement("\r");

227

wordSelectionData->addElement("for");

228

wordSelectionData->addElement("\r");

229

wordSelectionData->addElement("\r");

230

wordSelectionData->addElement("all");

231

wordSelectionData->addElement(" ");

232

233

// to test for bug #4097779

234

wordSelectionData->addElement(CharsToUnicodeString("aa\\u0300a"));

235

wordSelectionData->addElement(" ");

236

237

// to test for bug #4098467

238

// What follows is a string of Korean characters (I found it in the Yellow Pages

239

// ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed

240

// it correctly), first as precomposed syllables, and then as conjoining jamo.

241

// Both sequences should be semantically identical and break the same way.

242

// precomposed syllables...

243

wordSelectionData->addElement(CharsToUnicodeString("\\uc0c1\\ud56d"));

244

wordSelectionData->addElement(" ");

245

wordSelectionData->addElement(CharsToUnicodeString("\\ud55c\\uc778"));

246

wordSelectionData->addElement(" ");

247

wordSelectionData->addElement(CharsToUnicodeString("\\uc5f0\\ud569"));

248

wordSelectionData->addElement(" ");

249

wordSelectionData->addElement(CharsToUnicodeString("\\uc7a5\\ub85c\\uad50\\ud68c"));

250

wordSelectionData->addElement(" ");

251

// conjoining jamo...

252

wordSelectionData->addElement(CharsToUnicodeString("\\u1109\\u1161\\u11bc\\u1112\\u1161\\u11bc"));

253

wordSelectionData->addElement(" ");

254

wordSelectionData->addElement(CharsToUnicodeString("\\u1112\\u1161\\u11ab\\u110b\\u1175\\u11ab"));

255

wordSelectionData->addElement(" ");

256

wordSelectionData->addElement(CharsToUnicodeString("\\u110b\\u1167\\u11ab\\u1112\\u1161\\u11b8"));

257

wordSelectionData->addElement(" ");

258

wordSelectionData->addElement(CharsToUnicodeString("\\u110c\\u1161\\u11bc\\u1105\\u1169\\u1100\\u116d\\u1112\\u116c"));

259

wordSelectionData->addElement(" ");

260

261

// this is a test for bug #4117554: the ideographic iteration mark (U+3005) should

262

// count as a Kanji character for the purposes of word breaking

263

wordSelectionData->addElement("abc");

264

wordSelectionData->addElement(CharsToUnicodeString("\\u4e01\\u4e02\\u3005\\u4e03\\u4e03"));

265

wordSelectionData->addElement("abc");

266

267

268

}

269

270

const UChar kParagraphSeparator = 0x2029;

271

const UChar kLineSeparator = 0x2028;

272

273

/**

274

* @bug 4111338 4117554 4113835

275

276

void IntlTestTextBoundary::addTestSentenceData()

277

{

278

sentenceSelectionData = new Vector();

279

sentenceSelectionData->addElement("This is a simple sample sentence. ");

280

sentenceSelectionData->addElement("(This is it.) ");

281

sentenceSelectionData->addElement("This is a simple sample sentence. ");

282

sentenceSelectionData->addElement("\"This isn\'t it.\" ");

283

sentenceSelectionData->addElement("Hi! ");

284

sentenceSelectionData->addElement("This is a simple sample sentence. ");

285

sentenceSelectionData->addElement("It does not have to make any sense as you can see. ");

286

sentenceSelectionData->addElement("Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ");

287

sentenceSelectionData->addElement("Che la dritta via aveo smarrita. ");

288

sentenceSelectionData->addElement("He said, that I said, that you said!! ");

289

290

sentenceSelectionData->addElement("Don't rock the boat." + UCharToUnicodeString(kParagraphSeparator));

291

292

sentenceSelectionData->addElement("Because I am the daddy, that is why. ");

293

sentenceSelectionData->addElement("Not on my time (el timo.)! ");

294

295

sentenceSelectionData->addElement("So what!!" + UCharToUnicodeString(kParagraphSeparator));

296

297

sentenceSelectionData->addElement("\"But now,\" he said, \"I know!\" ");

298

sentenceSelectionData->addElement("Harris thumbed down several, including \"Away We Go\" (which became the huge success Oklahoma!). ");

299

sentenceSelectionData->addElement("One species, B. anthracis, is highly virulent.\n");

300

sentenceSelectionData->addElement("Wolf said about Sounder:\"Beautifully thought-out and directed.\" ");

301

sentenceSelectionData->addElement("Have you ever said, \"This is where\tI shall live\"? ");

302

sentenceSelectionData->addElement("He answered, \"You may not!\" ");

303

sentenceSelectionData->addElement("Another popular saying is: \"How do you do?\". ");

304

sentenceSelectionData->addElement("Yet another popular saying is: \'I\'m fine thanks.\' ");

305

sentenceSelectionData->addElement("What is the proper use of the abbreviation pp.? ");

306

sentenceSelectionData->addElement("Yes, I am definatelly 12\" tall!!");

307

308

// test for bug #4113835: \n and \r count as spaces, not as paragraph breaks

309

sentenceSelectionData->addElement(CharsToUnicodeString("Now\ris\nthe\r\ntime\n\rfor\r\rall\\u2029"));

310

311

// test for bug #4111338: Don't break sentences at the boundary between CJK

312

// and other letters

313

sentenceSelectionData->addElement(CharsToUnicodeString("\\u5487\\u67ff\\ue591\\u5017\\u61b3\\u60a1\\u9510\\u8165:\"JAVA\\u821c")

314

+ CharsToUnicodeString("\\u8165\\u7fc8\\u51ce\\u306d,\\u2494\\u56d8\\u4ec0\\u60b1\\u8560\\u51ba")

315

+ CharsToUnicodeString("\\u611d\\u57b6\\u2510\\u5d46\".\\u2029"));

316

sentenceSelectionData->addElement(CharsToUnicodeString("\\u5487\\u67ff\\ue591\\u5017\\u61b3\\u60a1\\u9510\\u8165\\u9de8")

317

+ CharsToUnicodeString("\\u97e4JAVA\\u821c\\u8165\\u7fc8\\u51ce\\u306d\\ue30b\\u2494\\u56d8\\u4ec0")

318

+ CharsToUnicodeString("\\u60b1\\u8560\\u51ba\\u611d\\u57b6\\u2510\\u5d46\\u97e5\\u7751\\u2029"));

319

sentenceSelectionData->addElement(CharsToUnicodeString("\\u5487\\u67ff\\ue591\\u5017\\u61b3\\u60a1\\u9510\\u8165\\u9de8\\u97e4")

320

+ CharsToUnicodeString("\\u6470\\u8790JAVA\\u821c\\u8165\\u7fc8\\u51ce\\u306d\\ue30b\\u2494\\u56d8")

321

+ CharsToUnicodeString("\\u4ec0\\u60b1\\u8560\\u51ba\\u611d\\u57b6\\u2510\\u5d46\\u97e5\\u7751\\u2029"));

322

sentenceSelectionData->addElement(CharsToUnicodeString("He said, \"I can go there.\"\\u2029"));

323

324

// test for bug #4117554: Treat fullwidth variants of .!? the same as their

325

// normal counterparts

326

sentenceSelectionData->addElement(CharsToUnicodeString("I know I'm right\\uff0e "));

327

sentenceSelectionData->addElement(CharsToUnicodeString("Right\\uff1f "));

328

sentenceSelectionData->addElement(CharsToUnicodeString("Right\\uff01 "));

329

330

// test for bug #4117554: Don't break sentences at boundary between CJK and digits

331

sentenceSelectionData->addElement(CharsToUnicodeString("\\u5487\\u67ff\\ue591\\u5017\\u61b3\\u60a1\\u9510\\u8165\\u9de8")

332

+ CharsToUnicodeString("\\u97e48888\\u821c\\u8165\\u7fc8\\u51ce\\u306d\\ue30b\\u2494\\u56d8\\u4ec0")

333

+ CharsToUnicodeString("\\u60b1\\u8560\\u51ba\\u611d\\u57b6\\u2510\\u5d46\\u97e5\\u7751\\u2029"));

334

335

// test for bug #4117554: Break sentence between a sentence terminator and

336

// opening punctuation

337

sentenceSelectionData->addElement("no?");

338

sentenceSelectionData->addElement("(yes)" + CharsToUnicodeString("\\u2029"));

339

340

// test for bug #4158381: Don't break sentence after period if it isn't

341

// followed by a space

342

sentenceSelectionData->addElement("Test <code>Flags.Flag</code> class. ");

343

sentenceSelectionData->addElement("Another test." + CharsToUnicodeString("\\u2029"));

344

345

// test for bug #4158381: No breaks when there are no terminators around

346

sentenceSelectionData->addElement("Provides a set of "lightweight" (all-javaTM language) components that, to the maximum degree possible, work the same on all platforms. ");

347

sentenceSelectionData->addElement("Another test." + CharsToUnicodeString("\\u2029"));

348

349

// test for bug #4143071: Make sure sentences that end with digits

350

// work right

351

sentenceSelectionData->addElement("Today is the 27th of May, 1998. ");

352

sentenceSelectionData->addElement("Tomorrow with be 28 May 1998. ");

353

sentenceSelectionData->addElement("The day after will be the 30th."

354

+ CharsToUnicodeString("\\u2029"));

355

356

// test for bug #4152416: Make sure sentences ending with a capital

357

// letter are treated correctly

358

sentenceSelectionData->addElement("The type of all primitive <code>boolean</code> values accessed in the target VM. ");

359

sentenceSelectionData->addElement("Calls to xxx will return an implementor of this interface." + CharsToUnicodeString("\\u2029"));

360

361

// test for bug #4152117: Make sure sentence breaking is handling

362

// punctuation correctly [COULD NOT REPRODUCE THIS BUG, BUT TEST IS

363

// HERE TO MAKE SURE IT DOESN'T CROP UP]

364

sentenceSelectionData->addElement("Constructs a randomly generated BigInteger, uniformly distributed over the range <tt>0</tt> to <tt>(2numBits - 1)</tt>, inclusive. ");

365

sentenceSelectionData->addElement("The uniformity of the distribution assumes that a fair source of random bits is provided in <tt>rnd</tt>. ");

366

sentenceSelectionData->addElement("Note that this constructor always constructs a non-negative BigInteger." + CharsToUnicodeString("\\u2029"));

367

368

}

369

370

/**

371

* @bug 4068133 4086052 4035266 4097920 4098467 4117554

372

373

void IntlTestTextBoundary::addTestLineData()

374

{

375

lineSelectionData = new Vector();

376

lineSelectionData->addElement("Multi-");

377

lineSelectionData->addElement("Level ");

378

lineSelectionData->addElement("example ");

379

lineSelectionData->addElement("of ");

380

lineSelectionData->addElement("a ");

381

lineSelectionData->addElement("semi-");

382

lineSelectionData->addElement("idiotic ");

383

lineSelectionData->addElement("non-");

384

lineSelectionData->addElement("sensical ");

385

lineSelectionData->addElement("(non-");

386

lineSelectionData->addElement("important) ");

387

lineSelectionData->addElement("sentence. ");

388

389

lineSelectionData->addElement("Hi ");

390

lineSelectionData->addElement("Hello ");

391

lineSelectionData->addElement("How\n");

392

lineSelectionData->addElement("are\r");

393

lineSelectionData->addElement("you" + UCharToUnicodeString(kLineSeparator));

394

lineSelectionData->addElement("fine.\t");

395

lineSelectionData->addElement("good. ");

396

397

lineSelectionData->addElement("Now\r");

398

lineSelectionData->addElement("is\n");

399

lineSelectionData->addElement("the\r\n");

400

lineSelectionData->addElement("time\n");

401

lineSelectionData->addElement("\r");

402

lineSelectionData->addElement("for\r");

403

lineSelectionData->addElement("\r");

404

lineSelectionData->addElement("all");

405

406

// to test for bug #4068133

407

lineSelectionData->addElement(CharsToUnicodeString("\\u96f6"));

408

lineSelectionData->addElement(CharsToUnicodeString("\\u4e00\\u3002"));

409

lineSelectionData->addElement(CharsToUnicodeString("\\u4e8c\\u3001"));

410

lineSelectionData->addElement(CharsToUnicodeString("\\u4e09\\u3002\\u3001"));

411

lineSelectionData->addElement(CharsToUnicodeString("\\u56db\\u3001\\u3002\\u3001"));

412

lineSelectionData->addElement(CharsToUnicodeString("\\u4e94,"));

413

lineSelectionData->addElement(CharsToUnicodeString("\\u516d."));

414

lineSelectionData->addElement(CharsToUnicodeString("\\u4e03.\\u3001,\\u3002"));

415

lineSelectionData->addElement(CharsToUnicodeString("\\u516b"));

416

417

// to test for bug #4086052

418

lineSelectionData->addElement(CharsToUnicodeString("foo\\u00a0bar "));

419

// lineSelectionData->addElement("foo\\ufeffbar");

420

421

// to test for bug #4097920

422

lineSelectionData->addElement("dog,");

423

lineSelectionData->addElement("cat,");

424

lineSelectionData->addElement("mouse ");

425

lineSelectionData->addElement("(one)");

426

lineSelectionData->addElement("(two)\n");

427

428

// to test for bug #4035266

429

lineSelectionData->addElement("The ");

430

lineSelectionData->addElement("balance ");

431

lineSelectionData->addElement("is ");

432

lineSelectionData->addElement("$-23,456.78, ");

433

lineSelectionData->addElement("not ");

434

lineSelectionData->addElement("-$32,456.78!\n");

435

436

// to test for bug #4098467

437

// What follows is a string of Korean characters (I found it in the Yellow Pages

438

// ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed

439

// it correctly), first as precomposed syllables, and then as conjoining jamo.

440

// Both sequences should be semantically identical and break the same way.

441

// precomposed syllables...

442

lineSelectionData->addElement(CharsToUnicodeString("\\uc0c1\\ud56d "));

443

lineSelectionData->addElement(CharsToUnicodeString("\\ud55c\\uc778 "));

444

lineSelectionData->addElement(CharsToUnicodeString("\\uc5f0\\ud569 "));

445

lineSelectionData->addElement(CharsToUnicodeString("\\uc7a5\\ub85c\\uad50\\ud68c "));

446

// conjoining jamo...

447

lineSelectionData->addElement(CharsToUnicodeString("\\u1109\\u1161\\u11bc\\u1112\\u1161\\u11bc "));

448

lineSelectionData->addElement(CharsToUnicodeString("\\u1112\\u1161\\u11ab\\u110b\\u1175\\u11ab "));

449

lineSelectionData->addElement(CharsToUnicodeString("\\u110b\\u1167\\u11ab\\u1112\\u1161\\u11b8 "));

450

lineSelectionData->addElement(CharsToUnicodeString("\\u110c\\u1161\\u11bc\\u1105\\u1169\\u1100\\u116d\\u1112\\u116c"));

451

452

// to test for bug #4117554: Fullwidth .!? should be treated as postJwrd

453

lineSelectionData->addElement(CharsToUnicodeString("\\u4e01\\uff0e"));

454

lineSelectionData->addElement(CharsToUnicodeString("\\u4e02\\uff01"));

455

lineSelectionData->addElement(CharsToUnicodeString("\\u4e03\\uff1f"));

456

457

}

458

459

460

const UnicodeString graveS = "S" + (UChar)0x0300;

461

const UnicodeString acuteBelowI = "i" + UCharToUnicodeString(0x0317);

462

const UnicodeString acuteE = "e" + UCharToUnicodeString(0x0301);

463

const UnicodeString circumflexA = "a" + UCharToUnicodeString(0x0302);

464

const UnicodeString tildeE = "e" + UCharToUnicodeString(0x0303);

465

466

467

/**

468

* @bug 4098467

469

470

void IntlTestTextBoundary::addTestCharacterData()

471

{

472

characterSelectionData = new Vector();

473

characterSelectionData->addElement("S" + UCharToUnicodeString(0x0300)); //graveS

474

characterSelectionData->addElement("i" + UCharToUnicodeString(0x0301)); // acuteBelowI

475

characterSelectionData->addElement("m");

476

characterSelectionData->addElement("p");

477

characterSelectionData->addElement("l");

478

characterSelectionData->addElement("e" + UCharToUnicodeString(0x0301)); // acuteE

479

characterSelectionData->addElement(" ");

480

characterSelectionData->addElement("s");

481

characterSelectionData->addElement("a" + UCharToUnicodeString(0x0302)); // circumflexA

482

characterSelectionData->addElement("m");

483

characterSelectionData->addElement("p");

484

characterSelectionData->addElement("l");

485

characterSelectionData->addElement("e" + UCharToUnicodeString(0x0303)); // tildeE

486

characterSelectionData->addElement(".");

487

characterSelectionData->addElement("w");

488

characterSelectionData->addElement("a" + UCharToUnicodeString(0x0302)); // circumflexA

489

characterSelectionData->addElement("w");

490

characterSelectionData->addElement("a");

491

characterSelectionData->addElement("f");

492

characterSelectionData->addElement("q");

493

characterSelectionData->addElement("\n");

494

characterSelectionData->addElement("\r");

495

characterSelectionData->addElement("\r\n");

496

characterSelectionData->addElement("\n");

497

498

// to test for bug #4098467

499

// What follows is a string of Korean characters (I found it in the Yellow Pages

500

// ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed

501

// it correctly), first as precomposed syllables, and then as conjoining jamo.

502

// Both sequences should be semantically identical and break the same way.

503

// precomposed syllables...

504

characterSelectionData->addElement(CharsToUnicodeString("\\uc0c1"));

505

characterSelectionData->addElement(CharsToUnicodeString("\\ud56d"));

506

characterSelectionData->addElement(" ");

507

characterSelectionData->addElement(CharsToUnicodeString("\\ud55c"));

508

characterSelectionData->addElement(CharsToUnicodeString("\\uc778"));

509

characterSelectionData->addElement(" ");

510

characterSelectionData->addElement(CharsToUnicodeString("\\uc5f0"));

511

characterSelectionData->addElement(CharsToUnicodeString("\\ud569"));

512

characterSelectionData->addElement(" ");

513

characterSelectionData->addElement(CharsToUnicodeString("\\uc7a5"));

514

characterSelectionData->addElement(CharsToUnicodeString("\\ub85c"));

515

characterSelectionData->addElement(CharsToUnicodeString("\\uad50"));

516

characterSelectionData->addElement(CharsToUnicodeString("\\ud68c"));

517

characterSelectionData->addElement(" ");

518

// conjoining jamo...

519

characterSelectionData->addElement(CharsToUnicodeString("\\u1109\\u1161\\u11bc"));

520

characterSelectionData->addElement(CharsToUnicodeString("\\u1112\\u1161\\u11bc"));

521

characterSelectionData->addElement(" ");

522

characterSelectionData->addElement(CharsToUnicodeString("\\u1112\\u1161\\u11ab"));

523

characterSelectionData->addElement(CharsToUnicodeString("\\u110b\\u1175\\u11ab"));

524

characterSelectionData->addElement(" ");

525

characterSelectionData->addElement(CharsToUnicodeString("\\u110b\\u1167\\u11ab"));

526

characterSelectionData->addElement(CharsToUnicodeString("\\u1112\\u1161\\u11b8"));

527

characterSelectionData->addElement(" ");

528

characterSelectionData->addElement(CharsToUnicodeString("\\u110c\\u1161\\u11bc"));

529

characterSelectionData->addElement(CharsToUnicodeString("\\u1105\\u1169"));

530

characterSelectionData->addElement(CharsToUnicodeString("\\u1100\\u116d"));

531

characterSelectionData->addElement(CharsToUnicodeString("\\u1112\\u116c"));

532

533

}

534

535

UnicodeString IntlTestTextBoundary::createTestData(Enumeration* e)

536

{

537

UnicodeString result = "";

538

539

while (e->hasMoreElements()) {

540

result += e->nextElement();

541

}

542

return result;

543

}

544

545

//---------------------------------------------

546

// SentenceBreak tests

547

//---------------------------------------------

548

549

void IntlTestTextBoundary::TestSentenceIteration()

550

{

551

UErrorCode status = U_ZERO_ERROR;

552

BreakIterator* e = BreakIterator::createSentenceInstance(Locale::getDefault(), status);

553

if (U_FAILURE(status))

554

{

555

errln("Failed to create the BreakIterator for default locale in TestSentenceIteration.\n");

556

return;

557

}

558

generalIteratorTest(*e, sentenceSelectionData);

559

delete e;

560

}

561

562

void IntlTestTextBoundary::TestSentenceInvariants()

563

{

564

UErrorCode status = U_ZERO_ERROR;

565

BreakIterator *e = BreakIterator::createSentenceInstance(Locale::getDefault(), status);

566

if (U_FAILURE(status))

567

{

568

errln("Failed to create the BreakIterator for default locale in TestSentenceInvariant.\n");

569

return;

570

}

571

UnicodeString s = *cannedTestChars + CharsToUnicodeString(".,\\u3001\\u3002\\u3041\\u3042\\u3043\\ufeff");

572

doOtherInvariantTest(*e, s);

573

delete e;

574

}

575

//---------------------------------------------

576

// WordBreak tests

577

//---------------------------------------------

578

void IntlTestTextBoundary::TestWordIteration()

579

{

580

UErrorCode status = U_ZERO_ERROR;

581

BreakIterator* e = BreakIterator::createWordInstance(Locale::getDefault(), status);

582

if (U_FAILURE(status))

583

{

584

errln("Failed to create the BreakIterator for default locale in TestWordIteration.\n");

585

return;

586

}

587

generalIteratorTest(*e, wordSelectionData);

588

delete e;

589

}

590

void IntlTestTextBoundary::TestWordInvariants()

591

{

592

UErrorCode status = U_ZERO_ERROR;

593

BreakIterator *e = BreakIterator::createWordInstance(Locale::getDefault(), status);

594

if (U_FAILURE(status))

595

{

596

errln("Failed to create the BreakIterator for default locale in TestWordInvariants.\n");

597

return;

598

}

599

UnicodeString s = *cannedTestChars + CharsToUnicodeString("\',.\\u3041\\u3042\\u3043\\u309b\\u309c\\u30a1\\u30a2\\u30a3\\u4e00\\u4e01\\u4e02");

600

doBreakInvariantTest(*e, s);

601

s = *cannedTestChars + CharsToUnicodeString("\',.\\u3041\\u3042\\u3043\\u309b\\u309c\\u30a1\\u30a2\\u30a3\\u4e00\\u4e01\\u4e02");

602

doOtherInvariantTest(*e, s);

603

delete e;

604

}

605

//---------------------------------------------

606

// CharacterBreak tests

607

//---------------------------------------------

608

void IntlTestTextBoundary::TestCharacterIteration()

609

{

610

UErrorCode status = U_ZERO_ERROR;

611

BreakIterator* e = BreakIterator::createCharacterInstance(Locale::getDefault(), status);

612

if (U_FAILURE(status))

613

{

614

errln("Failed to create the BreakIterator for default locale in TestCharacterIteration.\n");

615

return;

616

}

617

// generalIteratorTest(*e, testCharacterText, characterSelectionData);

618

generalIteratorTest(*e, characterSelectionData);

619

delete e;

620

}

621

void IntlTestTextBoundary::TestCharacterInvariants()

622

{

623

UErrorCode status = U_ZERO_ERROR;

624

BreakIterator *e = BreakIterator::createCharacterInstance(Locale::getDefault(), status);

625

if (U_FAILURE(status))

626

{

627

errln("Failed to create the BreakIterator for default locale in TestCharacterInvariants.\n");

628

return;

629

}

630

UnicodeString s = *cannedTestChars + CharsToUnicodeString("\\u1100\\u1101\\u1102\\u1160\\u1161\\u1162\\u11a8\\u11a9\\u11aa");

631

doBreakInvariantTest(*e, s);

632

s = *cannedTestChars + CharsToUnicodeString("\\u1100\\u1101\\u1102\\u1160\\u1161\\u1162\\u11a8\\u11a9\\u11aa");

633

doOtherInvariantTest(*e, s);

634

delete e;

635

}

636

//---------------------------------------------

637

// LineBreak tests

638

//---------------------------------------------

639

void IntlTestTextBoundary::TestLineIteration()

640

{

641

UErrorCode status = U_ZERO_ERROR;

642

BreakIterator* e = BreakIterator::createLineInstance(Locale::getDefault(), status);

643

if (U_FAILURE(status))

644

{

645

errln("Failed to create the BreakIterator for default locale in TestLineIteration.\n");

646

return;

647

}

648

generalIteratorTest(*e, lineSelectionData);

649

delete e;

650

}

651

void IntlTestTextBoundary::TestLineInvariants()

652

{

653

UErrorCode status = U_ZERO_ERROR;

654

BreakIterator *e = BreakIterator::createLineInstance(Locale::US, status);

655

if (U_FAILURE(status))

656

{

657

errln("Failed to create the BreakIterator for default locale in TestLineInvariants.\n");

658

return;

659

}

660

UnicodeString s = CharsToUnicodeString(".,;:\\u3001\\u3002\\u3041\\u3042\\u3043\\u3044\\u3045\\u30a3\\u4e00\\u4e01\\u4e02");

661

UnicodeString testChars = *cannedTestChars + s;

662

doBreakInvariantTest(*e, testChars);

663

doOtherInvariantTest(*e, testChars);

664

665

int32_t errCount = 0, testCharsLen, noBreakLen, dashesLen;

666

int32_t i, j, k;

667

668

// in addition to the other invariants, a line-break iterator should make sure that:

669

// it doesn't break around the non-breaking characters

670

UnicodeString noBreak = CharsToUnicodeString("\\u00a0\\u2007\\u2011\\ufeff");

671

UnicodeString work("aaa");

672

testCharsLen = testChars.length();

673

noBreakLen = noBreak.length();

674

for (i = 0; i < testCharsLen; i++) {

675

UChar c = testChars[i];

676

if (c == '\r' || c == '\n' || c == 0x2029 || c == 0x2028 || c == 0x0003)

677

continue;

678

work[0] = c;

679

for (j = 0; j < noBreakLen; j++) {

680

work[1] = noBreak[j];

681

for (k = 0; k < testCharsLen; k++) {

682

work[2] = testChars[k];

683

e->setText(work);

684

for (int l = e->first(); l != BreakIterator::DONE; l = e->next())

685

if (l == 1 || l == 2) {

686

errln("Got break between U+" + UCharToUnicodeString(work[l - 1]) +

687

" and U+" + UCharToUnicodeString(work[l]));

688

errCount++;

689

if (errCount >= 75)

690

return;

691

}

692

}

693

}

694

}

695

696

// it does break after hyphens (unless they're followed by a digit, a non-spacing mark,

697

// a currency symbol, a non-breaking space, or a line or paragraph separator)

698

UnicodeString dashes = CharsToUnicodeString("-\\u00ad\\u2010\\u2012\\u2013\\u2014");

699

dashesLen = dashes.length();

700

for (i = 0; i < testCharsLen; i++) {

701

work[0] = testChars[i];

702

for (j = 0; j < dashesLen; j++) {

703

work[1] = dashes[j];

704

for (k = 0; k < testCharsLen; k++) {

705

UChar c = testChars[k];

706

int8_t type = Unicode::getType(c);

707

if (type == Unicode::DECIMAL_DIGIT_NUMBER ||

708

type == Unicode::OTHER_NUMBER ||

709

type == Unicode::NON_SPACING_MARK ||

710

type == Unicode::ENCLOSING_MARK ||

711

type == Unicode::CURRENCY_SYMBOL ||

712

type == Unicode::SPACE_SEPARATOR ||

713

type == Unicode::DASH_PUNCTUATION ||

714

type == Unicode::CONTROL ||

715

type == Unicode::FORMAT ||

716

c == '\n' || c == '\r' || c == 0x2028 || c == 0x2029 ||

717

c == 0x0003 || c == 0x00a0 || c == 0x2007 || c == 0x2011 ||

718

c == 0xfeff)

719

{

720

continue;

721

}

722

work[2] = c;

723

e->setText(work);

724

UBool saw2 = FALSE;

725

for (int l = e->first(); l != BreakIterator::DONE; l = e->next()) {

726

if (l == 2) {

727

saw2 = TRUE;

728

break;

729

}

730

}

731

if (!saw2) {

732

errln("Didn't get break between U+" + UCharToUnicodeString(work[1]) +

733

" and U+" + UCharToUnicodeString(work[2]));

734

errCount++;

735

if (errCount >= 75)

736

return;

737

}

738

}

739

}

740

}

741

delete e;

742

}

743

744

void IntlTestTextBoundary::TestThaiLineBreak() {

745

Vector* thaiLineSelection = new Vector();

746

UErrorCode status = U_ZERO_ERROR;

747

748

// \u0e2f-- the Thai paiyannoi character-- isn't a letter. It's a symbol that

749

// represents elided letters at the end of a long word. It should be bound to

750

// the end of the word and not treated as an independent punctuation mark.

751

752

753

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f"));

754

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e08\\u0e30"));

755

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e23\\u0e30\\u0e14\\u0e21"));

756

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e08\\u0e49\\u0e32"));

757

// thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2b\\u0e19\\u0e49\\u0e32"));

758

// thaiLineSelection->addElement(CharsToUnicodeString("\\u0e17\\u0e35\\u0e48"));

759

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48"));

760

// the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us

761

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2d\\u0e2d\\u0e01"));

762

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e21\\u0e32"));

763

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e23\\u0e48\\u0e07"));

764

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22"));

765

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07"));

766

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e15\\u0e47\\u0e21"));

767

768

// the one time where the paiyannoi occurs somewhere other than at the end

769

// of a word is in the Thai abbrevation for "etc.", which both begins and

770

// ends with a paiyannoi

771

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2f\\u0e25\\u0e2f"));

772

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e17\\u0e35\\u0e48"));

773

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e19\\u0e31\\u0e49\\u0e19"));

774

775

BreakIterator* e = BreakIterator::createLineInstance(

776

Locale("th"), status);

777

if (U_FAILURE(status))

778

{

779

errln("Failed to create the BreakIterator for Thai locale in TestThaiLineBreak.\n");

780

return;

781

}

782

783

generalIteratorTest(*e, thaiLineSelection);

784

delete e;

785

delete thaiLineSelection;

786

}

787

788

void IntlTestTextBoundary::TestMixedThaiLineBreak()

789

{

790

UErrorCode status = U_ZERO_ERROR;

791

Vector* thaiLineSelection= new Vector();

792

793

// Arabic numerals should always be separated from surrounding Thai text

794

795

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e04\\u0e48\\u0e32"));

796

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e07\\u0e34\\u0e19"));

797

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e1a\\u0e32\\u0e17"));

798

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e41\\u0e15\\u0e30"));

799

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e23\\u0e30\\u0e14\\u0e31\\u0e1a"));

800

thaiLineSelection->addElement("39");

801

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e1a\\u0e32\\u0e17 "));

802

803

// words in non-Thai scripts should always be separated from surrounding Thai text

804

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e17\\u0e14"));

805

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2a\\u0e2d\\u0e1a"));

806

thaiLineSelection->addElement("Java");

807

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e1a\\u0e19"));

808

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e04\\u0e23\\u0e37\\u0e48\\u0e2d\\u0e07"));

809

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e44\\u0e2d\\u0e1a\\u0e35\\u0e40\\u0e2d\\u0e47\\u0e21 "));

810

811

// Thai numerals should always be separated from the text surrounding them

812

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e04\\u0e48\\u0e32"));

813

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e07\\u0e34\\u0e19"));

814

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e1a\\u0e32\\u0e17"));

815

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e41\\u0e15\\u0e30"));

816

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e23\\u0e30\\u0e14\\u0e31\\u0e1a"));

817

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e53\\u0e59"));

818

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e1a\\u0e32\\u0e17 "));

819

820

// Thai text should interact correctly with punctuation and symbols

821

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e44\\u0e2d\\u0e1a\\u0e35\\u0e40\\u0e2d\\u0e47\\u0e21"));

822

// thaiLineSelection->addElement(CharsToUnicodeString("(\\u0e1b\\u0e23\\u0e30\\u0e40\\u0e17\\u0e28"));

823

// thaiLineSelection->addElement(CharsToUnicodeString("\\u0e44\\u0e17\\u0e22)"));

824

thaiLineSelection->addElement(CharsToUnicodeString("(\\u0e1b\\u0e23\\u0e30\\u0e40\\u0e17\\u0e28\\u0e44\\u0e17\\u0e22)"));

825

// I believe the commented-out reading above to be the correct one, but this is what passes with our current dictionary

826

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e08\\u0e33\\u0e01\\u0e31\\u0e14"));

827

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e1b\\u0e34\\u0e14"));

828

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e15\\u0e31\\u0e27\""));

829

830

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\""));

831

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e23\\u0e38\\u0e48\\u0e19"));

832

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e43\\u0e2b\\u0e21\\u0e48"));

833

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e14\\u0e37\\u0e2d\\u0e19\\u0e21\\u0e34."));

834

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e22."));

835

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e19\\u0e35\\u0e49"));

836

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e23\\u0e32\\u0e04\\u0e32"));

837

thaiLineSelection->addElement("$200");

838

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e17\\u0e48\\u0e32"));

839

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e19\\u0e31\\u0e49\\u0e19 "));

840

thaiLineSelection->addElement(CharsToUnicodeString("(\"\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\")."));

841

842

BreakIterator* e = BreakIterator::createLineInstance(Locale("th"), status);

843

if (U_FAILURE(status))

844

{

845

errln("Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak.\n");

846

return;

847

}

848

849

850

generalIteratorTest(*e, thaiLineSelection);

851

delete e;

852

delete thaiLineSelection;

853

}

854

855

856

void IntlTestTextBoundary::TestMaiyamok()

857

{

858

Vector* thaiLineSelection= new Vector();

859

UErrorCode status = U_ZERO_ERROR;

860

// the Thai maiyamok character is a shorthand symbol that means "repeat the previous

861

// word". Instead of appearing as a word unto itself, however, it's kept together

862

// with the word before it

863

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e44\\u0e1b\\u0e46"));

864

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e21\\u0e32\\u0e46"));

865

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07"));

866

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e01\\u0e23\\u0e38\\u0e07\\u0e40\\u0e17\\u0e1e"));

867

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e41\\u0e25\\u0e30"));

868

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e03\\u0e35\\u0e22\\u0e07"));

869

thaiLineSelection->addElement(CharsToUnicodeString("\\u0e43\\u0e2b\\u0e21\\u0e48"));

870

871

BreakIterator* e = BreakIterator::createLineInstance(

872

Locale("th"), status);

873

874

if (U_FAILURE(status))

875

{

876

errln("Failed to create the BreakIterator for Thai locale in TestMaiyamok.\n");

877

return;

878

}

879

generalIteratorTest(*e, thaiLineSelection);

880

delete e;

881

delete thaiLineSelection;

882

}

883

884

void IntlTestTextBoundary::TestThaiWordBreak() {

885

Vector* thaiWordSelection = new Vector();

886

UErrorCode status = U_ZERO_ERROR;

887

888

thaiWordSelection->addElement(CharsToUnicodeString("\\u0E1A\\u0E17")); //2

889

thaiWordSelection->addElement(CharsToUnicodeString("\\u0E17\\u0E35\\u0E48")); //5

890

thaiWordSelection->addElement(CharsToUnicodeString("\\u0E51")); //6

891

thaiWordSelection->addElement(CharsToUnicodeString("\\u0E1E\\u0E32\\u0E22\\u0E38")); //10

892

thaiWordSelection->addElement(CharsToUnicodeString("\\u0E44\\u0E0B\\u0E42\\u0E04\\u0E25\\u0E19")); //16

893

thaiWordSelection->addElement(CharsToUnicodeString("\\u000D\\u000A")); //18

894

895

// This is the correct result

896

//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E42\\u0E14\\u0E42\\u0E23\\u0E18\\u0E35")); //24

897

//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E2D\\u0E32\\u0E28\\u0E31\\u0E22")); //29

898

899

// and this is what the dictionary does...

900

thaiWordSelection->addElement(CharsToUnicodeString("\\u0E42\\u0E14")); // 20

901

thaiWordSelection->addElement(CharsToUnicodeString("\\u0E42\\u0E23\\u0E18\\u0E35\\u0E2D\\u0E32\\u0E28\\u0E31\\u0E22")); //29

902

903

thaiWordSelection->addElement(CharsToUnicodeString("\\u0E2D\\u0E22\\u0E39\\u0E48")); //33

904

905

// This is the correct result

906

//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E17\\u0E48\\u0E32\\u0E21")); //37

907

//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E01\\u0E25\\u0E32\\u0E07")); //41

908

909

// and this is what the dictionary does

910

thaiWordSelection->addElement(CharsToUnicodeString("\\u0E17\\u0E48\\u0E32\\u0E21\\u0E01\\u0E25\\u0E32\\u0E07")); //41

911

912

thaiWordSelection->addElement(CharsToUnicodeString("\\u0E17\\u0E38\\u0E48\\u0E07")); //45

913

thaiWordSelection->addElement(CharsToUnicodeString("\\u0E43\\u0E2B\\u0E0D\\u0E48")); //49

914

thaiWordSelection->addElement(CharsToUnicodeString("\\u0E43\\u0E19")); //51

915

916

// This is the correct result

917

//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E41\\u0E04\\u0E19\\u0E0B\\u0E31\\u0E2A")); //57

918

//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E01\\u0E31\\u0E1A")); //60

919

920

// and this is what the dictionary does

921

thaiWordSelection->addElement(CharsToUnicodeString("\\u0E41\\u0E04\\u0E19")); // 54

922

thaiWordSelection->addElement(CharsToUnicodeString("\\u0E0B\\u0E31\\u0E2A\\u0E01\\u0E31\\u0E1A")); //60

923

924

thaiWordSelection->addElement(CharsToUnicodeString("\\u0E25\\u0E38\\u0E07")); //63

925

926

// This is the correct result

927

//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E40\\u0E2E\\u0E19\\u0E23\\u0E35")); //68

928

//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E0A\\u0E32\\u0E27")); //71

929

//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E44\\u0E23\\u0E48")); //74

930

//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E41\\u0E25\\u0E30")); //77

931

932

// and this is what the dictionary does

933

thaiWordSelection->addElement(CharsToUnicodeString("\\u0E40\\u0E2E")); // 65

934

thaiWordSelection->addElement(CharsToUnicodeString("\\u0E19\\u0E23\\u0E35\\u0E0A\\u0E32\\u0E27\\u0E44\\u0E23\\u0E48\\u0E41\\u0E25\\u0E30")); //77

935

936

BreakIterator* e = BreakIterator::createWordInstance(

937

Locale("th"), status);

938

if (U_FAILURE(status))

939

{

940

errln("Failed to create the BreakIterator for Thai locale in TestThaiWordBreak.\n");

941

return;

942

}

943

944

generalIteratorTest(*e, thaiWordSelection);

945

delete e;

946

delete thaiWordSelection;

947

}

948

949

/**

950

* Test Japanese Line Break

951

* @bug 4095322

952

953

void IntlTestTextBoundary::TestJapaneseLineBreak()

954

{

955

UErrorCode status = U_ZERO_ERROR;

956

UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c");

957

UnicodeString precedingChars = CharsToUnicodeString("([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f");

958

UnicodeString followingChars = CharsToUnicodeString(")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc:;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302");

959

BreakIterator *iter = BreakIterator::createLineInstance(Locale::JAPAN, status);

960

961

int32_t i;

962

if (U_FAILURE(status))

963

{

964

errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n");

965

return;

966

}

967

968

for (i = 0; i < precedingChars.length(); i++) {

969

testString[1] = precedingChars[i];

970

iter->setText(testString);

971

int32_t j = iter->first();

972

if (j != 0)

973

errln("ja line break failure: failed to start at 0");

974

j = iter->next();

975

if (j != 1)

976

errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i])

977

+ "' (" + ((int)(precedingChars[i])) + ")");

978

j = iter->next();

979

if (j != 3)

980

errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i])

981

+ "' (" + ((int)(precedingChars[i])) + ")");

982

}

983

984

for (i = 0; i < followingChars.length(); i++) {

985

testString[1] = followingChars[i];

986

iter->setText(testString);

987

int j = iter->first();

988

if (j != 0)

989

errln("ja line break failure: failed to start at 0");

990

j = iter->next();

991

if (j != 2)

992

errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i])

993

+ "' (" + ((int)(followingChars[i])) + ")");

994

j = iter->next();

995

if (j != 3)

996

errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i])

997

+ "' (" + ((int)(followingChars[i])) + ")");

998

}

999

delete iter;

1000

}

1001

1002

//---------------------------------------------

1003

// other tests

1004

//---------------------------------------------/

1005

1006

void IntlTestTextBoundary::TestEmptyString()

1007

{

1008

UnicodeString text = "";

1009

Vector x;

1010

UErrorCode status = U_ZERO_ERROR;

1011

x.addElement(text);

1012

BreakIterator* bi = BreakIterator::createLineInstance(Locale::getDefault(), status);

1013

if (U_FAILURE(status))

1014

{

1015

errln("Failed to create the BreakIterator for default locale in TestEmptyString.\n");

1016

return;

1017

}

1018

generalIteratorTest(*bi, &x);

1019

1020

delete bi;

1021

}

1022

1023

void IntlTestTextBoundary::TestGetAvailableLocales()

1024

{

1025

int32_t locCount = 0;

1026

const Locale* locList = BreakIterator::getAvailableLocales(locCount);

1027

1028

if (locCount == 0)

1029

errln("getAvailableLocales() returned an empty list!");

1030

// Just make sure that it's returning good memory.

1031

for (int32_t i = 0; i < locCount; ++i) {

1032

logln(locList[i].getName());

1033

}

1034

}

1035

1036

//Testing the BreakIterator::getDisplayName() function

1037

void IntlTestTextBoundary::TestGetDisplayName()

1038

{

1039

UnicodeString result;

1040

1041

BreakIterator::getDisplayName(Locale::getUS(), result);

1042

if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")

1043

errln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""

1044

+ result);

1045

1046

BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);

1047

if (result != "French (France)")

1048

errln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""

1049

+ result);

1050

}

1051

/**

1052

* Test End Behaviour

1053

* @bug 4068137

1054

1055

void IntlTestTextBoundary::TestEndBehaviour()

1056

{

1057

UErrorCode status = U_ZERO_ERROR;

1058

UnicodeString testString("boo.");

1059

BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);

1060

if (U_FAILURE(status))

1061

{

1062

errln("Failed to create the BreakIterator for default locale in TestEndBehaviour.\n");

1063

return;

1064

}

1065

wb->setText(testString);

1066

1067

if (wb->first() != 0)

1068

errln("Didn't get break at beginning of string.");

1069

if (wb->next() != 3)

1070

errln("Didn't get break before period in \"boo.\"");

1071

if (wb->current() != 4 && wb->next() != 4)

1072

errln("Didn't get break at end of string.");

1073

delete wb;

1074

}

1075

1076

* @bug 4153072

1077

1078

void IntlTestTextBoundary::TestBug4153072() {

1079

UErrorCode status = U_ZERO_ERROR;

1080

BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);

1081

if (U_FAILURE(status))

1082

{

1083

errln("Failed to create the BreakIterator for default locale in TestBug4153072\n");

1084

return;

1085

}

1086

UnicodeString str("...Hello, World!...");

1087

int32_t begin = 3;

1088

int32_t end = str.length() - 3;

1089

UBool dummy;

1090

1091

StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);

1092

iter->adoptText(textIterator);

1093

for (int index = -1; index < begin + 1; ++index) {

1094

dummy = iter->isBoundary(index);

1095

if (index < begin && dummy == TRUE) {

1096

errln((UnicodeString)"Didn't handle preceeding correctly with offset = " + index +

1097

" and begin index = " + begin);

1098

}

1099

}

1100

delete iter;

1101

}

1102

1103

1104

* Test Preceding()

1105

1106

void IntlTestTextBoundary::TestPreceding()

1107

{

1108

UErrorCode status = U_ZERO_ERROR;

1109

UnicodeString words3("aaa bbb ccc");

1110

BreakIterator* e = BreakIterator::createWordInstance(Locale::getDefault(), status);

1111

if (U_FAILURE(status))

1112

{

1113

errln("Failed to create the BreakIterator for default locale in TestPreceeding.\n");

1114

return;

1115

}

1116

1117

e->setText( words3 );

1118

e->first();

1119

int32_t p1 = e->next();

1120

int32_t p2 = e->next();

1121

int32_t p3 = e->next();

1122

int32_t p4 = e->next();

1123

1124

int32_t f = e->following(p2+1);

1125

int32_t p = e->preceding(p2+1);

1126

if (f!=p3)

1127

errln("IntlTestTextBoundary::TestPreceding: f!=p3");

1128

if (p!=p2)

1129

errln("IntlTestTextBoundary::TestPreceding: p!=p2");

1130

1131

if (p1+1!=p2)

1132

errln("IntlTestTextBoundary::TestPreceding: p1+1!=p2");

1133

1134

if (p3+1!=p4)

1135

errln("IntlTestTextBoundary::TestPreceding: p3+1!=p4");

1136

1137

if (!e->isBoundary(p2) || e->isBoundary(p2+1) || !e->isBoundary(p3))

1138

{

1139

errln("IntlTestTextBoundary::TestPreceding: isBoundary err");

1140

}

1141

delete e;

1142

}

1143

//---------------------------------------------

1144

// runIndexedTest

1145

//---------------------------------------------

1146

1147

void IntlTestTextBoundary::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )

1148

{

1149

if (exec) logln("TestSuite TextBoundary: ");

1150

switch (index) {

1151

case 0: name = "TestSentenceIteration"; if(exec) TestSentenceIteration(); break;

1152

case 1: name = "TestWordIteration"; if(exec) TestWordIteration(); break;

1153

case 2: name = "TestLineIteration"; if(exec) TestLineIteration(); break;

1154

case 3: name = "TestCharacterIteration"; if(exec) TestCharacterIteration(); break;

1155

case 4: name = "TestSentenceInvariants"; if(exec) TestSentenceInvariants();break;

1156

case 5: name = "TestWordInvariants"; if(exec) TestWordInvariants();break;

1157

case 6: name = "TestLineInvariants"; if(exec) TestLineInvariants();break;

1158

case 7: name = "TestCharacterInvariants"; if(exec) TestCharacterInvariants();break;

1159

1160

case 8: name = "TestEmptyString"; if (exec) TestEmptyString(); break;

1161

case 9: name = "TestGetAvailableLocales"; if (exec) TestGetAvailableLocales(); break;

1162

case 10: name = "TestGetDisplayName"; if (exec) TestGetDisplayName(); break;

1163

case 11: name = "TestPreceding"; if (exec) TestPreceding(); break;

1164

case 12: name = "TestBug4153072"; if (exec) TestBug4153072(); break;

1165

case 13: name = "TestEndBehaviour"; if (exec) TestEndBehaviour(); break;

1166

1167

case 14: name = "TestJapaneseLineBreak"; if (exec) TestJapaneseLineBreak(); break;

1168

case 15: name = "TestThaiLineBreak"; if(exec) TestThaiLineBreak(); break;

1169

case 16: name = "TestMixedThaiLineBreak"; if(exec) TestMixedThaiLineBreak(); break;

1170

case 17: name = "TestMaiyamok"; if(exec) TestMaiyamok(); break;

1171

case 18: name = "TestThaiWordBreak"; if(exec) TestThaiWordBreak(); break;

1172

1173

1174

default: name = ""; break; //needed to end loop

1175

}

1176

}

1177

1178

//---------------------------------------------

1179

// Test implementation routines

1180

//---------------------------------------------

1181

1182

// general test Implementation subroutines

1183

void IntlTestTextBoundary::generalIteratorTest(BreakIterator& bi, Vector* expectedResult)

1184

{

1185

Enumeration *elems = expectedResult->elements();

1186

UnicodeString text = createTestData(elems);

1187

delete elems;

1188

1189

logln("comparing forward and backward...");

1190

bi.setText(text);

1191

1192

Vector *nextResults = testFirstAndNext(bi, text);

1193

if (nextResults == NULL) {

1194

errln("Couldn't get nextResults!");

1195

return;

1196

}

1197

1198

Vector *previousResults = testLastAndPrevious(bi, text);

1199

1200

if (previousResults == NULL) {

1201

errln("Couldn't get previousResults!");

1202

return;

1203

}

1204

1205

int errs = getErrors();

1206

UnicodeString str1="forward iteration";

1207

UnicodeString str2="backward iteration";

1208

compareFragmentLists(str1, str2, nextResults,

1209

previousResults);

1210

if (getErrors() == errs) {

1211

logln("comparing expected and actual...");

1212

str1="expected result";

1213

str2="actual result";

1214

compareFragmentLists(str1, str2, expectedResult,

1215

nextResults);

1216

}

1217

1218

int32_t *boundaries = new int32_t[expectedResult->size() + 3];

1219

boundaries[0] = BreakIterator::DONE;

1220

boundaries[1] = 0;

1221

for (int i = 0; i < expectedResult->size(); i++)

1222

boundaries[i + 2] = boundaries[i + 1] + ((UnicodeString)expectedResult->elementAt(i)).

1223

length();

1224

1225

int len = expectedResult->size() + 3 -1;

1226

boundaries[len] = BreakIterator::DONE;

1227

1228

testFollowing(bi, text, boundaries);

1229

testPreceding(bi, text, boundaries);

1230

testIsBoundary(bi, text, boundaries);

1231

1232

doMultipleSelectionTest(bi, text);

1233

1234

delete nextResults;

1235

delete previousResults;

1236

delete []boundaries;

1237

}

1238

1239

Vector* IntlTestTextBoundary::testFirstAndNext(BreakIterator& bi, UnicodeString& text)

1240

{

1241

int32_t p = bi.first();

1242

int32_t lastP = p;

1243

Vector *result = new Vector();

1244

UnicodeString selection;

1245

1246

if (p != 0)

1247

errln((UnicodeString)"first() returned " + p + (UnicodeString)" instead of 0");

1248

while (p != BreakIterator::DONE) {

1249

p = bi.next();

1250

if (p != BreakIterator::DONE) {

1251

if (p <= lastP) {

1252

errln((UnicodeString)"next() failed to move forward: next() on position "

1253

+ lastP + (UnicodeString)" yielded " + p);

1254

errln("Are the *.brk files corrupt?");

1255

return NULL;

1256

}

1257

1258

text.extractBetween(lastP, p, selection);

1259

result->addElement(selection);

1260

}

1261

else {

1262

if (lastP != text.length())

1263

errln((UnicodeString)"next() returned DONE prematurely: offset was "

1264

+ lastP + (UnicodeString)" instead of " + text.length());

1265

}

1266

lastP = p;

1267

}

1268

return result;

1269

}

1270

1271

Vector* IntlTestTextBoundary::testLastAndPrevious(BreakIterator& bi, UnicodeString& text)

1272

{

1273

int32_t p = bi.last();

1274

int32_t lastP = p;

1275

Vector *result = new Vector();

1276

UnicodeString selection;

1277

1278

if (p != text.length())

1279

errln((UnicodeString)"last() returned " + p + (UnicodeString)" instead of " + text.length());

1280

while (p != BreakIterator::DONE) {

1281

p = bi.previous();

1282

if (p != BreakIterator::DONE) {

1283

if (p >= lastP)

1284

errln((UnicodeString)"previous() failed to move backward: previous() on position "

1285

+ lastP + (UnicodeString)" yielded " + p);

1286

text.extractBetween(p, lastP, selection);

1287

result->insertElementAt(selection, 0);

1288

}

1289

else {

1290

if (lastP != 0)

1291

errln((UnicodeString)"previous() returned DONE prematurely: offset was "

1292

+ lastP + (UnicodeString)" instead of 0");

1293

}

1294

lastP = p;

1295

}

1296

return result;

1297

}

1298

1299

void IntlTestTextBoundary::compareFragmentLists(UnicodeString& f1Name, UnicodeString& f2Name, Vector* f1, Vector* f2)

1300

{

1301

int32_t p1 = 0;

1302

int32_t p2 = 0;

1303

UnicodeString s1;

1304

UnicodeString s2;

1305

int32_t t1 = 0;

1306

int32_t t2 = 0;

1307

UnicodeString target;

1308

1309

while (p1 < f1->size() && p2 < f2->size()) {

1310

s1 = (UnicodeString)f1->elementAt(p1);

1311

s2 = (UnicodeString)f2->elementAt(p2);

1312

t1 += s1.length();

1313

t2 += s2.length();

1314

1315

if (s1.compare(s2) == 0) {

1316

logln(prettify((UnicodeString)" >" + s1 + (UnicodeString)"<", target));

1317

++p1;

1318

++p2;

1319

}

1320

else {

1321

int32_t tempT1 = t1;

1322

int32_t tempT2 = t2;

1323

int32_t tempP1 = p1;

1324

int32_t tempP2 = p2;

1325

1326

while (tempT1 != tempT2 && tempP1 < f1->size() && tempP2 < f2->size()) {

1327

while (tempT1 < tempT2 && tempP1 < f1->size()) {

1328

tempT1 += ((UnicodeString)f1->elementAt(tempP1)).length();

1329

++tempP1;

1330

}

1331

while (tempT2 < tempT1 && tempP2 < f2->size()) {

1332

tempT2 += ((UnicodeString)f2->elementAt(tempP2)).length();

1333

++tempP2;

1334

}

1335

}

1336

logln((UnicodeString)"*** " + f1Name + (UnicodeString)" has:");

1337

while (p1 <= tempP1 && p1 < f1->size()) {

1338

s1 = (UnicodeString)f1->elementAt(p1);

1339

t1 += s1.length();

1340

logln(prettify((UnicodeString)" *** >" + s1 + (UnicodeString)"<", target));

1341

++p1;

1342

}

1343

logln("***** " + f2Name + " has:");

1344

while (p2 <= tempP2 && p2 < f2->size()) {

1345

s2 = (UnicodeString)f2->elementAt(p2);

1346

t2 += s2.length();

1347

logln(prettify(" ***** >" + s2 + "<", target));

1348

++p2;

1349

}

1350

errln((UnicodeString)"Discrepancy between " + f1Name + (UnicodeString)" and " + f2Name);

1351

}

1352

}

1353

}

1354

1355

void IntlTestTextBoundary::testFollowing(BreakIterator& bi, UnicodeString& text, int32_t *boundaries)

1356

{

1357

logln("testFollowing():");

1358

int p = 2;

1359

int32_t textLen = text.length();

1360

for (int i = 0; i <= textLen; i++) {

1361

if (i == boundaries[p])

1362

++p;

1363

1364

int32_t b = bi.following(i);

1365

logln((UnicodeString)"bi.following(" + i + ") -> " + b);

1366

if (b != boundaries[p])

1367

errln((UnicodeString)"Wrong result from following() for " + i + (UnicodeString)": expected " + boundaries[p]

1368

+ (UnicodeString)", got " + b);

1369

}

1370

}

1371

1372

void IntlTestTextBoundary::testPreceding(BreakIterator& bi, UnicodeString& text, int32_t *boundaries) {

1373

logln("testPreceding():");

1374

int p = 0;

1375

int32_t textLen = text.length();

1376

for (int i = 0; i <= textLen; i++) {

1377

int32_t b = bi.preceding(i);

1378

logln((UnicodeString)"bi.preceding(" + i + ") -> " + b);

1379

if (b != boundaries[p])

1380

errln((UnicodeString)"Wrong result from preceding() for " + i + (UnicodeString)": expected " + boundaries[p]

1381

+ (UnicodeString)", got " + b);

1382

1383

if (i == boundaries[p + 1])

1384

++p;

1385

}

1386

}

1387

1388

void IntlTestTextBoundary::testIsBoundary(BreakIterator& bi, UnicodeString& text, int32_t *boundaries) {

1389

logln("testIsBoundary():");

1390

int p = 1;

1391

UBool isB;

1392

int32_t textLen = text.length();

1393

for (int i = 0; i < textLen; i++) {

1394

isB = bi.isBoundary(i);

1395

logln((UnicodeString)"bi.isBoundary(" + i + ") -> " + isB);

1396

1397

if (i == boundaries[p]) {

1398

if (!isB)

1399

errln((UnicodeString)"Wrong result from isBoundary() for " + i + (UnicodeString)": expected true, got false");

1400

p++;

1401

}

1402

else {

1403

if (isB)

1404

errln((UnicodeString)"Wrong result from isBoundary() for " + i + (UnicodeString)": expected false, got true");

1405

}

1406

}

1407

}

1408

1409

void IntlTestTextBoundary::doMultipleSelectionTest(BreakIterator& iterator,

1410

UnicodeString& testText)

1411

{

1412

iterator.setText(testText);

1413

1414

BreakIterator* testIterator = iterator.clone();

1415

int32_t offset = iterator.first();

1416

int32_t testOffset;

1417

int32_t count = 0;

1418

1419

logln("doMultipleSelectionTest text of length: %d", testText.length());

1420

1421

if (*testIterator != iterator)

1422

errln("clone() or operator!= failed: two clones compared unequal");

1423

1424

do {

1425

testOffset = testIterator->first();

1426

testOffset = testIterator->next(count);

1427

if (offset != testOffset)

1428

errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);

1429

1430

if (offset != BreakIterator::DONE) {

1431

count++;

1432

offset = iterator.next();

1433

1434

if (offset != BreakIterator::DONE && *testIterator == iterator)

1435

errln("operator== failed: Two unequal iterators compared equal.");

1436

}

1437

} while (offset != BreakIterator::DONE);

1438

1439

// now do it backwards...

1440

offset = iterator.last();

1441

count = 0;

1442

1443

do {

1444

testOffset = testIterator->last();

1445

testOffset = testIterator->next(count);

1446

if (offset != testOffset)

1447

errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);

1448

1449

if (offset != BreakIterator::DONE) {

1450

count--;

1451

offset = iterator.previous();

1452

}

1453

} while (offset != BreakIterator::DONE);

1454

delete testIterator;

1455

}

1456

1457

void IntlTestTextBoundary::doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars)

1458

{

1459

UnicodeString work("aaa");

1460

int32_t errCount = 0, testCharsLen = testChars.length(), breaksLen;

1461

1462

// a break should always occur after CR (unless followed by LF), LF, PS, and LS

1463

UnicodeString breaks = CharsToUnicodeString("\r\n\\u2029\\u2028");

1464

int32_t i, j;

1465

1466

breaksLen = breaks.length();

1467

for (i = 0; i < breaksLen; i++) {

1468

work[1] = breaks[i];

1469

for (j = 0; j < testCharsLen; j++) {

1470

work[0] = testChars[j];

1471

for (int k = 0; k < testCharsLen; k++) {

1472

UChar c = testChars[k];

1473

1474

// if a cr is followed by lf, ps, ls or etx, don't do the check (that's

1475

// not supposed to work)

1476

if (work[1] == '\r' && (c == '\n' || c == 0x2029

1477

|| c == 0x2028 || c == 0x0003))

1478

continue;

1479

1480

work[2] = c;

1481

tb.setText(work);

1482

UBool seen2 = FALSE;

1483

for (int l = tb.first(); l != BreakIterator::DONE; l = tb.next()) {

1484

if (l == 2) {

1485

seen2 = TRUE;

1486

break;

1487

}

1488

}

1489

if (!seen2) {

1490

errln("No break between U+" + UCharToUnicodeString(work[1])

1491

+ " and U+" + UCharToUnicodeString(work[2]));

1492

errCount++;

1493

if (errCount >= 75)

1494

return;

1495

}

1496

}

1497

}

1498

}

1499

}

1500

1501

void IntlTestTextBoundary::doOtherInvariantTest(BreakIterator& tb, UnicodeString& testChars)

1502

{

1503

UnicodeString work("a\r\na");

1504

int32_t errCount = 0, testCharsLen = testChars.length();

1505

int32_t i, j;

1506

int8_t type;

1507

1508

// a break should never occur between CR and LF

1509

for (i = 0; i < testCharsLen; i++) {

1510

work[0] = testChars[i];

1511

for (j = 0; j < testCharsLen; j++) {

1512

work[3] = testChars[j];

1513

tb.setText(work);

1514

for (int32_t k = tb.first(); k != BreakIterator::DONE; k = tb.next())

1515

if (k == 2) {

1516

errln("Break between CR and LF in string U+" + UCharToUnicodeString(work[0]) +

1517

", U+d U+a U+" + UCharToUnicodeString(work[3]));

1518

errCount++;

1519

if (errCount >= 75)

1520

return;

1521

}

1522

}

1523

}

1524

1525

// a break should never occur before a non-spacing mark, unless the preceding

1526

// character is CR, LF, PS, or LS

1527

work.remove();

1528

work += "aaaa";

1529

for (i = 0; i < testCharsLen; i++) {

1530

UChar c = testChars[i];

1531

if (c == '\n' || c == '\r' || c == 0x2029 || c == 0x2028 || c == 0x0003)

1532

continue;

1533

work[1] = c;

1534

for (j = 0; j < testCharsLen; j++) {

1535

c = testChars[j];

1536

type = Unicode::getType(c);

1537

if ((type != Unicode::NON_SPACING_MARK) &&

1538

(type != Unicode::ENCLOSING_MARK))

1539

continue;

1540

work[2] = c;

1541

tb.setText(work);

1542

for (int k = tb.first(); k != BreakIterator::DONE; k = tb.next())

1543

if (k == 2) {

1544

errln("Break between U+" + UCharToUnicodeString(work[1])

1545

+ " and U+" + UCharToUnicodeString(work[2]));

1546

errCount++;

1547

if (errCount >= 75)

1548

return;

1549

}

1550

}

1551

}

1552

}

1553

1554

void IntlTestTextBoundary::sample(BreakIterator& tb,

1555

UnicodeString& text,

1556

UnicodeString& title)

1557

{

1558

UnicodeString substring;

1559

UBool verboseWas = verbose;

1560

verbose = TRUE;

1561

logln("-------------------------"+title+" length = "+text.length());

1562

tb.setText(text);

1563

int32_t start = tb.first();

1564

int32_t end;

1565

for (end = tb.next(); end != BreakIterator::DONE; end = tb.next()) {

1566

text.extractBetween(start, end, substring);

1567

logln(UnicodeString("[")+start+","+end+"] \""+substring+"\"");

1568

start = end;

1569

}

1570

verbose = verboseWas;

1571

}

1572

1573

1574

Older »