~ubuntu-branches/ubuntu/vivid/icu4j-4.4/vivid

UnicodeSet extraSamples = new UnicodeSet("[\uCE20{\uAD6C\uB514}{\uAD73\uC774}{\uBB34\uB837}{\uBB3C\uC5FF}{\uC544\uAE4C}{\uC544\uB530}{\uC544\uBE60}{\uC544\uC2F8}{\uC544\uC9DC}{\uC544\uCC28}{\uC545\uC0AC}{\uC545\uC2F8}{\uC546\uCE74}{\uC548\uAC00}{\uC548\uC790}{\uC548\uC9DC}{\uC548\uD558}{\uC54C\uAC00}{\uC54C\uB530}{\uC54C\uB9C8}{\uC54C\uBC14}{\uC54C\uBE60}{\uC54C\uC0AC}{\uC54C\uC2F8}{\uC54C\uD0C0}{\uC54C\uD30C}{\uC54C\uD558}{\uC555\uC0AC}{\uC555\uC2F8}{\uC558\uC0AC}{\uC5C5\uC12F\uC501}{\uC5C6\uC5C8\uC2B5}]");

194

UnicodeSet sourceSet = new UnicodeSet();

195

addRepresentativeHangul(sourceSet, 2, false);

196

addRepresentativeHangul(sourceSet, 3, false);

197

addRepresentativeHangul(sourceSet, 2, true);

198

addRepresentativeHangul(sourceSet, 3, true);

199

// add the boundary cases; we want an example of each case of V + L and one example of each case of T+L

200

201

UnicodeSet more = getRepresentativeBoundaryHangul();

202

sourceSet.addAll(more);

203

sourceSet.addAll(extraSamples);

204

return sourceSet;

205

}

206

207

private static UnicodeSet getRepresentativeBoundaryHangul() {

208

UnicodeSet resultToAddTo = new UnicodeSet();

209

// U+1100 HANGUL CHOSEONG KIYEOK

210

// U+1161 HANGUL JUNGSEONG A

211

UnicodeSet L = new UnicodeSet("[:hst=L:]");

212

UnicodeSet V = new UnicodeSet("[:hst=V:]");

213

UnicodeSet T = new UnicodeSet("[:hst=T:]");

214

215

String prefixLV = "\u1100\u1161";

216

String prefixL = "\u1100";

217

String suffixV = "\u1161";

218

String nullL = "\u110B"; // HANGUL CHOSEONG IEUNG

219

220

UnicodeSet L0 = new UnicodeSet("[\u1100\u110B]");

221

222

// do all combinations of L0 + V + nullL + V

223

224

for (UnicodeSetIterator iL0 = new UnicodeSetIterator(L0); iL0.next();) {

225

for (UnicodeSetIterator iV = new UnicodeSetIterator(V); iV.next();) {

226

for (UnicodeSetIterator iV2 = new UnicodeSetIterator(V); iV2.next();) {

227

String sample = iL0.getString() + iV.getString() + nullL + iV2.getString();

228

String trial = Normalizer.compose(sample, false);

229

if (trial.length() == 2) {

230

resultToAddTo.add(trial);

231

}

232

}

233

}

234

}

235

236

for (UnicodeSetIterator iL = new UnicodeSetIterator(L); iL.next();) {

237

// do all combinations of "g" + V + L + "a"

238

final String suffix = iL.getString() + suffixV;

239

for (UnicodeSetIterator iV = new UnicodeSetIterator(V); iV.next();) {

240

String sample = prefixL + iV.getString() + suffix;

241

String trial = Normalizer.compose(sample, false);

242

if (trial.length() == 2) {

243

resultToAddTo.add(trial);

244

}

245

}

246

// do all combinations of "ga" + T + L + "a"

247

for (UnicodeSetIterator iT = new UnicodeSetIterator(T); iT.next();) {

248

String sample = prefixLV + iT.getString() + suffix;

249

String trial = Normalizer.compose(sample, false);

250

if (trial.length() == 2) {

251

resultToAddTo.add(trial);

252

}

253

}

254

}

255

return resultToAddTo;

256

}

257

258

private static void addRepresentativeHangul(UnicodeSet resultToAddTo, int leng, boolean noFirstConsonant) {

259

UnicodeSet notYetSeen = new UnicodeSet();

260

for (char c = '\uAC00'; c < '\uD7AF'; ++c) {

261

String charStr = String.valueOf(c);

262

String decomp = Normalizer.decompose(charStr, false);

263

if (decomp.length() != leng) {

264

continue; // only take one length at a time

265

}

266

if (decomp.startsWith("\u110B ") != noFirstConsonant) {

267

continue;

268

}

269

if (!notYetSeen.containsAll(decomp)) {

270

resultToAddTo.add(c);

271

notYetSeen.addAll(decomp);

272

}

273

}

274

}

275

276

277

public void TestHan() throws UnsupportedEncodingException, FileNotFoundException {

278

try{

279

UnicodeSet exemplars = LocaleData.getExemplarSet(new ULocale("zh"),0);

280

// create string with all chars

281

StringBuffer b = new StringBuffer();

282

for (UnicodeSetIterator it = new UnicodeSetIterator(exemplars); it.next();) {

283

UTF16.append(b,it.codepoint);

284

}

285

String source = b.toString();

286

// transform with Han translit

287

Transliterator han = Transliterator.getInstance("Han-Latin");

288

String target = han.transliterate(source);

289

// now verify that there are no Han characters left

290

UnicodeSet allHan = new UnicodeSet("[:han:]");

291

assertFalse("No Han must be left after Han-Latin transliteration",allHan.containsSome(target));

292

// check the pinyin translit

293

Transliterator pn = Transliterator.getInstance("Latin-NumericPinyin");

294

String target2 = pn.transliterate(target);

295

// verify that there are no marks

296

Transliterator nfc = Transliterator.getInstance("nfc");

297

String nfced = nfc.transliterate(target2);

298

UnicodeSet allMarks = new UnicodeSet("[:mark:]");

299

assertFalse("NumericPinyin must contain no marks", allMarks.containsSome(nfced));

300

// verify roundtrip

301

Transliterator np = pn.getInverse();

302

String target3 = np.transliterate(target);

303

boolean roundtripOK = target3.equals(target);

304

assertTrue("NumericPinyin must roundtrip", roundtripOK);

305

if (!roundtripOK) {

306

String filename = "numeric-pinyin.log.txt";

307

PrintWriter out = new PrintWriter(

308

new BufferedWriter(

309

new OutputStreamWriter(

310

new FileOutputStream(filename), "UTF8"), 4*1024));

311

errln("Creating log file " + new File(filename).getAbsoluteFile());

312

out.println("Pinyin: " + target);

313

out.println("Pinyin-Numeric-Pinyin: " + target2);

314

out.close();

315

}

316

}catch(MissingResourceException ex){

317

warnln("Could not load the locale data for fetching the exemplar characters.");

318

}

319

}

320

321

public void TestSingle() {

322

Transliterator t = Transliterator.getInstance("Latin-Greek");

323

t.transliterate("\u0061\u0101\u0069");

324

}

325

326

String getGreekSet() {

327

// Time bomb

328

if (skipIfBeforeICU(4,5,0)) {

329

// We temporarily filter against Unicode 4.1, but we only do this

330

// before version 3.5.

331

logln("TestGreek needs to be updated to remove delete the section marked [:Age=4.0:] filter");

332

} else {

333

errln("TestGreek needs to be updated to remove delete the [:Age=4.0:] filter ");

334

}

335

return

336

// isICU28() ? "[[\u003B\u00B7[:Greek:]-[\u03D7-\u03EF]]&[:Age=3.2:]]" :

337

"[\u003B\u00B7[[:Greek:]&[:Letter:]]-[" +

338

"\u1D26-\u1D2A" + // L& [5] GREEK LETTER SMALL CAPITAL GAMMA..GREEK LETTER SMALL CAPITAL PSI

339

"\u1D5D-\u1D61" + // Lm [5] MODIFIER LETTER SMALL BETA..MODIFIER LETTER SMALL CHI

340

"\u1D66-\u1D6A" + // L& [5] GREEK SUBSCRIPT SMALL LETTER BETA..GREEK SUBSCRIPT SMALL LETTER CHI

341

"\u03D7-\u03EF" + // \N{GREEK KAI SYMBOL}..\N{COPTIC SMALL LETTER DEI}

342

"] & [:Age=4.0:]]";

343

}

344

345

public void TestGreek() throws IOException {

346

long start = System.currentTimeMillis();

347

new Test("Latin-Greek", 50)

348

.test("[a-zA-Z]", getGreekSet(),

349

"[\u00B5\u037A\u03D0-\u03F5\u03F9]", /* roundtrip exclusions */

350

this, new LegalGreek(true));

351

showElapsed(start, "TestGreek");

352

}

353

354

public void TestGreekUNGEGN() throws IOException {

355

long start = System.currentTimeMillis();

356

new Test("Latin-Greek/UNGEGN")

357

.test("[a-zA-Z]", getGreekSet(),

358

"[\u00B5\u037A\u03D0-\uFFFF{\u039C\u03C0}]", /* roundtrip exclusions */

359

this, new LegalGreek(false));

360

showElapsed(start, "TestGreekUNGEGN");

361

}

362

363

public void Testel() throws IOException {

364

long start = System.currentTimeMillis();

365

new Test("Latin-el")

366

.test("[a-zA-Z]", getGreekSet(),

367

"[\u00B5\u037A\u03D0-\uFFFF{\u039C\u03C0}]", /* roundtrip exclusions */

368

this, new LegalGreek(false));

369

showElapsed(start, "Testel");

370

}

371

372

public void TestCyrillic() throws IOException {

373

long start = System.currentTimeMillis();

374

new Test("Latin-Cyrillic")

375

.test("[a-zA-Z\u0110\u0111\u02BA\u02B9]", "[\u0400-\u045F]", null, this, new Legal());

376

showElapsed(start, "TestCyrillic");

377

}

378

379

static final String ARABIC = "[\u06A9\u060C\u061B\u061F\u0621\u0627-\u063A\u0641-\u0655\u0660-\u066C\u067E\u0686\u0698\u06A4\u06AD\u06AF\u06CB-\u06CC\u06F0-\u06F9]";

380

381

public void TestArabic() throws IOException {

382

long start = System.currentTimeMillis();

383

new Test("Latin-Arabic")

384

.test("[a-zA-Z\u02BE\u02BF]", ARABIC, "[a-zA-Z\u02BE\u02BF\u207F]", null, this, new Legal()); //

385

showElapsed(start, "TestArabic");

386

}

387

388

public void TestHebrew() throws IOException {

389

// Time bomb

390

if (skipIfBeforeICU(4,5,0)) {

391

// We temporarily filter against Unicode 4.1, but we only do this

392

// before version 3.5.

393

logln("TestHebrew needs to be updated to remove delete the section marked [:Age=4.0:] filter");

394

} else {

395

errln("TestHebrew needs to be updated to remove delete the [:Age=4.0:] filter ");

396

}

397

long start = System.currentTimeMillis();

398

new Test("Latin-Hebrew")

399

.test("[a-zA-Z\u02BC\u02BB]", "[[[:hebrew:]-[\u05BD\uFB00-\uFBFF]]& [:Age=4.0:]]", "[\u05F0\u05F1\u05F2]", this, new LegalHebrew());

400

showElapsed(start, "TestHebrew");

401

}

402

403

public void TestThai() throws IOException {

404

long start = System.currentTimeMillis();

405

if(skipIfBeforeICU(4,5,0)){

406

new Test("Latin-Thai")

407

.test("[a-zA-Z\u0142\u1ECD\u00E6\u0131\u0268\u02CC]",

408

"[\u0E01-\u0E3A\u0E40-\u0E5B]",

409

"[a-zA-Z\u0142\u1ECD\u00E6\u0131\u0268\u02B9\u02CC]",

410

"[\u0E4F]", this, new LegalThai());

411

}else{

412

new Test("Latin-Thai")

413

.test("[a-zA-Z\u0142\u1ECD\u00E6\u0131\u0268\u02CC]",

414

"[\u0E01-\u0E3A\u0E40-\u0E5B]",

415

"[a-zA-Z\u0142\u1ECD\u00E6\u0131\u0268\u02B9\u02CC]",

416

null, this, new LegalThai());

417

}

418

419

showElapsed(start, "TestThai");

420

}

421

422

//----------------------------------

423

// Inter-Indic Tests

424

//----------------------------------

425

public static class LegalIndic extends Legal{

426

UnicodeSet vowelSignSet = new UnicodeSet();

427

428

public LegalIndic(){

429

vowelSignSet.addAll(new UnicodeSet("[\u0901\u0902\u0903\u0904\u093e-\u094c\u0962\u0963]")); /* Devanagari */

430

vowelSignSet.addAll(new UnicodeSet("[\u0981\u0982\u0983\u09be-\u09cc\u09e2\u09e3\u09D7]")); /* Bengali */

431

vowelSignSet.addAll(new UnicodeSet("[\u0a01\u0a02\u0a03\u0a3e-\u0a4c\u0a62\u0a63\u0a70\u0a71]")); /* Gurmukhi */

432

vowelSignSet.addAll(new UnicodeSet("[\u0a81\u0a82\u0a83\u0abe-\u0acc\u0ae2\u0ae3]")); /* Gujarati */

433

vowelSignSet.addAll(new UnicodeSet("[\u0b01\u0b02\u0b03\u0b3e-\u0b4c\u0b62\u0b63\u0b56\u0b57]")); /* Oriya */

434

vowelSignSet.addAll(new UnicodeSet("[\u0b81\u0b82\u0b83\u0bbe-\u0bcc\u0be2\u0be3\u0bd7]")); /* Tamil */

435

vowelSignSet.addAll(new UnicodeSet("[\u0c01\u0c02\u0c03\u0c3e-\u0c4c\u0c62\u0c63\u0c55\u0c56]")); /* Telugu */

436

vowelSignSet.addAll(new UnicodeSet("[\u0c81\u0c82\u0c83\u0cbe-\u0ccc\u0ce2\u0ce3\u0cd5\u0cd6]")); /* Kannada */

437

vowelSignSet.addAll(new UnicodeSet("[\u0d01\u0d02\u0d03\u0d3e-\u0d4c\u0d62\u0d63\u0d57]")); /* Malayalam */

438

}

439

440

String avagraha = "\u093d\u09bd\u0abd\u0b3d\u0cbd";

441

String nukta = "\u093c\u09bc\u0a3c\u0abc\u0b3c\u0cbc";

442

String virama = "\u094d\u09cd\u0a4d\u0acd\u0b4d\u0bcd\u0c4d\u0ccd\u0d4d";

443

String sanskritStressSigns = "\u0951\u0952\u0953\u0954\u097d";

444

String chandrabindu = "\u0901\u0981\u0A81\u0b01\u0c01";

445

public boolean is(String sourceString){

446

int cp=sourceString.charAt(0);

447

448

// A vowel sign cannot be the first char

449

if(vowelSignSet.contains(cp)){

450

return false;

451

}else if(avagraha.indexOf(cp)!=-1){

452

return false;

453

}else if(virama.indexOf(cp)!=-1){

454

return false;

455

}else if(nukta.indexOf(cp)!=-1){

456

return false;

457

}else if(sanskritStressSigns.indexOf(cp)!=-1){

458

return false;

459

}else if((chandrabindu.indexOf(cp)!=-1) &&

460

(sourceString.length() >1 &&

461

vowelSignSet.contains(sourceString.charAt(1)))){

462

return false;

463

}

464

return true;

465

}

466

}

467

static String latinForIndic = "[['.0-9A-Za-z~\u00C0-\u00C5\u00C7-\u00CF\u00D1-\u00D6\u00D9-\u00DD"+

468

"\u00E0-\u00E5\u00E7-\u00EF\u00F1-\u00F6\u00F9-\u00FD\u00FF-\u010F"+

469

"\u0112-\u0125\u0128-\u0130\u0134-\u0137\u0139-\u013E\u0143-\u0148"+

470

"\u014C-\u0151\u0154-\u0165\u0168-\u017E\u01A0-\u01A1\u01AF-\u01B0"+

471

"\u01CD-\u01DC\u01DE-\u01E3\u01E6-\u01ED\u01F0\u01F4-\u01F5\u01F8-\u01FB"+

472

"\u0200-\u021B\u021E-\u021F\u0226-\u0233\u0294\u0303-\u0304\u0306\u0314-\u0315"+

473

"\u0325\u040E\u0419\u0439\u045E\u04C1-\u04C2\u04D0-\u04D1\u04D6-\u04D7"+

474

"\u04E2-\u04E3\u04EE-\u04EF\u1E00-\u1E99\u1EA0-\u1EF9\u1F01\u1F03\u1F05"+

475

"\u1F07\u1F09\u1F0B\u1F0D\u1F0F\u1F11\u1F13\u1F15\u1F19\u1F1B\u1F1D\u1F21"+

476

"\u1F23\u1F25\u1F27\u1F29\u1F2B\u1F2D\u1F2F\u1F31\u1F33\u1F35\u1F37\u1F39"+

477

"\u1F3B\u1F3D\u1F3F\u1F41\u1F43\u1F45\u1F49\u1F4B\u1F4D\u1F51\u1F53\u1F55"+

478

"\u1F57\u1F59\u1F5B\u1F5D\u1F5F\u1F61\u1F63\u1F65\u1F67\u1F69\u1F6B\u1F6D"+

479

"\u1F6F\u1F81\u1F83\u1F85\u1F87\u1F89\u1F8B\u1F8D\u1F8F\u1F91\u1F93\u1F95"+

480

"\u1F97\u1F99\u1F9B\u1F9D\u1F9F\u1FA1\u1FA3\u1FA5\u1FA7\u1FA9\u1FAB\u1FAD"+

481

"\u1FAF-\u1FB1\u1FB8-\u1FB9\u1FD0-\u1FD1\u1FD8-\u1FD9\u1FE0-\u1FE1\u1FE5"+

482

"\u1FE8-\u1FE9\u1FEC\u212A-\u212B\uE04D\uE064]"+

483

"-[\uE000-\uE080 \u01E2\u01E3]& [[:latin:][:mark:]]]";

484

485

public void TestDevanagariLatin() throws IOException {

486

long start = System.currentTimeMillis();

487

if (skipIfBeforeICU(4,5,0)) {

488

logln("Warning: TestDevanagariLatin needs to be updated to remove delete the section marked [:Age=4.1:] filter");

489

} else {

490

// We temporarily filter against Unicode 4.1, but we only do this

491

// before version 3.4.

492

errln("FAIL: TestDevanagariLatin needs to be updated to remove delete the [:Age=4.1:] filter ");

493

return;

494

}

495

new Test("Latin-DEVANAGARI", 50)

496

.test(latinForIndic, "[[[:Devanagari:][\u094d][\u0964\u0965]]&[:Age=4.1:]]", "[\u0965\u0904]", this, new LegalIndic());

497

showElapsed(start, "TestDevanagariLatin");

498

}

499

500

private static final String [][] interIndicArray= new String[][]{

501

new String [] { "BENGALI-DEVANAGARI",

502

"[:BENGALI:]", "[:Devanagari:]",

503

"[\u0904\u0951-\u0954\u0943-\u0949\u094a\u0962\u0963\u090D\u090e\u0911\u0912\u0929\u0933\u0934\u0935\u0950\u0958\u0959\u095a\u095b\u095e\u097d]", /*roundtrip exclusions*/

504

505

new String [] { "DEVANAGARI-BENGALI",

506

"[:Devanagari:]", "[:BENGALI:]",

507

"[\u09D7\u090D\u090e\u0911\u0912\u0929\u0933\u0934\u0935\u0950\u0958\u0959\u095a\u095b\u095e\u09f0\u09f1\u09f2-\u09fa\u09ce]", /*roundtrip exclusions*/

508

509

510

new String [] { "GURMUKHI-DEVANAGARI",

511

"[:GURMUKHI:]", "[:Devanagari:]",

512

"[\u0904\u0902\u0936\u0933\u0951-\u0954\u0902\u0903\u0943-\u0949\u094a\u0962\u0963\u090B\u090C\u090D\u090e\u0911\u0912\u0934\u0937\u093D\u0950\u0960\u0961\u097d]", /*roundtrip exclusions*/

513

514

new String [] { "DEVANAGARI-GURMUKHI",

515

"[:Devanagari:]", "[:GURMUKHI:]",

516

"[\u0A02\u0946\u0A5C\u0951-\u0954\u0A70\u0A71\u090B\u090C\u090D\u090e\u0911\u0912\u0934\u0937\u093D\u0950\u0960\u0961\u0a72\u0a73\u0a74]", /*roundtrip exclusions*/

517

518

519

new String [] { "GUJARATI-DEVANAGARI",

520

"[:GUJARATI:]", "[:Devanagari:]",

521

"[\u0904\u0946\u094A\u0962\u0963\u0951-\u0954\u0961\u090c\u090e\u0912\u097d]", /*roundtrip exclusions*/

522

523

new String [] { "DEVANAGARI-GUJARATI",

524

"[:Devanagari:]", "[:GUJARATI:]",

525

"[\u0951-\u0954\u0961\u090c\u090e\u0912]", /*roundtrip exclusions*/

526

527

528

new String [] { "ORIYA-DEVANAGARI",

529

"[:ORIYA:]", "[:Devanagari:]",

530

"[\u0904\u0912\u0911\u090D\u090e\u0931\u0943-\u094a\u0962\u0963\u0951-\u0954\u0950\u097d]", /*roundtrip exclusions*/

531

532

new String [] { "DEVANAGARI-ORIYA",

533

"[:Devanagari:]", "[:ORIYA:]",

534

"[\u0b5f\u0b56\u0b57\u0b70\u0b71\u0950\u090D\u090e\u0912\u0911\u0931]", /*roundtrip exclusions*/

535

536

537

new String [] { "Tamil-DEVANAGARI",

538

"[:tamil:]", "[:Devanagari:]",

539

"[\u0901\u0904\u093c\u0943-\u094a\u0951-\u0954\u0962\u0963\u090B\u090C\u090D\u0911\u0916\u0917\u0918\u091B\u091D\u0920\u0921\u0922\u0925\u0926\u0927\u092B\u092C\u092D\u0936\u093d\u0950[\u0958-\u0961]\u097d]", /*roundtrip exclusions*/

540

541

new String [] { "DEVANAGARI-Tamil",

542

"[:Devanagari:]", "[:tamil:]",

543

"[\u0bd7\u0BF0\u0BF1\u0BF2]", /*roundtrip exclusions*/

544

545

546

new String [] { "Telugu-DEVANAGARI",

547

"[:telugu:]", "[:Devanagari:]",

548

"[\u0904\u093c\u0950\u0945\u0949\u0951-\u0954\u0962\u0963\u090D\u0911\u093d\u0929\u0934[\u0958-\u095f]\u097d]", /*roundtrip exclusions*/

549

550

new String [] { "DEVANAGARI-TELUGU",

551

"[:Devanagari:]", "[:TELUGU:]",

552

"[\u0c55\u0c56\u0950\u090D\u0911\u093d\u0929\u0934[\u0958-\u095f]]", /*roundtrip exclusions*/

553

554

555

new String [] { "KANNADA-DEVANAGARI",

556

"[:KANNADA:]", "[:Devanagari:]",

557

"[\u0901\u0904\u0946\u0950\u0945\u0949\u0951-\u0954\u0962\u0963\u0950\u090D\u0911\u093d\u0929\u0934[\u0958-\u095f]\u097d]", /*roundtrip exclusions*/

558

559

new String [] { "DEVANAGARI-KANNADA",

560

"[:Devanagari:]", "[:KANNADA:]",

561

"[{\u0cb0\u0cbc}{\u0cb3\u0cbc}\u0cde\u0cd5\u0cd6\u0950\u090D\u0911\u093d\u0929\u0934[\u0958-\u095f]]", /*roundtrip exclusions*/

562

563

564

new String [] { "MALAYALAM-DEVANAGARI",

565

"[:MALAYALAM:]", "[:Devanagari:]",

566

"[\u0901\u0904\u094a\u094b\u094c\u093c\u0950\u0944\u0945\u0949\u0951-\u0954\u0962\u0963\u090D\u0911\u093d\u0929\u0934[\u0958-\u095f]\u097d]", /*roundtrip exclusions*/

567

568

new String [] { "DEVANAGARI-MALAYALAM",

569

"[:Devanagari:]", "[:MALAYALAM:]",

570

"[\u0d4c\u0d57\u0950\u090D\u0911\u093d\u0929\u0934[\u0958-\u095f]]", /*roundtrip exclusions*/

571

572

573

new String [] { "GURMUKHI-BENGALI",

574

"[:GURMUKHI:]", "[:BENGALI:]",

575

"[\u0982\u09b6\u09e2\u09e3\u09c3\u09c4\u09d7\u098B\u098C\u09B7\u09E0\u09E1\u09F0\u09F1\u09f2-\u09fa\u09ce]", /*roundtrip exclusions*/

576

577

new String [] { "BENGALI-GURMUKHI",

578

"[:BENGALI:]", "[:GURMUKHI:]",

579

"[\u0A02\u0a5c\u0a47\u0a70\u0a71\u0A33\u0A35\u0A59\u0A5A\u0A5B\u0A5E\u0A72\u0A73\u0A74]", /*roundtrip exclusions*/

580

581

582

new String [] { "GUJARATI-BENGALI",

583

"[:GUJARATI:]", "[:BENGALI:]",

584

"[\u09d7\u09e2\u09e3\u098c\u09e1\u09f0\u09f1\u09f2-\u09fa\u09ce]", /*roundtrip exclusions*/

585

586

new String [] { "BENGALI-GUJARATI",

587

"[:BENGALI:]", "[:GUJARATI:]",

588

"[\u0A82\u0a83\u0Ac9\u0Ac5\u0ac7\u0A8D\u0A91\u0AB3\u0AB5\u0ABD\u0AD0]", /*roundtrip exclusions*/

589

590

591

new String [] { "ORIYA-BENGALI",

592

"[:ORIYA:]", "[:BENGALI:]",

593

"[\u09c4\u09e2\u09e3\u09f0\u09f1\u09f2-\u09fa\u09ce]", /*roundtrip exclusions*/

594

595

new String [] { "BENGALI-ORIYA",

596

"[:BENGALI:]", "[:ORIYA:]",

597

"[\u0b35\u0b71\u0b5f\u0b56\u0b33\u0b3d]", /*roundtrip exclusions*/

598

599

600

new String [] { "Tamil-BENGALI",

601

"[:tamil:]", "[:BENGALI:]",

602

"[\u0981\u09bc\u09c3\u09c4\u09e2\u09e3\u09f0\u09f1\u098B\u098C\u0996\u0997\u0998\u099B\u099D\u09A0\u09A1\u09A2\u09A5\u09A6\u09A7\u09AB\u09AC\u09AD\u09B6\u09DC\u09DD\u09DF\u09E0\u09E1\u09f2-\u09fa\u09ce]", /*roundtrip exclusions*/

603

604

new String [] { "BENGALI-Tamil",

605

"[:BENGALI:]", "[:tamil:]",

606

"[\u0bc6\u0bc7\u0bca\u0B8E\u0B92\u0BA9\u0BB1\u0BB3\u0BB4\u0BB5\u0BF0\u0BF1\u0BF2]", /*roundtrip exclusions*/

607

608

609

new String [] { "Telugu-BENGALI",

610

"[:telugu:]", "[:BENGALI:]",

611

"[\u09e2\u09e3\u09bc\u09d7\u09f0\u09f1\u09dc\u09dd\u09df\u09f2-\u09fa\u09ce]", /*roundtrip exclusions*/

612

613

new String [] { "BENGALI-TELUGU",

614

"[:BENGALI:]", "[:TELUGU:]",

615

"[\u0c55\u0c56\u0c47\u0c46\u0c4a\u0C0E\u0C12\u0C31\u0C33\u0C35]", /*roundtrip exclusions*/

616

617

618

new String [] { "KANNADA-BENGALI",

619

"[:KANNADA:]", "[:BENGALI:]",

620

"[\u0981\u09e2\u09e3\u09bc\u09d7\u09f0\u09f1\u09dc\u09dd\u09df\u09f2-\u09fa\u09ce]", /*roundtrip exclusions*/

621

622

new String [] { "BENGALI-KANNADA",

623

"[:BENGALI:]", "[:KANNADA:]",

624

"[{\u0cb0\u0cbc}{\u0cb3\u0cbc}\u0cc6\u0cca\u0cd5\u0cd6\u0cc7\u0C8E\u0C92\u0CB1\u0cb3\u0cb5\u0cde]", /*roundtrip exclusions*/

625

626

627

new String [] { "MALAYALAM-BENGALI",

628

"[:MALAYALAM:]", "[:BENGALI:]",

629

"[\u0981\u09e2\u09e3\u09bc\u09c4\u09f0\u09f1\u09dc\u09dd\u09df\u09f2-\u09fa\u09ce]", /*roundtrip exclusions*/

630

631

new String [] { "BENGALI-MALAYALAM",

632

"[:BENGALI:]", "[:MALAYALAM:]",

633

"[\u0d46\u0d4a\u0d47\u0d31-\u0d35\u0d0e\u0d12]", /*roundtrip exclusions*/

634

635

636

new String [] { "GUJARATI-GURMUKHI",

637

"[:GUJARATI:]", "[:GURMUKHI:]",

638

"[\u0A02\u0ab3\u0ab6\u0A70\u0a71\u0a82\u0a83\u0ac3\u0ac4\u0ac5\u0ac9\u0a5c\u0a72\u0a73\u0a74\u0a8b\u0a8d\u0a91\u0abd]", /*roundtrip exclusions*/

639

640

new String [] { "GURMUKHI-GUJARATI",

641

"[:GURMUKHI:]", "[:GUJARATI:]",

642

"[\u0a5c\u0A70\u0a71\u0a72\u0a73\u0a74\u0a82\u0a83\u0a8b\u0a8c\u0a8d\u0a91\u0ab3\u0ab6\u0ab7\u0abd\u0ac3\u0ac4\u0ac5\u0ac9\u0ad0\u0ae0\u0ae1]", /*roundtrip exclusions*/

643

644

645

new String [] { "ORIYA-GURMUKHI",

646

"[:ORIYA:]", "[:GURMUKHI:]",

647

"[\u0A02\u0a5c\u0a21\u0a47\u0a71\u0b02\u0b03\u0b33\u0b36\u0b43\u0b56\u0b57\u0B0B\u0B0C\u0B37\u0B3D\u0B5F\u0B60\u0B61\u0a35\u0a72\u0a73\u0a74]", /*roundtrip exclusions*/

648

649

new String [] { "GURMUKHI-ORIYA",

650

"[:GURMUKHI:]", "[:ORIYA:]",

651

"[\u0a71\u0b02\u0b03\u0b33\u0b36\u0b43\u0b56\u0b57\u0B0B\u0B0C\u0B37\u0B3D\u0B5F\u0B60\u0B61\u0b70\u0b71]", /*roundtrip exclusions*/

652

653

654

new String [] { "TAMIL-GURMUKHI",

655

"[:TAMIL:]", "[:GURMUKHI:]",

656

"[\u0A01\u0A02\u0a33\u0a36\u0a3c\u0a70\u0a71\u0a47\u0A16\u0A17\u0A18\u0A1B\u0A1D\u0A20\u0A21\u0A22\u0A25\u0A26\u0A27\u0A2B\u0A2C\u0A2D\u0A59\u0A5A\u0A5B\u0A5C\u0A5E\u0A72\u0A73\u0A74]", /*roundtrip exclusions*/

657

658

new String [] { "GURMUKHI-TAMIL",

659

"[:GURMUKHI:]", "[:TAMIL:]",

660

"[\u0b82\u0bc6\u0bca\u0bd7\u0bb7\u0bb3\u0b83\u0B8E\u0B92\u0BA9\u0BB1\u0BB4\u0bb6\u0BF0\u0BF1\u0BF2]", /*roundtrip exclusions*/

661

662

663

new String [] { "TELUGU-GURMUKHI",

664

"[:TELUGU:]", "[:GURMUKHI:]",

665

"[\u0A02\u0a33\u0a36\u0a3c\u0a70\u0a71\u0A59\u0A5A\u0A5B\u0A5C\u0A5E\u0A72\u0A73\u0A74]", /*roundtrip exclusions*/

666

667

new String [] { "GURMUKHI-TELUGU",

668

"[:GURMUKHI:]", "[:TELUGU:]",

669

"[\u0c02\u0c03\u0c33\u0c36\u0c44\u0c43\u0c46\u0c4a\u0c56\u0c55\u0C0B\u0C0C\u0C0E\u0C12\u0C31\u0C37\u0C60\u0C61]", /*roundtrip exclusions*/

670

671

new String [] { "KANNADA-GURMUKHI",

672

"[:KANNADA:]", "[:GURMUKHI:]",

673

"[\u0A01\u0A02\u0a33\u0a36\u0a3c\u0a70\u0a71\u0A59\u0A5A\u0A5B\u0A5C\u0A5E\u0A72\u0A73\u0A74]", /*roundtrip exclusions*/

674

675

new String [] { "GURMUKHI-KANNADA",

676

"[:GURMUKHI:]", "[:KANNADA:]",

677

"[{\u0cb0\u0cbc}{\u0cb3\u0cbc}\u0c82\u0c83\u0cb3\u0cb6\u0cc4\u0cc3\u0cc6\u0cca\u0cd5\u0cd6\u0C8B\u0C8C\u0C8E\u0C92\u0CB1\u0CB7\u0cbd\u0CE0\u0CE1\u0cde]", /*roundtrip exclusions*/

678

679

680

new String [] { "MALAYALAM-GURMUKHI",

681

"[:MALAYALAM:]", "[:GURMUKHI:]",

682

"[\u0A01\u0A02\u0a4b\u0a4c\u0a33\u0a36\u0a3c\u0a70\u0a71\u0A59\u0A5A\u0A5B\u0A5C\u0A5E\u0A72\u0A73\u0A74]", /*roundtrip exclusions*/

683

684

new String [] { "GURMUKHI-MALAYALAM",

685

"[:GURMUKHI:]", "[:MALAYALAM:]",

686

"[\u0d02\u0d03\u0d33\u0d36\u0d43\u0d46\u0d4a\u0d4c\u0d57\u0D0B\u0D0C\u0D0E\u0D12\u0D31\u0D34\u0D37\u0D60\u0D61]", /*roundtrip exclusions*/

687

688

689

new String [] { "GUJARATI-ORIYA",

690

"[:GUJARATI:]", "[:ORIYA:]",

691

"[\u0b56\u0b57\u0B0C\u0B5F\u0B61\u0b70\u0b71]", /*roundtrip exclusions*/

692

693

new String [] { "ORIYA-GUJARATI",

694

"[:ORIYA:]", "[:GUJARATI:]",

695

"[\u0Ac4\u0Ac5\u0Ac9\u0Ac7\u0A8D\u0A91\u0AB5\u0Ad0]", /*roundtrip exclusions*/

696

697

698

new String [] { "TAMIL-GUJARATI",

699

"[:TAMIL:]", "[:GUJARATI:]",

700

"[\u0A81\u0a8c\u0abc\u0ac3\u0Ac4\u0Ac5\u0Ac9\u0Ac7\u0A8B\u0A8D\u0A91\u0A96\u0A97\u0A98\u0A9B\u0A9D\u0AA0\u0AA1\u0AA2\u0AA5\u0AA6\u0AA7\u0AAB\u0AAC\u0AAD\u0AB6\u0ABD\u0AD0\u0AE0\u0AE1]", /*roundtrip exclusions*/

701

702

new String [] { "GUJARATI-TAMIL",

703

"[:GUJARATI:]", "[:TAMIL:]",

704

"[\u0Bc6\u0Bca\u0Bd7\u0B8E\u0B92\u0BA9\u0BB1\u0BB4\u0BF0\u0BF1\u0BF2]", /*roundtrip exclusions*/

705

706

707

new String [] { "TELUGU-GUJARATI",

708

"[:TELUGU:]", "[:GUJARATI:]",

709

"[\u0abc\u0Ac5\u0Ac9\u0A8D\u0A91\u0ABD\u0Ad0]", /*roundtrip exclusions*/

710

711

new String [] { "GUJARATI-TELUGU",

712

"[:GUJARATI:]", "[:TELUGU:]",

713

"[\u0c46\u0c4a\u0c55\u0c56\u0C0C\u0C0E\u0C12\u0C31\u0C61]", /*roundtrip exclusions*/

714

715

716

new String [] { "KANNADA-GUJARATI",

717

"[:KANNADA:]", "[:GUJARATI:]",

718

"[\u0A81\u0abc\u0Ac5\u0Ac9\u0A8D\u0A91\u0ABD\u0Ad0]", /*roundtrip exclusions*/

719

720

new String [] { "GUJARATI-KANNADA",

721

"[:GUJARATI:]", "[:KANNADA:]",

722

"[{\u0cb0\u0cbc}{\u0cb3\u0cbc}\u0cc6\u0cca\u0cd5\u0cd6\u0C8C\u0C8E\u0C92\u0CB1\u0CDE\u0CE1]", /*roundtrip exclusions*/

723

724

725

new String [] { "MALAYALAM-GUJARATI",

726

"[:MALAYALAM:]", "[:GUJARATI:]",

727

"[\u0A81\u0ac4\u0acb\u0acc\u0abc\u0Ac5\u0Ac9\u0A8D\u0A91\u0ABD\u0Ad0]", /*roundtrip exclusions*/

728

729

new String [] { "GUJARATI-MALAYALAM",

730

"[:GUJARATI:]", "[:MALAYALAM:]",

731

"[\u0d46\u0d4a\u0d4c\u0d55\u0d57\u0D0C\u0D0E\u0D12\u0D31\u0D34\u0D61]", /*roundtrip exclusions*/

732

733

734

new String [] { "TAMIL-ORIYA",

735

"[:TAMIL:]", "[:ORIYA:]",

736

"[\u0B01\u0b3c\u0b43\u0b56\u0B0B\u0B0C\u0B16\u0B17\u0B18\u0B1B\u0B1D\u0B20\u0B21\u0B22\u0B25\u0B26\u0B27\u0B2B\u0B2C\u0B2D\u0B36\u0B3D\u0B5C\u0B5D\u0B5F\u0B60\u0B61\u0b70\u0b71]", /*roundtrip exclusions*/

737

738

new String [] { "ORIYA-TAMIL",

739

"[:ORIYA:]", "[:TAMIL:]",

740

"[\u0bc6\u0bca\u0bc7\u0B8E\u0B92\u0BA9\u0BB1\u0BB4\u0BB5\u0BF0\u0BF1\u0BF2]", /*roundtrip exclusions*/

741

742

743

new String [] { "TELUGU-ORIYA",

744

"[:TELUGU:]", "[:ORIYA:]",

745

"[\u0b3c\u0b57\u0b56\u0B3D\u0B5C\u0B5D\u0B5F\u0b70\u0b71]", /*roundtrip exclusions*/

746

747

new String [] { "ORIYA-TELUGU",

748

"[:ORIYA:]", "[:TELUGU:]",

749

"[\u0c44\u0c46\u0c4a\u0c55\u0c47\u0C0E\u0C12\u0C31\u0C35]", /*roundtrip exclusions*/

750

751

752

new String [] { "KANNADA-ORIYA",

753

"[:KANNADA:]", "[:ORIYA:]",

754

"[\u0B01\u0b3c\u0b57\u0B3D\u0B5C\u0B5D\u0B5F\u0b70\u0b71]", /*roundtrip exclusions*/

755

756

new String [] { "ORIYA-KANNADA",

757

"[:ORIYA:]", "[:KANNADA:]",

758

"[{\u0cb0\u0cbc}{\u0cb3\u0cbc}\u0cc4\u0cc6\u0cca\u0cd5\u0cc7\u0C8E\u0C92\u0CB1\u0CB5\u0CDE]", /*roundtrip exclusions*/

759

760

761

new String [] { "MALAYALAM-ORIYA",

762

"[:MALAYALAM:]", "[:ORIYA:]",

763

"[\u0B01\u0b3c\u0b56\u0B3D\u0B5C\u0B5D\u0B5F\u0b70\u0b71]", /*roundtrip exclusions*/

764

765

new String [] { "ORIYA-MALAYALAM",

766

"[:ORIYA:]", "[:MALAYALAM:]",

767

"[\u0D47\u0D46\u0D4a\u0D0E\u0D12\u0D31\u0D34\u0D35]", /*roundtrip exclusions*/

768

769

770

new String [] { "TELUGU-TAMIL",

771

"[:TELUGU:]", "[:TAMIL:]",

772

"[\u0bd7\u0ba9\u0bb4\u0BF0\u0BF1\u0BF2\u0BF0\u0BF1\u0BF2]", /*roundtrip exclusions*/

773

774

new String [] { "TAMIL-TELUGU",

775

"[:TAMIL:]", "[:TELUGU:]",

776

"[\u0C01\u0c43\u0c44\u0c46\u0c47\u0c55\u0c56\u0c66\u0C0B\u0C0C\u0C16\u0C17\u0C18\u0C1B\u0C1D\u0C20\u0C21\u0C22\u0C25\u0C26\u0C27\u0C2B\u0C2C\u0C2D\u0C36\u0C60\u0C61]", /*roundtrip exclusions*/

777

778

779

new String [] { "KANNADA-TAMIL",

780

"[:KANNADA:]", "[:TAMIL:]",

781

"[\u0bd7\u0bc6\u0ba9\u0bb4\u0BF0\u0BF1\u0BF2]", /*roundtrip exclusions*/

782

783

new String [] { "TAMIL-KANNADA",

784

"[:TAMIL:]", "[:KANNADA:]",

785

"[\u0cc3\u0cc4\u0cc6\u0cc7\u0cd5\u0cd6\u0C8B\u0C8C\u0C96\u0C97\u0C98\u0C9B\u0C9D\u0CA0\u0CA1\u0CA2\u0CA5\u0CA6\u0CA7\u0CAB\u0CAC\u0CAD\u0CB6\u0cbc\u0cbd\u0CDE\u0CE0\u0CE1]", /*roundtrip exclusions*/

786

787

788

new String [] { "MALAYALAM-TAMIL",

789

"[:MALAYALAM:]", "[:TAMIL:]",

790

"[\u0ba9\u0BF0\u0BF1\u0BF2]", /*roundtrip exclusions*/

791

792

new String [] { "TAMIL-MALAYALAM",

793

"[:TAMIL:]", "[:MALAYALAM:]",

794

"[\u0d43\u0d12\u0D0B\u0D0C\u0D16\u0D17\u0D18\u0D1B\u0D1D\u0D20\u0D21\u0D22\u0D25\u0D26\u0D27\u0D2B\u0D2C\u0D2D\u0D36\u0D60\u0D61]", /*roundtrip exclusions*/

795

796

797

new String [] { "KANNADA-TELUGU",

798

"[:KANNADA:]", "[:TELUGU:]",

799

"[\u0C01\u0c3f\u0c46\u0c48\u0c4a]", /*roundtrip exclusions*/

800

801

new String [] { "TELUGU-KANNADA",

802

"[:TELUGU:]", "[:KANNADA:]",

803

"[\u0cc8\u0cd5\u0cd6\u0CDE\u0cbc\u0cbd]", /*roundtrip exclusions*/

804

805

806

new String [] { "MALAYALAM-TELUGU",

807

"[:MALAYALAM:]", "[:TELUGU:]",

808

"[\u0C01\u0c44\u0c4a\u0c4c\u0c4b\u0c55\u0c56]", /*roundtrip exclusions*/

809

810

new String [] { "TELUGU-MALAYALAM",

811

"[:TELUGU:]", "[:MALAYALAM:]",

812

"[\u0d4c\u0d57\u0D34]", /*roundtrip exclusions*/

813

814

815

new String [] { "MALAYALAM-KANNADA",

816

"[:MALAYALAM:]", "[:KANNADA:]",

817

"[\u0cbc\u0cbd\u0cc4\u0cc6\u0cca\u0ccc\u0ccb\u0cd5\u0cd6\u0cDe]", /*roundtrip exclusions*/

818

819

new String [] { "Latin-Bengali",

820

latinForIndic, "[[:Bengali:][\u0964\u0965]]",

821

"[\u0965\u09f0-\u09fa\u09ce]", /*roundtrip exclusions*/

822

823

new String [] { "Latin-Gurmukhi",

824

latinForIndic, "[[:Gurmukhi:][\u0964\u0965]]",

825

"[\u0a01\u0a02\u0965\u0a72\u0a73\u0a74]", /*roundtrip exclusions*/

826

827

new String [] { "Latin-Gujarati",

828

latinForIndic, "[[:Gujarati:][\u0964\u0965]]",

829

"[\u0965]", /*roundtrip exclusions*/

830

831

new String [] { "Latin-Oriya",

832

latinForIndic, "[[:Oriya:][\u0964\u0965]]",

833

"[\u0965\u0b70]", /*roundtrip exclusions*/

834

835

new String [] { "Latin-Tamil",

836

latinForIndic, "[:Tamil:]",

837

"[\u0BF0\u0BF1\u0BF2]", /*roundtrip exclusions*/

838

839

new String [] { "Latin-Telugu",

840

latinForIndic, "[:Telugu:]",

841

null, /*roundtrip exclusions*/

842

843

new String [] { "Latin-Kannada",

844

latinForIndic, "[:Kannada:]",

845

null, /*roundtrip exclusions*/

846

847

new String [] { "Latin-Malayalam",

848

latinForIndic, "[:Malayalam:]",

849

null, /*roundtrip exclusions*/

850

851

};

852

853

public void TestInterIndic() throws Exception{

854

long start = System.currentTimeMillis();

855

int num = interIndicArray.length;

856

if (isQuick()) {

857

logln("Testing only 5 of "+ interIndicArray.length+" Skipping rest (use -e for exhaustive)");

858

num = 5;

859

}

860

if (skipIfBeforeICU(4,5,0)) {

861

logln("Warning: TestInterIndic needs to be updated to remove delete the section marked [:Age=4.1:] filter");

862

} else {

863

// We temporarily filter against Unicode 4.1, but we only do this

864

// before version 3.4.

865

errln("FAIL: TestInterIndic needs to be updated to remove delete the [:Age=4.1:] filter ");

866

return;

867

}

868

for(int i=0; i<num;i++){

869

logln("Testing " + interIndicArray[i][0] + " at index " + i );

870

/*TODO: uncomment the line below when the transliterator is fixed

871

new Test(interIndicArray[i][0], 50)

872

.test(interIndicArray[i][1],

873

interIndicArray[i][2],

874

interIndicArray[i][3],

875

this, new LegalIndic());

876

877

/* comment lines below when transliterator is fixed */

878

// start

879

new Test(interIndicArray[i][0], 50)

880

.test("["+interIndicArray[i][1]+" &[:Age=4.1:]]",

881

"["+interIndicArray[i][2]+" &[:Age=4.1:]]",

882

interIndicArray[i][3],

883

this, new LegalIndic());

884

//end

885

}

886

showElapsed(start, "TestInterIndic");

887

}

888

889

//---------------

890

// End Indic

891

//---------------

892

893

public static class Legal {

894

public boolean is(String sourceString) {return true;}

895

}

896

897

public static class LegalJamo extends Legal {

898

// any initial must be followed by a medial (or initial)

899

// any medial must follow an initial (or medial)

900

// any final must follow a medial (or final)

901

902

public boolean is(String sourceString) {

903

try {

904

int t;

905

String decomp = Normalizer.normalize(sourceString, Normalizer.NFD);

906

for (int i = 0; i < decomp.length(); ++i) { // don't worry about surrogates

907

switch (getType(decomp.charAt(i))) {

908

case 0:

909

t = getType(decomp.charAt(i+1));

910

if (t != 0 && t != 1) return false;

911

break;

912

case 1:

913

t = getType(decomp.charAt(i-1));

914

if (t != 0 && t != 1) return false;

915

break;

916

case 2:

917

t = getType(decomp.charAt(i-1));

918

if (t != 1 && t != 2) return false;

919

break;

920

}

921

}

922

return true;

923

} catch (StringIndexOutOfBoundsException e) {

924

return false;

925

}

926

}

927

928

public int getType(char c) {

929

if ('\u1100' <= c && c <= '\u1112') return 0;

930

else if ('\u1161' <= c && c <= '\u1175') return 1;

931

else if ('\u11A8' <= c && c <= '\u11C2') return 2;

932

return -1; // other

933

}

934

}

935

936

//static BreakIterator thaiBreak = BreakIterator.getWordInstance(new Locale("th", "TH"));

937

// anything is legal except word ending with Logical-order-exception

938

public static class LegalThai extends Legal {

939

public boolean is(String sourceString) {

940

if (sourceString.length() == 0) return true;

941

char ch = sourceString.charAt(sourceString.length() - 1); // don't worry about surrogates.

942

if (UCharacter.hasBinaryProperty(ch, UProperty.LOGICAL_ORDER_EXCEPTION)) return false;

943

944

945

// disallow anything with a wordbreak between

946

947

if (UTF16.countCodePoint(sourceString) <= 1) return true;

948

thaiBreak.setText(sourceString);

949

for (int pos = thaiBreak.first(); pos != BreakIterator.DONE; pos = thaiBreak.next()) {

950

if (pos > 0 && pos < sourceString.length()) {

951

System.out.println("Skipping " + Utility.escape(sourceString));

952

return false;

953

}

954

}

955

956

return true;

957

}

958

}

959

960

// anything is legal except that Final letters can't be followed by letter; NonFinal must be

961

public static class LegalHebrew extends Legal {

962

static UnicodeSet FINAL = new UnicodeSet("[\u05DA\u05DD\u05DF\u05E3\u05E5]");

963

static UnicodeSet NON_FINAL = new UnicodeSet("[\u05DB\u05DE\u05E0\u05E4\u05E6]");

964

static UnicodeSet LETTER = new UnicodeSet("[:letter:]");

965

public boolean is(String sourceString) {

966

if (sourceString.length() == 0) return true;

967

// don't worry about surrogates.

968

for (int i = 0; i < sourceString.length(); ++i) {

969

char ch = sourceString.charAt(i);

970

char next = i+1 == sourceString.length() ? '\u0000' : sourceString.charAt(i);

971

if (FINAL.contains(ch)) {

972

if (LETTER.contains(next)) return false;

973

} else if (NON_FINAL.contains(ch)) {

974

if (!LETTER.contains(next)) return false;

975

}

976

}

977

return true;

978

}

979

}

980

981

982

public static class LegalGreek extends Legal {

983

984

boolean full;

985

986

public LegalGreek(boolean full) {

987

this.full = full;

988

}

989

990

static final char IOTA_SUBSCRIPT = '\u0345';

991

static final UnicodeSet breathing = new UnicodeSet("[\\u0313\\u0314']");

992

static final UnicodeSet validSecondVowel = new UnicodeSet("[\\u03C5\\u03B9\\u03A5\\u0399]");

993

994

public static boolean isVowel(char c) {

995

return "\u03B1\u03B5\u03B7\u03B9\u03BF\u03C5\u03C9\u0391\u0395\u0397\u0399\u039F\u03A5\u03A9".indexOf(c) >= 0;

996

}

997

998

public static boolean isRho(char c) {

999

return "\u03C1\u03A1".indexOf(c) >= 0;

1000

}

1001

1002

public boolean is(String sourceString) {

1003

try {

1004

String decomp = Normalizer.normalize(sourceString, Normalizer.NFD);

1005

1006

// modern is simpler: don't care about anything but a grave

1007

if (!full) {

1008

//if (sourceString.equals("\u039C\u03C0")) return false;

1009

for (int i = 0; i < decomp.length(); ++i) {

1010

char c = decomp.charAt(i);

1011

// exclude all the accents

1012

if (c == '\u0313' || c == '\u0314' || c == '\u0300' || c == '\u0302'

1013

|| c == '\u0342' || c == '\u0345'

1014

) return false;

1015

}

1016

return true;

1017

}

1018

1019

// Legal full Greek has breathing marks IFF there is a vowel or RHO at the start

1020

// IF it has them, it has exactly one.

1021

// IF it starts with a RHO, then the breathing mark must come before the second letter.

1022

// IF it starts with a vowel, then it must before the third letter.

1023

// it will only come after the second if of the format [vowel] [no iota subscript!] [upsilon or iota]

1024

// Since there are no surrogates in greek, don't worry about them

1025

1026

boolean firstIsVowel = false;

1027

boolean firstIsRho = false;

1028

boolean noLetterYet = true;

1029

int breathingCount = 0;

1030

int letterCount = 0;

1031

//int breathingPosition = -1;

1032

1033

for (int i = 0; i < decomp.length(); ++i) {

1034

char c = decomp.charAt(i);

1035

if (UCharacter.isLetter(c)) {

1036

++letterCount;

1037

if (firstIsVowel && !validSecondVowel.contains(c) && breathingCount == 0) return false;

1038

if (noLetterYet) {

1039

noLetterYet = false;

1040

firstIsVowel = isVowel(c);

1041

firstIsRho = isRho(c);

1042

}

1043

if (firstIsRho && letterCount == 2 && breathingCount == 0) return false;

1044

}

1045

if (c == IOTA_SUBSCRIPT && firstIsVowel && breathingCount == 0) return false;

1046

if (breathing.contains(c)) {

1047

// breathingPosition = i;

1048

++breathingCount;

1049

}

1050

}

1051

1052

if (firstIsVowel || firstIsRho) return breathingCount == 1;

1053

return breathingCount == 0;

1054

} catch (Throwable t) {

1055

System.out.println(t.getClass().getName() + " " + t.getMessage());

1056

return true;

1057

}

1058

}

1059

}

1060

1061

static class Test {

1062

1063

PrintWriter out;

1064

1065

private String transliteratorID;

1066

private int errorLimit = 500;

1067

private int errorCount = 0;

1068

private long pairLimit = 1000000; // make default be 1M.

1069

private int density = 100;

1070

UnicodeSet sourceRange;

1071

UnicodeSet targetRange;

1072

UnicodeSet toSource;

1073

UnicodeSet toTarget;

1074

UnicodeSet roundtripExclusions;

1075

1076

RoundTripTest log;

1077

Legal legalSource;

1078

UnicodeSet badCharacters;

1079

1080

1081

* create a test for the given script transliterator.

1082

1083

Test(String transliteratorID) {

1084

this(transliteratorID, 100);

1085

}

1086

1087

Test(String transliteratorID, int dens) {

1088

this.transliteratorID = transliteratorID;

1089

this.density = dens;

1090

}

1091

1092

public void setErrorLimit(int limit) {

1093

errorLimit = limit;

1094

}

1095

1096

public void setPairLimit(int limit) {

1097

pairLimit = limit;

1098

}

1099

1100

// Added to do better equality check.

1101

1102

public static boolean isSame(String a, String b) {

1103

if (a.equals(b)) return true;

1104

if (a.equalsIgnoreCase(b) && isCamel(a)) return true;

1105

a = Normalizer.normalize(a, Normalizer.NFD);

1106

b = Normalizer.normalize(b, Normalizer.NFD);

1107

if (a.equals(b)) return true;

1108

if (a.equalsIgnoreCase(b) && isCamel(a)) return true;

1109

return false;

1110

}

1111

1112

1113

public boolean includesSome(UnicodeSet set, String a) {

1114

int cp;

1115

for (int i = 0; i < a.length(); i += UTF16.getCharCount(cp)) {

1116

cp = UTF16.charAt(a, i);

1117

if (set.contains(cp)) return true;

1118

}

1119

return false;

1120

}

1121

1122

1123

public static boolean isCamel(String a) {

1124

//System.out.println("CamelTest");

1125

// see if string is of the form aB; e.g. lower, then upper or title

1126

int cp;

1127

boolean haveLower = false;

1128

for (int i = 0; i < a.length(); i += UTF16.getCharCount(cp)) {

1129

cp = UTF16.charAt(a, i);

1130

int t = UCharacter.getType(cp);

1131

//System.out.println("\t" + t + " " + Integer.toString(cp,16) + " " + UCharacter.getName(cp));

1132

switch (t) {

1133

case Character.UPPERCASE_LETTER:

1134

if (haveLower) return true;

1135

break;

1136

case Character.TITLECASE_LETTER:

1137

if (haveLower) return true;

1138

// drop through, since second letter is lower.

1139

case Character.LOWERCASE_LETTER:

1140

haveLower = true;

1141

break;

1142

}

1143

}

1144

//System.out.println("FALSE");

1145

return false;

1146

}

1147

1148

static final UnicodeSet okAnyway = new UnicodeSet("[^[:Letter:]]");

1149

static final UnicodeSet neverOk = new UnicodeSet("[:Other:]");

1150

1151

public void test(String srcRange, String trgtRange,

1152

String rdtripExclusions, RoundTripTest logger, Legal legalSrc)

1153

throws java.io.IOException {

1154

test(srcRange, trgtRange, srcRange, rdtripExclusions, logger, legalSrc);

1155

}

1156

1157

/**

1158

* Will test

1159

* that everything in sourceRange maps to targetRange,

1160

* that everything in targetRange maps to backtoSourceRange

1161

* that everything roundtrips from target -> source -> target, except roundtripExceptions

1162

1163

public void test(String srcRange, String trgtRange, String backtoSourceRange,

1164

String rdtripExclusions, RoundTripTest logger, Legal legalSrc)

1165

throws java.io.IOException {

1166

1167

legalSource = legalSrc;

1168

sourceRange = new UnicodeSet(srcRange);

1169

sourceRange.removeAll(neverOk);

1170

1171

targetRange = new UnicodeSet(trgtRange);

1172

targetRange.removeAll(neverOk);

1173

1174

toSource = new UnicodeSet(backtoSourceRange);

1175

toSource.addAll(okAnyway);

1176

1177

toTarget = new UnicodeSet(trgtRange);

1178

toTarget.addAll(okAnyway);

1179

1180

if (rdtripExclusions != null && rdtripExclusions.length() > 0) {

1181

roundtripExclusions = new UnicodeSet(rdtripExclusions);

1182

}else{

1183

roundtripExclusions = new UnicodeSet(); // empty

1184

}

1185

1186

log = logger;

1187

1188

log.logln(Utility.escape("Source: " + sourceRange));

1189

log.logln(Utility.escape("Target: " + targetRange));

1190

log.logln(Utility.escape("Exclude: " + roundtripExclusions));

1191

if (log.isQuick()) log.logln("Abbreviated Test");

1192

1193

badCharacters = new UnicodeSet("[:other:]");

1194

1195

// make a UTF-8 output file we can read with a browser

1196

1197

// note: check that every transliterator transliterates the null string correctly!

1198

1199

// {dlf} reorganize so can run test in protected security environment

1200

// String logFileName = "test_" + transliteratorID.replace('/', '_') + ".html";

1201

1202

// File lf = new File(logFileName);

1203

// log.logln("Creating log file " + lf.getAbsoluteFile());

1204

1205

// out = new PrintWriter(new BufferedWriter(new OutputStreamWriter(

1206

// new FileOutputStream(logFileName), "UTF8"), 4*1024));

1207

1208

ByteArrayOutputStream bast = new ByteArrayOutputStream();

1209

out = new PrintWriter(new BufferedWriter(new OutputStreamWriter(

1210

bast, "UTF8"), 4*1024));

1211

//out.write('\uFFEF'); // BOM

1212

out.println("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\">");

1213

out.println("<HTML><HEAD>");

1214

out.println("<META content=\"text/html; charset=utf-8\" http-equiv=Content-Type></HEAD>");

1215

out.println("<BODY bgcolor='#FFFFFF' style='font-family: Arial Unicode MS'>");

1216

1217

try {

1218

test2();

1219

} catch (TestTruncated e) {

1220

out.println(e.getMessage());

1221

}

1222

out.println("</BODY></HTML>");

1223

out.close();

1224

1225

if (errorCount > 0) {

1226

try {

1227

File translitErrorDirectory = new File("translitErrorLogs");

1228

if (!translitErrorDirectory.exists()) {

1229

translitErrorDirectory.mkdir();

1230

}

1231

String logFileName = "translitErrorLogs/test_" + transliteratorID.replace('/', '_') + ".html";

1232

File lf = new File(logFileName);

1233

logger.logln("Creating log file " + lf.getAbsoluteFile());

1234

FileOutputStream fos = new FileOutputStream(lf);

1235

fos.write(bast.toByteArray());

1236

fos.close();

1237

logger.errln(transliteratorID + " errors: "

1238

+ errorCount + (errorCount > errorLimit ? " (at least!)" : "")

1239

+ ", see " + lf.getAbsoluteFile());

1240

}

1241

catch (SecurityException e) {

1242

logger.errln(transliteratorID + " errors: "

1243

+ errorCount + (errorCount > errorLimit ? " (at least!)" : "")

1244

+ ", no log provided due to protected test domain");

1245

}

1246

} else {

1247

logger.logln(transliteratorID + " ok");

1248

// new File(logFileName).delete();

1249

}

1250

}

1251

1252

// ok if at least one is not equal

1253

public boolean checkIrrelevants(Transliterator t, String irrelevants) {

1254

for (int i = 0; i < irrelevants.length(); ++i) {

1255

char c = irrelevants.charAt(i);

1256

String cs = UTF16.valueOf(c);

1257

String targ = t.transliterate(cs);

1258

if (cs.equals(targ)) return true;

1259

}

1260

return false;

1261

}

1262

1263

AbbreviatedUnicodeSetIterator usi = new AbbreviatedUnicodeSetIterator();

1264

AbbreviatedUnicodeSetIterator usi2 = new AbbreviatedUnicodeSetIterator();

1265

1266

Transliterator sourceToTarget;

1267

Transliterator targetToSource;

1268

1269

public void test2() {

1270

1271

sourceToTarget = Transliterator.getInstance(transliteratorID);

1272

targetToSource = sourceToTarget.getInverse();

1273

1274

log.logln("Checking that at least one irrevant characters is not NFC'ed");

1275

out.println("<h3>Checking that at least one irrevant characters is not NFC'ed</h3>");

1276

1277

String irrelevants = "\u2000\u2001\u2126\u212A\u212B\u2329"; // string is from NFC_NO in the UCD

1278

1279

if (!checkIrrelevants(sourceToTarget, irrelevants)) {

1280

logFails("" + getSourceTarget(transliteratorID) + ", Must not NFC everything");

1281

}

1282

if (!checkIrrelevants(targetToSource, irrelevants)) {

1283

logFails("" + getTargetSource(transliteratorID) + ", irrelevants");

1284

}

1285

1286

if (EXTRA_TESTS) {

1287

log.logln("Checking that toRules works");

1288

String rules = "";

1289

Transliterator sourceToTarget2;

1290

Transliterator targetToSource2;

1291

try {

1292

rules = sourceToTarget.toRules(false);

1293

sourceToTarget2 = Transliterator.createFromRules("s2t2", rules, Transliterator.FORWARD);

1294

if (PRINT_RULES) {

1295

out.println("<h3>Forward Rules:</h3><p>");

1296

out.println(TestUtility.replace(rules, "\n", "\u200E<br>\n\u200E"));

1297

out.println("</p>");

1298

}

1299

rules = targetToSource.toRules(false);

1300

targetToSource2 = Transliterator.createFromRules("t2s2", rules, Transliterator.FORWARD);

1301

if (PRINT_RULES) {

1302

out.println("<h3>Backward Rules:</h3><p>");

1303

out.println(TestUtility.replace(rules, "\n", "\u200E<br>\n\u200E"));

1304

out.println("</p>");

1305

}

1306

} catch (RuntimeException e) {

1307

out.println("<h3>Broken Rules:</h3><p>");

1308

out.println(TestUtility.replace(rules, "\n", "<br>\n"));

1309

out.println("</p>");

1310

out.flush();

1311

throw e;

1312

}

1313

1314

out.println("<h3>Roundtrip Exclusions: " + new UnicodeSet(roundtripExclusions) + "</h3>");

1315

out.flush();

1316

1317

checkSourceTargetSource(sourceToTarget2);

1318

1319

checkTargetSourceTarget(targetToSource2);

1320

}

1321

1322

UnicodeSet failSourceTarg = new UnicodeSet();

1323

1324

1325

checkSourceTargetSingles(failSourceTarg);

1326

1327

boolean quickRt = checkSourceTargetDoubles(failSourceTarg);

1328

1329

UnicodeSet failTargSource = new UnicodeSet();

1330

UnicodeSet failRound = new UnicodeSet();

1331

1332

checkTargetSourceSingles(failTargSource, failRound);

1333

checkTargetSourceDoubles(quickRt, failTargSource, failRound);

1334

}

1335

1336

private void checkSourceTargetSource(Transliterator sourceToTarget2) {

1337

log.logln("Checking that source -> target -> source");

1338

out.println("<h3>Checking that source -> target -> source</h3>");

1339

1340

usi.reset(sourceRange);

1341

while (usi.next()) {

1342

int c = usi.codepoint;

1343

1344

String cs = UTF16.valueOf(c);

1345

String targ = sourceToTarget.transliterate(cs);

1346

String targ2 = sourceToTarget2.transliterate(cs);

1347

if (!targ.equals(targ2)) {

1348

logToRulesFails("" + getSourceTarget(transliteratorID) + ", toRules", cs, targ, targ2);

1349

}

1350

}

1351

}

1352

1353

private void checkTargetSourceTarget(Transliterator targetToSource2) {

1354

log.logln("Checking that target -> source -> target");

1355

out.println("<h3>Checking that target -> source -> target</h3>");

1356

usi.reset(targetRange);

1357

while (usi.next()) {

1358

int c = usi.codepoint;

1359

1360

String cs = UTF16.valueOf(c);

1361

String targ = targetToSource.transliterate(cs);

1362

String targ2 = targetToSource2.transliterate(cs);

1363

if (!targ.equals(targ2)) {

1364

logToRulesFails("" + getTargetSource(transliteratorID) + ", toRules", cs, targ, targ2);

1365

}

1366

}

1367

}

1368

1369

private void checkSourceTargetSingles(UnicodeSet failSourceTarg) {

1370

log.logln("Checking that source characters convert to target - Singles");

1371

out.println("<h3>Checking that source characters convert to target - Singles</h3>");

1372

1373

1374

1375

for (char c = 0; c < 0xFFFF; ++c) {

1376

if (!sourceRange.contains(c)) continue;

1377

1378

usi.reset(sourceRange);

1379

while (usi.next()) {

1380

int c = usi.codepoint;

1381

1382

String cs = UTF16.valueOf(c);

1383

String targ = sourceToTarget.transliterate(cs);

1384

if (!toTarget.containsAll(targ)

1385

|| badCharacters.containsSome(targ)) {

1386

String targD = Normalizer.normalize(targ, Normalizer.NFD);

1387

if (!toTarget.containsAll(targD)

1388

|| badCharacters.containsSome(targD)) {

1389

logWrongScript("" + getSourceTarget(transliteratorID) + "", cs, targ, toTarget, badCharacters);

1390

failSourceTarg.add(c);

1391

continue;

1392

}

1393

}

1394

1395

String cs2 = Normalizer.normalize(cs, Normalizer.NFD);

1396

String targ2 = sourceToTarget.transliterate(cs2);

1397

if (!targ.equals(targ2)) {

1398

logNotCanonical("" + getSourceTarget(transliteratorID) + "", cs, targ, cs2, targ2);

1399

}

1400

}

1401

}

1402

1403

private boolean checkSourceTargetDoubles(UnicodeSet failSourceTarg) {

1404

log.logln("Checking that source characters convert to target - Doubles");

1405

out.println("<h3>Checking that source characters convert to target - Doubles</h3>");

1406

long count = 0;

1407

1408

1409

for (char c = 0; c < 0xFFFF; ++c) {

1410

if (TestUtility.isUnassigned(c) ||

1411

!sourceRange.contains(c)) continue;

1412

if (failSourceTarg.get(c)) continue;

1413

1414

1415

1416

UnicodeSet sourceRangeMinusFailures = new UnicodeSet(sourceRange);

1417

sourceRangeMinusFailures.removeAll(failSourceTarg);

1418

1419

boolean quickRt = log.getInclusion() < 10;

1420

1421

usi.reset(sourceRangeMinusFailures, quickRt, density);

1422

1423

while (usi.next()) {

1424

int c = usi.codepoint;

1425

1426

1427

for (char d = 0; d < 0xFFFF; ++d) {

1428

if (TestUtility.isUnassigned(d) ||

1429

!sourceRange.contains(d)) continue;

1430

if (failSourceTarg.get(d)) continue;

1431

1432

log.logln(count + "/" + pairLimit + " Checking starting with " + UTF16.valueOf(c));

1433

usi2.reset(sourceRangeMinusFailures, quickRt, density);

1434

1435

while (usi2.next()) {

1436

int d = usi2.codepoint;

1437

++count;

1438

1439

String cs = UTF16.valueOf(c) + UTF16.valueOf(d);

1440

String targ = sourceToTarget.transliterate(cs);

1441

if (!toTarget.containsAll(targ)

1442

|| badCharacters.containsSome(targ)) {

1443

String targD = Normalizer.normalize(targ, Normalizer.NFD);

1444

if (!toTarget.containsAll(targD)

1445

|| badCharacters.containsSome(targD)) {

1446

logWrongScript("" + getSourceTarget(transliteratorID) + "", cs, targ, toTarget, badCharacters);

1447

continue;

1448

}

1449

}

1450

String cs2 = Normalizer.normalize(cs, Normalizer.NFD);

1451

String targ2 = sourceToTarget.transliterate(cs2);

1452

if (!targ.equals(targ2)) {

1453

logNotCanonical("" + getSourceTarget(transliteratorID) + "", cs, targ, cs2, targ2);

1454

}

1455

}

1456

}

1457

return quickRt;

1458

}

1459

1460

void checkTargetSourceSingles(UnicodeSet failTargSource, UnicodeSet failRound) {

1461

log.logln("Checking that target characters convert to source and back - Singles");

1462

out.println("<h3>Checking that target characters convert to source and back - Singles</h3>");

1463

1464

1465

/*for (char c = 0; c < 0xFFFF; ++c) {

1466

if (TestUtility.isUnassigned(c) ||

1467

!targetRange.contains(c)) continue;

1468

1469

1470

usi.reset(targetRange);

1471

while (usi.next()) {

1472

String cs;

1473

int c;

1474

if(usi.codepoint == UnicodeSetIterator.IS_STRING){

1475

cs = usi.string;

1476

c = UTF16.charAt(cs,0);

1477

}else{

1478

c = usi.codepoint;

1479

cs =UTF16.valueOf(c);

1480

}

1481

1482

String targ = targetToSource.transliterate(cs);

1483

String reverse = sourceToTarget.transliterate(targ);

1484

1485

if (!toSource.containsAll(targ)

1486

|| badCharacters.containsSome(targ)) {

1487

String targD = Normalizer.normalize(targ, Normalizer.NFD);

1488

if (!toSource.containsAll(targD)

1489

|| badCharacters.containsSome(targD)) {

1490

/*UnicodeSet temp = */new UnicodeSet().addAll(targD);

1491

logWrongScript("" + getTargetSource(transliteratorID) + "", cs, targ, toSource, badCharacters);

1492

failTargSource.add(cs);

1493

continue;

1494

}

1495

}

1496

if (!isSame(cs, reverse) && !roundtripExclusions.contains(c)

1497

&& !roundtripExclusions.contains(cs)) {

1498

logRoundTripFailure(cs,targetToSource.getID(), targ,sourceToTarget.getID(), reverse);

1499

failRound.add(c);

1500

continue;

1501

}

1502

String targ2 = Normalizer.normalize(targ, Normalizer.NFD);

1503

String reverse2 = sourceToTarget.transliterate(targ2);

1504

if (!reverse.equals(reverse2)) {

1505

logNotCanonical("" + getTargetSource(transliteratorID) + "", targ, reverse, targ2, reverse2);

1506

}

1507

}

1508

1509

}

1510

1511

private void checkTargetSourceDoubles(boolean quickRt, UnicodeSet failTargSource,

1512

UnicodeSet failRound) {

1513

log.logln("Checking that target characters convert to source and back - Doubles");

1514

out.println("<h3>Checking that target characters convert to source and back - Doubles</h3>");

1515

long count = 0;

1516

1517

UnicodeSet targetRangeMinusFailures = new UnicodeSet(targetRange);

1518

targetRangeMinusFailures.removeAll(failTargSource);

1519

targetRangeMinusFailures.removeAll(failRound);

1520

1521

//char[] buf = new char[4]; // maximum we can have with 2 code points

1522

1523

for (char c = 0; c < 0xFFFF; ++c) {

1524

if (TestUtility.isUnassigned(c) ||

1525

!targetRange.contains(c)) continue;

1526

1527

1528

usi.reset(targetRangeMinusFailures, quickRt, density);

1529

1530

while (usi.next()) {

1531

int c = usi.codepoint;

1532

1533

//log.log(TestUtility.hex(c));

1534

1535

1536

for (char d = 0; d < 0xFFFF; ++d) {

1537

if (TestUtility.isUnassigned(d) ||

1538

!targetRange.contains(d)) continue;

1539

1540

log.logln(count + "/" + pairLimit + " Checking starting with " + UTF16.valueOf(c));

1541

usi2.reset(targetRangeMinusFailures, quickRt, density);

1542

1543

while (usi2.next()) {

1544

1545

int d = usi2.codepoint;

1546

if (d < 0) break;

1547

1548

if (++count > pairLimit) {

1549

throw new TestTruncated("Test truncated at " + pairLimit);

1550

}

1551

1552

String cs = UTF16.valueOf(c) + UTF16.valueOf(d);

1553

String targ = targetToSource.transliterate(cs);

1554

String reverse = sourceToTarget.transliterate(targ);

1555

1556

if (!toSource.containsAll(targ) /*&& !failTargSource.contains(c) && !failTargSource.contains(d)*/

1557

|| badCharacters.containsSome(targ)) {

1558

String targD = Normalizer.normalize(targ, Normalizer.NFD);

1559

if (!toSource.containsAll(targD) /*&& !failTargSource.contains(c) && !failTargSource.contains(d)*/

1560

|| badCharacters.containsSome(targD)) {

1561

logWrongScript("" + getTargetSource(transliteratorID) + "", cs, targ, toSource, badCharacters);

1562

continue;

1563

}

1564

}

1565

if (!isSame(cs, reverse) /*&& !failRound.contains(c) && !failRound.contains(d)*/

1566

&& !roundtripExclusions.contains(c)

1567

&& !roundtripExclusions.contains(d)

1568

&& !roundtripExclusions.contains(cs)) {

1569

logRoundTripFailure(cs,targetToSource.getID(), targ,sourceToTarget.getID(), reverse);

1570

continue;

1571

}

1572

String targ2 = Normalizer.normalize(targ, Normalizer.NFD);

1573

String reverse2 = sourceToTarget.transliterate(targ2);

1574

if (!reverse.equals(reverse2)) {

1575

logNotCanonical("" + getTargetSource(transliteratorID) + "", targ, reverse, targ2, reverse2);

1576

}

1577

}

1578

}

1579

log.logln("");

1580

}

1581

1582

/**

1583

* @param transliteratorID2

1584

* @return

1585

1586

private String getTargetSource(String transliteratorID2) {

1587

return "Target-Source [" + transliteratorID2 + "]";

1588

}

1589

1590

/**

1591

* @param transliteratorID2

1592

* @return

1593

1594

private String getSourceTarget(String transliteratorID2) {

1595

return "Source-Target [" + transliteratorID2 + "]";

1596

}

1597

1598

final String info(String s) {

1599

StringBuffer result = new StringBuffer();

1600

result.append("\u200E").append(s).append("\u200E (").append(TestUtility.hex(s)).append("/");

1601

if (false) { // append age, as a check

1602

int cp = 0;

1603

for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {

1604

cp = UTF16.charAt(s, i);

1605

if (i > 0) result.append(", ");

1606

result.append(UCharacter.getAge(cp));

1607

}

1608

}

1609

result.append(")");

1610

return result.toString();

1611

}

1612

1613

final void logWrongScript(String label, String from, String to,

1614

UnicodeSet shouldContainAll, UnicodeSet shouldNotContainAny) {

1615

if (++errorCount > errorLimit) {

1616

throw new TestTruncated("Test truncated; too many failures");

1617

}

1618

String toD = Normalizer.normalize(to, Normalizer.NFD);

1619

UnicodeSet temp = new UnicodeSet().addAll(toD);

1620

UnicodeSet bad = new UnicodeSet(shouldNotContainAny).retainAll(temp)

1621

.addAll(new UnicodeSet(temp).removeAll(shouldContainAll));

1622

1623

out.println("<br>Fail " + label + ": " +

1624

info(from) + " => " + info(to) + " " + bad

1625

);

1626

}

1627

1628

final void logNotCanonical(String label, String from, String to, String fromCan, String toCan) {

1629

if (++errorCount > errorLimit) {

1630

throw new TestTruncated("Test truncated; too many failures");

1631

}

1632

out.println("<br>Fail (can.equiv) " + label + ": " +

1633

info(from) + " => " + info(to) +

1634

" -- " +

1635

info(fromCan) + " => " + info(toCan) + ")"

1636

);

1637

}

1638

1639

final void logFails(String label) {

1640

if (++errorCount > errorLimit) {

1641

throw new TestTruncated("Test truncated; too many failures");

1642

}

1643

out.println("<br>Fail (can.equiv)" + label);

1644

}

1645

1646

final void logToRulesFails(String label, String from, String to, String toCan) {

1647

if (++errorCount > errorLimit) {

1648

throw new TestTruncated("Test truncated; too many failures");

1649

}

1650

out.println("<br>Fail " + label + ": " +

1651

info(from) + " => " + info(to) + ", " + info(toCan)

1652

);

1653

}

1654

1655

final void logRoundTripFailure(String from,String toID, String to,String backID, String back) {

1656

if (!legalSource.is(from)) return; // skip illegals

1657

1658

if (++errorCount > errorLimit) {

1659

throw new TestTruncated("Test truncated; too many failures");

1660

}

1661

out.println("<br>Fail Roundtrip: " +

1662

info(from) + " "+toID+" => " + info(to) + " " + backID+" => " + info(back)

1663

);

1664

}

1665

1666

1667

* Characters to filter for source-target mapping completeness

1668

* Typically is base alphabet, minus extended characters

1669

* Default is ASCII letters for Latin

1670

1671

1672

public boolean isSource(char c) {

1673

if (!sourceRange.contains(c)) return false;

1674

return true;

1675

}

1676

1677

1678

1679

* Characters to check for target back to source mapping.

1680

* Typically the same as the target script, plus punctuation

1681

1682

1683

public boolean isReceivingSource(char c) {

1684

if (!targetRange.contains(c)) return false;

1685

return true;

1686

}

1687

1688

1689

* Characters to filter for target-source mapping

1690

* Typically is base alphabet, minus extended characters

1691

1692

1693

public boolean isTarget(char c) {

1694

byte script = TestUtility.getScript(c);

1695

if (script != targetScript) return false;

1696

if (!TestUtility.isLetter(c)) return false;

1697

if (targetRange != null && !targetRange.contains(c)) return false;

1698

return true;

1699

}

1700

1701

1702

1703

* Characters to check for target-source mapping

1704

* Typically the same as the source script, plus punctuation

1705

1706

1707

public boolean isReceivingTarget(char c) {

1708

byte script = TestUtility.getScript(c);

1709

return (script == targetScript || script == TestUtility.COMMON_SCRIPT);

1710

}

1711

1712

final boolean isSource(String s) {

1713

for (int i = 0; i < s.length(); ++i) {

1714

if (!isSource(s.charAt(i))) return false;

1715

}

1716

return true;

1717

}

1718

1719

final boolean isTarget(String s) {

1720

for (int i = 0; i < s.length(); ++i) {

1721

if (!isTarget(s.charAt(i))) return false;

1722

}

1723

return true;

1724

}

1725

1726

final boolean isReceivingSource(String s) {

1727

for (int i = 0; i < s.length(); ++i) {

1728

if (!isReceivingSource(s.charAt(i))) return false;

1729

}

1730

return true;

1731

}

1732

1733

final boolean isReceivingTarget(String s) {

1734

for (int i = 0; i < s.length(); ++i) {

1735

if (!isReceivingTarget(s.charAt(i))) return false;

1736

}

1737

return true;

1738

}

1739

1740

1741

static class TestTruncated extends RuntimeException {

1742

/**

1743

* For serialization

1744

1745

private static final long serialVersionUID = 3361828190488168323L;

1746

1747

TestTruncated(String msg) {

1748

super(msg);

1749

}

1750

}

1751

}

1752

1753

// static class TestHangul extends Test {

1754

// TestHangul () {

1755

// super("Jamo-Hangul", TestUtility.JAMO_SCRIPT, TestUtility.HANGUL_SCRIPT);

1756

// }

1757

1758

// public boolean isSource(char c) {

1759

// if (0x1113 <= c && c <= 0x1160) return false;

1760

// if (0x1176 <= c && c <= 0x11F9) return false;

1761

// if (0x3131 <= c && c <= 0x318E) return false;

1762

// return super.isSource(c);

1763

// }

1764

// }

1765

1766

1767

}

Older »