1
/* This Source Code Form is subject to the terms of the Mozilla Public
2
* License, v. 2.0. If a copy of the MPL was not distributed with this
3
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
static const char CVS_ID[] = "@(#) $RCSfile: utf8.c,v $ $Revision: 1.14 $ $Date: 2012/04/25 14:50:16 $";
15
#define PORT_Assert assert
21
* UCS-4 range (hex.) UTF-8 octet sequence (binary)
22
* 0000 0000-0000 007F 0xxxxxxx
23
* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
24
* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
25
* 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
26
* 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
27
* 0400 0000-7FFF FFFF 1111110x 10xxxxxx ... 10xxxxxx
31
* From http://www.imc.org/draft-hoffman-utf16
33
* For U on [0x00010000,0x0010FFFF]: Let U' = U - 0x00010000
35
* U' = yyyyyyyyyyxxxxxxxxxx
36
* W1 = 110110yyyyyyyyyy
37
* W2 = 110111xxxxxxxxxx
41
* This code is assuming NETWORK BYTE ORDER for the 16- and 32-bit
42
* character values. If you wish to use this code for working with
43
* host byte order values, define the following:
52
* #else / * not everyone has elif * /
53
* #if IS_LITTLE_ENDIAN
61
* #error "PDP and NUXI support deferred"
62
* #endif / * IS_LITTLE_ENDIAN * /
63
* #endif / * IS_BIG_ENDIAN * /
73
#define BAD_UTF8 ((PRUint32)-1)
76
* Parse a single UTF-8 character per the spec. in section 3.9 (D36)
80
* index - Points to the byte offset in inBuf of character to read. On success,
81
* updated to the offset of the following character.
82
* inBuf - Input buffer, UTF-8 encoded
83
* inbufLen - Length of input buffer, in bytes.
86
* Success - The UCS4 encoded character
90
sec_port_read_utf8(unsigned int *index, unsigned char *inBuf, unsigned int inBufLen)
93
unsigned int i = *index;
97
PORT_Assert(i < inBufLen);
99
if ( (inBuf[i] & 0x80) == 0x00 ) {
103
} else if ( (inBuf[i] & 0xE0) == 0xC0 ) {
104
result = inBuf[i++] & 0x1F;
107
} else if ( (inBuf[i] & 0xF0) == 0xE0) {
108
result = inBuf[i++] & 0x0F;
111
} else if ( (inBuf[i] & 0xF8) == 0xF0) {
112
result = inBuf[i++] & 0x07;
119
while (bytes_left--) {
120
if (i >= inBufLen || (inBuf[i] & 0xC0) != 0x80) return BAD_UTF8;
121
result = (result << 6) | (inBuf[i++] & 0x3F);
124
/* Check for overlong sequences, surrogates, and outside unicode range */
125
if (result < min_value || (result & 0xFFFFF800) == 0xD800 || result > 0x10FFFF) {
134
sec_port_ucs4_utf8_conversion_function
137
unsigned char *inBuf,
138
unsigned int inBufLen,
139
unsigned char *outBuf,
140
unsigned int maxOutBufLen,
141
unsigned int *outBufLen
144
PORT_Assert((unsigned int *)NULL != outBufLen);
147
unsigned int i, len = 0;
149
for( i = 0; i < inBufLen; ) {
150
if( (inBuf[i] & 0x80) == 0x00 ) i += 1;
151
else if( (inBuf[i] & 0xE0) == 0xC0 ) i += 2;
152
else if( (inBuf[i] & 0xF0) == 0xE0 ) i += 3;
153
else if( (inBuf[i] & 0xF8) == 0xF0 ) i += 4;
154
else return PR_FALSE;
159
if( len > maxOutBufLen ) {
166
for( i = 0; i < inBufLen; ) {
167
PRUint32 ucs4 = sec_port_read_utf8(&i, inBuf, inBufLen);
169
if (ucs4 == BAD_UTF8) return PR_FALSE;
171
outBuf[len+L_0] = 0x00;
172
outBuf[len+L_1] = (unsigned char)(ucs4 >> 16);
173
outBuf[len+L_2] = (unsigned char)(ucs4 >> 8);
174
outBuf[len+L_3] = (unsigned char)ucs4;
182
unsigned int i, len = 0;
183
PORT_Assert((inBufLen % 4) == 0);
184
if ((inBufLen % 4) != 0) {
189
for( i = 0; i < inBufLen; i += 4 ) {
190
if( (inBuf[i+L_0] > 0x00) || (inBuf[i+L_1] > 0x10) ) {
193
} else if( inBuf[i+L_1] >= 0x01 ) len += 4;
194
else if( inBuf[i+L_2] >= 0x08 ) len += 3;
195
else if( (inBuf[i+L_2] > 0x00) || (inBuf[i+L_3] >= 0x80) ) len += 2;
199
if( len > maxOutBufLen ) {
206
for( i = 0; i < inBufLen; i += 4 ) {
207
if( inBuf[i+L_1] >= 0x01 ) {
208
/* 0001 0000-001F FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
209
/* 00000000 000abcde fghijklm nopqrstu ->
210
11110abc 10defghi 10jklmno 10pqrstu */
212
outBuf[len+0] = 0xF0 | ((inBuf[i+L_1] & 0x1C) >> 2);
213
outBuf[len+1] = 0x80 | ((inBuf[i+L_1] & 0x03) << 4)
214
| ((inBuf[i+L_2] & 0xF0) >> 4);
215
outBuf[len+2] = 0x80 | ((inBuf[i+L_2] & 0x0F) << 2)
216
| ((inBuf[i+L_3] & 0xC0) >> 6);
217
outBuf[len+3] = 0x80 | ((inBuf[i+L_3] & 0x3F) >> 0);
220
} else if( inBuf[i+L_2] >= 0x08 ) {
221
/* 0000 0800-0000 FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */
222
/* 00000000 00000000 abcdefgh ijklmnop ->
223
1110abcd 10efghij 10klmnop */
225
outBuf[len+0] = 0xE0 | ((inBuf[i+L_2] & 0xF0) >> 4);
226
outBuf[len+1] = 0x80 | ((inBuf[i+L_2] & 0x0F) << 2)
227
| ((inBuf[i+L_3] & 0xC0) >> 6);
228
outBuf[len+2] = 0x80 | ((inBuf[i+L_3] & 0x3F) >> 0);
231
} else if( (inBuf[i+L_2] > 0x00) || (inBuf[i+L_3] >= 0x80) ) {
232
/* 0000 0080-0000 07FF -> 110xxxxx 10xxxxxx */
233
/* 00000000 00000000 00000abc defghijk ->
236
outBuf[len+0] = 0xC0 | ((inBuf[i+L_2] & 0x07) << 2)
237
| ((inBuf[i+L_3] & 0xC0) >> 6);
238
outBuf[len+1] = 0x80 | ((inBuf[i+L_3] & 0x3F) >> 0);
242
/* 0000 0000-0000 007F -> 0xxxxxx */
243
/* 00000000 00000000 00000000 0abcdefg ->
246
outBuf[len+0] = (inBuf[i+L_3] & 0x7F);
258
sec_port_ucs2_utf8_conversion_function
261
unsigned char *inBuf,
262
unsigned int inBufLen,
263
unsigned char *outBuf,
264
unsigned int maxOutBufLen,
265
unsigned int *outBufLen
268
PORT_Assert((unsigned int *)NULL != outBufLen);
271
unsigned int i, len = 0;
273
for( i = 0; i < inBufLen; ) {
274
if( (inBuf[i] & 0x80) == 0x00 ) {
277
} else if( (inBuf[i] & 0xE0) == 0xC0 ) {
280
} else if( (inBuf[i] & 0xF0) == 0xE0 ) {
283
} else if( (inBuf[i] & 0xF8) == 0xF0 ) {
286
} else return PR_FALSE;
289
if( len > maxOutBufLen ) {
296
for( i = 0; i < inBufLen; ) {
297
PRUint32 ucs4 = sec_port_read_utf8(&i, inBuf, inBufLen);
299
if (ucs4 == BAD_UTF8) return PR_FALSE;
301
if( ucs4 < 0x10000) {
302
outBuf[len+H_0] = (unsigned char)(ucs4 >> 8);
303
outBuf[len+H_1] = (unsigned char)ucs4;
307
outBuf[len+0+H_0] = (unsigned char)(0xD8 | ((ucs4 >> 18) & 0x3));
308
outBuf[len+0+H_1] = (unsigned char)(ucs4 >> 10);
309
outBuf[len+2+H_0] = (unsigned char)(0xDC | ((ucs4 >> 8) & 0x3));
310
outBuf[len+2+H_1] = (unsigned char)ucs4;
318
unsigned int i, len = 0;
319
PORT_Assert((inBufLen % 2) == 0);
320
if ((inBufLen % 2) != 0) {
325
for( i = 0; i < inBufLen; i += 2 ) {
326
if( (inBuf[i+H_0] == 0x00) && ((inBuf[i+H_0] & 0x80) == 0x00) ) len += 1;
327
else if( inBuf[i+H_0] < 0x08 ) len += 2;
328
else if( ((inBuf[i+0+H_0] & 0xDC) == 0xD8) ) {
329
if( ((inBuf[i+2+H_0] & 0xDC) == 0xDC) && ((inBufLen - i) > 2) ) {
339
if( len > maxOutBufLen ) {
346
for( i = 0; i < inBufLen; i += 2 ) {
347
if( (inBuf[i+H_0] == 0x00) && ((inBuf[i+H_1] & 0x80) == 0x00) ) {
348
/* 0000-007F -> 0xxxxxx */
349
/* 00000000 0abcdefg -> 0abcdefg */
351
outBuf[len] = inBuf[i+H_1] & 0x7F;
354
} else if( inBuf[i+H_0] < 0x08 ) {
355
/* 0080-07FF -> 110xxxxx 10xxxxxx */
356
/* 00000abc defghijk -> 110abcde 10fghijk */
358
outBuf[len+0] = 0xC0 | ((inBuf[i+H_0] & 0x07) << 2)
359
| ((inBuf[i+H_1] & 0xC0) >> 6);
360
outBuf[len+1] = 0x80 | ((inBuf[i+H_1] & 0x3F) >> 0);
363
} else if( (inBuf[i+H_0] & 0xDC) == 0xD8 ) {
366
PORT_Assert(((inBuf[i+2+H_0] & 0xDC) == 0xDC) && ((inBufLen - i) > 2));
368
/* D800-DBFF DC00-DFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
369
/* 110110BC DEfghijk 110111lm nopqrstu ->
370
{ Let abcde = BCDE + 1 }
371
11110abc 10defghi 10jklmno 10pqrstu */
373
BCDE = ((inBuf[i+H_0] & 0x03) << 2) | ((inBuf[i+H_1] & 0xC0) >> 6);
376
outBuf[len+0] = 0xF0 | ((abcde & 0x1C) >> 2);
377
outBuf[len+1] = 0x80 | ((abcde & 0x03) << 4)
378
| ((inBuf[i+0+H_1] & 0x3C) >> 2);
379
outBuf[len+2] = 0x80 | ((inBuf[i+0+H_1] & 0x03) << 4)
380
| ((inBuf[i+2+H_0] & 0x03) << 2)
381
| ((inBuf[i+2+H_1] & 0xC0) >> 6);
382
outBuf[len+3] = 0x80 | ((inBuf[i+2+H_1] & 0x3F) >> 0);
387
/* 0800-FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */
388
/* abcdefgh ijklmnop -> 1110abcd 10efghij 10klmnop */
390
outBuf[len+0] = 0xE0 | ((inBuf[i+H_0] & 0xF0) >> 4);
391
outBuf[len+1] = 0x80 | ((inBuf[i+H_0] & 0x0F) << 2)
392
| ((inBuf[i+H_1] & 0xC0) >> 6);
393
outBuf[len+2] = 0x80 | ((inBuf[i+H_1] & 0x3F) >> 0);
405
sec_port_iso88591_utf8_conversion_function
407
const unsigned char *inBuf,
408
unsigned int inBufLen,
409
unsigned char *outBuf,
410
unsigned int maxOutBufLen,
411
unsigned int *outBufLen
414
unsigned int i, len = 0;
416
PORT_Assert((unsigned int *)NULL != outBufLen);
418
for( i = 0; i < inBufLen; i++) {
419
if( (inBuf[i] & 0x80) == 0x00 ) len += 1;
423
if( len > maxOutBufLen ) {
430
for( i = 0; i < inBufLen; i++) {
431
if( (inBuf[i] & 0x80) == 0x00 ) {
432
/* 00-7F -> 0xxxxxxx */
433
/* 0abcdefg -> 0abcdefg */
435
outBuf[len] = inBuf[i];
438
/* 80-FF <- 110xxxxx 10xxxxxx */
439
/* 00000000 abcdefgh -> 110000ab 10cdefgh */
441
outBuf[len+0] = 0xC0 | ((inBuf[i] & 0xC0) >> 6);
442
outBuf[len+1] = 0x80 | ((inBuf[i] & 0x3F) >> 0);
457
#include <netinet/in.h> /* for htonl and htons */
491
struct ucs4 ucs4[] = {
492
{ 0x00000001, "\x01" },
493
{ 0x00000002, "\x02" },
494
{ 0x00000003, "\x03" },
495
{ 0x00000004, "\x04" },
496
{ 0x00000007, "\x07" },
497
{ 0x00000008, "\x08" },
498
{ 0x0000000F, "\x0F" },
499
{ 0x00000010, "\x10" },
500
{ 0x0000001F, "\x1F" },
501
{ 0x00000020, "\x20" },
502
{ 0x0000003F, "\x3F" },
503
{ 0x00000040, "\x40" },
504
{ 0x0000007F, "\x7F" },
506
{ 0x00000080, "\xC2\x80" },
507
{ 0x00000081, "\xC2\x81" },
508
{ 0x00000082, "\xC2\x82" },
509
{ 0x00000084, "\xC2\x84" },
510
{ 0x00000088, "\xC2\x88" },
511
{ 0x00000090, "\xC2\x90" },
512
{ 0x000000A0, "\xC2\xA0" },
513
{ 0x000000C0, "\xC3\x80" },
514
{ 0x000000FF, "\xC3\xBF" },
515
{ 0x00000100, "\xC4\x80" },
516
{ 0x00000101, "\xC4\x81" },
517
{ 0x00000102, "\xC4\x82" },
518
{ 0x00000104, "\xC4\x84" },
519
{ 0x00000108, "\xC4\x88" },
520
{ 0x00000110, "\xC4\x90" },
521
{ 0x00000120, "\xC4\xA0" },
522
{ 0x00000140, "\xC5\x80" },
523
{ 0x00000180, "\xC6\x80" },
524
{ 0x000001FF, "\xC7\xBF" },
525
{ 0x00000200, "\xC8\x80" },
526
{ 0x00000201, "\xC8\x81" },
527
{ 0x00000202, "\xC8\x82" },
528
{ 0x00000204, "\xC8\x84" },
529
{ 0x00000208, "\xC8\x88" },
530
{ 0x00000210, "\xC8\x90" },
531
{ 0x00000220, "\xC8\xA0" },
532
{ 0x00000240, "\xC9\x80" },
533
{ 0x00000280, "\xCA\x80" },
534
{ 0x00000300, "\xCC\x80" },
535
{ 0x000003FF, "\xCF\xBF" },
536
{ 0x00000400, "\xD0\x80" },
537
{ 0x00000401, "\xD0\x81" },
538
{ 0x00000402, "\xD0\x82" },
539
{ 0x00000404, "\xD0\x84" },
540
{ 0x00000408, "\xD0\x88" },
541
{ 0x00000410, "\xD0\x90" },
542
{ 0x00000420, "\xD0\xA0" },
543
{ 0x00000440, "\xD1\x80" },
544
{ 0x00000480, "\xD2\x80" },
545
{ 0x00000500, "\xD4\x80" },
546
{ 0x00000600, "\xD8\x80" },
547
{ 0x000007FF, "\xDF\xBF" },
549
{ 0x00000800, "\xE0\xA0\x80" },
550
{ 0x00000801, "\xE0\xA0\x81" },
551
{ 0x00000802, "\xE0\xA0\x82" },
552
{ 0x00000804, "\xE0\xA0\x84" },
553
{ 0x00000808, "\xE0\xA0\x88" },
554
{ 0x00000810, "\xE0\xA0\x90" },
555
{ 0x00000820, "\xE0\xA0\xA0" },
556
{ 0x00000840, "\xE0\xA1\x80" },
557
{ 0x00000880, "\xE0\xA2\x80" },
558
{ 0x00000900, "\xE0\xA4\x80" },
559
{ 0x00000A00, "\xE0\xA8\x80" },
560
{ 0x00000C00, "\xE0\xB0\x80" },
561
{ 0x00000FFF, "\xE0\xBF\xBF" },
562
{ 0x00001000, "\xE1\x80\x80" },
563
{ 0x00001001, "\xE1\x80\x81" },
564
{ 0x00001002, "\xE1\x80\x82" },
565
{ 0x00001004, "\xE1\x80\x84" },
566
{ 0x00001008, "\xE1\x80\x88" },
567
{ 0x00001010, "\xE1\x80\x90" },
568
{ 0x00001020, "\xE1\x80\xA0" },
569
{ 0x00001040, "\xE1\x81\x80" },
570
{ 0x00001080, "\xE1\x82\x80" },
571
{ 0x00001100, "\xE1\x84\x80" },
572
{ 0x00001200, "\xE1\x88\x80" },
573
{ 0x00001400, "\xE1\x90\x80" },
574
{ 0x00001800, "\xE1\xA0\x80" },
575
{ 0x00001FFF, "\xE1\xBF\xBF" },
576
{ 0x00002000, "\xE2\x80\x80" },
577
{ 0x00002001, "\xE2\x80\x81" },
578
{ 0x00002002, "\xE2\x80\x82" },
579
{ 0x00002004, "\xE2\x80\x84" },
580
{ 0x00002008, "\xE2\x80\x88" },
581
{ 0x00002010, "\xE2\x80\x90" },
582
{ 0x00002020, "\xE2\x80\xA0" },
583
{ 0x00002040, "\xE2\x81\x80" },
584
{ 0x00002080, "\xE2\x82\x80" },
585
{ 0x00002100, "\xE2\x84\x80" },
586
{ 0x00002200, "\xE2\x88\x80" },
587
{ 0x00002400, "\xE2\x90\x80" },
588
{ 0x00002800, "\xE2\xA0\x80" },
589
{ 0x00003000, "\xE3\x80\x80" },
590
{ 0x00003FFF, "\xE3\xBF\xBF" },
591
{ 0x00004000, "\xE4\x80\x80" },
592
{ 0x00004001, "\xE4\x80\x81" },
593
{ 0x00004002, "\xE4\x80\x82" },
594
{ 0x00004004, "\xE4\x80\x84" },
595
{ 0x00004008, "\xE4\x80\x88" },
596
{ 0x00004010, "\xE4\x80\x90" },
597
{ 0x00004020, "\xE4\x80\xA0" },
598
{ 0x00004040, "\xE4\x81\x80" },
599
{ 0x00004080, "\xE4\x82\x80" },
600
{ 0x00004100, "\xE4\x84\x80" },
601
{ 0x00004200, "\xE4\x88\x80" },
602
{ 0x00004400, "\xE4\x90\x80" },
603
{ 0x00004800, "\xE4\xA0\x80" },
604
{ 0x00005000, "\xE5\x80\x80" },
605
{ 0x00006000, "\xE6\x80\x80" },
606
{ 0x00007FFF, "\xE7\xBF\xBF" },
607
{ 0x00008000, "\xE8\x80\x80" },
608
{ 0x00008001, "\xE8\x80\x81" },
609
{ 0x00008002, "\xE8\x80\x82" },
610
{ 0x00008004, "\xE8\x80\x84" },
611
{ 0x00008008, "\xE8\x80\x88" },
612
{ 0x00008010, "\xE8\x80\x90" },
613
{ 0x00008020, "\xE8\x80\xA0" },
614
{ 0x00008040, "\xE8\x81\x80" },
615
{ 0x00008080, "\xE8\x82\x80" },
616
{ 0x00008100, "\xE8\x84\x80" },
617
{ 0x00008200, "\xE8\x88\x80" },
618
{ 0x00008400, "\xE8\x90\x80" },
619
{ 0x00008800, "\xE8\xA0\x80" },
620
{ 0x00009000, "\xE9\x80\x80" },
621
{ 0x0000A000, "\xEA\x80\x80" },
622
{ 0x0000C000, "\xEC\x80\x80" },
623
{ 0x0000FFFF, "\xEF\xBF\xBF" },
625
{ 0x00010000, "\xF0\x90\x80\x80" },
626
{ 0x00010001, "\xF0\x90\x80\x81" },
627
{ 0x00010002, "\xF0\x90\x80\x82" },
628
{ 0x00010004, "\xF0\x90\x80\x84" },
629
{ 0x00010008, "\xF0\x90\x80\x88" },
630
{ 0x00010010, "\xF0\x90\x80\x90" },
631
{ 0x00010020, "\xF0\x90\x80\xA0" },
632
{ 0x00010040, "\xF0\x90\x81\x80" },
633
{ 0x00010080, "\xF0\x90\x82\x80" },
634
{ 0x00010100, "\xF0\x90\x84\x80" },
635
{ 0x00010200, "\xF0\x90\x88\x80" },
636
{ 0x00010400, "\xF0\x90\x90\x80" },
637
{ 0x00010800, "\xF0\x90\xA0\x80" },
638
{ 0x00011000, "\xF0\x91\x80\x80" },
639
{ 0x00012000, "\xF0\x92\x80\x80" },
640
{ 0x00014000, "\xF0\x94\x80\x80" },
641
{ 0x00018000, "\xF0\x98\x80\x80" },
642
{ 0x0001FFFF, "\xF0\x9F\xBF\xBF" },
643
{ 0x00020000, "\xF0\xA0\x80\x80" },
644
{ 0x00020001, "\xF0\xA0\x80\x81" },
645
{ 0x00020002, "\xF0\xA0\x80\x82" },
646
{ 0x00020004, "\xF0\xA0\x80\x84" },
647
{ 0x00020008, "\xF0\xA0\x80\x88" },
648
{ 0x00020010, "\xF0\xA0\x80\x90" },
649
{ 0x00020020, "\xF0\xA0\x80\xA0" },
650
{ 0x00020040, "\xF0\xA0\x81\x80" },
651
{ 0x00020080, "\xF0\xA0\x82\x80" },
652
{ 0x00020100, "\xF0\xA0\x84\x80" },
653
{ 0x00020200, "\xF0\xA0\x88\x80" },
654
{ 0x00020400, "\xF0\xA0\x90\x80" },
655
{ 0x00020800, "\xF0\xA0\xA0\x80" },
656
{ 0x00021000, "\xF0\xA1\x80\x80" },
657
{ 0x00022000, "\xF0\xA2\x80\x80" },
658
{ 0x00024000, "\xF0\xA4\x80\x80" },
659
{ 0x00028000, "\xF0\xA8\x80\x80" },
660
{ 0x00030000, "\xF0\xB0\x80\x80" },
661
{ 0x0003FFFF, "\xF0\xBF\xBF\xBF" },
662
{ 0x00040000, "\xF1\x80\x80\x80" },
663
{ 0x00040001, "\xF1\x80\x80\x81" },
664
{ 0x00040002, "\xF1\x80\x80\x82" },
665
{ 0x00040004, "\xF1\x80\x80\x84" },
666
{ 0x00040008, "\xF1\x80\x80\x88" },
667
{ 0x00040010, "\xF1\x80\x80\x90" },
668
{ 0x00040020, "\xF1\x80\x80\xA0" },
669
{ 0x00040040, "\xF1\x80\x81\x80" },
670
{ 0x00040080, "\xF1\x80\x82\x80" },
671
{ 0x00040100, "\xF1\x80\x84\x80" },
672
{ 0x00040200, "\xF1\x80\x88\x80" },
673
{ 0x00040400, "\xF1\x80\x90\x80" },
674
{ 0x00040800, "\xF1\x80\xA0\x80" },
675
{ 0x00041000, "\xF1\x81\x80\x80" },
676
{ 0x00042000, "\xF1\x82\x80\x80" },
677
{ 0x00044000, "\xF1\x84\x80\x80" },
678
{ 0x00048000, "\xF1\x88\x80\x80" },
679
{ 0x00050000, "\xF1\x90\x80\x80" },
680
{ 0x00060000, "\xF1\xA0\x80\x80" },
681
{ 0x0007FFFF, "\xF1\xBF\xBF\xBF" },
682
{ 0x00080000, "\xF2\x80\x80\x80" },
683
{ 0x00080001, "\xF2\x80\x80\x81" },
684
{ 0x00080002, "\xF2\x80\x80\x82" },
685
{ 0x00080004, "\xF2\x80\x80\x84" },
686
{ 0x00080008, "\xF2\x80\x80\x88" },
687
{ 0x00080010, "\xF2\x80\x80\x90" },
688
{ 0x00080020, "\xF2\x80\x80\xA0" },
689
{ 0x00080040, "\xF2\x80\x81\x80" },
690
{ 0x00080080, "\xF2\x80\x82\x80" },
691
{ 0x00080100, "\xF2\x80\x84\x80" },
692
{ 0x00080200, "\xF2\x80\x88\x80" },
693
{ 0x00080400, "\xF2\x80\x90\x80" },
694
{ 0x00080800, "\xF2\x80\xA0\x80" },
695
{ 0x00081000, "\xF2\x81\x80\x80" },
696
{ 0x00082000, "\xF2\x82\x80\x80" },
697
{ 0x00084000, "\xF2\x84\x80\x80" },
698
{ 0x00088000, "\xF2\x88\x80\x80" },
699
{ 0x00090000, "\xF2\x90\x80\x80" },
700
{ 0x000A0000, "\xF2\xA0\x80\x80" },
701
{ 0x000C0000, "\xF3\x80\x80\x80" },
702
{ 0x000FFFFF, "\xF3\xBF\xBF\xBF" },
703
{ 0x00100000, "\xF4\x80\x80\x80" },
704
{ 0x00100001, "\xF4\x80\x80\x81" },
705
{ 0x00100002, "\xF4\x80\x80\x82" },
706
{ 0x00100004, "\xF4\x80\x80\x84" },
707
{ 0x00100008, "\xF4\x80\x80\x88" },
708
{ 0x00100010, "\xF4\x80\x80\x90" },
709
{ 0x00100020, "\xF4\x80\x80\xA0" },
710
{ 0x00100040, "\xF4\x80\x81\x80" },
711
{ 0x00100080, "\xF4\x80\x82\x80" },
712
{ 0x00100100, "\xF4\x80\x84\x80" },
713
{ 0x00100200, "\xF4\x80\x88\x80" },
714
{ 0x00100400, "\xF4\x80\x90\x80" },
715
{ 0x00100800, "\xF4\x80\xA0\x80" },
716
{ 0x00101000, "\xF4\x81\x80\x80" },
717
{ 0x00102000, "\xF4\x82\x80\x80" },
718
{ 0x00104000, "\xF4\x84\x80\x80" },
719
{ 0x00108000, "\xF4\x88\x80\x80" },
720
{ 0x0010FFFF, "\xF4\x8F\xBF\xBF" },
727
struct ucs2 ucs2[] = {
742
{ 0x0080, "\xC2\x80" },
743
{ 0x0081, "\xC2\x81" },
744
{ 0x0082, "\xC2\x82" },
745
{ 0x0084, "\xC2\x84" },
746
{ 0x0088, "\xC2\x88" },
747
{ 0x0090, "\xC2\x90" },
748
{ 0x00A0, "\xC2\xA0" },
749
{ 0x00C0, "\xC3\x80" },
750
{ 0x00FF, "\xC3\xBF" },
751
{ 0x0100, "\xC4\x80" },
752
{ 0x0101, "\xC4\x81" },
753
{ 0x0102, "\xC4\x82" },
754
{ 0x0104, "\xC4\x84" },
755
{ 0x0108, "\xC4\x88" },
756
{ 0x0110, "\xC4\x90" },
757
{ 0x0120, "\xC4\xA0" },
758
{ 0x0140, "\xC5\x80" },
759
{ 0x0180, "\xC6\x80" },
760
{ 0x01FF, "\xC7\xBF" },
761
{ 0x0200, "\xC8\x80" },
762
{ 0x0201, "\xC8\x81" },
763
{ 0x0202, "\xC8\x82" },
764
{ 0x0204, "\xC8\x84" },
765
{ 0x0208, "\xC8\x88" },
766
{ 0x0210, "\xC8\x90" },
767
{ 0x0220, "\xC8\xA0" },
768
{ 0x0240, "\xC9\x80" },
769
{ 0x0280, "\xCA\x80" },
770
{ 0x0300, "\xCC\x80" },
771
{ 0x03FF, "\xCF\xBF" },
772
{ 0x0400, "\xD0\x80" },
773
{ 0x0401, "\xD0\x81" },
774
{ 0x0402, "\xD0\x82" },
775
{ 0x0404, "\xD0\x84" },
776
{ 0x0408, "\xD0\x88" },
777
{ 0x0410, "\xD0\x90" },
778
{ 0x0420, "\xD0\xA0" },
779
{ 0x0440, "\xD1\x80" },
780
{ 0x0480, "\xD2\x80" },
781
{ 0x0500, "\xD4\x80" },
782
{ 0x0600, "\xD8\x80" },
783
{ 0x07FF, "\xDF\xBF" },
785
{ 0x0800, "\xE0\xA0\x80" },
786
{ 0x0801, "\xE0\xA0\x81" },
787
{ 0x0802, "\xE0\xA0\x82" },
788
{ 0x0804, "\xE0\xA0\x84" },
789
{ 0x0808, "\xE0\xA0\x88" },
790
{ 0x0810, "\xE0\xA0\x90" },
791
{ 0x0820, "\xE0\xA0\xA0" },
792
{ 0x0840, "\xE0\xA1\x80" },
793
{ 0x0880, "\xE0\xA2\x80" },
794
{ 0x0900, "\xE0\xA4\x80" },
795
{ 0x0A00, "\xE0\xA8\x80" },
796
{ 0x0C00, "\xE0\xB0\x80" },
797
{ 0x0FFF, "\xE0\xBF\xBF" },
798
{ 0x1000, "\xE1\x80\x80" },
799
{ 0x1001, "\xE1\x80\x81" },
800
{ 0x1002, "\xE1\x80\x82" },
801
{ 0x1004, "\xE1\x80\x84" },
802
{ 0x1008, "\xE1\x80\x88" },
803
{ 0x1010, "\xE1\x80\x90" },
804
{ 0x1020, "\xE1\x80\xA0" },
805
{ 0x1040, "\xE1\x81\x80" },
806
{ 0x1080, "\xE1\x82\x80" },
807
{ 0x1100, "\xE1\x84\x80" },
808
{ 0x1200, "\xE1\x88\x80" },
809
{ 0x1400, "\xE1\x90\x80" },
810
{ 0x1800, "\xE1\xA0\x80" },
811
{ 0x1FFF, "\xE1\xBF\xBF" },
812
{ 0x2000, "\xE2\x80\x80" },
813
{ 0x2001, "\xE2\x80\x81" },
814
{ 0x2002, "\xE2\x80\x82" },
815
{ 0x2004, "\xE2\x80\x84" },
816
{ 0x2008, "\xE2\x80\x88" },
817
{ 0x2010, "\xE2\x80\x90" },
818
{ 0x2020, "\xE2\x80\xA0" },
819
{ 0x2040, "\xE2\x81\x80" },
820
{ 0x2080, "\xE2\x82\x80" },
821
{ 0x2100, "\xE2\x84\x80" },
822
{ 0x2200, "\xE2\x88\x80" },
823
{ 0x2400, "\xE2\x90\x80" },
824
{ 0x2800, "\xE2\xA0\x80" },
825
{ 0x3000, "\xE3\x80\x80" },
826
{ 0x3FFF, "\xE3\xBF\xBF" },
827
{ 0x4000, "\xE4\x80\x80" },
828
{ 0x4001, "\xE4\x80\x81" },
829
{ 0x4002, "\xE4\x80\x82" },
830
{ 0x4004, "\xE4\x80\x84" },
831
{ 0x4008, "\xE4\x80\x88" },
832
{ 0x4010, "\xE4\x80\x90" },
833
{ 0x4020, "\xE4\x80\xA0" },
834
{ 0x4040, "\xE4\x81\x80" },
835
{ 0x4080, "\xE4\x82\x80" },
836
{ 0x4100, "\xE4\x84\x80" },
837
{ 0x4200, "\xE4\x88\x80" },
838
{ 0x4400, "\xE4\x90\x80" },
839
{ 0x4800, "\xE4\xA0\x80" },
840
{ 0x5000, "\xE5\x80\x80" },
841
{ 0x6000, "\xE6\x80\x80" },
842
{ 0x7FFF, "\xE7\xBF\xBF" },
843
{ 0x8000, "\xE8\x80\x80" },
844
{ 0x8001, "\xE8\x80\x81" },
845
{ 0x8002, "\xE8\x80\x82" },
846
{ 0x8004, "\xE8\x80\x84" },
847
{ 0x8008, "\xE8\x80\x88" },
848
{ 0x8010, "\xE8\x80\x90" },
849
{ 0x8020, "\xE8\x80\xA0" },
850
{ 0x8040, "\xE8\x81\x80" },
851
{ 0x8080, "\xE8\x82\x80" },
852
{ 0x8100, "\xE8\x84\x80" },
853
{ 0x8200, "\xE8\x88\x80" },
854
{ 0x8400, "\xE8\x90\x80" },
855
{ 0x8800, "\xE8\xA0\x80" },
856
{ 0x9000, "\xE9\x80\x80" },
857
{ 0xA000, "\xEA\x80\x80" },
858
{ 0xC000, "\xEC\x80\x80" },
859
{ 0xFFFF, "\xEF\xBF\xBF" }
867
struct utf16 utf16[] = {
868
{ 0x00010000, { 0xD800, 0xDC00 } },
869
{ 0x00010001, { 0xD800, 0xDC01 } },
870
{ 0x00010002, { 0xD800, 0xDC02 } },
871
{ 0x00010003, { 0xD800, 0xDC03 } },
872
{ 0x00010004, { 0xD800, 0xDC04 } },
873
{ 0x00010007, { 0xD800, 0xDC07 } },
874
{ 0x00010008, { 0xD800, 0xDC08 } },
875
{ 0x0001000F, { 0xD800, 0xDC0F } },
876
{ 0x00010010, { 0xD800, 0xDC10 } },
877
{ 0x0001001F, { 0xD800, 0xDC1F } },
878
{ 0x00010020, { 0xD800, 0xDC20 } },
879
{ 0x0001003F, { 0xD800, 0xDC3F } },
880
{ 0x00010040, { 0xD800, 0xDC40 } },
881
{ 0x0001007F, { 0xD800, 0xDC7F } },
882
{ 0x00010080, { 0xD800, 0xDC80 } },
883
{ 0x00010081, { 0xD800, 0xDC81 } },
884
{ 0x00010082, { 0xD800, 0xDC82 } },
885
{ 0x00010084, { 0xD800, 0xDC84 } },
886
{ 0x00010088, { 0xD800, 0xDC88 } },
887
{ 0x00010090, { 0xD800, 0xDC90 } },
888
{ 0x000100A0, { 0xD800, 0xDCA0 } },
889
{ 0x000100C0, { 0xD800, 0xDCC0 } },
890
{ 0x000100FF, { 0xD800, 0xDCFF } },
891
{ 0x00010100, { 0xD800, 0xDD00 } },
892
{ 0x00010101, { 0xD800, 0xDD01 } },
893
{ 0x00010102, { 0xD800, 0xDD02 } },
894
{ 0x00010104, { 0xD800, 0xDD04 } },
895
{ 0x00010108, { 0xD800, 0xDD08 } },
896
{ 0x00010110, { 0xD800, 0xDD10 } },
897
{ 0x00010120, { 0xD800, 0xDD20 } },
898
{ 0x00010140, { 0xD800, 0xDD40 } },
899
{ 0x00010180, { 0xD800, 0xDD80 } },
900
{ 0x000101FF, { 0xD800, 0xDDFF } },
901
{ 0x00010200, { 0xD800, 0xDE00 } },
902
{ 0x00010201, { 0xD800, 0xDE01 } },
903
{ 0x00010202, { 0xD800, 0xDE02 } },
904
{ 0x00010204, { 0xD800, 0xDE04 } },
905
{ 0x00010208, { 0xD800, 0xDE08 } },
906
{ 0x00010210, { 0xD800, 0xDE10 } },
907
{ 0x00010220, { 0xD800, 0xDE20 } },
908
{ 0x00010240, { 0xD800, 0xDE40 } },
909
{ 0x00010280, { 0xD800, 0xDE80 } },
910
{ 0x00010300, { 0xD800, 0xDF00 } },
911
{ 0x000103FF, { 0xD800, 0xDFFF } },
912
{ 0x00010400, { 0xD801, 0xDC00 } },
913
{ 0x00010401, { 0xD801, 0xDC01 } },
914
{ 0x00010402, { 0xD801, 0xDC02 } },
915
{ 0x00010404, { 0xD801, 0xDC04 } },
916
{ 0x00010408, { 0xD801, 0xDC08 } },
917
{ 0x00010410, { 0xD801, 0xDC10 } },
918
{ 0x00010420, { 0xD801, 0xDC20 } },
919
{ 0x00010440, { 0xD801, 0xDC40 } },
920
{ 0x00010480, { 0xD801, 0xDC80 } },
921
{ 0x00010500, { 0xD801, 0xDD00 } },
922
{ 0x00010600, { 0xD801, 0xDE00 } },
923
{ 0x000107FF, { 0xD801, 0xDFFF } },
924
{ 0x00010800, { 0xD802, 0xDC00 } },
925
{ 0x00010801, { 0xD802, 0xDC01 } },
926
{ 0x00010802, { 0xD802, 0xDC02 } },
927
{ 0x00010804, { 0xD802, 0xDC04 } },
928
{ 0x00010808, { 0xD802, 0xDC08 } },
929
{ 0x00010810, { 0xD802, 0xDC10 } },
930
{ 0x00010820, { 0xD802, 0xDC20 } },
931
{ 0x00010840, { 0xD802, 0xDC40 } },
932
{ 0x00010880, { 0xD802, 0xDC80 } },
933
{ 0x00010900, { 0xD802, 0xDD00 } },
934
{ 0x00010A00, { 0xD802, 0xDE00 } },
935
{ 0x00010C00, { 0xD803, 0xDC00 } },
936
{ 0x00010FFF, { 0xD803, 0xDFFF } },
937
{ 0x00011000, { 0xD804, 0xDC00 } },
938
{ 0x00011001, { 0xD804, 0xDC01 } },
939
{ 0x00011002, { 0xD804, 0xDC02 } },
940
{ 0x00011004, { 0xD804, 0xDC04 } },
941
{ 0x00011008, { 0xD804, 0xDC08 } },
942
{ 0x00011010, { 0xD804, 0xDC10 } },
943
{ 0x00011020, { 0xD804, 0xDC20 } },
944
{ 0x00011040, { 0xD804, 0xDC40 } },
945
{ 0x00011080, { 0xD804, 0xDC80 } },
946
{ 0x00011100, { 0xD804, 0xDD00 } },
947
{ 0x00011200, { 0xD804, 0xDE00 } },
948
{ 0x00011400, { 0xD805, 0xDC00 } },
949
{ 0x00011800, { 0xD806, 0xDC00 } },
950
{ 0x00011FFF, { 0xD807, 0xDFFF } },
951
{ 0x00012000, { 0xD808, 0xDC00 } },
952
{ 0x00012001, { 0xD808, 0xDC01 } },
953
{ 0x00012002, { 0xD808, 0xDC02 } },
954
{ 0x00012004, { 0xD808, 0xDC04 } },
955
{ 0x00012008, { 0xD808, 0xDC08 } },
956
{ 0x00012010, { 0xD808, 0xDC10 } },
957
{ 0x00012020, { 0xD808, 0xDC20 } },
958
{ 0x00012040, { 0xD808, 0xDC40 } },
959
{ 0x00012080, { 0xD808, 0xDC80 } },
960
{ 0x00012100, { 0xD808, 0xDD00 } },
961
{ 0x00012200, { 0xD808, 0xDE00 } },
962
{ 0x00012400, { 0xD809, 0xDC00 } },
963
{ 0x00012800, { 0xD80A, 0xDC00 } },
964
{ 0x00013000, { 0xD80C, 0xDC00 } },
965
{ 0x00013FFF, { 0xD80F, 0xDFFF } },
966
{ 0x00014000, { 0xD810, 0xDC00 } },
967
{ 0x00014001, { 0xD810, 0xDC01 } },
968
{ 0x00014002, { 0xD810, 0xDC02 } },
969
{ 0x00014004, { 0xD810, 0xDC04 } },
970
{ 0x00014008, { 0xD810, 0xDC08 } },
971
{ 0x00014010, { 0xD810, 0xDC10 } },
972
{ 0x00014020, { 0xD810, 0xDC20 } },
973
{ 0x00014040, { 0xD810, 0xDC40 } },
974
{ 0x00014080, { 0xD810, 0xDC80 } },
975
{ 0x00014100, { 0xD810, 0xDD00 } },
976
{ 0x00014200, { 0xD810, 0xDE00 } },
977
{ 0x00014400, { 0xD811, 0xDC00 } },
978
{ 0x00014800, { 0xD812, 0xDC00 } },
979
{ 0x00015000, { 0xD814, 0xDC00 } },
980
{ 0x00016000, { 0xD818, 0xDC00 } },
981
{ 0x00017FFF, { 0xD81F, 0xDFFF } },
982
{ 0x00018000, { 0xD820, 0xDC00 } },
983
{ 0x00018001, { 0xD820, 0xDC01 } },
984
{ 0x00018002, { 0xD820, 0xDC02 } },
985
{ 0x00018004, { 0xD820, 0xDC04 } },
986
{ 0x00018008, { 0xD820, 0xDC08 } },
987
{ 0x00018010, { 0xD820, 0xDC10 } },
988
{ 0x00018020, { 0xD820, 0xDC20 } },
989
{ 0x00018040, { 0xD820, 0xDC40 } },
990
{ 0x00018080, { 0xD820, 0xDC80 } },
991
{ 0x00018100, { 0xD820, 0xDD00 } },
992
{ 0x00018200, { 0xD820, 0xDE00 } },
993
{ 0x00018400, { 0xD821, 0xDC00 } },
994
{ 0x00018800, { 0xD822, 0xDC00 } },
995
{ 0x00019000, { 0xD824, 0xDC00 } },
996
{ 0x0001A000, { 0xD828, 0xDC00 } },
997
{ 0x0001C000, { 0xD830, 0xDC00 } },
998
{ 0x0001FFFF, { 0xD83F, 0xDFFF } },
999
{ 0x00020000, { 0xD840, 0xDC00 } },
1000
{ 0x00020001, { 0xD840, 0xDC01 } },
1001
{ 0x00020002, { 0xD840, 0xDC02 } },
1002
{ 0x00020004, { 0xD840, 0xDC04 } },
1003
{ 0x00020008, { 0xD840, 0xDC08 } },
1004
{ 0x00020010, { 0xD840, 0xDC10 } },
1005
{ 0x00020020, { 0xD840, 0xDC20 } },
1006
{ 0x00020040, { 0xD840, 0xDC40 } },
1007
{ 0x00020080, { 0xD840, 0xDC80 } },
1008
{ 0x00020100, { 0xD840, 0xDD00 } },
1009
{ 0x00020200, { 0xD840, 0xDE00 } },
1010
{ 0x00020400, { 0xD841, 0xDC00 } },
1011
{ 0x00020800, { 0xD842, 0xDC00 } },
1012
{ 0x00021000, { 0xD844, 0xDC00 } },
1013
{ 0x00022000, { 0xD848, 0xDC00 } },
1014
{ 0x00024000, { 0xD850, 0xDC00 } },
1015
{ 0x00028000, { 0xD860, 0xDC00 } },
1016
{ 0x0002FFFF, { 0xD87F, 0xDFFF } },
1017
{ 0x00030000, { 0xD880, 0xDC00 } },
1018
{ 0x00030001, { 0xD880, 0xDC01 } },
1019
{ 0x00030002, { 0xD880, 0xDC02 } },
1020
{ 0x00030004, { 0xD880, 0xDC04 } },
1021
{ 0x00030008, { 0xD880, 0xDC08 } },
1022
{ 0x00030010, { 0xD880, 0xDC10 } },
1023
{ 0x00030020, { 0xD880, 0xDC20 } },
1024
{ 0x00030040, { 0xD880, 0xDC40 } },
1025
{ 0x00030080, { 0xD880, 0xDC80 } },
1026
{ 0x00030100, { 0xD880, 0xDD00 } },
1027
{ 0x00030200, { 0xD880, 0xDE00 } },
1028
{ 0x00030400, { 0xD881, 0xDC00 } },
1029
{ 0x00030800, { 0xD882, 0xDC00 } },
1030
{ 0x00031000, { 0xD884, 0xDC00 } },
1031
{ 0x00032000, { 0xD888, 0xDC00 } },
1032
{ 0x00034000, { 0xD890, 0xDC00 } },
1033
{ 0x00038000, { 0xD8A0, 0xDC00 } },
1034
{ 0x0003FFFF, { 0xD8BF, 0xDFFF } },
1035
{ 0x00040000, { 0xD8C0, 0xDC00 } },
1036
{ 0x00040001, { 0xD8C0, 0xDC01 } },
1037
{ 0x00040002, { 0xD8C0, 0xDC02 } },
1038
{ 0x00040004, { 0xD8C0, 0xDC04 } },
1039
{ 0x00040008, { 0xD8C0, 0xDC08 } },
1040
{ 0x00040010, { 0xD8C0, 0xDC10 } },
1041
{ 0x00040020, { 0xD8C0, 0xDC20 } },
1042
{ 0x00040040, { 0xD8C0, 0xDC40 } },
1043
{ 0x00040080, { 0xD8C0, 0xDC80 } },
1044
{ 0x00040100, { 0xD8C0, 0xDD00 } },
1045
{ 0x00040200, { 0xD8C0, 0xDE00 } },
1046
{ 0x00040400, { 0xD8C1, 0xDC00 } },
1047
{ 0x00040800, { 0xD8C2, 0xDC00 } },
1048
{ 0x00041000, { 0xD8C4, 0xDC00 } },
1049
{ 0x00042000, { 0xD8C8, 0xDC00 } },
1050
{ 0x00044000, { 0xD8D0, 0xDC00 } },
1051
{ 0x00048000, { 0xD8E0, 0xDC00 } },
1052
{ 0x0004FFFF, { 0xD8FF, 0xDFFF } },
1053
{ 0x00050000, { 0xD900, 0xDC00 } },
1054
{ 0x00050001, { 0xD900, 0xDC01 } },
1055
{ 0x00050002, { 0xD900, 0xDC02 } },
1056
{ 0x00050004, { 0xD900, 0xDC04 } },
1057
{ 0x00050008, { 0xD900, 0xDC08 } },
1058
{ 0x00050010, { 0xD900, 0xDC10 } },
1059
{ 0x00050020, { 0xD900, 0xDC20 } },
1060
{ 0x00050040, { 0xD900, 0xDC40 } },
1061
{ 0x00050080, { 0xD900, 0xDC80 } },
1062
{ 0x00050100, { 0xD900, 0xDD00 } },
1063
{ 0x00050200, { 0xD900, 0xDE00 } },
1064
{ 0x00050400, { 0xD901, 0xDC00 } },
1065
{ 0x00050800, { 0xD902, 0xDC00 } },
1066
{ 0x00051000, { 0xD904, 0xDC00 } },
1067
{ 0x00052000, { 0xD908, 0xDC00 } },
1068
{ 0x00054000, { 0xD910, 0xDC00 } },
1069
{ 0x00058000, { 0xD920, 0xDC00 } },
1070
{ 0x00060000, { 0xD940, 0xDC00 } },
1071
{ 0x00070000, { 0xD980, 0xDC00 } },
1072
{ 0x0007FFFF, { 0xD9BF, 0xDFFF } },
1073
{ 0x00080000, { 0xD9C0, 0xDC00 } },
1074
{ 0x00080001, { 0xD9C0, 0xDC01 } },
1075
{ 0x00080002, { 0xD9C0, 0xDC02 } },
1076
{ 0x00080004, { 0xD9C0, 0xDC04 } },
1077
{ 0x00080008, { 0xD9C0, 0xDC08 } },
1078
{ 0x00080010, { 0xD9C0, 0xDC10 } },
1079
{ 0x00080020, { 0xD9C0, 0xDC20 } },
1080
{ 0x00080040, { 0xD9C0, 0xDC40 } },
1081
{ 0x00080080, { 0xD9C0, 0xDC80 } },
1082
{ 0x00080100, { 0xD9C0, 0xDD00 } },
1083
{ 0x00080200, { 0xD9C0, 0xDE00 } },
1084
{ 0x00080400, { 0xD9C1, 0xDC00 } },
1085
{ 0x00080800, { 0xD9C2, 0xDC00 } },
1086
{ 0x00081000, { 0xD9C4, 0xDC00 } },
1087
{ 0x00082000, { 0xD9C8, 0xDC00 } },
1088
{ 0x00084000, { 0xD9D0, 0xDC00 } },
1089
{ 0x00088000, { 0xD9E0, 0xDC00 } },
1090
{ 0x0008FFFF, { 0xD9FF, 0xDFFF } },
1091
{ 0x00090000, { 0xDA00, 0xDC00 } },
1092
{ 0x00090001, { 0xDA00, 0xDC01 } },
1093
{ 0x00090002, { 0xDA00, 0xDC02 } },
1094
{ 0x00090004, { 0xDA00, 0xDC04 } },
1095
{ 0x00090008, { 0xDA00, 0xDC08 } },
1096
{ 0x00090010, { 0xDA00, 0xDC10 } },
1097
{ 0x00090020, { 0xDA00, 0xDC20 } },
1098
{ 0x00090040, { 0xDA00, 0xDC40 } },
1099
{ 0x00090080, { 0xDA00, 0xDC80 } },
1100
{ 0x00090100, { 0xDA00, 0xDD00 } },
1101
{ 0x00090200, { 0xDA00, 0xDE00 } },
1102
{ 0x00090400, { 0xDA01, 0xDC00 } },
1103
{ 0x00090800, { 0xDA02, 0xDC00 } },
1104
{ 0x00091000, { 0xDA04, 0xDC00 } },
1105
{ 0x00092000, { 0xDA08, 0xDC00 } },
1106
{ 0x00094000, { 0xDA10, 0xDC00 } },
1107
{ 0x00098000, { 0xDA20, 0xDC00 } },
1108
{ 0x000A0000, { 0xDA40, 0xDC00 } },
1109
{ 0x000B0000, { 0xDA80, 0xDC00 } },
1110
{ 0x000C0000, { 0xDAC0, 0xDC00 } },
1111
{ 0x000D0000, { 0xDB00, 0xDC00 } },
1112
{ 0x000FFFFF, { 0xDBBF, 0xDFFF } },
1113
{ 0x0010FFFF, { 0xDBFF, 0xDFFF } }
1117
/* illegal utf8 sequences */
1118
char *utf8_bad[] = {
1127
"\xF8\x80\x80\x80\x80",
1128
"\xF8\x88\x80\x80\x80",
1129
"\xF8\x92\x80\x80\x80",
1130
"\xF8\x9F\xBF\xBF\xBF",
1131
"\xF8\xA0\x80\x80\x80",
1132
"\xF8\xA8\x80\x80\x80",
1133
"\xF8\xB0\x80\x80\x80",
1134
"\xF8\xBF\xBF\xBF\xBF",
1135
"\xF9\x80\x80\x80\x88",
1136
"\xF9\x84\x80\x80\x80",
1137
"\xF9\xBF\xBF\xBF\xBF",
1138
"\xFA\x80\x80\x80\x80",
1139
"\xFA\x90\x80\x80\x80",
1140
"\xFB\xBF\xBF\xBF\xBF",
1141
"\xFC\x84\x80\x80\x80\x81",
1142
"\xFC\x85\x80\x80\x80\x80",
1143
"\xFC\x86\x80\x80\x80\x80",
1144
"\xFC\x87\xBF\xBF\xBF\xBF",
1145
"\xFC\x88\xA0\x80\x80\x80",
1146
"\xFC\x89\x80\x80\x80\x80",
1147
"\xFC\x8A\x80\x80\x80\x80",
1148
"\xFC\x90\x80\x80\x80\x82",
1149
"\xFD\x80\x80\x80\x80\x80",
1150
"\xFD\xBF\xBF\xBF\xBF\xBF",
1157
"\xED\xA0\x80\xE0\xBF\xBF",
1164
unsigned char *utf8,
1168
fprintf(stdout, "%s ", word);
1169
for( ; *utf8; utf8++ ) {
1170
fprintf(stdout, "%02.2x ", (unsigned int)*utf8);
1172
fprintf(stdout, "%s", end);
1181
PRBool rv = PR_TRUE;
1184
for( i = 0; i < sizeof(ucs4)/sizeof(ucs4[0]); i++ ) {
1185
struct ucs4 *e = &ucs4[i];
1187
unsigned char utf8[8];
1188
unsigned int len = 0;
1191
(void)memset(utf8, 0, sizeof(utf8));
1193
result = sec_port_ucs4_utf8_conversion_function(PR_FALSE,
1194
(unsigned char *)&e->c, sizeof(e->c), utf8, sizeof(utf8), &len);
1197
fprintf(stdout, "Failed to convert UCS-4 0x%08.8x to UTF-8\n", e->c);
1202
if( (len >= sizeof(utf8)) ||
1203
(strlen(e->utf8) != len) ||
1204
(utf8[len] = '\0', 0 != strcmp(e->utf8, utf8)) ) {
1205
fprintf(stdout, "Wrong conversion of UCS-4 0x%08.8x to UTF-8: ", e->c);
1206
dump_utf8("expected", e->utf8, ", ");
1207
dump_utf8("received", utf8, "\n");
1212
result = sec_port_ucs4_utf8_conversion_function(PR_TRUE,
1213
utf8, len, (unsigned char *)&back, sizeof(back), &len);
1216
dump_utf8("Failed to convert UTF-8", utf8, "to UCS-4\n");
1221
if( (sizeof(back) != len) || (e->c != back) ) {
1222
dump_utf8("Wrong conversion of UTF-8", utf8, " to UCS-4:");
1223
fprintf(stdout, "expected 0x%08.8x, received 0x%08.8x\n", e->c, back);
1238
PRBool rv = PR_TRUE;
1241
for( i = 0; i < sizeof(ucs2)/sizeof(ucs2[0]); i++ ) {
1242
struct ucs2 *e = &ucs2[i];
1244
unsigned char utf8[8];
1245
unsigned int len = 0;
1248
(void)memset(utf8, 0, sizeof(utf8));
1250
result = sec_port_ucs2_utf8_conversion_function(PR_FALSE,
1251
(unsigned char *)&e->c, sizeof(e->c), utf8, sizeof(utf8), &len);
1254
fprintf(stdout, "Failed to convert UCS-2 0x%04.4x to UTF-8\n", e->c);
1259
if( (len >= sizeof(utf8)) ||
1260
(strlen(e->utf8) != len) ||
1261
(utf8[len] = '\0', 0 != strcmp(e->utf8, utf8)) ) {
1262
fprintf(stdout, "Wrong conversion of UCS-2 0x%04.4x to UTF-8: ", e->c);
1263
dump_utf8("expected", e->utf8, ", ");
1264
dump_utf8("received", utf8, "\n");
1269
result = sec_port_ucs2_utf8_conversion_function(PR_TRUE,
1270
utf8, len, (unsigned char *)&back, sizeof(back), &len);
1273
dump_utf8("Failed to convert UTF-8", utf8, "to UCS-2\n");
1278
if( (sizeof(back) != len) || (e->c != back) ) {
1279
dump_utf8("Wrong conversion of UTF-8", utf8, "to UCS-2:");
1280
fprintf(stdout, "expected 0x%08.8x, received 0x%08.8x\n", e->c, back);
1295
PRBool rv = PR_TRUE;
1298
for( i = 0; i < sizeof(utf16)/sizeof(utf16[0]); i++ ) {
1299
struct utf16 *e = &utf16[i];
1301
unsigned char utf8[8];
1302
unsigned int len = 0;
1303
PRUint32 back32 = 0;
1306
(void)memset(utf8, 0, sizeof(utf8));
1308
result = sec_port_ucs2_utf8_conversion_function(PR_FALSE,
1309
(unsigned char *)&e->w[0], sizeof(e->w), utf8, sizeof(utf8), &len);
1312
fprintf(stdout, "Failed to convert UTF-16 0x%04.4x 0x%04.4x to UTF-8\n",
1318
result = sec_port_ucs4_utf8_conversion_function(PR_TRUE,
1319
utf8, len, (unsigned char *)&back32, sizeof(back32), &len);
1322
fprintf(stdout, "Failed to convert UTF-16 0x%04.4x 0x%04.4x to UTF-8: "
1323
"unexpected len %d\n", e->w[0], e->w[1], len);
1328
utf8[len] = '\0'; /* null-terminate for printing */
1331
dump_utf8("Failed to convert UTF-8", utf8, "to UCS-4 (utf-16 test)\n");
1336
if( (sizeof(back32) != len) || (e->c != back32) ) {
1337
fprintf(stdout, "Wrong conversion of UTF-16 0x%04.4x 0x%04.4x ",
1339
dump_utf8("to UTF-8", utf8, "and then to UCS-4: ");
1340
if( sizeof(back32) != len ) {
1341
fprintf(stdout, "len is %d\n", len);
1343
fprintf(stdout, "expected 0x%08.8x, received 0x%08.8x\n", e->c, back32);
1349
(void)memset(utf8, 0, sizeof(utf8));
1350
back[0] = back[1] = 0;
1352
result = sec_port_ucs4_utf8_conversion_function(PR_FALSE,
1353
(unsigned char *)&e->c, sizeof(e->c), utf8, sizeof(utf8), &len);
1356
fprintf(stdout, "Failed to convert UCS-4 0x%08.8x to UTF-8 (utf-16 test)\n",
1362
result = sec_port_ucs2_utf8_conversion_function(PR_TRUE,
1363
utf8, len, (unsigned char *)&back[0], sizeof(back), &len);
1366
fprintf(stdout, "Failed to convert UCS-4 0x%08.8x to UTF-8: "
1367
"unexpected len %d\n", e->c, len);
1372
utf8[len] = '\0'; /* null-terminate for printing */
1375
dump_utf8("Failed to convert UTF-8", utf8, "to UTF-16\n");
1380
if( (sizeof(back) != len) || (e->w[0] != back[0]) || (e->w[1] != back[1]) ) {
1381
fprintf(stdout, "Wrong conversion of UCS-4 0x%08.8x to UTF-8", e->c);
1382
dump_utf8("", utf8, "and then to UTF-16:");
1383
if( sizeof(back) != len ) {
1384
fprintf(stdout, "len is %d\n", len);
1386
fprintf(stdout, "expected 0x%04.4x 0x%04.4x, received 0x%04.4x 0x%04.4xx\n",
1387
e->w[0], e->w[1], back[0], back[1]);
1403
PRBool rv = PR_TRUE;
1406
for( i = 0; i < sizeof(utf8_bad)/sizeof(utf8_bad[0]); i++ ) {
1408
unsigned char destbuf[30];
1409
unsigned int len = 0;
1411
result = sec_port_ucs2_utf8_conversion_function(PR_TRUE,
1412
(unsigned char *)utf8_bad[i], strlen(utf8_bad[i]), destbuf, sizeof(destbuf), &len);
1415
dump_utf8("Failed to detect bad UTF-8 string converting to UCS2: ", utf8_bad[i], "\n");
1419
result = sec_port_ucs4_utf8_conversion_function(PR_TRUE,
1420
(unsigned char *)utf8_bad[i], strlen(utf8_bad[i]), destbuf, sizeof(destbuf), &len);
1423
dump_utf8("Failed to detect bad UTF-8 string converting to UCS4: ", utf8_bad[i], "\n");
1439
PRBool rv = PR_TRUE;
1442
for( i = 0; i < sizeof(ucs2)/sizeof(ucs2[0]); i++ ) {
1443
struct ucs2 *e = &ucs2[i];
1445
unsigned char iso88591;
1446
unsigned char utf8[3];
1447
unsigned int len = 0;
1449
if (ntohs(e->c) > 0xFF) continue;
1451
(void)memset(utf8, 0, sizeof(utf8));
1452
iso88591 = ntohs(e->c);
1454
result = sec_port_iso88591_utf8_conversion_function(&iso88591,
1455
1, utf8, sizeof(utf8), &len);
1458
fprintf(stdout, "Failed to convert ISO-8859-1 0x%02.2x to UTF-8\n", iso88591);
1463
if( (len >= sizeof(utf8)) ||
1464
(strlen(e->utf8) != len) ||
1465
(utf8[len] = '\0', 0 != strcmp(e->utf8, utf8)) ) {
1466
fprintf(stdout, "Wrong conversion of ISO-8859-1 0x%02.2x to UTF-8: ", iso88591);
1467
dump_utf8("expected", e->utf8, ", ");
1468
dump_utf8("received", utf8, "\n");
1484
PRBool rv = PR_TRUE;
1488
unsigned char utf8[8];
1489
unsigned int len = 0;
1493
(void)memset(utf8, 1, sizeof(utf8));
1495
result = sec_port_ucs4_utf8_conversion_function(PR_FALSE,
1496
(unsigned char *)&lzero, sizeof(lzero), utf8, sizeof(utf8), &len);
1499
fprintf(stdout, "Failed to convert UCS-4 0x00000000 to UTF-8\n");
1501
} else if( 1 != len ) {
1502
fprintf(stdout, "Wrong conversion of UCS-4 0x00000000: len = %d\n", len);
1504
} else if( '\0' != *utf8 ) {
1505
fprintf(stdout, "Wrong conversion of UCS-4 0x00000000: expected 00 ,"
1506
"received %02.2x\n", (unsigned int)*utf8);
1510
result = sec_port_ucs4_utf8_conversion_function(PR_TRUE,
1511
"", 1, (unsigned char *)&lback, sizeof(lback), &len);
1514
fprintf(stdout, "Failed to convert UTF-8 00 to UCS-4\n");
1516
} else if( 4 != len ) {
1517
fprintf(stdout, "Wrong conversion of UTF-8 00 to UCS-4: len = %d\n", len);
1519
} else if( 0 != lback ) {
1520
fprintf(stdout, "Wrong conversion of UTF-8 00 to UCS-4: "
1521
"expected 0x00000000, received 0x%08.8x\n", lback);
1525
(void)memset(utf8, 1, sizeof(utf8));
1527
result = sec_port_ucs2_utf8_conversion_function(PR_FALSE,
1528
(unsigned char *)&szero, sizeof(szero), utf8, sizeof(utf8), &len);
1531
fprintf(stdout, "Failed to convert UCS-2 0x0000 to UTF-8\n");
1533
} else if( 1 != len ) {
1534
fprintf(stdout, "Wrong conversion of UCS-2 0x0000: len = %d\n", len);
1536
} else if( '\0' != *utf8 ) {
1537
fprintf(stdout, "Wrong conversion of UCS-2 0x0000: expected 00 ,"
1538
"received %02.2x\n", (unsigned int)*utf8);
1542
result = sec_port_ucs2_utf8_conversion_function(PR_TRUE,
1543
"", 1, (unsigned char *)&sback, sizeof(sback), &len);
1546
fprintf(stdout, "Failed to convert UTF-8 00 to UCS-2\n");
1548
} else if( 2 != len ) {
1549
fprintf(stdout, "Wrong conversion of UTF-8 00 to UCS-2: len = %d\n", len);
1551
} else if( 0 != sback ) {
1552
fprintf(stdout, "Wrong conversion of UTF-8 00 to UCS-2: "
1553
"expected 0x0000, received 0x%04.4x\n", sback);
1567
unsigned int len, lenout;
1575
ucs4s = (PRUint32 *)calloc(sizeof(ucs4)/sizeof(ucs4[0]), sizeof(PRUint32));
1576
ucs2s = (PRUint16 *)calloc(sizeof(ucs2)/sizeof(ucs2[0]), sizeof(PRUint16));
1578
if( ((PRUint32 *)NULL == ucs4s) || ((PRUint16 *)NULL == ucs2s) ) {
1579
fprintf(stderr, "out of memory\n");
1584
for( i = 0; i < sizeof(ucs4)/sizeof(ucs4[0]); i++ ) {
1585
ucs4s[i] = ucs4[i].c;
1586
len += strlen(ucs4[i].utf8);
1589
ucs4_utf8 = (char *)malloc(len);
1592
for( i = 0; i < sizeof(ucs2)/sizeof(ucs2[0]); i++ ) {
1593
ucs2s[i] = ucs2[i].c;
1594
len += strlen(ucs2[i].utf8);
1597
ucs2_utf8 = (char *)malloc(len);
1599
if( ((char *)NULL == ucs4_utf8) || ((char *)NULL == ucs2_utf8) ) {
1600
fprintf(stderr, "out of memory\n");
1605
for( i = 0; i < sizeof(ucs4)/sizeof(ucs4[0]); i++ ) {
1606
strcat(ucs4_utf8, ucs4[i].utf8);
1610
for( i = 0; i < sizeof(ucs2)/sizeof(ucs2[0]); i++ ) {
1611
strcat(ucs2_utf8, ucs2[i].utf8);
1614
/* UTF-8 -> UCS-4 */
1615
len = sizeof(ucs4)/sizeof(ucs4[0]) * sizeof(PRUint32);
1616
tmp = calloc(len, 1);
1617
if( (void *)NULL == tmp ) {
1618
fprintf(stderr, "out of memory\n");
1622
result = sec_port_ucs4_utf8_conversion_function(PR_TRUE,
1623
ucs4_utf8, strlen(ucs4_utf8), tmp, len, &lenout);
1625
fprintf(stdout, "Failed to convert much UTF-8 to UCS-4\n");
1629
if( lenout != len ) {
1630
fprintf(stdout, "Unexpected length converting much UTF-8 to UCS-4\n");
1634
if( 0 != memcmp(ucs4s, tmp, len) ) {
1635
fprintf(stdout, "Wrong conversion of much UTF-8 to UCS-4\n");
1639
free(tmp); tmp = (void *)NULL;
1641
/* UCS-4 -> UTF-8 */
1642
len = strlen(ucs4_utf8);
1643
tmp = calloc(len, 1);
1644
if( (void *)NULL == tmp ) {
1645
fprintf(stderr, "out of memory\n");
1649
result = sec_port_ucs4_utf8_conversion_function(PR_FALSE,
1650
(unsigned char *)ucs4s, sizeof(ucs4)/sizeof(ucs4[0]) * sizeof(PRUint32),
1653
fprintf(stdout, "Failed to convert much UCS-4 to UTF-8\n");
1657
if( lenout != len ) {
1658
fprintf(stdout, "Unexpected length converting much UCS-4 to UTF-8\n");
1662
if( 0 != strncmp(ucs4_utf8, tmp, len) ) {
1663
fprintf(stdout, "Wrong conversion of much UCS-4 to UTF-8\n");
1667
free(tmp); tmp = (void *)NULL;
1669
/* UTF-8 -> UCS-2 */
1670
len = sizeof(ucs2)/sizeof(ucs2[0]) * sizeof(PRUint16);
1671
tmp = calloc(len, 1);
1672
if( (void *)NULL == tmp ) {
1673
fprintf(stderr, "out of memory\n");
1677
result = sec_port_ucs2_utf8_conversion_function(PR_TRUE,
1678
ucs2_utf8, strlen(ucs2_utf8), tmp, len, &lenout);
1680
fprintf(stdout, "Failed to convert much UTF-8 to UCS-2\n");
1684
if( lenout != len ) {
1685
fprintf(stdout, "Unexpected length converting much UTF-8 to UCS-2\n");
1689
if( 0 != memcmp(ucs2s, tmp, len) ) {
1690
fprintf(stdout, "Wrong conversion of much UTF-8 to UCS-2\n");
1694
free(tmp); tmp = (void *)NULL;
1696
/* UCS-2 -> UTF-8 */
1697
len = strlen(ucs2_utf8);
1698
tmp = calloc(len, 1);
1699
if( (void *)NULL == tmp ) {
1700
fprintf(stderr, "out of memory\n");
1704
result = sec_port_ucs2_utf8_conversion_function(PR_FALSE,
1705
(unsigned char *)ucs2s, sizeof(ucs2)/sizeof(ucs2[0]) * sizeof(PRUint16),
1708
fprintf(stdout, "Failed to convert much UCS-2 to UTF-8\n");
1712
if( lenout != len ) {
1713
fprintf(stdout, "Unexpected length converting much UCS-2 to UTF-8\n");
1717
if( 0 != strncmp(ucs2_utf8, tmp, len) ) {
1718
fprintf(stdout, "Wrong conversion of much UCS-2 to UTF-8\n");
1722
/* implement UTF16 */
1734
if( (void *)NULL != tmp ) free(tmp);
1745
* The implementation (now) expects the 16- and 32-bit characters
1746
* to be in network byte order, not host byte order. Therefore I
1747
* have to byteswap all those test vectors above. hton[ls] may be
1748
* functions, so I have to do this dynamically. If you want to
1749
* use this code to do host byte order conversions, just remove
1750
* the call in main() to this function.
1755
for( i = 0; i < sizeof(ucs4)/sizeof(ucs4[0]); i++ ) {
1756
struct ucs4 *e = &ucs4[i];
1760
for( i = 0; i < sizeof(ucs2)/sizeof(ucs2[0]); i++ ) {
1761
struct ucs2 *e = &ucs2[i];
1765
for( i = 0; i < sizeof(utf16)/sizeof(utf16[0]); i++ ) {
1766
struct utf16 *e = &utf16[i];
1768
e->w[0] = htons(e->w[0]);
1769
e->w[1] = htons(e->w[1]);
1784
if( test_ucs4_chars() &&
1785
test_ucs2_chars() &&
1786
test_utf16_chars() &&
1787
test_utf8_bad_chars() &&
1788
test_iso88591_chars() &&
1790
test_multichars() &&
1792
fprintf(stderr, "PASS\n");
1795
fprintf(stderr, "FAIL\n");
1800
#endif /* TEST_UTF8 */