1
/* Generate a Unicode conforming Line Break Properties tables from a
3
Written by Bruno Haible <haible@clisp.cons.org>, 2000-2001.
5
This program is free software; you can redistribute it and/or modify
6
it under the terms of the GNU General Public License as published by
7
the Free Software Foundation; either version 2, or (at your option)
10
This program is distributed in the hope that it will be useful,
11
but WITHOUT ANY WARRANTY; without even the implied warranty of
12
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
GNU General Public License for more details.
15
You should have received a copy of the GNU General Public License
16
along with this program; if not, write to the Free Software
17
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
20
$ gen-lbrkprop /usr/local/share/Unidata/UnicodeData.txt \
21
/usr/local/share/Unidata/PropList.txt \
22
/usr/local/share/Unidata/EastAsianWidth.txt \
33
/* This structure represents one line in the UnicodeData.txt file. */
34
struct unicode_attribute
36
const char *name; /* Character name */
37
const char *category; /* General category */
38
const char *combining; /* Canonical combining classes */
39
const char *bidi; /* Bidirectional category */
40
const char *decomposition; /* Character decomposition mapping */
41
const char *decdigit; /* Decimal digit value */
42
const char *digit; /* Digit value */
43
const char *numeric; /* Numeric value */
44
int mirrored; /* mirrored */
45
const char *oldname; /* Old Unicode 1.0 name */
46
const char *comment; /* Comment */
47
unsigned int upper; /* Uppercase mapping */
48
unsigned int lower; /* Lowercase mapping */
49
unsigned int title; /* Titlecase mapping */
52
/* Missing fields are represented with "" for strings, and NONE for
54
#define NONE (~(unsigned int)0)
56
/* The entire contents of the UnicodeData.txt file. */
57
struct unicode_attribute unicode_attributes [0x10000];
59
/* Stores in unicode_attributes[i] the values from the given fields. */
61
fill_attribute (unsigned int i,
62
const char *field1, const char *field2,
63
const char *field3, const char *field4,
64
const char *field5, const char *field6,
65
const char *field7, const char *field8,
66
const char *field9, const char *field10,
67
const char *field11, const char *field12,
68
const char *field13, const char *field14)
70
struct unicode_attribute * uni;
74
fprintf (stderr, "index too large\n");
77
uni = &unicode_attributes[i];
78
/* Copy the strings. */
79
uni->name = strdup (field1);
80
uni->category = (field2[0] == '\0' ? "" : strdup (field2));
81
uni->combining = (field3[0] == '\0' ? "" : strdup (field3));
82
uni->bidi = (field4[0] == '\0' ? "" : strdup (field4));
83
uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
84
uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6));
85
uni->digit = (field7[0] == '\0' ? "" : strdup (field7));
86
uni->numeric = (field8[0] == '\0' ? "" : strdup (field8));
87
uni->mirrored = (field9[0] == 'Y');
88
uni->oldname = (field10[0] == '\0' ? "" : strdup (field10));
89
uni->comment = (field11[0] == '\0' ? "" : strdup (field11));
90
uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
91
uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
92
uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
95
/* Maximum length of a field in the UnicodeData.txt file. */
98
/* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN.
99
Reads up to (but excluding) DELIM.
100
Returns 1 when a field was successfully read, otherwise 0. */
102
getfield (FILE *stream, char *buffer, int delim)
107
for (; (c = getc (stream)), (c != EOF && c != delim); )
109
/* The original unicode.org UnicodeData.txt file happens to have
110
CR/LF line terminators. Silently convert to LF. */
114
/* Put c into the buffer. */
115
if (++count >= FIELDLEN - 1)
117
fprintf (stderr, "field too long\n");
130
/* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
133
fill_attributes (const char *unicodedata_filename)
137
char field0[FIELDLEN];
138
char field1[FIELDLEN];
139
char field2[FIELDLEN];
140
char field3[FIELDLEN];
141
char field4[FIELDLEN];
142
char field5[FIELDLEN];
143
char field6[FIELDLEN];
144
char field7[FIELDLEN];
145
char field8[FIELDLEN];
146
char field9[FIELDLEN];
147
char field10[FIELDLEN];
148
char field11[FIELDLEN];
149
char field12[FIELDLEN];
150
char field13[FIELDLEN];
151
char field14[FIELDLEN];
154
for (i = 0; i < 0x10000; i++)
155
unicode_attributes[i].name = NULL;
157
stream = fopen (unicodedata_filename, "r");
160
fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
169
n = getfield (stream, field0, ';');
170
n += getfield (stream, field1, ';');
171
n += getfield (stream, field2, ';');
172
n += getfield (stream, field3, ';');
173
n += getfield (stream, field4, ';');
174
n += getfield (stream, field5, ';');
175
n += getfield (stream, field6, ';');
176
n += getfield (stream, field7, ';');
177
n += getfield (stream, field8, ';');
178
n += getfield (stream, field9, ';');
179
n += getfield (stream, field10, ';');
180
n += getfield (stream, field11, ';');
181
n += getfield (stream, field12, ';');
182
n += getfield (stream, field13, ';');
183
n += getfield (stream, field14, '\n');
188
fprintf (stderr, "short line in'%s':%d\n",
189
unicodedata_filename, lineno);
192
i = strtoul (field0, NULL, 16);
194
&& strlen (field1) >= 9
195
&& !strcmp (field1 + strlen(field1) - 8, ", First>"))
197
/* Deal with a range. */
199
n = getfield (stream, field0, ';');
200
n += getfield (stream, field1, ';');
201
n += getfield (stream, field2, ';');
202
n += getfield (stream, field3, ';');
203
n += getfield (stream, field4, ';');
204
n += getfield (stream, field5, ';');
205
n += getfield (stream, field6, ';');
206
n += getfield (stream, field7, ';');
207
n += getfield (stream, field8, ';');
208
n += getfield (stream, field9, ';');
209
n += getfield (stream, field10, ';');
210
n += getfield (stream, field11, ';');
211
n += getfield (stream, field12, ';');
212
n += getfield (stream, field13, ';');
213
n += getfield (stream, field14, '\n');
216
fprintf (stderr, "missing end range in '%s':%d\n",
217
unicodedata_filename, lineno);
220
if (!(field1[0] == '<'
221
&& strlen (field1) >= 8
222
&& !strcmp (field1 + strlen (field1) - 7, ", Last>")))
224
fprintf (stderr, "missing end range in '%s':%d\n",
225
unicodedata_filename, lineno);
228
field1[strlen (field1) - 7] = '\0';
229
j = strtoul (field0, NULL, 16);
231
fill_attribute (i, field1+1, field2, field3, field4, field5,
232
field6, field7, field8, field9, field10,
233
field11, field12, field13, field14);
237
/* Single character line */
238
fill_attribute (i, field1, field2, field3, field4, field5,
239
field6, field7, field8, field9, field10,
240
field11, field12, field13, field14);
243
if (ferror (stream) || fclose (stream))
245
fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
250
/* The combining property from the PropList.txt file. */
251
char unicode_combining[0x10000];
253
/* Stores in unicode_combining[] the Combining property from the
254
PropList.txt file. */
256
fill_combining (const char *proplist_filename)
262
for (i = 0; i < 0x10000; i++)
263
unicode_combining[i] = 0;
265
stream = fopen (proplist_filename, "r");
268
fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
272
/* Search for the "Property dump for: 0x20000004 (Combining)" line. */
275
if (fscanf (stream, "%100[^\n]\n", buf) < 1)
277
fprintf (stderr, "no combining property found in '%s'\n",
282
while (strstr (buf, "(Combining)") == NULL);
288
if (fscanf (stream, "%100[^\n]\n", buf) < 1)
290
fprintf (stderr, "premature end of combining property in '%s'\n",
296
if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.')
298
if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2)
300
fprintf (stderr, "parse error in combining property in '%s'\n",
305
else if (strlen (buf) >= 4)
307
if (sscanf (buf, "%4X", &i1) < 1)
309
fprintf (stderr, "parse error in combining property in '%s'\n",
317
fprintf (stderr, "parse error in combining property in '%s'\n",
321
for (i = i1; i <= i2; i++)
322
unicode_combining[i] = 1;
324
if (ferror (stream) || fclose (stream))
326
fprintf (stderr, "error reading from '%s'\n", proplist_filename);
331
/* The width property from the EastAsianWidth.txt file.
332
Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */
333
const char * unicode_width[0x10000];
335
/* Stores in unicode_width[] the width property from the PropList.txt
338
fill_width (const char *width_filename)
342
char field0[FIELDLEN];
343
char field1[FIELDLEN];
344
char field2[FIELDLEN];
347
for (i = 0; i < 0x10000; i++)
348
unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL);
350
stream = fopen (width_filename, "r");
353
fprintf (stderr, "error during fopen of '%s'\n", width_filename);
368
do c = getc (stream); while (c != EOF && c != '\n');
372
n = getfield (stream, field0, ';');
373
n += getfield (stream, field1, ';');
374
n += getfield (stream, field2, '\n');
379
fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno);
382
i = strtoul (field0, NULL, 16);
384
&& strlen (field2) >= 9
385
&& !strcmp (field2 + strlen(field2) - 8, ", First>"))
387
/* Deal with a range. */
389
n = getfield (stream, field0, ';');
390
n += getfield (stream, field1, ';');
391
n += getfield (stream, field2, '\n');
394
fprintf (stderr, "missing end range in '%s':%d\n",
395
width_filename, lineno);
398
if (!(field2[0] == '<'
399
&& strlen (field2) >= 8
400
&& !strcmp (field2 + strlen (field2) - 7, ", Last>")))
402
fprintf (stderr, "missing end range in '%s':%d\n",
403
width_filename, lineno);
406
field2[strlen (field2) - 7] = '\0';
407
j = strtoul (field0, NULL, 16);
409
unicode_width[i] = strdup (field1);
413
/* Single character line */
414
unicode_width[i] = strdup (field1);
417
if (ferror (stream) || fclose (stream))
419
fprintf (stderr, "error reading from '%s'\n", width_filename);
424
/* Line breaking classification. */
428
/* Values >= 20 are resolved at run time. */
429
LBP_BK = 0, /* mandatory break */
430
/*LBP_CR, carriage return - not used here because it's a DOSism */
431
/*LBP_LF, line feed - not used here because it's a DOSism */
432
LBP_CM = 20, /* attached characters and combining marks */
433
/*LBP_SG, surrogates - not used here because they are not characters */
434
LBP_ZW = 1, /* zero width space */
435
LBP_IN = 2, /* inseparable */
436
LBP_GL = 3, /* non-breaking (glue) */
437
LBP_CB = 22, /* contingent break opportunity */
438
LBP_SP = 21, /* space */
439
LBP_BA = 4, /* break opportunity after */
440
LBP_BB = 5, /* break opportunity before */
441
LBP_B2 = 6, /* break opportunity before and after */
442
LBP_HY = 7, /* hyphen */
443
LBP_NS = 8, /* non starter */
444
LBP_OP = 9, /* opening punctuation */
445
LBP_CL = 10, /* closing punctuation */
446
LBP_QU = 11, /* ambiguous quotation */
447
LBP_EX = 12, /* exclamation/interrogation */
448
LBP_ID = 13, /* ideographic */
449
LBP_NU = 14, /* numeric */
450
LBP_IS = 15, /* infix separator (numeric) */
451
LBP_SY = 16, /* symbols allowing breaks */
452
LBP_AL = 17, /* ordinary alphabetic and symbol characters */
453
LBP_PR = 18, /* prefix (numeric) */
454
LBP_PO = 19, /* postfix (numeric) */
455
LBP_SA = 23, /* complex context (South East Asian) */
456
LBP_AI = 24, /* ambiguous (alphabetic or ideograph) */
457
LBP_XX = 25 /* unknown */
460
/* Returns the line breaking classification for ch, as a bit mask. */
462
get_lbp (unsigned int ch)
466
if (unicode_attributes[ch].name != NULL)
468
/* mandatory break */
469
if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */
470
|| ch == 0x000C /* form feed */
471
|| ch == 0x2028 /* LINE SEPARATOR */
472
|| ch == 0x2029 /* PARAGRAPH SEPARATOR */)
475
/* zero width space */
476
if (ch == 0x200B /* ZERO WIDTH SPACE */)
480
if (ch == 0x2024 /* ONE DOT LEADER */
481
|| ch == 0x2025 /* TWO DOT LEADER */
482
|| ch == 0x2026 /* HORIZONTAL ELLIPSIS */)
485
/* non-breaking (glue) */
486
if (ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */
487
|| ch == 0x00A0 /* NO-BREAK SPACE */
488
|| ch == 0x202F /* NARROW NO-BREAK SPACE */
489
|| ch == 0x2007 /* FIGURE SPACE */
490
|| ch == 0x2011 /* NON-BREAKING HYPHEN */
491
|| ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */)
494
/* contingent break opportunity */
495
if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
499
if (ch == 0x0020 /* SPACE */)
502
/* break opportunity after */
503
if (ch == 0x2000 /* EN QUAD */
504
|| ch == 0x2001 /* EM QUAD */
505
|| ch == 0x2002 /* EN SPACE */
506
|| ch == 0x2003 /* EM SPACE */
507
|| ch == 0x2004 /* THREE-PER-EM SPACE */
508
|| ch == 0x2005 /* FOUR-PER-EM SPACE */
509
|| ch == 0x2006 /* SIX-PER-EM SPACE */
510
|| ch == 0x2008 /* PUNCTUATION SPACE */
511
|| ch == 0x2009 /* THIN SPACE */
512
|| ch == 0x200A /* HAIR SPACE */
513
|| ch == 0x0009 /* tab */
514
|| ch == 0x2010 /* HYPHEN */
515
|| ch == 0x058A /* ARMENIAN HYPHEN */
516
|| ch == 0x00AD /* SOFT HYPHEN */
517
|| ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
518
|| ch == 0x1361 /* ETHIOPIC WORDSPACE */
519
|| ch == 0x1680 /* OGHAM SPACE MARK */
520
|| ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */
521
|| ch == 0x2027 /* HYPHENATION POINT */
522
|| ch == 0x007C /* VERTICAL LINE */)
525
/* break opportunity before */
526
if (ch == 0x00B4 /* ACUTE ACCENT */
527
|| ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */
528
|| ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */
529
|| ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */)
532
/* break opportunity before and after */
533
if (ch == 0x2014 /* EM DASH */)
537
if (ch == 0x002D /* HYPHEN-MINUS */)
540
/* exclamation/interrogation */
541
if (ch == 0x0021 /* EXCLAMATION MARK */
542
|| ch == 0x003F /* QUESTION MARK */
543
|| ch == 0xFE56 /* SMALL QUESTION MARK */
544
|| ch == 0xFE57 /* SMALL EXCLAMATION MARK */
545
|| ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
546
|| ch == 0xFF1F /* FULLWIDTH QUESTION MARK */)
549
/* opening punctuation */
550
if (unicode_attributes[ch].category[0] == 'P'
551
&& unicode_attributes[ch].category[1] == 's')
554
/* closing punctuation */
555
if (ch == 0x3001 /* IDEOGRAPHIC COMMA */
556
|| ch == 0x3002 /* IDEOGRAPHIC FULL STOP */
557
|| ch == 0xFF0C /* FULLWIDTH COMMA */
558
|| ch == 0xFF0E /* FULLWIDTH FULL STOP */
559
|| ch == 0xFE50 /* SMALL COMMA */
560
|| ch == 0xFE52 /* SMALL FULL STOP */
561
|| ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
562
|| ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */
563
|| (unicode_attributes[ch].category[0] == 'P'
564
&& unicode_attributes[ch].category[1] == 'e'))
567
/* ambiguous quotation */
568
if (ch == 0x0022 /* QUOTATION MARK */
569
|| ch == 0x0027 /* APOSTROPHE */
570
|| (unicode_attributes[ch].category[0] == 'P'
571
&& (unicode_attributes[ch].category[1] == 'f'
572
|| unicode_attributes[ch].category[1] == 'i')))
575
/* attached characters and combining marks */
576
if ((unicode_attributes[ch].category[0] == 'M'
577
&& (unicode_attributes[ch].category[1] == 'n'
578
|| unicode_attributes[ch].category[1] == 'c'
579
|| unicode_attributes[ch].category[1] == 'e'))
580
|| (ch >= 0x1160 && ch <= 0x11F9)
581
|| (unicode_attributes[ch].category[0] == 'C'
582
&& (unicode_attributes[ch].category[1] == 'c'
583
|| unicode_attributes[ch].category[1] == 'f')))
584
if (!(attr & ((1 << LBP_BK) | (1 << LBP_BA) | (1 << LBP_GL))))
588
if (ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */
589
|| ch == 0x0E5B /* THAI CHARACTER KHOMUT */
590
|| ch == 0x17D4 /* KHMER SIGN KHAN */
591
|| ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
592
|| ch == 0x17D7 /* KHMER SIGN LEK TOO */
593
|| ch == 0x17D8 /* KHMER SIGN BEYYAL */
594
|| ch == 0x17D9 /* KHMER SIGN PHNAEK MUAN */
595
|| ch == 0x17DA /* KHMER SIGN KOOMUUT */
596
|| ch == 0x203C /* DOUBLE EXCLAMATION MARK */
597
|| ch == 0x2044 /* FRACTION SLASH */
598
|| ch == 0x301C /* WAVE DASH */
599
|| ch == 0x30FB /* KATAKANA MIDDLE DOT */
600
|| ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */
601
|| ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */
602
|| ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */
603
|| ch == 0x309D /* HIRAGANA ITERATION MARK */
604
|| ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */
605
|| ch == 0x30FD /* KATAKANA ITERATION MARK */
606
|| ch == 0xFE54 /* SMALL SEMICOLON */
607
|| ch == 0xFE55 /* SMALL COLON */
608
|| ch == 0xFF1A /* FULLWIDTH COLON */
609
|| ch == 0xFF1B /* FULLWIDTH SEMICOLON */
610
|| ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */
611
|| ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
612
|| (unicode_attributes[ch].category[0] == 'L'
613
&& unicode_attributes[ch].category[1] == 'm'
614
&& (unicode_width[ch][0] == 'W'
615
|| unicode_width[ch][0] == 'H'))
616
|| (unicode_attributes[ch].category[0] == 'S'
617
&& unicode_attributes[ch].category[1] == 'k'
618
&& unicode_width[ch][0] == 'W')
619
|| strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL
620
|| strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL)
624
if (unicode_attributes[ch].category[0] == 'N'
625
&& unicode_attributes[ch].category[1] == 'd'
626
&& strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL)
629
/* infix separator (numeric) */
630
if (ch == 0x002C /* COMMA */
631
|| ch == 0x002E /* FULL STOP */
632
|| ch == 0x003A /* COLON */
633
|| ch == 0x003B /* SEMICOLON */
634
|| ch == 0x0589 /* ARMENIAN FULL STOP */)
637
/* symbols allowing breaks */
638
if (ch == 0x002F /* SOLIDUS */)
641
/* postfix (numeric) */
642
if (ch == 0x0025 /* PERCENT SIGN */
643
|| ch == 0x00A2 /* CENT SIGN */
644
|| ch == 0x00B0 /* DEGREE SIGN */
645
|| ch == 0x2030 /* PER MILLE SIGN */
646
|| ch == 0x2031 /* PER TEN THOUSAND SIGN */
647
|| ch == 0x2032 /* PRIME */
648
|| ch == 0x2033 /* DOUBLE PRIME */
649
|| ch == 0x2034 /* TRIPLE PRIME */
650
|| ch == 0x2035 /* REVERSED PRIME */
651
|| ch == 0x20A7 /* PESETA SIGN */
652
|| ch == 0x2103 /* DEGREE CELSIUS */
653
|| ch == 0x2109 /* DEGREE FAHRENHEIT */
654
|| ch == 0x2126 /* OHM SIGN */
655
|| ch == 0xFE6A /* SMALL PERCENT SIGN */
656
|| ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */
657
|| ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */)
660
/* prefix (numeric) */
661
if (ch == 0x002B /* PLUS SIGN */
662
|| ch == 0x005C /* REVERSE SOLIDUS */
663
|| ch == 0x00B1 /* PLUS-MINUS SIGN */
664
|| ch == 0x2212 /* MINUS SIGN */
665
|| ch == 0x2116 /* NUMERO SIGN */
666
|| ch == 0x2213 /* MINUS-OR-PLUS SIGN */
667
|| (unicode_attributes[ch].category[0] == 'S'
668
&& unicode_attributes[ch].category[1] == 'c'))
669
if (!(attr & (1 << LBP_PO)))
672
/* complex context (South East Asian) */
673
if ((ch >= 0x0E00 && ch <= 0x0EFF)
674
|| (ch >= 0x1000 && ch <= 0x109F)
675
|| (ch >= 0x1780 && ch <= 0x17FF))
676
if (!(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_NU) | (1 << LBP_BA) | (1 << LBP_PR))))
680
if ((ch >= 0x4E00 && ch <= 0x9FAF) /* CJK Ideograph */
681
|| (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Ideograph Extension A */
682
|| (ch >= 0xF900 && ch <= 0xFAFF) /* CJK COMPATIBILITY IDEOGRAPH */
683
|| ch == 0x3000 /* IDEOGRAPHIC SPACE */
684
|| (ch >= 0xAC00 && ch <= 0xD7AF) /* HANGUL SYLLABLE */
685
|| (ch >= 0x3130 && ch <= 0x318F) /* HANGUL LETTER */
686
|| (ch >= 0x1100 && ch <= 0x115F) /* HANGUL CHOSEONG */
687
|| (ch >= 0xA000 && ch <= 0xA48C) /* YI SYLLABLE */
688
|| (ch >= 0xA490 && ch <= 0xACFF) /* YI RADICAL */
689
|| (ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
690
|| ch == 0xFE62 /* SMALL PLUS SIGN */
691
|| ch == 0xFE63 /* SMALL HYPHEN-MINUS */
692
|| ch == 0xFE64 /* SMALL LESS-THAN SIGN */
693
|| ch == 0xFE65 /* SMALL GREATER-THAN SIGN */
694
|| ch == 0xFE66 /* SMALL EQUALS SIGN */
695
|| (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */
696
|| strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
697
|| (ch >= 0x3000 && ch <= 0x33FF
698
&& !(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_CL)))))
700
/* ambiguous (ideograph) ? */
701
if (unicode_width[ch] != NULL
702
&& unicode_width[ch][0] == 'A')
708
/* ordinary alphabetic and symbol characters */
709
if ((unicode_attributes[ch].category[0] == 'L'
710
&& (unicode_attributes[ch].category[1] == 'u'
711
|| unicode_attributes[ch].category[1] == 'l'
712
|| unicode_attributes[ch].category[1] == 't'
713
|| unicode_attributes[ch].category[1] == 'm'
714
|| unicode_attributes[ch].category[1] == 'o'))
715
|| (unicode_attributes[ch].category[0] == 'S'
716
&& (unicode_attributes[ch].category[1] == 'm'
717
|| unicode_attributes[ch].category[1] == 'c'
718
|| unicode_attributes[ch].category[1] == 'k'
719
|| unicode_attributes[ch].category[1] == 'o')))
720
if (!(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_ID) | (1 << LBP_BA) | (1 << LBP_BB) | (1 << LBP_PO) | (1 << LBP_PR) | (1 << LBP_SA) | (1 << LBP_CB))))
722
/* ambiguous (alphabetic) ? */
723
if (unicode_width[ch] != NULL
724
&& unicode_width[ch][0] == 'A')
738
/* Output the line breaking properties in a human readable format. */
740
debug_output_lbp (FILE *stream)
744
for (i = 0; i < 0x10000; i++)
746
int attr = get_lbp (i);
747
if (attr != 1 << LBP_XX)
749
fprintf (stream, "0x%04X", i);
750
#define PRINT_BIT(attr,bit) \
751
if (attr & (1 << bit)) fprintf (stream, " " ## #bit);
752
PRINT_BIT(attr,LBP_BK);
753
PRINT_BIT(attr,LBP_CM);
754
PRINT_BIT(attr,LBP_ZW);
755
PRINT_BIT(attr,LBP_IN);
756
PRINT_BIT(attr,LBP_GL);
757
PRINT_BIT(attr,LBP_CB);
758
PRINT_BIT(attr,LBP_SP);
759
PRINT_BIT(attr,LBP_BA);
760
PRINT_BIT(attr,LBP_BB);
761
PRINT_BIT(attr,LBP_B2);
762
PRINT_BIT(attr,LBP_HY);
763
PRINT_BIT(attr,LBP_NS);
764
PRINT_BIT(attr,LBP_OP);
765
PRINT_BIT(attr,LBP_CL);
766
PRINT_BIT(attr,LBP_QU);
767
PRINT_BIT(attr,LBP_EX);
768
PRINT_BIT(attr,LBP_ID);
769
PRINT_BIT(attr,LBP_NU);
770
PRINT_BIT(attr,LBP_IS);
771
PRINT_BIT(attr,LBP_SY);
772
PRINT_BIT(attr,LBP_AL);
773
PRINT_BIT(attr,LBP_PR);
774
PRINT_BIT(attr,LBP_PO);
775
PRINT_BIT(attr,LBP_SA);
776
PRINT_BIT(attr,LBP_XX);
777
PRINT_BIT(attr,LBP_AI);
779
fprintf (stream, "\n");
784
/* Construction of sparse 3-level tables. */
785
#define TABLE lbp_table
786
#define ELEMENT unsigned char
787
#define DEFAULT LBP_XX
788
#define xmalloc malloc
789
#define xrealloc realloc
793
output_lbp (FILE *stream)
797
unsigned int level1_offset, level2_offset, level3_offset;
803
for (i = 0; i < 0x10000; i++)
805
int attr = get_lbp (i);
807
/* Now attr should contain exactly one bit. */
808
if (attr == 0 || ((attr & (attr - 1)) != 0))
811
if (attr != 1 << LBP_XX)
813
unsigned int log2_attr;
814
for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
816
lbp_table_add (&t, i, log2_attr);
820
lbp_table_finalize (&t);
823
5 * sizeof (uint32_t);
825
5 * sizeof (uint32_t)
826
+ t.level1_size * sizeof (uint32_t);
828
5 * sizeof (uint32_t)
829
+ t.level1_size * sizeof (uint32_t)
830
+ (t.level2_size << t.q) * sizeof (uint32_t);
832
for (i = 0; i < 5; i++)
833
fprintf (stream, "#define lbrkprop_header_%d %d\n", i,
834
((uint32_t *) t.result)[i]);
835
fprintf (stream, "static const\n");
836
fprintf (stream, "struct\n");
837
fprintf (stream, " {\n");
838
fprintf (stream, " int level1[%d];\n", t.level1_size);
839
fprintf (stream, " int level2[%d << %d];\n", t.level2_size, t.q);
840
fprintf (stream, " unsigned char level3[%d << %d];\n", t.level3_size, t.p);
841
fprintf (stream, " }\n");
842
fprintf (stream, "lbrkprop =\n");
843
fprintf (stream, "{\n");
844
fprintf (stream, " { ");
845
for (i = 0; i < t.level1_size; i++)
846
fprintf (stream, "%d%s ",
847
(((uint32_t *) (t.result + level1_offset))[i] - level2_offset) / sizeof (uint32_t),
848
(i+1 < t.level1_size ? "," : ""));
849
fprintf (stream, "},\n");
850
fprintf (stream, " {");
851
if (t.level2_size << t.q > 8)
852
fprintf (stream, "\n ");
853
for (i = 0; i < t.level2_size << t.q; i++)
855
if (i > 0 && (i % 8) == 0)
856
fprintf (stream, "\n ");
857
fprintf (stream, " %5d%s",
858
(((uint32_t *) (t.result + level2_offset))[i] - level3_offset) / sizeof (uint8_t),
859
(i+1 < t.level2_size << t.q ? "," : ""));
861
if (t.level2_size << t.q > 8)
862
fprintf (stream, "\n ");
863
fprintf (stream, " },\n");
864
fprintf (stream, " {");
865
if (t.level3_size << t.p > 8)
866
fprintf (stream, "\n ");
867
for (i = 0; i < t.level3_size << t.p; i++)
869
unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
870
const char *value_string;
873
#define CASE(x) case x: value_string = #x; break;
904
if (i > 0 && (i % 8) == 0)
905
fprintf (stream, "\n ");
906
fprintf (stream, " %s%s", value_string,
907
(i+1 < t.level3_size << t.p ? "," : ""));
909
if (t.level3_size << t.p > 8)
910
fprintf (stream, "\n ");
911
fprintf (stream, " }\n");
912
fprintf (stream, "};\n");
916
debug_output_tables (const char *filename)
920
stream = fopen (filename, "w");
923
fprintf (stderr, "cannot open '%s' for writing\n", filename);
927
debug_output_lbp (stream);
929
if (ferror (stream) || fclose (stream))
931
fprintf (stderr, "error writing to '%s'\n", filename);
937
output_tables (const char *filename, const char *version)
941
stream = fopen (filename, "w");
944
fprintf (stderr, "cannot open '%s' for writing\n", filename);
948
fprintf (stream, "/* Line breaking properties of Unicode characters. */\n");
949
fprintf (stream, "/* Generated automatically by gen-lbrkprop for Unicode %s. */\n",
951
fprintf (stream, "\n");
955
if (ferror (stream) || fclose (stream))
957
fprintf (stderr, "error writing to '%s'\n", filename);
963
main (int argc, char * argv[])
967
fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt EastAsianWidth.txt version\n",
972
fill_attributes (argv[1]);
973
fill_combining (argv[2]);
974
fill_width (argv[3]);
976
debug_output_tables ("lbrkprop.txt");
978
output_tables ("lbrkprop.h", argv[4]);