2
*******************************************************************************
4
* Copyright (C) 1999-2002, International Business Machines
5
* Corporation and others. All Rights Reserved.
7
*******************************************************************************
10
* tab size: 8 (not used)
13
* created on: 1999dec11
14
* created by: Markus W. Scherer
16
* Store Unicode character properties efficiently for
22
#include "unicode/utypes.h"
23
#include "unicode/uchar.h"
28
#include "unicode/udata.h"
33
#define DO_DEBUG_OUT 0
35
/* Unicode character properties file format ------------------------------------
37
The file format prepared and written here contains several data
38
structures that store indexes or data.
40
Before the data contents described below, there are the headers required by
41
the udata API for loading ICU data. Especially, a UDataInfo structure
42
precedes the actual data. It contains platform properties values and the
45
The following is a description of format version 2.1 .
49
The contents is a parsed, binary form of several Unicode character
50
database files, most prominently UnicodeData.txt.
52
Any Unicode code point from 0 to 0x10ffff can be looked up to get
53
the properties, if any, for that code point. This means that the input
54
to the lookup are 21-bit unsigned integers, with not all of the
57
It is assumed that client code keeps a uint32_t pointer
58
to the beginning of the data:
62
Formally, the file contains the following structures:
64
const int32_t indexes[16] with values i0..i15:
66
i0 propsIndex; -- 32-bit unit index to the table of 32-bit properties words
67
i1 exceptionsIndex; -- 32-bit unit index to the table of 32-bit exception words
68
i2 exceptionsTopIndex; -- 32-bit unit index to the array of UChars for special mappings
70
i3 additionalTrieIndex; -- 32-bit unit index to the additional trie for more properties
71
i4 additionalVectorsIndex; -- 32-bit unit index to the table of properties vectors
72
i5 additionalVectorsColumns; -- number of 32-bit words per properties vector
74
i6 reservedItemIndex; -- 32-bit unit index to the top of the properties vectors table
75
i7..i15 reservedIndexes; -- reserved values; 0 for now
77
PT serialized properties trie, see utrie.h (byte size: 4*(i0-16))
79
P const uint32_t props32[i1-i0];
80
E const uint32_t exceptions[i2-i1];
81
U const UChar uchars[2*(i3-i2)];
83
AT serialized trie for additional properties (byte size: 4*(i4-i3))
84
PV const uint32_t propsVectors[(i6-i4)/i5][i5]==uint32_t propsVectors[i6-i4];
86
Trie lookup and properties:
88
In order to condense the data for the 21-bit code space, several properties of
89
the Unicode code assignment are exploited:
90
- The code space is sparse.
91
- There are several 10k of consecutive codes with the same properties.
92
- Characters and scripts are allocated in groups of 16 code points.
93
- Inside blocks for scripts the properties are often repetitive.
94
- The 21-bit space is not fully used for Unicode.
96
The lookup of properties for a given code point is done with a trie lookup,
97
using the UTrie implementation.
98
The trie lookup result is a 16-bit index in the props32[] table where the
99
actual 32-bit properties word is stored. This is done to save space.
101
(There are thousands of 16-bit entries in the trie data table, but
102
only a few hundred unique 32-bit properties words.
103
If the trie data table contained 32-bit words directly, then that would be
104
larger because the length of the table would be the same as now but the
105
width would be 32 bits instead of 16. This saves more than 10kB.)
107
With a given Unicode code point
111
and 0<=c<0x110000, the lookup is done like this:
115
uint32_t props=p32[i];
117
For some characters, not all of the properties can be efficiently encoded
118
using 32 bits. For them, the 32-bit word contains an index into the exceptions[]
121
if(props&EXCEPTION_BIT)) {
122
uint16_t e=(uint16_t)(props>>VALUE_SHIFT);
126
The exception values are a variable number of uint32_t starting at
128
const uint32_t *pe=p32+exceptionsIndex+e;
130
The first uint32_t there contains flags about what values actually follow it.
131
Some of the exception values are UChar32 code points for the case mappings,
132
others are numeric values etc.
134
32-bit properties sets:
136
Each 32-bit properties word contains:
138
0.. 4 general category
139
5 has exception values
142
12..13 numericType (new in format version 2.1):
144
1 decimal digit value
148
20..31 value according to bits 0..5:
151
} else switch(general category) {
152
case Ll: delta to uppercase; -- same as titlecase
153
case Lu: -delta to lowercase; -- titlecase is same as c
154
case Lt: -delta to lowercase; -- uppercase is same as c
155
case Mn: combining class;
156
case Nd: value=numeric value==decimal digit value=digit value;
158
case No: value=numeric value - but decimal digit value and digit value are not defined;
169
In the first uint32_t exception word for a code point,
172
23..16 combining class
173
15..0 flags that indicate which values follow:
176
0 has uppercase mapping
177
1 has lowercase mapping
178
2 has titlecase mapping
180
4 has numeric value (numerator)
181
5 has denominator value
182
6 has a mirror-image Unicode code point
183
7 has SpecialCasing.txt entries
184
8 has CaseFolding.txt entries
186
According to the flags in this word, one or more uint32_t words follow it
187
in the sequence of the bit flags in the flags word; if a flag is not set,
188
then the value is missing or 0:
190
For the case mappings and the mirror-image Unicode code point,
191
one uint32_t or UChar32 each is the code point.
192
If the titlecase mapping is missing, then it is the same as the uppercase mapping.
194
For the digit values, bits 31..16 contain the decimal digit value, and
195
bits 15..0 contain the digit value. A value of -1 indicates that
196
this value is missing.
198
For the numeric/numerator value, an int32_t word contains the value directly,
199
except for when there is no numerator but a denominator, then the numerator
200
is implicitly 1. This means:
201
numerator denominator result
207
For the denominator value, a uint32_t word contains the value directly.
209
For special casing mappings, the 32-bit exception word contains:
210
31 if set, this character has complex, conditional mappings
212
otherwise, the mappings are stored according to the following bits
213
30..24 number of UChars used for mappings
215
15.. 0 UChar offset from the beginning of the UChars array where the
216
UChars for the special case mappings are stored in the following format:
218
Format of special casing UChars:
219
One UChar value with lengths as follows:
220
14..10 number of UChars for titlecase mapping
221
9.. 5 number of UChars for uppercase mapping
222
4.. 0 number of UChars for lowercase mapping
224
Followed by the UChars for lowercase, uppercase, titlecase mappings in this order.
226
For case folding mappings, the 32-bit exception word contains:
227
31..24 number of UChars used for the full mapping
229
15.. 0 UChar offset from the beginning of the UChars array where the
230
UChars for the special case mappings are stored in the following format:
232
Format of case folding UChars:
233
Two UChars contain the simple mapping as follows:
234
0, 0 no simple mapping
235
BMP,0 a simple mapping to a BMP code point
236
s1, s2 a simple mapping to a supplementary code point stored as two surrogates
237
This is followed by the UChars for the full case folding mappings.
240
U+2160, ROMAN NUMERAL ONE, needs an exception because it has a lowercase
241
mapping and a numeric value.
242
Its exception values would be stored as 3 uint32_t words:
244
- flags=0x0a (see above) with combining class 0
245
- lowercase mapping 0x2170
248
--- Additional properties (new in format version 2.1) ---
250
The second trie for additional properties (AT) is also a UTrie with 16-bit data.
251
The data words consist of 32-bit unit indexes (not row indexes!) into the
252
table of unique properties vectors (PV).
253
Each vector contains a set of properties. The width of a vector may change
254
with the formatVersion, it is stored in i5.
256
Current properties: (see also icu/source/common/uprops.h)
259
0 31..24 age of the code point designation/assignment, a Unicode version
260
bits 31..28 major version number
261
bits 27..24 minor version number
264
----------------------------------------------------------------------------- */
266
/* UDataInfo cf. udata.h */
267
static UDataInfo dataInfo={
276
{ 0x55, 0x50, 0x72, 0x6f }, /* dataFormat="UPro" */
277
{ 2, 1, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */
278
{ 3, 0, 0, 0 } /* dataVersion */
281
/* definitions of expected data size limits */
283
MAX_PROPS_COUNT=25000,
284
MAX_UCHAR_COUNT=10000,
285
MAX_EXCEPTIONS_COUNT=4096
288
/* definitions for the properties words */
290
/* general category shift==0 0 (5 bits) */
291
EXCEPTION_SHIFT=5, /* 5 (1 bit) */
292
BIDI_SHIFT, /* 6 (5 bits) */
293
MIRROR_SHIFT=BIDI_SHIFT+5, /* 11 (1 bit) */
294
NUMERIC_TYPE_SHIFT, /* 12 (2 bits) */
295
RESERVED_SHIFT=NUMERIC_TYPE_SHIFT+2, /* 14 (6 bits) */
296
VALUE_SHIFT=20, /* 20 */
298
EXCEPTION_BIT=1UL<<EXCEPTION_SHIFT,
299
VALUE_BITS=32-VALUE_SHIFT
302
static const int32_t MAX_VALUE=(1L<<(VALUE_BITS-1))-1;
303
static const int32_t MIN_VALUE=-(1L<<(VALUE_BITS-1));
305
static UNewTrie *pTrie=NULL;
307
/* props32[] contains unique properties words after compacting the array of properties */
308
static uint32_t props32[MAX_PROPS_COUNT];
310
/* context pointer for compareProps() - temporarily holds a pointer to the trie data */
311
static uint32_t *props;
313
/* length of props32[] after compaction */
314
static int32_t propsTop;
316
/* exceptions values */
317
static uint32_t exceptions[MAX_EXCEPTIONS_COUNT+20];
318
static uint16_t exceptionsTop=0;
320
/* Unicode characters, e.g. for special casing or decomposition */
321
static UChar uchars[MAX_UCHAR_COUNT+20];
322
static uint32_t ucharsTop=0;
325
static uint16_t exceptionsCount=0;
327
/* prototypes --------------------------------------------------------------- */
330
compareProps(const void *l, const void *r);
333
addUChars(const UChar *s, uint32_t length);
335
/* -------------------------------------------------------------------------- */
338
setUnicodeVersion(const char *v) {
339
UVersionInfo version;
340
u_versionFromString(version, v);
341
uprv_memcpy(dataInfo.dataVersion, version, 4);
346
pTrie=utrie_open(NULL, NULL, MAX_PROPS_COUNT, 0, FALSE);
348
fprintf(stderr, "error: unable to create a UNewTrie\n");
349
exit(U_MEMORY_ALLOCATION_ERROR);
352
uprv_memset(props32, 0, sizeof(props32));
355
/* store a character's properties ------------------------------------------- */
358
makeProps(Props *p) {
365
* Simple ideas for reducing the number of bits for one character's
368
* Some fields are only used for characters of certain
369
* general categories:
370
* - casing fields for letters and others, not for
372
* + uppercase not for uppercase letters
373
* + lowercase not for lowercase letters
374
* + titlecase not for titlecase letters
376
* * most of the time, uppercase=titlecase
377
* - numeric fields for various digit & other types
378
* - canonical combining classes for non-spacing marks (Mn)
379
* * the above is not always true, for all three cases
381
* Using the same bits for alternate fields saves some space.
383
* For the canonical categories, there are only few actually used
385
* They can be stored using 5 bits.
387
* In the BiDi categories, the 5 explicit codes are only ever
388
* assigned 1:1 to 5 well-known code points. Storing only one
389
* value for all "explicit codes" gets this down to 4 bits.
390
* Client code then needs to check for this special value
391
* and replace it by the real one using a 5-element table.
393
* The general categories Mn & Me, non-spacing & enclosing marks,
394
* are always NSM, and NSM are always of those categories.
396
* Digit values can often be derived from the code point value
397
* itself in a simple way.
401
/* count the case mappings and other values competing for the value bit field */
405
isNumber= (UBool)(genCategoryNames[p->generalCategory][0]=='N');
407
if(p->upperCase!=0) {
408
/* verify that no numbers and no Mn have case mappings */
409
if(p->generalCategory==U_LOWERCASE_LETTER) {
410
value=(int32_t)p->code-(int32_t)p->upperCase;
416
if(p->lowerCase!=0) {
417
/* verify that no numbers and no Mn have case mappings */
418
if(p->generalCategory==U_UPPERCASE_LETTER || p->generalCategory==U_TITLECASE_LETTER) {
419
value=(int32_t)p->lowerCase-(int32_t)p->code;
425
if(p->upperCase!=p->titleCase) {
429
if(p->canonicalCombining>0) {
430
/* verify that only Mn has a canonical combining class */
431
if(p->generalCategory==U_NON_SPACING_MARK) {
432
value=p->canonicalCombining;
438
if(p->generalCategory==U_DECIMAL_DIGIT_NUMBER) {
439
/* verify that all numeric fields contain the same value */
440
if(p->decimalDigitValue!=-1 && p->digitValue==p->decimalDigitValue &&
441
p->numericType==1 && p->numericValue==p->decimalDigitValue &&
444
value=p->decimalDigitValue;
449
} else if(p->generalCategory==U_LETTER_NUMBER || p->generalCategory==U_OTHER_NUMBER) {
450
if(p->numericType==3) {
451
value=p->numericValue;
456
} else if(p->numericType!=0) {
460
if(p->denominator!=0) {
461
/* verification for numeric category covered by the above */
466
if(p->mirrorMapping!=0) {
467
value=(int32_t)p->mirrorMapping-(int32_t)p->code;
471
if(p->specialCasing!=NULL) {
475
if(p->caseFolding!=NULL) {
480
/* handle exceptions */
481
if(count>1 || x!=0 || value<MIN_VALUE || MAX_VALUE<value) {
482
/* this code point needs exception values */
485
printf("*** code 0x%06x needs an exception because it is irregular\n", p->code);
486
} else if(count==1) {
487
printf("*** code 0x%06x needs an exception because its value would be %ld\n",
488
p->code, (long)value);
489
} else if(value<MIN_VALUE || MAX_VALUE<value) {
490
printf("*** code 0x%06x needs an exception because its value is out-of-bounds at %ld (not [%ld..%ld]\n",
491
p->code, (long)value, (long)MIN_VALUE, (long)MAX_VALUE);
493
printf("*** code 0x%06x needs an exception because it has %u values\n", p->code, count);
500
/* allocate and create exception values */
503
fprintf(stderr, "genprops: out of exceptions memory at U+%06x. (%d exceeds allocated space)\n",
505
exit(U_MEMORY_ALLOCATION_ERROR);
507
uint32_t first=(uint32_t)p->canonicalCombining<<16;
510
if(p->upperCase!=0) {
512
exceptions[value+length++]=p->upperCase;
514
if(p->lowerCase!=0) {
516
exceptions[value+length++]=p->lowerCase;
518
if(p->upperCase!=p->titleCase) {
520
if(p->titleCase!=0) {
521
exceptions[value+length++]=p->titleCase;
523
exceptions[value+length++]=p->code;
526
if(p->decimalDigitValue!=-1 || p->digitValue!=-1) {
528
exceptions[value+length++]=
529
(uint32_t)p->decimalDigitValue<<16|
530
(uint16_t)p->digitValue;
532
if(p->numericType==3) {
533
if(p->denominator==0) {
535
exceptions[value+length++]=(uint32_t)p->numericValue;
537
if(p->numericValue!=1) {
539
exceptions[value+length++]=(uint32_t)p->numericValue;
542
exceptions[value+length++]=p->denominator;
547
exceptions[value+length++]=p->mirrorMapping;
549
if(p->specialCasing!=NULL) {
551
if(p->specialCasing->isComplex) {
552
/* complex special casing */
553
exceptions[value+length++]=0x80000000;
555
/* unconditional special casing */
562
j=p->specialCasing->lowerCase[0];
564
uprv_memcpy(u+1, p->specialCasing->lowerCase+1, 2*j);
568
j=p->specialCasing->upperCase[0];
570
uprv_memcpy(u+i, p->specialCasing->upperCase+1, 2*j);
574
j=p->specialCasing->titleCase[0];
576
uprv_memcpy(u+i, p->specialCasing->titleCase+1, 2*j);
582
exceptions[value+length++]=(i<<24)|addUChars(u, i);
585
if(p->caseFolding!=NULL) {
587
if(p->caseFolding->simple==0 && p->caseFolding->full[0]==0) {
588
/* special case folding, store only a marker */
589
exceptions[value+length++]=0;
591
/* normal case folding with a simple and a full mapping */
595
/* store the simple mapping into the first two UChars */
598
UTF_APPEND_CHAR_UNSAFE(u, i, p->caseFolding->simple);
600
/* store the full mapping after that */
601
i=p->caseFolding->full[0];
603
uprv_memcpy(u+2, p->caseFolding->full+1, 2*i);
606
exceptions[value+length++]=(i<<24)|addUChars(u, 2+i);
609
exceptions[value]=first;
610
exceptionsTop+=length;
614
/* put together the 32-bit word of encoded properties */
616
(uint32_t)p->generalCategory |
617
(uint32_t)p->bidi<<BIDI_SHIFT |
618
(uint32_t)p->isMirrored<<MIRROR_SHIFT |
619
(uint32_t)p->numericType<<NUMERIC_TYPE_SHIFT |
620
(uint32_t)value<<VALUE_SHIFT;
625
* "Higher-hanging fruit" (not implemented):
627
* For some sets of fields, there are fewer sets of values
628
* than the product of the numbers of values per field.
629
* This means that storing one single value for more than
630
* one field and later looking up both field values in a table
633
* - general category & BiDi
635
* There are only few common displacements between a code point
636
* and its case mappings. Store deltas. Store codes for few
642
addProps(uint32_t c, uint32_t x) {
643
if(!utrie_set32(pTrie, (UChar32)c, x)) {
644
fprintf(stderr, "error: too many entries for the properties trie\n");
645
exit(U_BUFFER_OVERFLOW_ERROR);
649
/* areas of same properties ------------------------------------------------- */
652
repeatProps(uint32_t first, uint32_t last, uint32_t x) {
653
if(!utrie_setRange32(pTrie, (UChar32)first, (UChar32)(last+1), x, FALSE)) {
654
fprintf(stderr, "error: too many entries for the properties trie\n");
655
exit(U_BUFFER_OVERFLOW_ERROR);
659
/* compacting --------------------------------------------------------------- */
664
* At this point, all the propsTop properties are in props[], but they
665
* are not all unique.
666
* Now we sort them, reduce them to unique ones in props32[], and
667
* build an index in stage3[] from the old to the new indexes.
668
* (The quick sort averages at N*log(N) with N=propsTop. The inverting
669
* yields linear performance.)
673
* We are going to sort only an index table in map[] because we need this
674
* index table anyway and qsort() does not allow to sort two tables together
675
* directly. This will thus also reduce the amount of data moved around.
678
int32_t i, oldIndex, newIndex;
680
static uint16_t map[MAX_PROPS_COUNT];
687
for(c=0; c<0xffff; c+=307) {
688
printf("properties(0x%06x)=0x%06x\n", c, getProps(c, &i1, &i2, &i3));
693
props=utrie_getData(pTrie, &propsTop);
695
/* build the index table */
696
for(i=propsTop; i>0;) {
702
qsort(map, propsTop, 2, compareProps);
705
* Now invert the reordered table and compact it in the same step.
706
* The result will be props32[] having only unique properties words
707
* and stage3[] having indexes to them.
710
for(i=0; i<propsTop;) {
711
/* set the first of a possible series of the same properties */
713
props32[newIndex]=x=props[oldIndex];
714
props[oldIndex]=newIndex;
716
/* set the following same properties only in stage3 */
717
while(++i<propsTop && x==props[map[i]]) {
718
props[map[i]]=newIndex;
724
/* we saved some space */
726
printf("compactProps() reduced propsTop from %u to %u\n", propsTop, newIndex);
733
uint16_t i1, i2, i3, i4;
735
for(c=0; c<0xffff; c+=307) {
736
printf("properties(0x%06x)=0x%06x\n", c, getProps2(c, &i1, &i2, &i3, &i4));
743
compareProps(const void *l, const void *r) {
744
uint32_t left=props[*(const uint16_t *)l], right=props[*(const uint16_t *)r];
746
/* compare general categories first */
747
int rc=(int)(left&0x1f)-(int)(right&0x1f);
748
if(rc==0 && left!=right) {
749
rc= left<right ? -1 : 1;
754
/* generate output data ----------------------------------------------------- */
756
/* folding value: just store the offset (16 bits) if there is any non-0 entry */
757
U_CAPI uint32_t U_EXPORT2
758
getFoldedPropsValue(UNewTrie *trie, UChar32 start, int32_t offset) {
765
value=utrie_get32(trie, start, &inBlockZero);
767
start+=UTRIE_DATA_BLOCK_LENGTH;
768
} else if(value!=0) {
769
return (uint32_t)(offset|0x8000);
778
generateData(const char *dataDir) {
779
static int32_t indexes[UPROPS_INDEX_COUNT]={
785
static uint8_t trieBlock[40000];
786
static uint8_t additionalProps[40000];
788
UNewDataMemory *pData;
789
UErrorCode errorCode=U_ZERO_ERROR;
791
int32_t trieSize, additionalPropsSize, offset;
796
trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), getFoldedPropsValue, TRUE, &errorCode);
797
if(U_FAILURE(errorCode)) {
798
fprintf(stderr, "error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode), (long)trieSize);
802
offset=sizeof(indexes)/4; /* uint32_t offset to the properties trie */
804
/* round up trie size to 4-alignement */
805
trieSize=(trieSize+3)&~3;
807
indexes[UPROPS_PROPS32_INDEX]=offset; /* uint32_t offset to props[] */
810
indexes[UPROPS_EXCEPTIONS_INDEX]=offset;/* uint32_t offset to exceptions[] */
812
offset+=exceptionsTop; /* uint32_t offset to the first unit after exceptions[] */
813
indexes[UPROPS_EXCEPTIONS_TOP_INDEX]=offset;
815
/* round up UChar count to 4-alignement */
816
ucharsTop=(ucharsTop+1)&~1;
817
offset+=(uint16_t)(ucharsTop/2); /* uint32_t offset to the first unit after uchars[] */
818
indexes[UPROPS_ADDITIONAL_TRIE_INDEX]=offset;
821
printf("trie size in bytes: %5u\n", trieSize);
822
printf("number of unique properties values: %5u\n", propsTop);
823
printf("number of code points with exceptions: %5u\n", exceptionsCount);
824
printf("size in bytes of exceptions: %5u\n", 4*exceptionsTop);
825
printf("number of UChars for special mappings: %5u\n", ucharsTop);
828
additionalPropsSize=writeAdditionalData(additionalProps, sizeof(additionalProps), indexes);
830
size=4*offset+additionalPropsSize; /* total size of data */
832
printf("data size: %6lu\n", (unsigned long)size);
836
pData=udata_create(dataDir, DATA_TYPE, DATA_NAME, &dataInfo,
837
haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
838
if(U_FAILURE(errorCode)) {
839
fprintf(stderr, "genprops: unable to create data memory, %s\n", u_errorName(errorCode));
843
udata_writeBlock(pData, indexes, sizeof(indexes));
844
udata_writeBlock(pData, trieBlock, trieSize);
845
udata_writeBlock(pData, props32, 4*propsTop);
846
udata_writeBlock(pData, exceptions, 4*exceptionsTop);
847
udata_writeBlock(pData, uchars, 2*ucharsTop);
848
udata_writeBlock(pData, additionalProps, additionalPropsSize);
851
dataLength=udata_finish(pData, &errorCode);
852
if(U_FAILURE(errorCode)) {
853
fprintf(stderr, "genprops: error %d writing the output file\n", errorCode);
857
if(dataLength!=(long)size) {
858
fprintf(stderr, "genprops: data length %ld != calculated size %lu\n",
859
dataLength, (unsigned long)size);
860
exit(U_INTERNAL_PROGRAM_ERROR);
866
/* helpers ------------------------------------------------------------------ */
869
addUChars(const UChar *s, uint32_t length) {
870
uint32_t top=(uint16_t)(ucharsTop+length);
873
if(top>=MAX_UCHAR_COUNT) {
874
fprintf(stderr, "genprops: out of UChars memory\n");
875
exit(U_MEMORY_ALLOCATION_ERROR);
878
uprv_memcpy(p, s, 2*length);
880
return (uint32_t)(p-uchars);
884
* Hey, Emacs, please set the following:
887
* indent-tabs-mode: nil