2
*******************************************************************************
4
* Copyright (C) 1999-2008, International Business Machines
5
* Corporation and others. All Rights Reserved.
7
*******************************************************************************
8
* file name: gennames.c
10
* tab size: 8 (not used)
13
* created on: 1999sep30
14
* created by: Markus W. Scherer
16
* This program reads the Unicode character database text file,
17
* parses it, and extracts the character code,
18
* the "modern" character name, and optionally the
19
* Unicode 1.0 character name, and (starting with ICU 2.2) the ISO 10646 comment.
20
* It then tokenizes and compresses the names and builds
21
* compact binary tables for random-access lookup
22
* in a u_charName() API function.
24
* unames.icu file format (after UDataInfo header etc. - see udata.c)
25
* (all data is static const)
30
* dataVersion = Unicode version from -u or --unicode command line option, defaults to 3.0.0
33
* uint32_t tokenStringOffset,
38
* uint16_t tokenCount;
39
* uint16_t tokenTable[tokenCount];
41
* char tokenStrings[]; -- padded to even count
43
* -- strings (groupStrings) are tokenized as follows:
44
* for each character c
45
* if(c>=tokenCount) write that character c directly
47
* token=tokenTable[c];
48
* if(token==0xfffe) -- lead byte of double-byte token
49
* token=tokenTable[c<<8|next character];
53
* tokenString=tokenStrings+token; (tokenStrings=start of names data + tokenStringOffset;)
54
* append zero-terminated tokenString;
56
* Different strings for a code point - normal name, 1.0 name, and ISO comment -
57
* are separated by ';'.
59
* uint16_t groupCount;
61
* uint16_t groupMSB; -- for a group of 32 character names stored, this is code point>>5
62
* uint16_t offsetHigh; -- group strings are at start of names data + groupStringsOffset + this 32 bit-offset
64
* } groupTable[groupCount];
66
* char groupStrings[]; -- padded to 4-count
68
* -- The actual, tokenized group strings are not zero-terminated because
69
* that would take up too much space.
70
* Instead, they are preceeded by their length, written in a variable-length sequence:
71
* For each of the 32 group strings, one or two nibbles are stored for its length.
72
* Nibbles (4-bit values, half-bytes) are read MSB first.
73
* A nibble with a value of 0..11 directly indicates the length of the name string.
74
* A nibble n with a value of 12..15 is a lead nibble and forms a value with the following nibble m
75
* by (((n-12)<<4)|m)+12, reaching values of 12..75.
76
* These lengths are sequentially for each tokenized string, not for the de-tokenized result.
77
* For the de-tokenizing, see token description above; the strings immediately follow the
80
* -- algorithmic names
82
* typedef struct AlgorithmicRange {
83
* uint32_t rangeStart, rangeEnd;
84
* uint8_t algorithmType, algorithmVariant;
88
* uint32_t algRangesCount; -- number of data blocks for ranges of
89
* algorithmic names (Unicode 3.0.0: 3, hardcoded in gennames)
92
* AlgorithmicRange algRange;
93
* uint8_t algRangeData[]; -- padded to 4-count except in last range
94
* } algRanges[algNamesCount];
95
* -- not a real array because each part has a different size
96
* of algRange.rangeSize (including AlgorithmicRange)
98
* -- algorithmic range types:
100
* 0 Names are formed from a string prefix that is stored in
101
* the algRangeData (zero-terminated), followed by the Unicode code point
102
* of the character in hexadecimal digits;
103
* algRange.algorithmVariant digits are written
105
* 1 Names are formed by calculating modulo-factors of the code point value as follows:
106
* algRange.algorithmVariant is the count of modulo factors
107
* algRangeData contains
108
* uint16_t factors[algRange.algorithmVariant];
110
* the first zero-terminated string is written as the prefix; then:
112
* The rangeStart is subtracted; with the difference, here "code":
113
* for(i=algRange.algorithmVariant-1 to 0 step -1)
114
* index[i]=code%factor[i];
117
* The strings after the prefix are short pieces that are then appended to the result
118
* according to index[0..algRange.algorithmVariant-1].
122
#include "unicode/utypes.h"
123
#include "unicode/putil.h"
124
#include "unicode/uclean.h"
125
#include "unicode/udata.h"
128
#include "uarrsort.h"
129
#include "unewdata.h"
130
#include "uoptions.h"
133
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
135
#define STRING_STORE_SIZE 1000000
136
#define GROUP_STORE_SIZE 5000
138
#define GROUP_SHIFT 5
139
#define LINES_PER_GROUP (1UL<<GROUP_SHIFT)
140
#define GROUP_MASK (LINES_PER_GROUP-1)
142
#define MAX_LINE_COUNT 50000
143
#define MAX_WORD_COUNT 20000
144
#define MAX_GROUP_COUNT 5000
146
#define DATA_NAME "unames"
147
#define DATA_TYPE "icu"
148
#define VERSION_STRING "unam"
149
#define NAME_SEPARATOR_CHAR ';'
151
#define ISO_DATA_NAME "ucomment"
153
/* Unicode versions --------------------------------------------------------- */
170
static const UVersionInfo
185
static int32_t ucdVersion=UNI_5_1;
188
findUnicodeVersion(const UVersionInfo version) {
191
for(i=0; /* while(version>unicodeVersions[i]) {} */
192
i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)>0;
194
if(0<i && i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)<0) {
195
--i; /* fix 4.0.2 to land before 4.1, for valid x>=ucdVersion comparisons */
197
return i; /* version>=unicodeVersions[i] && version<unicodeVersions[i+1]; possible: i==UNI_VER_COUNT */
200
/* generator data ----------------------------------------------------------- */
202
/* UDataInfo cf. udata.h */
203
static UDataInfo dataInfo={
212
{0x75, 0x6e, 0x61, 0x6d}, /* dataFormat="unam" */
213
{1, 0, 0, 0}, /* formatVersion */
214
{3, 0, 0, 0} /* dataVersion */
217
static UBool beVerbose=FALSE, beQuiet=FALSE, haveCopyright=TRUE;
219
typedef struct Options {
222
UBool storeISOComments;
225
static uint8_t stringStore[STRING_STORE_SIZE],
226
groupStore[GROUP_STORE_SIZE],
227
lineLengths[LINES_PER_GROUP];
229
static uint32_t lineTop=0, groupBottom, wordBottom=STRING_STORE_SIZE, lineLengthsTop;
238
int32_t weight; /* -(cost for token) + (number of occurences) * (length-1) */
244
static Line lines[MAX_LINE_COUNT];
245
static Word words[MAX_WORD_COUNT];
247
static uint32_t lineCount=0, wordCount=0;
249
static int16_t leadByteCount;
251
#define LEADBYTE_LIMIT 16
253
static int16_t tokens[LEADBYTE_LIMIT*256];
254
static uint32_t tokenCount;
256
/* prototypes --------------------------------------------------------------- */
262
parseDB(const char *filename, Options *options);
265
parseName(char *name, int16_t length);
268
skipNoise(char *line, int16_t start, int16_t limit);
271
getWord(char *line, int16_t start, int16_t limit);
280
compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop);
283
compareWords(const void *context, const void *word1, const void *word2);
286
generateData(const char *dataDir, Options *options);
289
generateAlgorithmicData(UNewDataMemory *pData, Options *options);
292
findToken(uint8_t *s, int16_t length);
295
findWord(char *s, int16_t length);
298
addWord(char *s, int16_t length);
301
countWord(Word *word);
304
addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count);
307
addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length);
310
addToken(uint8_t *s, int16_t length);
313
appendLineLength(int16_t length);
316
appendLineLengthNibble(uint8_t nibble);
319
allocLine(int32_t length);
322
allocWord(uint32_t length);
324
/* -------------------------------------------------------------------------- */
339
static UOption options[]={
341
UOPTION_HELP_QUESTION_MARK,
346
{ "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 },
347
{ "unicode1-names", NULL, NULL, NULL, '1', UOPT_NO_ARG, 0 },
348
{ "no-iso-comments", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 },
349
{ "only-iso-comments", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 }
353
main(int argc, char* argv[]) {
354
UVersionInfo version;
355
Options moreOptions={ TRUE, FALSE, TRUE };
356
UErrorCode errorCode = U_ZERO_ERROR;
358
U_MAIN_INIT_ARGS(argc, argv);
362
if (U_FAILURE(errorCode) && errorCode != U_FILE_ACCESS_ERROR) {
363
/* Note: u_init() will try to open ICU property data.
364
* failures here are expected when building ICU from scratch.
367
fprintf(stderr, "%s: can not initialize ICU. errorCode = %s\n",
368
argv[0], u_errorName(errorCode));
372
/* preset then read command line options */
373
options[DESTDIR].value=u_getDataDirectory();
374
options[UNICODE].value="4.1";
375
argc=u_parseArgs(argc, argv, LENGTHOF(options), options);
377
/* error handling, printing usage message */
380
"error in command line argument \"%s\"\n",
385
if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
387
* Broken into chucks because the C89 standard says the minimum
388
* required supported string length is 509 bytes.
391
"Usage: %s [-1[+|-]] [-v[+|-]] [-c[+|-]] filename\n"
393
"Read the UnicodeData.txt file and \n"
394
"create a binary file " DATA_NAME "." DATA_TYPE " with the character names\n"
396
"\tfilename absolute path/filename for the Unicode database text file\n"
397
"\t\t(default: standard input)\n"
402
"\t-h or -? or --help this usage text\n"
403
"\t-v or --verbose verbose output\n"
404
"\t-q or --quiet no output\n"
405
"\t-c or --copyright include a copyright notice\n"
406
"\t-d or --destdir destination directory, followed by the path\n"
407
"\t-u or --unicode Unicode version, followed by the version like 3.0.0\n");
409
"\t-1 or --unicode1-names store Unicode 1.0 character names\n"
410
"\t --no-iso-comments do not store ISO comments\n"
411
"\t --only-iso-comments write ucomment.icu with only ISO comments\n");
412
return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
415
/* get the options values */
416
beVerbose=options[VERBOSE].doesOccur;
417
beQuiet=options[QUIET].doesOccur;
418
haveCopyright=options[COPYRIGHT].doesOccur;
419
moreOptions.store10Names=options[UNICODE1_NAMES].doesOccur;
420
moreOptions.storeISOComments=!options[NO_ISO_COMMENTS].doesOccur;
421
if(options[ONLY_ISO_COMMENTS].doesOccur) {
422
moreOptions.storeNames=moreOptions.store10Names=FALSE;
423
moreOptions.storeISOComments=TRUE;
426
/* set the Unicode version */
427
u_versionFromString(version, options[UNICODE].value);
428
uprv_memcpy(dataInfo.dataVersion, version, 4);
429
ucdVersion=findUnicodeVersion(version);
432
parseDB(argc>=2 ? argv[1] : "-", &moreOptions);
434
generateData(options[DESTDIR].value, &moreOptions);
444
for(i=0; i<256; ++i) {
449
/* parsing ------------------------------------------------------------------ */
451
/* get a name, strip leading and trailing whitespace */
453
getName(char **pStart, char *limit) {
454
/* strip leading whitespace */
455
char *start=(char *)u_skipWhitespace(*pStart);
457
/* strip trailing whitespace */
458
while(start<limit && (*(limit-1)==' ' || *(limit-1)=='\t')) {
464
return (int16_t)(limit-start);
467
static void U_CALLCONV
468
lineFn(void *context,
469
char *fields[][2], int32_t fieldCount,
470
UErrorCode *pErrorCode) {
471
Options *storeOptions=(Options *)context;
473
int16_t lengths[3]={ 0, 0, 0 };
474
static uint32_t prevCode=0;
477
if(U_FAILURE(*pErrorCode)) {
480
/* get the character code */
481
code=uprv_strtoul(fields[0][0], NULL, 16);
483
/* get the character name */
484
if(storeOptions->storeNames) {
485
names[0]=fields[1][0];
486
lengths[0]=getName(names+0, fields[1][1]);
487
if(names[0][0]=='<') {
488
/* do not store pseudo-names in <> brackets */
493
/* store 1.0 names */
494
/* get the second character name, the one from Unicode 1.0 */
495
if(storeOptions->store10Names) {
496
names[1]=fields[10][0];
497
lengths[1]=getName(names+1, fields[10][1]);
498
if(names[1][0]=='<') {
499
/* do not store pseudo-names in <> brackets */
504
/* get the ISO 10646 comment */
505
if(storeOptions->storeISOComments) {
506
names[2]=fields[11][0];
507
lengths[2]=getName(names+2, fields[11][1]);
510
if(lengths[0]+lengths[1]+lengths[2]==0) {
514
/* check for non-character code points */
515
if(!U_IS_UNICODE_CHAR(code)) {
516
fprintf(stderr, "gennames: error - properties for non-character code point U+%04lx\n",
517
(unsigned long)code);
518
*pErrorCode=U_PARSE_ERROR;
522
/* check that the code points (code) are in ascending order */
523
if(code<=prevCode && code>0) {
524
fprintf(stderr, "gennames: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
525
(unsigned long)code, (unsigned long)prevCode);
526
*pErrorCode=U_PARSE_ERROR;
531
parseName(names[0], lengths[0]);
532
parseName(names[1], lengths[1]);
533
parseName(names[2], lengths[2]);
536
* set the count argument to
537
* 1: only store regular names, or only store ISO 10646 comments
538
* 2: store regular and 1.0 names
539
* 3: store names and ISO 10646 comment
541
* addLine() will ignore empty trailing names
543
if(storeOptions->storeNames) {
544
/* store names and comments as parsed according to storeOptions */
545
addLine(code, names, lengths, 3);
547
/* store only ISO 10646 comments */
548
addLine(code, names+2, lengths+2, 1);
553
parseDB(const char *filename, Options *storeOptions) {
555
UErrorCode errorCode=U_ZERO_ERROR;
557
u_parseDelimitedFile(filename, ';', fields, 15, lineFn, storeOptions, &errorCode);
558
if(U_FAILURE(errorCode)) {
559
fprintf(stderr, "gennames parse error: %s\n", u_errorName(errorCode));
564
printf("size of all names in the database: %lu\n",
565
(unsigned long)lineTop);
566
printf("number of named Unicode characters: %lu\n",
567
(unsigned long)lineCount);
568
printf("number of words in the dictionary from these names: %lu\n",
569
(unsigned long)wordCount);
574
parseName(char *name, int16_t length) {
575
int16_t start=0, limit, wordLength/*, prevStart=-1*/;
578
while(start<length) {
579
/* skip any "noise" characters */
580
limit=skipNoise(name, start, length);
589
/* get a word and add it if it is longer than 1 */
590
limit=getWord(name, start, length);
591
wordLength=(int16_t)(limit-start);
593
word=findWord(name+start, wordLength);
595
word=addWord(name+start, wordLength);
602
* if there was a word before this
603
* (with no noise in between), then add the pair of words, too
606
wordLength=limit-prevStart;
607
word=findWord(name+prevStart, wordLength);
609
word=addWord(name+prevStart, wordLength);
620
static UBool U_INLINE
622
return ('A'<=c && c<='I') || /* EBCDIC-safe check for letters */
623
('J'<=c && c<='R') ||
624
('S'<=c && c<='Z') ||
626
('a'<=c && c<='i') || /* lowercase letters for ISO comments */
627
('j'<=c && c<='r') ||
628
('s'<=c && c<='z') ||
634
skipNoise(char *line, int16_t start, int16_t limit) {
635
/* skip anything that is not part of a word in this sense */
636
while(start<limit && !isWordChar(line[start])) {
644
getWord(char *line, int16_t start, int16_t limit) {
645
char c=0; /* initialize to avoid a compiler warning although the code was safe */
647
/* a unicode character name word consists of A-Z0-9 */
648
while(start<limit && isWordChar(line[start])) {
652
/* include a following space or dash */
653
if(start<limit && ((c=line[start])==' ' || c=='-')) {
660
/* compressing -------------------------------------------------------------- */
664
uint32_t i, letterCount;
666
UErrorCode errorCode;
668
/* sort the words in reverse order by weight */
669
errorCode=U_ZERO_ERROR;
670
uprv_sortArray(words, wordCount, sizeof(Word),
671
compareWords, NULL, FALSE, &errorCode);
673
/* remove the words that do not save anything */
674
while(wordCount>0 && words[wordCount-1].weight<1) {
678
/* count the letters in the token range */
680
for(i=LEADBYTE_LIMIT; i<256; ++i) {
686
printf("number of letters used in the names: %d\n", (int)letterCount);
689
/* do we need double-byte tokens? */
690
if(wordCount+letterCount<=256) {
691
/* no, single-byte tokens are enough */
693
for(i=0, wordNumber=0; wordNumber<(int16_t)wordCount; ++i) {
695
tokens[i]=wordNumber;
697
printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
698
(int)i, (long)words[wordNumber].weight,
699
words[wordNumber].length, words[wordNumber].s);
707
* The tokens that need two token bytes
708
* get their weight reduced by their count
709
* because they save less.
711
tokenCount=256-letterCount;
712
for(i=tokenCount; i<wordCount; ++i) {
713
words[i].weight-=words[i].count;
716
/* sort these words in reverse order by weight */
717
errorCode=U_ZERO_ERROR;
718
uprv_sortArray(words+tokenCount, wordCount-tokenCount, sizeof(Word),
719
compareWords, NULL, FALSE, &errorCode);
721
/* remove the words that do not save anything */
722
while(wordCount>0 && words[wordCount-1].weight<1) {
726
/* how many tokens and lead bytes do we have now? */
727
tokenCount=wordCount+letterCount+(LEADBYTE_LIMIT-1);
729
* adjust upwards to take into account that
730
* double-byte tokens must not
731
* use NAME_SEPARATOR_CHAR as a second byte
733
tokenCount+=(tokenCount-256+254)/255;
735
leadByteCount=(int16_t)(tokenCount>>8);
736
if(leadByteCount<LEADBYTE_LIMIT) {
737
/* adjust for the real number of lead bytes */
738
tokenCount-=(LEADBYTE_LIMIT-1)-leadByteCount;
740
/* limit the number of lead bytes */
741
leadByteCount=LEADBYTE_LIMIT-1;
742
tokenCount=LEADBYTE_LIMIT*256;
743
wordCount=tokenCount-letterCount-(LEADBYTE_LIMIT-1);
744
/* adjust again to skip double-byte tokens with ';' */
745
wordCount-=(tokenCount-256+254)/255;
748
/* set token 0 to word 0 */
751
printf("tokens[0x000]: word%8ld \"%.*s\"\n",
752
(long)words[0].weight,
753
words[0].length, words[0].s);
757
/* set the lead byte tokens */
758
for(i=1; (int16_t)i<=leadByteCount; ++i) {
764
/* if store10Names then the parser set tokens[NAME_SEPARATOR_CHAR]=-1 */
766
tokens[i]=wordNumber;
768
printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
769
(int)i, (long)words[wordNumber].weight,
770
words[wordNumber].length, words[wordNumber].s);
776
/* continue above 255 where there are no letters */
777
for(; (uint32_t)wordNumber<wordCount; ++i) {
778
if((i&0xff)==NAME_SEPARATOR_CHAR) {
779
tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */
781
tokens[i]=wordNumber;
783
printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
784
(int)i, (long)words[wordNumber].weight,
785
words[wordNumber].length, words[wordNumber].s);
790
tokenCount=i; /* should be already tokenCount={i or i+1} */
794
printf("number of lead bytes: %d\n", leadByteCount);
795
printf("number of single-byte tokens: %lu\n",
796
(unsigned long)256-letterCount-leadByteCount);
797
printf("number of tokens: %lu\n", (unsigned long)tokenCount);
806
uint32_t i=0, inLine, outLine=0xffffffff /* (uint32_t)(-1) */,
807
groupMSB=0xffff, lineCount2;
810
/* store the groups like lines, with compressed data after raw strings */
812
lineCount2=lineCount;
815
/* loop over all lines */
816
while(i<lineCount2) {
820
/* segment the lines to groups of 32 */
821
if(inLine>>GROUP_SHIFT!=groupMSB) {
822
/* finish the current group with empty lines */
823
while((++outLine&GROUP_MASK)!=0) {
827
/* store the group like a line */
829
if(groupTop>GROUP_STORE_SIZE) {
830
fprintf(stderr, "gennames: group store overflow\n");
831
exit(U_BUFFER_OVERFLOW_ERROR);
833
addGroup(groupMSB, groupStore, groupTop);
836
/* start the new group */
839
groupMSB=inLine>>GROUP_SHIFT;
840
outLine=(inLine&~GROUP_MASK)-1;
843
/* write empty lines between the previous line in the group and this one */
844
while(++outLine<inLine) {
848
/* write characters and tokens for this line */
849
appendLineLength(compressLine(line->s, line->length, &groupTop));
852
/* finish and store the last group */
853
if(line && groupMSB!=0xffff) {
854
/* finish the current group with empty lines */
855
while((++outLine&GROUP_MASK)!=0) {
859
/* store the group like a line */
861
if(groupTop>GROUP_STORE_SIZE) {
862
fprintf(stderr, "gennames: group store overflow\n");
863
exit(U_BUFFER_OVERFLOW_ERROR);
865
addGroup(groupMSB, groupStore, groupTop);
870
printf("number of groups: %lu\n", (unsigned long)lineCount);
875
compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop) {
876
int16_t start, limit, token, groupTop=*pGroupTop;
880
/* write any "noise" characters */
881
limit=skipNoise((char *)s, start, length);
883
groupStore[groupTop++]=s[start++];
890
/* write a word, as token or directly */
891
limit=getWord((char *)s, start, length);
893
groupStore[groupTop++]=s[start++];
895
token=findToken(s+start, (int16_t)(limit-start));
898
groupStore[groupTop++]=(uint8_t)(token>>8);
900
groupStore[groupTop++]=(uint8_t)token;
904
groupStore[groupTop++]=s[start++];
908
} while(start<length);
910
length=(int16_t)(groupTop-*pGroupTop);
916
compareWords(const void *context, const void *word1, const void *word2) {
917
/* reverse sort by word weight */
918
return ((Word *)word2)->weight-((Word *)word1)->weight;
921
/* generate output data ----------------------------------------------------- */
924
generateData(const char *dataDir, Options *storeOptions) {
925
UNewDataMemory *pData;
926
UErrorCode errorCode=U_ZERO_ERROR;
927
uint16_t groupWords[3];
928
uint32_t i, groupTop=lineTop, offset, size,
929
tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
933
pData=udata_create(dataDir,
934
DATA_TYPE, storeOptions->storeNames ? DATA_NAME : ISO_DATA_NAME,
936
haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
937
if(U_FAILURE(errorCode)) {
938
fprintf(stderr, "gennames: unable to create data memory, error %d\n", errorCode);
942
/* first, see how much space we need, and prepare the token strings */
943
for(i=0; i<tokenCount; ++i) {
945
if(token!=-1 && token!=-2) {
946
tokens[i]=(int16_t)(addToken(words[token].s, words[token].length)-groupTop);
951
* Required padding for data swapping:
952
* The token table undergoes a permutation during data swapping when the
953
* input and output charsets are different.
954
* The token table cannot grow during swapping, so we need to make sure that
955
* the table is long enough for successful in-place permutation.
957
* We simply round up tokenCount to the next multiple of 256 to account for
958
* all possible permutations.
960
* An optimization is possible if we only ever swap between ASCII and EBCDIC:
962
* If tokenCount>256, then a semicolon (NAME_SEPARATOR_CHAR) is used
963
* and will be swapped between ASCII and EBCDIC between
964
* positions 0x3b (ASCII semicolon) and 0x5e (EBCDIC semicolon).
965
* This should be the only -1 entry in tokens[256..511] on which the data
966
* swapper bases its trail byte permutation map (trailMap[]).
968
* It would be sufficient to increase tokenCount so that its lower 8 bits
969
* are at least 0x5e+1 to make room for swapping between the two semicolons.
970
* For values higher than 0x5e, the trail byte permutation map (trailMap[])
971
* should always be an identity map, where we do not need additional room.
974
tokenCount=(tokenCount+0xff)&~0xff;
975
if(!beQuiet && i<tokenCount) {
976
printf("number of tokens[] padding entries for data swapping: %lu\n", (unsigned long)(tokenCount-i));
978
for(; i<tokenCount; ++i) {
979
if((i&0xff)==NAME_SEPARATOR_CHAR) {
980
tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */
982
tokens[i]=0; /* unused token for padding */
987
* Calculate the total size in bytes of the data including:
988
* - the offset to the token strings, uint32_t (4)
989
* - the offset to the group table, uint32_t (4)
990
* - the offset to the group strings, uint32_t (4)
991
* - the offset to the algorithmic names, uint32_t (4)
993
* - the number of tokens, uint16_t (2)
994
* - the token table, uint16_t[tokenCount] (2*tokenCount)
996
* - the token strings, each zero-terminated (tokenSize=(lineTop-groupTop)), 2-padded
998
* - the number of groups, uint16_t (2)
999
* - the group table, { uint16_t groupMSB, uint16_t offsetHigh, uint16_t offsetLow }[6*groupCount]
1001
* - the group strings (groupTop-groupBottom), 2-padded
1003
* - the size of the data for the algorithmic names
1005
tokenStringOffset=4+4+4+4+2+2*tokenCount;
1006
groupsOffset=(tokenStringOffset+(lineTop-groupTop)+1)&~1;
1007
groupStringOffset=groupsOffset+2+6*lineCount;
1008
algNamesOffset=(groupStringOffset+(groupTop-groupBottom)+3)&~3;
1010
offset=generateAlgorithmicData(NULL, storeOptions);
1011
size=algNamesOffset+offset;
1014
printf("size of the Unicode Names data:\n"
1015
"total data length %lu, token strings %lu, compressed strings %lu, algorithmic names %lu\n",
1016
(unsigned long)size, (unsigned long)(lineTop-groupTop),
1017
(unsigned long)(groupTop-groupBottom), (unsigned long)offset);
1020
/* write the data to the file */
1022
udata_write32(pData, tokenStringOffset);
1023
udata_write32(pData, groupsOffset);
1024
udata_write32(pData, groupStringOffset);
1025
udata_write32(pData, algNamesOffset);
1028
udata_write16(pData, (uint16_t)tokenCount);
1029
udata_writeBlock(pData, tokens, 2*tokenCount);
1032
udata_writeBlock(pData, stringStore+groupTop, lineTop-groupTop);
1033
if((lineTop-groupTop)&1) {
1035
udata_writePadding(pData, 1);
1039
udata_write16(pData, (uint16_t)lineCount);
1040
for(i=0; i<lineCount; ++i) {
1042
groupWords[0]=(uint16_t)lines[i].code;
1045
offset = (uint32_t)((lines[i].s - stringStore)-groupBottom);
1046
groupWords[1]=(uint16_t)(offset>>16);
1047
groupWords[2]=(uint16_t)(offset);
1048
udata_writeBlock(pData, groupWords, 6);
1052
udata_writeBlock(pData, stringStore+groupBottom, groupTop-groupBottom);
1054
/* 4-align the algorithmic names data */
1055
udata_writePadding(pData, algNamesOffset-(groupStringOffset+(groupTop-groupBottom)));
1057
generateAlgorithmicData(pData, storeOptions);
1060
dataLength=udata_finish(pData, &errorCode);
1061
if(U_FAILURE(errorCode)) {
1062
fprintf(stderr, "gennames: error %d writing the output file\n", errorCode);
1066
if(dataLength!=(long)size) {
1067
fprintf(stderr, "gennames: data length %ld != calculated size %lu\n",
1068
dataLength, (unsigned long)size);
1069
exit(U_INTERNAL_PROGRAM_ERROR);
1073
/* the structure for algorithmic names needs to be 4-aligned */
1074
typedef struct AlgorithmicRange {
1075
uint32_t rangeStart, rangeEnd;
1076
uint8_t algorithmType, algorithmVariant;
1081
generateAlgorithmicData(UNewDataMemory *pData, Options *storeOptions) {
1082
static char prefix[] = "CJK UNIFIED IDEOGRAPH-";
1083
# define PREFIX_LENGTH 23
1084
# define PREFIX_LENGTH_4 24
1085
uint32_t countAlgRanges;
1087
static AlgorithmicRange cjkExtA={
1090
sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
1092
static AlgorithmicRange cjk={
1095
sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
1097
static AlgorithmicRange cjkExtB={
1100
sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
1104
"HANGUL SYLLABLE \0"
1106
"G\0GG\0N\0D\0DD\0R\0M\0B\0BB\0"
1107
"S\0SS\0\0J\0JJ\0C\0K\0T\0P\0H\0"
1109
"A\0AE\0YA\0YAE\0EO\0E\0YEO\0YE\0O\0"
1110
"WA\0WAE\0OE\0YO\0U\0WEO\0WE\0WI\0"
1113
"\0G\0GG\0GS\0N\0NJ\0NH\0D\0L\0LG\0LM\0"
1114
"LB\0LS\0LT\0LP\0LH\0M\0B\0BS\0"
1115
"S\0SS\0NG\0J\0C\0K\0T\0P\0H"
1118
static AlgorithmicRange hangul={
1121
sizeof(AlgorithmicRange)+6+sizeof(jamo)
1124
/* modulo factors, maximum 8 */
1125
/* 3 factors: 19, 21, 28, most-to-least-significant */
1126
static uint16_t hangulFactors[3]={
1134
if(ucdVersion>=UNI_5_1) {
1135
/* Unicode 5.1 and up has a longer CJK Unihan range than before */
1136
cjk.rangeEnd=0x9FC3;
1137
} else if(ucdVersion>=UNI_4_1) {
1138
/* Unicode 4.1 and up has a longer CJK Unihan range than before */
1139
cjk.rangeEnd=0x9FBB;
1142
/* number of ranges of algorithmic names */
1143
if(!storeOptions->storeNames) {
1145
} else if(ucdVersion>=UNI_3_1) {
1146
/* Unicode 3.1 and up has 4 ranges including CJK Extension B */
1148
} else if(ucdVersion>=UNI_3_0) {
1149
/* Unicode 3.0 has 3 ranges including CJK Extension A */
1152
/* Unicode 2.0 has 2 ranges including Hangul and CJK Unihan */
1157
udata_write32(pData, countAlgRanges);
1161
if(countAlgRanges==0) {
1167
* uint32_t rangeStart
1169
* uint8_t algorithmType
1170
* uint8_t algorithmVariant
1171
* uint16_t size of range data
1172
* uint8_t[size] data
1175
/* range 0: cjk extension a */
1176
if(countAlgRanges>=3) {
1178
udata_writeBlock(pData, &cjkExtA, sizeof(AlgorithmicRange));
1179
udata_writeString(pData, prefix, PREFIX_LENGTH);
1180
if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
1181
udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
1184
size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
1190
udata_writeBlock(pData, &cjk, sizeof(AlgorithmicRange));
1191
udata_writeString(pData, prefix, PREFIX_LENGTH);
1192
if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
1193
udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
1196
size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
1199
/* range 2: hangul syllables */
1201
udata_writeBlock(pData, &hangul, sizeof(AlgorithmicRange));
1202
udata_writeBlock(pData, hangulFactors, 6);
1203
udata_writeString(pData, jamo, sizeof(jamo));
1205
size+=sizeof(AlgorithmicRange)+6+sizeof(jamo);
1208
/* range 3: cjk extension b */
1209
if(countAlgRanges>=4) {
1211
udata_writeBlock(pData, &cjkExtB, sizeof(AlgorithmicRange));
1212
udata_writeString(pData, prefix, PREFIX_LENGTH);
1213
if(PREFIX_LENGTH<PREFIX_LENGTH_4) {
1214
udata_writePadding(pData, PREFIX_LENGTH_4-PREFIX_LENGTH);
1217
size+=sizeof(AlgorithmicRange)+PREFIX_LENGTH_4;
1224
/* helpers ------------------------------------------------------------------ */
1227
findToken(uint8_t *s, int16_t length) {
1230
for(i=0; i<(int16_t)tokenCount; ++i) {
1232
if(token>=0 && length==words[token].length && 0==uprv_memcmp(s, words[token].s, length)) {
1241
findWord(char *s, int16_t length) {
1244
for(i=0; i<wordCount; ++i) {
1245
if(length==words[i].length && 0==uprv_memcmp(s, words[i].s, length)) {
1254
addWord(char *s, int16_t length) {
1255
uint8_t *stringStart;
1258
if(wordCount==MAX_WORD_COUNT) {
1259
fprintf(stderr, "gennames: too many words\n");
1260
exit(U_BUFFER_OVERFLOW_ERROR);
1263
stringStart=allocWord(length);
1264
uprv_memcpy(stringStart, s, length);
1266
word=words+wordCount;
1269
* Initialize the weight with the costs for this token:
1270
* a zero-terminated string and a 16-bit offset.
1272
word->weight=-(length+1+2);
1274
word->length=length;
1275
word->s=stringStart;
1283
countWord(Word *word) {
1284
/* add to the weight the savings: the length of the word minus 1 byte for the token */
1285
word->weight+=word->length-1;
1290
addLine(uint32_t code, char *names[], int16_t lengths[], int16_t count) {
1291
uint8_t *stringStart;
1295
if(lineCount==MAX_LINE_COUNT) {
1296
fprintf(stderr, "gennames: too many lines\n");
1297
exit(U_BUFFER_OVERFLOW_ERROR);
1300
/* find the last non-empty name */
1301
while(count>0 && lengths[count-1]==0) {
1305
return; /* should not occur: caller should not have called */
1308
/* there will be (count-1) separator characters */
1312
/* add lengths of strings */
1314
length+=lengths[--i];
1317
/* allocate line memory */
1318
stringStart=allocLine(length);
1320
/* copy all strings into the line memory */
1321
length=0; /* number of chars copied so far */
1322
for(i=0; i<count; ++i) {
1324
stringStart[length++]=NAME_SEPARATOR_CHAR;
1327
uprv_memcpy(stringStart+length, names[i], lengths[i]);
1332
line=lines+lineCount;
1335
line->length=length;
1336
line->s=stringStart;
1340
/* prevent a character value that is actually in a name from becoming a token */
1342
tokens[stringStart[--length]]=-1;
1347
addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length) {
1348
uint8_t *stringStart;
1351
if(lineCount==MAX_LINE_COUNT) {
1352
fprintf(stderr, "gennames: too many groups\n");
1353
exit(U_BUFFER_OVERFLOW_ERROR);
1356
/* store the line lengths first, then the strings */
1357
lineLengthsTop=(lineLengthsTop+1)/2;
1358
stringStart=allocLine(lineLengthsTop+length);
1359
uprv_memcpy(stringStart, lineLengths, lineLengthsTop);
1360
uprv_memcpy(stringStart+lineLengthsTop, strings, length);
1362
line=lines+lineCount;
1364
line->code=groupMSB;
1365
line->length=length;
1366
line->s=stringStart;
1372
addToken(uint8_t *s, int16_t length) {
1373
uint8_t *stringStart;
1375
stringStart=allocLine(length+1);
1376
uprv_memcpy(stringStart, s, length);
1377
stringStart[length]=0;
1379
return (uint32_t)(stringStart - stringStore);
1383
appendLineLength(int16_t length) {
1385
fprintf(stderr, "gennames: compressed line too long\n");
1386
exit(U_BUFFER_OVERFLOW_ERROR);
1390
appendLineLengthNibble((uint8_t)((length>>4)|12));
1392
appendLineLengthNibble((uint8_t)length);
1396
appendLineLengthNibble(uint8_t nibble) {
1397
if((lineLengthsTop&1)==0) {
1398
lineLengths[lineLengthsTop/2]=(uint8_t)(nibble<<4);
1400
lineLengths[lineLengthsTop/2]|=nibble&0xf;
1406
allocLine(int32_t length) {
1407
uint32_t top=lineTop+length;
1410
if(top>wordBottom) {
1411
fprintf(stderr, "gennames: out of memory\n");
1412
exit(U_MEMORY_ALLOCATION_ERROR);
1414
p=stringStore+lineTop;
1420
allocWord(uint32_t length) {
1421
uint32_t bottom=wordBottom-length;
1423
if(lineTop>bottom) {
1424
fprintf(stderr, "gennames: out of memory\n");
1425
exit(U_MEMORY_ALLOCATION_ERROR);
1428
return stringStore+bottom;
1432
* Hey, Emacs, please set the following:
1435
* indent-tabs-mode: nil