2
*******************************************************************************
4
* Copyright (C) 2000-2001, International Business Machines
5
* Corporation and others. All Rights Reserved.
7
*******************************************************************************
10
* tab size: 8 (not used)
13
* created on: 2000jul06
14
* created by: Markus W. Scherer
18
#include "unicode/utypes.h"
28
MBCS_STATE_FLAG_DIRECT=1,
29
MBCS_STATE_FLAG_SURROGATES,
31
MBCS_STATE_FLAG_READY=16
35
MBCS_STAGE_2_BLOCK_SIZE=0x40, /* 64; 64=1<<6 for 6 bits in stage 2 */
36
MBCS_STAGE_2_BLOCK_SIZE_SHIFT=6, /* log2(MBCS_STAGE_2_BLOCK_SIZE) */
37
MBCS_STAGE_1_SIZE=0x440, /* 0x110000>>10, or 17*64 for one entry per 1k code points */
38
MBCS_STAGE_2_SIZE=0xfbc0, /* 0x10000-MBCS_STAGE_1_SIZE */
39
MBCS_MAX_STAGE_2_TOP=MBCS_STAGE_2_SIZE,
40
MBCS_STAGE_2_MAX_BLOCKS=MBCS_STAGE_2_SIZE>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT,
42
MBCS_STAGE_2_ALL_UNASSIGNED_INDEX=0, /* stage 1 entry for the all-unassigned stage 2 block */
43
MBCS_STAGE_2_FIRST_ASSIGNED=MBCS_STAGE_2_BLOCK_SIZE, /* start of the first stage 2 block after the all-unassigned one */
45
MBCS_MAX_STATE_COUNT=128,
46
MBCS_MAX_FALLBACK_COUNT=1000
49
typedef struct MBCSData {
50
NewConverter newConverter;
53
int32_t stateTable[MBCS_MAX_STATE_COUNT][256];
54
uint32_t stateFlags[MBCS_MAX_STATE_COUNT],
55
stateOffsetSum[MBCS_MAX_STATE_COUNT];
56
_MBCSToUFallback toUFallbacks[MBCS_MAX_FALLBACK_COUNT];
57
uint16_t *unicodeCodeUnits;
59
int32_t countToUCodeUnits;
62
uint16_t stage1[MBCS_STAGE_1_SIZE];
63
uint16_t stage2Single[MBCS_STAGE_2_SIZE]; /* stage 2 for single-byte codepages */
64
uint32_t stage2[MBCS_STAGE_2_SIZE]; /* stage 2 for MBCS */
66
uint32_t stage2Top, stage3Top, maxCharLength;
71
MBCSClose(NewConverter *cnvData);
74
MBCSProcessStates(NewConverter *cnvData);
77
MBCSAddToUnicode(NewConverter *cnvData,
78
const uint8_t *bytes, int32_t length,
79
UChar32 c, uint32_t b,
83
MBCSIsValid(NewConverter *cnvData,
84
const uint8_t *bytes, int32_t length,
88
MBCSSingleAddFromUnicode(NewConverter *cnvData,
89
const uint8_t *bytes, int32_t length,
90
UChar32 c, uint32_t b,
94
MBCSAddFromUnicode(NewConverter *cnvData,
95
const uint8_t *bytes, int32_t length,
96
UChar32 c, uint32_t b,
100
MBCSPostprocess(NewConverter *cnvData, const UConverterStaticData *staticData);
103
MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData, UNewDataMemory *pData);
105
/* implementation ----------------------------------------------------------- */
108
MBCSInit(MBCSData *mbcsData, uint8_t maxCharLength) {
111
uprv_memset(mbcsData, 0, sizeof(MBCSData));
113
mbcsData->newConverter.close=MBCSClose;
114
mbcsData->newConverter.startMappings=MBCSProcessStates;
115
mbcsData->newConverter.isValid=MBCSIsValid;
116
mbcsData->newConverter.addToUnicode=MBCSAddToUnicode;
117
if(maxCharLength==1) {
118
mbcsData->newConverter.addFromUnicode=MBCSSingleAddFromUnicode;
120
mbcsData->newConverter.addFromUnicode=MBCSAddFromUnicode;
122
mbcsData->newConverter.finishMappings=MBCSPostprocess;
123
mbcsData->newConverter.write=MBCSWrite;
125
mbcsData->header.version[0]=4;
126
mbcsData->stateFlags[0]=MBCS_STATE_FLAG_DIRECT;
127
mbcsData->stage2Top=MBCS_STAGE_2_FIRST_ASSIGNED; /* after stage 1 and one all-unassigned stage 2 block */
128
mbcsData->stage3Top=16*maxCharLength; /* after one all-unassigned stage 3 block */
129
mbcsData->maxCharLength=maxCharLength;
130
mbcsData->header.flags=maxCharLength-1; /* outputType */
132
/* point all entries in stage 1 to the "all-unassigned" first block in stage 2 */
133
for(i=0; i<MBCS_STAGE_1_SIZE; ++i) {
134
mbcsData->stage1[i]=MBCS_STAGE_2_ALL_UNASSIGNED_INDEX;
139
MBCSOpen(uint8_t maxCharLength) {
140
MBCSData *mbcsData=(MBCSData *)uprv_malloc(sizeof(MBCSData));
142
MBCSInit(mbcsData, maxCharLength);
144
return &mbcsData->newConverter;
148
MBCSClose(NewConverter *cnvData) {
149
MBCSData *mbcsData=(MBCSData *)cnvData;
151
if(mbcsData->unicodeCodeUnits!=NULL) {
152
uprv_free(mbcsData->unicodeCodeUnits);
154
if(mbcsData->fromUBytes!=NULL) {
155
uprv_free(mbcsData->fromUBytes);
162
skipWhitespace(const char *s) {
163
while(*s==' ' || *s=='\t') {
170
* state table row grammar (ebnf-style):
171
* (whitespace is allowed between all tokens)
173
* row=[[firstentry ','] entry (',' entry)*]
174
* firstentry="initial" | "surrogates"
175
* (initial state (default for state 0), output is all surrogate pairs)
176
* entry=range [':' nextstate] ['.' action]
177
* range=number ['-' number]
180
* action='u' | 's' | 'p' | 'i'
181
* (unassigned, state change only, surrogate pair, illegal)
182
* number=(1- or 2-digit hexadecimal number)
185
parseState(const char *s, int32_t state[256], uint32_t *pFlags) {
187
uint32_t start, end, i;
190
/* initialize the state: all illegal with U+ffff */
191
for(i=0; i<256; ++i) {
192
state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0xffff);
195
/* skip leading white space */
198
/* is there an "initial" or "surrogates" directive? */
199
if(uprv_strncmp("initial", s, 7)==0) {
200
*pFlags=MBCS_STATE_FLAG_DIRECT;
201
s=skipWhitespace(s+7);
205
} else if(*pFlags==0 && uprv_strncmp("surrogates", s, 10)==0) {
206
*pFlags=MBCS_STATE_FLAG_SURROGATES;
207
s=skipWhitespace(s+10);
212
/* empty state row: all-illegal */
217
/* read an entry, the start of the range first */
219
start=uprv_strtoul(s, (char **)&t, 16);
220
if(s==t || 0xff<start) {
225
/* read the end of the range if there is one */
227
s=skipWhitespace(s+1);
228
end=uprv_strtoul(s, (char **)&t, 16);
229
if(s==t || end<start || 0xff<end) {
237
/* determine the state entrys for this range */
238
if(*s!=':' && *s!='.') {
239
/* the default is: final state with valid entries */
240
entry=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_16, 0);
242
entry=MBCS_ENTRY_TRANSITION(0, 0);
244
/* get the next state, default to 0 */
245
s=skipWhitespace(s+1);
246
i=uprv_strtoul(s, (char **)&t, 16);
252
entry=MBCS_ENTRY_SET_STATE(entry, i);
256
/* get the state action, default to valid */
258
/* this is a final state */
259
entry=MBCS_ENTRY_SET_FINAL(entry);
261
s=skipWhitespace(s+1);
263
/* unassigned set U+fffe */
264
entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe);
265
s=skipWhitespace(s+1);
267
if(*pFlags!=MBCS_STATE_FLAG_DIRECT) {
268
entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16_PAIR);
270
entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16);
272
s=skipWhitespace(s+1);
274
entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_CHANGE_ONLY);
275
s=skipWhitespace(s+1);
277
/* illegal set U+ffff */
278
entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_ILLEGAL, 0xffff);
279
s=skipWhitespace(s+1);
281
/* default to valid */
282
entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16);
285
/* this is an intermediate state, nothing to do */
289
/* adjust "final valid" states according to the state flags */
290
if(MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16) {
295
case MBCS_STATE_FLAG_DIRECT:
296
/* set the valid-direct code point to "unassigned"==0xfffe */
297
entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_DIRECT_16, 0xfffe);
299
case MBCS_STATE_FLAG_SURROGATES:
300
entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_16_PAIR, 0);
307
/* set this entry for the range */
308
for(i=start; i<=end; ++i) {
315
return *s==0 ? NULL : s;
321
MBCSAddState(NewConverter *cnvData, const char *s) {
322
MBCSData *mbcsData=(MBCSData *)cnvData;
325
if(mbcsData->header.countStates==MBCS_MAX_STATE_COUNT) {
326
fprintf(stderr, "error: too many states (maximum %u)\n", MBCS_MAX_STATE_COUNT);
330
error=parseState(s, mbcsData->stateTable[mbcsData->header.countStates],
331
&mbcsData->stateFlags[mbcsData->header.countStates]);
333
fprintf(stderr, "parse error in state definition at '%s'\n", error);
337
++mbcsData->header.countStates;
342
sumUpStates(MBCSData *mbcsData) {
344
int state, cell, count;
345
UBool allStatesReady;
348
* Sum up the offsets for all states.
349
* In each final state (where there are only final entries),
350
* the offsets add up directly.
351
* In all other state table rows, for each transition entry to another state,
352
* the offsets sum of that state needs to be added.
353
* This is achieved in at most countStates iterations.
355
allStatesReady=FALSE;
356
for(count=mbcsData->header.countStates; !allStatesReady && count>=0; --count) {
358
for(state=mbcsData->header.countStates-1; state>=0; --state) {
359
if(!(mbcsData->stateFlags[state]&MBCS_STATE_FLAG_READY)) {
360
allStatesReady=FALSE;
363
/* at first, add up only the final delta offsets to keep them <512 */
364
for(cell=0; cell<256; ++cell) {
365
entry=mbcsData->stateTable[state][cell];
366
if(MBCS_ENTRY_IS_FINAL(entry)) {
367
switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
368
case MBCS_STATE_VALID_16:
369
mbcsData->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum);
372
case MBCS_STATE_VALID_16_PAIR:
373
mbcsData->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum);
383
/* now, add up the delta offsets for the transitional entries */
384
for(cell=0; cell<256; ++cell) {
385
entry=mbcsData->stateTable[state][cell];
386
if(MBCS_ENTRY_IS_TRANSITION(entry)) {
387
if(mbcsData->stateFlags[MBCS_ENTRY_TRANSITION_STATE(entry)]&MBCS_STATE_FLAG_READY) {
388
mbcsData->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_SET_OFFSET(entry, sum);
389
sum+=mbcsData->stateOffsetSum[MBCS_ENTRY_TRANSITION_STATE(entry)];
391
/* that next state does not have a sum yet, we cannot finish the one for this state */
399
mbcsData->stateOffsetSum[state]=sum;
400
mbcsData->stateFlags[state]|=MBCS_STATE_FLAG_READY;
406
if(!allStatesReady) {
407
fprintf(stderr, "error: the state table contains loops\n");
412
* For all "direct" (i.e., initial) states>0,
413
* the offsets need to be increased by the sum of
414
* the previous initial states.
416
sum=mbcsData->stateOffsetSum[0];
417
for(state=1; state<(int)mbcsData->header.countStates; ++state) {
418
if((mbcsData->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
420
sum+=mbcsData->stateOffsetSum[state];
421
for(cell=0; cell<256; ++cell) {
422
entry=mbcsData->stateTable[state][cell];
423
if(MBCS_ENTRY_IS_TRANSITION(entry)) {
424
mbcsData->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_ADD_OFFSET(entry, sum2);
430
printf("the total number of offsets is 0x%lx=%ld\n",
431
(unsigned long)sum, (long)sum);
434
/* round up to the next even number to have the following data 32-bit-aligned */
436
return mbcsData->countToUCodeUnits=sum;
440
MBCSProcessStates(NewConverter *cnvData) {
441
MBCSData *mbcsData=(MBCSData *)cnvData;
442
int32_t i, entry, sum;
446
* first make sure that all "next state" values are within limits
447
* and that all next states after final ones have the "direct"
448
* flag of initial states
450
for(state=mbcsData->header.countStates-1; state>=0; --state) {
451
for(cell=0; cell<256; ++cell) {
452
entry=mbcsData->stateTable[state][cell];
453
if((uint8_t)MBCS_ENTRY_STATE(entry)>=mbcsData->header.countStates) {
454
fprintf(stderr, "error: state table entry [%x][%x] has a next state of %x that is too high\n",
455
state, cell, MBCS_ENTRY_STATE(entry));
458
if(MBCS_ENTRY_IS_FINAL(entry) && (mbcsData->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)!=MBCS_STATE_FLAG_DIRECT) {
459
fprintf(stderr, "error: state table entry [%x][%x] is final but has a non-initial next state of %x\n",
460
state, cell, MBCS_ENTRY_STATE(entry));
462
} else if(MBCS_ENTRY_IS_TRANSITION(entry) && (mbcsData->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)==MBCS_STATE_FLAG_DIRECT) {
463
fprintf(stderr, "error: state table entry [%x][%x] is not final but has an initial next state of %x\n",
464
state, cell, MBCS_ENTRY_STATE(entry));
470
/* is this an SI/SO (like EBCDIC-stateful) state table? */
471
if(mbcsData->header.countStates>=2 && (mbcsData->stateFlags[1]&0xf)==MBCS_STATE_FLAG_DIRECT) {
472
if(mbcsData->maxCharLength!=2) {
473
fprintf(stderr, "error: SI/SO codepages must have max 2 bytes/char (not %x)\n", mbcsData->maxCharLength);
476
if(mbcsData->header.countStates<3) {
477
fprintf(stderr, "error: SI/SO codepages must have at least 3 states (not %x)\n", mbcsData->header.countStates);
480
/* are the SI/SO all in the right places? */
481
if( mbcsData->stateTable[0][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) &&
482
mbcsData->stateTable[0][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0) &&
483
mbcsData->stateTable[1][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) &&
484
mbcsData->stateTable[1][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0)
486
mbcsData->header.flags=MBCS_OUTPUT_2_SISO;
488
fprintf(stderr, "error: SI/SO codepages must have in states 0 and 1 transitions e:1.s, f:0.s\n");
496
/* check that no unexpected state is a "direct" one */
497
while(state<(int)mbcsData->header.countStates) {
498
if((mbcsData->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
499
fprintf(stderr, "error: state %d is 'initial' - not supported except for SI/SO codepages\n", state);
505
sum=sumUpStates(mbcsData);
510
/* allocate the code unit array and prefill it with "unassigned" values */
512
mbcsData->unicodeCodeUnits=(uint16_t *)uprv_malloc(sum*sizeof(uint16_t));
513
if(mbcsData->unicodeCodeUnits==NULL) {
514
fprintf(stderr, "error: out of memory allocating %ld 16-bit code units\n",
518
for(i=0; i<sum; ++i) {
519
mbcsData->unicodeCodeUnits[i]=0xfffe;
523
/* allocate the codepage mappings and preset the first 16 characters to 0 */
524
if(mbcsData->maxCharLength==1) {
525
/* allocate 64k 16-bit results for single-byte codepages */
528
/* allocate 1M * maxCharLength bytes for at most 1M mappings */
529
sum=0x100000*mbcsData->maxCharLength;
531
mbcsData->fromUBytes=(uint8_t *)uprv_malloc(sum);
532
if(mbcsData->fromUBytes==NULL) {
533
fprintf(stderr, "error: out of memory allocating %ldMB for target mappings\n",
537
/* initialize the all-unassigned first stage 3 block */
538
uprv_memset(mbcsData->fromUBytes, 0, 64);
543
/* find a fallback for this offset; return the index or -1 if not found */
545
findFallback(MBCSData *mbcsData, uint32_t offset) {
546
_MBCSToUFallback *toUFallbacks;
549
limit=mbcsData->header.countToUFallbacks;
551
/* shortcut: most codepages do not have fallbacks from codepage to Unicode */
555
/* do a linear search for the fallback mapping (the table is not yet sorted) */
556
toUFallbacks=mbcsData->toUFallbacks;
557
for(i=0; i<limit; ++i) {
558
if(offset==toUFallbacks[i].offset) {
565
/* return TRUE for success */
567
setFallback(MBCSData *mbcsData, uint32_t offset, UChar32 c) {
568
int32_t i=findFallback(mbcsData, offset);
570
/* if there is already a fallback for this offset, then overwrite it */
571
mbcsData->toUFallbacks[i].codePoint=c;
574
/* if there is no fallback for this offset, then add one */
575
i=mbcsData->header.countToUFallbacks;
576
if(i>=MBCS_MAX_FALLBACK_COUNT) {
577
fprintf(stderr, "error: too many toUnicode fallbacks, currently at: U+%lx\n", c);
580
mbcsData->toUFallbacks[i].offset=offset;
581
mbcsData->toUFallbacks[i].codePoint=c;
582
mbcsData->header.countToUFallbacks=i+1;
588
/* remove fallback if there is one with this offset; return the code point if there was such a fallback, otherwise -1 */
590
removeFallback(MBCSData *mbcsData, uint32_t offset) {
591
int32_t i=findFallback(mbcsData, offset);
593
_MBCSToUFallback *toUFallbacks;
596
toUFallbacks=mbcsData->toUFallbacks;
597
limit=mbcsData->header.countToUFallbacks;
598
old=(int32_t)toUFallbacks[i].codePoint;
600
/* copy the last fallback entry here to keep the list contiguous */
601
toUFallbacks[i].offset=toUFallbacks[limit-1].offset;
602
toUFallbacks[i].codePoint=toUFallbacks[limit-1].codePoint;
603
mbcsData->header.countToUFallbacks=limit-1;
611
* isFallback is almost a boolean:
612
* 1 (TRUE) this is a fallback mapping
613
* 0 (FALSE) this is a precise mapping
614
* -1 the precision of this mapping is not specified
617
MBCSAddToUnicode(NewConverter *cnvData,
618
const uint8_t *bytes, int32_t length,
619
UChar32 c, uint32_t b,
621
MBCSData *mbcsData=(MBCSData *)cnvData;
623
int32_t i=0, entry, old;
626
if(mbcsData->header.countStates==0) {
627
fprintf(stderr, "error: there is no state information!\n");
631
/* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */
632
if(length==2 && (mbcsData->header.flags&0xff)==MBCS_OUTPUT_2_SISO) {
637
* Walk down the state table like in conversion,
638
* much like getNextUChar().
639
* We assume that c<=0x10ffff.
642
entry=mbcsData->stateTable[state][bytes[i++]];
643
if(MBCS_ENTRY_IS_TRANSITION(entry)) {
645
fprintf(stderr, "error: byte sequence too short, ends in non-final state %hu: 0x%02lx (U+%lx)\n",
646
state, (unsigned long)b, c);
649
state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
650
offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
653
fprintf(stderr, "error: byte sequence too long by %d bytes, final state %hu: 0x%02lx (U+%lx)\n",
654
(length-i), state, (unsigned long)b, c);
657
switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
658
case MBCS_STATE_ILLEGAL:
659
fprintf(stderr, "error: byte sequence ends in illegal state at U+%04lx<->0x%02lx\n",
660
c, (unsigned long)b);
662
case MBCS_STATE_CHANGE_ONLY:
663
fprintf(stderr, "error: byte sequence ends in state-change-only at U+%04lx<->0x%02lx\n",
664
c, (unsigned long)b);
666
case MBCS_STATE_UNASSIGNED:
667
fprintf(stderr, "error: byte sequence ends in unassigned state at U+%04lx<->0x%02lx\n",
668
c, (unsigned long)b);
670
case MBCS_STATE_FALLBACK_DIRECT_16:
671
case MBCS_STATE_VALID_DIRECT_16:
672
case MBCS_STATE_FALLBACK_DIRECT_20:
673
case MBCS_STATE_VALID_DIRECT_20:
674
if(MBCS_ENTRY_SET_STATE(entry, 0)!=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, 0xfffe)) {
675
/* the "direct" action's value is not "valid-direct-16-unassigned" any more */
676
if(MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_DIRECT_16 || MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_FALLBACK_DIRECT_16) {
677
old=MBCS_ENTRY_FINAL_VALUE(entry);
679
old=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
682
fprintf(stderr, "error: duplicate codepage byte sequence at U+%04lx<->0x%02lx see U+%04lx\n",
683
c, (unsigned long)b, (long)old);
686
fprintf(stderr, "duplicate codepage byte sequence at U+%04lx<->0x%02lx see U+%04lx\n",
687
c, (unsigned long)b, (long)old);
690
* Continue after the above warning
691
* if the precision of the mapping is unspecified.
694
/* reassign the correct action code */
695
entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, (MBCS_STATE_VALID_DIRECT_16+(isFallback>0 ? 2 : 0)+(c>=0x10000 ? 1 : 0)));
697
/* put the code point into bits 22..7 for BMP, c-0x10000 into 26..7 for others */
699
entry=MBCS_ENTRY_FINAL_SET_VALUE(entry, c);
701
entry=MBCS_ENTRY_FINAL_SET_VALUE(entry, c-0x10000);
703
mbcsData->stateTable[state][bytes[i-1]]=entry;
705
case MBCS_STATE_VALID_16:
706
/* bits 26..16 are not used, 0 */
707
/* bits 15..7 contain the final offset delta to one 16-bit code unit */
708
offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
709
/* check that this byte sequence is still unassigned */
710
if((old=mbcsData->unicodeCodeUnits[offset])!=0xfffe || (old=removeFallback(mbcsData, offset))!=-1) {
712
fprintf(stderr, "error: duplicate codepage byte sequence at U+%04lx<->0x%02lx see U+%04lx\n",
713
c, (unsigned long)b, (long)old);
716
fprintf(stderr, "duplicate codepage byte sequence at U+%04lx<->0x%02lx see U+%04lx\n",
717
c, (unsigned long)b, (long)old);
721
fprintf(stderr, "error: code point does not fit into valid-16-bit state at U+%04lx<->0x%02lx\n",
722
c, (unsigned long)b);
726
/* assign only if there is no precise mapping */
727
if(mbcsData->unicodeCodeUnits[offset]==0xfffe) {
728
return setFallback(mbcsData, offset, c);
731
mbcsData->unicodeCodeUnits[offset]=(uint16_t)c;
734
case MBCS_STATE_VALID_16_PAIR:
735
/* bits 26..16 are not used, 0 */
736
/* bits 15..7 contain the final offset delta to two 16-bit code units */
737
offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
738
/* check that this byte sequence is still unassigned */
739
old=mbcsData->unicodeCodeUnits[offset];
744
} else if(old<=0xdfff) {
745
real=0x10000+((old&0x3ff)<<10)+((mbcsData->unicodeCodeUnits[offset+1])&0x3ff);
746
} else /* old<=0xe001 */ {
747
real=mbcsData->unicodeCodeUnits[offset+1];
750
fprintf(stderr, "error: duplicate codepage byte sequence at U+%04lx<->0x%02lx see U+%04lx\n",
751
c, (unsigned long)b, (long)real);
754
fprintf(stderr, "duplicate codepage byte sequence at U+%04lx<->0x%02lx see U+%04lx\n",
755
c, (unsigned long)b, (long)real);
759
/* assign only if there is no precise mapping */
760
if(old<=0xdbff || old==0xe000) {
762
} else if(c<=0xffff) {
763
/* set a BMP fallback code point as a pair with 0xe001 */
764
mbcsData->unicodeCodeUnits[offset++]=0xe001;
765
mbcsData->unicodeCodeUnits[offset]=(uint16_t)c;
767
/* set a fallback surrogate pair with two second surrogates */
768
mbcsData->unicodeCodeUnits[offset++]=(uint16_t)(0xdbc0+(c>>10));
769
mbcsData->unicodeCodeUnits[offset]=(uint16_t)(0xdc00+(c&0x3ff));
773
/* set a BMP code point */
774
mbcsData->unicodeCodeUnits[offset]=(uint16_t)c;
775
} else if(c<=0xffff) {
776
/* set a BMP code point above 0xd800 as a pair with 0xe000 */
777
mbcsData->unicodeCodeUnits[offset++]=0xe000;
778
mbcsData->unicodeCodeUnits[offset]=(uint16_t)c;
780
/* set a surrogate pair */
781
mbcsData->unicodeCodeUnits[offset++]=(uint16_t)(0xd7c0+(c>>10));
782
mbcsData->unicodeCodeUnits[offset]=(uint16_t)(0xdc00+(c&0x3ff));
787
/* reserved, must never occur */
788
fprintf(stderr, "internal error: byte sequence reached reserved action code, entry0x%02lx: 0x%02lx (U+%lx)\n",
789
(unsigned long)entry, (unsigned long)b, c);
798
/* is this byte sequence valid? (this is almost the same as MBCSAddToUnicode()) */
800
MBCSIsValid(NewConverter *cnvData,
801
const uint8_t *bytes, int32_t length,
803
MBCSData *mbcsData=(MBCSData *)cnvData;
808
if(mbcsData->header.countStates==0) {
809
fprintf(stderr, "error: there is no state information!\n");
813
/* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */
814
if(length==2 && (mbcsData->header.flags&0xff)==MBCS_OUTPUT_2_SISO) {
819
* Walk down the state table like in conversion,
820
* much like getNextUChar().
821
* We assume that c<=0x10ffff.
824
entry=mbcsData->stateTable[state][bytes[i++]];
825
if(MBCS_ENTRY_IS_TRANSITION(entry)) {
827
fprintf(stderr, "error: byte sequence too short, ends in non-final state %hu: 0x%02lx\n",
828
state, (unsigned long)b);
831
state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
832
offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
835
fprintf(stderr, "error: byte sequence too long by %d bytes, final state %hu: 0x%02lx\n",
836
(length-i), state, (unsigned long)b);
839
switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
840
case MBCS_STATE_ILLEGAL:
841
fprintf(stderr, "error: byte sequence ends in illegal state: 0x%02lx\n",
844
case MBCS_STATE_CHANGE_ONLY:
845
fprintf(stderr, "error: byte sequence ends in state-change-only: 0x%02lx\n",
848
case MBCS_STATE_UNASSIGNED:
849
fprintf(stderr, "error: byte sequence ends in unassigned state: 0x%02lx\n",
852
case MBCS_STATE_FALLBACK_DIRECT_16:
853
case MBCS_STATE_VALID_DIRECT_16:
854
case MBCS_STATE_FALLBACK_DIRECT_20:
855
case MBCS_STATE_VALID_DIRECT_20:
856
case MBCS_STATE_VALID_16:
857
case MBCS_STATE_VALID_16_PAIR:
860
/* reserved, must never occur */
861
fprintf(stderr, "internal error: byte sequence reached reserved action code, entry0x%02lx: 0x%02lx\n",
862
(long)entry, (unsigned long)b);
870
MBCSSingleAddFromUnicode(NewConverter *cnvData,
871
const uint8_t *bytes, int32_t length,
872
UChar32 c, uint32_t b,
874
MBCSData *mbcsData=(MBCSData *)cnvData;
880
* Walk down the triple-stage compact array ("trie") and
881
* allocate parts as necessary.
882
* Note that the first stage 2 and 3 blocks are reserved for all-unassigned mappings.
883
* We assume that length<=maxCharLength and that c<=0x10ffff.
886
/* inspect stage 1 */
888
if(mbcsData->stage1[index]==MBCS_STAGE_2_ALL_UNASSIGNED_INDEX) {
889
/* allocate another block in stage 2 */
890
if(mbcsData->stage2Top>=MBCS_MAX_STAGE_2_TOP) {
891
fprintf(stderr, "error: too many stage 2 entries at U+%04lx<->0x%02lx\n",
892
c, (unsigned long)b);
897
* each stage 2 block contains 64 16-bit words:
898
* 6 code point bits 9..4 with 1 stage 3 index
900
mbcsData->stage1[index]=(uint16_t)mbcsData->stage2Top;
901
mbcsData->stage2Top+=MBCS_STAGE_2_BLOCK_SIZE;
904
/* inspect stage 2 */
905
index=(uint32_t)mbcsData->stage1[index]+((c>>4)&0x3f);
906
if(mbcsData->stage2Single[index]==0) {
907
/* allocate another block in stage 3 */
908
if(mbcsData->stage3Top>=0x10000) {
909
fprintf(stderr, "error: too many code points at U+%04lx<->0x%02lx\n",
910
c, (unsigned long)b);
913
/* each block has 16 uint16_t entries */
914
mbcsData->stage2Single[index]=(uint16_t)mbcsData->stage3Top;
915
uprv_memset(mbcsData->fromUBytes+2*mbcsData->stage3Top, 0, 32);
916
mbcsData->stage3Top+=16;
919
/* write the codepage entry into stage 3 and get the previous entry */
920
p=(uint16_t *)mbcsData->fromUBytes+mbcsData->stage2Single[index]+(c&0xf);
923
*p=(uint16_t)(0xf00|b);
924
} else if(IS_PRIVATE_USE(c)) {
925
*p=(uint16_t)(0xc00|b);
927
*p=(uint16_t)(0x800|b);
930
/* check that this Unicode code point was still unassigned */
933
fprintf(stderr, "error: duplicate Unicode code point at U+%04lx<->0x%02lx see 0x%02x\n",
934
c, (unsigned long)b, old);
937
fprintf(stderr, "duplicate Unicode code point at U+%04lx<->0x%02lx see 0x%02x\n",
938
c, (unsigned long)b, old);
940
/* continue after the above warning if the precision of the mapping is unspecified */
947
MBCSAddFromUnicode(NewConverter *cnvData,
948
const uint8_t *bytes, int32_t length,
949
UChar32 c, uint32_t b,
951
MBCSData *mbcsData=(MBCSData *)cnvData;
955
if( (mbcsData->header.flags&0xff)==MBCS_OUTPUT_2_SISO &&
956
(*bytes==0xe || *bytes==0xf)
958
fprintf(stderr, "error: illegal mapping to SI or SO for SI/SO codepage: U+%04lx<->0x%02lx\n",
959
c, (unsigned long)b);
963
* Walk down the triple-stage compact array ("trie") and
964
* allocate parts as necessary.
965
* Note that the first stage 2 and 3 blocks are reserved for
966
* all-unassigned mappings.
967
* We assume that length<=maxCharLength and that c<=0x10ffff.
970
/* inspect stage 1 */
972
if(mbcsData->stage1[index]==MBCS_STAGE_2_ALL_UNASSIGNED_INDEX) {
973
/* allocate another block in stage 2 */
974
if(mbcsData->stage2Top>=MBCS_MAX_STAGE_2_TOP) {
975
fprintf(stderr, "error: too many stage 2 entries at U+%04lx<->0x%02lx\n",
976
c, (unsigned long)b);
981
* each stage 2 block contains 64 32-bit words:
982
* 6 code point bits 9..4 with value with bits 31..16 "assigned" flags and bits 15..0 stage 3 index
984
mbcsData->stage1[index]=(uint16_t)mbcsData->stage2Top;
985
mbcsData->stage2Top+=MBCS_STAGE_2_BLOCK_SIZE;
988
/* inspect stage 2 */
989
index=mbcsData->stage1[index]+((c>>4)&0x3f);
990
if(mbcsData->stage2[index]==0) {
991
/* allocate another block in stage 3 */
992
if(mbcsData->stage3Top>=0x100000*mbcsData->maxCharLength) {
993
fprintf(stderr, "error: too many code points at U+%04lx<->0x%02lx\n",
994
c, (unsigned long)b);
997
/* each block has 16*maxCharLength bytes */
998
mbcsData->stage2[index]=(mbcsData->stage3Top/16)/mbcsData->maxCharLength;
999
uprv_memset(mbcsData->fromUBytes+mbcsData->stage3Top, 0, 16*mbcsData->maxCharLength);
1000
mbcsData->stage3Top+=16*mbcsData->maxCharLength;
1003
/* write the codepage bytes into stage 3 and get the previous bytes */
1005
p=mbcsData->fromUBytes+(16*(uint32_t)(uint16_t)mbcsData->stage2[index]+(c&0xf))*mbcsData->maxCharLength;
1006
switch(mbcsData->maxCharLength) {
1009
*(uint16_t *)p=(uint16_t)b;
1012
old=(uint32_t)*p<<16;
1013
*p++=(uint8_t)(b>>16);
1014
old|=(uint32_t)*p<<8;
1015
*p++=(uint8_t)(b>>8);
1024
/* will never occur */
1028
/* check that this Unicode code point was still unassigned */
1029
if((mbcsData->stage2[index]&(1UL<<(16+(c&0xf))))!=0 || old!=0) {
1031
fprintf(stderr, "error: duplicate Unicode code point at U+%04lx<->0x%02lx see 0x%02lx\n",
1032
c, (unsigned long)b, (unsigned long)old);
1034
} else if(VERBOSE) {
1035
fprintf(stderr, "duplicate Unicode code point at U+%04lx<->0x%02lx see 0x%02lx\n",
1036
c, (unsigned long)b, (unsigned long)old);
1038
/* continue after the above warning if the precision of the mapping is
1042
/* set the "assigned" flag */
1043
mbcsData->stage2[index]|=(1UL<<(16+(c&0xf)));
1050
compareFallbacks(const void *fb1, const void *fb2) {
1051
return ((const _MBCSToUFallback *)fb1)->offset-((const _MBCSToUFallback *)fb2)->offset;
1055
* This function tries to compact toUnicode tables for 2-byte codepages
1056
* by finding lead bytes with all-unassigned trail bytes and adding another state
1060
compactToUnicode2(MBCSData *mbcsData) {
1061
int32_t (*oldStateTable)[256];
1062
uint16_t count[256];
1063
uint16_t *oldUnicodeCodeUnits;
1064
int32_t entry, offset, oldOffset, trailOffset, oldTrailOffset, savings, sum;
1065
int32_t i, j, leadState, trailState, newState, fallback;
1068
/* find the lead state */
1069
if((mbcsData->header.flags&0xff)==MBCS_OUTPUT_2_SISO) {
1070
/* use the DBCS lead state for SI/SO codepages */
1076
/* find the main trail state: the most used target state */
1077
uprv_memset(count, 0, sizeof(count));
1078
for(i=0; i<256; ++i) {
1079
entry=mbcsData->stateTable[leadState][i];
1080
if(MBCS_ENTRY_IS_TRANSITION(entry)) {
1081
++count[MBCS_ENTRY_TRANSITION_STATE(entry)];
1085
for(i=1; i<(int)mbcsData->header.countStates; ++i) {
1086
if(count[i]>count[trailState]) {
1091
/* count possible savings from lead bytes with all-unassigned results in all trail bytes */
1092
uprv_memset(count, 0, sizeof(count));
1094
/* for each lead byte */
1095
for(i=0; i<256; ++i) {
1096
entry=mbcsData->stateTable[leadState][i];
1097
if(MBCS_ENTRY_IS_TRANSITION(entry) && (MBCS_ENTRY_TRANSITION_STATE(entry))==trailState) {
1098
/* the offset is different for each lead byte */
1099
offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
1100
/* for each trail byte for this lead byte */
1101
for(j=0; j<256; ++j) {
1102
entry=mbcsData->stateTable[trailState][j];
1103
switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
1104
case MBCS_STATE_VALID_16:
1105
entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
1106
if(mbcsData->unicodeCodeUnits[entry]==0xfffe && findFallback(mbcsData, entry)<0) {
1109
j=999; /* do not count for this lead byte because there are assignments */
1112
case MBCS_STATE_VALID_16_PAIR:
1113
entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
1114
if(mbcsData->unicodeCodeUnits[entry]==0xfffe) {
1117
j=999; /* do not count for this lead byte because there are assignments */
1125
/* all trail bytes for this lead byte are unassigned */
1132
/* subtract from the possible savings the cost of an additional state */
1133
savings=savings*2-1024; /* count bytes, not 16-bit words */
1138
printf("compacting toUnicode data saves %ld bytes\n", (long)savings);
1140
if(mbcsData->header.countStates>=MBCS_MAX_STATE_COUNT) {
1141
fprintf(stderr, "cannot compact toUnicode because the maximum number of states is reached\n");
1145
/* make a copy of the state table */
1146
oldStateTable=(int32_t (*)[256])uprv_malloc(mbcsData->header.countStates*1024);
1147
if(oldStateTable==NULL) {
1148
fprintf(stderr, "cannot compact toUnicode: out of memory\n");
1151
uprv_memcpy(oldStateTable, mbcsData->stateTable, mbcsData->header.countStates*1024);
1153
/* add the new state */
1155
* this function does not catch the degenerate case where all lead bytes
1156
* have all-unassigned trail bytes and the lead state could be removed
1158
newState=mbcsData->header.countStates++;
1159
mbcsData->stateFlags[newState]=0;
1160
/* copy the old trail state, turning all assigned states into unassigned ones */
1161
for(i=0; i<256; ++i) {
1162
entry=mbcsData->stateTable[trailState][i];
1163
switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
1164
case MBCS_STATE_VALID_16:
1165
case MBCS_STATE_VALID_16_PAIR:
1166
mbcsData->stateTable[newState][i]=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe);
1169
mbcsData->stateTable[newState][i]=entry;
1174
/* in the lead state, redirect all lead bytes with all-unassigned trail bytes to the new state */
1175
for(i=0; i<256; ++i) {
1177
mbcsData->stateTable[leadState][i]=MBCS_ENTRY_SET_STATE(mbcsData->stateTable[leadState][i], newState);
1181
/* sum up the new state table */
1182
for(i=0; i<(int)mbcsData->header.countStates; ++i) {
1183
mbcsData->stateFlags[i]&=~MBCS_STATE_FLAG_READY;
1185
sum=sumUpStates(mbcsData);
1187
/* allocate a new, smaller code units array */
1188
oldUnicodeCodeUnits=mbcsData->unicodeCodeUnits;
1190
mbcsData->unicodeCodeUnits=NULL;
1191
if(oldUnicodeCodeUnits!=NULL) {
1192
uprv_free(oldUnicodeCodeUnits);
1194
uprv_free(oldStateTable);
1197
mbcsData->unicodeCodeUnits=(uint16_t *)uprv_malloc(sum*sizeof(uint16_t));
1198
if(mbcsData->unicodeCodeUnits==NULL) {
1199
fprintf(stderr, "cannot compact toUnicode: out of memory allocating %ld 16-bit code units\n",
1201
/* revert to the old state table */
1202
mbcsData->unicodeCodeUnits=oldUnicodeCodeUnits;
1203
--mbcsData->header.countStates;
1204
uprv_memcpy(mbcsData->stateTable, oldStateTable, mbcsData->header.countStates*1024);
1205
uprv_free(oldStateTable);
1208
for(i=0; i<sum; ++i) {
1209
mbcsData->unicodeCodeUnits[i]=0xfffe;
1212
/* copy the code units for all assigned characters */
1214
* The old state table has the same lead _and_ trail states for assigned characters!
1215
* The differences are in the offsets, and in the trail states for some unassigned characters.
1216
* For each character with an assigned state in the new table, it was assigned in the old one.
1217
* Only still-assigned characters are copied.
1218
* Note that fallback mappings need to get their offset values adjusted.
1221
/* for each initial state */
1222
for(leadState=0; leadState<(int)mbcsData->header.countStates; ++leadState) {
1223
if((mbcsData->stateFlags[leadState]&0xf)==MBCS_STATE_FLAG_DIRECT) {
1224
/* for each lead byte from there */
1225
for(i=0; i<256; ++i) {
1226
entry=mbcsData->stateTable[leadState][i];
1227
if(MBCS_ENTRY_IS_TRANSITION(entry)) {
1228
trailState=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
1229
/* the new state does not have assigned states */
1230
if(trailState!=newState) {
1231
trailOffset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
1232
oldTrailOffset=MBCS_ENTRY_TRANSITION_OFFSET(oldStateTable[leadState][i]);
1233
/* for each trail byte */
1234
for(j=0; j<256; ++j) {
1235
entry=mbcsData->stateTable[trailState][j];
1236
/* copy assigned-character code units and adjust fallback offsets */
1237
switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
1238
case MBCS_STATE_VALID_16:
1239
offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry);
1240
/* find the old offset according to the old state table */
1241
oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]);
1242
unit=mbcsData->unicodeCodeUnits[offset]=oldUnicodeCodeUnits[oldOffset];
1243
if(unit==0xfffe && (fallback=findFallback(mbcsData, oldOffset))>=0) {
1244
mbcsData->toUFallbacks[fallback].offset=0x80000000|offset;
1247
case MBCS_STATE_VALID_16_PAIR:
1248
offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry);
1249
/* find the old offset according to the old state table */
1250
oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]);
1251
mbcsData->unicodeCodeUnits[offset++]=oldUnicodeCodeUnits[oldOffset++];
1252
mbcsData->unicodeCodeUnits[offset]=oldUnicodeCodeUnits[oldOffset];
1264
/* remove temporary flags from fallback offsets that protected them from being modified twice */
1265
sum=mbcsData->header.countToUFallbacks;
1266
for(i=0; i<sum; ++i) {
1267
mbcsData->toUFallbacks[i].offset&=0x7fffffff;
1270
/* free temporary memory */
1271
uprv_free(oldUnicodeCodeUnits);
1272
uprv_free(oldStateTable);
1276
* recursive sub-function of compactToUnicodeHelper()
1278
* >0 number of bytes that are used in unicodeCodeUnits[] that could be saved,
1279
* if all sequences from this state are unassigned, returns the
1280
* <0 there are assignments in unicodeCodeUnits[]
1281
* 0 no use of unicodeCodeUnits[]
1284
findUnassigned(MBCSData *mbcsData, int32_t state, int32_t offset, uint32_t b) {
1285
int32_t i, entry, savings, localSavings, belowSavings;
1288
localSavings=belowSavings=0;
1290
for(i=0; i<256; ++i) {
1291
entry=mbcsData->stateTable[state][i];
1292
if(MBCS_ENTRY_IS_TRANSITION(entry)) {
1293
savings=findUnassigned(mbcsData, MBCS_ENTRY_TRANSITION_STATE(entry), offset+MBCS_ENTRY_TRANSITION_OFFSET(entry), (b<<8)|(uint32_t)i);
1296
} else if(savings>0) {
1297
printf(" all-unassigned sequences from prefix 0x%02lx state %ld use %ld bytes\n",
1298
(unsigned long)((b<<8)|i), (long)state, (long)savings);
1299
belowSavings+=savings;
1301
} else if(!haveAssigned) {
1302
switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
1303
case MBCS_STATE_VALID_16:
1304
entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
1305
if(mbcsData->unicodeCodeUnits[entry]==0xfffe && findFallback(mbcsData, entry)<0) {
1311
case MBCS_STATE_VALID_16_PAIR:
1312
entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
1313
if(mbcsData->unicodeCodeUnits[entry]==0xfffe) {
1327
return localSavings+belowSavings;
1331
/* helper function for finding compaction opportunities */
1333
compactToUnicodeHelper(MBCSData *mbcsData) {
1334
int32_t state, savings;
1340
/* for each initial state */
1341
for(state=0; state<(int)mbcsData->header.countStates; ++state) {
1342
if((mbcsData->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
1343
savings=findUnassigned(mbcsData, state, 0, 0);
1345
printf(" all-unassigned sequences from initial state %ld use %ld bytes\n",
1346
(long)state, (long)savings);
1353
transformEUC(MBCSData *mbcsData) {
1355
uint32_t i, value, oldLength=mbcsData->maxCharLength, old3Top=mbcsData->stage3Top, new3Top;
1362
/* careful: 2-byte and 4-byte codes are stored in platform endianness! */
1364
/* test if all first bytes are in {0, 0x8e, 0x8f} */
1365
p8=mbcsData->fromUBytes;
1367
#if !U_IS_BIG_ENDIAN
1373
for(i=0; i<old3Top; i+=oldLength) {
1375
if(b!=0 && b!=0x8e && b!=0x8f) {
1376
/* some first byte does not fit the EUC pattern, nothing to be done */
1380
/* restore p if it was modified above */
1381
p8=mbcsData->fromUBytes;
1383
/* modify outputType and adjust stage3Top */
1384
mbcsData->header.flags=MBCS_OUTPUT_3_EUC+oldLength-3;
1385
mbcsData->stage3Top=new3Top=(old3Top*(oldLength-1))/oldLength;
1388
* EUC-encode all byte sequences;
1389
* see "CJKV Information Processing" (1st ed. 1999) from Ken Lunde, O'Reilly,
1390
* p. 161 in chapter 4 "Encoding Methods"
1392
* This also must reverse the byte order if the platform is little-endian!
1395
uint16_t *q=(uint16_t *)p8;
1396
for(i=0; i<old3Top; i+=oldLength) {
1399
/* short sequences are stored directly */
1400
/* code set 0 or 1 */
1401
(*q++)=(uint16_t)((p8[1]<<8)|p8[2]);
1402
} else if(b==0x8e) {
1404
(*q++)=(uint16_t)(((p8[1]&0x7f)<<8)|p8[2]);
1405
} else /* b==0x8f */ {
1407
(*q++)=(uint16_t)((p8[1]<<8)|(p8[2]&0x7f));
1411
} else /* oldLength==4 */ {
1413
uint32_t *p32=(uint32_t *)p8;
1414
for(i=0; i<old3Top; i+=4) {
1416
if(value<=0xffffff) {
1417
/* short sequences are stored directly */
1418
/* code set 0 or 1 */
1419
(*q++)=(uint8_t)(value>>16);
1420
(*q++)=(uint8_t)(value>>8);
1421
(*q++)=(uint8_t)value;
1422
} else if(value<=0x8effffff) {
1424
(*q++)=(uint8_t)((value>>16)&0x7f);
1425
(*q++)=(uint8_t)(value>>8);
1426
(*q++)=(uint8_t)value;
1427
} else /* first byte is 0x8f */ {
1429
(*q++)=(uint8_t)(value>>16);
1430
(*q++)=(uint8_t)((value>>8)&0x7f);
1431
(*q++)=(uint8_t)value;
1440
* Compact stage 2 for SBCS by overlapping adjacent stage 2 blocks as far
1441
* as possible. Overlapping is done on unassigned head and tail
1442
* parts of blocks in steps of MBCS_STAGE_2_MULTIPLIER.
1443
* Stage 1 indexes need to be adjusted accordingly.
1444
* This function is very similar to genprops/store.c/compactStage().
1447
singleCompactStage2(MBCSData *mbcsData) {
1448
/* this array maps the ordinal number of a stage 2 block to its new stage 1 index */
1449
uint16_t map[MBCS_STAGE_2_MAX_BLOCKS];
1450
uint16_t i, start, prevEnd, newStart;
1452
/* enter the all-unassigned first stage 2 block into the map */
1453
map[0]=MBCS_STAGE_2_ALL_UNASSIGNED_INDEX;
1455
/* begin with the first block after the all-unassigned one */
1456
start=newStart=MBCS_STAGE_2_FIRST_ASSIGNED;
1457
while(start<mbcsData->stage2Top) {
1458
prevEnd=(uint16_t)(newStart-1);
1460
/* find the size of the overlap */
1461
for(i=0; i<MBCS_STAGE_2_BLOCK_SIZE && mbcsData->stage2Single[start+i]==0 && mbcsData->stage2Single[prevEnd-i]==0; ++i) {}
1464
map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=(uint16_t)(newStart-i);
1466
/* move the non-overlapping indexes to their new positions */
1468
for(i=(uint16_t)(MBCS_STAGE_2_BLOCK_SIZE-i); i>0; --i) {
1469
mbcsData->stage2Single[newStart++]=mbcsData->stage2Single[start++];
1471
} else if(newStart<start) {
1472
/* move the indexes to their new positions */
1473
map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=newStart;
1474
for(i=MBCS_STAGE_2_BLOCK_SIZE; i>0; --i) {
1475
mbcsData->stage2Single[newStart++]=mbcsData->stage2Single[start++];
1477
} else /* no overlap && newStart==start */ {
1478
map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=start;
1479
start=newStart+=MBCS_STAGE_2_BLOCK_SIZE;
1483
/* adjust stage2Top */
1484
if(VERBOSE && newStart<mbcsData->stage2Top) {
1485
printf("compacting stage 2 from stage2Top=0x%lx to 0x%lx, saving %ld bytes\n",
1486
(unsigned long)mbcsData->stage2Top, (unsigned long)newStart,
1487
(long)(mbcsData->stage2Top-newStart)*2);
1489
mbcsData->stage2Top=newStart;
1491
/* now adjust stage 1 */
1492
for(i=0; i<MBCS_STAGE_1_SIZE; ++i) {
1493
mbcsData->stage1[i]=map[mbcsData->stage1[i]>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT];
1497
/* Compact stage 3 for SBCS - same algorithm as above. */
1499
singleCompactStage3(MBCSData *mbcsData) {
1500
uint16_t *stage3=(uint16_t *)mbcsData->fromUBytes;
1502
/* this array maps the ordinal number of a stage 3 block to its new stage 2 index */
1503
uint16_t map[0x1000];
1504
uint16_t i, start, prevEnd, newStart;
1506
/* enter the all-unassigned first stage 3 block into the map */
1509
/* begin with the first block after the all-unassigned one */
1511
while(start<mbcsData->stage3Top) {
1512
prevEnd=(uint16_t)(newStart-1);
1514
/* find the size of the overlap */
1515
for(i=0; i<16 && stage3[start+i]==0 && stage3[prevEnd-i]==0; ++i) {}
1518
map[start>>4]=(uint16_t)(newStart-i);
1520
/* move the non-overlapping indexes to their new positions */
1522
for(i=(uint16_t)(16-i); i>0; --i) {
1523
stage3[newStart++]=stage3[start++];
1525
} else if(newStart<start) {
1526
/* move the indexes to their new positions */
1527
map[start>>4]=newStart;
1528
for(i=16; i>0; --i) {
1529
stage3[newStart++]=stage3[start++];
1531
} else /* no overlap && newStart==start */ {
1532
map[start>>4]=start;
1537
/* adjust stage3Top */
1538
if(VERBOSE && newStart<mbcsData->stage3Top) {
1539
printf("compacting stage 3 from stage3Top=0x%lx to 0x%lx, saving %ld bytes\n",
1540
(unsigned long)mbcsData->stage3Top, (unsigned long)newStart,
1541
(long)(mbcsData->stage3Top-newStart)*2);
1543
mbcsData->stage3Top=newStart;
1545
/* now adjust stage 2 */
1546
for(i=0; i<mbcsData->stage2Top; ++i) {
1547
mbcsData->stage2Single[i]=map[mbcsData->stage2Single[i]>>4];
1552
* Compact stage 2 by overlapping adjacent stage 2 blocks as far
1553
* as possible. Overlapping is done on unassigned head and tail
1554
* parts of blocks in steps of MBCS_STAGE_2_MULTIPLIER.
1555
* Stage 1 indexes need to be adjusted accordingly.
1556
* This function is very similar to genprops/store.c/compactStage().
1559
compactStage2(MBCSData *mbcsData) {
1560
/* this array maps the ordinal number of a stage 2 block to its new stage 1 index */
1561
uint16_t map[MBCS_STAGE_2_MAX_BLOCKS];
1562
uint16_t i, start, prevEnd, newStart;
1564
/* enter the all-unassigned first stage 2 block into the map */
1565
map[0]=MBCS_STAGE_2_ALL_UNASSIGNED_INDEX;
1567
/* begin with the first block after the all-unassigned one */
1568
start=newStart=MBCS_STAGE_2_FIRST_ASSIGNED;
1569
while(start<mbcsData->stage2Top) {
1570
prevEnd=(uint16_t)(newStart-1);
1572
/* find the size of the overlap */
1573
for(i=0; i<MBCS_STAGE_2_BLOCK_SIZE && mbcsData->stage2[start+i]==0 && mbcsData->stage2[prevEnd-i]==0; ++i) {}
1576
map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=(uint16_t)(newStart-i);
1578
/* move the non-overlapping indexes to their new positions */
1580
for(i=(uint16_t)(MBCS_STAGE_2_BLOCK_SIZE-i); i>0; --i) {
1581
mbcsData->stage2[newStart++]=mbcsData->stage2[start++];
1583
} else if(newStart<start) {
1584
/* move the indexes to their new positions */
1585
map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=newStart;
1586
for(i=MBCS_STAGE_2_BLOCK_SIZE; i>0; --i) {
1587
mbcsData->stage2[newStart++]=mbcsData->stage2[start++];
1589
} else /* no overlap && newStart==start */ {
1590
map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=start;
1591
start=newStart+=MBCS_STAGE_2_BLOCK_SIZE;
1595
/* adjust stage2Top */
1596
if(VERBOSE && newStart<mbcsData->stage2Top) {
1597
printf("compacting stage 2 from stage2Top=0x%lx to 0x%lx, saving %ld bytes\n",
1598
(unsigned long)mbcsData->stage2Top, (unsigned long)newStart,
1599
(long)(mbcsData->stage2Top-newStart)*4);
1601
mbcsData->stage2Top=newStart;
1603
/* now adjust stage 1 */
1604
for(i=0; i<MBCS_STAGE_1_SIZE; ++i) {
1605
mbcsData->stage1[i]=map[mbcsData->stage1[i]>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT];
1610
MBCSPostprocess(NewConverter *cnvData, const UConverterStaticData *staticData) {
1611
MBCSData *mbcsData=(MBCSData *)cnvData;
1615
/* this needs to be printed before the EUC transformation because later maxCharLength might not be correct */
1617
printf("number of codepage characters in 16-blocks: 0x%lx=%lu\n",
1618
(unsigned long)mbcsData->stage3Top/mbcsData->maxCharLength,
1619
(unsigned long)mbcsData->stage3Top/mbcsData->maxCharLength);
1622
/* test each state table entry */
1623
for(state=0; state<(int)mbcsData->header.countStates; ++state) {
1624
for(cell=0; cell<256; ++cell) {
1625
entry=mbcsData->stateTable[state][cell];
1627
* if the entry is a final one with an MBCS_STATE_VALID_DIRECT_16 action code
1628
* and the code point is "unassigned" (0xfffe), then change it to
1629
* the "unassigned" action code with bits 26..23 set to zero and U+fffe.
1631
if(MBCS_ENTRY_SET_STATE(entry, 0)==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, 0xfffe)) {
1632
mbcsData->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_UNASSIGNED);
1637
/* try to compact the toUnicode tables */
1638
if(mbcsData->maxCharLength==2) {
1639
compactToUnicode2(mbcsData);
1640
} else if(mbcsData->maxCharLength>2) {
1641
compactToUnicodeHelper(mbcsData);
1644
/* sort toUFallbacks */
1646
* It should be safe to sort them before compactToUnicode2() is called,
1647
* because it should not change the relative order of the offset values
1648
* that it adjusts, but they need to be sorted at some point, and
1649
* it is safest here.
1651
if(mbcsData->header.countToUFallbacks>0) {
1652
qsort(mbcsData->toUFallbacks, mbcsData->header.countToUFallbacks, sizeof(_MBCSToUFallback), compareFallbacks);
1655
/* try to compact the fromUnicode tables */
1656
transformEUC(mbcsData);
1657
if(mbcsData->maxCharLength==1) {
1658
singleCompactStage3(mbcsData);
1659
singleCompactStage2(mbcsData);
1661
compactStage2(mbcsData);
1666
MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData, UNewDataMemory *pData) {
1667
MBCSData *mbcsData=(MBCSData *)cnvData;
1668
int32_t i, stage1Top;
1670
/* adjust stage 1 entries to include the size of stage 1 in the offsets to stage 2 */
1671
if(mbcsData->maxCharLength==1) {
1672
if(staticData->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
1673
stage1Top=MBCS_STAGE_1_SIZE; /* 0x440==1088 */
1675
stage1Top=0x40; /* 0x40==64 */
1677
for(i=0; i<stage1Top; ++i) {
1678
mbcsData->stage1[i]+=(uint16_t)stage1Top;
1681
/* stage2Top has counted 16-bit results, now we need to count bytes */
1682
mbcsData->stage2Top*=2;
1684
/* stage3Top has counted 16-bit results, now we need to count bytes */
1685
mbcsData->stage3Top*=2;
1687
if(staticData->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
1688
stage1Top=MBCS_STAGE_1_SIZE; /* 0x440==1088 */
1690
stage1Top=0x40; /* 0x40==64 */
1692
for(i=0; i<stage1Top; ++i) {
1693
mbcsData->stage1[i]+=(uint16_t)stage1Top/2; /* stage 2 contains 32-bit entries, stage 1 16-bit entries */
1696
/* stage2Top has counted 32-bit results, now we need to count bytes */
1697
mbcsData->stage2Top*=4;
1699
/* stage3Top has already counted bytes */
1702
/* round up stage2Top and stage3Top so that the sizes of all data blocks are multiples of 4 */
1703
mbcsData->stage2Top=(mbcsData->stage2Top+3)&~3;
1704
mbcsData->stage3Top=(mbcsData->stage3Top+3)&~3;
1706
/* fill the header */
1707
mbcsData->header.offsetToUCodeUnits=
1708
sizeof(_MBCSHeader)+
1709
mbcsData->header.countStates*1024+
1710
mbcsData->header.countToUFallbacks*sizeof(_MBCSToUFallback);
1711
mbcsData->header.offsetFromUTable=
1712
mbcsData->header.offsetToUCodeUnits+
1713
mbcsData->countToUCodeUnits*2;
1714
mbcsData->header.offsetFromUBytes=
1715
mbcsData->header.offsetFromUTable+
1717
mbcsData->stage2Top;
1719
/* write the MBCS data */
1720
udata_writeBlock(pData, &mbcsData->header, sizeof(_MBCSHeader));
1721
udata_writeBlock(pData, mbcsData->stateTable, mbcsData->header.countStates*1024);
1722
udata_writeBlock(pData, mbcsData->toUFallbacks, mbcsData->header.countToUFallbacks*sizeof(_MBCSToUFallback));
1723
udata_writeBlock(pData, mbcsData->unicodeCodeUnits, mbcsData->countToUCodeUnits*2);
1724
udata_writeBlock(pData, mbcsData->stage1, stage1Top*2);
1725
if(mbcsData->maxCharLength==1) {
1726
udata_writeBlock(pData, mbcsData->stage2Single, mbcsData->stage2Top);
1728
udata_writeBlock(pData, mbcsData->stage2, mbcsData->stage2Top);
1730
udata_writeBlock(pData, mbcsData->fromUBytes, mbcsData->stage3Top);
1732
/* return the number of bytes that should have been written */
1733
return mbcsData->header.offsetFromUBytes+mbcsData->stage3Top;