2
**********************************************************************
3
* Copyright (C) 2000-2001, International Business Machines
4
* Corporation and others. All Rights Reserved.
5
**********************************************************************
6
* file name: ucnvlat1.cpp
8
* tab size: 8 (not used)
11
* created on: 2000feb07
12
* created by: Markus W. Scherer
15
#include "unicode/utypes.h"
16
#include "unicode/ucnv.h"
17
#include "unicode/ucnv_err.h"
21
/* control optimizations according to the platform */
22
#define LATIN1_UNROLL_TO_UNICODE 1
23
#define LATIN1_UNROLL_FROM_UNICODE 1
24
#define ASCII_UNROLL_TO_UNICODE 1
26
/* ISO 8859-1 --------------------------------------------------------------- */
28
/* This is a table-less and callback-less version of _MBCSSingleToBMPWithOffsets(). */
30
_Latin1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
31
UErrorCode *pErrorCode) {
32
const uint8_t *source;
34
int32_t targetCapacity, length;
39
/* set up the local pointers */
40
source=(const uint8_t *)pArgs->source;
42
targetCapacity=pArgs->targetLimit-pArgs->target;
43
offsets=pArgs->offsets;
48
* since the conversion here is 1:1 UChar:uint8_t, we need only one counter
49
* for the minimum of the sourceLength and targetCapacity
51
length=(const uint8_t *)pArgs->sourceLimit-source;
52
if(length<=targetCapacity) {
53
targetCapacity=length;
55
/* target will be full */
56
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
57
length=targetCapacity;
60
#if LATIN1_UNROLL_TO_UNICODE
61
if(targetCapacity>=16) {
64
loops=count=targetCapacity>>4;
65
length=targetCapacity&=0xf;
87
*offsets++=sourceIndex++;
88
*offsets++=sourceIndex++;
89
*offsets++=sourceIndex++;
90
*offsets++=sourceIndex++;
91
*offsets++=sourceIndex++;
92
*offsets++=sourceIndex++;
93
*offsets++=sourceIndex++;
94
*offsets++=sourceIndex++;
95
*offsets++=sourceIndex++;
96
*offsets++=sourceIndex++;
97
*offsets++=sourceIndex++;
98
*offsets++=sourceIndex++;
99
*offsets++=sourceIndex++;
100
*offsets++=sourceIndex++;
101
*offsets++=sourceIndex++;
102
*offsets++=sourceIndex++;
108
/* conversion loop */
109
while(targetCapacity>0) {
114
/* write back the updated pointers */
115
pArgs->source=(const char *)source;
116
pArgs->target=target;
121
*offsets++=sourceIndex++;
124
pArgs->offsets=offsets;
128
/* This is a table-less and callback-less version of _MBCSSingleGetNextUChar(). */
130
_Latin1GetNextUChar(UConverterToUnicodeArgs *pArgs,
131
UErrorCode *pErrorCode) {
132
const uint8_t *source=(const uint8_t *)pArgs->source;
133
if(source<(const uint8_t *)pArgs->sourceLimit) {
134
pArgs->source=(const char *)(source+1);
138
/* no output because of empty input */
139
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
143
/* This is a table-less version of _MBCSSingleFromBMPWithOffsets(). */
145
_Latin1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
146
UErrorCode *pErrorCode) {
148
const UChar *source, *sourceLimit, *lastSource;
150
int32_t targetCapacity, length;
157
UConverterCallbackReason reason;
160
/* set up the local pointers */
161
cnv=pArgs->converter;
162
source=pArgs->source;
163
sourceLimit=pArgs->sourceLimit;
164
target=(uint8_t *)pArgs->target;
165
targetCapacity=pArgs->targetLimit-pArgs->target;
166
offsets=pArgs->offsets;
168
if(cnv->sharedData==&_Latin1Data) {
169
max=0xff; /* Latin-1 */
171
max=0x7f; /* US-ASCII */
174
/* get the converter state from UConverter */
175
c=cnv->fromUSurrogateLead;
177
/* sourceIndex=-1 if the current character began in the previous buffer */
178
sourceIndex= c==0 ? 0 : -1;
182
* since the conversion here is 1:1 UChar:uint8_t, we need only one counter
183
* for the minimum of the sourceLength and targetCapacity
185
length=sourceLimit-source;
186
if(length<targetCapacity) {
187
targetCapacity=length;
190
/* conversion loop */
191
if(c!=0 && targetCapacity>0) {
195
#if LATIN1_UNROLL_FROM_UNICODE
196
/* unroll the loop with the most common case */
198
if(targetCapacity>=16) {
199
int32_t count, loops;
202
loops=count=targetCapacity>>4;
204
oredChars=u=*source++;
205
*target++=(uint8_t)u;
206
oredChars|=u=*source++;
207
*target++=(uint8_t)u;
208
oredChars|=u=*source++;
209
*target++=(uint8_t)u;
210
oredChars|=u=*source++;
211
*target++=(uint8_t)u;
212
oredChars|=u=*source++;
213
*target++=(uint8_t)u;
214
oredChars|=u=*source++;
215
*target++=(uint8_t)u;
216
oredChars|=u=*source++;
217
*target++=(uint8_t)u;
218
oredChars|=u=*source++;
219
*target++=(uint8_t)u;
220
oredChars|=u=*source++;
221
*target++=(uint8_t)u;
222
oredChars|=u=*source++;
223
*target++=(uint8_t)u;
224
oredChars|=u=*source++;
225
*target++=(uint8_t)u;
226
oredChars|=u=*source++;
227
*target++=(uint8_t)u;
228
oredChars|=u=*source++;
229
*target++=(uint8_t)u;
230
oredChars|=u=*source++;
231
*target++=(uint8_t)u;
232
oredChars|=u=*source++;
233
*target++=(uint8_t)u;
234
oredChars|=u=*source++;
235
*target++=(uint8_t)u;
237
/* were all 16 entries really valid? */
239
/* no, return to the first of these 16 */
246
targetCapacity-=16*count;
249
lastSource+=16*count;
251
*offsets++=sourceIndex++;
252
*offsets++=sourceIndex++;
253
*offsets++=sourceIndex++;
254
*offsets++=sourceIndex++;
255
*offsets++=sourceIndex++;
256
*offsets++=sourceIndex++;
257
*offsets++=sourceIndex++;
258
*offsets++=sourceIndex++;
259
*offsets++=sourceIndex++;
260
*offsets++=sourceIndex++;
261
*offsets++=sourceIndex++;
262
*offsets++=sourceIndex++;
263
*offsets++=sourceIndex++;
264
*offsets++=sourceIndex++;
265
*offsets++=sourceIndex++;
266
*offsets++=sourceIndex++;
275
while(targetCapacity>0) {
277
* Get a correct Unicode code point:
278
* a single UChar for a BMP code point or
279
* a matched surrogate pair for a "surrogate code point".
283
/* convert the Unicode code point */
284
*target++=(uint8_t)c;
287
/* normal end of conversion: prepare for a new character */
290
if(!UTF_IS_SURROGATE(c)) {
291
/* callback(unassigned) */
292
reason=UCNV_UNASSIGNED;
293
*pErrorCode=U_INVALID_CHAR_FOUND;
294
} else if(UTF_IS_SURROGATE_FIRST(c)) {
296
if(source<sourceLimit) {
297
/* test the following code unit */
299
if(UTF_IS_SECOND_SURROGATE(trail)) {
301
c=UTF16_GET_PAIR_VALUE(c, trail);
302
/* this codepage does not map supplementary code points */
303
/* callback(unassigned) */
304
reason=UCNV_UNASSIGNED;
305
*pErrorCode=U_INVALID_CHAR_FOUND;
307
/* this is an unmatched lead code unit (1st surrogate) */
308
/* callback(illegal) */
310
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
317
/* this is an unmatched trail code unit (2nd surrogate) */
318
/* callback(illegal) */
320
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
323
/* call the callback function with all the preparations and post-processing */
324
/* get the number of code units for c to correctly advance sourceIndex after the callback call */
325
length=UTF_CHAR_LENGTH(c);
327
/* set offsets since the start or the last callback */
329
int32_t count=(int32_t)(source-lastSource);
331
/* do not set the offset for the callback-causing character */
335
*offsets++=sourceIndex++;
338
/* offset and sourceIndex are now set for the current character */
341
/* update the arguments structure */
342
pArgs->source=source;
343
pArgs->target=(char *)target;
344
pArgs->offsets=offsets;
346
/* set the converter state in UConverter to deal with the next character */
347
cnv->fromUSurrogateLead=0;
349
/* write the code point as code units */
351
UTF_APPEND_CHAR_UNSAFE(cnv->invalidUCharBuffer, i, c);
352
cnv->invalidUCharLength=(int8_t)i;
355
/* call the callback function */
356
cnv->fromUCharErrorBehaviour(cnv->fromUContext, pArgs, cnv->invalidUCharBuffer, i, c, reason, pErrorCode);
358
/* get the converter state from UConverter */
359
c=cnv->fromUSurrogateLead;
361
/* update target and deal with offsets if necessary */
362
offsets=ucnv_updateCallbackOffsets(offsets, ((uint8_t *)pArgs->target)-target, sourceIndex);
363
target=(uint8_t *)pArgs->target;
365
/* update the source pointer and index */
366
sourceIndex+=length+(pArgs->source-source);
367
source=lastSource=pArgs->source;
368
targetCapacity=(uint8_t *)pArgs->targetLimit-target;
369
length=sourceLimit-source;
370
if(length<targetCapacity) {
371
targetCapacity=length;
375
* If the callback overflowed the target, then we need to
376
* stop here with an overflow indication.
378
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
380
} else if(U_FAILURE(*pErrorCode)) {
384
} else if(cnv->charErrorBufferLength>0) {
386
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
390
#if LATIN1_UNROLL_FROM_UNICODE
396
if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) {
398
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
401
/* set offsets since the start or the last callback */
403
size_t count=source-lastSource;
405
*offsets++=sourceIndex++;
410
if(pArgs->flush && source>=sourceLimit) {
411
/* reset the state for the next conversion */
412
if(c!=0 && U_SUCCESS(*pErrorCode)) {
413
/* a Unicode code point remains incomplete (only a first surrogate) */
414
*pErrorCode=U_TRUNCATED_CHAR_FOUND;
416
cnv->fromUSurrogateLead=0;
418
/* set the converter state back into UConverter */
419
cnv->fromUSurrogateLead=(UChar)c;
422
/* write back the updated pointers */
423
pArgs->source=source;
424
pArgs->target=(char *)target;
425
pArgs->offsets=offsets;
428
static const UConverterImpl _Latin1Impl={
438
_Latin1ToUnicodeWithOffsets,
439
_Latin1ToUnicodeWithOffsets,
440
_Latin1FromUnicodeWithOffsets,
441
_Latin1FromUnicodeWithOffsets,
448
static const UConverterStaticData _Latin1StaticData={
449
sizeof(UConverterStaticData),
451
819, UCNV_IBM, UCNV_LATIN_1, 1, 1,
452
{ 0x1a, 0, 0, 0 }, 1, FALSE, FALSE,
455
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
458
const UConverterSharedData _Latin1Data={
459
sizeof(UConverterSharedData), ~((uint32_t) 0),
460
NULL, NULL, &_Latin1StaticData, FALSE, &_Latin1Impl,
464
/* US-ASCII ----------------------------------------------------------------- */
466
/* This is a table-less version of _MBCSSingleToBMPWithOffsets(). */
468
_ASCIIToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
469
UErrorCode *pErrorCode) {
470
const uint8_t *source, *sourceLimit, *lastSource;
472
int32_t targetCapacity, length;
477
/* set up the local pointers */
478
source=(const uint8_t *)pArgs->source;
479
sourceLimit=(const uint8_t *)pArgs->sourceLimit;
480
target=pArgs->target;
481
targetCapacity=pArgs->targetLimit-pArgs->target;
482
offsets=pArgs->offsets;
484
/* sourceIndex=-1 if the current character began in the previous buffer */
489
* since the conversion here is 1:1 UChar:uint8_t, we need only one counter
490
* for the minimum of the sourceLength and targetCapacity
492
length=sourceLimit-source;
493
if(length<targetCapacity) {
494
targetCapacity=length;
497
#if ASCII_UNROLL_TO_UNICODE
498
/* unroll the loop with the most common case */
500
if(targetCapacity>=16) {
501
int32_t count, loops;
504
loops=count=targetCapacity>>4;
506
oredChars=*target++=*source++;
507
oredChars|=*target++=*source++;
508
oredChars|=*target++=*source++;
509
oredChars|=*target++=*source++;
510
oredChars|=*target++=*source++;
511
oredChars|=*target++=*source++;
512
oredChars|=*target++=*source++;
513
oredChars|=*target++=*source++;
514
oredChars|=*target++=*source++;
515
oredChars|=*target++=*source++;
516
oredChars|=*target++=*source++;
517
oredChars|=*target++=*source++;
518
oredChars|=*target++=*source++;
519
oredChars|=*target++=*source++;
520
oredChars|=*target++=*source++;
521
oredChars|=*target++=*source++;
523
/* were all 16 entries really valid? */
525
/* no, return to the first of these 16 */
532
targetCapacity-=16*count;
535
lastSource+=16*count;
537
*offsets++=sourceIndex++;
538
*offsets++=sourceIndex++;
539
*offsets++=sourceIndex++;
540
*offsets++=sourceIndex++;
541
*offsets++=sourceIndex++;
542
*offsets++=sourceIndex++;
543
*offsets++=sourceIndex++;
544
*offsets++=sourceIndex++;
545
*offsets++=sourceIndex++;
546
*offsets++=sourceIndex++;
547
*offsets++=sourceIndex++;
548
*offsets++=sourceIndex++;
549
*offsets++=sourceIndex++;
550
*offsets++=sourceIndex++;
551
*offsets++=sourceIndex++;
552
*offsets++=sourceIndex++;
559
/* conversion loop */
560
while(targetCapacity>0) {
561
if((*target++=*source++)<=0x7f) {
566
/* back out the illegal character */
569
/* call the callback function with all the preparations and post-processing */
570
cnv=pArgs->converter;
572
/* callback(illegal) */
573
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
575
/* set offsets since the start or the last callback */
577
int32_t count=(int32_t)(source-lastSource);
579
/* predecrement: do not set the offset for the callback-causing character */
581
*offsets++=sourceIndex++;
583
/* offset and sourceIndex are now set for the current character */
586
/* update the arguments structure */
587
pArgs->source=(const char *)source;
588
pArgs->target=target;
589
pArgs->offsets=offsets;
591
/* copy the current bytes to invalidCharBuffer */
592
cnv->invalidCharBuffer[0]=*(source-1);
593
cnv->invalidCharLength=1;
595
/* call the callback function */
596
cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, cnv->invalidCharBuffer, 1, UCNV_ILLEGAL, pErrorCode);
598
/* update target and deal with offsets if necessary */
599
offsets=ucnv_updateCallbackOffsets(offsets, pArgs->target-target, sourceIndex);
600
target=pArgs->target;
602
/* update the source pointer and index */
603
sourceIndex+=1+((const uint8_t *)pArgs->source-source);
604
source=lastSource=(const uint8_t *)pArgs->source;
605
targetCapacity=pArgs->targetLimit-target;
606
length=sourceLimit-source;
607
if(length<targetCapacity) {
608
targetCapacity=length;
612
* If the callback overflowed the target, then we need to
613
* stop here with an overflow indication.
615
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
617
} else if(U_FAILURE(*pErrorCode)) {
620
} else if(cnv->UCharErrorBufferLength>0) {
622
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
626
#if ASCII_UNROLL_TO_UNICODE
632
if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=pArgs->targetLimit) {
634
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
637
/* set offsets since the start or the last callback */
639
size_t count=source-lastSource;
641
*offsets++=sourceIndex++;
646
/* write back the updated pointers */
647
pArgs->source=(const char *)source;
648
pArgs->target=target;
649
pArgs->offsets=offsets;
652
/* This is a table-less version of _MBCSSingleGetNextUChar(). */
654
_ASCIIGetNextUChar(UConverterToUnicodeArgs *pArgs,
655
UErrorCode *pErrorCode) {
656
UChar buffer[UTF_MAX_CHAR_LENGTH];
657
const uint8_t *source;
660
/* set up the local pointers */
661
source=(const uint8_t *)pArgs->source;
663
/* conversion loop */
664
while(source<(const uint8_t *)pArgs->sourceLimit) {
666
pArgs->source=(const char *)source;
670
/* call the callback function with all the preparations and post-processing */
671
UConverter *cnv=pArgs->converter;
673
/* callback(illegal) */
674
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
676
/* update the arguments structure */
677
pArgs->target=buffer;
678
pArgs->targetLimit=buffer+UTF_MAX_CHAR_LENGTH;
680
/* copy the current byte to invalidCharBuffer */
681
cnv->invalidCharBuffer[0]=(char)b;
682
cnv->invalidCharLength=1;
684
/* call the callback function */
685
cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs, cnv->invalidCharBuffer, 1, UCNV_ILLEGAL, pErrorCode);
687
/* update the source pointer */
688
source=(const uint8_t *)pArgs->source;
691
* return the first character if the callback wrote some
692
* we do not need to goto finish because the converter state is already set
694
if(U_SUCCESS(*pErrorCode)) {
695
int32_t length=pArgs->target-buffer;
697
return ucnv_getUChar32KeepOverflow(cnv, buffer, length);
699
/* else (callback did not write anything) continue */
700
} else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
701
*pErrorCode=U_ZERO_ERROR;
702
return ucnv_getUChar32KeepOverflow(cnv, buffer, UTF_MAX_CHAR_LENGTH);
705
/* ### what if a callback set an error but _also_ generated output?! */
711
/* no output because of empty input or only skipping callbacks */
712
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
716
static const UConverterImpl _ASCIIImpl={
726
_ASCIIToUnicodeWithOffsets,
727
_ASCIIToUnicodeWithOffsets,
728
_Latin1FromUnicodeWithOffsets,
729
_Latin1FromUnicodeWithOffsets,
736
static const UConverterStaticData _ASCIIStaticData={
737
sizeof(UConverterStaticData),
739
367, UCNV_IBM, UCNV_US_ASCII, 1, 1,
740
{ 0x1a, 0, 0, 0 }, 1, FALSE, FALSE,
743
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
746
const UConverterSharedData _ASCIIData={
747
sizeof(UConverterSharedData), ~((uint32_t) 0),
748
NULL, NULL, &_ASCIIStaticData, FALSE, &_ASCIIImpl,