1
/*********************************************************
2
* Copyright (C) 2008 VMware, Inc. All rights reserved.
4
* This file is part of VMware View Open Client.
5
*********************************************************/
7
**********************************************************************
8
* Copyright (C) 2000-2007, International Business Machines
9
* Corporation and others. All Rights Reserved.
10
**********************************************************************
11
* file name: ucnv2022.c
13
* tab size: 8 (not used)
16
* created on: 2000feb03
17
* created by: Markus W. Scherer
21
* 06/29/2000 helena Major rewrite of the callback APIs.
22
* 08/08/2000 Ram Included support for ISO-2022-JP-2
23
* Changed implementation of toUnicode
25
* 08/21/2000 Ram Added support for ISO-2022-KR
26
* 08/29/2000 Ram Seperated implementation of EBCDIC to
28
* 09/20/2000 Ram Added support for ISO-2022-CN
29
* Added implementations for getNextUChar()
30
* for specific 2022 country variants.
31
* 10/31/2000 Ram Implemented offsets logic functions
34
#include "unicode/utypes.h"
36
#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
38
#include "unicode/ucnv.h"
39
#include "unicode/uset.h"
40
#include "unicode/ucnv_err.h"
41
#include "unicode/ucnv_cb.h"
49
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
51
#ifdef U_ENABLE_GENERIC_ISO_2022
53
* I am disabling the generic ISO-2022 converter after proposing to do so on
54
* the icu mailing list two days ago.
57
* 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
58
* its designation sequences, single shifts with return to the previous state,
59
* switch-with-no-return to UTF-16BE or similar, etc.
60
* This is unlike the language-specific variants like ISO-2022-JP which
61
* require a much smaller repertoire of ISO-2022 features.
62
* These variants continue to be supported.
63
* 2. I believe that no one is really using the generic ISO-2022 converter
64
* but rather always one of the language-specific variants.
65
* Note that ICU's generic ISO-2022 converter has always output one escape
66
* sequence followed by UTF-8 for the whole stream.
67
* 3. Switching between subcharsets is extremely slow, because each time
68
* the previous converter is closed and a new one opened,
69
* without any kind of caching, least-recently-used list, etc.
70
* 4. The code is currently buggy, and given the above it does not seem
71
* reasonable to spend the time on maintenance.
72
* 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
73
* This means, for example, that when ISO-8859-7 is designated, the following
74
* ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
75
* The ICU ISO-2022 converter does not handle this - and has no information
76
* about which subconverter would have to be shifted vs. which is designed
79
* Markus Scherer 2003-dec-03
83
static const char SHIFT_IN_STR[] = "\x0F";
84
static const char SHIFT_OUT_STR[] = "\x0E";
98
* 94-character sets with native byte values A1..FE are encoded in ISO 2022
99
* as bytes 21..7E. (Subtract 0x80.)
100
* 96-character sets with native byte values A0..FF are encoded in ISO 2022
101
* as bytes 20..7F. (Subtract 0x80.)
102
* Do not encode C1 control codes with native bytes 80..9F
103
* as bytes 00..1F (C0 control codes).
113
* ISO 2022 control codes must not be converted from Unicode
114
* because they would mess up the byte stream.
115
* The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
116
* corresponding to SO, SI, and ESC.
118
#define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
120
/* for ISO-2022-JP and -CN implementations */
137
HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */
140
/* the first few enum constants must keep their values because they correspond to myConverterArray[] */
146
* these are used in StateEnum and ISO2022State variables,
147
* but CNS_11643 must be used to index into myConverterArray[]
159
/* is the StateEnum charset value for a DBCS charset? */
160
#define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
162
#define CSM(cs) ((uint16_t)1<<(cs))
165
* Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
166
* to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
168
* Note: The converter uses some leniency:
169
* - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
170
* all versions, not just JIS7 and JIS8.
171
* - ICU does not distinguish between different versions of JIS X 0208.
173
static const uint16_t jpCharsetMasks[5]={
174
CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
175
CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
176
CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
177
CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
178
CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
190
typedef struct ISO2022State {
191
int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
192
int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
193
int8_t prevG; /* g before single shift (SS2 or SS3) */
196
#define UCNV_OPTIONS_VERSION_MASK 0xf
197
#define UCNV_2022_MAX_CONVERTERS 10
200
UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
201
UConverter *currentConverter;
202
Cnv2022Type currentType;
203
ISO2022State toU2022State, fromU2022State;
206
#ifdef U_ENABLE_GENERIC_ISO_2022
211
}UConverterDataISO2022;
214
/* ISO-2022 ----------------------------------------------------------------- */
216
/*Forward declaration */
218
ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
221
ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
224
#define ESC_2022 0x1B /*ESC*/
228
INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
229
VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
230
VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
231
VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
232
} UCNV_TableStates_2022;
235
* The way these state transition arrays work is:
236
* ex : ESC$B is the sequence for JISX208
237
* a) First Iteration: char is ESC
238
* i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
239
* int x = normalize_esq_chars_2022[27] which is equal to 1
240
* ii) Search for this value in escSeqStateTable_Key_2022[]
241
* value of x is stored at escSeqStateTable_Key_2022[0]
242
* iii) Save this index as offset
243
* iv) Get state of this sequence from escSeqStateTable_Value_2022[]
244
* escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
245
* b) Switch on this state and continue to next char
246
* i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
247
* which is normalize_esq_chars_2022[36] == 4
248
* ii) x is currently 1(from above)
249
* x<<=5 -- x is now 32
250
* x+=normalize_esq_chars_2022[36]
252
* iii) Search for this value in escSeqStateTable_Key_2022[]
253
* value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
254
* iv) Get state of this sequence from escSeqStateTable_Value_2022[]
255
* escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
256
* c) Switch on this state and continue to next char
257
* i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
258
* ii) x is currently 36 (from above)
259
* x<<=5 -- x is now 1152
260
* x+=normalize_esq_chars_2022[66]
262
* iii) Search for this value in escSeqStateTable_Key_2022[]
263
* value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
264
* iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
265
* escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
266
* v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
270
/*Below are the 3 arrays depicting a state transition table*/
271
static const int8_t normalize_esq_chars_2022[256] = {
272
/* 0 1 2 3 4 5 6 7 8 9 */
274
0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
275
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
276
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0
277
,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0
278
,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0
279
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
280
,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12
281
,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28
282
,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0
283
,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
284
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
285
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
286
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
287
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
288
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
289
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
290
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
291
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
292
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
293
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
294
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
295
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
296
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
297
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
298
,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
302
#ifdef U_ENABLE_GENERIC_ISO_2022
304
* When the generic ISO-2022 converter is completely removed, not just disabled
305
* per #ifdef, then the following state table and the associated tables that are
306
* dimensioned with MAX_STATES_2022 should be trimmed.
308
* Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
309
* the associated escape sequences starting with ESC ( B should be removed.
310
* This includes the ones with key values 1097 and all of the ones above 1000000.
312
* For the latter, the tables can simply be truncated.
313
* For the former, since the tables must be kept parallel, it is probably best
314
* to simply duplicate an adjacent table cell, parallel in all tables.
316
* It may make sense to restructure the tables, especially by using small search
317
* tables for the variants instead of indexing them parallel to the table here.
321
#define MAX_STATES_2022 74
322
static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
323
/* 0 1 2 3 4 5 6 7 8 9 */
325
1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096
326
,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106
327
,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257
328
,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940
329
,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644
330
,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138
331
,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630
332
,35947631 ,35947635 ,35947636 ,35947638
335
#ifdef U_ENABLE_GENERIC_ISO_2022
337
static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
338
/* 0 1 2 3 4 5 6 7 8 9 */
340
NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,NULL ,"latin1" ,"latin1"
341
,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1"
342
,"latin1" ,NULL ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,NULL ,NULL ,NULL ,NULL ,"UTF8"
343
,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,NULL ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
344
,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
345
,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
346
,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,NULL ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089"
347
,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
352
static const UCNV_TableStates_2022 escSeqStateTable_Value_2022[MAX_STATES_2022] = {
353
/* 0 1 2 3 4 5 6 7 8 9 */
354
VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
355
,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
356
,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022
357
,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
358
,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
359
,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
360
,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
361
,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
365
/* Type def for refactoring changeState_2022 code*/
367
#ifdef U_ENABLE_GENERIC_ISO_2022
375
/*********** ISO 2022 Converter Protos ***********/
377
_ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *errorCode);
380
_ISO2022Close(UConverter *converter);
383
_ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
386
_ISO2022getName(const UConverter* cnv);
389
_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
392
_ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
394
#ifdef U_ENABLE_GENERIC_ISO_2022
396
T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
399
/*const UConverterSharedData _ISO2022Data;*/
400
static const UConverterSharedData _ISO2022JPData;
401
static const UConverterSharedData _ISO2022KRData;
402
static const UConverterSharedData _ISO2022CNData;
404
/*************** Converter implementations ******************/
406
/* The purpose of this function is to get around gcc compiler warnings. */
408
fromUWriteUInt8(UConverter *cnv,
409
const char *bytes, int32_t length,
410
uint8_t **target, const char *targetLimit,
413
UErrorCode *pErrorCode)
415
char *targetChars = (char *)*target;
416
ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
417
offsets, sourceIndex, pErrorCode);
418
*target = (uint8_t*)targetChars;
423
setInitialStateToUnicodeKR(UConverter* converter, UConverterDataISO2022 *myConverterData){
424
if(myConverterData->version == 1) {
425
UConverter *cnv = myConverterData->currentConverter;
427
cnv->toUnicodeStatus=0; /* offset */
428
cnv->mode=0; /* state */
429
cnv->toULength=0; /* byteIndex */
434
setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
435
/* in ISO-2022-KR the designator sequence appears only once
436
* in a file so we append it only once
438
if( converter->charErrorBufferLength==0){
440
converter->charErrorBufferLength = 4;
441
converter->charErrorBuffer[0] = 0x1b;
442
converter->charErrorBuffer[1] = 0x24;
443
converter->charErrorBuffer[2] = 0x29;
444
converter->charErrorBuffer[3] = 0x43;
446
if(myConverterData->version == 1) {
447
UConverter *cnv = myConverterData->currentConverter;
450
cnv->fromUnicodeStatus=1; /* prevLength */
455
_ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *errorCode){
457
char myLocale[6]={' ',' ',' ',' ',' ',' '};
459
cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
460
if(cnv->extraInfo != NULL) {
461
UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
464
uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
465
myConverterData->currentType = ASCII1;
466
cnv->fromUnicodeStatus =FALSE;
468
uprv_strncpy(myLocale, locale, sizeof(myLocale));
470
version = options & UCNV_OPTIONS_VERSION_MASK;
471
myConverterData->version = version;
472
if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
473
(myLocale[2]=='_' || myLocale[2]=='\0'))
476
/* open the required converters and cache them */
477
if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
478
myConverterData->myConverterArray[ISO8859_7]= ucnv_loadSharedData("ISO8859_7", NULL, errorCode);
480
myConverterData->myConverterArray[JISX201] = ucnv_loadSharedData("JISX0201", NULL, errorCode);
481
myConverterData->myConverterArray[JISX208] = ucnv_loadSharedData("jisx-208", NULL, errorCode);
482
if(jpCharsetMasks[version]&CSM(JISX212)) {
483
myConverterData->myConverterArray[JISX212] = ucnv_loadSharedData("jisx-212", NULL, errorCode);
485
if(jpCharsetMasks[version]&CSM(GB2312)) {
486
myConverterData->myConverterArray[GB2312] = ucnv_loadSharedData("ibm-5478", NULL, errorCode); /* gb_2312_80-1 */
488
if(jpCharsetMasks[version]&CSM(KSC5601)) {
489
myConverterData->myConverterArray[KSC5601] = ucnv_loadSharedData("ksc_5601", NULL, errorCode);
492
/* set the function pointers to appropriate funtions */
493
cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
494
uprv_strcpy(myConverterData->locale,"ja");
496
(void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
497
len = uprv_strlen(myConverterData->name);
498
myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
499
myConverterData->name[len+1]='\0';
501
else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
502
(myLocale[2]=='_' || myLocale[2]=='\0'))
505
myConverterData->currentConverter=
506
ucnv_open("icu-internal-25546",errorCode);
508
if (U_FAILURE(*errorCode)) {
513
(void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
514
uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
515
cnv->subCharLen = myConverterData->currentConverter->subCharLen;
517
myConverterData->currentConverter=ucnv_open("ibm-949",errorCode);
519
if (U_FAILURE(*errorCode)) {
524
myConverterData->version = 0;
525
(void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
528
/* initialize the state variables */
529
setInitialStateToUnicodeKR(cnv, myConverterData);
530
setInitialStateFromUnicodeKR(cnv, myConverterData);
532
/* set the function pointers to appropriate funtions */
533
cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
534
uprv_strcpy(myConverterData->locale,"ko");
536
else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
537
(myLocale[2]=='_' || myLocale[2]=='\0'))
540
/* open the required converters and cache them */
541
myConverterData->myConverterArray[GB2312_1] = ucnv_loadSharedData("ibm-5478", NULL, errorCode);
543
myConverterData->myConverterArray[ISO_IR_165] = ucnv_loadSharedData("iso-ir-165", NULL, errorCode);
545
myConverterData->myConverterArray[CNS_11643] = ucnv_loadSharedData("cns-11643-1992", NULL, errorCode);
548
/* set the function pointers to appropriate funtions */
549
cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
550
uprv_strcpy(myConverterData->locale,"cn");
553
(void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
555
myConverterData->version = 0;
556
(void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
560
#ifdef U_ENABLE_GENERIC_ISO_2022
561
myConverterData->isFirstBuffer = TRUE;
563
/* append the UTF-8 escape sequence */
564
cnv->charErrorBufferLength = 3;
565
cnv->charErrorBuffer[0] = 0x1b;
566
cnv->charErrorBuffer[1] = 0x25;
567
cnv->charErrorBuffer[2] = 0x42;
569
cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
570
/* initialize the state variables */
571
uprv_strcpy(myConverterData->name,"ISO_2022");
573
*errorCode = U_UNSUPPORTED_ERROR;
578
cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
580
if(U_FAILURE(*errorCode)) {
584
*errorCode = U_MEMORY_ALLOCATION_ERROR;
590
_ISO2022Close(UConverter *converter) {
591
UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
592
UConverterSharedData **array = myData->myConverterArray;
595
if (converter->extraInfo != NULL) {
596
/*close the array of converter pointers and free the memory*/
597
for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
599
ucnv_unloadSharedDataIfReady(array[i]);
603
ucnv_close(myData->currentConverter);
605
if(!converter->isExtraLocal){
606
uprv_free (converter->extraInfo);
607
converter->extraInfo = NULL;
613
_ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
614
UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
615
if(choice<=UCNV_RESET_TO_UNICODE) {
616
uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
617
myConverterData->key = 0;
619
if(choice!=UCNV_RESET_TO_UNICODE) {
620
uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
622
#ifdef U_ENABLE_GENERIC_ISO_2022
623
if(myConverterData->locale[0] == 0){
624
if(choice<=UCNV_RESET_TO_UNICODE) {
625
myConverterData->isFirstBuffer = TRUE;
626
myConverterData->key = 0;
627
if (converter->mode == UCNV_SO){
628
ucnv_close (myConverterData->currentConverter);
629
myConverterData->currentConverter=NULL;
631
converter->mode = UCNV_SI;
633
if(choice!=UCNV_RESET_TO_UNICODE) {
634
/* re-append UTF-8 escape sequence */
635
converter->charErrorBufferLength = 3;
636
converter->charErrorBuffer[0] = 0x1b;
637
converter->charErrorBuffer[1] = 0x28;
638
converter->charErrorBuffer[2] = 0x42;
644
/* reset the state variables */
645
if(myConverterData->locale[0] == 'k'){
646
if(choice<=UCNV_RESET_TO_UNICODE) {
647
setInitialStateToUnicodeKR(converter, myConverterData);
649
if(choice!=UCNV_RESET_TO_UNICODE) {
650
setInitialStateFromUnicodeKR(converter, myConverterData);
657
_ISO2022getName(const UConverter* cnv){
659
UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
666
/*************** to unicode *******************/
667
/****************************************************************************
668
* Recognized escape sequences are
680
static const StateEnum nextStateToUnicodeJP[MAX_STATES_2022]= {
681
/* 0 1 2 3 4 5 6 7 8 9 */
682
INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
683
,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
684
,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
685
,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE
686
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
687
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
688
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
689
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
692
/*************** to unicode *******************/
693
static const StateEnum nextStateToUnicodeCN[MAX_STATES_2022]= {
694
/* 0 1 2 3 4 5 6 7 8 9 */
695
INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
696
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
697
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
698
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
699
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165
700
,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
701
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
702
,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
706
static UCNV_TableStates_2022
707
getKey_2022(char c,int32_t* key,int32_t* offset){
710
int32_t hi = MAX_STATES_2022;
713
togo = normalize_esq_chars_2022[(uint8_t)c];
715
/* not a valid character anywhere in an escape sequence */
720
togo = (*key << 5) + togo;
722
while (hi != low) /*binary search*/{
724
register int32_t mid = (hi+low) >> 1; /*Finds median*/
729
if (escSeqStateTable_Key_2022[mid] > togo){
732
else if (escSeqStateTable_Key_2022[mid] < togo){
735
else /*we found it*/{
738
return escSeqStateTable_Value_2022[mid];
749
/*runs through a state machine to determine the escape sequence - codepage correspondance
752
changeState_2022(UConverter* _this,
754
const char* sourceLimit,
757
UCNV_TableStates_2022 value;
758
UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
759
uint32_t key = myData2022->key;
763
value = VALID_NON_TERMINAL_2022;
764
while (*source < sourceLimit) {
766
_this->toUBytes[_this->toULength++]=(uint8_t)c;
767
value = getKey_2022(c,(int32_t *) &key, &offset);
771
case VALID_NON_TERMINAL_2022 :
772
/* continue with the loop */
775
case VALID_TERMINAL_2022:
782
case VALID_MAYBE_TERMINAL_2022:
783
#ifdef U_ENABLE_GENERIC_ISO_2022
784
/* ESC ( B is ambiguous only for ISO_2022 itself */
785
if(var == ISO_2022) {
786
/* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
787
_this->toULength = 0;
789
/* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
791
/* continue with the loop */
792
value = VALID_NON_TERMINAL_2022;
797
/* not ISO_2022 itself, finish here */
798
value = VALID_TERMINAL_2022;
806
myData2022->key = key;
808
if (value == VALID_NON_TERMINAL_2022) {
809
/* indicate that the escape sequence is incomplete: key!=0 */
811
} else if (value == INVALID_2022 ) {
812
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
814
} else /* value == VALID_TERMINAL_2022 */ {
816
#ifdef U_ENABLE_GENERIC_ISO_2022
819
const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
820
if(chosenConverterName == NULL) {
822
*err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
826
_this->mode = UCNV_SI;
827
ucnv_close(myData2022->currentConverter);
828
myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
829
if(U_SUCCESS(*err)) {
830
myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
831
_this->mode = UCNV_SO;
838
StateEnum tempState=nextStateToUnicodeJP[offset];
841
*err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
844
if(myData2022->toU2022State.cs[2]!=0) {
845
if(myData2022->toU2022State.g<2) {
846
myData2022->toU2022State.prevG=myData2022->toU2022State.g;
848
myData2022->toU2022State.g=2;
850
/* illegal to have SS2 before a matching designator */
851
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
854
/* case SS3_STATE: not used in ISO-2022-JP-x */
857
if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
858
*err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
860
/* G2 charset for SS2 */
861
myData2022->toU2022State.cs[2]=(int8_t)tempState;
865
if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
866
*err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
869
myData2022->toU2022State.cs[0]=(int8_t)tempState;
877
StateEnum tempState=nextStateToUnicodeCN[offset];
880
*err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
883
if(myData2022->toU2022State.cs[2]!=0) {
884
if(myData2022->toU2022State.g<2) {
885
myData2022->toU2022State.prevG=myData2022->toU2022State.g;
887
myData2022->toU2022State.g=2;
889
/* illegal to have SS2 before a matching designator */
890
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
894
if(myData2022->toU2022State.cs[3]!=0) {
895
if(myData2022->toU2022State.g<2) {
896
myData2022->toU2022State.prevG=myData2022->toU2022State.g;
898
myData2022->toU2022State.g=3;
900
/* illegal to have SS3 before a matching designator */
901
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
905
if(myData2022->version==0) {
906
*err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
913
myData2022->toU2022State.cs[1]=(int8_t)tempState;
916
myData2022->toU2022State.cs[2]=(int8_t)tempState;
919
/* other CNS 11643 planes */
920
if(myData2022->version==0) {
921
*err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
923
myData2022->toU2022State.cs[3]=(int8_t)tempState;
931
/* nothing to be done, just accept this one escape sequence */
933
*err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
938
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
942
if(U_SUCCESS(*err)) {
943
_this->toULength = 0;
947
/*Checks the characters of the buffer against valid 2022 escape sequences
948
*if the match we return a pointer to the initial start of the sequence otherwise
949
*we return sourceLimit
951
/*for 2022 looks ahead in the stream
952
*to determine the longest possible convertible
955
static U_INLINE const char*
956
getEndOfBuffer_2022(const char** source,
957
const char* sourceLimit,
960
const char* mySource = *source;
962
#ifdef U_ENABLE_GENERIC_ISO_2022
963
if (*source >= sourceLimit)
968
if (*mySource == ESC_2022){
972
UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
974
/* Kludge: I could not
975
* figure out the reason for validating an escape sequence
976
* twice - once here and once in changeState_2022().
977
* is it possible to have an ESC character in a ISO2022
978
* byte stream which is valid in a code page? Is it legal?
981
(mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
983
value = getKey_2022(*(mySource+i), &key, &offset);
985
if (value > 0 || *mySource==ESC_2022)
988
if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
991
}while (++mySource < sourceLimit);
995
while(mySource < sourceLimit && *mySource != ESC_2022) {
1003
/* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
1004
* any future change in _MBCSFromUChar32() function should be reflected here.
1005
* @return number of bytes in *value; negative number if fallback; 0 if no mapping
1007
static U_INLINE int32_t
1008
MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
1015
const uint16_t *table;
1016
uint32_t stage2Entry;
1021
* TODO(markus): Use and require new, faster MBCS conversion table structures.
1022
* Use internal version of ucnv_open() that verifies that the new structures are available,
1023
* else U_INTERNAL_PROGRAM_ERROR.
1025
/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1026
if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1027
table=sharedData->mbcs.fromUnicodeTable;
1028
stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
1029
/* get the bytes and the length for the output */
1030
if(outputType==MBCS_OUTPUT_2){
1031
myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1037
} else /* outputType==MBCS_OUTPUT_3 */ {
1038
p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
1039
myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
1042
} else if(myValue<=0xffff) {
1049
* TODO(markus): Use Shift-JIS table for JIS X 0208, to save mapping table space.
1050
* Pass in parameter for type of output bytes, for validation and shifting:
1051
* - Direct: Pass bytes through, but forbid control codes 00-1F (except SI/SO/ESC) and space 20?
1052
* (Need to allow some (TAB/LF/CR) or most of them for ASCII and maybe JIS X 0201.)
1053
* - A1-FE: Subtract 80 after range check.
1054
* - SJIS: Shift DBCS result to 21-7E x 21-7E.
1056
/* is this code point assigned, or do we use fallbacks? */
1057
if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
1061
} else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
1063
* We allow a 0 byte output if the "assigned" bit is set for this entry.
1064
* There is no way with this data structure for fallback output
1065
* to be a zero byte.
1072
cx=sharedData->mbcs.extIndexes;
1074
return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
1081
/* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
1082
* any future change in _MBCSSingleFromUChar32() function should be reflected here.
1083
* @param retval pointer to output byte
1084
* @return 1 roundtrip byte 0 no mapping -1 fallback byte
1086
static U_INLINE int32_t
1087
MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
1092
const uint16_t *table;
1094
/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
1095
if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
1098
/* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
1099
table=sharedData->mbcs.fromUnicodeTable;
1100
/* get the byte for the output */
1101
value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
1102
/* is this code point assigned, or do we use fallbacks? */
1103
*retval=(uint32_t)(value&0xff);
1105
return 1; /* roundtrip */
1106
} else if(useFallback ? value>=0x800 : value>=0xc00) {
1107
return -1; /* fallback taken */
1109
return 0; /* no mapping */
1113
#ifdef U_ENABLE_GENERIC_ISO_2022
1115
/**********************************************************************************
1116
* ISO-2022 Converter
1122
T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
1124
const char* mySourceLimit, *realSourceLimit;
1125
const char* sourceStart;
1126
const UChar* myTargetStart;
1127
UConverter* saveThis;
1128
UConverterDataISO2022* myData;
1131
saveThis = args->converter;
1132
myData=((UConverterDataISO2022*)(saveThis->extraInfo));
1134
realSourceLimit = args->sourceLimit;
1135
while (args->source < realSourceLimit) {
1136
if(myData->key == 0) { /* are we in the middle of an escape sequence? */
1137
/*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
1138
mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
1140
if(args->source < mySourceLimit) {
1141
if(myData->currentConverter==NULL) {
1142
myData->currentConverter = ucnv_open("ASCII",err);
1143
if(U_FAILURE(*err)){
1147
myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
1148
saveThis->mode = UCNV_SO;
1151
/* convert to before the ESC or until the end of the buffer */
1152
myData->isFirstBuffer=FALSE;
1153
sourceStart = args->source;
1154
myTargetStart = args->target;
1155
args->converter = myData->currentConverter;
1156
ucnv_toUnicode(args->converter,
1162
(UBool)(args->flush && mySourceLimit == realSourceLimit),
1164
args->converter = saveThis;
1166
if (*err == U_BUFFER_OVERFLOW_ERROR) {
1167
/* move the overflow buffer */
1168
length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
1169
myData->currentConverter->UCharErrorBufferLength = 0;
1171
uprv_memcpy(saveThis->UCharErrorBuffer,
1172
myData->currentConverter->UCharErrorBuffer,
1173
length*U_SIZEOF_UCHAR);
1180
* -Error while converting
1181
* -Done with entire buffer
1182
* -Need to write offsets or update the current offset
1183
* (leave that up to the code in ucnv.c)
1185
* or else we just stopped at an ESC byte and continue with changeState_2022()
1187
if (U_FAILURE(*err) ||
1188
(args->source == realSourceLimit) ||
1189
(args->offsets != NULL && (args->target != myTargetStart || args->source != sourceStart) ||
1190
(mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
1192
/* copy partial or error input for truncated detection and error handling */
1193
if(U_FAILURE(*err)) {
1194
length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
1196
uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
1199
length = saveThis->toULength = myData->currentConverter->toULength;
1201
uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
1202
if(args->source < mySourceLimit) {
1203
*err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
1212
sourceStart = args->source;
1213
changeState_2022(args->converter,
1218
if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != NULL)) {
1219
/* let the ucnv.c code update its current offset */
1228
* To Unicode Callback helper function
1231
toUnicodeCallback(UConverter *cnv,
1232
const uint32_t sourceChar, const uint32_t targetUniChar,
1234
if(sourceChar>0xff){
1235
cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
1236
cnv->toUBytes[1] = (uint8_t)sourceChar;
1240
cnv->toUBytes[0] =(char) sourceChar;
1244
if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
1245
*err = U_INVALID_CHAR_FOUND;
1248
*err = U_ILLEGAL_CHAR_FOUND;
1252
/**************************************ISO-2022-JP*************************************************/
1254
/************************************** IMPORTANT **************************************************
1255
* The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
1256
* MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
1257
* The converter iterates over each Unicode codepoint
1258
* to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
1259
* processed one char at a time it would make sense to reduce the extra processing a canned converter
1260
* would do as far as possible.
1262
* If the implementation of these macros or structure of sharedData struct change in the future, make
1263
* sure that ISO-2022 is also changed.
1264
***************************************************************************************************
1267
/***************************************************************************************************
1268
* Rules for ISO-2022-jp encoding
1269
* (i) Escape sequences must be fully contained within a line they should not
1270
* span new lines or CRs
1271
* (ii) If the last character on a line is represented by two bytes then an ASCII or
1272
* JIS-Roman character escape sequence should follow before the line terminates
1273
* (iii) If the first character on the line is represented by two bytes then a two
1274
* byte character escape sequence should precede it
1275
* (iv) If no escape sequence is encountered then the characters are ASCII
1276
* (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
1277
* and invoked with SS2 (ESC N).
1278
* (vi) If there is any G0 designation in text, there must be a switch to
1279
* ASCII or to JIS X 0201-Roman before a space character (but not
1280
* necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
1281
* characters such as tab or CRLF.
1282
* (vi) Supported encodings:
1283
* ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
1287
* JISX201, JISX208,JISX212 : new .cnv data files created
1288
* KSC5601 : alias to ibm-949 mapping table
1289
* GB2312 : alias to ibm-1386 mapping table
1290
* ISO-8859-1 : Algorithmic implemented as LATIN1 case
1291
* ISO-8859-7 : alisas to ibm-9409 mapping table
1294
/* preference order of JP charsets */
1295
static const StateEnum jpCharsetPref[]={
1308
* The escape sequences must be in order of the enum constants like JISX201 = 3,
1309
* not in order of jpCharsetPref[]!
1311
static const char escSeqChars[][6] ={
1312
"\x1B\x28\x42", /* <ESC>(B ASCII */
1313
"\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */
1314
"\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */
1315
"\x1B\x28\x4A", /* <ESC>(J JISX-201 */
1316
"\x1B\x24\x42", /* <ESC>$B JISX-208 */
1317
"\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */
1318
"\x1B\x24\x41", /* <ESC>$A GB2312 */
1319
"\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */
1320
"\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */
1323
static const int32_t escSeqCharsLen[] ={
1324
3, /* length of <ESC>(B ASCII */
1325
3, /* length of <ESC>.A ISO-8859-1 */
1326
3, /* length of <ESC>.F ISO-8859-7 */
1327
3, /* length of <ESC>(J JISX-201 */
1328
3, /* length of <ESC>$B JISX-208 */
1329
4, /* length of <ESC>$(D JISX-212 */
1330
3, /* length of <ESC>$A GB2312 */
1331
4, /* length of <ESC>$(C KSC5601 */
1332
3 /* length of <ESC>(I HWKANA_7BIT */
1336
* The iteration over various code pages works this way:
1337
* i) Get the currentState from myConverterData->currentState
1338
* ii) Check if the character is mapped to a valid character in the currentState
1339
* Yes -> a) set the initIterState to currentState
1340
* b) remain in this state until an invalid character is found
1341
* No -> a) go to the next code page and find the character
1342
* iii) Before changing the state increment the current state check if the current state
1343
* is equal to the intitIteration state
1344
* Yes -> A character that cannot be represented in any of the supported encodings
1345
* break and return a U_INVALID_CHARACTER error
1346
* No -> Continue and find the character in next code page
1349
* TODO: Implement a priority technique where the users are allowed to set the priority of code pages
1353
UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
1354
UConverter *cnv = args->converter;
1355
UConverterDataISO2022 *converterData;
1356
ISO2022State *pFromU2022State;
1357
uint8_t *target = (uint8_t *) args->target;
1358
const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
1359
const UChar* source = args->source;
1360
const UChar* sourceLimit = args->sourceLimit;
1361
int32_t* offsets = args->offsets;
1364
int32_t len, outLen;
1366
int32_t choiceCount;
1367
uint32_t targetValue = 0;
1373
/* set up the state */
1374
converterData = (UConverterDataISO2022*)cnv->extraInfo;
1375
pFromU2022State = &converterData->fromU2022State;
1379
/* check if the last codepoint of previous buffer was a lead surrogate*/
1380
if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
1384
while(source < sourceLimit) {
1385
if(target < targetLimit) {
1387
sourceChar = *(source++);
1388
/*check if the char is a First surrogate*/
1389
if(UTF_IS_SURROGATE(sourceChar)) {
1390
if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
1392
/*look ahead to find the trail surrogate*/
1393
if(source < sourceLimit) {
1394
/* test the following code unit */
1395
UChar trail=(UChar) *source;
1396
if(UTF_IS_SECOND_SURROGATE(trail)) {
1398
sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
1399
cnv->fromUChar32=0x00;
1400
/* convert this supplementary code point */
1401
/* exit this condition tree */
1403
/* this is an unmatched lead code unit (1st surrogate) */
1404
/* callback(illegal) */
1405
*err=U_ILLEGAL_CHAR_FOUND;
1406
cnv->fromUChar32=sourceChar;
1411
cnv->fromUChar32=sourceChar;
1415
/* this is an unmatched trail code unit (2nd surrogate) */
1416
/* callback(illegal) */
1417
*err=U_ILLEGAL_CHAR_FOUND;
1418
cnv->fromUChar32=sourceChar;
1423
/* do not convert SO/SI/ESC */
1424
if(IS_2022_CONTROL(sourceChar)) {
1425
/* callback(illegal) */
1426
*err=U_ILLEGAL_CHAR_FOUND;
1427
cnv->fromUChar32=sourceChar;
1431
/* do the conversion */
1433
if(choiceCount == 0) {
1437
* The csm variable keeps track of which charsets are allowed
1438
* and not used yet while building the choices[].
1440
csm = jpCharsetMasks[converterData->version];
1443
/* JIS7/8: try single-byte half-width Katakana before JISX208 */
1444
if(converterData->version == 3 || converterData->version == 4) {
1445
choices[choiceCount++] = (int8_t)HWKANA_7BIT;
1447
/* Do not try single-byte half-width Katakana for other versions. */
1448
csm &= ~CSM(HWKANA_7BIT);
1450
/* try the current G0 charset */
1451
choices[choiceCount++] = cs = pFromU2022State->cs[0];
1454
/* try the current G2 charset */
1455
if((cs = pFromU2022State->cs[2]) != 0) {
1456
choices[choiceCount++] = cs;
1460
/* try all the other possible charsets */
1461
for(i = 0; i < LENGTHOF(jpCharsetPref); ++i) {
1462
cs = (int8_t)jpCharsetPref[i];
1464
choices[choiceCount++] = cs;
1472
* len==0: no mapping found yet
1473
* len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
1474
* len>0: found a roundtrip result, done
1478
* We will turn off useFallback after finding a fallback,
1479
* but we still get fallbacks from PUA code points as usual.
1480
* Therefore, we will also need to check that we don't overwrite
1481
* an early fallback with a later one.
1483
useFallback = cnv->useFallback;
1485
for(i = 0; i < choiceCount && len <= 0; ++i) {
1488
int8_t cs0 = choices[i];
1491
if(sourceChar <= 0x7f) {
1492
targetValue = (uint32_t)sourceChar;
1499
if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
1500
targetValue = (uint32_t)sourceChar - 0x80;
1507
if((uint32_t)(HWKANA_END-sourceChar)<=(HWKANA_END-HWKANA_START)) {
1508
if(converterData->version==3) {
1509
/* JIS7: use G1 (SO) */
1510
/* Shift U+FF61..U+FF9F to bytes 21..5F. */
1511
targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
1513
pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
1515
} else if(converterData->version==4) {
1516
/* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
1517
/* Shift U+FF61..U+FF9F to bytes A1..DF. */
1518
targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
1521
cs = pFromU2022State->cs[0];
1522
if(IS_JP_DBCS(cs)) {
1523
/* switch from a DBCS charset to JISX201 */
1524
cs = (int8_t)JISX201;
1526
/* else stay in the current G0 charset */
1529
/* else do not use HWKANA_7BIT with other versions */
1534
len2 = MBCS_SINGLE_FROM_UCHAR32(
1535
converterData->myConverterArray[cs0],
1538
if(len2 != 0 && !(len2 < 0 && len != 0) && value <= 0x7f) {
1539
targetValue = value;
1543
useFallback = FALSE;
1547
/* G0 SBCS forced to 7-bit output */
1548
len2 = MBCS_SINGLE_FROM_UCHAR32(
1549
converterData->myConverterArray[cs0],
1552
if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
1553
targetValue = value - 0x80;
1557
useFallback = FALSE;
1562
len2 = MBCS_FROM_UCHAR32_ISO2022(
1563
converterData->myConverterArray[cs0],
1565
useFallback, MBCS_OUTPUT_2);
1566
if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
1567
if(cs0 == KSC5601) {
1569
* Check for valid bytes for the encoding scheme.
1570
* This is necessary because the sub-converter (windows-949)
1571
* has a broader encoding scheme than is valid for 2022.
1573
* Check that the result is a 2-byte value with each byte in the range A1..FE
1574
* (strict EUC-KR DBCS) before accepting it and subtracting 0x80 from each byte
1575
* to move it to the ISO 2022 range 21..7E.
1577
if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
1578
(uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
1580
value -= 0x8080; /* shift down to 21..7e byte range */
1582
break; /* not valid for ISO 2022 */
1585
targetValue = value;
1589
useFallback = FALSE;
1597
len = -len; /* fallback */
1599
outLen = 0; /* count output bytes */
1601
/* write SI if necessary (only for JIS7) */
1602
if(pFromU2022State->g == 1 && g == 0) {
1603
buffer[outLen++] = UCNV_SI;
1604
pFromU2022State->g = 0;
1607
/* write the designation sequence if necessary */
1608
if(cs != pFromU2022State->cs[g]) {
1609
int32_t escLen = escSeqCharsLen[cs];
1610
uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
1612
pFromU2022State->cs[g] = cs;
1614
/* invalidate the choices[] */
1618
/* write the shift sequence if necessary */
1619
if(g != pFromU2022State->g) {
1621
/* case 0 handled before writing escapes */
1623
buffer[outLen++] = UCNV_SO;
1624
pFromU2022State->g = 1;
1626
default: /* case 2 */
1627
buffer[outLen++] = 0x1b;
1628
buffer[outLen++] = 0x4e;
1630
/* no case 3: no SS3 in ISO-2022-JP-x */
1634
/* write the output bytes */
1636
buffer[outLen++] = (char)targetValue;
1637
} else /* len == 2 */ {
1638
buffer[outLen++] = (char)(targetValue >> 8);
1639
buffer[outLen++] = (char)targetValue;
1643
* if we cannot find the character after checking all codepages
1644
* then this is an error
1646
*err = U_INVALID_CHAR_FOUND;
1647
cnv->fromUChar32=sourceChar;
1651
if(sourceChar == CR || sourceChar == LF) {
1652
/* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
1653
pFromU2022State->cs[2] = 0;
1657
/* output outLen>0 bytes in buffer[] */
1659
*target++ = buffer[0];
1661
*offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
1663
} else if(outLen == 2 && (target + 2) <= targetLimit) {
1664
*target++ = buffer[0];
1665
*target++ = buffer[1];
1667
int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
1668
*offsets++ = sourceIndex;
1669
*offsets++ = sourceIndex;
1675
&target, (const char *)targetLimit,
1676
&offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
1678
if(U_FAILURE(*err)) {
1682
} /* end if(myTargetIndex<myTargetLength) */
1684
*err =U_BUFFER_OVERFLOW_ERROR;
1688
}/* end while(mySourceIndex<mySourceLength) */
1691
* the end of the input stream and detection of truncated input
1692
* are handled by the framework, but for ISO-2022-JP conversion
1693
* we need to be in ASCII mode at the very end
1697
* in SO mode or not in ASCII mode
1698
* end of input and no truncated input
1700
if( U_SUCCESS(*err) &&
1701
(pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
1702
args->flush && source>=sourceLimit && cnv->fromUChar32==0
1704
int32_t sourceIndex;
1708
if(pFromU2022State->g != 0) {
1709
buffer[outLen++] = UCNV_SI;
1710
pFromU2022State->g = 0;
1713
if(pFromU2022State->cs[0] != ASCII) {
1714
int32_t escLen = escSeqCharsLen[ASCII];
1715
uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
1717
pFromU2022State->cs[0] = (int8_t)ASCII;
1720
/* get the source index of the last input character */
1722
* TODO this would be simpler and more reliable if we used a pair
1723
* of sourceIndex/prevSourceIndex like in ucnvmbcs.c
1724
* so that we could simply use the prevSourceIndex here;
1725
* this code gives an incorrect result for the rare case of an unmatched
1726
* trail surrogate that is alone in the last buffer of the text stream
1728
sourceIndex=(int32_t)(source-args->source);
1731
if( U16_IS_TRAIL(args->source[sourceIndex]) &&
1732
(sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
1743
&target, (const char *)targetLimit,
1744
&offsets, sourceIndex,
1748
/*save the state and return */
1749
args->source = source;
1750
args->target = (char*)target;
1753
/*************** to unicode *******************/
1756
UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
1759
const char *mySource = (char *) args->source;
1760
UChar *myTarget = args->target;
1761
const char *mySourceLimit = args->sourceLimit;
1762
uint32_t targetUniChar = 0x0000;
1763
uint32_t mySourceChar = 0x0000;
1764
UConverterDataISO2022* myData;
1765
ISO2022State *pToU2022State;
1768
myData=(UConverterDataISO2022*)(args->converter->extraInfo);
1769
pToU2022State = &myData->toU2022State;
1771
if(myData->key != 0) {
1772
/* continue with a partial escape sequence */
1774
} else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
1775
/* continue with a partial double-byte character */
1776
mySourceChar = args->converter->toUBytes[0];
1777
args->converter->toULength = 0;
1778
cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
1782
while(mySource < mySourceLimit){
1784
targetUniChar =missingCharMarker;
1786
if(myTarget < args->targetLimit){
1788
mySourceChar= (unsigned char) *mySource++;
1790
switch(mySourceChar) {
1792
if(myData->version==3) {
1796
/* only JIS7 uses SI/SO, not ISO-2022-JP-x */
1801
if(myData->version==3) {
1802
/* JIS7: switch to G1 half-width Katakana */
1803
pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
1807
/* only JIS7 uses SI/SO, not ISO-2022-JP-x */
1814
changeState_2022(args->converter,&(mySource),
1815
mySourceLimit, ISO_2022_JP,err);
1817
/* invalid or illegal escape sequence */
1818
if(U_FAILURE(*err)){
1819
args->target = myTarget;
1820
args->source = mySource;
1825
/* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
1830
/* automatically reset to single-byte mode */
1831
if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
1832
pToU2022State->cs[0] = (int8_t)ASCII;
1834
pToU2022State->cs[2] = 0;
1835
pToU2022State->g = 0;
1838
/* convert one or two bytes */
1839
cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
1840
if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
1843
/* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
1844
targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
1846
/* return from a single-shift state to the previous one */
1847
if(pToU2022State->g >= 2) {
1848
pToU2022State->g=pToU2022State->prevG;
1852
if(mySourceChar <= 0x7f) {
1853
targetUniChar = mySourceChar;
1857
if(mySourceChar <= 0x7f) {
1858
targetUniChar = mySourceChar + 0x80;
1860
/* return from a single-shift state to the previous one */
1861
pToU2022State->g=pToU2022State->prevG;
1864
if(mySourceChar <= 0x7f) {
1865
/* convert mySourceChar+0x80 to use a normal 8-bit table */
1867
_MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
1868
myData->myConverterArray[cs],
1869
mySourceChar + 0x80);
1871
/* return from a single-shift state to the previous one */
1872
pToU2022State->g=pToU2022State->prevG;
1875
if(mySourceChar <= 0x7f) {
1877
_MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
1878
myData->myConverterArray[cs],
1883
if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
1884
/* 7-bit halfwidth Katakana */
1885
targetUniChar = mySourceChar + (HWKANA_START - 0x21);
1890
if(mySource < mySourceLimit) {
1893
tempBuf[0] = (char) (mySourceChar);
1894
tempBuf[1] = trailByte = *mySource++;
1895
mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
1896
targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
1898
args->converter->toUBytes[0] = (uint8_t)mySourceChar;
1899
args->converter->toULength = 1;
1902
} /* End of inner switch */
1904
} /* End of outer switch */
1905
if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
1907
args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
1909
*(myTarget++)=(UChar)targetUniChar;
1911
else if(targetUniChar > missingCharMarker){
1912
/* disassemble the surrogate pair and write to output*/
1913
targetUniChar-=0x0010000;
1914
*myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
1916
args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
1919
if(myTarget< args->targetLimit){
1920
*myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
1922
args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
1926
args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
1927
(UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
1932
/* Call the callback function*/
1933
toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
1937
else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */
1938
*err =U_BUFFER_OVERFLOW_ERROR;
1943
args->target = myTarget;
1944
args->source = mySource;
1948
/***************************************************************
1949
* Rules for ISO-2022-KR encoding
1950
* i) The KSC5601 designator sequence should appear only once in a file,
1951
* at the begining of a line before any KSC5601 characters. This usually
1952
* means that it appears by itself on the first line of the file
1953
* ii) There are only 2 shifting sequences SO to shift into double byte mode
1954
* and SI to shift into single byte mode
1957
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
1959
UConverter* saveConv = args->converter;
1960
UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
1961
args->converter=myConverterData->currentConverter;
1963
myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
1964
ucnv_MBCSFromUnicodeWithOffsets(args,err);
1965
saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
1967
if(*err == U_BUFFER_OVERFLOW_ERROR) {
1968
if(myConverterData->currentConverter->charErrorBufferLength > 0) {
1970
saveConv->charErrorBuffer,
1971
myConverterData->currentConverter->charErrorBuffer,
1972
myConverterData->currentConverter->charErrorBufferLength);
1974
saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
1975
myConverterData->currentConverter->charErrorBufferLength = 0;
1977
args->converter=saveConv;
1981
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
1983
const UChar *source = args->source;
1984
const UChar *sourceLimit = args->sourceLimit;
1985
unsigned char *target = (unsigned char *) args->target;
1986
unsigned char *targetLimit = (unsigned char *) args->targetLimit;
1987
int32_t* offsets = args->offsets;
1988
uint32_t targetByteUnit = 0x0000;
1989
UChar32 sourceChar = 0x0000;
1990
UBool isTargetByteDBCS;
1991
UBool oldIsTargetByteDBCS;
1992
UConverterDataISO2022 *converterData;
1993
UConverterSharedData* sharedData;
1997
converterData=(UConverterDataISO2022*)args->converter->extraInfo;
1998
/* if the version is 1 then the user is requesting
1999
* conversion with ibm-25546 pass the arguments to
2000
* MBCS converter and return
2002
if(converterData->version==1){
2003
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2007
/* initialize data */
2008
sharedData = converterData->currentConverter->sharedData;
2009
useFallback = args->converter->useFallback;
2010
isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
2011
oldIsTargetByteDBCS = isTargetByteDBCS;
2013
isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus;
2014
if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
2017
while(source < sourceLimit){
2019
targetByteUnit = missingCharMarker;
2021
if(target < (unsigned char*) args->targetLimit){
2022
sourceChar = *source++;
2024
/* do not convert SO/SI/ESC */
2025
if(IS_2022_CONTROL(sourceChar)) {
2026
/* callback(illegal) */
2027
*err=U_ILLEGAL_CHAR_FOUND;
2028
args->converter->fromUChar32=sourceChar;
2032
length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
2034
length = -length; /* fallback */
2036
/* only DBCS or SBCS characters are expected*/
2037
/* DB characters with high bit set to 1 are expected */
2038
if(length > 2 || length==0 ||(((targetByteUnit & 0x8080) != 0x8080)&& length==2)){
2039
targetByteUnit=missingCharMarker;
2041
if (targetByteUnit != missingCharMarker){
2043
oldIsTargetByteDBCS = isTargetByteDBCS;
2044
isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
2045
/* append the shift sequence */
2046
if (oldIsTargetByteDBCS != isTargetByteDBCS ){
2048
if (isTargetByteDBCS)
2049
*target++ = UCNV_SO;
2051
*target++ = UCNV_SI;
2053
*(offsets++) = (int32_t)(source - args->source-1);
2055
/* write the targetUniChar to target */
2056
if(targetByteUnit <= 0x00FF){
2057
if( target < targetLimit){
2058
*(target++) = (unsigned char) targetByteUnit;
2060
*(offsets++) = (int32_t)(source - args->source-1);
2064
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
2065
*err = U_BUFFER_OVERFLOW_ERROR;
2068
if(target < targetLimit){
2069
*(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
2071
*(offsets++) = (int32_t)(source - args->source-1);
2073
if(target < targetLimit){
2074
*(target++) =(unsigned char) (targetByteUnit -0x80);
2076
*(offsets++) = (int32_t)(source - args->source-1);
2079
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
2080
*err = U_BUFFER_OVERFLOW_ERROR;
2083
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
2084
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
2085
*err = U_BUFFER_OVERFLOW_ERROR;
2091
/* oops.. the code point is unassingned
2092
* set the error and reason
2095
/*check if the char is a First surrogate*/
2096
if(UTF_IS_SURROGATE(sourceChar)) {
2097
if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
2099
/*look ahead to find the trail surrogate*/
2100
if(source < sourceLimit) {
2101
/* test the following code unit */
2102
UChar trail=(UChar) *source;
2103
if(UTF_IS_SECOND_SURROGATE(trail)) {
2105
sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
2106
*err = U_INVALID_CHAR_FOUND;
2107
/* convert this surrogate code point */
2108
/* exit this condition tree */
2110
/* this is an unmatched lead code unit (1st surrogate) */
2111
/* callback(illegal) */
2112
*err=U_ILLEGAL_CHAR_FOUND;
2116
*err = U_ZERO_ERROR;
2119
/* this is an unmatched trail code unit (2nd surrogate) */
2120
/* callback(illegal) */
2121
*err=U_ILLEGAL_CHAR_FOUND;
2124
/* callback(unassigned) for a BMP code point */
2125
*err = U_INVALID_CHAR_FOUND;
2128
args->converter->fromUChar32=sourceChar;
2131
} /* end if(myTargetIndex<myTargetLength) */
2133
*err =U_BUFFER_OVERFLOW_ERROR;
2137
}/* end while(mySourceIndex<mySourceLength) */
2140
* the end of the input stream and detection of truncated input
2141
* are handled by the framework, but for ISO-2022-KR conversion
2142
* we need to be in ASCII mode at the very end
2147
* end of input and no truncated input
2149
if( U_SUCCESS(*err) &&
2151
args->flush && source>=sourceLimit && args->converter->fromUChar32==0
2153
int32_t sourceIndex;
2155
/* we are switching to ASCII */
2156
isTargetByteDBCS=FALSE;
2158
/* get the source index of the last input character */
2160
* TODO this would be simpler and more reliable if we used a pair
2161
* of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2162
* so that we could simply use the prevSourceIndex here;
2163
* this code gives an incorrect result for the rare case of an unmatched
2164
* trail surrogate that is alone in the last buffer of the text stream
2166
sourceIndex=(int32_t)(source-args->source);
2169
if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2170
(sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2181
&target, (const char *)targetLimit,
2182
&offsets, sourceIndex,
2186
/*save the state and return */
2187
args->source = source;
2188
args->target = (char*)target;
2189
args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
2192
/************************ To Unicode ***************************************/
2195
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
2197
char const* sourceStart;
2198
UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2200
UConverterToUnicodeArgs subArgs;
2201
int32_t minArgsSize;
2203
/* set up the subconverter arguments */
2204
if(args->size<sizeof(UConverterToUnicodeArgs)) {
2205
minArgsSize = args->size;
2207
minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
2210
uprv_memcpy(&subArgs, args, minArgsSize);
2211
subArgs.size = (uint16_t)minArgsSize;
2212
subArgs.converter = myData->currentConverter;
2214
/* remember the original start of the input for offsets */
2215
sourceStart = args->source;
2217
if(myData->key != 0) {
2218
/* continue with a partial escape sequence */
2222
while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
2223
/*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
2224
subArgs.source = args->source;
2225
subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
2226
if(subArgs.source != subArgs.sourceLimit) {
2228
* get the current partial byte sequence
2230
* it needs to be moved between the public and the subconverter
2231
* so that the conversion framework, which only sees the public
2232
* converter, can handle truncated and illegal input etc.
2234
if(args->converter->toULength > 0) {
2235
uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
2237
subArgs.converter->toULength = args->converter->toULength;
2240
* Convert up to the end of the input, or to before the next escape character.
2241
* Does not handle conversion extensions because the preToU[] state etc.
2244
ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
2246
if(args->offsets != NULL && sourceStart != args->source) {
2247
/* update offsets to base them on the actual start of the input */
2248
int32_t *offsets = args->offsets;
2249
UChar *target = args->target;
2250
int32_t delta = (int32_t)(args->source - sourceStart);
2251
while(target < subArgs.target) {
2259
args->source = subArgs.source;
2260
args->target = subArgs.target;
2261
args->offsets = subArgs.offsets;
2263
/* copy input/error/overflow buffers */
2264
if(subArgs.converter->toULength > 0) {
2265
uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
2267
args->converter->toULength = subArgs.converter->toULength;
2269
if(*err == U_BUFFER_OVERFLOW_ERROR) {
2270
if(subArgs.converter->UCharErrorBufferLength > 0) {
2271
uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
2272
subArgs.converter->UCharErrorBufferLength);
2274
args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
2275
subArgs.converter->UCharErrorBufferLength = 0;
2279
if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
2284
changeState_2022(args->converter,
2293
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2296
const char *mySource = ( char *) args->source;
2297
UChar *myTarget = args->target;
2298
const char *mySourceLimit = args->sourceLimit;
2299
UChar32 targetUniChar = 0x0000;
2300
UChar mySourceChar = 0x0000;
2301
UConverterDataISO2022* myData;
2302
UConverterSharedData* sharedData ;
2305
myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2306
if(myData->version==1){
2307
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
2311
/* initialize state */
2312
sharedData = myData->currentConverter->sharedData;
2313
useFallback = args->converter->useFallback;
2315
if(myData->key != 0) {
2316
/* continue with a partial escape sequence */
2318
} else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2319
/* continue with a partial double-byte character */
2320
mySourceChar = args->converter->toUBytes[0];
2321
args->converter->toULength = 0;
2325
while(mySource< mySourceLimit){
2327
if(myTarget < args->targetLimit){
2329
mySourceChar= (unsigned char) *mySource++;
2331
if(mySourceChar==UCNV_SI){
2332
myData->toU2022State.g = 0;
2333
/*consume the source */
2335
}else if(mySourceChar==UCNV_SO){
2336
myData->toU2022State.g = 1;
2337
/*consume the source */
2339
}else if(mySourceChar==ESC_2022){
2342
changeState_2022(args->converter,&(mySource),
2343
mySourceLimit, ISO_2022_KR, err);
2344
if(U_FAILURE(*err)){
2345
args->target = myTarget;
2346
args->source = mySource;
2352
if(myData->toU2022State.g == 1) {
2353
if(mySource < mySourceLimit) {
2356
trailByte = *mySource++;
2357
tempBuf[0] = (char)(mySourceChar + 0x80);
2358
tempBuf[1] = (char)(trailByte + 0x80);
2359
mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
2360
if((mySourceChar & 0x8080) == 0) {
2361
targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
2363
/* illegal bytes > 0x7f */
2364
targetUniChar = missingCharMarker;
2367
args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2368
args->converter->toULength = 1;
2373
targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
2375
if(targetUniChar < 0xfffe){
2377
args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2379
*(myTarget++)=(UChar)targetUniChar;
2382
/* Call the callback function*/
2383
toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2388
*err =U_BUFFER_OVERFLOW_ERROR;
2392
args->target = myTarget;
2393
args->source = mySource;
2396
/*************************** END ISO2022-KR *********************************/
2398
/*************************** ISO-2022-CN *********************************
2400
* Rules for ISO-2022-CN Encoding:
2401
* i) The designator sequence must appear once on a line before any instance
2402
* of character set it designates.
2403
* ii) If two lines contain characters from the same character set, both lines
2404
* must include the designator sequence.
2405
* iii) Once the designator sequence is known, a shifting sequence has to be found
2406
* to invoke the shifting
2407
* iv) All lines start in ASCII and end in ASCII.
2408
* v) Four shifting sequences are employed for this purpose:
2410
* Sequcence ASCII Eq Charsets
2411
* ---------- ------- ---------
2413
* SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
2414
* SS2 <ESC>N CNS-11643-1992 Plane 2
2415
* SS3 <ESC>O CNS-11643-1992 Planes 3-7
2418
* SOdesignator : ESC "$" ")" finalchar_for_SO
2419
* SS2designator : ESC "$" "*" finalchar_for_SS2
2420
* SS3designator : ESC "$" "+" finalchar_for_SS3
2422
* ESC $ ) A Indicates the bytes following SO are Chinese
2423
* characters as defined in GB 2312-80, until
2424
* another SOdesignation appears
2427
* ESC $ ) E Indicates the bytes following SO are as defined
2428
* in ISO-IR-165 (for details, see section 2.1),
2429
* until another SOdesignation appears
2431
* ESC $ ) G Indicates the bytes following SO are as defined
2432
* in CNS 11643-plane-1, until another
2433
* SOdesignation appears
2435
* ESC $ * H Indicates the two bytes immediately following
2436
* SS2 is a Chinese character as defined in CNS
2437
* 11643-plane-2, until another SS2designation
2439
* (Meaning <ESC>N must preceed every 2 byte
2442
* ESC $ + I Indicates the immediate two bytes following SS3
2443
* is a Chinese character as defined in CNS
2444
* 11643-plane-3, until another SS3designation
2446
* (Meaning <ESC>O must preceed every 2 byte
2449
* ESC $ + J Indicates the immediate two bytes following SS3
2450
* is a Chinese character as defined in CNS
2451
* 11643-plane-4, until another SS3designation
2453
* (In English: <ESC>O must preceed every 2 byte
2456
* ESC $ + K Indicates the immediate two bytes following SS3
2457
* is a Chinese character as defined in CNS
2458
* 11643-plane-5, until another SS3designation
2461
* ESC $ + L Indicates the immediate two bytes following SS3
2462
* is a Chinese character as defined in CNS
2463
* 11643-plane-6, until another SS3designation
2466
* ESC $ + M Indicates the immediate two bytes following SS3
2467
* is a Chinese character as defined in CNS
2468
* 11643-plane-7, until another SS3designation
2471
* As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
2472
* has its own designation information before any Chinese characters
2477
/* The following are defined this way to make the strings truely readonly */
2478
static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
2479
static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
2480
static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
2481
static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
2482
static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
2483
static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
2484
static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
2485
static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
2486
static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
2488
/********************** ISO2022-CN Data **************************/
2489
static const char* const escSeqCharsCN[10] ={
2490
SHIFT_IN_STR, /* ASCII */
2493
CNS_11643_1992_Plane_1_STR,
2494
CNS_11643_1992_Plane_2_STR,
2495
CNS_11643_1992_Plane_3_STR,
2496
CNS_11643_1992_Plane_4_STR,
2497
CNS_11643_1992_Plane_5_STR,
2498
CNS_11643_1992_Plane_6_STR,
2499
CNS_11643_1992_Plane_7_STR
2503
UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
2504
UConverter *cnv = args->converter;
2505
UConverterDataISO2022 *converterData;
2506
ISO2022State *pFromU2022State;
2507
uint8_t *target = (uint8_t *) args->target;
2508
const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
2509
const UChar* source = args->source;
2510
const UChar* sourceLimit = args->sourceLimit;
2511
int32_t* offsets = args->offsets;
2516
int32_t choiceCount;
2517
uint32_t targetValue = 0;
2520
/* set up the state */
2521
converterData = (UConverterDataISO2022*)cnv->extraInfo;
2522
pFromU2022State = &converterData->fromU2022State;
2526
/* check if the last codepoint of previous buffer was a lead surrogate*/
2527
if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
2531
while( source < sourceLimit){
2532
if(target < targetLimit){
2534
sourceChar = *(source++);
2535
/*check if the char is a First surrogate*/
2536
if(UTF_IS_SURROGATE(sourceChar)) {
2537
if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
2539
/*look ahead to find the trail surrogate*/
2540
if(source < sourceLimit) {
2541
/* test the following code unit */
2542
UChar trail=(UChar) *source;
2543
if(UTF_IS_SECOND_SURROGATE(trail)) {
2545
sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail);
2546
cnv->fromUChar32=0x00;
2547
/* convert this supplementary code point */
2548
/* exit this condition tree */
2550
/* this is an unmatched lead code unit (1st surrogate) */
2551
/* callback(illegal) */
2552
*err=U_ILLEGAL_CHAR_FOUND;
2553
cnv->fromUChar32=sourceChar;
2558
cnv->fromUChar32=sourceChar;
2562
/* this is an unmatched trail code unit (2nd surrogate) */
2563
/* callback(illegal) */
2564
*err=U_ILLEGAL_CHAR_FOUND;
2565
cnv->fromUChar32=sourceChar;
2570
/* do the conversion */
2571
if(sourceChar <= 0x007f ){
2572
/* do not convert SO/SI/ESC */
2573
if(IS_2022_CONTROL(sourceChar)) {
2574
/* callback(illegal) */
2575
*err=U_ILLEGAL_CHAR_FOUND;
2576
cnv->fromUChar32=sourceChar;
2581
if(pFromU2022State->g == 0) {
2582
buffer[0] = (char)sourceChar;
2585
buffer[0] = UCNV_SI;
2586
buffer[1] = (char)sourceChar;
2588
pFromU2022State->g = 0;
2591
if(sourceChar == CR || sourceChar == LF) {
2592
/* reset the state at the end of a line */
2593
uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
2598
/* convert U+0080..U+10ffff */
2602
if(choiceCount == 0) {
2603
/* try the current SO/G1 converter first */
2604
choices[0] = pFromU2022State->cs[1];
2606
/* default to GB2312_1 if none is designated yet */
2607
if(choices[0] == 0) {
2608
choices[0] = GB2312_1;
2611
if(converterData->version == 0) {
2614
/* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
2615
if(choices[0] == GB2312_1) {
2616
choices[1] = (int8_t)CNS_11643_1;
2618
choices[1] = (int8_t)GB2312_1;
2623
/* ISO-2022-CN-EXT */
2625
/* try one of the other converters */
2626
switch(choices[0]) {
2628
choices[1] = (int8_t)CNS_11643_1;
2629
choices[2] = (int8_t)ISO_IR_165;
2632
choices[1] = (int8_t)GB2312_1;
2633
choices[2] = (int8_t)CNS_11643_1;
2635
default: /* CNS_11643_x */
2636
choices[1] = (int8_t)GB2312_1;
2637
choices[2] = (int8_t)ISO_IR_165;
2647
* len==0: no mapping found yet
2648
* len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
2649
* len>0: found a roundtrip result, done
2653
* We will turn off useFallback after finding a fallback,
2654
* but we still get fallbacks from PUA code points as usual.
2655
* Therefore, we will also need to check that we don't overwrite
2656
* an early fallback with a later one.
2658
useFallback = cnv->useFallback;
2660
for(i = 0; i < choiceCount && len <= 0; ++i) {
2661
int8_t cs0 = choices[i];
2665
if(cs0 > CNS_11643_0) {
2666
len2 = MBCS_FROM_UCHAR32_ISO2022(
2667
converterData->myConverterArray[CNS_11643],
2672
if(len2 == 3 || (len2 == -3 && len == 0)) {
2673
targetValue = value;
2674
cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
2679
useFallback = FALSE;
2681
if(cs == CNS_11643_1) {
2683
} else if(cs == CNS_11643_2) {
2685
} else /* plane 3..7 */ if(converterData->version == 1) {
2688
/* ISO-2022-CN (without -EXT) does not support plane 3..7 */
2693
/* GB2312_1 or ISO-IR-165 */
2694
len2 = MBCS_FROM_UCHAR32_ISO2022(
2695
converterData->myConverterArray[cs0],
2700
if(len2 == 2 || (len2 == -2 && len == 0)) {
2701
targetValue = value;
2705
useFallback = FALSE;
2712
len = 0; /* count output bytes; it must have been abs(len) == 2 */
2714
/* write the designation sequence if necessary */
2715
if(cs != pFromU2022State->cs[g]) {
2716
if(cs < CNS_11643) {
2717
uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
2719
uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
2722
pFromU2022State->cs[g] = cs;
2724
/* changing the SO/G1 charset invalidates the choices[] */
2729
/* write the shift sequence if necessary */
2730
if(g != pFromU2022State->g) {
2733
buffer[len++] = UCNV_SO;
2735
/* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
2736
pFromU2022State->g = 1;
2739
buffer[len++] = 0x1b;
2740
buffer[len++] = 0x4e;
2742
default: /* case 3 */
2743
buffer[len++] = 0x1b;
2744
buffer[len++] = 0x4f;
2749
/* write the two output bytes */
2750
buffer[len++] = (char)(targetValue >> 8);
2751
buffer[len++] = (char)targetValue;
2753
/* if we cannot find the character after checking all codepages
2754
* then this is an error
2756
*err = U_INVALID_CHAR_FOUND;
2757
cnv->fromUChar32=sourceChar;
2762
/* output len>0 bytes in buffer[] */
2764
*target++ = buffer[0];
2766
*offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
2768
} else if(len == 2 && (target + 2) <= targetLimit) {
2769
*target++ = buffer[0];
2770
*target++ = buffer[1];
2772
int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
2773
*offsets++ = sourceIndex;
2774
*offsets++ = sourceIndex;
2780
&target, (const char *)targetLimit,
2781
&offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
2783
if(U_FAILURE(*err)) {
2787
} /* end if(myTargetIndex<myTargetLength) */
2789
*err =U_BUFFER_OVERFLOW_ERROR;
2793
}/* end while(mySourceIndex<mySourceLength) */
2796
* the end of the input stream and detection of truncated input
2797
* are handled by the framework, but for ISO-2022-CN conversion
2798
* we need to be in ASCII mode at the very end
2803
* end of input and no truncated input
2805
if( U_SUCCESS(*err) &&
2806
pFromU2022State->g!=0 &&
2807
args->flush && source>=sourceLimit && cnv->fromUChar32==0
2809
int32_t sourceIndex;
2811
/* we are switching to ASCII */
2812
pFromU2022State->g=0;
2814
/* get the source index of the last input character */
2816
* TODO this would be simpler and more reliable if we used a pair
2817
* of sourceIndex/prevSourceIndex like in ucnvmbcs.c
2818
* so that we could simply use the prevSourceIndex here;
2819
* this code gives an incorrect result for the rare case of an unmatched
2820
* trail surrogate that is alone in the last buffer of the text stream
2822
sourceIndex=(int32_t)(source-args->source);
2825
if( U16_IS_TRAIL(args->source[sourceIndex]) &&
2826
(sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
2837
&target, (const char *)targetLimit,
2838
&offsets, sourceIndex,
2842
/*save the state and return */
2843
args->source = source;
2844
args->target = (char*)target;
2849
UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
2852
const char *mySource = (char *) args->source;
2853
UChar *myTarget = args->target;
2854
const char *mySourceLimit = args->sourceLimit;
2855
uint32_t targetUniChar = 0x0000;
2856
uint32_t mySourceChar = 0x0000;
2857
UConverterDataISO2022* myData;
2858
ISO2022State *pToU2022State;
2860
myData=(UConverterDataISO2022*)(args->converter->extraInfo);
2861
pToU2022State = &myData->toU2022State;
2863
if(myData->key != 0) {
2864
/* continue with a partial escape sequence */
2866
} else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
2867
/* continue with a partial double-byte character */
2868
mySourceChar = args->converter->toUBytes[0];
2869
args->converter->toULength = 0;
2873
while(mySource < mySourceLimit){
2875
targetUniChar =missingCharMarker;
2877
if(myTarget < args->targetLimit){
2879
mySourceChar= (unsigned char) *mySource++;
2881
switch(mySourceChar){
2887
if(pToU2022State->cs[1] != 0) {
2891
/* illegal to have SO before a matching designator */
2898
changeState_2022(args->converter,&(mySource),
2899
mySourceLimit, ISO_2022_CN,err);
2901
/* invalid or illegal escape sequence */
2902
if(U_FAILURE(*err)){
2903
args->target = myTarget;
2904
args->source = mySource;
2909
/* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
2914
uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
2917
/* convert one or two bytes */
2918
if(pToU2022State->g != 0) {
2919
if(mySource < mySourceLimit) {
2920
UConverterSharedData *cnv;
2921
StateEnum tempState;
2925
trailByte = *mySource++;
2926
tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
2927
if(tempState > CNS_11643_0) {
2928
cnv = myData->myConverterArray[CNS_11643];
2929
tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
2930
tempBuf[1] = (char) (mySourceChar);
2931
tempBuf[2] = trailByte;
2935
cnv = myData->myConverterArray[tempState];
2936
tempBuf[0] = (char) (mySourceChar);
2937
tempBuf[1] = trailByte;
2940
mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
2941
if(pToU2022State->g>=2) {
2942
/* return from a single-shift state to the previous one */
2943
pToU2022State->g=pToU2022State->prevG;
2945
targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, FALSE);
2947
args->converter->toUBytes[0] = (uint8_t)mySourceChar;
2948
args->converter->toULength = 1;
2953
if(mySourceChar <= 0x7f) {
2954
targetUniChar = (UChar) mySourceChar;
2959
if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
2961
args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2963
*(myTarget++)=(UChar)targetUniChar;
2965
else if(targetUniChar > missingCharMarker){
2966
/* disassemble the surrogate pair and write to output*/
2967
targetUniChar-=0x0010000;
2968
*myTarget = (UChar)(0xd800+(UChar)(targetUniChar>>10));
2970
args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2973
if(myTarget< args->targetLimit){
2974
*myTarget = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2976
args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
2980
args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
2981
(UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
2986
/* Call the callback function*/
2987
toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
2992
*err =U_BUFFER_OVERFLOW_ERROR;
2997
args->target = myTarget;
2998
args->source = mySource;
3002
_ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
3003
UConverter *cnv = args->converter;
3004
UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
3005
ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
3010
subchar=(char *)cnv->subChars;
3011
length=cnv->subCharLen; /* assume length==1 for most variants */
3014
switch(myConverterData->locale[0]){
3019
if(pFromU2022State->g == 1) {
3020
/* JIS7: switch from G1 to G0 */
3021
pFromU2022State->g = 0;
3025
cs = pFromU2022State->cs[0];
3026
if(cs != ASCII && cs != JISX201) {
3027
/* not in ASCII or JIS X 0201: switch to ASCII */
3028
pFromU2022State->cs[0] = (int8_t)ASCII;
3038
if(pFromU2022State->g != 0) {
3039
/* not in ASCII mode: switch to ASCII */
3040
pFromU2022State->g = 0;
3046
if(myConverterData->version == 0) {
3048
if((UBool)args->converter->fromUnicodeStatus) {
3049
/* in DBCS mode: switch to SBCS */
3050
args->converter->fromUnicodeStatus = 0;
3054
} else /* length == 2*/ {
3055
if(!(UBool)args->converter->fromUnicodeStatus) {
3056
/* in SBCS mode: switch to DBCS */
3057
args->converter->fromUnicodeStatus = 1;
3065
/* save the subconverter's substitution string */
3066
uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
3067
int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
3069
/* set our substitution string into the subconverter */
3070
myConverterData->currentConverter->subChars = (uint8_t *)subchar;
3071
myConverterData->currentConverter->subCharLen = (int8_t)length;
3073
/* let the subconverter write the subchar, set/retrieve fromUChar32 state */
3074
args->converter = myConverterData->currentConverter;
3075
myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
3076
ucnv_cbFromUWriteSub(args, 0, err);
3077
cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
3078
args->converter = cnv;
3080
/* restore the subconverter's substitution string */
3081
myConverterData->currentConverter->subChars = currentSubChars;
3082
myConverterData->currentConverter->subCharLen = currentSubCharLen;
3084
if(*err == U_BUFFER_OVERFLOW_ERROR) {
3085
if(myConverterData->currentConverter->charErrorBufferLength > 0) {
3087
cnv->charErrorBuffer,
3088
myConverterData->currentConverter->charErrorBuffer,
3089
myConverterData->currentConverter->charErrorBufferLength);
3091
cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
3092
myConverterData->currentConverter->charErrorBufferLength = 0;
3100
ucnv_cbFromUWriteBytes(args,
3101
buffer, (int32_t)(p - buffer),
3106
* Structure for cloning an ISO 2022 converter into a single memory block.
3107
* ucnv_safeClone() of the converter will align the entire cloneStruct,
3108
* and then ucnv_safeClone() of the sub-converter may additionally align
3109
* currentConverter inside the cloneStruct, for which we need the deadSpace
3110
* after currentConverter.
3111
* This is because UAlignedMemory may be larger than the actually
3112
* necessary alignment size for the platform.
3113
* The other cloneStruct fields will not be moved around,
3114
* and are aligned properly with cloneStruct's alignment.
3119
UConverter currentConverter;
3120
UAlignedMemory deadSpace;
3121
UConverterDataISO2022 mydata;
3126
_ISO_2022_SafeClone(
3127
const UConverter *cnv,
3129
int32_t *pBufferSize,
3132
struct cloneStruct * localClone;
3133
UConverterDataISO2022 *cnvData;
3136
if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
3137
*pBufferSize = (int32_t)sizeof(struct cloneStruct);
3141
cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
3142
localClone = (struct cloneStruct *)stackBuffer;
3144
/* ucnv.c/ucnv_safeClone() copied the main UConverter already */
3146
uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
3147
localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
3148
localClone->cnv.isExtraLocal = TRUE;
3150
/* share the subconverters */
3152
if(cnvData->currentConverter != NULL) {
3153
size = (int32_t)(sizeof(UConverter) + sizeof(UAlignedMemory)); /* include size of padding */
3154
localClone->mydata.currentConverter =
3155
ucnv_safeClone(cnvData->currentConverter,
3156
&localClone->currentConverter,
3158
if(U_FAILURE(*status)) {
3163
for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
3164
if(cnvData->myConverterArray[i] != NULL) {
3165
ucnv_incrementRefCount(cnvData->myConverterArray[i]);
3169
return &localClone->cnv;
3173
_ISO_2022_GetUnicodeSet(const UConverter *cnv,
3174
const USetAdder *sa,
3175
UConverterUnicodeSet which,
3176
UErrorCode *pErrorCode)
3179
UConverterDataISO2022* cnvData;
3181
if (U_FAILURE(*pErrorCode)) {
3184
#ifdef U_ENABLE_GENERIC_ISO_2022
3185
if (cnv->sharedData == &_ISO2022Data) {
3186
/* We use UTF-8 in this case */
3187
sa->addRange(sa->set, 0, 0xd7FF);
3188
sa->addRange(sa->set, 0xE000, 0x10FFFF);
3193
cnvData = (UConverterDataISO2022*)cnv->extraInfo;
3195
/* open a set and initialize it with code points that are algorithmically round-tripped */
3196
switch(cnvData->locale[0]){
3198
if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
3199
/* include Latin-1 for some variants of JP */
3200
sa->addRange(sa->set, 0, 0xff);
3202
/* include ASCII for JP */
3203
sa->addRange(sa->set, 0, 0x7f);
3205
if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) {
3206
/* include half-width Katakana for JP */
3207
sa->addRange(sa->set, HWKANA_START, HWKANA_END);
3212
/* include ASCII for CN */
3213
sa->addRange(sa->set, 0, 0x7f);
3216
/* there is only one converter for KR, and it is not in the myConverterArray[] */
3217
cnvData->currentConverter->sharedData->impl->getUnicodeSet(
3218
cnvData->currentConverter, sa, which, pErrorCode);
3219
/* the loop over myConverterArray[] will simply not find another converter */
3226
* Version-specific for CN:
3227
* CN version 0 does not map CNS planes 3..7 although
3228
* they are all available in the CNS conversion table;
3229
* CN version 1 does map them all.
3230
* The two versions create different Unicode sets.
3232
for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
3233
if(cnvData->myConverterArray[i]!=NULL) {
3234
if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
3235
cnvData->version==0 && i==CNS_11643
3237
/* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
3238
ucnv_MBCSGetUnicodeSetForBytes(
3239
cnvData->myConverterArray[i],
3240
sa, UCNV_ROUNDTRIP_SET,
3244
ucnv_MBCSGetUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, pErrorCode);
3250
* ISO 2022 converters must not convert SO/SI/ESC despite what
3251
* sub-converters do by themselves.
3252
* Remove these characters from the set.
3254
sa->remove(sa->set, 0x0e);
3255
sa->remove(sa->set, 0x0f);
3256
sa->remove(sa->set, 0x1b);
3259
static const UConverterImpl _ISO2022Impl={
3269
#ifdef U_ENABLE_GENERIC_ISO_2022
3270
T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3271
T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
3272
ucnv_fromUnicode_UTF8,
3273
ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
3285
_ISO_2022_SafeClone,
3286
_ISO_2022_GetUnicodeSet,
3288
static const UConverterStaticData _ISO2022StaticData={
3289
sizeof(UConverterStaticData),
3295
3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
3302
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3304
const UConverterSharedData _ISO2022Data={
3305
sizeof(UConverterSharedData),
3309
&_ISO2022StaticData,
3315
/*************JP****************/
3316
static const UConverterImpl _ISO2022JPImpl={
3326
UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3327
UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3328
UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3329
UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
3335
_ISO_2022_SafeClone,
3336
_ISO_2022_GetUnicodeSet
3338
static const UConverterStaticData _ISO2022JPStaticData={
3339
sizeof(UConverterStaticData),
3345
6, /* max 6 bytes per UChar: 4-byte escape sequence + DBCS */
3352
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3354
static const UConverterSharedData _ISO2022JPData={
3355
sizeof(UConverterSharedData),
3359
&_ISO2022JPStaticData,
3365
/************* KR ***************/
3366
static const UConverterImpl _ISO2022KRImpl={
3376
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3377
UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3378
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3379
UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
3385
_ISO_2022_SafeClone,
3386
_ISO_2022_GetUnicodeSet
3388
static const UConverterStaticData _ISO2022KRStaticData={
3389
sizeof(UConverterStaticData),
3395
3, /* max 3 bytes per UChar: SO+DBCS */
3402
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3404
static const UConverterSharedData _ISO2022KRData={
3405
sizeof(UConverterSharedData),
3409
&_ISO2022KRStaticData,
3415
/*************** CN ***************/
3416
static const UConverterImpl _ISO2022CNImpl={
3427
UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3428
UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3429
UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3430
UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
3436
_ISO_2022_SafeClone,
3437
_ISO_2022_GetUnicodeSet
3439
static const UConverterStaticData _ISO2022CNStaticData={
3440
sizeof(UConverterStaticData),
3446
8, /* max 8 bytes per UChar: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
3453
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
3455
static const UConverterSharedData _ISO2022CNData={
3456
sizeof(UConverterSharedData),
3460
&_ISO2022CNStaticData,
3468
#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */