1109
1110
/* MBCS with three byte codepage data test4.ucm*/
1110
const UChar unicodeInput[] = { 0x20ac, 0x0005, 0x0006, 0x000b, 0xdbc4, 0xde34, 0xd84d, 0xdc56, 0x000e};
1111
const uint8_t expectedtest4[] = { 0x00, 0x05, 0xff, 0x01, 0x02, 0x03, 0x0b, 0x07, 0x01, 0x02, 0x03, 0x0a, 0xff,};
1112
int32_t totest4Offs[] = { 0, 1, 2, 3, 3, 3, 3, 4, 6, 6, 6, 6, 8,};
1111
static const UChar unicodeInput[] = { 0x20ac, 0x0005, 0x0006, 0x000b, 0xdbc4, 0xde34, 0xd84d, 0xdc56, 0x000e};
1112
static const uint8_t expectedtest4[] = { 0x00, 0x05, 0xff, 0x01, 0x02, 0x03, 0x0b, 0x07, 0x01, 0x02, 0x03, 0x0a, 0xff,};
1113
static const int32_t totest4Offs[] = { 0, 1, 2, 3, 3, 3, 3, 4, 6, 6, 6, 6, 8,};
1114
const uint8_t test4input[] = { 0x00, 0x05, 0x06, 0x01, 0x02, 0x03, 0x0b, 0x07, 0x01, 0x02, 0x03, 0x0a, 0x01, 0x02, 0x03, 0x0c,};
1115
const UChar expectedUnicode[] = { 0x20ac, 0x0005, 0x0006, 0x000b, 0xdbc4, 0xde34, 0xd84d, 0xdc56, 0xfffd};
1116
int32_t fromtest4Offs[] = { 0, 1, 2, 3, 7, 7, 8, 8, 12,};
1115
static const uint8_t test4input[] = { 0x00, 0x05, 0x06, 0x01, 0x02, 0x03, 0x0b, 0x07, 0x01, 0x02, 0x03, 0x0a, 0x01, 0x02, 0x03, 0x0c,};
1116
static const UChar expectedUnicode[] = { 0x20ac, 0x0005, 0x0006, 0x000b, 0xdbc4, 0xde34, 0xd84d, 0xdc56, 0xfffd};
1117
static const int32_t fromtest4Offs[] = { 0, 1, 2, 3, 7, 7, 8, 8, 12,};
1118
1119
/*from Unicode*/
1119
1120
if(!testConvertFromU(unicodeInput, sizeof(unicodeInput)/sizeof(unicodeInput[0]),
1170
1192
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE};*/
1173
log_verbose("Testing KSC, ibm-930, ibm-878 for starters and their conversion types.");
1195
log_verbose("Testing KSC, ibm-930, ibm-878 for starters and their conversion types.");
1175
myConverter[0] = ucnv_open("ksc", &err);
1197
myConverter = ucnv_open("ksc", &err);
1176
1198
if (U_FAILURE(err)) {
1177
1199
log_err("Failed to create an ibm-ksc converter\n");
1182
if (ucnv_getType(myConverter[0])!=UCNV_MBCS)
1204
if (ucnv_getType(myConverter)!=UCNV_MBCS)
1183
1205
log_err("ucnv_getType Failed for ibm-949\n");
1185
1207
log_verbose("ucnv_getType ibm-949 ok\n");
1187
if(myConverter[0]!=NULL)
1188
ucnv_getStarters(myConverter[0], mystarters, &err);
1209
if(myConverter!=NULL)
1210
ucnv_getStarters(myConverter, mystarters, &err);
1190
1212
/*if (memcmp(expectedKSCstarters, mystarters, sizeof(expectedKSCstarters)))
1191
1213
log_err("Failed ucnv_getStarters for ksc\n");
1193
1215
log_verbose("ucnv_getStarters ok\n");*/
1197
myConverter[1] = ucnv_open("ibm-930", &err);
1198
if (U_FAILURE(err)) {
1199
log_err("Failed to create an ibm-930 converter\n");
1204
if (ucnv_getType(myConverter[1])!=UCNV_EBCDIC_STATEFUL)
1205
log_err("ucnv_getType Failed for ibm-930\n");
1207
log_verbose("ucnv_getType ibm-930 ok\n");
1210
myConverter[2] = ucnv_open("ibm-878", &err);
1211
if (U_FAILURE(err)) {
1212
log_err("Failed to create an ibm-815 converter\n");
1217
if (ucnv_getType(myConverter[2])!=UCNV_SBCS) log_err("ucnv_getType Failed for ibm-815\n");
1218
else log_verbose("ucnv_getType ibm-815 ok\n");
1222
ucnv_close(myConverter[0]);
1223
ucnv_close(myConverter[1]);
1224
ucnv_close(myConverter[2]);
1218
ucnv_close(myConverter);
1220
TestConverterType("ibm-930", UCNV_EBCDIC_STATEFUL);
1221
TestConverterType("ibm-878", UCNV_SBCS);
1222
TestConverterType("iso-8859-1", UCNV_LATIN_1);
1223
TestConverterType("ibm-1208", UCNV_UTF8);
1224
TestConverterType("utf-8", UCNV_UTF8);
1225
TestConverterType("UTF-16BE", UCNV_UTF16_BigEndian);
1226
TestConverterType("UTF-16LE", UCNV_UTF16_LittleEndian);
1227
TestConverterType("UTF-32BE", UCNV_UTF32_BigEndian);
1228
TestConverterType("UTF-32LE", UCNV_UTF32_LittleEndian);
1229
TestConverterType("iso-2022", UCNV_ISO_2022);
1230
TestConverterType("hz", UCNV_HZ);
1231
TestConverterType("scsu", UCNV_SCSU);
1232
TestConverterType("x-iscii-de", UCNV_ISCII);
1233
TestConverterType("ascii", UCNV_US_ASCII);
1234
TestConverterType("utf-7", UCNV_UTF7);
2577
2591
0x095F,0x0930,0x0931,0x0932,0x0933,0x0934,0x0935,0x0936,0x0937,
2578
2592
0x0938,0x0939,0x200D,0x093E,0x093F,0x0940,0x0941,0x0942,0x0943,
2579
2593
0x0946,0x0947,0x0948,0x0945,0x094A,0x094B,0x094C,0x0949,0x094D,
2580
0x0964,0x093C,0x0966,0x0967,0x0968,0x0969,0x096A,0x096B,0x096C,
2594
0x093d,0x0966,0x0967,0x0968,0x0969,0x096A,0x096B,0x096C,
2581
2595
0x096D,0x096E,0x096F,
2582
2596
/* test Soft halant*/
2583
2597
0x0915,0x094d, 0x200D,
2643
2677
/* kannada range */
2644
2678
0xEF, 0x48,0xa4, 0xa2, 0xa3,
2645
2679
/* anudatta and abbreviation sign */
2646
0xEF, 0x42, 0xF0, 0xBF, 0xF0, 0xB8
2680
0xEF, 0x42, 0xF0, 0xBF, 0xF0, 0xB8,
2683
0xAA, 0xE9,/* RI + NUKTA 0x0960*/
2685
0xDF, 0xE9,/* Vowel sign RI + NUKTA 0x0944*/
2687
0xa6, 0xE9,/* Vowel I + NUKTA 0x090C*/
2689
0xdb, 0xE9,/* Vowel sign I + Nukta 0x0962*/
2691
0xa7, 0xE9,/* Vowel II + NUKTA 0x0961*/
2693
0xdc, 0xE9,/* Vowel sign II + Nukta 0x0963*/
2695
0xa1, 0xE9,/* chandrabindu + Nukta 0x0950*/
2697
0xEA, 0xE9, /* Danda + Nukta 0x093D*/
2699
0xB3, 0xE9, /* Ka + NUKTA */
2701
0xB4, 0xE9, /* Kha + NUKTA */
2703
0xB5, 0xE9, /* Ga + NUKTA */
2649
2715
TestConv(in,(sizeof(in)/2),"ISCII,version=0","hindi", (char *)byteArr,sizeof(byteArr));
2752
uBuf = (UChar*)malloc(uBufSize * sizeof(UChar)*5);
2753
cBuf =(char*)malloc(uBufSize * sizeof(char) * 5);
2818
uBuf = (UChar*)malloc(uBufSize * sizeof(UChar));
2819
cBuf =(char*)malloc(uBufSize * sizeof(char));
2754
2820
uSource = (const UChar*)&in[0];
2755
2821
uSourceLimit=uSource+len;
2756
2822
cTarget = cBuf;
2757
cTargetLimit = cBuf +uBufSize*5;
2823
cTargetLimit = cBuf +uBufSize;
2758
2824
uTarget = uBuf;
2759
uTargetLimit = uBuf+ uBufSize*5;
2825
uTargetLimit = uBuf+ uBufSize;
2760
2826
ucnv_fromUnicode( cnv , &cTarget, cTargetLimit,&uSource,uSourceLimit,myOff,TRUE, &errorCode);
2761
2827
if(U_FAILURE(errorCode)){
2762
2828
log_err("ucnv_fromUnicode conversion failed reason %s\n", u_errorName(errorCode));
2765
log_verbose("length of compressed string for language %s using %s:%i \n",conv,lang,(cTarget-cBuf));
2831
/*log_verbose("length of compressed string for language %s using %s:%i \n",conv,lang,(cTarget-cBuf));*/
2766
2832
cSource = cBuf;
2767
2833
cSourceLimit =cTarget;
2884
_charAt(int32_t offset, void *context) {
2885
return ((char*)context)[offset];
2889
unescape(UChar* dst, int32_t dstLen,const char* src,int32_t srcLen,UErrorCode *status){
2892
if(U_FAILURE(*status)){
2895
if((dst==NULL && dstLen>0) || (src==NULL ) || dstLen < -1 || srcLen <-1 ){
2896
*status = U_ILLEGAL_ARGUMENT_ERROR;
2900
srcLen = uprv_strlen(src);
2903
for (; srcIndex<srcLen; ) {
2904
UChar32 c = src[srcIndex++];
2905
if (c == 0x005C /*'\\'*/) {
2906
c = u_unescapeAt(_charAt,&srcIndex,srcLen,(void*)src); /* advances i*/
2907
if (c == (UChar32)0xFFFFFFFF) {
2908
*status=U_INVALID_CHAR_FOUND; /* return empty string */
2909
break; /* invalid escape sequence */
2912
if(dstIndex < dstLen){
2914
dst[dstIndex++] = UTF16_LEAD(c);
2915
if(dstIndex<dstLen){
2916
dst[dstIndex]=UTF16_TRAIL(c);
2918
*status=U_BUFFER_OVERFLOW_ERROR;
2921
dst[dstIndex]=(UChar)c;
2925
*status = U_BUFFER_OVERFLOW_ERROR;
2927
dstIndex++; /* for preflighting */
2933
TestFullRoundtrip(const char* cp){
2934
UChar usource[10] ={0};
2935
UChar nsrc[10] = {0};
2939
/* Test codepoint 0 */
2940
TestConv(usource,1,cp,"",NULL,0);
2941
TestConv(usource,2,cp,"",NULL,0);
2943
TestConv(nsrc,3,cp,"",NULL,0);
2945
for(;i<=0x10FFFF;i++){
2946
if(i>=0xD800 && i<=0xDFFF){
2950
usource[0] =(UChar) i;
2953
usource[0]=UTF16_LEAD(i);
2954
usource[1]=UTF16_TRAIL(i);
2957
/* Test only single code points */
2958
TestConv(usource,u_strlen(usource),cp,"",NULL,0);
2959
/* Test codepoint repeated twice */
2960
u_strncat(usource,usource,len);
2961
TestConv(usource,u_strlen(usource),cp,"",NULL,0);
2962
/* Test codepoint repeated 3 times */
2963
u_strncat(usource,usource,len);
2964
TestConv(usource,u_strlen(usource),cp,"",NULL,0);
2965
/* Test codepoint in between 2 codepoints */
2967
u_strncat(nsrc,usource,len);
2969
TestConv(nsrc,len+2,cp,"",NULL,0);
2970
uprv_memset(usource,0,sizeof(UChar)*10);
2975
TestRoundTrippingAllUTF(void){
2977
log_verbose("Running exhaustive round trip test for SCSU\n");
2978
TestFullRoundtrip("SCSU");
2979
log_verbose("Running exhaustive round trip test for UTF-8\n");
2980
TestFullRoundtrip("UTF-8");
2981
log_verbose("Running exhaustive round trip test for UTF-16BE\n");
2982
TestFullRoundtrip("UTF-16BE");
2983
log_verbose("Running exhaustive round trip test for UTF-16LE\n");
2984
TestFullRoundtrip("UTF-16LE");
2985
log_verbose("Running exhaustive round trip test for UTF-32BE\n");
2986
TestFullRoundtrip("UTF-32BE");
2987
log_verbose("Running exhaustive round trip test for UTF-32LE\n");
2988
TestFullRoundtrip("UTF-32LE");
2989
log_verbose("Running exhaustive round trip test for UTF-7\n");
2990
TestFullRoundtrip("UTF-7");
2991
log_verbose("Running exhaustive round trip test for UTF-7\n");
2992
TestFullRoundtrip("UTF-7,version=1");
2993
/*#### TODO: Enable this test when BOCU-1 is available */
2994
/*log_verbose("Running exhaustive round trip test for BOCU-1");*/
2995
/*TestFullRoundtrip("BOCU-1");*/
2996
log_verbose("Running exhaustive round trip test for GB18030\n");
2997
TestFullRoundtrip("GB18030");
2820
uint16_t germanUTF16[]={
3004
static const uint16_t germanUTF16[]={
2821
3005
0x00d6, 0x006c, 0x0020, 0x0066, 0x006c, 0x0069, 0x0065, 0x00df, 0x0074
2824
uint8_t germanSCSU[]={
3008
static const uint8_t germanSCSU[]={
2825
3009
0xd6, 0x6c, 0x20, 0x66, 0x6c, 0x69, 0x65, 0xdf, 0x74
2828
uint16_t russianUTF16[]={
3012
static const uint16_t russianUTF16[]={
2829
3013
0x041c, 0x043e, 0x0441, 0x043a, 0x0432, 0x0430
2832
uint8_t russianSCSU[]={
3016
static const uint8_t russianSCSU[]={
2833
3017
0x12, 0x9c, 0xbe, 0xc1, 0xba, 0xb2, 0xb0
2836
uint16_t japaneseUTF16[]={
3020
static const uint16_t japaneseUTF16[]={
2837
3021
0x3000, 0x266a, 0x30ea, 0x30f3, 0x30b4, 0x53ef, 0x611b,
2838
3022
0x3044, 0x3084, 0x53ef, 0x611b, 0x3044, 0x3084, 0x30ea, 0x30f3,
2839
3023
0x30b4, 0x3002, 0x534a, 0x4e16, 0x7d00, 0x3082, 0x524d, 0x306b,
2854
3038
/* SCSUEncoder produces a slightly longer result (179B vs. 178B) because of one different choice:
2855
3039
it uses an SQn once where a longer look-ahead could have shown that SCn is more efficient */
2856
uint8_t japaneseSCSU[]={
3040
static const uint8_t japaneseSCSU[]={
2857
3041
0x08, 0x00, 0x1b, 0x4c, 0xea, 0x16, 0xca, 0xd3, 0x94, 0x0f, 0x53, 0xef, 0x61, 0x1b, 0xe5, 0x84,
2858
3042
0xc4, 0x0f, 0x53, 0xef, 0x61, 0x1b, 0xe5, 0x84, 0xc4, 0x16, 0xca, 0xd3, 0x94, 0x08, 0x02, 0x0f,
2859
3043
0x53, 0x4a, 0x4e, 0x16, 0x7d, 0x00, 0x30, 0x82, 0x52, 0x4d, 0x30, 0x6b, 0x6d, 0x41, 0x88, 0x4c,
2913
3097
0xD869, 0xDEC1, 0xD869, 0xDEC2, 0xD869, 0xDEC3, 0xD869, 0xDEC4, 0xD869, 0xDEC8,
2914
3098
0xD869, 0xDECA, 0xD869, 0xDECB, 0xD869, 0xDECD, 0xD869, 0xDECE, 0xD869, 0xDECF,
2915
3099
0xD869, 0xDED0, 0xD869, 0xDED1, 0xD869, 0xDED2, 0xD869, 0xDED3, 0xD869, 0xDED4,
3100
0xD869, 0xDED5, 0xD800, 0xDC00, 0xD800, 0xDC00, 0xD800, 0xDC00, 0xDBFF, 0xDFFF,
3101
0xDBFF, 0xDFFF, 0xDBFF, 0xDFFF,
2918
3104
0x4DB3, 0x4DB4, 0x4DB5, 0x4E00, 0x4E00, 0x4E01, 0x4E02, 0x4E03, 0x000D, 0x000A,
2919
3105
0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, 0x33E0, 0x33E6, 0x000D, 0x000A,
2921
3107
0x4E0C, 0x0021, 0x0022, 0x0023, 0x0024, 0xFF40, 0xFF41, 0xFF42, 0x000D, 0x000A,
2922
3108
0xFF43, 0xFF44, 0xFF45, 0xFF46, 0xFF47, 0xFF48, 0xFF49, 0xFF4A, 0x000D, 0x000A,
3110
static const char *fTestCases [] = {
3111
"\\ud800\\udc00", /* smallest surrogate*/
3113
"\\udBff\\udFff", /* largest surrogate pair*/
3116
"Hello \\u9292 \\u9192 World!",
3117
"Hell\\u0429o \\u9292 \\u9192 W\\u00e4rld!",
3118
"Hell\\u0429o \\u9292 \\u9292W\\u00e4rld!",
3120
"\\u0648\\u06c8", /* catch missing reset*/
3123
"\\u4444\\uE001", /* lowest quotable*/
3124
"\\u4444\\uf2FF", /* highest quotable*/
3125
"\\u4444\\uf188\\u4444",
3126
"\\u4444\\uf188\\uf288",
3127
"\\u4444\\uf188abc\\u0429\\uf288",
3129
"Hell\\u0429\\u04230o \\u9292 \\u9292W\\u00e4\\u0192rld!",
3130
"Hell\\u0429o \\u9292 \\u9292W\\u00e4rld!",
3131
"Hello World!123456",
3132
"Hello W\\u0081\\u011f\\u0082!", /* Latin 1 run*/
3134
"abc\\u0301\\u0302", /* uses SQn for u301 u302*/
3135
"abc\\u4411d", /* uses SQU*/
3136
"abc\\u4411\\u4412d",/* uses SCU*/
3137
"abc\\u0401\\u0402\\u047f\\u00a5\\u0405", /* uses SQn for ua5*/
3138
"\\u9191\\u9191\\u3041\\u9191\\u3041\\u3041\\u3000", /* SJIS like data*/
3140
"\\u9191\\u9191\\u3041\\u9191\\u3041\\u3041\\u3000",
3141
"\\u9999\\u3051\\u300c\\u9999\\u9999\\u3060\\u9999\\u3065\\u3065\\u3065\\u300c",
3142
"\\u3000\\u266a\\u30ea\\u30f3\\u30b4\\u53ef\\u611b\\u3044\\u3084\\u53ef\\u611b\\u3044\\u3084\\u30ea\\u30f3\\u30b4\\u3002",
3144
"", /* empty input*/
3145
"\\u0000", /* smallest BMP character*/
3146
"\\uFFFF", /* largest BMP character*/
3148
/* regression tests*/
3149
"\\u6441\\ub413\\ua733\\uf8fe\\ueedb\\u587f\\u195f\\u4899\\uf23d\\u49fd\\u0aac\\u5792\\ufc22\\ufc3c\\ufc46\\u00aa",
3150
"\\u00df\\u01df\\uf000\\udbff\\udfff\\u000d\n\\u0041\\u00df\\u0401\\u015f\\u00df\\u01df\\uf000\\udbff\\udfff",
3151
"\\u30f9\\u8321\\u05e5\\u181c\\ud72b\\u2019\\u99c9\\u2f2f\\uc10c\\u82e1\\u2c4d\\u1ebc\\u6013\\u66dc\\ubbde\\u94a5\\u4726\\u74af\\u3083\\u55b9\\u000c",
3152
"\\u0041\\u00df\\u0401\\u015f",
3153
"\\u9066\\u2123abc",
3154
"\\ud266\\u43d7\\u\\ue386\\uc9c0\\u4a6b\\u9222\\u901f\\u7410\\ua63f\\u539b\\u9596\\u482e\\u9d47\\ucfe4\\u7b71\\uc280\\uf26a\\u982f\\u862a\\u4edd\\uf513\\ufda6\\u869d\\u2ee0\\ua216\\u3ff6\\u3c70\\u89c0\\u9576\\ud5ec\\ubfda\\u6cca\\u5bb3\\ubcea\\u554c\\u914e\\ufa4a\\uede3\\u2990\\ud2f5\\u2729\\u5141\\u0f26\\uccd8\\u5413\\ud196\\ubbe2\\u51b9\\u9b48\\u0dc8\\u2195\\u21a2\\u21e9\\u00e4\\u9d92\\u0bc0\\u06c5",
3155
"\\uf95b\\u2458\\u2468\\u0e20\\uf51b\\ue36e\\ubfc1\\u0080\\u02dd\\uf1b5\\u0cf3\\u6059\\u7489",
3158
for(;i<sizeof(fTestCases)/sizeof(*fTestCases);i++){
3159
const char* cSrc = fTestCases[i];
3160
UErrorCode status = U_ZERO_ERROR;
3161
int32_t cSrcLen,srcLen;
3163
/* UConverter* cnv = ucnv_open("SCSU",&status); */
3164
cSrcLen= srcLen = uprv_strlen(fTestCases[i]);
3165
src = (UChar*) uprv_malloc((sizeof(UChar) * srcLen) + sizeof(UChar));
3166
srcLen=unescape(src,srcLen,cSrc,cSrcLen,&status);
3167
log_verbose("Testing roundtrip for src: %s at index :%d\n",cSrc,i);
3168
TestConv(src,srcLen,"SCSU","Coverage",NULL,0);
2924
3171
TestConv(allFeaturesUTF16,(sizeof(allFeaturesUTF16)/2),"SCSU","all features", (char *)allFeaturesSCSU,sizeof(allFeaturesSCSU));
2925
3172
TestConv(allFeaturesUTF16,(sizeof(allFeaturesUTF16)/2),"SCSU","all features",(char *)allFeaturesSCSU,sizeof(allFeaturesSCSU));
2926
3173
TestConv(japaneseUTF16,(sizeof(japaneseUTF16)/2),"SCSU","japaneese",(char *)japaneseSCSU,sizeof(japaneseSCSU));