2
/******************************************************************
4
Copyright 1993 by SunSoft, Inc.
5
Copyright 1999-2000 by Bruno Haible
7
Permission to use, copy, modify, distribute, and sell this software
8
and its documentation for any purpose is hereby granted without fee,
9
provided that the above copyright notice appear in all copies and
10
that both that copyright notice and this permission notice appear
11
in supporting documentation, and that the names of SunSoft, Inc. and
12
Bruno Haible not be used in advertising or publicity pertaining to
13
distribution of the software without specific, written prior
14
permission. SunSoft, Inc. and Bruno Haible make no representations
15
about the suitability of this software for any purpose. It is
16
provided "as is" without express or implied warranty.
18
SunSoft Inc. AND Bruno Haible DISCLAIM ALL WARRANTIES WITH REGARD
19
TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
20
AND FITNESS, IN NO EVENT SHALL SunSoft, Inc. OR Bruno Haible BE LIABLE
21
FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
22
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
23
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
24
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
26
******************************************************************/
27
/* $XFree86: xc/lib/X11/lcUTF8.c,v 1.15 2002/10/08 23:31:36 dawes Exp $ */
32
* I. Conversion routines CompoundText/CharSet <--> Unicode/UTF-8.
34
* Used for three purposes:
35
* 1. The UTF-8 locales, see below.
36
* 2. Unicode aware applications for which the use of 8-bit character
37
* sets is an anachronism.
38
* 3. For conversion from keysym to locale encoding.
40
* II. Conversion files for an UTF-8 locale loader.
41
* Supports: all locales with codeset UTF-8.
42
* How: Provides converters for UTF-8.
43
* Platforms: all systems.
45
* The loader itself is located in lcUTF8.c.
49
* The conversion from UTF-8 to CompoundText is realized in a very
50
* conservative way. Recall that CompoundText data is used for inter-client
51
* communication purposes. We distinguish three classes of clients:
52
* - Clients which accept only those pieces of CompoundText which belong to
53
* the character set understood by the current locale.
54
* (Example: clients which are linked to an older X11 library.)
55
* - Clients which accept CompoundText with multiple character sets and parse
57
* (Example: emacs, xemacs.)
58
* - Clients which rely entirely on the X{mb,wc}TextPropertyToTextList
59
* functions for the conversion of CompoundText to their current locale's
60
* multi-byte/wide-character format.
61
* For best interoperation, the UTF-8 to CompoundText conversion proceeds as
62
* follows. For every character, it first tests whether the character is
63
* representable in the current locale's original (non-UTF-8) character set.
64
* If not, it goes through the list of predefined character sets for
65
* CompoundText and tests if the character is representable in that character
66
* set. If so, it encodes the character using its code within that character
67
* set. If not, it uses an UTF-8-in-CompoundText encapsulation. Since
68
* clients of the first and second kind ignore such encapsulated text,
69
* this encapsulation is kept to a minimum and terminated as early as possible.
71
* In a distant future, when clients of the first and second kind will have
72
* disappeared, we will be able to stuff UTF-8 data directly in CompoundText
73
* without first going through the list of predefined character sets.
79
#include "XlcGeneric.h"
84
XlcConvMethods methods)
88
conv = (XlcConv) Xmalloc(sizeof(XlcConvRec));
89
if (conv == (XlcConv) NULL)
90
return (XlcConv) NULL;
92
conv->methods = methods;
102
Xfree((char *) conv);
105
/* Replacement character for invalid multibyte sequence or wide character. */
106
#define BAD_WCHAR ((ucs4_t) 0xfffd)
109
/***************************************************************************/
110
/* Part I: Conversion routines CompoundText/CharSet <--> Unicode/UTF-8.
112
* Note that this code works in any locale. We store Unicode values in
113
* `ucs4_t' variables, but don't pass them to the user.
115
* This code has to support all character sets that are used for CompoundText,
116
* nothing more, nothing less. See the table in lcCT.c.
117
* Since the conversion _to_ CompoundText is likely to need the tables for all
118
* character sets at once, we don't use dynamic loading (of tables or shared
119
* libraries through iconv()). Use a fixed set of tables instead.
121
* We use statically computed tables, not dynamically allocated arrays,
122
* because it's more memory efficient: Different processes using the same
123
* libX11 shared library share the "text" and read-only "data" sections.
126
typedef unsigned int ucs4_t;
127
#define conv_t XlcConv
129
typedef struct _Utf8ConvRec {
132
#if NeedFunctionPrototypes
133
int (* cstowc) (XlcConv, ucs4_t *, unsigned char const *, int);
137
#if NeedFunctionPrototypes
138
int (* wctocs) (XlcConv, unsigned char *, ucs4_t, int);
142
} Utf8ConvRec, *Utf8Conv;
145
* int xxx_cstowc (XlcConv conv, ucs4_t *pwc, unsigned char const *s, int n)
146
* converts the byte sequence starting at s to a wide character. Up to n bytes
147
* are available at s. n is >= 1.
148
* Result is number of bytes consumed (if a wide character was read),
149
* or 0 if invalid, or -1 if n too small.
151
* int xxx_wctocs (XlcConv conv, unsigned char *r, ucs4_t wc, int n)
152
* converts the wide character wc to the character set xxx, and stores the
153
* result beginning at r. Up to n bytes may be written at r. n is >= 1.
154
* Result is number of bytes written, or 0 if invalid, or -1 if n too small.
157
/* Return code if invalid. (xxx_mbtowc, xxx_wctomb) */
159
/* Return code if only a shift sequence of n bytes was read. (xxx_mbtowc) */
160
#define RET_TOOFEW(n) (-1-(n))
161
/* Return code if output buffer is too small. (xxx_wctomb, xxx_reset) */
162
#define RET_TOOSMALL -1
165
* The tables below are bijective. It would be possible to extend the
166
* xxx_wctocs tables to do some transliteration (e.g. U+201C,U+201D -> 0x22)
167
* but *only* with characters not contained in any other table, and *only*
168
* when the current locale is not an UTF-8 locale.
171
#include "lcUniConv/utf8.h"
172
#include "lcUniConv/ucs2be.h"
174
#include "lcUniConv/ascii.h"
176
#include "lcUniConv/iso8859_1.h"
177
#include "lcUniConv/iso8859_2.h"
178
#include "lcUniConv/iso8859_3.h"
179
#include "lcUniConv/iso8859_4.h"
180
#include "lcUniConv/iso8859_5.h"
181
#include "lcUniConv/iso8859_6.h"
182
#include "lcUniConv/iso8859_7.h"
183
#include "lcUniConv/iso8859_8.h"
184
#include "lcUniConv/iso8859_9.h"
185
#include "lcUniConv/iso8859_10.h"
186
#include "lcUniConv/iso8859_11.h"
187
#include "lcUniConv/iso8859_13.h"
188
#include "lcUniConv/iso8859_14.h"
189
#include "lcUniConv/iso8859_15.h"
190
#include "lcUniConv/iso8859_16.h"
191
#include "lcUniConv/iso8859_9e.h"
192
#include "lcUniConv/jisx0201.h"
193
#include "lcUniConv/tis620.h"
194
#include "lcUniConv/koi8_r.h"
195
#include "lcUniConv/koi8_u.h"
196
#include "lcUniConv/koi8_c.h"
197
#include "lcUniConv/armscii_8.h"
198
#include "lcUniConv/cp1133.h"
199
#include "lcUniConv/mulelao.h"
200
#include "lcUniConv/viscii.h"
201
#include "lcUniConv/tcvn.h"
202
#include "lcUniConv/georgian_academy.h"
203
#include "lcUniConv/georgian_ps.h"
204
#include "lcUniConv/cp1251.h"
205
#include "lcUniConv/cp1255.h"
206
#include "lcUniConv/cp1256.h"
207
#include "lcUniConv/tatar_cyr.h"
210
unsigned short indx; /* index into big table */
211
unsigned short used; /* bitmask of used entries */
214
#include "lcUniConv/gb2312.h"
215
#include "lcUniConv/jisx0208.h"
216
#include "lcUniConv/jisx0212.h"
217
#include "lcUniConv/ksc5601.h"
218
#include "lcUniConv/big5.h"
219
#include "lcUniConv/big5_emacs.h"
221
static Utf8ConvRec all_charsets[] = {
222
/* The ISO10646-1/UTF-8 entry occurs twice, once at the beginning
223
(for lookup speed), once at the end (as a fallback). */
224
{ "ISO10646-1", NULLQUARK,
225
utf8_mbtowc, utf8_wctomb
228
{ "ISO8859-1", NULLQUARK,
229
iso8859_1_mbtowc, iso8859_1_wctomb
231
{ "ISO8859-2", NULLQUARK,
232
iso8859_2_mbtowc, iso8859_2_wctomb
234
{ "ISO8859-3", NULLQUARK,
235
iso8859_3_mbtowc, iso8859_3_wctomb
237
{ "ISO8859-4", NULLQUARK,
238
iso8859_4_mbtowc, iso8859_4_wctomb
240
{ "ISO8859-5", NULLQUARK,
241
iso8859_5_mbtowc, iso8859_5_wctomb
243
{ "ISO8859-6", NULLQUARK,
244
iso8859_6_mbtowc, iso8859_6_wctomb
246
{ "ISO8859-7", NULLQUARK,
247
iso8859_7_mbtowc, iso8859_7_wctomb
249
{ "ISO8859-8", NULLQUARK,
250
iso8859_8_mbtowc, iso8859_8_wctomb
252
{ "ISO8859-9", NULLQUARK,
253
iso8859_9_mbtowc, iso8859_9_wctomb
255
{ "ISO8859-10", NULLQUARK,
256
iso8859_10_mbtowc, iso8859_10_wctomb
258
{ "ISO8859-11", NULLQUARK,
259
iso8859_11_mbtowc, iso8859_11_wctomb
261
{ "ISO8859-13", NULLQUARK,
262
iso8859_13_mbtowc, iso8859_13_wctomb
264
{ "ISO8859-14", NULLQUARK,
265
iso8859_14_mbtowc, iso8859_14_wctomb
267
{ "ISO8859-15", NULLQUARK,
268
iso8859_15_mbtowc, iso8859_15_wctomb
270
{ "ISO8859-16", NULLQUARK,
271
iso8859_16_mbtowc, iso8859_16_wctomb
273
{ "JISX0201.1976-0", NULLQUARK,
274
jisx0201_mbtowc, jisx0201_wctomb
276
{ "TIS620-0", NULLQUARK,
277
tis620_mbtowc, tis620_wctomb
279
{ "GB2312.1980-0", NULLQUARK,
280
gb2312_mbtowc, gb2312_wctomb
282
{ "JISX0208.1983-0", NULLQUARK,
283
jisx0208_mbtowc, jisx0208_wctomb
285
{ "JISX0208.1990-0", NULLQUARK,
286
jisx0208_mbtowc, jisx0208_wctomb
288
{ "JISX0212.1990-0", NULLQUARK,
289
jisx0212_mbtowc, jisx0212_wctomb
291
{ "KSC5601.1987-0", NULLQUARK,
292
ksc5601_mbtowc, ksc5601_wctomb
294
{ "KOI8-R", NULLQUARK,
295
koi8_r_mbtowc, koi8_r_wctomb
297
{ "KOI8-U", NULLQUARK,
298
koi8_u_mbtowc, koi8_u_wctomb
300
{ "KOI8-C", NULLQUARK,
301
koi8_c_mbtowc, koi8_c_wctomb
303
{ "TATAR-CYR", NULLQUARK,
304
tatar_cyr_mbtowc, tatar_cyr_wctomb
306
{ "ARMSCII-8", NULLQUARK,
307
armscii_8_mbtowc, armscii_8_wctomb
309
{ "IBM-CP1133", NULLQUARK,
310
cp1133_mbtowc, cp1133_wctomb
312
{ "MULELAO-1", NULLQUARK,
313
mulelao_mbtowc, mulelao_wctomb
315
{ "VISCII1.1-1", NULLQUARK,
316
viscii_mbtowc, viscii_wctomb
318
{ "TCVN-5712", NULLQUARK,
319
tcvn_mbtowc, tcvn_wctomb
321
{ "GEORGIAN-ACADEMY", NULLQUARK,
322
georgian_academy_mbtowc, georgian_academy_wctomb
324
{ "GEORGIAN-PS", NULLQUARK,
325
georgian_ps_mbtowc, georgian_ps_wctomb
327
{ "ISO8859-9E", NULLQUARK,
328
iso8859_9e_mbtowc, iso8859_9e_wctomb
330
{ "MICROSOFT-CP1251", NULLQUARK,
331
cp1251_mbtowc, cp1251_wctomb
333
{ "MICROSOFT-CP1255", NULLQUARK,
334
cp1255_mbtowc, cp1255_wctomb
336
{ "MICROSOFT-CP1256", NULLQUARK,
337
cp1256_mbtowc, cp1256_wctomb
339
{ "BIG5-0", NULLQUARK,
340
big5_mbtowc, big5_wctomb
342
{ "BIG5-E0", NULLQUARK,
343
big5_0_mbtowc, big5_0_wctomb
345
{ "BIG5-E1", NULLQUARK,
346
big5_1_mbtowc, big5_1_wctomb
349
/* The ISO10646-1/UTF-8 entry occurs twice, once at the beginning
350
(for lookup speed), once at the end (as a fallback). */
351
{ "ISO10646-1", NULLQUARK,
352
utf8_mbtowc, utf8_wctomb
355
/* Encoding ISO10646-1 for fonts means UCS2-like encoding
356
so for conversion to FontCharSet we need this record */
357
{ "ISO10646-1", NULLQUARK,
358
ucs2be_mbtowc, ucs2be_wctomb
362
#define charsets_table_size (sizeof(all_charsets)/sizeof(all_charsets[0]))
363
#define all_charsets_count (charsets_table_size - 1)
364
#define ucs2_conv_index (charsets_table_size - 1)
367
init_all_charsets (void)
372
for (convptr = all_charsets, i = charsets_table_size; i > 0; convptr++, i--)
373
convptr->xrm_name = XrmStringToQuark(convptr->name);
376
#define lazy_init_all_charsets() \
378
if (all_charsets[0].xrm_name == NULLQUARK) \
379
init_all_charsets(); \
382
/* from XlcNCharSet to XlcNUtf8String */
398
unsigned char const *src;
399
unsigned char const *srcend;
401
unsigned char *dstend;
404
if (from == NULL || *from == NULL)
410
charset = (XlcCharSet) args[0];
411
name = charset->encoding_name;
412
/* not charset->name because the latter has a ":GL"/":GR" suffix */
414
for (convptr = all_charsets, i = all_charsets_count-1; i > 0; convptr++, i--)
415
if (!strcmp(convptr->name, name))
420
src = (unsigned char const *) *from;
421
srcend = src + *from_left;
422
dst = (unsigned char *) *to;
423
dstend = dst + *to_left;
426
while (src < srcend) {
431
consumed = convptr->cstowc(conv, &wc, src, srcend-src);
432
if (consumed == RET_ILSEQ)
434
if (consumed == RET_TOOFEW(0))
437
count = utf8_wctomb(NULL, dst, wc, dstend-dst);
438
if (count == RET_TOOSMALL)
440
if (count == RET_ILSEQ) {
441
count = utf8_wctomb(NULL, dst, BAD_WCHAR, dstend-dst);
442
if (count == RET_TOOSMALL)
450
*from = (XPointer) src;
451
*from_left = srcend - src;
452
*to = (XPointer) dst;
453
*to_left = dstend - dst;
458
static XlcConvMethodsRec methods_cstoutf8 = {
467
const char *from_type,
471
lazy_init_all_charsets();
472
return create_conv(from_lcd, &methods_cstoutf8);
475
/* from XlcNUtf8String to XlcNCharSet */
480
XlcConvMethods methods)
483
CodeSet *codeset_list;
489
lazy_init_all_charsets();
491
codeset_list = XLC_GENERIC(lcd, codeset_list);
492
codeset_num = XLC_GENERIC(lcd, codeset_num);
495
for (i = 0; i < codeset_num; i++)
496
charset_num += codeset_list[i]->num_charsets;
497
if (charset_num > all_charsets_count-1)
498
charset_num = all_charsets_count-1;
500
conv = (XlcConv) Xmalloc(sizeof(XlcConvRec)
501
+ (charset_num + 1) * sizeof(Utf8Conv));
502
if (conv == (XlcConv) NULL)
503
return (XlcConv) NULL;
504
preferred = (Utf8Conv *) ((char *) conv + sizeof(XlcConvRec));
506
/* Loop through all codesets mentioned in the locale. */
508
for (i = 0; i < codeset_num; i++) {
509
XlcCharSet *charsets = codeset_list[i]->charset_list;
510
int num_charsets = codeset_list[i]->num_charsets;
511
for (j = 0; j < num_charsets; j++) {
512
const char *name = charsets[j]->encoding_name;
513
/* If it wasn't already encountered... */
514
for (k = charset_num-1; k >= 0; k--)
515
if (!strcmp(preferred[k]->name, name))
518
/* Look it up in all_charsets[]. */
519
for (k = 0; k < all_charsets_count-1; k++)
520
if (!strcmp(all_charsets[k].name, name)) {
521
/* Add it to the preferred set. */
522
preferred[charset_num++] = &all_charsets[k];
528
preferred[charset_num] = (Utf8Conv) NULL;
530
conv->methods = methods;
531
conv->state = (XPointer) preferred;
537
close_tocs_converter(
540
/* conv->state is allocated together with conv, free both at once. */
541
Xfree((char *) conv);
545
* Converts a Unicode character to an appropriate character set. The NULL
546
* terminated array of preferred character sets is passed as first argument.
547
* If successful, *charsetp is set to the character set that was used, and
548
* *sidep is set to the character set side (XlcGL or XlcGR).
564
for (; *preferred != (Utf8Conv) NULL; preferred++) {
565
convptr = *preferred;
566
count = convptr->wctocs(conv, r, wc, n);
567
if (count == RET_TOOSMALL)
569
if (count != RET_ILSEQ) {
571
*sidep = (*r < 0x80 ? XlcGL : XlcGR);
575
for (convptr = all_charsets+1, i = all_charsets_count-1; i > 0; convptr++, i--) {
576
count = convptr->wctocs(conv, r, wc, n);
577
if (count == RET_TOOSMALL)
579
if (count != RET_ILSEQ) {
581
*sidep = (*r < 0x80 ? XlcGL : XlcGR);
598
Utf8Conv *preferred_charsets;
599
XlcCharSet last_charset = NULL;
600
unsigned char const *src;
601
unsigned char const *srcend;
603
unsigned char *dstend;
606
if (from == NULL || *from == NULL)
609
preferred_charsets = (Utf8Conv *) conv->state;
610
src = (unsigned char const *) *from;
611
srcend = src + *from_left;
612
dst = (unsigned char *) *to;
613
dstend = dst + *to_left;
616
while (src < srcend && dst < dstend) {
617
Utf8Conv chosen_charset = NULL;
618
XlcSide chosen_side = XlcNONE;
623
consumed = utf8_mbtowc(NULL, &wc, src, srcend-src);
624
if (consumed == RET_TOOFEW(0))
626
if (consumed == RET_ILSEQ) {
632
count = charset_wctocs(preferred_charsets, &chosen_charset, &chosen_side, conv, dst, wc, dstend-dst);
633
if (count == RET_TOOSMALL)
635
if (count == RET_ILSEQ) {
641
if (last_charset == NULL) {
643
_XlcGetCharSetWithSide(chosen_charset->name, chosen_side);
644
if (last_charset == NULL) {
650
if (!(last_charset->xrm_encoding_name == chosen_charset->xrm_name
651
&& (last_charset->side == XlcGLGR
652
|| last_charset->side == chosen_side)))
659
if (last_charset == NULL)
662
*from = (XPointer) src;
663
*from_left = srcend - src;
664
*to = (XPointer) dst;
665
*to_left = dstend - dst;
668
*((XlcCharSet *)args[0]) = last_charset;
673
static XlcConvMethodsRec methods_utf8tocs = {
674
close_tocs_converter,
682
const char *from_type,
686
return create_tocs_conv(from_lcd, &methods_utf8tocs);
689
/* from XlcNUtf8String to XlcNChar */
701
Utf8Conv *preferred_charsets;
702
XlcCharSet last_charset = NULL;
703
unsigned char const *src;
704
unsigned char const *srcend;
706
unsigned char *dstend;
709
if (from == NULL || *from == NULL)
712
preferred_charsets = (Utf8Conv *) conv->state;
713
src = (unsigned char const *) *from;
714
srcend = src + *from_left;
715
dst = (unsigned char *) *to;
716
dstend = dst + *to_left;
719
while (src < srcend && dst < dstend) {
720
Utf8Conv chosen_charset = NULL;
721
XlcSide chosen_side = XlcNONE;
726
consumed = utf8_mbtowc(NULL, &wc, src, srcend-src);
727
if (consumed == RET_TOOFEW(0))
729
if (consumed == RET_ILSEQ) {
735
count = charset_wctocs(preferred_charsets, &chosen_charset, &chosen_side, conv, dst, wc, dstend-dst);
736
if (count == RET_TOOSMALL)
738
if (count == RET_ILSEQ) {
744
if (last_charset == NULL) {
746
_XlcGetCharSetWithSide(chosen_charset->name, chosen_side);
747
if (last_charset == NULL) {
753
if (!(last_charset->xrm_encoding_name == chosen_charset->xrm_name
754
&& (last_charset->side == XlcGLGR
755
|| last_charset->side == chosen_side)))
763
if (last_charset == NULL)
766
*from = (XPointer) src;
767
*from_left = srcend - src;
768
*to = (XPointer) dst;
769
*to_left = dstend - dst;
772
*((XlcCharSet *)args[0]) = last_charset;
777
static XlcConvMethodsRec methods_utf8tocs1 = {
778
close_tocs_converter,
786
const char *from_type,
790
return create_tocs_conv(from_lcd, &methods_utf8tocs1);
793
/* from XlcNUtf8String to XlcNString */
805
unsigned char const *src;
806
unsigned char const *srcend;
808
unsigned char *dstend;
811
if (from == NULL || *from == NULL)
814
src = (unsigned char const *) *from;
815
srcend = src + *from_left;
816
dst = (unsigned char *) *to;
817
dstend = dst + *to_left;
820
while (src < srcend) {
825
consumed = utf8_mbtowc(NULL, &wc, src, srcend-src);
826
if (consumed == RET_TOOFEW(0))
830
if (consumed == RET_ILSEQ) {
835
if ((wc & ~(ucs4_t)0xff) != 0) {
839
c = (unsigned char) wc;
845
*from = (XPointer) src;
846
*from_left = srcend - src;
847
*to = (XPointer) dst;
848
*to_left = dstend - dst;
853
static XlcConvMethodsRec methods_utf8tostr = {
862
const char *from_type,
866
return create_conv(from_lcd, &methods_utf8tostr);
869
/* from XlcNString to XlcNUtf8String */
881
unsigned char const *src;
882
unsigned char const *srcend;
884
unsigned char *dstend;
886
if (from == NULL || *from == NULL)
889
src = (unsigned char const *) *from;
890
srcend = src + *from_left;
891
dst = (unsigned char *) *to;
892
dstend = dst + *to_left;
894
while (src < srcend) {
895
int count = utf8_wctomb(NULL, dst, *src, dstend-dst);
896
if (count == RET_TOOSMALL)
902
*from = (XPointer) src;
903
*from_left = srcend - src;
904
*to = (XPointer) dst;
905
*to_left = dstend - dst;
910
static XlcConvMethodsRec methods_strtoutf8 = {
919
const char *from_type,
923
return create_conv(from_lcd, &methods_strtoutf8);
926
/* Support for the input methods. */
937
return (XPointer) NULL;
939
lazy_init_all_charsets();
940
xrm_name = XrmStringToQuark(name);
942
for (convptr = all_charsets, i = all_charsets_count-1; i > 0; convptr++, i--)
943
if (convptr->xrm_name == xrm_name)
944
return (XPointer) convptr->wctocs;
945
return (XPointer) NULL;
948
/* from XlcNUcsChar to XlcNChar, needed for input methods */
953
XlcConvMethods methods)
956
if (XLC_PUBLIC_PART(lcd)->codeset
957
&& _XlcCompareISOLatin1(XLC_PUBLIC_PART(lcd)->codeset, "UTF-8") == 0) {
961
lazy_init_all_charsets();
963
conv = (XlcConv) Xmalloc(sizeof(XlcConvRec) + 2 * sizeof(Utf8Conv));
964
if (conv == (XlcConv) NULL)
965
return (XlcConv) NULL;
966
preferred = (Utf8Conv *) ((char *) conv + sizeof(XlcConvRec));
968
preferred[0] = &all_charsets[0]; /* ISO10646 */
969
preferred[1] = (Utf8Conv) NULL;
971
conv->methods = methods;
972
conv->state = (XPointer) preferred;
976
return create_tocs_conv(lcd, methods);
981
charset_wctocs_exactly(
993
for (; *preferred != (Utf8Conv) NULL; preferred++) {
994
convptr = *preferred;
995
count = convptr->wctocs(conv, r, wc, n);
996
if (count == RET_TOOSMALL)
998
if (count != RET_ILSEQ) {
1000
*sidep = (*r < 0x80 ? XlcGL : XlcGR);
1017
ucs4_t const *src = (ucs4_t const *) *from;
1018
unsigned char *dst = (unsigned char *) *to;
1020
Utf8Conv *preferred_charsets = (Utf8Conv *) conv->state;
1021
Utf8Conv chosen_charset = NULL;
1022
XlcSide chosen_side = XlcNONE;
1023
XlcCharSet charset = NULL;
1026
if (from == NULL || *from == NULL)
1029
count = charset_wctocs_exactly(preferred_charsets, &chosen_charset,
1030
&chosen_side, conv, dst, *src, *to_left);
1035
charset = _XlcGetCharSetWithSide(chosen_charset->name, chosen_side);
1037
if (charset == NULL)
1040
*from = (XPointer) ++src;
1042
*to = (XPointer) dst;
1046
*((XlcCharSet *)args[0]) = charset;
1051
static XlcConvMethodsRec methods_ucstocs1 = {
1052
close_tocs_converter,
1060
const char *from_type,
1062
const char *to_type)
1064
return create_ucstocs_conv(from_lcd, &methods_ucstocs1);
1067
/* from XlcNUcsChar to XlcNUtf8String, needed for input methods */
1080
const ucs4_t *srcend;
1082
unsigned char *dstend;
1085
if (from == NULL || *from == NULL)
1088
src = (const ucs4_t *) *from;
1089
srcend = src + *from_left;
1090
dst = (unsigned char *) *to;
1091
dstend = dst + *to_left;
1094
while (src < srcend) {
1095
int count = utf8_wctomb(NULL, dst, *src, dstend-dst);
1096
if (count == RET_TOOSMALL)
1098
if (count == RET_ILSEQ)
1104
*from = (XPointer) src;
1105
*from_left = srcend - src;
1106
*to = (XPointer) dst;
1107
*to_left = dstend - dst;
1112
static XlcConvMethodsRec methods_ucstoutf8 = {
1121
const char *from_type,
1123
const char *to_type)
1125
return create_conv(from_lcd, &methods_ucstoutf8);
1128
/* Registers UTF-8 converters for a non-UTF-8 locale. */
1130
_XlcAddUtf8Converters(
1133
_XlcSetConverter(lcd, XlcNCharSet, lcd, XlcNUtf8String, open_cstoutf8);
1134
_XlcSetConverter(lcd, XlcNUtf8String, lcd, XlcNCharSet, open_utf8tocs);
1135
_XlcSetConverter(lcd, XlcNUtf8String, lcd, XlcNChar, open_utf8tocs1);
1136
_XlcSetConverter(lcd, XlcNString, lcd, XlcNUtf8String, open_strtoutf8);
1137
_XlcSetConverter(lcd, XlcNUtf8String, lcd, XlcNString, open_utf8tostr);
1138
_XlcSetConverter(lcd, XlcNUcsChar, lcd, XlcNChar, open_ucstocs1);
1139
_XlcSetConverter(lcd, XlcNUcsChar, lcd, XlcNUtf8String, open_ucstoutf8);
1142
/***************************************************************************/
1143
/* Part II: UTF-8 locale loader conversion files
1145
* Here we can assume that "multi-byte" is UTF-8 and that `wchar_t' is Unicode.
1148
/* from XlcNMultiByte to XlcNWideChar */
1160
unsigned char const *src;
1161
unsigned char const *srcend;
1166
if (from == NULL || *from == NULL)
1169
src = (unsigned char const *) *from;
1170
srcend = src + *from_left;
1171
dst = (wchar_t *) *to;
1172
dstend = dst + *to_left;
1175
while (src < srcend && dst < dstend) {
1177
int consumed = utf8_mbtowc(NULL, &wc, src, srcend-src);
1178
if (consumed == RET_TOOFEW(0))
1180
if (consumed == RET_ILSEQ) {
1191
*from = (XPointer) src;
1192
*from_left = srcend - src;
1193
*to = (XPointer) dst;
1194
*to_left = dstend - dst;
1199
static XlcConvMethodsRec methods_utf8towcs = {
1208
const char *from_type,
1210
const char *to_type)
1212
return create_conv(from_lcd, &methods_utf8towcs);
1215
/* from XlcNWideChar to XlcNMultiByte */
1228
wchar_t const *srcend;
1230
unsigned char *dstend;
1233
if (from == NULL || *from == NULL)
1236
src = (wchar_t const *) *from;
1237
srcend = src + *from_left;
1238
dst = (unsigned char *) *to;
1239
dstend = dst + *to_left;
1242
while (src < srcend) {
1243
int count = utf8_wctomb(NULL, dst, *src, dstend-dst);
1244
if (count == RET_TOOSMALL)
1246
if (count == RET_ILSEQ) {
1247
count = utf8_wctomb(NULL, dst, BAD_WCHAR, dstend-dst);
1248
if (count == RET_TOOSMALL)
1256
*from = (XPointer) src;
1257
*from_left = srcend - src;
1258
*to = (XPointer) dst;
1259
*to_left = dstend - dst;
1264
static XlcConvMethodsRec methods_wcstoutf8 = {
1273
const char *from_type,
1275
const char *to_type)
1277
return create_conv(from_lcd, &methods_wcstoutf8);
1280
/* from XlcNString to XlcNWideChar */
1292
unsigned char const *src;
1293
unsigned char const *srcend;
1297
if (from == NULL || *from == NULL)
1300
src = (unsigned char const *) *from;
1301
srcend = src + *from_left;
1302
dst = (wchar_t *) *to;
1303
dstend = dst + *to_left;
1305
while (src < srcend && dst < dstend)
1306
*dst++ = (wchar_t) *src++;
1308
*from = (XPointer) src;
1309
*from_left = srcend - src;
1310
*to = (XPointer) dst;
1311
*to_left = dstend - dst;
1316
static XlcConvMethodsRec methods_strtowcs = {
1325
const char *from_type,
1327
const char *to_type)
1329
return create_conv(from_lcd, &methods_strtowcs);
1332
/* from XlcNWideChar to XlcNString */
1345
wchar_t const *srcend;
1347
unsigned char *dstend;
1350
if (from == NULL || *from == NULL)
1353
src = (wchar_t const *) *from;
1354
srcend = src + *from_left;
1355
dst = (unsigned char *) *to;
1356
dstend = dst + *to_left;
1359
while (src < srcend && dst < dstend) {
1360
unsigned int wc = *src++;
1370
*from = (XPointer) src;
1371
*from_left = srcend - src;
1372
*to = (XPointer) dst;
1373
*to_left = dstend - dst;
1378
static XlcConvMethodsRec methods_wcstostr = {
1387
const char *from_type,
1389
const char *to_type)
1391
return create_conv(from_lcd, &methods_wcstostr);
1394
/* from XlcNCharSet to XlcNWideChar */
1410
unsigned char const *src;
1411
unsigned char const *srcend;
1416
if (from == NULL || *from == NULL)
1422
charset = (XlcCharSet) args[0];
1423
name = charset->encoding_name;
1424
/* not charset->name because the latter has a ":GL"/":GR" suffix */
1426
for (convptr = all_charsets, i = all_charsets_count-1; i > 0; convptr++, i--)
1427
if (!strcmp(convptr->name, name))
1432
src = (unsigned char const *) *from;
1433
srcend = src + *from_left;
1434
dst = (wchar_t *) *to;
1435
dstend = dst + *to_left;
1438
while (src < srcend && dst < dstend) {
1442
consumed = convptr->cstowc(conv, &wc, src, srcend-src);
1443
if (consumed == RET_ILSEQ)
1445
if (consumed == RET_TOOFEW(0))
1452
*from = (XPointer) src;
1453
*from_left = srcend - src;
1454
*to = (XPointer) dst;
1455
*to_left = dstend - dst;
1460
static XlcConvMethodsRec methods_cstowcs = {
1469
const char *from_type,
1471
const char *to_type)
1473
lazy_init_all_charsets();
1474
return create_conv(from_lcd, &methods_cstowcs);
1477
/* from XlcNWideChar to XlcNCharSet */
1489
Utf8Conv *preferred_charsets;
1490
XlcCharSet last_charset = NULL;
1492
wchar_t const *srcend;
1494
unsigned char *dstend;
1497
if (from == NULL || *from == NULL)
1500
preferred_charsets = (Utf8Conv *) conv->state;
1501
src = (wchar_t const *) *from;
1502
srcend = src + *from_left;
1503
dst = (unsigned char *) *to;
1504
dstend = dst + *to_left;
1507
while (src < srcend && dst < dstend) {
1508
Utf8Conv chosen_charset = NULL;
1509
XlcSide chosen_side = XlcNONE;
1513
count = charset_wctocs(preferred_charsets, &chosen_charset, &chosen_side, conv, dst, wc, dstend-dst);
1514
if (count == RET_TOOSMALL)
1516
if (count == RET_ILSEQ) {
1522
if (last_charset == NULL) {
1524
_XlcGetCharSetWithSide(chosen_charset->name, chosen_side);
1525
if (last_charset == NULL) {
1531
if (!(last_charset->xrm_encoding_name == chosen_charset->xrm_name
1532
&& (last_charset->side == XlcGLGR
1533
|| last_charset->side == chosen_side)))
1540
if (last_charset == NULL)
1543
*from = (XPointer) src;
1544
*from_left = srcend - src;
1545
*to = (XPointer) dst;
1546
*to_left = dstend - dst;
1549
*((XlcCharSet *)args[0]) = last_charset;
1554
static XlcConvMethodsRec methods_wcstocs = {
1555
close_tocs_converter,
1563
const char *from_type,
1565
const char *to_type)
1567
return create_tocs_conv(from_lcd, &methods_wcstocs);
1570
/* from XlcNWideChar to XlcNChar */
1582
Utf8Conv *preferred_charsets;
1583
XlcCharSet last_charset = NULL;
1585
wchar_t const *srcend;
1587
unsigned char *dstend;
1590
if (from == NULL || *from == NULL)
1593
preferred_charsets = (Utf8Conv *) conv->state;
1594
src = (wchar_t const *) *from;
1595
srcend = src + *from_left;
1596
dst = (unsigned char *) *to;
1597
dstend = dst + *to_left;
1600
while (src < srcend && dst < dstend) {
1601
Utf8Conv chosen_charset = NULL;
1602
XlcSide chosen_side = XlcNONE;
1606
count = charset_wctocs(preferred_charsets, &chosen_charset, &chosen_side, conv, dst, wc, dstend-dst);
1607
if (count == RET_TOOSMALL)
1609
if (count == RET_ILSEQ) {
1615
if (last_charset == NULL) {
1617
_XlcGetCharSetWithSide(chosen_charset->name, chosen_side);
1618
if (last_charset == NULL) {
1624
if (!(last_charset->xrm_encoding_name == chosen_charset->xrm_name
1625
&& (last_charset->side == XlcGLGR
1626
|| last_charset->side == chosen_side)))
1634
if (last_charset == NULL)
1637
*from = (XPointer) src;
1638
*from_left = srcend - src;
1639
*to = (XPointer) dst;
1640
*to_left = dstend - dst;
1643
*((XlcCharSet *)args[0]) = last_charset;
1648
static XlcConvMethodsRec methods_wcstocs1 = {
1649
close_tocs_converter,
1657
const char *from_type,
1659
const char *to_type)
1661
return create_tocs_conv(from_lcd, &methods_wcstocs1);
1664
/* trivial, no conversion */
1676
unsigned char const *src;
1677
unsigned char const *srcend;
1679
unsigned char *dstend;
1681
if (from == NULL || *from == NULL)
1684
src = (unsigned char const *) *from;
1685
srcend = src + *from_left;
1686
dst = (unsigned char *) *to;
1687
dstend = dst + *to_left;
1689
while (src < srcend && dst < dstend)
1692
*from = (XPointer) src;
1693
*from_left = srcend - src;
1694
*to = (XPointer) dst;
1695
*to_left = dstend - dst;
1700
static XlcConvMethodsRec methods_identity = {
1709
const char *from_type,
1711
const char *to_type)
1713
return create_conv(from_lcd, &methods_identity);
1716
/* from MultiByte/WideChar to FontCharSet. */
1717
/* They really use converters to CharSet
1718
* but with different create_conv procedure. */
1721
create_tofontcs_conv(
1723
XlcConvMethods methods)
1726
int i, num, k, count;
1727
char **value, buf[20];
1728
Utf8Conv *preferred;
1730
lazy_init_all_charsets();
1732
for (i = 0, num = 0;; i++) {
1733
sprintf(buf, "fs%d.charset.name", i);
1734
_XlcGetResource(lcd, "XLC_FONTSET", buf, &value, &count);
1736
sprintf(buf, "fs%d.charset", i);
1737
_XlcGetResource(lcd, "XLC_FONTSET", buf, &value, &count);
1744
conv = (XlcConv) Xmalloc(sizeof(XlcConvRec) + (num + 1) * sizeof(Utf8Conv));
1745
if (conv == (XlcConv) NULL)
1746
return (XlcConv) NULL;
1747
preferred = (Utf8Conv *) ((char *) conv + sizeof(XlcConvRec));
1749
/* Loop through all fontsets mentioned in the locale. */
1750
for (i = 0, num = 0;; i++) {
1751
sprintf(buf, "fs%d.charset.name", i);
1752
_XlcGetResource(lcd, "XLC_FONTSET", buf, &value, &count);
1754
sprintf(buf, "fs%d.charset", i);
1755
_XlcGetResource(lcd, "XLC_FONTSET", buf, &value, &count);
1759
while (count-- > 0) {
1760
XlcCharSet charset = _XlcGetCharSet(*value++);
1761
const char *name = charset->encoding_name;
1762
/* If it wasn't already encountered... */
1763
for (k = num - 1; k >= 0; k--)
1764
if (!strcmp(preferred[k]->name, name))
1767
/* For fonts "ISO10646-1" means not utf8 but ucs2.*/
1768
if (!strcmp("ISO10646-1", name)) {
1769
preferred[num++] = &all_charsets[ucs2_conv_index];
1772
/* Look it up in all_charsets[]. */
1773
for (k = 0; k < all_charsets_count-1; k++)
1774
if (!strcmp(all_charsets[k].name, name)) {
1775
/* Add it to the preferred set. */
1776
preferred[num++] = &all_charsets[k];
1782
preferred[num] = (Utf8Conv) NULL;
1784
conv->methods = methods;
1785
conv->state = (XPointer) preferred;
1793
const char *from_type,
1795
const char *to_type)
1797
return create_tofontcs_conv(from_lcd, &methods_wcstocs);
1803
const char *from_type,
1805
const char *to_type)
1807
return create_tofontcs_conv(from_lcd, &methods_utf8tocs);
1810
/* Registers UTF-8 converters for a UTF-8 locale. */
1813
_XlcAddUtf8LocaleConverters(
1816
/* Register elementary converters. */
1818
_XlcSetConverter(lcd, XlcNMultiByte, lcd, XlcNWideChar, open_utf8towcs);
1820
_XlcSetConverter(lcd, XlcNWideChar, lcd, XlcNMultiByte, open_wcstoutf8);
1821
_XlcSetConverter(lcd, XlcNWideChar, lcd, XlcNString, open_wcstostr);
1823
_XlcSetConverter(lcd, XlcNString, lcd, XlcNWideChar, open_strtowcs);
1825
/* Register converters for XlcNCharSet. This implicitly provides
1826
* converters from and to XlcNCompoundText. */
1828
_XlcSetConverter(lcd, XlcNCharSet, lcd, XlcNMultiByte, open_cstoutf8);
1829
_XlcSetConverter(lcd, XlcNMultiByte, lcd, XlcNCharSet, open_utf8tocs);
1830
_XlcSetConverter(lcd, XlcNMultiByte, lcd, XlcNChar, open_utf8tocs1);
1832
_XlcSetConverter(lcd, XlcNCharSet, lcd, XlcNWideChar, open_cstowcs);
1833
_XlcSetConverter(lcd, XlcNWideChar, lcd, XlcNCharSet, open_wcstocs);
1834
_XlcSetConverter(lcd, XlcNWideChar, lcd, XlcNChar, open_wcstocs1);
1836
_XlcSetConverter(lcd, XlcNString, lcd, XlcNMultiByte, open_strtoutf8);
1837
_XlcSetConverter(lcd, XlcNMultiByte, lcd, XlcNString, open_utf8tostr);
1838
_XlcSetConverter(lcd, XlcNUtf8String, lcd, XlcNMultiByte, open_identity);
1839
_XlcSetConverter(lcd, XlcNMultiByte, lcd, XlcNUtf8String, open_identity);
1841
/* Register converters for XlcNFontCharSet */
1842
_XlcSetConverter(lcd, XlcNMultiByte, lcd, XlcNFontCharSet, open_utf8tofcs);
1843
_XlcSetConverter(lcd, XlcNWideChar, lcd, XlcNFontCharSet, open_wcstofcs);