2
* Copyright (c) 2007-2009, Paul Mattes.
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions are met:
7
* * Redistributions of source code must retain the above copyright
8
* notice, this list of conditions and the following disclaimer.
9
* * Redistributions in binary form must reproduce the above copyright
10
* notice, this list of conditions and the following disclaimer in the
11
* documentation and/or other materials provided with the distribution.
12
* * Neither the names of Paul Mattes nor the names of his contributors
13
* may be used to endorse or promote products derived from this software
14
* without specific prior written permission.
16
* THIS SOFTWARE IS PROVIDED BY PAUL MATTES "AS IS" AND ANY EXPRESS OR IMPLIED
17
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
18
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
19
* EVENT SHALL PAUL MATTES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
22
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
24
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
25
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
* 3270 Terminal Emulator
39
char *locale_codeset = CN;
40
Boolean is_utf8 = False;
43
* Save the codeset from the locale, and set globals based on known values.
46
set_codeset(char *codeset_name)
48
#if !defined(TCL3270) /*[*/
49
is_utf8 = (!strcasecmp(codeset_name, "utf-8") ||
50
!strcasecmp(codeset_name, "utf8") ||
51
!strcasecmp(codeset_name, "utf_8"));
54
* tcl3270 is always in UTF-8 mode, because it needs to
55
* supply UTF-8 strings to libtcl and vice-versa.
60
Replace(locale_codeset, NewString(codeset_name));
64
* Convert from UCS-4 to UTF-8.
66
* >0: length of converted character
70
unicode_to_utf8(ucs4_t ucs4, char *utf8)
72
if (ucs4 & 0x80000000)
75
if (ucs4 <= 0x0000007f) {
76
utf8[0] = ucs4 & 0x7f; /* 7 bits */
78
} else if (ucs4 <= 0x000007ff) {
79
utf8[0] = 0xc0 | ((ucs4 >> 6) & 0x1f); /* upper 5 bits */
80
utf8[1] = 0x80 | (ucs4 & 0x3f); /* lower 6 bits */
82
} else if (ucs4 <= 0x0000ffff) {
83
utf8[0] = 0xe0 | ((ucs4 >> 12) & 0x0f); /* upper 4 bits */
84
utf8[1] = 0x80 | ((ucs4 >> 6) & 0x3f); /* next 6 bits */
85
utf8[2] = 0x80 | (ucs4 & 0x3f); /* last 6 bits */
87
} else if (ucs4 <= 0x001fffff) {
88
utf8[0] = 0xf0 | ((ucs4 >> 18) & 0x07); /* upper 3 bits */
89
utf8[1] = 0x80 | ((ucs4 >> 12) & 0x3f); /* next 6 bits */
90
utf8[2] = 0x80 | ((ucs4 >> 6) & 0x3f); /* next 6 bits */
91
utf8[3] = 0x80 | (ucs4 & 0x3f); /* last 6 bits */
93
} else if (ucs4 <= 0x03ffffff) {
94
utf8[0] = 0xf8 | ((ucs4 >> 24) & 0x03); /* upper 2 bits */
95
utf8[1] = 0x80 | ((ucs4 >> 18) & 0x3f); /* next 6 bits */
96
utf8[2] = 0x80 | ((ucs4 >> 12) & 0x3f); /* next 6 bits */
97
utf8[3] = 0x80 | ((ucs4 >> 6) & 0x3f); /* next 6 bits */
98
utf8[4] = 0x80 | (ucs4 & 0x3f); /* last 6 bits */
101
utf8[0] = 0xfc | ((ucs4 >> 30) & 0x01); /* upper 1 bit */
102
utf8[1] = 0x80 | ((ucs4 >> 24) & 0x3f); /* next 6 bits */
103
utf8[2] = 0x80 | ((ucs4 >> 18) & 0x3f); /* next 6 bits */
104
utf8[3] = 0x80 | ((ucs4 >> 12) & 0x3f); /* next 6 bits */
105
utf8[4] = 0x80 | ((ucs4 >> 6) & 0x3f); /* next 6 bits */
106
utf8[5] = 0x80 | (ucs4 & 0x3f); /* last 6 bits */
112
* Convert at most 'len' bytes from a UTF-8 string to one UCS-4 character.
114
* >0: Number of characters consumed.
115
* 0: Incomplete sequence.
116
* -1: Invalid sequence.
117
* -2: Illegal (too-long) encoding.
118
* -3: Invalid lead byte.
120
* An invalid sequence can be either improperly composed, or using the wrong
121
* encoding length (often used to get past spam filters and such).
124
utf8_to_unicode(const char *utf8, int len, ucs4_t *ucs4)
126
/* No input is by definition incomplete. */
130
/* See if it's ASCII-7. */
131
if ((utf8[0] & 0xff) < 0x80) {
132
*ucs4 = utf8[0] & 0x7f;
136
/* Now check for specific UTF-8 leading bytes. */
137
if ((utf8[0] & 0xe0) == 0xc0) {
139
* 0x00000080-0x000007ff */
142
if ((utf8[1] & 0xc0) != 0x80)
144
*ucs4 = ((utf8[0] << 6) & 0x7c0) |
146
if (*ucs4 < 0x00000080)
151
if ((utf8[0] & 0xf0) == 0xe0) {
152
/* 1110xxxx 10xxxxxx 10xxxxxx
153
* 0x00000800-0x0000ffff */
156
if (((utf8[1] & 0xc0) != 0x80) ||
157
((utf8[2] & 0xc0) != 0x80))
159
*ucs4 = ((utf8[0] << 12) & 0xf000) |
160
((utf8[1] << 6) & 0x0fc0) |
161
((utf8[2]) & 0x003f);
162
if (*ucs4 < 0x00000800)
167
if ((utf8[0] & 0xf8) == 0xf0) {
168
/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
169
* 0x00010000-0x001fffff */
172
if (((utf8[1] & 0xc0) != 0x80) ||
173
((utf8[2] & 0xc0) != 0x80) ||
174
((utf8[3] & 0xc0) != 0x80))
176
*ucs4 = ((utf8[0] << 18) & 0x1c0000) |
177
((utf8[1] << 12) & 0x03f000) |
178
((utf8[2] << 6) & 0x000fc0) |
179
((utf8[3]) & 0x00003f);
180
if (*ucs4 < 0x00010000)
185
if ((utf8[0] & 0xfc) == 0xf8) {
186
/* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
187
* 0x00200000-0x03ffffff */
190
if (((utf8[1] & 0xc0) != 0x80) ||
191
((utf8[2] & 0xc0) != 0x80) ||
192
((utf8[3] & 0xc0) != 0x80) ||
193
((utf8[4] & 0xc0) != 0x80))
195
*ucs4 = ((utf8[0] << 24) & 0x3000000) |
196
((utf8[1] << 18) & 0x0fc0000) |
197
((utf8[2] << 12) & 0x003f000) |
198
((utf8[3] << 6) & 0x0000fc0) |
199
((utf8[4]) & 0x000003f);
200
if (*ucs4 < 0x00200000)
205
if ((utf8[0] & 0xfe) == 0xfc) {
206
/* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
207
* 0x04000000-0x7fffffff */
210
if (((utf8[1] & 0xc0) != 0x80) ||
211
((utf8[2] & 0xc0) != 0x80) ||
212
((utf8[3] & 0xc0) != 0x80) ||
213
((utf8[4] & 0xc0) != 0x80) ||
214
((utf8[5] & 0xc0) != 0x80))
216
*ucs4 = ((utf8[0] << 30) & 0x40000000) |
217
((utf8[1] << 24) & 0x3f000000) |
218
((utf8[2] << 18) & 0x00fc0000) |
219
((utf8[3] << 12) & 0x0003f000) |
220
((utf8[4] << 6) & 0x00000fc0) |
221
((utf8[5]) & 0x0000003f);
222
if (*ucs4 < 0x04000000)