1
/*===========================================================================
2
* FileName : encoding.c
3
* About : handling encoding
5
* Copyright (C) 2005 by Kazuki Ohta (mover@hct.zaq.ne.jp)
9
* Redistribution and use in source and binary forms, with or without
10
* modification, are permitted provided that the following conditions
13
* 1. Redistributions of source code must retain the above copyright
14
* notice, this list of conditions and the following disclaimer.
15
* 2. Redistributions in binary form must reproduce the above copyright
16
* notice, this list of conditions and the following disclaimer in the
17
* documentation and/or other materials provided with the distribution.
18
* 3. Neither the name of authors nor the names of its contributors
19
* may be used to endorse or promote products derived from this software
20
* without specific prior written permission.
22
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''
23
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
26
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
28
* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33
===========================================================================*/
35
/* Acknowledgement: much information was gained from the
36
* i18n-introduction of the debian project. Many thanks to its
37
* authors, Tomohiro KUBOTA, et al. */
40
/*=======================================
42
=======================================*/
44
/*=======================================
46
=======================================*/
47
#include "sigscheme.h"
48
#include "sigschemeinternal.h"
50
/*=======================================
52
=======================================*/
54
static ScmMultibyteCharInfo eucjp_scan_char(ScmMultibyteString mbs);
58
static ScmMultibyteCharInfo iso2022kr_scan_char(ScmMultibyteString mbs);
59
static ScmMultibyteCharInfo iso2022kr_scan_input_char(ScmMultibyteString mbs);
63
static ScmMultibyteCharInfo iso2022jp_scan_char(ScmMultibyteString mbs);
64
static ScmMultibyteCharInfo iso2022jp_scan_input_char(ScmMultibyteString mbs);
68
static ScmMultibyteCharInfo sjis_scan_char(ScmMultibyteString mbs);
72
static ScmMultibyteCharInfo euccn_scan_char(ScmMultibyteString mbs);
76
static ScmMultibyteCharInfo euckr_scan_char(ScmMultibyteString mbs);
80
static ScmMultibyteCharInfo utf8_scan_char(ScmMultibyteString mbs);
83
static ScmMultibyteCharInfo unibyte_scan_char(ScmMultibyteString mbs);
85
typedef unsigned char uchar;
87
/*=======================================
89
=======================================*/
90
/* TODO: add some mechanism to dynamically switch between encodings. */
91
ScmMultibyteCharInfo (*Scm_mb_scan_char)(ScmMultibyteString mbs)
94
/*=======================================
96
=======================================*/
98
int Scm_mb_strlen(ScmMultibyteString mbs)
101
ScmMultibyteCharInfo c;
103
CDBG((SCM_DBG_ENCODING, "mb_strlen: size = %d; str = %s;",
104
SCM_MBS_GET_SIZE(mbs), SCM_MBS_GET_STR(mbs)));
106
while (SCM_MBS_GET_SIZE(mbs)) {
107
c = Scm_mb_scan_char(mbs);
108
CDBG((SCM_DBG_ENCODING, "%d, %d;", SCM_MBCINFO_GET_SIZE(c), c.flag));
109
SCM_MBS_SKIP_CHAR(mbs, c);
113
CDBG((SCM_DBG_ENCODING, "len=%d\n", len));
117
/* FIXME: pick a better name. */
118
int Scm_mb_bare_c_strlen(const char *s)
120
ScmMultibyteString mbs;
122
SCM_MBS_SET_STR(mbs, s);
123
SCM_MBS_SET_SIZE(mbs, strlen(s));
124
return Scm_mb_strlen(mbs);
127
ScmMultibyteString Scm_mb_substring(ScmMultibyteString mbs, int i, int len)
129
ScmMultibyteString ret;
130
ScmMultibyteString end;
131
ScmMultibyteCharInfo c;
136
c = Scm_mb_scan_char(ret);
137
SCM_MBS_SKIP_CHAR(ret, c);
143
c = Scm_mb_scan_char(end);
144
SCM_MBS_SKIP_CHAR(end, c);
147
SCM_MBS_SET_SIZE(ret, SCM_MBS_GET_STR(end) - SCM_MBS_GET_STR(ret));
152
/*=======================================
153
Encoding-specific functions
154
=======================================*/
156
/* Every encoding implements the <encoding name>_scan_char()
157
* primitive. Its job is to determine the length of the first
158
* character in the given string. Stateful encodings should save
159
* their state *at exit*, that is, the state right after reading the
160
* first character (so don't omit it). */
162
/* Convenience macros. Start with ENTER and return with RETURN*.
163
* EXPECT_SIZE() declares the expected length of the character. We'll
164
* use it to return information on how many octets are missing. It
165
* also serves as documentation. */
166
#define ENTER ScmMultibyteCharInfo _ret; SCM_MBCINFO_INIT(_ret)
167
#define RETURN(n) do { SCM_MBCINFO_SET_SIZE(_ret, n); return _ret; } while (0)
168
#define RETURN_ERROR() do { SCM_MBCINFO_SET_ERROR(_ret); RETURN(1); } while (0)
169
#define RETURN_INCOMPLETE(n) do { SCM_MBCINFO_SET_INCOMPLETE(_ret); RETURN(n); } while (0)
170
#define SAVE_STATE(stat) (SCM_MBCINFO_SET_STATE(_ret, (stat)))
171
#define EXPECT_SIZE(size) /* Currently ignored. */
173
/* Encodings based on ISO/IEC 2022. */
175
/* Control regions. */
176
#define IN_CL(c) ((uchar)(c) < 0x20)
177
#define IN_CR(c) (0x80 <= (uchar)(c) && (uchar)(c) <= 0x9F)
179
/* General purpose regions. */
180
#define IN_GL94(c) (0x21 <= (uchar)(c) && (uchar)(c) <= 0x7E)
181
#define IN_GL96(c) (0x20 <= (uchar)(c) && (uchar)(c) <= 0x7F)
182
#define IN_GR94(c) (0xA1 <= (uchar)(c) && (uchar)(c) <= 0xFE)
183
#define IN_GR96(c) (0xA0 <= (uchar)(c) && (uchar)(c) <= 0xFF)
185
#define IS_ASCII(c) ((uchar)(c) <= 0x7F)
186
#define IS_GR_SPC_OR_DEL(c) ((uchar)(c) == 0xA0 || (uchar)(c) == 0xFF)
196
/* G0 <- (96) ASCII (or was it JIS X 0201 Roman?)
197
* G1 <- (94x94) JIS X 0208 kanji/kana
198
* G2 <- (94) JIS X 0201 Katakana ("half-width katakana")
199
* G3 <- (94x94) JIS X 0212 kanji, or JIS X 0213 kanji plane 2
202
* GR <- G1 (JIS X 0208)
203
* CL <- JIS X 0211 C0
204
* CR <- JIS X 0211 C1 */
205
static ScmMultibyteCharInfo eucjp_scan_char(ScmMultibyteString mbs)
207
const char *str = SCM_MBS_GET_STR(mbs);
208
const int size = SCM_MBS_GET_SIZE(mbs);
214
if (IN_CL(str[0]) || IN_GL96(str[0]))
216
else if (IN_GR94(str[0]) || (uchar)str[0] == SS2) {
218
if (size < 2) RETURN_INCOMPLETE(1);
219
#if SCM_STRICT_ENCODING_CHECK
220
if (!IN_GR96(str[1])) RETURN_ERROR();
223
} else if ((uchar)str[0] == SS3) {
225
#if SCM_STRICT_ENCODING_CHECK
226
if (size < 2) RETURN_INCOMPLETE(size);
227
if (IS_GR_SPC_OR_DEL(str[1]))
229
if (!IN_GR94(str[1])) RETURN_ERROR();
230
if (size < 3) RETURN_INCOMPLETE(size);
231
if (!IN_GR94(str[2])) RETURN_ERROR();
233
#else /* not SCM_STRICT_ENCODING_CHECK */
235
RETURN_INCOMPLETE(size);
237
#endif /* not SCM_STRICT_ENCODING_CHECK */
242
#endif /* SCM_USE_EUCJP */
245
/* FIXME: NOT TESTED!
247
* G0 <- ASCII (or GB 1988?)
251
* GR <- G1 (GB2312) */
252
static ScmMultibyteCharInfo euccn_scan_char(ScmMultibyteString mbs)
254
/* TODO: maybe we can make this an alias of eucjp_scan_char()? */
255
const char *str = SCM_MBS_GET_STR(mbs);
256
const int size = SCM_MBS_GET_SIZE(mbs);
261
if (IS_ASCII(str[0]))
263
if (IN_GR94(str[0])) {
266
RETURN_INCOMPLETE(size);
267
#if SCM_STRICT_ENCODING_CHECK
268
if (!IN_GR96(str[1]))
278
/* FIXME: NOT TESTED! I'm not sure about this encoding. There's also
279
* a Microsoft variant called CP949, which is not supported (yet).
280
* RFC 1557 says KS X 1001 is 94x94.
283
* G1 <- KS X 1001 (aka KSC 5601)
287
static ScmMultibyteCharInfo euckr_scan_char(ScmMultibyteString mbs)
289
const char *str = SCM_MBS_GET_STR(mbs);
290
const int size = SCM_MBS_GET_SIZE(mbs);
295
if (IS_ASCII(str[0]))
297
if (IN_GR94(str[0])) {
300
RETURN_INCOMPLETE(size);
301
#if SCM_STRICT_ENCODING_CHECK
302
if (!IN_GR96(str[1]))
309
#endif /* SCM_USE_EUCKR */
311
/*==== Encodings for Unicode ====*/
314
#define MASK(n) ((LEN_CODE(n) >> 1) | 0x80)
315
#define LEN_CODE(n) (((1 << (n))-1) << (8-n))
316
#define IS_LEN(c, n) ((MASK(n) & (c)) == LEN_CODE(n))
317
#define IS_TRAILING(c) (IS_LEN((c), 1))
319
static ScmMultibyteCharInfo utf8_scan_char(ScmMultibyteString mbs)
321
const char *str = SCM_MBS_GET_STR(mbs);
322
const int size = SCM_MBS_GET_SIZE(mbs);
328
if (IS_ASCII(str[0]))
331
if (IS_LEN(str[0], 2)) len = 2;
332
else if (IS_LEN(str[0], 3)) len = 3;
333
else if (IS_LEN(str[0], 4)) len = 4;
336
#if SCM_STRICT_ENCODING_CHECK
339
for (i=1; i < len; i++) {
341
RETURN_INCOMPLETE(size);
342
if (!IS_TRAILING(str[i]))
346
#else /* not SCM_STRICT_ENCODING_CHECK */
348
RETURN_INCOMPLETE(size);
349
#endif /* not SCM_STRICT_ENCODING_CHECK */
359
#endif /* SCM_USE_UTF8 */
361
/*==== Other encodings ====*/
364
/* The cwazy Japanese encoding. This function implements the JIS X
369
* 0x81 .. 0x9F: lead byte of 2-byte char
371
* 0xA1 .. 0xDF: JIS X 0201 katakana (1 byte)
372
* 0xE0 .. 0xEF: lead byte of 2-byte char
373
* 0xF0 .. 0xFC: lead byte of 2-byte char if JIS X 0213 is used
374
* 0xFD .. 0xFF: undefined
376
* 0x40 .. 0x7E: trailing byte of 2-byte char
377
* 0x80 .. 0xFC: trailing byte of 2-byte char
379
static ScmMultibyteCharInfo sjis_scan_char(ScmMultibyteString mbs)
381
#define IS_KANA(c) (0xA1 <= (uchar)(c) && (uchar)(c) <= 0xDF)
383
(0x81 <= (uchar)(c) \
385
&& (uchar)(c) <= 0xFC \
386
&& (uchar)(c) != 0xA0)
387
#define IS_TRAIL(c) (0x40 <= (uchar)(c) && (uchar)(c) <= 0xFC && (c) != 0x7E)
389
const char *str = SCM_MBS_GET_STR(mbs);
390
const int size = SCM_MBS_GET_SIZE(mbs);
394
if (IS_LEAD(str[0])) {
397
RETURN_INCOMPLETE(size);
398
#if SCM_STRICT_ENCODING_CHECK
399
if (!IS_TRAIL(str[1]))
410
#endif /* SCM_USE_SJIS */
412
/* Single-byte encodings. Please add any that you know are missing.
413
* Sorted alphabetically.
420
static ScmMultibyteCharInfo unibyte_scan_char(ScmMultibyteString mbs)
423
if (SCM_MBS_GET_SIZE(mbs))