1
/*===========================================================================
2
* Filename : encoding.c
3
* About : Character encoding handling
5
* Copyright (C) 2005 Kazuki Ohta <mover AT hct.zaq.ne.jp>
6
* Copyright (C) 2005 Jun Inoue <jun.lambda AT gmail.com>
7
* Copyright (C) 2005-2006 YAMAMOTO Kengo <yamaken AT bp.iij4u.or.jp>
8
* Copyright (c) 2007 SigScheme Project <uim AT freedesktop.org>
10
* All rights reserved.
12
* Redistribution and use in source and binary forms, with or without
13
* modification, are permitted provided that the following conditions
16
* 1. Redistributions of source code must retain the above copyright
17
* notice, this list of conditions and the following disclaimer.
18
* 2. Redistributions in binary form must reproduce the above copyright
19
* notice, this list of conditions and the following disclaimer in the
20
* documentation and/or other materials provided with the distribution.
21
* 3. Neither the name of authors nor the names of its contributors
22
* may be used to endorse or promote products derived from this software
23
* without specific prior written permission.
25
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS
26
* IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
27
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
29
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
32
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
33
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
34
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
35
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36
===========================================================================*/
38
/* Acknowledgement: much information was gained from the
39
* i18n-introduction of the debian project. Many thanks to its
40
* authors, Tomohiro KUBOTA, et al. */
43
* This file is intended to be portable. Don't depend on SigScheme and don't
44
* merge into another file.
53
#include "encoding-config.h"
56
/*=======================================
57
File Local Macro Definitions
58
=======================================*/
60
/*=======================================
61
File Local Type Definitions
62
=======================================*/
63
typedef unsigned char uchar;
65
/*=======================================
67
=======================================*/
68
static scm_bool pred_always_true(void) SCM_UNUSED;
69
static scm_bool pred_always_false(void) SCM_UNUSED;
72
static const char *eucjp_encoding(void);
73
static enum ScmCodedCharSet eucjp_ccs(void);
74
static int eucjp_char_len(scm_ichar_t ch);
75
static ScmMultibyteCharInfo eucjp_scan_char(ScmMultibyteString mbs);
76
static scm_ichar_t eucjp_str2int(const uchar *src, size_t len,
77
ScmMultibyteState state);
78
static uchar *eucjp_int2str(uchar *dst, scm_ichar_t ch,
79
ScmMultibyteState state);
83
static ScmMultibyteCharInfo iso2022kr_scan_char(ScmMultibyteString mbs);
84
static ScmMultibyteCharInfo iso2022kr_scan_input_char(ScmMultibyteString mbs);
88
static ScmMultibyteCharInfo iso2022jp_scan_char(ScmMultibyteString mbs);
89
static ScmMultibyteCharInfo iso2022jp_scan_input_char(ScmMultibyteString mbs);
93
static const char *sjis_encoding(void);
94
static enum ScmCodedCharSet sjis_ccs(void);
95
static int sjis_char_len(scm_ichar_t ch);
96
static ScmMultibyteCharInfo sjis_scan_char(ScmMultibyteString mbs);
97
static uchar *sjis_int2str(uchar *dst, scm_ichar_t ch,
98
ScmMultibyteState state);
101
#if (SCM_USE_EUCCN || SCM_USE_EUCKR || SCM_USE_SJIS)
102
/* generic double-byte char */
103
static scm_ichar_t dbc_str2int(const uchar *src, size_t len,
104
ScmMultibyteState state);
107
#if (SCM_USE_EUCCN || SCM_USE_EUCKR)
108
/* shared by EUCCN and EUCKR */
109
static int euc_char_len(scm_ichar_t ch);
110
static uchar *euc_int2str(uchar *dst, scm_ichar_t ch, ScmMultibyteState state);
114
static const char *euccn_encoding(void);
115
static enum ScmCodedCharSet euccn_ccs(void);
116
static ScmMultibyteCharInfo euccn_scan_char(ScmMultibyteString mbs);
120
static const char *euckr_encoding(void);
121
static enum ScmCodedCharSet euckr_ccs(void);
122
static ScmMultibyteCharInfo euckr_scan_char(ScmMultibyteString mbs);
126
static const char *utf8_encoding(void);
127
static enum ScmCodedCharSet utf8_ccs(void);
128
static int utf8_char_len(scm_ichar_t ch);
129
static ScmMultibyteCharInfo utf8_scan_char(ScmMultibyteString mbs);
130
static scm_ichar_t utf8_str2int(const uchar *src, size_t len,
131
ScmMultibyteState state);
132
static uchar *utf8_int2str(uchar *dst, scm_ichar_t ch,
133
ScmMultibyteState state);
136
static const char *unibyte_encoding(void);
137
static enum ScmCodedCharSet unibyte_ccs(void);
138
static int unibyte_char_len(scm_ichar_t ch);
139
static ScmMultibyteCharInfo unibyte_scan_char(ScmMultibyteString mbs);
140
static scm_ichar_t unibyte_str2int(const uchar *src, size_t len,
141
ScmMultibyteState state);
142
static uchar *unibyte_int2str(uchar *dst, scm_ichar_t ch,
143
ScmMultibyteState state);
145
/*=======================================
147
=======================================*/
149
static const ScmCharCodecVTbl utf8_codec_vtbl = {
155
(ScmCharCodecMethod_str2int)&utf8_str2int,
156
(ScmCharCodecMethod_int2str)&utf8_int2str
158
#define utf8_codec (&utf8_codec_vtbl)
162
static const ScmCharCodecVTbl euccn_codec_vtbl = {
168
(ScmCharCodecMethod_str2int)&dbc_str2int,
169
(ScmCharCodecMethod_int2str)&euc_int2str
171
#define euccn_codec (&euccn_codec_vtbl)
175
static const ScmCharCodecVTbl eucjp_codec_vtbl = {
181
(ScmCharCodecMethod_str2int)&eucjp_str2int,
182
(ScmCharCodecMethod_int2str)&eucjp_int2str
184
#define eucjp_codec (&eucjp_codec_vtbl)
188
static const ScmCharCodecVTbl euckr_codec_vtbl = {
194
(ScmCharCodecMethod_str2int)&dbc_str2int,
195
(ScmCharCodecMethod_int2str)&euc_int2str
197
#define euckr_codec (&euckr_codec_vtbl)
201
static const ScmCharCodecVTbl sjis_codec_vtbl = {
207
(ScmCharCodecMethod_str2int)&dbc_str2int,
208
(ScmCharCodecMethod_int2str)&sjis_int2str
210
#define sjis_codec (&sjis_codec_vtbl)
213
static const ScmCharCodecVTbl unibyte_codec_vtbl = {
219
(ScmCharCodecMethod_str2int)&unibyte_str2int,
220
(ScmCharCodecMethod_int2str)&unibyte_int2str
222
#define unibyte_codec (&unibyte_codec_vtbl)
224
static ScmCharCodec *const available_codecs[] = {
244
/*=======================================
246
=======================================*/
247
SCM_DEFINE_EXPORTED_VARS(encoding);
249
/*=======================================
251
=======================================*/
253
scm_encoding_init(void)
255
SCM_GLOBAL_VARS_INIT(encoding);
257
/* To allow re-initialization of the interpreter, this variables must be
258
* initialized by assignment. Initialized .data section does not work for
259
* such situation. -- YamaKen 2006-03-31 */
261
/* temporary solution */
262
scm_current_char_codec
263
#if SCM_USE_UTF8_AS_DEFAULT
265
#elif SCM_USE_EUCJP_AS_DEFAULT
267
#elif SCM_USE_EUCCN_AS_DEFAULT
269
#elif SCM_USE_EUCKR_AS_DEFAULT
271
#elif SCM_USE_SJIS_AS_DEFAULT
279
scm_mb_strlen(ScmCharCodec *codec, ScmMultibyteString mbs)
282
ScmMultibyteCharInfo c;
284
SCM_ENCODING_CDBG((SCM_DBG_ENCODING, "mb_strlen: size = ~ZU; str = ~S;",
285
SCM_MBS_GET_SIZE(mbs), SCM_MBS_GET_STR(mbs)));
287
for (len = 0; SCM_MBS_GET_SIZE(mbs); len++) {
288
c = SCM_CHARCODEC_SCAN_CHAR(codec, mbs);
289
SCM_ENCODING_CDBG((SCM_DBG_ENCODING, "~ZU, ~D;",
290
SCM_MBCINFO_GET_SIZE(c), c.flag));
291
SCM_MBS_SKIP_CHAR(mbs, c);
294
SCM_ENCODING_CDBG((SCM_DBG_ENCODING, "len=~ZU\n", len));
298
/* FIXME: pick a better name. */
300
scm_mb_bare_c_strlen(ScmCharCodec *codec, const char *s)
302
ScmMultibyteString mbs;
304
SCM_MBS_INIT2(mbs, s, strlen(s));
305
return scm_mb_strlen(codec, mbs);
308
SCM_EXPORT ScmMultibyteString
309
scm_mb_substring(ScmCharCodec *codec,
310
ScmMultibyteString mbs, size_t i, size_t len)
312
ScmMultibyteString ret, end;
313
ScmMultibyteCharInfo c;
318
c = SCM_CHARCODEC_SCAN_CHAR(codec, ret);
319
SCM_MBS_SKIP_CHAR(ret, c);
325
c = SCM_CHARCODEC_SCAN_CHAR(codec, end);
326
SCM_MBS_SKIP_CHAR(end, c);
329
SCM_MBS_SET_SIZE(ret, SCM_MBS_GET_STR(end) - SCM_MBS_GET_STR(ret));
333
/* TODO: support encoding name canonicalization */
334
SCM_EXPORT ScmCharCodec *
335
scm_mb_find_codec(const char *encoding)
337
ScmCharCodec *const *codecp;
339
for (codecp = &available_codecs[0]; *codecp; codecp++) {
340
if (strcmp(SCM_CHARCODEC_ENCODING(*codecp), encoding) == 0)
347
SCM_EXPORT scm_ichar_t
348
scm_charcodec_read_char(ScmCharCodec *codec, ScmMultibyteString *mbs,
351
ScmMultibyteCharInfo mbc;
352
ScmMultibyteState state;
355
SCM_ENCODING_ASSERT(SCM_MBS_GET_SIZE(*mbs));
357
state = SCM_MBS_GET_STATE(*mbs);
358
mbc = SCM_CHARCODEC_SCAN_CHAR(codec, *mbs);
359
if (SCM_MBCINFO_ERRORP(mbc) || SCM_MBCINFO_INCOMPLETEP(mbc))
360
SCM_ENCODING_ERROR("scm_charcodec_read_char: invalid char sequence");
361
ch = SCM_CHARCODEC_STR2INT(codec,
362
SCM_MBS_GET_STR(*mbs),
363
SCM_MBCINFO_GET_SIZE(mbc),
365
if (ch == SCM_ICHAR_EOF)
366
SCM_ENCODING_ERROR("scm_charcodec_read_char: invalid char sequence");
368
SCM_MBS_SKIP_CHAR(*mbs, mbc);
373
/*=======================================
374
Encoding-specific functions
375
=======================================*/
378
pred_always_true(void)
384
pred_always_false(void)
389
/* Every encoding implements the <encoding name>_scan_char()
390
* primitive. Its job is to determine the length of the first
391
* character in the given string. Stateful encodings should save
392
* their state *at exit*, that is, the state right after reading the
393
* first character (so don't omit it). */
395
/* Convenience macros. Start with ENTER and return with RETURN*.
396
* EXPECT_SIZE() declares the expected length of the character. We'll
397
* use it to return information on how many octets are missing. It
398
* also serves as documentation. */
399
#define ENTER ScmMultibyteCharInfo _ret; SCM_MBCINFO_INIT(_ret)
402
SCM_MBCINFO_SET_SIZE(_ret, n); \
404
} while (/* CONSTCOND */ 0)
405
#define RETURN_ERROR() \
407
SCM_MBCINFO_SET_ERROR(_ret); \
409
} while (/* CONSTCOND */ 0)
410
#define RETURN_INCOMPLETE(n) \
412
SCM_MBCINFO_SET_INCOMPLETE(_ret); \
414
} while (/* CONSTCOND */ 0)
415
#define SAVE_STATE(stat) SCM_MBCINFO_SET_STATE(_ret, (stat))
416
#define EXPECT_SIZE(size) SCM_EMPTY_EXPR /* Currently ignored. */
418
/* Encodings based on ISO/IEC 2022. */
420
/* Control regions. */
421
#define IN_CL(c) ((uchar)(c) < 0x20)
422
#define IN_CR(c) (0x80 <= (uchar)(c) && (uchar)(c) <= 0x9F)
424
/* General purpose regions. */
425
#define IN_GL94(c) (0x21 <= (uchar)(c) && (uchar)(c) <= 0x7E)
426
#define IN_GL96(c) (0x20 <= (uchar)(c) && (uchar)(c) <= 0x7F)
427
#define IN_GR94(c) (0xA1 <= (uchar)(c) && (uchar)(c) <= 0xFE)
428
#define IN_GR96(c) (0xA0 <= (uchar)(c) /* && (uchar)(c) <= 0xFF */)
430
#define IS_ASCII(c) ((scm_ichar_t)(c) <= 0x7F)
431
#define IS_GR_SPC_OR_DEL(c) ((uchar)(c) == 0xA0 || (uchar)(c) == 0xFF)
434
#define BYTE_MASK 0xFF
435
#define IS_1BYTE(e) ((scm_ichar_t)(e) <= 0x7F)
436
#define IS_2BYTES(e) ((scm_ichar_t)(e) <= 0xFFFF)
437
#define IS_3BYTES(e) ((scm_ichar_t)(e) <= ((SS3 << CHAR_BITS * 2) | 0xFFFF))
452
static enum ScmCodedCharSet
458
/* FIXME: Optimize */
460
eucjp_char_len(scm_ichar_t ch)
463
uchar buf[SCM_MB_CHAR_BUF_SIZE];
465
end = eucjp_int2str(buf, ch, SCM_MB_STATELESS);
467
return (end) ? end - buf : 0;
470
/* G0 <- (96) ASCII (or was it JIS X 0201 Roman?)
471
* G1 <- (94x94) JIS X 0208 kanji/kana
472
* G2 <- (94) JIS X 0201 Katakana ("half-width katakana")
473
* G3 <- (94x94) JIS X 0212 kanji, or JIS X 0213 kanji plane 2
476
* GR <- G1 (JIS X 0208)
477
* CL <- JIS X 0211 C0
478
* CR <- JIS X 0211 C1 */
479
static ScmMultibyteCharInfo
480
eucjp_scan_char(ScmMultibyteString mbs)
482
const uchar *str = (const uchar *)SCM_MBS_GET_STR(mbs);
483
const size_t size = SCM_MBS_GET_SIZE(mbs);
489
if (IN_CL(str[0]) || IN_GL96(str[0]))
491
else if (IN_GR94(str[0]) || (uchar)str[0] == SS2) {
493
if (size < 2) RETURN_INCOMPLETE(1);
494
#if SCM_STRICT_ENCODING_CHECK
495
if (!IN_GR96(str[1])) RETURN_ERROR();
498
} else if ((uchar)str[0] == SS3) {
500
#if SCM_STRICT_ENCODING_CHECK
501
if (size < 2) RETURN_INCOMPLETE(size);
502
if (IS_GR_SPC_OR_DEL(str[1]))
504
if (!IN_GR94(str[1])) RETURN_ERROR();
505
if (size < 3) RETURN_INCOMPLETE(size);
506
if (!IN_GR94(str[2])) RETURN_ERROR();
508
#else /* not SCM_STRICT_ENCODING_CHECK */
510
RETURN_INCOMPLETE(size);
512
#endif /* not SCM_STRICT_ENCODING_CHECK */
519
eucjp_str2int(const uchar *src, size_t len, ScmMultibyteState state)
529
ch = src[0] << CHAR_BITS;
534
ch = src[0] << CHAR_BITS * 2;
535
ch |= src[1] << CHAR_BITS;
540
return SCM_ICHAR_EOF;
546
/* TODO: migrate to a canonical form shared with ISO-2022 variants that contain
547
absolute character set identifier instead of raw encoding-dependent
550
eucjp_int2str(uchar *dst, scm_ichar_t ch, ScmMultibyteState state)
552
#if SCM_STRICT_ENCODING_CHECK
558
} else if (IS_2BYTES(ch)) {
559
#if SCM_STRICT_ENCODING_CHECK
560
seq[0] = ch >> CHAR_BITS;
561
seq[1] = ch & BYTE_MASK;
562
if ((!IN_GR94(seq[0]) && seq[0] != SS2)
566
*dst++ = ch >> CHAR_BITS;
567
*dst++ = ch & BYTE_MASK;
568
} else if (IS_3BYTES(ch)) {
569
#if SCM_STRICT_ENCODING_CHECK
570
seq[0] = ch >> CHAR_BITS * 2;
571
seq[1] = (ch >> CHAR_BITS) & BYTE_MASK;
572
seq[2] = ch & BYTE_MASK;
573
if (seq[0] != SS3 || !IN_GR94(seq[1]) || !IN_GR94(seq[2]))
576
*dst++ = ch >> CHAR_BITS * 2;
577
*dst++ = (ch >> CHAR_BITS) & BYTE_MASK;
578
*dst++ = ch & BYTE_MASK;
586
#endif /* SCM_USE_EUCJP */
588
#if (SCM_USE_EUCCN || SCM_USE_EUCKR || SCM_USE_SJIS)
589
/* generic double-byte char */
591
dbc_str2int(const uchar *src, size_t len, ScmMultibyteState state)
601
ch = src[0] << CHAR_BITS;
606
return SCM_ICHAR_EOF;
611
#endif /* (SCM_USE_EUCCN || SCM_USE_EUCKR || SCM_USE_SJIS) */
613
#if (SCM_USE_EUCCN || SCM_USE_EUCKR)
614
/* FIXME: Optimize */
616
euc_char_len(scm_ichar_t ch)
619
uchar buf[SCM_MB_CHAR_BUF_SIZE];
621
end = euc_int2str(buf, ch, SCM_MB_STATELESS);
623
return (end) ? end - buf : 0;
627
euc_int2str(uchar *dst, scm_ichar_t ch, ScmMultibyteState state)
629
#if SCM_STRICT_ENCODING_CHECK
635
} else if (IS_2BYTES(ch)) {
636
#if SCM_STRICT_ENCODING_CHECK
637
seq[0] = ch >> CHAR_BITS;
638
seq[1] = ch & BYTE_MASK;
639
if (!IN_GR94(seq[0]) || !IN_GR96(seq[1]))
642
*dst++ = ch >> CHAR_BITS;
643
*dst++ = ch & BYTE_MASK;
651
#endif /* (SCM_USE_EUCCN || SCM_USE_EUCKR) */
660
static enum ScmCodedCharSet
663
return SCM_CCS_UNKNOWN;
666
/* FIXME: NOT TESTED!
668
* G0 <- ASCII (or GB 1988?)
672
* GR <- G1 (GB2312) */
673
static ScmMultibyteCharInfo
674
euccn_scan_char(ScmMultibyteString mbs)
676
/* TODO: maybe we can make this an alias of eucjp_scan_char()? */
677
const uchar *str = (const uchar *)SCM_MBS_GET_STR(mbs);
678
const size_t size = SCM_MBS_GET_SIZE(mbs);
683
if (IS_ASCII(str[0]))
685
if (IN_GR94(str[0])) {
688
RETURN_INCOMPLETE(size);
689
#if SCM_STRICT_ENCODING_CHECK
690
if (!IN_GR96(str[1]))
700
static enum ScmCodedCharSet
703
return SCM_CCS_UNKNOWN;
712
/* FIXME: NOT TESTED! I'm not sure about this encoding. There's also
713
* a Microsoft variant called CP949, which is not supported (yet).
714
* RFC 1557 says KS X 1001 is 94x94.
717
* G1 <- KS X 1001 (aka KSC 5601)
721
static ScmMultibyteCharInfo
722
euckr_scan_char(ScmMultibyteString mbs)
724
const uchar *str = (const uchar *)SCM_MBS_GET_STR(mbs);
725
const size_t size = SCM_MBS_GET_SIZE(mbs);
730
if (IS_ASCII(str[0]))
732
if (IN_GR94(str[0])) {
735
RETURN_INCOMPLETE(size);
736
#if SCM_STRICT_ENCODING_CHECK
737
if (!IN_GR96(str[1]))
744
#endif /* SCM_USE_EUCKR */
746
/*==== Encodings for Unicode ====*/
747
#define IN_OCT_BMP(u) ((scm_ichar_t)(u) <= 0x7ff)
748
#define IN_BMP(u) ((scm_ichar_t)(u) <= 0xffff)
749
#define IN_SMP(u) ((scm_ichar_t)(u) <= 0x10ffff && !IN_BMP(u))
753
#define MASK(n) ((LEN_CODE(n) >> 1) | 0x80)
754
#define LEN_CODE(n) (((1 << (n))-1) << (8-n))
755
#define IS_LEN(c, n) ((MASK(n) & (c)) == LEN_CODE(n))
756
#define IS_TRAILING(c) (IS_LEN((c), 1))
758
#define LEN_CODE_BITS(n) (n + 1)
759
#define TRAILING_CODE_BITS LEN_CODE_BITS(1)
760
#define TRAILING_VAL_BITS (CHAR_BITS - TRAILING_CODE_BITS)
761
#define LEADING_VAL_BITS(n) (CHAR_BITS - LEN_CODE_BITS(n))
762
#define LEADING_VAL(u, n) ((u) >> TRAILING_VAL_BITS * ((n) - 1))
763
#define TRAILING_VAL(u, i) (~MASK(1) & ((u) >> TRAILING_VAL_BITS * (i)))
771
static enum ScmCodedCharSet
777
/* FIXME: Optimize */
779
utf8_char_len(scm_ichar_t ch)
782
uchar buf[SCM_MB_CHAR_BUF_SIZE];
784
end = utf8_int2str(buf, ch, SCM_MB_STATELESS);
786
return (end) ? end - buf : 0;
789
static ScmMultibyteCharInfo
790
utf8_scan_char(ScmMultibyteString mbs)
792
const uchar *str = (const uchar *)SCM_MBS_GET_STR(mbs);
793
const size_t size = SCM_MBS_GET_SIZE(mbs);
799
if (IS_ASCII(str[0]))
802
if (IS_LEN(str[0], 2)) len = 2;
803
else if (IS_LEN(str[0], 3)) len = 3;
804
else if (IS_LEN(str[0], 4)) len = 4;
807
#if SCM_STRICT_ENCODING_CHECK
810
for (i = 1; i < len; i++) {
812
RETURN_INCOMPLETE(size);
813
if (!IS_TRAILING(str[i]))
817
#else /* not SCM_STRICT_ENCODING_CHECK */
819
RETURN_INCOMPLETE(size);
820
#endif /* not SCM_STRICT_ENCODING_CHECK */
827
utf8_str2int(const uchar *src, size_t len, ScmMultibyteState state)
837
ch = (~MASK(2) & src[0]) << TRAILING_VAL_BITS;
838
ch |= (~MASK(1) & src[1]);
842
ch = (~MASK(3) & src[0]) << TRAILING_VAL_BITS * 2;
843
ch |= (~MASK(1) & src[1]) << TRAILING_VAL_BITS;
844
ch |= (~MASK(1) & src[2]);
848
ch = (~MASK(4) & src[0]) << TRAILING_VAL_BITS * 3;
849
ch |= (~MASK(1) & src[1]) << TRAILING_VAL_BITS * 2;
850
ch |= (~MASK(1) & src[2]) << TRAILING_VAL_BITS;
851
ch |= (~MASK(1) & src[3]);
855
return SCM_ICHAR_EOF;
862
utf8_int2str(uchar *dst, scm_ichar_t ch, ScmMultibyteState state)
866
} else if (IN_OCT_BMP(ch)) {
867
*dst++ = LEN_CODE(2) | LEADING_VAL(ch, 2);
868
*dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 0);
869
} else if (IN_BMP(ch)) {
870
*dst++ = LEN_CODE(3) | LEADING_VAL(ch, 3);
871
*dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 1);
872
*dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 0);
873
} else if (IN_SMP(ch)) {
874
*dst++ = LEN_CODE(4) | LEADING_VAL(ch, 4);
875
*dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 2);
876
*dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 1);
877
*dst++ = LEN_CODE(1) | TRAILING_VAL(ch, 0);
890
#undef TRAILING_CODE_BITS
891
#undef TRAILING_VAL_BITS
892
#undef LEADING_VAL_BITS
895
#endif /* SCM_USE_UTF8 */
897
/*==== Other encodings ====*/
900
/* The cwazy Japanese encoding. This function implements the JIS X
905
* 0x81 .. 0x9F: lead byte of 2-byte char
907
* 0xA1 .. 0xDF: JIS X 0201 katakana (1 byte)
908
* 0xE0 .. 0xEF: lead byte of 2-byte char
909
* 0xF0 .. 0xFC: lead byte of 2-byte char if JIS X 0213 is used
910
* 0xFD .. 0xFF: undefined
912
* 0x40 .. 0x7E: trailing byte of 2-byte char
913
* 0x80 .. 0xFC: trailing byte of 2-byte char
915
#define IS_KANA(c) (0xA1 <= (uchar)(c) && (uchar)(c) <= 0xDF)
917
(0x81 <= (uchar)(c) \
919
&& (uchar)(c) <= 0xFC \
920
&& (uchar)(c) != 0xA0)
921
#define IS_TRAIL(c) (0x40 <= (uchar)(c) && (uchar)(c) <= 0xFC && (c) != 0x7E)
929
static enum ScmCodedCharSet
932
return SCM_CCS_UNKNOWN;
935
/* FIXME: Optimize */
937
sjis_char_len(scm_ichar_t ch)
940
uchar buf[SCM_MB_CHAR_BUF_SIZE];
942
end = sjis_int2str(buf, ch, SCM_MB_STATELESS);
944
return (end) ? end - buf : 0;
947
static ScmMultibyteCharInfo
948
sjis_scan_char(ScmMultibyteString mbs)
950
const uchar *str = (const uchar *)SCM_MBS_GET_STR(mbs);
951
const size_t size = SCM_MBS_GET_SIZE(mbs);
956
if (IS_LEAD(str[0])) {
959
RETURN_INCOMPLETE(size);
960
#if SCM_STRICT_ENCODING_CHECK
961
if (!IS_TRAIL(str[1]))
970
sjis_int2str(uchar *dst, scm_ichar_t ch, ScmMultibyteState state)
974
#if SCM_STRICT_ENCODING_CHECK
975
if (ch >> CHAR_BITS * 2)
978
high = ch >> CHAR_BITS;
979
low = ch & BYTE_MASK;
982
#if SCM_STRICT_ENCODING_CHECK
996
#endif /* SCM_USE_SJIS */
998
/* Single-byte encodings. Please add any that you know are missing.
999
* Sorted alphabetically.
1007
unibyte_encoding(void)
1009
/* conventional assumption */
1010
return "ISO-8859-1";
1013
static enum ScmCodedCharSet
1016
/* conventional assumption */
1017
return SCM_CCS_ISO8859_1;
1021
unibyte_char_len(scm_ichar_t ch)
1023
return (0 < ch && ch <= 0xff) ? 1 : 0;
1026
static ScmMultibyteCharInfo
1027
unibyte_scan_char(ScmMultibyteString mbs)
1031
if (SCM_MBS_GET_SIZE(mbs))
1037
unibyte_str2int(const uchar *src, size_t len, ScmMultibyteState state)
1039
#if SCM_STRICT_ENCODING_CHECK
1041
return SCM_ICHAR_EOF;
1047
unibyte_int2str(uchar *dst, scm_ichar_t ch, ScmMultibyteState state)
1049
#if SCM_STRICT_ENCODING_CHECK
1050
if (ch & ~BYTE_MASK)