1
/* Convert multibyte character to wide character.
2
Copyright (C) 1999-2002, 2005-2010 Free Software Foundation, Inc.
3
Written by Bruno Haible <bruno@clisp.org>, 2008.
5
This program is free software: you can redistribute it and/or modify
6
it under the terms of the GNU General Public License as published by
7
the Free Software Foundation; either version 3 of the License, or
8
(at your option) any later version.
10
This program is distributed in the hope that it will be useful,
11
but WITHOUT ANY WARRANTY; without even the implied warranty of
12
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
GNU General Public License for more details.
15
You should have received a copy of the GNU General Public License
16
along with this program. If not, see <http://www.gnu.org/licenses/>. */
23
#if GNULIB_defined_mbstate_t
24
/* Implement mbrtowc() on top of mbtowc(). */
29
# include "localcharset.h"
34
verify (sizeof (mbstate_t) >= 4);
36
static char internal_state[4];
39
mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
41
char *pstate = (char *)ps;
44
pstate = internal_state;
58
size_t nstate = pstate[0];
95
/* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
96
mbtowc (NULL, NULL, 0);
99
int res = mbtowc (pwc, p, m);
103
if (pwc != NULL && ((*pwc == 0) != (res == 0)))
105
if (nstate >= (res > 0 ? res : 1))
112
/* mbtowc does not distinguish between invalid and incomplete multibyte
113
sequences. But mbrtowc needs to make this distinction.
114
There are two possible approaches:
115
- Use iconv() and its return value.
116
- Use built-in knowledge about the possible encodings.
117
Given the low quality of implementation of iconv() on the systems that
118
lack mbrtowc(), we use the second approach.
119
The possible encodings are:
121
- EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS,
123
Use specialized code for each. */
124
if (m >= 4 || m >= MB_CUR_MAX)
126
/* Here MB_CUR_MAX > 1 and 0 < m < 4. */
128
const char *encoding = locale_charset ();
130
if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
132
/* Cf. unistr/u8-mblen.c. */
133
unsigned char c = (unsigned char) p[0];
148
unsigned char c2 = (unsigned char) p[1];
150
if ((c2 ^ 0x80) < 0x40
151
&& (c >= 0xe1 || c2 >= 0xa0)
152
&& (c != 0xed || c2 < 0xa0))
160
else /* m == 2 || m == 3 */
162
unsigned char c2 = (unsigned char) p[1];
164
if ((c2 ^ 0x80) < 0x40
165
&& (c >= 0xf1 || c2 >= 0x90)
166
&& (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
172
unsigned char c3 = (unsigned char) p[2];
174
if ((c3 ^ 0x80) < 0x40)
184
/* As a reference for this code, you can use the GNU libiconv
185
implementation. Look for uses of the RET_TOOFEW macro. */
187
if (STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
191
unsigned char c = (unsigned char) p[0];
193
if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
198
unsigned char c = (unsigned char) p[0];
202
unsigned char c2 = (unsigned char) p[1];
204
if (c2 >= 0xa1 && c2 < 0xff)
210
if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
211
|| STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
212
|| STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
216
unsigned char c = (unsigned char) p[0];
218
if (c >= 0xa1 && c < 0xff)
223
if (STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
227
unsigned char c = (unsigned char) p[0];
229
if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
232
else /* m == 2 || m == 3 */
234
unsigned char c = (unsigned char) p[0];
241
if (STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
245
unsigned char c = (unsigned char) p[0];
247
if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe))
250
else /* m == 2 || m == 3 */
252
unsigned char c = (unsigned char) p[0];
254
if (c >= 0x90 && c <= 0xe3)
256
unsigned char c2 = (unsigned char) p[1];
258
if (c2 >= 0x30 && c2 <= 0x39)
264
unsigned char c3 = (unsigned char) p[2];
266
if (c3 >= 0x81 && c3 <= 0xfe)
274
if (STREQ (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
278
unsigned char c = (unsigned char) p[0];
280
if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
281
|| (c >= 0xf0 && c <= 0xf9))
287
/* An unknown multibyte encoding. */
294
/* Here 0 <= k < m < 4. */
310
/* The conversion state is undefined, says POSIX. */
317
/* Override the system's mbrtowc() function. */
322
rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
324
# if MBRTOWC_NULL_ARG_BUG || MBRTOWC_RETVAL_BUG
333
# if MBRTOWC_RETVAL_BUG
335
static mbstate_t internal_state;
337
/* Override mbrtowc's internal state. We can not call mbsinit() on the
338
hidden internal state, but we can call it on our variable. */
340
ps = &internal_state;
344
/* Parse the rest of the multibyte character byte for byte. */
346
for (; n > 0; s++, n--)
349
size_t ret = mbrtowc (&wc, s, 1, ps);
351
if (ret == (size_t)(-1))
354
if (ret != (size_t)(-2))
356
/* The multibyte character has been completed. */
359
return (wc == 0 ? 0 : count);
367
# if MBRTOWC_NUL_RETVAL_BUG
370
size_t ret = mbrtowc (&wc, s, n, ps);
372
if (ret != (size_t)(-1) && ret != (size_t)(-2))
382
return mbrtowc (pwc, s, n, ps);