1
/* Convert multibyte character to wide character.
2
Copyright (C) 1999-2002, 2005-2008 Free Software Foundation, Inc.
3
Written by Bruno Haible <bruno@clisp.org>, 2008.
5
This program is free software: you can redistribute it and/or modify
6
it under the terms of the GNU General Public License as published by
7
the Free Software Foundation; either version 3 of the License, or
8
(at your option) any later version.
10
This program is distributed in the hope that it will be useful,
11
but WITHOUT ANY WARRANTY; without even the implied warranty of
12
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
GNU General Public License for more details.
15
You should have received a copy of the GNU General Public License
16
along with this program. If not, see <http://www.gnu.org/licenses/>. */
23
#if GNULIB_defined_mbstate_t
24
/* Implement mbrtowc() on top of mbtowc(). */
29
# include "localcharset.h"
34
verify (sizeof (mbstate_t) >= 4);
36
static char internal_state[4];
39
mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
41
char *pstate = (char *)ps;
44
pstate = internal_state;
58
size_t nstate = pstate[0];
95
/* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */
96
mbtowc (NULL, NULL, 0);
99
int res = mbtowc (pwc, p, m);
103
if (pwc != NULL && ((*pwc == 0) != (res == 0)))
105
if (nstate >= (res > 0 ? res : 1))
112
/* mbtowc does not distinguish between invalid and incomplete multibyte
113
sequences. But mbrtowc needs to make this distinction.
114
There are two possible approaches:
115
- Use iconv() and its return value.
116
- Use built-in knowledge about the possible encodings.
117
Given the low quality of implementation of iconv() on the systems that
118
lack mbrtowc(), we use the second approach.
119
The possible encodings are:
121
- EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, SJIS,
123
Use specialized code for each. */
124
if (m >= 4 || m >= MB_CUR_MAX)
126
/* Here MB_CUR_MAX > 1 and 0 < m < 4. */
128
const char *encoding = locale_charset ();
130
if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
132
/* Cf. unistr/u8-mblen.c. */
133
unsigned char c = (unsigned char) p[0];
148
unsigned char c2 = (unsigned char) p[1];
150
if ((c2 ^ 0x80) < 0x40
151
&& (c >= 0xe1 || c2 >= 0xa0)
152
&& (c != 0xed || c2 < 0xa0))
160
else /* m == 2 || m == 3 */
162
unsigned char c2 = (unsigned char) p[1];
164
if ((c2 ^ 0x80) < 0x40
165
&& (c >= 0xf1 || c2 >= 0x90)
166
&& (c < 0xf4 || (c == 0xf4 && c2 < 0x90)))
172
unsigned char c3 = (unsigned char) p[2];
174
if ((c3 ^ 0x80) < 0x40)
184
/* As a reference for this code, you can use the GNU libiconv
185
implementation. Look for uses of the RET_TOOFEW macro. */
187
if (STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0))
191
unsigned char c = (unsigned char) p[0];
193
if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f)
198
unsigned char c = (unsigned char) p[0];
202
unsigned char c2 = (unsigned char) p[1];
204
if (c2 >= 0xa1 && c2 < 0xff)
210
if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
211
|| STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
212
|| STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0))
216
unsigned char c = (unsigned char) p[0];
218
if (c >= 0xa1 && c < 0xff)
223
if (STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0))
227
unsigned char c = (unsigned char) p[0];
229
if ((c >= 0xa1 && c < 0xff) || c == 0x8e)
232
else /* m == 2 || m == 3 */
234
unsigned char c = (unsigned char) p[0];
241
if (STREQ (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0))
245
unsigned char c = (unsigned char) p[0];
247
if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)
248
|| (c >= 0xf0 && c <= 0xf9))
254
/* An unknown multibyte encoding. */
261
/* Here 0 < k < m < 4. */
273
/* The conversion state is undefined, says POSIX. */
280
/* Override the system's mbrtowc() function. */
285
rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
287
# if MBRTOWC_NULL_ARG_BUG || MBRTOWC_RETVAL_BUG
296
# if MBRTOWC_RETVAL_BUG
298
static mbstate_t internal_state;
300
/* Override mbrtowc's internal state. We can not call mbsinit() on the
301
hidden internal state, but we can call it on our variable. */
303
ps = &internal_state;
307
/* Parse the rest of the multibyte character byte for byte. */
309
for (; n > 0; s++, n--)
312
size_t ret = mbrtowc (&wc, s, 1, ps);
314
if (ret == (size_t)(-1))
317
if (ret != (size_t)(-2))
319
/* The multibyte character has been completed. */
322
return (wc == 0 ? 0 : count);
330
# if MBRTOWC_NUL_RETVAL_BUG
333
size_t ret = mbrtowc (&wc, s, n, ps);
335
if (ret != (size_t)(-1) && ret != (size_t)(-2))
345
return mbrtowc (pwc, s, n, ps);