1
/* Determine a canonical name for the current locale's character encoding.
3
Copyright (C) 2000-2006, 2008-2014 Free Software Foundation, Inc.
5
This program is free software; you can redistribute it and/or modify
6
it under the terms of the GNU General Public License as published by
7
the Free Software Foundation; either version 3, or (at your option)
10
This program is distributed in the hope that it will be useful,
11
but WITHOUT ANY WARRANTY; without even the implied warranty of
12
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
GNU General Public License for more details.
15
You should have received a copy of the GNU General Public License along
16
with this program; if not, see <http://www.gnu.org/licenses/>. */
18
/* Written by Bruno Haible <bruno@clisp.org>. */
23
#include "localcharset.h"
31
#if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
32
# define DARWIN7 /* Darwin 7 or newer, i.e. Mac OS X 10.3 or newer */
35
#if defined _WIN32 || defined __WIN32__
36
# define WINDOWS_NATIVE
41
/* Assume EMX program runs on OS/2, even if compiled under DOS. */
47
#if !defined WINDOWS_NATIVE
49
# if HAVE_LANGINFO_CODESET
50
# include <langinfo.h>
52
# if 0 /* see comment below */
57
# define WIN32_LEAN_AND_MEAN
60
#elif defined WINDOWS_NATIVE
61
# define WIN32_LEAN_AND_MEAN
69
/* For MB_CUR_MAX_L */
74
#if ENABLE_RELOCATABLE
75
# include "relocatable.h"
77
# define relocate(pathname) (pathname)
82
# include "configmake.h"
85
/* Define O_NOFOLLOW to 0 on platforms where it does not exist. */
90
#if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
91
/* Native Windows, Cygwin, OS/2, DOS */
92
# define ISSLASH(C) ((C) == '/' || (C) == '\\')
95
#ifndef DIRECTORY_SEPARATOR
96
# define DIRECTORY_SEPARATOR '/'
100
# define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
103
#if HAVE_DECL_GETC_UNLOCKED
105
# define getc getc_unlocked
108
/* The following static variable is declared 'volatile' to avoid a
109
possible multithread problem in the function get_charset_aliases. If we
110
are running in a threaded environment, and if two threads initialize
111
'charset_aliases' simultaneously, both will produce the same value,
112
and everything will be ok if the two assignments to 'charset_aliases'
113
are atomic. But I don't know what will happen if the two assignments mix. */
115
# define volatile /* empty */
117
/* Pointer to the contents of the charset.alias file, if it has already been
118
read, else NULL. Its format is:
119
ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */
120
static const char * volatile charset_aliases;
122
/* Return a pointer to the contents of the charset.alias file. */
124
get_charset_aliases (void)
128
cp = charset_aliases;
131
#if !(defined DARWIN7 || defined VMS || defined WINDOWS_NATIVE || defined __CYGWIN__)
133
const char *base = "charset.alias";
136
/* Make it possible to override the charset.alias location. This is
137
necessary for running the testsuite before "make install". */
138
dir = getenv ("CHARSETALIASDIR");
139
if (dir == NULL || dir[0] == '\0')
140
dir = relocate (LIBDIR);
142
/* Concatenate dir and base into freshly allocated file_name. */
144
size_t dir_len = strlen (dir);
145
size_t base_len = strlen (base);
146
int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
147
file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
148
if (file_name != NULL)
150
memcpy (file_name, dir, dir_len);
152
file_name[dir_len] = DIRECTORY_SEPARATOR;
153
memcpy (file_name + dir_len + add_slash, base, base_len + 1);
157
if (file_name == NULL)
158
/* Out of memory. Treat the file as empty. */
164
/* Open the file. Reject symbolic links on platforms that support
165
O_NOFOLLOW. This is a security feature. Without it, an attacker
166
could retrieve parts of the contents (namely, the tail of the
167
first line that starts with "* ") of an arbitrary file by placing
168
a symbolic link to that file under the name "charset.alias" in
169
some writable directory and defining the environment variable
170
CHARSETALIASDIR to point to that directory. */
171
fd = open (file_name,
172
O_RDONLY | (HAVE_WORKING_O_NOFOLLOW ? O_NOFOLLOW : 0));
174
/* File not found. Treat it as empty. */
180
fp = fdopen (fd, "r");
183
/* Out of memory. Treat the file as empty. */
189
/* Parse the file's contents. */
190
char *res_ptr = NULL;
204
if (c == '\n' || c == ' ' || c == '\t')
208
/* Skip comment, to end of line. */
211
while (!(c == EOF || c == '\n'));
217
if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
221
old_res_ptr = res_ptr;
224
res_size = l1 + 1 + l2 + 1;
225
res_ptr = (char *) malloc (res_size + 1);
229
res_size += l1 + 1 + l2 + 1;
230
res_ptr = (char *) realloc (res_ptr, res_size + 1);
239
strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
240
strcpy (res_ptr + res_size - (l2 + 1), buf2);
247
*(res_ptr + res_size) = '\0';
259
/* To avoid the trouble of installing a file that is shared by many
260
GNU packages -- many packaging systems have problems with this --,
261
simply inline the aliases here. */
262
cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
263
"ISO8859-2" "\0" "ISO-8859-2" "\0"
264
"ISO8859-4" "\0" "ISO-8859-4" "\0"
265
"ISO8859-5" "\0" "ISO-8859-5" "\0"
266
"ISO8859-7" "\0" "ISO-8859-7" "\0"
267
"ISO8859-9" "\0" "ISO-8859-9" "\0"
268
"ISO8859-13" "\0" "ISO-8859-13" "\0"
269
"ISO8859-15" "\0" "ISO-8859-15" "\0"
270
"KOI8-R" "\0" "KOI8-R" "\0"
271
"KOI8-U" "\0" "KOI8-U" "\0"
272
"CP866" "\0" "CP866" "\0"
273
"CP949" "\0" "CP949" "\0"
274
"CP1131" "\0" "CP1131" "\0"
275
"CP1251" "\0" "CP1251" "\0"
276
"eucCN" "\0" "GB2312" "\0"
277
"GB2312" "\0" "GB2312" "\0"
278
"eucJP" "\0" "EUC-JP" "\0"
279
"eucKR" "\0" "EUC-KR" "\0"
280
"Big5" "\0" "BIG5" "\0"
281
"Big5HKSCS" "\0" "BIG5-HKSCS" "\0"
282
"GBK" "\0" "GBK" "\0"
283
"GB18030" "\0" "GB18030" "\0"
284
"SJIS" "\0" "SHIFT_JIS" "\0"
285
"ARMSCII-8" "\0" "ARMSCII-8" "\0"
286
"PT154" "\0" "PT154" "\0"
287
/*"ISCII-DEV" "\0" "?" "\0"*/
288
"*" "\0" "UTF-8" "\0";
292
/* To avoid the troubles of an extra file charset.alias_vms in the
293
sources of many GNU packages, simply inline the aliases here. */
294
/* The list of encodings is taken from the OpenVMS 7.3-1 documentation
295
"Compaq C Run-Time Library Reference Manual for OpenVMS systems"
296
section 10.7 "Handling Different Character Sets". */
297
cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
298
"ISO8859-2" "\0" "ISO-8859-2" "\0"
299
"ISO8859-5" "\0" "ISO-8859-5" "\0"
300
"ISO8859-7" "\0" "ISO-8859-7" "\0"
301
"ISO8859-8" "\0" "ISO-8859-8" "\0"
302
"ISO8859-9" "\0" "ISO-8859-9" "\0"
304
"eucJP" "\0" "EUC-JP" "\0"
305
"SJIS" "\0" "SHIFT_JIS" "\0"
306
"DECKANJI" "\0" "DEC-KANJI" "\0"
307
"SDECKANJI" "\0" "EUC-JP" "\0"
309
"eucTW" "\0" "EUC-TW" "\0"
310
"DECHANYU" "\0" "DEC-HANYU" "\0"
311
"DECHANZI" "\0" "GB2312" "\0"
313
"DECKOREAN" "\0" "EUC-KR" "\0";
316
# if defined WINDOWS_NATIVE || defined __CYGWIN__
317
/* To avoid the troubles of installing a separate file in the same
318
directory as the DLL and of retrieving the DLL's directory at
319
runtime, simply inline the aliases here. */
321
cp = "CP936" "\0" "GBK" "\0"
322
"CP1361" "\0" "JOHAB" "\0"
323
"CP20127" "\0" "ASCII" "\0"
324
"CP20866" "\0" "KOI8-R" "\0"
325
"CP20936" "\0" "GB2312" "\0"
326
"CP21866" "\0" "KOI8-RU" "\0"
327
"CP28591" "\0" "ISO-8859-1" "\0"
328
"CP28592" "\0" "ISO-8859-2" "\0"
329
"CP28593" "\0" "ISO-8859-3" "\0"
330
"CP28594" "\0" "ISO-8859-4" "\0"
331
"CP28595" "\0" "ISO-8859-5" "\0"
332
"CP28596" "\0" "ISO-8859-6" "\0"
333
"CP28597" "\0" "ISO-8859-7" "\0"
334
"CP28598" "\0" "ISO-8859-8" "\0"
335
"CP28599" "\0" "ISO-8859-9" "\0"
336
"CP28605" "\0" "ISO-8859-15" "\0"
337
"CP38598" "\0" "ISO-8859-8" "\0"
338
"CP51932" "\0" "EUC-JP" "\0"
339
"CP51936" "\0" "GB2312" "\0"
340
"CP51949" "\0" "EUC-KR" "\0"
341
"CP51950" "\0" "EUC-TW" "\0"
342
"CP54936" "\0" "GB18030" "\0"
343
"CP65001" "\0" "UTF-8" "\0";
347
charset_aliases = cp;
353
/* Determine the current locale's character encoding, and canonicalize it
354
into one of the canonical names listed in config.charset.
355
The result must not be freed; it is statically allocated.
356
If the canonical name cannot be determined, the result is a non-canonical
363
locale_charset (void)
368
#if !(defined WINDOWS_NATIVE || defined OS2)
370
# if HAVE_LANGINFO_CODESET
372
/* Most systems support nl_langinfo (CODESET) nowadays. */
373
codeset = nl_langinfo (CODESET);
376
/* Cygwin < 1.7 does not have locales. nl_langinfo (CODESET) always
377
returns "US-ASCII". Return the suffix of the locale name from the
378
environment variables (if present) or the codepage as a number. */
379
if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
382
static char buf[2 + 10 + 1];
384
locale = getenv ("LC_ALL");
385
if (locale == NULL || locale[0] == '\0')
387
locale = getenv ("LC_CTYPE");
388
if (locale == NULL || locale[0] == '\0')
389
locale = getenv ("LANG");
391
if (locale != NULL && locale[0] != '\0')
393
/* If the locale name contains an encoding after the dot, return
395
const char *dot = strchr (locale, '.');
399
const char *modifier;
402
/* Look for the possible @... trailer and remove it, if any. */
403
modifier = strchr (dot, '@');
404
if (modifier == NULL)
406
if (modifier - dot < sizeof (buf))
408
memcpy (buf, dot, modifier - dot);
409
buf [modifier - dot] = '\0';
415
/* The Windows API has a function returning the locale's codepage as a
416
number: GetACP(). This encoding is used by Cygwin, unless the user
417
has set the environment variable CYGWIN=codepage:oem (which very few
419
Output directed to console windows needs to be converted (to
420
GetOEMCP() if the console is using a raster font, or to
421
GetConsoleOutputCP() if it is using a TrueType font). Cygwin does
422
this conversion transparently (see winsup/cygwin/fhandler_console.cc),
423
converting to GetConsoleOutputCP(). This leads to correct results,
424
except when SetConsoleOutputCP has been called and a raster font is
426
sprintf (buf, "CP%u", GetACP ());
433
/* On old systems which lack it, use setlocale or getenv. */
434
const char *locale = NULL;
436
/* But most old systems don't have a complete set of locales. Some
437
(like SunOS 4 or DJGPP) have only the C locale. Therefore we don't
438
use setlocale here; it would return "C" when it doesn't support the
439
locale name the user has set. */
441
locale = setlocale (LC_CTYPE, NULL);
443
if (locale == NULL || locale[0] == '\0')
445
locale = getenv ("LC_ALL");
446
if (locale == NULL || locale[0] == '\0')
448
locale = getenv ("LC_CTYPE");
449
if (locale == NULL || locale[0] == '\0')
450
locale = getenv ("LANG");
454
/* On some old systems, one used to set locale = "iso8859_1". On others,
455
you set it to "language_COUNTRY.charset". In any case, we resolve it
456
through the charset.alias file. */
461
#elif defined WINDOWS_NATIVE
463
static char buf[2 + 10 + 1];
465
/* The Windows API has a function returning the locale's codepage as
466
a number, but the value doesn't change according to what the
467
'setlocale' call specified. So we use it as a last resort, in
468
case the string returned by 'setlocale' doesn't specify the
470
char *current_locale = setlocale (LC_ALL, NULL);
473
/* If they set different locales for different categories,
474
'setlocale' will return a semi-colon separated list of locale
475
values. To make sure we use the correct one, we choose LC_CTYPE. */
476
if (strchr (current_locale, ';'))
477
current_locale = setlocale (LC_CTYPE, NULL);
479
pdot = strrchr (current_locale, '.');
481
sprintf (buf, "CP%s", pdot + 1);
484
/* The Windows API has a function returning the locale's codepage as a
486
When the output goes to a console window, it needs to be provided in
487
GetOEMCP() encoding if the console is using a raster font, or in
488
GetConsoleOutputCP() encoding if it is using a TrueType font.
489
But in GUI programs and for output sent to files and pipes, GetACP()
490
encoding is the best bet. */
491
sprintf (buf, "CP%u", GetACP ());
498
static char buf[2 + 10 + 1];
502
/* Allow user to override the codeset, as set in the operating system,
503
with standard language environment variables. */
504
locale = getenv ("LC_ALL");
505
if (locale == NULL || locale[0] == '\0')
507
locale = getenv ("LC_CTYPE");
508
if (locale == NULL || locale[0] == '\0')
509
locale = getenv ("LANG");
511
if (locale != NULL && locale[0] != '\0')
513
/* If the locale name contains an encoding after the dot, return it. */
514
const char *dot = strchr (locale, '.');
518
const char *modifier;
521
/* Look for the possible @... trailer and remove it, if any. */
522
modifier = strchr (dot, '@');
523
if (modifier == NULL)
525
if (modifier - dot < sizeof (buf))
527
memcpy (buf, dot, modifier - dot);
528
buf [modifier - dot] = '\0';
533
/* Resolve through the charset.alias file. */
538
/* OS/2 has a function returning the locale's codepage as a number. */
539
if (DosQueryCp (sizeof (cp), cp, &cplen))
543
sprintf (buf, "CP%u", cp[0]);
551
/* The canonical name cannot be determined. */
555
for (aliases = get_charset_aliases ();
557
aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
558
if (strcmp (codeset, aliases) == 0
559
|| (aliases[0] == '*' && aliases[1] == '\0'))
561
codeset = aliases + strlen (aliases) + 1;
565
/* Don't return an empty string. GNU libc and GNU libiconv interpret
566
the empty string as denoting "the locale's character encoding",
567
thus GNU libiconv would call this function a second time. */
568
if (codeset[0] == '\0')
572
/* Mac OS X sets MB_CUR_MAX to 1 when LC_ALL=C, and "UTF-8"
573
(the default codeset) does not work when MB_CUR_MAX is 1. */
574
if (strcmp (codeset, "UTF-8") == 0 && MB_CUR_MAX_L (uselocale (NULL)) <= 1)