1
/* Determine a canonical name for the current locale's character encoding.
3
Copyright (C) 2000-2006, 2008-2012 Free Software Foundation, Inc.
5
This program is free software; you can redistribute it and/or modify
6
it under the terms of the GNU General Public License as published by
7
the Free Software Foundation; either version 3, or (at your option)
10
This program is distributed in the hope that it will be useful,
11
but WITHOUT ANY WARRANTY; without even the implied warranty of
12
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
GNU General Public License for more details.
15
You should have received a copy of the GNU General Public License along
16
with this program; if not, see <http://www.gnu.org/licenses/>. */
18
/* Written by Bruno Haible <bruno@clisp.org>. */
23
#include "localcharset.h"
31
#if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
32
# define DARWIN7 /* Darwin 7 or newer, i.e. MacOS X 10.3 or newer */
35
#if defined _WIN32 || defined __WIN32__
36
# define WINDOWS_NATIVE
40
/* Assume EMX program runs on OS/2, even if compiled under DOS. */
46
#if !defined WINDOWS_NATIVE
48
# if HAVE_LANGINFO_CODESET
49
# include <langinfo.h>
51
# if 0 /* see comment below */
56
# define WIN32_LEAN_AND_MEAN
59
#elif defined WINDOWS_NATIVE
60
# define WIN32_LEAN_AND_MEAN
68
#if ENABLE_RELOCATABLE
69
# include "relocatable.h"
71
# define relocate(pathname) (pathname)
76
# include "configmake.h"
79
/* Define O_NOFOLLOW to 0 on platforms where it does not exist. */
84
#if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
85
/* Native Windows, Cygwin, OS/2, DOS */
86
# define ISSLASH(C) ((C) == '/' || (C) == '\\')
89
#ifndef DIRECTORY_SEPARATOR
90
# define DIRECTORY_SEPARATOR '/'
94
# define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
97
#if HAVE_DECL_GETC_UNLOCKED
99
# define getc getc_unlocked
102
/* The following static variable is declared 'volatile' to avoid a
103
possible multithread problem in the function get_charset_aliases. If we
104
are running in a threaded environment, and if two threads initialize
105
'charset_aliases' simultaneously, both will produce the same value,
106
and everything will be ok if the two assignments to 'charset_aliases'
107
are atomic. But I don't know what will happen if the two assignments mix. */
109
# define volatile /* empty */
111
/* Pointer to the contents of the charset.alias file, if it has already been
112
read, else NULL. Its format is:
113
ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */
114
static const char * volatile charset_aliases;
116
/* Return a pointer to the contents of the charset.alias file. */
118
get_charset_aliases (void)
122
cp = charset_aliases;
125
#if !(defined DARWIN7 || defined VMS || defined WINDOWS_NATIVE || defined __CYGWIN__)
127
const char *base = "charset.alias";
130
/* Make it possible to override the charset.alias location. This is
131
necessary for running the testsuite before "make install". */
132
dir = getenv ("CHARSETALIASDIR");
133
if (dir == NULL || dir[0] == '\0')
134
dir = relocate (LIBDIR);
136
/* Concatenate dir and base into freshly allocated file_name. */
138
size_t dir_len = strlen (dir);
139
size_t base_len = strlen (base);
140
int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
141
file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
142
if (file_name != NULL)
144
memcpy (file_name, dir, dir_len);
146
file_name[dir_len] = DIRECTORY_SEPARATOR;
147
memcpy (file_name + dir_len + add_slash, base, base_len + 1);
151
if (file_name == NULL)
152
/* Out of memory. Treat the file as empty. */
158
/* Open the file. Reject symbolic links on platforms that support
159
O_NOFOLLOW. This is a security feature. Without it, an attacker
160
could retrieve parts of the contents (namely, the tail of the
161
first line that starts with "* ") of an arbitrary file by placing
162
a symbolic link to that file under the name "charset.alias" in
163
some writable directory and defining the environment variable
164
CHARSETALIASDIR to point to that directory. */
165
fd = open (file_name,
166
O_RDONLY | (HAVE_WORKING_O_NOFOLLOW ? O_NOFOLLOW : 0));
168
/* File not found. Treat it as empty. */
174
fp = fdopen (fd, "r");
177
/* Out of memory. Treat the file as empty. */
183
/* Parse the file's contents. */
184
char *res_ptr = NULL;
198
if (c == '\n' || c == ' ' || c == '\t')
202
/* Skip comment, to end of line. */
205
while (!(c == EOF || c == '\n'));
211
if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
215
old_res_ptr = res_ptr;
218
res_size = l1 + 1 + l2 + 1;
219
res_ptr = (char *) malloc (res_size + 1);
223
res_size += l1 + 1 + l2 + 1;
224
res_ptr = (char *) realloc (res_ptr, res_size + 1);
233
strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
234
strcpy (res_ptr + res_size - (l2 + 1), buf2);
241
*(res_ptr + res_size) = '\0';
253
/* To avoid the trouble of installing a file that is shared by many
254
GNU packages -- many packaging systems have problems with this --,
255
simply inline the aliases here. */
256
cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
257
"ISO8859-2" "\0" "ISO-8859-2" "\0"
258
"ISO8859-4" "\0" "ISO-8859-4" "\0"
259
"ISO8859-5" "\0" "ISO-8859-5" "\0"
260
"ISO8859-7" "\0" "ISO-8859-7" "\0"
261
"ISO8859-9" "\0" "ISO-8859-9" "\0"
262
"ISO8859-13" "\0" "ISO-8859-13" "\0"
263
"ISO8859-15" "\0" "ISO-8859-15" "\0"
264
"KOI8-R" "\0" "KOI8-R" "\0"
265
"KOI8-U" "\0" "KOI8-U" "\0"
266
"CP866" "\0" "CP866" "\0"
267
"CP949" "\0" "CP949" "\0"
268
"CP1131" "\0" "CP1131" "\0"
269
"CP1251" "\0" "CP1251" "\0"
270
"eucCN" "\0" "GB2312" "\0"
271
"GB2312" "\0" "GB2312" "\0"
272
"eucJP" "\0" "EUC-JP" "\0"
273
"eucKR" "\0" "EUC-KR" "\0"
274
"Big5" "\0" "BIG5" "\0"
275
"Big5HKSCS" "\0" "BIG5-HKSCS" "\0"
276
"GBK" "\0" "GBK" "\0"
277
"GB18030" "\0" "GB18030" "\0"
278
"SJIS" "\0" "SHIFT_JIS" "\0"
279
"ARMSCII-8" "\0" "ARMSCII-8" "\0"
280
"PT154" "\0" "PT154" "\0"
281
/*"ISCII-DEV" "\0" "?" "\0"*/
282
"*" "\0" "UTF-8" "\0";
286
/* To avoid the troubles of an extra file charset.alias_vms in the
287
sources of many GNU packages, simply inline the aliases here. */
288
/* The list of encodings is taken from the OpenVMS 7.3-1 documentation
289
"Compaq C Run-Time Library Reference Manual for OpenVMS systems"
290
section 10.7 "Handling Different Character Sets". */
291
cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
292
"ISO8859-2" "\0" "ISO-8859-2" "\0"
293
"ISO8859-5" "\0" "ISO-8859-5" "\0"
294
"ISO8859-7" "\0" "ISO-8859-7" "\0"
295
"ISO8859-8" "\0" "ISO-8859-8" "\0"
296
"ISO8859-9" "\0" "ISO-8859-9" "\0"
298
"eucJP" "\0" "EUC-JP" "\0"
299
"SJIS" "\0" "SHIFT_JIS" "\0"
300
"DECKANJI" "\0" "DEC-KANJI" "\0"
301
"SDECKANJI" "\0" "EUC-JP" "\0"
303
"eucTW" "\0" "EUC-TW" "\0"
304
"DECHANYU" "\0" "DEC-HANYU" "\0"
305
"DECHANZI" "\0" "GB2312" "\0"
307
"DECKOREAN" "\0" "EUC-KR" "\0";
310
# if defined WINDOWS_NATIVE || defined __CYGWIN__
311
/* To avoid the troubles of installing a separate file in the same
312
directory as the DLL and of retrieving the DLL's directory at
313
runtime, simply inline the aliases here. */
315
cp = "CP936" "\0" "GBK" "\0"
316
"CP1361" "\0" "JOHAB" "\0"
317
"CP20127" "\0" "ASCII" "\0"
318
"CP20866" "\0" "KOI8-R" "\0"
319
"CP20936" "\0" "GB2312" "\0"
320
"CP21866" "\0" "KOI8-RU" "\0"
321
"CP28591" "\0" "ISO-8859-1" "\0"
322
"CP28592" "\0" "ISO-8859-2" "\0"
323
"CP28593" "\0" "ISO-8859-3" "\0"
324
"CP28594" "\0" "ISO-8859-4" "\0"
325
"CP28595" "\0" "ISO-8859-5" "\0"
326
"CP28596" "\0" "ISO-8859-6" "\0"
327
"CP28597" "\0" "ISO-8859-7" "\0"
328
"CP28598" "\0" "ISO-8859-8" "\0"
329
"CP28599" "\0" "ISO-8859-9" "\0"
330
"CP28605" "\0" "ISO-8859-15" "\0"
331
"CP38598" "\0" "ISO-8859-8" "\0"
332
"CP51932" "\0" "EUC-JP" "\0"
333
"CP51936" "\0" "GB2312" "\0"
334
"CP51949" "\0" "EUC-KR" "\0"
335
"CP51950" "\0" "EUC-TW" "\0"
336
"CP54936" "\0" "GB18030" "\0"
337
"CP65001" "\0" "UTF-8" "\0";
341
charset_aliases = cp;
347
/* Determine the current locale's character encoding, and canonicalize it
348
into one of the canonical names listed in config.charset.
349
The result must not be freed; it is statically allocated.
350
If the canonical name cannot be determined, the result is a non-canonical
357
locale_charset (void)
362
#if !(defined WINDOWS_NATIVE || defined OS2)
364
# if HAVE_LANGINFO_CODESET
366
/* Most systems support nl_langinfo (CODESET) nowadays. */
367
codeset = nl_langinfo (CODESET);
370
/* Cygwin < 1.7 does not have locales. nl_langinfo (CODESET) always
371
returns "US-ASCII". Return the suffix of the locale name from the
372
environment variables (if present) or the codepage as a number. */
373
if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
376
static char buf[2 + 10 + 1];
378
locale = getenv ("LC_ALL");
379
if (locale == NULL || locale[0] == '\0')
381
locale = getenv ("LC_CTYPE");
382
if (locale == NULL || locale[0] == '\0')
383
locale = getenv ("LANG");
385
if (locale != NULL && locale[0] != '\0')
387
/* If the locale name contains an encoding after the dot, return
389
const char *dot = strchr (locale, '.');
393
const char *modifier;
396
/* Look for the possible @... trailer and remove it, if any. */
397
modifier = strchr (dot, '@');
398
if (modifier == NULL)
400
if (modifier - dot < sizeof (buf))
402
memcpy (buf, dot, modifier - dot);
403
buf [modifier - dot] = '\0';
409
/* The Windows API has a function returning the locale's codepage as a
410
number: GetACP(). This encoding is used by Cygwin, unless the user
411
has set the environment variable CYGWIN=codepage:oem (which very few
413
Output directed to console windows needs to be converted (to
414
GetOEMCP() if the console is using a raster font, or to
415
GetConsoleOutputCP() if it is using a TrueType font). Cygwin does
416
this conversion transparently (see winsup/cygwin/fhandler_console.cc),
417
converting to GetConsoleOutputCP(). This leads to correct results,
418
except when SetConsoleOutputCP has been called and a raster font is
420
sprintf (buf, "CP%u", GetACP ());
427
/* On old systems which lack it, use setlocale or getenv. */
428
const char *locale = NULL;
430
/* But most old systems don't have a complete set of locales. Some
431
(like SunOS 4 or DJGPP) have only the C locale. Therefore we don't
432
use setlocale here; it would return "C" when it doesn't support the
433
locale name the user has set. */
435
locale = setlocale (LC_CTYPE, NULL);
437
if (locale == NULL || locale[0] == '\0')
439
locale = getenv ("LC_ALL");
440
if (locale == NULL || locale[0] == '\0')
442
locale = getenv ("LC_CTYPE");
443
if (locale == NULL || locale[0] == '\0')
444
locale = getenv ("LANG");
448
/* On some old systems, one used to set locale = "iso8859_1". On others,
449
you set it to "language_COUNTRY.charset". In any case, we resolve it
450
through the charset.alias file. */
455
#elif defined WINDOWS_NATIVE
457
static char buf[2 + 10 + 1];
459
/* The Windows API has a function returning the locale's codepage as a
461
When the output goes to a console window, it needs to be provided in
462
GetOEMCP() encoding if the console is using a raster font, or in
463
GetConsoleOutputCP() encoding if it is using a TrueType font.
464
But in GUI programs and for output sent to files and pipes, GetACP()
465
encoding is the best bet. */
466
sprintf (buf, "CP%u", GetACP ());
472
static char buf[2 + 10 + 1];
476
/* Allow user to override the codeset, as set in the operating system,
477
with standard language environment variables. */
478
locale = getenv ("LC_ALL");
479
if (locale == NULL || locale[0] == '\0')
481
locale = getenv ("LC_CTYPE");
482
if (locale == NULL || locale[0] == '\0')
483
locale = getenv ("LANG");
485
if (locale != NULL && locale[0] != '\0')
487
/* If the locale name contains an encoding after the dot, return it. */
488
const char *dot = strchr (locale, '.');
492
const char *modifier;
495
/* Look for the possible @... trailer and remove it, if any. */
496
modifier = strchr (dot, '@');
497
if (modifier == NULL)
499
if (modifier - dot < sizeof (buf))
501
memcpy (buf, dot, modifier - dot);
502
buf [modifier - dot] = '\0';
507
/* Resolve through the charset.alias file. */
512
/* OS/2 has a function returning the locale's codepage as a number. */
513
if (DosQueryCp (sizeof (cp), cp, &cplen))
517
sprintf (buf, "CP%u", cp[0]);
525
/* The canonical name cannot be determined. */
529
for (aliases = get_charset_aliases ();
531
aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
532
if (strcmp (codeset, aliases) == 0
533
|| (aliases[0] == '*' && aliases[1] == '\0'))
535
codeset = aliases + strlen (aliases) + 1;
539
/* Don't return an empty string. GNU libc and GNU libiconv interpret
540
the empty string as denoting "the locale's character encoding",
541
thus GNU libiconv would call this function a second time. */
542
if (codeset[0] == '\0')