1
/* Determine a canonical name for the current locale's character encoding.
3
Copyright (C) 2000-2006, 2008-2010 Free Software Foundation, Inc.
5
This program is free software; you can redistribute it and/or modify
6
it under the terms of the GNU General Public License as published by
7
the Free Software Foundation; either version 3, or (at your option)
10
This program is distributed in the hope that it will be useful,
11
but WITHOUT ANY WARRANTY; without even the implied warranty of
12
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
GNU General Public License for more details.
15
You should have received a copy of the GNU General Public License along
16
with this program; if not, write to the Free Software Foundation,
17
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19
/* Written by Bruno Haible <bruno@clisp.org>. */
24
#include "localcharset.h"
32
#if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
33
# define DARWIN7 /* Darwin 7 or newer, i.e. MacOS X 10.3 or newer */
36
#if defined _WIN32 || defined __WIN32__
41
/* Assume EMX program runs on OS/2, even if compiled under DOS. */
47
#if !defined WIN32_NATIVE
49
# if HAVE_LANGINFO_CODESET
50
# include <langinfo.h>
52
# if 0 /* see comment below */
57
# define WIN32_LEAN_AND_MEAN
60
#elif defined WIN32_NATIVE
61
# define WIN32_LEAN_AND_MEAN
69
#if ENABLE_RELOCATABLE
70
# include "relocatable.h"
72
# define relocate(pathname) (pathname)
77
# include "configmake.h"
80
/* Define O_NOFOLLOW to 0 on platforms where it does not exist. */
85
#if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
86
/* Win32, Cygwin, OS/2, DOS */
87
# define ISSLASH(C) ((C) == '/' || (C) == '\\')
90
#ifndef DIRECTORY_SEPARATOR
91
# define DIRECTORY_SEPARATOR '/'
95
# define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
98
#if HAVE_DECL_GETC_UNLOCKED
100
# define getc getc_unlocked
103
/* The following static variable is declared 'volatile' to avoid a
104
possible multithread problem in the function get_charset_aliases. If we
105
are running in a threaded environment, and if two threads initialize
106
'charset_aliases' simultaneously, both will produce the same value,
107
and everything will be ok if the two assignments to 'charset_aliases'
108
are atomic. But I don't know what will happen if the two assignments mix. */
110
# define volatile /* empty */
112
/* Pointer to the contents of the charset.alias file, if it has already been
113
read, else NULL. Its format is:
114
ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */
115
static const char * volatile charset_aliases;
117
/* Return a pointer to the contents of the charset.alias file. */
119
get_charset_aliases (void)
123
cp = charset_aliases;
126
#if !(defined DARWIN7 || defined VMS || defined WIN32_NATIVE || defined __CYGWIN__)
128
const char *base = "charset.alias";
131
/* Make it possible to override the charset.alias location. This is
132
necessary for running the testsuite before "make install". */
133
dir = getenv ("CHARSETALIASDIR");
134
if (dir == NULL || dir[0] == '\0')
135
dir = relocate (LIBDIR);
137
/* Concatenate dir and base into freshly allocated file_name. */
139
size_t dir_len = strlen (dir);
140
size_t base_len = strlen (base);
141
int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
142
file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
143
if (file_name != NULL)
145
memcpy (file_name, dir, dir_len);
147
file_name[dir_len] = DIRECTORY_SEPARATOR;
148
memcpy (file_name + dir_len + add_slash, base, base_len + 1);
152
if (file_name == NULL)
153
/* Out of memory. Treat the file as empty. */
159
/* Open the file. Reject symbolic links on platforms that support
160
O_NOFOLLOW. This is a security feature. Without it, an attacker
161
could retrieve parts of the contents (namely, the tail of the
162
first line that starts with "* ") of an arbitrary file by placing
163
a symbolic link to that file under the name "charset.alias" in
164
some writable directory and defining the environment variable
165
CHARSETALIASDIR to point to that directory. */
166
fd = open (file_name,
167
O_RDONLY | (HAVE_WORKING_O_NOFOLLOW ? O_NOFOLLOW : 0));
169
/* File not found. Treat it as empty. */
175
fp = fdopen (fd, "r");
178
/* Out of memory. Treat the file as empty. */
184
/* Parse the file's contents. */
185
char *res_ptr = NULL;
199
if (c == '\n' || c == ' ' || c == '\t')
203
/* Skip comment, to end of line. */
206
while (!(c == EOF || c == '\n'));
212
if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
216
old_res_ptr = res_ptr;
219
res_size = l1 + 1 + l2 + 1;
220
res_ptr = (char *) malloc (res_size + 1);
224
res_size += l1 + 1 + l2 + 1;
225
res_ptr = (char *) realloc (res_ptr, res_size + 1);
231
if (old_res_ptr != NULL)
235
strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
236
strcpy (res_ptr + res_size - (l2 + 1), buf2);
243
*(res_ptr + res_size) = '\0';
255
/* To avoid the trouble of installing a file that is shared by many
256
GNU packages -- many packaging systems have problems with this --,
257
simply inline the aliases here. */
258
cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
259
"ISO8859-2" "\0" "ISO-8859-2" "\0"
260
"ISO8859-4" "\0" "ISO-8859-4" "\0"
261
"ISO8859-5" "\0" "ISO-8859-5" "\0"
262
"ISO8859-7" "\0" "ISO-8859-7" "\0"
263
"ISO8859-9" "\0" "ISO-8859-9" "\0"
264
"ISO8859-13" "\0" "ISO-8859-13" "\0"
265
"ISO8859-15" "\0" "ISO-8859-15" "\0"
266
"KOI8-R" "\0" "KOI8-R" "\0"
267
"KOI8-U" "\0" "KOI8-U" "\0"
268
"CP866" "\0" "CP866" "\0"
269
"CP949" "\0" "CP949" "\0"
270
"CP1131" "\0" "CP1131" "\0"
271
"CP1251" "\0" "CP1251" "\0"
272
"eucCN" "\0" "GB2312" "\0"
273
"GB2312" "\0" "GB2312" "\0"
274
"eucJP" "\0" "EUC-JP" "\0"
275
"eucKR" "\0" "EUC-KR" "\0"
276
"Big5" "\0" "BIG5" "\0"
277
"Big5HKSCS" "\0" "BIG5-HKSCS" "\0"
278
"GBK" "\0" "GBK" "\0"
279
"GB18030" "\0" "GB18030" "\0"
280
"SJIS" "\0" "SHIFT_JIS" "\0"
281
"ARMSCII-8" "\0" "ARMSCII-8" "\0"
282
"PT154" "\0" "PT154" "\0"
283
/*"ISCII-DEV" "\0" "?" "\0"*/
284
"*" "\0" "UTF-8" "\0";
288
/* To avoid the troubles of an extra file charset.alias_vms in the
289
sources of many GNU packages, simply inline the aliases here. */
290
/* The list of encodings is taken from the OpenVMS 7.3-1 documentation
291
"Compaq C Run-Time Library Reference Manual for OpenVMS systems"
292
section 10.7 "Handling Different Character Sets". */
293
cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
294
"ISO8859-2" "\0" "ISO-8859-2" "\0"
295
"ISO8859-5" "\0" "ISO-8859-5" "\0"
296
"ISO8859-7" "\0" "ISO-8859-7" "\0"
297
"ISO8859-8" "\0" "ISO-8859-8" "\0"
298
"ISO8859-9" "\0" "ISO-8859-9" "\0"
300
"eucJP" "\0" "EUC-JP" "\0"
301
"SJIS" "\0" "SHIFT_JIS" "\0"
302
"DECKANJI" "\0" "DEC-KANJI" "\0"
303
"SDECKANJI" "\0" "EUC-JP" "\0"
305
"eucTW" "\0" "EUC-TW" "\0"
306
"DECHANYU" "\0" "DEC-HANYU" "\0"
307
"DECHANZI" "\0" "GB2312" "\0"
309
"DECKOREAN" "\0" "EUC-KR" "\0";
312
# if defined WIN32_NATIVE || defined __CYGWIN__
313
/* To avoid the troubles of installing a separate file in the same
314
directory as the DLL and of retrieving the DLL's directory at
315
runtime, simply inline the aliases here. */
317
cp = "CP936" "\0" "GBK" "\0"
318
"CP1361" "\0" "JOHAB" "\0"
319
"CP20127" "\0" "ASCII" "\0"
320
"CP20866" "\0" "KOI8-R" "\0"
321
"CP20936" "\0" "GB2312" "\0"
322
"CP21866" "\0" "KOI8-RU" "\0"
323
"CP28591" "\0" "ISO-8859-1" "\0"
324
"CP28592" "\0" "ISO-8859-2" "\0"
325
"CP28593" "\0" "ISO-8859-3" "\0"
326
"CP28594" "\0" "ISO-8859-4" "\0"
327
"CP28595" "\0" "ISO-8859-5" "\0"
328
"CP28596" "\0" "ISO-8859-6" "\0"
329
"CP28597" "\0" "ISO-8859-7" "\0"
330
"CP28598" "\0" "ISO-8859-8" "\0"
331
"CP28599" "\0" "ISO-8859-9" "\0"
332
"CP28605" "\0" "ISO-8859-15" "\0"
333
"CP38598" "\0" "ISO-8859-8" "\0"
334
"CP51932" "\0" "EUC-JP" "\0"
335
"CP51936" "\0" "GB2312" "\0"
336
"CP51949" "\0" "EUC-KR" "\0"
337
"CP51950" "\0" "EUC-TW" "\0"
338
"CP54936" "\0" "GB18030" "\0"
339
"CP65001" "\0" "UTF-8" "\0";
343
charset_aliases = cp;
349
/* Determine the current locale's character encoding, and canonicalize it
350
into one of the canonical names listed in config.charset.
351
The result must not be freed; it is statically allocated.
352
If the canonical name cannot be determined, the result is a non-canonical
359
locale_charset (void)
364
#if !(defined WIN32_NATIVE || defined OS2)
366
# if HAVE_LANGINFO_CODESET
368
/* Most systems support nl_langinfo (CODESET) nowadays. */
369
codeset = nl_langinfo (CODESET);
372
/* Cygwin < 1.7 does not have locales. nl_langinfo (CODESET) always
373
returns "US-ASCII". Return the suffix of the locale name from the
374
environment variables (if present) or the codepage as a number. */
375
if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
378
static char buf[2 + 10 + 1];
380
locale = getenv ("LC_ALL");
381
if (locale == NULL || locale[0] == '\0')
383
locale = getenv ("LC_CTYPE");
384
if (locale == NULL || locale[0] == '\0')
385
locale = getenv ("LANG");
387
if (locale != NULL && locale[0] != '\0')
389
/* If the locale name contains an encoding after the dot, return
391
const char *dot = strchr (locale, '.');
395
const char *modifier;
398
/* Look for the possible @... trailer and remove it, if any. */
399
modifier = strchr (dot, '@');
400
if (modifier == NULL)
402
if (modifier - dot < sizeof (buf))
404
memcpy (buf, dot, modifier - dot);
405
buf [modifier - dot] = '\0';
411
/* Woe32 has a function returning the locale's codepage as a number:
412
GetACP(). This encoding is used by Cygwin, unless the user has set
413
the environment variable CYGWIN=codepage:oem (which very few people
415
Output directed to console windows needs to be converted (to
416
GetOEMCP() if the console is using a raster font, or to
417
GetConsoleOutputCP() if it is using a TrueType font). Cygwin does
418
this conversion transparently (see winsup/cygwin/fhandler_console.cc),
419
converting to GetConsoleOutputCP(). This leads to correct results,
420
except when SetConsoleOutputCP has been called and a raster font is
422
sprintf (buf, "CP%u", GetACP ());
429
/* On old systems which lack it, use setlocale or getenv. */
430
const char *locale = NULL;
432
/* But most old systems don't have a complete set of locales. Some
433
(like SunOS 4 or DJGPP) have only the C locale. Therefore we don't
434
use setlocale here; it would return "C" when it doesn't support the
435
locale name the user has set. */
437
locale = setlocale (LC_CTYPE, NULL);
439
if (locale == NULL || locale[0] == '\0')
441
locale = getenv ("LC_ALL");
442
if (locale == NULL || locale[0] == '\0')
444
locale = getenv ("LC_CTYPE");
445
if (locale == NULL || locale[0] == '\0')
446
locale = getenv ("LANG");
450
/* On some old systems, one used to set locale = "iso8859_1". On others,
451
you set it to "language_COUNTRY.charset". In any case, we resolve it
452
through the charset.alias file. */
457
#elif defined WIN32_NATIVE
459
static char buf[2 + 10 + 1];
461
/* Woe32 has a function returning the locale's codepage as a number:
463
When the output goes to a console window, it needs to be provided in
464
GetOEMCP() encoding if the console is using a raster font, or in
465
GetConsoleOutputCP() encoding if it is using a TrueType font.
466
But in GUI programs and for output sent to files and pipes, GetACP()
467
encoding is the best bet. */
468
sprintf (buf, "CP%u", GetACP ());
474
static char buf[2 + 10 + 1];
478
/* Allow user to override the codeset, as set in the operating system,
479
with standard language environment variables. */
480
locale = getenv ("LC_ALL");
481
if (locale == NULL || locale[0] == '\0')
483
locale = getenv ("LC_CTYPE");
484
if (locale == NULL || locale[0] == '\0')
485
locale = getenv ("LANG");
487
if (locale != NULL && locale[0] != '\0')
489
/* If the locale name contains an encoding after the dot, return it. */
490
const char *dot = strchr (locale, '.');
494
const char *modifier;
497
/* Look for the possible @... trailer and remove it, if any. */
498
modifier = strchr (dot, '@');
499
if (modifier == NULL)
501
if (modifier - dot < sizeof (buf))
503
memcpy (buf, dot, modifier - dot);
504
buf [modifier - dot] = '\0';
509
/* Resolve through the charset.alias file. */
514
/* OS/2 has a function returning the locale's codepage as a number. */
515
if (DosQueryCp (sizeof (cp), cp, &cplen))
519
sprintf (buf, "CP%u", cp[0]);
527
/* The canonical name cannot be determined. */
531
for (aliases = get_charset_aliases ();
533
aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
534
if (strcmp (codeset, aliases) == 0
535
|| (aliases[0] == '*' && aliases[1] == '\0'))
537
codeset = aliases + strlen (aliases) + 1;
541
/* Don't return an empty string. GNU libc and GNU libiconv interpret
542
the empty string as denoting "the locale's character encoding",
543
thus GNU libiconv would call this function a second time. */
544
if (codeset[0] == '\0')